diff --git "a/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" "b/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" --- "a/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" +++ "b/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" @@ -1,5 +1,5 @@ { - "best_metric": 0.002085434390429638, + "best_metric": 0.013088030408351149, "best_model_checkpoint": "./results-cc/qwen25-0.5b-instruct/qwen25_0.5b_lora_official_5e-05/checkpoint-14718", "epoch": 1.0, "eval_steps": 500, @@ -10,20611 +10,20611 @@ "log_history": [ { "epoch": 0.0003397200706617747, - "grad_norm": 2.2115156650543213, + "grad_norm": 2.220684289932251, "learning_rate": 4.9997876749558366e-05, - "loss": 3.2474, + "loss": 3.2464, "step": 5 }, { "epoch": 0.0006794401413235494, - "grad_norm": 1.9220579862594604, + "grad_norm": 1.9180666208267212, "learning_rate": 4.999575349911673e-05, - "loss": 2.8647, + "loss": 2.8655, "step": 10 }, { "epoch": 0.0010191602119853241, - "grad_norm": 1.5465667247772217, + "grad_norm": 1.5330926179885864, "learning_rate": 4.9993630248675094e-05, - "loss": 2.6643, + "loss": 2.6645, "step": 15 }, { "epoch": 0.001358880282647099, - "grad_norm": 1.5638021230697632, + "grad_norm": 1.553266167640686, "learning_rate": 4.999150699823346e-05, - "loss": 2.3563, + "loss": 2.3561, "step": 20 }, { "epoch": 0.0016986003533088735, - "grad_norm": 1.5559604167938232, + "grad_norm": 1.5376861095428467, "learning_rate": 4.998938374779182e-05, - "loss": 2.2025, + "loss": 2.2031, "step": 25 }, { "epoch": 0.0020383204239706482, - "grad_norm": 1.4563536643981934, + "grad_norm": 1.4365992546081543, "learning_rate": 4.9987260497350186e-05, - "loss": 1.9061, + "loss": 1.9062, "step": 30 }, { "epoch": 0.002378040494632423, - "grad_norm": 1.561199426651001, + "grad_norm": 1.5631930828094482, "learning_rate": 4.998513724690855e-05, - "loss": 1.6777, + "loss": 1.6762, "step": 35 }, { "epoch": 0.002717760565294198, - "grad_norm": 0.7528368234634399, + "grad_norm": 0.6962679624557495, "learning_rate": 4.9983013996466914e-05, - "loss": 1.522, + "loss": 1.5196, "step": 40 }, { "epoch": 0.0030574806359559724, - "grad_norm": 0.5895223021507263, + "grad_norm": 0.5944058895111084, "learning_rate": 4.998089074602528e-05, - "loss": 1.5657, + "loss": 1.5653, "step": 45 }, { "epoch": 0.003397200706617747, - "grad_norm": 0.6270008087158203, + "grad_norm": 0.5761585831642151, "learning_rate": 4.997876749558364e-05, - "loss": 1.5043, + "loss": 1.5031, "step": 50 }, { "epoch": 0.0037369207772795215, - "grad_norm": 0.5577742457389832, + "grad_norm": 0.5524360537528992, "learning_rate": 4.9976644245142006e-05, - "loss": 1.5139, + "loss": 1.513, "step": 55 }, { "epoch": 0.0040766408479412965, - "grad_norm": 0.5614880323410034, + "grad_norm": 0.5626854300498962, "learning_rate": 4.997452099470037e-05, - "loss": 1.5069, + "loss": 1.5058, "step": 60 }, { "epoch": 0.0044163609186030715, - "grad_norm": 0.4737943410873413, + "grad_norm": 0.4755340814590454, "learning_rate": 4.997239774425873e-05, - "loss": 1.5826, + "loss": 1.5827, "step": 65 }, { "epoch": 0.004756080989264846, - "grad_norm": 0.5027161240577698, + "grad_norm": 0.5004586577415466, "learning_rate": 4.99702744938171e-05, - "loss": 1.3716, + "loss": 1.372, "step": 70 }, { "epoch": 0.005095801059926621, - "grad_norm": 0.6184471845626831, + "grad_norm": 0.6223646998405457, "learning_rate": 4.996815124337546e-05, - "loss": 1.4837, + "loss": 1.4841, "step": 75 }, { "epoch": 0.005435521130588396, - "grad_norm": 0.507949948310852, + "grad_norm": 0.5142911076545715, "learning_rate": 4.996602799293382e-05, - "loss": 1.5033, + "loss": 1.5036, "step": 80 }, { "epoch": 0.00577524120125017, - "grad_norm": 0.43082889914512634, + "grad_norm": 0.43239352107048035, "learning_rate": 4.996390474249219e-05, - "loss": 1.5037, + "loss": 1.5031, "step": 85 }, { "epoch": 0.006114961271911945, - "grad_norm": 0.5485135912895203, + "grad_norm": 0.5478119254112244, "learning_rate": 4.9961781492050554e-05, - "loss": 1.5205, + "loss": 1.5213, "step": 90 }, { "epoch": 0.006454681342573719, - "grad_norm": 0.6378966569900513, + "grad_norm": 0.6446812152862549, "learning_rate": 4.995965824160891e-05, - "loss": 1.505, + "loss": 1.5048, "step": 95 }, { "epoch": 0.006794401413235494, - "grad_norm": 0.545911431312561, + "grad_norm": 0.5452167987823486, "learning_rate": 4.995753499116728e-05, - "loss": 1.5321, + "loss": 1.5325, "step": 100 }, { "epoch": 0.007134121483897269, - "grad_norm": 0.5056548714637756, + "grad_norm": 0.5040423274040222, "learning_rate": 4.9955411740725646e-05, - "loss": 1.4546, + "loss": 1.455, "step": 105 }, { "epoch": 0.007473841554559043, - "grad_norm": 0.47536423802375793, + "grad_norm": 0.4777057468891144, "learning_rate": 4.9953288490284004e-05, - "loss": 1.465, + "loss": 1.4649, "step": 110 }, { "epoch": 0.007813561625220818, - "grad_norm": 0.5124016404151917, + "grad_norm": 0.5126002430915833, "learning_rate": 4.9951165239842374e-05, - "loss": 1.5854, + "loss": 1.5857, "step": 115 }, { "epoch": 0.008153281695882593, - "grad_norm": 0.4447226822376251, + "grad_norm": 0.44543054699897766, "learning_rate": 4.994904198940074e-05, - "loss": 1.4954, + "loss": 1.4953, "step": 120 }, { "epoch": 0.008493001766544368, - "grad_norm": 0.5351713299751282, + "grad_norm": 0.5366056561470032, "learning_rate": 4.9946918738959096e-05, - "loss": 1.512, + "loss": 1.5116, "step": 125 }, { "epoch": 0.008832721837206143, - "grad_norm": 0.49306538701057434, + "grad_norm": 0.49342411756515503, "learning_rate": 4.9944795488517466e-05, "loss": 1.4245, "step": 130 }, { "epoch": 0.009172441907867916, - "grad_norm": 0.5303800702095032, + "grad_norm": 0.5293818712234497, "learning_rate": 4.994267223807583e-05, - "loss": 1.4689, + "loss": 1.4691, "step": 135 }, { "epoch": 0.009512161978529691, - "grad_norm": 0.5413838624954224, + "grad_norm": 0.5411449074745178, "learning_rate": 4.994054898763419e-05, - "loss": 1.4714, + "loss": 1.4711, "step": 140 }, { "epoch": 0.009851882049191466, - "grad_norm": 0.5445604920387268, + "grad_norm": 0.5442690253257751, "learning_rate": 4.993842573719256e-05, - "loss": 1.4894, + "loss": 1.4896, "step": 145 }, { "epoch": 0.010191602119853241, - "grad_norm": 0.5994873642921448, + "grad_norm": 0.6027960777282715, "learning_rate": 4.9936302486750916e-05, - "loss": 1.5265, + "loss": 1.5266, "step": 150 }, { "epoch": 0.010531322190515016, - "grad_norm": 0.5787465572357178, + "grad_norm": 0.5794451832771301, "learning_rate": 4.993417923630928e-05, - "loss": 1.546, + "loss": 1.5466, "step": 155 }, { "epoch": 0.010871042261176791, - "grad_norm": 0.5157501101493835, + "grad_norm": 0.5141589641571045, "learning_rate": 4.993205598586765e-05, - "loss": 1.4844, + "loss": 1.4846, "step": 160 }, { "epoch": 0.011210762331838564, - "grad_norm": 0.5613402724266052, + "grad_norm": 0.5596498250961304, "learning_rate": 4.992993273542601e-05, - "loss": 1.5223, + "loss": 1.5219, "step": 165 }, { "epoch": 0.01155048240250034, - "grad_norm": 0.5664464235305786, + "grad_norm": 0.5729048252105713, "learning_rate": 4.992780948498437e-05, - "loss": 1.4891, + "loss": 1.4898, "step": 170 }, { "epoch": 0.011890202473162114, - "grad_norm": 0.5007671117782593, + "grad_norm": 0.5001816153526306, "learning_rate": 4.992568623454274e-05, - "loss": 1.4462, + "loss": 1.4461, "step": 175 }, { "epoch": 0.01222992254382389, - "grad_norm": 0.5545527935028076, + "grad_norm": 0.5542340278625488, "learning_rate": 4.99235629841011e-05, - "loss": 1.5052, + "loss": 1.5054, "step": 180 }, { "epoch": 0.012569642614485664, - "grad_norm": 0.48418471217155457, + "grad_norm": 0.4830928146839142, "learning_rate": 4.992143973365947e-05, - "loss": 1.5056, + "loss": 1.5046, "step": 185 }, { "epoch": 0.012909362685147438, - "grad_norm": 0.5149874687194824, + "grad_norm": 0.5120685696601868, "learning_rate": 4.9919316483217835e-05, - "loss": 1.5157, + "loss": 1.5163, "step": 190 }, { "epoch": 0.013249082755809213, - "grad_norm": 0.41576215624809265, + "grad_norm": 0.4157141149044037, "learning_rate": 4.991719323277619e-05, - "loss": 1.5507, + "loss": 1.5509, "step": 195 }, { "epoch": 0.013588802826470988, - "grad_norm": 0.6348430514335632, + "grad_norm": 0.631702721118927, "learning_rate": 4.991506998233456e-05, - "loss": 1.4514, + "loss": 1.451, "step": 200 }, { "epoch": 0.013928522897132763, - "grad_norm": 0.43560290336608887, + "grad_norm": 0.43671202659606934, "learning_rate": 4.9912946731892927e-05, - "loss": 1.4208, + "loss": 1.4205, "step": 205 }, { "epoch": 0.014268242967794538, - "grad_norm": 0.5400347709655762, + "grad_norm": 0.5415316224098206, "learning_rate": 4.9910823481451284e-05, - "loss": 1.4863, + "loss": 1.4865, "step": 210 }, { "epoch": 0.014607963038456313, - "grad_norm": 0.49760323762893677, + "grad_norm": 0.5014939308166504, "learning_rate": 4.9908700231009655e-05, - "loss": 1.5158, + "loss": 1.5154, "step": 215 }, { "epoch": 0.014947683109118086, - "grad_norm": 0.5227531790733337, + "grad_norm": 0.5216646790504456, "learning_rate": 4.990657698056801e-05, - "loss": 1.5481, + "loss": 1.5477, "step": 220 }, { "epoch": 0.015287403179779861, - "grad_norm": 0.6145650148391724, + "grad_norm": 0.6209692358970642, "learning_rate": 4.9904453730126376e-05, - "loss": 1.5456, + "loss": 1.545, "step": 225 }, { "epoch": 0.015627123250441636, - "grad_norm": 0.469058096408844, + "grad_norm": 0.4683520197868347, "learning_rate": 4.9902330479684747e-05, - "loss": 1.4884, + "loss": 1.4886, "step": 230 }, { "epoch": 0.01596684332110341, - "grad_norm": 0.5135950446128845, + "grad_norm": 0.513251006603241, "learning_rate": 4.9900207229243104e-05, - "loss": 1.4663, + "loss": 1.4672, "step": 235 }, { "epoch": 0.016306563391765186, - "grad_norm": 0.5521801710128784, + "grad_norm": 0.5497923493385315, "learning_rate": 4.989808397880147e-05, - "loss": 1.423, + "loss": 1.4225, "step": 240 }, { "epoch": 0.01664628346242696, - "grad_norm": 0.5495691895484924, + "grad_norm": 0.5504872798919678, "learning_rate": 4.989596072835984e-05, - "loss": 1.4743, + "loss": 1.4745, "step": 245 }, { "epoch": 0.016986003533088736, - "grad_norm": 0.444443017244339, + "grad_norm": 0.44505226612091064, "learning_rate": 4.9893837477918196e-05, - "loss": 1.477, + "loss": 1.4769, "step": 250 }, { "epoch": 0.01732572360375051, - "grad_norm": 0.5638461709022522, + "grad_norm": 0.5649851560592651, "learning_rate": 4.989171422747656e-05, - "loss": 1.3996, + "loss": 1.4008, "step": 255 }, { "epoch": 0.017665443674412286, - "grad_norm": 0.6463044881820679, + "grad_norm": 0.6474494934082031, "learning_rate": 4.988959097703493e-05, - "loss": 1.4509, + "loss": 1.4512, "step": 260 }, { "epoch": 0.01800516374507406, - "grad_norm": 0.489048033952713, + "grad_norm": 0.48915228247642517, "learning_rate": 4.988746772659329e-05, - "loss": 1.5476, + "loss": 1.548, "step": 265 }, { "epoch": 0.018344883815735832, - "grad_norm": 0.5454437732696533, + "grad_norm": 0.5431952476501465, "learning_rate": 4.988534447615165e-05, - "loss": 1.3992, + "loss": 1.3993, "step": 270 }, { "epoch": 0.01868460388639761, - "grad_norm": 0.47111111879348755, + "grad_norm": 0.4723069369792938, "learning_rate": 4.988322122571002e-05, - "loss": 1.5034, + "loss": 1.5038, "step": 275 }, { "epoch": 0.019024323957059382, - "grad_norm": 0.5251321792602539, + "grad_norm": 0.5241596102714539, "learning_rate": 4.988109797526838e-05, - "loss": 1.4966, + "loss": 1.4976, "step": 280 }, { "epoch": 0.01936404402772116, - "grad_norm": 0.5076407194137573, + "grad_norm": 0.505593478679657, "learning_rate": 4.9878974724826744e-05, - "loss": 1.423, + "loss": 1.4228, "step": 285 }, { "epoch": 0.019703764098382932, - "grad_norm": 0.4797162413597107, + "grad_norm": 0.4799843728542328, "learning_rate": 4.987685147438511e-05, "loss": 1.4852, "step": 290 }, { "epoch": 0.020043484169044706, - "grad_norm": 0.5633953213691711, + "grad_norm": 0.5613768100738525, "learning_rate": 4.987472822394347e-05, - "loss": 1.5244, + "loss": 1.5239, "step": 295 }, { "epoch": 0.020383204239706482, - "grad_norm": 0.5671972632408142, + "grad_norm": 0.5662097334861755, "learning_rate": 4.9872604973501836e-05, - "loss": 1.4672, + "loss": 1.4671, "step": 300 }, { "epoch": 0.020722924310368256, - "grad_norm": 0.5827104449272156, + "grad_norm": 0.5767617225646973, "learning_rate": 4.98704817230602e-05, - "loss": 1.3738, + "loss": 1.3733, "step": 305 }, { "epoch": 0.021062644381030032, - "grad_norm": 0.5388350486755371, + "grad_norm": 0.5416541695594788, "learning_rate": 4.9868358472618564e-05, - "loss": 1.4657, + "loss": 1.4656, "step": 310 }, { "epoch": 0.021402364451691806, - "grad_norm": 0.5764205455780029, + "grad_norm": 0.576328456401825, "learning_rate": 4.986623522217693e-05, - "loss": 1.4862, + "loss": 1.4863, "step": 315 }, { "epoch": 0.021742084522353582, - "grad_norm": 0.5410416722297668, + "grad_norm": 0.5416762232780457, "learning_rate": 4.986411197173529e-05, - "loss": 1.4702, + "loss": 1.4703, "step": 320 }, { "epoch": 0.022081804593015356, - "grad_norm": 0.5311737656593323, + "grad_norm": 0.5314561724662781, "learning_rate": 4.9861988721293656e-05, - "loss": 1.5223, + "loss": 1.5224, "step": 325 }, { "epoch": 0.02242152466367713, - "grad_norm": 0.4751683473587036, + "grad_norm": 0.47489133477211, "learning_rate": 4.985986547085202e-05, - "loss": 1.4521, + "loss": 1.4526, "step": 330 }, { "epoch": 0.022761244734338906, - "grad_norm": 0.5192616581916809, + "grad_norm": 0.5184615850448608, "learning_rate": 4.9857742220410384e-05, - "loss": 1.5345, + "loss": 1.5341, "step": 335 }, { "epoch": 0.02310096480500068, - "grad_norm": 0.5677399039268494, + "grad_norm": 0.574855387210846, "learning_rate": 4.985561896996875e-05, - "loss": 1.4637, + "loss": 1.4629, "step": 340 }, { "epoch": 0.023440684875662456, - "grad_norm": 0.5919628143310547, + "grad_norm": 0.5987824201583862, "learning_rate": 4.985349571952711e-05, - "loss": 1.4891, + "loss": 1.4895, "step": 345 }, { "epoch": 0.02378040494632423, - "grad_norm": 0.5812225937843323, + "grad_norm": 0.5849152207374573, "learning_rate": 4.9851372469085476e-05, - "loss": 1.3587, + "loss": 1.3584, "step": 350 }, { "epoch": 0.024120125016986002, - "grad_norm": 0.5220487713813782, + "grad_norm": 0.5208017826080322, "learning_rate": 4.984924921864384e-05, - "loss": 1.4029, + "loss": 1.403, "step": 355 }, { "epoch": 0.02445984508764778, - "grad_norm": 0.5572966933250427, + "grad_norm": 0.5547541379928589, "learning_rate": 4.9847125968202204e-05, - "loss": 1.4791, + "loss": 1.479, "step": 360 }, { "epoch": 0.024799565158309552, - "grad_norm": 0.49765080213546753, + "grad_norm": 0.49849653244018555, "learning_rate": 4.984500271776057e-05, - "loss": 1.4598, + "loss": 1.4603, "step": 365 }, { "epoch": 0.02513928522897133, - "grad_norm": 0.5231608152389526, + "grad_norm": 0.5223923921585083, "learning_rate": 4.984287946731893e-05, - "loss": 1.5089, + "loss": 1.5092, "step": 370 }, { "epoch": 0.025479005299633102, - "grad_norm": 0.5667971968650818, + "grad_norm": 0.5657708644866943, "learning_rate": 4.9840756216877296e-05, - "loss": 1.5159, + "loss": 1.5155, "step": 375 }, { "epoch": 0.025818725370294875, - "grad_norm": 0.5797260999679565, + "grad_norm": 0.5795098543167114, "learning_rate": 4.983863296643566e-05, - "loss": 1.5268, + "loss": 1.527, "step": 380 }, { "epoch": 0.026158445440956652, - "grad_norm": 0.512911856174469, + "grad_norm": 0.5121040344238281, "learning_rate": 4.9836509715994024e-05, - "loss": 1.5491, + "loss": 1.549, "step": 385 }, { "epoch": 0.026498165511618425, - "grad_norm": 0.5526360273361206, + "grad_norm": 0.5503144860267639, "learning_rate": 4.983438646555239e-05, - "loss": 1.5025, + "loss": 1.5023, "step": 390 }, { "epoch": 0.026837885582280202, - "grad_norm": 0.5525634288787842, + "grad_norm": 0.5591674447059631, "learning_rate": 4.983226321511075e-05, - "loss": 1.4455, + "loss": 1.4452, "step": 395 }, { "epoch": 0.027177605652941975, - "grad_norm": 0.5490344762802124, + "grad_norm": 0.5500125288963318, "learning_rate": 4.9830139964669116e-05, - "loss": 1.4632, + "loss": 1.4631, "step": 400 }, { "epoch": 0.027517325723603752, - "grad_norm": 0.5567771792411804, + "grad_norm": 0.556633472442627, "learning_rate": 4.982801671422748e-05, - "loss": 1.4736, + "loss": 1.4741, "step": 405 }, { "epoch": 0.027857045794265525, - "grad_norm": 0.5474903583526611, + "grad_norm": 0.5470479130744934, "learning_rate": 4.9825893463785844e-05, "loss": 1.4769, "step": 410 }, { "epoch": 0.0281967658649273, - "grad_norm": 0.6983373761177063, + "grad_norm": 0.6944449543952942, "learning_rate": 4.982377021334421e-05, - "loss": 1.4939, + "loss": 1.4933, "step": 415 }, { "epoch": 0.028536485935589075, - "grad_norm": 0.5765412449836731, + "grad_norm": 0.5708439350128174, "learning_rate": 4.9821646962902565e-05, - "loss": 1.4371, + "loss": 1.4373, "step": 420 }, { "epoch": 0.02887620600625085, - "grad_norm": 0.5680785775184631, + "grad_norm": 0.565213680267334, "learning_rate": 4.9819523712460936e-05, - "loss": 1.4653, + "loss": 1.4646, "step": 425 }, { "epoch": 0.029215926076912625, - "grad_norm": 0.5412982106208801, + "grad_norm": 0.5410686135292053, "learning_rate": 4.98174004620193e-05, - "loss": 1.4598, + "loss": 1.4593, "step": 430 }, { "epoch": 0.0295556461475744, - "grad_norm": 0.5168296694755554, + "grad_norm": 0.5180397033691406, "learning_rate": 4.981527721157766e-05, - "loss": 1.4298, + "loss": 1.4304, "step": 435 }, { "epoch": 0.029895366218236172, - "grad_norm": 0.4910033643245697, + "grad_norm": 0.492260605096817, "learning_rate": 4.981315396113603e-05, - "loss": 1.4402, + "loss": 1.4404, "step": 440 }, { "epoch": 0.03023508628889795, - "grad_norm": 0.5194764137268066, + "grad_norm": 0.5216431617736816, "learning_rate": 4.981103071069439e-05, "loss": 1.4435, "step": 445 }, { "epoch": 0.030574806359559722, - "grad_norm": 0.45196419954299927, + "grad_norm": 0.4491364657878876, "learning_rate": 4.980890746025275e-05, - "loss": 1.3709, + "loss": 1.371, "step": 450 }, { "epoch": 0.0309145264302215, - "grad_norm": 0.5187885761260986, + "grad_norm": 0.5196198225021362, "learning_rate": 4.980678420981112e-05, - "loss": 1.5127, + "loss": 1.5134, "step": 455 }, { "epoch": 0.03125424650088327, - "grad_norm": 0.6062313914299011, + "grad_norm": 0.6152990460395813, "learning_rate": 4.9804660959369484e-05, - "loss": 1.4292, + "loss": 1.4299, "step": 460 }, { "epoch": 0.03159396657154505, - "grad_norm": 0.5382944345474243, + "grad_norm": 0.5448952913284302, "learning_rate": 4.980253770892784e-05, - "loss": 1.4779, + "loss": 1.4784, "step": 465 }, { "epoch": 0.03193368664220682, - "grad_norm": 0.5015461444854736, + "grad_norm": 0.4982541501522064, "learning_rate": 4.980041445848621e-05, - "loss": 1.4652, + "loss": 1.4654, "step": 470 }, { "epoch": 0.032273406712868595, - "grad_norm": 0.5272130370140076, + "grad_norm": 0.5250760316848755, "learning_rate": 4.9798291208044576e-05, - "loss": 1.4272, + "loss": 1.4276, "step": 475 }, { "epoch": 0.03261312678353037, - "grad_norm": 0.5345466732978821, + "grad_norm": 0.5274128913879395, "learning_rate": 4.979616795760293e-05, - "loss": 1.435, + "loss": 1.4359, "step": 480 }, { "epoch": 0.03295284685419215, - "grad_norm": 0.5273901224136353, + "grad_norm": 0.5279980301856995, "learning_rate": 4.9794044707161304e-05, - "loss": 1.4544, + "loss": 1.4545, "step": 485 }, { "epoch": 0.03329256692485392, - "grad_norm": 0.6292443871498108, + "grad_norm": 0.6279738545417786, "learning_rate": 4.979192145671966e-05, "loss": 1.4516, "step": 490 }, { "epoch": 0.033632286995515695, - "grad_norm": 0.656648576259613, + "grad_norm": 0.6582928895950317, "learning_rate": 4.9789798206278025e-05, - "loss": 1.4173, + "loss": 1.4178, "step": 495 }, { "epoch": 0.03397200706617747, - "grad_norm": 0.5808439254760742, + "grad_norm": 0.5777362585067749, "learning_rate": 4.9787674955836396e-05, - "loss": 1.4431, + "loss": 1.4436, "step": 500 }, { "epoch": 0.03431172713683924, - "grad_norm": 0.502206027507782, + "grad_norm": 0.5016368627548218, "learning_rate": 4.978555170539475e-05, "loss": 1.4618, "step": 505 }, { "epoch": 0.03465144720750102, - "grad_norm": 0.5035110712051392, + "grad_norm": 0.5017910003662109, "learning_rate": 4.978342845495312e-05, - "loss": 1.3757, + "loss": 1.3759, "step": 510 }, { "epoch": 0.034991167278162795, - "grad_norm": 0.5452114939689636, + "grad_norm": 0.5451638698577881, "learning_rate": 4.978130520451149e-05, - "loss": 1.4199, + "loss": 1.42, "step": 515 }, { "epoch": 0.03533088734882457, - "grad_norm": 0.5839388370513916, + "grad_norm": 0.5852941870689392, "learning_rate": 4.9779181954069845e-05, - "loss": 1.5315, + "loss": 1.5313, "step": 520 }, { "epoch": 0.03567060741948634, - "grad_norm": 0.5461863875389099, + "grad_norm": 0.5483387112617493, "learning_rate": 4.9777058703628216e-05, - "loss": 1.5461, + "loss": 1.5458, "step": 525 }, { "epoch": 0.03601032749014812, - "grad_norm": 0.5233834385871887, + "grad_norm": 0.5246821045875549, "learning_rate": 4.977493545318658e-05, - "loss": 1.4343, + "loss": 1.4339, "step": 530 }, { "epoch": 0.036350047560809895, - "grad_norm": 0.5493288040161133, + "grad_norm": 0.5500034093856812, "learning_rate": 4.977281220274494e-05, - "loss": 1.5356, + "loss": 1.536, "step": 535 }, { "epoch": 0.036689767631471665, - "grad_norm": 0.5625514984130859, + "grad_norm": 0.5636404752731323, "learning_rate": 4.977068895230331e-05, - "loss": 1.6131, + "loss": 1.6127, "step": 540 }, { "epoch": 0.03702948770213344, - "grad_norm": 0.5404923558235168, + "grad_norm": 0.5450454354286194, "learning_rate": 4.976856570186167e-05, "loss": 1.4323, "step": 545 }, { "epoch": 0.03736920777279522, - "grad_norm": 0.4866144359111786, + "grad_norm": 0.48811179399490356, "learning_rate": 4.976644245142003e-05, - "loss": 1.4887, + "loss": 1.4881, "step": 550 }, { "epoch": 0.03770892784345699, - "grad_norm": 0.5609140396118164, + "grad_norm": 0.5600892901420593, "learning_rate": 4.97643192009784e-05, - "loss": 1.4837, + "loss": 1.4841, "step": 555 }, { "epoch": 0.038048647914118765, - "grad_norm": 0.5729807019233704, + "grad_norm": 0.5723892450332642, "learning_rate": 4.9762195950536764e-05, "loss": 1.5659, "step": 560 }, { "epoch": 0.03838836798478054, - "grad_norm": 0.5228670239448547, + "grad_norm": 0.5241605043411255, "learning_rate": 4.976007270009512e-05, - "loss": 1.4805, + "loss": 1.4799, "step": 565 }, { "epoch": 0.03872808805544232, - "grad_norm": 0.5944375991821289, + "grad_norm": 0.5933988690376282, "learning_rate": 4.975794944965349e-05, - "loss": 1.4658, + "loss": 1.4652, "step": 570 }, { "epoch": 0.03906780812610409, - "grad_norm": 0.5736159682273865, + "grad_norm": 0.5706843733787537, "learning_rate": 4.975582619921185e-05, - "loss": 1.4871, + "loss": 1.4868, "step": 575 }, { "epoch": 0.039407528196765865, - "grad_norm": 0.5262963175773621, + "grad_norm": 0.5259225964546204, "learning_rate": 4.975370294877021e-05, - "loss": 1.4684, + "loss": 1.468, "step": 580 }, { "epoch": 0.03974724826742764, - "grad_norm": 0.5111757516860962, + "grad_norm": 0.5126088857650757, "learning_rate": 4.9751579698328584e-05, - "loss": 1.46, + "loss": 1.4601, "step": 585 }, { "epoch": 0.04008696833808941, - "grad_norm": 0.575961172580719, + "grad_norm": 0.5756707787513733, "learning_rate": 4.974945644788694e-05, - "loss": 1.4456, + "loss": 1.4454, "step": 590 }, { "epoch": 0.04042668840875119, - "grad_norm": 0.5009347200393677, + "grad_norm": 0.5006542801856995, "learning_rate": 4.9747333197445305e-05, - "loss": 1.4215, + "loss": 1.4222, "step": 595 }, { "epoch": 0.040766408479412965, - "grad_norm": 0.566643476486206, + "grad_norm": 0.5641801357269287, "learning_rate": 4.9745209947003676e-05, - "loss": 1.3709, + "loss": 1.3713, "step": 600 }, { "epoch": 0.04110612855007474, - "grad_norm": 0.5933975577354431, + "grad_norm": 0.5969881415367126, "learning_rate": 4.974308669656203e-05, - "loss": 1.4551, + "loss": 1.455, "step": 605 }, { "epoch": 0.04144584862073651, - "grad_norm": 0.6027320623397827, + "grad_norm": 0.6034646034240723, "learning_rate": 4.97409634461204e-05, "loss": 1.4568, "step": 610 }, { "epoch": 0.04178556869139829, - "grad_norm": 0.6281547546386719, + "grad_norm": 0.6286112666130066, "learning_rate": 4.973884019567877e-05, "loss": 1.4873, "step": 615 }, { "epoch": 0.042125288762060065, - "grad_norm": 0.5826748609542847, + "grad_norm": 0.5790241956710815, "learning_rate": 4.9736716945237125e-05, "loss": 1.4697, "step": 620 }, { "epoch": 0.042465008832721834, - "grad_norm": 0.5156673789024353, + "grad_norm": 0.517672598361969, "learning_rate": 4.973459369479549e-05, - "loss": 1.4339, + "loss": 1.4343, "step": 625 }, { "epoch": 0.04280472890338361, - "grad_norm": 0.5677419900894165, + "grad_norm": 0.5668666958808899, "learning_rate": 4.973247044435386e-05, - "loss": 1.3631, + "loss": 1.3632, "step": 630 }, { "epoch": 0.04314444897404539, - "grad_norm": 0.5827733278274536, + "grad_norm": 0.5779948830604553, "learning_rate": 4.973034719391222e-05, - "loss": 1.4643, + "loss": 1.4639, "step": 635 }, { "epoch": 0.043484169044707165, - "grad_norm": 0.49551665782928467, + "grad_norm": 0.49769327044487, "learning_rate": 4.972822394347058e-05, - "loss": 1.3798, + "loss": 1.3802, "step": 640 }, { "epoch": 0.043823889115368934, - "grad_norm": 0.5372036099433899, + "grad_norm": 0.539963960647583, "learning_rate": 4.9726100693028945e-05, - "loss": 1.4887, + "loss": 1.4883, "step": 645 }, { "epoch": 0.04416360918603071, - "grad_norm": 0.5611512660980225, + "grad_norm": 0.5626983642578125, "learning_rate": 4.972397744258731e-05, - "loss": 1.4698, + "loss": 1.4693, "step": 650 }, { "epoch": 0.04450332925669249, - "grad_norm": 0.6036912202835083, + "grad_norm": 0.6038297414779663, "learning_rate": 4.972185419214567e-05, - "loss": 1.4582, + "loss": 1.4587, "step": 655 }, { "epoch": 0.04484304932735426, - "grad_norm": 0.6422983407974243, + "grad_norm": 0.6336650252342224, "learning_rate": 4.971973094170404e-05, - "loss": 1.3964, + "loss": 1.3966, "step": 660 }, { "epoch": 0.045182769398016034, - "grad_norm": 0.4992437958717346, + "grad_norm": 0.4973067343235016, "learning_rate": 4.97176076912624e-05, - "loss": 1.4224, + "loss": 1.4218, "step": 665 }, { "epoch": 0.04552248946867781, - "grad_norm": 0.5885098576545715, + "grad_norm": 0.5825999975204468, "learning_rate": 4.9715484440820765e-05, - "loss": 1.4885, + "loss": 1.4886, "step": 670 }, { "epoch": 0.04586220953933958, - "grad_norm": 0.6584553718566895, + "grad_norm": 0.6596586108207703, "learning_rate": 4.971336119037913e-05, - "loss": 1.3926, + "loss": 1.3933, "step": 675 }, { "epoch": 0.04620192961000136, - "grad_norm": 0.5546813607215881, + "grad_norm": 0.5573785901069641, "learning_rate": 4.971123793993749e-05, "loss": 1.4001, "step": 680 }, { "epoch": 0.046541649680663134, - "grad_norm": 0.5703087449073792, + "grad_norm": 0.5661247372627258, "learning_rate": 4.970911468949586e-05, - "loss": 1.4585, + "loss": 1.458, "step": 685 }, { "epoch": 0.04688136975132491, - "grad_norm": 0.5438553690910339, + "grad_norm": 0.5434086918830872, "learning_rate": 4.970699143905422e-05, - "loss": 1.5377, + "loss": 1.5383, "step": 690 }, { "epoch": 0.04722108982198668, - "grad_norm": 0.5606414675712585, + "grad_norm": 0.5578226447105408, "learning_rate": 4.9704868188612585e-05, - "loss": 1.4817, + "loss": 1.4809, "step": 695 }, { "epoch": 0.04756080989264846, - "grad_norm": 0.5491502285003662, + "grad_norm": 0.5472956895828247, "learning_rate": 4.970274493817095e-05, - "loss": 1.504, + "loss": 1.5043, "step": 700 }, { "epoch": 0.047900529963310234, - "grad_norm": 0.6139447093009949, + "grad_norm": 0.6101970672607422, "learning_rate": 4.970062168772931e-05, - "loss": 1.5721, + "loss": 1.5714, "step": 705 }, { "epoch": 0.048240250033972004, - "grad_norm": 0.5045318603515625, + "grad_norm": 0.5049817562103271, "learning_rate": 4.969849843728768e-05, - "loss": 1.5299, + "loss": 1.5302, "step": 710 }, { "epoch": 0.04857997010463378, - "grad_norm": 0.5747368931770325, + "grad_norm": 0.5746999382972717, "learning_rate": 4.969637518684604e-05, - "loss": 1.5473, + "loss": 1.5476, "step": 715 }, { "epoch": 0.04891969017529556, - "grad_norm": 0.5617840886116028, + "grad_norm": 0.5637834072113037, "learning_rate": 4.9694251936404405e-05, - "loss": 1.5111, + "loss": 1.5114, "step": 720 }, { "epoch": 0.049259410245957334, - "grad_norm": 0.5222535729408264, + "grad_norm": 0.5218318104743958, "learning_rate": 4.969212868596277e-05, - "loss": 1.3683, + "loss": 1.3687, "step": 725 }, { "epoch": 0.049599130316619104, - "grad_norm": 0.4956512749195099, + "grad_norm": 0.49561241269111633, "learning_rate": 4.9690005435521133e-05, "loss": 1.4828, "step": 730 }, { "epoch": 0.04993885038728088, - "grad_norm": 0.6255701780319214, + "grad_norm": 0.6286036372184753, "learning_rate": 4.96878821850795e-05, - "loss": 1.4093, + "loss": 1.41, "step": 735 }, { "epoch": 0.05027857045794266, - "grad_norm": 0.5820271968841553, + "grad_norm": 0.5821284055709839, "learning_rate": 4.968575893463786e-05, - "loss": 1.4821, + "loss": 1.4825, "step": 740 }, { "epoch": 0.05061829052860443, - "grad_norm": 0.5157151818275452, + "grad_norm": 0.5134862065315247, "learning_rate": 4.9683635684196225e-05, - "loss": 1.4265, + "loss": 1.4267, "step": 745 }, { "epoch": 0.050958010599266204, - "grad_norm": 0.5513901710510254, + "grad_norm": 0.5511249303817749, "learning_rate": 4.968151243375459e-05, - "loss": 1.5006, + "loss": 1.5003, "step": 750 }, { "epoch": 0.05129773066992798, - "grad_norm": 0.5603316426277161, + "grad_norm": 0.5605166554450989, "learning_rate": 4.9679389183312953e-05, - "loss": 1.5731, + "loss": 1.5732, "step": 755 }, { "epoch": 0.05163745074058975, - "grad_norm": 0.52471524477005, + "grad_norm": 0.5246262550354004, "learning_rate": 4.967726593287132e-05, - "loss": 1.474, + "loss": 1.4744, "step": 760 }, { "epoch": 0.05197717081125153, - "grad_norm": 0.588664174079895, + "grad_norm": 0.5897058248519897, "learning_rate": 4.967514268242968e-05, - "loss": 1.4958, + "loss": 1.4955, "step": 765 }, { "epoch": 0.052316890881913304, - "grad_norm": 0.5561087131500244, + "grad_norm": 0.5559947490692139, "learning_rate": 4.9673019431988045e-05, "loss": 1.4695, "step": 770 }, { "epoch": 0.05265661095257508, - "grad_norm": 0.5775383710861206, + "grad_norm": 0.5775343179702759, "learning_rate": 4.96708961815464e-05, - "loss": 1.6117, + "loss": 1.6115, "step": 775 }, { "epoch": 0.05299633102323685, - "grad_norm": 0.5280258655548096, + "grad_norm": 0.5265573859214783, "learning_rate": 4.9668772931104773e-05, - "loss": 1.5171, + "loss": 1.5172, "step": 780 }, { "epoch": 0.05333605109389863, - "grad_norm": 0.589007556438446, + "grad_norm": 0.5910927057266235, "learning_rate": 4.966664968066314e-05, - "loss": 1.388, + "loss": 1.3883, "step": 785 }, { "epoch": 0.053675771164560404, - "grad_norm": 0.6063833236694336, + "grad_norm": 0.6103203892707825, "learning_rate": 4.9664526430221495e-05, - "loss": 1.5372, + "loss": 1.5376, "step": 790 }, { "epoch": 0.054015491235222174, - "grad_norm": 0.5418073534965515, + "grad_norm": 0.5413540005683899, "learning_rate": 4.9662403179779865e-05, - "loss": 1.4476, + "loss": 1.448, "step": 795 }, { "epoch": 0.05435521130588395, - "grad_norm": 0.5852705240249634, + "grad_norm": 0.5844071507453918, "learning_rate": 4.966027992933823e-05, - "loss": 1.5328, + "loss": 1.5329, "step": 800 }, { "epoch": 0.05469493137654573, - "grad_norm": 0.6038408875465393, + "grad_norm": 0.6013526320457458, "learning_rate": 4.965815667889659e-05, - "loss": 1.4551, + "loss": 1.456, "step": 805 }, { "epoch": 0.055034651447207504, - "grad_norm": 0.5931604504585266, + "grad_norm": 0.5928500294685364, "learning_rate": 4.965603342845496e-05, - "loss": 1.5199, + "loss": 1.5192, "step": 810 }, { "epoch": 0.055374371517869274, - "grad_norm": 0.5286008715629578, + "grad_norm": 0.527395486831665, "learning_rate": 4.965391017801332e-05, - "loss": 1.3396, + "loss": 1.3395, "step": 815 }, { "epoch": 0.05571409158853105, - "grad_norm": 0.6526750326156616, + "grad_norm": 0.6465798616409302, "learning_rate": 4.965178692757168e-05, - "loss": 1.4556, + "loss": 1.4562, "step": 820 }, { "epoch": 0.05605381165919283, - "grad_norm": 0.6939230561256409, + "grad_norm": 0.6946747899055481, "learning_rate": 4.964966367713005e-05, - "loss": 1.4602, + "loss": 1.4597, "step": 825 }, { "epoch": 0.0563935317298546, - "grad_norm": 0.5691767334938049, + "grad_norm": 0.5718979239463806, "learning_rate": 4.9647540426688413e-05, - "loss": 1.5759, + "loss": 1.5762, "step": 830 }, { "epoch": 0.056733251800516374, - "grad_norm": 0.6197113394737244, + "grad_norm": 0.6205467581748962, "learning_rate": 4.964541717624677e-05, - "loss": 1.4188, + "loss": 1.4182, "step": 835 }, { "epoch": 0.05707297187117815, - "grad_norm": 0.5608623027801514, + "grad_norm": 0.5629431009292603, "learning_rate": 4.964329392580514e-05, - "loss": 1.4783, + "loss": 1.4786, "step": 840 }, { "epoch": 0.05741269194183993, - "grad_norm": 0.6069940328598022, + "grad_norm": 0.5980949997901917, "learning_rate": 4.96411706753635e-05, - "loss": 1.5059, + "loss": 1.5065, "step": 845 }, { "epoch": 0.0577524120125017, - "grad_norm": 0.5475958585739136, + "grad_norm": 0.5427051782608032, "learning_rate": 4.963904742492186e-05, - "loss": 1.4701, + "loss": 1.4698, "step": 850 }, { "epoch": 0.058092132083163474, - "grad_norm": 0.580152690410614, + "grad_norm": 0.5818867087364197, "learning_rate": 4.9636924174480234e-05, - "loss": 1.3949, + "loss": 1.395, "step": 855 }, { "epoch": 0.05843185215382525, - "grad_norm": 0.6389477849006653, + "grad_norm": 0.6426949501037598, "learning_rate": 4.963480092403859e-05, - "loss": 1.5599, + "loss": 1.5598, "step": 860 }, { "epoch": 0.05877157222448702, - "grad_norm": 0.6286891102790833, + "grad_norm": 0.6321501731872559, "learning_rate": 4.963267767359696e-05, - "loss": 1.5362, + "loss": 1.5365, "step": 865 }, { "epoch": 0.0591112922951488, - "grad_norm": 0.6547082662582397, + "grad_norm": 0.6546335220336914, "learning_rate": 4.9630554423155326e-05, - "loss": 1.4031, + "loss": 1.4033, "step": 870 }, { "epoch": 0.059451012365810574, - "grad_norm": 0.4733664095401764, + "grad_norm": 0.47296005487442017, "learning_rate": 4.962843117271368e-05, - "loss": 1.5581, + "loss": 1.5583, "step": 875 }, { "epoch": 0.059790732436472344, - "grad_norm": 0.515631377696991, + "grad_norm": 0.5152649283409119, "learning_rate": 4.9626307922272054e-05, - "loss": 1.4553, + "loss": 1.4555, "step": 880 }, { "epoch": 0.06013045250713412, - "grad_norm": 0.6194367408752441, + "grad_norm": 0.6229832768440247, "learning_rate": 4.962418467183042e-05, "loss": 1.4622, "step": 885 }, { "epoch": 0.0604701725777959, - "grad_norm": 0.5636453032493591, + "grad_norm": 0.563595712184906, "learning_rate": 4.9622061421388775e-05, - "loss": 1.4836, + "loss": 1.4828, "step": 890 }, { "epoch": 0.060809892648457674, - "grad_norm": 0.519422173500061, + "grad_norm": 0.5186006426811218, "learning_rate": 4.9619938170947146e-05, - "loss": 1.4085, + "loss": 1.4091, "step": 895 }, { "epoch": 0.061149612719119444, - "grad_norm": 0.5942779779434204, + "grad_norm": 0.5956375598907471, "learning_rate": 4.961781492050551e-05, - "loss": 1.4996, + "loss": 1.4992, "step": 900 }, { "epoch": 0.06148933278978122, - "grad_norm": 0.6055877208709717, + "grad_norm": 0.6079341173171997, "learning_rate": 4.961569167006387e-05, - "loss": 1.4698, + "loss": 1.4701, "step": 905 }, { "epoch": 0.061829052860443, - "grad_norm": 0.5699920058250427, + "grad_norm": 0.5693296790122986, "learning_rate": 4.961356841962224e-05, - "loss": 1.4858, + "loss": 1.4861, "step": 910 }, { "epoch": 0.06216877293110477, - "grad_norm": 0.5864441394805908, + "grad_norm": 0.5857205986976624, "learning_rate": 4.9611445169180595e-05, "loss": 1.4402, "step": 915 }, { "epoch": 0.06250849300176654, - "grad_norm": 0.44144207239151, + "grad_norm": 0.44130274653434753, "learning_rate": 4.960932191873896e-05, - "loss": 1.4507, + "loss": 1.451, "step": 920 }, { "epoch": 0.06284821307242831, - "grad_norm": 0.6052708625793457, + "grad_norm": 0.6073679327964783, "learning_rate": 4.960719866829733e-05, "loss": 1.4825, "step": 925 }, { "epoch": 0.0631879331430901, - "grad_norm": 0.5513001680374146, + "grad_norm": 0.544926643371582, "learning_rate": 4.960507541785569e-05, - "loss": 1.4192, + "loss": 1.4195, "step": 930 }, { "epoch": 0.06352765321375187, - "grad_norm": 0.5435033440589905, + "grad_norm": 0.5487568974494934, "learning_rate": 4.960295216741405e-05, - "loss": 1.45, + "loss": 1.4501, "step": 935 }, { "epoch": 0.06386737328441364, - "grad_norm": 0.5514296889305115, + "grad_norm": 0.5522922873497009, "learning_rate": 4.960082891697242e-05, "loss": 1.4641, "step": 940 }, { "epoch": 0.06420709335507542, - "grad_norm": 0.6034606695175171, + "grad_norm": 0.603104829788208, "learning_rate": 4.959870566653078e-05, - "loss": 1.4519, + "loss": 1.4523, "step": 945 }, { "epoch": 0.06454681342573719, - "grad_norm": 0.5675321221351624, + "grad_norm": 0.5638898015022278, "learning_rate": 4.959658241608914e-05, - "loss": 1.4714, + "loss": 1.471, "step": 950 }, { "epoch": 0.06488653349639897, - "grad_norm": 0.6714404225349426, + "grad_norm": 0.6686537861824036, "learning_rate": 4.9594459165647514e-05, - "loss": 1.4705, + "loss": 1.4699, "step": 955 }, { "epoch": 0.06522625356706074, - "grad_norm": 0.552984356880188, + "grad_norm": 0.5517327189445496, "learning_rate": 4.959233591520587e-05, - "loss": 1.3832, + "loss": 1.3837, "step": 960 }, { "epoch": 0.06556597363772251, - "grad_norm": 0.6472477316856384, + "grad_norm": 0.6409295797348022, "learning_rate": 4.9590212664764235e-05, - "loss": 1.5058, + "loss": 1.5052, "step": 965 }, { "epoch": 0.0659056937083843, - "grad_norm": 0.6469607353210449, + "grad_norm": 0.6419216394424438, "learning_rate": 4.9588089414322606e-05, - "loss": 1.464, + "loss": 1.463, "step": 970 }, { "epoch": 0.06624541377904607, - "grad_norm": 0.5577409863471985, + "grad_norm": 0.5590230822563171, "learning_rate": 4.958596616388096e-05, - "loss": 1.4982, + "loss": 1.4987, "step": 975 }, { "epoch": 0.06658513384970784, - "grad_norm": 0.5370805263519287, + "grad_norm": 0.5382899641990662, "learning_rate": 4.958384291343933e-05, - "loss": 1.5008, + "loss": 1.5011, "step": 980 }, { "epoch": 0.06692485392036962, - "grad_norm": 0.48894742131233215, + "grad_norm": 0.4882492423057556, "learning_rate": 4.95817196629977e-05, - "loss": 1.3468, + "loss": 1.3467, "step": 985 }, { "epoch": 0.06726457399103139, - "grad_norm": 0.6260393261909485, + "grad_norm": 0.6203342080116272, "learning_rate": 4.9579596412556055e-05, - "loss": 1.5571, + "loss": 1.5574, "step": 990 }, { "epoch": 0.06760429406169316, - "grad_norm": 0.5318644642829895, + "grad_norm": 0.53333979845047, "learning_rate": 4.957747316211442e-05, - "loss": 1.4583, + "loss": 1.4582, "step": 995 }, { "epoch": 0.06794401413235494, - "grad_norm": 0.5583365559577942, + "grad_norm": 0.5553420186042786, "learning_rate": 4.957534991167278e-05, - "loss": 1.3278, + "loss": 1.3271, "step": 1000 }, { "epoch": 0.06828373420301671, - "grad_norm": 0.5711193084716797, + "grad_norm": 0.5705015659332275, "learning_rate": 4.957322666123115e-05, - "loss": 1.4424, + "loss": 1.4425, "step": 1005 }, { "epoch": 0.06862345427367848, - "grad_norm": 0.5027225017547607, + "grad_norm": 0.5034784078598022, "learning_rate": 4.957110341078951e-05, - "loss": 1.4588, + "loss": 1.4589, "step": 1010 }, { "epoch": 0.06896317434434027, - "grad_norm": 0.5902908444404602, + "grad_norm": 0.5891916751861572, "learning_rate": 4.9568980160347875e-05, - "loss": 1.5481, + "loss": 1.5477, "step": 1015 }, { "epoch": 0.06930289441500204, - "grad_norm": 0.5455006957054138, + "grad_norm": 0.5455055236816406, "learning_rate": 4.956685690990624e-05, - "loss": 1.4043, + "loss": 1.4046, "step": 1020 }, { "epoch": 0.0696426144856638, - "grad_norm": 0.5535411834716797, + "grad_norm": 0.5530799031257629, "learning_rate": 4.95647336594646e-05, - "loss": 1.4274, + "loss": 1.4272, "step": 1025 }, { "epoch": 0.06998233455632559, - "grad_norm": 0.5849533677101135, + "grad_norm": 0.585089385509491, "learning_rate": 4.956261040902297e-05, - "loss": 1.4464, + "loss": 1.4463, "step": 1030 }, { "epoch": 0.07032205462698736, - "grad_norm": 0.5384759902954102, + "grad_norm": 0.5371710658073425, "learning_rate": 4.956048715858133e-05, - "loss": 1.4941, + "loss": 1.4939, "step": 1035 }, { "epoch": 0.07066177469764914, - "grad_norm": 0.5651819109916687, + "grad_norm": 0.5678858757019043, "learning_rate": 4.9558363908139695e-05, - "loss": 1.4649, + "loss": 1.4647, "step": 1040 }, { "epoch": 0.07100149476831091, - "grad_norm": 0.6011462211608887, + "grad_norm": 0.5847295522689819, "learning_rate": 4.955624065769806e-05, - "loss": 1.4573, + "loss": 1.4571, "step": 1045 }, { "epoch": 0.07134121483897268, - "grad_norm": 0.6316829323768616, + "grad_norm": 0.6287067532539368, "learning_rate": 4.955411740725642e-05, - "loss": 1.456, + "loss": 1.4554, "step": 1050 }, { "epoch": 0.07168093490963447, - "grad_norm": 0.593286395072937, + "grad_norm": 0.5941911339759827, "learning_rate": 4.955199415681479e-05, - "loss": 1.3989, + "loss": 1.3991, "step": 1055 }, { "epoch": 0.07202065498029624, - "grad_norm": 0.5714511871337891, + "grad_norm": 0.5726537704467773, "learning_rate": 4.954987090637315e-05, - "loss": 1.3714, + "loss": 1.3721, "step": 1060 }, { "epoch": 0.072360375050958, - "grad_norm": 0.5762469172477722, + "grad_norm": 0.5782477259635925, "learning_rate": 4.9547747655931515e-05, "loss": 1.3775, "step": 1065 }, { "epoch": 0.07270009512161979, - "grad_norm": 0.5935518741607666, + "grad_norm": 0.5908689498901367, "learning_rate": 4.954562440548988e-05, "loss": 1.4325, "step": 1070 }, { "epoch": 0.07303981519228156, - "grad_norm": 0.4853309392929077, + "grad_norm": 0.4875369369983673, "learning_rate": 4.954350115504824e-05, - "loss": 1.3883, + "loss": 1.3882, "step": 1075 }, { "epoch": 0.07337953526294333, - "grad_norm": 0.5568351149559021, + "grad_norm": 0.5570701956748962, "learning_rate": 4.954137790460661e-05, - "loss": 1.454, + "loss": 1.4534, "step": 1080 }, { "epoch": 0.07371925533360511, - "grad_norm": 0.5317642688751221, + "grad_norm": 0.5313116312026978, "learning_rate": 4.953925465416497e-05, - "loss": 1.4239, + "loss": 1.4249, "step": 1085 }, { "epoch": 0.07405897540426688, - "grad_norm": 0.5457822680473328, + "grad_norm": 0.5517529845237732, "learning_rate": 4.9537131403723335e-05, - "loss": 1.4276, + "loss": 1.4282, "step": 1090 }, { "epoch": 0.07439869547492865, - "grad_norm": 0.4952144920825958, + "grad_norm": 0.49552908539772034, "learning_rate": 4.95350081532817e-05, - "loss": 1.4777, + "loss": 1.4767, "step": 1095 }, { "epoch": 0.07473841554559044, - "grad_norm": 0.6003117561340332, + "grad_norm": 0.6010189056396484, "learning_rate": 4.953288490284006e-05, - "loss": 1.3981, + "loss": 1.3977, "step": 1100 }, { "epoch": 0.0750781356162522, - "grad_norm": 0.5776582360267639, + "grad_norm": 0.5819920301437378, "learning_rate": 4.953076165239843e-05, - "loss": 1.456, + "loss": 1.4559, "step": 1105 }, { "epoch": 0.07541785568691398, - "grad_norm": 0.58598792552948, + "grad_norm": 0.5904499292373657, "learning_rate": 4.952863840195679e-05, - "loss": 1.4497, + "loss": 1.4499, "step": 1110 }, { "epoch": 0.07575757575757576, - "grad_norm": 0.5512722730636597, + "grad_norm": 0.5508612394332886, "learning_rate": 4.952651515151515e-05, - "loss": 1.491, + "loss": 1.4906, "step": 1115 }, { "epoch": 0.07609729582823753, - "grad_norm": 0.5397170782089233, + "grad_norm": 0.5387109518051147, "learning_rate": 4.952439190107352e-05, "loss": 1.4648, "step": 1120 }, { "epoch": 0.07643701589889931, - "grad_norm": 0.5717841386795044, + "grad_norm": 0.5709366798400879, "learning_rate": 4.952226865063188e-05, - "loss": 1.4529, + "loss": 1.4531, "step": 1125 }, { "epoch": 0.07677673596956108, - "grad_norm": 0.5555099248886108, + "grad_norm": 0.554914116859436, "learning_rate": 4.952014540019024e-05, - "loss": 1.379, + "loss": 1.3788, "step": 1130 }, { "epoch": 0.07711645604022285, - "grad_norm": 0.6435902714729309, + "grad_norm": 0.6410313248634338, "learning_rate": 4.951802214974861e-05, - "loss": 1.427, + "loss": 1.4267, "step": 1135 }, { "epoch": 0.07745617611088464, - "grad_norm": 0.6450826525688171, + "grad_norm": 0.6456186771392822, "learning_rate": 4.9515898899306975e-05, "loss": 1.3249, "step": 1140 }, { "epoch": 0.0777958961815464, - "grad_norm": 0.6193044781684875, + "grad_norm": 0.6202892661094666, "learning_rate": 4.951377564886533e-05, - "loss": 1.5011, + "loss": 1.502, "step": 1145 }, { "epoch": 0.07813561625220818, - "grad_norm": 0.6083833575248718, + "grad_norm": 0.6102027893066406, "learning_rate": 4.95116523984237e-05, - "loss": 1.3717, + "loss": 1.3707, "step": 1150 }, { "epoch": 0.07847533632286996, - "grad_norm": 0.5254198908805847, + "grad_norm": 0.5255089402198792, "learning_rate": 4.950952914798207e-05, - "loss": 1.4496, + "loss": 1.4495, "step": 1155 }, { "epoch": 0.07881505639353173, - "grad_norm": 0.5539206862449646, + "grad_norm": 0.5510827302932739, "learning_rate": 4.9507405897540424e-05, - "loss": 1.4013, + "loss": 1.4011, "step": 1160 }, { "epoch": 0.0791547764641935, - "grad_norm": 0.597885012626648, + "grad_norm": 0.5971511602401733, "learning_rate": 4.9505282647098795e-05, - "loss": 1.4537, + "loss": 1.4536, "step": 1165 }, { "epoch": 0.07949449653485528, - "grad_norm": 0.5942360758781433, + "grad_norm": 0.5953072905540466, "learning_rate": 4.950315939665716e-05, - "loss": 1.4627, + "loss": 1.4632, "step": 1170 }, { "epoch": 0.07983421660551705, - "grad_norm": 0.5833660364151001, + "grad_norm": 0.5839298367500305, "learning_rate": 4.9501036146215516e-05, - "loss": 1.4865, + "loss": 1.4862, "step": 1175 }, { "epoch": 0.08017393667617882, - "grad_norm": 0.5241972208023071, + "grad_norm": 0.5240070819854736, "learning_rate": 4.949891289577389e-05, - "loss": 1.4778, + "loss": 1.4773, "step": 1180 }, { "epoch": 0.0805136567468406, - "grad_norm": 0.48506563901901245, + "grad_norm": 0.4830525517463684, "learning_rate": 4.949678964533225e-05, - "loss": 1.5036, + "loss": 1.5033, "step": 1185 }, { "epoch": 0.08085337681750238, - "grad_norm": 0.5333223342895508, + "grad_norm": 0.5352356433868408, "learning_rate": 4.949466639489061e-05, - "loss": 1.428, + "loss": 1.4278, "step": 1190 }, { "epoch": 0.08119309688816416, - "grad_norm": 0.6433978080749512, + "grad_norm": 0.649842381477356, "learning_rate": 4.949254314444898e-05, - "loss": 1.3285, + "loss": 1.3289, "step": 1195 }, { "epoch": 0.08153281695882593, - "grad_norm": 0.6657341122627258, + "grad_norm": 0.6652253270149231, "learning_rate": 4.9490419894007336e-05, - "loss": 1.4037, + "loss": 1.4038, "step": 1200 }, { "epoch": 0.0818725370294877, - "grad_norm": 0.5360137224197388, + "grad_norm": 0.5364483594894409, "learning_rate": 4.948829664356571e-05, - "loss": 1.5553, + "loss": 1.555, "step": 1205 }, { "epoch": 0.08221225710014948, - "grad_norm": 0.5867024064064026, + "grad_norm": 0.5863167643547058, "learning_rate": 4.948617339312407e-05, - "loss": 1.4141, + "loss": 1.4135, "step": 1210 }, { "epoch": 0.08255197717081125, - "grad_norm": 0.5557378530502319, + "grad_norm": 0.5541061162948608, "learning_rate": 4.948405014268243e-05, - "loss": 1.4631, + "loss": 1.4634, "step": 1215 }, { "epoch": 0.08289169724147302, - "grad_norm": 0.6246277689933777, + "grad_norm": 0.62068110704422, "learning_rate": 4.94819268922408e-05, - "loss": 1.3083, + "loss": 1.3086, "step": 1220 }, { "epoch": 0.0832314173121348, - "grad_norm": 0.5651764869689941, + "grad_norm": 0.5657665133476257, "learning_rate": 4.947980364179916e-05, - "loss": 1.4385, + "loss": 1.4384, "step": 1225 }, { "epoch": 0.08357113738279658, - "grad_norm": 0.619651198387146, + "grad_norm": 0.6227665543556213, "learning_rate": 4.947768039135752e-05, - "loss": 1.4379, + "loss": 1.4378, "step": 1230 }, { "epoch": 0.08391085745345835, - "grad_norm": 0.5746042728424072, + "grad_norm": 0.5737194418907166, "learning_rate": 4.947555714091589e-05, - "loss": 1.534, + "loss": 1.5343, "step": 1235 }, { "epoch": 0.08425057752412013, - "grad_norm": 0.6215890049934387, + "grad_norm": 0.6211442947387695, "learning_rate": 4.9473433890474255e-05, - "loss": 1.3487, + "loss": 1.3493, "step": 1240 }, { "epoch": 0.0845902975947819, - "grad_norm": 0.5779523849487305, + "grad_norm": 0.5718501210212708, "learning_rate": 4.947131064003261e-05, - "loss": 1.3078, + "loss": 1.3084, "step": 1245 }, { "epoch": 0.08493001766544367, - "grad_norm": 0.6031992435455322, + "grad_norm": 0.6049017906188965, "learning_rate": 4.946918738959098e-05, "loss": 1.387, "step": 1250 }, { "epoch": 0.08526973773610545, - "grad_norm": 0.5986976027488708, + "grad_norm": 0.5942484736442566, "learning_rate": 4.946706413914935e-05, - "loss": 1.4365, + "loss": 1.4361, "step": 1255 }, { "epoch": 0.08560945780676722, - "grad_norm": 0.5761814713478088, + "grad_norm": 0.5764678120613098, "learning_rate": 4.9464940888707704e-05, - "loss": 1.4215, + "loss": 1.4214, "step": 1260 }, { "epoch": 0.08594917787742899, - "grad_norm": 0.5750184655189514, + "grad_norm": 0.5732192397117615, "learning_rate": 4.9462817638266075e-05, - "loss": 1.4771, + "loss": 1.4763, "step": 1265 }, { "epoch": 0.08628889794809078, - "grad_norm": 0.608689546585083, + "grad_norm": 0.6050506830215454, "learning_rate": 4.946069438782443e-05, - "loss": 1.4386, + "loss": 1.438, "step": 1270 }, { "epoch": 0.08662861801875255, - "grad_norm": 0.609923779964447, + "grad_norm": 0.6066972613334656, "learning_rate": 4.9458571137382796e-05, "loss": 1.4066, "step": 1275 }, { "epoch": 0.08696833808941433, - "grad_norm": 0.5635353922843933, + "grad_norm": 0.5636779069900513, "learning_rate": 4.945644788694117e-05, "loss": 1.4556, "step": 1280 }, { "epoch": 0.0873080581600761, - "grad_norm": 0.6076200604438782, + "grad_norm": 0.6095977425575256, "learning_rate": 4.9454324636499524e-05, - "loss": 1.5373, + "loss": 1.5379, "step": 1285 }, { "epoch": 0.08764777823073787, - "grad_norm": 0.5832569599151611, + "grad_norm": 0.5810807347297668, "learning_rate": 4.945220138605789e-05, - "loss": 1.4948, + "loss": 1.4946, "step": 1290 }, { "epoch": 0.08798749830139965, - "grad_norm": 0.5482774972915649, + "grad_norm": 0.5459557771682739, "learning_rate": 4.945007813561626e-05, - "loss": 1.4732, + "loss": 1.4726, "step": 1295 }, { "epoch": 0.08832721837206142, - "grad_norm": 0.567272961139679, + "grad_norm": 0.5690489411354065, "learning_rate": 4.9447954885174616e-05, - "loss": 1.4416, + "loss": 1.4418, "step": 1300 }, { "epoch": 0.08866693844272319, - "grad_norm": 0.6006086468696594, + "grad_norm": 0.603033721446991, "learning_rate": 4.944583163473298e-05, - "loss": 1.431, + "loss": 1.4308, "step": 1305 }, { "epoch": 0.08900665851338498, - "grad_norm": 0.5554898381233215, + "grad_norm": 0.5555094480514526, "learning_rate": 4.944370838429135e-05, - "loss": 1.4923, + "loss": 1.4926, "step": 1310 }, { "epoch": 0.08934637858404675, - "grad_norm": 0.7005094289779663, + "grad_norm": 0.7002037763595581, "learning_rate": 4.944158513384971e-05, - "loss": 1.3814, + "loss": 1.3811, "step": 1315 }, { "epoch": 0.08968609865470852, - "grad_norm": 0.5457586050033569, + "grad_norm": 0.5486272573471069, "learning_rate": 4.943946188340807e-05, - "loss": 1.3287, + "loss": 1.3288, "step": 1320 }, { "epoch": 0.0900258187253703, - "grad_norm": 0.5568594932556152, + "grad_norm": 0.5570945739746094, "learning_rate": 4.943733863296644e-05, - "loss": 1.4231, + "loss": 1.4233, "step": 1325 }, { "epoch": 0.09036553879603207, - "grad_norm": 0.564529538154602, + "grad_norm": 0.5638700127601624, "learning_rate": 4.94352153825248e-05, - "loss": 1.4203, + "loss": 1.42, "step": 1330 }, { "epoch": 0.09070525886669384, - "grad_norm": 0.7224603891372681, + "grad_norm": 0.707033097743988, "learning_rate": 4.9433092132083164e-05, - "loss": 1.4963, + "loss": 1.496, "step": 1335 }, { "epoch": 0.09104497893735562, - "grad_norm": 0.5889787077903748, + "grad_norm": 0.5878371596336365, "learning_rate": 4.9430968881641535e-05, - "loss": 1.5203, + "loss": 1.52, "step": 1340 }, { "epoch": 0.09138469900801739, - "grad_norm": 0.6516150236129761, + "grad_norm": 0.6477514505386353, "learning_rate": 4.942884563119989e-05, - "loss": 1.3814, + "loss": 1.3809, "step": 1345 }, { "epoch": 0.09172441907867916, - "grad_norm": 0.5434114336967468, + "grad_norm": 0.5446731448173523, "learning_rate": 4.9426722380758256e-05, - "loss": 1.4401, + "loss": 1.4397, "step": 1350 }, { "epoch": 0.09206413914934095, - "grad_norm": 0.5956789255142212, + "grad_norm": 0.590691089630127, "learning_rate": 4.942459913031662e-05, - "loss": 1.435, + "loss": 1.4348, "step": 1355 }, { "epoch": 0.09240385922000272, - "grad_norm": 0.5733022093772888, + "grad_norm": 0.5734403729438782, "learning_rate": 4.9422475879874984e-05, - "loss": 1.3873, + "loss": 1.3877, "step": 1360 }, { "epoch": 0.0927435792906645, - "grad_norm": 0.591184139251709, + "grad_norm": 0.5889521241188049, "learning_rate": 4.942035262943335e-05, - "loss": 1.3833, + "loss": 1.3837, "step": 1365 }, { "epoch": 0.09308329936132627, - "grad_norm": 0.5225968360900879, + "grad_norm": 0.5227224230766296, "learning_rate": 4.941822937899171e-05, - "loss": 1.4241, + "loss": 1.4249, "step": 1370 }, { "epoch": 0.09342301943198804, - "grad_norm": 0.46089863777160645, + "grad_norm": 0.462613046169281, "learning_rate": 4.9416106128550076e-05, - "loss": 1.3174, + "loss": 1.3167, "step": 1375 }, { "epoch": 0.09376273950264982, - "grad_norm": 0.5939785838127136, + "grad_norm": 0.5963701605796814, "learning_rate": 4.941398287810844e-05, - "loss": 1.5062, + "loss": 1.506, "step": 1380 }, { "epoch": 0.09410245957331159, - "grad_norm": 0.5519104599952698, + "grad_norm": 0.5501843094825745, "learning_rate": 4.9411859627666804e-05, - "loss": 1.3897, + "loss": 1.3896, "step": 1385 }, { "epoch": 0.09444217964397336, - "grad_norm": 0.5778692960739136, + "grad_norm": 0.5779409408569336, "learning_rate": 4.940973637722517e-05, "loss": 1.4834, "step": 1390 }, { "epoch": 0.09478189971463515, - "grad_norm": 0.5841971039772034, + "grad_norm": 0.5808186531066895, "learning_rate": 4.940761312678353e-05, - "loss": 1.3967, + "loss": 1.3972, "step": 1395 }, { "epoch": 0.09512161978529692, - "grad_norm": 0.5745139122009277, + "grad_norm": 0.5755193829536438, "learning_rate": 4.9405489876341896e-05, - "loss": 1.528, + "loss": 1.5284, "step": 1400 }, { "epoch": 0.09546133985595869, - "grad_norm": 0.6057448983192444, + "grad_norm": 0.6077845096588135, "learning_rate": 4.940336662590026e-05, - "loss": 1.5101, + "loss": 1.5102, "step": 1405 }, { "epoch": 0.09580105992662047, - "grad_norm": 0.5619961023330688, + "grad_norm": 0.5620288848876953, "learning_rate": 4.9401243375458624e-05, - "loss": 1.4567, + "loss": 1.4569, "step": 1410 }, { "epoch": 0.09614077999728224, - "grad_norm": 0.5457148551940918, + "grad_norm": 0.5455212593078613, "learning_rate": 4.939912012501699e-05, - "loss": 1.4855, + "loss": 1.4856, "step": 1415 }, { "epoch": 0.09648050006794401, - "grad_norm": 0.5889705419540405, + "grad_norm": 0.5875693559646606, "learning_rate": 4.939699687457535e-05, - "loss": 1.5395, + "loss": 1.5402, "step": 1420 }, { "epoch": 0.09682022013860579, - "grad_norm": 0.6001344323158264, + "grad_norm": 0.5983798503875732, "learning_rate": 4.9394873624133716e-05, - "loss": 1.5335, + "loss": 1.5332, "step": 1425 }, { "epoch": 0.09715994020926756, - "grad_norm": 0.5506335496902466, + "grad_norm": 0.5512588024139404, "learning_rate": 4.939275037369208e-05, - "loss": 1.4368, + "loss": 1.4367, "step": 1430 }, { "epoch": 0.09749966027992933, - "grad_norm": 0.6067984104156494, + "grad_norm": 0.6075851917266846, "learning_rate": 4.9390627123250444e-05, - "loss": 1.4778, + "loss": 1.4777, "step": 1435 }, { "epoch": 0.09783938035059112, - "grad_norm": 0.5627113580703735, + "grad_norm": 0.5639499425888062, "learning_rate": 4.938850387280881e-05, "loss": 1.3986, "step": 1440 }, { "epoch": 0.09817910042125289, - "grad_norm": 0.5547177195549011, + "grad_norm": 0.5548174381256104, "learning_rate": 4.938638062236717e-05, - "loss": 1.3866, + "loss": 1.3862, "step": 1445 }, { "epoch": 0.09851882049191467, - "grad_norm": 0.5425217151641846, + "grad_norm": 0.5431373119354248, "learning_rate": 4.9384257371925536e-05, - "loss": 1.52, + "loss": 1.5206, "step": 1450 }, { "epoch": 0.09885854056257644, - "grad_norm": 0.5709924697875977, + "grad_norm": 0.5803036093711853, "learning_rate": 4.93821341214839e-05, - "loss": 1.4488, + "loss": 1.449, "step": 1455 }, { "epoch": 0.09919826063323821, - "grad_norm": 0.6176159977912903, + "grad_norm": 0.6221015453338623, "learning_rate": 4.9380010871042264e-05, - "loss": 1.5285, + "loss": 1.5283, "step": 1460 }, { "epoch": 0.09953798070389999, - "grad_norm": 0.6142645478248596, + "grad_norm": 0.6184520125389099, "learning_rate": 4.937788762060063e-05, - "loss": 1.4332, + "loss": 1.4327, "step": 1465 }, { "epoch": 0.09987770077456176, - "grad_norm": 0.6106401681900024, + "grad_norm": 0.6029756665229797, "learning_rate": 4.9375764370158986e-05, - "loss": 1.4246, + "loss": 1.4247, "step": 1470 }, { "epoch": 0.10021742084522353, - "grad_norm": 0.5476092100143433, + "grad_norm": 0.5496050119400024, "learning_rate": 4.9373641119717356e-05, - "loss": 1.4671, + "loss": 1.4673, "step": 1475 }, { "epoch": 0.10055714091588532, - "grad_norm": 0.705704391002655, + "grad_norm": 0.7015076875686646, "learning_rate": 4.937151786927572e-05, - "loss": 1.4213, + "loss": 1.4209, "step": 1480 }, { "epoch": 0.10089686098654709, - "grad_norm": 0.5743557810783386, + "grad_norm": 0.573707640171051, "learning_rate": 4.936939461883408e-05, - "loss": 1.4137, + "loss": 1.4143, "step": 1485 }, { "epoch": 0.10123658105720885, - "grad_norm": 0.5322557687759399, + "grad_norm": 0.5321272611618042, "learning_rate": 4.936727136839245e-05, "loss": 1.4304, "step": 1490 }, { "epoch": 0.10157630112787064, - "grad_norm": 0.6251279711723328, + "grad_norm": 0.6251848340034485, "learning_rate": 4.936514811795081e-05, - "loss": 1.4182, + "loss": 1.4174, "step": 1495 }, { "epoch": 0.10191602119853241, - "grad_norm": 0.654549241065979, + "grad_norm": 0.6612779498100281, "learning_rate": 4.936302486750917e-05, - "loss": 1.3279, + "loss": 1.3281, "step": 1500 }, { "epoch": 0.10225574126919418, - "grad_norm": 0.6309332251548767, + "grad_norm": 0.6360726356506348, "learning_rate": 4.936090161706754e-05, - "loss": 1.3739, + "loss": 1.3742, "step": 1505 }, { "epoch": 0.10259546133985596, - "grad_norm": 0.547921895980835, + "grad_norm": 0.5478609204292297, "learning_rate": 4.9358778366625904e-05, - "loss": 1.346, + "loss": 1.3463, "step": 1510 }, { "epoch": 0.10293518141051773, - "grad_norm": 0.5620610117912292, + "grad_norm": 0.5622805953025818, "learning_rate": 4.935665511618426e-05, "loss": 1.4221, "step": 1515 }, { "epoch": 0.1032749014811795, - "grad_norm": 0.5969321727752686, + "grad_norm": 0.5957236289978027, "learning_rate": 4.935453186574263e-05, - "loss": 1.462, + "loss": 1.4626, "step": 1520 }, { "epoch": 0.10361462155184128, - "grad_norm": 0.6860018968582153, + "grad_norm": 0.6936641335487366, "learning_rate": 4.9352408615300996e-05, "loss": 1.4466, "step": 1525 }, { "epoch": 0.10395434162250305, - "grad_norm": 0.5293998718261719, + "grad_norm": 0.529825747013092, "learning_rate": 4.9350285364859354e-05, - "loss": 1.4329, + "loss": 1.4325, "step": 1530 }, { "epoch": 0.10429406169316484, - "grad_norm": 0.5381416082382202, + "grad_norm": 0.5374847650527954, "learning_rate": 4.9348162114417724e-05, - "loss": 1.3761, + "loss": 1.3757, "step": 1535 }, { "epoch": 0.10463378176382661, - "grad_norm": 0.5802014470100403, + "grad_norm": 0.5831882357597351, "learning_rate": 4.934603886397609e-05, - "loss": 1.4226, + "loss": 1.4223, "step": 1540 }, { "epoch": 0.10497350183448838, - "grad_norm": 0.5551945567131042, + "grad_norm": 0.5555147528648376, "learning_rate": 4.934391561353445e-05, "loss": 1.4017, "step": 1545 }, { "epoch": 0.10531322190515016, - "grad_norm": 0.6843209862709045, + "grad_norm": 0.6835488080978394, "learning_rate": 4.9341792363092816e-05, - "loss": 1.424, + "loss": 1.4242, "step": 1550 }, { "epoch": 0.10565294197581193, - "grad_norm": 0.6528891921043396, + "grad_norm": 0.6528186798095703, "learning_rate": 4.9339669112651174e-05, - "loss": 1.3912, + "loss": 1.3913, "step": 1555 }, { "epoch": 0.1059926620464737, - "grad_norm": 0.5132463574409485, + "grad_norm": 0.5208539962768555, "learning_rate": 4.9337545862209544e-05, - "loss": 1.3568, + "loss": 1.3573, "step": 1560 }, { "epoch": 0.10633238211713548, - "grad_norm": 0.6335332989692688, + "grad_norm": 0.639661431312561, "learning_rate": 4.933542261176791e-05, - "loss": 1.3855, + "loss": 1.3848, "step": 1565 }, { "epoch": 0.10667210218779725, - "grad_norm": 0.6554496884346008, + "grad_norm": 0.6527217030525208, "learning_rate": 4.9333299361326266e-05, - "loss": 1.4204, + "loss": 1.4202, "step": 1570 }, { "epoch": 0.10701182225845902, - "grad_norm": 0.5501953959465027, + "grad_norm": 0.5507591366767883, "learning_rate": 4.9331176110884637e-05, - "loss": 1.5018, + "loss": 1.5019, "step": 1575 }, { "epoch": 0.10735154232912081, - "grad_norm": 0.5601421594619751, + "grad_norm": 0.5593494176864624, "learning_rate": 4.9329052860443e-05, - "loss": 1.4294, + "loss": 1.4293, "step": 1580 }, { "epoch": 0.10769126239978258, - "grad_norm": 0.5142511129379272, + "grad_norm": 0.5148162841796875, "learning_rate": 4.932692961000136e-05, - "loss": 1.36, + "loss": 1.3599, "step": 1585 }, { "epoch": 0.10803098247044435, - "grad_norm": 0.7431572675704956, + "grad_norm": 0.7443335652351379, "learning_rate": 4.932480635955973e-05, - "loss": 1.4361, + "loss": 1.4362, "step": 1590 }, { "epoch": 0.10837070254110613, - "grad_norm": 0.6073015332221985, + "grad_norm": 0.6011170148849487, "learning_rate": 4.932268310911809e-05, - "loss": 1.4543, + "loss": 1.4545, "step": 1595 }, { "epoch": 0.1087104226117679, - "grad_norm": 0.5889863967895508, + "grad_norm": 0.5892181396484375, "learning_rate": 4.932055985867645e-05, - "loss": 1.4554, + "loss": 1.4558, "step": 1600 }, { "epoch": 0.10905014268242967, - "grad_norm": 0.5507410764694214, + "grad_norm": 0.5500453114509583, "learning_rate": 4.931843660823482e-05, - "loss": 1.4994, + "loss": 1.4996, "step": 1605 }, { "epoch": 0.10938986275309145, - "grad_norm": 0.6453044414520264, + "grad_norm": 0.6437133550643921, "learning_rate": 4.9316313357793185e-05, - "loss": 1.4872, + "loss": 1.4873, "step": 1610 }, { "epoch": 0.10972958282375322, - "grad_norm": 0.5991227030754089, + "grad_norm": 0.5981764197349548, "learning_rate": 4.931419010735154e-05, - "loss": 1.3944, + "loss": 1.3943, "step": 1615 }, { "epoch": 0.11006930289441501, - "grad_norm": 0.5487539768218994, + "grad_norm": 0.5495855212211609, "learning_rate": 4.931206685690991e-05, - "loss": 1.4355, + "loss": 1.4359, "step": 1620 }, { "epoch": 0.11040902296507678, - "grad_norm": 0.5705694556236267, + "grad_norm": 0.5727270245552063, "learning_rate": 4.930994360646827e-05, - "loss": 1.3654, + "loss": 1.3653, "step": 1625 }, { "epoch": 0.11074874303573855, - "grad_norm": 0.5932425260543823, + "grad_norm": 0.5957831740379333, "learning_rate": 4.9307820356026634e-05, "loss": 1.4701, "step": 1630 }, { "epoch": 0.11108846310640033, - "grad_norm": 0.6856851577758789, + "grad_norm": 0.6832654476165771, "learning_rate": 4.9305697105585005e-05, - "loss": 1.4945, + "loss": 1.494, "step": 1635 }, { "epoch": 0.1114281831770621, - "grad_norm": 0.5697663426399231, + "grad_norm": 0.5719290971755981, "learning_rate": 4.930357385514336e-05, - "loss": 1.4453, + "loss": 1.446, "step": 1640 }, { "epoch": 0.11176790324772387, - "grad_norm": 0.4906790554523468, + "grad_norm": 0.48680078983306885, "learning_rate": 4.9301450604701726e-05, - "loss": 1.4755, + "loss": 1.4761, "step": 1645 }, { "epoch": 0.11210762331838565, - "grad_norm": 0.5060974359512329, + "grad_norm": 0.5075876712799072, "learning_rate": 4.9299327354260097e-05, - "loss": 1.4535, + "loss": 1.4534, "step": 1650 }, { "epoch": 0.11244734338904742, - "grad_norm": 0.5901421308517456, + "grad_norm": 0.5911957621574402, "learning_rate": 4.9297204103818454e-05, - "loss": 1.5331, + "loss": 1.5338, "step": 1655 }, { "epoch": 0.1127870634597092, - "grad_norm": 0.5493902564048767, + "grad_norm": 0.5495116710662842, "learning_rate": 4.929508085337682e-05, - "loss": 1.4116, + "loss": 1.4115, "step": 1660 }, { "epoch": 0.11312678353037098, - "grad_norm": 0.6108385324478149, + "grad_norm": 0.6108766794204712, "learning_rate": 4.929295760293519e-05, - "loss": 1.385, + "loss": 1.3849, "step": 1665 }, { "epoch": 0.11346650360103275, - "grad_norm": 0.5650813579559326, + "grad_norm": 0.565472424030304, "learning_rate": 4.9290834352493546e-05, - "loss": 1.4503, + "loss": 1.4507, "step": 1670 }, { "epoch": 0.11380622367169452, - "grad_norm": 0.5575889945030212, + "grad_norm": 0.555022120475769, "learning_rate": 4.928871110205191e-05, - "loss": 1.2971, + "loss": 1.2967, "step": 1675 }, { "epoch": 0.1141459437423563, - "grad_norm": 0.5342539548873901, + "grad_norm": 0.5347650647163391, "learning_rate": 4.928658785161028e-05, - "loss": 1.4955, + "loss": 1.4958, "step": 1680 }, { "epoch": 0.11448566381301807, - "grad_norm": 0.577377438545227, + "grad_norm": 0.5791923999786377, "learning_rate": 4.928446460116864e-05, - "loss": 1.3362, + "loss": 1.3366, "step": 1685 }, { "epoch": 0.11482538388367985, - "grad_norm": 0.5332703590393066, + "grad_norm": 0.5344591736793518, "learning_rate": 4.9282341350727e-05, - "loss": 1.3495, + "loss": 1.3494, "step": 1690 }, { "epoch": 0.11516510395434162, - "grad_norm": 0.560100257396698, + "grad_norm": 0.5606475472450256, "learning_rate": 4.9280218100285366e-05, "loss": 1.4691, "step": 1695 }, { "epoch": 0.1155048240250034, - "grad_norm": 0.5314947366714478, + "grad_norm": 0.5325570702552795, "learning_rate": 4.927809484984373e-05, - "loss": 1.4083, + "loss": 1.4079, "step": 1700 }, { "epoch": 0.11584454409566518, - "grad_norm": 0.6132587790489197, + "grad_norm": 0.6115857362747192, "learning_rate": 4.9275971599402094e-05, - "loss": 1.5005, + "loss": 1.4996, "step": 1705 }, { "epoch": 0.11618426416632695, - "grad_norm": 0.6340333223342896, + "grad_norm": 0.6252047419548035, "learning_rate": 4.927384834896046e-05, - "loss": 1.4116, + "loss": 1.4111, "step": 1710 }, { "epoch": 0.11652398423698872, - "grad_norm": 0.6274508237838745, + "grad_norm": 0.6282231211662292, "learning_rate": 4.927172509851882e-05, - "loss": 1.3378, + "loss": 1.3383, "step": 1715 }, { "epoch": 0.1168637043076505, - "grad_norm": 0.6226668953895569, + "grad_norm": 0.6186545491218567, "learning_rate": 4.9269601848077186e-05, - "loss": 1.3891, + "loss": 1.3894, "step": 1720 }, { "epoch": 0.11720342437831227, - "grad_norm": 0.6084844470024109, + "grad_norm": 0.6081210970878601, "learning_rate": 4.926747859763555e-05, - "loss": 1.3585, + "loss": 1.3581, "step": 1725 }, { "epoch": 0.11754314444897404, - "grad_norm": 0.48321905732154846, + "grad_norm": 0.48397549986839294, "learning_rate": 4.9265355347193914e-05, "loss": 1.4552, "step": 1730 }, { "epoch": 0.11788286451963582, - "grad_norm": 0.5236528515815735, + "grad_norm": 0.5217854380607605, "learning_rate": 4.926323209675228e-05, - "loss": 1.4124, + "loss": 1.4132, "step": 1735 }, { "epoch": 0.1182225845902976, - "grad_norm": 0.593169629573822, + "grad_norm": 0.5856433510780334, "learning_rate": 4.926110884631064e-05, - "loss": 1.474, + "loss": 1.4744, "step": 1740 }, { "epoch": 0.11856230466095936, - "grad_norm": 0.6106769442558289, + "grad_norm": 0.6093284487724304, "learning_rate": 4.9258985595869006e-05, - "loss": 1.3808, + "loss": 1.3805, "step": 1745 }, { "epoch": 0.11890202473162115, - "grad_norm": 0.5643929839134216, + "grad_norm": 0.5639051795005798, "learning_rate": 4.925686234542737e-05, - "loss": 1.4238, + "loss": 1.4233, "step": 1750 }, { "epoch": 0.11924174480228292, - "grad_norm": 0.5895077586174011, + "grad_norm": 0.5924566984176636, "learning_rate": 4.9254739094985734e-05, - "loss": 1.4292, + "loss": 1.4294, "step": 1755 }, { "epoch": 0.11958146487294469, - "grad_norm": 0.6480593681335449, + "grad_norm": 0.6433087587356567, "learning_rate": 4.92526158445441e-05, - "loss": 1.5694, + "loss": 1.5687, "step": 1760 }, { "epoch": 0.11992118494360647, - "grad_norm": 0.5908152461051941, + "grad_norm": 0.5897480845451355, "learning_rate": 4.925049259410246e-05, "loss": 1.4193, "step": 1765 }, { "epoch": 0.12026090501426824, - "grad_norm": 0.5783700942993164, + "grad_norm": 0.5793489217758179, "learning_rate": 4.9248369343660826e-05, - "loss": 1.3689, + "loss": 1.3688, "step": 1770 }, { "epoch": 0.12060062508493002, - "grad_norm": 0.5613039135932922, + "grad_norm": 0.5564126968383789, "learning_rate": 4.924624609321919e-05, - "loss": 1.4366, + "loss": 1.4365, "step": 1775 }, { "epoch": 0.1209403451555918, - "grad_norm": 0.5893939733505249, + "grad_norm": 0.5775806903839111, "learning_rate": 4.9244122842777554e-05, - "loss": 1.4309, + "loss": 1.4303, "step": 1780 }, { "epoch": 0.12128006522625356, - "grad_norm": 0.5168840289115906, + "grad_norm": 0.5161932110786438, "learning_rate": 4.924199959233592e-05, - "loss": 1.4472, + "loss": 1.4476, "step": 1785 }, { "epoch": 0.12161978529691535, - "grad_norm": 0.5954933166503906, + "grad_norm": 0.5869646668434143, "learning_rate": 4.923987634189428e-05, - "loss": 1.3857, + "loss": 1.3854, "step": 1790 }, { "epoch": 0.12195950536757712, - "grad_norm": 0.6200373768806458, + "grad_norm": 0.6200955510139465, "learning_rate": 4.9237753091452646e-05, - "loss": 1.3801, + "loss": 1.3806, "step": 1795 }, { "epoch": 0.12229922543823889, - "grad_norm": 0.5800577402114868, + "grad_norm": 0.5824678540229797, "learning_rate": 4.923562984101101e-05, - "loss": 1.4184, + "loss": 1.4186, "step": 1800 }, { "epoch": 0.12263894550890067, - "grad_norm": 0.5880134105682373, + "grad_norm": 0.5879980325698853, "learning_rate": 4.9233506590569374e-05, - "loss": 1.3974, + "loss": 1.3976, "step": 1805 }, { "epoch": 0.12297866557956244, - "grad_norm": 0.5568594932556152, + "grad_norm": 0.5551352500915527, "learning_rate": 4.923138334012774e-05, - "loss": 1.4636, + "loss": 1.4638, "step": 1810 }, { "epoch": 0.12331838565022421, - "grad_norm": 0.5803167819976807, + "grad_norm": 0.582542896270752, "learning_rate": 4.92292600896861e-05, - "loss": 1.4275, + "loss": 1.4282, "step": 1815 }, { "epoch": 0.123658105720886, - "grad_norm": 0.6611342430114746, + "grad_norm": 0.6597841382026672, "learning_rate": 4.9227136839244466e-05, - "loss": 1.4995, + "loss": 1.4993, "step": 1820 }, { "epoch": 0.12399782579154776, - "grad_norm": 0.5750046372413635, + "grad_norm": 0.5749479532241821, "learning_rate": 4.922501358880282e-05, "loss": 1.4043, "step": 1825 }, { "epoch": 0.12433754586220953, - "grad_norm": 0.5646703839302063, + "grad_norm": 0.5649749040603638, "learning_rate": 4.9222890338361194e-05, - "loss": 1.3777, + "loss": 1.3778, "step": 1830 }, { "epoch": 0.12467726593287132, - "grad_norm": 0.46412211656570435, + "grad_norm": 0.4707821011543274, "learning_rate": 4.922076708791956e-05, - "loss": 1.427, + "loss": 1.4273, "step": 1835 }, { "epoch": 0.1250169860035331, - "grad_norm": 0.6101703643798828, + "grad_norm": 0.6115847229957581, "learning_rate": 4.9218643837477915e-05, - "loss": 1.4434, + "loss": 1.4437, "step": 1840 }, { "epoch": 0.12535670607419486, - "grad_norm": 0.5917672514915466, + "grad_norm": 0.5928316116333008, "learning_rate": 4.9216520587036286e-05, - "loss": 1.435, + "loss": 1.4348, "step": 1845 }, { "epoch": 0.12569642614485663, - "grad_norm": 0.6468296647071838, + "grad_norm": 0.6487690210342407, "learning_rate": 4.921439733659465e-05, - "loss": 1.4413, + "loss": 1.4412, "step": 1850 }, { "epoch": 0.12603614621551842, - "grad_norm": 0.6298085451126099, + "grad_norm": 0.6372539401054382, "learning_rate": 4.921227408615301e-05, "loss": 1.4177, "step": 1855 }, { "epoch": 0.1263758662861802, - "grad_norm": 0.6000416874885559, + "grad_norm": 0.5986123085021973, "learning_rate": 4.921015083571138e-05, - "loss": 1.4596, + "loss": 1.4595, "step": 1860 }, { "epoch": 0.12671558635684196, - "grad_norm": 0.5701744556427002, + "grad_norm": 0.5672442317008972, "learning_rate": 4.920802758526974e-05, - "loss": 1.2559, + "loss": 1.256, "step": 1865 }, { "epoch": 0.12705530642750373, - "grad_norm": 0.6048057675361633, + "grad_norm": 0.6015438437461853, "learning_rate": 4.92059043348281e-05, - "loss": 1.4887, + "loss": 1.4885, "step": 1870 }, { "epoch": 0.1273950264981655, - "grad_norm": 0.6061444878578186, + "grad_norm": 0.6073367595672607, "learning_rate": 4.920378108438647e-05, - "loss": 1.4442, + "loss": 1.4447, "step": 1875 }, { "epoch": 0.12773474656882727, - "grad_norm": 0.6482565402984619, + "grad_norm": 0.6518848538398743, "learning_rate": 4.9201657833944834e-05, - "loss": 1.5101, + "loss": 1.5095, "step": 1880 }, { "epoch": 0.12807446663948907, - "grad_norm": 0.6278590559959412, + "grad_norm": 0.6262403726577759, "learning_rate": 4.91995345835032e-05, - "loss": 1.4861, + "loss": 1.4859, "step": 1885 }, { "epoch": 0.12841418671015084, - "grad_norm": 0.5761988759040833, + "grad_norm": 0.5696122646331787, "learning_rate": 4.919741133306156e-05, - "loss": 1.4632, + "loss": 1.4633, "step": 1890 }, { "epoch": 0.1287539067808126, - "grad_norm": 0.5002067685127258, + "grad_norm": 0.49948787689208984, "learning_rate": 4.919528808261992e-05, "loss": 1.3721, "step": 1895 }, { "epoch": 0.12909362685147438, - "grad_norm": 0.5973879098892212, + "grad_norm": 0.595719039440155, "learning_rate": 4.919316483217829e-05, - "loss": 1.4422, + "loss": 1.4415, "step": 1900 }, { "epoch": 0.12943334692213615, - "grad_norm": 0.6070477366447449, + "grad_norm": 0.6072402000427246, "learning_rate": 4.9191041581736654e-05, - "loss": 1.446, + "loss": 1.4464, "step": 1905 }, { "epoch": 0.12977306699279795, - "grad_norm": 0.562765896320343, + "grad_norm": 0.5648959875106812, "learning_rate": 4.918891833129501e-05, - "loss": 1.3384, + "loss": 1.3386, "step": 1910 }, { "epoch": 0.13011278706345972, - "grad_norm": 0.5640690922737122, + "grad_norm": 0.5625189542770386, "learning_rate": 4.918679508085338e-05, "loss": 1.4031, "step": 1915 }, { "epoch": 0.1304525071341215, - "grad_norm": 0.639013946056366, + "grad_norm": 0.6404390931129456, "learning_rate": 4.9184671830411746e-05, - "loss": 1.3823, + "loss": 1.3821, "step": 1920 }, { "epoch": 0.13079222720478326, - "grad_norm": 0.5957516431808472, + "grad_norm": 0.5951664447784424, "learning_rate": 4.91825485799701e-05, - "loss": 1.4052, + "loss": 1.4058, "step": 1925 }, { "epoch": 0.13113194727544503, - "grad_norm": 0.6282196044921875, + "grad_norm": 0.6264207363128662, "learning_rate": 4.9180425329528474e-05, - "loss": 1.371, + "loss": 1.3712, "step": 1930 }, { "epoch": 0.1314716673461068, - "grad_norm": 0.5092763304710388, + "grad_norm": 0.5121632814407349, "learning_rate": 4.917830207908684e-05, - "loss": 1.4383, + "loss": 1.4384, "step": 1935 }, { "epoch": 0.1318113874167686, - "grad_norm": 0.539523720741272, + "grad_norm": 0.5396740436553955, "learning_rate": 4.9176178828645195e-05, - "loss": 1.3717, + "loss": 1.3718, "step": 1940 }, { "epoch": 0.13215110748743036, - "grad_norm": 0.6316164135932922, + "grad_norm": 0.6380709409713745, "learning_rate": 4.9174055578203566e-05, - "loss": 1.3851, + "loss": 1.3852, "step": 1945 }, { "epoch": 0.13249082755809213, - "grad_norm": 0.5689708590507507, + "grad_norm": 0.5703464150428772, "learning_rate": 4.917193232776193e-05, - "loss": 1.4547, + "loss": 1.4552, "step": 1950 }, { "epoch": 0.1328305476287539, - "grad_norm": 0.5140627026557922, + "grad_norm": 0.5154551863670349, "learning_rate": 4.916980907732029e-05, - "loss": 1.4956, + "loss": 1.4955, "step": 1955 }, { "epoch": 0.13317026769941567, - "grad_norm": 0.6105151772499084, + "grad_norm": 0.6107110381126404, "learning_rate": 4.916768582687866e-05, - "loss": 1.4766, + "loss": 1.477, "step": 1960 }, { "epoch": 0.13350998777007744, - "grad_norm": 0.541977047920227, + "grad_norm": 0.5411398410797119, "learning_rate": 4.916556257643702e-05, - "loss": 1.3756, + "loss": 1.3754, "step": 1965 }, { "epoch": 0.13384970784073924, - "grad_norm": 0.6562601923942566, + "grad_norm": 0.6577611565589905, "learning_rate": 4.916343932599538e-05, - "loss": 1.4917, + "loss": 1.4919, "step": 1970 }, { "epoch": 0.134189427911401, - "grad_norm": 0.5527669787406921, + "grad_norm": 0.5500496029853821, "learning_rate": 4.916131607555375e-05, "loss": 1.3802, "step": 1975 }, { "epoch": 0.13452914798206278, - "grad_norm": 0.6226165294647217, + "grad_norm": 0.6189011335372925, "learning_rate": 4.915919282511211e-05, - "loss": 1.3691, + "loss": 1.3693, "step": 1980 }, { "epoch": 0.13486886805272455, - "grad_norm": 0.56634521484375, + "grad_norm": 0.5683608055114746, "learning_rate": 4.915706957467047e-05, - "loss": 1.4878, + "loss": 1.4879, "step": 1985 }, { "epoch": 0.13520858812338632, - "grad_norm": 0.5585847496986389, + "grad_norm": 0.558321475982666, "learning_rate": 4.915494632422884e-05, - "loss": 1.3215, + "loss": 1.3219, "step": 1990 }, { "epoch": 0.13554830819404812, - "grad_norm": 0.5256451368331909, + "grad_norm": 0.5274139642715454, "learning_rate": 4.91528230737872e-05, "loss": 1.4867, "step": 1995 }, { "epoch": 0.1358880282647099, - "grad_norm": 0.6245139837265015, + "grad_norm": 0.6212067604064941, "learning_rate": 4.915069982334556e-05, - "loss": 1.4282, + "loss": 1.428, "step": 2000 }, { "epoch": 0.13622774833537166, - "grad_norm": 0.5906956195831299, + "grad_norm": 0.5847666263580322, "learning_rate": 4.9148576572903934e-05, - "loss": 1.4018, + "loss": 1.4027, "step": 2005 }, { "epoch": 0.13656746840603343, - "grad_norm": 0.6229907870292664, + "grad_norm": 0.619773805141449, "learning_rate": 4.914645332246229e-05, - "loss": 1.531, + "loss": 1.5314, "step": 2010 }, { "epoch": 0.1369071884766952, - "grad_norm": 0.5922444462776184, + "grad_norm": 0.5914238095283508, "learning_rate": 4.9144330072020655e-05, - "loss": 1.5008, + "loss": 1.5009, "step": 2015 }, { "epoch": 0.13724690854735697, - "grad_norm": 0.6321733593940735, + "grad_norm": 0.6325128078460693, "learning_rate": 4.9142206821579026e-05, - "loss": 1.5288, + "loss": 1.5284, "step": 2020 }, { "epoch": 0.13758662861801876, - "grad_norm": 0.6001906394958496, + "grad_norm": 0.6023498773574829, "learning_rate": 4.914008357113738e-05, "loss": 1.4587, "step": 2025 }, { "epoch": 0.13792634868868053, - "grad_norm": 0.6095107793807983, + "grad_norm": 0.6098828911781311, "learning_rate": 4.913796032069575e-05, - "loss": 1.3526, + "loss": 1.3527, "step": 2030 }, { "epoch": 0.1382660687593423, - "grad_norm": 0.6127040386199951, + "grad_norm": 0.6126969456672668, "learning_rate": 4.913583707025412e-05, - "loss": 1.4356, + "loss": 1.4359, "step": 2035 }, { "epoch": 0.13860578883000407, - "grad_norm": 0.5953993201255798, + "grad_norm": 0.5946942567825317, "learning_rate": 4.9133713819812475e-05, - "loss": 1.4753, + "loss": 1.4746, "step": 2040 }, { "epoch": 0.13894550890066584, - "grad_norm": 0.5859862565994263, + "grad_norm": 0.5871629118919373, "learning_rate": 4.913159056937084e-05, "loss": 1.5111, "step": 2045 }, { "epoch": 0.1392852289713276, - "grad_norm": 0.642520546913147, + "grad_norm": 0.6401618123054504, "learning_rate": 4.91294673189292e-05, - "loss": 1.4449, + "loss": 1.4457, "step": 2050 }, { "epoch": 0.1396249490419894, - "grad_norm": 0.6965727806091309, + "grad_norm": 0.6944628953933716, "learning_rate": 4.912734406848757e-05, - "loss": 1.3786, + "loss": 1.3792, "step": 2055 }, { "epoch": 0.13996466911265118, - "grad_norm": 0.562048077583313, + "grad_norm": 0.561116635799408, "learning_rate": 4.912522081804593e-05, - "loss": 1.3695, + "loss": 1.3699, "step": 2060 }, { "epoch": 0.14030438918331295, - "grad_norm": 0.7077406644821167, + "grad_norm": 0.7094035744667053, "learning_rate": 4.9123097567604295e-05, - "loss": 1.4872, + "loss": 1.4879, "step": 2065 }, { "epoch": 0.14064410925397472, - "grad_norm": 0.5612382292747498, + "grad_norm": 0.5590683817863464, "learning_rate": 4.912097431716266e-05, - "loss": 1.4583, + "loss": 1.4576, "step": 2070 }, { "epoch": 0.1409838293246365, - "grad_norm": 0.618161141872406, + "grad_norm": 0.6172699928283691, "learning_rate": 4.911885106672102e-05, - "loss": 1.4502, + "loss": 1.45, "step": 2075 }, { "epoch": 0.1413235493952983, - "grad_norm": 0.6133080720901489, + "grad_norm": 0.595493733882904, "learning_rate": 4.911672781627939e-05, - "loss": 1.4537, + "loss": 1.4543, "step": 2080 }, { "epoch": 0.14166326946596006, - "grad_norm": 0.6027617454528809, + "grad_norm": 0.6013965010643005, "learning_rate": 4.911460456583775e-05, - "loss": 1.4806, + "loss": 1.4808, "step": 2085 }, { "epoch": 0.14200298953662183, - "grad_norm": 0.5253147482872009, + "grad_norm": 0.5255105495452881, "learning_rate": 4.9112481315396115e-05, - "loss": 1.409, + "loss": 1.4088, "step": 2090 }, { "epoch": 0.1423427096072836, - "grad_norm": 0.6017151474952698, + "grad_norm": 0.602545440196991, "learning_rate": 4.911035806495448e-05, - "loss": 1.4645, + "loss": 1.4646, "step": 2095 }, { "epoch": 0.14268242967794537, - "grad_norm": 0.5726915597915649, + "grad_norm": 0.5700831413269043, "learning_rate": 4.910823481451284e-05, - "loss": 1.3991, + "loss": 1.3989, "step": 2100 }, { "epoch": 0.14302214974860714, - "grad_norm": 0.5525890588760376, + "grad_norm": 0.5524193644523621, "learning_rate": 4.910611156407121e-05, - "loss": 1.4447, + "loss": 1.4444, "step": 2105 }, { "epoch": 0.14336186981926893, - "grad_norm": 0.5599091649055481, + "grad_norm": 0.5617913007736206, "learning_rate": 4.910398831362957e-05, - "loss": 1.4747, + "loss": 1.4748, "step": 2110 }, { "epoch": 0.1437015898899307, - "grad_norm": 0.5579841136932373, + "grad_norm": 0.5601038932800293, "learning_rate": 4.9101865063187935e-05, - "loss": 1.4366, + "loss": 1.4373, "step": 2115 }, { "epoch": 0.14404130996059247, - "grad_norm": 0.6631743907928467, + "grad_norm": 0.6611192226409912, "learning_rate": 4.90997418127463e-05, - "loss": 1.4922, + "loss": 1.4925, "step": 2120 }, { "epoch": 0.14438103003125424, - "grad_norm": 0.6793774366378784, + "grad_norm": 0.6762072443962097, "learning_rate": 4.909761856230466e-05, - "loss": 1.4246, + "loss": 1.4248, "step": 2125 }, { "epoch": 0.144720750101916, - "grad_norm": 0.622520923614502, + "grad_norm": 0.6223987340927124, "learning_rate": 4.909549531186303e-05, - "loss": 1.3589, + "loss": 1.358, "step": 2130 }, { "epoch": 0.14506047017257778, - "grad_norm": 0.5355719327926636, + "grad_norm": 0.5363806486129761, "learning_rate": 4.909337206142139e-05, "loss": 1.3717, "step": 2135 }, { "epoch": 0.14540019024323958, - "grad_norm": 0.6087984442710876, + "grad_norm": 0.6105725169181824, "learning_rate": 4.9091248810979755e-05, - "loss": 1.4251, + "loss": 1.4256, "step": 2140 }, { "epoch": 0.14573991031390135, - "grad_norm": 0.5506510734558105, + "grad_norm": 0.5507329106330872, "learning_rate": 4.908912556053812e-05, - "loss": 1.4563, + "loss": 1.4571, "step": 2145 }, { "epoch": 0.14607963038456312, - "grad_norm": 0.5640601515769958, + "grad_norm": 0.5728895664215088, "learning_rate": 4.9087002310096483e-05, "loss": 1.2312, "step": 2150 }, { "epoch": 0.1464193504552249, - "grad_norm": 0.6096499562263489, + "grad_norm": 0.6046918630599976, "learning_rate": 4.908487905965485e-05, "loss": 1.571, "step": 2155 }, { "epoch": 0.14675907052588666, - "grad_norm": 0.5384127497673035, + "grad_norm": 0.5339946746826172, "learning_rate": 4.908275580921321e-05, - "loss": 1.3816, + "loss": 1.3825, "step": 2160 }, { "epoch": 0.14709879059654846, - "grad_norm": 0.6377132534980774, + "grad_norm": 0.6427642703056335, "learning_rate": 4.9080632558771575e-05, "loss": 1.3667, "step": 2165 }, { "epoch": 0.14743851066721023, - "grad_norm": 0.5406596064567566, + "grad_norm": 0.5397003889083862, "learning_rate": 4.907850930832994e-05, - "loss": 1.3812, + "loss": 1.3811, "step": 2170 }, { "epoch": 0.147778230737872, - "grad_norm": 0.6148573160171509, + "grad_norm": 0.6125214099884033, "learning_rate": 4.9076386057888303e-05, - "loss": 1.5092, + "loss": 1.5091, "step": 2175 }, { "epoch": 0.14811795080853377, - "grad_norm": 0.5781053900718689, + "grad_norm": 0.578341543674469, "learning_rate": 4.907426280744666e-05, "loss": 1.3639, "step": 2180 }, { "epoch": 0.14845767087919554, - "grad_norm": 0.5883496403694153, + "grad_norm": 0.5864296555519104, "learning_rate": 4.907213955700503e-05, "loss": 1.531, "step": 2185 }, { "epoch": 0.1487973909498573, - "grad_norm": 0.5491074323654175, + "grad_norm": 0.5502539873123169, "learning_rate": 4.9070016306563395e-05, - "loss": 1.4468, + "loss": 1.4462, "step": 2190 }, { "epoch": 0.1491371110205191, - "grad_norm": 0.5220900177955627, + "grad_norm": 0.5269922018051147, "learning_rate": 4.906789305612175e-05, - "loss": 1.3442, + "loss": 1.3444, "step": 2195 }, { "epoch": 0.14947683109118087, - "grad_norm": 0.5424416065216064, + "grad_norm": 0.5420787334442139, "learning_rate": 4.9065769805680123e-05, - "loss": 1.4565, + "loss": 1.4561, "step": 2200 }, { "epoch": 0.14981655116184264, - "grad_norm": 0.5749632716178894, + "grad_norm": 0.5739856362342834, "learning_rate": 4.906364655523849e-05, - "loss": 1.3293, + "loss": 1.3292, "step": 2205 }, { "epoch": 0.1501562712325044, - "grad_norm": 0.581767201423645, + "grad_norm": 0.5831305384635925, "learning_rate": 4.9061523304796845e-05, - "loss": 1.4812, + "loss": 1.481, "step": 2210 }, { "epoch": 0.15049599130316618, - "grad_norm": 0.6531165838241577, + "grad_norm": 0.6508095860481262, "learning_rate": 4.9059400054355215e-05, - "loss": 1.4215, + "loss": 1.4212, "step": 2215 }, { "epoch": 0.15083571137382795, - "grad_norm": 0.5991101264953613, + "grad_norm": 0.5992867350578308, "learning_rate": 4.905727680391358e-05, - "loss": 1.4545, + "loss": 1.4549, "step": 2220 }, { "epoch": 0.15117543144448975, - "grad_norm": 0.660378098487854, + "grad_norm": 0.6591667532920837, "learning_rate": 4.9055153553471943e-05, - "loss": 1.4995, + "loss": 1.499, "step": 2225 }, { "epoch": 0.15151515151515152, - "grad_norm": 0.5730030536651611, + "grad_norm": 0.5719335079193115, "learning_rate": 4.905303030303031e-05, - "loss": 1.3848, + "loss": 1.3847, "step": 2230 }, { "epoch": 0.1518548715858133, - "grad_norm": 0.5338610410690308, + "grad_norm": 0.5334826111793518, "learning_rate": 4.905090705258867e-05, - "loss": 1.437, + "loss": 1.4372, "step": 2235 }, { "epoch": 0.15219459165647506, - "grad_norm": 0.5345319509506226, + "grad_norm": 0.5362840294837952, "learning_rate": 4.9048783802147035e-05, - "loss": 1.501, + "loss": 1.5011, "step": 2240 }, { "epoch": 0.15253431172713683, - "grad_norm": 0.5088982582092285, + "grad_norm": 0.5110069513320923, "learning_rate": 4.90466605517054e-05, "loss": 1.4801, "step": 2245 }, { "epoch": 0.15287403179779863, - "grad_norm": 0.5582383871078491, + "grad_norm": 0.5597143173217773, "learning_rate": 4.904453730126376e-05, - "loss": 1.3736, + "loss": 1.3729, "step": 2250 }, { "epoch": 0.1532137518684604, - "grad_norm": 0.5619956254959106, + "grad_norm": 0.5623816251754761, "learning_rate": 4.904241405082213e-05, - "loss": 1.325, + "loss": 1.3252, "step": 2255 }, { "epoch": 0.15355347193912217, - "grad_norm": 0.5197911858558655, + "grad_norm": 0.5228548645973206, "learning_rate": 4.904029080038049e-05, - "loss": 1.4212, + "loss": 1.4215, "step": 2260 }, { "epoch": 0.15389319200978394, - "grad_norm": 0.6320146322250366, + "grad_norm": 0.6350699663162231, "learning_rate": 4.903816754993885e-05, - "loss": 1.4784, + "loss": 1.4786, "step": 2265 }, { "epoch": 0.1542329120804457, - "grad_norm": 0.6770452260971069, + "grad_norm": 0.6745502948760986, "learning_rate": 4.903604429949722e-05, - "loss": 1.3964, + "loss": 1.3966, "step": 2270 }, { "epoch": 0.15457263215110748, - "grad_norm": 0.6808265447616577, + "grad_norm": 0.683824360370636, "learning_rate": 4.9033921049055583e-05, - "loss": 1.4104, + "loss": 1.4105, "step": 2275 }, { "epoch": 0.15491235222176927, - "grad_norm": 0.5745781660079956, + "grad_norm": 0.5739218592643738, "learning_rate": 4.903179779861394e-05, - "loss": 1.4284, + "loss": 1.4286, "step": 2280 }, { "epoch": 0.15525207229243104, - "grad_norm": 0.6451144218444824, + "grad_norm": 0.6470056772232056, "learning_rate": 4.902967454817231e-05, - "loss": 1.3964, + "loss": 1.3956, "step": 2285 }, { "epoch": 0.1555917923630928, - "grad_norm": 0.7113416790962219, + "grad_norm": 0.7059006094932556, "learning_rate": 4.9027551297730676e-05, - "loss": 1.4505, + "loss": 1.4501, "step": 2290 }, { "epoch": 0.15593151243375458, - "grad_norm": 0.5574209094047546, + "grad_norm": 0.5586583614349365, "learning_rate": 4.902542804728903e-05, - "loss": 1.4106, + "loss": 1.4107, "step": 2295 }, { "epoch": 0.15627123250441635, - "grad_norm": 0.5501256585121155, + "grad_norm": 0.549902617931366, "learning_rate": 4.9023304796847404e-05, - "loss": 1.473, + "loss": 1.4727, "step": 2300 }, { "epoch": 0.15661095257507815, - "grad_norm": 0.5758779048919678, + "grad_norm": 0.5777664184570312, "learning_rate": 4.902118154640577e-05, - "loss": 1.4395, + "loss": 1.4391, "step": 2305 }, { "epoch": 0.15695067264573992, - "grad_norm": 0.561896562576294, + "grad_norm": 0.5615882873535156, "learning_rate": 4.9019058295964125e-05, - "loss": 1.427, + "loss": 1.4272, "step": 2310 }, { "epoch": 0.1572903927164017, - "grad_norm": 0.6061218976974487, + "grad_norm": 0.6097357869148254, "learning_rate": 4.9016935045522496e-05, - "loss": 1.384, + "loss": 1.3842, "step": 2315 }, { "epoch": 0.15763011278706346, - "grad_norm": 0.5345364809036255, + "grad_norm": 0.5372126698493958, "learning_rate": 4.901481179508085e-05, - "loss": 1.4105, + "loss": 1.4103, "step": 2320 }, { "epoch": 0.15796983285772523, - "grad_norm": 0.6396936774253845, + "grad_norm": 0.6201612949371338, "learning_rate": 4.901268854463922e-05, - "loss": 1.4066, + "loss": 1.4054, "step": 2325 }, { "epoch": 0.158309552928387, - "grad_norm": 0.5536668300628662, + "grad_norm": 0.5524593591690063, "learning_rate": 4.901056529419759e-05, - "loss": 1.4842, + "loss": 1.4839, "step": 2330 }, { "epoch": 0.1586492729990488, - "grad_norm": 0.5542172193527222, + "grad_norm": 0.553835928440094, "learning_rate": 4.9008442043755945e-05, - "loss": 1.4228, + "loss": 1.4226, "step": 2335 }, { "epoch": 0.15898899306971057, - "grad_norm": 0.5791119337081909, + "grad_norm": 0.5874102115631104, "learning_rate": 4.900631879331431e-05, - "loss": 1.5023, + "loss": 1.5016, "step": 2340 }, { "epoch": 0.15932871314037234, - "grad_norm": 0.5835673213005066, + "grad_norm": 0.575965166091919, "learning_rate": 4.900419554287268e-05, - "loss": 1.4371, + "loss": 1.4367, "step": 2345 }, { "epoch": 0.1596684332110341, - "grad_norm": 0.6104651689529419, + "grad_norm": 0.6099889278411865, "learning_rate": 4.900207229243104e-05, - "loss": 1.3615, + "loss": 1.3614, "step": 2350 }, { "epoch": 0.16000815328169588, - "grad_norm": 0.5057826042175293, + "grad_norm": 0.5063363909721375, "learning_rate": 4.89999490419894e-05, - "loss": 1.4277, + "loss": 1.4281, "step": 2355 }, { "epoch": 0.16034787335235764, - "grad_norm": 0.506723165512085, + "grad_norm": 0.5066052079200745, "learning_rate": 4.899782579154777e-05, - "loss": 1.4386, + "loss": 1.4387, "step": 2360 }, { "epoch": 0.16068759342301944, - "grad_norm": 0.592574954032898, + "grad_norm": 0.594146192073822, "learning_rate": 4.899570254110613e-05, - "loss": 1.465, + "loss": 1.4648, "step": 2365 }, { "epoch": 0.1610273134936812, - "grad_norm": 0.6453089714050293, + "grad_norm": 0.6465543508529663, "learning_rate": 4.899357929066449e-05, "loss": 1.3677, "step": 2370 }, { "epoch": 0.16136703356434298, - "grad_norm": 0.6195045113563538, + "grad_norm": 0.6143479943275452, "learning_rate": 4.8991456040222864e-05, - "loss": 1.3894, + "loss": 1.3891, "step": 2375 }, { "epoch": 0.16170675363500475, - "grad_norm": 0.5778237581253052, + "grad_norm": 0.5801399350166321, "learning_rate": 4.898933278978122e-05, - "loss": 1.3848, + "loss": 1.3854, "step": 2380 }, { "epoch": 0.16204647370566652, - "grad_norm": 0.5943452715873718, + "grad_norm": 0.597212553024292, "learning_rate": 4.8987209539339585e-05, - "loss": 1.4076, + "loss": 1.4072, "step": 2385 }, { "epoch": 0.16238619377632832, - "grad_norm": 0.6236830949783325, + "grad_norm": 0.6248683333396912, "learning_rate": 4.8985086288897956e-05, - "loss": 1.4971, + "loss": 1.4964, "step": 2390 }, { "epoch": 0.1627259138469901, - "grad_norm": 0.45929020643234253, + "grad_norm": 0.4583134949207306, "learning_rate": 4.898296303845631e-05, - "loss": 1.4483, + "loss": 1.4485, "step": 2395 }, { "epoch": 0.16306563391765186, - "grad_norm": 0.6343552470207214, + "grad_norm": 0.6305431127548218, "learning_rate": 4.898083978801468e-05, - "loss": 1.4013, + "loss": 1.4016, "step": 2400 }, { "epoch": 0.16340535398831363, - "grad_norm": 0.5927261710166931, + "grad_norm": 0.5952189564704895, "learning_rate": 4.897871653757304e-05, - "loss": 1.3257, + "loss": 1.3266, "step": 2405 }, { "epoch": 0.1637450740589754, - "grad_norm": 0.5226753354072571, + "grad_norm": 0.5233879089355469, "learning_rate": 4.8976593287131405e-05, - "loss": 1.4674, + "loss": 1.4677, "step": 2410 }, { "epoch": 0.16408479412963717, - "grad_norm": 0.5839881300926208, + "grad_norm": 0.5826775431632996, "learning_rate": 4.897447003668977e-05, - "loss": 1.5248, + "loss": 1.5247, "step": 2415 }, { "epoch": 0.16442451420029897, - "grad_norm": 0.6374481320381165, + "grad_norm": 0.637760579586029, "learning_rate": 4.897234678624813e-05, - "loss": 1.4081, + "loss": 1.4075, "step": 2420 }, { "epoch": 0.16476423427096074, - "grad_norm": 0.5726152658462524, + "grad_norm": 0.5712495446205139, "learning_rate": 4.89702235358065e-05, - "loss": 1.4522, + "loss": 1.4527, "step": 2425 }, { "epoch": 0.1651039543416225, - "grad_norm": 0.6431419253349304, + "grad_norm": 0.6425234079360962, "learning_rate": 4.896810028536486e-05, - "loss": 1.3736, + "loss": 1.3738, "step": 2430 }, { "epoch": 0.16544367441228428, - "grad_norm": 0.5664803981781006, + "grad_norm": 0.5666119456291199, "learning_rate": 4.8965977034923225e-05, - "loss": 1.6692, + "loss": 1.669, "step": 2435 }, { "epoch": 0.16578339448294604, - "grad_norm": 0.6327283978462219, + "grad_norm": 0.6323779225349426, "learning_rate": 4.896385378448159e-05, - "loss": 1.4597, + "loss": 1.4601, "step": 2440 }, { "epoch": 0.16612311455360781, - "grad_norm": 0.6446481943130493, + "grad_norm": 0.649678647518158, "learning_rate": 4.896173053403995e-05, - "loss": 1.5186, + "loss": 1.5189, "step": 2445 }, { "epoch": 0.1664628346242696, - "grad_norm": 0.6424679160118103, + "grad_norm": 0.6421571373939514, "learning_rate": 4.895960728359832e-05, - "loss": 1.4282, + "loss": 1.4288, "step": 2450 }, { "epoch": 0.16680255469493138, - "grad_norm": 0.6918651461601257, + "grad_norm": 0.6975622177124023, "learning_rate": 4.895748403315668e-05, - "loss": 1.3549, + "loss": 1.3553, "step": 2455 }, { "epoch": 0.16714227476559315, - "grad_norm": 0.6785680651664734, + "grad_norm": 0.6754254102706909, "learning_rate": 4.8955360782715045e-05, - "loss": 1.4485, + "loss": 1.4479, "step": 2460 }, { "epoch": 0.16748199483625492, - "grad_norm": 0.5688127875328064, + "grad_norm": 0.5676441788673401, "learning_rate": 4.895323753227341e-05, - "loss": 1.3694, + "loss": 1.3693, "step": 2465 }, { "epoch": 0.1678217149069167, - "grad_norm": 0.5057792067527771, + "grad_norm": 0.5039764642715454, "learning_rate": 4.895111428183177e-05, - "loss": 1.4069, + "loss": 1.4065, "step": 2470 }, { "epoch": 0.1681614349775785, - "grad_norm": 0.6678634881973267, + "grad_norm": 0.6798781752586365, "learning_rate": 4.894899103139014e-05, - "loss": 1.4047, + "loss": 1.4049, "step": 2475 }, { "epoch": 0.16850115504824026, - "grad_norm": 0.6946313977241516, + "grad_norm": 0.6955761909484863, "learning_rate": 4.89468677809485e-05, - "loss": 1.4301, + "loss": 1.4293, "step": 2480 }, { "epoch": 0.16884087511890203, - "grad_norm": 0.5884628295898438, + "grad_norm": 0.5921393036842346, "learning_rate": 4.8944744530506865e-05, - "loss": 1.5095, + "loss": 1.5092, "step": 2485 }, { "epoch": 0.1691805951895638, - "grad_norm": 0.5525778532028198, + "grad_norm": 0.552216112613678, "learning_rate": 4.894262128006523e-05, - "loss": 1.4648, + "loss": 1.4646, "step": 2490 }, { "epoch": 0.16952031526022557, - "grad_norm": 0.6558218002319336, + "grad_norm": 0.6471473574638367, "learning_rate": 4.894049802962359e-05, "loss": 1.4296, "step": 2495 }, { "epoch": 0.16986003533088734, - "grad_norm": 0.5888893604278564, + "grad_norm": 0.5919592380523682, "learning_rate": 4.893837477918196e-05, "loss": 1.4795, "step": 2500 }, { "epoch": 0.17019975540154914, - "grad_norm": 0.6494855284690857, + "grad_norm": 0.6483196020126343, "learning_rate": 4.893625152874032e-05, - "loss": 1.4289, + "loss": 1.4293, "step": 2505 }, { "epoch": 0.1705394754722109, - "grad_norm": 0.5585719347000122, + "grad_norm": 0.5579963326454163, "learning_rate": 4.8934128278298685e-05, - "loss": 1.3688, + "loss": 1.3687, "step": 2510 }, { "epoch": 0.17087919554287267, - "grad_norm": 0.566693902015686, + "grad_norm": 0.5652419924736023, "learning_rate": 4.893200502785705e-05, - "loss": 1.4202, + "loss": 1.4196, "step": 2515 }, { "epoch": 0.17121891561353444, - "grad_norm": 0.5679460167884827, + "grad_norm": 0.5704705715179443, "learning_rate": 4.8929881777415406e-05, - "loss": 1.4117, + "loss": 1.4116, "step": 2520 }, { "epoch": 0.17155863568419621, - "grad_norm": 0.5545666217803955, + "grad_norm": 0.5532625317573547, "learning_rate": 4.892775852697378e-05, - "loss": 1.4807, + "loss": 1.4809, "step": 2525 }, { "epoch": 0.17189835575485798, - "grad_norm": 0.650221049785614, + "grad_norm": 0.647752046585083, "learning_rate": 4.892563527653214e-05, - "loss": 1.5084, + "loss": 1.5078, "step": 2530 }, { "epoch": 0.17223807582551978, - "grad_norm": 0.5717597603797913, + "grad_norm": 0.5708048939704895, "learning_rate": 4.89235120260905e-05, - "loss": 1.4395, + "loss": 1.4393, "step": 2535 }, { "epoch": 0.17257779589618155, - "grad_norm": 0.7092136144638062, + "grad_norm": 0.7074378728866577, "learning_rate": 4.892138877564887e-05, - "loss": 1.4863, + "loss": 1.4861, "step": 2540 }, { "epoch": 0.17291751596684332, - "grad_norm": 0.5754497051239014, + "grad_norm": 0.5761881470680237, "learning_rate": 4.891926552520723e-05, - "loss": 1.2872, + "loss": 1.2866, "step": 2545 }, { "epoch": 0.1732572360375051, - "grad_norm": 0.6163140535354614, + "grad_norm": 0.6130074858665466, "learning_rate": 4.891714227476559e-05, - "loss": 1.4011, + "loss": 1.4007, "step": 2550 }, { "epoch": 0.17359695610816686, - "grad_norm": 0.5960626006126404, + "grad_norm": 0.5959569811820984, "learning_rate": 4.891501902432396e-05, - "loss": 1.4145, + "loss": 1.4147, "step": 2555 }, { "epoch": 0.17393667617882866, - "grad_norm": 0.5820543169975281, + "grad_norm": 0.5839036107063293, "learning_rate": 4.8912895773882325e-05, - "loss": 1.3803, + "loss": 1.38, "step": 2560 }, { "epoch": 0.17427639624949043, - "grad_norm": 0.6925805807113647, + "grad_norm": 0.6866303086280823, "learning_rate": 4.891077252344069e-05, - "loss": 1.3168, + "loss": 1.3169, "step": 2565 }, { "epoch": 0.1746161163201522, - "grad_norm": 0.5974335670471191, + "grad_norm": 0.5958876013755798, "learning_rate": 4.890864927299905e-05, - "loss": 1.4439, + "loss": 1.4442, "step": 2570 }, { "epoch": 0.17495583639081397, - "grad_norm": 0.6052380204200745, + "grad_norm": 0.6119503974914551, "learning_rate": 4.890652602255742e-05, - "loss": 1.3726, + "loss": 1.3728, "step": 2575 }, { "epoch": 0.17529555646147574, - "grad_norm": 0.6102190017700195, + "grad_norm": 0.6121228933334351, "learning_rate": 4.890440277211578e-05, - "loss": 1.379, + "loss": 1.3798, "step": 2580 }, { "epoch": 0.1756352765321375, - "grad_norm": 0.5870862603187561, + "grad_norm": 0.5901484489440918, "learning_rate": 4.8902279521674145e-05, - "loss": 1.3191, + "loss": 1.3184, "step": 2585 }, { "epoch": 0.1759749966027993, - "grad_norm": 0.6346774697303772, + "grad_norm": 0.6334368586540222, "learning_rate": 4.890015627123251e-05, - "loss": 1.4171, + "loss": 1.4167, "step": 2590 }, { "epoch": 0.17631471667346107, - "grad_norm": 0.5704734325408936, + "grad_norm": 0.5719745755195618, "learning_rate": 4.889803302079087e-05, - "loss": 1.4367, + "loss": 1.4364, "step": 2595 }, { "epoch": 0.17665443674412284, - "grad_norm": 0.6515428423881531, + "grad_norm": 0.6542099118232727, "learning_rate": 4.889590977034924e-05, - "loss": 1.4435, + "loss": 1.4431, "step": 2600 }, { "epoch": 0.17699415681478461, - "grad_norm": 0.6457876563072205, + "grad_norm": 0.6459152698516846, "learning_rate": 4.8893786519907594e-05, "loss": 1.4706, "step": 2605 }, { "epoch": 0.17733387688544638, - "grad_norm": 0.6365540623664856, + "grad_norm": 0.6380369663238525, "learning_rate": 4.8891663269465965e-05, - "loss": 1.3534, + "loss": 1.3538, "step": 2610 }, { "epoch": 0.17767359695610815, - "grad_norm": 0.5984424352645874, + "grad_norm": 0.6021358966827393, "learning_rate": 4.888954001902433e-05, - "loss": 1.4365, + "loss": 1.4366, "step": 2615 }, { "epoch": 0.17801331702676995, - "grad_norm": 0.7218442559242249, + "grad_norm": 0.7277151942253113, "learning_rate": 4.8887416768582686e-05, - "loss": 1.4468, + "loss": 1.4471, "step": 2620 }, { "epoch": 0.17835303709743172, - "grad_norm": 0.5970467925071716, + "grad_norm": 0.5979676842689514, "learning_rate": 4.888529351814106e-05, - "loss": 1.3557, + "loss": 1.3554, "step": 2625 }, { "epoch": 0.1786927571680935, - "grad_norm": 0.6055663228034973, + "grad_norm": 0.6203859448432922, "learning_rate": 4.888317026769942e-05, - "loss": 1.3872, + "loss": 1.3862, "step": 2630 }, { "epoch": 0.17903247723875526, - "grad_norm": 0.5495836138725281, + "grad_norm": 0.5479990839958191, "learning_rate": 4.888104701725778e-05, - "loss": 1.5108, + "loss": 1.5111, "step": 2635 }, { "epoch": 0.17937219730941703, - "grad_norm": 0.6123842597007751, + "grad_norm": 0.6130913496017456, "learning_rate": 4.887892376681615e-05, - "loss": 1.4518, + "loss": 1.4515, "step": 2640 }, { "epoch": 0.17971191738007883, - "grad_norm": 0.6095494031906128, + "grad_norm": 0.6137251257896423, "learning_rate": 4.887680051637451e-05, - "loss": 1.4356, + "loss": 1.4351, "step": 2645 }, { "epoch": 0.1800516374507406, - "grad_norm": 0.6432989239692688, + "grad_norm": 0.6461949944496155, "learning_rate": 4.887467726593287e-05, "loss": 1.546, "step": 2650 }, { "epoch": 0.18039135752140237, - "grad_norm": 0.5585070848464966, + "grad_norm": 0.5597972869873047, "learning_rate": 4.887255401549124e-05, - "loss": 1.4513, + "loss": 1.4511, "step": 2655 }, { "epoch": 0.18073107759206414, - "grad_norm": 0.6016882658004761, + "grad_norm": 0.6015893816947937, "learning_rate": 4.8870430765049605e-05, - "loss": 1.3715, + "loss": 1.3714, "step": 2660 }, { "epoch": 0.1810707976627259, - "grad_norm": 0.6021006107330322, + "grad_norm": 0.6013908982276917, "learning_rate": 4.886830751460796e-05, - "loss": 1.4023, + "loss": 1.4015, "step": 2665 }, { "epoch": 0.18141051773338768, - "grad_norm": 0.6008695363998413, + "grad_norm": 0.5986375212669373, "learning_rate": 4.886618426416633e-05, "loss": 1.3748, "step": 2670 }, { "epoch": 0.18175023780404947, - "grad_norm": 0.5484296083450317, + "grad_norm": 0.5517350435256958, "learning_rate": 4.886406101372469e-05, - "loss": 1.4534, + "loss": 1.4536, "step": 2675 }, { "epoch": 0.18208995787471124, - "grad_norm": 0.6692857146263123, + "grad_norm": 0.670529305934906, "learning_rate": 4.8861937763283054e-05, - "loss": 1.4976, + "loss": 1.4975, "step": 2680 }, { "epoch": 0.18242967794537301, - "grad_norm": 0.6512504816055298, + "grad_norm": 0.6527950167655945, "learning_rate": 4.8859814512841425e-05, - "loss": 1.5091, + "loss": 1.509, "step": 2685 }, { "epoch": 0.18276939801603478, - "grad_norm": 0.574049174785614, + "grad_norm": 0.5700352191925049, "learning_rate": 4.885769126239978e-05, - "loss": 1.4236, + "loss": 1.4237, "step": 2690 }, { "epoch": 0.18310911808669655, - "grad_norm": 0.6614393591880798, + "grad_norm": 0.6623737812042236, "learning_rate": 4.8855568011958146e-05, - "loss": 1.4393, + "loss": 1.4396, "step": 2695 }, { "epoch": 0.18344883815735832, - "grad_norm": 0.5550003051757812, + "grad_norm": 0.5555988550186157, "learning_rate": 4.885344476151652e-05, - "loss": 1.4089, + "loss": 1.4092, "step": 2700 }, { "epoch": 0.18378855822802012, - "grad_norm": 0.5351262092590332, + "grad_norm": 0.5355735421180725, "learning_rate": 4.8851321511074874e-05, - "loss": 1.3451, + "loss": 1.3458, "step": 2705 }, { "epoch": 0.1841282782986819, - "grad_norm": 0.63856440782547, + "grad_norm": 0.6372920274734497, "learning_rate": 4.884919826063324e-05, - "loss": 1.3656, + "loss": 1.3651, "step": 2710 }, { "epoch": 0.18446799836934366, - "grad_norm": 0.6448277831077576, + "grad_norm": 0.6451325416564941, "learning_rate": 4.884707501019161e-05, - "loss": 1.4136, + "loss": 1.4137, "step": 2715 }, { "epoch": 0.18480771844000543, - "grad_norm": 0.6709998250007629, + "grad_norm": 0.6668452024459839, "learning_rate": 4.8844951759749966e-05, "loss": 1.3612, "step": 2720 }, { "epoch": 0.1851474385106672, - "grad_norm": 0.6180108785629272, + "grad_norm": 0.6308901309967041, "learning_rate": 4.884282850930833e-05, - "loss": 1.4253, + "loss": 1.4246, "step": 2725 }, { "epoch": 0.185487158581329, - "grad_norm": 0.6439008712768555, + "grad_norm": 0.6463460922241211, "learning_rate": 4.88407052588667e-05, - "loss": 1.4858, + "loss": 1.4863, "step": 2730 }, { "epoch": 0.18582687865199077, - "grad_norm": 0.5395865440368652, + "grad_norm": 0.540107250213623, "learning_rate": 4.883858200842506e-05, - "loss": 1.4399, + "loss": 1.4405, "step": 2735 }, { "epoch": 0.18616659872265254, - "grad_norm": 0.607215940952301, + "grad_norm": 0.6086602807044983, "learning_rate": 4.883645875798342e-05, - "loss": 1.5088, + "loss": 1.5089, "step": 2740 }, { "epoch": 0.1865063187933143, - "grad_norm": 0.6101176142692566, + "grad_norm": 0.6101647019386292, "learning_rate": 4.8834335507541786e-05, - "loss": 1.3556, + "loss": 1.3563, "step": 2745 }, { "epoch": 0.18684603886397608, - "grad_norm": 0.6299176216125488, + "grad_norm": 0.6288184523582458, "learning_rate": 4.883221225710015e-05, - "loss": 1.3241, + "loss": 1.3238, "step": 2750 }, { "epoch": 0.18718575893463785, - "grad_norm": 0.5913377404212952, + "grad_norm": 0.5928890109062195, "learning_rate": 4.8830089006658514e-05, - "loss": 1.5029, + "loss": 1.5026, "step": 2755 }, { "epoch": 0.18752547900529964, - "grad_norm": 0.6693141460418701, + "grad_norm": 0.6701631546020508, "learning_rate": 4.882796575621688e-05, - "loss": 1.5591, + "loss": 1.5589, "step": 2760 }, { "epoch": 0.18786519907596141, - "grad_norm": 0.6031198501586914, + "grad_norm": 0.6082215309143066, "learning_rate": 4.882584250577524e-05, - "loss": 1.4366, + "loss": 1.4376, "step": 2765 }, { "epoch": 0.18820491914662318, - "grad_norm": 0.5741726160049438, + "grad_norm": 0.576653242111206, "learning_rate": 4.8823719255333606e-05, - "loss": 1.4799, + "loss": 1.4803, "step": 2770 }, { "epoch": 0.18854463921728495, - "grad_norm": 0.6373092532157898, + "grad_norm": 0.6377544403076172, "learning_rate": 4.882159600489197e-05, - "loss": 1.5948, + "loss": 1.5944, "step": 2775 }, { "epoch": 0.18888435928794672, - "grad_norm": 0.6143050789833069, + "grad_norm": 0.6133949160575867, "learning_rate": 4.8819472754450334e-05, - "loss": 1.4609, + "loss": 1.461, "step": 2780 }, { "epoch": 0.1892240793586085, - "grad_norm": 0.6247331500053406, + "grad_norm": 0.6236678957939148, "learning_rate": 4.88173495040087e-05, - "loss": 1.4965, + "loss": 1.4967, "step": 2785 }, { "epoch": 0.1895637994292703, - "grad_norm": 0.6008833050727844, + "grad_norm": 0.60072922706604, "learning_rate": 4.881522625356706e-05, - "loss": 1.4772, + "loss": 1.4759, "step": 2790 }, { "epoch": 0.18990351949993206, - "grad_norm": 0.5636752843856812, + "grad_norm": 0.5638582110404968, "learning_rate": 4.8813103003125426e-05, - "loss": 1.3657, + "loss": 1.3656, "step": 2795 }, { "epoch": 0.19024323957059383, - "grad_norm": 0.6358240842819214, + "grad_norm": 0.6338021755218506, "learning_rate": 4.881097975268379e-05, - "loss": 1.337, + "loss": 1.3368, "step": 2800 }, { "epoch": 0.1905829596412556, - "grad_norm": 0.565941333770752, + "grad_norm": 0.5661375522613525, "learning_rate": 4.8808856502242154e-05, - "loss": 1.3339, + "loss": 1.3341, "step": 2805 }, { "epoch": 0.19092267971191737, - "grad_norm": 0.5850551128387451, + "grad_norm": 0.5836489200592041, "learning_rate": 4.880673325180052e-05, - "loss": 1.352, + "loss": 1.3523, "step": 2810 }, { "epoch": 0.19126239978257917, - "grad_norm": 0.5507126450538635, + "grad_norm": 0.5482072234153748, "learning_rate": 4.880461000135888e-05, - "loss": 1.4257, + "loss": 1.426, "step": 2815 }, { "epoch": 0.19160211985324094, - "grad_norm": 0.6068183183670044, + "grad_norm": 0.6090626120567322, "learning_rate": 4.8802486750917246e-05, - "loss": 1.5025, + "loss": 1.5029, "step": 2820 }, { "epoch": 0.1919418399239027, - "grad_norm": 0.5308849215507507, + "grad_norm": 0.5296637415885925, "learning_rate": 4.880036350047561e-05, - "loss": 1.4407, + "loss": 1.4406, "step": 2825 }, { "epoch": 0.19228155999456448, - "grad_norm": 0.587657630443573, + "grad_norm": 0.5892622470855713, "learning_rate": 4.8798240250033974e-05, - "loss": 1.4402, + "loss": 1.4403, "step": 2830 }, { "epoch": 0.19262128006522625, - "grad_norm": 0.6322230100631714, + "grad_norm": 0.6317263245582581, "learning_rate": 4.879611699959234e-05, - "loss": 1.3885, + "loss": 1.3883, "step": 2835 }, { "epoch": 0.19296100013588802, - "grad_norm": 0.4599972069263458, + "grad_norm": 0.4616105854511261, "learning_rate": 4.87939937491507e-05, - "loss": 1.3561, + "loss": 1.356, "step": 2840 }, { "epoch": 0.19330072020654981, - "grad_norm": 0.5593650937080383, + "grad_norm": 0.559741735458374, "learning_rate": 4.8791870498709066e-05, - "loss": 1.4226, + "loss": 1.4227, "step": 2845 }, { "epoch": 0.19364044027721158, - "grad_norm": 0.6431747674942017, + "grad_norm": 0.6514472365379333, "learning_rate": 4.878974724826743e-05, - "loss": 1.4919, + "loss": 1.4911, "step": 2850 }, { "epoch": 0.19398016034787335, - "grad_norm": 0.553047776222229, + "grad_norm": 0.5514373779296875, "learning_rate": 4.8787623997825794e-05, - "loss": 1.4808, + "loss": 1.4806, "step": 2855 }, { "epoch": 0.19431988041853512, - "grad_norm": 0.574626624584198, + "grad_norm": 0.5796857476234436, "learning_rate": 4.878550074738416e-05, - "loss": 1.4788, + "loss": 1.4779, "step": 2860 }, { "epoch": 0.1946596004891969, - "grad_norm": 0.498710960149765, + "grad_norm": 0.4963955283164978, "learning_rate": 4.878337749694252e-05, - "loss": 1.3874, + "loss": 1.3878, "step": 2865 }, { "epoch": 0.19499932055985866, - "grad_norm": 0.5680547952651978, + "grad_norm": 0.5704041719436646, "learning_rate": 4.8781254246500886e-05, - "loss": 1.4522, + "loss": 1.4525, "step": 2870 }, { "epoch": 0.19533904063052046, - "grad_norm": 0.6454386115074158, + "grad_norm": 0.6432074904441833, "learning_rate": 4.8779130996059244e-05, - "loss": 1.4065, + "loss": 1.4063, "step": 2875 }, { "epoch": 0.19567876070118223, - "grad_norm": 0.6895508170127869, + "grad_norm": 0.6874633431434631, "learning_rate": 4.8777007745617614e-05, - "loss": 1.4277, + "loss": 1.4273, "step": 2880 }, { "epoch": 0.196018480771844, - "grad_norm": 0.6072851419448853, + "grad_norm": 0.6058900952339172, "learning_rate": 4.877488449517598e-05, - "loss": 1.4315, + "loss": 1.4317, "step": 2885 }, { "epoch": 0.19635820084250577, - "grad_norm": 0.5200562477111816, + "grad_norm": 0.5183410048484802, "learning_rate": 4.8772761244734336e-05, - "loss": 1.3517, + "loss": 1.3516, "step": 2890 }, { "epoch": 0.19669792091316754, - "grad_norm": 0.547443151473999, + "grad_norm": 0.5470945835113525, "learning_rate": 4.8770637994292706e-05, - "loss": 1.324, + "loss": 1.3235, "step": 2895 }, { "epoch": 0.19703764098382934, - "grad_norm": 0.7084436416625977, + "grad_norm": 0.7027068138122559, "learning_rate": 4.876851474385107e-05, - "loss": 1.3544, + "loss": 1.3549, "step": 2900 }, { "epoch": 0.1973773610544911, - "grad_norm": 0.5730024576187134, + "grad_norm": 0.5728862285614014, "learning_rate": 4.8766391493409434e-05, - "loss": 1.4499, + "loss": 1.4507, "step": 2905 }, { "epoch": 0.19771708112515288, - "grad_norm": 0.6365293860435486, + "grad_norm": 0.6337721347808838, "learning_rate": 4.87642682429678e-05, - "loss": 1.4875, + "loss": 1.4878, "step": 2910 }, { "epoch": 0.19805680119581465, - "grad_norm": 0.7301263809204102, + "grad_norm": 0.7279835939407349, "learning_rate": 4.876214499252616e-05, - "loss": 1.4084, + "loss": 1.4082, "step": 2915 }, { "epoch": 0.19839652126647642, - "grad_norm": 0.6117473840713501, + "grad_norm": 0.6118570566177368, "learning_rate": 4.8760021742084526e-05, "loss": 1.4031, "step": 2920 }, { "epoch": 0.1987362413371382, - "grad_norm": 0.5654906034469604, + "grad_norm": 0.5653524398803711, "learning_rate": 4.875789849164289e-05, - "loss": 1.3552, + "loss": 1.3548, "step": 2925 }, { "epoch": 0.19907596140779998, - "grad_norm": 0.5986347794532776, + "grad_norm": 0.5978901386260986, "learning_rate": 4.8755775241201254e-05, - "loss": 1.3089, + "loss": 1.3095, "step": 2930 }, { "epoch": 0.19941568147846175, - "grad_norm": 0.5379148125648499, + "grad_norm": 0.5396924614906311, "learning_rate": 4.875365199075962e-05, - "loss": 1.4151, + "loss": 1.4153, "step": 2935 }, { "epoch": 0.19975540154912352, - "grad_norm": 0.6762790679931641, + "grad_norm": 0.6776423454284668, "learning_rate": 4.875152874031798e-05, "loss": 1.3418, "step": 2940 }, { "epoch": 0.2000951216197853, - "grad_norm": 0.6298921704292297, + "grad_norm": 0.627900242805481, "learning_rate": 4.874940548987634e-05, - "loss": 1.4135, + "loss": 1.4134, "step": 2945 }, { "epoch": 0.20043484169044706, - "grad_norm": 0.6401534080505371, + "grad_norm": 0.6267917156219482, "learning_rate": 4.874728223943471e-05, "loss": 1.4582, "step": 2950 }, { "epoch": 0.20077456176110883, - "grad_norm": 0.66017085313797, + "grad_norm": 0.6598471403121948, "learning_rate": 4.8745158988993074e-05, - "loss": 1.4479, + "loss": 1.4478, "step": 2955 }, { "epoch": 0.20111428183177063, - "grad_norm": 0.5370712280273438, + "grad_norm": 0.5372079014778137, "learning_rate": 4.874303573855143e-05, - "loss": 1.4445, + "loss": 1.4456, "step": 2960 }, { "epoch": 0.2014540019024324, - "grad_norm": 0.5681670904159546, + "grad_norm": 0.568881094455719, "learning_rate": 4.87409124881098e-05, - "loss": 1.4424, + "loss": 1.4425, "step": 2965 }, { "epoch": 0.20179372197309417, - "grad_norm": 0.564467191696167, + "grad_norm": 0.5657125115394592, "learning_rate": 4.8738789237668166e-05, - "loss": 1.4026, + "loss": 1.4032, "step": 2970 }, { "epoch": 0.20213344204375594, - "grad_norm": 0.5779341459274292, + "grad_norm": 0.5747621655464172, "learning_rate": 4.8736665987226524e-05, - "loss": 1.4891, + "loss": 1.4888, "step": 2975 }, { "epoch": 0.2024731621144177, - "grad_norm": 0.5849860310554504, + "grad_norm": 0.5849245190620422, "learning_rate": 4.8734542736784894e-05, "loss": 1.4685, "step": 2980 }, { "epoch": 0.2028128821850795, - "grad_norm": 0.5843042135238647, + "grad_norm": 0.5853272676467896, "learning_rate": 4.873241948634326e-05, "loss": 1.4741, "step": 2985 }, { "epoch": 0.20315260225574128, - "grad_norm": 0.6117401719093323, + "grad_norm": 0.6143925189971924, "learning_rate": 4.8730296235901616e-05, - "loss": 1.4677, + "loss": 1.4676, "step": 2990 }, { "epoch": 0.20349232232640305, - "grad_norm": 0.5531707406044006, + "grad_norm": 0.5527306795120239, "learning_rate": 4.8728172985459986e-05, "loss": 1.4241, "step": 2995 }, { "epoch": 0.20383204239706482, - "grad_norm": 0.5719231963157654, + "grad_norm": 0.573491632938385, "learning_rate": 4.872604973501835e-05, - "loss": 1.3577, + "loss": 1.3582, "step": 3000 }, { "epoch": 0.2041717624677266, - "grad_norm": 0.6049824953079224, + "grad_norm": 0.6034144163131714, "learning_rate": 4.872392648457671e-05, - "loss": 1.3973, + "loss": 1.3972, "step": 3005 }, { "epoch": 0.20451148253838836, - "grad_norm": 0.6646301746368408, + "grad_norm": 0.6673248410224915, "learning_rate": 4.872180323413508e-05, - "loss": 1.3812, + "loss": 1.3804, "step": 3010 }, { "epoch": 0.20485120260905015, - "grad_norm": 0.6004433631896973, + "grad_norm": 0.6003086566925049, "learning_rate": 4.871967998369344e-05, - "loss": 1.4524, + "loss": 1.4518, "step": 3015 }, { "epoch": 0.20519092267971192, - "grad_norm": 0.6404062509536743, + "grad_norm": 0.6409633755683899, "learning_rate": 4.87175567332518e-05, - "loss": 1.424, + "loss": 1.4242, "step": 3020 }, { "epoch": 0.2055306427503737, - "grad_norm": 0.6715559363365173, + "grad_norm": 0.6687463521957397, "learning_rate": 4.871543348281017e-05, "loss": 1.3809, "step": 3025 }, { "epoch": 0.20587036282103546, - "grad_norm": 0.655387818813324, + "grad_norm": 0.6560158133506775, "learning_rate": 4.871331023236853e-05, - "loss": 1.4434, + "loss": 1.4433, "step": 3030 }, { "epoch": 0.20621008289169723, - "grad_norm": 0.5823382139205933, + "grad_norm": 0.5824029445648193, "learning_rate": 4.871118698192689e-05, - "loss": 1.5534, + "loss": 1.553, "step": 3035 }, { "epoch": 0.206549802962359, - "grad_norm": 0.5753922462463379, + "grad_norm": 0.5668882727622986, "learning_rate": 4.870906373148526e-05, - "loss": 1.3642, + "loss": 1.3641, "step": 3040 }, { "epoch": 0.2068895230330208, - "grad_norm": 0.6257250905036926, + "grad_norm": 0.6222567558288574, "learning_rate": 4.870694048104362e-05, - "loss": 1.486, + "loss": 1.4847, "step": 3045 }, { "epoch": 0.20722924310368257, - "grad_norm": 0.5532388687133789, + "grad_norm": 0.5509527325630188, "learning_rate": 4.8704817230601984e-05, - "loss": 1.338, + "loss": 1.3379, "step": 3050 }, { "epoch": 0.20756896317434434, - "grad_norm": 0.6227984428405762, + "grad_norm": 0.6247271299362183, "learning_rate": 4.8702693980160355e-05, - "loss": 1.3807, + "loss": 1.3809, "step": 3055 }, { "epoch": 0.2079086832450061, - "grad_norm": 0.5318966507911682, + "grad_norm": 0.5325573086738586, "learning_rate": 4.870057072971871e-05, - "loss": 1.406, + "loss": 1.4056, "step": 3060 }, { "epoch": 0.20824840331566788, - "grad_norm": 0.5637968182563782, + "grad_norm": 0.5652135014533997, "learning_rate": 4.8698447479277076e-05, - "loss": 1.3692, + "loss": 1.3696, "step": 3065 }, { "epoch": 0.20858812338632968, - "grad_norm": 0.5600734353065491, + "grad_norm": 0.556892991065979, "learning_rate": 4.8696324228835447e-05, "loss": 1.4261, "step": 3070 }, { "epoch": 0.20892784345699145, - "grad_norm": 0.5506009459495544, + "grad_norm": 0.5482226014137268, "learning_rate": 4.8694200978393804e-05, - "loss": 1.392, + "loss": 1.3921, "step": 3075 }, { "epoch": 0.20926756352765322, - "grad_norm": 0.5840604305267334, + "grad_norm": 0.5838587284088135, "learning_rate": 4.869207772795217e-05, - "loss": 1.4166, + "loss": 1.4155, "step": 3080 }, { "epoch": 0.209607283598315, - "grad_norm": 0.5566538572311401, + "grad_norm": 0.5585147142410278, "learning_rate": 4.868995447751054e-05, - "loss": 1.5199, + "loss": 1.5201, "step": 3085 }, { "epoch": 0.20994700366897676, - "grad_norm": 0.6005268692970276, + "grad_norm": 0.5999969244003296, "learning_rate": 4.8687831227068896e-05, - "loss": 1.4194, + "loss": 1.4198, "step": 3090 }, { "epoch": 0.21028672373963853, - "grad_norm": 0.5995813608169556, + "grad_norm": 0.6017451882362366, "learning_rate": 4.868570797662726e-05, - "loss": 1.4782, + "loss": 1.4779, "step": 3095 }, { "epoch": 0.21062644381030032, - "grad_norm": 0.6140902042388916, + "grad_norm": 0.6124809980392456, "learning_rate": 4.8683584726185624e-05, - "loss": 1.4995, + "loss": 1.5, "step": 3100 }, { "epoch": 0.2109661638809621, - "grad_norm": 0.8333086967468262, + "grad_norm": 0.8275682926177979, "learning_rate": 4.868146147574399e-05, - "loss": 1.2436, + "loss": 1.2434, "step": 3105 }, { "epoch": 0.21130588395162386, - "grad_norm": 0.6089223027229309, + "grad_norm": 0.6090074181556702, "learning_rate": 4.867933822530235e-05, "loss": 1.4624, "step": 3110 }, { "epoch": 0.21164560402228563, - "grad_norm": 0.6053603291511536, + "grad_norm": 0.605712354183197, "learning_rate": 4.8677214974860716e-05, - "loss": 1.437, + "loss": 1.4379, "step": 3115 }, { "epoch": 0.2119853240929474, - "grad_norm": 0.6245133876800537, + "grad_norm": 0.6314756274223328, "learning_rate": 4.867509172441908e-05, - "loss": 1.379, + "loss": 1.3789, "step": 3120 }, { "epoch": 0.21232504416360917, - "grad_norm": 0.5423717498779297, + "grad_norm": 0.5423336029052734, "learning_rate": 4.8672968473977444e-05, - "loss": 1.4302, + "loss": 1.4307, "step": 3125 }, { "epoch": 0.21266476423427097, - "grad_norm": 0.550070583820343, + "grad_norm": 0.5594645738601685, "learning_rate": 4.867084522353581e-05, - "loss": 1.3904, + "loss": 1.3906, "step": 3130 }, { "epoch": 0.21300448430493274, - "grad_norm": 0.5501512289047241, + "grad_norm": 0.5493056178092957, "learning_rate": 4.866872197309417e-05, "loss": 1.4069, "step": 3135 }, { "epoch": 0.2133442043755945, - "grad_norm": 0.6174147725105286, + "grad_norm": 0.6178697943687439, "learning_rate": 4.8666598722652536e-05, - "loss": 1.3189, + "loss": 1.3188, "step": 3140 }, { "epoch": 0.21368392444625628, - "grad_norm": 0.6710749268531799, + "grad_norm": 0.6707695722579956, "learning_rate": 4.86644754722109e-05, - "loss": 1.372, + "loss": 1.3721, "step": 3145 }, { "epoch": 0.21402364451691805, - "grad_norm": 0.7570081949234009, + "grad_norm": 0.7310335636138916, "learning_rate": 4.8662352221769264e-05, "loss": 1.421, "step": 3150 }, { "epoch": 0.21436336458757985, - "grad_norm": 0.6627448201179504, + "grad_norm": 0.658669650554657, "learning_rate": 4.866022897132763e-05, - "loss": 1.3698, + "loss": 1.3695, "step": 3155 }, { "epoch": 0.21470308465824162, - "grad_norm": 0.5601295232772827, + "grad_norm": 0.5620143413543701, "learning_rate": 4.865810572088599e-05, - "loss": 1.4537, + "loss": 1.4535, "step": 3160 }, { "epoch": 0.2150428047289034, - "grad_norm": 0.577513575553894, + "grad_norm": 0.580302357673645, "learning_rate": 4.8655982470444356e-05, - "loss": 1.5005, + "loss": 1.5008, "step": 3165 }, { "epoch": 0.21538252479956516, - "grad_norm": 0.5896574854850769, + "grad_norm": 0.5908560156822205, "learning_rate": 4.865385922000272e-05, - "loss": 1.2676, + "loss": 1.2674, "step": 3170 }, { "epoch": 0.21572224487022693, - "grad_norm": 0.536129355430603, + "grad_norm": 0.538489043712616, "learning_rate": 4.8651735969561084e-05, - "loss": 1.4556, + "loss": 1.4563, "step": 3175 }, { "epoch": 0.2160619649408887, - "grad_norm": 0.5639377236366272, + "grad_norm": 0.5643789172172546, "learning_rate": 4.864961271911945e-05, - "loss": 1.3672, + "loss": 1.367, "step": 3180 }, { "epoch": 0.2164016850115505, - "grad_norm": 0.564048171043396, + "grad_norm": 0.5664741396903992, "learning_rate": 4.864748946867781e-05, "loss": 1.3609, "step": 3185 }, { "epoch": 0.21674140508221226, - "grad_norm": 0.604621946811676, + "grad_norm": 0.606351912021637, "learning_rate": 4.8645366218236176e-05, - "loss": 1.3545, + "loss": 1.3541, "step": 3190 }, { "epoch": 0.21708112515287403, - "grad_norm": 0.5847653150558472, + "grad_norm": 0.5868536233901978, "learning_rate": 4.864324296779454e-05, - "loss": 1.4451, + "loss": 1.4457, "step": 3195 }, { "epoch": 0.2174208452235358, - "grad_norm": 0.643519937992096, + "grad_norm": 0.6379307508468628, "learning_rate": 4.8641119717352904e-05, - "loss": 1.5368, + "loss": 1.5369, "step": 3200 }, { "epoch": 0.21776056529419757, - "grad_norm": 0.6260031461715698, + "grad_norm": 0.6265453696250916, "learning_rate": 4.863899646691127e-05, - "loss": 1.359, + "loss": 1.3587, "step": 3205 }, { "epoch": 0.21810028536485934, - "grad_norm": 0.7368310689926147, + "grad_norm": 0.7493308186531067, "learning_rate": 4.863687321646963e-05, - "loss": 1.4048, + "loss": 1.4052, "step": 3210 }, { "epoch": 0.21844000543552114, - "grad_norm": 0.5465880036354065, + "grad_norm": 0.5469814538955688, "learning_rate": 4.8634749966027996e-05, - "loss": 1.4155, + "loss": 1.4159, "step": 3215 }, { "epoch": 0.2187797255061829, - "grad_norm": 0.691990852355957, + "grad_norm": 0.6846829056739807, "learning_rate": 4.863262671558636e-05, - "loss": 1.4225, + "loss": 1.4228, "step": 3220 }, { "epoch": 0.21911944557684468, - "grad_norm": 0.5825974941253662, + "grad_norm": 0.5838154554367065, "learning_rate": 4.8630503465144724e-05, - "loss": 1.3682, + "loss": 1.368, "step": 3225 }, { "epoch": 0.21945916564750645, - "grad_norm": 0.5493308305740356, + "grad_norm": 0.5501458048820496, "learning_rate": 4.862838021470308e-05, - "loss": 1.3828, + "loss": 1.3831, "step": 3230 }, { "epoch": 0.21979888571816822, - "grad_norm": 0.6052065491676331, + "grad_norm": 0.6049789786338806, "learning_rate": 4.862625696426145e-05, - "loss": 1.4233, + "loss": 1.4229, "step": 3235 }, { "epoch": 0.22013860578883002, - "grad_norm": 0.6335605978965759, + "grad_norm": 0.6357119083404541, "learning_rate": 4.8624133713819816e-05, - "loss": 1.4469, + "loss": 1.4468, "step": 3240 }, { "epoch": 0.22047832585949179, - "grad_norm": 0.612486720085144, + "grad_norm": 0.6167799234390259, "learning_rate": 4.862201046337818e-05, - "loss": 1.4722, + "loss": 1.4723, "step": 3245 }, { "epoch": 0.22081804593015356, - "grad_norm": 0.6906188130378723, + "grad_norm": 0.6940762400627136, "learning_rate": 4.8619887212936544e-05, - "loss": 1.4672, + "loss": 1.4679, "step": 3250 }, { "epoch": 0.22115776600081533, - "grad_norm": 0.5844310522079468, + "grad_norm": 0.5851900577545166, "learning_rate": 4.861776396249491e-05, - "loss": 1.4467, + "loss": 1.4463, "step": 3255 }, { "epoch": 0.2214974860714771, - "grad_norm": 0.6048876643180847, + "grad_norm": 0.607383668422699, "learning_rate": 4.861564071205327e-05, - "loss": 1.4849, + "loss": 1.4852, "step": 3260 }, { "epoch": 0.22183720614213887, - "grad_norm": 0.6119795441627502, + "grad_norm": 0.6150192618370056, "learning_rate": 4.8613517461611636e-05, - "loss": 1.4486, + "loss": 1.4489, "step": 3265 }, { "epoch": 0.22217692621280066, - "grad_norm": 0.6853047013282776, + "grad_norm": 0.6845738291740417, "learning_rate": 4.861139421117e-05, - "loss": 1.3078, + "loss": 1.3079, "step": 3270 }, { "epoch": 0.22251664628346243, - "grad_norm": 0.5909206867218018, + "grad_norm": 0.5912994146347046, "learning_rate": 4.8609270960728364e-05, - "loss": 1.4173, + "loss": 1.4172, "step": 3275 }, { "epoch": 0.2228563663541242, - "grad_norm": 0.5987277626991272, + "grad_norm": 0.5995525121688843, "learning_rate": 4.860714771028673e-05, "loss": 1.4653, "step": 3280 }, { "epoch": 0.22319608642478597, - "grad_norm": 0.6398468613624573, + "grad_norm": 0.6410917639732361, "learning_rate": 4.860502445984509e-05, - "loss": 1.4372, + "loss": 1.4374, "step": 3285 }, { "epoch": 0.22353580649544774, - "grad_norm": 0.6329526901245117, + "grad_norm": 0.6326131224632263, "learning_rate": 4.8602901209403456e-05, - "loss": 1.3694, + "loss": 1.3696, "step": 3290 }, { "epoch": 0.22387552656610954, - "grad_norm": 0.5358933806419373, + "grad_norm": 0.5359666347503662, "learning_rate": 4.860077795896182e-05, - "loss": 1.3054, + "loss": 1.3062, "step": 3295 }, { "epoch": 0.2242152466367713, - "grad_norm": 0.5731387734413147, + "grad_norm": 0.5736616849899292, "learning_rate": 4.859865470852018e-05, - "loss": 1.4455, + "loss": 1.4446, "step": 3300 }, { "epoch": 0.22455496670743308, - "grad_norm": 0.5447611212730408, + "grad_norm": 0.5416898727416992, "learning_rate": 4.859653145807855e-05, - "loss": 1.5764, + "loss": 1.5761, "step": 3305 }, { "epoch": 0.22489468677809485, - "grad_norm": 0.5443009734153748, + "grad_norm": 0.5432747602462769, "learning_rate": 4.859440820763691e-05, - "loss": 1.3469, + "loss": 1.3473, "step": 3310 }, { "epoch": 0.22523440684875662, - "grad_norm": 0.6105886697769165, + "grad_norm": 0.6102696061134338, "learning_rate": 4.859228495719527e-05, - "loss": 1.3663, + "loss": 1.367, "step": 3315 }, { "epoch": 0.2255741269194184, - "grad_norm": 0.5600051879882812, + "grad_norm": 0.5593622922897339, "learning_rate": 4.859016170675364e-05, - "loss": 1.325, + "loss": 1.3252, "step": 3320 }, { "epoch": 0.22591384699008019, - "grad_norm": 0.5838090777397156, + "grad_norm": 0.584184467792511, "learning_rate": 4.8588038456312004e-05, - "loss": 1.3707, + "loss": 1.371, "step": 3325 }, { "epoch": 0.22625356706074196, - "grad_norm": 0.5875625014305115, + "grad_norm": 0.5876114368438721, "learning_rate": 4.858591520587036e-05, - "loss": 1.5171, + "loss": 1.5178, "step": 3330 }, { "epoch": 0.22659328713140373, - "grad_norm": 0.5414556860923767, + "grad_norm": 0.5425088405609131, "learning_rate": 4.858379195542873e-05, - "loss": 1.3891, + "loss": 1.3889, "step": 3335 }, { "epoch": 0.2269330072020655, - "grad_norm": 0.6923336982727051, + "grad_norm": 0.6951043605804443, "learning_rate": 4.8581668704987096e-05, - "loss": 1.4551, + "loss": 1.4552, "step": 3340 }, { "epoch": 0.22727272727272727, - "grad_norm": 0.5776564478874207, + "grad_norm": 0.5775253176689148, "learning_rate": 4.857954545454545e-05, - "loss": 1.3825, + "loss": 1.3823, "step": 3345 }, { "epoch": 0.22761244734338903, - "grad_norm": 0.6969068050384521, + "grad_norm": 0.695652425289154, "learning_rate": 4.8577422204103824e-05, - "loss": 1.4429, + "loss": 1.4427, "step": 3350 }, { "epoch": 0.22795216741405083, - "grad_norm": 0.6452533006668091, + "grad_norm": 0.6455203890800476, "learning_rate": 4.857529895366219e-05, - "loss": 1.356, + "loss": 1.3558, "step": 3355 }, { "epoch": 0.2282918874847126, - "grad_norm": 0.5599813461303711, + "grad_norm": 0.5605155825614929, "learning_rate": 4.8573175703220545e-05, - "loss": 1.4299, + "loss": 1.4297, "step": 3360 }, { "epoch": 0.22863160755537437, - "grad_norm": 0.6101433038711548, + "grad_norm": 0.6076835989952087, "learning_rate": 4.8571052452778916e-05, - "loss": 1.48, + "loss": 1.4801, "step": 3365 }, { "epoch": 0.22897132762603614, - "grad_norm": 0.517551064491272, + "grad_norm": 0.5249786376953125, "learning_rate": 4.856892920233727e-05, "loss": 1.3787, "step": 3370 }, { "epoch": 0.2293110476966979, - "grad_norm": 0.59132319688797, + "grad_norm": 0.5931435823440552, "learning_rate": 4.856680595189564e-05, - "loss": 1.3269, + "loss": 1.3274, "step": 3375 }, { "epoch": 0.2296507677673597, - "grad_norm": 0.6300364136695862, + "grad_norm": 0.6320357918739319, "learning_rate": 4.856468270145401e-05, - "loss": 1.4273, + "loss": 1.4266, "step": 3380 }, { "epoch": 0.22999048783802148, - "grad_norm": 0.5361374616622925, + "grad_norm": 0.5326262712478638, "learning_rate": 4.8562559451012365e-05, - "loss": 1.4379, + "loss": 1.4386, "step": 3385 }, { "epoch": 0.23033020790868325, - "grad_norm": 0.5496021509170532, + "grad_norm": 0.549776017665863, "learning_rate": 4.856043620057073e-05, - "loss": 1.4591, + "loss": 1.4587, "step": 3390 }, { "epoch": 0.23066992797934502, - "grad_norm": 0.5989214181900024, + "grad_norm": 0.597612738609314, "learning_rate": 4.85583129501291e-05, - "loss": 1.3554, + "loss": 1.355, "step": 3395 }, { "epoch": 0.2310096480500068, - "grad_norm": 0.6240394115447998, + "grad_norm": 0.6248374581336975, "learning_rate": 4.855618969968746e-05, "loss": 1.4081, "step": 3400 }, { "epoch": 0.23134936812066856, - "grad_norm": 0.6648991703987122, + "grad_norm": 0.6618469953536987, "learning_rate": 4.855406644924582e-05, - "loss": 1.4669, + "loss": 1.4664, "step": 3405 }, { "epoch": 0.23168908819133036, - "grad_norm": 0.5599499344825745, + "grad_norm": 0.5612849593162537, "learning_rate": 4.855194319880419e-05, - "loss": 1.309, + "loss": 1.3088, "step": 3410 }, { "epoch": 0.23202880826199213, - "grad_norm": 0.6417104601860046, + "grad_norm": 0.6400346755981445, "learning_rate": 4.854981994836255e-05, - "loss": 1.4025, + "loss": 1.4022, "step": 3415 }, { "epoch": 0.2323685283326539, - "grad_norm": 0.5643985271453857, + "grad_norm": 0.5645342469215393, "learning_rate": 4.854769669792091e-05, - "loss": 1.384, + "loss": 1.3838, "step": 3420 }, { "epoch": 0.23270824840331567, - "grad_norm": 0.5993593335151672, + "grad_norm": 0.6000213623046875, "learning_rate": 4.8545573447479284e-05, - "loss": 1.4265, + "loss": 1.427, "step": 3425 }, { "epoch": 0.23304796847397743, - "grad_norm": 0.612234354019165, + "grad_norm": 0.6100979447364807, "learning_rate": 4.854345019703764e-05, - "loss": 1.461, + "loss": 1.4614, "step": 3430 }, { "epoch": 0.2333876885446392, - "grad_norm": 0.5936884880065918, + "grad_norm": 0.595273494720459, "learning_rate": 4.8541326946596005e-05, - "loss": 1.4586, + "loss": 1.4589, "step": 3435 }, { "epoch": 0.233727408615301, - "grad_norm": 0.7210749983787537, + "grad_norm": 0.7224227786064148, "learning_rate": 4.8539203696154376e-05, - "loss": 1.3428, + "loss": 1.3424, "step": 3440 }, { "epoch": 0.23406712868596277, - "grad_norm": 0.6005198359489441, + "grad_norm": 0.6003565192222595, "learning_rate": 4.853708044571273e-05, - "loss": 1.3284, + "loss": 1.3282, "step": 3445 }, { "epoch": 0.23440684875662454, - "grad_norm": 0.6938565373420715, + "grad_norm": 0.6943996548652649, "learning_rate": 4.85349571952711e-05, "loss": 1.4068, "step": 3450 }, { "epoch": 0.2347465688272863, - "grad_norm": 0.563916027545929, + "grad_norm": 0.5645086169242859, "learning_rate": 4.853283394482946e-05, "loss": 1.478, "step": 3455 }, { "epoch": 0.23508628889794808, - "grad_norm": 0.6281059384346008, + "grad_norm": 0.6273751258850098, "learning_rate": 4.8530710694387825e-05, "loss": 1.3657, "step": 3460 }, { "epoch": 0.23542600896860988, - "grad_norm": 0.6494596600532532, + "grad_norm": 0.6490116119384766, "learning_rate": 4.852858744394619e-05, - "loss": 1.4288, + "loss": 1.4285, "step": 3465 }, { "epoch": 0.23576572903927165, - "grad_norm": 0.5783873200416565, + "grad_norm": 0.5815773606300354, "learning_rate": 4.852646419350455e-05, "loss": 1.3344, "step": 3470 }, { "epoch": 0.23610544910993342, - "grad_norm": 0.6288142800331116, + "grad_norm": 0.6247827410697937, "learning_rate": 4.852434094306292e-05, - "loss": 1.3888, + "loss": 1.3886, "step": 3475 }, { "epoch": 0.2364451691805952, - "grad_norm": 0.7065832018852234, + "grad_norm": 0.7069764733314514, "learning_rate": 4.852221769262128e-05, - "loss": 1.4344, + "loss": 1.4343, "step": 3480 }, { "epoch": 0.23678488925125696, - "grad_norm": 0.48714709281921387, + "grad_norm": 0.48644599318504333, "learning_rate": 4.8520094442179645e-05, - "loss": 1.3114, + "loss": 1.3107, "step": 3485 }, { "epoch": 0.23712460932191873, - "grad_norm": 0.5135933756828308, + "grad_norm": 0.5178372263908386, "learning_rate": 4.851797119173801e-05, - "loss": 1.3019, + "loss": 1.3017, "step": 3490 }, { "epoch": 0.23746432939258053, - "grad_norm": 0.5906147360801697, + "grad_norm": 0.5912247896194458, "learning_rate": 4.851584794129637e-05, - "loss": 1.3449, + "loss": 1.3447, "step": 3495 }, { "epoch": 0.2378040494632423, - "grad_norm": 0.5758570432662964, + "grad_norm": 0.5779603123664856, "learning_rate": 4.851372469085474e-05, - "loss": 1.3763, + "loss": 1.3761, "step": 3500 }, { "epoch": 0.23814376953390406, - "grad_norm": 0.6372262239456177, + "grad_norm": 0.6465055346488953, "learning_rate": 4.85116014404131e-05, - "loss": 1.4366, + "loss": 1.4368, "step": 3505 }, { "epoch": 0.23848348960456583, - "grad_norm": 0.6069319844245911, + "grad_norm": 0.601302981376648, "learning_rate": 4.8509478189971465e-05, - "loss": 1.3901, + "loss": 1.3899, "step": 3510 }, { "epoch": 0.2388232096752276, - "grad_norm": 0.5084763765335083, + "grad_norm": 0.5073055624961853, "learning_rate": 4.850735493952983e-05, - "loss": 1.4231, + "loss": 1.4233, "step": 3515 }, { "epoch": 0.23916292974588937, - "grad_norm": 0.6538741588592529, + "grad_norm": 0.6559848189353943, "learning_rate": 4.850523168908819e-05, - "loss": 1.446, + "loss": 1.4453, "step": 3520 }, { "epoch": 0.23950264981655117, - "grad_norm": 0.5911784172058105, + "grad_norm": 0.5950323343276978, "learning_rate": 4.850310843864656e-05, - "loss": 1.4551, + "loss": 1.4549, "step": 3525 }, { "epoch": 0.23984236988721294, - "grad_norm": 0.5534785389900208, + "grad_norm": 0.5542881488800049, "learning_rate": 4.850098518820492e-05, - "loss": 1.3407, + "loss": 1.3408, "step": 3530 }, { "epoch": 0.2401820899578747, - "grad_norm": 0.6726526618003845, + "grad_norm": 0.6722670197486877, "learning_rate": 4.8498861937763285e-05, - "loss": 1.4309, + "loss": 1.4301, "step": 3535 }, { "epoch": 0.24052181002853648, - "grad_norm": 0.5891156792640686, + "grad_norm": 0.5826908946037292, "learning_rate": 4.849673868732165e-05, - "loss": 1.4469, + "loss": 1.4463, "step": 3540 }, { "epoch": 0.24086153009919825, - "grad_norm": 0.7022532820701599, + "grad_norm": 0.706889271736145, "learning_rate": 4.849461543688001e-05, - "loss": 1.3612, + "loss": 1.3614, "step": 3545 }, { "epoch": 0.24120125016986005, - "grad_norm": 0.6173405051231384, + "grad_norm": 0.6127579808235168, "learning_rate": 4.849249218643838e-05, - "loss": 1.4501, + "loss": 1.4504, "step": 3550 }, { "epoch": 0.24154097024052182, - "grad_norm": 0.573545515537262, + "grad_norm": 0.5714594721794128, "learning_rate": 4.849036893599674e-05, - "loss": 1.4623, + "loss": 1.4619, "step": 3555 }, { "epoch": 0.2418806903111836, - "grad_norm": 0.5794370174407959, + "grad_norm": 0.5779387354850769, "learning_rate": 4.8488245685555105e-05, - "loss": 1.3815, + "loss": 1.3818, "step": 3560 }, { "epoch": 0.24222041038184536, - "grad_norm": 0.6289984583854675, + "grad_norm": 0.6277913451194763, "learning_rate": 4.848612243511347e-05, - "loss": 1.392, + "loss": 1.3921, "step": 3565 }, { "epoch": 0.24256013045250713, - "grad_norm": 0.6021126508712769, + "grad_norm": 0.6017010807991028, "learning_rate": 4.848399918467183e-05, - "loss": 1.4036, + "loss": 1.4031, "step": 3570 }, { "epoch": 0.2428998505231689, - "grad_norm": 0.6499696373939514, + "grad_norm": 0.6492795348167419, "learning_rate": 4.84818759342302e-05, - "loss": 1.4753, + "loss": 1.4749, "step": 3575 }, { "epoch": 0.2432395705938307, - "grad_norm": 0.5753048062324524, + "grad_norm": 0.5649400949478149, "learning_rate": 4.847975268378856e-05, - "loss": 1.6739, + "loss": 1.674, "step": 3580 }, { "epoch": 0.24357929066449246, - "grad_norm": 0.49970388412475586, + "grad_norm": 0.5031910538673401, "learning_rate": 4.8477629433346925e-05, "loss": 1.496, "step": 3585 }, { "epoch": 0.24391901073515423, - "grad_norm": 0.6353665590286255, + "grad_norm": 0.6367409825325012, "learning_rate": 4.847550618290529e-05, - "loss": 1.3704, + "loss": 1.3701, "step": 3590 }, { "epoch": 0.244258730805816, - "grad_norm": 0.6091659665107727, + "grad_norm": 0.607036828994751, "learning_rate": 4.8473382932463653e-05, - "loss": 1.4259, + "loss": 1.4257, "step": 3595 }, { "epoch": 0.24459845087647777, - "grad_norm": 0.5981108546257019, + "grad_norm": 0.5988706350326538, "learning_rate": 4.847125968202202e-05, - "loss": 1.4009, + "loss": 1.401, "step": 3600 }, { "epoch": 0.24493817094713954, - "grad_norm": 0.5597249269485474, + "grad_norm": 0.5594316720962524, "learning_rate": 4.846913643158038e-05, "loss": 1.4112, "step": 3605 }, { "epoch": 0.24527789101780134, - "grad_norm": 0.6595554947853088, + "grad_norm": 0.6705377101898193, "learning_rate": 4.8467013181138745e-05, - "loss": 1.4225, + "loss": 1.4223, "step": 3610 }, { "epoch": 0.2456176110884631, - "grad_norm": 0.6585940718650818, + "grad_norm": 0.6580055952072144, "learning_rate": 4.846488993069711e-05, - "loss": 1.3301, + "loss": 1.33, "step": 3615 }, { "epoch": 0.24595733115912488, - "grad_norm": 0.7183786034584045, + "grad_norm": 0.7203112244606018, "learning_rate": 4.8462766680255473e-05, - "loss": 1.3734, + "loss": 1.3738, "step": 3620 }, { "epoch": 0.24629705122978665, - "grad_norm": 0.584247887134552, + "grad_norm": 0.5851079225540161, "learning_rate": 4.846064342981384e-05, "loss": 1.3636, "step": 3625 }, { "epoch": 0.24663677130044842, - "grad_norm": 0.56235671043396, + "grad_norm": 0.562060534954071, "learning_rate": 4.84585201793722e-05, - "loss": 1.3583, + "loss": 1.3578, "step": 3630 }, { "epoch": 0.24697649137111022, - "grad_norm": 0.614459753036499, + "grad_norm": 0.6153395175933838, "learning_rate": 4.8456396928930565e-05, - "loss": 1.4291, + "loss": 1.4293, "step": 3635 }, { "epoch": 0.247316211441772, - "grad_norm": 0.6260278820991516, + "grad_norm": 0.621080756187439, "learning_rate": 4.845427367848893e-05, - "loss": 1.4347, + "loss": 1.4349, "step": 3640 }, { "epoch": 0.24765593151243376, - "grad_norm": 0.5813421607017517, + "grad_norm": 0.5764373540878296, "learning_rate": 4.8452150428047293e-05, - "loss": 1.3232, + "loss": 1.323, "step": 3645 }, { "epoch": 0.24799565158309553, - "grad_norm": 0.5234394669532776, + "grad_norm": 0.5209351181983948, "learning_rate": 4.845002717760566e-05, - "loss": 1.375, + "loss": 1.3752, "step": 3650 }, { "epoch": 0.2483353716537573, - "grad_norm": 0.5626548528671265, + "grad_norm": 0.5659106373786926, "learning_rate": 4.8447903927164015e-05, - "loss": 1.2804, + "loss": 1.2802, "step": 3655 }, { "epoch": 0.24867509172441907, - "grad_norm": 0.5892276167869568, + "grad_norm": 0.5918408036231995, "learning_rate": 4.8445780676722385e-05, - "loss": 1.3753, + "loss": 1.3748, "step": 3660 }, { "epoch": 0.24901481179508086, - "grad_norm": 0.5537261962890625, + "grad_norm": 0.5539352893829346, "learning_rate": 4.844365742628075e-05, - "loss": 1.3937, + "loss": 1.3939, "step": 3665 }, { "epoch": 0.24935453186574263, - "grad_norm": 0.6903495788574219, + "grad_norm": 0.6864833235740662, "learning_rate": 4.844153417583911e-05, "loss": 1.4007, "step": 3670 }, { "epoch": 0.2496942519364044, - "grad_norm": 0.6384214162826538, + "grad_norm": 0.6352394223213196, "learning_rate": 4.843941092539748e-05, - "loss": 1.4474, + "loss": 1.4473, "step": 3675 }, { "epoch": 0.2500339720070662, - "grad_norm": 0.5833981037139893, + "grad_norm": 0.5873843431472778, "learning_rate": 4.843728767495584e-05, - "loss": 1.4378, + "loss": 1.438, "step": 3680 }, { "epoch": 0.25037369207772797, - "grad_norm": 0.7025521993637085, + "grad_norm": 0.7023875713348389, "learning_rate": 4.84351644245142e-05, - "loss": 1.3458, + "loss": 1.3454, "step": 3685 }, { "epoch": 0.2507134121483897, - "grad_norm": 0.5626585483551025, + "grad_norm": 0.5609140992164612, "learning_rate": 4.843304117407257e-05, - "loss": 1.3712, + "loss": 1.3714, "step": 3690 }, { "epoch": 0.2510531322190515, - "grad_norm": 0.6970829367637634, + "grad_norm": 0.6881881356239319, "learning_rate": 4.8430917923630933e-05, - "loss": 1.4608, + "loss": 1.4613, "step": 3695 }, { "epoch": 0.25139285228971325, - "grad_norm": 0.764566957950592, + "grad_norm": 0.7551148533821106, "learning_rate": 4.842879467318929e-05, - "loss": 1.3965, + "loss": 1.3964, "step": 3700 }, { "epoch": 0.25173257236037505, - "grad_norm": 0.5882022976875305, + "grad_norm": 0.5884304046630859, "learning_rate": 4.842667142274766e-05, - "loss": 1.3826, + "loss": 1.3822, "step": 3705 }, { "epoch": 0.25207229243103685, - "grad_norm": 0.559703528881073, + "grad_norm": 0.5577235817909241, "learning_rate": 4.8424548172306025e-05, - "loss": 1.4295, + "loss": 1.4299, "step": 3710 }, { "epoch": 0.2524120125016986, - "grad_norm": 0.6943153142929077, + "grad_norm": 0.6934741735458374, "learning_rate": 4.842242492186438e-05, - "loss": 1.4013, + "loss": 1.401, "step": 3715 }, { "epoch": 0.2527517325723604, - "grad_norm": 0.6310819387435913, + "grad_norm": 0.6377744078636169, "learning_rate": 4.8420301671422754e-05, "loss": 1.3209, "step": 3720 }, { "epoch": 0.25309145264302213, - "grad_norm": 0.4891515076160431, + "grad_norm": 0.4872899651527405, "learning_rate": 4.841817842098111e-05, - "loss": 1.3769, + "loss": 1.3762, "step": 3725 }, { "epoch": 0.2534311727136839, - "grad_norm": 0.5766716003417969, + "grad_norm": 0.5762884020805359, "learning_rate": 4.8416055170539475e-05, - "loss": 1.2176, + "loss": 1.2179, "step": 3730 }, { "epoch": 0.2537708927843457, - "grad_norm": 0.7314422130584717, + "grad_norm": 0.7268533706665039, "learning_rate": 4.8413931920097846e-05, - "loss": 1.495, + "loss": 1.4952, "step": 3735 }, { "epoch": 0.25411061285500747, - "grad_norm": 0.6508923172950745, + "grad_norm": 0.6482822299003601, "learning_rate": 4.84118086696562e-05, - "loss": 1.3688, + "loss": 1.369, "step": 3740 }, { "epoch": 0.25445033292566926, - "grad_norm": 0.6452445983886719, + "grad_norm": 0.6496025323867798, "learning_rate": 4.840968541921457e-05, - "loss": 1.3935, + "loss": 1.3946, "step": 3745 }, { "epoch": 0.254790052996331, - "grad_norm": 0.6243131756782532, + "grad_norm": 0.6108883023262024, "learning_rate": 4.840756216877294e-05, - "loss": 1.497, + "loss": 1.4966, "step": 3750 }, { "epoch": 0.2551297730669928, - "grad_norm": 0.6898776888847351, + "grad_norm": 0.6868425011634827, "learning_rate": 4.8405438918331295e-05, - "loss": 1.5104, + "loss": 1.5105, "step": 3755 }, { "epoch": 0.25546949313765455, - "grad_norm": 0.5548217296600342, + "grad_norm": 0.5538015961647034, "learning_rate": 4.840331566788966e-05, - "loss": 1.385, + "loss": 1.3847, "step": 3760 }, { "epoch": 0.25580921320831634, - "grad_norm": 0.5216907858848572, + "grad_norm": 0.5221551656723022, "learning_rate": 4.840119241744803e-05, - "loss": 1.4015, + "loss": 1.4017, "step": 3765 }, { "epoch": 0.25614893327897814, - "grad_norm": 0.5976521372795105, + "grad_norm": 0.6003884673118591, "learning_rate": 4.839906916700639e-05, - "loss": 1.3784, + "loss": 1.3781, "step": 3770 }, { "epoch": 0.2564886533496399, - "grad_norm": 0.5819798111915588, + "grad_norm": 0.5883747935295105, "learning_rate": 4.839694591656475e-05, - "loss": 1.4777, + "loss": 1.4782, "step": 3775 }, { "epoch": 0.2568283734203017, - "grad_norm": 0.5949716567993164, + "grad_norm": 0.5916510820388794, "learning_rate": 4.839482266612312e-05, - "loss": 1.3815, + "loss": 1.3816, "step": 3780 }, { "epoch": 0.2571680934909634, - "grad_norm": 0.55443274974823, + "grad_norm": 0.5524773001670837, "learning_rate": 4.839269941568148e-05, "loss": 1.4397, "step": 3785 }, { "epoch": 0.2575078135616252, - "grad_norm": 0.581902801990509, + "grad_norm": 0.581365704536438, "learning_rate": 4.839057616523984e-05, - "loss": 1.4122, + "loss": 1.412, "step": 3790 }, { "epoch": 0.257847533632287, - "grad_norm": 0.5934801697731018, + "grad_norm": 0.5927204489707947, "learning_rate": 4.8388452914798214e-05, - "loss": 1.3265, + "loss": 1.3264, "step": 3795 }, { "epoch": 0.25818725370294876, - "grad_norm": 0.5715326070785522, + "grad_norm": 0.573259711265564, "learning_rate": 4.838632966435657e-05, - "loss": 1.3898, + "loss": 1.3903, "step": 3800 }, { "epoch": 0.25852697377361056, - "grad_norm": 0.5871545672416687, + "grad_norm": 0.5900419354438782, "learning_rate": 4.8384206413914935e-05, - "loss": 1.3728, + "loss": 1.3731, "step": 3805 }, { "epoch": 0.2588666938442723, - "grad_norm": 0.6707566976547241, + "grad_norm": 0.6695264577865601, "learning_rate": 4.83820831634733e-05, - "loss": 1.3844, + "loss": 1.3845, "step": 3810 }, { "epoch": 0.2592064139149341, - "grad_norm": 0.6047499179840088, + "grad_norm": 0.6028416156768799, "learning_rate": 4.837995991303166e-05, - "loss": 1.3889, + "loss": 1.3895, "step": 3815 }, { "epoch": 0.2595461339855959, - "grad_norm": 0.62021404504776, + "grad_norm": 0.6131296157836914, "learning_rate": 4.837783666259003e-05, - "loss": 1.4028, + "loss": 1.4032, "step": 3820 }, { "epoch": 0.25988585405625764, - "grad_norm": 0.6598581075668335, + "grad_norm": 0.6599849462509155, "learning_rate": 4.837571341214839e-05, - "loss": 1.4629, + "loss": 1.463, "step": 3825 }, { "epoch": 0.26022557412691943, - "grad_norm": 0.6117517352104187, + "grad_norm": 0.6092532873153687, "learning_rate": 4.8373590161706755e-05, - "loss": 1.325, + "loss": 1.3248, "step": 3830 }, { "epoch": 0.2605652941975812, - "grad_norm": 0.49917417764663696, + "grad_norm": 0.4998961091041565, "learning_rate": 4.837146691126512e-05, - "loss": 1.3048, + "loss": 1.3052, "step": 3835 }, { "epoch": 0.260905014268243, - "grad_norm": 0.6131162643432617, + "grad_norm": 0.6077843308448792, "learning_rate": 4.836934366082348e-05, - "loss": 1.4287, + "loss": 1.4284, "step": 3840 }, { "epoch": 0.2612447343389047, - "grad_norm": 0.6981179118156433, + "grad_norm": 0.7081494331359863, "learning_rate": 4.836722041038185e-05, - "loss": 1.4482, + "loss": 1.4489, "step": 3845 }, { "epoch": 0.2615844544095665, - "grad_norm": 0.5723700523376465, + "grad_norm": 0.5706216096878052, "learning_rate": 4.836509715994021e-05, - "loss": 1.4416, + "loss": 1.4411, "step": 3850 }, { "epoch": 0.2619241744802283, - "grad_norm": 0.6441469192504883, + "grad_norm": 0.6442426443099976, "learning_rate": 4.8362973909498575e-05, "loss": 1.4, "step": 3855 }, { "epoch": 0.26226389455089005, - "grad_norm": 0.6017731428146362, + "grad_norm": 0.6036747694015503, "learning_rate": 4.836085065905694e-05, - "loss": 1.4324, + "loss": 1.4327, "step": 3860 }, { "epoch": 0.26260361462155185, - "grad_norm": 0.6027065515518188, + "grad_norm": 0.6006507277488708, "learning_rate": 4.83587274086153e-05, - "loss": 1.405, + "loss": 1.4046, "step": 3865 }, { "epoch": 0.2629433346922136, - "grad_norm": 0.573409378528595, + "grad_norm": 0.5771772265434265, "learning_rate": 4.835660415817367e-05, - "loss": 1.3482, + "loss": 1.3475, "step": 3870 }, { "epoch": 0.2632830547628754, - "grad_norm": 0.6863682270050049, + "grad_norm": 0.6821397542953491, "learning_rate": 4.835448090773203e-05, - "loss": 1.3839, + "loss": 1.3844, "step": 3875 }, { "epoch": 0.2636227748335372, - "grad_norm": 0.6273049116134644, + "grad_norm": 0.6229386925697327, "learning_rate": 4.8352357657290395e-05, - "loss": 1.3573, + "loss": 1.3572, "step": 3880 }, { "epoch": 0.26396249490419893, - "grad_norm": 0.593453586101532, + "grad_norm": 0.591820240020752, "learning_rate": 4.835023440684876e-05, - "loss": 1.4102, + "loss": 1.4107, "step": 3885 }, { "epoch": 0.2643022149748607, - "grad_norm": 0.6387911438941956, + "grad_norm": 0.6376118063926697, "learning_rate": 4.834811115640712e-05, - "loss": 1.4568, + "loss": 1.4565, "step": 3890 }, { "epoch": 0.26464193504552247, - "grad_norm": 0.6193586587905884, + "grad_norm": 0.6222611665725708, "learning_rate": 4.834598790596549e-05, - "loss": 1.3945, + "loss": 1.395, "step": 3895 }, { "epoch": 0.26498165511618427, - "grad_norm": 0.6583086252212524, + "grad_norm": 0.6593599915504456, "learning_rate": 4.834386465552385e-05, - "loss": 1.3465, + "loss": 1.3467, "step": 3900 }, { "epoch": 0.26532137518684606, - "grad_norm": 0.6301378011703491, + "grad_norm": 0.6298807859420776, "learning_rate": 4.8341741405082215e-05, "loss": 1.4004, "step": 3905 }, { "epoch": 0.2656610952575078, - "grad_norm": 0.577540934085846, + "grad_norm": 0.5784420967102051, "learning_rate": 4.833961815464058e-05, - "loss": 1.4346, + "loss": 1.4341, "step": 3910 }, { "epoch": 0.2660008153281696, - "grad_norm": 0.6303648948669434, + "grad_norm": 0.6301862597465515, "learning_rate": 4.833749490419894e-05, - "loss": 1.3866, + "loss": 1.3864, "step": 3915 }, { "epoch": 0.26634053539883135, - "grad_norm": 0.6956437230110168, + "grad_norm": 0.6954297423362732, "learning_rate": 4.833537165375731e-05, - "loss": 1.4491, + "loss": 1.4488, "step": 3920 }, { "epoch": 0.26668025546949314, - "grad_norm": 0.656446635723114, + "grad_norm": 0.6545997858047485, "learning_rate": 4.833324840331567e-05, - "loss": 1.4148, + "loss": 1.415, "step": 3925 }, { "epoch": 0.2670199755401549, - "grad_norm": 0.5804231762886047, + "grad_norm": 0.5803295373916626, "learning_rate": 4.8331125152874035e-05, - "loss": 1.3895, + "loss": 1.3897, "step": 3930 }, { "epoch": 0.2673596956108167, - "grad_norm": 0.5970335602760315, + "grad_norm": 0.598288357257843, "learning_rate": 4.83290019024324e-05, - "loss": 1.3782, + "loss": 1.3778, "step": 3935 }, { "epoch": 0.2676994156814785, - "grad_norm": 0.6738160848617554, + "grad_norm": 0.6756088137626648, "learning_rate": 4.832687865199076e-05, - "loss": 1.4232, + "loss": 1.4233, "step": 3940 }, { "epoch": 0.2680391357521402, - "grad_norm": 0.5744298696517944, + "grad_norm": 0.5749544501304626, "learning_rate": 4.832475540154913e-05, - "loss": 1.4554, + "loss": 1.4555, "step": 3945 }, { "epoch": 0.268378855822802, - "grad_norm": 0.6450674533843994, + "grad_norm": 0.6492464542388916, "learning_rate": 4.832263215110749e-05, - "loss": 1.4713, + "loss": 1.4718, "step": 3950 }, { "epoch": 0.26871857589346376, - "grad_norm": 0.6713495850563049, + "grad_norm": 0.6712719202041626, "learning_rate": 4.8320508900665855e-05, - "loss": 1.3377, + "loss": 1.3366, "step": 3955 }, { "epoch": 0.26905829596412556, - "grad_norm": 0.6296396255493164, + "grad_norm": 0.6295887231826782, "learning_rate": 4.831838565022422e-05, - "loss": 1.4575, + "loss": 1.4576, "step": 3960 }, { "epoch": 0.26939801603478736, - "grad_norm": 0.6520622968673706, + "grad_norm": 0.6524296998977661, "learning_rate": 4.831626239978258e-05, - "loss": 1.4549, + "loss": 1.4551, "step": 3965 }, { "epoch": 0.2697377361054491, - "grad_norm": 0.6124261617660522, + "grad_norm": 0.6094898581504822, "learning_rate": 4.831413914934095e-05, - "loss": 1.3898, + "loss": 1.389, "step": 3970 }, { "epoch": 0.2700774561761109, - "grad_norm": 0.6756829023361206, + "grad_norm": 0.6753742694854736, "learning_rate": 4.831201589889931e-05, - "loss": 1.5082, + "loss": 1.5079, "step": 3975 }, { "epoch": 0.27041717624677264, - "grad_norm": 0.6021645069122314, + "grad_norm": 0.5980693697929382, "learning_rate": 4.8309892648457675e-05, - "loss": 1.4192, + "loss": 1.4195, "step": 3980 }, { "epoch": 0.27075689631743444, - "grad_norm": 0.6154415011405945, + "grad_norm": 0.6172364354133606, "learning_rate": 4.830776939801604e-05, - "loss": 1.4664, + "loss": 1.4667, "step": 3985 }, { "epoch": 0.27109661638809623, - "grad_norm": 0.6155113577842712, + "grad_norm": 0.6151544451713562, "learning_rate": 4.83056461475744e-05, "loss": 1.4474, "step": 3990 }, { "epoch": 0.271436336458758, - "grad_norm": 0.565564751625061, + "grad_norm": 0.5651228427886963, "learning_rate": 4.830352289713277e-05, - "loss": 1.4619, + "loss": 1.462, "step": 3995 }, { "epoch": 0.2717760565294198, - "grad_norm": 0.699712336063385, + "grad_norm": 0.6997990012168884, "learning_rate": 4.830139964669113e-05, - "loss": 1.5082, + "loss": 1.5085, "step": 4000 }, { "epoch": 0.2721157766000815, - "grad_norm": 0.5771880149841309, + "grad_norm": 0.5755518674850464, "learning_rate": 4.8299276396249495e-05, - "loss": 1.4625, + "loss": 1.4622, "step": 4005 }, { "epoch": 0.2724554966707433, - "grad_norm": 0.6200066804885864, + "grad_norm": 0.6201203465461731, "learning_rate": 4.829715314580785e-05, - "loss": 1.4102, + "loss": 1.4101, "step": 4010 }, { "epoch": 0.27279521674140506, - "grad_norm": 0.5667638182640076, + "grad_norm": 0.5651349425315857, "learning_rate": 4.829502989536622e-05, - "loss": 1.4367, + "loss": 1.4359, "step": 4015 }, { "epoch": 0.27313493681206685, - "grad_norm": 0.6307473182678223, + "grad_norm": 0.6333818435668945, "learning_rate": 4.829290664492459e-05, - "loss": 1.4044, + "loss": 1.4051, "step": 4020 }, { "epoch": 0.27347465688272865, - "grad_norm": 0.5659202933311462, + "grad_norm": 0.565485417842865, "learning_rate": 4.8290783394482944e-05, "loss": 1.4817, "step": 4025 }, { "epoch": 0.2738143769533904, - "grad_norm": 0.6943925619125366, + "grad_norm": 0.6985927224159241, "learning_rate": 4.8288660144041315e-05, - "loss": 1.3818, + "loss": 1.3824, "step": 4030 }, { "epoch": 0.2741540970240522, - "grad_norm": 0.9133588075637817, + "grad_norm": 0.9063515067100525, "learning_rate": 4.828653689359968e-05, - "loss": 1.4241, + "loss": 1.4245, "step": 4035 }, { "epoch": 0.27449381709471393, - "grad_norm": 0.5766347050666809, + "grad_norm": 0.5766139030456543, "learning_rate": 4.8284413643158036e-05, - "loss": 1.3824, + "loss": 1.3825, "step": 4040 }, { "epoch": 0.27483353716537573, - "grad_norm": 0.5434534549713135, + "grad_norm": 0.5437709093093872, "learning_rate": 4.828229039271641e-05, - "loss": 1.5095, + "loss": 1.5094, "step": 4045 }, { "epoch": 0.2751732572360375, - "grad_norm": 0.6625797152519226, + "grad_norm": 0.6624595522880554, "learning_rate": 4.828016714227477e-05, - "loss": 1.4583, + "loss": 1.459, "step": 4050 }, { "epoch": 0.27551297730669927, - "grad_norm": 0.5697526335716248, + "grad_norm": 0.5664342045783997, "learning_rate": 4.827804389183313e-05, - "loss": 1.4301, + "loss": 1.4304, "step": 4055 }, { "epoch": 0.27585269737736107, - "grad_norm": 0.6194468140602112, + "grad_norm": 0.6219688057899475, "learning_rate": 4.82759206413915e-05, - "loss": 1.3168, + "loss": 1.3171, "step": 4060 }, { "epoch": 0.2761924174480228, - "grad_norm": 0.6437966823577881, + "grad_norm": 0.6463744044303894, "learning_rate": 4.827379739094986e-05, - "loss": 1.3462, + "loss": 1.3474, "step": 4065 }, { "epoch": 0.2765321375186846, - "grad_norm": 0.6066455841064453, + "grad_norm": 0.6074214577674866, "learning_rate": 4.827167414050822e-05, - "loss": 1.3853, + "loss": 1.3854, "step": 4070 }, { "epoch": 0.2768718575893464, - "grad_norm": 0.6828492879867554, + "grad_norm": 0.6838439106941223, "learning_rate": 4.826955089006659e-05, - "loss": 1.4077, + "loss": 1.4076, "step": 4075 }, { "epoch": 0.27721157766000815, - "grad_norm": 0.5618029236793518, + "grad_norm": 0.5595834255218506, "learning_rate": 4.826742763962495e-05, - "loss": 1.4275, + "loss": 1.4271, "step": 4080 }, { "epoch": 0.27755129773066994, - "grad_norm": 0.5699742436408997, + "grad_norm": 0.5707030296325684, "learning_rate": 4.826530438918331e-05, - "loss": 1.3868, + "loss": 1.3872, "step": 4085 }, { "epoch": 0.2778910178013317, - "grad_norm": 0.600041925907135, + "grad_norm": 0.6009557843208313, "learning_rate": 4.826318113874168e-05, - "loss": 1.4206, + "loss": 1.4209, "step": 4090 }, { "epoch": 0.2782307378719935, - "grad_norm": 0.6188768148422241, + "grad_norm": 0.6216113567352295, "learning_rate": 4.826105788830004e-05, - "loss": 1.4262, + "loss": 1.4257, "step": 4095 }, { "epoch": 0.2785704579426552, - "grad_norm": 0.5706759691238403, + "grad_norm": 0.571537435054779, "learning_rate": 4.8258934637858404e-05, - "loss": 1.4351, + "loss": 1.435, "step": 4100 }, { "epoch": 0.278910178013317, - "grad_norm": 0.5694918036460876, + "grad_norm": 0.5683637857437134, "learning_rate": 4.8256811387416775e-05, - "loss": 1.4526, + "loss": 1.4527, "step": 4105 }, { "epoch": 0.2792498980839788, - "grad_norm": 0.5923181772232056, + "grad_norm": 0.5943649411201477, "learning_rate": 4.825468813697513e-05, - "loss": 1.3486, + "loss": 1.3492, "step": 4110 }, { "epoch": 0.27958961815464056, - "grad_norm": 0.5981913805007935, + "grad_norm": 0.5963008403778076, "learning_rate": 4.8252564886533496e-05, - "loss": 1.3979, + "loss": 1.3975, "step": 4115 }, { "epoch": 0.27992933822530236, - "grad_norm": 0.6309303045272827, + "grad_norm": 0.6311737298965454, "learning_rate": 4.825044163609187e-05, "loss": 1.4769, "step": 4120 }, { "epoch": 0.2802690582959641, - "grad_norm": 0.6503332853317261, + "grad_norm": 0.6508136987686157, "learning_rate": 4.8248318385650224e-05, - "loss": 1.3807, + "loss": 1.3803, "step": 4125 }, { "epoch": 0.2806087783666259, - "grad_norm": 0.5759245157241821, + "grad_norm": 0.5775586366653442, "learning_rate": 4.824619513520859e-05, - "loss": 1.3595, + "loss": 1.3589, "step": 4130 }, { "epoch": 0.2809484984372877, - "grad_norm": 0.5755437016487122, + "grad_norm": 0.5729120373725891, "learning_rate": 4.824407188476696e-05, - "loss": 1.425, + "loss": 1.4251, "step": 4135 }, { "epoch": 0.28128821850794944, - "grad_norm": 0.5048924684524536, + "grad_norm": 0.5057596564292908, "learning_rate": 4.8241948634325316e-05, - "loss": 1.3901, + "loss": 1.39, "step": 4140 }, { "epoch": 0.28162793857861124, - "grad_norm": 0.694521427154541, + "grad_norm": 0.6964313387870789, "learning_rate": 4.823982538388368e-05, - "loss": 1.3302, + "loss": 1.3308, "step": 4145 }, { "epoch": 0.281967658649273, - "grad_norm": 0.6683419346809387, + "grad_norm": 0.6690431833267212, "learning_rate": 4.8237702133442044e-05, - "loss": 1.3654, + "loss": 1.3652, "step": 4150 }, { "epoch": 0.2823073787199348, - "grad_norm": 0.6541835069656372, + "grad_norm": 0.6540568470954895, "learning_rate": 4.823557888300041e-05, - "loss": 1.463, + "loss": 1.4631, "step": 4155 }, { "epoch": 0.2826470987905966, - "grad_norm": 0.6378797292709351, + "grad_norm": 0.6362757682800293, "learning_rate": 4.823345563255877e-05, - "loss": 1.4953, + "loss": 1.495, "step": 4160 }, { "epoch": 0.2829868188612583, - "grad_norm": 0.6032472252845764, + "grad_norm": 0.6043899655342102, "learning_rate": 4.8231332382117136e-05, "loss": 1.3275, "step": 4165 }, { "epoch": 0.2833265389319201, - "grad_norm": 0.5532385110855103, + "grad_norm": 0.5535279512405396, "learning_rate": 4.82292091316755e-05, - "loss": 1.3732, + "loss": 1.3738, "step": 4170 }, { "epoch": 0.28366625900258186, - "grad_norm": 0.6906827092170715, + "grad_norm": 0.692501425743103, "learning_rate": 4.8227085881233864e-05, - "loss": 1.4511, + "loss": 1.451, "step": 4175 }, { "epoch": 0.28400597907324365, - "grad_norm": 0.5115543603897095, + "grad_norm": 0.5101706981658936, "learning_rate": 4.822496263079223e-05, - "loss": 1.2993, + "loss": 1.2991, "step": 4180 }, { "epoch": 0.2843456991439054, - "grad_norm": 0.6229546070098877, + "grad_norm": 0.6251339912414551, "learning_rate": 4.822283938035059e-05, - "loss": 1.4264, + "loss": 1.4267, "step": 4185 }, { "epoch": 0.2846854192145672, - "grad_norm": 0.610550045967102, + "grad_norm": 0.6104265451431274, "learning_rate": 4.8220716129908956e-05, "loss": 1.3518, "step": 4190 }, { "epoch": 0.285025139285229, - "grad_norm": 0.6253818273544312, + "grad_norm": 0.6231160759925842, "learning_rate": 4.821859287946732e-05, - "loss": 1.3582, + "loss": 1.358, "step": 4195 }, { "epoch": 0.28536485935589073, - "grad_norm": 0.6032271981239319, + "grad_norm": 0.6019293665885925, "learning_rate": 4.8216469629025684e-05, - "loss": 1.3786, + "loss": 1.3775, "step": 4200 }, { "epoch": 0.28570457942655253, - "grad_norm": 0.5876577496528625, + "grad_norm": 0.5898943543434143, "learning_rate": 4.821434637858405e-05, "loss": 1.432, "step": 4205 }, { "epoch": 0.28604429949721427, - "grad_norm": 0.5510315299034119, + "grad_norm": 0.5517131686210632, "learning_rate": 4.821222312814241e-05, - "loss": 1.4849, + "loss": 1.4852, "step": 4210 }, { "epoch": 0.28638401956787607, - "grad_norm": 0.6315826773643494, + "grad_norm": 0.6252326965332031, "learning_rate": 4.8210099877700776e-05, - "loss": 1.3832, + "loss": 1.3829, "step": 4215 }, { "epoch": 0.28672373963853787, - "grad_norm": 0.5919789671897888, + "grad_norm": 0.5831483006477356, "learning_rate": 4.820797662725914e-05, - "loss": 1.4017, + "loss": 1.4022, "step": 4220 }, { "epoch": 0.2870634597091996, - "grad_norm": 0.578884482383728, + "grad_norm": 0.5779646635055542, "learning_rate": 4.8205853376817504e-05, - "loss": 1.5031, + "loss": 1.5028, "step": 4225 }, { "epoch": 0.2874031797798614, - "grad_norm": 0.5814526677131653, + "grad_norm": 0.5757778882980347, "learning_rate": 4.820373012637587e-05, - "loss": 1.359, + "loss": 1.3595, "step": 4230 }, { "epoch": 0.28774289985052315, - "grad_norm": 0.5195465683937073, + "grad_norm": 0.5195366144180298, "learning_rate": 4.820160687593423e-05, - "loss": 1.3718, + "loss": 1.3712, "step": 4235 }, { "epoch": 0.28808261992118495, - "grad_norm": 0.6810558438301086, + "grad_norm": 0.662595272064209, "learning_rate": 4.8199483625492596e-05, - "loss": 1.3671, + "loss": 1.3675, "step": 4240 }, { "epoch": 0.28842233999184674, - "grad_norm": 0.622515857219696, + "grad_norm": 0.6281705498695374, "learning_rate": 4.819736037505096e-05, - "loss": 1.4952, + "loss": 1.4949, "step": 4245 }, { "epoch": 0.2887620600625085, - "grad_norm": 0.6229998469352722, + "grad_norm": 0.6233288645744324, "learning_rate": 4.8195237124609324e-05, - "loss": 1.3636, + "loss": 1.3639, "step": 4250 }, { "epoch": 0.2891017801331703, - "grad_norm": 0.5896877646446228, + "grad_norm": 0.5876872539520264, "learning_rate": 4.819311387416769e-05, - "loss": 1.4003, + "loss": 1.3998, "step": 4255 }, { "epoch": 0.289441500203832, - "grad_norm": 0.676675021648407, + "grad_norm": 0.6752828359603882, "learning_rate": 4.819099062372605e-05, "loss": 1.5011, "step": 4260 }, { "epoch": 0.2897812202744938, - "grad_norm": 0.6762051582336426, + "grad_norm": 0.6781887412071228, "learning_rate": 4.8188867373284416e-05, - "loss": 1.4601, + "loss": 1.4602, "step": 4265 }, { "epoch": 0.29012094034515556, - "grad_norm": 0.5926623940467834, + "grad_norm": 0.5928335785865784, "learning_rate": 4.818674412284278e-05, - "loss": 1.3434, + "loss": 1.3432, "step": 4270 }, { "epoch": 0.29046066041581736, - "grad_norm": 0.49280211329460144, + "grad_norm": 0.4953343868255615, "learning_rate": 4.8184620872401144e-05, - "loss": 1.3949, + "loss": 1.3953, "step": 4275 }, { "epoch": 0.29080038048647916, - "grad_norm": 0.6341902613639832, + "grad_norm": 0.6354355216026306, "learning_rate": 4.818249762195951e-05, - "loss": 1.3213, + "loss": 1.3221, "step": 4280 }, { "epoch": 0.2911401005571409, - "grad_norm": 0.6441182494163513, + "grad_norm": 0.6414484977722168, "learning_rate": 4.818037437151787e-05, - "loss": 1.3687, + "loss": 1.3681, "step": 4285 }, { "epoch": 0.2914798206278027, - "grad_norm": 0.6745972037315369, + "grad_norm": 0.6791452169418335, "learning_rate": 4.8178251121076236e-05, - "loss": 1.3677, + "loss": 1.3684, "step": 4290 }, { "epoch": 0.29181954069846444, - "grad_norm": 0.6068249940872192, + "grad_norm": 0.606411874294281, "learning_rate": 4.81761278706346e-05, "loss": 1.4581, "step": 4295 }, { "epoch": 0.29215926076912624, - "grad_norm": 0.6574903130531311, + "grad_norm": 0.659292459487915, "learning_rate": 4.8174004620192964e-05, - "loss": 1.4814, + "loss": 1.4813, "step": 4300 }, { "epoch": 0.29249898083978804, - "grad_norm": 0.5898470878601074, + "grad_norm": 0.5910828709602356, "learning_rate": 4.817188136975133e-05, - "loss": 1.3846, + "loss": 1.3851, "step": 4305 }, { "epoch": 0.2928387009104498, - "grad_norm": 0.7001326084136963, + "grad_norm": 0.7041095495223999, "learning_rate": 4.816975811930969e-05, - "loss": 1.4393, + "loss": 1.4396, "step": 4310 }, { "epoch": 0.2931784209811116, - "grad_norm": 0.6270405054092407, + "grad_norm": 0.6285356879234314, "learning_rate": 4.8167634868868056e-05, - "loss": 1.3712, + "loss": 1.372, "step": 4315 }, { "epoch": 0.2935181410517733, - "grad_norm": 0.597356379032135, + "grad_norm": 0.6167028546333313, "learning_rate": 4.816551161842642e-05, - "loss": 1.3049, + "loss": 1.3046, "step": 4320 }, { "epoch": 0.2938578611224351, - "grad_norm": 0.6218376755714417, + "grad_norm": 0.6217319965362549, "learning_rate": 4.8163388367984784e-05, - "loss": 1.3632, + "loss": 1.3642, "step": 4325 }, { "epoch": 0.2941975811930969, - "grad_norm": 0.7287354469299316, + "grad_norm": 0.7174155116081238, "learning_rate": 4.816126511754315e-05, - "loss": 1.4058, + "loss": 1.4048, "step": 4330 }, { "epoch": 0.29453730126375866, - "grad_norm": 0.6104965209960938, + "grad_norm": 0.6086079478263855, "learning_rate": 4.815914186710151e-05, - "loss": 1.5259, + "loss": 1.5257, "step": 4335 }, { "epoch": 0.29487702133442045, - "grad_norm": 0.6057714223861694, + "grad_norm": 0.6024938225746155, "learning_rate": 4.8157018616659876e-05, - "loss": 1.3611, + "loss": 1.3619, "step": 4340 }, { "epoch": 0.2952167414050822, - "grad_norm": 0.6120069622993469, + "grad_norm": 0.6108648777008057, "learning_rate": 4.815489536621824e-05, - "loss": 1.3453, + "loss": 1.3451, "step": 4345 }, { "epoch": 0.295556461475744, - "grad_norm": 0.6051473617553711, + "grad_norm": 0.6056551933288574, "learning_rate": 4.81527721157766e-05, - "loss": 1.4418, + "loss": 1.442, "step": 4350 }, { "epoch": 0.29589618154640573, - "grad_norm": 0.6352353096008301, + "grad_norm": 0.6350167393684387, "learning_rate": 4.815064886533497e-05, - "loss": 1.3684, + "loss": 1.3686, "step": 4355 }, { "epoch": 0.29623590161706753, - "grad_norm": 0.5874910950660706, + "grad_norm": 0.5853134989738464, "learning_rate": 4.814852561489333e-05, - "loss": 1.3508, + "loss": 1.3509, "step": 4360 }, { "epoch": 0.29657562168772933, - "grad_norm": 0.6882878541946411, + "grad_norm": 0.6776278018951416, "learning_rate": 4.814640236445169e-05, - "loss": 1.4451, + "loss": 1.4446, "step": 4365 }, { "epoch": 0.29691534175839107, - "grad_norm": 0.6234210133552551, + "grad_norm": 0.6220254302024841, "learning_rate": 4.814427911401006e-05, - "loss": 1.4546, + "loss": 1.4547, "step": 4370 }, { "epoch": 0.29725506182905287, - "grad_norm": 0.544707179069519, + "grad_norm": 0.545853316783905, "learning_rate": 4.8142155863568424e-05, - "loss": 1.265, + "loss": 1.2649, "step": 4375 }, { "epoch": 0.2975947818997146, - "grad_norm": 0.6247130632400513, + "grad_norm": 0.6324424147605896, "learning_rate": 4.814003261312678e-05, "loss": 1.3925, "step": 4380 }, { "epoch": 0.2979345019703764, - "grad_norm": 0.5478698015213013, + "grad_norm": 0.5474621057510376, "learning_rate": 4.813790936268515e-05, - "loss": 1.3705, + "loss": 1.37, "step": 4385 }, { "epoch": 0.2982742220410382, - "grad_norm": 0.628241777420044, + "grad_norm": 0.625907301902771, "learning_rate": 4.8135786112243516e-05, - "loss": 1.4246, + "loss": 1.4245, "step": 4390 }, { "epoch": 0.29861394211169995, - "grad_norm": 0.5739772915840149, + "grad_norm": 0.571111798286438, "learning_rate": 4.8133662861801874e-05, - "loss": 1.4448, + "loss": 1.4453, "step": 4395 }, { "epoch": 0.29895366218236175, - "grad_norm": 0.6296387314796448, + "grad_norm": 0.6292002201080322, "learning_rate": 4.8131539611360244e-05, - "loss": 1.4738, + "loss": 1.4734, "step": 4400 }, { "epoch": 0.2992933822530235, - "grad_norm": 0.5494669675827026, + "grad_norm": 0.5520553588867188, "learning_rate": 4.812941636091861e-05, - "loss": 1.4006, + "loss": 1.401, "step": 4405 }, { "epoch": 0.2996331023236853, - "grad_norm": 0.5814257860183716, + "grad_norm": 0.5858537554740906, "learning_rate": 4.8127293110476966e-05, - "loss": 1.2864, + "loss": 1.2872, "step": 4410 }, { "epoch": 0.2999728223943471, - "grad_norm": 0.656559944152832, + "grad_norm": 0.6561074256896973, "learning_rate": 4.8125169860035336e-05, - "loss": 1.3349, + "loss": 1.3341, "step": 4415 }, { "epoch": 0.3003125424650088, - "grad_norm": 0.6489969491958618, + "grad_norm": 0.6490188837051392, "learning_rate": 4.81230466095937e-05, - "loss": 1.4294, + "loss": 1.4297, "step": 4420 }, { "epoch": 0.3006522625356706, - "grad_norm": 0.6143659949302673, + "grad_norm": 0.6167186498641968, "learning_rate": 4.812092335915206e-05, - "loss": 1.4744, + "loss": 1.4746, "step": 4425 }, { "epoch": 0.30099198260633236, - "grad_norm": 0.6614595055580139, + "grad_norm": 0.6646005511283875, "learning_rate": 4.811880010871043e-05, - "loss": 1.5482, + "loss": 1.548, "step": 4430 }, { "epoch": 0.30133170267699416, - "grad_norm": 0.5838325619697571, + "grad_norm": 0.5838183164596558, "learning_rate": 4.8116676858268786e-05, - "loss": 1.4307, + "loss": 1.4312, "step": 4435 }, { "epoch": 0.3016714227476559, - "grad_norm": 0.7493719458580017, + "grad_norm": 0.7486035227775574, "learning_rate": 4.811455360782715e-05, - "loss": 1.3723, + "loss": 1.3714, "step": 4440 }, { "epoch": 0.3020111428183177, - "grad_norm": 0.5825745463371277, + "grad_norm": 0.5858824849128723, "learning_rate": 4.811243035738552e-05, - "loss": 1.4164, + "loss": 1.4168, "step": 4445 }, { "epoch": 0.3023508628889795, - "grad_norm": 0.6245297789573669, + "grad_norm": 0.6223589777946472, "learning_rate": 4.811030710694388e-05, - "loss": 1.3359, + "loss": 1.3361, "step": 4450 }, { "epoch": 0.30269058295964124, - "grad_norm": 0.5819307565689087, + "grad_norm": 0.5818971991539001, "learning_rate": 4.810818385650224e-05, - "loss": 1.3274, + "loss": 1.3269, "step": 4455 }, { "epoch": 0.30303030303030304, - "grad_norm": 0.6597985625267029, + "grad_norm": 0.6612294316291809, "learning_rate": 4.810606060606061e-05, - "loss": 1.3779, + "loss": 1.3786, "step": 4460 }, { "epoch": 0.3033700231009648, - "grad_norm": 0.6612825989723206, + "grad_norm": 0.659691572189331, "learning_rate": 4.810393735561897e-05, "loss": 1.4092, "step": 4465 }, { "epoch": 0.3037097431716266, - "grad_norm": 0.5472775101661682, + "grad_norm": 0.5466449856758118, "learning_rate": 4.8101814105177334e-05, - "loss": 1.3949, + "loss": 1.3951, "step": 4470 }, { "epoch": 0.3040494632422884, - "grad_norm": 0.603412389755249, + "grad_norm": 0.6039320826530457, "learning_rate": 4.8099690854735705e-05, - "loss": 1.4901, + "loss": 1.4902, "step": 4475 }, { "epoch": 0.3043891833129501, - "grad_norm": 0.878606379032135, + "grad_norm": 0.8951917886734009, "learning_rate": 4.809756760429406e-05, - "loss": 1.3746, + "loss": 1.374, "step": 4480 }, { "epoch": 0.3047289033836119, - "grad_norm": 0.5390682220458984, + "grad_norm": 0.541711688041687, "learning_rate": 4.8095444353852426e-05, - "loss": 1.4478, + "loss": 1.4479, "step": 4485 }, { "epoch": 0.30506862345427366, - "grad_norm": 0.6324414014816284, + "grad_norm": 0.6341110467910767, "learning_rate": 4.8093321103410797e-05, - "loss": 1.5412, + "loss": 1.5414, "step": 4490 }, { "epoch": 0.30540834352493546, - "grad_norm": 0.6175958514213562, + "grad_norm": 0.6160107851028442, "learning_rate": 4.8091197852969154e-05, - "loss": 1.3482, + "loss": 1.3481, "step": 4495 }, { "epoch": 0.30574806359559725, - "grad_norm": 0.6746761798858643, + "grad_norm": 0.676036536693573, "learning_rate": 4.808907460252752e-05, - "loss": 1.4718, + "loss": 1.4719, "step": 4500 }, { "epoch": 0.306087783666259, - "grad_norm": 0.6800394058227539, + "grad_norm": 0.6832454800605774, "learning_rate": 4.808695135208588e-05, - "loss": 1.34, + "loss": 1.3403, "step": 4505 }, { "epoch": 0.3064275037369208, - "grad_norm": 0.5797684788703918, + "grad_norm": 0.5796002745628357, "learning_rate": 4.8084828101644246e-05, - "loss": 1.433, + "loss": 1.4332, "step": 4510 }, { "epoch": 0.30676722380758253, - "grad_norm": 0.5536337494850159, + "grad_norm": 0.5533788800239563, "learning_rate": 4.808270485120261e-05, - "loss": 1.3905, + "loss": 1.3908, "step": 4515 }, { "epoch": 0.30710694387824433, - "grad_norm": 0.6156224608421326, + "grad_norm": 0.6180282235145569, "learning_rate": 4.8080581600760974e-05, - "loss": 1.3857, + "loss": 1.3859, "step": 4520 }, { "epoch": 0.3074466639489061, - "grad_norm": 0.5743491649627686, + "grad_norm": 0.5741366744041443, "learning_rate": 4.807845835031934e-05, - "loss": 1.329, + "loss": 1.3284, "step": 4525 }, { "epoch": 0.30778638401956787, - "grad_norm": 0.46847978234291077, + "grad_norm": 0.4675374925136566, "learning_rate": 4.80763350998777e-05, - "loss": 1.4378, + "loss": 1.4372, "step": 4530 }, { "epoch": 0.30812610409022967, - "grad_norm": 0.7120422124862671, + "grad_norm": 0.7097048163414001, "learning_rate": 4.8074211849436066e-05, - "loss": 1.3574, + "loss": 1.3571, "step": 4535 }, { "epoch": 0.3084658241608914, - "grad_norm": 0.6666555404663086, + "grad_norm": 0.6655773520469666, "learning_rate": 4.807208859899443e-05, - "loss": 1.4085, + "loss": 1.4088, "step": 4540 }, { "epoch": 0.3088055442315532, - "grad_norm": 0.5958794355392456, + "grad_norm": 0.5943617820739746, "learning_rate": 4.8069965348552794e-05, - "loss": 1.3687, + "loss": 1.3681, "step": 4545 }, { "epoch": 0.30914526430221495, - "grad_norm": 0.6510292291641235, + "grad_norm": 0.6509490609169006, "learning_rate": 4.806784209811116e-05, - "loss": 1.4446, + "loss": 1.4442, "step": 4550 }, { "epoch": 0.30948498437287675, - "grad_norm": 0.5929175019264221, + "grad_norm": 0.5945768356323242, "learning_rate": 4.806571884766952e-05, - "loss": 1.3678, + "loss": 1.3674, "step": 4555 }, { "epoch": 0.30982470444353855, - "grad_norm": 0.6229990720748901, + "grad_norm": 0.6225332021713257, "learning_rate": 4.8063595597227886e-05, - "loss": 1.3999, + "loss": 1.3995, "step": 4560 }, { "epoch": 0.3101644245142003, - "grad_norm": 0.6406978368759155, + "grad_norm": 0.6413206458091736, "learning_rate": 4.806147234678625e-05, - "loss": 1.4608, + "loss": 1.4604, "step": 4565 }, { "epoch": 0.3105041445848621, - "grad_norm": 0.596836268901825, + "grad_norm": 0.595911979675293, "learning_rate": 4.8059349096344614e-05, "loss": 1.3946, "step": 4570 }, { "epoch": 0.3108438646555238, - "grad_norm": 0.6312773823738098, + "grad_norm": 0.632885217666626, "learning_rate": 4.805722584590298e-05, - "loss": 1.3129, + "loss": 1.3128, "step": 4575 }, { "epoch": 0.3111835847261856, - "grad_norm": 0.5499838590621948, + "grad_norm": 0.5501091480255127, "learning_rate": 4.805510259546134e-05, - "loss": 1.4556, + "loss": 1.4558, "step": 4580 }, { "epoch": 0.3115233047968474, - "grad_norm": 0.5884018540382385, + "grad_norm": 0.583938479423523, "learning_rate": 4.8052979345019706e-05, - "loss": 1.4935, + "loss": 1.4931, "step": 4585 }, { "epoch": 0.31186302486750916, - "grad_norm": 0.5463150143623352, + "grad_norm": 0.5499637722969055, "learning_rate": 4.805085609457807e-05, - "loss": 1.373, + "loss": 1.3731, "step": 4590 }, { "epoch": 0.31220274493817096, - "grad_norm": 0.6012735366821289, + "grad_norm": 0.5993675589561462, "learning_rate": 4.8048732844136434e-05, - "loss": 1.4114, + "loss": 1.4111, "step": 4595 }, { "epoch": 0.3125424650088327, - "grad_norm": 0.6352843046188354, + "grad_norm": 0.6362263560295105, "learning_rate": 4.80466095936948e-05, - "loss": 1.3933, + "loss": 1.3928, "step": 4600 }, { "epoch": 0.3128821850794945, - "grad_norm": 0.5974364280700684, + "grad_norm": 0.5940797924995422, "learning_rate": 4.804448634325316e-05, - "loss": 1.3988, + "loss": 1.3984, "step": 4605 }, { "epoch": 0.3132219051501563, - "grad_norm": 0.6205219626426697, + "grad_norm": 0.615020751953125, "learning_rate": 4.8042363092811526e-05, - "loss": 1.3811, + "loss": 1.3818, "step": 4610 }, { "epoch": 0.31356162522081804, - "grad_norm": 0.5919428467750549, + "grad_norm": 0.5921596884727478, "learning_rate": 4.804023984236989e-05, - "loss": 1.3911, + "loss": 1.3917, "step": 4615 }, { "epoch": 0.31390134529147984, - "grad_norm": 0.59086674451828, + "grad_norm": 0.585711658000946, "learning_rate": 4.8038116591928254e-05, - "loss": 1.336, + "loss": 1.3356, "step": 4620 }, { "epoch": 0.3142410653621416, - "grad_norm": 0.5678482055664062, + "grad_norm": 0.5674652457237244, "learning_rate": 4.803599334148662e-05, - "loss": 1.3585, + "loss": 1.3584, "step": 4625 }, { "epoch": 0.3145807854328034, - "grad_norm": 0.6020316481590271, + "grad_norm": 0.6049450039863586, "learning_rate": 4.803387009104498e-05, - "loss": 1.3429, + "loss": 1.3432, "step": 4630 }, { "epoch": 0.3149205055034651, - "grad_norm": 0.6483088731765747, + "grad_norm": 0.6482475996017456, "learning_rate": 4.8031746840603346e-05, - "loss": 1.3824, + "loss": 1.3834, "step": 4635 }, { "epoch": 0.3152602255741269, - "grad_norm": 0.5800559520721436, + "grad_norm": 0.577788770198822, "learning_rate": 4.802962359016171e-05, - "loss": 1.3792, + "loss": 1.3796, "step": 4640 }, { "epoch": 0.3155999456447887, - "grad_norm": 0.529157280921936, + "grad_norm": 0.5292919874191284, "learning_rate": 4.8027500339720074e-05, - "loss": 1.454, + "loss": 1.4535, "step": 4645 }, { "epoch": 0.31593966571545046, - "grad_norm": 0.673725426197052, + "grad_norm": 0.6775003671646118, "learning_rate": 4.802537708927844e-05, - "loss": 1.3433, + "loss": 1.3441, "step": 4650 }, { "epoch": 0.31627938578611225, - "grad_norm": 0.5634777545928955, + "grad_norm": 0.5632394552230835, "learning_rate": 4.80232538388368e-05, - "loss": 1.408, + "loss": 1.4085, "step": 4655 }, { "epoch": 0.316619105856774, - "grad_norm": 0.6213387250900269, + "grad_norm": 0.6283419728279114, "learning_rate": 4.8021130588395166e-05, - "loss": 1.4259, + "loss": 1.426, "step": 4660 }, { "epoch": 0.3169588259274358, - "grad_norm": 0.6602832078933716, + "grad_norm": 0.6614745259284973, "learning_rate": 4.801900733795353e-05, - "loss": 1.3895, + "loss": 1.3894, "step": 4665 }, { "epoch": 0.3172985459980976, - "grad_norm": 0.6077116131782532, + "grad_norm": 0.6065006852149963, "learning_rate": 4.8016884087511894e-05, - "loss": 1.3893, + "loss": 1.3892, "step": 4670 }, { "epoch": 0.31763826606875933, - "grad_norm": 0.6477828621864319, + "grad_norm": 0.6503626704216003, "learning_rate": 4.801476083707026e-05, - "loss": 1.5709, + "loss": 1.5708, "step": 4675 }, { "epoch": 0.31797798613942113, - "grad_norm": 0.6097428798675537, + "grad_norm": 0.6123086214065552, "learning_rate": 4.801263758662862e-05, "loss": 1.3114, "step": 4680 }, { "epoch": 0.3183177062100829, - "grad_norm": 0.5983066558837891, + "grad_norm": 0.6000961661338806, "learning_rate": 4.8010514336186986e-05, - "loss": 1.5342, + "loss": 1.5345, "step": 4685 }, { "epoch": 0.31865742628074467, - "grad_norm": 0.5761337280273438, + "grad_norm": 0.5794433951377869, "learning_rate": 4.800839108574535e-05, - "loss": 1.4056, + "loss": 1.4064, "step": 4690 }, { "epoch": 0.31899714635140647, - "grad_norm": 0.5824152827262878, + "grad_norm": 0.5820723176002502, "learning_rate": 4.8006267835303714e-05, - "loss": 1.3726, + "loss": 1.373, "step": 4695 }, { "epoch": 0.3193368664220682, - "grad_norm": 0.5764522552490234, + "grad_norm": 0.5793818235397339, "learning_rate": 4.800414458486208e-05, - "loss": 1.3504, + "loss": 1.3501, "step": 4700 }, { "epoch": 0.31967658649273, - "grad_norm": 0.6585021615028381, + "grad_norm": 0.6611591577529907, "learning_rate": 4.8002021334420435e-05, - "loss": 1.3808, + "loss": 1.3812, "step": 4705 }, { "epoch": 0.32001630656339175, - "grad_norm": 0.6844765543937683, + "grad_norm": 0.6872620582580566, "learning_rate": 4.7999898083978806e-05, - "loss": 1.3319, + "loss": 1.332, "step": 4710 }, { "epoch": 0.32035602663405355, - "grad_norm": 0.6688312292098999, + "grad_norm": 0.6676487922668457, "learning_rate": 4.799777483353717e-05, - "loss": 1.3902, + "loss": 1.39, "step": 4715 }, { "epoch": 0.3206957467047153, - "grad_norm": 0.6278951168060303, + "grad_norm": 0.630530834197998, "learning_rate": 4.799565158309553e-05, - "loss": 1.4044, + "loss": 1.4042, "step": 4720 }, { "epoch": 0.3210354667753771, - "grad_norm": 0.7092724442481995, + "grad_norm": 0.7141803503036499, "learning_rate": 4.79935283326539e-05, - "loss": 1.4092, + "loss": 1.4088, "step": 4725 }, { "epoch": 0.3213751868460389, - "grad_norm": 0.5436230301856995, + "grad_norm": 0.542120099067688, "learning_rate": 4.799140508221226e-05, - "loss": 1.3973, + "loss": 1.3972, "step": 4730 }, { "epoch": 0.3217149069167006, - "grad_norm": 0.6359293460845947, + "grad_norm": 0.6312313079833984, "learning_rate": 4.798928183177062e-05, - "loss": 1.3636, + "loss": 1.3645, "step": 4735 }, { "epoch": 0.3220546269873624, - "grad_norm": 0.6571273803710938, + "grad_norm": 0.6562297940254211, "learning_rate": 4.798715858132899e-05, - "loss": 1.4008, + "loss": 1.4012, "step": 4740 }, { "epoch": 0.32239434705802417, - "grad_norm": 0.6457809209823608, + "grad_norm": 0.6460033655166626, "learning_rate": 4.7985035330887354e-05, - "loss": 1.3815, + "loss": 1.3816, "step": 4745 }, { "epoch": 0.32273406712868596, - "grad_norm": 0.7295517325401306, + "grad_norm": 0.7296239137649536, "learning_rate": 4.798291208044571e-05, - "loss": 1.4018, + "loss": 1.4013, "step": 4750 }, { "epoch": 0.32307378719934776, - "grad_norm": 0.598146378993988, + "grad_norm": 0.5994329452514648, "learning_rate": 4.798078883000408e-05, - "loss": 1.4444, + "loss": 1.445, "step": 4755 }, { "epoch": 0.3234135072700095, - "grad_norm": 0.5251185894012451, + "grad_norm": 0.5263556241989136, "learning_rate": 4.7978665579562446e-05, - "loss": 1.3763, + "loss": 1.3762, "step": 4760 }, { "epoch": 0.3237532273406713, - "grad_norm": 0.5642194747924805, + "grad_norm": 0.5676795840263367, "learning_rate": 4.79765423291208e-05, - "loss": 1.4491, + "loss": 1.4489, "step": 4765 }, { "epoch": 0.32409294741133304, - "grad_norm": 0.6763331890106201, + "grad_norm": 0.6814931631088257, "learning_rate": 4.7974419078679174e-05, - "loss": 1.393, + "loss": 1.3934, "step": 4770 }, { "epoch": 0.32443266748199484, - "grad_norm": 0.6245210766792297, + "grad_norm": 0.6255049705505371, "learning_rate": 4.797229582823753e-05, - "loss": 1.3899, + "loss": 1.3901, "step": 4775 }, { "epoch": 0.32477238755265664, - "grad_norm": 0.6132177114486694, + "grad_norm": 0.6123117208480835, "learning_rate": 4.7970172577795895e-05, - "loss": 1.4692, + "loss": 1.4696, "step": 4780 }, { "epoch": 0.3251121076233184, - "grad_norm": 0.6960394382476807, + "grad_norm": 0.6963241100311279, "learning_rate": 4.7968049327354266e-05, - "loss": 1.3672, + "loss": 1.367, "step": 4785 }, { "epoch": 0.3254518276939802, - "grad_norm": 0.6208332777023315, + "grad_norm": 0.6139285564422607, "learning_rate": 4.796592607691262e-05, - "loss": 1.3814, + "loss": 1.381, "step": 4790 }, { "epoch": 0.3257915477646419, - "grad_norm": 0.5507853627204895, + "grad_norm": 0.5506198406219482, "learning_rate": 4.796380282647099e-05, - "loss": 1.3566, + "loss": 1.3559, "step": 4795 }, { "epoch": 0.3261312678353037, - "grad_norm": 0.6600543856620789, + "grad_norm": 0.6621894240379333, "learning_rate": 4.796167957602936e-05, "loss": 1.44, "step": 4800 }, { "epoch": 0.32647098790596546, - "grad_norm": 0.638710081577301, + "grad_norm": 0.6410127878189087, "learning_rate": 4.7959556325587715e-05, - "loss": 1.3183, + "loss": 1.3178, "step": 4805 }, { "epoch": 0.32681070797662726, - "grad_norm": 0.6161012053489685, + "grad_norm": 0.614829957485199, "learning_rate": 4.795743307514608e-05, - "loss": 1.4818, + "loss": 1.4821, "step": 4810 }, { "epoch": 0.32715042804728905, - "grad_norm": 0.6707820296287537, + "grad_norm": 0.6711080074310303, "learning_rate": 4.795530982470445e-05, - "loss": 1.397, + "loss": 1.3969, "step": 4815 }, { "epoch": 0.3274901481179508, - "grad_norm": 0.5956471562385559, + "grad_norm": 0.5962970852851868, "learning_rate": 4.795318657426281e-05, - "loss": 1.3902, + "loss": 1.3907, "step": 4820 }, { "epoch": 0.3278298681886126, - "grad_norm": 0.4893324375152588, + "grad_norm": 0.48773762583732605, "learning_rate": 4.795106332382117e-05, - "loss": 1.4835, + "loss": 1.4834, "step": 4825 }, { "epoch": 0.32816958825927434, - "grad_norm": 0.6486138701438904, + "grad_norm": 0.6517683863639832, "learning_rate": 4.794894007337954e-05, - "loss": 1.2775, + "loss": 1.278, "step": 4830 }, { "epoch": 0.32850930832993613, - "grad_norm": 0.6149250864982605, + "grad_norm": 0.6137312054634094, "learning_rate": 4.79468168229379e-05, - "loss": 1.3381, + "loss": 1.3383, "step": 4835 }, { "epoch": 0.32884902840059793, - "grad_norm": 0.6293796300888062, + "grad_norm": 0.6289145946502686, "learning_rate": 4.794469357249626e-05, - "loss": 1.4361, + "loss": 1.4366, "step": 4840 }, { "epoch": 0.3291887484712597, - "grad_norm": 0.5803507566452026, + "grad_norm": 0.5792094469070435, "learning_rate": 4.7942570322054634e-05, - "loss": 1.4186, + "loss": 1.4181, "step": 4845 }, { "epoch": 0.32952846854192147, - "grad_norm": 0.6761592030525208, + "grad_norm": 0.6721256375312805, "learning_rate": 4.794044707161299e-05, "loss": 1.371, "step": 4850 }, { "epoch": 0.3298681886125832, - "grad_norm": 0.7355863451957703, + "grad_norm": 0.7336487770080566, "learning_rate": 4.7938323821171355e-05, - "loss": 1.2935, + "loss": 1.2929, "step": 4855 }, { "epoch": 0.330207908683245, - "grad_norm": 0.7543905377388, + "grad_norm": 0.7571314573287964, "learning_rate": 4.793620057072972e-05, - "loss": 1.3664, + "loss": 1.3661, "step": 4860 }, { "epoch": 0.3305476287539068, - "grad_norm": 0.6539553999900818, + "grad_norm": 0.6576473116874695, "learning_rate": 4.793407732028808e-05, - "loss": 1.3888, + "loss": 1.3889, "step": 4865 }, { "epoch": 0.33088734882456855, - "grad_norm": 0.5584560036659241, + "grad_norm": 0.5599566102027893, "learning_rate": 4.793195406984645e-05, - "loss": 1.4666, + "loss": 1.4667, "step": 4870 }, { "epoch": 0.33122706889523035, - "grad_norm": 0.5838732719421387, + "grad_norm": 0.581777036190033, "learning_rate": 4.792983081940481e-05, - "loss": 1.2935, + "loss": 1.2933, "step": 4875 }, { "epoch": 0.3315667889658921, - "grad_norm": 0.6014986634254456, + "grad_norm": 0.5999479293823242, "learning_rate": 4.7927707568963175e-05, "loss": 1.3325, "step": 4880 }, { "epoch": 0.3319065090365539, - "grad_norm": 0.571640133857727, + "grad_norm": 0.5716682076454163, "learning_rate": 4.792558431852154e-05, - "loss": 1.4879, + "loss": 1.4883, "step": 4885 }, { "epoch": 0.33224622910721563, - "grad_norm": 0.628282368183136, + "grad_norm": 0.6342703700065613, "learning_rate": 4.79234610680799e-05, - "loss": 1.3761, + "loss": 1.376, "step": 4890 }, { "epoch": 0.3325859491778774, - "grad_norm": 0.6059534549713135, + "grad_norm": 0.6060812473297119, "learning_rate": 4.792133781763827e-05, - "loss": 1.3459, + "loss": 1.346, "step": 4895 }, { "epoch": 0.3329256692485392, - "grad_norm": 0.588287353515625, + "grad_norm": 0.5847216844558716, "learning_rate": 4.791921456719663e-05, - "loss": 1.4798, + "loss": 1.4799, "step": 4900 }, { "epoch": 0.33326538931920097, - "grad_norm": 0.5479865670204163, + "grad_norm": 0.5480058789253235, "learning_rate": 4.7917091316754995e-05, - "loss": 1.3439, + "loss": 1.3442, "step": 4905 }, { "epoch": 0.33360510938986276, - "grad_norm": 0.6028078198432922, + "grad_norm": 0.6012808680534363, "learning_rate": 4.791496806631336e-05, - "loss": 1.3949, + "loss": 1.3947, "step": 4910 }, { "epoch": 0.3339448294605245, - "grad_norm": 0.6484723091125488, + "grad_norm": 0.6489436030387878, "learning_rate": 4.791284481587172e-05, "loss": 1.5038, "step": 4915 }, { "epoch": 0.3342845495311863, - "grad_norm": 0.629385769367218, + "grad_norm": 0.6316866278648376, "learning_rate": 4.791072156543009e-05, - "loss": 1.3792, + "loss": 1.3793, "step": 4920 }, { "epoch": 0.3346242696018481, - "grad_norm": 0.7283436059951782, + "grad_norm": 0.730665922164917, "learning_rate": 4.790859831498845e-05, - "loss": 1.4609, + "loss": 1.4612, "step": 4925 }, { "epoch": 0.33496398967250984, - "grad_norm": 0.6129261255264282, + "grad_norm": 0.6132574677467346, "learning_rate": 4.7906475064546815e-05, - "loss": 1.5769, + "loss": 1.5773, "step": 4930 }, { "epoch": 0.33530370974317164, - "grad_norm": 0.7182016372680664, + "grad_norm": 0.7169525623321533, "learning_rate": 4.790435181410518e-05, - "loss": 1.413, + "loss": 1.4129, "step": 4935 }, { "epoch": 0.3356434298138334, - "grad_norm": 0.5703354477882385, + "grad_norm": 0.5712332129478455, "learning_rate": 4.790222856366354e-05, - "loss": 1.4701, + "loss": 1.4708, "step": 4940 }, { "epoch": 0.3359831498844952, - "grad_norm": 0.5589520335197449, + "grad_norm": 0.5597065091133118, "learning_rate": 4.790010531322191e-05, "loss": 1.4668, "step": 4945 }, { "epoch": 0.336322869955157, - "grad_norm": 0.6707218885421753, + "grad_norm": 0.6731829047203064, "learning_rate": 4.789798206278027e-05, - "loss": 1.3746, + "loss": 1.374, "step": 4950 }, { "epoch": 0.3366625900258187, - "grad_norm": 0.5754794478416443, + "grad_norm": 0.5751031041145325, "learning_rate": 4.7895858812338635e-05, - "loss": 1.4444, + "loss": 1.4449, "step": 4955 }, { "epoch": 0.3370023100964805, - "grad_norm": 0.5749678611755371, + "grad_norm": 0.5820136070251465, "learning_rate": 4.7893735561897e-05, - "loss": 1.3014, + "loss": 1.3008, "step": 4960 }, { "epoch": 0.33734203016714226, - "grad_norm": 0.5664737224578857, + "grad_norm": 0.5675442814826965, "learning_rate": 4.789161231145536e-05, - "loss": 1.4334, + "loss": 1.4339, "step": 4965 }, { "epoch": 0.33768175023780406, - "grad_norm": 0.632757842540741, + "grad_norm": 0.6340497732162476, "learning_rate": 4.788948906101373e-05, - "loss": 1.4134, + "loss": 1.4136, "step": 4970 }, { "epoch": 0.3380214703084658, - "grad_norm": 0.5536924600601196, + "grad_norm": 0.5544374585151672, "learning_rate": 4.788736581057209e-05, - "loss": 1.4924, + "loss": 1.4916, "step": 4975 }, { "epoch": 0.3383611903791276, - "grad_norm": 0.5794201493263245, + "grad_norm": 0.5790332555770874, "learning_rate": 4.7885242560130455e-05, "loss": 1.38, "step": 4980 }, { "epoch": 0.3387009104497894, - "grad_norm": 0.6089941263198853, + "grad_norm": 0.6176489591598511, "learning_rate": 4.788311930968882e-05, - "loss": 1.3928, + "loss": 1.3934, "step": 4985 }, { "epoch": 0.33904063052045114, - "grad_norm": 0.6675647497177124, + "grad_norm": 0.6694954037666321, "learning_rate": 4.788099605924718e-05, - "loss": 1.3697, + "loss": 1.3702, "step": 4990 }, { "epoch": 0.33938035059111293, - "grad_norm": 0.6070053577423096, + "grad_norm": 0.6057488322257996, "learning_rate": 4.787887280880555e-05, - "loss": 1.4913, + "loss": 1.4914, "step": 4995 }, { "epoch": 0.3397200706617747, - "grad_norm": 0.538594126701355, + "grad_norm": 0.5366824269294739, "learning_rate": 4.787674955836391e-05, - "loss": 1.3906, + "loss": 1.3905, "step": 5000 }, { "epoch": 0.3400597907324365, - "grad_norm": 0.5597215294837952, + "grad_norm": 0.5644224286079407, "learning_rate": 4.7874626307922275e-05, - "loss": 1.4269, + "loss": 1.4278, "step": 5005 }, { "epoch": 0.34039951080309827, - "grad_norm": 0.6286970973014832, + "grad_norm": 0.6292764544487, "learning_rate": 4.787250305748064e-05, - "loss": 1.39, + "loss": 1.3901, "step": 5010 }, { "epoch": 0.34073923087376, - "grad_norm": 0.5263593792915344, + "grad_norm": 0.5256039500236511, "learning_rate": 4.7870379807039003e-05, - "loss": 1.356, + "loss": 1.3558, "step": 5015 }, { "epoch": 0.3410789509444218, - "grad_norm": 0.5326294898986816, + "grad_norm": 0.5306893587112427, "learning_rate": 4.786825655659737e-05, - "loss": 1.3172, + "loss": 1.3175, "step": 5020 }, { "epoch": 0.34141867101508355, - "grad_norm": 0.6011534929275513, + "grad_norm": 0.5975860953330994, "learning_rate": 4.786613330615573e-05, - "loss": 1.3477, + "loss": 1.3481, "step": 5025 }, { "epoch": 0.34175839108574535, - "grad_norm": 0.6430546641349792, + "grad_norm": 0.6439404487609863, "learning_rate": 4.7864010055714095e-05, - "loss": 1.3919, + "loss": 1.3918, "step": 5030 }, { "epoch": 0.34209811115640715, - "grad_norm": 0.6264625787734985, + "grad_norm": 0.6255616545677185, "learning_rate": 4.786188680527246e-05, - "loss": 1.3747, + "loss": 1.3752, "step": 5035 }, { "epoch": 0.3424378312270689, - "grad_norm": 0.6695388555526733, + "grad_norm": 0.6796082854270935, "learning_rate": 4.7859763554830823e-05, - "loss": 1.4466, + "loss": 1.4469, "step": 5040 }, { "epoch": 0.3427775512977307, - "grad_norm": 0.694168746471405, + "grad_norm": 0.6959269046783447, "learning_rate": 4.785764030438919e-05, - "loss": 1.4146, + "loss": 1.4145, "step": 5045 }, { "epoch": 0.34311727136839243, - "grad_norm": 0.616524338722229, + "grad_norm": 0.6152186393737793, "learning_rate": 4.785551705394755e-05, - "loss": 1.3124, + "loss": 1.3119, "step": 5050 }, { "epoch": 0.3434569914390542, - "grad_norm": 0.5950208902359009, + "grad_norm": 0.5967111587524414, "learning_rate": 4.7853393803505915e-05, - "loss": 1.4781, + "loss": 1.4778, "step": 5055 }, { "epoch": 0.34379671150971597, - "grad_norm": 0.5522637963294983, + "grad_norm": 0.5502579212188721, "learning_rate": 4.785127055306427e-05, - "loss": 1.2603, + "loss": 1.2597, "step": 5060 }, { "epoch": 0.34413643158037777, - "grad_norm": 0.6136479377746582, + "grad_norm": 0.612746000289917, "learning_rate": 4.7849147302622643e-05, - "loss": 1.2652, + "loss": 1.2645, "step": 5065 }, { "epoch": 0.34447615165103956, - "grad_norm": 0.603760302066803, + "grad_norm": 0.6044663786888123, "learning_rate": 4.784702405218101e-05, - "loss": 1.4905, + "loss": 1.4913, "step": 5070 }, { "epoch": 0.3448158717217013, - "grad_norm": 0.6027374863624573, + "grad_norm": 0.6014927625656128, "learning_rate": 4.7844900801739365e-05, - "loss": 1.357, + "loss": 1.3572, "step": 5075 }, { "epoch": 0.3451555917923631, - "grad_norm": 0.6933466792106628, + "grad_norm": 0.6972743272781372, "learning_rate": 4.7842777551297735e-05, - "loss": 1.4252, + "loss": 1.4247, "step": 5080 }, { "epoch": 0.34549531186302485, - "grad_norm": 0.5912922024726868, + "grad_norm": 0.5933302044868469, "learning_rate": 4.78406543008561e-05, "loss": 1.3562, "step": 5085 }, { "epoch": 0.34583503193368664, - "grad_norm": 0.8580370545387268, + "grad_norm": 0.8374489545822144, "learning_rate": 4.783853105041446e-05, "loss": 1.3646, "step": 5090 }, { "epoch": 0.34617475200434844, - "grad_norm": 0.5268974304199219, + "grad_norm": 0.5257875323295593, "learning_rate": 4.783640779997283e-05, - "loss": 1.4347, + "loss": 1.4353, "step": 5095 }, { "epoch": 0.3465144720750102, - "grad_norm": 0.6514886617660522, + "grad_norm": 0.6514118313789368, "learning_rate": 4.783428454953119e-05, - "loss": 1.3731, + "loss": 1.3736, "step": 5100 }, { "epoch": 0.346854192145672, - "grad_norm": 0.6381573677062988, + "grad_norm": 0.638463020324707, "learning_rate": 4.783216129908955e-05, - "loss": 1.4298, + "loss": 1.4304, "step": 5105 }, { "epoch": 0.3471939122163337, - "grad_norm": 0.5856737494468689, + "grad_norm": 0.58790522813797, "learning_rate": 4.783003804864792e-05, - "loss": 1.3209, + "loss": 1.3211, "step": 5110 }, { "epoch": 0.3475336322869955, - "grad_norm": 0.670164167881012, + "grad_norm": 0.6760638952255249, "learning_rate": 4.7827914798206283e-05, - "loss": 1.3555, + "loss": 1.3552, "step": 5115 }, { "epoch": 0.3478733523576573, - "grad_norm": 0.6281108260154724, + "grad_norm": 0.6283975839614868, "learning_rate": 4.782579154776464e-05, - "loss": 1.4509, + "loss": 1.4507, "step": 5120 }, { "epoch": 0.34821307242831906, - "grad_norm": 0.5642656683921814, + "grad_norm": 0.5674806833267212, "learning_rate": 4.782366829732301e-05, - "loss": 1.5178, + "loss": 1.5176, "step": 5125 }, { "epoch": 0.34855279249898086, - "grad_norm": 0.543973445892334, + "grad_norm": 0.5445407629013062, "learning_rate": 4.782154504688137e-05, - "loss": 1.3711, + "loss": 1.3712, "step": 5130 }, { "epoch": 0.3488925125696426, - "grad_norm": 0.5834305882453918, + "grad_norm": 0.5807813405990601, "learning_rate": 4.781942179643973e-05, - "loss": 1.4175, + "loss": 1.4169, "step": 5135 }, { "epoch": 0.3492322326403044, - "grad_norm": 0.5386638641357422, + "grad_norm": 0.5430293083190918, "learning_rate": 4.7817298545998103e-05, - "loss": 1.3985, + "loss": 1.3979, "step": 5140 }, { "epoch": 0.34957195271096614, - "grad_norm": 0.5610645413398743, + "grad_norm": 0.5600992441177368, "learning_rate": 4.781517529555646e-05, - "loss": 1.3601, + "loss": 1.3603, "step": 5145 }, { "epoch": 0.34991167278162794, - "grad_norm": 0.5059294700622559, + "grad_norm": 0.5062242150306702, "learning_rate": 4.7813052045114825e-05, - "loss": 1.3197, + "loss": 1.32, "step": 5150 }, { "epoch": 0.35025139285228973, - "grad_norm": 0.588741660118103, + "grad_norm": 0.5865479111671448, "learning_rate": 4.7810928794673196e-05, - "loss": 1.4182, + "loss": 1.4181, "step": 5155 }, { "epoch": 0.3505911129229515, - "grad_norm": 0.6061328649520874, + "grad_norm": 0.6084330081939697, "learning_rate": 4.780880554423155e-05, - "loss": 1.4193, + "loss": 1.4196, "step": 5160 }, { "epoch": 0.3509308329936133, - "grad_norm": 0.6105550527572632, + "grad_norm": 0.6114481091499329, "learning_rate": 4.780668229378992e-05, - "loss": 1.3509, + "loss": 1.3506, "step": 5165 }, { "epoch": 0.351270553064275, - "grad_norm": 0.5791309475898743, + "grad_norm": 0.5776077508926392, "learning_rate": 4.780455904334829e-05, - "loss": 1.4309, + "loss": 1.4305, "step": 5170 }, { "epoch": 0.3516102731349368, - "grad_norm": 0.6164483428001404, + "grad_norm": 0.6255431771278381, "learning_rate": 4.7802435792906645e-05, - "loss": 1.3154, + "loss": 1.3156, "step": 5175 }, { "epoch": 0.3519499932055986, - "grad_norm": 0.5944452881813049, + "grad_norm": 0.5948196053504944, "learning_rate": 4.780031254246501e-05, - "loss": 1.3854, + "loss": 1.3858, "step": 5180 }, { "epoch": 0.35228971327626035, - "grad_norm": 0.624857485294342, + "grad_norm": 0.6264603137969971, "learning_rate": 4.779818929202338e-05, - "loss": 1.386, + "loss": 1.3857, "step": 5185 }, { "epoch": 0.35262943334692215, - "grad_norm": 0.6018693447113037, + "grad_norm": 0.6026751399040222, "learning_rate": 4.779606604158174e-05, - "loss": 1.4586, + "loss": 1.459, "step": 5190 }, { "epoch": 0.3529691534175839, - "grad_norm": 0.6194534301757812, + "grad_norm": 0.6205053329467773, "learning_rate": 4.77939427911401e-05, - "loss": 1.335, + "loss": 1.3346, "step": 5195 }, { "epoch": 0.3533088734882457, - "grad_norm": 0.701894998550415, + "grad_norm": 0.7030936479568481, "learning_rate": 4.7791819540698465e-05, - "loss": 1.4342, + "loss": 1.4335, "step": 5200 }, { "epoch": 0.3536485935589075, - "grad_norm": 0.607342541217804, + "grad_norm": 0.6005493402481079, "learning_rate": 4.778969629025683e-05, - "loss": 1.3867, + "loss": 1.3869, "step": 5205 }, { "epoch": 0.35398831362956923, - "grad_norm": 0.6307671666145325, + "grad_norm": 0.6331180930137634, "learning_rate": 4.778757303981519e-05, - "loss": 1.3592, + "loss": 1.3599, "step": 5210 }, { "epoch": 0.354328033700231, - "grad_norm": 0.6012018322944641, + "grad_norm": 0.6039384007453918, "learning_rate": 4.778544978937356e-05, - "loss": 1.4269, + "loss": 1.4274, "step": 5215 }, { "epoch": 0.35466775377089277, - "grad_norm": 0.5351936221122742, + "grad_norm": 0.5330010652542114, "learning_rate": 4.778332653893192e-05, - "loss": 1.5335, + "loss": 1.5336, "step": 5220 }, { "epoch": 0.35500747384155457, - "grad_norm": 0.6091983914375305, + "grad_norm": 0.6083765625953674, "learning_rate": 4.7781203288490285e-05, - "loss": 1.3533, + "loss": 1.3535, "step": 5225 }, { "epoch": 0.3553471939122163, - "grad_norm": 0.6138412952423096, + "grad_norm": 0.6125687956809998, "learning_rate": 4.777908003804865e-05, - "loss": 1.4093, + "loss": 1.4095, "step": 5230 }, { "epoch": 0.3556869139828781, - "grad_norm": 0.6481094360351562, + "grad_norm": 0.6483888626098633, "learning_rate": 4.777695678760701e-05, - "loss": 1.434, + "loss": 1.4338, "step": 5235 }, { "epoch": 0.3560266340535399, - "grad_norm": 0.5672658085823059, + "grad_norm": 0.567366361618042, "learning_rate": 4.777483353716538e-05, - "loss": 1.4019, + "loss": 1.4011, "step": 5240 }, { "epoch": 0.35636635412420165, - "grad_norm": 0.6443695425987244, + "grad_norm": 0.6421518325805664, "learning_rate": 4.777271028672374e-05, - "loss": 1.3913, + "loss": 1.3908, "step": 5245 }, { "epoch": 0.35670607419486344, - "grad_norm": 0.6255459189414978, + "grad_norm": 0.6260653734207153, "learning_rate": 4.7770587036282105e-05, - "loss": 1.5109, + "loss": 1.5108, "step": 5250 }, { "epoch": 0.3570457942655252, - "grad_norm": 0.48603206872940063, + "grad_norm": 0.48495861887931824, "learning_rate": 4.776846378584047e-05, - "loss": 1.361, + "loss": 1.3607, "step": 5255 }, { "epoch": 0.357385514336187, - "grad_norm": 0.5364346504211426, + "grad_norm": 0.5363582968711853, "learning_rate": 4.776634053539883e-05, - "loss": 1.4004, + "loss": 1.3993, "step": 5260 }, { "epoch": 0.3577252344068488, - "grad_norm": 0.6334765553474426, + "grad_norm": 0.6344860792160034, "learning_rate": 4.77642172849572e-05, - "loss": 1.2438, + "loss": 1.2442, "step": 5265 }, { "epoch": 0.3580649544775105, - "grad_norm": 0.6408407092094421, + "grad_norm": 0.6419795751571655, "learning_rate": 4.776209403451556e-05, - "loss": 1.4691, + "loss": 1.4692, "step": 5270 }, { "epoch": 0.3584046745481723, - "grad_norm": 0.6182148456573486, + "grad_norm": 0.6224852800369263, "learning_rate": 4.7759970784073925e-05, - "loss": 1.5037, + "loss": 1.5036, "step": 5275 }, { "epoch": 0.35874439461883406, - "grad_norm": 0.5836244821548462, + "grad_norm": 0.5826894640922546, "learning_rate": 4.775784753363229e-05, - "loss": 1.4265, + "loss": 1.4262, "step": 5280 }, { "epoch": 0.35908411468949586, - "grad_norm": 0.6044645309448242, + "grad_norm": 0.6049767732620239, "learning_rate": 4.775572428319065e-05, - "loss": 1.2618, + "loss": 1.2621, "step": 5285 }, { "epoch": 0.35942383476015766, - "grad_norm": 0.61807781457901, + "grad_norm": 0.6157771944999695, "learning_rate": 4.775360103274902e-05, - "loss": 1.3845, + "loss": 1.384, "step": 5290 }, { "epoch": 0.3597635548308194, - "grad_norm": 0.554994523525238, + "grad_norm": 0.5538929104804993, "learning_rate": 4.775147778230738e-05, - "loss": 1.4331, + "loss": 1.4333, "step": 5295 }, { "epoch": 0.3601032749014812, - "grad_norm": 0.645104706287384, + "grad_norm": 0.6432947516441345, "learning_rate": 4.7749354531865745e-05, - "loss": 1.3954, + "loss": 1.3953, "step": 5300 }, { "epoch": 0.36044299497214294, - "grad_norm": 0.6977760791778564, + "grad_norm": 0.6998238563537598, "learning_rate": 4.774723128142411e-05, "loss": 1.4616, "step": 5305 }, { "epoch": 0.36078271504280474, - "grad_norm": 0.6513646841049194, + "grad_norm": 0.6479151844978333, "learning_rate": 4.774510803098247e-05, - "loss": 1.3543, + "loss": 1.3539, "step": 5310 }, { "epoch": 0.3611224351134665, - "grad_norm": 0.5543293356895447, + "grad_norm": 0.5551134943962097, "learning_rate": 4.774298478054084e-05, - "loss": 1.4529, + "loss": 1.4534, "step": 5315 }, { "epoch": 0.3614621551841283, - "grad_norm": 0.6506486535072327, + "grad_norm": 0.6508130431175232, "learning_rate": 4.77408615300992e-05, - "loss": 1.3163, + "loss": 1.3164, "step": 5320 }, { "epoch": 0.3618018752547901, - "grad_norm": 0.6141875982284546, + "grad_norm": 0.6130679845809937, "learning_rate": 4.7738738279657565e-05, - "loss": 1.4583, + "loss": 1.4579, "step": 5325 }, { "epoch": 0.3621415953254518, - "grad_norm": 0.6069737672805786, + "grad_norm": 0.6091111898422241, "learning_rate": 4.773661502921593e-05, - "loss": 1.3349, + "loss": 1.334, "step": 5330 }, { "epoch": 0.3624813153961136, - "grad_norm": 0.6170814037322998, + "grad_norm": 0.6193397045135498, "learning_rate": 4.773449177877429e-05, - "loss": 1.4384, + "loss": 1.4388, "step": 5335 }, { "epoch": 0.36282103546677535, - "grad_norm": 0.6338891386985779, + "grad_norm": 0.6356204748153687, "learning_rate": 4.773236852833266e-05, "loss": 1.4742, "step": 5340 }, { "epoch": 0.36316075553743715, - "grad_norm": 0.5873185396194458, + "grad_norm": 0.588175356388092, "learning_rate": 4.773024527789102e-05, - "loss": 1.3871, + "loss": 1.3875, "step": 5345 }, { "epoch": 0.36350047560809895, - "grad_norm": 0.6128943562507629, + "grad_norm": 0.6111719012260437, "learning_rate": 4.7728122027449385e-05, - "loss": 1.364, + "loss": 1.3644, "step": 5350 }, { "epoch": 0.3638401956787607, - "grad_norm": 0.6030228734016418, + "grad_norm": 0.6030589938163757, "learning_rate": 4.772599877700775e-05, - "loss": 1.4258, + "loss": 1.4255, "step": 5355 }, { "epoch": 0.3641799157494225, - "grad_norm": 0.6233001947402954, + "grad_norm": 0.6208328604698181, "learning_rate": 4.772387552656611e-05, "loss": 1.2788, "step": 5360 }, { "epoch": 0.36451963582008423, - "grad_norm": 0.6130266785621643, + "grad_norm": 0.6149685978889465, "learning_rate": 4.772175227612448e-05, - "loss": 1.3763, + "loss": 1.3764, "step": 5365 }, { "epoch": 0.36485935589074603, - "grad_norm": 0.6724543571472168, + "grad_norm": 0.6742793917655945, "learning_rate": 4.771962902568284e-05, - "loss": 1.3895, + "loss": 1.3896, "step": 5370 }, { "epoch": 0.3651990759614078, - "grad_norm": 0.57244473695755, + "grad_norm": 0.5725171566009521, "learning_rate": 4.7717505775241205e-05, - "loss": 1.4469, + "loss": 1.4466, "step": 5375 }, { "epoch": 0.36553879603206957, - "grad_norm": 0.619618833065033, + "grad_norm": 0.6239153146743774, "learning_rate": 4.771538252479957e-05, - "loss": 1.413, + "loss": 1.4129, "step": 5380 }, { "epoch": 0.36587851610273137, - "grad_norm": 0.5769147276878357, + "grad_norm": 0.5756824016571045, "learning_rate": 4.771325927435793e-05, - "loss": 1.3105, + "loss": 1.3099, "step": 5385 }, { "epoch": 0.3662182361733931, - "grad_norm": 0.5992787480354309, + "grad_norm": 0.5991590619087219, "learning_rate": 4.77111360239163e-05, - "loss": 1.328, + "loss": 1.3289, "step": 5390 }, { "epoch": 0.3665579562440549, - "grad_norm": 0.528085470199585, + "grad_norm": 0.5290319323539734, "learning_rate": 4.770901277347466e-05, "loss": 1.3521, "step": 5395 }, { "epoch": 0.36689767631471665, - "grad_norm": 0.6295778751373291, + "grad_norm": 0.6290856003761292, "learning_rate": 4.770688952303302e-05, - "loss": 1.4191, + "loss": 1.42, "step": 5400 }, { "epoch": 0.36723739638537845, - "grad_norm": 0.7183342576026917, + "grad_norm": 0.7184908986091614, "learning_rate": 4.770476627259139e-05, - "loss": 1.3113, + "loss": 1.3112, "step": 5405 }, { "epoch": 0.36757711645604024, - "grad_norm": 0.6314020752906799, + "grad_norm": 0.6327075958251953, "learning_rate": 4.770264302214975e-05, - "loss": 1.3726, + "loss": 1.3725, "step": 5410 }, { "epoch": 0.367916836526702, - "grad_norm": 0.6158671975135803, + "grad_norm": 0.616466224193573, "learning_rate": 4.770051977170811e-05, - "loss": 1.4362, + "loss": 1.4366, "step": 5415 }, { "epoch": 0.3682565565973638, - "grad_norm": 0.5616465210914612, + "grad_norm": 0.5604059100151062, "learning_rate": 4.769839652126648e-05, - "loss": 1.3334, + "loss": 1.3339, "step": 5420 }, { "epoch": 0.3685962766680255, - "grad_norm": 0.585128664970398, + "grad_norm": 0.5854315757751465, "learning_rate": 4.7696273270824845e-05, - "loss": 1.4112, + "loss": 1.4113, "step": 5425 }, { "epoch": 0.3689359967386873, - "grad_norm": 0.6653167009353638, + "grad_norm": 0.6522176265716553, "learning_rate": 4.76941500203832e-05, - "loss": 1.3972, + "loss": 1.3976, "step": 5430 }, { "epoch": 0.3692757168093491, - "grad_norm": 0.6336387395858765, + "grad_norm": 0.6328961849212646, "learning_rate": 4.769202676994157e-05, - "loss": 1.3333, + "loss": 1.3331, "step": 5435 }, { "epoch": 0.36961543688001086, - "grad_norm": 0.5875614881515503, + "grad_norm": 0.5880500674247742, "learning_rate": 4.768990351949994e-05, - "loss": 1.3727, + "loss": 1.3733, "step": 5440 }, { "epoch": 0.36995515695067266, - "grad_norm": 0.650518000125885, + "grad_norm": 0.6528835296630859, "learning_rate": 4.7687780269058294e-05, - "loss": 1.4229, + "loss": 1.423, "step": 5445 }, { "epoch": 0.3702948770213344, - "grad_norm": 0.6364461779594421, + "grad_norm": 0.6349461674690247, "learning_rate": 4.7685657018616665e-05, - "loss": 1.365, + "loss": 1.3649, "step": 5450 }, { "epoch": 0.3706345970919962, - "grad_norm": 0.5882337689399719, + "grad_norm": 0.589169442653656, "learning_rate": 4.768353376817503e-05, - "loss": 1.4432, + "loss": 1.4427, "step": 5455 }, { "epoch": 0.370974317162658, - "grad_norm": 0.627549409866333, + "grad_norm": 0.6249655485153198, "learning_rate": 4.7681410517733386e-05, - "loss": 1.4261, + "loss": 1.4259, "step": 5460 }, { "epoch": 0.37131403723331974, - "grad_norm": 0.6792019605636597, + "grad_norm": 0.6778886318206787, "learning_rate": 4.767928726729176e-05, - "loss": 1.4128, + "loss": 1.4127, "step": 5465 }, { "epoch": 0.37165375730398154, - "grad_norm": 0.6191064715385437, + "grad_norm": 0.6213449835777283, "learning_rate": 4.767716401685012e-05, - "loss": 1.4145, + "loss": 1.4143, "step": 5470 }, { "epoch": 0.3719934773746433, - "grad_norm": 0.5747395753860474, + "grad_norm": 0.574975848197937, "learning_rate": 4.767504076640848e-05, "loss": 1.339, "step": 5475 }, { "epoch": 0.3723331974453051, - "grad_norm": 0.546324610710144, + "grad_norm": 0.5462049245834351, "learning_rate": 4.767291751596685e-05, "loss": 1.3541, "step": 5480 }, { "epoch": 0.3726729175159668, - "grad_norm": 0.6170159578323364, + "grad_norm": 0.6181663870811462, "learning_rate": 4.7670794265525206e-05, - "loss": 1.4755, + "loss": 1.4756, "step": 5485 }, { "epoch": 0.3730126375866286, - "grad_norm": 0.5728192925453186, + "grad_norm": 0.5738523006439209, "learning_rate": 4.766867101508357e-05, - "loss": 1.388, + "loss": 1.3883, "step": 5490 }, { "epoch": 0.3733523576572904, - "grad_norm": 0.6069906949996948, + "grad_norm": 0.609873354434967, "learning_rate": 4.766654776464194e-05, "loss": 1.3868, "step": 5495 }, { "epoch": 0.37369207772795215, - "grad_norm": 0.6560340523719788, + "grad_norm": 0.6587361097335815, "learning_rate": 4.76644245142003e-05, - "loss": 1.3731, + "loss": 1.3724, "step": 5500 }, { "epoch": 0.37403179779861395, - "grad_norm": 0.5605098605155945, + "grad_norm": 0.5617695450782776, "learning_rate": 4.766230126375866e-05, - "loss": 1.4622, + "loss": 1.462, "step": 5505 }, { "epoch": 0.3743715178692757, - "grad_norm": 0.5942585468292236, + "grad_norm": 0.5982805490493774, "learning_rate": 4.766017801331703e-05, - "loss": 1.3181, + "loss": 1.3182, "step": 5510 }, { "epoch": 0.3747112379399375, - "grad_norm": 0.6005749702453613, + "grad_norm": 0.5999040007591248, "learning_rate": 4.765805476287539e-05, - "loss": 1.463, + "loss": 1.4629, "step": 5515 }, { "epoch": 0.3750509580105993, - "grad_norm": 0.5901667475700378, + "grad_norm": 0.5922342538833618, "learning_rate": 4.7655931512433754e-05, - "loss": 1.3492, + "loss": 1.3494, "step": 5520 }, { "epoch": 0.37539067808126103, - "grad_norm": 0.6109676361083984, + "grad_norm": 0.6112759113311768, "learning_rate": 4.7653808261992125e-05, - "loss": 1.3842, + "loss": 1.3835, "step": 5525 }, { "epoch": 0.37573039815192283, - "grad_norm": 0.5087992548942566, + "grad_norm": 0.5034341216087341, "learning_rate": 4.765168501155048e-05, - "loss": 1.4801, + "loss": 1.4809, "step": 5530 }, { "epoch": 0.37607011822258457, - "grad_norm": 0.6232153177261353, + "grad_norm": 0.6231547594070435, "learning_rate": 4.7649561761108846e-05, - "loss": 1.4592, + "loss": 1.4594, "step": 5535 }, { "epoch": 0.37640983829324637, - "grad_norm": 0.5623194575309753, + "grad_norm": 0.5642001628875732, "learning_rate": 4.764743851066722e-05, - "loss": 1.324, + "loss": 1.3243, "step": 5540 }, { "epoch": 0.37674955836390817, - "grad_norm": 0.5809667706489563, + "grad_norm": 0.5763465166091919, "learning_rate": 4.7645315260225574e-05, - "loss": 1.4363, + "loss": 1.4359, "step": 5545 }, { "epoch": 0.3770892784345699, - "grad_norm": 0.6279406547546387, + "grad_norm": 0.6270535588264465, "learning_rate": 4.764319200978394e-05, - "loss": 1.4373, + "loss": 1.4367, "step": 5550 }, { "epoch": 0.3774289985052317, - "grad_norm": 0.5912439823150635, + "grad_norm": 0.5896164178848267, "learning_rate": 4.76410687593423e-05, "loss": 1.3755, "step": 5555 }, { "epoch": 0.37776871857589345, - "grad_norm": 0.667495608329773, + "grad_norm": 0.6713253855705261, "learning_rate": 4.7638945508900666e-05, - "loss": 1.3669, + "loss": 1.3667, "step": 5560 }, { "epoch": 0.37810843864655524, - "grad_norm": 0.6271250247955322, + "grad_norm": 0.6284233927726746, "learning_rate": 4.763682225845903e-05, - "loss": 1.3308, + "loss": 1.331, "step": 5565 }, { "epoch": 0.378448158717217, - "grad_norm": 0.6690099835395813, + "grad_norm": 0.6684582233428955, "learning_rate": 4.7634699008017394e-05, "loss": 1.4808, "step": 5570 }, { "epoch": 0.3787878787878788, - "grad_norm": 0.6592546701431274, + "grad_norm": 0.6624757051467896, "learning_rate": 4.763257575757576e-05, - "loss": 1.4697, + "loss": 1.4692, "step": 5575 }, { "epoch": 0.3791275988585406, - "grad_norm": 0.6392249464988708, + "grad_norm": 0.6392044425010681, "learning_rate": 4.763045250713412e-05, - "loss": 1.4297, + "loss": 1.43, "step": 5580 }, { "epoch": 0.3794673189292023, - "grad_norm": 0.6348369717597961, + "grad_norm": 0.6301529407501221, "learning_rate": 4.7628329256692486e-05, - "loss": 1.4131, + "loss": 1.413, "step": 5585 }, { "epoch": 0.3798070389998641, - "grad_norm": 0.5649213790893555, + "grad_norm": 0.5599331855773926, "learning_rate": 4.762620600625085e-05, - "loss": 1.3963, + "loss": 1.3964, "step": 5590 }, { "epoch": 0.38014675907052586, - "grad_norm": 0.5986794233322144, + "grad_norm": 0.6008949279785156, "learning_rate": 4.7624082755809214e-05, - "loss": 1.3051, + "loss": 1.3057, "step": 5595 }, { "epoch": 0.38048647914118766, - "grad_norm": 0.6087474822998047, + "grad_norm": 0.6099117994308472, "learning_rate": 4.762195950536758e-05, - "loss": 1.5329, + "loss": 1.5326, "step": 5600 }, { "epoch": 0.38082619921184946, - "grad_norm": 0.6305631399154663, + "grad_norm": 0.6330830454826355, "learning_rate": 4.761983625492594e-05, - "loss": 1.4441, + "loss": 1.4446, "step": 5605 }, { "epoch": 0.3811659192825112, - "grad_norm": 0.6152334213256836, + "grad_norm": 0.6134799122810364, "learning_rate": 4.7617713004484306e-05, - "loss": 1.42, + "loss": 1.4191, "step": 5610 }, { "epoch": 0.381505639353173, - "grad_norm": 0.5399892926216125, + "grad_norm": 0.5395935773849487, "learning_rate": 4.761558975404267e-05, - "loss": 1.366, + "loss": 1.3665, "step": 5615 }, { "epoch": 0.38184535942383474, - "grad_norm": 0.6757349371910095, + "grad_norm": 0.6795154213905334, "learning_rate": 4.7613466503601034e-05, - "loss": 1.4201, + "loss": 1.4199, "step": 5620 }, { "epoch": 0.38218507949449654, - "grad_norm": 0.6649751663208008, + "grad_norm": 0.6648789048194885, "learning_rate": 4.76113432531594e-05, - "loss": 1.297, + "loss": 1.2966, "step": 5625 }, { "epoch": 0.38252479956515834, - "grad_norm": 0.6459630131721497, + "grad_norm": 0.6486769914627075, "learning_rate": 4.760922000271776e-05, - "loss": 1.4591, + "loss": 1.4599, "step": 5630 }, { "epoch": 0.3828645196358201, - "grad_norm": 0.6597029566764832, + "grad_norm": 0.6407860517501831, "learning_rate": 4.7607096752276126e-05, - "loss": 1.3731, + "loss": 1.3736, "step": 5635 }, { "epoch": 0.3832042397064819, - "grad_norm": 0.610629141330719, + "grad_norm": 0.6121465563774109, "learning_rate": 4.760497350183449e-05, - "loss": 1.4348, + "loss": 1.435, "step": 5640 }, { "epoch": 0.3835439597771436, - "grad_norm": 0.6628695130348206, + "grad_norm": 0.6615270972251892, "learning_rate": 4.7602850251392854e-05, - "loss": 1.4356, + "loss": 1.4359, "step": 5645 }, { "epoch": 0.3838836798478054, - "grad_norm": 0.5976195931434631, + "grad_norm": 0.5996428728103638, "learning_rate": 4.760072700095122e-05, - "loss": 1.3779, + "loss": 1.3785, "step": 5650 }, { "epoch": 0.38422339991846716, - "grad_norm": 0.5131832361221313, + "grad_norm": 0.5110504031181335, "learning_rate": 4.759860375050958e-05, - "loss": 1.2832, + "loss": 1.2837, "step": 5655 }, { "epoch": 0.38456311998912895, - "grad_norm": 0.5103899836540222, + "grad_norm": 0.5070738792419434, "learning_rate": 4.7596480500067946e-05, - "loss": 1.3908, + "loss": 1.3901, "step": 5660 }, { "epoch": 0.38490284005979075, - "grad_norm": 0.5472924113273621, + "grad_norm": 0.5480765104293823, "learning_rate": 4.759435724962631e-05, - "loss": 1.2945, + "loss": 1.2951, "step": 5665 }, { "epoch": 0.3852425601304525, - "grad_norm": 0.5743058323860168, + "grad_norm": 0.5790027379989624, "learning_rate": 4.7592233999184674e-05, - "loss": 1.3783, + "loss": 1.3785, "step": 5670 }, { "epoch": 0.3855822802011143, - "grad_norm": 0.6685713529586792, + "grad_norm": 0.6676103472709656, "learning_rate": 4.759011074874304e-05, - "loss": 1.404, + "loss": 1.4043, "step": 5675 }, { "epoch": 0.38592200027177603, - "grad_norm": 0.6060322523117065, + "grad_norm": 0.6065981984138489, "learning_rate": 4.75879874983014e-05, - "loss": 1.3298, + "loss": 1.3301, "step": 5680 }, { "epoch": 0.38626172034243783, - "grad_norm": 0.5998486876487732, + "grad_norm": 0.6018291711807251, "learning_rate": 4.7585864247859766e-05, - "loss": 1.4065, + "loss": 1.4055, "step": 5685 }, { "epoch": 0.38660144041309963, - "grad_norm": 0.6170439124107361, + "grad_norm": 0.6162451505661011, "learning_rate": 4.758374099741813e-05, - "loss": 1.5164, + "loss": 1.5158, "step": 5690 }, { "epoch": 0.38694116048376137, - "grad_norm": 0.5666913390159607, + "grad_norm": 0.5658090710639954, "learning_rate": 4.7581617746976494e-05, "loss": 1.3716, "step": 5695 }, { "epoch": 0.38728088055442317, - "grad_norm": 0.6925976872444153, + "grad_norm": 0.6854243278503418, "learning_rate": 4.757949449653486e-05, - "loss": 1.3122, + "loss": 1.3124, "step": 5700 }, { "epoch": 0.3876206006250849, - "grad_norm": 0.5945137739181519, + "grad_norm": 0.5956252813339233, "learning_rate": 4.757737124609322e-05, - "loss": 1.4064, + "loss": 1.4062, "step": 5705 }, { "epoch": 0.3879603206957467, - "grad_norm": 0.644679844379425, + "grad_norm": 0.64044189453125, "learning_rate": 4.7575247995651586e-05, - "loss": 1.3737, + "loss": 1.374, "step": 5710 }, { "epoch": 0.3883000407664085, - "grad_norm": 0.5794373750686646, + "grad_norm": 0.578926146030426, "learning_rate": 4.757312474520995e-05, - "loss": 1.3891, + "loss": 1.3896, "step": 5715 }, { "epoch": 0.38863976083707025, - "grad_norm": 0.6075679659843445, + "grad_norm": 0.6070773601531982, "learning_rate": 4.7571001494768314e-05, - "loss": 1.3963, + "loss": 1.3962, "step": 5720 }, { "epoch": 0.38897948090773204, - "grad_norm": 0.5855366587638855, + "grad_norm": 0.5850543975830078, "learning_rate": 4.756887824432668e-05, "loss": 1.3753, "step": 5725 }, { "epoch": 0.3893192009783938, - "grad_norm": 0.598223090171814, + "grad_norm": 0.5980214476585388, "learning_rate": 4.756675499388504e-05, - "loss": 1.3687, + "loss": 1.3694, "step": 5730 }, { "epoch": 0.3896589210490556, - "grad_norm": 0.6207942366600037, + "grad_norm": 0.6222603917121887, "learning_rate": 4.7564631743443406e-05, - "loss": 1.3976, + "loss": 1.3984, "step": 5735 }, { "epoch": 0.3899986411197173, - "grad_norm": 0.5999290943145752, + "grad_norm": 0.6029056310653687, "learning_rate": 4.756250849300177e-05, - "loss": 1.4766, + "loss": 1.476, "step": 5740 }, { "epoch": 0.3903383611903791, - "grad_norm": 0.6329728960990906, + "grad_norm": 0.633120059967041, "learning_rate": 4.7560385242560134e-05, - "loss": 1.4594, + "loss": 1.4591, "step": 5745 }, { "epoch": 0.3906780812610409, - "grad_norm": 0.7645265460014343, + "grad_norm": 0.7686992287635803, "learning_rate": 4.75582619921185e-05, - "loss": 1.3919, + "loss": 1.392, "step": 5750 }, { "epoch": 0.39101780133170266, - "grad_norm": 0.6652759313583374, + "grad_norm": 0.6642864346504211, "learning_rate": 4.7556138741676856e-05, - "loss": 1.4261, + "loss": 1.426, "step": 5755 }, { "epoch": 0.39135752140236446, - "grad_norm": 0.6099786162376404, + "grad_norm": 0.610150158405304, "learning_rate": 4.7554015491235226e-05, - "loss": 1.3679, + "loss": 1.3684, "step": 5760 }, { "epoch": 0.3916972414730262, - "grad_norm": 0.6915930509567261, + "grad_norm": 0.6935614347457886, "learning_rate": 4.755189224079359e-05, - "loss": 1.3873, + "loss": 1.3875, "step": 5765 }, { "epoch": 0.392036961543688, - "grad_norm": 0.5593711137771606, + "grad_norm": 0.5581985712051392, "learning_rate": 4.754976899035195e-05, - "loss": 1.3472, + "loss": 1.347, "step": 5770 }, { "epoch": 0.3923766816143498, - "grad_norm": 0.5792405605316162, + "grad_norm": 0.5772132873535156, "learning_rate": 4.754764573991032e-05, - "loss": 1.4887, + "loss": 1.4886, "step": 5775 }, { "epoch": 0.39271640168501154, - "grad_norm": 0.5475907325744629, + "grad_norm": 0.5454913377761841, "learning_rate": 4.754552248946868e-05, - "loss": 1.2726, + "loss": 1.2722, "step": 5780 }, { "epoch": 0.39305612175567334, - "grad_norm": 0.593908429145813, + "grad_norm": 0.5950626730918884, "learning_rate": 4.754339923902704e-05, - "loss": 1.4216, + "loss": 1.4215, "step": 5785 }, { "epoch": 0.3933958418263351, - "grad_norm": 0.6711115837097168, + "grad_norm": 0.6738715767860413, "learning_rate": 4.754127598858541e-05, - "loss": 1.3659, + "loss": 1.366, "step": 5790 }, { "epoch": 0.3937355618969969, - "grad_norm": 0.610977292060852, + "grad_norm": 0.6112919449806213, "learning_rate": 4.7539152738143774e-05, - "loss": 1.4279, + "loss": 1.4282, "step": 5795 }, { "epoch": 0.3940752819676587, - "grad_norm": 0.602996289730072, + "grad_norm": 0.6035336256027222, "learning_rate": 4.753702948770213e-05, - "loss": 1.3001, + "loss": 1.3006, "step": 5800 }, { "epoch": 0.3944150020383204, - "grad_norm": 0.5804190039634705, + "grad_norm": 0.5825509428977966, "learning_rate": 4.75349062372605e-05, "loss": 1.4123, "step": 5805 }, { "epoch": 0.3947547221089822, - "grad_norm": 0.7110206484794617, + "grad_norm": 0.7108162641525269, "learning_rate": 4.7532782986818866e-05, - "loss": 1.4464, + "loss": 1.4463, "step": 5810 }, { "epoch": 0.39509444217964396, - "grad_norm": 0.5763588547706604, + "grad_norm": 0.5828922390937805, "learning_rate": 4.7530659736377224e-05, - "loss": 1.2695, + "loss": 1.2698, "step": 5815 }, { "epoch": 0.39543416225030575, - "grad_norm": 0.5757023096084595, + "grad_norm": 0.5771903395652771, "learning_rate": 4.7528536485935594e-05, - "loss": 1.4458, + "loss": 1.4456, "step": 5820 }, { "epoch": 0.3957738823209675, - "grad_norm": 0.6958516836166382, + "grad_norm": 0.6933791041374207, "learning_rate": 4.752641323549395e-05, - "loss": 1.4825, + "loss": 1.483, "step": 5825 }, { "epoch": 0.3961136023916293, - "grad_norm": 0.6314806938171387, + "grad_norm": 0.630571186542511, "learning_rate": 4.7524289985052316e-05, - "loss": 1.3996, + "loss": 1.4002, "step": 5830 }, { "epoch": 0.3964533224622911, - "grad_norm": 0.6348795890808105, + "grad_norm": 0.6317381858825684, "learning_rate": 4.7522166734610686e-05, - "loss": 1.4085, + "loss": 1.4088, "step": 5835 }, { "epoch": 0.39679304253295283, - "grad_norm": 0.6376756429672241, + "grad_norm": 0.640292227268219, "learning_rate": 4.7520043484169044e-05, - "loss": 1.3745, + "loss": 1.3747, "step": 5840 }, { "epoch": 0.39713276260361463, - "grad_norm": 0.6278203725814819, + "grad_norm": 0.6272284388542175, "learning_rate": 4.751792023372741e-05, - "loss": 1.5009, + "loss": 1.4999, "step": 5845 }, { "epoch": 0.3974724826742764, - "grad_norm": 0.5627636313438416, + "grad_norm": 0.5624070167541504, "learning_rate": 4.751579698328578e-05, - "loss": 1.3236, + "loss": 1.3228, "step": 5850 }, { "epoch": 0.39781220274493817, - "grad_norm": 0.6148563623428345, + "grad_norm": 0.6139281392097473, "learning_rate": 4.7513673732844136e-05, - "loss": 1.3983, + "loss": 1.3991, "step": 5855 }, { "epoch": 0.39815192281559997, - "grad_norm": 0.6606502532958984, + "grad_norm": 0.6615352630615234, "learning_rate": 4.75115504824025e-05, - "loss": 1.4323, + "loss": 1.432, "step": 5860 }, { "epoch": 0.3984916428862617, - "grad_norm": 0.6654682159423828, + "grad_norm": 0.6683053374290466, "learning_rate": 4.750942723196087e-05, - "loss": 1.4555, + "loss": 1.455, "step": 5865 }, { "epoch": 0.3988313629569235, - "grad_norm": 0.6830945611000061, + "grad_norm": 0.6728695631027222, "learning_rate": 4.750730398151923e-05, - "loss": 1.3652, + "loss": 1.3659, "step": 5870 }, { "epoch": 0.39917108302758525, - "grad_norm": 0.5865166783332825, + "grad_norm": 0.5872734189033508, "learning_rate": 4.750518073107759e-05, - "loss": 1.3804, + "loss": 1.38, "step": 5875 }, { "epoch": 0.39951080309824705, - "grad_norm": 0.6635303497314453, + "grad_norm": 0.6586723923683167, "learning_rate": 4.750305748063596e-05, - "loss": 1.3655, + "loss": 1.3648, "step": 5880 }, { "epoch": 0.39985052316890884, - "grad_norm": 0.5407352447509766, + "grad_norm": 0.5397301912307739, "learning_rate": 4.750093423019432e-05, - "loss": 1.4497, + "loss": 1.4495, "step": 5885 }, { "epoch": 0.4001902432395706, - "grad_norm": 0.5308975577354431, + "grad_norm": 0.5320855379104614, "learning_rate": 4.7498810979752684e-05, - "loss": 1.4051, + "loss": 1.4056, "step": 5890 }, { "epoch": 0.4005299633102324, - "grad_norm": 0.7656110525131226, + "grad_norm": 0.7650267481803894, "learning_rate": 4.7496687729311055e-05, - "loss": 1.2989, + "loss": 1.299, "step": 5895 }, { "epoch": 0.4008696833808941, - "grad_norm": 0.5741212964057922, + "grad_norm": 0.5761499404907227, "learning_rate": 4.749456447886941e-05, - "loss": 1.327, + "loss": 1.3271, "step": 5900 }, { "epoch": 0.4012094034515559, - "grad_norm": 0.6385553479194641, + "grad_norm": 0.6393672823905945, "learning_rate": 4.7492441228427776e-05, - "loss": 1.3209, + "loss": 1.3212, "step": 5905 }, { "epoch": 0.40154912352221767, - "grad_norm": 0.5651522874832153, + "grad_norm": 0.5650556087493896, "learning_rate": 4.749031797798614e-05, - "loss": 1.3567, + "loss": 1.357, "step": 5910 }, { "epoch": 0.40188884359287946, - "grad_norm": 0.675216019153595, + "grad_norm": 0.6776626706123352, "learning_rate": 4.7488194727544504e-05, - "loss": 1.4391, + "loss": 1.4395, "step": 5915 }, { "epoch": 0.40222856366354126, - "grad_norm": 0.6836060881614685, + "grad_norm": 0.6792265772819519, "learning_rate": 4.748607147710287e-05, "loss": 1.4123, "step": 5920 }, { "epoch": 0.402568283734203, - "grad_norm": 0.5851414799690247, + "grad_norm": 0.5842483043670654, "learning_rate": 4.748394822666123e-05, - "loss": 1.3718, + "loss": 1.3727, "step": 5925 }, { "epoch": 0.4029080038048648, - "grad_norm": 0.6235272884368896, + "grad_norm": 0.6231877207756042, "learning_rate": 4.7481824976219596e-05, "loss": 1.4089, "step": 5930 }, { "epoch": 0.40324772387552654, - "grad_norm": 0.6178550720214844, + "grad_norm": 0.6191523671150208, "learning_rate": 4.747970172577796e-05, - "loss": 1.4801, + "loss": 1.4798, "step": 5935 }, { "epoch": 0.40358744394618834, - "grad_norm": 0.6094355583190918, + "grad_norm": 0.6142128109931946, "learning_rate": 4.7477578475336324e-05, - "loss": 1.508, + "loss": 1.5086, "step": 5940 }, { "epoch": 0.40392716401685014, - "grad_norm": 0.6139050126075745, + "grad_norm": 0.616356372833252, "learning_rate": 4.747545522489469e-05, - "loss": 1.2982, + "loss": 1.2979, "step": 5945 }, { "epoch": 0.4042668840875119, - "grad_norm": 0.6272382736206055, + "grad_norm": 0.625674843788147, "learning_rate": 4.747333197445305e-05, - "loss": 1.4177, + "loss": 1.4182, "step": 5950 }, { "epoch": 0.4046066041581737, - "grad_norm": 0.6022531390190125, + "grad_norm": 0.6020771861076355, "learning_rate": 4.7471208724011416e-05, - "loss": 1.4028, + "loss": 1.4021, "step": 5955 }, { "epoch": 0.4049463242288354, - "grad_norm": 0.734524667263031, + "grad_norm": 0.7337747812271118, "learning_rate": 4.746908547356978e-05, - "loss": 1.4994, + "loss": 1.5001, "step": 5960 }, { "epoch": 0.4052860442994972, - "grad_norm": 0.5173041224479675, + "grad_norm": 0.519357442855835, "learning_rate": 4.746696222312815e-05, - "loss": 1.3659, + "loss": 1.3663, "step": 5965 }, { "epoch": 0.405625764370159, - "grad_norm": 0.5936630964279175, + "grad_norm": 0.5908923149108887, "learning_rate": 4.746483897268651e-05, - "loss": 1.443, + "loss": 1.4431, "step": 5970 }, { "epoch": 0.40596548444082076, - "grad_norm": 0.6323860287666321, + "grad_norm": 0.6336833238601685, "learning_rate": 4.746271572224487e-05, - "loss": 1.318, + "loss": 1.3173, "step": 5975 }, { "epoch": 0.40630520451148255, - "grad_norm": 0.6829825639724731, + "grad_norm": 0.6870870590209961, "learning_rate": 4.7460592471803236e-05, - "loss": 1.4725, + "loss": 1.4724, "step": 5980 }, { "epoch": 0.4066449245821443, - "grad_norm": 0.5354074835777283, + "grad_norm": 0.5343552231788635, "learning_rate": 4.74584692213616e-05, - "loss": 1.3214, + "loss": 1.3211, "step": 5985 }, { "epoch": 0.4069846446528061, - "grad_norm": 0.6558411717414856, + "grad_norm": 0.6540526151657104, "learning_rate": 4.7456345970919964e-05, - "loss": 1.3154, + "loss": 1.3152, "step": 5990 }, { "epoch": 0.40732436472346784, - "grad_norm": 0.5801601409912109, + "grad_norm": 0.5807486176490784, "learning_rate": 4.745422272047833e-05, - "loss": 1.3867, + "loss": 1.3866, "step": 5995 }, { "epoch": 0.40766408479412963, - "grad_norm": 0.5719489455223083, + "grad_norm": 0.5710402131080627, "learning_rate": 4.745209947003669e-05, - "loss": 1.2751, + "loss": 1.2745, "step": 6000 }, { "epoch": 0.40800380486479143, - "grad_norm": 0.5202120542526245, + "grad_norm": 0.5205429196357727, "learning_rate": 4.7449976219595056e-05, - "loss": 1.3704, + "loss": 1.37, "step": 6005 }, { "epoch": 0.4083435249354532, - "grad_norm": 0.5829079151153564, + "grad_norm": 0.5858888030052185, "learning_rate": 4.744785296915342e-05, - "loss": 1.3562, + "loss": 1.3558, "step": 6010 }, { "epoch": 0.40868324500611497, - "grad_norm": 0.6678234934806824, + "grad_norm": 0.6672476530075073, "learning_rate": 4.7445729718711784e-05, - "loss": 1.3721, + "loss": 1.3727, "step": 6015 }, { "epoch": 0.4090229650767767, - "grad_norm": 0.6354725956916809, + "grad_norm": 0.6258479356765747, "learning_rate": 4.744360646827015e-05, - "loss": 1.4208, + "loss": 1.421, "step": 6020 }, { "epoch": 0.4093626851474385, - "grad_norm": 0.6208111047744751, + "grad_norm": 0.6207389831542969, "learning_rate": 4.744148321782851e-05, - "loss": 1.3284, + "loss": 1.3285, "step": 6025 }, { "epoch": 0.4097024052181003, - "grad_norm": 0.6854109764099121, + "grad_norm": 0.688257098197937, "learning_rate": 4.7439359967386876e-05, - "loss": 1.386, + "loss": 1.3855, "step": 6030 }, { "epoch": 0.41004212528876205, - "grad_norm": 0.6217085123062134, + "grad_norm": 0.622156023979187, "learning_rate": 4.743723671694524e-05, - "loss": 1.2589, + "loss": 1.2581, "step": 6035 }, { "epoch": 0.41038184535942385, - "grad_norm": 0.6386720538139343, + "grad_norm": 0.6398622393608093, "learning_rate": 4.7435113466503604e-05, "loss": 1.3122, "step": 6040 }, { "epoch": 0.4107215654300856, - "grad_norm": 0.630185067653656, + "grad_norm": 0.6293922066688538, "learning_rate": 4.743299021606197e-05, "loss": 1.3928, "step": 6045 }, { "epoch": 0.4110612855007474, - "grad_norm": 0.756481945514679, + "grad_norm": 0.7502629160881042, "learning_rate": 4.743086696562033e-05, - "loss": 1.3392, + "loss": 1.3398, "step": 6050 }, { "epoch": 0.4114010055714092, - "grad_norm": 0.6218680739402771, + "grad_norm": 0.6202749609947205, "learning_rate": 4.7428743715178696e-05, - "loss": 1.365, + "loss": 1.3646, "step": 6055 }, { "epoch": 0.4117407256420709, - "grad_norm": 0.6535066366195679, + "grad_norm": 0.653184175491333, "learning_rate": 4.742662046473706e-05, - "loss": 1.3425, + "loss": 1.3424, "step": 6060 }, { "epoch": 0.4120804457127327, - "grad_norm": 0.6885753870010376, + "grad_norm": 0.68714839220047, "learning_rate": 4.7424497214295424e-05, - "loss": 1.4964, + "loss": 1.497, "step": 6065 }, { "epoch": 0.41242016578339447, - "grad_norm": 0.6080855131149292, + "grad_norm": 0.6084105372428894, "learning_rate": 4.742237396385379e-05, - "loss": 1.411, + "loss": 1.4113, "step": 6070 }, { "epoch": 0.41275988585405626, - "grad_norm": 0.6426699757575989, + "grad_norm": 0.6415480971336365, "learning_rate": 4.742025071341215e-05, - "loss": 1.2876, + "loss": 1.2868, "step": 6075 }, { "epoch": 0.413099605924718, - "grad_norm": 0.6667383909225464, + "grad_norm": 0.6692948937416077, "learning_rate": 4.7418127462970516e-05, - "loss": 1.3941, + "loss": 1.3943, "step": 6080 }, { "epoch": 0.4134393259953798, - "grad_norm": 0.7255922555923462, + "grad_norm": 0.7205021381378174, "learning_rate": 4.741600421252888e-05, - "loss": 1.4121, + "loss": 1.4116, "step": 6085 }, { "epoch": 0.4137790460660416, - "grad_norm": 0.6203028559684753, + "grad_norm": 0.6211341619491577, "learning_rate": 4.7413880962087244e-05, - "loss": 1.4324, + "loss": 1.4323, "step": 6090 }, { "epoch": 0.41411876613670334, - "grad_norm": 0.784636378288269, + "grad_norm": 0.7924069166183472, "learning_rate": 4.741175771164561e-05, - "loss": 1.3818, + "loss": 1.3816, "step": 6095 }, { "epoch": 0.41445848620736514, - "grad_norm": 0.5628244280815125, + "grad_norm": 0.5620571374893188, "learning_rate": 4.740963446120397e-05, - "loss": 1.3401, + "loss": 1.3396, "step": 6100 }, { "epoch": 0.4147982062780269, - "grad_norm": 0.6107689738273621, + "grad_norm": 0.6123418807983398, "learning_rate": 4.7407511210762336e-05, - "loss": 1.4518, + "loss": 1.4517, "step": 6105 }, { "epoch": 0.4151379263486887, - "grad_norm": 0.6618290543556213, + "grad_norm": 0.6645820140838623, "learning_rate": 4.740538796032069e-05, - "loss": 1.4626, + "loss": 1.4627, "step": 6110 }, { "epoch": 0.4154776464193505, - "grad_norm": 0.7461044192314148, + "grad_norm": 0.7415497303009033, "learning_rate": 4.7403264709879064e-05, - "loss": 1.4806, + "loss": 1.4801, "step": 6115 }, { "epoch": 0.4158173664900122, - "grad_norm": 0.6222243309020996, + "grad_norm": 0.620860755443573, "learning_rate": 4.740114145943743e-05, - "loss": 1.4335, + "loss": 1.434, "step": 6120 }, { "epoch": 0.416157086560674, - "grad_norm": 0.60787034034729, + "grad_norm": 0.6059554815292358, "learning_rate": 4.7399018208995785e-05, - "loss": 1.4104, + "loss": 1.4109, "step": 6125 }, { "epoch": 0.41649680663133576, - "grad_norm": 0.6704518795013428, + "grad_norm": 0.6707401871681213, "learning_rate": 4.7396894958554156e-05, - "loss": 1.4775, + "loss": 1.478, "step": 6130 }, { "epoch": 0.41683652670199756, - "grad_norm": 0.589547872543335, + "grad_norm": 0.5887949466705322, "learning_rate": 4.739477170811252e-05, - "loss": 1.385, + "loss": 1.3851, "step": 6135 }, { "epoch": 0.41717624677265935, - "grad_norm": 0.6546774506568909, + "grad_norm": 0.6567453742027283, "learning_rate": 4.739264845767088e-05, - "loss": 1.3662, + "loss": 1.3661, "step": 6140 }, { "epoch": 0.4175159668433211, - "grad_norm": 0.609470546245575, + "grad_norm": 0.6083785891532898, "learning_rate": 4.739052520722925e-05, - "loss": 1.387, + "loss": 1.3864, "step": 6145 }, { "epoch": 0.4178556869139829, - "grad_norm": 0.6994268894195557, + "grad_norm": 0.6930893659591675, "learning_rate": 4.738840195678761e-05, - "loss": 1.4109, + "loss": 1.4107, "step": 6150 }, { "epoch": 0.41819540698464464, - "grad_norm": 0.6468284726142883, + "grad_norm": 0.6451560258865356, "learning_rate": 4.738627870634597e-05, - "loss": 1.314, + "loss": 1.3133, "step": 6155 }, { "epoch": 0.41853512705530643, - "grad_norm": 0.5958006978034973, + "grad_norm": 0.5979086756706238, "learning_rate": 4.738415545590434e-05, - "loss": 1.3969, + "loss": 1.3967, "step": 6160 }, { "epoch": 0.4188748471259682, - "grad_norm": 0.6362236142158508, + "grad_norm": 0.634627103805542, "learning_rate": 4.7382032205462704e-05, - "loss": 1.4263, + "loss": 1.4261, "step": 6165 }, { "epoch": 0.41921456719663, - "grad_norm": 0.7022944092750549, + "grad_norm": 0.7017971873283386, "learning_rate": 4.737990895502106e-05, - "loss": 1.3434, + "loss": 1.3419, "step": 6170 }, { "epoch": 0.41955428726729177, - "grad_norm": 0.6964726448059082, + "grad_norm": 0.6954447627067566, "learning_rate": 4.737778570457943e-05, - "loss": 1.3334, + "loss": 1.3335, "step": 6175 }, { "epoch": 0.4198940073379535, - "grad_norm": 0.553797721862793, + "grad_norm": 0.5540419220924377, "learning_rate": 4.737566245413779e-05, - "loss": 1.3971, + "loss": 1.3965, "step": 6180 }, { "epoch": 0.4202337274086153, - "grad_norm": 0.6053094267845154, + "grad_norm": 0.6059179306030273, "learning_rate": 4.737353920369615e-05, - "loss": 1.3486, + "loss": 1.3487, "step": 6185 }, { "epoch": 0.42057344747927705, - "grad_norm": 0.6094326972961426, + "grad_norm": 0.606228768825531, "learning_rate": 4.7371415953254524e-05, - "loss": 1.4286, + "loss": 1.4287, "step": 6190 }, { "epoch": 0.42091316754993885, - "grad_norm": 0.557897686958313, + "grad_norm": 0.5567349791526794, "learning_rate": 4.736929270281288e-05, - "loss": 1.4682, + "loss": 1.4678, "step": 6195 }, { "epoch": 0.42125288762060065, - "grad_norm": 0.6170535683631897, + "grad_norm": 0.6164939403533936, "learning_rate": 4.7367169452371245e-05, "loss": 1.3753, "step": 6200 }, { "epoch": 0.4215926076912624, - "grad_norm": 0.600373387336731, + "grad_norm": 0.6007863879203796, "learning_rate": 4.7365046201929616e-05, - "loss": 1.3378, + "loss": 1.3375, "step": 6205 }, { "epoch": 0.4219323277619242, - "grad_norm": 0.594676673412323, + "grad_norm": 0.5968669056892395, "learning_rate": 4.736292295148797e-05, - "loss": 1.3615, + "loss": 1.3613, "step": 6210 }, { "epoch": 0.42227204783258593, - "grad_norm": 0.6378969550132751, + "grad_norm": 0.6416547894477844, "learning_rate": 4.736079970104634e-05, - "loss": 1.3388, + "loss": 1.3392, "step": 6215 }, { "epoch": 0.4226117679032477, - "grad_norm": 0.5239458680152893, + "grad_norm": 0.5225838422775269, "learning_rate": 4.735867645060471e-05, - "loss": 1.3513, + "loss": 1.3521, "step": 6220 }, { "epoch": 0.4229514879739095, - "grad_norm": 0.6183655261993408, + "grad_norm": 0.6146356463432312, "learning_rate": 4.7356553200163065e-05, - "loss": 1.4161, + "loss": 1.4157, "step": 6225 }, { "epoch": 0.42329120804457127, - "grad_norm": 0.6560379862785339, + "grad_norm": 0.6584218740463257, "learning_rate": 4.735442994972143e-05, - "loss": 1.505, + "loss": 1.5055, "step": 6230 }, { "epoch": 0.42363092811523306, - "grad_norm": 0.7305738925933838, + "grad_norm": 0.7320394515991211, "learning_rate": 4.73523066992798e-05, - "loss": 1.4154, + "loss": 1.416, "step": 6235 }, { "epoch": 0.4239706481858948, - "grad_norm": 0.6894407868385315, + "grad_norm": 0.6887832880020142, "learning_rate": 4.735018344883816e-05, - "loss": 1.444, + "loss": 1.4441, "step": 6240 }, { "epoch": 0.4243103682565566, - "grad_norm": 0.6726522445678711, + "grad_norm": 0.6725772023200989, "learning_rate": 4.734806019839652e-05, - "loss": 1.3739, + "loss": 1.3745, "step": 6245 }, { "epoch": 0.42465008832721834, - "grad_norm": 0.6662222743034363, + "grad_norm": 0.6659643650054932, "learning_rate": 4.734593694795489e-05, - "loss": 1.3381, + "loss": 1.3372, "step": 6250 }, { "epoch": 0.42498980839788014, - "grad_norm": 0.6684558391571045, + "grad_norm": 0.6676824688911438, "learning_rate": 4.734381369751325e-05, - "loss": 1.3474, + "loss": 1.3479, "step": 6255 }, { "epoch": 0.42532952846854194, - "grad_norm": 0.6196736693382263, + "grad_norm": 0.6172699928283691, "learning_rate": 4.734169044707161e-05, - "loss": 1.3213, + "loss": 1.3218, "step": 6260 }, { "epoch": 0.4256692485392037, - "grad_norm": 0.6730858683586121, + "grad_norm": 0.6719139218330383, "learning_rate": 4.733956719662998e-05, - "loss": 1.3643, + "loss": 1.3641, "step": 6265 }, { "epoch": 0.4260089686098655, - "grad_norm": 0.8188268542289734, + "grad_norm": 0.8430404663085938, "learning_rate": 4.733744394618834e-05, - "loss": 1.4328, + "loss": 1.4327, "step": 6270 }, { "epoch": 0.4263486886805272, - "grad_norm": 0.6468964219093323, + "grad_norm": 0.6477616429328918, "learning_rate": 4.7335320695746705e-05, - "loss": 1.3889, + "loss": 1.3887, "step": 6275 }, { "epoch": 0.426688408751189, - "grad_norm": 0.6821523308753967, + "grad_norm": 0.6852474212646484, "learning_rate": 4.733319744530507e-05, - "loss": 1.4177, + "loss": 1.4181, "step": 6280 }, { "epoch": 0.4270281288218508, - "grad_norm": 0.6204740405082703, + "grad_norm": 0.6132010817527771, "learning_rate": 4.733107419486343e-05, - "loss": 1.4181, + "loss": 1.418, "step": 6285 }, { "epoch": 0.42736784889251256, - "grad_norm": 0.688260018825531, + "grad_norm": 0.696260929107666, "learning_rate": 4.73289509444218e-05, - "loss": 1.3448, + "loss": 1.3447, "step": 6290 }, { "epoch": 0.42770756896317436, - "grad_norm": 0.5745126008987427, + "grad_norm": 0.5720667839050293, "learning_rate": 4.732682769398016e-05, - "loss": 1.4101, + "loss": 1.4105, "step": 6295 }, { "epoch": 0.4280472890338361, - "grad_norm": 0.7786775827407837, + "grad_norm": 0.782663881778717, "learning_rate": 4.7324704443538525e-05, - "loss": 1.514, + "loss": 1.5142, "step": 6300 }, { "epoch": 0.4283870091044979, - "grad_norm": 0.6279415488243103, + "grad_norm": 0.6301887631416321, "learning_rate": 4.7322581193096896e-05, - "loss": 1.3362, + "loss": 1.3361, "step": 6305 }, { "epoch": 0.4287267291751597, - "grad_norm": 0.5530821084976196, + "grad_norm": 0.55410236120224, "learning_rate": 4.732045794265525e-05, - "loss": 1.3756, + "loss": 1.3752, "step": 6310 }, { "epoch": 0.42906644924582144, - "grad_norm": 0.7006688714027405, + "grad_norm": 0.6983581781387329, "learning_rate": 4.731833469221362e-05, - "loss": 1.4522, + "loss": 1.4517, "step": 6315 }, { "epoch": 0.42940616931648323, - "grad_norm": 0.6122555136680603, + "grad_norm": 0.6091383695602417, "learning_rate": 4.731621144177199e-05, - "loss": 1.3682, + "loss": 1.3683, "step": 6320 }, { "epoch": 0.429745889387145, - "grad_norm": 0.618565559387207, + "grad_norm": 0.6261497735977173, "learning_rate": 4.7314088191330345e-05, - "loss": 1.3329, + "loss": 1.3326, "step": 6325 }, { "epoch": 0.4300856094578068, - "grad_norm": 0.6874955296516418, + "grad_norm": 0.6840320229530334, "learning_rate": 4.731196494088871e-05, - "loss": 1.3354, + "loss": 1.3363, "step": 6330 }, { "epoch": 0.4304253295284685, - "grad_norm": 0.6150823831558228, + "grad_norm": 0.6141173839569092, "learning_rate": 4.730984169044707e-05, "loss": 1.4378, "step": 6335 }, { "epoch": 0.4307650495991303, - "grad_norm": 0.6561357975006104, + "grad_norm": 0.658711850643158, "learning_rate": 4.730771844000544e-05, - "loss": 1.3912, + "loss": 1.3917, "step": 6340 }, { "epoch": 0.4311047696697921, - "grad_norm": 0.579881489276886, + "grad_norm": 0.5769929885864258, "learning_rate": 4.73055951895638e-05, - "loss": 1.4251, + "loss": 1.4253, "step": 6345 }, { "epoch": 0.43144448974045385, - "grad_norm": 0.6598808765411377, + "grad_norm": 0.6589663028717041, "learning_rate": 4.7303471939122165e-05, - "loss": 1.4157, + "loss": 1.4162, "step": 6350 }, { "epoch": 0.43178420981111565, - "grad_norm": 0.6837253570556641, + "grad_norm": 0.6820507645606995, "learning_rate": 4.730134868868053e-05, - "loss": 1.401, + "loss": 1.4015, "step": 6355 }, { "epoch": 0.4321239298817774, - "grad_norm": 0.6009359359741211, + "grad_norm": 0.6008535027503967, "learning_rate": 4.729922543823889e-05, - "loss": 1.3655, + "loss": 1.3654, "step": 6360 }, { "epoch": 0.4324636499524392, - "grad_norm": 0.6403203010559082, + "grad_norm": 0.6409998536109924, "learning_rate": 4.729710218779726e-05, - "loss": 1.4184, + "loss": 1.4191, "step": 6365 }, { "epoch": 0.432803370023101, - "grad_norm": 0.6336822509765625, + "grad_norm": 0.6326683759689331, "learning_rate": 4.729497893735562e-05, - "loss": 1.3008, + "loss": 1.3005, "step": 6370 }, { "epoch": 0.43314309009376273, - "grad_norm": 0.6576248407363892, + "grad_norm": 0.6593410968780518, "learning_rate": 4.7292855686913985e-05, "loss": 1.4843, "step": 6375 }, { "epoch": 0.4334828101644245, - "grad_norm": 0.5695661306381226, + "grad_norm": 0.5687089562416077, "learning_rate": 4.729073243647235e-05, - "loss": 1.4633, + "loss": 1.4639, "step": 6380 }, { "epoch": 0.43382253023508627, - "grad_norm": 0.678441047668457, + "grad_norm": 0.6777456402778625, "learning_rate": 4.728860918603071e-05, - "loss": 1.4119, + "loss": 1.4116, "step": 6385 }, { "epoch": 0.43416225030574807, - "grad_norm": 0.6367850303649902, + "grad_norm": 0.6386072635650635, "learning_rate": 4.728648593558908e-05, - "loss": 1.3968, + "loss": 1.3972, "step": 6390 }, { "epoch": 0.43450197037640986, - "grad_norm": 0.6021522283554077, + "grad_norm": 0.6020061373710632, "learning_rate": 4.728436268514744e-05, - "loss": 1.4134, + "loss": 1.4131, "step": 6395 }, { "epoch": 0.4348416904470716, - "grad_norm": 0.592147707939148, + "grad_norm": 0.5907840728759766, "learning_rate": 4.7282239434705805e-05, - "loss": 1.3061, + "loss": 1.3059, "step": 6400 }, { "epoch": 0.4351814105177334, - "grad_norm": 0.6148473620414734, + "grad_norm": 0.6151184439659119, "learning_rate": 4.728011618426417e-05, - "loss": 1.4722, + "loss": 1.4725, "step": 6405 }, { "epoch": 0.43552113058839514, - "grad_norm": 0.5737019777297974, + "grad_norm": 0.5740171074867249, "learning_rate": 4.727799293382253e-05, - "loss": 1.3992, + "loss": 1.3993, "step": 6410 }, { "epoch": 0.43586085065905694, - "grad_norm": 0.6439921259880066, + "grad_norm": 0.64751136302948, "learning_rate": 4.72758696833809e-05, - "loss": 1.3698, + "loss": 1.3701, "step": 6415 }, { "epoch": 0.4362005707297187, - "grad_norm": 0.553864598274231, + "grad_norm": 0.5530461072921753, "learning_rate": 4.727374643293926e-05, - "loss": 1.291, + "loss": 1.2912, "step": 6420 }, { "epoch": 0.4365402908003805, - "grad_norm": 0.5933894515037537, + "grad_norm": 0.5970993041992188, "learning_rate": 4.7271623182497625e-05, - "loss": 1.3912, + "loss": 1.391, "step": 6425 }, { "epoch": 0.4368800108710423, - "grad_norm": 0.6752405762672424, + "grad_norm": 0.6758865714073181, "learning_rate": 4.726949993205599e-05, - "loss": 1.4306, + "loss": 1.4319, "step": 6430 }, { "epoch": 0.437219730941704, - "grad_norm": 0.6308922171592712, + "grad_norm": 0.6340398788452148, "learning_rate": 4.7267376681614353e-05, - "loss": 1.3051, + "loss": 1.3048, "step": 6435 }, { "epoch": 0.4375594510123658, - "grad_norm": 0.6437475681304932, + "grad_norm": 0.644960880279541, "learning_rate": 4.726525343117272e-05, - "loss": 1.3897, + "loss": 1.3898, "step": 6440 }, { "epoch": 0.43789917108302756, - "grad_norm": 0.7018872499465942, + "grad_norm": 0.700422465801239, "learning_rate": 4.726313018073108e-05, "loss": 1.3172, "step": 6445 }, { "epoch": 0.43823889115368936, - "grad_norm": 0.662291944026947, + "grad_norm": 0.6629453301429749, "learning_rate": 4.7261006930289445e-05, - "loss": 1.4606, + "loss": 1.461, "step": 6450 }, { "epoch": 0.43857861122435116, - "grad_norm": 0.5564162731170654, + "grad_norm": 0.5581923127174377, "learning_rate": 4.725888367984781e-05, "loss": 1.4022, "step": 6455 }, { "epoch": 0.4389183312950129, - "grad_norm": 0.677634060382843, + "grad_norm": 0.6781660318374634, "learning_rate": 4.7256760429406173e-05, - "loss": 1.3791, + "loss": 1.3787, "step": 6460 }, { "epoch": 0.4392580513656747, - "grad_norm": 0.5525080561637878, + "grad_norm": 0.552969217300415, "learning_rate": 4.725463717896453e-05, - "loss": 1.4171, + "loss": 1.417, "step": 6465 }, { "epoch": 0.43959777143633644, - "grad_norm": 0.5328442454338074, + "grad_norm": 0.5314246416091919, "learning_rate": 4.72525139285229e-05, - "loss": 1.5618, + "loss": 1.5625, "step": 6470 }, { "epoch": 0.43993749150699824, - "grad_norm": 0.6505507826805115, + "grad_norm": 0.6540617346763611, "learning_rate": 4.7250390678081265e-05, - "loss": 1.3914, + "loss": 1.3905, "step": 6475 }, { "epoch": 0.44027721157766003, - "grad_norm": 0.6275343894958496, + "grad_norm": 0.6251617670059204, "learning_rate": 4.724826742763962e-05, - "loss": 1.4201, + "loss": 1.42, "step": 6480 }, { "epoch": 0.4406169316483218, - "grad_norm": 0.670893669128418, + "grad_norm": 0.6699917316436768, "learning_rate": 4.7246144177197993e-05, - "loss": 1.426, + "loss": 1.4255, "step": 6485 }, { "epoch": 0.44095665171898357, - "grad_norm": 0.6744541525840759, + "grad_norm": 0.674936056137085, "learning_rate": 4.724402092675636e-05, - "loss": 1.4765, + "loss": 1.477, "step": 6490 }, { "epoch": 0.4412963717896453, - "grad_norm": 0.620466411113739, + "grad_norm": 0.6209253072738647, "learning_rate": 4.7241897676314715e-05, - "loss": 1.3884, + "loss": 1.3887, "step": 6495 }, { "epoch": 0.4416360918603071, - "grad_norm": 0.6874172687530518, + "grad_norm": 0.6823062300682068, "learning_rate": 4.7239774425873085e-05, - "loss": 1.3943, + "loss": 1.3942, "step": 6500 }, { "epoch": 0.4419758119309689, - "grad_norm": 0.6267426013946533, + "grad_norm": 0.6257312297821045, "learning_rate": 4.723765117543145e-05, - "loss": 1.3539, + "loss": 1.3548, "step": 6505 }, { "epoch": 0.44231553200163065, - "grad_norm": 0.5028911828994751, + "grad_norm": 0.505099356174469, "learning_rate": 4.723552792498981e-05, - "loss": 1.4493, + "loss": 1.4496, "step": 6510 }, { "epoch": 0.44265525207229245, - "grad_norm": 0.5759350061416626, + "grad_norm": 0.5741568803787231, "learning_rate": 4.723340467454818e-05, - "loss": 1.3223, + "loss": 1.3224, "step": 6515 }, { "epoch": 0.4429949721429542, - "grad_norm": 0.5979635119438171, + "grad_norm": 0.5960662961006165, "learning_rate": 4.723128142410654e-05, - "loss": 1.3262, + "loss": 1.3268, "step": 6520 }, { "epoch": 0.443334692213616, - "grad_norm": 0.6062570810317993, + "grad_norm": 0.6063078045845032, "learning_rate": 4.72291581736649e-05, - "loss": 1.3175, + "loss": 1.3177, "step": 6525 }, { "epoch": 0.44367441228427773, - "grad_norm": 0.6158884167671204, + "grad_norm": 0.6161885857582092, "learning_rate": 4.722703492322327e-05, - "loss": 1.311, + "loss": 1.3116, "step": 6530 }, { "epoch": 0.44401413235493953, - "grad_norm": 0.7551906108856201, + "grad_norm": 0.7544776201248169, "learning_rate": 4.722491167278163e-05, - "loss": 1.3355, + "loss": 1.3347, "step": 6535 }, { "epoch": 0.4443538524256013, - "grad_norm": 0.6330267190933228, + "grad_norm": 0.6324429512023926, "learning_rate": 4.722278842233999e-05, - "loss": 1.3874, + "loss": 1.3883, "step": 6540 }, { "epoch": 0.44469357249626307, - "grad_norm": 0.5953761339187622, + "grad_norm": 0.5945907235145569, "learning_rate": 4.722066517189836e-05, - "loss": 1.4243, + "loss": 1.4241, "step": 6545 }, { "epoch": 0.44503329256692487, - "grad_norm": 0.6187408566474915, + "grad_norm": 0.6212255358695984, "learning_rate": 4.721854192145672e-05, - "loss": 1.3054, + "loss": 1.3053, "step": 6550 }, { "epoch": 0.4453730126375866, - "grad_norm": 0.607340395450592, + "grad_norm": 0.6077522039413452, "learning_rate": 4.721641867101508e-05, - "loss": 1.3926, + "loss": 1.3922, "step": 6555 }, { "epoch": 0.4457127327082484, - "grad_norm": 0.5794751644134521, + "grad_norm": 0.5807745456695557, "learning_rate": 4.7214295420573453e-05, - "loss": 1.3777, + "loss": 1.3781, "step": 6560 }, { "epoch": 0.4460524527789102, - "grad_norm": 0.6140636205673218, + "grad_norm": 0.6122899651527405, "learning_rate": 4.721217217013181e-05, - "loss": 1.3956, + "loss": 1.3951, "step": 6565 }, { "epoch": 0.44639217284957194, - "grad_norm": 0.5925490856170654, + "grad_norm": 0.5924504399299622, "learning_rate": 4.7210048919690175e-05, - "loss": 1.3495, + "loss": 1.3496, "step": 6570 }, { "epoch": 0.44673189292023374, - "grad_norm": 0.6611554026603699, + "grad_norm": 0.6637756824493408, "learning_rate": 4.7207925669248545e-05, - "loss": 1.43, + "loss": 1.4299, "step": 6575 }, { "epoch": 0.4470716129908955, - "grad_norm": 0.588813841342926, + "grad_norm": 0.5896127820014954, "learning_rate": 4.72058024188069e-05, - "loss": 1.4257, + "loss": 1.4254, "step": 6580 }, { "epoch": 0.4474113330615573, - "grad_norm": 0.6727645397186279, + "grad_norm": 0.6748072504997253, "learning_rate": 4.720367916836527e-05, - "loss": 1.3628, + "loss": 1.3637, "step": 6585 }, { "epoch": 0.4477510531322191, - "grad_norm": 0.6347823739051819, + "grad_norm": 0.6335247159004211, "learning_rate": 4.720155591792364e-05, - "loss": 1.387, + "loss": 1.3862, "step": 6590 }, { "epoch": 0.4480907732028808, - "grad_norm": 0.624548077583313, + "grad_norm": 0.621158242225647, "learning_rate": 4.7199432667481995e-05, - "loss": 1.4544, + "loss": 1.4547, "step": 6595 }, { "epoch": 0.4484304932735426, - "grad_norm": 0.6414671540260315, + "grad_norm": 0.6408329606056213, "learning_rate": 4.719730941704036e-05, - "loss": 1.3541, + "loss": 1.3542, "step": 6600 }, { "epoch": 0.44877021334420436, - "grad_norm": 0.6082646250724792, + "grad_norm": 0.6085910201072693, "learning_rate": 4.719518616659872e-05, - "loss": 1.394, + "loss": 1.3937, "step": 6605 }, { "epoch": 0.44910993341486616, - "grad_norm": 0.6644629836082458, + "grad_norm": 0.6685971617698669, "learning_rate": 4.719306291615709e-05, - "loss": 1.5086, + "loss": 1.5087, "step": 6610 }, { "epoch": 0.4494496534855279, - "grad_norm": 0.6724450588226318, + "grad_norm": 0.6658419370651245, "learning_rate": 4.719093966571545e-05, "loss": 1.3885, "step": 6615 }, { "epoch": 0.4497893735561897, - "grad_norm": 0.5682170987129211, + "grad_norm": 0.5672845840454102, "learning_rate": 4.7188816415273815e-05, - "loss": 1.5027, + "loss": 1.5025, "step": 6620 }, { "epoch": 0.4501290936268515, - "grad_norm": 0.6639217138290405, + "grad_norm": 0.6638292670249939, "learning_rate": 4.718669316483218e-05, - "loss": 1.3845, + "loss": 1.385, "step": 6625 }, { "epoch": 0.45046881369751324, - "grad_norm": 0.6022807955741882, + "grad_norm": 0.6031318306922913, "learning_rate": 4.718456991439054e-05, - "loss": 1.331, + "loss": 1.3313, "step": 6630 }, { "epoch": 0.45080853376817503, - "grad_norm": 0.6910311579704285, + "grad_norm": 0.6934178471565247, "learning_rate": 4.718244666394891e-05, - "loss": 1.4313, + "loss": 1.4312, "step": 6635 }, { "epoch": 0.4511482538388368, - "grad_norm": 0.5989499688148499, + "grad_norm": 0.6001390218734741, "learning_rate": 4.718032341350727e-05, - "loss": 1.4147, + "loss": 1.4154, "step": 6640 }, { "epoch": 0.4514879739094986, - "grad_norm": 0.6600388288497925, + "grad_norm": 0.6572665572166443, "learning_rate": 4.717820016306564e-05, - "loss": 1.376, + "loss": 1.3757, "step": 6645 }, { "epoch": 0.45182769398016037, - "grad_norm": 0.5953102111816406, + "grad_norm": 0.5944000482559204, "learning_rate": 4.7176076912624e-05, - "loss": 1.2933, + "loss": 1.293, "step": 6650 }, { "epoch": 0.4521674140508221, - "grad_norm": 0.6237321496009827, + "grad_norm": 0.6239500045776367, "learning_rate": 4.717395366218236e-05, - "loss": 1.3621, + "loss": 1.3619, "step": 6655 }, { "epoch": 0.4525071341214839, - "grad_norm": 0.6195266246795654, + "grad_norm": 0.6148386597633362, "learning_rate": 4.7171830411740734e-05, "loss": 1.4022, "step": 6660 }, { "epoch": 0.45284685419214565, - "grad_norm": 0.6216328740119934, + "grad_norm": 0.6227109432220459, "learning_rate": 4.716970716129909e-05, - "loss": 1.2973, + "loss": 1.2977, "step": 6665 }, { "epoch": 0.45318657426280745, - "grad_norm": 0.5761063694953918, + "grad_norm": 0.5775809288024902, "learning_rate": 4.7167583910857455e-05, - "loss": 1.3438, + "loss": 1.3436, "step": 6670 }, { "epoch": 0.45352629433346925, - "grad_norm": 0.5767399668693542, + "grad_norm": 0.5750589370727539, "learning_rate": 4.7165460660415826e-05, - "loss": 1.4085, + "loss": 1.4083, "step": 6675 }, { "epoch": 0.453866014404131, - "grad_norm": 0.7233198285102844, + "grad_norm": 0.724828839302063, "learning_rate": 4.716333740997418e-05, - "loss": 1.4204, + "loss": 1.42, "step": 6680 }, { "epoch": 0.4542057344747928, - "grad_norm": 0.5569356679916382, + "grad_norm": 0.5527727603912354, "learning_rate": 4.716121415953255e-05, - "loss": 1.3326, + "loss": 1.3323, "step": 6685 }, { "epoch": 0.45454545454545453, - "grad_norm": 0.6534759998321533, + "grad_norm": 0.6516443490982056, "learning_rate": 4.715909090909091e-05, - "loss": 1.2859, + "loss": 1.2856, "step": 6690 }, { "epoch": 0.45488517461611633, - "grad_norm": 0.7906098365783691, + "grad_norm": 0.7347457408905029, "learning_rate": 4.7156967658649275e-05, - "loss": 1.4885, + "loss": 1.4888, "step": 6695 }, { "epoch": 0.45522489468677807, - "grad_norm": 0.6368966102600098, + "grad_norm": 0.6313270926475525, "learning_rate": 4.715484440820764e-05, - "loss": 1.2917, + "loss": 1.2916, "step": 6700 }, { "epoch": 0.45556461475743987, - "grad_norm": 0.6911798119544983, + "grad_norm": 0.6921724081039429, "learning_rate": 4.7152721157766e-05, "loss": 1.3531, "step": 6705 }, { "epoch": 0.45590433482810166, - "grad_norm": 0.6642591953277588, + "grad_norm": 0.6605074405670166, "learning_rate": 4.715059790732437e-05, - "loss": 1.4106, + "loss": 1.411, "step": 6710 }, { "epoch": 0.4562440548987634, - "grad_norm": 0.6652267575263977, + "grad_norm": 0.6658073663711548, "learning_rate": 4.714847465688273e-05, - "loss": 1.3623, + "loss": 1.362, "step": 6715 }, { "epoch": 0.4565837749694252, - "grad_norm": 0.7334158420562744, + "grad_norm": 0.7271640300750732, "learning_rate": 4.7146351406441095e-05, - "loss": 1.4278, + "loss": 1.4268, "step": 6720 }, { "epoch": 0.45692349504008695, - "grad_norm": 0.6598769426345825, + "grad_norm": 0.6611621975898743, "learning_rate": 4.714422815599946e-05, - "loss": 1.3, + "loss": 1.299, "step": 6725 }, { "epoch": 0.45726321511074874, - "grad_norm": 0.6216796636581421, + "grad_norm": 0.622531533241272, "learning_rate": 4.714210490555782e-05, - "loss": 1.4718, + "loss": 1.4717, "step": 6730 }, { "epoch": 0.45760293518141054, - "grad_norm": 0.7299495339393616, + "grad_norm": 0.7435928583145142, "learning_rate": 4.713998165511619e-05, - "loss": 1.3517, + "loss": 1.3521, "step": 6735 }, { "epoch": 0.4579426552520723, - "grad_norm": 0.6052029132843018, + "grad_norm": 0.6066542863845825, "learning_rate": 4.713785840467455e-05, - "loss": 1.3797, + "loss": 1.3794, "step": 6740 }, { "epoch": 0.4582823753227341, - "grad_norm": 0.7223475575447083, + "grad_norm": 0.7201389670372009, "learning_rate": 4.7135735154232915e-05, - "loss": 1.4449, + "loss": 1.4451, "step": 6745 }, { "epoch": 0.4586220953933958, - "grad_norm": 0.6296783685684204, + "grad_norm": 0.6423707008361816, "learning_rate": 4.713361190379128e-05, - "loss": 1.3496, + "loss": 1.3493, "step": 6750 }, { "epoch": 0.4589618154640576, - "grad_norm": 0.7194005846977234, + "grad_norm": 0.7198963165283203, "learning_rate": 4.713148865334964e-05, - "loss": 1.5065, + "loss": 1.5063, "step": 6755 }, { "epoch": 0.4593015355347194, - "grad_norm": 0.579858124256134, + "grad_norm": 0.579082190990448, "learning_rate": 4.712936540290801e-05, - "loss": 1.3154, + "loss": 1.3155, "step": 6760 }, { "epoch": 0.45964125560538116, - "grad_norm": 0.6932454705238342, + "grad_norm": 0.689719021320343, "learning_rate": 4.712724215246637e-05, - "loss": 1.3644, + "loss": 1.3636, "step": 6765 }, { "epoch": 0.45998097567604296, - "grad_norm": 0.6134516596794128, + "grad_norm": 0.615871012210846, "learning_rate": 4.7125118902024735e-05, - "loss": 1.4632, + "loss": 1.4637, "step": 6770 }, { "epoch": 0.4603206957467047, - "grad_norm": 0.647254467010498, + "grad_norm": 0.6502019166946411, "learning_rate": 4.71229956515831e-05, "loss": 1.4343, "step": 6775 }, { "epoch": 0.4606604158173665, - "grad_norm": 0.6321381330490112, + "grad_norm": 0.6338366270065308, "learning_rate": 4.712087240114146e-05, - "loss": 1.4337, + "loss": 1.4334, "step": 6780 }, { "epoch": 0.46100013588802824, - "grad_norm": 0.6225603222846985, + "grad_norm": 0.6241445541381836, "learning_rate": 4.711874915069983e-05, - "loss": 1.3577, + "loss": 1.3578, "step": 6785 }, { "epoch": 0.46133985595869004, - "grad_norm": 0.6371875405311584, + "grad_norm": 0.6385154128074646, "learning_rate": 4.711662590025819e-05, - "loss": 1.4572, + "loss": 1.4571, "step": 6790 }, { "epoch": 0.46167957602935183, - "grad_norm": 0.5947543382644653, + "grad_norm": 0.5958842635154724, "learning_rate": 4.7114502649816555e-05, - "loss": 1.3498, + "loss": 1.3508, "step": 6795 }, { "epoch": 0.4620192961000136, - "grad_norm": 0.6039636731147766, + "grad_norm": 0.603630006313324, "learning_rate": 4.711237939937492e-05, - "loss": 1.4372, + "loss": 1.4375, "step": 6800 }, { "epoch": 0.4623590161706754, - "grad_norm": 0.6076428294181824, + "grad_norm": 0.6063517332077026, "learning_rate": 4.7110256148933276e-05, - "loss": 1.3423, + "loss": 1.3419, "step": 6805 }, { "epoch": 0.4626987362413371, - "grad_norm": 0.623203456401825, + "grad_norm": 0.6224209070205688, "learning_rate": 4.710813289849165e-05, - "loss": 1.3663, + "loss": 1.3659, "step": 6810 }, { "epoch": 0.4630384563119989, - "grad_norm": 0.5835606455802917, + "grad_norm": 0.5842647552490234, "learning_rate": 4.710600964805001e-05, - "loss": 1.3292, + "loss": 1.3285, "step": 6815 }, { "epoch": 0.4633781763826607, - "grad_norm": 0.6508517861366272, + "grad_norm": 0.6530916094779968, "learning_rate": 4.710388639760837e-05, - "loss": 1.3595, + "loss": 1.3597, "step": 6820 }, { "epoch": 0.46371789645332245, - "grad_norm": 0.5497671961784363, + "grad_norm": 0.5500802397727966, "learning_rate": 4.710176314716674e-05, - "loss": 1.3624, + "loss": 1.3627, "step": 6825 }, { "epoch": 0.46405761652398425, - "grad_norm": 0.6289246082305908, + "grad_norm": 0.6246926188468933, "learning_rate": 4.70996398967251e-05, - "loss": 1.3744, + "loss": 1.3742, "step": 6830 }, { "epoch": 0.464397336594646, - "grad_norm": 0.7137528657913208, + "grad_norm": 0.7113204002380371, "learning_rate": 4.709751664628346e-05, - "loss": 1.3934, + "loss": 1.3925, "step": 6835 }, { "epoch": 0.4647370566653078, - "grad_norm": 0.638431191444397, + "grad_norm": 0.6359570622444153, "learning_rate": 4.709539339584183e-05, - "loss": 1.4849, + "loss": 1.4842, "step": 6840 }, { "epoch": 0.4650767767359696, - "grad_norm": 0.5792069435119629, + "grad_norm": 0.582706868648529, "learning_rate": 4.7093270145400195e-05, - "loss": 1.3843, + "loss": 1.3845, "step": 6845 }, { "epoch": 0.46541649680663133, - "grad_norm": 0.7117582559585571, + "grad_norm": 0.7090787291526794, "learning_rate": 4.709114689495855e-05, - "loss": 1.5094, + "loss": 1.5093, "step": 6850 }, { "epoch": 0.4657562168772931, - "grad_norm": 0.5959650278091431, + "grad_norm": 0.5955531001091003, "learning_rate": 4.708902364451692e-05, - "loss": 1.3869, + "loss": 1.3876, "step": 6855 }, { "epoch": 0.46609593694795487, - "grad_norm": 0.6324079632759094, + "grad_norm": 0.6333985328674316, "learning_rate": 4.708690039407529e-05, - "loss": 1.3351, + "loss": 1.3355, "step": 6860 }, { "epoch": 0.46643565701861667, - "grad_norm": 0.6711081266403198, + "grad_norm": 0.6683228015899658, "learning_rate": 4.7084777143633644e-05, - "loss": 1.4543, + "loss": 1.4547, "step": 6865 }, { "epoch": 0.4667753770892784, - "grad_norm": 0.6800785660743713, + "grad_norm": 0.6778053641319275, "learning_rate": 4.7082653893192015e-05, - "loss": 1.302, + "loss": 1.3014, "step": 6870 }, { "epoch": 0.4671150971599402, - "grad_norm": 0.6625350117683411, + "grad_norm": 0.6553548574447632, "learning_rate": 4.708053064275038e-05, - "loss": 1.2314, + "loss": 1.231, "step": 6875 }, { "epoch": 0.467454817230602, - "grad_norm": 0.7001163959503174, + "grad_norm": 0.7071673274040222, "learning_rate": 4.7078407392308736e-05, - "loss": 1.4239, + "loss": 1.4236, "step": 6880 }, { "epoch": 0.46779453730126375, - "grad_norm": 0.6386942267417908, + "grad_norm": 0.6369908452033997, "learning_rate": 4.707628414186711e-05, - "loss": 1.4019, + "loss": 1.4017, "step": 6885 }, { "epoch": 0.46813425737192554, - "grad_norm": 0.6393757462501526, + "grad_norm": 0.639376699924469, "learning_rate": 4.7074160891425464e-05, - "loss": 1.4489, + "loss": 1.4488, "step": 6890 }, { "epoch": 0.4684739774425873, - "grad_norm": 0.6970923542976379, + "grad_norm": 0.6954451203346252, "learning_rate": 4.707203764098383e-05, - "loss": 1.368, + "loss": 1.3681, "step": 6895 }, { "epoch": 0.4688136975132491, - "grad_norm": 0.599149227142334, + "grad_norm": 0.5978522300720215, "learning_rate": 4.70699143905422e-05, - "loss": 1.4188, + "loss": 1.4189, "step": 6900 }, { "epoch": 0.4691534175839109, - "grad_norm": 0.6489973664283752, + "grad_norm": 0.6491764187812805, "learning_rate": 4.7067791140100556e-05, - "loss": 1.3446, + "loss": 1.3448, "step": 6905 }, { "epoch": 0.4694931376545726, - "grad_norm": 0.6070247888565063, + "grad_norm": 0.6084409356117249, "learning_rate": 4.706566788965892e-05, "loss": 1.3729, "step": 6910 }, { "epoch": 0.4698328577252344, - "grad_norm": 0.6290463805198669, + "grad_norm": 0.626599907875061, "learning_rate": 4.706354463921729e-05, - "loss": 1.4297, + "loss": 1.4301, "step": 6915 }, { "epoch": 0.47017257779589616, - "grad_norm": 0.6896281242370605, + "grad_norm": 0.6886815428733826, "learning_rate": 4.706142138877565e-05, - "loss": 1.3642, + "loss": 1.3644, "step": 6920 }, { "epoch": 0.47051229786655796, - "grad_norm": 0.6045809984207153, + "grad_norm": 0.6031310558319092, "learning_rate": 4.705929813833401e-05, - "loss": 1.4051, + "loss": 1.4053, "step": 6925 }, { "epoch": 0.47085201793721976, - "grad_norm": 0.5476675033569336, + "grad_norm": 0.5482912659645081, "learning_rate": 4.705717488789238e-05, - "loss": 1.3467, + "loss": 1.347, "step": 6930 }, { "epoch": 0.4711917380078815, - "grad_norm": 0.6183815598487854, + "grad_norm": 0.6204922199249268, "learning_rate": 4.705505163745074e-05, - "loss": 1.4159, + "loss": 1.4157, "step": 6935 }, { "epoch": 0.4715314580785433, - "grad_norm": 0.6676488518714905, + "grad_norm": 0.6717233657836914, "learning_rate": 4.7052928387009104e-05, - "loss": 1.4024, + "loss": 1.4025, "step": 6940 }, { "epoch": 0.47187117814920504, - "grad_norm": 0.6127513647079468, + "grad_norm": 0.6122251152992249, "learning_rate": 4.7050805136567475e-05, - "loss": 1.4613, + "loss": 1.4612, "step": 6945 }, { "epoch": 0.47221089821986684, - "grad_norm": 0.6065813899040222, + "grad_norm": 0.6094078421592712, "learning_rate": 4.704868188612583e-05, - "loss": 1.2956, + "loss": 1.2958, "step": 6950 }, { "epoch": 0.4725506182905286, - "grad_norm": 0.6002477407455444, + "grad_norm": 0.5995004773139954, "learning_rate": 4.7046558635684196e-05, - "loss": 1.5058, + "loss": 1.5056, "step": 6955 }, { "epoch": 0.4728903383611904, - "grad_norm": 0.7777348756790161, + "grad_norm": 0.7737948894500732, "learning_rate": 4.704443538524256e-05, - "loss": 1.4562, + "loss": 1.4566, "step": 6960 }, { "epoch": 0.4732300584318522, - "grad_norm": 0.6998386979103088, + "grad_norm": 0.6943872570991516, "learning_rate": 4.7042312134800924e-05, - "loss": 1.4011, + "loss": 1.401, "step": 6965 }, { "epoch": 0.4735697785025139, - "grad_norm": 0.5755128264427185, + "grad_norm": 0.576105535030365, "learning_rate": 4.704018888435929e-05, - "loss": 1.4089, + "loss": 1.4086, "step": 6970 }, { "epoch": 0.4739094985731757, - "grad_norm": 0.5363697409629822, + "grad_norm": 0.5370616316795349, "learning_rate": 4.703806563391765e-05, - "loss": 1.4084, + "loss": 1.4086, "step": 6975 }, { "epoch": 0.47424921864383746, - "grad_norm": 0.6444015502929688, + "grad_norm": 0.6401310563087463, "learning_rate": 4.7035942383476016e-05, - "loss": 1.3412, + "loss": 1.3419, "step": 6980 }, { "epoch": 0.47458893871449925, - "grad_norm": 0.6073984503746033, + "grad_norm": 0.6063711643218994, "learning_rate": 4.703381913303439e-05, - "loss": 1.3084, + "loss": 1.3077, "step": 6985 }, { "epoch": 0.47492865878516105, - "grad_norm": 0.6416052579879761, + "grad_norm": 0.6448118090629578, "learning_rate": 4.7031695882592744e-05, - "loss": 1.393, + "loss": 1.3925, "step": 6990 }, { "epoch": 0.4752683788558228, - "grad_norm": 0.7063747644424438, + "grad_norm": 0.7061852216720581, "learning_rate": 4.702957263215111e-05, - "loss": 1.3068, + "loss": 1.3067, "step": 6995 }, { "epoch": 0.4756080989264846, - "grad_norm": 0.5851465463638306, + "grad_norm": 0.5849624872207642, "learning_rate": 4.702744938170948e-05, - "loss": 1.3124, + "loss": 1.3121, "step": 7000 }, { "epoch": 0.47594781899714633, - "grad_norm": 0.625334620475769, + "grad_norm": 0.6268022656440735, "learning_rate": 4.7025326131267836e-05, "loss": 1.3969, "step": 7005 }, { "epoch": 0.47628753906780813, - "grad_norm": 0.7166358828544617, + "grad_norm": 0.7078633904457092, "learning_rate": 4.70232028808262e-05, - "loss": 1.4506, + "loss": 1.4508, "step": 7010 }, { "epoch": 0.4766272591384699, - "grad_norm": 0.663744330406189, + "grad_norm": 0.6659931540489197, "learning_rate": 4.702107963038457e-05, - "loss": 1.4552, + "loss": 1.4556, "step": 7015 }, { "epoch": 0.47696697920913167, - "grad_norm": 0.6093037724494934, + "grad_norm": 0.608733057975769, "learning_rate": 4.701895637994293e-05, - "loss": 1.3591, + "loss": 1.3588, "step": 7020 }, { "epoch": 0.47730669927979347, - "grad_norm": 0.5953529477119446, + "grad_norm": 0.5917583703994751, "learning_rate": 4.701683312950129e-05, "loss": 1.2685, "step": 7025 }, { "epoch": 0.4776464193504552, - "grad_norm": 0.7081599235534668, + "grad_norm": 0.7081303000450134, "learning_rate": 4.7014709879059656e-05, - "loss": 1.4126, + "loss": 1.4122, "step": 7030 }, { "epoch": 0.477986139421117, - "grad_norm": 0.6899991035461426, + "grad_norm": 0.6908140182495117, "learning_rate": 4.701258662861802e-05, - "loss": 1.3675, + "loss": 1.3677, "step": 7035 }, { "epoch": 0.47832585949177875, - "grad_norm": 0.6287958025932312, + "grad_norm": 0.6233030557632446, "learning_rate": 4.7010463378176384e-05, - "loss": 1.3333, + "loss": 1.3332, "step": 7040 }, { "epoch": 0.47866557956244055, - "grad_norm": 0.5969959497451782, + "grad_norm": 0.5986418724060059, "learning_rate": 4.700834012773475e-05, - "loss": 1.3552, + "loss": 1.3556, "step": 7045 }, { "epoch": 0.47900529963310234, - "grad_norm": 0.5409806370735168, + "grad_norm": 0.5415254235267639, "learning_rate": 4.700621687729311e-05, - "loss": 1.3008, + "loss": 1.3007, "step": 7050 }, { "epoch": 0.4793450197037641, - "grad_norm": 0.6205386519432068, + "grad_norm": 0.6188207864761353, "learning_rate": 4.7004093626851476e-05, - "loss": 1.3809, + "loss": 1.381, "step": 7055 }, { "epoch": 0.4796847397744259, - "grad_norm": 0.5811137557029724, + "grad_norm": 0.5825943946838379, "learning_rate": 4.700197037640984e-05, "loss": 1.4104, "step": 7060 }, { "epoch": 0.4800244598450876, - "grad_norm": 0.6639488339424133, + "grad_norm": 0.6652608513832092, "learning_rate": 4.6999847125968204e-05, - "loss": 1.3618, + "loss": 1.3627, "step": 7065 }, { "epoch": 0.4803641799157494, - "grad_norm": 0.6906101107597351, + "grad_norm": 0.682145357131958, "learning_rate": 4.699772387552657e-05, - "loss": 1.3211, + "loss": 1.3214, "step": 7070 }, { "epoch": 0.4807038999864112, - "grad_norm": 0.6427626609802246, + "grad_norm": 0.6436914205551147, "learning_rate": 4.699560062508493e-05, - "loss": 1.4017, + "loss": 1.4024, "step": 7075 }, { "epoch": 0.48104362005707296, - "grad_norm": 0.6597930788993835, + "grad_norm": 0.6593771576881409, "learning_rate": 4.6993477374643296e-05, - "loss": 1.391, + "loss": 1.3915, "step": 7080 }, { "epoch": 0.48138334012773476, - "grad_norm": 0.5214569568634033, + "grad_norm": 0.521743655204773, "learning_rate": 4.699135412420166e-05, - "loss": 1.4231, + "loss": 1.4233, "step": 7085 }, { "epoch": 0.4817230601983965, - "grad_norm": 0.6299998760223389, + "grad_norm": 0.6319026947021484, "learning_rate": 4.6989230873760024e-05, - "loss": 1.4031, + "loss": 1.4033, "step": 7090 }, { "epoch": 0.4820627802690583, - "grad_norm": 0.6915963292121887, + "grad_norm": 0.6919960975646973, "learning_rate": 4.698710762331839e-05, - "loss": 1.3353, + "loss": 1.3355, "step": 7095 }, { "epoch": 0.4824025003397201, - "grad_norm": 0.5781539082527161, + "grad_norm": 0.5764263868331909, "learning_rate": 4.698498437287675e-05, - "loss": 1.3934, + "loss": 1.3939, "step": 7100 }, { "epoch": 0.48274222041038184, - "grad_norm": 0.8143801689147949, + "grad_norm": 0.8115857839584351, "learning_rate": 4.6982861122435116e-05, - "loss": 1.3657, + "loss": 1.3656, "step": 7105 }, { "epoch": 0.48308194048104364, - "grad_norm": 0.5864207744598389, + "grad_norm": 0.5859213471412659, "learning_rate": 4.698073787199348e-05, "loss": 1.4226, "step": 7110 }, { "epoch": 0.4834216605517054, - "grad_norm": 0.6527549028396606, + "grad_norm": 0.6513347029685974, "learning_rate": 4.6978614621551844e-05, "loss": 1.2369, "step": 7115 }, { "epoch": 0.4837613806223672, - "grad_norm": 0.6397340893745422, + "grad_norm": 0.6403194665908813, "learning_rate": 4.697649137111021e-05, - "loss": 1.4503, + "loss": 1.4505, "step": 7120 }, { "epoch": 0.4841011006930289, - "grad_norm": 0.7032833099365234, + "grad_norm": 0.7004779577255249, "learning_rate": 4.697436812066857e-05, - "loss": 1.4234, + "loss": 1.4236, "step": 7125 }, { "epoch": 0.4844408207636907, - "grad_norm": 0.6597045063972473, + "grad_norm": 0.6604002118110657, "learning_rate": 4.6972244870226936e-05, - "loss": 1.3776, + "loss": 1.3782, "step": 7130 }, { "epoch": 0.4847805408343525, - "grad_norm": 0.6307904124259949, + "grad_norm": 0.6332085728645325, "learning_rate": 4.69701216197853e-05, - "loss": 1.3324, + "loss": 1.3314, "step": 7135 }, { "epoch": 0.48512026090501426, - "grad_norm": 0.597841203212738, + "grad_norm": 0.5978452563285828, "learning_rate": 4.6967998369343664e-05, - "loss": 1.3702, + "loss": 1.3697, "step": 7140 }, { "epoch": 0.48545998097567605, - "grad_norm": 0.6850742101669312, + "grad_norm": 0.6851106286048889, "learning_rate": 4.696587511890203e-05, - "loss": 1.3493, + "loss": 1.349, "step": 7145 }, { "epoch": 0.4857997010463378, - "grad_norm": 0.6079716682434082, + "grad_norm": 0.6114030480384827, "learning_rate": 4.696375186846039e-05, - "loss": 1.3664, + "loss": 1.3662, "step": 7150 }, { "epoch": 0.4861394211169996, - "grad_norm": 0.6594722270965576, + "grad_norm": 0.6580917835235596, "learning_rate": 4.6961628618018756e-05, - "loss": 1.3821, + "loss": 1.3823, "step": 7155 }, { "epoch": 0.4864791411876614, - "grad_norm": 0.6235490441322327, + "grad_norm": 0.6250133514404297, "learning_rate": 4.6959505367577114e-05, - "loss": 1.3944, + "loss": 1.394, "step": 7160 }, { "epoch": 0.48681886125832313, - "grad_norm": 0.6533124446868896, + "grad_norm": 0.6537445783615112, "learning_rate": 4.6957382117135484e-05, - "loss": 1.314, + "loss": 1.3144, "step": 7165 }, { "epoch": 0.48715858132898493, - "grad_norm": 0.6049984693527222, + "grad_norm": 0.6050564050674438, "learning_rate": 4.695525886669385e-05, - "loss": 1.4004, + "loss": 1.4, "step": 7170 }, { "epoch": 0.48749830139964667, - "grad_norm": 0.6813513040542603, + "grad_norm": 0.6814160346984863, "learning_rate": 4.6953135616252206e-05, - "loss": 1.4451, + "loss": 1.445, "step": 7175 }, { "epoch": 0.48783802147030847, - "grad_norm": 0.707304060459137, + "grad_norm": 0.7019910216331482, "learning_rate": 4.6951012365810576e-05, - "loss": 1.331, + "loss": 1.3308, "step": 7180 }, { "epoch": 0.48817774154097027, - "grad_norm": 0.596437394618988, + "grad_norm": 0.5910308957099915, "learning_rate": 4.694888911536894e-05, - "loss": 1.3058, + "loss": 1.3062, "step": 7185 }, { "epoch": 0.488517461611632, - "grad_norm": 0.6496755480766296, + "grad_norm": 0.6503949165344238, "learning_rate": 4.69467658649273e-05, - "loss": 1.3408, + "loss": 1.3404, "step": 7190 }, { "epoch": 0.4888571816822938, - "grad_norm": 0.6542823910713196, + "grad_norm": 0.6556570529937744, "learning_rate": 4.694464261448567e-05, - "loss": 1.4765, + "loss": 1.4776, "step": 7195 }, { "epoch": 0.48919690175295555, - "grad_norm": 0.5419815182685852, + "grad_norm": 0.5468158721923828, "learning_rate": 4.694251936404403e-05, - "loss": 1.4465, + "loss": 1.4467, "step": 7200 }, { "epoch": 0.48953662182361735, - "grad_norm": 0.6572680473327637, + "grad_norm": 0.6573325991630554, "learning_rate": 4.694039611360239e-05, - "loss": 1.376, + "loss": 1.3766, "step": 7205 }, { "epoch": 0.4898763418942791, - "grad_norm": 0.5701210498809814, + "grad_norm": 0.5701974630355835, "learning_rate": 4.693827286316076e-05, - "loss": 1.3777, + "loss": 1.3783, "step": 7210 }, { "epoch": 0.4902160619649409, - "grad_norm": 0.7354646325111389, + "grad_norm": 0.737570583820343, "learning_rate": 4.6936149612719124e-05, - "loss": 1.3926, + "loss": 1.3928, "step": 7215 }, { "epoch": 0.4905557820356027, - "grad_norm": 0.6235008239746094, + "grad_norm": 0.6262993216514587, "learning_rate": 4.693402636227748e-05, - "loss": 1.3472, + "loss": 1.3473, "step": 7220 }, { "epoch": 0.4908955021062644, - "grad_norm": 0.5395787954330444, + "grad_norm": 0.5372124910354614, "learning_rate": 4.693190311183585e-05, - "loss": 1.4022, + "loss": 1.4018, "step": 7225 }, { "epoch": 0.4912352221769262, - "grad_norm": 0.6403634548187256, + "grad_norm": 0.6388150453567505, "learning_rate": 4.692977986139421e-05, - "loss": 1.3633, + "loss": 1.3631, "step": 7230 }, { "epoch": 0.49157494224758796, - "grad_norm": 0.724759578704834, + "grad_norm": 0.7263844609260559, "learning_rate": 4.6927656610952574e-05, - "loss": 1.3742, + "loss": 1.3747, "step": 7235 }, { "epoch": 0.49191466231824976, - "grad_norm": 0.6368885636329651, + "grad_norm": 0.6385419368743896, "learning_rate": 4.6925533360510944e-05, - "loss": 1.2781, + "loss": 1.2778, "step": 7240 }, { "epoch": 0.49225438238891156, - "grad_norm": 0.635615348815918, + "grad_norm": 0.6355118155479431, "learning_rate": 4.69234101100693e-05, - "loss": 1.328, + "loss": 1.3281, "step": 7245 }, { "epoch": 0.4925941024595733, - "grad_norm": 0.669722855091095, + "grad_norm": 0.6698096394538879, "learning_rate": 4.6921286859627666e-05, - "loss": 1.4532, + "loss": 1.4534, "step": 7250 }, { "epoch": 0.4929338225302351, - "grad_norm": 0.584342360496521, + "grad_norm": 0.58675217628479, "learning_rate": 4.6919163609186036e-05, - "loss": 1.3522, + "loss": 1.352, "step": 7255 }, { "epoch": 0.49327354260089684, - "grad_norm": 0.6784916520118713, + "grad_norm": 0.6844244599342346, "learning_rate": 4.6917040358744394e-05, - "loss": 1.4906, + "loss": 1.4905, "step": 7260 }, { "epoch": 0.49361326267155864, - "grad_norm": 0.6705710291862488, + "grad_norm": 0.672153651714325, "learning_rate": 4.691491710830276e-05, - "loss": 1.3883, + "loss": 1.3888, "step": 7265 }, { "epoch": 0.49395298274222044, - "grad_norm": 0.65829998254776, + "grad_norm": 0.6564855575561523, "learning_rate": 4.691279385786113e-05, - "loss": 1.3488, + "loss": 1.3479, "step": 7270 }, { "epoch": 0.4942927028128822, - "grad_norm": 0.6567642688751221, + "grad_norm": 0.6594576239585876, "learning_rate": 4.6910670607419486e-05, - "loss": 1.4068, + "loss": 1.4074, "step": 7275 }, { "epoch": 0.494632422883544, - "grad_norm": 0.7130700349807739, + "grad_norm": 0.7132222056388855, "learning_rate": 4.690854735697785e-05, - "loss": 1.3641, + "loss": 1.3644, "step": 7280 }, { "epoch": 0.4949721429542057, - "grad_norm": 0.6480490565299988, + "grad_norm": 0.6510976552963257, "learning_rate": 4.690642410653622e-05, - "loss": 1.4546, + "loss": 1.4538, "step": 7285 }, { "epoch": 0.4953118630248675, - "grad_norm": 0.6228196620941162, + "grad_norm": 0.6218722462654114, "learning_rate": 4.690430085609458e-05, "loss": 1.3729, "step": 7290 }, { "epoch": 0.49565158309552926, - "grad_norm": 0.5975043773651123, + "grad_norm": 0.6013529300689697, "learning_rate": 4.690217760565294e-05, - "loss": 1.3468, + "loss": 1.3466, "step": 7295 }, { "epoch": 0.49599130316619106, - "grad_norm": 0.6154775023460388, + "grad_norm": 0.6124358177185059, "learning_rate": 4.690005435521131e-05, - "loss": 1.4006, + "loss": 1.4007, "step": 7300 }, { "epoch": 0.49633102323685285, - "grad_norm": 0.6062136888504028, + "grad_norm": 0.6021935343742371, "learning_rate": 4.689793110476967e-05, - "loss": 1.3717, + "loss": 1.3714, "step": 7305 }, { "epoch": 0.4966707433075146, - "grad_norm": 0.6884073615074158, + "grad_norm": 0.6878105401992798, "learning_rate": 4.6895807854328034e-05, - "loss": 1.3792, + "loss": 1.3796, "step": 7310 }, { "epoch": 0.4970104633781764, - "grad_norm": 0.6220802068710327, + "grad_norm": 0.6230244636535645, "learning_rate": 4.68936846038864e-05, "loss": 1.3939, "step": 7315 }, { "epoch": 0.49735018344883813, - "grad_norm": 0.6321655511856079, + "grad_norm": 0.637650191783905, "learning_rate": 4.689156135344476e-05, "loss": 1.4618, "step": 7320 }, { "epoch": 0.49768990351949993, - "grad_norm": 0.6812154650688171, + "grad_norm": 0.6793188452720642, "learning_rate": 4.688943810300313e-05, - "loss": 1.4334, + "loss": 1.4335, "step": 7325 }, { "epoch": 0.49802962359016173, - "grad_norm": 0.5642015337944031, + "grad_norm": 0.561997652053833, "learning_rate": 4.688731485256149e-05, - "loss": 1.4009, + "loss": 1.4006, "step": 7330 }, { "epoch": 0.49836934366082347, - "grad_norm": 0.6238872408866882, + "grad_norm": 0.6245251297950745, "learning_rate": 4.6885191602119854e-05, - "loss": 1.4417, + "loss": 1.4416, "step": 7335 }, { "epoch": 0.49870906373148527, - "grad_norm": 0.575141191482544, + "grad_norm": 0.5713440775871277, "learning_rate": 4.6883068351678225e-05, - "loss": 1.3855, + "loss": 1.3852, "step": 7340 }, { "epoch": 0.499048783802147, - "grad_norm": 0.5635263919830322, + "grad_norm": 0.5639804601669312, "learning_rate": 4.688094510123658e-05, - "loss": 1.3929, + "loss": 1.3937, "step": 7345 }, { "epoch": 0.4993885038728088, - "grad_norm": 0.6585545539855957, + "grad_norm": 0.6597671508789062, "learning_rate": 4.6878821850794946e-05, - "loss": 1.2866, + "loss": 1.2871, "step": 7350 }, { "epoch": 0.4997282239434706, - "grad_norm": 0.6060187220573425, + "grad_norm": 0.6116788387298584, "learning_rate": 4.6876698600353317e-05, "loss": 1.3086, "step": 7355 }, { "epoch": 0.5000679440141323, - "grad_norm": 0.7069248557090759, + "grad_norm": 0.7007910013198853, "learning_rate": 4.6874575349911674e-05, "loss": 1.4091, "step": 7360 }, { "epoch": 0.5004076640847941, - "grad_norm": 0.5305033922195435, + "grad_norm": 0.5318356156349182, "learning_rate": 4.687245209947004e-05, - "loss": 1.3546, + "loss": 1.3551, "step": 7365 }, { "epoch": 0.5007473841554559, - "grad_norm": 0.5994262099266052, + "grad_norm": 0.5937709808349609, "learning_rate": 4.687032884902841e-05, "loss": 1.3083, "step": 7370 }, { "epoch": 0.5010871042261177, - "grad_norm": 0.5297023057937622, + "grad_norm": 0.562004804611206, "learning_rate": 4.6868205598586766e-05, - "loss": 1.2246, + "loss": 1.2248, "step": 7375 }, { "epoch": 0.5014268242967794, - "grad_norm": 0.5558198690414429, + "grad_norm": 0.5559371113777161, "learning_rate": 4.686608234814513e-05, - "loss": 1.4296, + "loss": 1.4298, "step": 7380 }, { "epoch": 0.5017665443674413, - "grad_norm": 0.6724082827568054, + "grad_norm": 0.6699419617652893, "learning_rate": 4.6863959097703494e-05, - "loss": 1.4768, + "loss": 1.477, "step": 7385 }, { "epoch": 0.502106264438103, - "grad_norm": 0.6454963684082031, + "grad_norm": 0.6447699666023254, "learning_rate": 4.686183584726186e-05, - "loss": 1.3367, + "loss": 1.3362, "step": 7390 }, { "epoch": 0.5024459845087648, - "grad_norm": 0.6249825358390808, + "grad_norm": 0.6252833008766174, "learning_rate": 4.685971259682022e-05, - "loss": 1.3552, + "loss": 1.3558, "step": 7395 }, { "epoch": 0.5027857045794265, - "grad_norm": 0.6613280773162842, + "grad_norm": 0.6689853668212891, "learning_rate": 4.6857589346378586e-05, - "loss": 1.4176, + "loss": 1.4179, "step": 7400 }, { "epoch": 0.5031254246500884, - "grad_norm": 0.6486158967018127, + "grad_norm": 0.6476678252220154, "learning_rate": 4.685546609593695e-05, - "loss": 1.3796, + "loss": 1.3794, "step": 7405 }, { "epoch": 0.5034651447207501, - "grad_norm": 0.5767577886581421, + "grad_norm": 0.5764955282211304, "learning_rate": 4.6853342845495314e-05, - "loss": 1.3338, + "loss": 1.3337, "step": 7410 }, { "epoch": 0.5038048647914118, - "grad_norm": 0.6602384448051453, + "grad_norm": 0.6616225242614746, "learning_rate": 4.685121959505368e-05, - "loss": 1.4648, + "loss": 1.4646, "step": 7415 }, { "epoch": 0.5041445848620737, - "grad_norm": 0.725269079208374, + "grad_norm": 0.723371684551239, "learning_rate": 4.684909634461204e-05, - "loss": 1.4488, + "loss": 1.4493, "step": 7420 }, { "epoch": 0.5044843049327354, - "grad_norm": 0.6460714936256409, + "grad_norm": 0.6469457745552063, "learning_rate": 4.6846973094170406e-05, - "loss": 1.3039, + "loss": 1.3033, "step": 7425 }, { "epoch": 0.5048240250033972, - "grad_norm": 0.6101348996162415, + "grad_norm": 0.6126435995101929, "learning_rate": 4.684484984372877e-05, - "loss": 1.3841, + "loss": 1.3847, "step": 7430 }, { "epoch": 0.5051637450740589, - "grad_norm": 0.5442931652069092, + "grad_norm": 0.5432808995246887, "learning_rate": 4.6842726593287134e-05, - "loss": 1.4632, + "loss": 1.4634, "step": 7435 }, { "epoch": 0.5055034651447208, - "grad_norm": 0.6163715124130249, + "grad_norm": 0.6145235896110535, "learning_rate": 4.68406033428455e-05, "loss": 1.4037, "step": 7440 }, { "epoch": 0.5058431852153825, - "grad_norm": 0.5447462201118469, + "grad_norm": 0.544209897518158, "learning_rate": 4.683848009240386e-05, - "loss": 1.3181, + "loss": 1.3185, "step": 7445 }, { "epoch": 0.5061829052860443, - "grad_norm": 0.5884779691696167, + "grad_norm": 0.5874150991439819, "learning_rate": 4.6836356841962226e-05, - "loss": 1.3845, + "loss": 1.3852, "step": 7450 }, { "epoch": 0.5065226253567061, - "grad_norm": 0.675547182559967, + "grad_norm": 0.6760624647140503, "learning_rate": 4.683423359152059e-05, - "loss": 1.3882, + "loss": 1.3886, "step": 7455 }, { "epoch": 0.5068623454273679, - "grad_norm": 0.5374890565872192, + "grad_norm": 0.5365296006202698, "learning_rate": 4.6832110341078954e-05, - "loss": 1.4428, + "loss": 1.4418, "step": 7460 }, { "epoch": 0.5072020654980296, - "grad_norm": 0.662009596824646, + "grad_norm": 0.6599185466766357, "learning_rate": 4.682998709063732e-05, - "loss": 1.4613, + "loss": 1.4616, "step": 7465 }, { "epoch": 0.5075417855686915, - "grad_norm": 0.5851577520370483, + "grad_norm": 0.5860413312911987, "learning_rate": 4.682786384019568e-05, - "loss": 1.3581, + "loss": 1.3582, "step": 7470 }, { "epoch": 0.5078815056393532, - "grad_norm": 0.6344434022903442, + "grad_norm": 0.6371375322341919, "learning_rate": 4.6825740589754046e-05, - "loss": 1.4882, + "loss": 1.4876, "step": 7475 }, { "epoch": 0.5082212257100149, - "grad_norm": 0.6997050642967224, + "grad_norm": 0.6983456015586853, "learning_rate": 4.682361733931241e-05, - "loss": 1.3075, + "loss": 1.3069, "step": 7480 }, { "epoch": 0.5085609457806767, - "grad_norm": 0.6474372744560242, + "grad_norm": 0.6485677361488342, "learning_rate": 4.6821494088870774e-05, - "loss": 1.4286, + "loss": 1.428, "step": 7485 }, { "epoch": 0.5089006658513385, - "grad_norm": 0.5623887181282043, + "grad_norm": 0.5630195736885071, "learning_rate": 4.681937083842914e-05, - "loss": 1.3027, + "loss": 1.3026, "step": 7490 }, { "epoch": 0.5092403859220003, - "grad_norm": 0.6566697359085083, + "grad_norm": 0.6584610939025879, "learning_rate": 4.68172475879875e-05, - "loss": 1.4586, + "loss": 1.4588, "step": 7495 }, { "epoch": 0.509580105992662, - "grad_norm": 0.6457698345184326, + "grad_norm": 0.6512005925178528, "learning_rate": 4.6815124337545866e-05, - "loss": 1.3691, + "loss": 1.3695, "step": 7500 }, { "epoch": 0.5099198260633239, - "grad_norm": 0.6728169918060303, + "grad_norm": 0.6735360026359558, "learning_rate": 4.681300108710423e-05, - "loss": 1.4661, + "loss": 1.466, "step": 7505 }, { "epoch": 0.5102595461339856, - "grad_norm": 0.5606579780578613, + "grad_norm": 0.5595372915267944, "learning_rate": 4.6810877836662594e-05, - "loss": 1.4124, + "loss": 1.4129, "step": 7510 }, { "epoch": 0.5105992662046474, - "grad_norm": 0.5920879244804382, + "grad_norm": 0.5912380814552307, "learning_rate": 4.680875458622095e-05, - "loss": 1.4345, + "loss": 1.4349, "step": 7515 }, { "epoch": 0.5109389862753091, - "grad_norm": 0.7026877403259277, + "grad_norm": 0.6980835199356079, "learning_rate": 4.680663133577932e-05, - "loss": 1.4688, + "loss": 1.4692, "step": 7520 }, { "epoch": 0.511278706345971, - "grad_norm": 0.6280520558357239, + "grad_norm": 0.6300933361053467, "learning_rate": 4.6804508085337686e-05, - "loss": 1.4535, + "loss": 1.4534, "step": 7525 }, { "epoch": 0.5116184264166327, - "grad_norm": 0.6298320889472961, + "grad_norm": 0.6297606825828552, "learning_rate": 4.680238483489604e-05, - "loss": 1.3375, + "loss": 1.3378, "step": 7530 }, { "epoch": 0.5119581464872944, - "grad_norm": 0.6901682019233704, + "grad_norm": 0.6973382830619812, "learning_rate": 4.6800261584454414e-05, - "loss": 1.3174, + "loss": 1.3175, "step": 7535 }, { "epoch": 0.5122978665579563, - "grad_norm": 0.6426547765731812, + "grad_norm": 0.644025444984436, "learning_rate": 4.679813833401278e-05, - "loss": 1.4635, + "loss": 1.463, "step": 7540 }, { "epoch": 0.512637586628618, - "grad_norm": 0.6120851039886475, + "grad_norm": 0.6101284027099609, "learning_rate": 4.6796015083571135e-05, "loss": 1.3263, "step": 7545 }, { "epoch": 0.5129773066992798, - "grad_norm": 0.6297520995140076, + "grad_norm": 0.6308725476264954, "learning_rate": 4.6793891833129506e-05, - "loss": 1.4802, + "loss": 1.4799, "step": 7550 }, { "epoch": 0.5133170267699416, - "grad_norm": 0.6352310180664062, + "grad_norm": 0.6367284655570984, "learning_rate": 4.679176858268787e-05, "loss": 1.3879, "step": 7555 }, { "epoch": 0.5136567468406034, - "grad_norm": 0.633722186088562, + "grad_norm": 0.6372681856155396, "learning_rate": 4.678964533224623e-05, - "loss": 1.4923, + "loss": 1.4918, "step": 7560 }, { "epoch": 0.5139964669112651, - "grad_norm": 0.7394413948059082, + "grad_norm": 0.7383389472961426, "learning_rate": 4.67875220818046e-05, - "loss": 1.4159, + "loss": 1.4162, "step": 7565 }, { "epoch": 0.5143361869819268, - "grad_norm": 0.631238579750061, + "grad_norm": 0.6316145658493042, "learning_rate": 4.678539883136296e-05, - "loss": 1.3724, + "loss": 1.3715, "step": 7570 }, { "epoch": 0.5146759070525887, - "grad_norm": 0.7046985626220703, + "grad_norm": 0.7056677937507629, "learning_rate": 4.678327558092132e-05, - "loss": 1.2557, + "loss": 1.2562, "step": 7575 }, { "epoch": 0.5150156271232504, - "grad_norm": 0.5655717253684998, + "grad_norm": 0.5713226199150085, "learning_rate": 4.678115233047969e-05, - "loss": 1.3346, + "loss": 1.3345, "step": 7580 }, { "epoch": 0.5153553471939122, - "grad_norm": 0.620157778263092, + "grad_norm": 0.6236425638198853, "learning_rate": 4.677902908003805e-05, - "loss": 1.4225, + "loss": 1.4228, "step": 7585 }, { "epoch": 0.515695067264574, - "grad_norm": 0.6900854706764221, + "grad_norm": 0.6932271718978882, "learning_rate": 4.677690582959641e-05, - "loss": 1.2892, + "loss": 1.289, "step": 7590 }, { "epoch": 0.5160347873352358, - "grad_norm": 0.6468934416770935, + "grad_norm": 0.6513332724571228, "learning_rate": 4.677478257915478e-05, - "loss": 1.3578, + "loss": 1.3571, "step": 7595 }, { "epoch": 0.5163745074058975, - "grad_norm": 0.6362219452857971, + "grad_norm": 0.6367542743682861, "learning_rate": 4.677265932871314e-05, - "loss": 1.3582, + "loss": 1.3581, "step": 7600 }, { "epoch": 0.5167142274765593, - "grad_norm": 0.5996149182319641, + "grad_norm": 0.5993014574050903, "learning_rate": 4.67705360782715e-05, "loss": 1.3891, "step": 7605 }, { "epoch": 0.5170539475472211, - "grad_norm": 0.6514232754707336, + "grad_norm": 0.6513985991477966, "learning_rate": 4.6768412827829874e-05, - "loss": 1.3815, + "loss": 1.3812, "step": 7610 }, { "epoch": 0.5173936676178829, - "grad_norm": 0.7216064929962158, + "grad_norm": 0.7239281535148621, "learning_rate": 4.676628957738823e-05, - "loss": 1.2489, + "loss": 1.2491, "step": 7615 }, { "epoch": 0.5177333876885446, - "grad_norm": 0.6348338723182678, + "grad_norm": 0.6326451897621155, "learning_rate": 4.6764166326946595e-05, - "loss": 1.4793, + "loss": 1.4792, "step": 7620 }, { "epoch": 0.5180731077592065, - "grad_norm": 0.5960789918899536, + "grad_norm": 0.5947378277778625, "learning_rate": 4.6762043076504966e-05, - "loss": 1.4501, + "loss": 1.4495, "step": 7625 }, { "epoch": 0.5184128278298682, - "grad_norm": 0.6377695798873901, + "grad_norm": 0.6373794674873352, "learning_rate": 4.675991982606332e-05, - "loss": 1.3961, + "loss": 1.3972, "step": 7630 }, { "epoch": 0.5187525479005299, - "grad_norm": 0.6256961822509766, + "grad_norm": 0.6286939382553101, "learning_rate": 4.675779657562169e-05, - "loss": 1.3389, + "loss": 1.3391, "step": 7635 }, { "epoch": 0.5190922679711918, - "grad_norm": 0.5948920249938965, + "grad_norm": 0.6001981496810913, "learning_rate": 4.675567332518006e-05, - "loss": 1.4922, + "loss": 1.4924, "step": 7640 }, { "epoch": 0.5194319880418535, - "grad_norm": 0.6825036406517029, + "grad_norm": 0.6814448237419128, "learning_rate": 4.6753550074738415e-05, - "loss": 1.3309, + "loss": 1.3305, "step": 7645 }, { "epoch": 0.5197717081125153, - "grad_norm": 0.6182436943054199, + "grad_norm": 0.6129140257835388, "learning_rate": 4.675142682429678e-05, - "loss": 1.3435, + "loss": 1.3432, "step": 7650 }, { "epoch": 0.520111428183177, - "grad_norm": 0.6601023077964783, + "grad_norm": 0.6579285264015198, "learning_rate": 4.674930357385514e-05, - "loss": 1.3334, + "loss": 1.333, "step": 7655 }, { "epoch": 0.5204511482538389, - "grad_norm": 0.5339378714561462, + "grad_norm": 0.5319764614105225, "learning_rate": 4.674718032341351e-05, - "loss": 1.3559, + "loss": 1.3553, "step": 7660 }, { "epoch": 0.5207908683245006, - "grad_norm": 0.6746454834938049, + "grad_norm": 0.6780913472175598, "learning_rate": 4.674505707297187e-05, - "loss": 1.3395, + "loss": 1.3393, "step": 7665 }, { "epoch": 0.5211305883951624, - "grad_norm": 0.6270281076431274, + "grad_norm": 0.6283904910087585, "learning_rate": 4.6742933822530235e-05, "loss": 1.4089, "step": 7670 }, { "epoch": 0.5214703084658242, - "grad_norm": 0.7043746709823608, + "grad_norm": 0.7081140279769897, "learning_rate": 4.67408105720886e-05, - "loss": 1.3482, + "loss": 1.3486, "step": 7675 }, { "epoch": 0.521810028536486, - "grad_norm": 0.6281951069831848, + "grad_norm": 0.6251125931739807, "learning_rate": 4.673868732164697e-05, - "loss": 1.2795, + "loss": 1.2796, "step": 7680 }, { "epoch": 0.5221497486071477, - "grad_norm": 0.7626495361328125, + "grad_norm": 0.7619231343269348, "learning_rate": 4.673656407120533e-05, - "loss": 1.4372, + "loss": 1.4379, "step": 7685 }, { "epoch": 0.5224894686778094, - "grad_norm": 0.672217607498169, + "grad_norm": 0.6748984456062317, "learning_rate": 4.673444082076369e-05, - "loss": 1.4279, + "loss": 1.4285, "step": 7690 }, { "epoch": 0.5228291887484713, - "grad_norm": 0.6980194449424744, + "grad_norm": 0.6956965923309326, "learning_rate": 4.673231757032206e-05, - "loss": 1.3344, + "loss": 1.3336, "step": 7695 }, { "epoch": 0.523168908819133, - "grad_norm": 0.6135451793670654, + "grad_norm": 0.6167689561843872, "learning_rate": 4.673019431988042e-05, - "loss": 1.399, + "loss": 1.3987, "step": 7700 }, { "epoch": 0.5235086288897948, - "grad_norm": 0.5958032608032227, + "grad_norm": 0.5954800248146057, "learning_rate": 4.672807106943878e-05, - "loss": 1.4593, + "loss": 1.4592, "step": 7705 }, { "epoch": 0.5238483489604566, - "grad_norm": 0.6722570657730103, + "grad_norm": 0.6722667813301086, "learning_rate": 4.6725947818997154e-05, - "loss": 1.2588, + "loss": 1.2584, "step": 7710 }, { "epoch": 0.5241880690311184, - "grad_norm": 0.6172633767127991, + "grad_norm": 0.6174940466880798, "learning_rate": 4.672382456855551e-05, - "loss": 1.3507, + "loss": 1.351, "step": 7715 }, { "epoch": 0.5245277891017801, - "grad_norm": 0.6627243757247925, + "grad_norm": 0.6645811796188354, "learning_rate": 4.6721701318113875e-05, - "loss": 1.4994, + "loss": 1.4995, "step": 7720 }, { "epoch": 0.524867509172442, - "grad_norm": 0.7068824172019958, + "grad_norm": 0.712710440158844, "learning_rate": 4.6719578067672246e-05, - "loss": 1.3198, + "loss": 1.3195, "step": 7725 }, { "epoch": 0.5252072292431037, - "grad_norm": 0.6820070743560791, + "grad_norm": 0.6817407011985779, "learning_rate": 4.67174548172306e-05, - "loss": 1.4842, + "loss": 1.485, "step": 7730 }, { "epoch": 0.5255469493137654, - "grad_norm": 0.6187071204185486, + "grad_norm": 0.6206430196762085, "learning_rate": 4.671533156678897e-05, - "loss": 1.3429, + "loss": 1.3431, "step": 7735 }, { "epoch": 0.5258866693844272, - "grad_norm": 0.5970851182937622, + "grad_norm": 0.5985031723976135, "learning_rate": 4.671320831634733e-05, - "loss": 1.3337, + "loss": 1.3335, "step": 7740 }, { "epoch": 0.526226389455089, - "grad_norm": 0.6725092530250549, + "grad_norm": 0.672912061214447, "learning_rate": 4.6711085065905695e-05, - "loss": 1.3842, + "loss": 1.3838, "step": 7745 }, { "epoch": 0.5265661095257508, - "grad_norm": 0.6618882417678833, + "grad_norm": 0.6649622321128845, "learning_rate": 4.670896181546406e-05, - "loss": 1.4507, + "loss": 1.4509, "step": 7750 }, { "epoch": 0.5269058295964125, - "grad_norm": 0.670928955078125, + "grad_norm": 0.6698585748672485, "learning_rate": 4.670683856502242e-05, - "loss": 1.5447, + "loss": 1.5448, "step": 7755 }, { "epoch": 0.5272455496670744, - "grad_norm": 0.7112619280815125, + "grad_norm": 0.7101792097091675, "learning_rate": 4.670471531458079e-05, - "loss": 1.3642, + "loss": 1.3648, "step": 7760 }, { "epoch": 0.5275852697377361, - "grad_norm": 0.6146485805511475, + "grad_norm": 0.6136994361877441, "learning_rate": 4.670259206413915e-05, - "loss": 1.2933, + "loss": 1.293, "step": 7765 }, { "epoch": 0.5279249898083979, - "grad_norm": 0.8029811382293701, + "grad_norm": 0.8013178706169128, "learning_rate": 4.6700468813697515e-05, - "loss": 1.2899, + "loss": 1.2893, "step": 7770 }, { "epoch": 0.5282647098790596, - "grad_norm": 0.6654508113861084, + "grad_norm": 0.6643385887145996, "learning_rate": 4.669834556325588e-05, - "loss": 1.4213, + "loss": 1.4216, "step": 7775 }, { "epoch": 0.5286044299497215, - "grad_norm": 0.5974158644676208, + "grad_norm": 0.5992156863212585, "learning_rate": 4.669622231281424e-05, - "loss": 1.2864, + "loss": 1.2866, "step": 7780 }, { "epoch": 0.5289441500203832, - "grad_norm": 0.572313666343689, + "grad_norm": 0.5723800659179688, "learning_rate": 4.669409906237261e-05, - "loss": 1.4704, + "loss": 1.4709, "step": 7785 }, { "epoch": 0.5292838700910449, - "grad_norm": 0.6661244034767151, + "grad_norm": 0.6667107939720154, "learning_rate": 4.669197581193097e-05, - "loss": 1.4355, + "loss": 1.435, "step": 7790 }, { "epoch": 0.5296235901617068, - "grad_norm": 0.6460347175598145, + "grad_norm": 0.6472682356834412, "learning_rate": 4.6689852561489335e-05, - "loss": 1.468, + "loss": 1.4681, "step": 7795 }, { "epoch": 0.5299633102323685, - "grad_norm": 0.6542794704437256, + "grad_norm": 0.6517109274864197, "learning_rate": 4.66877293110477e-05, - "loss": 1.3652, + "loss": 1.365, "step": 7800 }, { "epoch": 0.5303030303030303, - "grad_norm": 0.7438021302223206, + "grad_norm": 0.7099937796592712, "learning_rate": 4.668560606060606e-05, - "loss": 1.3834, + "loss": 1.3841, "step": 7805 }, { "epoch": 0.5306427503736921, - "grad_norm": 0.6667519807815552, + "grad_norm": 0.6689038276672363, "learning_rate": 4.668348281016443e-05, "loss": 1.2816, "step": 7810 }, { "epoch": 0.5309824704443539, - "grad_norm": 0.6257959008216858, + "grad_norm": 0.6253542304039001, "learning_rate": 4.668135955972279e-05, - "loss": 1.3173, + "loss": 1.3174, "step": 7815 }, { "epoch": 0.5313221905150156, - "grad_norm": 0.6496632695198059, + "grad_norm": 0.648605227470398, "learning_rate": 4.6679236309281155e-05, "loss": 1.3937, "step": 7820 }, { "epoch": 0.5316619105856774, - "grad_norm": 0.6605326533317566, + "grad_norm": 0.6577568650245667, "learning_rate": 4.667711305883952e-05, - "loss": 1.3761, + "loss": 1.3762, "step": 7825 }, { "epoch": 0.5320016306563392, - "grad_norm": 0.6309987306594849, + "grad_norm": 0.6312741637229919, "learning_rate": 4.667498980839788e-05, - "loss": 1.3127, + "loss": 1.3124, "step": 7830 }, { "epoch": 0.532341350727001, - "grad_norm": 0.6613396406173706, + "grad_norm": 0.6628729701042175, "learning_rate": 4.667286655795625e-05, "loss": 1.3807, "step": 7835 }, { "epoch": 0.5326810707976627, - "grad_norm": 0.6253994107246399, + "grad_norm": 0.624671220779419, "learning_rate": 4.667074330751461e-05, - "loss": 1.2943, + "loss": 1.2947, "step": 7840 }, { "epoch": 0.5330207908683245, - "grad_norm": 0.6277719140052795, + "grad_norm": 0.6267369389533997, "learning_rate": 4.6668620057072975e-05, "loss": 1.3851, "step": 7845 }, { "epoch": 0.5333605109389863, - "grad_norm": 0.6118496060371399, + "grad_norm": 0.6101575493812561, "learning_rate": 4.666649680663134e-05, - "loss": 1.4124, + "loss": 1.4131, "step": 7850 }, { "epoch": 0.533700231009648, - "grad_norm": 0.6762769818305969, + "grad_norm": 0.6726583242416382, "learning_rate": 4.6664373556189697e-05, - "loss": 1.3867, + "loss": 1.3875, "step": 7855 }, { "epoch": 0.5340399510803098, - "grad_norm": 0.5931509137153625, + "grad_norm": 0.5945740342140198, "learning_rate": 4.666225030574807e-05, - "loss": 1.3406, + "loss": 1.3409, "step": 7860 }, { "epoch": 0.5343796711509716, - "grad_norm": 0.5516414046287537, + "grad_norm": 0.552399754524231, "learning_rate": 4.666012705530643e-05, - "loss": 1.426, + "loss": 1.4261, "step": 7865 }, { "epoch": 0.5347193912216334, - "grad_norm": 0.539910078048706, + "grad_norm": 0.5406373739242554, "learning_rate": 4.665800380486479e-05, - "loss": 1.4105, + "loss": 1.4112, "step": 7870 }, { "epoch": 0.5350591112922951, - "grad_norm": 0.6131874322891235, + "grad_norm": 0.6137151122093201, "learning_rate": 4.665588055442316e-05, "loss": 1.3148, "step": 7875 }, { "epoch": 0.535398831362957, - "grad_norm": 0.6400133967399597, + "grad_norm": 0.6404659748077393, "learning_rate": 4.6653757303981523e-05, - "loss": 1.3658, + "loss": 1.3662, "step": 7880 }, { "epoch": 0.5357385514336187, - "grad_norm": 0.5909250974655151, + "grad_norm": 0.5880735516548157, "learning_rate": 4.665163405353988e-05, - "loss": 1.3665, + "loss": 1.3661, "step": 7885 }, { "epoch": 0.5360782715042804, - "grad_norm": 0.6109614372253418, + "grad_norm": 0.6104915142059326, "learning_rate": 4.664951080309825e-05, - "loss": 1.4881, + "loss": 1.4884, "step": 7890 }, { "epoch": 0.5364179915749423, - "grad_norm": 0.6652154922485352, + "grad_norm": 0.6660661101341248, "learning_rate": 4.6647387552656615e-05, - "loss": 1.4135, + "loss": 1.4144, "step": 7895 }, { "epoch": 0.536757711645604, - "grad_norm": 0.5965635180473328, + "grad_norm": 0.5970953702926636, "learning_rate": 4.664526430221497e-05, - "loss": 1.2594, + "loss": 1.2587, "step": 7900 }, { "epoch": 0.5370974317162658, - "grad_norm": 0.6420058608055115, + "grad_norm": 0.6456806659698486, "learning_rate": 4.6643141051773343e-05, - "loss": 1.411, + "loss": 1.4117, "step": 7905 }, { "epoch": 0.5374371517869275, - "grad_norm": 0.7349154353141785, + "grad_norm": 0.7306447625160217, "learning_rate": 4.664101780133171e-05, "loss": 1.5397, "step": 7910 }, { "epoch": 0.5377768718575894, - "grad_norm": 0.6341392397880554, + "grad_norm": 0.6343594789505005, "learning_rate": 4.6638894550890065e-05, - "loss": 1.5346, + "loss": 1.5341, "step": 7915 }, { "epoch": 0.5381165919282511, - "grad_norm": 0.5322096943855286, + "grad_norm": 0.5315430760383606, "learning_rate": 4.6636771300448435e-05, "loss": 1.4165, "step": 7920 }, { "epoch": 0.5384563119989129, - "grad_norm": 0.6493896245956421, + "grad_norm": 0.6469089984893799, "learning_rate": 4.66346480500068e-05, - "loss": 1.3916, + "loss": 1.3926, "step": 7925 }, { "epoch": 0.5387960320695747, - "grad_norm": 0.6232993602752686, + "grad_norm": 0.6222955584526062, "learning_rate": 4.663252479956516e-05, - "loss": 1.3467, + "loss": 1.3455, "step": 7930 }, { "epoch": 0.5391357521402365, - "grad_norm": 0.6491148471832275, + "grad_norm": 0.6444337368011475, "learning_rate": 4.663040154912353e-05, - "loss": 1.4513, + "loss": 1.4514, "step": 7935 }, { "epoch": 0.5394754722108982, - "grad_norm": 0.6049793362617493, + "grad_norm": 0.6063283681869507, "learning_rate": 4.6628278298681885e-05, - "loss": 1.3655, + "loss": 1.3656, "step": 7940 }, { "epoch": 0.5398151922815599, - "grad_norm": 0.6911430954933167, + "grad_norm": 0.6866796612739563, "learning_rate": 4.662615504824025e-05, - "loss": 1.3749, + "loss": 1.3748, "step": 7945 }, { "epoch": 0.5401549123522218, - "grad_norm": 0.7091353535652161, + "grad_norm": 0.7051680088043213, "learning_rate": 4.662403179779862e-05, - "loss": 1.4294, + "loss": 1.4288, "step": 7950 }, { "epoch": 0.5404946324228835, - "grad_norm": 0.6359396576881409, + "grad_norm": 0.6366180777549744, "learning_rate": 4.662190854735698e-05, "loss": 1.3657, "step": 7955 }, { "epoch": 0.5408343524935453, - "grad_norm": 0.6963664293289185, + "grad_norm": 0.6941373348236084, "learning_rate": 4.661978529691534e-05, - "loss": 1.3415, + "loss": 1.3416, "step": 7960 }, { "epoch": 0.5411740725642071, - "grad_norm": 0.6002401113510132, + "grad_norm": 0.5993899703025818, "learning_rate": 4.661766204647371e-05, - "loss": 1.4231, + "loss": 1.4229, "step": 7965 }, { "epoch": 0.5415137926348689, - "grad_norm": 0.6737387776374817, + "grad_norm": 0.6739424467086792, "learning_rate": 4.661553879603207e-05, - "loss": 1.4223, + "loss": 1.4219, "step": 7970 }, { "epoch": 0.5418535127055306, - "grad_norm": 0.7201437950134277, + "grad_norm": 0.720251202583313, "learning_rate": 4.661341554559043e-05, - "loss": 1.4063, + "loss": 1.4056, "step": 7975 }, { "epoch": 0.5421932327761925, - "grad_norm": 0.5882259607315063, + "grad_norm": 0.5877358913421631, "learning_rate": 4.6611292295148803e-05, - "loss": 1.3551, + "loss": 1.3546, "step": 7980 }, { "epoch": 0.5425329528468542, - "grad_norm": 0.5975483059883118, + "grad_norm": 0.5986196994781494, "learning_rate": 4.660916904470716e-05, - "loss": 1.4396, + "loss": 1.4391, "step": 7985 }, { "epoch": 0.542872672917516, - "grad_norm": 0.4384988844394684, + "grad_norm": 0.43787240982055664, "learning_rate": 4.6607045794265525e-05, - "loss": 1.3125, + "loss": 1.3121, "step": 7990 }, { "epoch": 0.5432123929881777, - "grad_norm": 0.7761228680610657, + "grad_norm": 0.7927671670913696, "learning_rate": 4.6604922543823895e-05, - "loss": 1.3367, + "loss": 1.3374, "step": 7995 }, { "epoch": 0.5435521130588395, - "grad_norm": 0.6241304278373718, + "grad_norm": 0.6257770657539368, "learning_rate": 4.660279929338225e-05, - "loss": 1.4024, + "loss": 1.4031, "step": 8000 }, { "epoch": 0.5438918331295013, - "grad_norm": 0.649651288986206, + "grad_norm": 0.6502427458763123, "learning_rate": 4.660067604294062e-05, - "loss": 1.3397, + "loss": 1.3405, "step": 8005 }, { "epoch": 0.544231553200163, - "grad_norm": 0.6383991241455078, + "grad_norm": 0.6403571963310242, "learning_rate": 4.659855279249898e-05, - "loss": 1.4675, + "loss": 1.4681, "step": 8010 }, { "epoch": 0.5445712732708249, - "grad_norm": 0.6509715914726257, + "grad_norm": 0.6504490971565247, "learning_rate": 4.6596429542057345e-05, - "loss": 1.3761, + "loss": 1.3764, "step": 8015 }, { "epoch": 0.5449109933414866, - "grad_norm": 0.685106098651886, + "grad_norm": 0.6836479306221008, "learning_rate": 4.6594306291615716e-05, - "loss": 1.3551, + "loss": 1.3546, "step": 8020 }, { "epoch": 0.5452507134121484, - "grad_norm": 0.6405821442604065, + "grad_norm": 0.6382858753204346, "learning_rate": 4.659218304117407e-05, - "loss": 1.3918, + "loss": 1.3923, "step": 8025 }, { "epoch": 0.5455904334828101, - "grad_norm": 0.6775296926498413, + "grad_norm": 0.6755823493003845, "learning_rate": 4.659005979073244e-05, - "loss": 1.3949, + "loss": 1.3945, "step": 8030 }, { "epoch": 0.545930153553472, - "grad_norm": 0.6392462849617004, + "grad_norm": 0.6402759552001953, "learning_rate": 4.658793654029081e-05, - "loss": 1.4426, + "loss": 1.443, "step": 8035 }, { "epoch": 0.5462698736241337, - "grad_norm": 0.5327250957489014, + "grad_norm": 0.5340790152549744, "learning_rate": 4.6585813289849165e-05, - "loss": 1.322, + "loss": 1.3226, "step": 8040 }, { "epoch": 0.5466095936947954, - "grad_norm": 1.0725865364074707, + "grad_norm": 1.0451440811157227, "learning_rate": 4.658369003940753e-05, - "loss": 1.3749, + "loss": 1.3738, "step": 8045 }, { "epoch": 0.5469493137654573, - "grad_norm": 0.6358645558357239, + "grad_norm": 0.6350591778755188, "learning_rate": 4.65815667889659e-05, - "loss": 1.4291, + "loss": 1.4297, "step": 8050 }, { "epoch": 0.547289033836119, - "grad_norm": 0.6378662586212158, + "grad_norm": 0.6282897591590881, "learning_rate": 4.657944353852426e-05, - "loss": 1.3221, + "loss": 1.3229, "step": 8055 }, { "epoch": 0.5476287539067808, - "grad_norm": 0.607462465763092, + "grad_norm": 0.6089807152748108, "learning_rate": 4.657732028808262e-05, - "loss": 1.3914, + "loss": 1.3908, "step": 8060 }, { "epoch": 0.5479684739774426, - "grad_norm": 0.6590172052383423, + "grad_norm": 0.6571972966194153, "learning_rate": 4.657519703764099e-05, - "loss": 1.2552, + "loss": 1.2551, "step": 8065 }, { "epoch": 0.5483081940481044, - "grad_norm": 0.6041439771652222, + "grad_norm": 0.5988997220993042, "learning_rate": 4.657307378719935e-05, - "loss": 1.4059, + "loss": 1.406, "step": 8070 }, { "epoch": 0.5486479141187661, - "grad_norm": 0.5647345185279846, + "grad_norm": 0.5657494068145752, "learning_rate": 4.657095053675771e-05, - "loss": 1.4185, + "loss": 1.4186, "step": 8075 }, { "epoch": 0.5489876341894279, - "grad_norm": 0.5834894776344299, + "grad_norm": 0.5830298662185669, "learning_rate": 4.656882728631608e-05, - "loss": 1.3359, + "loss": 1.3357, "step": 8080 }, { "epoch": 0.5493273542600897, - "grad_norm": 0.637019693851471, + "grad_norm": 0.6386149525642395, "learning_rate": 4.656670403587444e-05, "loss": 1.385, "step": 8085 }, { "epoch": 0.5496670743307515, - "grad_norm": 0.635460376739502, + "grad_norm": 0.6361541152000427, "learning_rate": 4.6564580785432805e-05, "loss": 1.3736, "step": 8090 }, { "epoch": 0.5500067944014132, - "grad_norm": 0.6867489814758301, + "grad_norm": 0.6874611377716064, "learning_rate": 4.656245753499117e-05, - "loss": 1.3258, + "loss": 1.3253, "step": 8095 }, { "epoch": 0.550346514472075, - "grad_norm": 0.7199000120162964, + "grad_norm": 0.7213795781135559, "learning_rate": 4.656033428454953e-05, - "loss": 1.3913, + "loss": 1.3921, "step": 8100 }, { "epoch": 0.5506862345427368, - "grad_norm": 0.6548787951469421, + "grad_norm": 0.6527374386787415, "learning_rate": 4.65582110341079e-05, - "loss": 1.3841, + "loss": 1.3846, "step": 8105 }, { "epoch": 0.5510259546133985, - "grad_norm": 0.6761186122894287, + "grad_norm": 0.673852264881134, "learning_rate": 4.655608778366626e-05, - "loss": 1.3442, + "loss": 1.3447, "step": 8110 }, { "epoch": 0.5513656746840603, - "grad_norm": 0.592474639415741, + "grad_norm": 0.5918948650360107, "learning_rate": 4.6553964533224625e-05, - "loss": 1.38, + "loss": 1.3806, "step": 8115 }, { "epoch": 0.5517053947547221, - "grad_norm": 0.66417396068573, + "grad_norm": 0.6634514331817627, "learning_rate": 4.655184128278299e-05, "loss": 1.5308, "step": 8120 }, { "epoch": 0.5520451148253839, - "grad_norm": 0.623815655708313, + "grad_norm": 0.6248072981834412, "learning_rate": 4.654971803234135e-05, - "loss": 1.4102, + "loss": 1.4109, "step": 8125 }, { "epoch": 0.5523848348960456, - "grad_norm": 0.6209930181503296, + "grad_norm": 0.6207954287528992, "learning_rate": 4.654759478189972e-05, "loss": 1.3864, "step": 8130 }, { "epoch": 0.5527245549667075, - "grad_norm": 0.6548243165016174, + "grad_norm": 0.6567164659500122, "learning_rate": 4.654547153145808e-05, - "loss": 1.4629, + "loss": 1.4624, "step": 8135 }, { "epoch": 0.5530642750373692, - "grad_norm": 0.6529601812362671, + "grad_norm": 0.6514599919319153, "learning_rate": 4.6543348281016445e-05, - "loss": 1.4546, + "loss": 1.4548, "step": 8140 }, { "epoch": 0.553403995108031, - "grad_norm": 0.667816162109375, + "grad_norm": 0.6703718304634094, "learning_rate": 4.654122503057481e-05, - "loss": 1.4255, + "loss": 1.4254, "step": 8145 }, { "epoch": 0.5537437151786928, - "grad_norm": 0.6340169906616211, + "grad_norm": 0.6354977488517761, "learning_rate": 4.653910178013317e-05, - "loss": 1.343, + "loss": 1.3427, "step": 8150 }, { "epoch": 0.5540834352493546, - "grad_norm": 0.6256027221679688, + "grad_norm": 0.6242473125457764, "learning_rate": 4.653697852969154e-05, - "loss": 1.3906, + "loss": 1.3908, "step": 8155 }, { "epoch": 0.5544231553200163, - "grad_norm": 0.648058295249939, + "grad_norm": 0.6458278894424438, "learning_rate": 4.65348552792499e-05, "loss": 1.3785, "step": 8160 }, { "epoch": 0.554762875390678, - "grad_norm": 0.6203805208206177, + "grad_norm": 0.6222134232521057, "learning_rate": 4.6532732028808265e-05, - "loss": 1.4438, + "loss": 1.4444, "step": 8165 }, { "epoch": 0.5551025954613399, - "grad_norm": 0.6117679476737976, + "grad_norm": 0.610227108001709, "learning_rate": 4.653060877836663e-05, - "loss": 1.4236, + "loss": 1.4233, "step": 8170 }, { "epoch": 0.5554423155320016, - "grad_norm": 0.6029890775680542, + "grad_norm": 0.6047002673149109, "learning_rate": 4.652848552792499e-05, - "loss": 1.3134, + "loss": 1.3135, "step": 8175 }, { "epoch": 0.5557820356026634, - "grad_norm": 0.6157516241073608, + "grad_norm": 0.608928918838501, "learning_rate": 4.652636227748336e-05, - "loss": 1.3097, + "loss": 1.309, "step": 8180 }, { "epoch": 0.5561217556733252, - "grad_norm": 0.6411975622177124, + "grad_norm": 0.6414384841918945, "learning_rate": 4.652423902704172e-05, - "loss": 1.2708, + "loss": 1.2711, "step": 8185 }, { "epoch": 0.556461475743987, - "grad_norm": 0.5759761929512024, + "grad_norm": 0.5755688548088074, "learning_rate": 4.6522115776600085e-05, - "loss": 1.3998, + "loss": 1.3994, "step": 8190 }, { "epoch": 0.5568011958146487, - "grad_norm": 0.7343934774398804, + "grad_norm": 0.7352962493896484, "learning_rate": 4.651999252615845e-05, - "loss": 1.3454, + "loss": 1.3447, "step": 8195 }, { "epoch": 0.5571409158853105, - "grad_norm": 0.6397271752357483, + "grad_norm": 0.6444336175918579, "learning_rate": 4.651786927571681e-05, - "loss": 1.3922, + "loss": 1.3925, "step": 8200 }, { "epoch": 0.5574806359559723, - "grad_norm": 0.6115093231201172, + "grad_norm": 0.6135734915733337, "learning_rate": 4.651574602527518e-05, - "loss": 1.3425, + "loss": 1.3424, "step": 8205 }, { "epoch": 0.557820356026634, - "grad_norm": 0.6679167151451111, + "grad_norm": 0.6665275692939758, "learning_rate": 4.6513622774833534e-05, - "loss": 1.3271, + "loss": 1.3272, "step": 8210 }, { "epoch": 0.5581600760972958, - "grad_norm": 0.5975149273872375, + "grad_norm": 0.598777711391449, "learning_rate": 4.6511499524391905e-05, - "loss": 1.2803, + "loss": 1.2799, "step": 8215 }, { "epoch": 0.5584997961679576, - "grad_norm": 0.5187572240829468, + "grad_norm": 0.5173119306564331, "learning_rate": 4.650937627395027e-05, - "loss": 1.2844, + "loss": 1.2846, "step": 8220 }, { "epoch": 0.5588395162386194, - "grad_norm": 0.6356489062309265, + "grad_norm": 0.6348473429679871, "learning_rate": 4.6507253023508626e-05, - "loss": 1.3259, + "loss": 1.3255, "step": 8225 }, { "epoch": 0.5591792363092811, - "grad_norm": 0.7586395740509033, + "grad_norm": 0.7525264620780945, "learning_rate": 4.6505129773067e-05, - "loss": 1.4771, + "loss": 1.4766, "step": 8230 }, { "epoch": 0.559518956379943, - "grad_norm": 0.5931305289268494, + "grad_norm": 0.5922448039054871, "learning_rate": 4.650300652262536e-05, - "loss": 1.3238, + "loss": 1.3237, "step": 8235 }, { "epoch": 0.5598586764506047, - "grad_norm": 0.6305395364761353, + "grad_norm": 0.6306609511375427, "learning_rate": 4.650088327218372e-05, - "loss": 1.4735, + "loss": 1.474, "step": 8240 }, { "epoch": 0.5601983965212665, - "grad_norm": 0.6581220626831055, + "grad_norm": 0.6600781679153442, "learning_rate": 4.649876002174209e-05, - "loss": 1.4475, + "loss": 1.4476, "step": 8245 }, { "epoch": 0.5605381165919282, - "grad_norm": 0.6286765933036804, + "grad_norm": 0.6282966136932373, "learning_rate": 4.649663677130045e-05, - "loss": 1.2987, + "loss": 1.2988, "step": 8250 }, { "epoch": 0.5608778366625901, - "grad_norm": 0.6141452193260193, + "grad_norm": 0.6167344450950623, "learning_rate": 4.649451352085881e-05, - "loss": 1.2827, + "loss": 1.2825, "step": 8255 }, { "epoch": 0.5612175567332518, - "grad_norm": 0.6577660441398621, + "grad_norm": 0.6633920669555664, "learning_rate": 4.649239027041718e-05, - "loss": 1.3723, + "loss": 1.3725, "step": 8260 }, { "epoch": 0.5615572768039135, - "grad_norm": 0.6323842406272888, + "grad_norm": 0.631795346736908, "learning_rate": 4.6490267019975545e-05, - "loss": 1.3636, + "loss": 1.3641, "step": 8265 }, { "epoch": 0.5618969968745754, - "grad_norm": 0.5749620199203491, + "grad_norm": 0.5734362602233887, "learning_rate": 4.64881437695339e-05, - "loss": 1.3261, + "loss": 1.3252, "step": 8270 }, { "epoch": 0.5622367169452371, - "grad_norm": 0.6635522246360779, + "grad_norm": 0.6658808588981628, "learning_rate": 4.648602051909227e-05, - "loss": 1.3411, + "loss": 1.3408, "step": 8275 }, { "epoch": 0.5625764370158989, - "grad_norm": 0.7397140264511108, + "grad_norm": 0.736308217048645, "learning_rate": 4.648389726865063e-05, - "loss": 1.3515, + "loss": 1.3512, "step": 8280 }, { "epoch": 0.5629161570865606, - "grad_norm": 0.5669236779212952, + "grad_norm": 0.5675844550132751, "learning_rate": 4.6481774018208994e-05, - "loss": 1.3943, + "loss": 1.3944, "step": 8285 }, { "epoch": 0.5632558771572225, - "grad_norm": 0.5314184427261353, + "grad_norm": 0.5291290879249573, "learning_rate": 4.6479650767767365e-05, - "loss": 1.4635, + "loss": 1.4634, "step": 8290 }, { "epoch": 0.5635955972278842, - "grad_norm": 0.6521828174591064, + "grad_norm": 0.6529996991157532, "learning_rate": 4.647752751732572e-05, - "loss": 1.3649, + "loss": 1.3657, "step": 8295 }, { "epoch": 0.563935317298546, - "grad_norm": 0.6266273260116577, + "grad_norm": 0.6253494024276733, "learning_rate": 4.6475404266884086e-05, - "loss": 1.4524, + "loss": 1.4531, "step": 8300 }, { "epoch": 0.5642750373692078, - "grad_norm": 0.5829536318778992, + "grad_norm": 0.5826250314712524, "learning_rate": 4.647328101644246e-05, - "loss": 1.354, + "loss": 1.3538, "step": 8305 }, { "epoch": 0.5646147574398696, - "grad_norm": 0.6950231194496155, + "grad_norm": 0.6933238506317139, "learning_rate": 4.6471157766000814e-05, - "loss": 1.3899, + "loss": 1.3898, "step": 8310 }, { "epoch": 0.5649544775105313, - "grad_norm": 0.6778619885444641, + "grad_norm": 0.6768309473991394, "learning_rate": 4.646903451555918e-05, - "loss": 1.3522, + "loss": 1.3518, "step": 8315 }, { "epoch": 0.5652941975811931, - "grad_norm": 0.6076153516769409, + "grad_norm": 0.6072086095809937, "learning_rate": 4.646691126511755e-05, - "loss": 1.3722, + "loss": 1.3723, "step": 8320 }, { "epoch": 0.5656339176518549, - "grad_norm": 0.6762505769729614, + "grad_norm": 0.6726799011230469, "learning_rate": 4.6464788014675906e-05, - "loss": 1.4833, + "loss": 1.4832, "step": 8325 }, { "epoch": 0.5659736377225166, - "grad_norm": 0.7496538758277893, + "grad_norm": 0.7483469247817993, "learning_rate": 4.646266476423427e-05, - "loss": 1.433, + "loss": 1.4326, "step": 8330 }, { "epoch": 0.5663133577931784, - "grad_norm": 0.692680299282074, + "grad_norm": 0.691436231136322, "learning_rate": 4.646054151379264e-05, - "loss": 1.5166, + "loss": 1.5168, "step": 8335 }, { "epoch": 0.5666530778638402, - "grad_norm": 0.5774497985839844, + "grad_norm": 0.5793526768684387, "learning_rate": 4.6458418263351e-05, - "loss": 1.3301, + "loss": 1.3298, "step": 8340 }, { "epoch": 0.566992797934502, - "grad_norm": 0.6338465213775635, + "grad_norm": 0.6343037486076355, "learning_rate": 4.645629501290936e-05, - "loss": 1.463, + "loss": 1.4637, "step": 8345 }, { "epoch": 0.5673325180051637, - "grad_norm": 0.6851441860198975, + "grad_norm": 0.6868380904197693, "learning_rate": 4.645417176246773e-05, - "loss": 1.3758, + "loss": 1.3761, "step": 8350 }, { "epoch": 0.5676722380758256, - "grad_norm": 0.7029812335968018, + "grad_norm": 0.7009192705154419, "learning_rate": 4.645204851202609e-05, - "loss": 1.4005, + "loss": 1.4004, "step": 8355 }, { "epoch": 0.5680119581464873, - "grad_norm": 0.5923866033554077, + "grad_norm": 0.5923925042152405, "learning_rate": 4.644992526158446e-05, - "loss": 1.4234, + "loss": 1.4236, "step": 8360 }, { "epoch": 0.568351678217149, - "grad_norm": 0.6720953583717346, + "grad_norm": 0.6747313141822815, "learning_rate": 4.644780201114282e-05, - "loss": 1.4136, + "loss": 1.4142, "step": 8365 }, { "epoch": 0.5686913982878108, - "grad_norm": 0.6519859433174133, + "grad_norm": 0.6399051547050476, "learning_rate": 4.644567876070118e-05, - "loss": 1.346, + "loss": 1.3458, "step": 8370 }, { "epoch": 0.5690311183584726, - "grad_norm": 0.6643994450569153, + "grad_norm": 0.6773597002029419, "learning_rate": 4.644355551025955e-05, - "loss": 1.3092, + "loss": 1.3089, "step": 8375 }, { "epoch": 0.5693708384291344, - "grad_norm": 0.6579692363739014, + "grad_norm": 0.6531585454940796, "learning_rate": 4.644143225981791e-05, - "loss": 1.4446, + "loss": 1.4441, "step": 8380 }, { "epoch": 0.5697105584997961, - "grad_norm": 0.6603977084159851, + "grad_norm": 0.6561786532402039, "learning_rate": 4.6439309009376274e-05, - "loss": 1.4177, + "loss": 1.4166, "step": 8385 }, { "epoch": 0.570050278570458, - "grad_norm": 0.6441840529441833, + "grad_norm": 0.6465388536453247, "learning_rate": 4.6437185758934645e-05, - "loss": 1.3774, + "loss": 1.3778, "step": 8390 }, { "epoch": 0.5703899986411197, - "grad_norm": 0.62338787317276, + "grad_norm": 0.6234838962554932, "learning_rate": 4.6435062508493e-05, - "loss": 1.4207, + "loss": 1.421, "step": 8395 }, { "epoch": 0.5707297187117815, - "grad_norm": 0.6295756697654724, + "grad_norm": 0.6319364309310913, "learning_rate": 4.6432939258051366e-05, - "loss": 1.3859, + "loss": 1.3867, "step": 8400 }, { "epoch": 0.5710694387824433, - "grad_norm": 0.6080472469329834, + "grad_norm": 0.6094376444816589, "learning_rate": 4.643081600760974e-05, - "loss": 1.4107, + "loss": 1.4106, "step": 8405 }, { "epoch": 0.5714091588531051, - "grad_norm": 0.6909534931182861, + "grad_norm": 0.6924925446510315, "learning_rate": 4.6428692757168094e-05, - "loss": 1.3254, + "loss": 1.3257, "step": 8410 }, { "epoch": 0.5717488789237668, - "grad_norm": 0.7021681666374207, + "grad_norm": 0.6983892321586609, "learning_rate": 4.642656950672646e-05, - "loss": 1.3105, + "loss": 1.3111, "step": 8415 }, { "epoch": 0.5720885989944285, - "grad_norm": 0.6477012634277344, + "grad_norm": 0.651628315448761, "learning_rate": 4.642444625628483e-05, - "loss": 1.4423, + "loss": 1.4418, "step": 8420 }, { "epoch": 0.5724283190650904, - "grad_norm": 0.6198950409889221, + "grad_norm": 0.6166821718215942, "learning_rate": 4.6422323005843186e-05, - "loss": 1.4801, + "loss": 1.4797, "step": 8425 }, { "epoch": 0.5727680391357521, - "grad_norm": 0.6134934425354004, + "grad_norm": 0.6128708720207214, "learning_rate": 4.642019975540155e-05, - "loss": 1.3673, + "loss": 1.3676, "step": 8430 }, { "epoch": 0.5731077592064139, - "grad_norm": 0.8099306225776672, + "grad_norm": 0.815666913986206, "learning_rate": 4.6418076504959914e-05, - "loss": 1.3798, + "loss": 1.3801, "step": 8435 }, { "epoch": 0.5734474792770757, - "grad_norm": 0.6538095474243164, + "grad_norm": 0.6564454436302185, "learning_rate": 4.641595325451828e-05, "loss": 1.3588, "step": 8440 }, { "epoch": 0.5737871993477375, - "grad_norm": 0.7463709712028503, + "grad_norm": 0.7451840043067932, "learning_rate": 4.641383000407664e-05, - "loss": 1.3673, + "loss": 1.3665, "step": 8445 }, { "epoch": 0.5741269194183992, - "grad_norm": 0.6359599232673645, + "grad_norm": 0.6364842057228088, "learning_rate": 4.6411706753635006e-05, - "loss": 1.3445, + "loss": 1.3444, "step": 8450 }, { "epoch": 0.574466639489061, - "grad_norm": 1.1299289464950562, + "grad_norm": 0.7477369904518127, "learning_rate": 4.640958350319337e-05, - "loss": 1.3005, + "loss": 1.2977, "step": 8455 }, { "epoch": 0.5748063595597228, - "grad_norm": 0.6729915142059326, + "grad_norm": 0.6721308827400208, "learning_rate": 4.6407460252751734e-05, - "loss": 1.3593, + "loss": 1.3598, "step": 8460 }, { "epoch": 0.5751460796303846, - "grad_norm": 0.6672472953796387, + "grad_norm": 0.6728067398071289, "learning_rate": 4.64053370023101e-05, - "loss": 1.3796, + "loss": 1.3799, "step": 8465 }, { "epoch": 0.5754857997010463, - "grad_norm": 0.6459980607032776, + "grad_norm": 0.6459260582923889, "learning_rate": 4.640321375186846e-05, - "loss": 1.3767, + "loss": 1.3764, "step": 8470 }, { "epoch": 0.5758255197717081, - "grad_norm": 0.7157725095748901, + "grad_norm": 0.7147847414016724, "learning_rate": 4.6401090501426826e-05, - "loss": 1.3143, + "loss": 1.3139, "step": 8475 }, { "epoch": 0.5761652398423699, - "grad_norm": 0.6390661597251892, + "grad_norm": 0.6413534879684448, "learning_rate": 4.639896725098519e-05, - "loss": 1.3659, + "loss": 1.3657, "step": 8480 }, { "epoch": 0.5765049599130316, - "grad_norm": 0.6072826981544495, + "grad_norm": 0.605827271938324, "learning_rate": 4.6396844000543554e-05, "loss": 1.3133, "step": 8485 }, { "epoch": 0.5768446799836935, - "grad_norm": 0.6549344658851624, + "grad_norm": 0.6577711701393127, "learning_rate": 4.639472075010192e-05, - "loss": 1.3423, + "loss": 1.3428, "step": 8490 }, { "epoch": 0.5771844000543552, - "grad_norm": 0.6404393315315247, + "grad_norm": 0.6369004249572754, "learning_rate": 4.639259749966028e-05, - "loss": 1.3167, + "loss": 1.3166, "step": 8495 }, { "epoch": 0.577524120125017, - "grad_norm": 0.6117603182792664, + "grad_norm": 0.6144927144050598, "learning_rate": 4.6390474249218646e-05, - "loss": 1.4581, + "loss": 1.4575, "step": 8500 }, { "epoch": 0.5778638401956787, - "grad_norm": 0.6287073493003845, + "grad_norm": 0.6302551627159119, "learning_rate": 4.638835099877701e-05, - "loss": 1.3583, + "loss": 1.3591, "step": 8505 }, { "epoch": 0.5782035602663406, - "grad_norm": 0.6488497257232666, + "grad_norm": 0.646761417388916, "learning_rate": 4.6386227748335374e-05, - "loss": 1.3389, + "loss": 1.3387, "step": 8510 }, { "epoch": 0.5785432803370023, - "grad_norm": 0.6550486087799072, + "grad_norm": 0.6460202932357788, "learning_rate": 4.638410449789374e-05, "loss": 1.3674, "step": 8515 }, { "epoch": 0.578883000407664, - "grad_norm": 0.6923273205757141, + "grad_norm": 0.6932740211486816, "learning_rate": 4.63819812474521e-05, - "loss": 1.3788, + "loss": 1.3798, "step": 8520 }, { "epoch": 0.5792227204783259, - "grad_norm": 0.6016242504119873, + "grad_norm": 0.6054776310920715, "learning_rate": 4.6379857997010466e-05, - "loss": 1.3521, + "loss": 1.3518, "step": 8525 }, { "epoch": 0.5795624405489876, - "grad_norm": 0.5966993570327759, + "grad_norm": 0.5954487323760986, "learning_rate": 4.637773474656883e-05, - "loss": 1.3855, + "loss": 1.3851, "step": 8530 }, { "epoch": 0.5799021606196494, - "grad_norm": 0.7263109683990479, + "grad_norm": 0.713200569152832, "learning_rate": 4.6375611496127194e-05, - "loss": 1.3474, + "loss": 1.3472, "step": 8535 }, { "epoch": 0.5802418806903111, - "grad_norm": 0.6226108074188232, + "grad_norm": 0.6199910640716553, "learning_rate": 4.637348824568556e-05, - "loss": 1.3671, + "loss": 1.3675, "step": 8540 }, { "epoch": 0.580581600760973, - "grad_norm": 0.5628734230995178, + "grad_norm": 0.5645411610603333, "learning_rate": 4.637136499524392e-05, - "loss": 1.4494, + "loss": 1.4496, "step": 8545 }, { "epoch": 0.5809213208316347, - "grad_norm": 0.5334903001785278, + "grad_norm": 0.5331540107727051, "learning_rate": 4.6369241744802286e-05, - "loss": 1.3946, + "loss": 1.3942, "step": 8550 }, { "epoch": 0.5812610409022965, - "grad_norm": 0.6473321318626404, + "grad_norm": 0.6536462306976318, "learning_rate": 4.636711849436065e-05, - "loss": 1.3325, + "loss": 1.3327, "step": 8555 }, { "epoch": 0.5816007609729583, - "grad_norm": 0.6834282875061035, + "grad_norm": 0.6808878779411316, "learning_rate": 4.6364995243919014e-05, - "loss": 1.3812, + "loss": 1.3808, "step": 8560 }, { "epoch": 0.5819404810436201, - "grad_norm": 0.6114623546600342, + "grad_norm": 0.6141053438186646, "learning_rate": 4.636287199347737e-05, - "loss": 1.3266, + "loss": 1.327, "step": 8565 }, { "epoch": 0.5822802011142818, - "grad_norm": 0.6906845569610596, + "grad_norm": 0.6924583315849304, "learning_rate": 4.636074874303574e-05, - "loss": 1.4233, + "loss": 1.4238, "step": 8570 }, { "epoch": 0.5826199211849437, - "grad_norm": 0.5607256889343262, + "grad_norm": 0.5604522228240967, "learning_rate": 4.6358625492594106e-05, - "loss": 1.4599, + "loss": 1.4602, "step": 8575 }, { "epoch": 0.5829596412556054, - "grad_norm": 0.6390836238861084, + "grad_norm": 0.6367323994636536, "learning_rate": 4.6356502242152464e-05, - "loss": 1.426, + "loss": 1.4261, "step": 8580 }, { "epoch": 0.5832993613262671, - "grad_norm": 0.5814246535301208, + "grad_norm": 0.5791518688201904, "learning_rate": 4.6354378991710834e-05, - "loss": 1.3193, + "loss": 1.3197, "step": 8585 }, { "epoch": 0.5836390813969289, - "grad_norm": 0.6813691258430481, + "grad_norm": 0.6824930310249329, "learning_rate": 4.63522557412692e-05, - "loss": 1.4357, + "loss": 1.4359, "step": 8590 }, { "epoch": 0.5839788014675907, - "grad_norm": 0.6934071183204651, + "grad_norm": 0.6955567598342896, "learning_rate": 4.6350132490827556e-05, - "loss": 1.3752, + "loss": 1.3755, "step": 8595 }, { "epoch": 0.5843185215382525, - "grad_norm": 0.6603383421897888, + "grad_norm": 0.6599175930023193, "learning_rate": 4.6348009240385926e-05, - "loss": 1.2453, + "loss": 1.2451, "step": 8600 }, { "epoch": 0.5846582416089142, - "grad_norm": 0.71683669090271, + "grad_norm": 0.7156389951705933, "learning_rate": 4.634588598994429e-05, - "loss": 1.5543, + "loss": 1.5544, "step": 8605 }, { "epoch": 0.5849979616795761, - "grad_norm": 0.6986023783683777, + "grad_norm": 0.6917446255683899, "learning_rate": 4.634376273950265e-05, - "loss": 1.4098, + "loss": 1.4106, "step": 8610 }, { "epoch": 0.5853376817502378, - "grad_norm": 0.6881933212280273, + "grad_norm": 0.6872302293777466, "learning_rate": 4.634163948906102e-05, - "loss": 1.4021, + "loss": 1.4015, "step": 8615 }, { "epoch": 0.5856774018208996, - "grad_norm": 0.6830471158027649, + "grad_norm": 0.6834813952445984, "learning_rate": 4.633951623861938e-05, - "loss": 1.3863, + "loss": 1.3865, "step": 8620 }, { "epoch": 0.5860171218915613, - "grad_norm": 0.5090907216072083, + "grad_norm": 0.5086743235588074, "learning_rate": 4.633739298817774e-05, "loss": 1.4505, "step": 8625 }, { "epoch": 0.5863568419622232, - "grad_norm": 0.6305417418479919, + "grad_norm": 0.6322169899940491, "learning_rate": 4.633526973773611e-05, "loss": 1.3958, "step": 8630 }, { "epoch": 0.5866965620328849, - "grad_norm": 0.627467691898346, + "grad_norm": 0.6302802562713623, "learning_rate": 4.633314648729447e-05, "loss": 1.4374, "step": 8635 }, { "epoch": 0.5870362821035466, - "grad_norm": 0.6345938444137573, + "grad_norm": 0.6355620622634888, "learning_rate": 4.633102323685283e-05, - "loss": 1.2474, + "loss": 1.2476, "step": 8640 }, { "epoch": 0.5873760021742085, - "grad_norm": 0.6117781400680542, + "grad_norm": 0.6129981279373169, "learning_rate": 4.63288999864112e-05, - "loss": 1.3993, + "loss": 1.3989, "step": 8645 }, { "epoch": 0.5877157222448702, - "grad_norm": 0.6087744235992432, + "grad_norm": 0.6112029552459717, "learning_rate": 4.632677673596956e-05, - "loss": 1.5465, + "loss": 1.5466, "step": 8650 }, { "epoch": 0.588055442315532, - "grad_norm": 0.6593524217605591, + "grad_norm": 0.6566773056983948, "learning_rate": 4.6324653485527924e-05, - "loss": 1.3562, + "loss": 1.357, "step": 8655 }, { "epoch": 0.5883951623861938, - "grad_norm": 0.6665415167808533, + "grad_norm": 0.6758537888526917, "learning_rate": 4.6322530235086294e-05, - "loss": 1.4428, + "loss": 1.4427, "step": 8660 }, { "epoch": 0.5887348824568556, - "grad_norm": 0.6231390237808228, + "grad_norm": 0.6215658187866211, "learning_rate": 4.632040698464465e-05, "loss": 1.3792, "step": 8665 }, { "epoch": 0.5890746025275173, - "grad_norm": 0.5803211331367493, + "grad_norm": 0.5827229619026184, "learning_rate": 4.6318283734203016e-05, - "loss": 1.3635, + "loss": 1.3642, "step": 8670 }, { "epoch": 0.589414322598179, - "grad_norm": 0.6013233065605164, + "grad_norm": 0.6018844246864319, "learning_rate": 4.6316160483761386e-05, - "loss": 1.515, + "loss": 1.5147, "step": 8675 }, { "epoch": 0.5897540426688409, - "grad_norm": 0.6222535967826843, + "grad_norm": 0.6212862133979797, "learning_rate": 4.6314037233319744e-05, - "loss": 1.4195, + "loss": 1.42, "step": 8680 }, { "epoch": 0.5900937627395026, - "grad_norm": 0.6328880786895752, + "grad_norm": 0.6286620497703552, "learning_rate": 4.631191398287811e-05, - "loss": 1.3758, + "loss": 1.3759, "step": 8685 }, { "epoch": 0.5904334828101644, - "grad_norm": 0.6459696292877197, + "grad_norm": 0.6469038128852844, "learning_rate": 4.630979073243648e-05, - "loss": 1.2561, + "loss": 1.2562, "step": 8690 }, { "epoch": 0.5907732028808262, - "grad_norm": 0.5639860033988953, + "grad_norm": 0.5655760169029236, "learning_rate": 4.6307667481994836e-05, - "loss": 1.267, + "loss": 1.2665, "step": 8695 }, { "epoch": 0.591112922951488, - "grad_norm": 0.6313180327415466, + "grad_norm": 0.6300448775291443, "learning_rate": 4.6305544231553206e-05, - "loss": 1.3963, + "loss": 1.3966, "step": 8700 }, { "epoch": 0.5914526430221497, - "grad_norm": 0.6009690761566162, + "grad_norm": 0.6009717583656311, "learning_rate": 4.630342098111157e-05, - "loss": 1.3293, + "loss": 1.3297, "step": 8705 }, { "epoch": 0.5917923630928115, - "grad_norm": 0.5952971577644348, + "grad_norm": 0.5948302149772644, "learning_rate": 4.630129773066993e-05, - "loss": 1.4078, + "loss": 1.4084, "step": 8710 }, { "epoch": 0.5921320831634733, - "grad_norm": 0.6673296689987183, + "grad_norm": 0.6646355986595154, "learning_rate": 4.62991744802283e-05, - "loss": 1.2694, + "loss": 1.2696, "step": 8715 }, { "epoch": 0.5924718032341351, - "grad_norm": 0.5815384984016418, + "grad_norm": 0.5794880986213684, "learning_rate": 4.6297051229786656e-05, - "loss": 1.2968, + "loss": 1.2964, "step": 8720 }, { "epoch": 0.5928115233047968, - "grad_norm": 0.6946033239364624, + "grad_norm": 0.6965275406837463, "learning_rate": 4.629492797934502e-05, - "loss": 1.3911, + "loss": 1.3914, "step": 8725 }, { "epoch": 0.5931512433754587, - "grad_norm": 0.6673735976219177, + "grad_norm": 0.6684483289718628, "learning_rate": 4.629280472890339e-05, - "loss": 1.3914, + "loss": 1.3915, "step": 8730 }, { "epoch": 0.5934909634461204, - "grad_norm": 0.7089139819145203, + "grad_norm": 0.7138425707817078, "learning_rate": 4.629068147846175e-05, - "loss": 1.428, + "loss": 1.4282, "step": 8735 }, { "epoch": 0.5938306835167821, - "grad_norm": 0.6138942837715149, + "grad_norm": 0.617935836315155, "learning_rate": 4.628855822802011e-05, - "loss": 1.3783, + "loss": 1.3781, "step": 8740 }, { "epoch": 0.594170403587444, - "grad_norm": 0.6514503359794617, + "grad_norm": 0.6548699736595154, "learning_rate": 4.628643497757848e-05, - "loss": 1.3654, + "loss": 1.365, "step": 8745 }, { "epoch": 0.5945101236581057, - "grad_norm": 0.6299140453338623, + "grad_norm": 0.6273828148841858, "learning_rate": 4.628431172713684e-05, - "loss": 1.4794, + "loss": 1.479, "step": 8750 }, { "epoch": 0.5948498437287675, - "grad_norm": 0.5932992696762085, + "grad_norm": 0.5947832465171814, "learning_rate": 4.6282188476695204e-05, - "loss": 1.342, + "loss": 1.3419, "step": 8755 }, { "epoch": 0.5951895637994292, - "grad_norm": 0.9953003525733948, + "grad_norm": 0.9850634932518005, "learning_rate": 4.6280065226253575e-05, - "loss": 1.4064, + "loss": 1.4073, "step": 8760 }, { "epoch": 0.5955292838700911, - "grad_norm": 0.6479867696762085, + "grad_norm": 0.6491472125053406, "learning_rate": 4.627794197581193e-05, - "loss": 1.3773, + "loss": 1.3776, "step": 8765 }, { "epoch": 0.5958690039407528, - "grad_norm": 0.6373013854026794, + "grad_norm": 0.6347814202308655, "learning_rate": 4.6275818725370296e-05, - "loss": 1.3575, + "loss": 1.3573, "step": 8770 }, { "epoch": 0.5962087240114146, - "grad_norm": 0.6302305459976196, + "grad_norm": 0.6294604539871216, "learning_rate": 4.6273695474928667e-05, - "loss": 1.3272, + "loss": 1.3277, "step": 8775 }, { "epoch": 0.5965484440820764, - "grad_norm": 0.676969051361084, + "grad_norm": 0.674103319644928, "learning_rate": 4.6271572224487024e-05, "loss": 1.3772, "step": 8780 }, { "epoch": 0.5968881641527382, - "grad_norm": 0.5809783339500427, + "grad_norm": 0.5823981165885925, "learning_rate": 4.626944897404539e-05, - "loss": 1.322, + "loss": 1.3225, "step": 8785 }, { "epoch": 0.5972278842233999, - "grad_norm": 0.5713885426521301, + "grad_norm": 0.5748230814933777, "learning_rate": 4.626732572360375e-05, - "loss": 1.4192, + "loss": 1.4187, "step": 8790 }, { "epoch": 0.5975676042940616, - "grad_norm": 0.6405071020126343, + "grad_norm": 0.641822338104248, "learning_rate": 4.6265202473162116e-05, - "loss": 1.4343, + "loss": 1.4342, "step": 8795 }, { "epoch": 0.5979073243647235, - "grad_norm": 0.6022502183914185, + "grad_norm": 0.6004037857055664, "learning_rate": 4.626307922272048e-05, - "loss": 1.3822, + "loss": 1.3819, "step": 8800 }, { "epoch": 0.5982470444353852, - "grad_norm": 0.7166122198104858, + "grad_norm": 0.7222495675086975, "learning_rate": 4.6260955972278844e-05, - "loss": 1.3959, + "loss": 1.3965, "step": 8805 }, { "epoch": 0.598586764506047, - "grad_norm": 0.5999231934547424, + "grad_norm": 0.602742612361908, "learning_rate": 4.625883272183721e-05, - "loss": 1.3961, + "loss": 1.3971, "step": 8810 }, { "epoch": 0.5989264845767088, - "grad_norm": 0.6405361294746399, + "grad_norm": 0.6365458965301514, "learning_rate": 4.625670947139557e-05, - "loss": 1.3848, + "loss": 1.3852, "step": 8815 }, { "epoch": 0.5992662046473706, - "grad_norm": 0.6125131845474243, + "grad_norm": 0.6098758578300476, "learning_rate": 4.6254586220953936e-05, - "loss": 1.2984, + "loss": 1.2977, "step": 8820 }, { "epoch": 0.5996059247180323, - "grad_norm": 0.555432915687561, + "grad_norm": 0.5579818487167358, "learning_rate": 4.62524629705123e-05, - "loss": 1.3137, + "loss": 1.3141, "step": 8825 }, { "epoch": 0.5999456447886942, - "grad_norm": 0.6365654468536377, + "grad_norm": 0.6309992671012878, "learning_rate": 4.6250339720070664e-05, - "loss": 1.294, + "loss": 1.2934, "step": 8830 }, { "epoch": 0.6002853648593559, - "grad_norm": 0.5567549467086792, + "grad_norm": 0.5560474991798401, "learning_rate": 4.624821646962903e-05, "loss": 1.4571, "step": 8835 }, { "epoch": 0.6006250849300176, - "grad_norm": 0.6197007298469543, + "grad_norm": 0.6199814677238464, "learning_rate": 4.624609321918739e-05, "loss": 1.4303, "step": 8840 }, { "epoch": 0.6009648050006794, - "grad_norm": 0.6491262912750244, + "grad_norm": 0.6530081033706665, "learning_rate": 4.6243969968745756e-05, - "loss": 1.3946, + "loss": 1.3948, "step": 8845 }, { "epoch": 0.6013045250713412, - "grad_norm": 0.6252356171607971, + "grad_norm": 0.6249587535858154, "learning_rate": 4.624184671830412e-05, - "loss": 1.4354, + "loss": 1.4353, "step": 8850 }, { "epoch": 0.601644245142003, - "grad_norm": 0.6818503141403198, + "grad_norm": 0.6813602447509766, "learning_rate": 4.6239723467862484e-05, - "loss": 1.3727, + "loss": 1.3726, "step": 8855 }, { "epoch": 0.6019839652126647, - "grad_norm": 0.5524508357048035, + "grad_norm": 0.5447959899902344, "learning_rate": 4.623760021742085e-05, - "loss": 1.3857, + "loss": 1.3854, "step": 8860 }, { "epoch": 0.6023236852833266, - "grad_norm": 0.6112775206565857, + "grad_norm": 0.6103903651237488, "learning_rate": 4.623547696697921e-05, - "loss": 1.3369, + "loss": 1.3374, "step": 8865 }, { "epoch": 0.6026634053539883, - "grad_norm": 0.6917428374290466, + "grad_norm": 0.6915332674980164, "learning_rate": 4.6233353716537576e-05, - "loss": 1.4105, + "loss": 1.4107, "step": 8870 }, { "epoch": 0.6030031254246501, - "grad_norm": 0.6560357213020325, + "grad_norm": 0.6536495685577393, "learning_rate": 4.623123046609594e-05, - "loss": 1.3502, + "loss": 1.3493, "step": 8875 }, { "epoch": 0.6033428454953118, - "grad_norm": 0.6382858753204346, + "grad_norm": 0.6379966735839844, "learning_rate": 4.6229107215654304e-05, - "loss": 1.2918, + "loss": 1.2924, "step": 8880 }, { "epoch": 0.6036825655659737, - "grad_norm": 0.7183751463890076, + "grad_norm": 0.7142328023910522, "learning_rate": 4.622698396521267e-05, "loss": 1.3846, "step": 8885 }, { "epoch": 0.6040222856366354, - "grad_norm": 0.627293050289154, + "grad_norm": 0.626592218875885, "learning_rate": 4.622486071477103e-05, - "loss": 1.4626, + "loss": 1.4629, "step": 8890 }, { "epoch": 0.6043620057072971, - "grad_norm": 0.6370736360549927, + "grad_norm": 0.6349679827690125, "learning_rate": 4.6222737464329396e-05, - "loss": 1.31, + "loss": 1.3104, "step": 8895 }, { "epoch": 0.604701725777959, - "grad_norm": 0.6612152457237244, + "grad_norm": 0.6626892685890198, "learning_rate": 4.622061421388776e-05, - "loss": 1.4417, + "loss": 1.4422, "step": 8900 }, { "epoch": 0.6050414458486207, - "grad_norm": 0.6770370006561279, + "grad_norm": 0.6759049296379089, "learning_rate": 4.6218490963446124e-05, "loss": 1.3413, "step": 8905 }, { "epoch": 0.6053811659192825, - "grad_norm": 0.6521413326263428, + "grad_norm": 0.6569634079933167, "learning_rate": 4.621636771300449e-05, - "loss": 1.4035, + "loss": 1.4037, "step": 8910 }, { "epoch": 0.6057208859899443, - "grad_norm": 0.6531280279159546, + "grad_norm": 0.6542913317680359, "learning_rate": 4.621424446256285e-05, - "loss": 1.2742, + "loss": 1.2744, "step": 8915 }, { "epoch": 0.6060606060606061, - "grad_norm": 0.6770360469818115, + "grad_norm": 0.6765264272689819, "learning_rate": 4.621212121212121e-05, "loss": 1.4134, "step": 8920 }, { "epoch": 0.6064003261312678, - "grad_norm": 0.6506828665733337, + "grad_norm": 0.6517230868339539, "learning_rate": 4.620999796167958e-05, - "loss": 1.3772, + "loss": 1.3771, "step": 8925 }, { "epoch": 0.6067400462019296, - "grad_norm": 0.6468803882598877, + "grad_norm": 0.6478938460350037, "learning_rate": 4.6207874711237944e-05, - "loss": 1.3124, + "loss": 1.3123, "step": 8930 }, { "epoch": 0.6070797662725914, - "grad_norm": 0.6341661810874939, + "grad_norm": 0.6326414942741394, "learning_rate": 4.62057514607963e-05, "loss": 1.4299, "step": 8935 }, { "epoch": 0.6074194863432532, - "grad_norm": 0.6457089781761169, + "grad_norm": 0.648455023765564, "learning_rate": 4.620362821035467e-05, - "loss": 1.4044, + "loss": 1.4042, "step": 8940 }, { "epoch": 0.6077592064139149, - "grad_norm": 0.7401508092880249, + "grad_norm": 0.7403650879859924, "learning_rate": 4.6201504959913036e-05, "loss": 1.3108, "step": 8945 }, { "epoch": 0.6080989264845768, - "grad_norm": 0.6043117046356201, + "grad_norm": 0.608013391494751, "learning_rate": 4.619938170947139e-05, - "loss": 1.3466, + "loss": 1.3464, "step": 8950 }, { "epoch": 0.6084386465552385, - "grad_norm": 0.6644378900527954, + "grad_norm": 0.6651491522789001, "learning_rate": 4.6197258459029764e-05, - "loss": 1.3746, + "loss": 1.3744, "step": 8955 }, { "epoch": 0.6087783666259002, - "grad_norm": 0.6129704713821411, + "grad_norm": 0.6120297908782959, "learning_rate": 4.619513520858813e-05, - "loss": 1.376, + "loss": 1.3755, "step": 8960 }, { "epoch": 0.609118086696562, - "grad_norm": 0.7116718888282776, + "grad_norm": 0.713447630405426, "learning_rate": 4.6193011958146485e-05, - "loss": 1.3112, + "loss": 1.3117, "step": 8965 }, { "epoch": 0.6094578067672238, - "grad_norm": 0.6450315713882446, + "grad_norm": 0.6527226567268372, "learning_rate": 4.6190888707704856e-05, - "loss": 1.4244, + "loss": 1.4245, "step": 8970 }, { "epoch": 0.6097975268378856, - "grad_norm": 0.708161473274231, + "grad_norm": 0.7073986530303955, "learning_rate": 4.618876545726322e-05, - "loss": 1.3267, + "loss": 1.3271, "step": 8975 }, { "epoch": 0.6101372469085473, - "grad_norm": 0.5978407859802246, + "grad_norm": 0.5979574918746948, "learning_rate": 4.618664220682158e-05, - "loss": 1.3002, + "loss": 1.3001, "step": 8980 }, { "epoch": 0.6104769669792092, - "grad_norm": 0.7121630907058716, + "grad_norm": 0.7243133783340454, "learning_rate": 4.618451895637995e-05, - "loss": 1.364, + "loss": 1.3643, "step": 8985 }, { "epoch": 0.6108166870498709, - "grad_norm": 0.6190058588981628, + "grad_norm": 0.6200166940689087, "learning_rate": 4.6182395705938305e-05, - "loss": 1.3812, + "loss": 1.3813, "step": 8990 }, { "epoch": 0.6111564071205327, - "grad_norm": 0.621415913105011, + "grad_norm": 0.6203546524047852, "learning_rate": 4.618027245549667e-05, - "loss": 1.3124, + "loss": 1.3127, "step": 8995 }, { "epoch": 0.6114961271911945, - "grad_norm": 0.6940446496009827, + "grad_norm": 0.6952609419822693, "learning_rate": 4.617814920505504e-05, - "loss": 1.3833, + "loss": 1.3839, "step": 9000 }, { "epoch": 0.6118358472618562, - "grad_norm": 0.6798141002655029, + "grad_norm": 0.6873037815093994, "learning_rate": 4.61760259546134e-05, - "loss": 1.3791, + "loss": 1.3792, "step": 9005 }, { "epoch": 0.612175567332518, - "grad_norm": 0.6257941126823425, + "grad_norm": 0.6277706027030945, "learning_rate": 4.617390270417176e-05, - "loss": 1.3625, + "loss": 1.3624, "step": 9010 }, { "epoch": 0.6125152874031797, - "grad_norm": 0.7014443278312683, + "grad_norm": 0.7151670455932617, "learning_rate": 4.617177945373013e-05, - "loss": 1.3498, + "loss": 1.3494, "step": 9015 }, { "epoch": 0.6128550074738416, - "grad_norm": 0.6132319569587708, + "grad_norm": 0.6136572360992432, "learning_rate": 4.616965620328849e-05, - "loss": 1.2609, + "loss": 1.2608, "step": 9020 }, { "epoch": 0.6131947275445033, - "grad_norm": 0.665908694267273, + "grad_norm": 0.6658135652542114, "learning_rate": 4.616753295284685e-05, - "loss": 1.2704, + "loss": 1.2708, "step": 9025 }, { "epoch": 0.6135344476151651, - "grad_norm": 0.4162493050098419, + "grad_norm": 0.416273295879364, "learning_rate": 4.6165409702405224e-05, - "loss": 1.3329, + "loss": 1.3323, "step": 9030 }, { "epoch": 0.6138741676858269, - "grad_norm": 0.6778438091278076, + "grad_norm": 0.6818050146102905, "learning_rate": 4.616328645196358e-05, - "loss": 1.4025, + "loss": 1.4032, "step": 9035 }, { "epoch": 0.6142138877564887, - "grad_norm": 0.6079490780830383, + "grad_norm": 0.6065679788589478, "learning_rate": 4.616116320152195e-05, - "loss": 1.4282, + "loss": 1.4278, "step": 9040 }, { "epoch": 0.6145536078271504, - "grad_norm": 0.6796638369560242, + "grad_norm": 0.6794381141662598, "learning_rate": 4.6159039951080316e-05, - "loss": 1.3924, + "loss": 1.3928, "step": 9045 }, { "epoch": 0.6148933278978121, - "grad_norm": 0.6430286765098572, + "grad_norm": 0.6409249901771545, "learning_rate": 4.615691670063867e-05, - "loss": 1.3837, + "loss": 1.3842, "step": 9050 }, { "epoch": 0.615233047968474, - "grad_norm": 0.6890765428543091, + "grad_norm": 0.6892575025558472, "learning_rate": 4.6154793450197044e-05, - "loss": 1.5043, + "loss": 1.5037, "step": 9055 }, { "epoch": 0.6155727680391357, - "grad_norm": 0.634694516658783, + "grad_norm": 0.6356381177902222, "learning_rate": 4.61526701997554e-05, - "loss": 1.3749, + "loss": 1.3751, "step": 9060 }, { "epoch": 0.6159124881097975, - "grad_norm": 0.6552861928939819, + "grad_norm": 0.6551557779312134, "learning_rate": 4.6150546949313765e-05, - "loss": 1.4356, + "loss": 1.4361, "step": 9065 }, { "epoch": 0.6162522081804593, - "grad_norm": 0.6340364217758179, + "grad_norm": 0.6361265778541565, "learning_rate": 4.6148423698872136e-05, - "loss": 1.3417, + "loss": 1.3418, "step": 9070 }, { "epoch": 0.6165919282511211, - "grad_norm": 0.6809887886047363, + "grad_norm": 0.6796130537986755, "learning_rate": 4.614630044843049e-05, - "loss": 1.3178, + "loss": 1.318, "step": 9075 }, { "epoch": 0.6169316483217828, - "grad_norm": 0.6862459182739258, + "grad_norm": 0.6846082806587219, "learning_rate": 4.614417719798886e-05, - "loss": 1.334, + "loss": 1.3337, "step": 9080 }, { "epoch": 0.6172713683924447, - "grad_norm": 0.6462175250053406, + "grad_norm": 0.6449984312057495, "learning_rate": 4.614205394754723e-05, - "loss": 1.3098, + "loss": 1.3101, "step": 9085 }, { "epoch": 0.6176110884631064, - "grad_norm": 0.6594539284706116, + "grad_norm": 0.6591634750366211, "learning_rate": 4.6139930697105585e-05, - "loss": 1.3112, + "loss": 1.3118, "step": 9090 }, { "epoch": 0.6179508085337682, - "grad_norm": 0.6995545625686646, + "grad_norm": 0.6992005109786987, "learning_rate": 4.613780744666395e-05, - "loss": 1.3699, + "loss": 1.37, "step": 9095 }, { "epoch": 0.6182905286044299, - "grad_norm": 0.6023643016815186, + "grad_norm": 0.6026031374931335, "learning_rate": 4.613568419622232e-05, - "loss": 1.446, + "loss": 1.4458, "step": 9100 }, { "epoch": 0.6186302486750918, - "grad_norm": 0.6412670016288757, + "grad_norm": 0.6417717337608337, "learning_rate": 4.613356094578068e-05, "loss": 1.3672, "step": 9105 }, { "epoch": 0.6189699687457535, - "grad_norm": 0.6643955707550049, + "grad_norm": 0.6635472178459167, "learning_rate": 4.613143769533904e-05, - "loss": 1.2998, + "loss": 1.299, "step": 9110 }, { "epoch": 0.6193096888164152, - "grad_norm": 0.7588699460029602, + "grad_norm": 0.7598823308944702, "learning_rate": 4.612931444489741e-05, - "loss": 1.4095, + "loss": 1.41, "step": 9115 }, { "epoch": 0.6196494088870771, - "grad_norm": 0.5423884987831116, + "grad_norm": 0.5403327345848083, "learning_rate": 4.612719119445577e-05, - "loss": 1.3938, + "loss": 1.3931, "step": 9120 }, { "epoch": 0.6199891289577388, - "grad_norm": 0.5800355672836304, + "grad_norm": 0.581428050994873, "learning_rate": 4.612506794401413e-05, - "loss": 1.358, + "loss": 1.3575, "step": 9125 }, { "epoch": 0.6203288490284006, - "grad_norm": 0.7485507130622864, + "grad_norm": 0.7497872710227966, "learning_rate": 4.6122944693572504e-05, - "loss": 1.3613, + "loss": 1.3612, "step": 9130 }, { "epoch": 0.6206685690990623, - "grad_norm": 0.6042966842651367, + "grad_norm": 0.6001119017601013, "learning_rate": 4.612082144313086e-05, - "loss": 1.4026, + "loss": 1.403, "step": 9135 }, { "epoch": 0.6210082891697242, - "grad_norm": 0.6381845474243164, + "grad_norm": 0.6408796310424805, "learning_rate": 4.6118698192689225e-05, - "loss": 1.3385, + "loss": 1.3386, "step": 9140 }, { "epoch": 0.6213480092403859, - "grad_norm": 0.6302849650382996, + "grad_norm": 0.6310064792633057, "learning_rate": 4.611657494224759e-05, - "loss": 1.342, + "loss": 1.3423, "step": 9145 }, { "epoch": 0.6216877293110477, - "grad_norm": 0.6172165870666504, + "grad_norm": 0.6168960928916931, "learning_rate": 4.611445169180595e-05, - "loss": 1.4811, + "loss": 1.4808, "step": 9150 }, { "epoch": 0.6220274493817095, - "grad_norm": 0.641302764415741, + "grad_norm": 0.6384915709495544, "learning_rate": 4.611232844136432e-05, - "loss": 1.3648, + "loss": 1.3642, "step": 9155 }, { "epoch": 0.6223671694523712, - "grad_norm": 0.6315203905105591, + "grad_norm": 0.6317213773727417, "learning_rate": 4.611020519092268e-05, - "loss": 1.4134, + "loss": 1.4137, "step": 9160 }, { "epoch": 0.622706889523033, - "grad_norm": 0.67299485206604, + "grad_norm": 0.6732621788978577, "learning_rate": 4.6108081940481045e-05, - "loss": 1.3513, + "loss": 1.352, "step": 9165 }, { "epoch": 0.6230466095936948, - "grad_norm": 0.6160862445831299, + "grad_norm": 0.6161730289459229, "learning_rate": 4.610595869003941e-05, - "loss": 1.3066, + "loss": 1.3057, "step": 9170 }, { "epoch": 0.6233863296643566, - "grad_norm": 0.7062851190567017, + "grad_norm": 0.7071179747581482, "learning_rate": 4.610383543959777e-05, - "loss": 1.3928, + "loss": 1.3933, "step": 9175 }, { "epoch": 0.6237260497350183, - "grad_norm": 0.8130349516868591, + "grad_norm": 0.8129214644432068, "learning_rate": 4.610171218915614e-05, - "loss": 1.345, + "loss": 1.3449, "step": 9180 }, { "epoch": 0.6240657698056801, - "grad_norm": 0.739722490310669, + "grad_norm": 0.7444182634353638, "learning_rate": 4.60995889387145e-05, - "loss": 1.4844, + "loss": 1.4851, "step": 9185 }, { "epoch": 0.6244054898763419, - "grad_norm": 0.6197490692138672, + "grad_norm": 0.6194591522216797, "learning_rate": 4.6097465688272865e-05, - "loss": 1.417, + "loss": 1.4171, "step": 9190 }, { "epoch": 0.6247452099470037, - "grad_norm": 0.642519474029541, + "grad_norm": 0.6405376195907593, "learning_rate": 4.609534243783123e-05, - "loss": 1.4504, + "loss": 1.4501, "step": 9195 }, { "epoch": 0.6250849300176654, - "grad_norm": 0.6041985750198364, + "grad_norm": 0.604480504989624, "learning_rate": 4.609321918738959e-05, - "loss": 1.3105, + "loss": 1.3109, "step": 9200 }, { "epoch": 0.6254246500883273, - "grad_norm": 0.7667335867881775, + "grad_norm": 0.7666611671447754, "learning_rate": 4.609109593694796e-05, "loss": 1.407, "step": 9205 }, { "epoch": 0.625764370158989, - "grad_norm": 0.6304047703742981, + "grad_norm": 0.6320334076881409, "learning_rate": 4.608897268650632e-05, - "loss": 1.4449, + "loss": 1.4445, "step": 9210 }, { "epoch": 0.6261040902296507, - "grad_norm": 0.6934990882873535, + "grad_norm": 0.6995032429695129, "learning_rate": 4.6086849436064685e-05, - "loss": 1.344, + "loss": 1.3441, "step": 9215 }, { "epoch": 0.6264438103003126, - "grad_norm": 0.6668086051940918, + "grad_norm": 0.6667202711105347, "learning_rate": 4.608472618562305e-05, - "loss": 1.4584, + "loss": 1.4573, "step": 9220 }, { "epoch": 0.6267835303709743, - "grad_norm": 0.711803138256073, + "grad_norm": 0.7114541530609131, "learning_rate": 4.608260293518141e-05, - "loss": 1.4036, + "loss": 1.4031, "step": 9225 }, { "epoch": 0.6271232504416361, - "grad_norm": 0.6676630973815918, + "grad_norm": 0.6686276793479919, "learning_rate": 4.608047968473978e-05, - "loss": 1.3854, + "loss": 1.3867, "step": 9230 }, { "epoch": 0.6274629705122978, - "grad_norm": 0.5944895148277283, + "grad_norm": 0.5926764011383057, "learning_rate": 4.607835643429814e-05, - "loss": 1.3905, + "loss": 1.391, "step": 9235 }, { "epoch": 0.6278026905829597, - "grad_norm": 0.5850406289100647, + "grad_norm": 0.5864980816841125, "learning_rate": 4.6076233183856505e-05, - "loss": 1.3627, + "loss": 1.3628, "step": 9240 }, { "epoch": 0.6281424106536214, - "grad_norm": 0.7356735467910767, + "grad_norm": 0.7356739044189453, "learning_rate": 4.607410993341487e-05, - "loss": 1.436, + "loss": 1.4355, "step": 9245 }, { "epoch": 0.6284821307242832, - "grad_norm": 0.7528555989265442, + "grad_norm": 0.7520934343338013, "learning_rate": 4.607198668297323e-05, - "loss": 1.3113, + "loss": 1.3112, "step": 9250 }, { "epoch": 0.628821850794945, - "grad_norm": 0.6859064698219299, + "grad_norm": 0.6856005787849426, "learning_rate": 4.60698634325316e-05, - "loss": 1.3161, + "loss": 1.3167, "step": 9255 }, { "epoch": 0.6291615708656068, - "grad_norm": 0.642941415309906, + "grad_norm": 0.638933002948761, "learning_rate": 4.6067740182089955e-05, - "loss": 1.4547, + "loss": 1.4544, "step": 9260 }, { "epoch": 0.6295012909362685, - "grad_norm": 0.6627421379089355, + "grad_norm": 0.6620590686798096, "learning_rate": 4.6065616931648325e-05, - "loss": 1.2929, + "loss": 1.2925, "step": 9265 }, { "epoch": 0.6298410110069302, - "grad_norm": 0.6617950201034546, + "grad_norm": 0.6601917147636414, "learning_rate": 4.606349368120669e-05, - "loss": 1.3382, + "loss": 1.338, "step": 9270 }, { "epoch": 0.6301807310775921, - "grad_norm": 0.6738972663879395, + "grad_norm": 0.675144374370575, "learning_rate": 4.6061370430765047e-05, - "loss": 1.3662, + "loss": 1.3655, "step": 9275 }, { "epoch": 0.6305204511482538, - "grad_norm": 0.5935032963752747, + "grad_norm": 0.5944033265113831, "learning_rate": 4.605924718032342e-05, - "loss": 1.3452, + "loss": 1.3447, "step": 9280 }, { "epoch": 0.6308601712189156, - "grad_norm": 0.641117513179779, + "grad_norm": 0.6410486698150635, "learning_rate": 4.605712392988178e-05, - "loss": 1.4126, + "loss": 1.4124, "step": 9285 }, { "epoch": 0.6311998912895774, - "grad_norm": 0.6209895014762878, + "grad_norm": 0.6212640404701233, "learning_rate": 4.605500067944014e-05, - "loss": 1.3441, + "loss": 1.3444, "step": 9290 }, { "epoch": 0.6315396113602392, - "grad_norm": 0.6649410724639893, + "grad_norm": 0.6639439463615417, "learning_rate": 4.605287742899851e-05, - "loss": 1.4278, + "loss": 1.4283, "step": 9295 }, { "epoch": 0.6318793314309009, - "grad_norm": 0.6906175017356873, + "grad_norm": 0.6898661851882935, "learning_rate": 4.6050754178556873e-05, - "loss": 1.3647, + "loss": 1.3648, "step": 9300 }, { "epoch": 0.6322190515015628, - "grad_norm": 0.5536985993385315, + "grad_norm": 0.5536724328994751, "learning_rate": 4.604863092811523e-05, - "loss": 1.3098, + "loss": 1.3091, "step": 9305 }, { "epoch": 0.6325587715722245, - "grad_norm": 0.6450605988502502, + "grad_norm": 0.6451136469841003, "learning_rate": 4.60465076776736e-05, - "loss": 1.322, + "loss": 1.3219, "step": 9310 }, { "epoch": 0.6328984916428863, - "grad_norm": 0.6113890409469604, + "grad_norm": 0.6017425656318665, "learning_rate": 4.6044384427231965e-05, - "loss": 1.2808, + "loss": 1.2811, "step": 9315 }, { "epoch": 0.633238211713548, - "grad_norm": 0.6422805190086365, + "grad_norm": 0.6427118182182312, "learning_rate": 4.604226117679032e-05, - "loss": 1.2798, + "loss": 1.2796, "step": 9320 }, { "epoch": 0.6335779317842098, - "grad_norm": 0.6318135857582092, + "grad_norm": 0.6267493367195129, "learning_rate": 4.6040137926348693e-05, - "loss": 1.3633, + "loss": 1.3632, "step": 9325 }, { "epoch": 0.6339176518548716, - "grad_norm": 0.62293940782547, + "grad_norm": 0.6224197745323181, "learning_rate": 4.603801467590706e-05, - "loss": 1.4161, + "loss": 1.4162, "step": 9330 }, { "epoch": 0.6342573719255333, - "grad_norm": 0.7394024729728699, + "grad_norm": 0.736854076385498, "learning_rate": 4.6035891425465415e-05, - "loss": 1.3523, + "loss": 1.3528, "step": 9335 }, { "epoch": 0.6345970919961952, - "grad_norm": 0.6278140544891357, + "grad_norm": 0.6301673650741577, "learning_rate": 4.6033768175023785e-05, - "loss": 1.3903, + "loss": 1.3899, "step": 9340 }, { "epoch": 0.6349368120668569, - "grad_norm": 0.702151358127594, + "grad_norm": 0.700330913066864, "learning_rate": 4.603164492458214e-05, - "loss": 1.3482, + "loss": 1.3476, "step": 9345 }, { "epoch": 0.6352765321375187, - "grad_norm": 0.5546267032623291, + "grad_norm": 0.5521506071090698, "learning_rate": 4.602952167414051e-05, - "loss": 1.3292, + "loss": 1.3297, "step": 9350 }, { "epoch": 0.6356162522081804, - "grad_norm": 0.6712154150009155, + "grad_norm": 0.6707620620727539, "learning_rate": 4.602739842369888e-05, - "loss": 1.3712, + "loss": 1.3705, "step": 9355 }, { "epoch": 0.6359559722788423, - "grad_norm": 0.6703802943229675, + "grad_norm": 0.6677772998809814, "learning_rate": 4.6025275173257235e-05, - "loss": 1.3229, + "loss": 1.3228, "step": 9360 }, { "epoch": 0.636295692349504, - "grad_norm": 0.6230586767196655, + "grad_norm": 0.6219545006752014, "learning_rate": 4.60231519228156e-05, "loss": 1.2958, "step": 9365 }, { "epoch": 0.6366354124201657, - "grad_norm": 0.5519856214523315, + "grad_norm": 0.5497100353240967, "learning_rate": 4.602102867237397e-05, - "loss": 1.3783, + "loss": 1.3784, "step": 9370 }, { "epoch": 0.6369751324908276, - "grad_norm": 0.6753631234169006, + "grad_norm": 0.6751018762588501, "learning_rate": 4.601890542193233e-05, - "loss": 1.3171, + "loss": 1.3173, "step": 9375 }, { "epoch": 0.6373148525614893, - "grad_norm": 0.7067842483520508, + "grad_norm": 0.7054197788238525, "learning_rate": 4.60167821714907e-05, - "loss": 1.3057, + "loss": 1.3054, "step": 9380 }, { "epoch": 0.6376545726321511, - "grad_norm": 0.6295456290245056, + "grad_norm": 0.629658579826355, "learning_rate": 4.601465892104906e-05, - "loss": 1.3678, + "loss": 1.3676, "step": 9385 }, { "epoch": 0.6379942927028129, - "grad_norm": 0.6710845828056335, + "grad_norm": 0.6685439944267273, "learning_rate": 4.601253567060742e-05, - "loss": 1.3506, + "loss": 1.3503, "step": 9390 }, { "epoch": 0.6383340127734747, - "grad_norm": 0.5668164491653442, + "grad_norm": 0.5651018619537354, "learning_rate": 4.601041242016579e-05, "loss": 1.2924, "step": 9395 }, { "epoch": 0.6386737328441364, - "grad_norm": 0.5902765989303589, + "grad_norm": 0.589766263961792, "learning_rate": 4.6008289169724153e-05, "loss": 1.3587, "step": 9400 }, { "epoch": 0.6390134529147982, - "grad_norm": 0.6761592626571655, + "grad_norm": 0.6739794611930847, "learning_rate": 4.600616591928251e-05, - "loss": 1.3528, + "loss": 1.3524, "step": 9405 }, { "epoch": 0.63935317298546, - "grad_norm": 0.6213817000389099, + "grad_norm": 0.6235234141349792, "learning_rate": 4.600404266884088e-05, - "loss": 1.3861, + "loss": 1.3863, "step": 9410 }, { "epoch": 0.6396928930561218, - "grad_norm": 0.6939463019371033, + "grad_norm": 0.7025445699691772, "learning_rate": 4.600191941839924e-05, - "loss": 1.3352, + "loss": 1.3351, "step": 9415 }, { "epoch": 0.6400326131267835, - "grad_norm": 0.6145976781845093, + "grad_norm": 0.6129694581031799, "learning_rate": 4.59997961679576e-05, - "loss": 1.3684, + "loss": 1.3678, "step": 9420 }, { "epoch": 0.6403723331974454, - "grad_norm": 0.5914034247398376, + "grad_norm": 0.5915600657463074, "learning_rate": 4.5997672917515973e-05, - "loss": 1.4238, + "loss": 1.4242, "step": 9425 }, { "epoch": 0.6407120532681071, - "grad_norm": 0.6548230051994324, + "grad_norm": 0.6478348970413208, "learning_rate": 4.599554966707433e-05, - "loss": 1.3183, + "loss": 1.3184, "step": 9430 }, { "epoch": 0.6410517733387688, - "grad_norm": 0.6701985001564026, + "grad_norm": 0.6688113808631897, "learning_rate": 4.5993426416632695e-05, - "loss": 1.2767, + "loss": 1.2775, "step": 9435 }, { "epoch": 0.6413914934094306, - "grad_norm": 0.6515761613845825, + "grad_norm": 0.6526153683662415, "learning_rate": 4.5991303166191066e-05, - "loss": 1.2968, + "loss": 1.2971, "step": 9440 }, { "epoch": 0.6417312134800924, - "grad_norm": 0.6450496315956116, + "grad_norm": 0.6435157060623169, "learning_rate": 4.598917991574942e-05, - "loss": 1.4845, + "loss": 1.4842, "step": 9445 }, { "epoch": 0.6420709335507542, - "grad_norm": 0.6368112564086914, + "grad_norm": 0.6378527283668518, "learning_rate": 4.598705666530779e-05, - "loss": 1.3298, + "loss": 1.3299, "step": 9450 }, { "epoch": 0.6424106536214159, - "grad_norm": 0.7002708315849304, + "grad_norm": 0.6959858536720276, "learning_rate": 4.598493341486616e-05, - "loss": 1.3902, + "loss": 1.3899, "step": 9455 }, { "epoch": 0.6427503736920778, - "grad_norm": 0.6493120193481445, + "grad_norm": 0.6492866277694702, "learning_rate": 4.5982810164424515e-05, - "loss": 1.4037, + "loss": 1.4038, "step": 9460 }, { "epoch": 0.6430900937627395, - "grad_norm": 0.6865414381027222, + "grad_norm": 0.6869975924491882, "learning_rate": 4.598068691398288e-05, "loss": 1.2946, "step": 9465 }, { "epoch": 0.6434298138334013, - "grad_norm": 0.6673260927200317, + "grad_norm": 0.673953652381897, "learning_rate": 4.597856366354125e-05, - "loss": 1.3747, + "loss": 1.3746, "step": 9470 }, { "epoch": 0.6437695339040631, - "grad_norm": 0.6680063009262085, + "grad_norm": 0.6707149744033813, "learning_rate": 4.597644041309961e-05, - "loss": 1.3585, + "loss": 1.3581, "step": 9475 }, { "epoch": 0.6441092539747248, - "grad_norm": 0.6357535123825073, + "grad_norm": 0.6396323442459106, "learning_rate": 4.597431716265797e-05, - "loss": 1.3602, + "loss": 1.3606, "step": 9480 }, { "epoch": 0.6444489740453866, - "grad_norm": 0.7351832985877991, + "grad_norm": 0.7113803029060364, "learning_rate": 4.5972193912216335e-05, - "loss": 1.4211, + "loss": 1.421, "step": 9485 }, { "epoch": 0.6447886941160483, - "grad_norm": 0.6761483550071716, + "grad_norm": 0.6755771040916443, "learning_rate": 4.59700706617747e-05, - "loss": 1.4096, + "loss": 1.4094, "step": 9490 }, { "epoch": 0.6451284141867102, - "grad_norm": 0.6655097007751465, + "grad_norm": 0.6690597534179688, "learning_rate": 4.596794741133306e-05, - "loss": 1.373, + "loss": 1.3733, "step": 9495 }, { "epoch": 0.6454681342573719, - "grad_norm": 0.6212973594665527, + "grad_norm": 0.6255130171775818, "learning_rate": 4.596582416089143e-05, - "loss": 1.388, + "loss": 1.3867, "step": 9500 }, { "epoch": 0.6458078543280337, - "grad_norm": 0.6282244920730591, + "grad_norm": 0.6306777000427246, "learning_rate": 4.596370091044979e-05, - "loss": 1.3183, + "loss": 1.3181, "step": 9505 }, { "epoch": 0.6461475743986955, - "grad_norm": 0.6414934396743774, + "grad_norm": 0.6393272280693054, "learning_rate": 4.5961577660008155e-05, - "loss": 1.3416, + "loss": 1.3423, "step": 9510 }, { "epoch": 0.6464872944693573, - "grad_norm": 0.6590577960014343, + "grad_norm": 0.6595920920372009, "learning_rate": 4.595945440956652e-05, - "loss": 1.4681, + "loss": 1.468, "step": 9515 }, { "epoch": 0.646827014540019, - "grad_norm": 0.6200308203697205, + "grad_norm": 0.6203466057777405, "learning_rate": 4.595733115912488e-05, - "loss": 1.3792, + "loss": 1.3788, "step": 9520 }, { "epoch": 0.6471667346106807, - "grad_norm": 0.7878032326698303, + "grad_norm": 0.7850175499916077, "learning_rate": 4.595520790868325e-05, - "loss": 1.366, + "loss": 1.365, "step": 9525 }, { "epoch": 0.6475064546813426, - "grad_norm": 0.5594908595085144, + "grad_norm": 0.5596631765365601, "learning_rate": 4.595308465824161e-05, - "loss": 1.3368, + "loss": 1.3373, "step": 9530 }, { "epoch": 0.6478461747520043, - "grad_norm": 0.6782909035682678, + "grad_norm": 0.6840354800224304, "learning_rate": 4.5950961407799975e-05, - "loss": 1.3882, + "loss": 1.3888, "step": 9535 }, { "epoch": 0.6481858948226661, - "grad_norm": 0.6730229258537292, + "grad_norm": 0.6667349338531494, "learning_rate": 4.594883815735834e-05, - "loss": 1.4006, + "loss": 1.4004, "step": 9540 }, { "epoch": 0.6485256148933279, - "grad_norm": 0.6307126879692078, + "grad_norm": 0.6300223469734192, "learning_rate": 4.59467149069167e-05, - "loss": 1.3597, + "loss": 1.3593, "step": 9545 }, { "epoch": 0.6488653349639897, - "grad_norm": 0.7037644982337952, + "grad_norm": 0.7056833505630493, "learning_rate": 4.594459165647507e-05, - "loss": 1.4252, + "loss": 1.426, "step": 9550 }, { "epoch": 0.6492050550346514, - "grad_norm": 0.5555718541145325, + "grad_norm": 0.5542994141578674, "learning_rate": 4.594246840603343e-05, - "loss": 1.3795, + "loss": 1.3797, "step": 9555 }, { "epoch": 0.6495447751053133, - "grad_norm": 0.6250681281089783, + "grad_norm": 0.625950038433075, "learning_rate": 4.5940345155591795e-05, - "loss": 1.2769, + "loss": 1.2773, "step": 9560 }, { "epoch": 0.649884495175975, - "grad_norm": 0.6747601628303528, + "grad_norm": 0.674426257610321, "learning_rate": 4.593822190515016e-05, - "loss": 1.4018, + "loss": 1.4013, "step": 9565 }, { "epoch": 0.6502242152466368, - "grad_norm": 0.6627749800682068, + "grad_norm": 0.6635863184928894, "learning_rate": 4.593609865470852e-05, - "loss": 1.3658, + "loss": 1.3659, "step": 9570 }, { "epoch": 0.6505639353172985, - "grad_norm": 0.6767715811729431, + "grad_norm": 0.6762595176696777, "learning_rate": 4.593397540426689e-05, - "loss": 1.3927, + "loss": 1.3924, "step": 9575 }, { "epoch": 0.6509036553879604, - "grad_norm": 0.6625750660896301, + "grad_norm": 0.6594296097755432, "learning_rate": 4.593185215382525e-05, - "loss": 1.3192, + "loss": 1.3187, "step": 9580 }, { "epoch": 0.6512433754586221, - "grad_norm": 0.6238129734992981, + "grad_norm": 0.6226010322570801, "learning_rate": 4.5929728903383615e-05, - "loss": 1.3058, + "loss": 1.3064, "step": 9585 }, { "epoch": 0.6515830955292838, - "grad_norm": 0.5913823843002319, + "grad_norm": 0.5923200845718384, "learning_rate": 4.592760565294198e-05, - "loss": 1.3467, + "loss": 1.3468, "step": 9590 }, { "epoch": 0.6519228155999457, - "grad_norm": 0.6385979652404785, + "grad_norm": 0.6385905146598816, "learning_rate": 4.592548240250034e-05, - "loss": 1.4386, + "loss": 1.4394, "step": 9595 }, { "epoch": 0.6522625356706074, - "grad_norm": 0.6688828468322754, + "grad_norm": 0.6688965559005737, "learning_rate": 4.592335915205871e-05, - "loss": 1.43, + "loss": 1.4302, "step": 9600 }, { "epoch": 0.6526022557412692, - "grad_norm": 0.662375271320343, + "grad_norm": 0.6555618047714233, "learning_rate": 4.592123590161707e-05, - "loss": 1.3591, + "loss": 1.3586, "step": 9605 }, { "epoch": 0.6529419758119309, - "grad_norm": 0.6747382283210754, + "grad_norm": 0.6792871952056885, "learning_rate": 4.5919112651175435e-05, - "loss": 1.362, + "loss": 1.3614, "step": 9610 }, { "epoch": 0.6532816958825928, - "grad_norm": 0.6197258830070496, + "grad_norm": 0.6181502938270569, "learning_rate": 4.591698940073379e-05, - "loss": 1.3253, + "loss": 1.325, "step": 9615 }, { "epoch": 0.6536214159532545, - "grad_norm": 0.7278828024864197, + "grad_norm": 0.7264374494552612, "learning_rate": 4.591486615029216e-05, - "loss": 1.3492, + "loss": 1.3482, "step": 9620 }, { "epoch": 0.6539611360239163, - "grad_norm": 0.6371850967407227, + "grad_norm": 0.6408815979957581, "learning_rate": 4.591274289985053e-05, - "loss": 1.4012, + "loss": 1.4011, "step": 9625 }, { "epoch": 0.6543008560945781, - "grad_norm": 0.6267068386077881, + "grad_norm": 0.626911461353302, "learning_rate": 4.5910619649408884e-05, - "loss": 1.3798, + "loss": 1.3799, "step": 9630 }, { "epoch": 0.6546405761652399, - "grad_norm": 0.666438102722168, + "grad_norm": 0.6695504188537598, "learning_rate": 4.5908496398967255e-05, - "loss": 1.3579, + "loss": 1.3571, "step": 9635 }, { "epoch": 0.6549802962359016, - "grad_norm": 0.7566459774971008, + "grad_norm": 0.7539008259773254, "learning_rate": 4.590637314852562e-05, - "loss": 1.3679, + "loss": 1.3686, "step": 9640 }, { "epoch": 0.6553200163065634, - "grad_norm": 0.6004300713539124, + "grad_norm": 0.6009779572486877, "learning_rate": 4.5904249898083976e-05, - "loss": 1.3402, + "loss": 1.3398, "step": 9645 }, { "epoch": 0.6556597363772252, - "grad_norm": 0.6538270115852356, + "grad_norm": 0.6565635204315186, "learning_rate": 4.590212664764235e-05, - "loss": 1.3925, + "loss": 1.3924, "step": 9650 }, { "epoch": 0.6559994564478869, - "grad_norm": 0.6357282400131226, + "grad_norm": 0.6381716132164001, "learning_rate": 4.590000339720071e-05, - "loss": 1.3934, + "loss": 1.3937, "step": 9655 }, { "epoch": 0.6563391765185487, - "grad_norm": 0.6436564326286316, + "grad_norm": 0.6425492763519287, "learning_rate": 4.589788014675907e-05, - "loss": 1.439, + "loss": 1.4389, "step": 9660 }, { "epoch": 0.6566788965892105, - "grad_norm": 0.6942059993743896, + "grad_norm": 0.692817747592926, "learning_rate": 4.589575689631744e-05, - "loss": 1.3961, + "loss": 1.3965, "step": 9665 }, { "epoch": 0.6570186166598723, - "grad_norm": 0.6094388961791992, + "grad_norm": 0.6088672280311584, "learning_rate": 4.58936336458758e-05, - "loss": 1.3108, + "loss": 1.311, "step": 9670 }, { "epoch": 0.657358336730534, - "grad_norm": 0.8097217679023743, + "grad_norm": 0.6743701696395874, "learning_rate": 4.589151039543416e-05, - "loss": 1.2711, + "loss": 1.2705, "step": 9675 }, { "epoch": 0.6576980568011959, - "grad_norm": 0.6026155948638916, + "grad_norm": 0.6033024787902832, "learning_rate": 4.588938714499253e-05, - "loss": 1.3313, + "loss": 1.3314, "step": 9680 }, { "epoch": 0.6580377768718576, - "grad_norm": 0.7359176874160767, + "grad_norm": 0.7370173931121826, "learning_rate": 4.588726389455089e-05, "loss": 1.2888, "step": 9685 }, { "epoch": 0.6583774969425193, - "grad_norm": 0.6421589255332947, + "grad_norm": 0.6422519683837891, "learning_rate": 4.588514064410925e-05, - "loss": 1.3364, + "loss": 1.3362, "step": 9690 }, { "epoch": 0.6587172170131811, - "grad_norm": 0.6737176775932312, + "grad_norm": 0.6741033792495728, "learning_rate": 4.588301739366762e-05, "loss": 1.3345, "step": 9695 }, { "epoch": 0.6590569370838429, - "grad_norm": 0.6418071389198303, + "grad_norm": 0.6446143984794617, "learning_rate": 4.588089414322598e-05, "loss": 1.3952, "step": 9700 }, { "epoch": 0.6593966571545047, - "grad_norm": 0.7056443095207214, + "grad_norm": 0.6968781352043152, "learning_rate": 4.5878770892784344e-05, - "loss": 1.3663, + "loss": 1.3665, "step": 9705 }, { "epoch": 0.6597363772251664, - "grad_norm": 0.6683934926986694, + "grad_norm": 0.6711854934692383, "learning_rate": 4.5876647642342715e-05, - "loss": 1.3471, + "loss": 1.3465, "step": 9710 }, { "epoch": 0.6600760972958283, - "grad_norm": 0.6746886372566223, + "grad_norm": 0.6789722442626953, "learning_rate": 4.587452439190107e-05, - "loss": 1.3707, + "loss": 1.3713, "step": 9715 }, { "epoch": 0.66041581736649, - "grad_norm": 0.634894073009491, + "grad_norm": 0.6360574960708618, "learning_rate": 4.587240114145944e-05, - "loss": 1.4886, + "loss": 1.4884, "step": 9720 }, { "epoch": 0.6607555374371518, - "grad_norm": 0.6497269868850708, + "grad_norm": 0.6462368965148926, "learning_rate": 4.587027789101781e-05, - "loss": 1.3776, + "loss": 1.3774, "step": 9725 }, { "epoch": 0.6610952575078136, - "grad_norm": 0.577656090259552, + "grad_norm": 0.5708618760108948, "learning_rate": 4.5868154640576164e-05, - "loss": 1.2446, + "loss": 1.2455, "step": 9730 }, { "epoch": 0.6614349775784754, - "grad_norm": 0.5996268391609192, + "grad_norm": 0.5966699719429016, "learning_rate": 4.5866031390134535e-05, - "loss": 1.3608, + "loss": 1.3607, "step": 9735 }, { "epoch": 0.6617746976491371, - "grad_norm": 0.602403461933136, + "grad_norm": 0.6049942970275879, "learning_rate": 4.58639081396929e-05, - "loss": 1.4305, + "loss": 1.4309, "step": 9740 }, { "epoch": 0.6621144177197988, - "grad_norm": 0.5692216157913208, + "grad_norm": 0.5673356652259827, "learning_rate": 4.5861784889251256e-05, - "loss": 1.376, + "loss": 1.3757, "step": 9745 }, { "epoch": 0.6624541377904607, - "grad_norm": 0.6115549206733704, + "grad_norm": 0.6122317910194397, "learning_rate": 4.585966163880963e-05, - "loss": 1.3595, + "loss": 1.3597, "step": 9750 }, { "epoch": 0.6627938578611224, - "grad_norm": 0.6958981156349182, + "grad_norm": 0.6996778249740601, "learning_rate": 4.585753838836799e-05, - "loss": 1.3745, + "loss": 1.3742, "step": 9755 }, { "epoch": 0.6631335779317842, - "grad_norm": 0.6716538667678833, + "grad_norm": 0.6699128150939941, "learning_rate": 4.585541513792635e-05, - "loss": 1.3938, + "loss": 1.3939, "step": 9760 }, { "epoch": 0.663473298002446, - "grad_norm": 0.641048014163971, + "grad_norm": 0.6456990242004395, "learning_rate": 4.585329188748472e-05, - "loss": 1.2594, + "loss": 1.2597, "step": 9765 }, { "epoch": 0.6638130180731078, - "grad_norm": 0.6822525858879089, + "grad_norm": 0.6850539445877075, "learning_rate": 4.5851168637043076e-05, - "loss": 1.4685, + "loss": 1.4684, "step": 9770 }, { "epoch": 0.6641527381437695, - "grad_norm": 0.617434024810791, + "grad_norm": 0.6190826892852783, "learning_rate": 4.584904538660144e-05, - "loss": 1.3618, + "loss": 1.3611, "step": 9775 }, { "epoch": 0.6644924582144313, - "grad_norm": 0.6560618281364441, + "grad_norm": 0.6523025631904602, "learning_rate": 4.584692213615981e-05, - "loss": 1.4008, + "loss": 1.4005, "step": 9780 }, { "epoch": 0.6648321782850931, - "grad_norm": 0.7547979950904846, + "grad_norm": 0.7554864287376404, "learning_rate": 4.584479888571817e-05, - "loss": 1.4516, + "loss": 1.4512, "step": 9785 }, { "epoch": 0.6651718983557549, - "grad_norm": 0.615734338760376, + "grad_norm": 0.6170873641967773, "learning_rate": 4.584267563527653e-05, - "loss": 1.4049, + "loss": 1.4055, "step": 9790 }, { "epoch": 0.6655116184264166, - "grad_norm": 0.741465151309967, + "grad_norm": 0.7452453374862671, "learning_rate": 4.58405523848349e-05, - "loss": 1.2591, + "loss": 1.2586, "step": 9795 }, { "epoch": 0.6658513384970784, - "grad_norm": 0.5768656730651855, + "grad_norm": 0.5752387642860413, "learning_rate": 4.583842913439326e-05, - "loss": 1.2828, + "loss": 1.2832, "step": 9800 }, { "epoch": 0.6661910585677402, - "grad_norm": 0.6337997317314148, + "grad_norm": 0.6374873518943787, "learning_rate": 4.5836305883951624e-05, - "loss": 1.3354, + "loss": 1.335, "step": 9805 }, { "epoch": 0.6665307786384019, - "grad_norm": 0.6725940704345703, + "grad_norm": 0.6738415956497192, "learning_rate": 4.5834182633509995e-05, - "loss": 1.3705, + "loss": 1.3709, "step": 9810 }, { "epoch": 0.6668704987090638, - "grad_norm": 0.750545084476471, + "grad_norm": 0.7572739124298096, "learning_rate": 4.583205938306835e-05, - "loss": 1.3244, + "loss": 1.3243, "step": 9815 }, { "epoch": 0.6672102187797255, - "grad_norm": 0.7528793215751648, + "grad_norm": 0.7511005401611328, "learning_rate": 4.5829936132626716e-05, - "loss": 1.339, + "loss": 1.3385, "step": 9820 }, { "epoch": 0.6675499388503873, - "grad_norm": 0.7245656251907349, + "grad_norm": 0.7180032730102539, "learning_rate": 4.582781288218509e-05, - "loss": 1.3719, + "loss": 1.3722, "step": 9825 }, { "epoch": 0.667889658921049, - "grad_norm": 0.6033870577812195, + "grad_norm": 0.6026372909545898, "learning_rate": 4.5825689631743444e-05, "loss": 1.3602, "step": 9830 }, { "epoch": 0.6682293789917109, - "grad_norm": 0.6469632983207703, + "grad_norm": 0.6468170285224915, "learning_rate": 4.582356638130181e-05, - "loss": 1.3224, + "loss": 1.3223, "step": 9835 }, { "epoch": 0.6685690990623726, - "grad_norm": 0.6322104930877686, + "grad_norm": 0.624422550201416, "learning_rate": 4.582144313086017e-05, - "loss": 1.4193, + "loss": 1.419, "step": 9840 }, { "epoch": 0.6689088191330343, - "grad_norm": 0.7348239421844482, + "grad_norm": 0.7368254065513611, "learning_rate": 4.5819319880418536e-05, - "loss": 1.2968, + "loss": 1.2967, "step": 9845 }, { "epoch": 0.6692485392036962, - "grad_norm": 0.6168177127838135, + "grad_norm": 0.6162317991256714, "learning_rate": 4.58171966299769e-05, - "loss": 1.3212, + "loss": 1.3213, "step": 9850 }, { "epoch": 0.6695882592743579, - "grad_norm": 0.7116106748580933, + "grad_norm": 0.7086217999458313, "learning_rate": 4.5815073379535264e-05, - "loss": 1.3936, + "loss": 1.3939, "step": 9855 }, { "epoch": 0.6699279793450197, - "grad_norm": 0.6527019143104553, + "grad_norm": 0.6512290835380554, "learning_rate": 4.581295012909363e-05, - "loss": 1.4694, + "loss": 1.4692, "step": 9860 }, { "epoch": 0.6702676994156814, - "grad_norm": 0.5700908899307251, + "grad_norm": 0.5678028464317322, "learning_rate": 4.581082687865199e-05, - "loss": 1.3862, + "loss": 1.386, "step": 9865 }, { "epoch": 0.6706074194863433, - "grad_norm": 0.6820536851882935, + "grad_norm": 0.6825844645500183, "learning_rate": 4.5808703628210356e-05, - "loss": 1.4069, + "loss": 1.4076, "step": 9870 }, { "epoch": 0.670947139557005, - "grad_norm": 0.6084043383598328, + "grad_norm": 0.6090874075889587, "learning_rate": 4.580658037776872e-05, - "loss": 1.4985, + "loss": 1.4981, "step": 9875 }, { "epoch": 0.6712868596276668, - "grad_norm": 0.6403409242630005, + "grad_norm": 0.6391855478286743, "learning_rate": 4.5804457127327084e-05, - "loss": 1.2438, + "loss": 1.2444, "step": 9880 }, { "epoch": 0.6716265796983286, - "grad_norm": 0.7354403138160706, + "grad_norm": 0.7378010749816895, "learning_rate": 4.580233387688545e-05, - "loss": 1.4273, + "loss": 1.4272, "step": 9885 }, { "epoch": 0.6719662997689904, - "grad_norm": 0.6923107504844666, + "grad_norm": 0.6893441081047058, "learning_rate": 4.580021062644381e-05, "loss": 1.3991, "step": 9890 }, { "epoch": 0.6723060198396521, - "grad_norm": 0.65451580286026, + "grad_norm": 0.6527566909790039, "learning_rate": 4.5798087376002176e-05, - "loss": 1.2395, + "loss": 1.2391, "step": 9895 }, { "epoch": 0.672645739910314, - "grad_norm": 0.6222007274627686, + "grad_norm": 0.6231962442398071, "learning_rate": 4.579596412556054e-05, - "loss": 1.414, + "loss": 1.4133, "step": 9900 }, { "epoch": 0.6729854599809757, - "grad_norm": 0.6794261932373047, + "grad_norm": 0.6785210371017456, "learning_rate": 4.5793840875118904e-05, "loss": 1.3881, "step": 9905 }, { "epoch": 0.6733251800516374, - "grad_norm": 0.6541102528572083, + "grad_norm": 0.6544910073280334, "learning_rate": 4.579171762467727e-05, - "loss": 1.4314, + "loss": 1.4312, "step": 9910 }, { "epoch": 0.6736649001222992, - "grad_norm": 0.6328390836715698, + "grad_norm": 0.6469367146492004, "learning_rate": 4.578959437423563e-05, - "loss": 1.5039, + "loss": 1.5042, "step": 9915 }, { "epoch": 0.674004620192961, - "grad_norm": 0.6314879059791565, + "grad_norm": 0.633521318435669, "learning_rate": 4.5787471123793996e-05, - "loss": 1.3484, + "loss": 1.3483, "step": 9920 }, { "epoch": 0.6743443402636228, - "grad_norm": 0.6886136531829834, + "grad_norm": 0.6853405237197876, "learning_rate": 4.578534787335236e-05, - "loss": 1.3884, + "loss": 1.3889, "step": 9925 }, { "epoch": 0.6746840603342845, - "grad_norm": 0.6789883971214294, + "grad_norm": 0.6813361048698425, "learning_rate": 4.5783224622910724e-05, - "loss": 1.4977, + "loss": 1.4982, "step": 9930 }, { "epoch": 0.6750237804049464, - "grad_norm": 0.6783531904220581, + "grad_norm": 0.6802077889442444, "learning_rate": 4.578110137246909e-05, - "loss": 1.3998, + "loss": 1.3999, "step": 9935 }, { "epoch": 0.6753635004756081, - "grad_norm": 0.6744592785835266, + "grad_norm": 0.6750180125236511, "learning_rate": 4.577897812202745e-05, - "loss": 1.4035, + "loss": 1.4044, "step": 9940 }, { "epoch": 0.6757032205462699, - "grad_norm": 0.64136803150177, + "grad_norm": 0.6422401666641235, "learning_rate": 4.5776854871585816e-05, - "loss": 1.333, + "loss": 1.3332, "step": 9945 }, { "epoch": 0.6760429406169316, - "grad_norm": 0.6484505534172058, + "grad_norm": 0.6511632800102234, "learning_rate": 4.577473162114418e-05, - "loss": 1.4407, + "loss": 1.4405, "step": 9950 }, { "epoch": 0.6763826606875935, - "grad_norm": 0.6182332634925842, + "grad_norm": 0.6183945536613464, "learning_rate": 4.5772608370702544e-05, - "loss": 1.293, + "loss": 1.2933, "step": 9955 }, { "epoch": 0.6767223807582552, - "grad_norm": 0.6932333111763, + "grad_norm": 0.6999959349632263, "learning_rate": 4.577048512026091e-05, - "loss": 1.273, + "loss": 1.2722, "step": 9960 }, { "epoch": 0.6770621008289169, - "grad_norm": 0.646960437297821, + "grad_norm": 0.6455107927322388, "learning_rate": 4.576836186981927e-05, - "loss": 1.3672, + "loss": 1.3674, "step": 9965 }, { "epoch": 0.6774018208995788, - "grad_norm": 0.691676139831543, + "grad_norm": 0.6968721747398376, "learning_rate": 4.576623861937763e-05, - "loss": 1.318, + "loss": 1.3183, "step": 9970 }, { "epoch": 0.6777415409702405, - "grad_norm": 0.6424114108085632, + "grad_norm": 0.64108806848526, "learning_rate": 4.5764115368936e-05, - "loss": 1.3387, + "loss": 1.3394, "step": 9975 }, { "epoch": 0.6780812610409023, - "grad_norm": 0.6947415471076965, + "grad_norm": 0.694321870803833, "learning_rate": 4.5761992118494364e-05, - "loss": 1.3571, + "loss": 1.3573, "step": 9980 }, { "epoch": 0.6784209811115641, - "grad_norm": 0.6696570515632629, + "grad_norm": 0.6680850982666016, "learning_rate": 4.575986886805272e-05, - "loss": 1.3673, + "loss": 1.3671, "step": 9985 }, { "epoch": 0.6787607011822259, - "grad_norm": 0.6752480864524841, + "grad_norm": 0.6760141253471375, "learning_rate": 4.575774561761109e-05, - "loss": 1.3672, + "loss": 1.3673, "step": 9990 }, { "epoch": 0.6791004212528876, - "grad_norm": 0.6025106310844421, + "grad_norm": 0.6011412739753723, "learning_rate": 4.5755622367169456e-05, - "loss": 1.3795, + "loss": 1.3799, "step": 9995 }, { "epoch": 0.6794401413235494, - "grad_norm": 0.6306586861610413, + "grad_norm": 0.6371522545814514, "learning_rate": 4.5753499116727814e-05, - "loss": 1.2824, + "loss": 1.2827, "step": 10000 }, { "epoch": 0.6797798613942112, - "grad_norm": 0.7083040475845337, + "grad_norm": 0.707524836063385, "learning_rate": 4.5751375866286184e-05, - "loss": 1.3134, + "loss": 1.3139, "step": 10005 }, { "epoch": 0.680119581464873, - "grad_norm": 0.6891843676567078, + "grad_norm": 0.6912394762039185, "learning_rate": 4.574925261584455e-05, "loss": 1.2929, "step": 10010 }, { "epoch": 0.6804593015355347, - "grad_norm": 0.6544457077980042, + "grad_norm": 0.6603009700775146, "learning_rate": 4.5747129365402906e-05, - "loss": 1.4114, + "loss": 1.4115, "step": 10015 }, { "epoch": 0.6807990216061965, - "grad_norm": 0.607488751411438, + "grad_norm": 0.6073808670043945, "learning_rate": 4.5745006114961276e-05, "loss": 1.3442, "step": 10020 }, { "epoch": 0.6811387416768583, - "grad_norm": 0.6513368487358093, + "grad_norm": 0.6521961688995361, "learning_rate": 4.574288286451964e-05, - "loss": 1.3319, + "loss": 1.3314, "step": 10025 }, { "epoch": 0.68147846174752, - "grad_norm": 0.5297539234161377, + "grad_norm": 0.5271780490875244, "learning_rate": 4.5740759614078e-05, - "loss": 1.3075, + "loss": 1.3074, "step": 10030 }, { "epoch": 0.6818181818181818, - "grad_norm": 0.6227191686630249, + "grad_norm": 0.620006799697876, "learning_rate": 4.573863636363637e-05, - "loss": 1.3797, + "loss": 1.3792, "step": 10035 }, { "epoch": 0.6821579018888436, - "grad_norm": 0.6473566293716431, + "grad_norm": 0.6503050327301025, "learning_rate": 4.5736513113194726e-05, - "loss": 1.3847, + "loss": 1.3848, "step": 10040 }, { "epoch": 0.6824976219595054, - "grad_norm": 0.6239564418792725, + "grad_norm": 0.625462532043457, "learning_rate": 4.573438986275309e-05, - "loss": 1.343, + "loss": 1.3428, "step": 10045 }, { "epoch": 0.6828373420301671, - "grad_norm": 0.6086986660957336, + "grad_norm": 0.6132872104644775, "learning_rate": 4.573226661231146e-05, - "loss": 1.2983, + "loss": 1.2985, "step": 10050 }, { "epoch": 0.683177062100829, - "grad_norm": 0.6269145011901855, + "grad_norm": 0.6292125582695007, "learning_rate": 4.573014336186982e-05, - "loss": 1.2954, + "loss": 1.2955, "step": 10055 }, { "epoch": 0.6835167821714907, - "grad_norm": 0.6437265276908875, + "grad_norm": 0.6404580473899841, "learning_rate": 4.572802011142819e-05, - "loss": 1.3948, + "loss": 1.3945, "step": 10060 }, { "epoch": 0.6838565022421524, - "grad_norm": 0.5740530490875244, + "grad_norm": 0.5758258104324341, "learning_rate": 4.572589686098655e-05, - "loss": 1.4117, + "loss": 1.4113, "step": 10065 }, { "epoch": 0.6841962223128143, - "grad_norm": 0.6062965393066406, + "grad_norm": 0.6057860255241394, "learning_rate": 4.572377361054491e-05, - "loss": 1.378, + "loss": 1.3777, "step": 10070 }, { "epoch": 0.684535942383476, - "grad_norm": 0.6366291046142578, + "grad_norm": 0.6405147910118103, "learning_rate": 4.572165036010328e-05, - "loss": 1.3278, + "loss": 1.3277, "step": 10075 }, { "epoch": 0.6848756624541378, - "grad_norm": 0.6265395283699036, + "grad_norm": 0.6282364726066589, "learning_rate": 4.5719527109661644e-05, - "loss": 1.4207, + "loss": 1.4214, "step": 10080 }, { "epoch": 0.6852153825247995, - "grad_norm": 0.5947545766830444, + "grad_norm": 0.595890998840332, "learning_rate": 4.571740385922e-05, - "loss": 1.4252, + "loss": 1.4256, "step": 10085 }, { "epoch": 0.6855551025954614, - "grad_norm": 0.6057619452476501, + "grad_norm": 0.6078627705574036, "learning_rate": 4.571528060877837e-05, - "loss": 1.3236, + "loss": 1.3241, "step": 10090 }, { "epoch": 0.6858948226661231, - "grad_norm": 0.698820173740387, + "grad_norm": 0.6993517875671387, "learning_rate": 4.5713157358336736e-05, - "loss": 1.2652, + "loss": 1.2655, "step": 10095 }, { "epoch": 0.6862345427367849, - "grad_norm": 0.6159490942955017, + "grad_norm": 0.6145053505897522, "learning_rate": 4.5711034107895094e-05, - "loss": 1.3296, + "loss": 1.3291, "step": 10100 }, { "epoch": 0.6865742628074467, - "grad_norm": 0.6031413078308105, + "grad_norm": 0.6031091809272766, "learning_rate": 4.5708910857453464e-05, - "loss": 1.3923, + "loss": 1.3926, "step": 10105 }, { "epoch": 0.6869139828781085, - "grad_norm": 0.6347100734710693, + "grad_norm": 0.6412459015846252, "learning_rate": 4.570678760701182e-05, - "loss": 1.2772, + "loss": 1.2767, "step": 10110 }, { "epoch": 0.6872537029487702, - "grad_norm": 0.6727352142333984, + "grad_norm": 0.6731709837913513, "learning_rate": 4.5704664356570186e-05, - "loss": 1.3894, + "loss": 1.3893, "step": 10115 }, { "epoch": 0.6875934230194319, - "grad_norm": 0.6287788152694702, + "grad_norm": 0.6308213472366333, "learning_rate": 4.5702541106128556e-05, - "loss": 1.3129, + "loss": 1.3125, "step": 10120 }, { "epoch": 0.6879331430900938, - "grad_norm": 0.7411888837814331, + "grad_norm": 0.7440937757492065, "learning_rate": 4.5700417855686914e-05, - "loss": 1.4734, + "loss": 1.4728, "step": 10125 }, { "epoch": 0.6882728631607555, - "grad_norm": 0.6954744458198547, + "grad_norm": 0.6960479617118835, "learning_rate": 4.569829460524528e-05, - "loss": 1.3616, + "loss": 1.3621, "step": 10130 }, { "epoch": 0.6886125832314173, - "grad_norm": 0.6447896957397461, + "grad_norm": 0.6436660289764404, "learning_rate": 4.569617135480365e-05, - "loss": 1.4141, + "loss": 1.4139, "step": 10135 }, { "epoch": 0.6889523033020791, - "grad_norm": 0.6385108232498169, + "grad_norm": 0.6377391219139099, "learning_rate": 4.5694048104362006e-05, "loss": 1.4046, "step": 10140 }, { "epoch": 0.6892920233727409, - "grad_norm": 0.7570716142654419, + "grad_norm": 0.7574175596237183, "learning_rate": 4.569192485392037e-05, "loss": 1.4226, "step": 10145 }, { "epoch": 0.6896317434434026, - "grad_norm": 0.6235305070877075, + "grad_norm": 0.6190837025642395, "learning_rate": 4.568980160347874e-05, - "loss": 1.2798, + "loss": 1.2792, "step": 10150 }, { "epoch": 0.6899714635140645, - "grad_norm": 0.6237177848815918, + "grad_norm": 0.6173886656761169, "learning_rate": 4.56876783530371e-05, - "loss": 1.3472, + "loss": 1.3473, "step": 10155 }, { "epoch": 0.6903111835847262, - "grad_norm": 0.6608687043190002, + "grad_norm": 0.6608256101608276, "learning_rate": 4.568555510259546e-05, - "loss": 1.3718, + "loss": 1.3714, "step": 10160 }, { "epoch": 0.690650903655388, - "grad_norm": 0.6903308033943176, + "grad_norm": 0.6893666386604309, "learning_rate": 4.568343185215383e-05, - "loss": 1.3937, + "loss": 1.3935, "step": 10165 }, { "epoch": 0.6909906237260497, - "grad_norm": 0.6505640149116516, + "grad_norm": 0.651647686958313, "learning_rate": 4.568130860171219e-05, - "loss": 1.4443, + "loss": 1.444, "step": 10170 }, { "epoch": 0.6913303437967115, - "grad_norm": 0.7226462364196777, + "grad_norm": 0.7247260808944702, "learning_rate": 4.5679185351270554e-05, - "loss": 1.3357, + "loss": 1.3352, "step": 10175 }, { "epoch": 0.6916700638673733, - "grad_norm": 0.6706532835960388, + "grad_norm": 0.6749725937843323, "learning_rate": 4.5677062100828925e-05, - "loss": 1.4377, + "loss": 1.4373, "step": 10180 }, { "epoch": 0.692009783938035, - "grad_norm": 0.5964548587799072, + "grad_norm": 0.5937505960464478, "learning_rate": 4.567493885038728e-05, - "loss": 1.3318, + "loss": 1.3319, "step": 10185 }, { "epoch": 0.6923495040086969, - "grad_norm": 0.4633779525756836, + "grad_norm": 0.46302735805511475, "learning_rate": 4.5672815599945646e-05, - "loss": 1.3428, + "loss": 1.3431, "step": 10190 }, { "epoch": 0.6926892240793586, - "grad_norm": 0.7144092917442322, + "grad_norm": 0.7327203750610352, "learning_rate": 4.567069234950401e-05, - "loss": 1.4012, + "loss": 1.4005, "step": 10195 }, { "epoch": 0.6930289441500204, - "grad_norm": 0.6145314574241638, + "grad_norm": 0.6168038845062256, "learning_rate": 4.5668569099062374e-05, - "loss": 1.3638, + "loss": 1.3641, "step": 10200 }, { "epoch": 0.6933686642206821, - "grad_norm": 0.699490487575531, + "grad_norm": 0.7014379501342773, "learning_rate": 4.566644584862074e-05, - "loss": 1.2711, + "loss": 1.2718, "step": 10205 }, { "epoch": 0.693708384291344, - "grad_norm": 0.6997410655021667, + "grad_norm": 0.7002038955688477, "learning_rate": 4.56643225981791e-05, - "loss": 1.3239, + "loss": 1.3234, "step": 10210 }, { "epoch": 0.6940481043620057, - "grad_norm": 0.63850998878479, + "grad_norm": 0.636362612247467, "learning_rate": 4.5662199347737466e-05, - "loss": 1.3865, + "loss": 1.3872, "step": 10215 }, { "epoch": 0.6943878244326674, - "grad_norm": 0.6392037272453308, + "grad_norm": 0.637187123298645, "learning_rate": 4.566007609729583e-05, - "loss": 1.4092, + "loss": 1.4091, "step": 10220 }, { "epoch": 0.6947275445033293, - "grad_norm": 0.7251085638999939, + "grad_norm": 0.7229210734367371, "learning_rate": 4.5657952846854194e-05, - "loss": 1.35, + "loss": 1.3496, "step": 10225 }, { "epoch": 0.695067264573991, - "grad_norm": 0.70627361536026, + "grad_norm": 0.7098286151885986, "learning_rate": 4.565582959641256e-05, - "loss": 1.3771, + "loss": 1.3765, "step": 10230 }, { "epoch": 0.6954069846446528, - "grad_norm": 0.5996231436729431, + "grad_norm": 0.59779953956604, "learning_rate": 4.565370634597092e-05, - "loss": 1.3957, + "loss": 1.3949, "step": 10235 }, { "epoch": 0.6957467047153146, - "grad_norm": 0.6541487574577332, + "grad_norm": 0.6550020575523376, "learning_rate": 4.5651583095529286e-05, - "loss": 1.294, + "loss": 1.2943, "step": 10240 }, { "epoch": 0.6960864247859764, - "grad_norm": 0.6588912010192871, + "grad_norm": 0.6625080704689026, "learning_rate": 4.564945984508765e-05, - "loss": 1.4421, + "loss": 1.4422, "step": 10245 }, { "epoch": 0.6964261448566381, - "grad_norm": 0.7020395994186401, + "grad_norm": 0.7003403902053833, "learning_rate": 4.5647336594646014e-05, - "loss": 1.3956, + "loss": 1.395, "step": 10250 }, { "epoch": 0.6967658649272999, - "grad_norm": 0.6050479412078857, + "grad_norm": 0.604834258556366, "learning_rate": 4.564521334420438e-05, - "loss": 1.3626, + "loss": 1.3625, "step": 10255 }, { "epoch": 0.6971055849979617, - "grad_norm": 0.6568503379821777, + "grad_norm": 0.6567161083221436, "learning_rate": 4.564309009376274e-05, - "loss": 1.3887, + "loss": 1.3896, "step": 10260 }, { "epoch": 0.6974453050686235, - "grad_norm": 0.6809664368629456, + "grad_norm": 0.6779438257217407, "learning_rate": 4.5640966843321106e-05, - "loss": 1.327, + "loss": 1.3273, "step": 10265 }, { "epoch": 0.6977850251392852, - "grad_norm": 0.6142020225524902, + "grad_norm": 0.6112393736839294, "learning_rate": 4.563884359287947e-05, - "loss": 1.388, + "loss": 1.3883, "step": 10270 }, { "epoch": 0.698124745209947, - "grad_norm": 0.5905758142471313, + "grad_norm": 0.5904213190078735, "learning_rate": 4.5636720342437834e-05, - "loss": 1.2479, + "loss": 1.2481, "step": 10275 }, { "epoch": 0.6984644652806088, - "grad_norm": 0.7039254307746887, + "grad_norm": 0.705613374710083, "learning_rate": 4.56345970919962e-05, - "loss": 1.3374, + "loss": 1.3381, "step": 10280 }, { "epoch": 0.6988041853512705, - "grad_norm": 0.6263798475265503, + "grad_norm": 0.625584602355957, "learning_rate": 4.563247384155456e-05, - "loss": 1.4462, + "loss": 1.4468, "step": 10285 }, { "epoch": 0.6991439054219323, - "grad_norm": 0.6036354899406433, + "grad_norm": 0.6043965816497803, "learning_rate": 4.5630350591112926e-05, - "loss": 1.4544, + "loss": 1.4551, "step": 10290 }, { "epoch": 0.6994836254925941, - "grad_norm": 0.7210366725921631, + "grad_norm": 0.733506441116333, "learning_rate": 4.562822734067129e-05, - "loss": 1.253, + "loss": 1.2532, "step": 10295 }, { "epoch": 0.6998233455632559, - "grad_norm": 0.5882965326309204, + "grad_norm": 0.5869718194007874, "learning_rate": 4.5626104090229654e-05, "loss": 1.3674, "step": 10300 }, { "epoch": 0.7001630656339176, - "grad_norm": 0.6516746878623962, + "grad_norm": 0.6503603458404541, "learning_rate": 4.562398083978802e-05, - "loss": 1.3727, + "loss": 1.3724, "step": 10305 }, { "epoch": 0.7005027857045795, - "grad_norm": 0.6511339545249939, + "grad_norm": 0.6521415114402771, "learning_rate": 4.5621857589346375e-05, - "loss": 1.337, + "loss": 1.3371, "step": 10310 }, { "epoch": 0.7008425057752412, - "grad_norm": 0.7368311882019043, + "grad_norm": 0.7386031150817871, "learning_rate": 4.5619734338904746e-05, - "loss": 1.3485, + "loss": 1.3482, "step": 10315 }, { "epoch": 0.701182225845903, - "grad_norm": 0.6463493704795837, + "grad_norm": 0.6511077880859375, "learning_rate": 4.561761108846311e-05, - "loss": 1.3034, + "loss": 1.3036, "step": 10320 }, { "epoch": 0.7015219459165648, - "grad_norm": 0.6115689873695374, + "grad_norm": 0.6134814023971558, "learning_rate": 4.561548783802147e-05, - "loss": 1.3668, + "loss": 1.3666, "step": 10325 }, { "epoch": 0.7018616659872265, - "grad_norm": 0.6443473100662231, + "grad_norm": 0.6482341885566711, "learning_rate": 4.561336458757984e-05, - "loss": 1.3558, + "loss": 1.3559, "step": 10330 }, { "epoch": 0.7022013860578883, - "grad_norm": 0.677765429019928, + "grad_norm": 0.6766654849052429, "learning_rate": 4.56112413371382e-05, - "loss": 1.3628, + "loss": 1.3626, "step": 10335 }, { "epoch": 0.70254110612855, - "grad_norm": 0.6149821877479553, + "grad_norm": 0.6150861978530884, "learning_rate": 4.560911808669656e-05, "loss": 1.3836, "step": 10340 }, { "epoch": 0.7028808261992119, - "grad_norm": 0.6386359333992004, + "grad_norm": 0.6367886066436768, "learning_rate": 4.560699483625493e-05, - "loss": 1.395, + "loss": 1.3957, "step": 10345 }, { "epoch": 0.7032205462698736, - "grad_norm": 0.605919361114502, + "grad_norm": 0.6064703464508057, "learning_rate": 4.5604871585813294e-05, - "loss": 1.462, + "loss": 1.4613, "step": 10350 }, { "epoch": 0.7035602663405354, - "grad_norm": 0.6383451819419861, + "grad_norm": 0.6364635825157166, "learning_rate": 4.560274833537165e-05, - "loss": 1.2691, + "loss": 1.2683, "step": 10355 }, { "epoch": 0.7038999864111972, - "grad_norm": 0.7361869215965271, + "grad_norm": 0.736740231513977, "learning_rate": 4.560062508493002e-05, - "loss": 1.3405, + "loss": 1.3404, "step": 10360 }, { "epoch": 0.704239706481859, - "grad_norm": 0.8459182977676392, + "grad_norm": 0.8430366516113281, "learning_rate": 4.5598501834488386e-05, - "loss": 1.4351, + "loss": 1.4343, "step": 10365 }, { "epoch": 0.7045794265525207, - "grad_norm": 0.6890857219696045, + "grad_norm": 0.6905868649482727, "learning_rate": 4.559637858404674e-05, - "loss": 1.3238, + "loss": 1.3237, "step": 10370 }, { "epoch": 0.7049191466231824, - "grad_norm": 0.6507689356803894, + "grad_norm": 0.6493223905563354, "learning_rate": 4.5594255333605114e-05, - "loss": 1.3762, + "loss": 1.3765, "step": 10375 }, { "epoch": 0.7052588666938443, - "grad_norm": 0.7193464040756226, + "grad_norm": 0.7178914546966553, "learning_rate": 4.559213208316348e-05, - "loss": 1.3918, + "loss": 1.3919, "step": 10380 }, { "epoch": 0.705598586764506, - "grad_norm": 0.6927496194839478, + "grad_norm": 0.6919376254081726, "learning_rate": 4.5590008832721835e-05, - "loss": 1.4717, + "loss": 1.4721, "step": 10385 }, { "epoch": 0.7059383068351678, - "grad_norm": 0.6915751695632935, + "grad_norm": 0.693134605884552, "learning_rate": 4.5587885582280206e-05, - "loss": 1.3379, + "loss": 1.3388, "step": 10390 }, { "epoch": 0.7062780269058296, - "grad_norm": 0.7506421208381653, + "grad_norm": 0.7530926465988159, "learning_rate": 4.558576233183856e-05, - "loss": 1.3583, + "loss": 1.3578, "step": 10395 }, { "epoch": 0.7066177469764914, - "grad_norm": 0.6764235496520996, + "grad_norm": 0.6762392520904541, "learning_rate": 4.5583639081396934e-05, - "loss": 1.3166, + "loss": 1.3165, "step": 10400 }, { "epoch": 0.7069574670471531, - "grad_norm": 0.6450632810592651, + "grad_norm": 0.6448944211006165, "learning_rate": 4.55815158309553e-05, - "loss": 1.2878, + "loss": 1.2882, "step": 10405 }, { "epoch": 0.707297187117815, - "grad_norm": 0.6657335758209229, + "grad_norm": 0.6622769236564636, "learning_rate": 4.5579392580513655e-05, - "loss": 1.4115, + "loss": 1.4111, "step": 10410 }, { "epoch": 0.7076369071884767, - "grad_norm": 0.6178698539733887, + "grad_norm": 0.6176264882087708, "learning_rate": 4.5577269330072026e-05, - "loss": 1.3599, + "loss": 1.3603, "step": 10415 }, { "epoch": 0.7079766272591385, - "grad_norm": 0.6750048398971558, + "grad_norm": 0.6757957339286804, "learning_rate": 4.557514607963039e-05, - "loss": 1.3979, + "loss": 1.3974, "step": 10420 }, { "epoch": 0.7083163473298002, - "grad_norm": 0.6003528237342834, + "grad_norm": 0.5989627838134766, "learning_rate": 4.557302282918875e-05, "loss": 1.4673, "step": 10425 }, { "epoch": 0.708656067400462, - "grad_norm": 0.6799806356430054, + "grad_norm": 0.6779143214225769, "learning_rate": 4.557089957874712e-05, - "loss": 1.3712, + "loss": 1.3709, "step": 10430 }, { "epoch": 0.7089957874711238, - "grad_norm": 0.6539444327354431, + "grad_norm": 0.6558088660240173, "learning_rate": 4.556877632830548e-05, "loss": 1.4233, "step": 10435 }, { "epoch": 0.7093355075417855, - "grad_norm": 0.7056052684783936, + "grad_norm": 0.7029901146888733, "learning_rate": 4.556665307786384e-05, - "loss": 1.4099, + "loss": 1.4095, "step": 10440 }, { "epoch": 0.7096752276124474, - "grad_norm": 0.6594746112823486, + "grad_norm": 0.6598241925239563, "learning_rate": 4.556452982742221e-05, - "loss": 1.4822, + "loss": 1.4823, "step": 10445 }, { "epoch": 0.7100149476831091, - "grad_norm": 0.636754035949707, + "grad_norm": 0.6395412087440491, "learning_rate": 4.5562406576980574e-05, - "loss": 1.3653, + "loss": 1.3655, "step": 10450 }, { "epoch": 0.7103546677537709, - "grad_norm": 0.6520203948020935, + "grad_norm": 0.6508654952049255, "learning_rate": 4.556028332653893e-05, - "loss": 1.2665, + "loss": 1.2663, "step": 10455 }, { "epoch": 0.7106943878244326, - "grad_norm": 0.6684704422950745, + "grad_norm": 0.6673070788383484, "learning_rate": 4.55581600760973e-05, - "loss": 1.4546, + "loss": 1.455, "step": 10460 }, { "epoch": 0.7110341078950945, - "grad_norm": 0.7014590501785278, + "grad_norm": 0.7030982971191406, "learning_rate": 4.555603682565566e-05, - "loss": 1.3714, + "loss": 1.3712, "step": 10465 }, { "epoch": 0.7113738279657562, - "grad_norm": 0.677331805229187, + "grad_norm": 0.676998496055603, "learning_rate": 4.555391357521402e-05, - "loss": 1.4425, + "loss": 1.4422, "step": 10470 }, { "epoch": 0.711713548036418, - "grad_norm": 0.6193029880523682, + "grad_norm": 0.6186684370040894, "learning_rate": 4.5551790324772394e-05, "loss": 1.4065, "step": 10475 }, { "epoch": 0.7120532681070798, - "grad_norm": 0.8266158699989319, + "grad_norm": 0.8217729330062866, "learning_rate": 4.554966707433075e-05, - "loss": 1.3732, + "loss": 1.3737, "step": 10480 }, { "epoch": 0.7123929881777415, - "grad_norm": 0.6649559140205383, + "grad_norm": 0.6646506190299988, "learning_rate": 4.5547543823889115e-05, - "loss": 1.2375, + "loss": 1.2376, "step": 10485 }, { "epoch": 0.7127327082484033, - "grad_norm": 0.6089277863502502, + "grad_norm": 0.6087135076522827, "learning_rate": 4.5545420573447486e-05, - "loss": 1.3748, + "loss": 1.3749, "step": 10490 }, { "epoch": 0.7130724283190651, - "grad_norm": 0.7148370146751404, + "grad_norm": 0.7140599489212036, "learning_rate": 4.554329732300584e-05, - "loss": 1.2781, + "loss": 1.278, "step": 10495 }, { "epoch": 0.7134121483897269, - "grad_norm": 0.6397234201431274, + "grad_norm": 0.6406027674674988, "learning_rate": 4.554117407256421e-05, - "loss": 1.305, + "loss": 1.3056, "step": 10500 }, { "epoch": 0.7137518684603886, - "grad_norm": 0.6898680925369263, + "grad_norm": 0.6885027289390564, "learning_rate": 4.553905082212258e-05, - "loss": 1.4395, + "loss": 1.4386, "step": 10505 }, { "epoch": 0.7140915885310504, - "grad_norm": 0.6065067648887634, + "grad_norm": 0.6064291000366211, "learning_rate": 4.5536927571680935e-05, - "loss": 1.4486, + "loss": 1.4494, "step": 10510 }, { "epoch": 0.7144313086017122, - "grad_norm": 0.6090561151504517, + "grad_norm": 0.6101377606391907, "learning_rate": 4.55348043212393e-05, - "loss": 1.4499, + "loss": 1.4502, "step": 10515 }, { "epoch": 0.714771028672374, - "grad_norm": 0.7384569048881531, + "grad_norm": 0.7396570444107056, "learning_rate": 4.553268107079767e-05, - "loss": 1.3616, + "loss": 1.3617, "step": 10520 }, { "epoch": 0.7151107487430357, - "grad_norm": 0.6976181268692017, + "grad_norm": 0.6990511417388916, "learning_rate": 4.553055782035603e-05, - "loss": 1.3167, + "loss": 1.3166, "step": 10525 }, { "epoch": 0.7154504688136976, - "grad_norm": 0.6307021975517273, + "grad_norm": 0.6317633986473083, "learning_rate": 4.552843456991439e-05, - "loss": 1.388, + "loss": 1.3877, "step": 10530 }, { "epoch": 0.7157901888843593, - "grad_norm": 0.4543355703353882, + "grad_norm": 0.4551231563091278, "learning_rate": 4.5526311319472755e-05, "loss": 1.2206, "step": 10535 }, { "epoch": 0.716129908955021, - "grad_norm": 0.6298014521598816, + "grad_norm": 0.6331966519355774, "learning_rate": 4.552418806903112e-05, - "loss": 1.3189, + "loss": 1.3185, "step": 10540 }, { "epoch": 0.7164696290256828, - "grad_norm": 0.7497329115867615, + "grad_norm": 0.7497586011886597, "learning_rate": 4.552206481858948e-05, - "loss": 1.3096, + "loss": 1.3099, "step": 10545 }, { "epoch": 0.7168093490963446, - "grad_norm": 0.7202334403991699, + "grad_norm": 0.7156761884689331, "learning_rate": 4.551994156814785e-05, - "loss": 1.3713, + "loss": 1.3706, "step": 10550 }, { "epoch": 0.7171490691670064, - "grad_norm": 0.6543573141098022, + "grad_norm": 0.6575216054916382, "learning_rate": 4.551781831770621e-05, - "loss": 1.3518, + "loss": 1.3515, "step": 10555 }, { "epoch": 0.7174887892376681, - "grad_norm": 0.6194443106651306, + "grad_norm": 0.6201654076576233, "learning_rate": 4.5515695067264575e-05, - "loss": 1.4825, + "loss": 1.4822, "step": 10560 }, { "epoch": 0.71782850930833, - "grad_norm": 0.6676272749900818, + "grad_norm": 0.6678162217140198, "learning_rate": 4.551357181682294e-05, - "loss": 1.4397, + "loss": 1.4394, "step": 10565 }, { "epoch": 0.7181682293789917, - "grad_norm": 0.6669387221336365, + "grad_norm": 0.6674899458885193, "learning_rate": 4.55114485663813e-05, - "loss": 1.4975, + "loss": 1.4971, "step": 10570 }, { "epoch": 0.7185079494496535, - "grad_norm": 0.7604942321777344, + "grad_norm": 0.7626459002494812, "learning_rate": 4.550932531593967e-05, - "loss": 1.4554, + "loss": 1.455, "step": 10575 }, { "epoch": 0.7188476695203153, - "grad_norm": 0.7051596641540527, + "grad_norm": 0.7065680027008057, "learning_rate": 4.550720206549803e-05, - "loss": 1.4082, + "loss": 1.4089, "step": 10580 }, { "epoch": 0.719187389590977, - "grad_norm": 0.7017998695373535, + "grad_norm": 0.6959729194641113, "learning_rate": 4.5505078815056395e-05, - "loss": 1.4139, + "loss": 1.4132, "step": 10585 }, { "epoch": 0.7195271096616388, - "grad_norm": 0.6412633657455444, + "grad_norm": 0.641036331653595, "learning_rate": 4.550295556461476e-05, - "loss": 1.3885, + "loss": 1.3886, "step": 10590 }, { "epoch": 0.7198668297323005, - "grad_norm": 0.6595339179039001, + "grad_norm": 0.664600670337677, "learning_rate": 4.550083231417312e-05, - "loss": 1.2869, + "loss": 1.2873, "step": 10595 }, { "epoch": 0.7202065498029624, - "grad_norm": 0.6425991654396057, + "grad_norm": 0.6408098340034485, "learning_rate": 4.549870906373149e-05, "loss": 1.3251, "step": 10600 }, { "epoch": 0.7205462698736241, - "grad_norm": 0.6715484261512756, + "grad_norm": 0.6610189080238342, "learning_rate": 4.549658581328985e-05, - "loss": 1.3397, + "loss": 1.3402, "step": 10605 }, { "epoch": 0.7208859899442859, - "grad_norm": 0.6597923636436462, + "grad_norm": 0.6628747582435608, "learning_rate": 4.5494462562848215e-05, "loss": 1.3952, "step": 10610 }, { "epoch": 0.7212257100149477, - "grad_norm": 0.704318106174469, + "grad_norm": 0.7054851055145264, "learning_rate": 4.549233931240658e-05, "loss": 1.3744, "step": 10615 }, { "epoch": 0.7215654300856095, - "grad_norm": 0.7401502132415771, + "grad_norm": 0.7391473054885864, "learning_rate": 4.549021606196494e-05, - "loss": 1.417, + "loss": 1.4177, "step": 10620 }, { "epoch": 0.7219051501562712, - "grad_norm": 0.661598801612854, + "grad_norm": 0.6622463464736938, "learning_rate": 4.548809281152331e-05, - "loss": 1.426, + "loss": 1.4256, "step": 10625 }, { "epoch": 0.722244870226933, - "grad_norm": 0.5903460383415222, + "grad_norm": 0.5930067300796509, "learning_rate": 4.548596956108167e-05, - "loss": 1.4073, + "loss": 1.4075, "step": 10630 }, { "epoch": 0.7225845902975948, - "grad_norm": 0.628123939037323, + "grad_norm": 0.6282181143760681, "learning_rate": 4.5483846310640035e-05, - "loss": 1.3774, + "loss": 1.3769, "step": 10635 }, { "epoch": 0.7229243103682566, - "grad_norm": 0.5890015363693237, + "grad_norm": 0.5910596251487732, "learning_rate": 4.54817230601984e-05, - "loss": 1.2802, + "loss": 1.2801, "step": 10640 }, { "epoch": 0.7232640304389183, - "grad_norm": 0.7097578048706055, + "grad_norm": 0.7116647958755493, "learning_rate": 4.547959980975676e-05, - "loss": 1.2938, + "loss": 1.2937, "step": 10645 }, { "epoch": 0.7236037505095801, - "grad_norm": 0.6969096064567566, + "grad_norm": 0.6981189250946045, "learning_rate": 4.547747655931513e-05, - "loss": 1.3474, + "loss": 1.3482, "step": 10650 }, { "epoch": 0.7239434705802419, - "grad_norm": 0.6347588300704956, + "grad_norm": 0.6340263485908508, "learning_rate": 4.547535330887349e-05, - "loss": 1.3791, + "loss": 1.379, "step": 10655 }, { "epoch": 0.7242831906509036, - "grad_norm": 0.7243689894676208, + "grad_norm": 0.723906934261322, "learning_rate": 4.5473230058431855e-05, - "loss": 1.3636, + "loss": 1.3648, "step": 10660 }, { "epoch": 0.7246229107215655, - "grad_norm": 0.6821337342262268, + "grad_norm": 0.6840580701828003, "learning_rate": 4.547110680799021e-05, - "loss": 1.4354, + "loss": 1.4356, "step": 10665 }, { "epoch": 0.7249626307922272, - "grad_norm": 0.6853811144828796, + "grad_norm": 0.6821880340576172, "learning_rate": 4.546898355754858e-05, - "loss": 1.31, + "loss": 1.3101, "step": 10670 }, { "epoch": 0.725302350862889, - "grad_norm": 0.6670143604278564, + "grad_norm": 0.6662035584449768, "learning_rate": 4.546686030710695e-05, - "loss": 1.4195, + "loss": 1.4198, "step": 10675 }, { "epoch": 0.7256420709335507, - "grad_norm": 0.711848258972168, + "grad_norm": 0.7115731239318848, "learning_rate": 4.5464737056665305e-05, - "loss": 1.2501, + "loss": 1.2509, "step": 10680 }, { "epoch": 0.7259817910042126, - "grad_norm": 0.7048289775848389, + "grad_norm": 0.7010862231254578, "learning_rate": 4.5462613806223675e-05, - "loss": 1.3292, + "loss": 1.3293, "step": 10685 }, { "epoch": 0.7263215110748743, - "grad_norm": 0.6657536029815674, + "grad_norm": 0.6642447113990784, "learning_rate": 4.546049055578204e-05, - "loss": 1.2919, + "loss": 1.2926, "step": 10690 }, { "epoch": 0.726661231145536, - "grad_norm": 0.6005533933639526, + "grad_norm": 0.6001885533332825, "learning_rate": 4.5458367305340397e-05, "loss": 1.3737, "step": 10695 }, { "epoch": 0.7270009512161979, - "grad_norm": 0.6253713965415955, + "grad_norm": 0.6260944604873657, "learning_rate": 4.545624405489877e-05, - "loss": 1.4649, + "loss": 1.465, "step": 10700 }, { "epoch": 0.7273406712868596, - "grad_norm": 0.7332903146743774, + "grad_norm": 0.7355566620826721, "learning_rate": 4.545412080445713e-05, - "loss": 1.4178, + "loss": 1.4179, "step": 10705 }, { "epoch": 0.7276803913575214, - "grad_norm": 0.7133682370185852, + "grad_norm": 0.7166079878807068, "learning_rate": 4.545199755401549e-05, - "loss": 1.3826, + "loss": 1.3835, "step": 10710 }, { "epoch": 0.7280201114281831, - "grad_norm": 0.6334335803985596, + "grad_norm": 0.63198322057724, "learning_rate": 4.544987430357386e-05, - "loss": 1.3523, + "loss": 1.3525, "step": 10715 }, { "epoch": 0.728359831498845, - "grad_norm": 0.6501543521881104, + "grad_norm": 0.6585511565208435, "learning_rate": 4.544775105313222e-05, - "loss": 1.3774, + "loss": 1.3772, "step": 10720 }, { "epoch": 0.7286995515695067, - "grad_norm": 0.6430721282958984, + "grad_norm": 0.6431351900100708, "learning_rate": 4.544562780269058e-05, - "loss": 1.3669, + "loss": 1.3672, "step": 10725 }, { "epoch": 0.7290392716401685, - "grad_norm": 0.6951696872711182, + "grad_norm": 0.6915704011917114, "learning_rate": 4.544350455224895e-05, - "loss": 1.3315, + "loss": 1.3311, "step": 10730 }, { "epoch": 0.7293789917108303, - "grad_norm": 0.6585467457771301, + "grad_norm": 0.6578543186187744, "learning_rate": 4.5441381301807315e-05, - "loss": 1.3278, + "loss": 1.3273, "step": 10735 }, { "epoch": 0.7297187117814921, - "grad_norm": 0.6286793351173401, + "grad_norm": 0.6241704225540161, "learning_rate": 4.543925805136568e-05, - "loss": 1.442, + "loss": 1.4423, "step": 10740 }, { "epoch": 0.7300584318521538, - "grad_norm": 0.6113405227661133, + "grad_norm": 0.6110124588012695, "learning_rate": 4.5437134800924043e-05, - "loss": 1.3904, + "loss": 1.3893, "step": 10745 }, { "epoch": 0.7303981519228157, - "grad_norm": 0.6477635502815247, + "grad_norm": 0.6513340473175049, "learning_rate": 4.54350115504824e-05, - "loss": 1.285, + "loss": 1.2856, "step": 10750 }, { "epoch": 0.7307378719934774, - "grad_norm": 0.6457996368408203, + "grad_norm": 0.6451296806335449, "learning_rate": 4.543288830004077e-05, "loss": 1.4113, "step": 10755 }, { "epoch": 0.7310775920641391, - "grad_norm": 0.6372414827346802, + "grad_norm": 0.6378304958343506, "learning_rate": 4.5430765049599135e-05, - "loss": 1.384, + "loss": 1.3845, "step": 10760 }, { "epoch": 0.7314173121348009, - "grad_norm": 0.6154240369796753, + "grad_norm": 0.6160060167312622, "learning_rate": 4.542864179915749e-05, - "loss": 1.3949, + "loss": 1.3952, "step": 10765 }, { "epoch": 0.7317570322054627, - "grad_norm": 0.670173168182373, + "grad_norm": 0.6640422344207764, "learning_rate": 4.5426518548715863e-05, - "loss": 1.2533, + "loss": 1.2531, "step": 10770 }, { "epoch": 0.7320967522761245, - "grad_norm": 0.6684125661849976, + "grad_norm": 0.6686874628067017, "learning_rate": 4.542439529827423e-05, - "loss": 1.4462, + "loss": 1.4461, "step": 10775 }, { "epoch": 0.7324364723467862, - "grad_norm": 0.5796847343444824, + "grad_norm": 0.5797076225280762, "learning_rate": 4.5422272047832585e-05, - "loss": 1.386, + "loss": 1.3857, "step": 10780 }, { "epoch": 0.7327761924174481, - "grad_norm": 0.7200872898101807, + "grad_norm": 0.718817412853241, "learning_rate": 4.5420148797390955e-05, - "loss": 1.3339, + "loss": 1.3342, "step": 10785 }, { "epoch": 0.7331159124881098, - "grad_norm": 0.6216338276863098, + "grad_norm": 0.6188486218452454, "learning_rate": 4.541802554694932e-05, - "loss": 1.3574, + "loss": 1.3581, "step": 10790 }, { "epoch": 0.7334556325587716, - "grad_norm": 0.6423646807670593, + "grad_norm": 0.6419550776481628, "learning_rate": 4.541590229650768e-05, - "loss": 1.3184, + "loss": 1.3189, "step": 10795 }, { "epoch": 0.7337953526294333, - "grad_norm": 0.7440258860588074, + "grad_norm": 0.7423009872436523, "learning_rate": 4.541377904606605e-05, - "loss": 1.3108, + "loss": 1.311, "step": 10800 }, { "epoch": 0.7341350727000951, - "grad_norm": 0.6222683191299438, + "grad_norm": 0.6231959462165833, "learning_rate": 4.541165579562441e-05, - "loss": 1.3767, + "loss": 1.3761, "step": 10805 }, { "epoch": 0.7344747927707569, - "grad_norm": 0.6356257796287537, + "grad_norm": 0.6309270858764648, "learning_rate": 4.540953254518277e-05, - "loss": 1.3499, + "loss": 1.3498, "step": 10810 }, { "epoch": 0.7348145128414186, - "grad_norm": 0.6644449830055237, + "grad_norm": 0.6639755368232727, "learning_rate": 4.540740929474114e-05, - "loss": 1.3655, + "loss": 1.3657, "step": 10815 }, { "epoch": 0.7351542329120805, - "grad_norm": 0.6641932129859924, + "grad_norm": 0.6616642475128174, "learning_rate": 4.54052860442995e-05, "loss": 1.394, "step": 10820 }, { "epoch": 0.7354939529827422, - "grad_norm": 0.6184588074684143, + "grad_norm": 0.6215910315513611, "learning_rate": 4.540316279385786e-05, - "loss": 1.4549, + "loss": 1.4554, "step": 10825 }, { "epoch": 0.735833673053404, - "grad_norm": 0.5978854894638062, + "grad_norm": 0.5965304374694824, "learning_rate": 4.540103954341623e-05, - "loss": 1.4342, + "loss": 1.4341, "step": 10830 }, { "epoch": 0.7361733931240658, - "grad_norm": 0.6878186464309692, + "grad_norm": 0.6912020444869995, "learning_rate": 4.539891629297459e-05, - "loss": 1.3338, + "loss": 1.3343, "step": 10835 }, { "epoch": 0.7365131131947276, - "grad_norm": 0.5620693564414978, + "grad_norm": 0.5654614567756653, "learning_rate": 4.539679304253295e-05, - "loss": 1.3268, + "loss": 1.3275, "step": 10840 }, { "epoch": 0.7368528332653893, - "grad_norm": 0.8356578946113586, + "grad_norm": 0.8462305665016174, "learning_rate": 4.5394669792091323e-05, - "loss": 1.3955, + "loss": 1.3959, "step": 10845 }, { "epoch": 0.737192553336051, - "grad_norm": 0.7349069118499756, + "grad_norm": 0.7331716418266296, "learning_rate": 4.539254654164968e-05, - "loss": 1.4559, + "loss": 1.4552, "step": 10850 }, { "epoch": 0.7375322734067129, - "grad_norm": 0.6967862248420715, + "grad_norm": 0.6954794526100159, "learning_rate": 4.5390423291208045e-05, - "loss": 1.2999, + "loss": 1.3004, "step": 10855 }, { "epoch": 0.7378719934773746, - "grad_norm": 0.6220554709434509, + "grad_norm": 0.6251990795135498, "learning_rate": 4.5388300040766415e-05, - "loss": 1.4476, + "loss": 1.4478, "step": 10860 }, { "epoch": 0.7382117135480364, - "grad_norm": 0.5988366007804871, + "grad_norm": 0.599622905254364, "learning_rate": 4.538617679032477e-05, - "loss": 1.3814, + "loss": 1.3815, "step": 10865 }, { "epoch": 0.7385514336186982, - "grad_norm": 0.667321503162384, + "grad_norm": 0.6646989583969116, "learning_rate": 4.538405353988314e-05, - "loss": 1.3919, + "loss": 1.3928, "step": 10870 }, { "epoch": 0.73889115368936, - "grad_norm": 0.7849789261817932, + "grad_norm": 0.7846508026123047, "learning_rate": 4.538193028944151e-05, - "loss": 1.3866, + "loss": 1.3863, "step": 10875 }, { "epoch": 0.7392308737600217, - "grad_norm": 0.6790648102760315, + "grad_norm": 0.6792385578155518, "learning_rate": 4.5379807038999865e-05, - "loss": 1.4061, + "loss": 1.4058, "step": 10880 }, { "epoch": 0.7395705938306835, - "grad_norm": 0.7118006348609924, + "grad_norm": 0.7135400772094727, "learning_rate": 4.537768378855823e-05, - "loss": 1.3913, + "loss": 1.3919, "step": 10885 }, { "epoch": 0.7399103139013453, - "grad_norm": 0.6077026724815369, + "grad_norm": 0.6083499193191528, "learning_rate": 4.537556053811659e-05, - "loss": 1.3604, + "loss": 1.3601, "step": 10890 }, { "epoch": 0.7402500339720071, - "grad_norm": 0.641882061958313, + "grad_norm": 0.6423265933990479, "learning_rate": 4.537343728767496e-05, - "loss": 1.3833, + "loss": 1.3834, "step": 10895 }, { "epoch": 0.7405897540426688, - "grad_norm": 0.6631798148155212, + "grad_norm": 0.6602912545204163, "learning_rate": 4.537131403723332e-05, - "loss": 1.3602, + "loss": 1.3597, "step": 10900 }, { "epoch": 0.7409294741133307, - "grad_norm": 0.6099927425384521, + "grad_norm": 0.6107264757156372, "learning_rate": 4.5369190786791685e-05, - "loss": 1.2839, + "loss": 1.2835, "step": 10905 }, { "epoch": 0.7412691941839924, - "grad_norm": 0.6890628933906555, + "grad_norm": 0.6893656849861145, "learning_rate": 4.536706753635005e-05, - "loss": 1.3124, + "loss": 1.3123, "step": 10910 }, { "epoch": 0.7416089142546541, - "grad_norm": 0.678059995174408, + "grad_norm": 0.6777758002281189, "learning_rate": 4.536494428590841e-05, - "loss": 1.3665, + "loss": 1.3659, "step": 10915 }, { "epoch": 0.741948634325316, - "grad_norm": 0.5877015590667725, + "grad_norm": 0.5877455472946167, "learning_rate": 4.536282103546678e-05, - "loss": 1.3879, + "loss": 1.388, "step": 10920 }, { "epoch": 0.7422883543959777, - "grad_norm": 0.7351785898208618, + "grad_norm": 0.7320488095283508, "learning_rate": 4.536069778502514e-05, - "loss": 1.4093, + "loss": 1.41, "step": 10925 }, { "epoch": 0.7426280744666395, - "grad_norm": 0.5812264680862427, + "grad_norm": 0.581148624420166, "learning_rate": 4.5358574534583505e-05, - "loss": 1.3514, + "loss": 1.3511, "step": 10930 }, { "epoch": 0.7429677945373012, - "grad_norm": 0.7274196743965149, + "grad_norm": 0.7245442867279053, "learning_rate": 4.535645128414187e-05, - "loss": 1.4067, + "loss": 1.4069, "step": 10935 }, { "epoch": 0.7433075146079631, - "grad_norm": 0.7358936071395874, + "grad_norm": 0.731349527835846, "learning_rate": 4.535432803370023e-05, - "loss": 1.3441, + "loss": 1.3442, "step": 10940 }, { "epoch": 0.7436472346786248, - "grad_norm": 0.5588825941085815, + "grad_norm": 0.5593454837799072, "learning_rate": 4.53522047832586e-05, - "loss": 1.3177, + "loss": 1.3176, "step": 10945 }, { "epoch": 0.7439869547492866, - "grad_norm": 0.6864970922470093, + "grad_norm": 0.6847903728485107, "learning_rate": 4.535008153281696e-05, - "loss": 1.4194, + "loss": 1.4191, "step": 10950 }, { "epoch": 0.7443266748199484, - "grad_norm": 0.6844140887260437, + "grad_norm": 0.6952393054962158, "learning_rate": 4.5347958282375325e-05, - "loss": 1.3596, + "loss": 1.3601, "step": 10955 }, { "epoch": 0.7446663948906102, - "grad_norm": 0.628604531288147, + "grad_norm": 0.6241804361343384, "learning_rate": 4.534583503193369e-05, - "loss": 1.3307, + "loss": 1.3303, "step": 10960 }, { "epoch": 0.7450061149612719, - "grad_norm": 0.6032277941703796, + "grad_norm": 0.5958861708641052, "learning_rate": 4.534371178149205e-05, - "loss": 1.3565, + "loss": 1.356, "step": 10965 }, { "epoch": 0.7453458350319336, - "grad_norm": 0.6987267136573792, + "grad_norm": 0.6996466517448425, "learning_rate": 4.534158853105042e-05, - "loss": 1.3718, + "loss": 1.3723, "step": 10970 }, { "epoch": 0.7456855551025955, - "grad_norm": 0.6521071791648865, + "grad_norm": 0.6502295136451721, "learning_rate": 4.533946528060878e-05, - "loss": 1.2881, + "loss": 1.2878, "step": 10975 }, { "epoch": 0.7460252751732572, - "grad_norm": 0.7675398588180542, + "grad_norm": 0.7723934054374695, "learning_rate": 4.5337342030167145e-05, - "loss": 1.3546, + "loss": 1.3538, "step": 10980 }, { "epoch": 0.746364995243919, - "grad_norm": 0.697261393070221, + "grad_norm": 0.6962344646453857, "learning_rate": 4.533521877972551e-05, - "loss": 1.4466, + "loss": 1.4465, "step": 10985 }, { "epoch": 0.7467047153145808, - "grad_norm": 0.6618101000785828, + "grad_norm": 0.6601017117500305, "learning_rate": 4.533309552928387e-05, - "loss": 1.3422, + "loss": 1.342, "step": 10990 }, { "epoch": 0.7470444353852426, - "grad_norm": 0.6806418299674988, + "grad_norm": 0.6849114894866943, "learning_rate": 4.533097227884224e-05, - "loss": 1.2398, + "loss": 1.2404, "step": 10995 }, { "epoch": 0.7473841554559043, - "grad_norm": 0.6811333894729614, + "grad_norm": 0.6837900280952454, "learning_rate": 4.53288490284006e-05, - "loss": 1.3156, + "loss": 1.3162, "step": 11000 }, { "epoch": 0.7477238755265662, - "grad_norm": 0.6440121531486511, + "grad_norm": 0.6448536515235901, "learning_rate": 4.5326725777958965e-05, - "loss": 1.3333, + "loss": 1.3343, "step": 11005 }, { "epoch": 0.7480635955972279, - "grad_norm": 0.6451594233512878, + "grad_norm": 0.6454798579216003, "learning_rate": 4.532460252751733e-05, - "loss": 1.3979, + "loss": 1.3975, "step": 11010 }, { "epoch": 0.7484033156678896, - "grad_norm": 0.6549493670463562, + "grad_norm": 0.6622368693351746, "learning_rate": 4.532247927707569e-05, - "loss": 1.2734, + "loss": 1.2728, "step": 11015 }, { "epoch": 0.7487430357385514, - "grad_norm": 0.6674458384513855, + "grad_norm": 0.6631676554679871, "learning_rate": 4.532035602663405e-05, - "loss": 1.3546, + "loss": 1.3548, "step": 11020 }, { "epoch": 0.7490827558092132, - "grad_norm": 0.645881712436676, + "grad_norm": 0.6411242485046387, "learning_rate": 4.531823277619242e-05, - "loss": 1.3222, + "loss": 1.3224, "step": 11025 }, { "epoch": 0.749422475879875, - "grad_norm": 0.7385879158973694, + "grad_norm": 0.7421648502349854, "learning_rate": 4.5316109525750785e-05, - "loss": 1.3321, + "loss": 1.3323, "step": 11030 }, { "epoch": 0.7497621959505367, - "grad_norm": 0.6517509818077087, + "grad_norm": 0.6513067483901978, "learning_rate": 4.531398627530914e-05, - "loss": 1.3645, + "loss": 1.365, "step": 11035 }, { "epoch": 0.7501019160211986, - "grad_norm": 0.7039206624031067, + "grad_norm": 0.7099272608757019, "learning_rate": 4.531186302486751e-05, - "loss": 1.4079, + "loss": 1.4076, "step": 11040 }, { "epoch": 0.7504416360918603, - "grad_norm": 0.6504554748535156, + "grad_norm": 0.652755081653595, "learning_rate": 4.530973977442588e-05, - "loss": 1.2961, + "loss": 1.2965, "step": 11045 }, { "epoch": 0.7507813561625221, - "grad_norm": 0.6780588626861572, + "grad_norm": 0.6693432331085205, "learning_rate": 4.5307616523984234e-05, - "loss": 1.428, + "loss": 1.4276, "step": 11050 }, { "epoch": 0.7511210762331838, - "grad_norm": 0.718646764755249, + "grad_norm": 0.7202014327049255, "learning_rate": 4.5305493273542605e-05, "loss": 1.2856, "step": 11055 }, { "epoch": 0.7514607963038457, - "grad_norm": 0.7259412407875061, + "grad_norm": 0.7204832434654236, "learning_rate": 4.530337002310097e-05, "loss": 1.4011, "step": 11060 }, { "epoch": 0.7518005163745074, - "grad_norm": 0.6184737682342529, + "grad_norm": 0.6166843175888062, "learning_rate": 4.5301246772659326e-05, - "loss": 1.3844, + "loss": 1.3843, "step": 11065 }, { "epoch": 0.7521402364451691, - "grad_norm": 0.7287435531616211, + "grad_norm": 0.7297067046165466, "learning_rate": 4.52991235222177e-05, - "loss": 1.268, + "loss": 1.2675, "step": 11070 }, { "epoch": 0.752479956515831, - "grad_norm": 0.5963881611824036, + "grad_norm": 0.5976271033287048, "learning_rate": 4.529700027177606e-05, - "loss": 1.3586, + "loss": 1.3582, "step": 11075 }, { "epoch": 0.7528196765864927, - "grad_norm": 0.6301303505897522, + "grad_norm": 0.6300629377365112, "learning_rate": 4.5294877021334425e-05, - "loss": 1.4247, + "loss": 1.4249, "step": 11080 }, { "epoch": 0.7531593966571545, - "grad_norm": 0.6454952955245972, + "grad_norm": 0.6415272355079651, "learning_rate": 4.529275377089279e-05, - "loss": 1.381, + "loss": 1.3809, "step": 11085 }, { "epoch": 0.7534991167278163, - "grad_norm": 0.6887718439102173, + "grad_norm": 0.6890599131584167, "learning_rate": 4.5290630520451146e-05, - "loss": 1.3288, + "loss": 1.3291, "step": 11090 }, { "epoch": 0.7538388367984781, - "grad_norm": 0.698443591594696, + "grad_norm": 0.6974114179611206, "learning_rate": 4.528850727000952e-05, - "loss": 1.3923, + "loss": 1.3918, "step": 11095 }, { "epoch": 0.7541785568691398, - "grad_norm": 0.6156599521636963, + "grad_norm": 0.6167287230491638, "learning_rate": 4.528638401956788e-05, - "loss": 1.3565, + "loss": 1.3566, "step": 11100 }, { "epoch": 0.7545182769398016, - "grad_norm": 0.6258707642555237, + "grad_norm": 0.6300950050354004, "learning_rate": 4.528426076912624e-05, - "loss": 1.4253, + "loss": 1.4251, "step": 11105 }, { "epoch": 0.7548579970104634, - "grad_norm": 0.748116135597229, + "grad_norm": 0.7475053071975708, "learning_rate": 4.528213751868461e-05, - "loss": 1.3582, + "loss": 1.3579, "step": 11110 }, { "epoch": 0.7551977170811252, - "grad_norm": 0.6185370683670044, + "grad_norm": 0.6139084100723267, "learning_rate": 4.528001426824297e-05, - "loss": 1.3557, + "loss": 1.3553, "step": 11115 }, { "epoch": 0.7555374371517869, - "grad_norm": 0.6756013631820679, + "grad_norm": 0.6925303936004639, "learning_rate": 4.527789101780133e-05, - "loss": 1.3789, + "loss": 1.3787, "step": 11120 }, { "epoch": 0.7558771572224487, - "grad_norm": 0.7011415958404541, + "grad_norm": 0.7046883702278137, "learning_rate": 4.52757677673597e-05, - "loss": 1.4337, + "loss": 1.4343, "step": 11125 }, { "epoch": 0.7562168772931105, - "grad_norm": 0.6777071356773376, + "grad_norm": 0.6794979572296143, "learning_rate": 4.5273644516918065e-05, - "loss": 1.3069, + "loss": 1.3065, "step": 11130 }, { "epoch": 0.7565565973637722, - "grad_norm": 0.5916514992713928, + "grad_norm": 0.5941647887229919, "learning_rate": 4.527152126647642e-05, - "loss": 1.4871, + "loss": 1.4869, "step": 11135 }, { "epoch": 0.756896317434434, - "grad_norm": 0.6710848808288574, + "grad_norm": 0.6628198623657227, "learning_rate": 4.526939801603479e-05, - "loss": 1.3507, + "loss": 1.3503, "step": 11140 }, { "epoch": 0.7572360375050958, - "grad_norm": 0.633577287197113, + "grad_norm": 0.6325341463088989, "learning_rate": 4.526727476559316e-05, - "loss": 1.3479, + "loss": 1.3477, "step": 11145 }, { "epoch": 0.7575757575757576, - "grad_norm": 0.6314858794212341, + "grad_norm": 0.6358028650283813, "learning_rate": 4.5265151515151514e-05, - "loss": 1.4401, + "loss": 1.4398, "step": 11150 }, { "epoch": 0.7579154776464193, - "grad_norm": 0.6551578044891357, + "grad_norm": 0.6528627872467041, "learning_rate": 4.5263028264709885e-05, - "loss": 1.2934, + "loss": 1.2929, "step": 11155 }, { "epoch": 0.7582551977170812, - "grad_norm": 0.7416176795959473, + "grad_norm": 0.7415926456451416, "learning_rate": 4.526090501426825e-05, "loss": 1.4084, "step": 11160 }, { "epoch": 0.7585949177877429, - "grad_norm": 0.6715858578681946, + "grad_norm": 0.6729483604431152, "learning_rate": 4.5258781763826606e-05, - "loss": 1.269, + "loss": 1.2691, "step": 11165 }, { "epoch": 0.7589346378584046, - "grad_norm": 0.6637235879898071, + "grad_norm": 0.6640619039535522, "learning_rate": 4.525665851338498e-05, - "loss": 1.3334, + "loss": 1.333, "step": 11170 }, { "epoch": 0.7592743579290665, - "grad_norm": 0.6463482975959778, + "grad_norm": 0.6433826684951782, "learning_rate": 4.5254535262943334e-05, - "loss": 1.4485, + "loss": 1.4486, "step": 11175 }, { "epoch": 0.7596140779997282, - "grad_norm": 0.7097727656364441, + "grad_norm": 0.7102058529853821, "learning_rate": 4.52524120125017e-05, - "loss": 1.3639, + "loss": 1.3633, "step": 11180 }, { "epoch": 0.75995379807039, - "grad_norm": 0.6772062182426453, + "grad_norm": 0.681615948677063, "learning_rate": 4.525028876206007e-05, - "loss": 1.3756, + "loss": 1.3754, "step": 11185 }, { "epoch": 0.7602935181410517, - "grad_norm": 0.6018824577331543, + "grad_norm": 0.6040261387825012, "learning_rate": 4.5248165511618426e-05, - "loss": 1.3822, + "loss": 1.3824, "step": 11190 }, { "epoch": 0.7606332382117136, - "grad_norm": 0.610094428062439, + "grad_norm": 0.6134982705116272, "learning_rate": 4.524604226117679e-05, - "loss": 1.4147, + "loss": 1.4138, "step": 11195 }, { "epoch": 0.7609729582823753, - "grad_norm": 0.6564494967460632, + "grad_norm": 0.6598483324050903, "learning_rate": 4.524391901073516e-05, - "loss": 1.3623, + "loss": 1.3615, "step": 11200 }, { "epoch": 0.7613126783530371, - "grad_norm": 0.6258195638656616, + "grad_norm": 0.6277610063552856, "learning_rate": 4.524179576029352e-05, - "loss": 1.3278, + "loss": 1.328, "step": 11205 }, { "epoch": 0.7616523984236989, - "grad_norm": 0.5058313012123108, + "grad_norm": 0.5048632621765137, "learning_rate": 4.523967250985188e-05, - "loss": 1.2296, + "loss": 1.2299, "step": 11210 }, { "epoch": 0.7619921184943607, - "grad_norm": 0.6882098317146301, + "grad_norm": 0.6930842995643616, "learning_rate": 4.523754925941025e-05, - "loss": 1.283, + "loss": 1.2831, "step": 11215 }, { "epoch": 0.7623318385650224, - "grad_norm": 0.6522939801216125, + "grad_norm": 0.6508983969688416, "learning_rate": 4.523542600896861e-05, - "loss": 1.3471, + "loss": 1.3469, "step": 11220 }, { "epoch": 0.7626715586356841, - "grad_norm": 0.7013705968856812, + "grad_norm": 0.7002859711647034, "learning_rate": 4.5233302758526974e-05, - "loss": 1.3625, + "loss": 1.3623, "step": 11225 }, { "epoch": 0.763011278706346, - "grad_norm": 0.669908881187439, + "grad_norm": 0.6685367226600647, "learning_rate": 4.5231179508085345e-05, - "loss": 1.4546, + "loss": 1.4551, "step": 11230 }, { "epoch": 0.7633509987770077, - "grad_norm": 0.6184269785881042, + "grad_norm": 0.6189066171646118, "learning_rate": 4.52290562576437e-05, - "loss": 1.4344, + "loss": 1.435, "step": 11235 }, { "epoch": 0.7636907188476695, - "grad_norm": 0.6487805247306824, + "grad_norm": 0.6493113040924072, "learning_rate": 4.5226933007202066e-05, - "loss": 1.4197, + "loss": 1.4188, "step": 11240 }, { "epoch": 0.7640304389183313, - "grad_norm": 0.7183907628059387, + "grad_norm": 0.7224499583244324, "learning_rate": 4.522480975676043e-05, - "loss": 1.268, + "loss": 1.2678, "step": 11245 }, { "epoch": 0.7643701589889931, - "grad_norm": 0.6929051876068115, + "grad_norm": 0.695044755935669, "learning_rate": 4.5222686506318794e-05, - "loss": 1.3469, + "loss": 1.3466, "step": 11250 }, { "epoch": 0.7647098790596548, - "grad_norm": 0.6434996724128723, + "grad_norm": 0.6477655172348022, "learning_rate": 4.522056325587716e-05, - "loss": 1.3583, + "loss": 1.3574, "step": 11255 }, { "epoch": 0.7650495991303167, - "grad_norm": 0.7112532258033752, + "grad_norm": 0.7149754166603088, "learning_rate": 4.521844000543552e-05, - "loss": 1.3904, + "loss": 1.3901, "step": 11260 }, { "epoch": 0.7653893192009784, - "grad_norm": 0.6957665085792542, + "grad_norm": 0.6924655437469482, "learning_rate": 4.5216316754993886e-05, - "loss": 1.4094, + "loss": 1.4092, "step": 11265 }, { "epoch": 0.7657290392716402, - "grad_norm": 0.6419957280158997, + "grad_norm": 0.6422463059425354, "learning_rate": 4.521419350455225e-05, - "loss": 1.2995, + "loss": 1.3004, "step": 11270 }, { "epoch": 0.7660687593423019, - "grad_norm": 0.5854098796844482, + "grad_norm": 0.5859542489051819, "learning_rate": 4.5212070254110614e-05, - "loss": 1.2422, + "loss": 1.242, "step": 11275 }, { "epoch": 0.7664084794129638, - "grad_norm": 0.6475620269775391, + "grad_norm": 0.6455865502357483, "learning_rate": 4.520994700366898e-05, - "loss": 1.3331, + "loss": 1.3326, "step": 11280 }, { "epoch": 0.7667481994836255, - "grad_norm": 0.6581407785415649, + "grad_norm": 0.6567974090576172, "learning_rate": 4.520782375322734e-05, - "loss": 1.2795, + "loss": 1.2799, "step": 11285 }, { "epoch": 0.7670879195542872, - "grad_norm": 0.6953970789909363, + "grad_norm": 0.6924960017204285, "learning_rate": 4.5205700502785706e-05, - "loss": 1.4028, + "loss": 1.4032, "step": 11290 }, { "epoch": 0.7674276396249491, - "grad_norm": 0.6748022437095642, + "grad_norm": 0.6770350337028503, "learning_rate": 4.520357725234407e-05, - "loss": 1.3456, + "loss": 1.3461, "step": 11295 }, { "epoch": 0.7677673596956108, - "grad_norm": 0.6086054444313049, + "grad_norm": 0.6109882593154907, "learning_rate": 4.5201454001902434e-05, - "loss": 1.4028, + "loss": 1.4026, "step": 11300 }, { "epoch": 0.7681070797662726, - "grad_norm": 0.5455803871154785, + "grad_norm": 0.5447876453399658, "learning_rate": 4.51993307514608e-05, - "loss": 1.3613, + "loss": 1.3616, "step": 11305 }, { "epoch": 0.7684467998369343, - "grad_norm": 0.7180493474006653, + "grad_norm": 0.7147044539451599, "learning_rate": 4.519720750101916e-05, - "loss": 1.357, + "loss": 1.3572, "step": 11310 }, { "epoch": 0.7687865199075962, - "grad_norm": 0.6505820751190186, + "grad_norm": 0.6510125994682312, "learning_rate": 4.5195084250577526e-05, - "loss": 1.2822, + "loss": 1.2821, "step": 11315 }, { "epoch": 0.7691262399782579, - "grad_norm": 0.7178351879119873, + "grad_norm": 0.7191102504730225, "learning_rate": 4.519296100013589e-05, - "loss": 1.3484, + "loss": 1.3488, "step": 11320 }, { "epoch": 0.7694659600489197, - "grad_norm": 0.7940161228179932, + "grad_norm": 0.7910858392715454, "learning_rate": 4.5190837749694254e-05, "loss": 1.2942, "step": 11325 }, { "epoch": 0.7698056801195815, - "grad_norm": 0.693401575088501, + "grad_norm": 0.69256192445755, "learning_rate": 4.518871449925262e-05, "loss": 1.3506, "step": 11330 }, { "epoch": 0.7701454001902432, - "grad_norm": 0.7177097201347351, + "grad_norm": 0.717030942440033, "learning_rate": 4.518659124881098e-05, - "loss": 1.2475, + "loss": 1.2473, "step": 11335 }, { "epoch": 0.770485120260905, - "grad_norm": 0.7371872067451477, + "grad_norm": 0.7348435521125793, "learning_rate": 4.5184467998369346e-05, - "loss": 1.3756, + "loss": 1.375, "step": 11340 }, { "epoch": 0.7708248403315668, - "grad_norm": 0.7532777190208435, + "grad_norm": 0.7518489360809326, "learning_rate": 4.518234474792771e-05, - "loss": 1.3471, + "loss": 1.3467, "step": 11345 }, { "epoch": 0.7711645604022286, - "grad_norm": 0.8056697845458984, + "grad_norm": 0.806397557258606, "learning_rate": 4.5180221497486074e-05, - "loss": 1.4821, + "loss": 1.4825, "step": 11350 }, { "epoch": 0.7715042804728903, - "grad_norm": 0.6422814726829529, + "grad_norm": 0.6400620341300964, "learning_rate": 4.517809824704444e-05, "loss": 1.3969, "step": 11355 }, { "epoch": 0.7718440005435521, - "grad_norm": 0.6496434807777405, + "grad_norm": 0.655762791633606, "learning_rate": 4.51759749966028e-05, - "loss": 1.395, + "loss": 1.3959, "step": 11360 }, { "epoch": 0.7721837206142139, - "grad_norm": 0.6860586404800415, + "grad_norm": 0.6830344796180725, "learning_rate": 4.5173851746161166e-05, - "loss": 1.2887, + "loss": 1.2896, "step": 11365 }, { "epoch": 0.7725234406848757, - "grad_norm": 0.6781855821609497, + "grad_norm": 0.6784651875495911, "learning_rate": 4.517172849571953e-05, - "loss": 1.4128, + "loss": 1.4137, "step": 11370 }, { "epoch": 0.7728631607555374, - "grad_norm": 0.6779969930648804, + "grad_norm": 0.6836321353912354, "learning_rate": 4.516960524527789e-05, - "loss": 1.2631, + "loss": 1.2628, "step": 11375 }, { "epoch": 0.7732028808261993, - "grad_norm": 0.7600628137588501, + "grad_norm": 0.760604977607727, "learning_rate": 4.516748199483626e-05, - "loss": 1.3465, + "loss": 1.347, "step": 11380 }, { "epoch": 0.773542600896861, - "grad_norm": 0.6441403031349182, + "grad_norm": 0.645848274230957, "learning_rate": 4.516535874439462e-05, - "loss": 1.446, + "loss": 1.4459, "step": 11385 }, { "epoch": 0.7738823209675227, - "grad_norm": 0.6897464990615845, + "grad_norm": 0.6907637119293213, "learning_rate": 4.516323549395298e-05, - "loss": 1.3506, + "loss": 1.351, "step": 11390 }, { "epoch": 0.7742220410381845, - "grad_norm": 0.7222656607627869, + "grad_norm": 0.7220481038093567, "learning_rate": 4.516111224351135e-05, - "loss": 1.3308, + "loss": 1.3305, "step": 11395 }, { "epoch": 0.7745617611088463, - "grad_norm": 0.6664167642593384, + "grad_norm": 0.668342113494873, "learning_rate": 4.5158988993069714e-05, - "loss": 1.3773, + "loss": 1.3779, "step": 11400 }, { "epoch": 0.7749014811795081, - "grad_norm": 0.690582275390625, + "grad_norm": 0.6928859949111938, "learning_rate": 4.515686574262807e-05, "loss": 1.4416, "step": 11405 }, { "epoch": 0.7752412012501698, - "grad_norm": 0.6101199984550476, + "grad_norm": 0.6050866842269897, "learning_rate": 4.515474249218644e-05, - "loss": 1.3748, + "loss": 1.3752, "step": 11410 }, { "epoch": 0.7755809213208317, - "grad_norm": 0.8245213627815247, + "grad_norm": 0.8344361782073975, "learning_rate": 4.5152619241744806e-05, - "loss": 1.3361, + "loss": 1.3366, "step": 11415 }, { "epoch": 0.7759206413914934, - "grad_norm": 0.7068120241165161, + "grad_norm": 0.7082160115242004, "learning_rate": 4.515049599130317e-05, - "loss": 1.3667, + "loss": 1.3669, "step": 11420 }, { "epoch": 0.7762603614621552, - "grad_norm": 0.6961685419082642, + "grad_norm": 0.6966045498847961, "learning_rate": 4.5148372740861534e-05, - "loss": 1.4166, + "loss": 1.4167, "step": 11425 }, { "epoch": 0.776600081532817, - "grad_norm": 0.6665591597557068, + "grad_norm": 0.6719191074371338, "learning_rate": 4.51462494904199e-05, - "loss": 1.3002, + "loss": 1.3001, "step": 11430 }, { "epoch": 0.7769398016034788, - "grad_norm": 0.6144057512283325, + "grad_norm": 0.6135929226875305, "learning_rate": 4.514412623997826e-05, - "loss": 1.4215, + "loss": 1.4218, "step": 11435 }, { "epoch": 0.7772795216741405, - "grad_norm": 0.6899815797805786, + "grad_norm": 0.6948690414428711, "learning_rate": 4.5142002989536626e-05, - "loss": 1.4774, + "loss": 1.4782, "step": 11440 }, { "epoch": 0.7776192417448022, - "grad_norm": 0.6336473226547241, + "grad_norm": 0.6334467530250549, "learning_rate": 4.5139879739094984e-05, - "loss": 1.4458, + "loss": 1.4462, "step": 11445 }, { "epoch": 0.7779589618154641, - "grad_norm": 0.734282374382019, + "grad_norm": 0.7368757724761963, "learning_rate": 4.5137756488653354e-05, - "loss": 1.3422, + "loss": 1.3427, "step": 11450 }, { "epoch": 0.7782986818861258, - "grad_norm": 0.6325424313545227, + "grad_norm": 0.6353341937065125, "learning_rate": 4.513563323821172e-05, - "loss": 1.6687, + "loss": 1.669, "step": 11455 }, { "epoch": 0.7786384019567876, - "grad_norm": 0.7124289274215698, + "grad_norm": 0.7112290263175964, "learning_rate": 4.5133509987770076e-05, - "loss": 1.3919, + "loss": 1.3924, "step": 11460 }, { "epoch": 0.7789781220274494, - "grad_norm": 0.6993489861488342, + "grad_norm": 0.687079906463623, "learning_rate": 4.5131386737328446e-05, "loss": 1.3735, "step": 11465 }, { "epoch": 0.7793178420981112, - "grad_norm": 0.6520628929138184, + "grad_norm": 0.6535384654998779, "learning_rate": 4.512926348688681e-05, - "loss": 1.3458, + "loss": 1.3453, "step": 11470 }, { "epoch": 0.7796575621687729, - "grad_norm": 0.6258121132850647, + "grad_norm": 0.6254514455795288, "learning_rate": 4.512714023644517e-05, - "loss": 1.3438, + "loss": 1.3442, "step": 11475 }, { "epoch": 0.7799972822394347, - "grad_norm": 0.6752614378929138, + "grad_norm": 0.6793270707130432, "learning_rate": 4.512501698600354e-05, - "loss": 1.4234, + "loss": 1.4227, "step": 11480 }, { "epoch": 0.7803370023100965, - "grad_norm": 0.7024703621864319, + "grad_norm": 0.7051302790641785, "learning_rate": 4.51228937355619e-05, - "loss": 1.4341, + "loss": 1.4348, "step": 11485 }, { "epoch": 0.7806767223807582, - "grad_norm": 0.65691739320755, + "grad_norm": 0.6549004912376404, "learning_rate": 4.512077048512026e-05, - "loss": 1.319, + "loss": 1.3189, "step": 11490 }, { "epoch": 0.78101644245142, - "grad_norm": 0.7620543837547302, + "grad_norm": 0.7588554620742798, "learning_rate": 4.511864723467863e-05, - "loss": 1.3211, + "loss": 1.3209, "step": 11495 }, { "epoch": 0.7813561625220818, - "grad_norm": 0.6572378873825073, + "grad_norm": 0.6572644114494324, "learning_rate": 4.5116523984236994e-05, - "loss": 1.3941, + "loss": 1.3943, "step": 11500 }, { "epoch": 0.7816958825927436, - "grad_norm": 0.6216370463371277, + "grad_norm": 0.6224039793014526, "learning_rate": 4.511440073379535e-05, - "loss": 1.2718, + "loss": 1.272, "step": 11505 }, { "epoch": 0.7820356026634053, - "grad_norm": 0.6395732164382935, + "grad_norm": 0.6367835998535156, "learning_rate": 4.511227748335372e-05, - "loss": 1.3318, + "loss": 1.3314, "step": 11510 }, { "epoch": 0.7823753227340672, - "grad_norm": 0.6153049468994141, + "grad_norm": 0.6178695559501648, "learning_rate": 4.511015423291208e-05, - "loss": 1.33, + "loss": 1.3303, "step": 11515 }, { "epoch": 0.7827150428047289, - "grad_norm": 0.6759575605392456, + "grad_norm": 0.6782312989234924, "learning_rate": 4.5108030982470444e-05, - "loss": 1.4836, + "loss": 1.4843, "step": 11520 }, { "epoch": 0.7830547628753907, - "grad_norm": 0.6762524247169495, + "grad_norm": 0.676954984664917, "learning_rate": 4.5105907732028814e-05, - "loss": 1.427, + "loss": 1.4273, "step": 11525 }, { "epoch": 0.7833944829460524, - "grad_norm": 0.6496263146400452, + "grad_norm": 0.6481071710586548, "learning_rate": 4.510378448158717e-05, - "loss": 1.3195, + "loss": 1.3188, "step": 11530 }, { "epoch": 0.7837342030167143, - "grad_norm": 0.6128015518188477, + "grad_norm": 0.6138940453529358, "learning_rate": 4.5101661231145536e-05, - "loss": 1.3555, + "loss": 1.3554, "step": 11535 }, { "epoch": 0.784073923087376, - "grad_norm": 0.6299436092376709, + "grad_norm": 0.6302028894424438, "learning_rate": 4.5099537980703906e-05, "loss": 1.4056, "step": 11540 }, { "epoch": 0.7844136431580377, - "grad_norm": 0.6537281274795532, + "grad_norm": 0.6524173617362976, "learning_rate": 4.5097414730262264e-05, - "loss": 1.3819, + "loss": 1.3826, "step": 11545 }, { "epoch": 0.7847533632286996, - "grad_norm": 0.6531508564949036, + "grad_norm": 0.6524701118469238, "learning_rate": 4.509529147982063e-05, - "loss": 1.3104, + "loss": 1.31, "step": 11550 }, { "epoch": 0.7850930832993613, - "grad_norm": 0.6828337907791138, + "grad_norm": 0.6821550130844116, "learning_rate": 4.5093168229379e-05, - "loss": 1.4078, + "loss": 1.4075, "step": 11555 }, { "epoch": 0.7854328033700231, - "grad_norm": 0.6737443804740906, + "grad_norm": 0.6780343651771545, "learning_rate": 4.5091044978937356e-05, - "loss": 1.4144, + "loss": 1.414, "step": 11560 }, { "epoch": 0.7857725234406848, - "grad_norm": 0.7346624732017517, + "grad_norm": 0.735101044178009, "learning_rate": 4.508892172849572e-05, - "loss": 1.4487, + "loss": 1.4489, "step": 11565 }, { "epoch": 0.7861122435113467, - "grad_norm": 0.6856018304824829, + "grad_norm": 0.6832066774368286, "learning_rate": 4.508679847805409e-05, - "loss": 1.2994, + "loss": 1.3, "step": 11570 }, { "epoch": 0.7864519635820084, - "grad_norm": 0.6291006207466125, + "grad_norm": 0.6269567608833313, "learning_rate": 4.508467522761245e-05, - "loss": 1.1959, + "loss": 1.1961, "step": 11575 }, { "epoch": 0.7867916836526702, - "grad_norm": 0.6951645612716675, + "grad_norm": 0.6947938799858093, "learning_rate": 4.508255197717081e-05, - "loss": 1.3675, + "loss": 1.3677, "step": 11580 }, { "epoch": 0.787131403723332, - "grad_norm": 0.8353897333145142, + "grad_norm": 0.8312880992889404, "learning_rate": 4.508042872672918e-05, - "loss": 1.3176, + "loss": 1.3172, "step": 11585 }, { "epoch": 0.7874711237939938, - "grad_norm": 0.6127036809921265, + "grad_norm": 0.6092492938041687, "learning_rate": 4.507830547628754e-05, - "loss": 1.3862, + "loss": 1.3859, "step": 11590 }, { "epoch": 0.7878108438646555, - "grad_norm": 0.6927005648612976, + "grad_norm": 0.6941688060760498, "learning_rate": 4.5076182225845904e-05, - "loss": 1.362, + "loss": 1.3617, "step": 11595 }, { "epoch": 0.7881505639353173, - "grad_norm": 0.6582446098327637, + "grad_norm": 0.6590314507484436, "learning_rate": 4.507405897540427e-05, - "loss": 1.3313, + "loss": 1.3309, "step": 11600 }, { "epoch": 0.7884902840059791, - "grad_norm": 0.6648750901222229, + "grad_norm": 0.6663656234741211, "learning_rate": 4.507193572496263e-05, - "loss": 1.411, + "loss": 1.4113, "step": 11605 }, { "epoch": 0.7888300040766408, - "grad_norm": 0.6673555374145508, + "grad_norm": 0.6651059985160828, "learning_rate": 4.5069812474520996e-05, - "loss": 1.4298, + "loss": 1.4299, "step": 11610 }, { "epoch": 0.7891697241473026, - "grad_norm": 0.6484569907188416, + "grad_norm": 0.6429831385612488, "learning_rate": 4.506768922407936e-05, - "loss": 1.3281, + "loss": 1.3277, "step": 11615 }, { "epoch": 0.7895094442179644, - "grad_norm": 0.7276854515075684, + "grad_norm": 0.7276691794395447, "learning_rate": 4.5065565973637724e-05, - "loss": 1.3447, + "loss": 1.3451, "step": 11620 }, { "epoch": 0.7898491642886262, - "grad_norm": 0.7185012698173523, + "grad_norm": 0.7179198861122131, "learning_rate": 4.506344272319609e-05, "loss": 1.3579, "step": 11625 }, { "epoch": 0.7901888843592879, - "grad_norm": 0.6514365077018738, + "grad_norm": 0.6476083993911743, "learning_rate": 4.506131947275445e-05, - "loss": 1.4761, + "loss": 1.4758, "step": 11630 }, { "epoch": 0.7905286044299498, - "grad_norm": 0.6557944416999817, + "grad_norm": 0.6560049653053284, "learning_rate": 4.5059196222312816e-05, "loss": 1.4036, "step": 11635 }, { "epoch": 0.7908683245006115, - "grad_norm": 0.648170530796051, + "grad_norm": 0.6476988196372986, "learning_rate": 4.505707297187118e-05, - "loss": 1.385, + "loss": 1.3845, "step": 11640 }, { "epoch": 0.7912080445712733, - "grad_norm": 0.6744073629379272, + "grad_norm": 0.6732839345932007, "learning_rate": 4.5054949721429544e-05, "loss": 1.3894, "step": 11645 }, { "epoch": 0.791547764641935, - "grad_norm": 0.6395312547683716, + "grad_norm": 0.6337847113609314, "learning_rate": 4.505282647098791e-05, - "loss": 1.4086, + "loss": 1.4083, "step": 11650 }, { "epoch": 0.7918874847125968, - "grad_norm": 0.7469674944877625, + "grad_norm": 0.7510994672775269, "learning_rate": 4.505070322054627e-05, - "loss": 1.3691, + "loss": 1.3689, "step": 11655 }, { "epoch": 0.7922272047832586, - "grad_norm": 0.676816999912262, + "grad_norm": 0.6778708100318909, "learning_rate": 4.5048579970104636e-05, - "loss": 1.4049, + "loss": 1.4046, "step": 11660 }, { "epoch": 0.7925669248539203, - "grad_norm": 0.6831178069114685, + "grad_norm": 0.6825473308563232, "learning_rate": 4.5046456719663e-05, - "loss": 1.4065, + "loss": 1.4066, "step": 11665 }, { "epoch": 0.7929066449245822, - "grad_norm": 0.6235385537147522, + "grad_norm": 0.6301274299621582, "learning_rate": 4.5044333469221364e-05, - "loss": 1.3501, + "loss": 1.3506, "step": 11670 }, { "epoch": 0.7932463649952439, - "grad_norm": 0.7086701393127441, + "grad_norm": 0.7098168134689331, "learning_rate": 4.504221021877973e-05, - "loss": 1.3325, + "loss": 1.333, "step": 11675 }, { "epoch": 0.7935860850659057, - "grad_norm": 0.629204511642456, + "grad_norm": 0.6336970329284668, "learning_rate": 4.504008696833809e-05, - "loss": 1.309, + "loss": 1.3094, "step": 11680 }, { "epoch": 0.7939258051365675, - "grad_norm": 0.6631782650947571, + "grad_norm": 0.6636404395103455, "learning_rate": 4.5037963717896456e-05, - "loss": 1.4683, + "loss": 1.4685, "step": 11685 }, { "epoch": 0.7942655252072293, - "grad_norm": 0.6834977865219116, + "grad_norm": 0.6821151971817017, "learning_rate": 4.503584046745482e-05, - "loss": 1.2518, + "loss": 1.2516, "step": 11690 }, { "epoch": 0.794605245277891, - "grad_norm": 0.7612637281417847, + "grad_norm": 0.7716544270515442, "learning_rate": 4.5033717217013184e-05, - "loss": 1.4312, + "loss": 1.4313, "step": 11695 }, { "epoch": 0.7949449653485527, - "grad_norm": 0.7162277698516846, + "grad_norm": 0.7126001715660095, "learning_rate": 4.503159396657155e-05, - "loss": 1.4017, + "loss": 1.402, "step": 11700 }, { "epoch": 0.7952846854192146, - "grad_norm": 0.7021288275718689, + "grad_norm": 0.7018154263496399, "learning_rate": 4.502947071612991e-05, - "loss": 1.2941, + "loss": 1.2942, "step": 11705 }, { "epoch": 0.7956244054898763, - "grad_norm": 0.6390158534049988, + "grad_norm": 0.6400017142295837, "learning_rate": 4.5027347465688276e-05, - "loss": 1.3193, + "loss": 1.3197, "step": 11710 }, { "epoch": 0.7959641255605381, - "grad_norm": 0.7118582129478455, + "grad_norm": 0.7118335366249084, "learning_rate": 4.502522421524663e-05, - "loss": 1.4165, + "loss": 1.4164, "step": 11715 }, { "epoch": 0.7963038456311999, - "grad_norm": 0.6698769927024841, + "grad_norm": 0.6630871295928955, "learning_rate": 4.5023100964805004e-05, - "loss": 1.3383, + "loss": 1.3384, "step": 11720 }, { "epoch": 0.7966435657018617, - "grad_norm": 0.6821621656417847, + "grad_norm": 0.683229386806488, "learning_rate": 4.502097771436337e-05, "loss": 1.3457, "step": 11725 }, { "epoch": 0.7969832857725234, - "grad_norm": 0.6107878088951111, + "grad_norm": 0.6115513443946838, "learning_rate": 4.5018854463921725e-05, - "loss": 1.2819, + "loss": 1.2821, "step": 11730 }, { "epoch": 0.7973230058431852, - "grad_norm": 0.783470630645752, + "grad_norm": 0.7887835502624512, "learning_rate": 4.5016731213480096e-05, - "loss": 1.3601, + "loss": 1.361, "step": 11735 }, { "epoch": 0.797662725913847, - "grad_norm": 0.5205401182174683, + "grad_norm": 0.5211535692214966, "learning_rate": 4.501460796303846e-05, "loss": 1.2042, "step": 11740 }, { "epoch": 0.7980024459845088, - "grad_norm": 0.6711729168891907, + "grad_norm": 0.6784567832946777, "learning_rate": 4.501248471259682e-05, - "loss": 1.2931, + "loss": 1.2929, "step": 11745 }, { "epoch": 0.7983421660551705, - "grad_norm": 0.7575369477272034, + "grad_norm": 0.7575064897537231, "learning_rate": 4.501036146215519e-05, - "loss": 1.32, + "loss": 1.3199, "step": 11750 }, { "epoch": 0.7986818861258324, - "grad_norm": 0.7468956708908081, + "grad_norm": 0.7438192963600159, "learning_rate": 4.500823821171355e-05, - "loss": 1.3422, + "loss": 1.3426, "step": 11755 }, { "epoch": 0.7990216061964941, - "grad_norm": 0.6427664756774902, + "grad_norm": 0.6408671736717224, "learning_rate": 4.5006114961271916e-05, - "loss": 1.3344, + "loss": 1.3346, "step": 11760 }, { "epoch": 0.7993613262671558, - "grad_norm": 0.6748515963554382, + "grad_norm": 0.6752783060073853, "learning_rate": 4.500399171083028e-05, - "loss": 1.3656, + "loss": 1.3658, "step": 11765 }, { "epoch": 0.7997010463378177, - "grad_norm": 0.6909468173980713, + "grad_norm": 0.6903826594352722, "learning_rate": 4.5001868460388644e-05, - "loss": 1.5675, + "loss": 1.5679, "step": 11770 }, { "epoch": 0.8000407664084794, - "grad_norm": 0.6876988410949707, + "grad_norm": 0.6887668371200562, "learning_rate": 4.499974520994701e-05, - "loss": 1.3812, + "loss": 1.3814, "step": 11775 }, { "epoch": 0.8003804864791412, - "grad_norm": 0.698937714099884, + "grad_norm": 0.696759819984436, "learning_rate": 4.499762195950537e-05, - "loss": 1.3926, + "loss": 1.3924, "step": 11780 }, { "epoch": 0.8007202065498029, - "grad_norm": 0.5930280089378357, + "grad_norm": 0.5966751575469971, "learning_rate": 4.4995498709063736e-05, - "loss": 1.4472, + "loss": 1.4467, "step": 11785 }, { "epoch": 0.8010599266204648, - "grad_norm": 0.6637153625488281, + "grad_norm": 0.665764570236206, "learning_rate": 4.49933754586221e-05, - "loss": 1.4034, + "loss": 1.4033, "step": 11790 }, { "epoch": 0.8013996466911265, - "grad_norm": 0.665380597114563, + "grad_norm": 0.6665855050086975, "learning_rate": 4.4991252208180464e-05, - "loss": 1.4277, + "loss": 1.4269, "step": 11795 }, { "epoch": 0.8017393667617883, - "grad_norm": 0.7033234238624573, + "grad_norm": 0.7055405378341675, "learning_rate": 4.498912895773882e-05, - "loss": 1.4312, + "loss": 1.4314, "step": 11800 }, { "epoch": 0.8020790868324501, - "grad_norm": 0.7284811735153198, + "grad_norm": 0.7230882048606873, "learning_rate": 4.498700570729719e-05, - "loss": 1.4031, + "loss": 1.4029, "step": 11805 }, { "epoch": 0.8024188069031118, - "grad_norm": 0.7314227819442749, + "grad_norm": 0.7312173247337341, "learning_rate": 4.4984882456855556e-05, - "loss": 1.3191, + "loss": 1.3185, "step": 11810 }, { "epoch": 0.8027585269737736, - "grad_norm": 0.8094779253005981, + "grad_norm": 0.8093398809432983, "learning_rate": 4.498275920641391e-05, - "loss": 1.3541, + "loss": 1.3536, "step": 11815 }, { "epoch": 0.8030982470444353, - "grad_norm": 0.6797764301300049, + "grad_norm": 0.6809344291687012, "learning_rate": 4.4980635955972284e-05, - "loss": 1.3905, + "loss": 1.3902, "step": 11820 }, { "epoch": 0.8034379671150972, - "grad_norm": 0.7100040912628174, + "grad_norm": 0.7096199989318848, "learning_rate": 4.497851270553065e-05, "loss": 1.4634, "step": 11825 }, { "epoch": 0.8037776871857589, - "grad_norm": 0.730214536190033, + "grad_norm": 0.7357856631278992, "learning_rate": 4.4976389455089005e-05, - "loss": 1.3458, + "loss": 1.3457, "step": 11830 }, { "epoch": 0.8041174072564207, - "grad_norm": 0.6728085875511169, + "grad_norm": 0.6706922054290771, "learning_rate": 4.4974266204647376e-05, - "loss": 1.4842, + "loss": 1.484, "step": 11835 }, { "epoch": 0.8044571273270825, - "grad_norm": 0.7188294529914856, + "grad_norm": 0.7177525162696838, "learning_rate": 4.497214295420574e-05, - "loss": 1.3956, + "loss": 1.3954, "step": 11840 }, { "epoch": 0.8047968473977443, - "grad_norm": 0.7234100699424744, + "grad_norm": 0.7204999923706055, "learning_rate": 4.49700197037641e-05, - "loss": 1.3436, + "loss": 1.3428, "step": 11845 }, { "epoch": 0.805136567468406, - "grad_norm": 0.6598080992698669, + "grad_norm": 0.6633571982383728, "learning_rate": 4.496789645332247e-05, - "loss": 1.3716, + "loss": 1.3717, "step": 11850 }, { "epoch": 0.8054762875390679, - "grad_norm": 0.697017252445221, + "grad_norm": 0.6955575346946716, "learning_rate": 4.496577320288083e-05, - "loss": 1.2724, + "loss": 1.2719, "step": 11855 }, { "epoch": 0.8058160076097296, - "grad_norm": 0.7265612483024597, + "grad_norm": 0.7265917062759399, "learning_rate": 4.496364995243919e-05, - "loss": 1.3161, + "loss": 1.3162, "step": 11860 }, { "epoch": 0.8061557276803913, - "grad_norm": 0.7386727333068848, + "grad_norm": 0.7366938591003418, "learning_rate": 4.496152670199756e-05, - "loss": 1.312, + "loss": 1.3118, "step": 11865 }, { "epoch": 0.8064954477510531, - "grad_norm": 0.6545016169548035, + "grad_norm": 0.6567254066467285, "learning_rate": 4.495940345155592e-05, - "loss": 1.3828, + "loss": 1.3826, "step": 11870 }, { "epoch": 0.8068351678217149, - "grad_norm": 0.6210530400276184, + "grad_norm": 0.6215032935142517, "learning_rate": 4.495728020111428e-05, - "loss": 1.3055, + "loss": 1.3054, "step": 11875 }, { "epoch": 0.8071748878923767, - "grad_norm": 0.6984265446662903, + "grad_norm": 0.6992623805999756, "learning_rate": 4.495515695067265e-05, - "loss": 1.315, + "loss": 1.3152, "step": 11880 }, { "epoch": 0.8075146079630384, - "grad_norm": 0.8059458136558533, + "grad_norm": 0.8025512099266052, "learning_rate": 4.495303370023101e-05, - "loss": 1.412, + "loss": 1.4121, "step": 11885 }, { "epoch": 0.8078543280337003, - "grad_norm": 0.6970521211624146, + "grad_norm": 0.6963446140289307, "learning_rate": 4.495091044978937e-05, - "loss": 1.3041, + "loss": 1.3038, "step": 11890 }, { "epoch": 0.808194048104362, - "grad_norm": 0.680242657661438, + "grad_norm": 0.682207465171814, "learning_rate": 4.4948787199347744e-05, - "loss": 1.3809, + "loss": 1.381, "step": 11895 }, { "epoch": 0.8085337681750238, - "grad_norm": 0.6085513234138489, + "grad_norm": 0.6070653200149536, "learning_rate": 4.49466639489061e-05, "loss": 1.2587, "step": 11900 }, { "epoch": 0.8088734882456855, - "grad_norm": 0.7571601867675781, + "grad_norm": 0.7624881267547607, "learning_rate": 4.4944540698464465e-05, - "loss": 1.4282, + "loss": 1.4281, "step": 11905 }, { "epoch": 0.8092132083163474, - "grad_norm": 0.7318002581596375, + "grad_norm": 0.7156019806861877, "learning_rate": 4.4942417448022836e-05, - "loss": 1.5599, + "loss": 1.5593, "step": 11910 }, { "epoch": 0.8095529283870091, - "grad_norm": 0.6579176187515259, + "grad_norm": 0.6608614325523376, "learning_rate": 4.494029419758119e-05, - "loss": 1.3842, + "loss": 1.3838, "step": 11915 }, { "epoch": 0.8098926484576708, - "grad_norm": 0.6124897003173828, + "grad_norm": 0.6124957799911499, "learning_rate": 4.493817094713956e-05, "loss": 1.3003, "step": 11920 }, { "epoch": 0.8102323685283327, - "grad_norm": 0.6390940546989441, + "grad_norm": 0.6411773562431335, "learning_rate": 4.493604769669793e-05, "loss": 1.4045, "step": 11925 }, { "epoch": 0.8105720885989944, - "grad_norm": 0.7170646786689758, + "grad_norm": 0.7191134095191956, "learning_rate": 4.4933924446256285e-05, - "loss": 1.397, + "loss": 1.3965, "step": 11930 }, { "epoch": 0.8109118086696562, - "grad_norm": 0.6255606412887573, + "grad_norm": 0.6288970112800598, "learning_rate": 4.493180119581465e-05, - "loss": 1.3251, + "loss": 1.3257, "step": 11935 }, { "epoch": 0.811251528740318, - "grad_norm": 0.5758702158927917, + "grad_norm": 0.5766484141349792, "learning_rate": 4.492967794537301e-05, - "loss": 1.4673, + "loss": 1.4674, "step": 11940 }, { "epoch": 0.8115912488109798, - "grad_norm": 0.6808621287345886, + "grad_norm": 0.6819114089012146, "learning_rate": 4.492755469493138e-05, - "loss": 1.3583, + "loss": 1.3579, "step": 11945 }, { "epoch": 0.8119309688816415, - "grad_norm": 0.5954741835594177, + "grad_norm": 0.5944585800170898, "learning_rate": 4.492543144448974e-05, - "loss": 1.4841, + "loss": 1.4847, "step": 11950 }, { "epoch": 0.8122706889523033, - "grad_norm": 0.7672919034957886, + "grad_norm": 0.7691552042961121, "learning_rate": 4.4923308194048105e-05, - "loss": 1.3046, + "loss": 1.304, "step": 11955 }, { "epoch": 0.8126104090229651, - "grad_norm": 0.6502673625946045, + "grad_norm": 0.6514336466789246, "learning_rate": 4.492118494360647e-05, - "loss": 1.4264, + "loss": 1.4266, "step": 11960 }, { "epoch": 0.8129501290936268, - "grad_norm": 0.6518172025680542, + "grad_norm": 0.6502189636230469, "learning_rate": 4.491906169316483e-05, - "loss": 1.3076, + "loss": 1.3074, "step": 11965 }, { "epoch": 0.8132898491642886, - "grad_norm": 0.6700319051742554, + "grad_norm": 0.6712385416030884, "learning_rate": 4.49169384427232e-05, - "loss": 1.4476, + "loss": 1.4475, "step": 11970 }, { "epoch": 0.8136295692349504, - "grad_norm": 0.7004209160804749, + "grad_norm": 0.7015194296836853, "learning_rate": 4.491481519228156e-05, - "loss": 1.3513, + "loss": 1.3509, "step": 11975 }, { "epoch": 0.8139692893056122, - "grad_norm": 0.6354067921638489, + "grad_norm": 0.6361408829689026, "learning_rate": 4.4912691941839925e-05, - "loss": 1.3639, + "loss": 1.3642, "step": 11980 }, { "epoch": 0.8143090093762739, - "grad_norm": 0.6545446515083313, + "grad_norm": 0.6641202569007874, "learning_rate": 4.491056869139829e-05, "loss": 1.3883, "step": 11985 }, { "epoch": 0.8146487294469357, - "grad_norm": 0.6701433062553406, + "grad_norm": 0.6706225275993347, "learning_rate": 4.490844544095665e-05, - "loss": 1.3951, + "loss": 1.395, "step": 11990 }, { "epoch": 0.8149884495175975, - "grad_norm": 0.666741669178009, + "grad_norm": 0.6683604121208191, "learning_rate": 4.490632219051502e-05, - "loss": 1.2614, + "loss": 1.2625, "step": 11995 }, { "epoch": 0.8153281695882593, - "grad_norm": 0.6113370656967163, + "grad_norm": 0.6118583083152771, "learning_rate": 4.490419894007338e-05, - "loss": 1.3702, + "loss": 1.3701, "step": 12000 }, { "epoch": 0.815667889658921, - "grad_norm": 0.9769265651702881, + "grad_norm": 0.9945932626724243, "learning_rate": 4.4902075689631745e-05, - "loss": 1.3561, + "loss": 1.3559, "step": 12005 }, { "epoch": 0.8160076097295829, - "grad_norm": 0.6849420070648193, + "grad_norm": 0.687251627445221, "learning_rate": 4.489995243919011e-05, "loss": 1.3951, "step": 12010 }, { "epoch": 0.8163473298002446, - "grad_norm": 0.6160445809364319, + "grad_norm": 0.6168782114982605, "learning_rate": 4.489782918874847e-05, - "loss": 1.3413, + "loss": 1.341, "step": 12015 }, { "epoch": 0.8166870498709063, - "grad_norm": 0.7271686792373657, + "grad_norm": 0.7266376614570618, "learning_rate": 4.489570593830684e-05, - "loss": 1.3699, + "loss": 1.3698, "step": 12020 }, { "epoch": 0.8170267699415682, - "grad_norm": 0.6870571374893188, + "grad_norm": 0.6880416870117188, "learning_rate": 4.48935826878652e-05, - "loss": 1.3483, + "loss": 1.3488, "step": 12025 }, { "epoch": 0.8173664900122299, - "grad_norm": 0.7074158191680908, + "grad_norm": 0.7098824977874756, "learning_rate": 4.4891459437423565e-05, - "loss": 1.3876, + "loss": 1.3875, "step": 12030 }, { "epoch": 0.8177062100828917, - "grad_norm": 0.7424540519714355, + "grad_norm": 0.7400995492935181, "learning_rate": 4.488933618698193e-05, - "loss": 1.3534, + "loss": 1.3536, "step": 12035 }, { "epoch": 0.8180459301535534, - "grad_norm": 0.6281335353851318, + "grad_norm": 0.6271576881408691, "learning_rate": 4.488721293654029e-05, "loss": 1.3199, "step": 12040 }, { "epoch": 0.8183856502242153, - "grad_norm": 0.7014420628547668, + "grad_norm": 0.6992721557617188, "learning_rate": 4.488508968609866e-05, - "loss": 1.4242, + "loss": 1.4244, "step": 12045 }, { "epoch": 0.818725370294877, - "grad_norm": 0.5700911283493042, + "grad_norm": 0.5690619945526123, "learning_rate": 4.488296643565702e-05, - "loss": 1.3779, + "loss": 1.378, "step": 12050 }, { "epoch": 0.8190650903655388, - "grad_norm": 0.7599695920944214, + "grad_norm": 0.7555146813392639, "learning_rate": 4.4880843185215385e-05, - "loss": 1.391, + "loss": 1.3909, "step": 12055 }, { "epoch": 0.8194048104362006, - "grad_norm": 0.6629148721694946, + "grad_norm": 0.6658848524093628, "learning_rate": 4.487871993477375e-05, - "loss": 1.4024, + "loss": 1.4026, "step": 12060 }, { "epoch": 0.8197445305068624, - "grad_norm": 0.6432792544364929, + "grad_norm": 0.6397156715393066, "learning_rate": 4.487659668433211e-05, - "loss": 1.3843, + "loss": 1.3853, "step": 12065 }, { "epoch": 0.8200842505775241, - "grad_norm": 0.6555769443511963, + "grad_norm": 0.6570034027099609, "learning_rate": 4.487447343389047e-05, - "loss": 1.3477, + "loss": 1.3478, "step": 12070 }, { "epoch": 0.8204239706481858, - "grad_norm": 0.7444199919700623, + "grad_norm": 0.7424708604812622, "learning_rate": 4.487235018344884e-05, - "loss": 1.4641, + "loss": 1.4633, "step": 12075 }, { "epoch": 0.8207636907188477, - "grad_norm": 0.6874467730522156, + "grad_norm": 0.689038097858429, "learning_rate": 4.4870226933007205e-05, - "loss": 1.3105, + "loss": 1.3099, "step": 12080 }, { "epoch": 0.8211034107895094, - "grad_norm": 0.6294021010398865, + "grad_norm": 0.6298597455024719, "learning_rate": 4.486810368256556e-05, - "loss": 1.2731, + "loss": 1.2728, "step": 12085 }, { "epoch": 0.8214431308601712, - "grad_norm": 0.6811396479606628, + "grad_norm": 0.6815755367279053, "learning_rate": 4.486598043212393e-05, - "loss": 1.4021, + "loss": 1.4022, "step": 12090 }, { "epoch": 0.821782850930833, - "grad_norm": 0.6806802153587341, + "grad_norm": 0.6822925806045532, "learning_rate": 4.48638571816823e-05, - "loss": 1.4288, + "loss": 1.4296, "step": 12095 }, { "epoch": 0.8221225710014948, - "grad_norm": 0.7208311557769775, + "grad_norm": 0.7204053401947021, "learning_rate": 4.486173393124066e-05, - "loss": 1.4121, + "loss": 1.4114, "step": 12100 }, { "epoch": 0.8224622910721565, - "grad_norm": 0.7082199454307556, + "grad_norm": 0.7051594853401184, "learning_rate": 4.4859610680799025e-05, - "loss": 1.3365, + "loss": 1.3367, "step": 12105 }, { "epoch": 0.8228020111428184, - "grad_norm": 0.6614202260971069, + "grad_norm": 0.6640490889549255, "learning_rate": 4.485748743035739e-05, - "loss": 1.2902, + "loss": 1.2908, "step": 12110 }, { "epoch": 0.8231417312134801, - "grad_norm": 0.6267545819282532, + "grad_norm": 0.6257873773574829, "learning_rate": 4.485536417991575e-05, - "loss": 1.355, + "loss": 1.3549, "step": 12115 }, { "epoch": 0.8234814512841419, - "grad_norm": 0.6980351209640503, + "grad_norm": 0.7227895855903625, "learning_rate": 4.485324092947412e-05, - "loss": 1.3563, + "loss": 1.3567, "step": 12120 }, { "epoch": 0.8238211713548036, - "grad_norm": 0.690074622631073, + "grad_norm": 0.6888381242752075, "learning_rate": 4.485111767903248e-05, - "loss": 1.3108, + "loss": 1.3106, "step": 12125 }, { "epoch": 0.8241608914254654, - "grad_norm": 0.5915892720222473, + "grad_norm": 0.5898076295852661, "learning_rate": 4.4848994428590845e-05, - "loss": 1.4385, + "loss": 1.4389, "step": 12130 }, { "epoch": 0.8245006114961272, - "grad_norm": 0.716480016708374, + "grad_norm": 0.7172653079032898, "learning_rate": 4.484687117814921e-05, - "loss": 1.4252, + "loss": 1.4251, "step": 12135 }, { "epoch": 0.8248403315667889, - "grad_norm": 0.6369530558586121, + "grad_norm": 0.6389583349227905, "learning_rate": 4.4844747927707567e-05, "loss": 1.3827, "step": 12140 }, { "epoch": 0.8251800516374508, - "grad_norm": 0.650497555732727, + "grad_norm": 0.652115523815155, "learning_rate": 4.484262467726594e-05, "loss": 1.4093, "step": 12145 }, { "epoch": 0.8255197717081125, - "grad_norm": 0.7560164928436279, + "grad_norm": 0.7555417418479919, "learning_rate": 4.48405014268243e-05, - "loss": 1.3747, + "loss": 1.3753, "step": 12150 }, { "epoch": 0.8258594917787743, - "grad_norm": 0.6294389963150024, + "grad_norm": 0.6318958401679993, "learning_rate": 4.483837817638266e-05, - "loss": 1.4042, + "loss": 1.4043, "step": 12155 }, { "epoch": 0.826199211849436, - "grad_norm": 0.66148442029953, + "grad_norm": 0.6668136119842529, "learning_rate": 4.483625492594103e-05, - "loss": 1.2952, + "loss": 1.2958, "step": 12160 }, { "epoch": 0.8265389319200979, - "grad_norm": 0.7001969218254089, + "grad_norm": 0.6985938549041748, "learning_rate": 4.4834131675499393e-05, - "loss": 1.3641, + "loss": 1.3645, "step": 12165 }, { "epoch": 0.8268786519907596, - "grad_norm": 0.6744056940078735, + "grad_norm": 0.6776582598686218, "learning_rate": 4.483200842505775e-05, - "loss": 1.3479, + "loss": 1.3474, "step": 12170 }, { "epoch": 0.8272183720614213, - "grad_norm": 0.7539799809455872, + "grad_norm": 0.7490609288215637, "learning_rate": 4.482988517461612e-05, - "loss": 1.3364, + "loss": 1.3367, "step": 12175 }, { "epoch": 0.8275580921320832, - "grad_norm": 0.6546370983123779, + "grad_norm": 0.6527763605117798, "learning_rate": 4.4827761924174485e-05, - "loss": 1.4197, + "loss": 1.4196, "step": 12180 }, { "epoch": 0.8278978122027449, - "grad_norm": 0.6741619110107422, + "grad_norm": 0.669812023639679, "learning_rate": 4.482563867373284e-05, - "loss": 1.3835, + "loss": 1.3836, "step": 12185 }, { "epoch": 0.8282375322734067, - "grad_norm": 0.6156356334686279, + "grad_norm": 0.6137902736663818, "learning_rate": 4.4823515423291213e-05, "loss": 1.2733, "step": 12190 }, { "epoch": 0.8285772523440685, - "grad_norm": 0.6626706123352051, + "grad_norm": 0.6600733399391174, "learning_rate": 4.482139217284958e-05, - "loss": 1.3214, + "loss": 1.3209, "step": 12195 }, { "epoch": 0.8289169724147303, - "grad_norm": 0.6759384870529175, + "grad_norm": 0.6764907240867615, "learning_rate": 4.4819268922407935e-05, - "loss": 1.3404, + "loss": 1.3412, "step": 12200 }, { "epoch": 0.829256692485392, - "grad_norm": 0.6920154094696045, + "grad_norm": 0.6915063261985779, "learning_rate": 4.4817145671966305e-05, - "loss": 1.419, + "loss": 1.4182, "step": 12205 }, { "epoch": 0.8295964125560538, - "grad_norm": 0.6718053817749023, + "grad_norm": 0.6697565317153931, "learning_rate": 4.481502242152467e-05, - "loss": 1.3467, + "loss": 1.3471, "step": 12210 }, { "epoch": 0.8299361326267156, - "grad_norm": 0.7080114483833313, + "grad_norm": 0.7094232439994812, "learning_rate": 4.481289917108303e-05, - "loss": 1.4269, + "loss": 1.427, "step": 12215 }, { "epoch": 0.8302758526973774, - "grad_norm": 0.6120716333389282, + "grad_norm": 0.6134877800941467, "learning_rate": 4.48107759206414e-05, "loss": 1.2772, "step": 12220 }, { "epoch": 0.8306155727680391, - "grad_norm": 0.6118149757385254, + "grad_norm": 0.6153057217597961, "learning_rate": 4.4808652670199755e-05, "loss": 1.4172, "step": 12225 }, { "epoch": 0.830955292838701, - "grad_norm": 0.6838310956954956, + "grad_norm": 0.6888895034790039, "learning_rate": 4.480652941975812e-05, - "loss": 1.3848, + "loss": 1.3842, "step": 12230 }, { "epoch": 0.8312950129093627, - "grad_norm": 0.7534611225128174, + "grad_norm": 0.7486812472343445, "learning_rate": 4.480440616931649e-05, - "loss": 1.2555, + "loss": 1.2548, "step": 12235 }, { "epoch": 0.8316347329800244, - "grad_norm": 0.6162254214286804, + "grad_norm": 0.6141130924224854, "learning_rate": 4.480228291887485e-05, - "loss": 1.3061, + "loss": 1.3057, "step": 12240 }, { "epoch": 0.8319744530506862, - "grad_norm": 0.6374291181564331, + "grad_norm": 0.6364213228225708, "learning_rate": 4.480015966843321e-05, - "loss": 1.3659, + "loss": 1.3664, "step": 12245 }, { "epoch": 0.832314173121348, - "grad_norm": 0.69317227602005, + "grad_norm": 0.6969799399375916, "learning_rate": 4.479803641799158e-05, - "loss": 1.3199, + "loss": 1.3203, "step": 12250 }, { "epoch": 0.8326538931920098, - "grad_norm": 0.5970177054405212, + "grad_norm": 0.596636176109314, "learning_rate": 4.479591316754994e-05, - "loss": 1.3317, + "loss": 1.3323, "step": 12255 }, { "epoch": 0.8329936132626715, - "grad_norm": 0.6941751837730408, + "grad_norm": 0.6884980797767639, "learning_rate": 4.47937899171083e-05, - "loss": 1.2643, + "loss": 1.2641, "step": 12260 }, { "epoch": 0.8333333333333334, - "grad_norm": 0.651773989200592, + "grad_norm": 0.6520838141441345, "learning_rate": 4.4791666666666673e-05, - "loss": 1.3537, + "loss": 1.3535, "step": 12265 }, { "epoch": 0.8336730534039951, - "grad_norm": 0.7591174840927124, + "grad_norm": 0.7542118430137634, "learning_rate": 4.478954341622503e-05, - "loss": 1.203, + "loss": 1.2029, "step": 12270 }, { "epoch": 0.8340127734746569, - "grad_norm": 0.666493833065033, + "grad_norm": 0.661770224571228, "learning_rate": 4.4787420165783395e-05, - "loss": 1.3851, + "loss": 1.3852, "step": 12275 }, { "epoch": 0.8343524935453187, - "grad_norm": 0.6488901376724243, + "grad_norm": 0.650799036026001, "learning_rate": 4.4785296915341765e-05, - "loss": 1.3747, + "loss": 1.3741, "step": 12280 }, { "epoch": 0.8346922136159804, - "grad_norm": 0.6691904067993164, + "grad_norm": 0.6699033379554749, "learning_rate": 4.478317366490012e-05, "loss": 1.4109, "step": 12285 }, { "epoch": 0.8350319336866422, - "grad_norm": 0.6803374290466309, + "grad_norm": 0.6797670722007751, "learning_rate": 4.478105041445849e-05, - "loss": 1.3148, + "loss": 1.3146, "step": 12290 }, { "epoch": 0.8353716537573039, - "grad_norm": 0.770541787147522, + "grad_norm": 0.7680563926696777, "learning_rate": 4.477892716401685e-05, - "loss": 1.3877, + "loss": 1.3878, "step": 12295 }, { "epoch": 0.8357113738279658, - "grad_norm": 0.6622931361198425, + "grad_norm": 0.6613762974739075, "learning_rate": 4.4776803913575215e-05, "loss": 1.3414, "step": 12300 }, { "epoch": 0.8360510938986275, - "grad_norm": 0.7337367534637451, + "grad_norm": 0.7226099967956543, "learning_rate": 4.477468066313358e-05, - "loss": 1.312, + "loss": 1.3112, "step": 12305 }, { "epoch": 0.8363908139692893, - "grad_norm": 0.7819303870201111, + "grad_norm": 0.7867833971977234, "learning_rate": 4.477255741269194e-05, - "loss": 1.337, + "loss": 1.3375, "step": 12310 }, { "epoch": 0.8367305340399511, - "grad_norm": 0.6312582492828369, + "grad_norm": 0.6290665864944458, "learning_rate": 4.477043416225031e-05, - "loss": 1.3172, + "loss": 1.3171, "step": 12315 }, { "epoch": 0.8370702541106129, - "grad_norm": 0.6198135018348694, + "grad_norm": 0.6231785416603088, "learning_rate": 4.476831091180867e-05, - "loss": 1.3957, + "loss": 1.3964, "step": 12320 }, { "epoch": 0.8374099741812746, - "grad_norm": 0.7267417311668396, + "grad_norm": 0.7289491295814514, "learning_rate": 4.4766187661367035e-05, - "loss": 1.3964, + "loss": 1.3959, "step": 12325 }, { "epoch": 0.8377496942519363, - "grad_norm": 0.524419903755188, + "grad_norm": 0.5235462188720703, "learning_rate": 4.47640644109254e-05, - "loss": 1.2128, + "loss": 1.2131, "step": 12330 }, { "epoch": 0.8380894143225982, - "grad_norm": 0.6941156387329102, + "grad_norm": 0.6954105496406555, "learning_rate": 4.476194116048376e-05, - "loss": 1.4847, + "loss": 1.4843, "step": 12335 }, { "epoch": 0.83842913439326, - "grad_norm": 0.7097057700157166, + "grad_norm": 0.7089130282402039, "learning_rate": 4.475981791004213e-05, - "loss": 1.3109, + "loss": 1.3114, "step": 12340 }, { "epoch": 0.8387688544639217, - "grad_norm": 0.6544502377510071, + "grad_norm": 0.6544634699821472, "learning_rate": 4.475769465960049e-05, - "loss": 1.3919, + "loss": 1.3923, "step": 12345 }, { "epoch": 0.8391085745345835, - "grad_norm": 0.7121051549911499, + "grad_norm": 0.722102165222168, "learning_rate": 4.4755571409158855e-05, - "loss": 1.2416, + "loss": 1.2423, "step": 12350 }, { "epoch": 0.8394482946052453, - "grad_norm": 0.6496121883392334, + "grad_norm": 0.6494455933570862, "learning_rate": 4.475344815871722e-05, - "loss": 1.3091, + "loss": 1.309, "step": 12355 }, { "epoch": 0.839788014675907, - "grad_norm": 0.6753186583518982, + "grad_norm": 0.674356997013092, "learning_rate": 4.475132490827558e-05, - "loss": 1.4142, + "loss": 1.414, "step": 12360 }, { "epoch": 0.8401277347465689, - "grad_norm": 0.6260736584663391, + "grad_norm": 0.6246697306632996, "learning_rate": 4.474920165783395e-05, - "loss": 1.4386, + "loss": 1.4387, "step": 12365 }, { "epoch": 0.8404674548172306, - "grad_norm": 0.7074244022369385, + "grad_norm": 0.7075207233428955, "learning_rate": 4.474707840739231e-05, - "loss": 1.2946, + "loss": 1.2945, "step": 12370 }, { "epoch": 0.8408071748878924, - "grad_norm": 0.7217336297035217, + "grad_norm": 0.718105137348175, "learning_rate": 4.4744955156950675e-05, - "loss": 1.3625, + "loss": 1.3624, "step": 12375 }, { "epoch": 0.8411468949585541, - "grad_norm": 0.6912894248962402, + "grad_norm": 0.6920083165168762, "learning_rate": 4.474283190650904e-05, - "loss": 1.3473, + "loss": 1.3467, "step": 12380 }, { "epoch": 0.841486615029216, - "grad_norm": 0.7598395347595215, + "grad_norm": 0.7647325992584229, "learning_rate": 4.47407086560674e-05, - "loss": 1.3052, + "loss": 1.3057, "step": 12385 }, { "epoch": 0.8418263350998777, - "grad_norm": 0.5946134924888611, + "grad_norm": 0.5941290855407715, "learning_rate": 4.473858540562577e-05, - "loss": 1.3984, + "loss": 1.3987, "step": 12390 }, { "epoch": 0.8421660551705394, - "grad_norm": 0.646136462688446, + "grad_norm": 0.6482870578765869, "learning_rate": 4.473646215518413e-05, - "loss": 1.3275, + "loss": 1.3276, "step": 12395 }, { "epoch": 0.8425057752412013, - "grad_norm": 0.6367974281311035, + "grad_norm": 0.6364887952804565, "learning_rate": 4.4734338904742495e-05, - "loss": 1.3249, + "loss": 1.3251, "step": 12400 }, { "epoch": 0.842845495311863, - "grad_norm": 0.5981682538986206, + "grad_norm": 0.5961602330207825, "learning_rate": 4.473221565430086e-05, - "loss": 1.3666, + "loss": 1.3663, "step": 12405 }, { "epoch": 0.8431852153825248, - "grad_norm": 0.7241153717041016, + "grad_norm": 0.7211259603500366, "learning_rate": 4.473009240385922e-05, - "loss": 1.3762, + "loss": 1.3758, "step": 12410 }, { "epoch": 0.8435249354531865, - "grad_norm": 0.5798956751823425, + "grad_norm": 0.5806518197059631, "learning_rate": 4.472796915341759e-05, - "loss": 1.2933, + "loss": 1.2935, "step": 12415 }, { "epoch": 0.8438646555238484, - "grad_norm": 0.6971281170845032, + "grad_norm": 0.6961290836334229, "learning_rate": 4.472584590297595e-05, - "loss": 1.3748, + "loss": 1.3745, "step": 12420 }, { "epoch": 0.8442043755945101, - "grad_norm": 0.8242133855819702, + "grad_norm": 0.8246265053749084, "learning_rate": 4.472372265253431e-05, - "loss": 1.3884, + "loss": 1.3886, "step": 12425 }, { "epoch": 0.8445440956651719, - "grad_norm": 0.7227784991264343, + "grad_norm": 0.7210039496421814, "learning_rate": 4.472159940209268e-05, - "loss": 1.362, + "loss": 1.3625, "step": 12430 }, { "epoch": 0.8448838157358337, - "grad_norm": 0.7591477632522583, + "grad_norm": 0.7564595341682434, "learning_rate": 4.471947615165104e-05, - "loss": 1.3163, + "loss": 1.317, "step": 12435 }, { "epoch": 0.8452235358064955, - "grad_norm": 0.7205374240875244, + "grad_norm": 0.7167272567749023, "learning_rate": 4.471735290120941e-05, - "loss": 1.2853, + "loss": 1.2851, "step": 12440 }, { "epoch": 0.8455632558771572, - "grad_norm": 0.6931034922599792, + "grad_norm": 0.694625973701477, "learning_rate": 4.471522965076777e-05, - "loss": 1.4047, + "loss": 1.4048, "step": 12445 }, { "epoch": 0.845902975947819, - "grad_norm": 0.5156599283218384, + "grad_norm": 0.5154675245285034, "learning_rate": 4.4713106400326135e-05, - "loss": 1.4269, + "loss": 1.4275, "step": 12450 }, { "epoch": 0.8462426960184808, - "grad_norm": 0.7908781170845032, + "grad_norm": 0.7953997254371643, "learning_rate": 4.47109831498845e-05, "loss": 1.4037, "step": 12455 }, { "epoch": 0.8465824160891425, - "grad_norm": 0.6748826503753662, + "grad_norm": 0.6764335036277771, "learning_rate": 4.470885989944286e-05, - "loss": 1.3054, + "loss": 1.3057, "step": 12460 }, { "epoch": 0.8469221361598043, - "grad_norm": 0.7158135771751404, + "grad_norm": 0.7150081396102905, "learning_rate": 4.470673664900123e-05, - "loss": 1.3301, + "loss": 1.3309, "step": 12465 }, { "epoch": 0.8472618562304661, - "grad_norm": 0.6487123370170593, + "grad_norm": 0.6471602916717529, "learning_rate": 4.470461339855959e-05, - "loss": 1.2662, + "loss": 1.2665, "step": 12470 }, { "epoch": 0.8476015763011279, - "grad_norm": 0.7046877145767212, + "grad_norm": 0.7071182131767273, "learning_rate": 4.4702490148117955e-05, - "loss": 1.339, + "loss": 1.3383, "step": 12475 }, { "epoch": 0.8479412963717896, - "grad_norm": 0.673567533493042, + "grad_norm": 0.6749264597892761, "learning_rate": 4.470036689767632e-05, - "loss": 1.3771, + "loss": 1.3765, "step": 12480 }, { "epoch": 0.8482810164424515, - "grad_norm": 0.7173770070075989, + "grad_norm": 0.7154066562652588, "learning_rate": 4.469824364723468e-05, - "loss": 1.4022, + "loss": 1.4025, "step": 12485 }, { "epoch": 0.8486207365131132, - "grad_norm": 0.7046067714691162, + "grad_norm": 0.7056087851524353, "learning_rate": 4.469612039679305e-05, - "loss": 1.4082, + "loss": 1.4076, "step": 12490 }, { "epoch": 0.848960456583775, - "grad_norm": 0.7364188432693481, + "grad_norm": 0.7351692914962769, "learning_rate": 4.4693997146351404e-05, - "loss": 1.3989, + "loss": 1.3986, "step": 12495 }, { "epoch": 0.8493001766544367, - "grad_norm": 0.7246367931365967, + "grad_norm": 0.7247734665870667, "learning_rate": 4.4691873895909775e-05, - "loss": 1.4032, + "loss": 1.4036, "step": 12500 }, { "epoch": 0.8496398967250985, - "grad_norm": 0.7582381963729858, + "grad_norm": 0.754787802696228, "learning_rate": 4.468975064546814e-05, - "loss": 1.2776, + "loss": 1.2772, "step": 12505 }, { "epoch": 0.8499796167957603, - "grad_norm": 0.7966914772987366, + "grad_norm": 0.7986366152763367, "learning_rate": 4.4687627395026496e-05, - "loss": 1.3539, + "loss": 1.3534, "step": 12510 }, { "epoch": 0.850319336866422, - "grad_norm": 0.5626029372215271, + "grad_norm": 0.5621582865715027, "learning_rate": 4.468550414458487e-05, - "loss": 1.3589, + "loss": 1.359, "step": 12515 }, { "epoch": 0.8506590569370839, - "grad_norm": 0.5940312147140503, + "grad_norm": 0.5939100980758667, "learning_rate": 4.468338089414323e-05, "loss": 1.298, "step": 12520 }, { "epoch": 0.8509987770077456, - "grad_norm": 0.7309895753860474, + "grad_norm": 0.705571174621582, "learning_rate": 4.468125764370159e-05, - "loss": 1.4446, + "loss": 1.4443, "step": 12525 }, { "epoch": 0.8513384970784074, - "grad_norm": 0.7690249681472778, + "grad_norm": 0.7720006108283997, "learning_rate": 4.467913439325996e-05, - "loss": 1.3459, + "loss": 1.3456, "step": 12530 }, { "epoch": 0.8516782171490692, - "grad_norm": 0.6707097887992859, + "grad_norm": 0.6723877191543579, "learning_rate": 4.467701114281832e-05, "loss": 1.3827, "step": 12535 }, { "epoch": 0.852017937219731, - "grad_norm": 0.7316380739212036, + "grad_norm": 0.7317809462547302, "learning_rate": 4.467488789237668e-05, - "loss": 1.3802, + "loss": 1.3799, "step": 12540 }, { "epoch": 0.8523576572903927, - "grad_norm": 0.6959583163261414, + "grad_norm": 0.6934905052185059, "learning_rate": 4.467276464193505e-05, - "loss": 1.3125, + "loss": 1.313, "step": 12545 }, { "epoch": 0.8526973773610544, - "grad_norm": 0.6206613183021545, + "grad_norm": 0.6207981109619141, "learning_rate": 4.4670641391493415e-05, - "loss": 1.3474, + "loss": 1.3478, "step": 12550 }, { "epoch": 0.8530370974317163, - "grad_norm": 0.683360755443573, + "grad_norm": 0.6838963031768799, "learning_rate": 4.466851814105177e-05, - "loss": 1.3881, + "loss": 1.3878, "step": 12555 }, { "epoch": 0.853376817502378, - "grad_norm": 0.5920791029930115, + "grad_norm": 0.5934872031211853, "learning_rate": 4.466639489061014e-05, - "loss": 1.3983, + "loss": 1.3977, "step": 12560 }, { "epoch": 0.8537165375730398, - "grad_norm": 0.7246683239936829, + "grad_norm": 0.7263369560241699, "learning_rate": 4.46642716401685e-05, "loss": 1.3337, "step": 12565 }, { "epoch": 0.8540562576437016, - "grad_norm": 0.6815592646598816, + "grad_norm": 0.6849284172058105, "learning_rate": 4.4662148389726864e-05, - "loss": 1.4058, + "loss": 1.4062, "step": 12570 }, { "epoch": 0.8543959777143634, - "grad_norm": 0.6942914724349976, + "grad_norm": 0.6925882697105408, "learning_rate": 4.4660025139285235e-05, - "loss": 1.3565, + "loss": 1.3555, "step": 12575 }, { "epoch": 0.8547356977850251, - "grad_norm": 0.7076385021209717, + "grad_norm": 0.7062354683876038, "learning_rate": 4.465790188884359e-05, - "loss": 1.214, + "loss": 1.2144, "step": 12580 }, { "epoch": 0.8550754178556869, - "grad_norm": 0.714252769947052, + "grad_norm": 0.7143131494522095, "learning_rate": 4.4655778638401956e-05, "loss": 1.3316, "step": 12585 }, { "epoch": 0.8554151379263487, - "grad_norm": 0.629565417766571, + "grad_norm": 0.6309350728988647, "learning_rate": 4.465365538796033e-05, - "loss": 1.3956, + "loss": 1.395, "step": 12590 }, { "epoch": 0.8557548579970105, - "grad_norm": 0.7133734822273254, + "grad_norm": 0.7168522477149963, "learning_rate": 4.4651532137518684e-05, - "loss": 1.4328, + "loss": 1.4331, "step": 12595 }, { "epoch": 0.8560945780676722, - "grad_norm": 0.556172788143158, + "grad_norm": 0.5570439100265503, "learning_rate": 4.464940888707705e-05, - "loss": 1.3339, + "loss": 1.3343, "step": 12600 }, { "epoch": 0.856434298138334, - "grad_norm": 0.7105357050895691, + "grad_norm": 0.7075490355491638, "learning_rate": 4.464728563663542e-05, - "loss": 1.3449, + "loss": 1.3443, "step": 12605 }, { "epoch": 0.8567740182089958, - "grad_norm": 0.6951549649238586, + "grad_norm": 0.6956871747970581, "learning_rate": 4.4645162386193776e-05, - "loss": 1.3716, + "loss": 1.3718, "step": 12610 }, { "epoch": 0.8571137382796575, - "grad_norm": 0.6994973421096802, + "grad_norm": 0.698732316493988, "learning_rate": 4.464303913575214e-05, - "loss": 1.3174, + "loss": 1.3178, "step": 12615 }, { "epoch": 0.8574534583503194, - "grad_norm": 0.6157001256942749, + "grad_norm": 0.6176219582557678, "learning_rate": 4.464091588531051e-05, - "loss": 1.3073, + "loss": 1.3071, "step": 12620 }, { "epoch": 0.8577931784209811, - "grad_norm": 0.7500584721565247, + "grad_norm": 0.7516471147537231, "learning_rate": 4.463879263486887e-05, - "loss": 1.3058, + "loss": 1.3055, "step": 12625 }, { "epoch": 0.8581328984916429, - "grad_norm": 0.6024318337440491, + "grad_norm": 0.6056381464004517, "learning_rate": 4.463666938442723e-05, "loss": 1.4033, "step": 12630 }, { "epoch": 0.8584726185623046, - "grad_norm": 0.619443953037262, + "grad_norm": 0.6195839643478394, "learning_rate": 4.46345461339856e-05, - "loss": 1.4824, + "loss": 1.4831, "step": 12635 }, { "epoch": 0.8588123386329665, - "grad_norm": 0.7248201966285706, + "grad_norm": 0.7258051633834839, "learning_rate": 4.463242288354396e-05, "loss": 1.4398, "step": 12640 }, { "epoch": 0.8591520587036282, - "grad_norm": 0.631622850894928, + "grad_norm": 0.6330260038375854, "learning_rate": 4.4630299633102324e-05, - "loss": 1.4092, + "loss": 1.4098, "step": 12645 }, { "epoch": 0.85949177877429, - "grad_norm": 0.6558489799499512, + "grad_norm": 0.6575252413749695, "learning_rate": 4.462817638266069e-05, - "loss": 1.3589, + "loss": 1.3591, "step": 12650 }, { "epoch": 0.8598314988449518, - "grad_norm": 0.7223998308181763, + "grad_norm": 0.7204355001449585, "learning_rate": 4.462605313221905e-05, - "loss": 1.4554, + "loss": 1.4556, "step": 12655 }, { "epoch": 0.8601712189156135, - "grad_norm": 0.7063769102096558, + "grad_norm": 0.7091907262802124, "learning_rate": 4.4623929881777416e-05, - "loss": 1.403, + "loss": 1.4027, "step": 12660 }, { "epoch": 0.8605109389862753, - "grad_norm": 0.6834850311279297, + "grad_norm": 0.6831023097038269, "learning_rate": 4.462180663133578e-05, "loss": 1.34, "step": 12665 }, { "epoch": 0.860850659056937, - "grad_norm": 0.6297242641448975, + "grad_norm": 0.6293261647224426, "learning_rate": 4.4619683380894144e-05, - "loss": 1.2888, + "loss": 1.2887, "step": 12670 }, { "epoch": 0.8611903791275989, - "grad_norm": 0.6202226877212524, + "grad_norm": 0.6196449398994446, "learning_rate": 4.461756013045251e-05, - "loss": 1.433, + "loss": 1.4328, "step": 12675 }, { "epoch": 0.8615300991982606, - "grad_norm": 0.7646043300628662, + "grad_norm": 0.7638828754425049, "learning_rate": 4.461543688001087e-05, - "loss": 1.3271, + "loss": 1.3272, "step": 12680 }, { "epoch": 0.8618698192689224, - "grad_norm": 0.6283760070800781, + "grad_norm": 0.6317503452301025, "learning_rate": 4.4613313629569236e-05, - "loss": 1.4297, + "loss": 1.43, "step": 12685 }, { "epoch": 0.8622095393395842, - "grad_norm": 0.7506958842277527, + "grad_norm": 0.7556169033050537, "learning_rate": 4.46111903791276e-05, - "loss": 1.3527, + "loss": 1.3531, "step": 12690 }, { "epoch": 0.862549259410246, - "grad_norm": 0.709775984287262, + "grad_norm": 0.7089202404022217, "learning_rate": 4.4609067128685964e-05, - "loss": 1.3299, + "loss": 1.3305, "step": 12695 }, { "epoch": 0.8628889794809077, - "grad_norm": 0.8392066359519958, + "grad_norm": 0.8301008939743042, "learning_rate": 4.460694387824433e-05, - "loss": 1.3447, + "loss": 1.3444, "step": 12700 }, { "epoch": 0.8632286995515696, - "grad_norm": 0.7184649109840393, + "grad_norm": 0.7169464230537415, "learning_rate": 4.460482062780269e-05, - "loss": 1.3057, + "loss": 1.3054, "step": 12705 }, { "epoch": 0.8635684196222313, - "grad_norm": 0.7541151642799377, + "grad_norm": 0.7545297145843506, "learning_rate": 4.4602697377361056e-05, - "loss": 1.3778, + "loss": 1.3776, "step": 12710 }, { "epoch": 0.863908139692893, - "grad_norm": 0.6973274350166321, + "grad_norm": 0.6988163590431213, "learning_rate": 4.460057412691942e-05, - "loss": 1.3023, + "loss": 1.3025, "step": 12715 }, { "epoch": 0.8642478597635548, - "grad_norm": 0.5811592936515808, + "grad_norm": 0.5839446187019348, "learning_rate": 4.4598450876477784e-05, - "loss": 1.4126, + "loss": 1.4127, "step": 12720 }, { "epoch": 0.8645875798342166, - "grad_norm": 0.8044971823692322, + "grad_norm": 0.7998316884040833, "learning_rate": 4.459632762603615e-05, - "loss": 1.3891, + "loss": 1.3894, "step": 12725 }, { "epoch": 0.8649272999048784, - "grad_norm": 0.6741278171539307, + "grad_norm": 0.6778241395950317, "learning_rate": 4.459420437559451e-05, - "loss": 1.4562, + "loss": 1.4568, "step": 12730 }, { "epoch": 0.8652670199755401, - "grad_norm": 0.6165005564689636, + "grad_norm": 0.6163250207901001, "learning_rate": 4.4592081125152876e-05, - "loss": 1.4003, + "loss": 1.4013, "step": 12735 }, { "epoch": 0.865606740046202, - "grad_norm": 0.7122872471809387, + "grad_norm": 0.7056950330734253, "learning_rate": 4.458995787471124e-05, - "loss": 1.4728, + "loss": 1.4731, "step": 12740 }, { "epoch": 0.8659464601168637, - "grad_norm": 0.7181335687637329, + "grad_norm": 0.7195923328399658, "learning_rate": 4.4587834624269604e-05, - "loss": 1.4214, + "loss": 1.4209, "step": 12745 }, { "epoch": 0.8662861801875255, - "grad_norm": 0.5936729907989502, + "grad_norm": 0.5925621390342712, "learning_rate": 4.458571137382797e-05, - "loss": 1.3632, + "loss": 1.3635, "step": 12750 }, { "epoch": 0.8666259002581872, - "grad_norm": 0.6827543377876282, + "grad_norm": 0.687329888343811, "learning_rate": 4.458358812338633e-05, "loss": 1.3793, "step": 12755 }, { "epoch": 0.866965620328849, - "grad_norm": 0.7110022306442261, + "grad_norm": 0.7102918028831482, "learning_rate": 4.4581464872944696e-05, - "loss": 1.381, + "loss": 1.3811, "step": 12760 }, { "epoch": 0.8673053403995108, - "grad_norm": 0.6814020872116089, + "grad_norm": 0.6777439117431641, "learning_rate": 4.4579341622503054e-05, - "loss": 1.3126, + "loss": 1.3122, "step": 12765 }, { "epoch": 0.8676450604701725, - "grad_norm": 0.7111496925354004, + "grad_norm": 0.7026997208595276, "learning_rate": 4.4577218372061424e-05, - "loss": 1.2872, + "loss": 1.2871, "step": 12770 }, { "epoch": 0.8679847805408344, - "grad_norm": 0.6328551173210144, + "grad_norm": 0.6339812874794006, "learning_rate": 4.457509512161979e-05, - "loss": 1.3097, + "loss": 1.3092, "step": 12775 }, { "epoch": 0.8683245006114961, - "grad_norm": 0.7361866235733032, + "grad_norm": 0.7375497221946716, "learning_rate": 4.457297187117815e-05, - "loss": 1.3533, + "loss": 1.3539, "step": 12780 }, { "epoch": 0.8686642206821579, - "grad_norm": 0.6520326733589172, + "grad_norm": 0.6528058648109436, "learning_rate": 4.4570848620736516e-05, - "loss": 1.4271, + "loss": 1.4264, "step": 12785 }, { "epoch": 0.8690039407528197, - "grad_norm": 0.7593867182731628, + "grad_norm": 0.757419228553772, "learning_rate": 4.456872537029488e-05, - "loss": 1.3221, + "loss": 1.3224, "step": 12790 }, { "epoch": 0.8693436608234815, - "grad_norm": 0.6264833211898804, + "grad_norm": 0.6312508583068848, "learning_rate": 4.4566602119853244e-05, - "loss": 1.3859, + "loss": 1.3854, "step": 12795 }, { "epoch": 0.8696833808941432, - "grad_norm": 0.5434083938598633, + "grad_norm": 0.5474633574485779, "learning_rate": 4.456447886941161e-05, - "loss": 1.3825, + "loss": 1.3837, "step": 12800 }, { "epoch": 0.870023100964805, - "grad_norm": 0.583276629447937, + "grad_norm": 0.582330584526062, "learning_rate": 4.456235561896997e-05, - "loss": 1.3597, + "loss": 1.3594, "step": 12805 }, { "epoch": 0.8703628210354668, - "grad_norm": 0.65021812915802, + "grad_norm": 0.6532167792320251, "learning_rate": 4.4560232368528336e-05, - "loss": 1.4308, + "loss": 1.431, "step": 12810 }, { "epoch": 0.8707025411061285, - "grad_norm": 0.7106450200080872, + "grad_norm": 0.7104457020759583, "learning_rate": 4.45581091180867e-05, - "loss": 1.4236, + "loss": 1.4229, "step": 12815 }, { "epoch": 0.8710422611767903, - "grad_norm": 0.6209539771080017, + "grad_norm": 0.6242319941520691, "learning_rate": 4.4555985867645064e-05, - "loss": 1.3181, + "loss": 1.3179, "step": 12820 }, { "epoch": 0.8713819812474521, - "grad_norm": 0.6393656730651855, + "grad_norm": 0.6393834352493286, "learning_rate": 4.455386261720343e-05, - "loss": 1.4685, + "loss": 1.4682, "step": 12825 }, { "epoch": 0.8717217013181139, - "grad_norm": 0.7080166935920715, + "grad_norm": 0.7054137587547302, "learning_rate": 4.455173936676179e-05, - "loss": 1.2752, + "loss": 1.2759, "step": 12830 }, { "epoch": 0.8720614213887756, - "grad_norm": 0.6802095770835876, + "grad_norm": 0.6701720952987671, "learning_rate": 4.4549616116320156e-05, - "loss": 1.3952, + "loss": 1.3955, "step": 12835 }, { "epoch": 0.8724011414594374, - "grad_norm": 0.6535228490829468, + "grad_norm": 0.6506157517433167, "learning_rate": 4.454749286587852e-05, - "loss": 1.3435, + "loss": 1.3437, "step": 12840 }, { "epoch": 0.8727408615300992, - "grad_norm": 0.8674468398094177, + "grad_norm": 0.8501390218734741, "learning_rate": 4.4545369615436884e-05, - "loss": 1.3229, + "loss": 1.3226, "step": 12845 }, { "epoch": 0.873080581600761, - "grad_norm": 0.6909651756286621, + "grad_norm": 0.6892210841178894, "learning_rate": 4.454324636499524e-05, - "loss": 1.4718, + "loss": 1.4715, "step": 12850 }, { "epoch": 0.8734203016714227, - "grad_norm": 0.6655237078666687, + "grad_norm": 0.6669777035713196, "learning_rate": 4.454112311455361e-05, - "loss": 1.3768, + "loss": 1.3763, "step": 12855 }, { "epoch": 0.8737600217420846, - "grad_norm": 0.6951791048049927, + "grad_norm": 0.7042840719223022, "learning_rate": 4.4538999864111976e-05, - "loss": 1.2887, + "loss": 1.2889, "step": 12860 }, { "epoch": 0.8740997418127463, - "grad_norm": 0.676295816898346, + "grad_norm": 0.6754158735275269, "learning_rate": 4.4536876613670334e-05, - "loss": 1.3273, + "loss": 1.3275, "step": 12865 }, { "epoch": 0.874439461883408, - "grad_norm": 0.7393082976341248, + "grad_norm": 0.7350449562072754, "learning_rate": 4.4534753363228704e-05, - "loss": 1.1891, + "loss": 1.1897, "step": 12870 }, { "epoch": 0.8747791819540699, - "grad_norm": 0.6166176199913025, + "grad_norm": 0.6162439584732056, "learning_rate": 4.453263011278707e-05, - "loss": 1.3926, + "loss": 1.3925, "step": 12875 }, { "epoch": 0.8751189020247316, - "grad_norm": 0.7175416350364685, + "grad_norm": 0.7142587304115295, "learning_rate": 4.4530506862345426e-05, - "loss": 1.3168, + "loss": 1.3173, "step": 12880 }, { "epoch": 0.8754586220953934, - "grad_norm": 0.665534496307373, + "grad_norm": 0.6636133790016174, "learning_rate": 4.4528383611903796e-05, - "loss": 1.3882, + "loss": 1.3877, "step": 12885 }, { "epoch": 0.8757983421660551, - "grad_norm": 0.6802185773849487, + "grad_norm": 0.6794610619544983, "learning_rate": 4.452626036146216e-05, - "loss": 1.366, + "loss": 1.3665, "step": 12890 }, { "epoch": 0.876138062236717, - "grad_norm": 0.6779190897941589, + "grad_norm": 0.6782474517822266, "learning_rate": 4.452413711102052e-05, - "loss": 1.3629, + "loss": 1.3636, "step": 12895 }, { "epoch": 0.8764777823073787, - "grad_norm": 0.622493326663971, + "grad_norm": 0.6222137808799744, "learning_rate": 4.452201386057889e-05, - "loss": 1.4485, + "loss": 1.4481, "step": 12900 }, { "epoch": 0.8768175023780405, - "grad_norm": 0.7519457340240479, + "grad_norm": 0.7548596262931824, "learning_rate": 4.451989061013725e-05, - "loss": 1.3622, + "loss": 1.3627, "step": 12905 }, { "epoch": 0.8771572224487023, - "grad_norm": 0.6253534555435181, + "grad_norm": 0.6249526739120483, "learning_rate": 4.451776735969561e-05, - "loss": 1.3836, + "loss": 1.3833, "step": 12910 }, { "epoch": 0.877496942519364, - "grad_norm": 0.6690431833267212, + "grad_norm": 0.6661779284477234, "learning_rate": 4.451564410925398e-05, - "loss": 1.3331, + "loss": 1.3333, "step": 12915 }, { "epoch": 0.8778366625900258, - "grad_norm": 0.6430015563964844, + "grad_norm": 0.6423152685165405, "learning_rate": 4.451352085881234e-05, - "loss": 1.3918, + "loss": 1.3921, "step": 12920 }, { "epoch": 0.8781763826606876, - "grad_norm": 0.6269593834877014, + "grad_norm": 0.6272457838058472, "learning_rate": 4.45113976083707e-05, "loss": 1.2831, "step": 12925 }, { "epoch": 0.8785161027313494, - "grad_norm": 0.6534284353256226, + "grad_norm": 0.6506603956222534, "learning_rate": 4.450927435792907e-05, - "loss": 1.329, + "loss": 1.3294, "step": 12930 }, { "epoch": 0.8788558228020111, - "grad_norm": 0.685684859752655, + "grad_norm": 0.6870989203453064, "learning_rate": 4.450715110748743e-05, - "loss": 1.3478, + "loss": 1.3479, "step": 12935 }, { "epoch": 0.8791955428726729, - "grad_norm": 0.7119500041007996, + "grad_norm": 0.7044215202331543, "learning_rate": 4.4505027857045794e-05, - "loss": 1.2784, + "loss": 1.2785, "step": 12940 }, { "epoch": 0.8795352629433347, - "grad_norm": 0.6209441423416138, + "grad_norm": 0.6179230809211731, "learning_rate": 4.4502904606604164e-05, - "loss": 1.3552, + "loss": 1.3551, "step": 12945 }, { "epoch": 0.8798749830139965, - "grad_norm": 0.7009634375572205, + "grad_norm": 0.6969861388206482, "learning_rate": 4.450078135616252e-05, - "loss": 1.305, + "loss": 1.3044, "step": 12950 }, { "epoch": 0.8802147030846582, - "grad_norm": 0.6846415996551514, + "grad_norm": 0.6904934048652649, "learning_rate": 4.4498658105720886e-05, - "loss": 1.343, + "loss": 1.3433, "step": 12955 }, { "epoch": 0.8805544231553201, - "grad_norm": 0.7048271894454956, + "grad_norm": 0.7056502103805542, "learning_rate": 4.4496534855279256e-05, - "loss": 1.3897, + "loss": 1.3904, "step": 12960 }, { "epoch": 0.8808941432259818, - "grad_norm": 0.6972647905349731, + "grad_norm": 0.6928659081459045, "learning_rate": 4.4494411604837614e-05, - "loss": 1.3743, + "loss": 1.3737, "step": 12965 }, { "epoch": 0.8812338632966435, - "grad_norm": 0.6757969856262207, + "grad_norm": 0.6760026812553406, "learning_rate": 4.449228835439598e-05, - "loss": 1.3773, + "loss": 1.3776, "step": 12970 }, { "epoch": 0.8815735833673053, - "grad_norm": 0.6842308044433594, + "grad_norm": 0.6865389943122864, "learning_rate": 4.449016510395435e-05, - "loss": 1.3798, + "loss": 1.3789, "step": 12975 }, { "epoch": 0.8819133034379671, - "grad_norm": 0.6644297242164612, + "grad_norm": 0.6639211773872375, "learning_rate": 4.4488041853512706e-05, - "loss": 1.3698, + "loss": 1.3695, "step": 12980 }, { "epoch": 0.8822530235086289, - "grad_norm": 0.7060693502426147, + "grad_norm": 0.7143428921699524, "learning_rate": 4.448591860307107e-05, "loss": 1.3364, "step": 12985 }, { "epoch": 0.8825927435792906, - "grad_norm": 0.767492413520813, + "grad_norm": 0.7689117789268494, "learning_rate": 4.448379535262944e-05, - "loss": 1.4915, + "loss": 1.4913, "step": 12990 }, { "epoch": 0.8829324636499525, - "grad_norm": 0.6986490488052368, + "grad_norm": 0.6992553472518921, "learning_rate": 4.44816721021878e-05, - "loss": 1.356, + "loss": 1.3561, "step": 12995 }, { "epoch": 0.8832721837206142, - "grad_norm": 0.8189640641212463, + "grad_norm": 0.815887451171875, "learning_rate": 4.447954885174616e-05, - "loss": 1.3555, + "loss": 1.3554, "step": 13000 }, { "epoch": 0.883611903791276, - "grad_norm": 0.822814404964447, + "grad_norm": 0.8343554139137268, "learning_rate": 4.4477425601304526e-05, - "loss": 1.4497, + "loss": 1.4508, "step": 13005 }, { "epoch": 0.8839516238619378, - "grad_norm": 0.7144615650177002, + "grad_norm": 0.7152528762817383, "learning_rate": 4.447530235086289e-05, - "loss": 1.3439, + "loss": 1.3438, "step": 13010 }, { "epoch": 0.8842913439325996, - "grad_norm": 0.6712161302566528, + "grad_norm": 0.6673587560653687, "learning_rate": 4.4473179100421254e-05, - "loss": 1.4226, + "loss": 1.4224, "step": 13015 }, { "epoch": 0.8846310640032613, - "grad_norm": 0.6807404160499573, + "grad_norm": 0.6807577013969421, "learning_rate": 4.447105584997962e-05, - "loss": 1.3964, + "loss": 1.3973, "step": 13020 }, { "epoch": 0.884970784073923, - "grad_norm": 0.6526500582695007, + "grad_norm": 0.6526590585708618, "learning_rate": 4.446893259953798e-05, - "loss": 1.257, + "loss": 1.2576, "step": 13025 }, { "epoch": 0.8853105041445849, - "grad_norm": 0.791623055934906, + "grad_norm": 0.7920680642127991, "learning_rate": 4.4466809349096346e-05, - "loss": 1.3861, + "loss": 1.3859, "step": 13030 }, { "epoch": 0.8856502242152466, - "grad_norm": 0.7261638045310974, + "grad_norm": 0.723523736000061, "learning_rate": 4.446468609865471e-05, - "loss": 1.3642, + "loss": 1.3635, "step": 13035 }, { "epoch": 0.8859899442859084, - "grad_norm": 0.7141435742378235, + "grad_norm": 0.712665319442749, "learning_rate": 4.4462562848213074e-05, - "loss": 1.3801, + "loss": 1.3803, "step": 13040 }, { "epoch": 0.8863296643565702, - "grad_norm": 0.6886386275291443, + "grad_norm": 0.6879522800445557, "learning_rate": 4.446043959777144e-05, - "loss": 1.441, + "loss": 1.4413, "step": 13045 }, { "epoch": 0.886669384427232, - "grad_norm": 0.6873751282691956, + "grad_norm": 0.6967764496803284, "learning_rate": 4.44583163473298e-05, - "loss": 1.3655, + "loss": 1.3658, "step": 13050 }, { "epoch": 0.8870091044978937, - "grad_norm": 0.7220982909202576, + "grad_norm": 0.7210564017295837, "learning_rate": 4.4456193096888166e-05, - "loss": 1.3556, + "loss": 1.3562, "step": 13055 }, { "epoch": 0.8873488245685555, - "grad_norm": 0.6405097842216492, + "grad_norm": 0.6411513090133667, "learning_rate": 4.445406984644653e-05, - "loss": 1.3033, + "loss": 1.3032, "step": 13060 }, { "epoch": 0.8876885446392173, - "grad_norm": 0.6902096271514893, + "grad_norm": 0.6906110644340515, "learning_rate": 4.4451946596004894e-05, "loss": 1.6509, "step": 13065 }, { "epoch": 0.8880282647098791, - "grad_norm": 0.8238720893859863, + "grad_norm": 0.8249316215515137, "learning_rate": 4.444982334556326e-05, - "loss": 1.3401, + "loss": 1.3406, "step": 13070 }, { "epoch": 0.8883679847805408, - "grad_norm": 0.7122278809547424, + "grad_norm": 0.7104088068008423, "learning_rate": 4.444770009512162e-05, - "loss": 1.4091, + "loss": 1.409, "step": 13075 }, { "epoch": 0.8887077048512027, - "grad_norm": 0.6530733108520508, + "grad_norm": 0.6525675058364868, "learning_rate": 4.4445576844679986e-05, - "loss": 1.3942, + "loss": 1.394, "step": 13080 }, { "epoch": 0.8890474249218644, - "grad_norm": 0.7043144702911377, + "grad_norm": 0.7093170881271362, "learning_rate": 4.444345359423835e-05, - "loss": 1.4006, + "loss": 1.4013, "step": 13085 }, { "epoch": 0.8893871449925261, - "grad_norm": 0.6719483137130737, + "grad_norm": 0.6713474988937378, "learning_rate": 4.4441330343796714e-05, - "loss": 1.3808, + "loss": 1.3812, "step": 13090 }, { "epoch": 0.889726865063188, - "grad_norm": 0.6334452033042908, + "grad_norm": 0.6353588700294495, "learning_rate": 4.443920709335508e-05, - "loss": 1.4171, + "loss": 1.4173, "step": 13095 }, { "epoch": 0.8900665851338497, - "grad_norm": 0.686010479927063, + "grad_norm": 0.6828039884567261, "learning_rate": 4.443708384291344e-05, - "loss": 1.3543, + "loss": 1.3548, "step": 13100 }, { "epoch": 0.8904063052045115, - "grad_norm": 0.6849266290664673, + "grad_norm": 0.6836593747138977, "learning_rate": 4.4434960592471806e-05, - "loss": 1.3469, + "loss": 1.3465, "step": 13105 }, { "epoch": 0.8907460252751732, - "grad_norm": 0.6323592066764832, + "grad_norm": 0.6327558159828186, "learning_rate": 4.443283734203017e-05, - "loss": 1.3562, + "loss": 1.3557, "step": 13110 }, { "epoch": 0.8910857453458351, - "grad_norm": 0.7662525773048401, + "grad_norm": 0.8014131188392639, "learning_rate": 4.4430714091588534e-05, - "loss": 1.3266, + "loss": 1.3276, "step": 13115 }, { "epoch": 0.8914254654164968, - "grad_norm": 0.6142858266830444, + "grad_norm": 0.6161994338035583, "learning_rate": 4.44285908411469e-05, - "loss": 1.3438, + "loss": 1.344, "step": 13120 }, { "epoch": 0.8917651854871586, - "grad_norm": 0.6289658546447754, + "grad_norm": 0.6258159279823303, "learning_rate": 4.442646759070526e-05, - "loss": 1.2906, + "loss": 1.2909, "step": 13125 }, { "epoch": 0.8921049055578204, - "grad_norm": 0.7273269295692444, + "grad_norm": 0.7282395362854004, "learning_rate": 4.4424344340263626e-05, "loss": 1.3285, "step": 13130 }, { "epoch": 0.8924446256284821, - "grad_norm": 0.7088006734848022, + "grad_norm": 0.7108657360076904, "learning_rate": 4.442222108982199e-05, - "loss": 1.2829, + "loss": 1.2822, "step": 13135 }, { "epoch": 0.8927843456991439, - "grad_norm": 0.663790762424469, + "grad_norm": 0.6663039922714233, "learning_rate": 4.4420097839380354e-05, - "loss": 1.3886, + "loss": 1.3881, "step": 13140 }, { "epoch": 0.8931240657698056, - "grad_norm": 0.5893279314041138, + "grad_norm": 0.5917723178863525, "learning_rate": 4.441797458893872e-05, - "loss": 1.3726, + "loss": 1.3724, "step": 13145 }, { "epoch": 0.8934637858404675, - "grad_norm": 0.6438887119293213, + "grad_norm": 0.6445760726928711, "learning_rate": 4.441585133849708e-05, - "loss": 1.4389, + "loss": 1.4399, "step": 13150 }, { "epoch": 0.8938035059111292, - "grad_norm": 0.6501761674880981, + "grad_norm": 0.6504111289978027, "learning_rate": 4.4413728088055446e-05, - "loss": 1.4758, + "loss": 1.4754, "step": 13155 }, { "epoch": 0.894143225981791, - "grad_norm": 0.693305253982544, + "grad_norm": 0.6911579966545105, "learning_rate": 4.441160483761381e-05, - "loss": 1.336, + "loss": 1.3363, "step": 13160 }, { "epoch": 0.8944829460524528, - "grad_norm": 0.641102135181427, + "grad_norm": 0.6377180218696594, "learning_rate": 4.4409481587172174e-05, - "loss": 1.2956, + "loss": 1.2945, "step": 13165 }, { "epoch": 0.8948226661231146, - "grad_norm": 0.6698890328407288, + "grad_norm": 0.6682319641113281, "learning_rate": 4.440735833673054e-05, "loss": 1.3542, "step": 13170 }, { "epoch": 0.8951623861937763, - "grad_norm": 0.6819259524345398, + "grad_norm": 0.6849210858345032, "learning_rate": 4.44052350862889e-05, - "loss": 1.4348, + "loss": 1.4342, "step": 13175 }, { "epoch": 0.8955021062644382, - "grad_norm": 0.7356864809989929, + "grad_norm": 0.7349604368209839, "learning_rate": 4.4403111835847266e-05, - "loss": 1.4219, + "loss": 1.4223, "step": 13180 }, { "epoch": 0.8958418263350999, - "grad_norm": 0.7078156471252441, + "grad_norm": 0.7068150043487549, "learning_rate": 4.440098858540563e-05, - "loss": 1.4111, + "loss": 1.4116, "step": 13185 }, { "epoch": 0.8961815464057616, - "grad_norm": 0.6777173280715942, + "grad_norm": 0.6776065826416016, "learning_rate": 4.4398865334963994e-05, - "loss": 1.3411, + "loss": 1.3416, "step": 13190 }, { "epoch": 0.8965212664764234, - "grad_norm": 0.6705132722854614, + "grad_norm": 0.6758216023445129, "learning_rate": 4.439674208452236e-05, - "loss": 1.337, + "loss": 1.3374, "step": 13195 }, { "epoch": 0.8968609865470852, - "grad_norm": 0.6225191950798035, + "grad_norm": 0.6204516887664795, "learning_rate": 4.439461883408072e-05, - "loss": 1.3543, + "loss": 1.3539, "step": 13200 }, { "epoch": 0.897200706617747, - "grad_norm": 0.7024900913238525, + "grad_norm": 0.7022368907928467, "learning_rate": 4.439249558363908e-05, - "loss": 1.3405, + "loss": 1.3408, "step": 13205 }, { "epoch": 0.8975404266884087, - "grad_norm": 0.6447491645812988, + "grad_norm": 0.6432115435600281, "learning_rate": 4.439037233319745e-05, - "loss": 1.2934, + "loss": 1.2933, "step": 13210 }, { "epoch": 0.8978801467590706, - "grad_norm": 0.6816796660423279, + "grad_norm": 0.678573727607727, "learning_rate": 4.4388249082755814e-05, - "loss": 1.2964, + "loss": 1.2961, "step": 13215 }, { "epoch": 0.8982198668297323, - "grad_norm": 0.7214002013206482, + "grad_norm": 0.7202540040016174, "learning_rate": 4.438612583231417e-05, - "loss": 1.3165, + "loss": 1.3164, "step": 13220 }, { "epoch": 0.8985595869003941, - "grad_norm": 0.7090854048728943, + "grad_norm": 0.7087196707725525, "learning_rate": 4.438400258187254e-05, - "loss": 1.3587, + "loss": 1.3584, "step": 13225 }, { "epoch": 0.8988993069710558, - "grad_norm": 0.5795954465866089, + "grad_norm": 0.5790719985961914, "learning_rate": 4.4381879331430906e-05, - "loss": 1.3832, + "loss": 1.3835, "step": 13230 }, { "epoch": 0.8992390270417177, - "grad_norm": 0.7099621295928955, + "grad_norm": 0.7106484174728394, "learning_rate": 4.437975608098926e-05, - "loss": 1.3428, + "loss": 1.3431, "step": 13235 }, { "epoch": 0.8995787471123794, - "grad_norm": 0.6782974600791931, + "grad_norm": 0.6797402501106262, "learning_rate": 4.4377632830547634e-05, - "loss": 1.369, + "loss": 1.3687, "step": 13240 }, { "epoch": 0.8999184671830411, - "grad_norm": 0.6810418367385864, + "grad_norm": 0.6789610981941223, "learning_rate": 4.4375509580106e-05, - "loss": 1.3566, + "loss": 1.3561, "step": 13245 }, { "epoch": 0.900258187253703, - "grad_norm": 0.7035811543464661, + "grad_norm": 0.7008369565010071, "learning_rate": 4.4373386329664355e-05, - "loss": 1.3677, + "loss": 1.3671, "step": 13250 }, { "epoch": 0.9005979073243647, - "grad_norm": 0.7452874779701233, + "grad_norm": 0.6668129563331604, "learning_rate": 4.4371263079222726e-05, - "loss": 1.3218, + "loss": 1.3216, "step": 13255 }, { "epoch": 0.9009376273950265, - "grad_norm": 0.6274643540382385, + "grad_norm": 0.6269006729125977, "learning_rate": 4.436913982878109e-05, "loss": 1.3924, "step": 13260 }, { "epoch": 0.9012773474656883, - "grad_norm": 0.6658132672309875, + "grad_norm": 0.6719803810119629, "learning_rate": 4.436701657833945e-05, - "loss": 1.3177, + "loss": 1.3174, "step": 13265 }, { "epoch": 0.9016170675363501, - "grad_norm": 0.7131541967391968, + "grad_norm": 0.7149995565414429, "learning_rate": 4.436489332789782e-05, - "loss": 1.3075, + "loss": 1.3072, "step": 13270 }, { "epoch": 0.9019567876070118, - "grad_norm": 0.6533933877944946, + "grad_norm": 0.6575379371643066, "learning_rate": 4.4362770077456175e-05, - "loss": 1.3699, + "loss": 1.3698, "step": 13275 }, { "epoch": 0.9022965076776736, - "grad_norm": 0.7328100800514221, + "grad_norm": 0.7339080572128296, "learning_rate": 4.436064682701454e-05, - "loss": 1.3425, + "loss": 1.3437, "step": 13280 }, { "epoch": 0.9026362277483354, - "grad_norm": 0.6254872679710388, + "grad_norm": 0.6248759031295776, "learning_rate": 4.435852357657291e-05, "loss": 1.3822, "step": 13285 }, { "epoch": 0.9029759478189971, - "grad_norm": 0.6984084844589233, + "grad_norm": 0.6994850635528564, "learning_rate": 4.435640032613127e-05, - "loss": 1.3947, + "loss": 1.3952, "step": 13290 }, { "epoch": 0.9033156678896589, - "grad_norm": 0.5597203969955444, + "grad_norm": 0.560857355594635, "learning_rate": 4.435427707568963e-05, - "loss": 1.3589, + "loss": 1.3591, "step": 13295 }, { "epoch": 0.9036553879603207, - "grad_norm": 0.668630838394165, + "grad_norm": 0.6680623888969421, "learning_rate": 4.4352153825248e-05, - "loss": 1.4675, + "loss": 1.4678, "step": 13300 }, { "epoch": 0.9039951080309825, - "grad_norm": 0.7078883647918701, + "grad_norm": 0.7115943431854248, "learning_rate": 4.435003057480636e-05, - "loss": 1.3876, + "loss": 1.3873, "step": 13305 }, { "epoch": 0.9043348281016442, - "grad_norm": 0.6608153581619263, + "grad_norm": 0.6609895825386047, "learning_rate": 4.434790732436472e-05, - "loss": 1.4429, + "loss": 1.4431, "step": 13310 }, { "epoch": 0.904674548172306, - "grad_norm": 0.6872571706771851, + "grad_norm": 0.6817240715026855, "learning_rate": 4.4345784073923094e-05, - "loss": 1.4909, + "loss": 1.4905, "step": 13315 }, { "epoch": 0.9050142682429678, - "grad_norm": 0.7475165724754333, + "grad_norm": 0.7412055730819702, "learning_rate": 4.434366082348145e-05, - "loss": 1.3526, + "loss": 1.3527, "step": 13320 }, { "epoch": 0.9053539883136296, - "grad_norm": 0.7566111087799072, + "grad_norm": 0.7256126999855042, "learning_rate": 4.4341537573039815e-05, - "loss": 1.2801, + "loss": 1.2794, "step": 13325 }, { "epoch": 0.9056937083842913, - "grad_norm": 0.6893516778945923, + "grad_norm": 0.667883038520813, "learning_rate": 4.4339414322598186e-05, - "loss": 1.3676, + "loss": 1.3678, "step": 13330 }, { "epoch": 0.9060334284549532, - "grad_norm": 0.7658765912055969, + "grad_norm": 0.7668097019195557, "learning_rate": 4.433729107215654e-05, - "loss": 1.3154, + "loss": 1.3151, "step": 13335 }, { "epoch": 0.9063731485256149, - "grad_norm": 0.6320791840553284, + "grad_norm": 0.5999295711517334, "learning_rate": 4.433516782171491e-05, - "loss": 1.3613, + "loss": 1.361, "step": 13340 }, { "epoch": 0.9067128685962766, - "grad_norm": 0.7179452180862427, + "grad_norm": 0.7142606377601624, "learning_rate": 4.433304457127327e-05, - "loss": 1.4489, + "loss": 1.4491, "step": 13345 }, { "epoch": 0.9070525886669385, - "grad_norm": 0.7113944292068481, + "grad_norm": 0.7077226042747498, "learning_rate": 4.4330921320831635e-05, - "loss": 1.2941, + "loss": 1.2947, "step": 13350 }, { "epoch": 0.9073923087376002, - "grad_norm": 0.7318709492683411, + "grad_norm": 0.7274554967880249, "learning_rate": 4.432879807039e-05, - "loss": 1.3876, + "loss": 1.3877, "step": 13355 }, { "epoch": 0.907732028808262, - "grad_norm": 0.7524285912513733, + "grad_norm": 0.7544350028038025, "learning_rate": 4.432667481994836e-05, - "loss": 1.4184, + "loss": 1.4183, "step": 13360 }, { "epoch": 0.9080717488789237, - "grad_norm": 0.6286381483078003, + "grad_norm": 0.6292847394943237, "learning_rate": 4.432455156950673e-05, - "loss": 1.3671, + "loss": 1.3677, "step": 13365 }, { "epoch": 0.9084114689495856, - "grad_norm": 0.6272299289703369, + "grad_norm": 0.628112256526947, "learning_rate": 4.432242831906509e-05, - "loss": 1.3946, + "loss": 1.3944, "step": 13370 }, { "epoch": 0.9087511890202473, - "grad_norm": 0.5813237428665161, + "grad_norm": 0.5800889730453491, "learning_rate": 4.4320305068623455e-05, - "loss": 1.2941, + "loss": 1.2942, "step": 13375 }, { "epoch": 0.9090909090909091, - "grad_norm": 0.6842602491378784, + "grad_norm": 0.6845670342445374, "learning_rate": 4.431818181818182e-05, - "loss": 1.4783, + "loss": 1.4775, "step": 13380 }, { "epoch": 0.9094306291615709, - "grad_norm": 0.6329503655433655, + "grad_norm": 0.6318212747573853, "learning_rate": 4.431605856774018e-05, - "loss": 1.374, + "loss": 1.3737, "step": 13385 }, { "epoch": 0.9097703492322327, - "grad_norm": 0.7969860434532166, + "grad_norm": 0.7932705879211426, "learning_rate": 4.431393531729855e-05, - "loss": 1.2046, + "loss": 1.2042, "step": 13390 }, { "epoch": 0.9101100693028944, - "grad_norm": 0.7060311436653137, + "grad_norm": 0.7063025832176208, "learning_rate": 4.431181206685691e-05, - "loss": 1.3383, + "loss": 1.3391, "step": 13395 }, { "epoch": 0.9104497893735561, - "grad_norm": 0.5485876798629761, + "grad_norm": 0.5513021349906921, "learning_rate": 4.4309688816415275e-05, - "loss": 1.2818, + "loss": 1.2827, "step": 13400 }, { "epoch": 0.910789509444218, - "grad_norm": 0.6860634088516235, + "grad_norm": 0.6828407645225525, "learning_rate": 4.430756556597364e-05, - "loss": 1.4044, + "loss": 1.4047, "step": 13405 }, { "epoch": 0.9111292295148797, - "grad_norm": 0.6536875367164612, + "grad_norm": 0.6503252983093262, "learning_rate": 4.4305442315532e-05, - "loss": 1.3108, + "loss": 1.3101, "step": 13410 }, { "epoch": 0.9114689495855415, - "grad_norm": 0.689294159412384, + "grad_norm": 0.691767156124115, "learning_rate": 4.430331906509037e-05, - "loss": 1.39, + "loss": 1.3903, "step": 13415 }, { "epoch": 0.9118086696562033, - "grad_norm": 0.6730312705039978, + "grad_norm": 0.6738830208778381, "learning_rate": 4.430119581464873e-05, - "loss": 1.3091, + "loss": 1.3093, "step": 13420 }, { "epoch": 0.9121483897268651, - "grad_norm": 0.6333032250404358, + "grad_norm": 0.634270191192627, "learning_rate": 4.4299072564207095e-05, - "loss": 1.4008, + "loss": 1.401, "step": 13425 }, { "epoch": 0.9124881097975268, - "grad_norm": 0.7166286706924438, + "grad_norm": 0.7178604602813721, "learning_rate": 4.429694931376546e-05, - "loss": 1.222, + "loss": 1.2215, "step": 13430 }, { "epoch": 0.9128278298681887, - "grad_norm": 0.7296689748764038, + "grad_norm": 0.7218911051750183, "learning_rate": 4.429482606332382e-05, - "loss": 1.3895, + "loss": 1.3892, "step": 13435 }, { "epoch": 0.9131675499388504, - "grad_norm": 0.6304849982261658, + "grad_norm": 0.6336371302604675, "learning_rate": 4.429270281288219e-05, - "loss": 1.3599, + "loss": 1.3596, "step": 13440 }, { "epoch": 0.9135072700095122, - "grad_norm": 0.6465513706207275, + "grad_norm": 0.6472822427749634, "learning_rate": 4.429057956244055e-05, - "loss": 1.315, + "loss": 1.3146, "step": 13445 }, { "epoch": 0.9138469900801739, - "grad_norm": 0.7055228352546692, + "grad_norm": 0.7048435807228088, "learning_rate": 4.4288456311998915e-05, - "loss": 1.374, + "loss": 1.3742, "step": 13450 }, { "epoch": 0.9141867101508357, - "grad_norm": 0.6719948053359985, + "grad_norm": 0.6702203750610352, "learning_rate": 4.428633306155728e-05, - "loss": 1.4075, + "loss": 1.408, "step": 13455 }, { "epoch": 0.9145264302214975, - "grad_norm": 0.7837120294570923, + "grad_norm": 0.7639760375022888, "learning_rate": 4.428420981111564e-05, - "loss": 1.3971, + "loss": 1.3972, "step": 13460 }, { "epoch": 0.9148661502921592, - "grad_norm": 0.6214168667793274, + "grad_norm": 0.6223245859146118, "learning_rate": 4.428208656067401e-05, "loss": 1.3746, "step": 13465 }, { "epoch": 0.9152058703628211, - "grad_norm": 0.6817909479141235, + "grad_norm": 0.6772913932800293, "learning_rate": 4.427996331023237e-05, - "loss": 1.2682, + "loss": 1.2679, "step": 13470 }, { "epoch": 0.9155455904334828, - "grad_norm": 0.6014137864112854, + "grad_norm": 0.6019927859306335, "learning_rate": 4.4277840059790735e-05, - "loss": 1.3443, + "loss": 1.3442, "step": 13475 }, { "epoch": 0.9158853105041446, - "grad_norm": 0.7761210799217224, + "grad_norm": 0.7702804803848267, "learning_rate": 4.42757168093491e-05, - "loss": 1.4239, + "loss": 1.4242, "step": 13480 }, { "epoch": 0.9162250305748063, - "grad_norm": 0.7059696316719055, + "grad_norm": 0.7069201469421387, "learning_rate": 4.427359355890746e-05, - "loss": 1.3228, + "loss": 1.3229, "step": 13485 }, { "epoch": 0.9165647506454682, - "grad_norm": 0.7353407740592957, + "grad_norm": 0.7335325479507446, "learning_rate": 4.427147030846583e-05, "loss": 1.3071, "step": 13490 }, { "epoch": 0.9169044707161299, - "grad_norm": 0.6706827878952026, + "grad_norm": 0.6628076434135437, "learning_rate": 4.426934705802419e-05, - "loss": 1.3291, + "loss": 1.3293, "step": 13495 }, { "epoch": 0.9172441907867916, - "grad_norm": 0.725422739982605, + "grad_norm": 0.7261803150177002, "learning_rate": 4.4267223807582555e-05, - "loss": 1.3362, + "loss": 1.336, "step": 13500 }, { "epoch": 0.9175839108574535, - "grad_norm": 0.6997534036636353, + "grad_norm": 0.6965487599372864, "learning_rate": 4.426510055714092e-05, - "loss": 1.3418, + "loss": 1.3417, "step": 13505 }, { "epoch": 0.9179236309281152, - "grad_norm": 0.659503161907196, + "grad_norm": 0.6563084125518799, "learning_rate": 4.426297730669928e-05, "loss": 1.4357, "step": 13510 }, { "epoch": 0.918263350998777, - "grad_norm": 0.6595714092254639, + "grad_norm": 0.6626461148262024, "learning_rate": 4.426085405625765e-05, - "loss": 1.3532, + "loss": 1.3525, "step": 13515 }, { "epoch": 0.9186030710694388, - "grad_norm": 0.6540241837501526, + "grad_norm": 0.673109769821167, "learning_rate": 4.425873080581601e-05, - "loss": 1.2698, + "loss": 1.2688, "step": 13520 }, { "epoch": 0.9189427911401006, - "grad_norm": 0.6801666021347046, + "grad_norm": 0.6807073950767517, "learning_rate": 4.4256607555374375e-05, - "loss": 1.3823, + "loss": 1.383, "step": 13525 }, { "epoch": 0.9192825112107623, - "grad_norm": 0.7063329815864563, + "grad_norm": 0.7095193266868591, "learning_rate": 4.425448430493274e-05, - "loss": 1.269, + "loss": 1.2692, "step": 13530 }, { "epoch": 0.9196222312814241, - "grad_norm": 0.6541804671287537, + "grad_norm": 0.6564123034477234, "learning_rate": 4.42523610544911e-05, - "loss": 1.4347, + "loss": 1.4346, "step": 13535 }, { "epoch": 0.9199619513520859, - "grad_norm": 0.7060062885284424, + "grad_norm": 0.7083345651626587, "learning_rate": 4.425023780404947e-05, - "loss": 1.4064, + "loss": 1.4059, "step": 13540 }, { "epoch": 0.9203016714227477, - "grad_norm": 0.7766232490539551, + "grad_norm": 0.7763247489929199, "learning_rate": 4.4248114553607825e-05, - "loss": 1.4848, + "loss": 1.4849, "step": 13545 }, { "epoch": 0.9206413914934094, - "grad_norm": 0.6796672940254211, + "grad_norm": 0.6772543787956238, "learning_rate": 4.4245991303166195e-05, - "loss": 1.4842, + "loss": 1.4846, "step": 13550 }, { "epoch": 0.9209811115640713, - "grad_norm": 0.6018598675727844, + "grad_norm": 0.6034160852432251, "learning_rate": 4.424386805272456e-05, - "loss": 1.3176, + "loss": 1.3184, "step": 13555 }, { "epoch": 0.921320831634733, - "grad_norm": 0.6565004587173462, + "grad_norm": 0.6573268175125122, "learning_rate": 4.4241744802282917e-05, "loss": 1.3705, "step": 13560 }, { "epoch": 0.9216605517053947, - "grad_norm": 0.6929068565368652, + "grad_norm": 0.693510115146637, "learning_rate": 4.423962155184129e-05, - "loss": 1.3362, + "loss": 1.3358, "step": 13565 }, { "epoch": 0.9220002717760565, - "grad_norm": 0.6933250427246094, + "grad_norm": 0.690349817276001, "learning_rate": 4.423749830139965e-05, "loss": 1.3811, "step": 13570 }, { "epoch": 0.9223399918467183, - "grad_norm": 0.7431418299674988, + "grad_norm": 0.7189193367958069, "learning_rate": 4.423537505095801e-05, - "loss": 1.3066, + "loss": 1.3059, "step": 13575 }, { "epoch": 0.9226797119173801, - "grad_norm": 0.686419665813446, + "grad_norm": 0.6866820454597473, "learning_rate": 4.423325180051638e-05, "loss": 1.3276, "step": 13580 }, { "epoch": 0.9230194319880418, - "grad_norm": 0.6179741024971008, + "grad_norm": 0.6184611916542053, "learning_rate": 4.423112855007474e-05, - "loss": 1.3087, + "loss": 1.3085, "step": 13585 }, { "epoch": 0.9233591520587037, - "grad_norm": 0.6031321287155151, + "grad_norm": 0.6048295497894287, "learning_rate": 4.42290052996331e-05, - "loss": 1.3237, + "loss": 1.324, "step": 13590 }, { "epoch": 0.9236988721293654, - "grad_norm": 0.7255411744117737, + "grad_norm": 0.726370632648468, "learning_rate": 4.422688204919147e-05, - "loss": 1.3777, + "loss": 1.3776, "step": 13595 }, { "epoch": 0.9240385922000272, - "grad_norm": 0.6925275325775146, + "grad_norm": 0.6939946413040161, "learning_rate": 4.4224758798749835e-05, - "loss": 1.3086, + "loss": 1.3083, "step": 13600 }, { "epoch": 0.924378312270689, - "grad_norm": 0.7831964492797852, + "grad_norm": 0.7890504598617554, "learning_rate": 4.422263554830819e-05, - "loss": 1.4558, + "loss": 1.4553, "step": 13605 }, { "epoch": 0.9247180323413507, - "grad_norm": 0.6948872208595276, + "grad_norm": 0.6937683820724487, "learning_rate": 4.4220512297866563e-05, - "loss": 1.3506, + "loss": 1.3509, "step": 13610 }, { "epoch": 0.9250577524120125, - "grad_norm": 0.5872417092323303, + "grad_norm": 0.5902190208435059, "learning_rate": 4.421838904742493e-05, - "loss": 1.3226, + "loss": 1.3233, "step": 13615 }, { "epoch": 0.9253974724826742, - "grad_norm": 0.6647357940673828, + "grad_norm": 0.665254533290863, "learning_rate": 4.4216265796983285e-05, - "loss": 1.4053, + "loss": 1.4052, "step": 13620 }, { "epoch": 0.9257371925533361, - "grad_norm": 0.7161340713500977, + "grad_norm": 0.7176029086112976, "learning_rate": 4.4214142546541655e-05, - "loss": 1.3165, + "loss": 1.3161, "step": 13625 }, { "epoch": 0.9260769126239978, - "grad_norm": 0.6334004402160645, + "grad_norm": 0.6363087296485901, "learning_rate": 4.421201929610001e-05, - "loss": 1.2966, + "loss": 1.2964, "step": 13630 }, { "epoch": 0.9264166326946596, - "grad_norm": 0.6781123876571655, + "grad_norm": 0.6788815855979919, "learning_rate": 4.420989604565838e-05, - "loss": 1.3496, + "loss": 1.3493, "step": 13635 }, { "epoch": 0.9267563527653214, - "grad_norm": 0.7244052290916443, + "grad_norm": 0.7237371802330017, "learning_rate": 4.420777279521675e-05, - "loss": 1.3406, + "loss": 1.3404, "step": 13640 }, { "epoch": 0.9270960728359832, - "grad_norm": 0.5577734708786011, + "grad_norm": 0.558638870716095, "learning_rate": 4.4205649544775105e-05, - "loss": 1.2553, + "loss": 1.2547, "step": 13645 }, { "epoch": 0.9274357929066449, - "grad_norm": 0.6536522507667542, + "grad_norm": 0.6577743887901306, "learning_rate": 4.420352629433347e-05, - "loss": 1.3195, + "loss": 1.3197, "step": 13650 }, { "epoch": 0.9277755129773066, - "grad_norm": 0.7267488241195679, + "grad_norm": 0.7234688401222229, "learning_rate": 4.420140304389184e-05, - "loss": 1.3983, + "loss": 1.3982, "step": 13655 }, { "epoch": 0.9281152330479685, - "grad_norm": 0.7505548596382141, + "grad_norm": 0.749758243560791, "learning_rate": 4.41992797934502e-05, - "loss": 1.4297, + "loss": 1.4293, "step": 13660 }, { "epoch": 0.9284549531186302, - "grad_norm": 0.6807383894920349, + "grad_norm": 0.6882309317588806, "learning_rate": 4.419715654300856e-05, - "loss": 1.3618, + "loss": 1.362, "step": 13665 }, { "epoch": 0.928794673189292, - "grad_norm": 0.7637690901756287, + "grad_norm": 0.7843093276023865, "learning_rate": 4.419503329256693e-05, - "loss": 1.346, + "loss": 1.3454, "step": 13670 }, { "epoch": 0.9291343932599538, - "grad_norm": 0.6206974983215332, + "grad_norm": 0.6279749870300293, "learning_rate": 4.419291004212529e-05, - "loss": 1.3381, + "loss": 1.3377, "step": 13675 }, { "epoch": 0.9294741133306156, - "grad_norm": 0.7176368236541748, + "grad_norm": 0.7146983742713928, "learning_rate": 4.419078679168365e-05, - "loss": 1.3247, + "loss": 1.3242, "step": 13680 }, { "epoch": 0.9298138334012773, - "grad_norm": 0.6860157251358032, + "grad_norm": 0.6866699457168579, "learning_rate": 4.4188663541242023e-05, - "loss": 1.3047, + "loss": 1.3045, "step": 13685 }, { "epoch": 0.9301535534719392, - "grad_norm": 0.6906935572624207, + "grad_norm": 0.6906257271766663, "learning_rate": 4.418654029080038e-05, "loss": 1.3295, "step": 13690 }, { "epoch": 0.9304932735426009, - "grad_norm": 0.6833156943321228, + "grad_norm": 0.6818947792053223, "learning_rate": 4.4184417040358745e-05, - "loss": 1.3558, + "loss": 1.3561, "step": 13695 }, { "epoch": 0.9308329936132627, - "grad_norm": 0.7648639678955078, + "grad_norm": 0.7698661088943481, "learning_rate": 4.418229378991711e-05, - "loss": 1.3238, + "loss": 1.3243, "step": 13700 }, { "epoch": 0.9311727136839244, - "grad_norm": 0.7127102613449097, + "grad_norm": 0.7148460149765015, "learning_rate": 4.418017053947547e-05, - "loss": 1.2945, + "loss": 1.2942, "step": 13705 }, { "epoch": 0.9315124337545863, - "grad_norm": 0.7148470282554626, + "grad_norm": 0.7136948108673096, "learning_rate": 4.417804728903384e-05, - "loss": 1.3695, + "loss": 1.3699, "step": 13710 }, { "epoch": 0.931852153825248, - "grad_norm": 0.7143694162368774, + "grad_norm": 0.7156612873077393, "learning_rate": 4.41759240385922e-05, - "loss": 1.4847, + "loss": 1.4856, "step": 13715 }, { "epoch": 0.9321918738959097, - "grad_norm": 0.7163874506950378, + "grad_norm": 0.7176118493080139, "learning_rate": 4.4173800788150565e-05, - "loss": 1.3838, + "loss": 1.384, "step": 13720 }, { "epoch": 0.9325315939665716, - "grad_norm": 0.5652332305908203, + "grad_norm": 0.561378538608551, "learning_rate": 4.417167753770893e-05, - "loss": 1.3025, + "loss": 1.3026, "step": 13725 }, { "epoch": 0.9328713140372333, - "grad_norm": 0.7008733153343201, + "grad_norm": 0.7014034986495972, "learning_rate": 4.416955428726729e-05, - "loss": 1.3852, + "loss": 1.385, "step": 13730 }, { "epoch": 0.9332110341078951, - "grad_norm": 0.7026612162590027, + "grad_norm": 0.7061130404472351, "learning_rate": 4.416743103682566e-05, - "loss": 1.4156, + "loss": 1.4166, "step": 13735 }, { "epoch": 0.9335507541785568, - "grad_norm": 0.7290593981742859, + "grad_norm": 0.7243119478225708, "learning_rate": 4.416530778638402e-05, - "loss": 1.3394, + "loss": 1.3405, "step": 13740 }, { "epoch": 0.9338904742492187, - "grad_norm": 0.656147301197052, + "grad_norm": 0.6528512239456177, "learning_rate": 4.4163184535942385e-05, - "loss": 1.3438, + "loss": 1.3433, "step": 13745 }, { "epoch": 0.9342301943198804, - "grad_norm": 0.6537467837333679, + "grad_norm": 0.6529529094696045, "learning_rate": 4.416106128550075e-05, - "loss": 1.2681, + "loss": 1.2682, "step": 13750 }, { "epoch": 0.9345699143905422, - "grad_norm": 0.6275011301040649, + "grad_norm": 0.6290601491928101, "learning_rate": 4.415893803505911e-05, - "loss": 1.374, + "loss": 1.3735, "step": 13755 }, { "epoch": 0.934909634461204, - "grad_norm": 0.7148743271827698, + "grad_norm": 0.7196561098098755, "learning_rate": 4.415681478461748e-05, "loss": 1.3613, "step": 13760 }, { "epoch": 0.9352493545318658, - "grad_norm": 0.665732741355896, + "grad_norm": 0.6692147850990295, "learning_rate": 4.415469153417584e-05, - "loss": 1.3207, + "loss": 1.3201, "step": 13765 }, { "epoch": 0.9355890746025275, - "grad_norm": 0.7353973984718323, + "grad_norm": 0.7392304539680481, "learning_rate": 4.4152568283734205e-05, - "loss": 1.3649, + "loss": 1.3653, "step": 13770 }, { "epoch": 0.9359287946731893, - "grad_norm": 0.7345528602600098, + "grad_norm": 0.730678915977478, "learning_rate": 4.415044503329257e-05, - "loss": 1.3052, + "loss": 1.3056, "step": 13775 }, { "epoch": 0.9362685147438511, - "grad_norm": 0.7020397782325745, + "grad_norm": 0.7026821374893188, "learning_rate": 4.414832178285093e-05, - "loss": 1.36, + "loss": 1.3594, "step": 13780 }, { "epoch": 0.9366082348145128, - "grad_norm": 0.6661944389343262, + "grad_norm": 0.6653341054916382, "learning_rate": 4.41461985324093e-05, - "loss": 1.3425, + "loss": 1.3432, "step": 13785 }, { "epoch": 0.9369479548851746, - "grad_norm": 0.6353135704994202, + "grad_norm": 0.6343628764152527, "learning_rate": 4.414407528196766e-05, - "loss": 1.3545, + "loss": 1.3544, "step": 13790 }, { "epoch": 0.9372876749558364, - "grad_norm": 0.6998313665390015, + "grad_norm": 0.6989215612411499, "learning_rate": 4.4141952031526025e-05, - "loss": 1.4537, + "loss": 1.4536, "step": 13795 }, { "epoch": 0.9376273950264982, - "grad_norm": 0.6690735816955566, + "grad_norm": 0.6695895791053772, "learning_rate": 4.413982878108439e-05, - "loss": 1.3284, + "loss": 1.3282, "step": 13800 }, { "epoch": 0.9379671150971599, - "grad_norm": 0.7317988276481628, + "grad_norm": 0.7325136065483093, "learning_rate": 4.413770553064275e-05, - "loss": 1.3734, + "loss": 1.373, "step": 13805 }, { "epoch": 0.9383068351678218, - "grad_norm": 0.7345866560935974, + "grad_norm": 0.7343001365661621, "learning_rate": 4.413558228020112e-05, - "loss": 1.2194, + "loss": 1.2185, "step": 13810 }, { "epoch": 0.9386465552384835, - "grad_norm": 0.6236724257469177, + "grad_norm": 0.6228654980659485, "learning_rate": 4.413345902975948e-05, - "loss": 1.4252, + "loss": 1.425, "step": 13815 }, { "epoch": 0.9389862753091452, - "grad_norm": 0.8949745297431946, + "grad_norm": 0.9206582903862, "learning_rate": 4.4131335779317845e-05, - "loss": 1.3196, + "loss": 1.3202, "step": 13820 }, { "epoch": 0.939325995379807, - "grad_norm": 0.6839173436164856, + "grad_norm": 0.6861138939857483, "learning_rate": 4.412921252887621e-05, - "loss": 1.2955, + "loss": 1.2959, "step": 13825 }, { "epoch": 0.9396657154504688, - "grad_norm": 0.7415878176689148, + "grad_norm": 0.7384356260299683, "learning_rate": 4.412708927843457e-05, - "loss": 1.331, + "loss": 1.3311, "step": 13830 }, { "epoch": 0.9400054355211306, - "grad_norm": 0.7342458963394165, + "grad_norm": 0.73335862159729, "learning_rate": 4.412496602799294e-05, - "loss": 1.4263, + "loss": 1.4264, "step": 13835 }, { "epoch": 0.9403451555917923, - "grad_norm": 0.6380994319915771, + "grad_norm": 0.6403820514678955, "learning_rate": 4.41228427775513e-05, - "loss": 1.3935, + "loss": 1.3932, "step": 13840 }, { "epoch": 0.9406848756624542, - "grad_norm": 0.7457711100578308, + "grad_norm": 0.7466438412666321, "learning_rate": 4.4120719527109665e-05, - "loss": 1.3192, + "loss": 1.3197, "step": 13845 }, { "epoch": 0.9410245957331159, - "grad_norm": 0.7554078102111816, + "grad_norm": 0.7519053816795349, "learning_rate": 4.411859627666803e-05, "loss": 1.3497, "step": 13850 }, { "epoch": 0.9413643158037777, - "grad_norm": 0.6834602952003479, + "grad_norm": 0.6925027966499329, "learning_rate": 4.411647302622639e-05, - "loss": 1.5496, + "loss": 1.5491, "step": 13855 }, { "epoch": 0.9417040358744395, - "grad_norm": 0.6930656433105469, + "grad_norm": 0.6961853504180908, "learning_rate": 4.411434977578476e-05, "loss": 1.5183, "step": 13860 }, { "epoch": 0.9420437559451013, - "grad_norm": 0.6998500227928162, + "grad_norm": 0.6967265009880066, "learning_rate": 4.411222652534312e-05, - "loss": 1.3476, + "loss": 1.3481, "step": 13865 }, { "epoch": 0.942383476015763, - "grad_norm": 0.6948878765106201, + "grad_norm": 0.7001057267189026, "learning_rate": 4.4110103274901485e-05, - "loss": 1.3836, + "loss": 1.3843, "step": 13870 }, { "epoch": 0.9427231960864247, - "grad_norm": 0.8128736615180969, + "grad_norm": 0.8108313679695129, "learning_rate": 4.410798002445985e-05, - "loss": 1.3481, + "loss": 1.3484, "step": 13875 }, { "epoch": 0.9430629161570866, - "grad_norm": 0.6872991323471069, + "grad_norm": 0.6868679523468018, "learning_rate": 4.410585677401821e-05, - "loss": 1.4402, + "loss": 1.4404, "step": 13880 }, { "epoch": 0.9434026362277483, - "grad_norm": 0.6555947661399841, + "grad_norm": 0.6461057066917419, "learning_rate": 4.410373352357658e-05, - "loss": 1.2432, + "loss": 1.243, "step": 13885 }, { "epoch": 0.9437423562984101, - "grad_norm": 0.6612246036529541, + "grad_norm": 0.658891499042511, "learning_rate": 4.410161027313494e-05, - "loss": 1.448, + "loss": 1.4487, "step": 13890 }, { "epoch": 0.9440820763690719, - "grad_norm": 0.673306405544281, + "grad_norm": 0.6795490384101868, "learning_rate": 4.4099487022693305e-05, - "loss": 1.3303, + "loss": 1.33, "step": 13895 }, { "epoch": 0.9444217964397337, - "grad_norm": 0.6873499155044556, + "grad_norm": 0.6852904558181763, "learning_rate": 4.409736377225166e-05, - "loss": 1.2458, + "loss": 1.2463, "step": 13900 }, { "epoch": 0.9447615165103954, - "grad_norm": 0.7184569239616394, + "grad_norm": 0.7164695262908936, "learning_rate": 4.409524052181003e-05, - "loss": 1.3367, + "loss": 1.3369, "step": 13905 }, { "epoch": 0.9451012365810572, - "grad_norm": 0.7577341198921204, + "grad_norm": 0.7611421346664429, "learning_rate": 4.40931172713684e-05, - "loss": 1.4501, + "loss": 1.4498, "step": 13910 }, { "epoch": 0.945440956651719, - "grad_norm": 0.7062777280807495, + "grad_norm": 0.7066428661346436, "learning_rate": 4.4090994020926754e-05, - "loss": 1.4258, + "loss": 1.4262, "step": 13915 }, { "epoch": 0.9457806767223808, - "grad_norm": 0.638261079788208, + "grad_norm": 0.6390190720558167, "learning_rate": 4.4088870770485125e-05, - "loss": 1.311, + "loss": 1.3111, "step": 13920 }, { "epoch": 0.9461203967930425, - "grad_norm": 0.5948472023010254, + "grad_norm": 0.595580518245697, "learning_rate": 4.408674752004349e-05, "loss": 1.3772, "step": 13925 }, { "epoch": 0.9464601168637043, - "grad_norm": 0.643025279045105, + "grad_norm": 0.6421446800231934, "learning_rate": 4.4084624269601846e-05, - "loss": 1.2932, + "loss": 1.2935, "step": 13930 }, { "epoch": 0.9467998369343661, - "grad_norm": 0.6830620169639587, + "grad_norm": 0.6884860992431641, "learning_rate": 4.408250101916022e-05, - "loss": 1.3575, + "loss": 1.3578, "step": 13935 }, { "epoch": 0.9471395570050278, - "grad_norm": 0.6490104794502258, + "grad_norm": 0.6483439803123474, "learning_rate": 4.408037776871858e-05, - "loss": 1.4628, + "loss": 1.463, "step": 13940 }, { "epoch": 0.9474792770756897, - "grad_norm": 0.6798790693283081, + "grad_norm": 0.6801953315734863, "learning_rate": 4.407825451827694e-05, - "loss": 1.3162, + "loss": 1.3165, "step": 13945 }, { "epoch": 0.9478189971463514, - "grad_norm": 0.6297010779380798, + "grad_norm": 0.6297950744628906, "learning_rate": 4.407613126783531e-05, - "loss": 1.3067, + "loss": 1.3074, "step": 13950 }, { "epoch": 0.9481587172170132, - "grad_norm": 0.6597910523414612, + "grad_norm": 0.6601901650428772, "learning_rate": 4.407400801739367e-05, - "loss": 1.3153, + "loss": 1.3149, "step": 13955 }, { "epoch": 0.9484984372876749, - "grad_norm": 0.6417344212532043, + "grad_norm": 0.641334056854248, "learning_rate": 4.407188476695203e-05, - "loss": 1.3862, + "loss": 1.3863, "step": 13960 }, { "epoch": 0.9488381573583368, - "grad_norm": 0.7095355987548828, + "grad_norm": 0.7074373364448547, "learning_rate": 4.40697615165104e-05, - "loss": 1.3388, + "loss": 1.3387, "step": 13965 }, { "epoch": 0.9491778774289985, - "grad_norm": 0.6944183707237244, + "grad_norm": 0.6944468021392822, "learning_rate": 4.406763826606876e-05, - "loss": 1.3299, + "loss": 1.33, "step": 13970 }, { "epoch": 0.9495175974996602, - "grad_norm": 0.6939918398857117, + "grad_norm": 0.6922484636306763, "learning_rate": 4.406551501562712e-05, - "loss": 1.3929, + "loss": 1.3933, "step": 13975 }, { "epoch": 0.9498573175703221, - "grad_norm": 0.7122329473495483, + "grad_norm": 0.7108091115951538, "learning_rate": 4.406339176518549e-05, - "loss": 1.5842, + "loss": 1.5845, "step": 13980 }, { "epoch": 0.9501970376409838, - "grad_norm": 0.5930275321006775, + "grad_norm": 0.5946042537689209, "learning_rate": 4.406126851474385e-05, - "loss": 1.2767, + "loss": 1.2766, "step": 13985 }, { "epoch": 0.9505367577116456, - "grad_norm": 0.6418907642364502, + "grad_norm": 0.643783688545227, "learning_rate": 4.4059145264302214e-05, - "loss": 1.241, + "loss": 1.2415, "step": 13990 }, { "epoch": 0.9508764777823073, - "grad_norm": 0.6851585507392883, + "grad_norm": 0.6817734837532043, "learning_rate": 4.4057022013860585e-05, - "loss": 1.2957, + "loss": 1.2956, "step": 13995 }, { "epoch": 0.9512161978529692, - "grad_norm": 0.6519604325294495, + "grad_norm": 0.6520854830741882, "learning_rate": 4.405489876341894e-05, - "loss": 1.3293, + "loss": 1.3299, "step": 14000 }, { "epoch": 0.9515559179236309, - "grad_norm": 0.6667711734771729, + "grad_norm": 0.6681818962097168, "learning_rate": 4.4052775512977306e-05, "loss": 1.3517, "step": 14005 }, { "epoch": 0.9518956379942927, - "grad_norm": 0.7155119180679321, + "grad_norm": 0.7166634202003479, "learning_rate": 4.405065226253568e-05, - "loss": 1.2733, + "loss": 1.2732, "step": 14010 }, { "epoch": 0.9522353580649545, - "grad_norm": 0.6329841017723083, + "grad_norm": 0.6355793476104736, "learning_rate": 4.4048529012094034e-05, - "loss": 1.3746, + "loss": 1.375, "step": 14015 }, { "epoch": 0.9525750781356163, - "grad_norm": 0.7616704106330872, + "grad_norm": 0.7557085156440735, "learning_rate": 4.40464057616524e-05, - "loss": 1.3645, + "loss": 1.3643, "step": 14020 }, { "epoch": 0.952914798206278, - "grad_norm": 0.6776059865951538, + "grad_norm": 0.6792165040969849, "learning_rate": 4.404428251121077e-05, - "loss": 1.4592, + "loss": 1.4588, "step": 14025 }, { "epoch": 0.9532545182769399, - "grad_norm": 0.613772451877594, + "grad_norm": 0.6113372445106506, "learning_rate": 4.4042159260769126e-05, - "loss": 1.2752, + "loss": 1.2748, "step": 14030 }, { "epoch": 0.9535942383476016, - "grad_norm": 0.5922441482543945, + "grad_norm": 0.5934929847717285, "learning_rate": 4.404003601032749e-05, - "loss": 1.2123, + "loss": 1.2129, "step": 14035 }, { "epoch": 0.9539339584182633, - "grad_norm": 0.6554942727088928, + "grad_norm": 0.6556838154792786, "learning_rate": 4.403791275988586e-05, - "loss": 1.3778, + "loss": 1.3779, "step": 14040 }, { "epoch": 0.9542736784889251, - "grad_norm": 0.7736828923225403, + "grad_norm": 0.7771568298339844, "learning_rate": 4.403578950944422e-05, "loss": 1.4264, "step": 14045 }, { "epoch": 0.9546133985595869, - "grad_norm": 0.6744146347045898, + "grad_norm": 0.6740558743476868, "learning_rate": 4.403366625900258e-05, - "loss": 1.3186, + "loss": 1.319, "step": 14050 }, { "epoch": 0.9549531186302487, - "grad_norm": 0.7017351984977722, + "grad_norm": 0.7029985785484314, "learning_rate": 4.4031543008560946e-05, "loss": 1.3742, "step": 14055 }, { "epoch": 0.9552928387009104, - "grad_norm": 0.6876327395439148, + "grad_norm": 0.6876945495605469, "learning_rate": 4.402941975811931e-05, - "loss": 1.3365, + "loss": 1.3358, "step": 14060 }, { "epoch": 0.9556325587715723, - "grad_norm": 0.6205369234085083, + "grad_norm": 0.6214742064476013, "learning_rate": 4.4027296507677674e-05, - "loss": 1.3912, + "loss": 1.3922, "step": 14065 }, { "epoch": 0.955972278842234, - "grad_norm": 0.6234815120697021, + "grad_norm": 0.6230260729789734, "learning_rate": 4.402517325723604e-05, - "loss": 1.3267, + "loss": 1.3264, "step": 14070 }, { "epoch": 0.9563119989128958, - "grad_norm": 0.6214469075202942, + "grad_norm": 0.6212023496627808, "learning_rate": 4.40230500067944e-05, - "loss": 1.3449, + "loss": 1.345, "step": 14075 }, { "epoch": 0.9566517189835575, - "grad_norm": 0.622333824634552, + "grad_norm": 0.6221936345100403, "learning_rate": 4.4020926756352766e-05, - "loss": 1.3436, + "loss": 1.3431, "step": 14080 }, { "epoch": 0.9569914390542194, - "grad_norm": 0.7575642466545105, + "grad_norm": 0.7613353133201599, "learning_rate": 4.401880350591113e-05, - "loss": 1.3706, + "loss": 1.371, "step": 14085 }, { "epoch": 0.9573311591248811, - "grad_norm": 0.7207659482955933, + "grad_norm": 0.7232246994972229, "learning_rate": 4.4016680255469494e-05, - "loss": 1.293, + "loss": 1.2923, "step": 14090 }, { "epoch": 0.9576708791955428, - "grad_norm": 0.6630386114120483, + "grad_norm": 0.6622391939163208, "learning_rate": 4.401455700502786e-05, - "loss": 1.3204, + "loss": 1.3199, "step": 14095 }, { "epoch": 0.9580105992662047, - "grad_norm": 0.7721298933029175, + "grad_norm": 0.7743175029754639, "learning_rate": 4.401243375458622e-05, - "loss": 1.3292, + "loss": 1.3298, "step": 14100 }, { "epoch": 0.9583503193368664, - "grad_norm": 0.7689454555511475, + "grad_norm": 0.7502080202102661, "learning_rate": 4.4010310504144586e-05, - "loss": 1.3164, + "loss": 1.3166, "step": 14105 }, { "epoch": 0.9586900394075282, - "grad_norm": 0.7109953761100769, + "grad_norm": 0.708746075630188, "learning_rate": 4.400818725370295e-05, - "loss": 1.2628, + "loss": 1.2621, "step": 14110 }, { "epoch": 0.95902975947819, - "grad_norm": 0.7149671316146851, + "grad_norm": 0.716349720954895, "learning_rate": 4.4006064003261314e-05, - "loss": 1.3386, + "loss": 1.3387, "step": 14115 }, { "epoch": 0.9593694795488518, - "grad_norm": 0.7511143088340759, + "grad_norm": 0.7411032319068909, "learning_rate": 4.400394075281968e-05, - "loss": 1.4523, + "loss": 1.4518, "step": 14120 }, { "epoch": 0.9597091996195135, - "grad_norm": 0.70169997215271, + "grad_norm": 0.7053865790367126, "learning_rate": 4.400181750237804e-05, - "loss": 1.3615, + "loss": 1.3612, "step": 14125 }, { "epoch": 0.9600489196901753, - "grad_norm": 0.7001254558563232, + "grad_norm": 0.6988649964332581, "learning_rate": 4.3999694251936406e-05, - "loss": 1.38, + "loss": 1.3803, "step": 14130 }, { "epoch": 0.9603886397608371, - "grad_norm": 0.6793377995491028, + "grad_norm": 0.6788693070411682, "learning_rate": 4.399757100149477e-05, - "loss": 1.4265, + "loss": 1.4263, "step": 14135 }, { "epoch": 0.9607283598314988, - "grad_norm": 0.6415596008300781, + "grad_norm": 0.653241753578186, "learning_rate": 4.3995447751053134e-05, - "loss": 1.3525, + "loss": 1.353, "step": 14140 }, { "epoch": 0.9610680799021606, - "grad_norm": 0.6914222240447998, + "grad_norm": 0.6926454901695251, "learning_rate": 4.39933245006115e-05, - "loss": 1.2615, + "loss": 1.2613, "step": 14145 }, { "epoch": 0.9614077999728224, - "grad_norm": 0.6529057025909424, + "grad_norm": 0.652787446975708, "learning_rate": 4.399120125016986e-05, - "loss": 1.4715, + "loss": 1.4713, "step": 14150 }, { "epoch": 0.9617475200434842, - "grad_norm": 0.6536417603492737, + "grad_norm": 0.6552249789237976, "learning_rate": 4.3989077999728226e-05, "loss": 1.2823, "step": 14155 }, { "epoch": 0.9620872401141459, - "grad_norm": 0.733706533908844, + "grad_norm": 0.7365493774414062, "learning_rate": 4.398695474928659e-05, - "loss": 1.4046, + "loss": 1.4041, "step": 14160 }, { "epoch": 0.9624269601848077, - "grad_norm": 0.757579505443573, + "grad_norm": 0.7602401971817017, "learning_rate": 4.3984831498844954e-05, - "loss": 1.4021, + "loss": 1.402, "step": 14165 }, { "epoch": 0.9627666802554695, - "grad_norm": 0.7217411994934082, + "grad_norm": 0.7281230688095093, "learning_rate": 4.398270824840332e-05, - "loss": 1.3788, + "loss": 1.3784, "step": 14170 }, { "epoch": 0.9631064003261313, - "grad_norm": 0.6954963207244873, + "grad_norm": 0.6983329057693481, "learning_rate": 4.398058499796168e-05, - "loss": 1.3508, + "loss": 1.3514, "step": 14175 }, { "epoch": 0.963446120396793, - "grad_norm": 0.6648340225219727, + "grad_norm": 0.666387140750885, "learning_rate": 4.3978461747520046e-05, - "loss": 1.3537, + "loss": 1.3534, "step": 14180 }, { "epoch": 0.9637858404674549, - "grad_norm": 0.7050533294677734, + "grad_norm": 0.7032514214515686, "learning_rate": 4.397633849707841e-05, - "loss": 1.4402, + "loss": 1.4397, "step": 14185 }, { "epoch": 0.9641255605381166, - "grad_norm": 0.7376846671104431, + "grad_norm": 0.738956868648529, "learning_rate": 4.3974215246636774e-05, - "loss": 1.3408, + "loss": 1.3405, "step": 14190 }, { "epoch": 0.9644652806087783, - "grad_norm": 0.6715635657310486, + "grad_norm": 0.6714094877243042, "learning_rate": 4.397209199619514e-05, - "loss": 1.3493, + "loss": 1.3501, "step": 14195 }, { "epoch": 0.9648050006794402, - "grad_norm": 0.708532452583313, + "grad_norm": 0.7061524987220764, "learning_rate": 4.39699687457535e-05, - "loss": 1.3779, + "loss": 1.3778, "step": 14200 }, { "epoch": 0.9651447207501019, - "grad_norm": 0.6870357394218445, + "grad_norm": 0.6860175728797913, "learning_rate": 4.3967845495311866e-05, - "loss": 1.3674, + "loss": 1.3685, "step": 14205 }, { "epoch": 0.9654844408207637, - "grad_norm": 0.7117043137550354, + "grad_norm": 0.7069896459579468, "learning_rate": 4.396572224487023e-05, "loss": 1.3592, "step": 14210 }, { "epoch": 0.9658241608914254, - "grad_norm": 0.8228017091751099, + "grad_norm": 0.818370521068573, "learning_rate": 4.3963598994428594e-05, "loss": 1.3517, "step": 14215 }, { "epoch": 0.9661638809620873, - "grad_norm": 0.6799255609512329, + "grad_norm": 0.6804168820381165, "learning_rate": 4.396147574398696e-05, - "loss": 1.3451, + "loss": 1.3453, "step": 14220 }, { "epoch": 0.966503601032749, - "grad_norm": 0.7322139739990234, + "grad_norm": 0.735768735408783, "learning_rate": 4.395935249354532e-05, - "loss": 1.3108, + "loss": 1.3106, "step": 14225 }, { "epoch": 0.9668433211034108, - "grad_norm": 0.7069768309593201, + "grad_norm": 0.7039186358451843, "learning_rate": 4.3957229243103686e-05, - "loss": 1.3206, + "loss": 1.3212, "step": 14230 }, { "epoch": 0.9671830411740726, - "grad_norm": 0.6934696435928345, + "grad_norm": 0.6944984197616577, "learning_rate": 4.395510599266205e-05, - "loss": 1.4205, + "loss": 1.4208, "step": 14235 }, { "epoch": 0.9675227612447344, - "grad_norm": 0.6686298251152039, + "grad_norm": 0.6691943407058716, "learning_rate": 4.3952982742220414e-05, - "loss": 1.3191, + "loss": 1.3193, "step": 14240 }, { "epoch": 0.9678624813153961, - "grad_norm": 0.713447093963623, + "grad_norm": 0.7164919376373291, "learning_rate": 4.395085949177878e-05, - "loss": 1.3402, + "loss": 1.3401, "step": 14245 }, { "epoch": 0.9682022013860578, - "grad_norm": 0.660870373249054, + "grad_norm": 0.6611899733543396, "learning_rate": 4.394873624133714e-05, - "loss": 1.3791, + "loss": 1.3787, "step": 14250 }, { "epoch": 0.9685419214567197, - "grad_norm": 0.6390849947929382, + "grad_norm": 0.6390730142593384, "learning_rate": 4.39466129908955e-05, - "loss": 1.3189, + "loss": 1.3187, "step": 14255 }, { "epoch": 0.9688816415273814, - "grad_norm": 0.7503184080123901, + "grad_norm": 0.7476893663406372, "learning_rate": 4.394448974045387e-05, - "loss": 1.3094, + "loss": 1.3095, "step": 14260 }, { "epoch": 0.9692213615980432, - "grad_norm": 0.714127779006958, + "grad_norm": 0.7101805806159973, "learning_rate": 4.3942366490012234e-05, - "loss": 1.378, + "loss": 1.3776, "step": 14265 }, { "epoch": 0.969561081668705, - "grad_norm": 0.6611217856407166, + "grad_norm": 0.6656824946403503, "learning_rate": 4.394024323957059e-05, - "loss": 1.1974, + "loss": 1.1973, "step": 14270 }, { "epoch": 0.9699008017393668, - "grad_norm": 0.8122195601463318, + "grad_norm": 0.8128830194473267, "learning_rate": 4.393811998912896e-05, - "loss": 1.3743, + "loss": 1.3738, "step": 14275 }, { "epoch": 0.9702405218100285, - "grad_norm": 0.6840839982032776, + "grad_norm": 0.6823077201843262, "learning_rate": 4.3935996738687326e-05, - "loss": 1.2965, + "loss": 1.2968, "step": 14280 }, { "epoch": 0.9705802418806904, - "grad_norm": 0.5934416055679321, + "grad_norm": 0.5937351584434509, "learning_rate": 4.3933873488245684e-05, - "loss": 1.2991, + "loss": 1.2996, "step": 14285 }, { "epoch": 0.9709199619513521, - "grad_norm": 0.6726582646369934, + "grad_norm": 0.6720177531242371, "learning_rate": 4.3931750237804054e-05, - "loss": 1.3566, + "loss": 1.3561, "step": 14290 }, { "epoch": 0.9712596820220138, - "grad_norm": 0.6945735812187195, + "grad_norm": 0.7040796279907227, "learning_rate": 4.392962698736242e-05, - "loss": 1.316, + "loss": 1.317, "step": 14295 }, { "epoch": 0.9715994020926756, - "grad_norm": 0.676308810710907, + "grad_norm": 0.6776344776153564, "learning_rate": 4.3927503736920776e-05, - "loss": 1.3917, + "loss": 1.3919, "step": 14300 }, { "epoch": 0.9719391221633374, - "grad_norm": 0.6670910716056824, + "grad_norm": 0.6695975065231323, "learning_rate": 4.3925380486479146e-05, - "loss": 1.3191, + "loss": 1.3197, "step": 14305 }, { "epoch": 0.9722788422339992, - "grad_norm": 0.575186550617218, + "grad_norm": 0.5713580846786499, "learning_rate": 4.392325723603751e-05, - "loss": 1.3523, + "loss": 1.353, "step": 14310 }, { "epoch": 0.9726185623046609, - "grad_norm": 0.6964967250823975, + "grad_norm": 0.7000569105148315, "learning_rate": 4.392113398559587e-05, - "loss": 1.2912, + "loss": 1.2917, "step": 14315 }, { "epoch": 0.9729582823753228, - "grad_norm": 0.7684643268585205, + "grad_norm": 0.7647229433059692, "learning_rate": 4.391901073515424e-05, - "loss": 1.4089, + "loss": 1.4079, "step": 14320 }, { "epoch": 0.9732980024459845, - "grad_norm": 0.7359722852706909, + "grad_norm": 0.7383917570114136, "learning_rate": 4.3916887484712596e-05, - "loss": 1.3103, + "loss": 1.3105, "step": 14325 }, { "epoch": 0.9736377225166463, - "grad_norm": 0.7156456112861633, + "grad_norm": 0.7147855758666992, "learning_rate": 4.391476423427096e-05, - "loss": 1.4543, + "loss": 1.4544, "step": 14330 }, { "epoch": 0.973977442587308, - "grad_norm": 0.6346716284751892, + "grad_norm": 0.6359826326370239, "learning_rate": 4.391264098382933e-05, - "loss": 1.505, + "loss": 1.5057, "step": 14335 }, { "epoch": 0.9743171626579699, - "grad_norm": 0.6405880451202393, + "grad_norm": 0.6400545835494995, "learning_rate": 4.391051773338769e-05, "loss": 1.3251, "step": 14340 }, { "epoch": 0.9746568827286316, - "grad_norm": 0.5706714987754822, + "grad_norm": 0.5649635791778564, "learning_rate": 4.390839448294605e-05, - "loss": 1.3277, + "loss": 1.3275, "step": 14345 }, { "epoch": 0.9749966027992933, - "grad_norm": 0.7190221548080444, + "grad_norm": 0.7162689566612244, "learning_rate": 4.390627123250442e-05, - "loss": 1.3276, + "loss": 1.3286, "step": 14350 }, { "epoch": 0.9753363228699552, - "grad_norm": 0.6241934895515442, + "grad_norm": 0.6249674558639526, "learning_rate": 4.390414798206278e-05, - "loss": 1.3346, + "loss": 1.335, "step": 14355 }, { "epoch": 0.9756760429406169, - "grad_norm": 0.7271825075149536, + "grad_norm": 0.725691020488739, "learning_rate": 4.3902024731621144e-05, - "loss": 1.3684, + "loss": 1.3686, "step": 14360 }, { "epoch": 0.9760157630112787, - "grad_norm": 0.8547173738479614, + "grad_norm": 0.8618035316467285, "learning_rate": 4.3899901481179514e-05, - "loss": 1.4018, + "loss": 1.4013, "step": 14365 }, { "epoch": 0.9763554830819405, - "grad_norm": 0.7631275057792664, + "grad_norm": 0.7599149942398071, "learning_rate": 4.389777823073787e-05, - "loss": 1.3373, + "loss": 1.3365, "step": 14370 }, { "epoch": 0.9766952031526023, - "grad_norm": 0.7032694220542908, + "grad_norm": 0.702073872089386, "learning_rate": 4.3895654980296236e-05, - "loss": 1.3935, + "loss": 1.3934, "step": 14375 }, { "epoch": 0.977034923223264, - "grad_norm": 0.6929472088813782, + "grad_norm": 0.6936185956001282, "learning_rate": 4.3893531729854606e-05, - "loss": 1.3589, + "loss": 1.36, "step": 14380 }, { "epoch": 0.9773746432939258, - "grad_norm": 0.6832548975944519, + "grad_norm": 0.6806415319442749, "learning_rate": 4.3891408479412964e-05, - "loss": 1.4363, + "loss": 1.436, "step": 14385 }, { "epoch": 0.9777143633645876, - "grad_norm": 0.6320139765739441, + "grad_norm": 0.6335381269454956, "learning_rate": 4.388928522897133e-05, - "loss": 1.2939, + "loss": 1.2935, "step": 14390 }, { "epoch": 0.9780540834352494, - "grad_norm": 0.6729910969734192, + "grad_norm": 0.6737633943557739, "learning_rate": 4.388716197852969e-05, - "loss": 1.3276, + "loss": 1.3278, "step": 14395 }, { "epoch": 0.9783938035059111, - "grad_norm": 0.7248609066009521, + "grad_norm": 0.7315681576728821, "learning_rate": 4.3885038728088056e-05, - "loss": 1.3013, + "loss": 1.3011, "step": 14400 }, { "epoch": 0.978733523576573, - "grad_norm": 0.7062149047851562, + "grad_norm": 0.7089962959289551, "learning_rate": 4.388291547764642e-05, - "loss": 1.4143, + "loss": 1.4147, "step": 14405 }, { "epoch": 0.9790732436472347, - "grad_norm": 0.7126065492630005, + "grad_norm": 0.7217907905578613, "learning_rate": 4.3880792227204784e-05, - "loss": 1.3011, + "loss": 1.3013, "step": 14410 }, { "epoch": 0.9794129637178964, - "grad_norm": 0.6709874272346497, + "grad_norm": 0.6686938405036926, "learning_rate": 4.387866897676315e-05, - "loss": 1.3646, + "loss": 1.3644, "step": 14415 }, { "epoch": 0.9797526837885582, - "grad_norm": 0.7767813205718994, + "grad_norm": 0.7761024236679077, "learning_rate": 4.387654572632151e-05, - "loss": 1.2983, + "loss": 1.2987, "step": 14420 }, { "epoch": 0.98009240385922, - "grad_norm": 0.7139157056808472, + "grad_norm": 0.7186993360519409, "learning_rate": 4.3874422475879876e-05, - "loss": 1.4078, + "loss": 1.4072, "step": 14425 }, { "epoch": 0.9804321239298818, - "grad_norm": 0.6205175518989563, + "grad_norm": 0.6272226572036743, "learning_rate": 4.387229922543824e-05, - "loss": 1.4646, + "loss": 1.465, "step": 14430 }, { "epoch": 0.9807718440005435, - "grad_norm": 0.7783640623092651, + "grad_norm": 0.7693725824356079, "learning_rate": 4.3870175974996604e-05, "loss": 1.3237, "step": 14435 }, { "epoch": 0.9811115640712054, - "grad_norm": 0.6792538166046143, + "grad_norm": 0.6802535653114319, "learning_rate": 4.386805272455497e-05, - "loss": 1.4417, + "loss": 1.4421, "step": 14440 }, { "epoch": 0.9814512841418671, - "grad_norm": 0.6064441204071045, + "grad_norm": 0.6068971753120422, "learning_rate": 4.386592947411333e-05, - "loss": 1.3311, + "loss": 1.3308, "step": 14445 }, { "epoch": 0.9817910042125289, - "grad_norm": 0.7223596572875977, + "grad_norm": 0.7233188152313232, "learning_rate": 4.3863806223671696e-05, - "loss": 1.3375, + "loss": 1.3373, "step": 14450 }, { "epoch": 0.9821307242831907, - "grad_norm": 0.6360611319541931, + "grad_norm": 0.6350387930870056, "learning_rate": 4.386168297323006e-05, - "loss": 1.2944, + "loss": 1.2942, "step": 14455 }, { "epoch": 0.9824704443538524, - "grad_norm": 0.6797980666160583, + "grad_norm": 0.685522735118866, "learning_rate": 4.3859559722788424e-05, - "loss": 1.3256, + "loss": 1.3255, "step": 14460 }, { "epoch": 0.9828101644245142, - "grad_norm": 0.7019459009170532, + "grad_norm": 0.6991590857505798, "learning_rate": 4.385743647234679e-05, - "loss": 1.3465, + "loss": 1.3467, "step": 14465 }, { "epoch": 0.9831498844951759, - "grad_norm": 0.6758162379264832, + "grad_norm": 0.6767458915710449, "learning_rate": 4.385531322190515e-05, - "loss": 1.2201, + "loss": 1.22, "step": 14470 }, { "epoch": 0.9834896045658378, - "grad_norm": 0.722413957118988, + "grad_norm": 0.7205209136009216, "learning_rate": 4.3853189971463516e-05, - "loss": 1.3084, + "loss": 1.3083, "step": 14475 }, { "epoch": 0.9838293246364995, - "grad_norm": 0.6451125144958496, + "grad_norm": 0.6451387405395508, "learning_rate": 4.385106672102188e-05, - "loss": 1.3079, + "loss": 1.3078, "step": 14480 }, { "epoch": 0.9841690447071613, - "grad_norm": 0.6718553900718689, + "grad_norm": 0.6701124310493469, "learning_rate": 4.3848943470580244e-05, - "loss": 1.4237, + "loss": 1.4242, "step": 14485 }, { "epoch": 0.9845087647778231, - "grad_norm": 0.623634397983551, + "grad_norm": 0.6223036050796509, "learning_rate": 4.384682022013861e-05, - "loss": 1.3196, + "loss": 1.32, "step": 14490 }, { "epoch": 0.9848484848484849, - "grad_norm": 0.7772073149681091, + "grad_norm": 0.7742608189582825, "learning_rate": 4.384469696969697e-05, - "loss": 1.4259, + "loss": 1.4257, "step": 14495 }, { "epoch": 0.9851882049191466, - "grad_norm": 0.7545345425605774, + "grad_norm": 0.7558873891830444, "learning_rate": 4.3842573719255336e-05, "loss": 1.3736, "step": 14500 }, { "epoch": 0.9855279249898083, - "grad_norm": 0.6576807498931885, + "grad_norm": 0.6581639647483826, "learning_rate": 4.38404504688137e-05, - "loss": 1.3358, + "loss": 1.3364, "step": 14505 }, { "epoch": 0.9858676450604702, - "grad_norm": 0.7687963247299194, + "grad_norm": 0.7694488167762756, "learning_rate": 4.3838327218372064e-05, - "loss": 1.3047, + "loss": 1.3048, "step": 14510 }, { "epoch": 0.9862073651311319, - "grad_norm": 0.6813951134681702, + "grad_norm": 0.679908275604248, "learning_rate": 4.383620396793043e-05, - "loss": 1.3818, + "loss": 1.3816, "step": 14515 }, { "epoch": 0.9865470852017937, - "grad_norm": 0.6597345471382141, + "grad_norm": 0.6565776467323303, "learning_rate": 4.383408071748879e-05, - "loss": 1.3778, + "loss": 1.3777, "step": 14520 }, { "epoch": 0.9868868052724555, - "grad_norm": 0.6556777954101562, + "grad_norm": 0.6553031802177429, "learning_rate": 4.3831957467047156e-05, - "loss": 1.3506, + "loss": 1.3508, "step": 14525 }, { "epoch": 0.9872265253431173, - "grad_norm": 0.6998063921928406, + "grad_norm": 0.7052482962608337, "learning_rate": 4.382983421660552e-05, - "loss": 1.3793, + "loss": 1.3798, "step": 14530 }, { "epoch": 0.987566245413779, - "grad_norm": 0.7922748327255249, + "grad_norm": 0.7921281456947327, "learning_rate": 4.3827710966163884e-05, - "loss": 1.3357, + "loss": 1.3353, "step": 14535 }, { "epoch": 0.9879059654844409, - "grad_norm": 0.6029360294342041, + "grad_norm": 0.6047056913375854, "learning_rate": 4.382558771572225e-05, - "loss": 1.3205, + "loss": 1.3207, "step": 14540 }, { "epoch": 0.9882456855551026, - "grad_norm": 0.6841976642608643, + "grad_norm": 0.684962272644043, "learning_rate": 4.382346446528061e-05, "loss": 1.3609, "step": 14545 }, { "epoch": 0.9885854056257644, - "grad_norm": 0.729258120059967, + "grad_norm": 0.7269608974456787, "learning_rate": 4.3821341214838976e-05, - "loss": 1.4015, + "loss": 1.4007, "step": 14550 }, { "epoch": 0.9889251256964261, - "grad_norm": 0.6758211851119995, + "grad_norm": 0.671068012714386, "learning_rate": 4.381921796439734e-05, - "loss": 1.3338, + "loss": 1.334, "step": 14555 }, { "epoch": 0.989264845767088, - "grad_norm": 0.6893669366836548, + "grad_norm": 0.6994994878768921, "learning_rate": 4.3817094713955704e-05, - "loss": 1.3181, + "loss": 1.3183, "step": 14560 }, { "epoch": 0.9896045658377497, - "grad_norm": 0.6467033624649048, + "grad_norm": 0.6453031301498413, "learning_rate": 4.381497146351407e-05, - "loss": 1.3362, + "loss": 1.3357, "step": 14565 }, { "epoch": 0.9899442859084114, - "grad_norm": 0.6745814681053162, + "grad_norm": 0.674057126045227, "learning_rate": 4.381284821307243e-05, - "loss": 1.3821, + "loss": 1.382, "step": 14570 }, { "epoch": 0.9902840059790733, - "grad_norm": 0.6461717486381531, + "grad_norm": 0.6448425650596619, "learning_rate": 4.3810724962630796e-05, - "loss": 1.339, + "loss": 1.3391, "step": 14575 }, { "epoch": 0.990623726049735, - "grad_norm": 0.7416273951530457, + "grad_norm": 0.7417392134666443, "learning_rate": 4.380860171218916e-05, "loss": 1.3768, "step": 14580 }, { "epoch": 0.9909634461203968, - "grad_norm": 0.6382895708084106, + "grad_norm": 0.6393870711326599, "learning_rate": 4.3806478461747524e-05, - "loss": 1.4391, + "loss": 1.439, "step": 14585 }, { "epoch": 0.9913031661910585, - "grad_norm": 0.6742331981658936, + "grad_norm": 0.6748619079589844, "learning_rate": 4.380435521130589e-05, - "loss": 1.3704, + "loss": 1.371, "step": 14590 }, { "epoch": 0.9916428862617204, - "grad_norm": 0.7783130407333374, + "grad_norm": 0.7761855721473694, "learning_rate": 4.3802231960864245e-05, - "loss": 1.3181, + "loss": 1.3185, "step": 14595 }, { "epoch": 0.9919826063323821, - "grad_norm": 0.766089916229248, + "grad_norm": 0.767564058303833, "learning_rate": 4.3800108710422616e-05, - "loss": 1.3837, + "loss": 1.3829, "step": 14600 }, { "epoch": 0.9923223264030439, - "grad_norm": 0.6705353856086731, + "grad_norm": 0.6691464185714722, "learning_rate": 4.379798545998098e-05, - "loss": 1.3804, + "loss": 1.3806, "step": 14605 }, { "epoch": 0.9926620464737057, - "grad_norm": 0.7190149426460266, + "grad_norm": 0.7189333438873291, "learning_rate": 4.379586220953934e-05, - "loss": 1.3942, + "loss": 1.3948, "step": 14610 }, { "epoch": 0.9930017665443674, - "grad_norm": 0.6647737622261047, + "grad_norm": 0.6627060770988464, "learning_rate": 4.379373895909771e-05, - "loss": 1.3856, + "loss": 1.3854, "step": 14615 }, { "epoch": 0.9933414866150292, - "grad_norm": 0.6353852152824402, + "grad_norm": 0.6358307600021362, "learning_rate": 4.379161570865607e-05, - "loss": 1.3071, + "loss": 1.3068, "step": 14620 }, { "epoch": 0.993681206685691, - "grad_norm": 0.6535605192184448, + "grad_norm": 0.6550939679145813, "learning_rate": 4.378949245821443e-05, "loss": 1.3018, "step": 14625 }, { "epoch": 0.9940209267563528, - "grad_norm": 0.6865241527557373, + "grad_norm": 0.6847389936447144, "learning_rate": 4.37873692077728e-05, - "loss": 1.4223, + "loss": 1.4216, "step": 14630 }, { "epoch": 0.9943606468270145, - "grad_norm": 0.7610270977020264, + "grad_norm": 0.7671051025390625, "learning_rate": 4.3785245957331164e-05, - "loss": 1.3702, + "loss": 1.37, "step": 14635 }, { "epoch": 0.9947003668976763, - "grad_norm": 0.696793794631958, + "grad_norm": 0.6948806643486023, "learning_rate": 4.378312270688952e-05, - "loss": 1.3446, + "loss": 1.3445, "step": 14640 }, { "epoch": 0.9950400869683381, - "grad_norm": 0.7753585577011108, + "grad_norm": 0.7738326787948608, "learning_rate": 4.378099945644789e-05, - "loss": 1.2673, + "loss": 1.2675, "step": 14645 }, { "epoch": 0.9953798070389999, - "grad_norm": 0.7199378609657288, + "grad_norm": 0.7190296649932861, "learning_rate": 4.3778876206006256e-05, - "loss": 1.4074, + "loss": 1.407, "step": 14650 }, { "epoch": 0.9957195271096616, - "grad_norm": 0.7430431842803955, + "grad_norm": 0.7412502765655518, "learning_rate": 4.377675295556461e-05, - "loss": 1.344, + "loss": 1.343, "step": 14655 }, { "epoch": 0.9960592471803235, - "grad_norm": 0.7380971312522888, + "grad_norm": 0.7375914454460144, "learning_rate": 4.3774629705122984e-05, "loss": 1.4623, "step": 14660 }, { "epoch": 0.9963989672509852, - "grad_norm": 0.7530075907707214, + "grad_norm": 0.7533898949623108, "learning_rate": 4.377250645468135e-05, - "loss": 1.3504, + "loss": 1.3503, "step": 14665 }, { "epoch": 0.9967386873216469, - "grad_norm": 0.7144792079925537, + "grad_norm": 0.7184464335441589, "learning_rate": 4.3770383204239705e-05, - "loss": 1.3724, + "loss": 1.3722, "step": 14670 }, { "epoch": 0.9970784073923087, - "grad_norm": 0.7846295833587646, + "grad_norm": 0.7925822734832764, "learning_rate": 4.3768259953798076e-05, - "loss": 1.4125, + "loss": 1.4126, "step": 14675 }, { "epoch": 0.9974181274629705, - "grad_norm": 0.6689630150794983, + "grad_norm": 0.6686549782752991, "learning_rate": 4.376613670335643e-05, - "loss": 1.3159, + "loss": 1.3167, "step": 14680 }, { "epoch": 0.9977578475336323, - "grad_norm": 0.6798150539398193, + "grad_norm": 0.6799967288970947, "learning_rate": 4.37640134529148e-05, - "loss": 1.2581, + "loss": 1.2582, "step": 14685 }, { "epoch": 0.998097567604294, - "grad_norm": 0.6772509217262268, + "grad_norm": 0.6778358221054077, "learning_rate": 4.376189020247317e-05, - "loss": 1.4554, + "loss": 1.4548, "step": 14690 }, { "epoch": 0.9984372876749559, - "grad_norm": 0.6583994030952454, + "grad_norm": 0.6608484983444214, "learning_rate": 4.3759766952031525e-05, - "loss": 1.4909, + "loss": 1.4911, "step": 14695 }, { "epoch": 0.9987770077456176, - "grad_norm": 0.648712158203125, + "grad_norm": 0.6505342125892639, "learning_rate": 4.375764370158989e-05, - "loss": 1.3482, + "loss": 1.349, "step": 14700 }, { "epoch": 0.9991167278162794, - "grad_norm": 0.6531977653503418, + "grad_norm": 0.6502706408500671, "learning_rate": 4.375552045114826e-05, - "loss": 1.3399, + "loss": 1.3401, "step": 14705 }, { "epoch": 0.9994564478869412, - "grad_norm": 0.7878149747848511, + "grad_norm": 0.7920843362808228, "learning_rate": 4.375339720070662e-05, - "loss": 1.3877, + "loss": 1.3881, "step": 14710 }, { "epoch": 0.999796167957603, - "grad_norm": 0.6606357097625732, + "grad_norm": 0.6571126580238342, "learning_rate": 4.375127395026498e-05, - "loss": 1.3811, + "loss": 1.3813, "step": 14715 }, { "epoch": 1.0, - "eval_loss": 1.493106484413147, - "eval_runtime": 189.937, - "eval_samples_per_second": 54.329, - "eval_steps_per_second": 6.792, + "eval_loss": 1.4914416074752808, + "eval_runtime": 188.386, + "eval_samples_per_second": 54.776, + "eval_steps_per_second": 6.848, "step": 14718 } ],