diff --git "a/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" "b/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" new file mode 100644--- /dev/null +++ "b/qwen25_0.5b_lora_official_5e-05/checkpoint-14718/trainer_state.json" @@ -0,0 +1,20642 @@ +{ + "best_metric": 0.002085434390429638, + "best_model_checkpoint": "./results-cc/qwen25-0.5b-instruct/qwen25_0.5b_lora_official_5e-05/checkpoint-14718", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 14718, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003397200706617747, + "grad_norm": 2.2115156650543213, + "learning_rate": 4.9997876749558366e-05, + "loss": 3.2474, + "step": 5 + }, + { + "epoch": 0.0006794401413235494, + "grad_norm": 1.9220579862594604, + "learning_rate": 4.999575349911673e-05, + "loss": 2.8647, + "step": 10 + }, + { + "epoch": 0.0010191602119853241, + "grad_norm": 1.5465667247772217, + "learning_rate": 4.9993630248675094e-05, + "loss": 2.6643, + "step": 15 + }, + { + "epoch": 0.001358880282647099, + "grad_norm": 1.5638021230697632, + "learning_rate": 4.999150699823346e-05, + "loss": 2.3563, + "step": 20 + }, + { + "epoch": 0.0016986003533088735, + "grad_norm": 1.5559604167938232, + "learning_rate": 4.998938374779182e-05, + "loss": 2.2025, + "step": 25 + }, + { + "epoch": 0.0020383204239706482, + "grad_norm": 1.4563536643981934, + "learning_rate": 4.9987260497350186e-05, + "loss": 1.9061, + "step": 30 + }, + { + "epoch": 0.002378040494632423, + "grad_norm": 1.561199426651001, + "learning_rate": 4.998513724690855e-05, + "loss": 1.6777, + "step": 35 + }, + { + "epoch": 0.002717760565294198, + "grad_norm": 0.7528368234634399, + "learning_rate": 4.9983013996466914e-05, + "loss": 1.522, + "step": 40 + }, + { + "epoch": 0.0030574806359559724, + "grad_norm": 0.5895223021507263, + "learning_rate": 4.998089074602528e-05, + "loss": 1.5657, + "step": 45 + }, + { + "epoch": 0.003397200706617747, + "grad_norm": 0.6270008087158203, + "learning_rate": 4.997876749558364e-05, + "loss": 1.5043, + "step": 50 + }, + { + "epoch": 0.0037369207772795215, + "grad_norm": 0.5577742457389832, + "learning_rate": 4.9976644245142006e-05, + "loss": 1.5139, + "step": 55 + }, + { + "epoch": 0.0040766408479412965, + "grad_norm": 0.5614880323410034, + "learning_rate": 4.997452099470037e-05, + "loss": 1.5069, + "step": 60 + }, + { + "epoch": 0.0044163609186030715, + "grad_norm": 0.4737943410873413, + "learning_rate": 4.997239774425873e-05, + "loss": 1.5826, + "step": 65 + }, + { + "epoch": 0.004756080989264846, + "grad_norm": 0.5027161240577698, + "learning_rate": 4.99702744938171e-05, + "loss": 1.3716, + "step": 70 + }, + { + "epoch": 0.005095801059926621, + "grad_norm": 0.6184471845626831, + "learning_rate": 4.996815124337546e-05, + "loss": 1.4837, + "step": 75 + }, + { + "epoch": 0.005435521130588396, + "grad_norm": 0.507949948310852, + "learning_rate": 4.996602799293382e-05, + "loss": 1.5033, + "step": 80 + }, + { + "epoch": 0.00577524120125017, + "grad_norm": 0.43082889914512634, + "learning_rate": 4.996390474249219e-05, + "loss": 1.5037, + "step": 85 + }, + { + "epoch": 0.006114961271911945, + "grad_norm": 0.5485135912895203, + "learning_rate": 4.9961781492050554e-05, + "loss": 1.5205, + "step": 90 + }, + { + "epoch": 0.006454681342573719, + "grad_norm": 0.6378966569900513, + "learning_rate": 4.995965824160891e-05, + "loss": 1.505, + "step": 95 + }, + { + "epoch": 0.006794401413235494, + "grad_norm": 0.545911431312561, + "learning_rate": 4.995753499116728e-05, + "loss": 1.5321, + "step": 100 + }, + { + "epoch": 0.007134121483897269, + "grad_norm": 0.5056548714637756, + "learning_rate": 4.9955411740725646e-05, + "loss": 1.4546, + "step": 105 + }, + { + "epoch": 0.007473841554559043, + "grad_norm": 0.47536423802375793, + "learning_rate": 4.9953288490284004e-05, + "loss": 1.465, + "step": 110 + }, + { + "epoch": 0.007813561625220818, + "grad_norm": 0.5124016404151917, + "learning_rate": 4.9951165239842374e-05, + "loss": 1.5854, + "step": 115 + }, + { + "epoch": 0.008153281695882593, + "grad_norm": 0.4447226822376251, + "learning_rate": 4.994904198940074e-05, + "loss": 1.4954, + "step": 120 + }, + { + "epoch": 0.008493001766544368, + "grad_norm": 0.5351713299751282, + "learning_rate": 4.9946918738959096e-05, + "loss": 1.512, + "step": 125 + }, + { + "epoch": 0.008832721837206143, + "grad_norm": 0.49306538701057434, + "learning_rate": 4.9944795488517466e-05, + "loss": 1.4245, + "step": 130 + }, + { + "epoch": 0.009172441907867916, + "grad_norm": 0.5303800702095032, + "learning_rate": 4.994267223807583e-05, + "loss": 1.4689, + "step": 135 + }, + { + "epoch": 0.009512161978529691, + "grad_norm": 0.5413838624954224, + "learning_rate": 4.994054898763419e-05, + "loss": 1.4714, + "step": 140 + }, + { + "epoch": 0.009851882049191466, + "grad_norm": 0.5445604920387268, + "learning_rate": 4.993842573719256e-05, + "loss": 1.4894, + "step": 145 + }, + { + "epoch": 0.010191602119853241, + "grad_norm": 0.5994873642921448, + "learning_rate": 4.9936302486750916e-05, + "loss": 1.5265, + "step": 150 + }, + { + "epoch": 0.010531322190515016, + "grad_norm": 0.5787465572357178, + "learning_rate": 4.993417923630928e-05, + "loss": 1.546, + "step": 155 + }, + { + "epoch": 0.010871042261176791, + "grad_norm": 0.5157501101493835, + "learning_rate": 4.993205598586765e-05, + "loss": 1.4844, + "step": 160 + }, + { + "epoch": 0.011210762331838564, + "grad_norm": 0.5613402724266052, + "learning_rate": 4.992993273542601e-05, + "loss": 1.5223, + "step": 165 + }, + { + "epoch": 0.01155048240250034, + "grad_norm": 0.5664464235305786, + "learning_rate": 4.992780948498437e-05, + "loss": 1.4891, + "step": 170 + }, + { + "epoch": 0.011890202473162114, + "grad_norm": 0.5007671117782593, + "learning_rate": 4.992568623454274e-05, + "loss": 1.4462, + "step": 175 + }, + { + "epoch": 0.01222992254382389, + "grad_norm": 0.5545527935028076, + "learning_rate": 4.99235629841011e-05, + "loss": 1.5052, + "step": 180 + }, + { + "epoch": 0.012569642614485664, + "grad_norm": 0.48418471217155457, + "learning_rate": 4.992143973365947e-05, + "loss": 1.5056, + "step": 185 + }, + { + "epoch": 0.012909362685147438, + "grad_norm": 0.5149874687194824, + "learning_rate": 4.9919316483217835e-05, + "loss": 1.5157, + "step": 190 + }, + { + "epoch": 0.013249082755809213, + "grad_norm": 0.41576215624809265, + "learning_rate": 4.991719323277619e-05, + "loss": 1.5507, + "step": 195 + }, + { + "epoch": 0.013588802826470988, + "grad_norm": 0.6348430514335632, + "learning_rate": 4.991506998233456e-05, + "loss": 1.4514, + "step": 200 + }, + { + "epoch": 0.013928522897132763, + "grad_norm": 0.43560290336608887, + "learning_rate": 4.9912946731892927e-05, + "loss": 1.4208, + "step": 205 + }, + { + "epoch": 0.014268242967794538, + "grad_norm": 0.5400347709655762, + "learning_rate": 4.9910823481451284e-05, + "loss": 1.4863, + "step": 210 + }, + { + "epoch": 0.014607963038456313, + "grad_norm": 0.49760323762893677, + "learning_rate": 4.9908700231009655e-05, + "loss": 1.5158, + "step": 215 + }, + { + "epoch": 0.014947683109118086, + "grad_norm": 0.5227531790733337, + "learning_rate": 4.990657698056801e-05, + "loss": 1.5481, + "step": 220 + }, + { + "epoch": 0.015287403179779861, + "grad_norm": 0.6145650148391724, + "learning_rate": 4.9904453730126376e-05, + "loss": 1.5456, + "step": 225 + }, + { + "epoch": 0.015627123250441636, + "grad_norm": 0.469058096408844, + "learning_rate": 4.9902330479684747e-05, + "loss": 1.4884, + "step": 230 + }, + { + "epoch": 0.01596684332110341, + "grad_norm": 0.5135950446128845, + "learning_rate": 4.9900207229243104e-05, + "loss": 1.4663, + "step": 235 + }, + { + "epoch": 0.016306563391765186, + "grad_norm": 0.5521801710128784, + "learning_rate": 4.989808397880147e-05, + "loss": 1.423, + "step": 240 + }, + { + "epoch": 0.01664628346242696, + "grad_norm": 0.5495691895484924, + "learning_rate": 4.989596072835984e-05, + "loss": 1.4743, + "step": 245 + }, + { + "epoch": 0.016986003533088736, + "grad_norm": 0.444443017244339, + "learning_rate": 4.9893837477918196e-05, + "loss": 1.477, + "step": 250 + }, + { + "epoch": 0.01732572360375051, + "grad_norm": 0.5638461709022522, + "learning_rate": 4.989171422747656e-05, + "loss": 1.3996, + "step": 255 + }, + { + "epoch": 0.017665443674412286, + "grad_norm": 0.6463044881820679, + "learning_rate": 4.988959097703493e-05, + "loss": 1.4509, + "step": 260 + }, + { + "epoch": 0.01800516374507406, + "grad_norm": 0.489048033952713, + "learning_rate": 4.988746772659329e-05, + "loss": 1.5476, + "step": 265 + }, + { + "epoch": 0.018344883815735832, + "grad_norm": 0.5454437732696533, + "learning_rate": 4.988534447615165e-05, + "loss": 1.3992, + "step": 270 + }, + { + "epoch": 0.01868460388639761, + "grad_norm": 0.47111111879348755, + "learning_rate": 4.988322122571002e-05, + "loss": 1.5034, + "step": 275 + }, + { + "epoch": 0.019024323957059382, + "grad_norm": 0.5251321792602539, + "learning_rate": 4.988109797526838e-05, + "loss": 1.4966, + "step": 280 + }, + { + "epoch": 0.01936404402772116, + "grad_norm": 0.5076407194137573, + "learning_rate": 4.9878974724826744e-05, + "loss": 1.423, + "step": 285 + }, + { + "epoch": 0.019703764098382932, + "grad_norm": 0.4797162413597107, + "learning_rate": 4.987685147438511e-05, + "loss": 1.4852, + "step": 290 + }, + { + "epoch": 0.020043484169044706, + "grad_norm": 0.5633953213691711, + "learning_rate": 4.987472822394347e-05, + "loss": 1.5244, + "step": 295 + }, + { + "epoch": 0.020383204239706482, + "grad_norm": 0.5671972632408142, + "learning_rate": 4.9872604973501836e-05, + "loss": 1.4672, + "step": 300 + }, + { + "epoch": 0.020722924310368256, + "grad_norm": 0.5827104449272156, + "learning_rate": 4.98704817230602e-05, + "loss": 1.3738, + "step": 305 + }, + { + "epoch": 0.021062644381030032, + "grad_norm": 0.5388350486755371, + "learning_rate": 4.9868358472618564e-05, + "loss": 1.4657, + "step": 310 + }, + { + "epoch": 0.021402364451691806, + "grad_norm": 0.5764205455780029, + "learning_rate": 4.986623522217693e-05, + "loss": 1.4862, + "step": 315 + }, + { + "epoch": 0.021742084522353582, + "grad_norm": 0.5410416722297668, + "learning_rate": 4.986411197173529e-05, + "loss": 1.4702, + "step": 320 + }, + { + "epoch": 0.022081804593015356, + "grad_norm": 0.5311737656593323, + "learning_rate": 4.9861988721293656e-05, + "loss": 1.5223, + "step": 325 + }, + { + "epoch": 0.02242152466367713, + "grad_norm": 0.4751683473587036, + "learning_rate": 4.985986547085202e-05, + "loss": 1.4521, + "step": 330 + }, + { + "epoch": 0.022761244734338906, + "grad_norm": 0.5192616581916809, + "learning_rate": 4.9857742220410384e-05, + "loss": 1.5345, + "step": 335 + }, + { + "epoch": 0.02310096480500068, + "grad_norm": 0.5677399039268494, + "learning_rate": 4.985561896996875e-05, + "loss": 1.4637, + "step": 340 + }, + { + "epoch": 0.023440684875662456, + "grad_norm": 0.5919628143310547, + "learning_rate": 4.985349571952711e-05, + "loss": 1.4891, + "step": 345 + }, + { + "epoch": 0.02378040494632423, + "grad_norm": 0.5812225937843323, + "learning_rate": 4.9851372469085476e-05, + "loss": 1.3587, + "step": 350 + }, + { + "epoch": 0.024120125016986002, + "grad_norm": 0.5220487713813782, + "learning_rate": 4.984924921864384e-05, + "loss": 1.4029, + "step": 355 + }, + { + "epoch": 0.02445984508764778, + "grad_norm": 0.5572966933250427, + "learning_rate": 4.9847125968202204e-05, + "loss": 1.4791, + "step": 360 + }, + { + "epoch": 0.024799565158309552, + "grad_norm": 0.49765080213546753, + "learning_rate": 4.984500271776057e-05, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.02513928522897133, + "grad_norm": 0.5231608152389526, + "learning_rate": 4.984287946731893e-05, + "loss": 1.5089, + "step": 370 + }, + { + "epoch": 0.025479005299633102, + "grad_norm": 0.5667971968650818, + "learning_rate": 4.9840756216877296e-05, + "loss": 1.5159, + "step": 375 + }, + { + "epoch": 0.025818725370294875, + "grad_norm": 0.5797260999679565, + "learning_rate": 4.983863296643566e-05, + "loss": 1.5268, + "step": 380 + }, + { + "epoch": 0.026158445440956652, + "grad_norm": 0.512911856174469, + "learning_rate": 4.9836509715994024e-05, + "loss": 1.5491, + "step": 385 + }, + { + "epoch": 0.026498165511618425, + "grad_norm": 0.5526360273361206, + "learning_rate": 4.983438646555239e-05, + "loss": 1.5025, + "step": 390 + }, + { + "epoch": 0.026837885582280202, + "grad_norm": 0.5525634288787842, + "learning_rate": 4.983226321511075e-05, + "loss": 1.4455, + "step": 395 + }, + { + "epoch": 0.027177605652941975, + "grad_norm": 0.5490344762802124, + "learning_rate": 4.9830139964669116e-05, + "loss": 1.4632, + "step": 400 + }, + { + "epoch": 0.027517325723603752, + "grad_norm": 0.5567771792411804, + "learning_rate": 4.982801671422748e-05, + "loss": 1.4736, + "step": 405 + }, + { + "epoch": 0.027857045794265525, + "grad_norm": 0.5474903583526611, + "learning_rate": 4.9825893463785844e-05, + "loss": 1.4769, + "step": 410 + }, + { + "epoch": 0.0281967658649273, + "grad_norm": 0.6983373761177063, + "learning_rate": 4.982377021334421e-05, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.028536485935589075, + "grad_norm": 0.5765412449836731, + "learning_rate": 4.9821646962902565e-05, + "loss": 1.4371, + "step": 420 + }, + { + "epoch": 0.02887620600625085, + "grad_norm": 0.5680785775184631, + "learning_rate": 4.9819523712460936e-05, + "loss": 1.4653, + "step": 425 + }, + { + "epoch": 0.029215926076912625, + "grad_norm": 0.5412982106208801, + "learning_rate": 4.98174004620193e-05, + "loss": 1.4598, + "step": 430 + }, + { + "epoch": 0.0295556461475744, + "grad_norm": 0.5168296694755554, + "learning_rate": 4.981527721157766e-05, + "loss": 1.4298, + "step": 435 + }, + { + "epoch": 0.029895366218236172, + "grad_norm": 0.4910033643245697, + "learning_rate": 4.981315396113603e-05, + "loss": 1.4402, + "step": 440 + }, + { + "epoch": 0.03023508628889795, + "grad_norm": 0.5194764137268066, + "learning_rate": 4.981103071069439e-05, + "loss": 1.4435, + "step": 445 + }, + { + "epoch": 0.030574806359559722, + "grad_norm": 0.45196419954299927, + "learning_rate": 4.980890746025275e-05, + "loss": 1.3709, + "step": 450 + }, + { + "epoch": 0.0309145264302215, + "grad_norm": 0.5187885761260986, + "learning_rate": 4.980678420981112e-05, + "loss": 1.5127, + "step": 455 + }, + { + "epoch": 0.03125424650088327, + "grad_norm": 0.6062313914299011, + "learning_rate": 4.9804660959369484e-05, + "loss": 1.4292, + "step": 460 + }, + { + "epoch": 0.03159396657154505, + "grad_norm": 0.5382944345474243, + "learning_rate": 4.980253770892784e-05, + "loss": 1.4779, + "step": 465 + }, + { + "epoch": 0.03193368664220682, + "grad_norm": 0.5015461444854736, + "learning_rate": 4.980041445848621e-05, + "loss": 1.4652, + "step": 470 + }, + { + "epoch": 0.032273406712868595, + "grad_norm": 0.5272130370140076, + "learning_rate": 4.9798291208044576e-05, + "loss": 1.4272, + "step": 475 + }, + { + "epoch": 0.03261312678353037, + "grad_norm": 0.5345466732978821, + "learning_rate": 4.979616795760293e-05, + "loss": 1.435, + "step": 480 + }, + { + "epoch": 0.03295284685419215, + "grad_norm": 0.5273901224136353, + "learning_rate": 4.9794044707161304e-05, + "loss": 1.4544, + "step": 485 + }, + { + "epoch": 0.03329256692485392, + "grad_norm": 0.6292443871498108, + "learning_rate": 4.979192145671966e-05, + "loss": 1.4516, + "step": 490 + }, + { + "epoch": 0.033632286995515695, + "grad_norm": 0.656648576259613, + "learning_rate": 4.9789798206278025e-05, + "loss": 1.4173, + "step": 495 + }, + { + "epoch": 0.03397200706617747, + "grad_norm": 0.5808439254760742, + "learning_rate": 4.9787674955836396e-05, + "loss": 1.4431, + "step": 500 + }, + { + "epoch": 0.03431172713683924, + "grad_norm": 0.502206027507782, + "learning_rate": 4.978555170539475e-05, + "loss": 1.4618, + "step": 505 + }, + { + "epoch": 0.03465144720750102, + "grad_norm": 0.5035110712051392, + "learning_rate": 4.978342845495312e-05, + "loss": 1.3757, + "step": 510 + }, + { + "epoch": 0.034991167278162795, + "grad_norm": 0.5452114939689636, + "learning_rate": 4.978130520451149e-05, + "loss": 1.4199, + "step": 515 + }, + { + "epoch": 0.03533088734882457, + "grad_norm": 0.5839388370513916, + "learning_rate": 4.9779181954069845e-05, + "loss": 1.5315, + "step": 520 + }, + { + "epoch": 0.03567060741948634, + "grad_norm": 0.5461863875389099, + "learning_rate": 4.9777058703628216e-05, + "loss": 1.5461, + "step": 525 + }, + { + "epoch": 0.03601032749014812, + "grad_norm": 0.5233834385871887, + "learning_rate": 4.977493545318658e-05, + "loss": 1.4343, + "step": 530 + }, + { + "epoch": 0.036350047560809895, + "grad_norm": 0.5493288040161133, + "learning_rate": 4.977281220274494e-05, + "loss": 1.5356, + "step": 535 + }, + { + "epoch": 0.036689767631471665, + "grad_norm": 0.5625514984130859, + "learning_rate": 4.977068895230331e-05, + "loss": 1.6131, + "step": 540 + }, + { + "epoch": 0.03702948770213344, + "grad_norm": 0.5404923558235168, + "learning_rate": 4.976856570186167e-05, + "loss": 1.4323, + "step": 545 + }, + { + "epoch": 0.03736920777279522, + "grad_norm": 0.4866144359111786, + "learning_rate": 4.976644245142003e-05, + "loss": 1.4887, + "step": 550 + }, + { + "epoch": 0.03770892784345699, + "grad_norm": 0.5609140396118164, + "learning_rate": 4.97643192009784e-05, + "loss": 1.4837, + "step": 555 + }, + { + "epoch": 0.038048647914118765, + "grad_norm": 0.5729807019233704, + "learning_rate": 4.9762195950536764e-05, + "loss": 1.5659, + "step": 560 + }, + { + "epoch": 0.03838836798478054, + "grad_norm": 0.5228670239448547, + "learning_rate": 4.976007270009512e-05, + "loss": 1.4805, + "step": 565 + }, + { + "epoch": 0.03872808805544232, + "grad_norm": 0.5944375991821289, + "learning_rate": 4.975794944965349e-05, + "loss": 1.4658, + "step": 570 + }, + { + "epoch": 0.03906780812610409, + "grad_norm": 0.5736159682273865, + "learning_rate": 4.975582619921185e-05, + "loss": 1.4871, + "step": 575 + }, + { + "epoch": 0.039407528196765865, + "grad_norm": 0.5262963175773621, + "learning_rate": 4.975370294877021e-05, + "loss": 1.4684, + "step": 580 + }, + { + "epoch": 0.03974724826742764, + "grad_norm": 0.5111757516860962, + "learning_rate": 4.9751579698328584e-05, + "loss": 1.46, + "step": 585 + }, + { + "epoch": 0.04008696833808941, + "grad_norm": 0.575961172580719, + "learning_rate": 4.974945644788694e-05, + "loss": 1.4456, + "step": 590 + }, + { + "epoch": 0.04042668840875119, + "grad_norm": 0.5009347200393677, + "learning_rate": 4.9747333197445305e-05, + "loss": 1.4215, + "step": 595 + }, + { + "epoch": 0.040766408479412965, + "grad_norm": 0.566643476486206, + "learning_rate": 4.9745209947003676e-05, + "loss": 1.3709, + "step": 600 + }, + { + "epoch": 0.04110612855007474, + "grad_norm": 0.5933975577354431, + "learning_rate": 4.974308669656203e-05, + "loss": 1.4551, + "step": 605 + }, + { + "epoch": 0.04144584862073651, + "grad_norm": 0.6027320623397827, + "learning_rate": 4.97409634461204e-05, + "loss": 1.4568, + "step": 610 + }, + { + "epoch": 0.04178556869139829, + "grad_norm": 0.6281547546386719, + "learning_rate": 4.973884019567877e-05, + "loss": 1.4873, + "step": 615 + }, + { + "epoch": 0.042125288762060065, + "grad_norm": 0.5826748609542847, + "learning_rate": 4.9736716945237125e-05, + "loss": 1.4697, + "step": 620 + }, + { + "epoch": 0.042465008832721834, + "grad_norm": 0.5156673789024353, + "learning_rate": 4.973459369479549e-05, + "loss": 1.4339, + "step": 625 + }, + { + "epoch": 0.04280472890338361, + "grad_norm": 0.5677419900894165, + "learning_rate": 4.973247044435386e-05, + "loss": 1.3631, + "step": 630 + }, + { + "epoch": 0.04314444897404539, + "grad_norm": 0.5827733278274536, + "learning_rate": 4.973034719391222e-05, + "loss": 1.4643, + "step": 635 + }, + { + "epoch": 0.043484169044707165, + "grad_norm": 0.49551665782928467, + "learning_rate": 4.972822394347058e-05, + "loss": 1.3798, + "step": 640 + }, + { + "epoch": 0.043823889115368934, + "grad_norm": 0.5372036099433899, + "learning_rate": 4.9726100693028945e-05, + "loss": 1.4887, + "step": 645 + }, + { + "epoch": 0.04416360918603071, + "grad_norm": 0.5611512660980225, + "learning_rate": 4.972397744258731e-05, + "loss": 1.4698, + "step": 650 + }, + { + "epoch": 0.04450332925669249, + "grad_norm": 0.6036912202835083, + "learning_rate": 4.972185419214567e-05, + "loss": 1.4582, + "step": 655 + }, + { + "epoch": 0.04484304932735426, + "grad_norm": 0.6422983407974243, + "learning_rate": 4.971973094170404e-05, + "loss": 1.3964, + "step": 660 + }, + { + "epoch": 0.045182769398016034, + "grad_norm": 0.4992437958717346, + "learning_rate": 4.97176076912624e-05, + "loss": 1.4224, + "step": 665 + }, + { + "epoch": 0.04552248946867781, + "grad_norm": 0.5885098576545715, + "learning_rate": 4.9715484440820765e-05, + "loss": 1.4885, + "step": 670 + }, + { + "epoch": 0.04586220953933958, + "grad_norm": 0.6584553718566895, + "learning_rate": 4.971336119037913e-05, + "loss": 1.3926, + "step": 675 + }, + { + "epoch": 0.04620192961000136, + "grad_norm": 0.5546813607215881, + "learning_rate": 4.971123793993749e-05, + "loss": 1.4001, + "step": 680 + }, + { + "epoch": 0.046541649680663134, + "grad_norm": 0.5703087449073792, + "learning_rate": 4.970911468949586e-05, + "loss": 1.4585, + "step": 685 + }, + { + "epoch": 0.04688136975132491, + "grad_norm": 0.5438553690910339, + "learning_rate": 4.970699143905422e-05, + "loss": 1.5377, + "step": 690 + }, + { + "epoch": 0.04722108982198668, + "grad_norm": 0.5606414675712585, + "learning_rate": 4.9704868188612585e-05, + "loss": 1.4817, + "step": 695 + }, + { + "epoch": 0.04756080989264846, + "grad_norm": 0.5491502285003662, + "learning_rate": 4.970274493817095e-05, + "loss": 1.504, + "step": 700 + }, + { + "epoch": 0.047900529963310234, + "grad_norm": 0.6139447093009949, + "learning_rate": 4.970062168772931e-05, + "loss": 1.5721, + "step": 705 + }, + { + "epoch": 0.048240250033972004, + "grad_norm": 0.5045318603515625, + "learning_rate": 4.969849843728768e-05, + "loss": 1.5299, + "step": 710 + }, + { + "epoch": 0.04857997010463378, + "grad_norm": 0.5747368931770325, + "learning_rate": 4.969637518684604e-05, + "loss": 1.5473, + "step": 715 + }, + { + "epoch": 0.04891969017529556, + "grad_norm": 0.5617840886116028, + "learning_rate": 4.9694251936404405e-05, + "loss": 1.5111, + "step": 720 + }, + { + "epoch": 0.049259410245957334, + "grad_norm": 0.5222535729408264, + "learning_rate": 4.969212868596277e-05, + "loss": 1.3683, + "step": 725 + }, + { + "epoch": 0.049599130316619104, + "grad_norm": 0.4956512749195099, + "learning_rate": 4.9690005435521133e-05, + "loss": 1.4828, + "step": 730 + }, + { + "epoch": 0.04993885038728088, + "grad_norm": 0.6255701780319214, + "learning_rate": 4.96878821850795e-05, + "loss": 1.4093, + "step": 735 + }, + { + "epoch": 0.05027857045794266, + "grad_norm": 0.5820271968841553, + "learning_rate": 4.968575893463786e-05, + "loss": 1.4821, + "step": 740 + }, + { + "epoch": 0.05061829052860443, + "grad_norm": 0.5157151818275452, + "learning_rate": 4.9683635684196225e-05, + "loss": 1.4265, + "step": 745 + }, + { + "epoch": 0.050958010599266204, + "grad_norm": 0.5513901710510254, + "learning_rate": 4.968151243375459e-05, + "loss": 1.5006, + "step": 750 + }, + { + "epoch": 0.05129773066992798, + "grad_norm": 0.5603316426277161, + "learning_rate": 4.9679389183312953e-05, + "loss": 1.5731, + "step": 755 + }, + { + "epoch": 0.05163745074058975, + "grad_norm": 0.52471524477005, + "learning_rate": 4.967726593287132e-05, + "loss": 1.474, + "step": 760 + }, + { + "epoch": 0.05197717081125153, + "grad_norm": 0.588664174079895, + "learning_rate": 4.967514268242968e-05, + "loss": 1.4958, + "step": 765 + }, + { + "epoch": 0.052316890881913304, + "grad_norm": 0.5561087131500244, + "learning_rate": 4.9673019431988045e-05, + "loss": 1.4695, + "step": 770 + }, + { + "epoch": 0.05265661095257508, + "grad_norm": 0.5775383710861206, + "learning_rate": 4.96708961815464e-05, + "loss": 1.6117, + "step": 775 + }, + { + "epoch": 0.05299633102323685, + "grad_norm": 0.5280258655548096, + "learning_rate": 4.9668772931104773e-05, + "loss": 1.5171, + "step": 780 + }, + { + "epoch": 0.05333605109389863, + "grad_norm": 0.589007556438446, + "learning_rate": 4.966664968066314e-05, + "loss": 1.388, + "step": 785 + }, + { + "epoch": 0.053675771164560404, + "grad_norm": 0.6063833236694336, + "learning_rate": 4.9664526430221495e-05, + "loss": 1.5372, + "step": 790 + }, + { + "epoch": 0.054015491235222174, + "grad_norm": 0.5418073534965515, + "learning_rate": 4.9662403179779865e-05, + "loss": 1.4476, + "step": 795 + }, + { + "epoch": 0.05435521130588395, + "grad_norm": 0.5852705240249634, + "learning_rate": 4.966027992933823e-05, + "loss": 1.5328, + "step": 800 + }, + { + "epoch": 0.05469493137654573, + "grad_norm": 0.6038408875465393, + "learning_rate": 4.965815667889659e-05, + "loss": 1.4551, + "step": 805 + }, + { + "epoch": 0.055034651447207504, + "grad_norm": 0.5931604504585266, + "learning_rate": 4.965603342845496e-05, + "loss": 1.5199, + "step": 810 + }, + { + "epoch": 0.055374371517869274, + "grad_norm": 0.5286008715629578, + "learning_rate": 4.965391017801332e-05, + "loss": 1.3396, + "step": 815 + }, + { + "epoch": 0.05571409158853105, + "grad_norm": 0.6526750326156616, + "learning_rate": 4.965178692757168e-05, + "loss": 1.4556, + "step": 820 + }, + { + "epoch": 0.05605381165919283, + "grad_norm": 0.6939230561256409, + "learning_rate": 4.964966367713005e-05, + "loss": 1.4602, + "step": 825 + }, + { + "epoch": 0.0563935317298546, + "grad_norm": 0.5691767334938049, + "learning_rate": 4.9647540426688413e-05, + "loss": 1.5759, + "step": 830 + }, + { + "epoch": 0.056733251800516374, + "grad_norm": 0.6197113394737244, + "learning_rate": 4.964541717624677e-05, + "loss": 1.4188, + "step": 835 + }, + { + "epoch": 0.05707297187117815, + "grad_norm": 0.5608623027801514, + "learning_rate": 4.964329392580514e-05, + "loss": 1.4783, + "step": 840 + }, + { + "epoch": 0.05741269194183993, + "grad_norm": 0.6069940328598022, + "learning_rate": 4.96411706753635e-05, + "loss": 1.5059, + "step": 845 + }, + { + "epoch": 0.0577524120125017, + "grad_norm": 0.5475958585739136, + "learning_rate": 4.963904742492186e-05, + "loss": 1.4701, + "step": 850 + }, + { + "epoch": 0.058092132083163474, + "grad_norm": 0.580152690410614, + "learning_rate": 4.9636924174480234e-05, + "loss": 1.3949, + "step": 855 + }, + { + "epoch": 0.05843185215382525, + "grad_norm": 0.6389477849006653, + "learning_rate": 4.963480092403859e-05, + "loss": 1.5599, + "step": 860 + }, + { + "epoch": 0.05877157222448702, + "grad_norm": 0.6286891102790833, + "learning_rate": 4.963267767359696e-05, + "loss": 1.5362, + "step": 865 + }, + { + "epoch": 0.0591112922951488, + "grad_norm": 0.6547082662582397, + "learning_rate": 4.9630554423155326e-05, + "loss": 1.4031, + "step": 870 + }, + { + "epoch": 0.059451012365810574, + "grad_norm": 0.4733664095401764, + "learning_rate": 4.962843117271368e-05, + "loss": 1.5581, + "step": 875 + }, + { + "epoch": 0.059790732436472344, + "grad_norm": 0.515631377696991, + "learning_rate": 4.9626307922272054e-05, + "loss": 1.4553, + "step": 880 + }, + { + "epoch": 0.06013045250713412, + "grad_norm": 0.6194367408752441, + "learning_rate": 4.962418467183042e-05, + "loss": 1.4622, + "step": 885 + }, + { + "epoch": 0.0604701725777959, + "grad_norm": 0.5636453032493591, + "learning_rate": 4.9622061421388775e-05, + "loss": 1.4836, + "step": 890 + }, + { + "epoch": 0.060809892648457674, + "grad_norm": 0.519422173500061, + "learning_rate": 4.9619938170947146e-05, + "loss": 1.4085, + "step": 895 + }, + { + "epoch": 0.061149612719119444, + "grad_norm": 0.5942779779434204, + "learning_rate": 4.961781492050551e-05, + "loss": 1.4996, + "step": 900 + }, + { + "epoch": 0.06148933278978122, + "grad_norm": 0.6055877208709717, + "learning_rate": 4.961569167006387e-05, + "loss": 1.4698, + "step": 905 + }, + { + "epoch": 0.061829052860443, + "grad_norm": 0.5699920058250427, + "learning_rate": 4.961356841962224e-05, + "loss": 1.4858, + "step": 910 + }, + { + "epoch": 0.06216877293110477, + "grad_norm": 0.5864441394805908, + "learning_rate": 4.9611445169180595e-05, + "loss": 1.4402, + "step": 915 + }, + { + "epoch": 0.06250849300176654, + "grad_norm": 0.44144207239151, + "learning_rate": 4.960932191873896e-05, + "loss": 1.4507, + "step": 920 + }, + { + "epoch": 0.06284821307242831, + "grad_norm": 0.6052708625793457, + "learning_rate": 4.960719866829733e-05, + "loss": 1.4825, + "step": 925 + }, + { + "epoch": 0.0631879331430901, + "grad_norm": 0.5513001680374146, + "learning_rate": 4.960507541785569e-05, + "loss": 1.4192, + "step": 930 + }, + { + "epoch": 0.06352765321375187, + "grad_norm": 0.5435033440589905, + "learning_rate": 4.960295216741405e-05, + "loss": 1.45, + "step": 935 + }, + { + "epoch": 0.06386737328441364, + "grad_norm": 0.5514296889305115, + "learning_rate": 4.960082891697242e-05, + "loss": 1.4641, + "step": 940 + }, + { + "epoch": 0.06420709335507542, + "grad_norm": 0.6034606695175171, + "learning_rate": 4.959870566653078e-05, + "loss": 1.4519, + "step": 945 + }, + { + "epoch": 0.06454681342573719, + "grad_norm": 0.5675321221351624, + "learning_rate": 4.959658241608914e-05, + "loss": 1.4714, + "step": 950 + }, + { + "epoch": 0.06488653349639897, + "grad_norm": 0.6714404225349426, + "learning_rate": 4.9594459165647514e-05, + "loss": 1.4705, + "step": 955 + }, + { + "epoch": 0.06522625356706074, + "grad_norm": 0.552984356880188, + "learning_rate": 4.959233591520587e-05, + "loss": 1.3832, + "step": 960 + }, + { + "epoch": 0.06556597363772251, + "grad_norm": 0.6472477316856384, + "learning_rate": 4.9590212664764235e-05, + "loss": 1.5058, + "step": 965 + }, + { + "epoch": 0.0659056937083843, + "grad_norm": 0.6469607353210449, + "learning_rate": 4.9588089414322606e-05, + "loss": 1.464, + "step": 970 + }, + { + "epoch": 0.06624541377904607, + "grad_norm": 0.5577409863471985, + "learning_rate": 4.958596616388096e-05, + "loss": 1.4982, + "step": 975 + }, + { + "epoch": 0.06658513384970784, + "grad_norm": 0.5370805263519287, + "learning_rate": 4.958384291343933e-05, + "loss": 1.5008, + "step": 980 + }, + { + "epoch": 0.06692485392036962, + "grad_norm": 0.48894742131233215, + "learning_rate": 4.95817196629977e-05, + "loss": 1.3468, + "step": 985 + }, + { + "epoch": 0.06726457399103139, + "grad_norm": 0.6260393261909485, + "learning_rate": 4.9579596412556055e-05, + "loss": 1.5571, + "step": 990 + }, + { + "epoch": 0.06760429406169316, + "grad_norm": 0.5318644642829895, + "learning_rate": 4.957747316211442e-05, + "loss": 1.4583, + "step": 995 + }, + { + "epoch": 0.06794401413235494, + "grad_norm": 0.5583365559577942, + "learning_rate": 4.957534991167278e-05, + "loss": 1.3278, + "step": 1000 + }, + { + "epoch": 0.06828373420301671, + "grad_norm": 0.5711193084716797, + "learning_rate": 4.957322666123115e-05, + "loss": 1.4424, + "step": 1005 + }, + { + "epoch": 0.06862345427367848, + "grad_norm": 0.5027225017547607, + "learning_rate": 4.957110341078951e-05, + "loss": 1.4588, + "step": 1010 + }, + { + "epoch": 0.06896317434434027, + "grad_norm": 0.5902908444404602, + "learning_rate": 4.9568980160347875e-05, + "loss": 1.5481, + "step": 1015 + }, + { + "epoch": 0.06930289441500204, + "grad_norm": 0.5455006957054138, + "learning_rate": 4.956685690990624e-05, + "loss": 1.4043, + "step": 1020 + }, + { + "epoch": 0.0696426144856638, + "grad_norm": 0.5535411834716797, + "learning_rate": 4.95647336594646e-05, + "loss": 1.4274, + "step": 1025 + }, + { + "epoch": 0.06998233455632559, + "grad_norm": 0.5849533677101135, + "learning_rate": 4.956261040902297e-05, + "loss": 1.4464, + "step": 1030 + }, + { + "epoch": 0.07032205462698736, + "grad_norm": 0.5384759902954102, + "learning_rate": 4.956048715858133e-05, + "loss": 1.4941, + "step": 1035 + }, + { + "epoch": 0.07066177469764914, + "grad_norm": 0.5651819109916687, + "learning_rate": 4.9558363908139695e-05, + "loss": 1.4649, + "step": 1040 + }, + { + "epoch": 0.07100149476831091, + "grad_norm": 0.6011462211608887, + "learning_rate": 4.955624065769806e-05, + "loss": 1.4573, + "step": 1045 + }, + { + "epoch": 0.07134121483897268, + "grad_norm": 0.6316829323768616, + "learning_rate": 4.955411740725642e-05, + "loss": 1.456, + "step": 1050 + }, + { + "epoch": 0.07168093490963447, + "grad_norm": 0.593286395072937, + "learning_rate": 4.955199415681479e-05, + "loss": 1.3989, + "step": 1055 + }, + { + "epoch": 0.07202065498029624, + "grad_norm": 0.5714511871337891, + "learning_rate": 4.954987090637315e-05, + "loss": 1.3714, + "step": 1060 + }, + { + "epoch": 0.072360375050958, + "grad_norm": 0.5762469172477722, + "learning_rate": 4.9547747655931515e-05, + "loss": 1.3775, + "step": 1065 + }, + { + "epoch": 0.07270009512161979, + "grad_norm": 0.5935518741607666, + "learning_rate": 4.954562440548988e-05, + "loss": 1.4325, + "step": 1070 + }, + { + "epoch": 0.07303981519228156, + "grad_norm": 0.4853309392929077, + "learning_rate": 4.954350115504824e-05, + "loss": 1.3883, + "step": 1075 + }, + { + "epoch": 0.07337953526294333, + "grad_norm": 0.5568351149559021, + "learning_rate": 4.954137790460661e-05, + "loss": 1.454, + "step": 1080 + }, + { + "epoch": 0.07371925533360511, + "grad_norm": 0.5317642688751221, + "learning_rate": 4.953925465416497e-05, + "loss": 1.4239, + "step": 1085 + }, + { + "epoch": 0.07405897540426688, + "grad_norm": 0.5457822680473328, + "learning_rate": 4.9537131403723335e-05, + "loss": 1.4276, + "step": 1090 + }, + { + "epoch": 0.07439869547492865, + "grad_norm": 0.4952144920825958, + "learning_rate": 4.95350081532817e-05, + "loss": 1.4777, + "step": 1095 + }, + { + "epoch": 0.07473841554559044, + "grad_norm": 0.6003117561340332, + "learning_rate": 4.953288490284006e-05, + "loss": 1.3981, + "step": 1100 + }, + { + "epoch": 0.0750781356162522, + "grad_norm": 0.5776582360267639, + "learning_rate": 4.953076165239843e-05, + "loss": 1.456, + "step": 1105 + }, + { + "epoch": 0.07541785568691398, + "grad_norm": 0.58598792552948, + "learning_rate": 4.952863840195679e-05, + "loss": 1.4497, + "step": 1110 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.5512722730636597, + "learning_rate": 4.952651515151515e-05, + "loss": 1.491, + "step": 1115 + }, + { + "epoch": 0.07609729582823753, + "grad_norm": 0.5397170782089233, + "learning_rate": 4.952439190107352e-05, + "loss": 1.4648, + "step": 1120 + }, + { + "epoch": 0.07643701589889931, + "grad_norm": 0.5717841386795044, + "learning_rate": 4.952226865063188e-05, + "loss": 1.4529, + "step": 1125 + }, + { + "epoch": 0.07677673596956108, + "grad_norm": 0.5555099248886108, + "learning_rate": 4.952014540019024e-05, + "loss": 1.379, + "step": 1130 + }, + { + "epoch": 0.07711645604022285, + "grad_norm": 0.6435902714729309, + "learning_rate": 4.951802214974861e-05, + "loss": 1.427, + "step": 1135 + }, + { + "epoch": 0.07745617611088464, + "grad_norm": 0.6450826525688171, + "learning_rate": 4.9515898899306975e-05, + "loss": 1.3249, + "step": 1140 + }, + { + "epoch": 0.0777958961815464, + "grad_norm": 0.6193044781684875, + "learning_rate": 4.951377564886533e-05, + "loss": 1.5011, + "step": 1145 + }, + { + "epoch": 0.07813561625220818, + "grad_norm": 0.6083833575248718, + "learning_rate": 4.95116523984237e-05, + "loss": 1.3717, + "step": 1150 + }, + { + "epoch": 0.07847533632286996, + "grad_norm": 0.5254198908805847, + "learning_rate": 4.950952914798207e-05, + "loss": 1.4496, + "step": 1155 + }, + { + "epoch": 0.07881505639353173, + "grad_norm": 0.5539206862449646, + "learning_rate": 4.9507405897540424e-05, + "loss": 1.4013, + "step": 1160 + }, + { + "epoch": 0.0791547764641935, + "grad_norm": 0.597885012626648, + "learning_rate": 4.9505282647098795e-05, + "loss": 1.4537, + "step": 1165 + }, + { + "epoch": 0.07949449653485528, + "grad_norm": 0.5942360758781433, + "learning_rate": 4.950315939665716e-05, + "loss": 1.4627, + "step": 1170 + }, + { + "epoch": 0.07983421660551705, + "grad_norm": 0.5833660364151001, + "learning_rate": 4.9501036146215516e-05, + "loss": 1.4865, + "step": 1175 + }, + { + "epoch": 0.08017393667617882, + "grad_norm": 0.5241972208023071, + "learning_rate": 4.949891289577389e-05, + "loss": 1.4778, + "step": 1180 + }, + { + "epoch": 0.0805136567468406, + "grad_norm": 0.48506563901901245, + "learning_rate": 4.949678964533225e-05, + "loss": 1.5036, + "step": 1185 + }, + { + "epoch": 0.08085337681750238, + "grad_norm": 0.5333223342895508, + "learning_rate": 4.949466639489061e-05, + "loss": 1.428, + "step": 1190 + }, + { + "epoch": 0.08119309688816416, + "grad_norm": 0.6433978080749512, + "learning_rate": 4.949254314444898e-05, + "loss": 1.3285, + "step": 1195 + }, + { + "epoch": 0.08153281695882593, + "grad_norm": 0.6657341122627258, + "learning_rate": 4.9490419894007336e-05, + "loss": 1.4037, + "step": 1200 + }, + { + "epoch": 0.0818725370294877, + "grad_norm": 0.5360137224197388, + "learning_rate": 4.948829664356571e-05, + "loss": 1.5553, + "step": 1205 + }, + { + "epoch": 0.08221225710014948, + "grad_norm": 0.5867024064064026, + "learning_rate": 4.948617339312407e-05, + "loss": 1.4141, + "step": 1210 + }, + { + "epoch": 0.08255197717081125, + "grad_norm": 0.5557378530502319, + "learning_rate": 4.948405014268243e-05, + "loss": 1.4631, + "step": 1215 + }, + { + "epoch": 0.08289169724147302, + "grad_norm": 0.6246277689933777, + "learning_rate": 4.94819268922408e-05, + "loss": 1.3083, + "step": 1220 + }, + { + "epoch": 0.0832314173121348, + "grad_norm": 0.5651764869689941, + "learning_rate": 4.947980364179916e-05, + "loss": 1.4385, + "step": 1225 + }, + { + "epoch": 0.08357113738279658, + "grad_norm": 0.619651198387146, + "learning_rate": 4.947768039135752e-05, + "loss": 1.4379, + "step": 1230 + }, + { + "epoch": 0.08391085745345835, + "grad_norm": 0.5746042728424072, + "learning_rate": 4.947555714091589e-05, + "loss": 1.534, + "step": 1235 + }, + { + "epoch": 0.08425057752412013, + "grad_norm": 0.6215890049934387, + "learning_rate": 4.9473433890474255e-05, + "loss": 1.3487, + "step": 1240 + }, + { + "epoch": 0.0845902975947819, + "grad_norm": 0.5779523849487305, + "learning_rate": 4.947131064003261e-05, + "loss": 1.3078, + "step": 1245 + }, + { + "epoch": 0.08493001766544367, + "grad_norm": 0.6031992435455322, + "learning_rate": 4.946918738959098e-05, + "loss": 1.387, + "step": 1250 + }, + { + "epoch": 0.08526973773610545, + "grad_norm": 0.5986976027488708, + "learning_rate": 4.946706413914935e-05, + "loss": 1.4365, + "step": 1255 + }, + { + "epoch": 0.08560945780676722, + "grad_norm": 0.5761814713478088, + "learning_rate": 4.9464940888707704e-05, + "loss": 1.4215, + "step": 1260 + }, + { + "epoch": 0.08594917787742899, + "grad_norm": 0.5750184655189514, + "learning_rate": 4.9462817638266075e-05, + "loss": 1.4771, + "step": 1265 + }, + { + "epoch": 0.08628889794809078, + "grad_norm": 0.608689546585083, + "learning_rate": 4.946069438782443e-05, + "loss": 1.4386, + "step": 1270 + }, + { + "epoch": 0.08662861801875255, + "grad_norm": 0.609923779964447, + "learning_rate": 4.9458571137382796e-05, + "loss": 1.4066, + "step": 1275 + }, + { + "epoch": 0.08696833808941433, + "grad_norm": 0.5635353922843933, + "learning_rate": 4.945644788694117e-05, + "loss": 1.4556, + "step": 1280 + }, + { + "epoch": 0.0873080581600761, + "grad_norm": 0.6076200604438782, + "learning_rate": 4.9454324636499524e-05, + "loss": 1.5373, + "step": 1285 + }, + { + "epoch": 0.08764777823073787, + "grad_norm": 0.5832569599151611, + "learning_rate": 4.945220138605789e-05, + "loss": 1.4948, + "step": 1290 + }, + { + "epoch": 0.08798749830139965, + "grad_norm": 0.5482774972915649, + "learning_rate": 4.945007813561626e-05, + "loss": 1.4732, + "step": 1295 + }, + { + "epoch": 0.08832721837206142, + "grad_norm": 0.567272961139679, + "learning_rate": 4.9447954885174616e-05, + "loss": 1.4416, + "step": 1300 + }, + { + "epoch": 0.08866693844272319, + "grad_norm": 0.6006086468696594, + "learning_rate": 4.944583163473298e-05, + "loss": 1.431, + "step": 1305 + }, + { + "epoch": 0.08900665851338498, + "grad_norm": 0.5554898381233215, + "learning_rate": 4.944370838429135e-05, + "loss": 1.4923, + "step": 1310 + }, + { + "epoch": 0.08934637858404675, + "grad_norm": 0.7005094289779663, + "learning_rate": 4.944158513384971e-05, + "loss": 1.3814, + "step": 1315 + }, + { + "epoch": 0.08968609865470852, + "grad_norm": 0.5457586050033569, + "learning_rate": 4.943946188340807e-05, + "loss": 1.3287, + "step": 1320 + }, + { + "epoch": 0.0900258187253703, + "grad_norm": 0.5568594932556152, + "learning_rate": 4.943733863296644e-05, + "loss": 1.4231, + "step": 1325 + }, + { + "epoch": 0.09036553879603207, + "grad_norm": 0.564529538154602, + "learning_rate": 4.94352153825248e-05, + "loss": 1.4203, + "step": 1330 + }, + { + "epoch": 0.09070525886669384, + "grad_norm": 0.7224603891372681, + "learning_rate": 4.9433092132083164e-05, + "loss": 1.4963, + "step": 1335 + }, + { + "epoch": 0.09104497893735562, + "grad_norm": 0.5889787077903748, + "learning_rate": 4.9430968881641535e-05, + "loss": 1.5203, + "step": 1340 + }, + { + "epoch": 0.09138469900801739, + "grad_norm": 0.6516150236129761, + "learning_rate": 4.942884563119989e-05, + "loss": 1.3814, + "step": 1345 + }, + { + "epoch": 0.09172441907867916, + "grad_norm": 0.5434114336967468, + "learning_rate": 4.9426722380758256e-05, + "loss": 1.4401, + "step": 1350 + }, + { + "epoch": 0.09206413914934095, + "grad_norm": 0.5956789255142212, + "learning_rate": 4.942459913031662e-05, + "loss": 1.435, + "step": 1355 + }, + { + "epoch": 0.09240385922000272, + "grad_norm": 0.5733022093772888, + "learning_rate": 4.9422475879874984e-05, + "loss": 1.3873, + "step": 1360 + }, + { + "epoch": 0.0927435792906645, + "grad_norm": 0.591184139251709, + "learning_rate": 4.942035262943335e-05, + "loss": 1.3833, + "step": 1365 + }, + { + "epoch": 0.09308329936132627, + "grad_norm": 0.5225968360900879, + "learning_rate": 4.941822937899171e-05, + "loss": 1.4241, + "step": 1370 + }, + { + "epoch": 0.09342301943198804, + "grad_norm": 0.46089863777160645, + "learning_rate": 4.9416106128550076e-05, + "loss": 1.3174, + "step": 1375 + }, + { + "epoch": 0.09376273950264982, + "grad_norm": 0.5939785838127136, + "learning_rate": 4.941398287810844e-05, + "loss": 1.5062, + "step": 1380 + }, + { + "epoch": 0.09410245957331159, + "grad_norm": 0.5519104599952698, + "learning_rate": 4.9411859627666804e-05, + "loss": 1.3897, + "step": 1385 + }, + { + "epoch": 0.09444217964397336, + "grad_norm": 0.5778692960739136, + "learning_rate": 4.940973637722517e-05, + "loss": 1.4834, + "step": 1390 + }, + { + "epoch": 0.09478189971463515, + "grad_norm": 0.5841971039772034, + "learning_rate": 4.940761312678353e-05, + "loss": 1.3967, + "step": 1395 + }, + { + "epoch": 0.09512161978529692, + "grad_norm": 0.5745139122009277, + "learning_rate": 4.9405489876341896e-05, + "loss": 1.528, + "step": 1400 + }, + { + "epoch": 0.09546133985595869, + "grad_norm": 0.6057448983192444, + "learning_rate": 4.940336662590026e-05, + "loss": 1.5101, + "step": 1405 + }, + { + "epoch": 0.09580105992662047, + "grad_norm": 0.5619961023330688, + "learning_rate": 4.9401243375458624e-05, + "loss": 1.4567, + "step": 1410 + }, + { + "epoch": 0.09614077999728224, + "grad_norm": 0.5457148551940918, + "learning_rate": 4.939912012501699e-05, + "loss": 1.4855, + "step": 1415 + }, + { + "epoch": 0.09648050006794401, + "grad_norm": 0.5889705419540405, + "learning_rate": 4.939699687457535e-05, + "loss": 1.5395, + "step": 1420 + }, + { + "epoch": 0.09682022013860579, + "grad_norm": 0.6001344323158264, + "learning_rate": 4.9394873624133716e-05, + "loss": 1.5335, + "step": 1425 + }, + { + "epoch": 0.09715994020926756, + "grad_norm": 0.5506335496902466, + "learning_rate": 4.939275037369208e-05, + "loss": 1.4368, + "step": 1430 + }, + { + "epoch": 0.09749966027992933, + "grad_norm": 0.6067984104156494, + "learning_rate": 4.9390627123250444e-05, + "loss": 1.4778, + "step": 1435 + }, + { + "epoch": 0.09783938035059112, + "grad_norm": 0.5627113580703735, + "learning_rate": 4.938850387280881e-05, + "loss": 1.3986, + "step": 1440 + }, + { + "epoch": 0.09817910042125289, + "grad_norm": 0.5547177195549011, + "learning_rate": 4.938638062236717e-05, + "loss": 1.3866, + "step": 1445 + }, + { + "epoch": 0.09851882049191467, + "grad_norm": 0.5425217151641846, + "learning_rate": 4.9384257371925536e-05, + "loss": 1.52, + "step": 1450 + }, + { + "epoch": 0.09885854056257644, + "grad_norm": 0.5709924697875977, + "learning_rate": 4.93821341214839e-05, + "loss": 1.4488, + "step": 1455 + }, + { + "epoch": 0.09919826063323821, + "grad_norm": 0.6176159977912903, + "learning_rate": 4.9380010871042264e-05, + "loss": 1.5285, + "step": 1460 + }, + { + "epoch": 0.09953798070389999, + "grad_norm": 0.6142645478248596, + "learning_rate": 4.937788762060063e-05, + "loss": 1.4332, + "step": 1465 + }, + { + "epoch": 0.09987770077456176, + "grad_norm": 0.6106401681900024, + "learning_rate": 4.9375764370158986e-05, + "loss": 1.4246, + "step": 1470 + }, + { + "epoch": 0.10021742084522353, + "grad_norm": 0.5476092100143433, + "learning_rate": 4.9373641119717356e-05, + "loss": 1.4671, + "step": 1475 + }, + { + "epoch": 0.10055714091588532, + "grad_norm": 0.705704391002655, + "learning_rate": 4.937151786927572e-05, + "loss": 1.4213, + "step": 1480 + }, + { + "epoch": 0.10089686098654709, + "grad_norm": 0.5743557810783386, + "learning_rate": 4.936939461883408e-05, + "loss": 1.4137, + "step": 1485 + }, + { + "epoch": 0.10123658105720885, + "grad_norm": 0.5322557687759399, + "learning_rate": 4.936727136839245e-05, + "loss": 1.4304, + "step": 1490 + }, + { + "epoch": 0.10157630112787064, + "grad_norm": 0.6251279711723328, + "learning_rate": 4.936514811795081e-05, + "loss": 1.4182, + "step": 1495 + }, + { + "epoch": 0.10191602119853241, + "grad_norm": 0.654549241065979, + "learning_rate": 4.936302486750917e-05, + "loss": 1.3279, + "step": 1500 + }, + { + "epoch": 0.10225574126919418, + "grad_norm": 0.6309332251548767, + "learning_rate": 4.936090161706754e-05, + "loss": 1.3739, + "step": 1505 + }, + { + "epoch": 0.10259546133985596, + "grad_norm": 0.547921895980835, + "learning_rate": 4.9358778366625904e-05, + "loss": 1.346, + "step": 1510 + }, + { + "epoch": 0.10293518141051773, + "grad_norm": 0.5620610117912292, + "learning_rate": 4.935665511618426e-05, + "loss": 1.4221, + "step": 1515 + }, + { + "epoch": 0.1032749014811795, + "grad_norm": 0.5969321727752686, + "learning_rate": 4.935453186574263e-05, + "loss": 1.462, + "step": 1520 + }, + { + "epoch": 0.10361462155184128, + "grad_norm": 0.6860018968582153, + "learning_rate": 4.9352408615300996e-05, + "loss": 1.4466, + "step": 1525 + }, + { + "epoch": 0.10395434162250305, + "grad_norm": 0.5293998718261719, + "learning_rate": 4.9350285364859354e-05, + "loss": 1.4329, + "step": 1530 + }, + { + "epoch": 0.10429406169316484, + "grad_norm": 0.5381416082382202, + "learning_rate": 4.9348162114417724e-05, + "loss": 1.3761, + "step": 1535 + }, + { + "epoch": 0.10463378176382661, + "grad_norm": 0.5802014470100403, + "learning_rate": 4.934603886397609e-05, + "loss": 1.4226, + "step": 1540 + }, + { + "epoch": 0.10497350183448838, + "grad_norm": 0.5551945567131042, + "learning_rate": 4.934391561353445e-05, + "loss": 1.4017, + "step": 1545 + }, + { + "epoch": 0.10531322190515016, + "grad_norm": 0.6843209862709045, + "learning_rate": 4.9341792363092816e-05, + "loss": 1.424, + "step": 1550 + }, + { + "epoch": 0.10565294197581193, + "grad_norm": 0.6528891921043396, + "learning_rate": 4.9339669112651174e-05, + "loss": 1.3912, + "step": 1555 + }, + { + "epoch": 0.1059926620464737, + "grad_norm": 0.5132463574409485, + "learning_rate": 4.9337545862209544e-05, + "loss": 1.3568, + "step": 1560 + }, + { + "epoch": 0.10633238211713548, + "grad_norm": 0.6335332989692688, + "learning_rate": 4.933542261176791e-05, + "loss": 1.3855, + "step": 1565 + }, + { + "epoch": 0.10667210218779725, + "grad_norm": 0.6554496884346008, + "learning_rate": 4.9333299361326266e-05, + "loss": 1.4204, + "step": 1570 + }, + { + "epoch": 0.10701182225845902, + "grad_norm": 0.5501953959465027, + "learning_rate": 4.9331176110884637e-05, + "loss": 1.5018, + "step": 1575 + }, + { + "epoch": 0.10735154232912081, + "grad_norm": 0.5601421594619751, + "learning_rate": 4.9329052860443e-05, + "loss": 1.4294, + "step": 1580 + }, + { + "epoch": 0.10769126239978258, + "grad_norm": 0.5142511129379272, + "learning_rate": 4.932692961000136e-05, + "loss": 1.36, + "step": 1585 + }, + { + "epoch": 0.10803098247044435, + "grad_norm": 0.7431572675704956, + "learning_rate": 4.932480635955973e-05, + "loss": 1.4361, + "step": 1590 + }, + { + "epoch": 0.10837070254110613, + "grad_norm": 0.6073015332221985, + "learning_rate": 4.932268310911809e-05, + "loss": 1.4543, + "step": 1595 + }, + { + "epoch": 0.1087104226117679, + "grad_norm": 0.5889863967895508, + "learning_rate": 4.932055985867645e-05, + "loss": 1.4554, + "step": 1600 + }, + { + "epoch": 0.10905014268242967, + "grad_norm": 0.5507410764694214, + "learning_rate": 4.931843660823482e-05, + "loss": 1.4994, + "step": 1605 + }, + { + "epoch": 0.10938986275309145, + "grad_norm": 0.6453044414520264, + "learning_rate": 4.9316313357793185e-05, + "loss": 1.4872, + "step": 1610 + }, + { + "epoch": 0.10972958282375322, + "grad_norm": 0.5991227030754089, + "learning_rate": 4.931419010735154e-05, + "loss": 1.3944, + "step": 1615 + }, + { + "epoch": 0.11006930289441501, + "grad_norm": 0.5487539768218994, + "learning_rate": 4.931206685690991e-05, + "loss": 1.4355, + "step": 1620 + }, + { + "epoch": 0.11040902296507678, + "grad_norm": 0.5705694556236267, + "learning_rate": 4.930994360646827e-05, + "loss": 1.3654, + "step": 1625 + }, + { + "epoch": 0.11074874303573855, + "grad_norm": 0.5932425260543823, + "learning_rate": 4.9307820356026634e-05, + "loss": 1.4701, + "step": 1630 + }, + { + "epoch": 0.11108846310640033, + "grad_norm": 0.6856851577758789, + "learning_rate": 4.9305697105585005e-05, + "loss": 1.4945, + "step": 1635 + }, + { + "epoch": 0.1114281831770621, + "grad_norm": 0.5697663426399231, + "learning_rate": 4.930357385514336e-05, + "loss": 1.4453, + "step": 1640 + }, + { + "epoch": 0.11176790324772387, + "grad_norm": 0.4906790554523468, + "learning_rate": 4.9301450604701726e-05, + "loss": 1.4755, + "step": 1645 + }, + { + "epoch": 0.11210762331838565, + "grad_norm": 0.5060974359512329, + "learning_rate": 4.9299327354260097e-05, + "loss": 1.4535, + "step": 1650 + }, + { + "epoch": 0.11244734338904742, + "grad_norm": 0.5901421308517456, + "learning_rate": 4.9297204103818454e-05, + "loss": 1.5331, + "step": 1655 + }, + { + "epoch": 0.1127870634597092, + "grad_norm": 0.5493902564048767, + "learning_rate": 4.929508085337682e-05, + "loss": 1.4116, + "step": 1660 + }, + { + "epoch": 0.11312678353037098, + "grad_norm": 0.6108385324478149, + "learning_rate": 4.929295760293519e-05, + "loss": 1.385, + "step": 1665 + }, + { + "epoch": 0.11346650360103275, + "grad_norm": 0.5650813579559326, + "learning_rate": 4.9290834352493546e-05, + "loss": 1.4503, + "step": 1670 + }, + { + "epoch": 0.11380622367169452, + "grad_norm": 0.5575889945030212, + "learning_rate": 4.928871110205191e-05, + "loss": 1.2971, + "step": 1675 + }, + { + "epoch": 0.1141459437423563, + "grad_norm": 0.5342539548873901, + "learning_rate": 4.928658785161028e-05, + "loss": 1.4955, + "step": 1680 + }, + { + "epoch": 0.11448566381301807, + "grad_norm": 0.577377438545227, + "learning_rate": 4.928446460116864e-05, + "loss": 1.3362, + "step": 1685 + }, + { + "epoch": 0.11482538388367985, + "grad_norm": 0.5332703590393066, + "learning_rate": 4.9282341350727e-05, + "loss": 1.3495, + "step": 1690 + }, + { + "epoch": 0.11516510395434162, + "grad_norm": 0.560100257396698, + "learning_rate": 4.9280218100285366e-05, + "loss": 1.4691, + "step": 1695 + }, + { + "epoch": 0.1155048240250034, + "grad_norm": 0.5314947366714478, + "learning_rate": 4.927809484984373e-05, + "loss": 1.4083, + "step": 1700 + }, + { + "epoch": 0.11584454409566518, + "grad_norm": 0.6132587790489197, + "learning_rate": 4.9275971599402094e-05, + "loss": 1.5005, + "step": 1705 + }, + { + "epoch": 0.11618426416632695, + "grad_norm": 0.6340333223342896, + "learning_rate": 4.927384834896046e-05, + "loss": 1.4116, + "step": 1710 + }, + { + "epoch": 0.11652398423698872, + "grad_norm": 0.6274508237838745, + "learning_rate": 4.927172509851882e-05, + "loss": 1.3378, + "step": 1715 + }, + { + "epoch": 0.1168637043076505, + "grad_norm": 0.6226668953895569, + "learning_rate": 4.9269601848077186e-05, + "loss": 1.3891, + "step": 1720 + }, + { + "epoch": 0.11720342437831227, + "grad_norm": 0.6084844470024109, + "learning_rate": 4.926747859763555e-05, + "loss": 1.3585, + "step": 1725 + }, + { + "epoch": 0.11754314444897404, + "grad_norm": 0.48321905732154846, + "learning_rate": 4.9265355347193914e-05, + "loss": 1.4552, + "step": 1730 + }, + { + "epoch": 0.11788286451963582, + "grad_norm": 0.5236528515815735, + "learning_rate": 4.926323209675228e-05, + "loss": 1.4124, + "step": 1735 + }, + { + "epoch": 0.1182225845902976, + "grad_norm": 0.593169629573822, + "learning_rate": 4.926110884631064e-05, + "loss": 1.474, + "step": 1740 + }, + { + "epoch": 0.11856230466095936, + "grad_norm": 0.6106769442558289, + "learning_rate": 4.9258985595869006e-05, + "loss": 1.3808, + "step": 1745 + }, + { + "epoch": 0.11890202473162115, + "grad_norm": 0.5643929839134216, + "learning_rate": 4.925686234542737e-05, + "loss": 1.4238, + "step": 1750 + }, + { + "epoch": 0.11924174480228292, + "grad_norm": 0.5895077586174011, + "learning_rate": 4.9254739094985734e-05, + "loss": 1.4292, + "step": 1755 + }, + { + "epoch": 0.11958146487294469, + "grad_norm": 0.6480593681335449, + "learning_rate": 4.92526158445441e-05, + "loss": 1.5694, + "step": 1760 + }, + { + "epoch": 0.11992118494360647, + "grad_norm": 0.5908152461051941, + "learning_rate": 4.925049259410246e-05, + "loss": 1.4193, + "step": 1765 + }, + { + "epoch": 0.12026090501426824, + "grad_norm": 0.5783700942993164, + "learning_rate": 4.9248369343660826e-05, + "loss": 1.3689, + "step": 1770 + }, + { + "epoch": 0.12060062508493002, + "grad_norm": 0.5613039135932922, + "learning_rate": 4.924624609321919e-05, + "loss": 1.4366, + "step": 1775 + }, + { + "epoch": 0.1209403451555918, + "grad_norm": 0.5893939733505249, + "learning_rate": 4.9244122842777554e-05, + "loss": 1.4309, + "step": 1780 + }, + { + "epoch": 0.12128006522625356, + "grad_norm": 0.5168840289115906, + "learning_rate": 4.924199959233592e-05, + "loss": 1.4472, + "step": 1785 + }, + { + "epoch": 0.12161978529691535, + "grad_norm": 0.5954933166503906, + "learning_rate": 4.923987634189428e-05, + "loss": 1.3857, + "step": 1790 + }, + { + "epoch": 0.12195950536757712, + "grad_norm": 0.6200373768806458, + "learning_rate": 4.9237753091452646e-05, + "loss": 1.3801, + "step": 1795 + }, + { + "epoch": 0.12229922543823889, + "grad_norm": 0.5800577402114868, + "learning_rate": 4.923562984101101e-05, + "loss": 1.4184, + "step": 1800 + }, + { + "epoch": 0.12263894550890067, + "grad_norm": 0.5880134105682373, + "learning_rate": 4.9233506590569374e-05, + "loss": 1.3974, + "step": 1805 + }, + { + "epoch": 0.12297866557956244, + "grad_norm": 0.5568594932556152, + "learning_rate": 4.923138334012774e-05, + "loss": 1.4636, + "step": 1810 + }, + { + "epoch": 0.12331838565022421, + "grad_norm": 0.5803167819976807, + "learning_rate": 4.92292600896861e-05, + "loss": 1.4275, + "step": 1815 + }, + { + "epoch": 0.123658105720886, + "grad_norm": 0.6611342430114746, + "learning_rate": 4.9227136839244466e-05, + "loss": 1.4995, + "step": 1820 + }, + { + "epoch": 0.12399782579154776, + "grad_norm": 0.5750046372413635, + "learning_rate": 4.922501358880282e-05, + "loss": 1.4043, + "step": 1825 + }, + { + "epoch": 0.12433754586220953, + "grad_norm": 0.5646703839302063, + "learning_rate": 4.9222890338361194e-05, + "loss": 1.3777, + "step": 1830 + }, + { + "epoch": 0.12467726593287132, + "grad_norm": 0.46412211656570435, + "learning_rate": 4.922076708791956e-05, + "loss": 1.427, + "step": 1835 + }, + { + "epoch": 0.1250169860035331, + "grad_norm": 0.6101703643798828, + "learning_rate": 4.9218643837477915e-05, + "loss": 1.4434, + "step": 1840 + }, + { + "epoch": 0.12535670607419486, + "grad_norm": 0.5917672514915466, + "learning_rate": 4.9216520587036286e-05, + "loss": 1.435, + "step": 1845 + }, + { + "epoch": 0.12569642614485663, + "grad_norm": 0.6468296647071838, + "learning_rate": 4.921439733659465e-05, + "loss": 1.4413, + "step": 1850 + }, + { + "epoch": 0.12603614621551842, + "grad_norm": 0.6298085451126099, + "learning_rate": 4.921227408615301e-05, + "loss": 1.4177, + "step": 1855 + }, + { + "epoch": 0.1263758662861802, + "grad_norm": 0.6000416874885559, + "learning_rate": 4.921015083571138e-05, + "loss": 1.4596, + "step": 1860 + }, + { + "epoch": 0.12671558635684196, + "grad_norm": 0.5701744556427002, + "learning_rate": 4.920802758526974e-05, + "loss": 1.2559, + "step": 1865 + }, + { + "epoch": 0.12705530642750373, + "grad_norm": 0.6048057675361633, + "learning_rate": 4.92059043348281e-05, + "loss": 1.4887, + "step": 1870 + }, + { + "epoch": 0.1273950264981655, + "grad_norm": 0.6061444878578186, + "learning_rate": 4.920378108438647e-05, + "loss": 1.4442, + "step": 1875 + }, + { + "epoch": 0.12773474656882727, + "grad_norm": 0.6482565402984619, + "learning_rate": 4.9201657833944834e-05, + "loss": 1.5101, + "step": 1880 + }, + { + "epoch": 0.12807446663948907, + "grad_norm": 0.6278590559959412, + "learning_rate": 4.91995345835032e-05, + "loss": 1.4861, + "step": 1885 + }, + { + "epoch": 0.12841418671015084, + "grad_norm": 0.5761988759040833, + "learning_rate": 4.919741133306156e-05, + "loss": 1.4632, + "step": 1890 + }, + { + "epoch": 0.1287539067808126, + "grad_norm": 0.5002067685127258, + "learning_rate": 4.919528808261992e-05, + "loss": 1.3721, + "step": 1895 + }, + { + "epoch": 0.12909362685147438, + "grad_norm": 0.5973879098892212, + "learning_rate": 4.919316483217829e-05, + "loss": 1.4422, + "step": 1900 + }, + { + "epoch": 0.12943334692213615, + "grad_norm": 0.6070477366447449, + "learning_rate": 4.9191041581736654e-05, + "loss": 1.446, + "step": 1905 + }, + { + "epoch": 0.12977306699279795, + "grad_norm": 0.562765896320343, + "learning_rate": 4.918891833129501e-05, + "loss": 1.3384, + "step": 1910 + }, + { + "epoch": 0.13011278706345972, + "grad_norm": 0.5640690922737122, + "learning_rate": 4.918679508085338e-05, + "loss": 1.4031, + "step": 1915 + }, + { + "epoch": 0.1304525071341215, + "grad_norm": 0.639013946056366, + "learning_rate": 4.9184671830411746e-05, + "loss": 1.3823, + "step": 1920 + }, + { + "epoch": 0.13079222720478326, + "grad_norm": 0.5957516431808472, + "learning_rate": 4.91825485799701e-05, + "loss": 1.4052, + "step": 1925 + }, + { + "epoch": 0.13113194727544503, + "grad_norm": 0.6282196044921875, + "learning_rate": 4.9180425329528474e-05, + "loss": 1.371, + "step": 1930 + }, + { + "epoch": 0.1314716673461068, + "grad_norm": 0.5092763304710388, + "learning_rate": 4.917830207908684e-05, + "loss": 1.4383, + "step": 1935 + }, + { + "epoch": 0.1318113874167686, + "grad_norm": 0.539523720741272, + "learning_rate": 4.9176178828645195e-05, + "loss": 1.3717, + "step": 1940 + }, + { + "epoch": 0.13215110748743036, + "grad_norm": 0.6316164135932922, + "learning_rate": 4.9174055578203566e-05, + "loss": 1.3851, + "step": 1945 + }, + { + "epoch": 0.13249082755809213, + "grad_norm": 0.5689708590507507, + "learning_rate": 4.917193232776193e-05, + "loss": 1.4547, + "step": 1950 + }, + { + "epoch": 0.1328305476287539, + "grad_norm": 0.5140627026557922, + "learning_rate": 4.916980907732029e-05, + "loss": 1.4956, + "step": 1955 + }, + { + "epoch": 0.13317026769941567, + "grad_norm": 0.6105151772499084, + "learning_rate": 4.916768582687866e-05, + "loss": 1.4766, + "step": 1960 + }, + { + "epoch": 0.13350998777007744, + "grad_norm": 0.541977047920227, + "learning_rate": 4.916556257643702e-05, + "loss": 1.3756, + "step": 1965 + }, + { + "epoch": 0.13384970784073924, + "grad_norm": 0.6562601923942566, + "learning_rate": 4.916343932599538e-05, + "loss": 1.4917, + "step": 1970 + }, + { + "epoch": 0.134189427911401, + "grad_norm": 0.5527669787406921, + "learning_rate": 4.916131607555375e-05, + "loss": 1.3802, + "step": 1975 + }, + { + "epoch": 0.13452914798206278, + "grad_norm": 0.6226165294647217, + "learning_rate": 4.915919282511211e-05, + "loss": 1.3691, + "step": 1980 + }, + { + "epoch": 0.13486886805272455, + "grad_norm": 0.56634521484375, + "learning_rate": 4.915706957467047e-05, + "loss": 1.4878, + "step": 1985 + }, + { + "epoch": 0.13520858812338632, + "grad_norm": 0.5585847496986389, + "learning_rate": 4.915494632422884e-05, + "loss": 1.3215, + "step": 1990 + }, + { + "epoch": 0.13554830819404812, + "grad_norm": 0.5256451368331909, + "learning_rate": 4.91528230737872e-05, + "loss": 1.4867, + "step": 1995 + }, + { + "epoch": 0.1358880282647099, + "grad_norm": 0.6245139837265015, + "learning_rate": 4.915069982334556e-05, + "loss": 1.4282, + "step": 2000 + }, + { + "epoch": 0.13622774833537166, + "grad_norm": 0.5906956195831299, + "learning_rate": 4.9148576572903934e-05, + "loss": 1.4018, + "step": 2005 + }, + { + "epoch": 0.13656746840603343, + "grad_norm": 0.6229907870292664, + "learning_rate": 4.914645332246229e-05, + "loss": 1.531, + "step": 2010 + }, + { + "epoch": 0.1369071884766952, + "grad_norm": 0.5922444462776184, + "learning_rate": 4.9144330072020655e-05, + "loss": 1.5008, + "step": 2015 + }, + { + "epoch": 0.13724690854735697, + "grad_norm": 0.6321733593940735, + "learning_rate": 4.9142206821579026e-05, + "loss": 1.5288, + "step": 2020 + }, + { + "epoch": 0.13758662861801876, + "grad_norm": 0.6001906394958496, + "learning_rate": 4.914008357113738e-05, + "loss": 1.4587, + "step": 2025 + }, + { + "epoch": 0.13792634868868053, + "grad_norm": 0.6095107793807983, + "learning_rate": 4.913796032069575e-05, + "loss": 1.3526, + "step": 2030 + }, + { + "epoch": 0.1382660687593423, + "grad_norm": 0.6127040386199951, + "learning_rate": 4.913583707025412e-05, + "loss": 1.4356, + "step": 2035 + }, + { + "epoch": 0.13860578883000407, + "grad_norm": 0.5953993201255798, + "learning_rate": 4.9133713819812475e-05, + "loss": 1.4753, + "step": 2040 + }, + { + "epoch": 0.13894550890066584, + "grad_norm": 0.5859862565994263, + "learning_rate": 4.913159056937084e-05, + "loss": 1.5111, + "step": 2045 + }, + { + "epoch": 0.1392852289713276, + "grad_norm": 0.642520546913147, + "learning_rate": 4.91294673189292e-05, + "loss": 1.4449, + "step": 2050 + }, + { + "epoch": 0.1396249490419894, + "grad_norm": 0.6965727806091309, + "learning_rate": 4.912734406848757e-05, + "loss": 1.3786, + "step": 2055 + }, + { + "epoch": 0.13996466911265118, + "grad_norm": 0.562048077583313, + "learning_rate": 4.912522081804593e-05, + "loss": 1.3695, + "step": 2060 + }, + { + "epoch": 0.14030438918331295, + "grad_norm": 0.7077406644821167, + "learning_rate": 4.9123097567604295e-05, + "loss": 1.4872, + "step": 2065 + }, + { + "epoch": 0.14064410925397472, + "grad_norm": 0.5612382292747498, + "learning_rate": 4.912097431716266e-05, + "loss": 1.4583, + "step": 2070 + }, + { + "epoch": 0.1409838293246365, + "grad_norm": 0.618161141872406, + "learning_rate": 4.911885106672102e-05, + "loss": 1.4502, + "step": 2075 + }, + { + "epoch": 0.1413235493952983, + "grad_norm": 0.6133080720901489, + "learning_rate": 4.911672781627939e-05, + "loss": 1.4537, + "step": 2080 + }, + { + "epoch": 0.14166326946596006, + "grad_norm": 0.6027617454528809, + "learning_rate": 4.911460456583775e-05, + "loss": 1.4806, + "step": 2085 + }, + { + "epoch": 0.14200298953662183, + "grad_norm": 0.5253147482872009, + "learning_rate": 4.9112481315396115e-05, + "loss": 1.409, + "step": 2090 + }, + { + "epoch": 0.1423427096072836, + "grad_norm": 0.6017151474952698, + "learning_rate": 4.911035806495448e-05, + "loss": 1.4645, + "step": 2095 + }, + { + "epoch": 0.14268242967794537, + "grad_norm": 0.5726915597915649, + "learning_rate": 4.910823481451284e-05, + "loss": 1.3991, + "step": 2100 + }, + { + "epoch": 0.14302214974860714, + "grad_norm": 0.5525890588760376, + "learning_rate": 4.910611156407121e-05, + "loss": 1.4447, + "step": 2105 + }, + { + "epoch": 0.14336186981926893, + "grad_norm": 0.5599091649055481, + "learning_rate": 4.910398831362957e-05, + "loss": 1.4747, + "step": 2110 + }, + { + "epoch": 0.1437015898899307, + "grad_norm": 0.5579841136932373, + "learning_rate": 4.9101865063187935e-05, + "loss": 1.4366, + "step": 2115 + }, + { + "epoch": 0.14404130996059247, + "grad_norm": 0.6631743907928467, + "learning_rate": 4.90997418127463e-05, + "loss": 1.4922, + "step": 2120 + }, + { + "epoch": 0.14438103003125424, + "grad_norm": 0.6793774366378784, + "learning_rate": 4.909761856230466e-05, + "loss": 1.4246, + "step": 2125 + }, + { + "epoch": 0.144720750101916, + "grad_norm": 0.622520923614502, + "learning_rate": 4.909549531186303e-05, + "loss": 1.3589, + "step": 2130 + }, + { + "epoch": 0.14506047017257778, + "grad_norm": 0.5355719327926636, + "learning_rate": 4.909337206142139e-05, + "loss": 1.3717, + "step": 2135 + }, + { + "epoch": 0.14540019024323958, + "grad_norm": 0.6087984442710876, + "learning_rate": 4.9091248810979755e-05, + "loss": 1.4251, + "step": 2140 + }, + { + "epoch": 0.14573991031390135, + "grad_norm": 0.5506510734558105, + "learning_rate": 4.908912556053812e-05, + "loss": 1.4563, + "step": 2145 + }, + { + "epoch": 0.14607963038456312, + "grad_norm": 0.5640601515769958, + "learning_rate": 4.9087002310096483e-05, + "loss": 1.2312, + "step": 2150 + }, + { + "epoch": 0.1464193504552249, + "grad_norm": 0.6096499562263489, + "learning_rate": 4.908487905965485e-05, + "loss": 1.571, + "step": 2155 + }, + { + "epoch": 0.14675907052588666, + "grad_norm": 0.5384127497673035, + "learning_rate": 4.908275580921321e-05, + "loss": 1.3816, + "step": 2160 + }, + { + "epoch": 0.14709879059654846, + "grad_norm": 0.6377132534980774, + "learning_rate": 4.9080632558771575e-05, + "loss": 1.3667, + "step": 2165 + }, + { + "epoch": 0.14743851066721023, + "grad_norm": 0.5406596064567566, + "learning_rate": 4.907850930832994e-05, + "loss": 1.3812, + "step": 2170 + }, + { + "epoch": 0.147778230737872, + "grad_norm": 0.6148573160171509, + "learning_rate": 4.9076386057888303e-05, + "loss": 1.5092, + "step": 2175 + }, + { + "epoch": 0.14811795080853377, + "grad_norm": 0.5781053900718689, + "learning_rate": 4.907426280744666e-05, + "loss": 1.3639, + "step": 2180 + }, + { + "epoch": 0.14845767087919554, + "grad_norm": 0.5883496403694153, + "learning_rate": 4.907213955700503e-05, + "loss": 1.531, + "step": 2185 + }, + { + "epoch": 0.1487973909498573, + "grad_norm": 0.5491074323654175, + "learning_rate": 4.9070016306563395e-05, + "loss": 1.4468, + "step": 2190 + }, + { + "epoch": 0.1491371110205191, + "grad_norm": 0.5220900177955627, + "learning_rate": 4.906789305612175e-05, + "loss": 1.3442, + "step": 2195 + }, + { + "epoch": 0.14947683109118087, + "grad_norm": 0.5424416065216064, + "learning_rate": 4.9065769805680123e-05, + "loss": 1.4565, + "step": 2200 + }, + { + "epoch": 0.14981655116184264, + "grad_norm": 0.5749632716178894, + "learning_rate": 4.906364655523849e-05, + "loss": 1.3293, + "step": 2205 + }, + { + "epoch": 0.1501562712325044, + "grad_norm": 0.581767201423645, + "learning_rate": 4.9061523304796845e-05, + "loss": 1.4812, + "step": 2210 + }, + { + "epoch": 0.15049599130316618, + "grad_norm": 0.6531165838241577, + "learning_rate": 4.9059400054355215e-05, + "loss": 1.4215, + "step": 2215 + }, + { + "epoch": 0.15083571137382795, + "grad_norm": 0.5991101264953613, + "learning_rate": 4.905727680391358e-05, + "loss": 1.4545, + "step": 2220 + }, + { + "epoch": 0.15117543144448975, + "grad_norm": 0.660378098487854, + "learning_rate": 4.9055153553471943e-05, + "loss": 1.4995, + "step": 2225 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.5730030536651611, + "learning_rate": 4.905303030303031e-05, + "loss": 1.3848, + "step": 2230 + }, + { + "epoch": 0.1518548715858133, + "grad_norm": 0.5338610410690308, + "learning_rate": 4.905090705258867e-05, + "loss": 1.437, + "step": 2235 + }, + { + "epoch": 0.15219459165647506, + "grad_norm": 0.5345319509506226, + "learning_rate": 4.9048783802147035e-05, + "loss": 1.501, + "step": 2240 + }, + { + "epoch": 0.15253431172713683, + "grad_norm": 0.5088982582092285, + "learning_rate": 4.90466605517054e-05, + "loss": 1.4801, + "step": 2245 + }, + { + "epoch": 0.15287403179779863, + "grad_norm": 0.5582383871078491, + "learning_rate": 4.904453730126376e-05, + "loss": 1.3736, + "step": 2250 + }, + { + "epoch": 0.1532137518684604, + "grad_norm": 0.5619956254959106, + "learning_rate": 4.904241405082213e-05, + "loss": 1.325, + "step": 2255 + }, + { + "epoch": 0.15355347193912217, + "grad_norm": 0.5197911858558655, + "learning_rate": 4.904029080038049e-05, + "loss": 1.4212, + "step": 2260 + }, + { + "epoch": 0.15389319200978394, + "grad_norm": 0.6320146322250366, + "learning_rate": 4.903816754993885e-05, + "loss": 1.4784, + "step": 2265 + }, + { + "epoch": 0.1542329120804457, + "grad_norm": 0.6770452260971069, + "learning_rate": 4.903604429949722e-05, + "loss": 1.3964, + "step": 2270 + }, + { + "epoch": 0.15457263215110748, + "grad_norm": 0.6808265447616577, + "learning_rate": 4.9033921049055583e-05, + "loss": 1.4104, + "step": 2275 + }, + { + "epoch": 0.15491235222176927, + "grad_norm": 0.5745781660079956, + "learning_rate": 4.903179779861394e-05, + "loss": 1.4284, + "step": 2280 + }, + { + "epoch": 0.15525207229243104, + "grad_norm": 0.6451144218444824, + "learning_rate": 4.902967454817231e-05, + "loss": 1.3964, + "step": 2285 + }, + { + "epoch": 0.1555917923630928, + "grad_norm": 0.7113416790962219, + "learning_rate": 4.9027551297730676e-05, + "loss": 1.4505, + "step": 2290 + }, + { + "epoch": 0.15593151243375458, + "grad_norm": 0.5574209094047546, + "learning_rate": 4.902542804728903e-05, + "loss": 1.4106, + "step": 2295 + }, + { + "epoch": 0.15627123250441635, + "grad_norm": 0.5501256585121155, + "learning_rate": 4.9023304796847404e-05, + "loss": 1.473, + "step": 2300 + }, + { + "epoch": 0.15661095257507815, + "grad_norm": 0.5758779048919678, + "learning_rate": 4.902118154640577e-05, + "loss": 1.4395, + "step": 2305 + }, + { + "epoch": 0.15695067264573992, + "grad_norm": 0.561896562576294, + "learning_rate": 4.9019058295964125e-05, + "loss": 1.427, + "step": 2310 + }, + { + "epoch": 0.1572903927164017, + "grad_norm": 0.6061218976974487, + "learning_rate": 4.9016935045522496e-05, + "loss": 1.384, + "step": 2315 + }, + { + "epoch": 0.15763011278706346, + "grad_norm": 0.5345364809036255, + "learning_rate": 4.901481179508085e-05, + "loss": 1.4105, + "step": 2320 + }, + { + "epoch": 0.15796983285772523, + "grad_norm": 0.6396936774253845, + "learning_rate": 4.901268854463922e-05, + "loss": 1.4066, + "step": 2325 + }, + { + "epoch": 0.158309552928387, + "grad_norm": 0.5536668300628662, + "learning_rate": 4.901056529419759e-05, + "loss": 1.4842, + "step": 2330 + }, + { + "epoch": 0.1586492729990488, + "grad_norm": 0.5542172193527222, + "learning_rate": 4.9008442043755945e-05, + "loss": 1.4228, + "step": 2335 + }, + { + "epoch": 0.15898899306971057, + "grad_norm": 0.5791119337081909, + "learning_rate": 4.900631879331431e-05, + "loss": 1.5023, + "step": 2340 + }, + { + "epoch": 0.15932871314037234, + "grad_norm": 0.5835673213005066, + "learning_rate": 4.900419554287268e-05, + "loss": 1.4371, + "step": 2345 + }, + { + "epoch": 0.1596684332110341, + "grad_norm": 0.6104651689529419, + "learning_rate": 4.900207229243104e-05, + "loss": 1.3615, + "step": 2350 + }, + { + "epoch": 0.16000815328169588, + "grad_norm": 0.5057826042175293, + "learning_rate": 4.89999490419894e-05, + "loss": 1.4277, + "step": 2355 + }, + { + "epoch": 0.16034787335235764, + "grad_norm": 0.506723165512085, + "learning_rate": 4.899782579154777e-05, + "loss": 1.4386, + "step": 2360 + }, + { + "epoch": 0.16068759342301944, + "grad_norm": 0.592574954032898, + "learning_rate": 4.899570254110613e-05, + "loss": 1.465, + "step": 2365 + }, + { + "epoch": 0.1610273134936812, + "grad_norm": 0.6453089714050293, + "learning_rate": 4.899357929066449e-05, + "loss": 1.3677, + "step": 2370 + }, + { + "epoch": 0.16136703356434298, + "grad_norm": 0.6195045113563538, + "learning_rate": 4.8991456040222864e-05, + "loss": 1.3894, + "step": 2375 + }, + { + "epoch": 0.16170675363500475, + "grad_norm": 0.5778237581253052, + "learning_rate": 4.898933278978122e-05, + "loss": 1.3848, + "step": 2380 + }, + { + "epoch": 0.16204647370566652, + "grad_norm": 0.5943452715873718, + "learning_rate": 4.8987209539339585e-05, + "loss": 1.4076, + "step": 2385 + }, + { + "epoch": 0.16238619377632832, + "grad_norm": 0.6236830949783325, + "learning_rate": 4.8985086288897956e-05, + "loss": 1.4971, + "step": 2390 + }, + { + "epoch": 0.1627259138469901, + "grad_norm": 0.45929020643234253, + "learning_rate": 4.898296303845631e-05, + "loss": 1.4483, + "step": 2395 + }, + { + "epoch": 0.16306563391765186, + "grad_norm": 0.6343552470207214, + "learning_rate": 4.898083978801468e-05, + "loss": 1.4013, + "step": 2400 + }, + { + "epoch": 0.16340535398831363, + "grad_norm": 0.5927261710166931, + "learning_rate": 4.897871653757304e-05, + "loss": 1.3257, + "step": 2405 + }, + { + "epoch": 0.1637450740589754, + "grad_norm": 0.5226753354072571, + "learning_rate": 4.8976593287131405e-05, + "loss": 1.4674, + "step": 2410 + }, + { + "epoch": 0.16408479412963717, + "grad_norm": 0.5839881300926208, + "learning_rate": 4.897447003668977e-05, + "loss": 1.5248, + "step": 2415 + }, + { + "epoch": 0.16442451420029897, + "grad_norm": 0.6374481320381165, + "learning_rate": 4.897234678624813e-05, + "loss": 1.4081, + "step": 2420 + }, + { + "epoch": 0.16476423427096074, + "grad_norm": 0.5726152658462524, + "learning_rate": 4.89702235358065e-05, + "loss": 1.4522, + "step": 2425 + }, + { + "epoch": 0.1651039543416225, + "grad_norm": 0.6431419253349304, + "learning_rate": 4.896810028536486e-05, + "loss": 1.3736, + "step": 2430 + }, + { + "epoch": 0.16544367441228428, + "grad_norm": 0.5664803981781006, + "learning_rate": 4.8965977034923225e-05, + "loss": 1.6692, + "step": 2435 + }, + { + "epoch": 0.16578339448294604, + "grad_norm": 0.6327283978462219, + "learning_rate": 4.896385378448159e-05, + "loss": 1.4597, + "step": 2440 + }, + { + "epoch": 0.16612311455360781, + "grad_norm": 0.6446481943130493, + "learning_rate": 4.896173053403995e-05, + "loss": 1.5186, + "step": 2445 + }, + { + "epoch": 0.1664628346242696, + "grad_norm": 0.6424679160118103, + "learning_rate": 4.895960728359832e-05, + "loss": 1.4282, + "step": 2450 + }, + { + "epoch": 0.16680255469493138, + "grad_norm": 0.6918651461601257, + "learning_rate": 4.895748403315668e-05, + "loss": 1.3549, + "step": 2455 + }, + { + "epoch": 0.16714227476559315, + "grad_norm": 0.6785680651664734, + "learning_rate": 4.8955360782715045e-05, + "loss": 1.4485, + "step": 2460 + }, + { + "epoch": 0.16748199483625492, + "grad_norm": 0.5688127875328064, + "learning_rate": 4.895323753227341e-05, + "loss": 1.3694, + "step": 2465 + }, + { + "epoch": 0.1678217149069167, + "grad_norm": 0.5057792067527771, + "learning_rate": 4.895111428183177e-05, + "loss": 1.4069, + "step": 2470 + }, + { + "epoch": 0.1681614349775785, + "grad_norm": 0.6678634881973267, + "learning_rate": 4.894899103139014e-05, + "loss": 1.4047, + "step": 2475 + }, + { + "epoch": 0.16850115504824026, + "grad_norm": 0.6946313977241516, + "learning_rate": 4.89468677809485e-05, + "loss": 1.4301, + "step": 2480 + }, + { + "epoch": 0.16884087511890203, + "grad_norm": 0.5884628295898438, + "learning_rate": 4.8944744530506865e-05, + "loss": 1.5095, + "step": 2485 + }, + { + "epoch": 0.1691805951895638, + "grad_norm": 0.5525778532028198, + "learning_rate": 4.894262128006523e-05, + "loss": 1.4648, + "step": 2490 + }, + { + "epoch": 0.16952031526022557, + "grad_norm": 0.6558218002319336, + "learning_rate": 4.894049802962359e-05, + "loss": 1.4296, + "step": 2495 + }, + { + "epoch": 0.16986003533088734, + "grad_norm": 0.5888893604278564, + "learning_rate": 4.893837477918196e-05, + "loss": 1.4795, + "step": 2500 + }, + { + "epoch": 0.17019975540154914, + "grad_norm": 0.6494855284690857, + "learning_rate": 4.893625152874032e-05, + "loss": 1.4289, + "step": 2505 + }, + { + "epoch": 0.1705394754722109, + "grad_norm": 0.5585719347000122, + "learning_rate": 4.8934128278298685e-05, + "loss": 1.3688, + "step": 2510 + }, + { + "epoch": 0.17087919554287267, + "grad_norm": 0.566693902015686, + "learning_rate": 4.893200502785705e-05, + "loss": 1.4202, + "step": 2515 + }, + { + "epoch": 0.17121891561353444, + "grad_norm": 0.5679460167884827, + "learning_rate": 4.8929881777415406e-05, + "loss": 1.4117, + "step": 2520 + }, + { + "epoch": 0.17155863568419621, + "grad_norm": 0.5545666217803955, + "learning_rate": 4.892775852697378e-05, + "loss": 1.4807, + "step": 2525 + }, + { + "epoch": 0.17189835575485798, + "grad_norm": 0.650221049785614, + "learning_rate": 4.892563527653214e-05, + "loss": 1.5084, + "step": 2530 + }, + { + "epoch": 0.17223807582551978, + "grad_norm": 0.5717597603797913, + "learning_rate": 4.89235120260905e-05, + "loss": 1.4395, + "step": 2535 + }, + { + "epoch": 0.17257779589618155, + "grad_norm": 0.7092136144638062, + "learning_rate": 4.892138877564887e-05, + "loss": 1.4863, + "step": 2540 + }, + { + "epoch": 0.17291751596684332, + "grad_norm": 0.5754497051239014, + "learning_rate": 4.891926552520723e-05, + "loss": 1.2872, + "step": 2545 + }, + { + "epoch": 0.1732572360375051, + "grad_norm": 0.6163140535354614, + "learning_rate": 4.891714227476559e-05, + "loss": 1.4011, + "step": 2550 + }, + { + "epoch": 0.17359695610816686, + "grad_norm": 0.5960626006126404, + "learning_rate": 4.891501902432396e-05, + "loss": 1.4145, + "step": 2555 + }, + { + "epoch": 0.17393667617882866, + "grad_norm": 0.5820543169975281, + "learning_rate": 4.8912895773882325e-05, + "loss": 1.3803, + "step": 2560 + }, + { + "epoch": 0.17427639624949043, + "grad_norm": 0.6925805807113647, + "learning_rate": 4.891077252344069e-05, + "loss": 1.3168, + "step": 2565 + }, + { + "epoch": 0.1746161163201522, + "grad_norm": 0.5974335670471191, + "learning_rate": 4.890864927299905e-05, + "loss": 1.4439, + "step": 2570 + }, + { + "epoch": 0.17495583639081397, + "grad_norm": 0.6052380204200745, + "learning_rate": 4.890652602255742e-05, + "loss": 1.3726, + "step": 2575 + }, + { + "epoch": 0.17529555646147574, + "grad_norm": 0.6102190017700195, + "learning_rate": 4.890440277211578e-05, + "loss": 1.379, + "step": 2580 + }, + { + "epoch": 0.1756352765321375, + "grad_norm": 0.5870862603187561, + "learning_rate": 4.8902279521674145e-05, + "loss": 1.3191, + "step": 2585 + }, + { + "epoch": 0.1759749966027993, + "grad_norm": 0.6346774697303772, + "learning_rate": 4.890015627123251e-05, + "loss": 1.4171, + "step": 2590 + }, + { + "epoch": 0.17631471667346107, + "grad_norm": 0.5704734325408936, + "learning_rate": 4.889803302079087e-05, + "loss": 1.4367, + "step": 2595 + }, + { + "epoch": 0.17665443674412284, + "grad_norm": 0.6515428423881531, + "learning_rate": 4.889590977034924e-05, + "loss": 1.4435, + "step": 2600 + }, + { + "epoch": 0.17699415681478461, + "grad_norm": 0.6457876563072205, + "learning_rate": 4.8893786519907594e-05, + "loss": 1.4706, + "step": 2605 + }, + { + "epoch": 0.17733387688544638, + "grad_norm": 0.6365540623664856, + "learning_rate": 4.8891663269465965e-05, + "loss": 1.3534, + "step": 2610 + }, + { + "epoch": 0.17767359695610815, + "grad_norm": 0.5984424352645874, + "learning_rate": 4.888954001902433e-05, + "loss": 1.4365, + "step": 2615 + }, + { + "epoch": 0.17801331702676995, + "grad_norm": 0.7218442559242249, + "learning_rate": 4.8887416768582686e-05, + "loss": 1.4468, + "step": 2620 + }, + { + "epoch": 0.17835303709743172, + "grad_norm": 0.5970467925071716, + "learning_rate": 4.888529351814106e-05, + "loss": 1.3557, + "step": 2625 + }, + { + "epoch": 0.1786927571680935, + "grad_norm": 0.6055663228034973, + "learning_rate": 4.888317026769942e-05, + "loss": 1.3872, + "step": 2630 + }, + { + "epoch": 0.17903247723875526, + "grad_norm": 0.5495836138725281, + "learning_rate": 4.888104701725778e-05, + "loss": 1.5108, + "step": 2635 + }, + { + "epoch": 0.17937219730941703, + "grad_norm": 0.6123842597007751, + "learning_rate": 4.887892376681615e-05, + "loss": 1.4518, + "step": 2640 + }, + { + "epoch": 0.17971191738007883, + "grad_norm": 0.6095494031906128, + "learning_rate": 4.887680051637451e-05, + "loss": 1.4356, + "step": 2645 + }, + { + "epoch": 0.1800516374507406, + "grad_norm": 0.6432989239692688, + "learning_rate": 4.887467726593287e-05, + "loss": 1.546, + "step": 2650 + }, + { + "epoch": 0.18039135752140237, + "grad_norm": 0.5585070848464966, + "learning_rate": 4.887255401549124e-05, + "loss": 1.4513, + "step": 2655 + }, + { + "epoch": 0.18073107759206414, + "grad_norm": 0.6016882658004761, + "learning_rate": 4.8870430765049605e-05, + "loss": 1.3715, + "step": 2660 + }, + { + "epoch": 0.1810707976627259, + "grad_norm": 0.6021006107330322, + "learning_rate": 4.886830751460796e-05, + "loss": 1.4023, + "step": 2665 + }, + { + "epoch": 0.18141051773338768, + "grad_norm": 0.6008695363998413, + "learning_rate": 4.886618426416633e-05, + "loss": 1.3748, + "step": 2670 + }, + { + "epoch": 0.18175023780404947, + "grad_norm": 0.5484296083450317, + "learning_rate": 4.886406101372469e-05, + "loss": 1.4534, + "step": 2675 + }, + { + "epoch": 0.18208995787471124, + "grad_norm": 0.6692857146263123, + "learning_rate": 4.8861937763283054e-05, + "loss": 1.4976, + "step": 2680 + }, + { + "epoch": 0.18242967794537301, + "grad_norm": 0.6512504816055298, + "learning_rate": 4.8859814512841425e-05, + "loss": 1.5091, + "step": 2685 + }, + { + "epoch": 0.18276939801603478, + "grad_norm": 0.574049174785614, + "learning_rate": 4.885769126239978e-05, + "loss": 1.4236, + "step": 2690 + }, + { + "epoch": 0.18310911808669655, + "grad_norm": 0.6614393591880798, + "learning_rate": 4.8855568011958146e-05, + "loss": 1.4393, + "step": 2695 + }, + { + "epoch": 0.18344883815735832, + "grad_norm": 0.5550003051757812, + "learning_rate": 4.885344476151652e-05, + "loss": 1.4089, + "step": 2700 + }, + { + "epoch": 0.18378855822802012, + "grad_norm": 0.5351262092590332, + "learning_rate": 4.8851321511074874e-05, + "loss": 1.3451, + "step": 2705 + }, + { + "epoch": 0.1841282782986819, + "grad_norm": 0.63856440782547, + "learning_rate": 4.884919826063324e-05, + "loss": 1.3656, + "step": 2710 + }, + { + "epoch": 0.18446799836934366, + "grad_norm": 0.6448277831077576, + "learning_rate": 4.884707501019161e-05, + "loss": 1.4136, + "step": 2715 + }, + { + "epoch": 0.18480771844000543, + "grad_norm": 0.6709998250007629, + "learning_rate": 4.8844951759749966e-05, + "loss": 1.3612, + "step": 2720 + }, + { + "epoch": 0.1851474385106672, + "grad_norm": 0.6180108785629272, + "learning_rate": 4.884282850930833e-05, + "loss": 1.4253, + "step": 2725 + }, + { + "epoch": 0.185487158581329, + "grad_norm": 0.6439008712768555, + "learning_rate": 4.88407052588667e-05, + "loss": 1.4858, + "step": 2730 + }, + { + "epoch": 0.18582687865199077, + "grad_norm": 0.5395865440368652, + "learning_rate": 4.883858200842506e-05, + "loss": 1.4399, + "step": 2735 + }, + { + "epoch": 0.18616659872265254, + "grad_norm": 0.607215940952301, + "learning_rate": 4.883645875798342e-05, + "loss": 1.5088, + "step": 2740 + }, + { + "epoch": 0.1865063187933143, + "grad_norm": 0.6101176142692566, + "learning_rate": 4.8834335507541786e-05, + "loss": 1.3556, + "step": 2745 + }, + { + "epoch": 0.18684603886397608, + "grad_norm": 0.6299176216125488, + "learning_rate": 4.883221225710015e-05, + "loss": 1.3241, + "step": 2750 + }, + { + "epoch": 0.18718575893463785, + "grad_norm": 0.5913377404212952, + "learning_rate": 4.8830089006658514e-05, + "loss": 1.5029, + "step": 2755 + }, + { + "epoch": 0.18752547900529964, + "grad_norm": 0.6693141460418701, + "learning_rate": 4.882796575621688e-05, + "loss": 1.5591, + "step": 2760 + }, + { + "epoch": 0.18786519907596141, + "grad_norm": 0.6031198501586914, + "learning_rate": 4.882584250577524e-05, + "loss": 1.4366, + "step": 2765 + }, + { + "epoch": 0.18820491914662318, + "grad_norm": 0.5741726160049438, + "learning_rate": 4.8823719255333606e-05, + "loss": 1.4799, + "step": 2770 + }, + { + "epoch": 0.18854463921728495, + "grad_norm": 0.6373092532157898, + "learning_rate": 4.882159600489197e-05, + "loss": 1.5948, + "step": 2775 + }, + { + "epoch": 0.18888435928794672, + "grad_norm": 0.6143050789833069, + "learning_rate": 4.8819472754450334e-05, + "loss": 1.4609, + "step": 2780 + }, + { + "epoch": 0.1892240793586085, + "grad_norm": 0.6247331500053406, + "learning_rate": 4.88173495040087e-05, + "loss": 1.4965, + "step": 2785 + }, + { + "epoch": 0.1895637994292703, + "grad_norm": 0.6008833050727844, + "learning_rate": 4.881522625356706e-05, + "loss": 1.4772, + "step": 2790 + }, + { + "epoch": 0.18990351949993206, + "grad_norm": 0.5636752843856812, + "learning_rate": 4.8813103003125426e-05, + "loss": 1.3657, + "step": 2795 + }, + { + "epoch": 0.19024323957059383, + "grad_norm": 0.6358240842819214, + "learning_rate": 4.881097975268379e-05, + "loss": 1.337, + "step": 2800 + }, + { + "epoch": 0.1905829596412556, + "grad_norm": 0.565941333770752, + "learning_rate": 4.8808856502242154e-05, + "loss": 1.3339, + "step": 2805 + }, + { + "epoch": 0.19092267971191737, + "grad_norm": 0.5850551128387451, + "learning_rate": 4.880673325180052e-05, + "loss": 1.352, + "step": 2810 + }, + { + "epoch": 0.19126239978257917, + "grad_norm": 0.5507126450538635, + "learning_rate": 4.880461000135888e-05, + "loss": 1.4257, + "step": 2815 + }, + { + "epoch": 0.19160211985324094, + "grad_norm": 0.6068183183670044, + "learning_rate": 4.8802486750917246e-05, + "loss": 1.5025, + "step": 2820 + }, + { + "epoch": 0.1919418399239027, + "grad_norm": 0.5308849215507507, + "learning_rate": 4.880036350047561e-05, + "loss": 1.4407, + "step": 2825 + }, + { + "epoch": 0.19228155999456448, + "grad_norm": 0.587657630443573, + "learning_rate": 4.8798240250033974e-05, + "loss": 1.4402, + "step": 2830 + }, + { + "epoch": 0.19262128006522625, + "grad_norm": 0.6322230100631714, + "learning_rate": 4.879611699959234e-05, + "loss": 1.3885, + "step": 2835 + }, + { + "epoch": 0.19296100013588802, + "grad_norm": 0.4599972069263458, + "learning_rate": 4.87939937491507e-05, + "loss": 1.3561, + "step": 2840 + }, + { + "epoch": 0.19330072020654981, + "grad_norm": 0.5593650937080383, + "learning_rate": 4.8791870498709066e-05, + "loss": 1.4226, + "step": 2845 + }, + { + "epoch": 0.19364044027721158, + "grad_norm": 0.6431747674942017, + "learning_rate": 4.878974724826743e-05, + "loss": 1.4919, + "step": 2850 + }, + { + "epoch": 0.19398016034787335, + "grad_norm": 0.553047776222229, + "learning_rate": 4.8787623997825794e-05, + "loss": 1.4808, + "step": 2855 + }, + { + "epoch": 0.19431988041853512, + "grad_norm": 0.574626624584198, + "learning_rate": 4.878550074738416e-05, + "loss": 1.4788, + "step": 2860 + }, + { + "epoch": 0.1946596004891969, + "grad_norm": 0.498710960149765, + "learning_rate": 4.878337749694252e-05, + "loss": 1.3874, + "step": 2865 + }, + { + "epoch": 0.19499932055985866, + "grad_norm": 0.5680547952651978, + "learning_rate": 4.8781254246500886e-05, + "loss": 1.4522, + "step": 2870 + }, + { + "epoch": 0.19533904063052046, + "grad_norm": 0.6454386115074158, + "learning_rate": 4.8779130996059244e-05, + "loss": 1.4065, + "step": 2875 + }, + { + "epoch": 0.19567876070118223, + "grad_norm": 0.6895508170127869, + "learning_rate": 4.8777007745617614e-05, + "loss": 1.4277, + "step": 2880 + }, + { + "epoch": 0.196018480771844, + "grad_norm": 0.6072851419448853, + "learning_rate": 4.877488449517598e-05, + "loss": 1.4315, + "step": 2885 + }, + { + "epoch": 0.19635820084250577, + "grad_norm": 0.5200562477111816, + "learning_rate": 4.8772761244734336e-05, + "loss": 1.3517, + "step": 2890 + }, + { + "epoch": 0.19669792091316754, + "grad_norm": 0.547443151473999, + "learning_rate": 4.8770637994292706e-05, + "loss": 1.324, + "step": 2895 + }, + { + "epoch": 0.19703764098382934, + "grad_norm": 0.7084436416625977, + "learning_rate": 4.876851474385107e-05, + "loss": 1.3544, + "step": 2900 + }, + { + "epoch": 0.1973773610544911, + "grad_norm": 0.5730024576187134, + "learning_rate": 4.8766391493409434e-05, + "loss": 1.4499, + "step": 2905 + }, + { + "epoch": 0.19771708112515288, + "grad_norm": 0.6365293860435486, + "learning_rate": 4.87642682429678e-05, + "loss": 1.4875, + "step": 2910 + }, + { + "epoch": 0.19805680119581465, + "grad_norm": 0.7301263809204102, + "learning_rate": 4.876214499252616e-05, + "loss": 1.4084, + "step": 2915 + }, + { + "epoch": 0.19839652126647642, + "grad_norm": 0.6117473840713501, + "learning_rate": 4.8760021742084526e-05, + "loss": 1.4031, + "step": 2920 + }, + { + "epoch": 0.1987362413371382, + "grad_norm": 0.5654906034469604, + "learning_rate": 4.875789849164289e-05, + "loss": 1.3552, + "step": 2925 + }, + { + "epoch": 0.19907596140779998, + "grad_norm": 0.5986347794532776, + "learning_rate": 4.8755775241201254e-05, + "loss": 1.3089, + "step": 2930 + }, + { + "epoch": 0.19941568147846175, + "grad_norm": 0.5379148125648499, + "learning_rate": 4.875365199075962e-05, + "loss": 1.4151, + "step": 2935 + }, + { + "epoch": 0.19975540154912352, + "grad_norm": 0.6762790679931641, + "learning_rate": 4.875152874031798e-05, + "loss": 1.3418, + "step": 2940 + }, + { + "epoch": 0.2000951216197853, + "grad_norm": 0.6298921704292297, + "learning_rate": 4.874940548987634e-05, + "loss": 1.4135, + "step": 2945 + }, + { + "epoch": 0.20043484169044706, + "grad_norm": 0.6401534080505371, + "learning_rate": 4.874728223943471e-05, + "loss": 1.4582, + "step": 2950 + }, + { + "epoch": 0.20077456176110883, + "grad_norm": 0.66017085313797, + "learning_rate": 4.8745158988993074e-05, + "loss": 1.4479, + "step": 2955 + }, + { + "epoch": 0.20111428183177063, + "grad_norm": 0.5370712280273438, + "learning_rate": 4.874303573855143e-05, + "loss": 1.4445, + "step": 2960 + }, + { + "epoch": 0.2014540019024324, + "grad_norm": 0.5681670904159546, + "learning_rate": 4.87409124881098e-05, + "loss": 1.4424, + "step": 2965 + }, + { + "epoch": 0.20179372197309417, + "grad_norm": 0.564467191696167, + "learning_rate": 4.8738789237668166e-05, + "loss": 1.4026, + "step": 2970 + }, + { + "epoch": 0.20213344204375594, + "grad_norm": 0.5779341459274292, + "learning_rate": 4.8736665987226524e-05, + "loss": 1.4891, + "step": 2975 + }, + { + "epoch": 0.2024731621144177, + "grad_norm": 0.5849860310554504, + "learning_rate": 4.8734542736784894e-05, + "loss": 1.4685, + "step": 2980 + }, + { + "epoch": 0.2028128821850795, + "grad_norm": 0.5843042135238647, + "learning_rate": 4.873241948634326e-05, + "loss": 1.4741, + "step": 2985 + }, + { + "epoch": 0.20315260225574128, + "grad_norm": 0.6117401719093323, + "learning_rate": 4.8730296235901616e-05, + "loss": 1.4677, + "step": 2990 + }, + { + "epoch": 0.20349232232640305, + "grad_norm": 0.5531707406044006, + "learning_rate": 4.8728172985459986e-05, + "loss": 1.4241, + "step": 2995 + }, + { + "epoch": 0.20383204239706482, + "grad_norm": 0.5719231963157654, + "learning_rate": 4.872604973501835e-05, + "loss": 1.3577, + "step": 3000 + }, + { + "epoch": 0.2041717624677266, + "grad_norm": 0.6049824953079224, + "learning_rate": 4.872392648457671e-05, + "loss": 1.3973, + "step": 3005 + }, + { + "epoch": 0.20451148253838836, + "grad_norm": 0.6646301746368408, + "learning_rate": 4.872180323413508e-05, + "loss": 1.3812, + "step": 3010 + }, + { + "epoch": 0.20485120260905015, + "grad_norm": 0.6004433631896973, + "learning_rate": 4.871967998369344e-05, + "loss": 1.4524, + "step": 3015 + }, + { + "epoch": 0.20519092267971192, + "grad_norm": 0.6404062509536743, + "learning_rate": 4.87175567332518e-05, + "loss": 1.424, + "step": 3020 + }, + { + "epoch": 0.2055306427503737, + "grad_norm": 0.6715559363365173, + "learning_rate": 4.871543348281017e-05, + "loss": 1.3809, + "step": 3025 + }, + { + "epoch": 0.20587036282103546, + "grad_norm": 0.655387818813324, + "learning_rate": 4.871331023236853e-05, + "loss": 1.4434, + "step": 3030 + }, + { + "epoch": 0.20621008289169723, + "grad_norm": 0.5823382139205933, + "learning_rate": 4.871118698192689e-05, + "loss": 1.5534, + "step": 3035 + }, + { + "epoch": 0.206549802962359, + "grad_norm": 0.5753922462463379, + "learning_rate": 4.870906373148526e-05, + "loss": 1.3642, + "step": 3040 + }, + { + "epoch": 0.2068895230330208, + "grad_norm": 0.6257250905036926, + "learning_rate": 4.870694048104362e-05, + "loss": 1.486, + "step": 3045 + }, + { + "epoch": 0.20722924310368257, + "grad_norm": 0.5532388687133789, + "learning_rate": 4.8704817230601984e-05, + "loss": 1.338, + "step": 3050 + }, + { + "epoch": 0.20756896317434434, + "grad_norm": 0.6227984428405762, + "learning_rate": 4.8702693980160355e-05, + "loss": 1.3807, + "step": 3055 + }, + { + "epoch": 0.2079086832450061, + "grad_norm": 0.5318966507911682, + "learning_rate": 4.870057072971871e-05, + "loss": 1.406, + "step": 3060 + }, + { + "epoch": 0.20824840331566788, + "grad_norm": 0.5637968182563782, + "learning_rate": 4.8698447479277076e-05, + "loss": 1.3692, + "step": 3065 + }, + { + "epoch": 0.20858812338632968, + "grad_norm": 0.5600734353065491, + "learning_rate": 4.8696324228835447e-05, + "loss": 1.4261, + "step": 3070 + }, + { + "epoch": 0.20892784345699145, + "grad_norm": 0.5506009459495544, + "learning_rate": 4.8694200978393804e-05, + "loss": 1.392, + "step": 3075 + }, + { + "epoch": 0.20926756352765322, + "grad_norm": 0.5840604305267334, + "learning_rate": 4.869207772795217e-05, + "loss": 1.4166, + "step": 3080 + }, + { + "epoch": 0.209607283598315, + "grad_norm": 0.5566538572311401, + "learning_rate": 4.868995447751054e-05, + "loss": 1.5199, + "step": 3085 + }, + { + "epoch": 0.20994700366897676, + "grad_norm": 0.6005268692970276, + "learning_rate": 4.8687831227068896e-05, + "loss": 1.4194, + "step": 3090 + }, + { + "epoch": 0.21028672373963853, + "grad_norm": 0.5995813608169556, + "learning_rate": 4.868570797662726e-05, + "loss": 1.4782, + "step": 3095 + }, + { + "epoch": 0.21062644381030032, + "grad_norm": 0.6140902042388916, + "learning_rate": 4.8683584726185624e-05, + "loss": 1.4995, + "step": 3100 + }, + { + "epoch": 0.2109661638809621, + "grad_norm": 0.8333086967468262, + "learning_rate": 4.868146147574399e-05, + "loss": 1.2436, + "step": 3105 + }, + { + "epoch": 0.21130588395162386, + "grad_norm": 0.6089223027229309, + "learning_rate": 4.867933822530235e-05, + "loss": 1.4624, + "step": 3110 + }, + { + "epoch": 0.21164560402228563, + "grad_norm": 0.6053603291511536, + "learning_rate": 4.8677214974860716e-05, + "loss": 1.437, + "step": 3115 + }, + { + "epoch": 0.2119853240929474, + "grad_norm": 0.6245133876800537, + "learning_rate": 4.867509172441908e-05, + "loss": 1.379, + "step": 3120 + }, + { + "epoch": 0.21232504416360917, + "grad_norm": 0.5423717498779297, + "learning_rate": 4.8672968473977444e-05, + "loss": 1.4302, + "step": 3125 + }, + { + "epoch": 0.21266476423427097, + "grad_norm": 0.550070583820343, + "learning_rate": 4.867084522353581e-05, + "loss": 1.3904, + "step": 3130 + }, + { + "epoch": 0.21300448430493274, + "grad_norm": 0.5501512289047241, + "learning_rate": 4.866872197309417e-05, + "loss": 1.4069, + "step": 3135 + }, + { + "epoch": 0.2133442043755945, + "grad_norm": 0.6174147725105286, + "learning_rate": 4.8666598722652536e-05, + "loss": 1.3189, + "step": 3140 + }, + { + "epoch": 0.21368392444625628, + "grad_norm": 0.6710749268531799, + "learning_rate": 4.86644754722109e-05, + "loss": 1.372, + "step": 3145 + }, + { + "epoch": 0.21402364451691805, + "grad_norm": 0.7570081949234009, + "learning_rate": 4.8662352221769264e-05, + "loss": 1.421, + "step": 3150 + }, + { + "epoch": 0.21436336458757985, + "grad_norm": 0.6627448201179504, + "learning_rate": 4.866022897132763e-05, + "loss": 1.3698, + "step": 3155 + }, + { + "epoch": 0.21470308465824162, + "grad_norm": 0.5601295232772827, + "learning_rate": 4.865810572088599e-05, + "loss": 1.4537, + "step": 3160 + }, + { + "epoch": 0.2150428047289034, + "grad_norm": 0.577513575553894, + "learning_rate": 4.8655982470444356e-05, + "loss": 1.5005, + "step": 3165 + }, + { + "epoch": 0.21538252479956516, + "grad_norm": 0.5896574854850769, + "learning_rate": 4.865385922000272e-05, + "loss": 1.2676, + "step": 3170 + }, + { + "epoch": 0.21572224487022693, + "grad_norm": 0.536129355430603, + "learning_rate": 4.8651735969561084e-05, + "loss": 1.4556, + "step": 3175 + }, + { + "epoch": 0.2160619649408887, + "grad_norm": 0.5639377236366272, + "learning_rate": 4.864961271911945e-05, + "loss": 1.3672, + "step": 3180 + }, + { + "epoch": 0.2164016850115505, + "grad_norm": 0.564048171043396, + "learning_rate": 4.864748946867781e-05, + "loss": 1.3609, + "step": 3185 + }, + { + "epoch": 0.21674140508221226, + "grad_norm": 0.604621946811676, + "learning_rate": 4.8645366218236176e-05, + "loss": 1.3545, + "step": 3190 + }, + { + "epoch": 0.21708112515287403, + "grad_norm": 0.5847653150558472, + "learning_rate": 4.864324296779454e-05, + "loss": 1.4451, + "step": 3195 + }, + { + "epoch": 0.2174208452235358, + "grad_norm": 0.643519937992096, + "learning_rate": 4.8641119717352904e-05, + "loss": 1.5368, + "step": 3200 + }, + { + "epoch": 0.21776056529419757, + "grad_norm": 0.6260031461715698, + "learning_rate": 4.863899646691127e-05, + "loss": 1.359, + "step": 3205 + }, + { + "epoch": 0.21810028536485934, + "grad_norm": 0.7368310689926147, + "learning_rate": 4.863687321646963e-05, + "loss": 1.4048, + "step": 3210 + }, + { + "epoch": 0.21844000543552114, + "grad_norm": 0.5465880036354065, + "learning_rate": 4.8634749966027996e-05, + "loss": 1.4155, + "step": 3215 + }, + { + "epoch": 0.2187797255061829, + "grad_norm": 0.691990852355957, + "learning_rate": 4.863262671558636e-05, + "loss": 1.4225, + "step": 3220 + }, + { + "epoch": 0.21911944557684468, + "grad_norm": 0.5825974941253662, + "learning_rate": 4.8630503465144724e-05, + "loss": 1.3682, + "step": 3225 + }, + { + "epoch": 0.21945916564750645, + "grad_norm": 0.5493308305740356, + "learning_rate": 4.862838021470308e-05, + "loss": 1.3828, + "step": 3230 + }, + { + "epoch": 0.21979888571816822, + "grad_norm": 0.6052065491676331, + "learning_rate": 4.862625696426145e-05, + "loss": 1.4233, + "step": 3235 + }, + { + "epoch": 0.22013860578883002, + "grad_norm": 0.6335605978965759, + "learning_rate": 4.8624133713819816e-05, + "loss": 1.4469, + "step": 3240 + }, + { + "epoch": 0.22047832585949179, + "grad_norm": 0.612486720085144, + "learning_rate": 4.862201046337818e-05, + "loss": 1.4722, + "step": 3245 + }, + { + "epoch": 0.22081804593015356, + "grad_norm": 0.6906188130378723, + "learning_rate": 4.8619887212936544e-05, + "loss": 1.4672, + "step": 3250 + }, + { + "epoch": 0.22115776600081533, + "grad_norm": 0.5844310522079468, + "learning_rate": 4.861776396249491e-05, + "loss": 1.4467, + "step": 3255 + }, + { + "epoch": 0.2214974860714771, + "grad_norm": 0.6048876643180847, + "learning_rate": 4.861564071205327e-05, + "loss": 1.4849, + "step": 3260 + }, + { + "epoch": 0.22183720614213887, + "grad_norm": 0.6119795441627502, + "learning_rate": 4.8613517461611636e-05, + "loss": 1.4486, + "step": 3265 + }, + { + "epoch": 0.22217692621280066, + "grad_norm": 0.6853047013282776, + "learning_rate": 4.861139421117e-05, + "loss": 1.3078, + "step": 3270 + }, + { + "epoch": 0.22251664628346243, + "grad_norm": 0.5909206867218018, + "learning_rate": 4.8609270960728364e-05, + "loss": 1.4173, + "step": 3275 + }, + { + "epoch": 0.2228563663541242, + "grad_norm": 0.5987277626991272, + "learning_rate": 4.860714771028673e-05, + "loss": 1.4653, + "step": 3280 + }, + { + "epoch": 0.22319608642478597, + "grad_norm": 0.6398468613624573, + "learning_rate": 4.860502445984509e-05, + "loss": 1.4372, + "step": 3285 + }, + { + "epoch": 0.22353580649544774, + "grad_norm": 0.6329526901245117, + "learning_rate": 4.8602901209403456e-05, + "loss": 1.3694, + "step": 3290 + }, + { + "epoch": 0.22387552656610954, + "grad_norm": 0.5358933806419373, + "learning_rate": 4.860077795896182e-05, + "loss": 1.3054, + "step": 3295 + }, + { + "epoch": 0.2242152466367713, + "grad_norm": 0.5731387734413147, + "learning_rate": 4.859865470852018e-05, + "loss": 1.4455, + "step": 3300 + }, + { + "epoch": 0.22455496670743308, + "grad_norm": 0.5447611212730408, + "learning_rate": 4.859653145807855e-05, + "loss": 1.5764, + "step": 3305 + }, + { + "epoch": 0.22489468677809485, + "grad_norm": 0.5443009734153748, + "learning_rate": 4.859440820763691e-05, + "loss": 1.3469, + "step": 3310 + }, + { + "epoch": 0.22523440684875662, + "grad_norm": 0.6105886697769165, + "learning_rate": 4.859228495719527e-05, + "loss": 1.3663, + "step": 3315 + }, + { + "epoch": 0.2255741269194184, + "grad_norm": 0.5600051879882812, + "learning_rate": 4.859016170675364e-05, + "loss": 1.325, + "step": 3320 + }, + { + "epoch": 0.22591384699008019, + "grad_norm": 0.5838090777397156, + "learning_rate": 4.8588038456312004e-05, + "loss": 1.3707, + "step": 3325 + }, + { + "epoch": 0.22625356706074196, + "grad_norm": 0.5875625014305115, + "learning_rate": 4.858591520587036e-05, + "loss": 1.5171, + "step": 3330 + }, + { + "epoch": 0.22659328713140373, + "grad_norm": 0.5414556860923767, + "learning_rate": 4.858379195542873e-05, + "loss": 1.3891, + "step": 3335 + }, + { + "epoch": 0.2269330072020655, + "grad_norm": 0.6923336982727051, + "learning_rate": 4.8581668704987096e-05, + "loss": 1.4551, + "step": 3340 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.5776564478874207, + "learning_rate": 4.857954545454545e-05, + "loss": 1.3825, + "step": 3345 + }, + { + "epoch": 0.22761244734338903, + "grad_norm": 0.6969068050384521, + "learning_rate": 4.8577422204103824e-05, + "loss": 1.4429, + "step": 3350 + }, + { + "epoch": 0.22795216741405083, + "grad_norm": 0.6452533006668091, + "learning_rate": 4.857529895366219e-05, + "loss": 1.356, + "step": 3355 + }, + { + "epoch": 0.2282918874847126, + "grad_norm": 0.5599813461303711, + "learning_rate": 4.8573175703220545e-05, + "loss": 1.4299, + "step": 3360 + }, + { + "epoch": 0.22863160755537437, + "grad_norm": 0.6101433038711548, + "learning_rate": 4.8571052452778916e-05, + "loss": 1.48, + "step": 3365 + }, + { + "epoch": 0.22897132762603614, + "grad_norm": 0.517551064491272, + "learning_rate": 4.856892920233727e-05, + "loss": 1.3787, + "step": 3370 + }, + { + "epoch": 0.2293110476966979, + "grad_norm": 0.59132319688797, + "learning_rate": 4.856680595189564e-05, + "loss": 1.3269, + "step": 3375 + }, + { + "epoch": 0.2296507677673597, + "grad_norm": 0.6300364136695862, + "learning_rate": 4.856468270145401e-05, + "loss": 1.4273, + "step": 3380 + }, + { + "epoch": 0.22999048783802148, + "grad_norm": 0.5361374616622925, + "learning_rate": 4.8562559451012365e-05, + "loss": 1.4379, + "step": 3385 + }, + { + "epoch": 0.23033020790868325, + "grad_norm": 0.5496021509170532, + "learning_rate": 4.856043620057073e-05, + "loss": 1.4591, + "step": 3390 + }, + { + "epoch": 0.23066992797934502, + "grad_norm": 0.5989214181900024, + "learning_rate": 4.85583129501291e-05, + "loss": 1.3554, + "step": 3395 + }, + { + "epoch": 0.2310096480500068, + "grad_norm": 0.6240394115447998, + "learning_rate": 4.855618969968746e-05, + "loss": 1.4081, + "step": 3400 + }, + { + "epoch": 0.23134936812066856, + "grad_norm": 0.6648991703987122, + "learning_rate": 4.855406644924582e-05, + "loss": 1.4669, + "step": 3405 + }, + { + "epoch": 0.23168908819133036, + "grad_norm": 0.5599499344825745, + "learning_rate": 4.855194319880419e-05, + "loss": 1.309, + "step": 3410 + }, + { + "epoch": 0.23202880826199213, + "grad_norm": 0.6417104601860046, + "learning_rate": 4.854981994836255e-05, + "loss": 1.4025, + "step": 3415 + }, + { + "epoch": 0.2323685283326539, + "grad_norm": 0.5643985271453857, + "learning_rate": 4.854769669792091e-05, + "loss": 1.384, + "step": 3420 + }, + { + "epoch": 0.23270824840331567, + "grad_norm": 0.5993593335151672, + "learning_rate": 4.8545573447479284e-05, + "loss": 1.4265, + "step": 3425 + }, + { + "epoch": 0.23304796847397743, + "grad_norm": 0.612234354019165, + "learning_rate": 4.854345019703764e-05, + "loss": 1.461, + "step": 3430 + }, + { + "epoch": 0.2333876885446392, + "grad_norm": 0.5936884880065918, + "learning_rate": 4.8541326946596005e-05, + "loss": 1.4586, + "step": 3435 + }, + { + "epoch": 0.233727408615301, + "grad_norm": 0.7210749983787537, + "learning_rate": 4.8539203696154376e-05, + "loss": 1.3428, + "step": 3440 + }, + { + "epoch": 0.23406712868596277, + "grad_norm": 0.6005198359489441, + "learning_rate": 4.853708044571273e-05, + "loss": 1.3284, + "step": 3445 + }, + { + "epoch": 0.23440684875662454, + "grad_norm": 0.6938565373420715, + "learning_rate": 4.85349571952711e-05, + "loss": 1.4068, + "step": 3450 + }, + { + "epoch": 0.2347465688272863, + "grad_norm": 0.563916027545929, + "learning_rate": 4.853283394482946e-05, + "loss": 1.478, + "step": 3455 + }, + { + "epoch": 0.23508628889794808, + "grad_norm": 0.6281059384346008, + "learning_rate": 4.8530710694387825e-05, + "loss": 1.3657, + "step": 3460 + }, + { + "epoch": 0.23542600896860988, + "grad_norm": 0.6494596600532532, + "learning_rate": 4.852858744394619e-05, + "loss": 1.4288, + "step": 3465 + }, + { + "epoch": 0.23576572903927165, + "grad_norm": 0.5783873200416565, + "learning_rate": 4.852646419350455e-05, + "loss": 1.3344, + "step": 3470 + }, + { + "epoch": 0.23610544910993342, + "grad_norm": 0.6288142800331116, + "learning_rate": 4.852434094306292e-05, + "loss": 1.3888, + "step": 3475 + }, + { + "epoch": 0.2364451691805952, + "grad_norm": 0.7065832018852234, + "learning_rate": 4.852221769262128e-05, + "loss": 1.4344, + "step": 3480 + }, + { + "epoch": 0.23678488925125696, + "grad_norm": 0.48714709281921387, + "learning_rate": 4.8520094442179645e-05, + "loss": 1.3114, + "step": 3485 + }, + { + "epoch": 0.23712460932191873, + "grad_norm": 0.5135933756828308, + "learning_rate": 4.851797119173801e-05, + "loss": 1.3019, + "step": 3490 + }, + { + "epoch": 0.23746432939258053, + "grad_norm": 0.5906147360801697, + "learning_rate": 4.851584794129637e-05, + "loss": 1.3449, + "step": 3495 + }, + { + "epoch": 0.2378040494632423, + "grad_norm": 0.5758570432662964, + "learning_rate": 4.851372469085474e-05, + "loss": 1.3763, + "step": 3500 + }, + { + "epoch": 0.23814376953390406, + "grad_norm": 0.6372262239456177, + "learning_rate": 4.85116014404131e-05, + "loss": 1.4366, + "step": 3505 + }, + { + "epoch": 0.23848348960456583, + "grad_norm": 0.6069319844245911, + "learning_rate": 4.8509478189971465e-05, + "loss": 1.3901, + "step": 3510 + }, + { + "epoch": 0.2388232096752276, + "grad_norm": 0.5084763765335083, + "learning_rate": 4.850735493952983e-05, + "loss": 1.4231, + "step": 3515 + }, + { + "epoch": 0.23916292974588937, + "grad_norm": 0.6538741588592529, + "learning_rate": 4.850523168908819e-05, + "loss": 1.446, + "step": 3520 + }, + { + "epoch": 0.23950264981655117, + "grad_norm": 0.5911784172058105, + "learning_rate": 4.850310843864656e-05, + "loss": 1.4551, + "step": 3525 + }, + { + "epoch": 0.23984236988721294, + "grad_norm": 0.5534785389900208, + "learning_rate": 4.850098518820492e-05, + "loss": 1.3407, + "step": 3530 + }, + { + "epoch": 0.2401820899578747, + "grad_norm": 0.6726526618003845, + "learning_rate": 4.8498861937763285e-05, + "loss": 1.4309, + "step": 3535 + }, + { + "epoch": 0.24052181002853648, + "grad_norm": 0.5891156792640686, + "learning_rate": 4.849673868732165e-05, + "loss": 1.4469, + "step": 3540 + }, + { + "epoch": 0.24086153009919825, + "grad_norm": 0.7022532820701599, + "learning_rate": 4.849461543688001e-05, + "loss": 1.3612, + "step": 3545 + }, + { + "epoch": 0.24120125016986005, + "grad_norm": 0.6173405051231384, + "learning_rate": 4.849249218643838e-05, + "loss": 1.4501, + "step": 3550 + }, + { + "epoch": 0.24154097024052182, + "grad_norm": 0.573545515537262, + "learning_rate": 4.849036893599674e-05, + "loss": 1.4623, + "step": 3555 + }, + { + "epoch": 0.2418806903111836, + "grad_norm": 0.5794370174407959, + "learning_rate": 4.8488245685555105e-05, + "loss": 1.3815, + "step": 3560 + }, + { + "epoch": 0.24222041038184536, + "grad_norm": 0.6289984583854675, + "learning_rate": 4.848612243511347e-05, + "loss": 1.392, + "step": 3565 + }, + { + "epoch": 0.24256013045250713, + "grad_norm": 0.6021126508712769, + "learning_rate": 4.848399918467183e-05, + "loss": 1.4036, + "step": 3570 + }, + { + "epoch": 0.2428998505231689, + "grad_norm": 0.6499696373939514, + "learning_rate": 4.84818759342302e-05, + "loss": 1.4753, + "step": 3575 + }, + { + "epoch": 0.2432395705938307, + "grad_norm": 0.5753048062324524, + "learning_rate": 4.847975268378856e-05, + "loss": 1.6739, + "step": 3580 + }, + { + "epoch": 0.24357929066449246, + "grad_norm": 0.49970388412475586, + "learning_rate": 4.8477629433346925e-05, + "loss": 1.496, + "step": 3585 + }, + { + "epoch": 0.24391901073515423, + "grad_norm": 0.6353665590286255, + "learning_rate": 4.847550618290529e-05, + "loss": 1.3704, + "step": 3590 + }, + { + "epoch": 0.244258730805816, + "grad_norm": 0.6091659665107727, + "learning_rate": 4.8473382932463653e-05, + "loss": 1.4259, + "step": 3595 + }, + { + "epoch": 0.24459845087647777, + "grad_norm": 0.5981108546257019, + "learning_rate": 4.847125968202202e-05, + "loss": 1.4009, + "step": 3600 + }, + { + "epoch": 0.24493817094713954, + "grad_norm": 0.5597249269485474, + "learning_rate": 4.846913643158038e-05, + "loss": 1.4112, + "step": 3605 + }, + { + "epoch": 0.24527789101780134, + "grad_norm": 0.6595554947853088, + "learning_rate": 4.8467013181138745e-05, + "loss": 1.4225, + "step": 3610 + }, + { + "epoch": 0.2456176110884631, + "grad_norm": 0.6585940718650818, + "learning_rate": 4.846488993069711e-05, + "loss": 1.3301, + "step": 3615 + }, + { + "epoch": 0.24595733115912488, + "grad_norm": 0.7183786034584045, + "learning_rate": 4.8462766680255473e-05, + "loss": 1.3734, + "step": 3620 + }, + { + "epoch": 0.24629705122978665, + "grad_norm": 0.584247887134552, + "learning_rate": 4.846064342981384e-05, + "loss": 1.3636, + "step": 3625 + }, + { + "epoch": 0.24663677130044842, + "grad_norm": 0.56235671043396, + "learning_rate": 4.84585201793722e-05, + "loss": 1.3583, + "step": 3630 + }, + { + "epoch": 0.24697649137111022, + "grad_norm": 0.614459753036499, + "learning_rate": 4.8456396928930565e-05, + "loss": 1.4291, + "step": 3635 + }, + { + "epoch": 0.247316211441772, + "grad_norm": 0.6260278820991516, + "learning_rate": 4.845427367848893e-05, + "loss": 1.4347, + "step": 3640 + }, + { + "epoch": 0.24765593151243376, + "grad_norm": 0.5813421607017517, + "learning_rate": 4.8452150428047293e-05, + "loss": 1.3232, + "step": 3645 + }, + { + "epoch": 0.24799565158309553, + "grad_norm": 0.5234394669532776, + "learning_rate": 4.845002717760566e-05, + "loss": 1.375, + "step": 3650 + }, + { + "epoch": 0.2483353716537573, + "grad_norm": 0.5626548528671265, + "learning_rate": 4.8447903927164015e-05, + "loss": 1.2804, + "step": 3655 + }, + { + "epoch": 0.24867509172441907, + "grad_norm": 0.5892276167869568, + "learning_rate": 4.8445780676722385e-05, + "loss": 1.3753, + "step": 3660 + }, + { + "epoch": 0.24901481179508086, + "grad_norm": 0.5537261962890625, + "learning_rate": 4.844365742628075e-05, + "loss": 1.3937, + "step": 3665 + }, + { + "epoch": 0.24935453186574263, + "grad_norm": 0.6903495788574219, + "learning_rate": 4.844153417583911e-05, + "loss": 1.4007, + "step": 3670 + }, + { + "epoch": 0.2496942519364044, + "grad_norm": 0.6384214162826538, + "learning_rate": 4.843941092539748e-05, + "loss": 1.4474, + "step": 3675 + }, + { + "epoch": 0.2500339720070662, + "grad_norm": 0.5833981037139893, + "learning_rate": 4.843728767495584e-05, + "loss": 1.4378, + "step": 3680 + }, + { + "epoch": 0.25037369207772797, + "grad_norm": 0.7025521993637085, + "learning_rate": 4.84351644245142e-05, + "loss": 1.3458, + "step": 3685 + }, + { + "epoch": 0.2507134121483897, + "grad_norm": 0.5626585483551025, + "learning_rate": 4.843304117407257e-05, + "loss": 1.3712, + "step": 3690 + }, + { + "epoch": 0.2510531322190515, + "grad_norm": 0.6970829367637634, + "learning_rate": 4.8430917923630933e-05, + "loss": 1.4608, + "step": 3695 + }, + { + "epoch": 0.25139285228971325, + "grad_norm": 0.764566957950592, + "learning_rate": 4.842879467318929e-05, + "loss": 1.3965, + "step": 3700 + }, + { + "epoch": 0.25173257236037505, + "grad_norm": 0.5882022976875305, + "learning_rate": 4.842667142274766e-05, + "loss": 1.3826, + "step": 3705 + }, + { + "epoch": 0.25207229243103685, + "grad_norm": 0.559703528881073, + "learning_rate": 4.8424548172306025e-05, + "loss": 1.4295, + "step": 3710 + }, + { + "epoch": 0.2524120125016986, + "grad_norm": 0.6943153142929077, + "learning_rate": 4.842242492186438e-05, + "loss": 1.4013, + "step": 3715 + }, + { + "epoch": 0.2527517325723604, + "grad_norm": 0.6310819387435913, + "learning_rate": 4.8420301671422754e-05, + "loss": 1.3209, + "step": 3720 + }, + { + "epoch": 0.25309145264302213, + "grad_norm": 0.4891515076160431, + "learning_rate": 4.841817842098111e-05, + "loss": 1.3769, + "step": 3725 + }, + { + "epoch": 0.2534311727136839, + "grad_norm": 0.5766716003417969, + "learning_rate": 4.8416055170539475e-05, + "loss": 1.2176, + "step": 3730 + }, + { + "epoch": 0.2537708927843457, + "grad_norm": 0.7314422130584717, + "learning_rate": 4.8413931920097846e-05, + "loss": 1.495, + "step": 3735 + }, + { + "epoch": 0.25411061285500747, + "grad_norm": 0.6508923172950745, + "learning_rate": 4.84118086696562e-05, + "loss": 1.3688, + "step": 3740 + }, + { + "epoch": 0.25445033292566926, + "grad_norm": 0.6452445983886719, + "learning_rate": 4.840968541921457e-05, + "loss": 1.3935, + "step": 3745 + }, + { + "epoch": 0.254790052996331, + "grad_norm": 0.6243131756782532, + "learning_rate": 4.840756216877294e-05, + "loss": 1.497, + "step": 3750 + }, + { + "epoch": 0.2551297730669928, + "grad_norm": 0.6898776888847351, + "learning_rate": 4.8405438918331295e-05, + "loss": 1.5104, + "step": 3755 + }, + { + "epoch": 0.25546949313765455, + "grad_norm": 0.5548217296600342, + "learning_rate": 4.840331566788966e-05, + "loss": 1.385, + "step": 3760 + }, + { + "epoch": 0.25580921320831634, + "grad_norm": 0.5216907858848572, + "learning_rate": 4.840119241744803e-05, + "loss": 1.4015, + "step": 3765 + }, + { + "epoch": 0.25614893327897814, + "grad_norm": 0.5976521372795105, + "learning_rate": 4.839906916700639e-05, + "loss": 1.3784, + "step": 3770 + }, + { + "epoch": 0.2564886533496399, + "grad_norm": 0.5819798111915588, + "learning_rate": 4.839694591656475e-05, + "loss": 1.4777, + "step": 3775 + }, + { + "epoch": 0.2568283734203017, + "grad_norm": 0.5949716567993164, + "learning_rate": 4.839482266612312e-05, + "loss": 1.3815, + "step": 3780 + }, + { + "epoch": 0.2571680934909634, + "grad_norm": 0.55443274974823, + "learning_rate": 4.839269941568148e-05, + "loss": 1.4397, + "step": 3785 + }, + { + "epoch": 0.2575078135616252, + "grad_norm": 0.581902801990509, + "learning_rate": 4.839057616523984e-05, + "loss": 1.4122, + "step": 3790 + }, + { + "epoch": 0.257847533632287, + "grad_norm": 0.5934801697731018, + "learning_rate": 4.8388452914798214e-05, + "loss": 1.3265, + "step": 3795 + }, + { + "epoch": 0.25818725370294876, + "grad_norm": 0.5715326070785522, + "learning_rate": 4.838632966435657e-05, + "loss": 1.3898, + "step": 3800 + }, + { + "epoch": 0.25852697377361056, + "grad_norm": 0.5871545672416687, + "learning_rate": 4.8384206413914935e-05, + "loss": 1.3728, + "step": 3805 + }, + { + "epoch": 0.2588666938442723, + "grad_norm": 0.6707566976547241, + "learning_rate": 4.83820831634733e-05, + "loss": 1.3844, + "step": 3810 + }, + { + "epoch": 0.2592064139149341, + "grad_norm": 0.6047499179840088, + "learning_rate": 4.837995991303166e-05, + "loss": 1.3889, + "step": 3815 + }, + { + "epoch": 0.2595461339855959, + "grad_norm": 0.62021404504776, + "learning_rate": 4.837783666259003e-05, + "loss": 1.4028, + "step": 3820 + }, + { + "epoch": 0.25988585405625764, + "grad_norm": 0.6598581075668335, + "learning_rate": 4.837571341214839e-05, + "loss": 1.4629, + "step": 3825 + }, + { + "epoch": 0.26022557412691943, + "grad_norm": 0.6117517352104187, + "learning_rate": 4.8373590161706755e-05, + "loss": 1.325, + "step": 3830 + }, + { + "epoch": 0.2605652941975812, + "grad_norm": 0.49917417764663696, + "learning_rate": 4.837146691126512e-05, + "loss": 1.3048, + "step": 3835 + }, + { + "epoch": 0.260905014268243, + "grad_norm": 0.6131162643432617, + "learning_rate": 4.836934366082348e-05, + "loss": 1.4287, + "step": 3840 + }, + { + "epoch": 0.2612447343389047, + "grad_norm": 0.6981179118156433, + "learning_rate": 4.836722041038185e-05, + "loss": 1.4482, + "step": 3845 + }, + { + "epoch": 0.2615844544095665, + "grad_norm": 0.5723700523376465, + "learning_rate": 4.836509715994021e-05, + "loss": 1.4416, + "step": 3850 + }, + { + "epoch": 0.2619241744802283, + "grad_norm": 0.6441469192504883, + "learning_rate": 4.8362973909498575e-05, + "loss": 1.4, + "step": 3855 + }, + { + "epoch": 0.26226389455089005, + "grad_norm": 0.6017731428146362, + "learning_rate": 4.836085065905694e-05, + "loss": 1.4324, + "step": 3860 + }, + { + "epoch": 0.26260361462155185, + "grad_norm": 0.6027065515518188, + "learning_rate": 4.83587274086153e-05, + "loss": 1.405, + "step": 3865 + }, + { + "epoch": 0.2629433346922136, + "grad_norm": 0.573409378528595, + "learning_rate": 4.835660415817367e-05, + "loss": 1.3482, + "step": 3870 + }, + { + "epoch": 0.2632830547628754, + "grad_norm": 0.6863682270050049, + "learning_rate": 4.835448090773203e-05, + "loss": 1.3839, + "step": 3875 + }, + { + "epoch": 0.2636227748335372, + "grad_norm": 0.6273049116134644, + "learning_rate": 4.8352357657290395e-05, + "loss": 1.3573, + "step": 3880 + }, + { + "epoch": 0.26396249490419893, + "grad_norm": 0.593453586101532, + "learning_rate": 4.835023440684876e-05, + "loss": 1.4102, + "step": 3885 + }, + { + "epoch": 0.2643022149748607, + "grad_norm": 0.6387911438941956, + "learning_rate": 4.834811115640712e-05, + "loss": 1.4568, + "step": 3890 + }, + { + "epoch": 0.26464193504552247, + "grad_norm": 0.6193586587905884, + "learning_rate": 4.834598790596549e-05, + "loss": 1.3945, + "step": 3895 + }, + { + "epoch": 0.26498165511618427, + "grad_norm": 0.6583086252212524, + "learning_rate": 4.834386465552385e-05, + "loss": 1.3465, + "step": 3900 + }, + { + "epoch": 0.26532137518684606, + "grad_norm": 0.6301378011703491, + "learning_rate": 4.8341741405082215e-05, + "loss": 1.4004, + "step": 3905 + }, + { + "epoch": 0.2656610952575078, + "grad_norm": 0.577540934085846, + "learning_rate": 4.833961815464058e-05, + "loss": 1.4346, + "step": 3910 + }, + { + "epoch": 0.2660008153281696, + "grad_norm": 0.6303648948669434, + "learning_rate": 4.833749490419894e-05, + "loss": 1.3866, + "step": 3915 + }, + { + "epoch": 0.26634053539883135, + "grad_norm": 0.6956437230110168, + "learning_rate": 4.833537165375731e-05, + "loss": 1.4491, + "step": 3920 + }, + { + "epoch": 0.26668025546949314, + "grad_norm": 0.656446635723114, + "learning_rate": 4.833324840331567e-05, + "loss": 1.4148, + "step": 3925 + }, + { + "epoch": 0.2670199755401549, + "grad_norm": 0.5804231762886047, + "learning_rate": 4.8331125152874035e-05, + "loss": 1.3895, + "step": 3930 + }, + { + "epoch": 0.2673596956108167, + "grad_norm": 0.5970335602760315, + "learning_rate": 4.83290019024324e-05, + "loss": 1.3782, + "step": 3935 + }, + { + "epoch": 0.2676994156814785, + "grad_norm": 0.6738160848617554, + "learning_rate": 4.832687865199076e-05, + "loss": 1.4232, + "step": 3940 + }, + { + "epoch": 0.2680391357521402, + "grad_norm": 0.5744298696517944, + "learning_rate": 4.832475540154913e-05, + "loss": 1.4554, + "step": 3945 + }, + { + "epoch": 0.268378855822802, + "grad_norm": 0.6450674533843994, + "learning_rate": 4.832263215110749e-05, + "loss": 1.4713, + "step": 3950 + }, + { + "epoch": 0.26871857589346376, + "grad_norm": 0.6713495850563049, + "learning_rate": 4.8320508900665855e-05, + "loss": 1.3377, + "step": 3955 + }, + { + "epoch": 0.26905829596412556, + "grad_norm": 0.6296396255493164, + "learning_rate": 4.831838565022422e-05, + "loss": 1.4575, + "step": 3960 + }, + { + "epoch": 0.26939801603478736, + "grad_norm": 0.6520622968673706, + "learning_rate": 4.831626239978258e-05, + "loss": 1.4549, + "step": 3965 + }, + { + "epoch": 0.2697377361054491, + "grad_norm": 0.6124261617660522, + "learning_rate": 4.831413914934095e-05, + "loss": 1.3898, + "step": 3970 + }, + { + "epoch": 0.2700774561761109, + "grad_norm": 0.6756829023361206, + "learning_rate": 4.831201589889931e-05, + "loss": 1.5082, + "step": 3975 + }, + { + "epoch": 0.27041717624677264, + "grad_norm": 0.6021645069122314, + "learning_rate": 4.8309892648457675e-05, + "loss": 1.4192, + "step": 3980 + }, + { + "epoch": 0.27075689631743444, + "grad_norm": 0.6154415011405945, + "learning_rate": 4.830776939801604e-05, + "loss": 1.4664, + "step": 3985 + }, + { + "epoch": 0.27109661638809623, + "grad_norm": 0.6155113577842712, + "learning_rate": 4.83056461475744e-05, + "loss": 1.4474, + "step": 3990 + }, + { + "epoch": 0.271436336458758, + "grad_norm": 0.565564751625061, + "learning_rate": 4.830352289713277e-05, + "loss": 1.4619, + "step": 3995 + }, + { + "epoch": 0.2717760565294198, + "grad_norm": 0.699712336063385, + "learning_rate": 4.830139964669113e-05, + "loss": 1.5082, + "step": 4000 + }, + { + "epoch": 0.2721157766000815, + "grad_norm": 0.5771880149841309, + "learning_rate": 4.8299276396249495e-05, + "loss": 1.4625, + "step": 4005 + }, + { + "epoch": 0.2724554966707433, + "grad_norm": 0.6200066804885864, + "learning_rate": 4.829715314580785e-05, + "loss": 1.4102, + "step": 4010 + }, + { + "epoch": 0.27279521674140506, + "grad_norm": 0.5667638182640076, + "learning_rate": 4.829502989536622e-05, + "loss": 1.4367, + "step": 4015 + }, + { + "epoch": 0.27313493681206685, + "grad_norm": 0.6307473182678223, + "learning_rate": 4.829290664492459e-05, + "loss": 1.4044, + "step": 4020 + }, + { + "epoch": 0.27347465688272865, + "grad_norm": 0.5659202933311462, + "learning_rate": 4.8290783394482944e-05, + "loss": 1.4817, + "step": 4025 + }, + { + "epoch": 0.2738143769533904, + "grad_norm": 0.6943925619125366, + "learning_rate": 4.8288660144041315e-05, + "loss": 1.3818, + "step": 4030 + }, + { + "epoch": 0.2741540970240522, + "grad_norm": 0.9133588075637817, + "learning_rate": 4.828653689359968e-05, + "loss": 1.4241, + "step": 4035 + }, + { + "epoch": 0.27449381709471393, + "grad_norm": 0.5766347050666809, + "learning_rate": 4.8284413643158036e-05, + "loss": 1.3824, + "step": 4040 + }, + { + "epoch": 0.27483353716537573, + "grad_norm": 0.5434534549713135, + "learning_rate": 4.828229039271641e-05, + "loss": 1.5095, + "step": 4045 + }, + { + "epoch": 0.2751732572360375, + "grad_norm": 0.6625797152519226, + "learning_rate": 4.828016714227477e-05, + "loss": 1.4583, + "step": 4050 + }, + { + "epoch": 0.27551297730669927, + "grad_norm": 0.5697526335716248, + "learning_rate": 4.827804389183313e-05, + "loss": 1.4301, + "step": 4055 + }, + { + "epoch": 0.27585269737736107, + "grad_norm": 0.6194468140602112, + "learning_rate": 4.82759206413915e-05, + "loss": 1.3168, + "step": 4060 + }, + { + "epoch": 0.2761924174480228, + "grad_norm": 0.6437966823577881, + "learning_rate": 4.827379739094986e-05, + "loss": 1.3462, + "step": 4065 + }, + { + "epoch": 0.2765321375186846, + "grad_norm": 0.6066455841064453, + "learning_rate": 4.827167414050822e-05, + "loss": 1.3853, + "step": 4070 + }, + { + "epoch": 0.2768718575893464, + "grad_norm": 0.6828492879867554, + "learning_rate": 4.826955089006659e-05, + "loss": 1.4077, + "step": 4075 + }, + { + "epoch": 0.27721157766000815, + "grad_norm": 0.5618029236793518, + "learning_rate": 4.826742763962495e-05, + "loss": 1.4275, + "step": 4080 + }, + { + "epoch": 0.27755129773066994, + "grad_norm": 0.5699742436408997, + "learning_rate": 4.826530438918331e-05, + "loss": 1.3868, + "step": 4085 + }, + { + "epoch": 0.2778910178013317, + "grad_norm": 0.600041925907135, + "learning_rate": 4.826318113874168e-05, + "loss": 1.4206, + "step": 4090 + }, + { + "epoch": 0.2782307378719935, + "grad_norm": 0.6188768148422241, + "learning_rate": 4.826105788830004e-05, + "loss": 1.4262, + "step": 4095 + }, + { + "epoch": 0.2785704579426552, + "grad_norm": 0.5706759691238403, + "learning_rate": 4.8258934637858404e-05, + "loss": 1.4351, + "step": 4100 + }, + { + "epoch": 0.278910178013317, + "grad_norm": 0.5694918036460876, + "learning_rate": 4.8256811387416775e-05, + "loss": 1.4526, + "step": 4105 + }, + { + "epoch": 0.2792498980839788, + "grad_norm": 0.5923181772232056, + "learning_rate": 4.825468813697513e-05, + "loss": 1.3486, + "step": 4110 + }, + { + "epoch": 0.27958961815464056, + "grad_norm": 0.5981913805007935, + "learning_rate": 4.8252564886533496e-05, + "loss": 1.3979, + "step": 4115 + }, + { + "epoch": 0.27992933822530236, + "grad_norm": 0.6309303045272827, + "learning_rate": 4.825044163609187e-05, + "loss": 1.4769, + "step": 4120 + }, + { + "epoch": 0.2802690582959641, + "grad_norm": 0.6503332853317261, + "learning_rate": 4.8248318385650224e-05, + "loss": 1.3807, + "step": 4125 + }, + { + "epoch": 0.2806087783666259, + "grad_norm": 0.5759245157241821, + "learning_rate": 4.824619513520859e-05, + "loss": 1.3595, + "step": 4130 + }, + { + "epoch": 0.2809484984372877, + "grad_norm": 0.5755437016487122, + "learning_rate": 4.824407188476696e-05, + "loss": 1.425, + "step": 4135 + }, + { + "epoch": 0.28128821850794944, + "grad_norm": 0.5048924684524536, + "learning_rate": 4.8241948634325316e-05, + "loss": 1.3901, + "step": 4140 + }, + { + "epoch": 0.28162793857861124, + "grad_norm": 0.694521427154541, + "learning_rate": 4.823982538388368e-05, + "loss": 1.3302, + "step": 4145 + }, + { + "epoch": 0.281967658649273, + "grad_norm": 0.6683419346809387, + "learning_rate": 4.8237702133442044e-05, + "loss": 1.3654, + "step": 4150 + }, + { + "epoch": 0.2823073787199348, + "grad_norm": 0.6541835069656372, + "learning_rate": 4.823557888300041e-05, + "loss": 1.463, + "step": 4155 + }, + { + "epoch": 0.2826470987905966, + "grad_norm": 0.6378797292709351, + "learning_rate": 4.823345563255877e-05, + "loss": 1.4953, + "step": 4160 + }, + { + "epoch": 0.2829868188612583, + "grad_norm": 0.6032472252845764, + "learning_rate": 4.8231332382117136e-05, + "loss": 1.3275, + "step": 4165 + }, + { + "epoch": 0.2833265389319201, + "grad_norm": 0.5532385110855103, + "learning_rate": 4.82292091316755e-05, + "loss": 1.3732, + "step": 4170 + }, + { + "epoch": 0.28366625900258186, + "grad_norm": 0.6906827092170715, + "learning_rate": 4.8227085881233864e-05, + "loss": 1.4511, + "step": 4175 + }, + { + "epoch": 0.28400597907324365, + "grad_norm": 0.5115543603897095, + "learning_rate": 4.822496263079223e-05, + "loss": 1.2993, + "step": 4180 + }, + { + "epoch": 0.2843456991439054, + "grad_norm": 0.6229546070098877, + "learning_rate": 4.822283938035059e-05, + "loss": 1.4264, + "step": 4185 + }, + { + "epoch": 0.2846854192145672, + "grad_norm": 0.610550045967102, + "learning_rate": 4.8220716129908956e-05, + "loss": 1.3518, + "step": 4190 + }, + { + "epoch": 0.285025139285229, + "grad_norm": 0.6253818273544312, + "learning_rate": 4.821859287946732e-05, + "loss": 1.3582, + "step": 4195 + }, + { + "epoch": 0.28536485935589073, + "grad_norm": 0.6032271981239319, + "learning_rate": 4.8216469629025684e-05, + "loss": 1.3786, + "step": 4200 + }, + { + "epoch": 0.28570457942655253, + "grad_norm": 0.5876577496528625, + "learning_rate": 4.821434637858405e-05, + "loss": 1.432, + "step": 4205 + }, + { + "epoch": 0.28604429949721427, + "grad_norm": 0.5510315299034119, + "learning_rate": 4.821222312814241e-05, + "loss": 1.4849, + "step": 4210 + }, + { + "epoch": 0.28638401956787607, + "grad_norm": 0.6315826773643494, + "learning_rate": 4.8210099877700776e-05, + "loss": 1.3832, + "step": 4215 + }, + { + "epoch": 0.28672373963853787, + "grad_norm": 0.5919789671897888, + "learning_rate": 4.820797662725914e-05, + "loss": 1.4017, + "step": 4220 + }, + { + "epoch": 0.2870634597091996, + "grad_norm": 0.578884482383728, + "learning_rate": 4.8205853376817504e-05, + "loss": 1.5031, + "step": 4225 + }, + { + "epoch": 0.2874031797798614, + "grad_norm": 0.5814526677131653, + "learning_rate": 4.820373012637587e-05, + "loss": 1.359, + "step": 4230 + }, + { + "epoch": 0.28774289985052315, + "grad_norm": 0.5195465683937073, + "learning_rate": 4.820160687593423e-05, + "loss": 1.3718, + "step": 4235 + }, + { + "epoch": 0.28808261992118495, + "grad_norm": 0.6810558438301086, + "learning_rate": 4.8199483625492596e-05, + "loss": 1.3671, + "step": 4240 + }, + { + "epoch": 0.28842233999184674, + "grad_norm": 0.622515857219696, + "learning_rate": 4.819736037505096e-05, + "loss": 1.4952, + "step": 4245 + }, + { + "epoch": 0.2887620600625085, + "grad_norm": 0.6229998469352722, + "learning_rate": 4.8195237124609324e-05, + "loss": 1.3636, + "step": 4250 + }, + { + "epoch": 0.2891017801331703, + "grad_norm": 0.5896877646446228, + "learning_rate": 4.819311387416769e-05, + "loss": 1.4003, + "step": 4255 + }, + { + "epoch": 0.289441500203832, + "grad_norm": 0.676675021648407, + "learning_rate": 4.819099062372605e-05, + "loss": 1.5011, + "step": 4260 + }, + { + "epoch": 0.2897812202744938, + "grad_norm": 0.6762051582336426, + "learning_rate": 4.8188867373284416e-05, + "loss": 1.4601, + "step": 4265 + }, + { + "epoch": 0.29012094034515556, + "grad_norm": 0.5926623940467834, + "learning_rate": 4.818674412284278e-05, + "loss": 1.3434, + "step": 4270 + }, + { + "epoch": 0.29046066041581736, + "grad_norm": 0.49280211329460144, + "learning_rate": 4.8184620872401144e-05, + "loss": 1.3949, + "step": 4275 + }, + { + "epoch": 0.29080038048647916, + "grad_norm": 0.6341902613639832, + "learning_rate": 4.818249762195951e-05, + "loss": 1.3213, + "step": 4280 + }, + { + "epoch": 0.2911401005571409, + "grad_norm": 0.6441182494163513, + "learning_rate": 4.818037437151787e-05, + "loss": 1.3687, + "step": 4285 + }, + { + "epoch": 0.2914798206278027, + "grad_norm": 0.6745972037315369, + "learning_rate": 4.8178251121076236e-05, + "loss": 1.3677, + "step": 4290 + }, + { + "epoch": 0.29181954069846444, + "grad_norm": 0.6068249940872192, + "learning_rate": 4.81761278706346e-05, + "loss": 1.4581, + "step": 4295 + }, + { + "epoch": 0.29215926076912624, + "grad_norm": 0.6574903130531311, + "learning_rate": 4.8174004620192964e-05, + "loss": 1.4814, + "step": 4300 + }, + { + "epoch": 0.29249898083978804, + "grad_norm": 0.5898470878601074, + "learning_rate": 4.817188136975133e-05, + "loss": 1.3846, + "step": 4305 + }, + { + "epoch": 0.2928387009104498, + "grad_norm": 0.7001326084136963, + "learning_rate": 4.816975811930969e-05, + "loss": 1.4393, + "step": 4310 + }, + { + "epoch": 0.2931784209811116, + "grad_norm": 0.6270405054092407, + "learning_rate": 4.8167634868868056e-05, + "loss": 1.3712, + "step": 4315 + }, + { + "epoch": 0.2935181410517733, + "grad_norm": 0.597356379032135, + "learning_rate": 4.816551161842642e-05, + "loss": 1.3049, + "step": 4320 + }, + { + "epoch": 0.2938578611224351, + "grad_norm": 0.6218376755714417, + "learning_rate": 4.8163388367984784e-05, + "loss": 1.3632, + "step": 4325 + }, + { + "epoch": 0.2941975811930969, + "grad_norm": 0.7287354469299316, + "learning_rate": 4.816126511754315e-05, + "loss": 1.4058, + "step": 4330 + }, + { + "epoch": 0.29453730126375866, + "grad_norm": 0.6104965209960938, + "learning_rate": 4.815914186710151e-05, + "loss": 1.5259, + "step": 4335 + }, + { + "epoch": 0.29487702133442045, + "grad_norm": 0.6057714223861694, + "learning_rate": 4.8157018616659876e-05, + "loss": 1.3611, + "step": 4340 + }, + { + "epoch": 0.2952167414050822, + "grad_norm": 0.6120069622993469, + "learning_rate": 4.815489536621824e-05, + "loss": 1.3453, + "step": 4345 + }, + { + "epoch": 0.295556461475744, + "grad_norm": 0.6051473617553711, + "learning_rate": 4.81527721157766e-05, + "loss": 1.4418, + "step": 4350 + }, + { + "epoch": 0.29589618154640573, + "grad_norm": 0.6352353096008301, + "learning_rate": 4.815064886533497e-05, + "loss": 1.3684, + "step": 4355 + }, + { + "epoch": 0.29623590161706753, + "grad_norm": 0.5874910950660706, + "learning_rate": 4.814852561489333e-05, + "loss": 1.3508, + "step": 4360 + }, + { + "epoch": 0.29657562168772933, + "grad_norm": 0.6882878541946411, + "learning_rate": 4.814640236445169e-05, + "loss": 1.4451, + "step": 4365 + }, + { + "epoch": 0.29691534175839107, + "grad_norm": 0.6234210133552551, + "learning_rate": 4.814427911401006e-05, + "loss": 1.4546, + "step": 4370 + }, + { + "epoch": 0.29725506182905287, + "grad_norm": 0.544707179069519, + "learning_rate": 4.8142155863568424e-05, + "loss": 1.265, + "step": 4375 + }, + { + "epoch": 0.2975947818997146, + "grad_norm": 0.6247130632400513, + "learning_rate": 4.814003261312678e-05, + "loss": 1.3925, + "step": 4380 + }, + { + "epoch": 0.2979345019703764, + "grad_norm": 0.5478698015213013, + "learning_rate": 4.813790936268515e-05, + "loss": 1.3705, + "step": 4385 + }, + { + "epoch": 0.2982742220410382, + "grad_norm": 0.628241777420044, + "learning_rate": 4.8135786112243516e-05, + "loss": 1.4246, + "step": 4390 + }, + { + "epoch": 0.29861394211169995, + "grad_norm": 0.5739772915840149, + "learning_rate": 4.8133662861801874e-05, + "loss": 1.4448, + "step": 4395 + }, + { + "epoch": 0.29895366218236175, + "grad_norm": 0.6296387314796448, + "learning_rate": 4.8131539611360244e-05, + "loss": 1.4738, + "step": 4400 + }, + { + "epoch": 0.2992933822530235, + "grad_norm": 0.5494669675827026, + "learning_rate": 4.812941636091861e-05, + "loss": 1.4006, + "step": 4405 + }, + { + "epoch": 0.2996331023236853, + "grad_norm": 0.5814257860183716, + "learning_rate": 4.8127293110476966e-05, + "loss": 1.2864, + "step": 4410 + }, + { + "epoch": 0.2999728223943471, + "grad_norm": 0.656559944152832, + "learning_rate": 4.8125169860035336e-05, + "loss": 1.3349, + "step": 4415 + }, + { + "epoch": 0.3003125424650088, + "grad_norm": 0.6489969491958618, + "learning_rate": 4.81230466095937e-05, + "loss": 1.4294, + "step": 4420 + }, + { + "epoch": 0.3006522625356706, + "grad_norm": 0.6143659949302673, + "learning_rate": 4.812092335915206e-05, + "loss": 1.4744, + "step": 4425 + }, + { + "epoch": 0.30099198260633236, + "grad_norm": 0.6614595055580139, + "learning_rate": 4.811880010871043e-05, + "loss": 1.5482, + "step": 4430 + }, + { + "epoch": 0.30133170267699416, + "grad_norm": 0.5838325619697571, + "learning_rate": 4.8116676858268786e-05, + "loss": 1.4307, + "step": 4435 + }, + { + "epoch": 0.3016714227476559, + "grad_norm": 0.7493719458580017, + "learning_rate": 4.811455360782715e-05, + "loss": 1.3723, + "step": 4440 + }, + { + "epoch": 0.3020111428183177, + "grad_norm": 0.5825745463371277, + "learning_rate": 4.811243035738552e-05, + "loss": 1.4164, + "step": 4445 + }, + { + "epoch": 0.3023508628889795, + "grad_norm": 0.6245297789573669, + "learning_rate": 4.811030710694388e-05, + "loss": 1.3359, + "step": 4450 + }, + { + "epoch": 0.30269058295964124, + "grad_norm": 0.5819307565689087, + "learning_rate": 4.810818385650224e-05, + "loss": 1.3274, + "step": 4455 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.6597985625267029, + "learning_rate": 4.810606060606061e-05, + "loss": 1.3779, + "step": 4460 + }, + { + "epoch": 0.3033700231009648, + "grad_norm": 0.6612825989723206, + "learning_rate": 4.810393735561897e-05, + "loss": 1.4092, + "step": 4465 + }, + { + "epoch": 0.3037097431716266, + "grad_norm": 0.5472775101661682, + "learning_rate": 4.8101814105177334e-05, + "loss": 1.3949, + "step": 4470 + }, + { + "epoch": 0.3040494632422884, + "grad_norm": 0.603412389755249, + "learning_rate": 4.8099690854735705e-05, + "loss": 1.4901, + "step": 4475 + }, + { + "epoch": 0.3043891833129501, + "grad_norm": 0.878606379032135, + "learning_rate": 4.809756760429406e-05, + "loss": 1.3746, + "step": 4480 + }, + { + "epoch": 0.3047289033836119, + "grad_norm": 0.5390682220458984, + "learning_rate": 4.8095444353852426e-05, + "loss": 1.4478, + "step": 4485 + }, + { + "epoch": 0.30506862345427366, + "grad_norm": 0.6324414014816284, + "learning_rate": 4.8093321103410797e-05, + "loss": 1.5412, + "step": 4490 + }, + { + "epoch": 0.30540834352493546, + "grad_norm": 0.6175958514213562, + "learning_rate": 4.8091197852969154e-05, + "loss": 1.3482, + "step": 4495 + }, + { + "epoch": 0.30574806359559725, + "grad_norm": 0.6746761798858643, + "learning_rate": 4.808907460252752e-05, + "loss": 1.4718, + "step": 4500 + }, + { + "epoch": 0.306087783666259, + "grad_norm": 0.6800394058227539, + "learning_rate": 4.808695135208588e-05, + "loss": 1.34, + "step": 4505 + }, + { + "epoch": 0.3064275037369208, + "grad_norm": 0.5797684788703918, + "learning_rate": 4.8084828101644246e-05, + "loss": 1.433, + "step": 4510 + }, + { + "epoch": 0.30676722380758253, + "grad_norm": 0.5536337494850159, + "learning_rate": 4.808270485120261e-05, + "loss": 1.3905, + "step": 4515 + }, + { + "epoch": 0.30710694387824433, + "grad_norm": 0.6156224608421326, + "learning_rate": 4.8080581600760974e-05, + "loss": 1.3857, + "step": 4520 + }, + { + "epoch": 0.3074466639489061, + "grad_norm": 0.5743491649627686, + "learning_rate": 4.807845835031934e-05, + "loss": 1.329, + "step": 4525 + }, + { + "epoch": 0.30778638401956787, + "grad_norm": 0.46847978234291077, + "learning_rate": 4.80763350998777e-05, + "loss": 1.4378, + "step": 4530 + }, + { + "epoch": 0.30812610409022967, + "grad_norm": 0.7120422124862671, + "learning_rate": 4.8074211849436066e-05, + "loss": 1.3574, + "step": 4535 + }, + { + "epoch": 0.3084658241608914, + "grad_norm": 0.6666555404663086, + "learning_rate": 4.807208859899443e-05, + "loss": 1.4085, + "step": 4540 + }, + { + "epoch": 0.3088055442315532, + "grad_norm": 0.5958794355392456, + "learning_rate": 4.8069965348552794e-05, + "loss": 1.3687, + "step": 4545 + }, + { + "epoch": 0.30914526430221495, + "grad_norm": 0.6510292291641235, + "learning_rate": 4.806784209811116e-05, + "loss": 1.4446, + "step": 4550 + }, + { + "epoch": 0.30948498437287675, + "grad_norm": 0.5929175019264221, + "learning_rate": 4.806571884766952e-05, + "loss": 1.3678, + "step": 4555 + }, + { + "epoch": 0.30982470444353855, + "grad_norm": 0.6229990720748901, + "learning_rate": 4.8063595597227886e-05, + "loss": 1.3999, + "step": 4560 + }, + { + "epoch": 0.3101644245142003, + "grad_norm": 0.6406978368759155, + "learning_rate": 4.806147234678625e-05, + "loss": 1.4608, + "step": 4565 + }, + { + "epoch": 0.3105041445848621, + "grad_norm": 0.596836268901825, + "learning_rate": 4.8059349096344614e-05, + "loss": 1.3946, + "step": 4570 + }, + { + "epoch": 0.3108438646555238, + "grad_norm": 0.6312773823738098, + "learning_rate": 4.805722584590298e-05, + "loss": 1.3129, + "step": 4575 + }, + { + "epoch": 0.3111835847261856, + "grad_norm": 0.5499838590621948, + "learning_rate": 4.805510259546134e-05, + "loss": 1.4556, + "step": 4580 + }, + { + "epoch": 0.3115233047968474, + "grad_norm": 0.5884018540382385, + "learning_rate": 4.8052979345019706e-05, + "loss": 1.4935, + "step": 4585 + }, + { + "epoch": 0.31186302486750916, + "grad_norm": 0.5463150143623352, + "learning_rate": 4.805085609457807e-05, + "loss": 1.373, + "step": 4590 + }, + { + "epoch": 0.31220274493817096, + "grad_norm": 0.6012735366821289, + "learning_rate": 4.8048732844136434e-05, + "loss": 1.4114, + "step": 4595 + }, + { + "epoch": 0.3125424650088327, + "grad_norm": 0.6352843046188354, + "learning_rate": 4.80466095936948e-05, + "loss": 1.3933, + "step": 4600 + }, + { + "epoch": 0.3128821850794945, + "grad_norm": 0.5974364280700684, + "learning_rate": 4.804448634325316e-05, + "loss": 1.3988, + "step": 4605 + }, + { + "epoch": 0.3132219051501563, + "grad_norm": 0.6205219626426697, + "learning_rate": 4.8042363092811526e-05, + "loss": 1.3811, + "step": 4610 + }, + { + "epoch": 0.31356162522081804, + "grad_norm": 0.5919428467750549, + "learning_rate": 4.804023984236989e-05, + "loss": 1.3911, + "step": 4615 + }, + { + "epoch": 0.31390134529147984, + "grad_norm": 0.59086674451828, + "learning_rate": 4.8038116591928254e-05, + "loss": 1.336, + "step": 4620 + }, + { + "epoch": 0.3142410653621416, + "grad_norm": 0.5678482055664062, + "learning_rate": 4.803599334148662e-05, + "loss": 1.3585, + "step": 4625 + }, + { + "epoch": 0.3145807854328034, + "grad_norm": 0.6020316481590271, + "learning_rate": 4.803387009104498e-05, + "loss": 1.3429, + "step": 4630 + }, + { + "epoch": 0.3149205055034651, + "grad_norm": 0.6483088731765747, + "learning_rate": 4.8031746840603346e-05, + "loss": 1.3824, + "step": 4635 + }, + { + "epoch": 0.3152602255741269, + "grad_norm": 0.5800559520721436, + "learning_rate": 4.802962359016171e-05, + "loss": 1.3792, + "step": 4640 + }, + { + "epoch": 0.3155999456447887, + "grad_norm": 0.529157280921936, + "learning_rate": 4.8027500339720074e-05, + "loss": 1.454, + "step": 4645 + }, + { + "epoch": 0.31593966571545046, + "grad_norm": 0.673725426197052, + "learning_rate": 4.802537708927844e-05, + "loss": 1.3433, + "step": 4650 + }, + { + "epoch": 0.31627938578611225, + "grad_norm": 0.5634777545928955, + "learning_rate": 4.80232538388368e-05, + "loss": 1.408, + "step": 4655 + }, + { + "epoch": 0.316619105856774, + "grad_norm": 0.6213387250900269, + "learning_rate": 4.8021130588395166e-05, + "loss": 1.4259, + "step": 4660 + }, + { + "epoch": 0.3169588259274358, + "grad_norm": 0.6602832078933716, + "learning_rate": 4.801900733795353e-05, + "loss": 1.3895, + "step": 4665 + }, + { + "epoch": 0.3172985459980976, + "grad_norm": 0.6077116131782532, + "learning_rate": 4.8016884087511894e-05, + "loss": 1.3893, + "step": 4670 + }, + { + "epoch": 0.31763826606875933, + "grad_norm": 0.6477828621864319, + "learning_rate": 4.801476083707026e-05, + "loss": 1.5709, + "step": 4675 + }, + { + "epoch": 0.31797798613942113, + "grad_norm": 0.6097428798675537, + "learning_rate": 4.801263758662862e-05, + "loss": 1.3114, + "step": 4680 + }, + { + "epoch": 0.3183177062100829, + "grad_norm": 0.5983066558837891, + "learning_rate": 4.8010514336186986e-05, + "loss": 1.5342, + "step": 4685 + }, + { + "epoch": 0.31865742628074467, + "grad_norm": 0.5761337280273438, + "learning_rate": 4.800839108574535e-05, + "loss": 1.4056, + "step": 4690 + }, + { + "epoch": 0.31899714635140647, + "grad_norm": 0.5824152827262878, + "learning_rate": 4.8006267835303714e-05, + "loss": 1.3726, + "step": 4695 + }, + { + "epoch": 0.3193368664220682, + "grad_norm": 0.5764522552490234, + "learning_rate": 4.800414458486208e-05, + "loss": 1.3504, + "step": 4700 + }, + { + "epoch": 0.31967658649273, + "grad_norm": 0.6585021615028381, + "learning_rate": 4.8002021334420435e-05, + "loss": 1.3808, + "step": 4705 + }, + { + "epoch": 0.32001630656339175, + "grad_norm": 0.6844765543937683, + "learning_rate": 4.7999898083978806e-05, + "loss": 1.3319, + "step": 4710 + }, + { + "epoch": 0.32035602663405355, + "grad_norm": 0.6688312292098999, + "learning_rate": 4.799777483353717e-05, + "loss": 1.3902, + "step": 4715 + }, + { + "epoch": 0.3206957467047153, + "grad_norm": 0.6278951168060303, + "learning_rate": 4.799565158309553e-05, + "loss": 1.4044, + "step": 4720 + }, + { + "epoch": 0.3210354667753771, + "grad_norm": 0.7092724442481995, + "learning_rate": 4.79935283326539e-05, + "loss": 1.4092, + "step": 4725 + }, + { + "epoch": 0.3213751868460389, + "grad_norm": 0.5436230301856995, + "learning_rate": 4.799140508221226e-05, + "loss": 1.3973, + "step": 4730 + }, + { + "epoch": 0.3217149069167006, + "grad_norm": 0.6359293460845947, + "learning_rate": 4.798928183177062e-05, + "loss": 1.3636, + "step": 4735 + }, + { + "epoch": 0.3220546269873624, + "grad_norm": 0.6571273803710938, + "learning_rate": 4.798715858132899e-05, + "loss": 1.4008, + "step": 4740 + }, + { + "epoch": 0.32239434705802417, + "grad_norm": 0.6457809209823608, + "learning_rate": 4.7985035330887354e-05, + "loss": 1.3815, + "step": 4745 + }, + { + "epoch": 0.32273406712868596, + "grad_norm": 0.7295517325401306, + "learning_rate": 4.798291208044571e-05, + "loss": 1.4018, + "step": 4750 + }, + { + "epoch": 0.32307378719934776, + "grad_norm": 0.598146378993988, + "learning_rate": 4.798078883000408e-05, + "loss": 1.4444, + "step": 4755 + }, + { + "epoch": 0.3234135072700095, + "grad_norm": 0.5251185894012451, + "learning_rate": 4.7978665579562446e-05, + "loss": 1.3763, + "step": 4760 + }, + { + "epoch": 0.3237532273406713, + "grad_norm": 0.5642194747924805, + "learning_rate": 4.79765423291208e-05, + "loss": 1.4491, + "step": 4765 + }, + { + "epoch": 0.32409294741133304, + "grad_norm": 0.6763331890106201, + "learning_rate": 4.7974419078679174e-05, + "loss": 1.393, + "step": 4770 + }, + { + "epoch": 0.32443266748199484, + "grad_norm": 0.6245210766792297, + "learning_rate": 4.797229582823753e-05, + "loss": 1.3899, + "step": 4775 + }, + { + "epoch": 0.32477238755265664, + "grad_norm": 0.6132177114486694, + "learning_rate": 4.7970172577795895e-05, + "loss": 1.4692, + "step": 4780 + }, + { + "epoch": 0.3251121076233184, + "grad_norm": 0.6960394382476807, + "learning_rate": 4.7968049327354266e-05, + "loss": 1.3672, + "step": 4785 + }, + { + "epoch": 0.3254518276939802, + "grad_norm": 0.6208332777023315, + "learning_rate": 4.796592607691262e-05, + "loss": 1.3814, + "step": 4790 + }, + { + "epoch": 0.3257915477646419, + "grad_norm": 0.5507853627204895, + "learning_rate": 4.796380282647099e-05, + "loss": 1.3566, + "step": 4795 + }, + { + "epoch": 0.3261312678353037, + "grad_norm": 0.6600543856620789, + "learning_rate": 4.796167957602936e-05, + "loss": 1.44, + "step": 4800 + }, + { + "epoch": 0.32647098790596546, + "grad_norm": 0.638710081577301, + "learning_rate": 4.7959556325587715e-05, + "loss": 1.3183, + "step": 4805 + }, + { + "epoch": 0.32681070797662726, + "grad_norm": 0.6161012053489685, + "learning_rate": 4.795743307514608e-05, + "loss": 1.4818, + "step": 4810 + }, + { + "epoch": 0.32715042804728905, + "grad_norm": 0.6707820296287537, + "learning_rate": 4.795530982470445e-05, + "loss": 1.397, + "step": 4815 + }, + { + "epoch": 0.3274901481179508, + "grad_norm": 0.5956471562385559, + "learning_rate": 4.795318657426281e-05, + "loss": 1.3902, + "step": 4820 + }, + { + "epoch": 0.3278298681886126, + "grad_norm": 0.4893324375152588, + "learning_rate": 4.795106332382117e-05, + "loss": 1.4835, + "step": 4825 + }, + { + "epoch": 0.32816958825927434, + "grad_norm": 0.6486138701438904, + "learning_rate": 4.794894007337954e-05, + "loss": 1.2775, + "step": 4830 + }, + { + "epoch": 0.32850930832993613, + "grad_norm": 0.6149250864982605, + "learning_rate": 4.79468168229379e-05, + "loss": 1.3381, + "step": 4835 + }, + { + "epoch": 0.32884902840059793, + "grad_norm": 0.6293796300888062, + "learning_rate": 4.794469357249626e-05, + "loss": 1.4361, + "step": 4840 + }, + { + "epoch": 0.3291887484712597, + "grad_norm": 0.5803507566452026, + "learning_rate": 4.7942570322054634e-05, + "loss": 1.4186, + "step": 4845 + }, + { + "epoch": 0.32952846854192147, + "grad_norm": 0.6761592030525208, + "learning_rate": 4.794044707161299e-05, + "loss": 1.371, + "step": 4850 + }, + { + "epoch": 0.3298681886125832, + "grad_norm": 0.7355863451957703, + "learning_rate": 4.7938323821171355e-05, + "loss": 1.2935, + "step": 4855 + }, + { + "epoch": 0.330207908683245, + "grad_norm": 0.7543905377388, + "learning_rate": 4.793620057072972e-05, + "loss": 1.3664, + "step": 4860 + }, + { + "epoch": 0.3305476287539068, + "grad_norm": 0.6539553999900818, + "learning_rate": 4.793407732028808e-05, + "loss": 1.3888, + "step": 4865 + }, + { + "epoch": 0.33088734882456855, + "grad_norm": 0.5584560036659241, + "learning_rate": 4.793195406984645e-05, + "loss": 1.4666, + "step": 4870 + }, + { + "epoch": 0.33122706889523035, + "grad_norm": 0.5838732719421387, + "learning_rate": 4.792983081940481e-05, + "loss": 1.2935, + "step": 4875 + }, + { + "epoch": 0.3315667889658921, + "grad_norm": 0.6014986634254456, + "learning_rate": 4.7927707568963175e-05, + "loss": 1.3325, + "step": 4880 + }, + { + "epoch": 0.3319065090365539, + "grad_norm": 0.571640133857727, + "learning_rate": 4.792558431852154e-05, + "loss": 1.4879, + "step": 4885 + }, + { + "epoch": 0.33224622910721563, + "grad_norm": 0.628282368183136, + "learning_rate": 4.79234610680799e-05, + "loss": 1.3761, + "step": 4890 + }, + { + "epoch": 0.3325859491778774, + "grad_norm": 0.6059534549713135, + "learning_rate": 4.792133781763827e-05, + "loss": 1.3459, + "step": 4895 + }, + { + "epoch": 0.3329256692485392, + "grad_norm": 0.588287353515625, + "learning_rate": 4.791921456719663e-05, + "loss": 1.4798, + "step": 4900 + }, + { + "epoch": 0.33326538931920097, + "grad_norm": 0.5479865670204163, + "learning_rate": 4.7917091316754995e-05, + "loss": 1.3439, + "step": 4905 + }, + { + "epoch": 0.33360510938986276, + "grad_norm": 0.6028078198432922, + "learning_rate": 4.791496806631336e-05, + "loss": 1.3949, + "step": 4910 + }, + { + "epoch": 0.3339448294605245, + "grad_norm": 0.6484723091125488, + "learning_rate": 4.791284481587172e-05, + "loss": 1.5038, + "step": 4915 + }, + { + "epoch": 0.3342845495311863, + "grad_norm": 0.629385769367218, + "learning_rate": 4.791072156543009e-05, + "loss": 1.3792, + "step": 4920 + }, + { + "epoch": 0.3346242696018481, + "grad_norm": 0.7283436059951782, + "learning_rate": 4.790859831498845e-05, + "loss": 1.4609, + "step": 4925 + }, + { + "epoch": 0.33496398967250984, + "grad_norm": 0.6129261255264282, + "learning_rate": 4.7906475064546815e-05, + "loss": 1.5769, + "step": 4930 + }, + { + "epoch": 0.33530370974317164, + "grad_norm": 0.7182016372680664, + "learning_rate": 4.790435181410518e-05, + "loss": 1.413, + "step": 4935 + }, + { + "epoch": 0.3356434298138334, + "grad_norm": 0.5703354477882385, + "learning_rate": 4.790222856366354e-05, + "loss": 1.4701, + "step": 4940 + }, + { + "epoch": 0.3359831498844952, + "grad_norm": 0.5589520335197449, + "learning_rate": 4.790010531322191e-05, + "loss": 1.4668, + "step": 4945 + }, + { + "epoch": 0.336322869955157, + "grad_norm": 0.6707218885421753, + "learning_rate": 4.789798206278027e-05, + "loss": 1.3746, + "step": 4950 + }, + { + "epoch": 0.3366625900258187, + "grad_norm": 0.5754794478416443, + "learning_rate": 4.7895858812338635e-05, + "loss": 1.4444, + "step": 4955 + }, + { + "epoch": 0.3370023100964805, + "grad_norm": 0.5749678611755371, + "learning_rate": 4.7893735561897e-05, + "loss": 1.3014, + "step": 4960 + }, + { + "epoch": 0.33734203016714226, + "grad_norm": 0.5664737224578857, + "learning_rate": 4.789161231145536e-05, + "loss": 1.4334, + "step": 4965 + }, + { + "epoch": 0.33768175023780406, + "grad_norm": 0.632757842540741, + "learning_rate": 4.788948906101373e-05, + "loss": 1.4134, + "step": 4970 + }, + { + "epoch": 0.3380214703084658, + "grad_norm": 0.5536924600601196, + "learning_rate": 4.788736581057209e-05, + "loss": 1.4924, + "step": 4975 + }, + { + "epoch": 0.3383611903791276, + "grad_norm": 0.5794201493263245, + "learning_rate": 4.7885242560130455e-05, + "loss": 1.38, + "step": 4980 + }, + { + "epoch": 0.3387009104497894, + "grad_norm": 0.6089941263198853, + "learning_rate": 4.788311930968882e-05, + "loss": 1.3928, + "step": 4985 + }, + { + "epoch": 0.33904063052045114, + "grad_norm": 0.6675647497177124, + "learning_rate": 4.788099605924718e-05, + "loss": 1.3697, + "step": 4990 + }, + { + "epoch": 0.33938035059111293, + "grad_norm": 0.6070053577423096, + "learning_rate": 4.787887280880555e-05, + "loss": 1.4913, + "step": 4995 + }, + { + "epoch": 0.3397200706617747, + "grad_norm": 0.538594126701355, + "learning_rate": 4.787674955836391e-05, + "loss": 1.3906, + "step": 5000 + }, + { + "epoch": 0.3400597907324365, + "grad_norm": 0.5597215294837952, + "learning_rate": 4.7874626307922275e-05, + "loss": 1.4269, + "step": 5005 + }, + { + "epoch": 0.34039951080309827, + "grad_norm": 0.6286970973014832, + "learning_rate": 4.787250305748064e-05, + "loss": 1.39, + "step": 5010 + }, + { + "epoch": 0.34073923087376, + "grad_norm": 0.5263593792915344, + "learning_rate": 4.7870379807039003e-05, + "loss": 1.356, + "step": 5015 + }, + { + "epoch": 0.3410789509444218, + "grad_norm": 0.5326294898986816, + "learning_rate": 4.786825655659737e-05, + "loss": 1.3172, + "step": 5020 + }, + { + "epoch": 0.34141867101508355, + "grad_norm": 0.6011534929275513, + "learning_rate": 4.786613330615573e-05, + "loss": 1.3477, + "step": 5025 + }, + { + "epoch": 0.34175839108574535, + "grad_norm": 0.6430546641349792, + "learning_rate": 4.7864010055714095e-05, + "loss": 1.3919, + "step": 5030 + }, + { + "epoch": 0.34209811115640715, + "grad_norm": 0.6264625787734985, + "learning_rate": 4.786188680527246e-05, + "loss": 1.3747, + "step": 5035 + }, + { + "epoch": 0.3424378312270689, + "grad_norm": 0.6695388555526733, + "learning_rate": 4.7859763554830823e-05, + "loss": 1.4466, + "step": 5040 + }, + { + "epoch": 0.3427775512977307, + "grad_norm": 0.694168746471405, + "learning_rate": 4.785764030438919e-05, + "loss": 1.4146, + "step": 5045 + }, + { + "epoch": 0.34311727136839243, + "grad_norm": 0.616524338722229, + "learning_rate": 4.785551705394755e-05, + "loss": 1.3124, + "step": 5050 + }, + { + "epoch": 0.3434569914390542, + "grad_norm": 0.5950208902359009, + "learning_rate": 4.7853393803505915e-05, + "loss": 1.4781, + "step": 5055 + }, + { + "epoch": 0.34379671150971597, + "grad_norm": 0.5522637963294983, + "learning_rate": 4.785127055306427e-05, + "loss": 1.2603, + "step": 5060 + }, + { + "epoch": 0.34413643158037777, + "grad_norm": 0.6136479377746582, + "learning_rate": 4.7849147302622643e-05, + "loss": 1.2652, + "step": 5065 + }, + { + "epoch": 0.34447615165103956, + "grad_norm": 0.603760302066803, + "learning_rate": 4.784702405218101e-05, + "loss": 1.4905, + "step": 5070 + }, + { + "epoch": 0.3448158717217013, + "grad_norm": 0.6027374863624573, + "learning_rate": 4.7844900801739365e-05, + "loss": 1.357, + "step": 5075 + }, + { + "epoch": 0.3451555917923631, + "grad_norm": 0.6933466792106628, + "learning_rate": 4.7842777551297735e-05, + "loss": 1.4252, + "step": 5080 + }, + { + "epoch": 0.34549531186302485, + "grad_norm": 0.5912922024726868, + "learning_rate": 4.78406543008561e-05, + "loss": 1.3562, + "step": 5085 + }, + { + "epoch": 0.34583503193368664, + "grad_norm": 0.8580370545387268, + "learning_rate": 4.783853105041446e-05, + "loss": 1.3646, + "step": 5090 + }, + { + "epoch": 0.34617475200434844, + "grad_norm": 0.5268974304199219, + "learning_rate": 4.783640779997283e-05, + "loss": 1.4347, + "step": 5095 + }, + { + "epoch": 0.3465144720750102, + "grad_norm": 0.6514886617660522, + "learning_rate": 4.783428454953119e-05, + "loss": 1.3731, + "step": 5100 + }, + { + "epoch": 0.346854192145672, + "grad_norm": 0.6381573677062988, + "learning_rate": 4.783216129908955e-05, + "loss": 1.4298, + "step": 5105 + }, + { + "epoch": 0.3471939122163337, + "grad_norm": 0.5856737494468689, + "learning_rate": 4.783003804864792e-05, + "loss": 1.3209, + "step": 5110 + }, + { + "epoch": 0.3475336322869955, + "grad_norm": 0.670164167881012, + "learning_rate": 4.7827914798206283e-05, + "loss": 1.3555, + "step": 5115 + }, + { + "epoch": 0.3478733523576573, + "grad_norm": 0.6281108260154724, + "learning_rate": 4.782579154776464e-05, + "loss": 1.4509, + "step": 5120 + }, + { + "epoch": 0.34821307242831906, + "grad_norm": 0.5642656683921814, + "learning_rate": 4.782366829732301e-05, + "loss": 1.5178, + "step": 5125 + }, + { + "epoch": 0.34855279249898086, + "grad_norm": 0.543973445892334, + "learning_rate": 4.782154504688137e-05, + "loss": 1.3711, + "step": 5130 + }, + { + "epoch": 0.3488925125696426, + "grad_norm": 0.5834305882453918, + "learning_rate": 4.781942179643973e-05, + "loss": 1.4175, + "step": 5135 + }, + { + "epoch": 0.3492322326403044, + "grad_norm": 0.5386638641357422, + "learning_rate": 4.7817298545998103e-05, + "loss": 1.3985, + "step": 5140 + }, + { + "epoch": 0.34957195271096614, + "grad_norm": 0.5610645413398743, + "learning_rate": 4.781517529555646e-05, + "loss": 1.3601, + "step": 5145 + }, + { + "epoch": 0.34991167278162794, + "grad_norm": 0.5059294700622559, + "learning_rate": 4.7813052045114825e-05, + "loss": 1.3197, + "step": 5150 + }, + { + "epoch": 0.35025139285228973, + "grad_norm": 0.588741660118103, + "learning_rate": 4.7810928794673196e-05, + "loss": 1.4182, + "step": 5155 + }, + { + "epoch": 0.3505911129229515, + "grad_norm": 0.6061328649520874, + "learning_rate": 4.780880554423155e-05, + "loss": 1.4193, + "step": 5160 + }, + { + "epoch": 0.3509308329936133, + "grad_norm": 0.6105550527572632, + "learning_rate": 4.780668229378992e-05, + "loss": 1.3509, + "step": 5165 + }, + { + "epoch": 0.351270553064275, + "grad_norm": 0.5791309475898743, + "learning_rate": 4.780455904334829e-05, + "loss": 1.4309, + "step": 5170 + }, + { + "epoch": 0.3516102731349368, + "grad_norm": 0.6164483428001404, + "learning_rate": 4.7802435792906645e-05, + "loss": 1.3154, + "step": 5175 + }, + { + "epoch": 0.3519499932055986, + "grad_norm": 0.5944452881813049, + "learning_rate": 4.780031254246501e-05, + "loss": 1.3854, + "step": 5180 + }, + { + "epoch": 0.35228971327626035, + "grad_norm": 0.624857485294342, + "learning_rate": 4.779818929202338e-05, + "loss": 1.386, + "step": 5185 + }, + { + "epoch": 0.35262943334692215, + "grad_norm": 0.6018693447113037, + "learning_rate": 4.779606604158174e-05, + "loss": 1.4586, + "step": 5190 + }, + { + "epoch": 0.3529691534175839, + "grad_norm": 0.6194534301757812, + "learning_rate": 4.77939427911401e-05, + "loss": 1.335, + "step": 5195 + }, + { + "epoch": 0.3533088734882457, + "grad_norm": 0.701894998550415, + "learning_rate": 4.7791819540698465e-05, + "loss": 1.4342, + "step": 5200 + }, + { + "epoch": 0.3536485935589075, + "grad_norm": 0.607342541217804, + "learning_rate": 4.778969629025683e-05, + "loss": 1.3867, + "step": 5205 + }, + { + "epoch": 0.35398831362956923, + "grad_norm": 0.6307671666145325, + "learning_rate": 4.778757303981519e-05, + "loss": 1.3592, + "step": 5210 + }, + { + "epoch": 0.354328033700231, + "grad_norm": 0.6012018322944641, + "learning_rate": 4.778544978937356e-05, + "loss": 1.4269, + "step": 5215 + }, + { + "epoch": 0.35466775377089277, + "grad_norm": 0.5351936221122742, + "learning_rate": 4.778332653893192e-05, + "loss": 1.5335, + "step": 5220 + }, + { + "epoch": 0.35500747384155457, + "grad_norm": 0.6091983914375305, + "learning_rate": 4.7781203288490285e-05, + "loss": 1.3533, + "step": 5225 + }, + { + "epoch": 0.3553471939122163, + "grad_norm": 0.6138412952423096, + "learning_rate": 4.777908003804865e-05, + "loss": 1.4093, + "step": 5230 + }, + { + "epoch": 0.3556869139828781, + "grad_norm": 0.6481094360351562, + "learning_rate": 4.777695678760701e-05, + "loss": 1.434, + "step": 5235 + }, + { + "epoch": 0.3560266340535399, + "grad_norm": 0.5672658085823059, + "learning_rate": 4.777483353716538e-05, + "loss": 1.4019, + "step": 5240 + }, + { + "epoch": 0.35636635412420165, + "grad_norm": 0.6443695425987244, + "learning_rate": 4.777271028672374e-05, + "loss": 1.3913, + "step": 5245 + }, + { + "epoch": 0.35670607419486344, + "grad_norm": 0.6255459189414978, + "learning_rate": 4.7770587036282105e-05, + "loss": 1.5109, + "step": 5250 + }, + { + "epoch": 0.3570457942655252, + "grad_norm": 0.48603206872940063, + "learning_rate": 4.776846378584047e-05, + "loss": 1.361, + "step": 5255 + }, + { + "epoch": 0.357385514336187, + "grad_norm": 0.5364346504211426, + "learning_rate": 4.776634053539883e-05, + "loss": 1.4004, + "step": 5260 + }, + { + "epoch": 0.3577252344068488, + "grad_norm": 0.6334765553474426, + "learning_rate": 4.77642172849572e-05, + "loss": 1.2438, + "step": 5265 + }, + { + "epoch": 0.3580649544775105, + "grad_norm": 0.6408407092094421, + "learning_rate": 4.776209403451556e-05, + "loss": 1.4691, + "step": 5270 + }, + { + "epoch": 0.3584046745481723, + "grad_norm": 0.6182148456573486, + "learning_rate": 4.7759970784073925e-05, + "loss": 1.5037, + "step": 5275 + }, + { + "epoch": 0.35874439461883406, + "grad_norm": 0.5836244821548462, + "learning_rate": 4.775784753363229e-05, + "loss": 1.4265, + "step": 5280 + }, + { + "epoch": 0.35908411468949586, + "grad_norm": 0.6044645309448242, + "learning_rate": 4.775572428319065e-05, + "loss": 1.2618, + "step": 5285 + }, + { + "epoch": 0.35942383476015766, + "grad_norm": 0.61807781457901, + "learning_rate": 4.775360103274902e-05, + "loss": 1.3845, + "step": 5290 + }, + { + "epoch": 0.3597635548308194, + "grad_norm": 0.554994523525238, + "learning_rate": 4.775147778230738e-05, + "loss": 1.4331, + "step": 5295 + }, + { + "epoch": 0.3601032749014812, + "grad_norm": 0.645104706287384, + "learning_rate": 4.7749354531865745e-05, + "loss": 1.3954, + "step": 5300 + }, + { + "epoch": 0.36044299497214294, + "grad_norm": 0.6977760791778564, + "learning_rate": 4.774723128142411e-05, + "loss": 1.4616, + "step": 5305 + }, + { + "epoch": 0.36078271504280474, + "grad_norm": 0.6513646841049194, + "learning_rate": 4.774510803098247e-05, + "loss": 1.3543, + "step": 5310 + }, + { + "epoch": 0.3611224351134665, + "grad_norm": 0.5543293356895447, + "learning_rate": 4.774298478054084e-05, + "loss": 1.4529, + "step": 5315 + }, + { + "epoch": 0.3614621551841283, + "grad_norm": 0.6506486535072327, + "learning_rate": 4.77408615300992e-05, + "loss": 1.3163, + "step": 5320 + }, + { + "epoch": 0.3618018752547901, + "grad_norm": 0.6141875982284546, + "learning_rate": 4.7738738279657565e-05, + "loss": 1.4583, + "step": 5325 + }, + { + "epoch": 0.3621415953254518, + "grad_norm": 0.6069737672805786, + "learning_rate": 4.773661502921593e-05, + "loss": 1.3349, + "step": 5330 + }, + { + "epoch": 0.3624813153961136, + "grad_norm": 0.6170814037322998, + "learning_rate": 4.773449177877429e-05, + "loss": 1.4384, + "step": 5335 + }, + { + "epoch": 0.36282103546677535, + "grad_norm": 0.6338891386985779, + "learning_rate": 4.773236852833266e-05, + "loss": 1.4742, + "step": 5340 + }, + { + "epoch": 0.36316075553743715, + "grad_norm": 0.5873185396194458, + "learning_rate": 4.773024527789102e-05, + "loss": 1.3871, + "step": 5345 + }, + { + "epoch": 0.36350047560809895, + "grad_norm": 0.6128943562507629, + "learning_rate": 4.7728122027449385e-05, + "loss": 1.364, + "step": 5350 + }, + { + "epoch": 0.3638401956787607, + "grad_norm": 0.6030228734016418, + "learning_rate": 4.772599877700775e-05, + "loss": 1.4258, + "step": 5355 + }, + { + "epoch": 0.3641799157494225, + "grad_norm": 0.6233001947402954, + "learning_rate": 4.772387552656611e-05, + "loss": 1.2788, + "step": 5360 + }, + { + "epoch": 0.36451963582008423, + "grad_norm": 0.6130266785621643, + "learning_rate": 4.772175227612448e-05, + "loss": 1.3763, + "step": 5365 + }, + { + "epoch": 0.36485935589074603, + "grad_norm": 0.6724543571472168, + "learning_rate": 4.771962902568284e-05, + "loss": 1.3895, + "step": 5370 + }, + { + "epoch": 0.3651990759614078, + "grad_norm": 0.57244473695755, + "learning_rate": 4.7717505775241205e-05, + "loss": 1.4469, + "step": 5375 + }, + { + "epoch": 0.36553879603206957, + "grad_norm": 0.619618833065033, + "learning_rate": 4.771538252479957e-05, + "loss": 1.413, + "step": 5380 + }, + { + "epoch": 0.36587851610273137, + "grad_norm": 0.5769147276878357, + "learning_rate": 4.771325927435793e-05, + "loss": 1.3105, + "step": 5385 + }, + { + "epoch": 0.3662182361733931, + "grad_norm": 0.5992787480354309, + "learning_rate": 4.77111360239163e-05, + "loss": 1.328, + "step": 5390 + }, + { + "epoch": 0.3665579562440549, + "grad_norm": 0.528085470199585, + "learning_rate": 4.770901277347466e-05, + "loss": 1.3521, + "step": 5395 + }, + { + "epoch": 0.36689767631471665, + "grad_norm": 0.6295778751373291, + "learning_rate": 4.770688952303302e-05, + "loss": 1.4191, + "step": 5400 + }, + { + "epoch": 0.36723739638537845, + "grad_norm": 0.7183342576026917, + "learning_rate": 4.770476627259139e-05, + "loss": 1.3113, + "step": 5405 + }, + { + "epoch": 0.36757711645604024, + "grad_norm": 0.6314020752906799, + "learning_rate": 4.770264302214975e-05, + "loss": 1.3726, + "step": 5410 + }, + { + "epoch": 0.367916836526702, + "grad_norm": 0.6158671975135803, + "learning_rate": 4.770051977170811e-05, + "loss": 1.4362, + "step": 5415 + }, + { + "epoch": 0.3682565565973638, + "grad_norm": 0.5616465210914612, + "learning_rate": 4.769839652126648e-05, + "loss": 1.3334, + "step": 5420 + }, + { + "epoch": 0.3685962766680255, + "grad_norm": 0.585128664970398, + "learning_rate": 4.7696273270824845e-05, + "loss": 1.4112, + "step": 5425 + }, + { + "epoch": 0.3689359967386873, + "grad_norm": 0.6653167009353638, + "learning_rate": 4.76941500203832e-05, + "loss": 1.3972, + "step": 5430 + }, + { + "epoch": 0.3692757168093491, + "grad_norm": 0.6336387395858765, + "learning_rate": 4.769202676994157e-05, + "loss": 1.3333, + "step": 5435 + }, + { + "epoch": 0.36961543688001086, + "grad_norm": 0.5875614881515503, + "learning_rate": 4.768990351949994e-05, + "loss": 1.3727, + "step": 5440 + }, + { + "epoch": 0.36995515695067266, + "grad_norm": 0.650518000125885, + "learning_rate": 4.7687780269058294e-05, + "loss": 1.4229, + "step": 5445 + }, + { + "epoch": 0.3702948770213344, + "grad_norm": 0.6364461779594421, + "learning_rate": 4.7685657018616665e-05, + "loss": 1.365, + "step": 5450 + }, + { + "epoch": 0.3706345970919962, + "grad_norm": 0.5882337689399719, + "learning_rate": 4.768353376817503e-05, + "loss": 1.4432, + "step": 5455 + }, + { + "epoch": 0.370974317162658, + "grad_norm": 0.627549409866333, + "learning_rate": 4.7681410517733386e-05, + "loss": 1.4261, + "step": 5460 + }, + { + "epoch": 0.37131403723331974, + "grad_norm": 0.6792019605636597, + "learning_rate": 4.767928726729176e-05, + "loss": 1.4128, + "step": 5465 + }, + { + "epoch": 0.37165375730398154, + "grad_norm": 0.6191064715385437, + "learning_rate": 4.767716401685012e-05, + "loss": 1.4145, + "step": 5470 + }, + { + "epoch": 0.3719934773746433, + "grad_norm": 0.5747395753860474, + "learning_rate": 4.767504076640848e-05, + "loss": 1.339, + "step": 5475 + }, + { + "epoch": 0.3723331974453051, + "grad_norm": 0.546324610710144, + "learning_rate": 4.767291751596685e-05, + "loss": 1.3541, + "step": 5480 + }, + { + "epoch": 0.3726729175159668, + "grad_norm": 0.6170159578323364, + "learning_rate": 4.7670794265525206e-05, + "loss": 1.4755, + "step": 5485 + }, + { + "epoch": 0.3730126375866286, + "grad_norm": 0.5728192925453186, + "learning_rate": 4.766867101508357e-05, + "loss": 1.388, + "step": 5490 + }, + { + "epoch": 0.3733523576572904, + "grad_norm": 0.6069906949996948, + "learning_rate": 4.766654776464194e-05, + "loss": 1.3868, + "step": 5495 + }, + { + "epoch": 0.37369207772795215, + "grad_norm": 0.6560340523719788, + "learning_rate": 4.76644245142003e-05, + "loss": 1.3731, + "step": 5500 + }, + { + "epoch": 0.37403179779861395, + "grad_norm": 0.5605098605155945, + "learning_rate": 4.766230126375866e-05, + "loss": 1.4622, + "step": 5505 + }, + { + "epoch": 0.3743715178692757, + "grad_norm": 0.5942585468292236, + "learning_rate": 4.766017801331703e-05, + "loss": 1.3181, + "step": 5510 + }, + { + "epoch": 0.3747112379399375, + "grad_norm": 0.6005749702453613, + "learning_rate": 4.765805476287539e-05, + "loss": 1.463, + "step": 5515 + }, + { + "epoch": 0.3750509580105993, + "grad_norm": 0.5901667475700378, + "learning_rate": 4.7655931512433754e-05, + "loss": 1.3492, + "step": 5520 + }, + { + "epoch": 0.37539067808126103, + "grad_norm": 0.6109676361083984, + "learning_rate": 4.7653808261992125e-05, + "loss": 1.3842, + "step": 5525 + }, + { + "epoch": 0.37573039815192283, + "grad_norm": 0.5087992548942566, + "learning_rate": 4.765168501155048e-05, + "loss": 1.4801, + "step": 5530 + }, + { + "epoch": 0.37607011822258457, + "grad_norm": 0.6232153177261353, + "learning_rate": 4.7649561761108846e-05, + "loss": 1.4592, + "step": 5535 + }, + { + "epoch": 0.37640983829324637, + "grad_norm": 0.5623194575309753, + "learning_rate": 4.764743851066722e-05, + "loss": 1.324, + "step": 5540 + }, + { + "epoch": 0.37674955836390817, + "grad_norm": 0.5809667706489563, + "learning_rate": 4.7645315260225574e-05, + "loss": 1.4363, + "step": 5545 + }, + { + "epoch": 0.3770892784345699, + "grad_norm": 0.6279406547546387, + "learning_rate": 4.764319200978394e-05, + "loss": 1.4373, + "step": 5550 + }, + { + "epoch": 0.3774289985052317, + "grad_norm": 0.5912439823150635, + "learning_rate": 4.76410687593423e-05, + "loss": 1.3755, + "step": 5555 + }, + { + "epoch": 0.37776871857589345, + "grad_norm": 0.667495608329773, + "learning_rate": 4.7638945508900666e-05, + "loss": 1.3669, + "step": 5560 + }, + { + "epoch": 0.37810843864655524, + "grad_norm": 0.6271250247955322, + "learning_rate": 4.763682225845903e-05, + "loss": 1.3308, + "step": 5565 + }, + { + "epoch": 0.378448158717217, + "grad_norm": 0.6690099835395813, + "learning_rate": 4.7634699008017394e-05, + "loss": 1.4808, + "step": 5570 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.6592546701431274, + "learning_rate": 4.763257575757576e-05, + "loss": 1.4697, + "step": 5575 + }, + { + "epoch": 0.3791275988585406, + "grad_norm": 0.6392249464988708, + "learning_rate": 4.763045250713412e-05, + "loss": 1.4297, + "step": 5580 + }, + { + "epoch": 0.3794673189292023, + "grad_norm": 0.6348369717597961, + "learning_rate": 4.7628329256692486e-05, + "loss": 1.4131, + "step": 5585 + }, + { + "epoch": 0.3798070389998641, + "grad_norm": 0.5649213790893555, + "learning_rate": 4.762620600625085e-05, + "loss": 1.3963, + "step": 5590 + }, + { + "epoch": 0.38014675907052586, + "grad_norm": 0.5986794233322144, + "learning_rate": 4.7624082755809214e-05, + "loss": 1.3051, + "step": 5595 + }, + { + "epoch": 0.38048647914118766, + "grad_norm": 0.6087474822998047, + "learning_rate": 4.762195950536758e-05, + "loss": 1.5329, + "step": 5600 + }, + { + "epoch": 0.38082619921184946, + "grad_norm": 0.6305631399154663, + "learning_rate": 4.761983625492594e-05, + "loss": 1.4441, + "step": 5605 + }, + { + "epoch": 0.3811659192825112, + "grad_norm": 0.6152334213256836, + "learning_rate": 4.7617713004484306e-05, + "loss": 1.42, + "step": 5610 + }, + { + "epoch": 0.381505639353173, + "grad_norm": 0.5399892926216125, + "learning_rate": 4.761558975404267e-05, + "loss": 1.366, + "step": 5615 + }, + { + "epoch": 0.38184535942383474, + "grad_norm": 0.6757349371910095, + "learning_rate": 4.7613466503601034e-05, + "loss": 1.4201, + "step": 5620 + }, + { + "epoch": 0.38218507949449654, + "grad_norm": 0.6649751663208008, + "learning_rate": 4.76113432531594e-05, + "loss": 1.297, + "step": 5625 + }, + { + "epoch": 0.38252479956515834, + "grad_norm": 0.6459630131721497, + "learning_rate": 4.760922000271776e-05, + "loss": 1.4591, + "step": 5630 + }, + { + "epoch": 0.3828645196358201, + "grad_norm": 0.6597029566764832, + "learning_rate": 4.7607096752276126e-05, + "loss": 1.3731, + "step": 5635 + }, + { + "epoch": 0.3832042397064819, + "grad_norm": 0.610629141330719, + "learning_rate": 4.760497350183449e-05, + "loss": 1.4348, + "step": 5640 + }, + { + "epoch": 0.3835439597771436, + "grad_norm": 0.6628695130348206, + "learning_rate": 4.7602850251392854e-05, + "loss": 1.4356, + "step": 5645 + }, + { + "epoch": 0.3838836798478054, + "grad_norm": 0.5976195931434631, + "learning_rate": 4.760072700095122e-05, + "loss": 1.3779, + "step": 5650 + }, + { + "epoch": 0.38422339991846716, + "grad_norm": 0.5131832361221313, + "learning_rate": 4.759860375050958e-05, + "loss": 1.2832, + "step": 5655 + }, + { + "epoch": 0.38456311998912895, + "grad_norm": 0.5103899836540222, + "learning_rate": 4.7596480500067946e-05, + "loss": 1.3908, + "step": 5660 + }, + { + "epoch": 0.38490284005979075, + "grad_norm": 0.5472924113273621, + "learning_rate": 4.759435724962631e-05, + "loss": 1.2945, + "step": 5665 + }, + { + "epoch": 0.3852425601304525, + "grad_norm": 0.5743058323860168, + "learning_rate": 4.7592233999184674e-05, + "loss": 1.3783, + "step": 5670 + }, + { + "epoch": 0.3855822802011143, + "grad_norm": 0.6685713529586792, + "learning_rate": 4.759011074874304e-05, + "loss": 1.404, + "step": 5675 + }, + { + "epoch": 0.38592200027177603, + "grad_norm": 0.6060322523117065, + "learning_rate": 4.75879874983014e-05, + "loss": 1.3298, + "step": 5680 + }, + { + "epoch": 0.38626172034243783, + "grad_norm": 0.5998486876487732, + "learning_rate": 4.7585864247859766e-05, + "loss": 1.4065, + "step": 5685 + }, + { + "epoch": 0.38660144041309963, + "grad_norm": 0.6170439124107361, + "learning_rate": 4.758374099741813e-05, + "loss": 1.5164, + "step": 5690 + }, + { + "epoch": 0.38694116048376137, + "grad_norm": 0.5666913390159607, + "learning_rate": 4.7581617746976494e-05, + "loss": 1.3716, + "step": 5695 + }, + { + "epoch": 0.38728088055442317, + "grad_norm": 0.6925976872444153, + "learning_rate": 4.757949449653486e-05, + "loss": 1.3122, + "step": 5700 + }, + { + "epoch": 0.3876206006250849, + "grad_norm": 0.5945137739181519, + "learning_rate": 4.757737124609322e-05, + "loss": 1.4064, + "step": 5705 + }, + { + "epoch": 0.3879603206957467, + "grad_norm": 0.644679844379425, + "learning_rate": 4.7575247995651586e-05, + "loss": 1.3737, + "step": 5710 + }, + { + "epoch": 0.3883000407664085, + "grad_norm": 0.5794373750686646, + "learning_rate": 4.757312474520995e-05, + "loss": 1.3891, + "step": 5715 + }, + { + "epoch": 0.38863976083707025, + "grad_norm": 0.6075679659843445, + "learning_rate": 4.7571001494768314e-05, + "loss": 1.3963, + "step": 5720 + }, + { + "epoch": 0.38897948090773204, + "grad_norm": 0.5855366587638855, + "learning_rate": 4.756887824432668e-05, + "loss": 1.3753, + "step": 5725 + }, + { + "epoch": 0.3893192009783938, + "grad_norm": 0.598223090171814, + "learning_rate": 4.756675499388504e-05, + "loss": 1.3687, + "step": 5730 + }, + { + "epoch": 0.3896589210490556, + "grad_norm": 0.6207942366600037, + "learning_rate": 4.7564631743443406e-05, + "loss": 1.3976, + "step": 5735 + }, + { + "epoch": 0.3899986411197173, + "grad_norm": 0.5999290943145752, + "learning_rate": 4.756250849300177e-05, + "loss": 1.4766, + "step": 5740 + }, + { + "epoch": 0.3903383611903791, + "grad_norm": 0.6329728960990906, + "learning_rate": 4.7560385242560134e-05, + "loss": 1.4594, + "step": 5745 + }, + { + "epoch": 0.3906780812610409, + "grad_norm": 0.7645265460014343, + "learning_rate": 4.75582619921185e-05, + "loss": 1.3919, + "step": 5750 + }, + { + "epoch": 0.39101780133170266, + "grad_norm": 0.6652759313583374, + "learning_rate": 4.7556138741676856e-05, + "loss": 1.4261, + "step": 5755 + }, + { + "epoch": 0.39135752140236446, + "grad_norm": 0.6099786162376404, + "learning_rate": 4.7554015491235226e-05, + "loss": 1.3679, + "step": 5760 + }, + { + "epoch": 0.3916972414730262, + "grad_norm": 0.6915930509567261, + "learning_rate": 4.755189224079359e-05, + "loss": 1.3873, + "step": 5765 + }, + { + "epoch": 0.392036961543688, + "grad_norm": 0.5593711137771606, + "learning_rate": 4.754976899035195e-05, + "loss": 1.3472, + "step": 5770 + }, + { + "epoch": 0.3923766816143498, + "grad_norm": 0.5792405605316162, + "learning_rate": 4.754764573991032e-05, + "loss": 1.4887, + "step": 5775 + }, + { + "epoch": 0.39271640168501154, + "grad_norm": 0.5475907325744629, + "learning_rate": 4.754552248946868e-05, + "loss": 1.2726, + "step": 5780 + }, + { + "epoch": 0.39305612175567334, + "grad_norm": 0.593908429145813, + "learning_rate": 4.754339923902704e-05, + "loss": 1.4216, + "step": 5785 + }, + { + "epoch": 0.3933958418263351, + "grad_norm": 0.6711115837097168, + "learning_rate": 4.754127598858541e-05, + "loss": 1.3659, + "step": 5790 + }, + { + "epoch": 0.3937355618969969, + "grad_norm": 0.610977292060852, + "learning_rate": 4.7539152738143774e-05, + "loss": 1.4279, + "step": 5795 + }, + { + "epoch": 0.3940752819676587, + "grad_norm": 0.602996289730072, + "learning_rate": 4.753702948770213e-05, + "loss": 1.3001, + "step": 5800 + }, + { + "epoch": 0.3944150020383204, + "grad_norm": 0.5804190039634705, + "learning_rate": 4.75349062372605e-05, + "loss": 1.4123, + "step": 5805 + }, + { + "epoch": 0.3947547221089822, + "grad_norm": 0.7110206484794617, + "learning_rate": 4.7532782986818866e-05, + "loss": 1.4464, + "step": 5810 + }, + { + "epoch": 0.39509444217964396, + "grad_norm": 0.5763588547706604, + "learning_rate": 4.7530659736377224e-05, + "loss": 1.2695, + "step": 5815 + }, + { + "epoch": 0.39543416225030575, + "grad_norm": 0.5757023096084595, + "learning_rate": 4.7528536485935594e-05, + "loss": 1.4458, + "step": 5820 + }, + { + "epoch": 0.3957738823209675, + "grad_norm": 0.6958516836166382, + "learning_rate": 4.752641323549395e-05, + "loss": 1.4825, + "step": 5825 + }, + { + "epoch": 0.3961136023916293, + "grad_norm": 0.6314806938171387, + "learning_rate": 4.7524289985052316e-05, + "loss": 1.3996, + "step": 5830 + }, + { + "epoch": 0.3964533224622911, + "grad_norm": 0.6348795890808105, + "learning_rate": 4.7522166734610686e-05, + "loss": 1.4085, + "step": 5835 + }, + { + "epoch": 0.39679304253295283, + "grad_norm": 0.6376756429672241, + "learning_rate": 4.7520043484169044e-05, + "loss": 1.3745, + "step": 5840 + }, + { + "epoch": 0.39713276260361463, + "grad_norm": 0.6278203725814819, + "learning_rate": 4.751792023372741e-05, + "loss": 1.5009, + "step": 5845 + }, + { + "epoch": 0.3974724826742764, + "grad_norm": 0.5627636313438416, + "learning_rate": 4.751579698328578e-05, + "loss": 1.3236, + "step": 5850 + }, + { + "epoch": 0.39781220274493817, + "grad_norm": 0.6148563623428345, + "learning_rate": 4.7513673732844136e-05, + "loss": 1.3983, + "step": 5855 + }, + { + "epoch": 0.39815192281559997, + "grad_norm": 0.6606502532958984, + "learning_rate": 4.75115504824025e-05, + "loss": 1.4323, + "step": 5860 + }, + { + "epoch": 0.3984916428862617, + "grad_norm": 0.6654682159423828, + "learning_rate": 4.750942723196087e-05, + "loss": 1.4555, + "step": 5865 + }, + { + "epoch": 0.3988313629569235, + "grad_norm": 0.6830945611000061, + "learning_rate": 4.750730398151923e-05, + "loss": 1.3652, + "step": 5870 + }, + { + "epoch": 0.39917108302758525, + "grad_norm": 0.5865166783332825, + "learning_rate": 4.750518073107759e-05, + "loss": 1.3804, + "step": 5875 + }, + { + "epoch": 0.39951080309824705, + "grad_norm": 0.6635303497314453, + "learning_rate": 4.750305748063596e-05, + "loss": 1.3655, + "step": 5880 + }, + { + "epoch": 0.39985052316890884, + "grad_norm": 0.5407352447509766, + "learning_rate": 4.750093423019432e-05, + "loss": 1.4497, + "step": 5885 + }, + { + "epoch": 0.4001902432395706, + "grad_norm": 0.5308975577354431, + "learning_rate": 4.7498810979752684e-05, + "loss": 1.4051, + "step": 5890 + }, + { + "epoch": 0.4005299633102324, + "grad_norm": 0.7656110525131226, + "learning_rate": 4.7496687729311055e-05, + "loss": 1.2989, + "step": 5895 + }, + { + "epoch": 0.4008696833808941, + "grad_norm": 0.5741212964057922, + "learning_rate": 4.749456447886941e-05, + "loss": 1.327, + "step": 5900 + }, + { + "epoch": 0.4012094034515559, + "grad_norm": 0.6385553479194641, + "learning_rate": 4.7492441228427776e-05, + "loss": 1.3209, + "step": 5905 + }, + { + "epoch": 0.40154912352221767, + "grad_norm": 0.5651522874832153, + "learning_rate": 4.749031797798614e-05, + "loss": 1.3567, + "step": 5910 + }, + { + "epoch": 0.40188884359287946, + "grad_norm": 0.675216019153595, + "learning_rate": 4.7488194727544504e-05, + "loss": 1.4391, + "step": 5915 + }, + { + "epoch": 0.40222856366354126, + "grad_norm": 0.6836060881614685, + "learning_rate": 4.748607147710287e-05, + "loss": 1.4123, + "step": 5920 + }, + { + "epoch": 0.402568283734203, + "grad_norm": 0.5851414799690247, + "learning_rate": 4.748394822666123e-05, + "loss": 1.3718, + "step": 5925 + }, + { + "epoch": 0.4029080038048648, + "grad_norm": 0.6235272884368896, + "learning_rate": 4.7481824976219596e-05, + "loss": 1.4089, + "step": 5930 + }, + { + "epoch": 0.40324772387552654, + "grad_norm": 0.6178550720214844, + "learning_rate": 4.747970172577796e-05, + "loss": 1.4801, + "step": 5935 + }, + { + "epoch": 0.40358744394618834, + "grad_norm": 0.6094355583190918, + "learning_rate": 4.7477578475336324e-05, + "loss": 1.508, + "step": 5940 + }, + { + "epoch": 0.40392716401685014, + "grad_norm": 0.6139050126075745, + "learning_rate": 4.747545522489469e-05, + "loss": 1.2982, + "step": 5945 + }, + { + "epoch": 0.4042668840875119, + "grad_norm": 0.6272382736206055, + "learning_rate": 4.747333197445305e-05, + "loss": 1.4177, + "step": 5950 + }, + { + "epoch": 0.4046066041581737, + "grad_norm": 0.6022531390190125, + "learning_rate": 4.7471208724011416e-05, + "loss": 1.4028, + "step": 5955 + }, + { + "epoch": 0.4049463242288354, + "grad_norm": 0.734524667263031, + "learning_rate": 4.746908547356978e-05, + "loss": 1.4994, + "step": 5960 + }, + { + "epoch": 0.4052860442994972, + "grad_norm": 0.5173041224479675, + "learning_rate": 4.746696222312815e-05, + "loss": 1.3659, + "step": 5965 + }, + { + "epoch": 0.405625764370159, + "grad_norm": 0.5936630964279175, + "learning_rate": 4.746483897268651e-05, + "loss": 1.443, + "step": 5970 + }, + { + "epoch": 0.40596548444082076, + "grad_norm": 0.6323860287666321, + "learning_rate": 4.746271572224487e-05, + "loss": 1.318, + "step": 5975 + }, + { + "epoch": 0.40630520451148255, + "grad_norm": 0.6829825639724731, + "learning_rate": 4.7460592471803236e-05, + "loss": 1.4725, + "step": 5980 + }, + { + "epoch": 0.4066449245821443, + "grad_norm": 0.5354074835777283, + "learning_rate": 4.74584692213616e-05, + "loss": 1.3214, + "step": 5985 + }, + { + "epoch": 0.4069846446528061, + "grad_norm": 0.6558411717414856, + "learning_rate": 4.7456345970919964e-05, + "loss": 1.3154, + "step": 5990 + }, + { + "epoch": 0.40732436472346784, + "grad_norm": 0.5801601409912109, + "learning_rate": 4.745422272047833e-05, + "loss": 1.3867, + "step": 5995 + }, + { + "epoch": 0.40766408479412963, + "grad_norm": 0.5719489455223083, + "learning_rate": 4.745209947003669e-05, + "loss": 1.2751, + "step": 6000 + }, + { + "epoch": 0.40800380486479143, + "grad_norm": 0.5202120542526245, + "learning_rate": 4.7449976219595056e-05, + "loss": 1.3704, + "step": 6005 + }, + { + "epoch": 0.4083435249354532, + "grad_norm": 0.5829079151153564, + "learning_rate": 4.744785296915342e-05, + "loss": 1.3562, + "step": 6010 + }, + { + "epoch": 0.40868324500611497, + "grad_norm": 0.6678234934806824, + "learning_rate": 4.7445729718711784e-05, + "loss": 1.3721, + "step": 6015 + }, + { + "epoch": 0.4090229650767767, + "grad_norm": 0.6354725956916809, + "learning_rate": 4.744360646827015e-05, + "loss": 1.4208, + "step": 6020 + }, + { + "epoch": 0.4093626851474385, + "grad_norm": 0.6208111047744751, + "learning_rate": 4.744148321782851e-05, + "loss": 1.3284, + "step": 6025 + }, + { + "epoch": 0.4097024052181003, + "grad_norm": 0.6854109764099121, + "learning_rate": 4.7439359967386876e-05, + "loss": 1.386, + "step": 6030 + }, + { + "epoch": 0.41004212528876205, + "grad_norm": 0.6217085123062134, + "learning_rate": 4.743723671694524e-05, + "loss": 1.2589, + "step": 6035 + }, + { + "epoch": 0.41038184535942385, + "grad_norm": 0.6386720538139343, + "learning_rate": 4.7435113466503604e-05, + "loss": 1.3122, + "step": 6040 + }, + { + "epoch": 0.4107215654300856, + "grad_norm": 0.630185067653656, + "learning_rate": 4.743299021606197e-05, + "loss": 1.3928, + "step": 6045 + }, + { + "epoch": 0.4110612855007474, + "grad_norm": 0.756481945514679, + "learning_rate": 4.743086696562033e-05, + "loss": 1.3392, + "step": 6050 + }, + { + "epoch": 0.4114010055714092, + "grad_norm": 0.6218680739402771, + "learning_rate": 4.7428743715178696e-05, + "loss": 1.365, + "step": 6055 + }, + { + "epoch": 0.4117407256420709, + "grad_norm": 0.6535066366195679, + "learning_rate": 4.742662046473706e-05, + "loss": 1.3425, + "step": 6060 + }, + { + "epoch": 0.4120804457127327, + "grad_norm": 0.6885753870010376, + "learning_rate": 4.7424497214295424e-05, + "loss": 1.4964, + "step": 6065 + }, + { + "epoch": 0.41242016578339447, + "grad_norm": 0.6080855131149292, + "learning_rate": 4.742237396385379e-05, + "loss": 1.411, + "step": 6070 + }, + { + "epoch": 0.41275988585405626, + "grad_norm": 0.6426699757575989, + "learning_rate": 4.742025071341215e-05, + "loss": 1.2876, + "step": 6075 + }, + { + "epoch": 0.413099605924718, + "grad_norm": 0.6667383909225464, + "learning_rate": 4.7418127462970516e-05, + "loss": 1.3941, + "step": 6080 + }, + { + "epoch": 0.4134393259953798, + "grad_norm": 0.7255922555923462, + "learning_rate": 4.741600421252888e-05, + "loss": 1.4121, + "step": 6085 + }, + { + "epoch": 0.4137790460660416, + "grad_norm": 0.6203028559684753, + "learning_rate": 4.7413880962087244e-05, + "loss": 1.4324, + "step": 6090 + }, + { + "epoch": 0.41411876613670334, + "grad_norm": 0.784636378288269, + "learning_rate": 4.741175771164561e-05, + "loss": 1.3818, + "step": 6095 + }, + { + "epoch": 0.41445848620736514, + "grad_norm": 0.5628244280815125, + "learning_rate": 4.740963446120397e-05, + "loss": 1.3401, + "step": 6100 + }, + { + "epoch": 0.4147982062780269, + "grad_norm": 0.6107689738273621, + "learning_rate": 4.7407511210762336e-05, + "loss": 1.4518, + "step": 6105 + }, + { + "epoch": 0.4151379263486887, + "grad_norm": 0.6618290543556213, + "learning_rate": 4.740538796032069e-05, + "loss": 1.4626, + "step": 6110 + }, + { + "epoch": 0.4154776464193505, + "grad_norm": 0.7461044192314148, + "learning_rate": 4.7403264709879064e-05, + "loss": 1.4806, + "step": 6115 + }, + { + "epoch": 0.4158173664900122, + "grad_norm": 0.6222243309020996, + "learning_rate": 4.740114145943743e-05, + "loss": 1.4335, + "step": 6120 + }, + { + "epoch": 0.416157086560674, + "grad_norm": 0.60787034034729, + "learning_rate": 4.7399018208995785e-05, + "loss": 1.4104, + "step": 6125 + }, + { + "epoch": 0.41649680663133576, + "grad_norm": 0.6704518795013428, + "learning_rate": 4.7396894958554156e-05, + "loss": 1.4775, + "step": 6130 + }, + { + "epoch": 0.41683652670199756, + "grad_norm": 0.589547872543335, + "learning_rate": 4.739477170811252e-05, + "loss": 1.385, + "step": 6135 + }, + { + "epoch": 0.41717624677265935, + "grad_norm": 0.6546774506568909, + "learning_rate": 4.739264845767088e-05, + "loss": 1.3662, + "step": 6140 + }, + { + "epoch": 0.4175159668433211, + "grad_norm": 0.609470546245575, + "learning_rate": 4.739052520722925e-05, + "loss": 1.387, + "step": 6145 + }, + { + "epoch": 0.4178556869139829, + "grad_norm": 0.6994268894195557, + "learning_rate": 4.738840195678761e-05, + "loss": 1.4109, + "step": 6150 + }, + { + "epoch": 0.41819540698464464, + "grad_norm": 0.6468284726142883, + "learning_rate": 4.738627870634597e-05, + "loss": 1.314, + "step": 6155 + }, + { + "epoch": 0.41853512705530643, + "grad_norm": 0.5958006978034973, + "learning_rate": 4.738415545590434e-05, + "loss": 1.3969, + "step": 6160 + }, + { + "epoch": 0.4188748471259682, + "grad_norm": 0.6362236142158508, + "learning_rate": 4.7382032205462704e-05, + "loss": 1.4263, + "step": 6165 + }, + { + "epoch": 0.41921456719663, + "grad_norm": 0.7022944092750549, + "learning_rate": 4.737990895502106e-05, + "loss": 1.3434, + "step": 6170 + }, + { + "epoch": 0.41955428726729177, + "grad_norm": 0.6964726448059082, + "learning_rate": 4.737778570457943e-05, + "loss": 1.3334, + "step": 6175 + }, + { + "epoch": 0.4198940073379535, + "grad_norm": 0.553797721862793, + "learning_rate": 4.737566245413779e-05, + "loss": 1.3971, + "step": 6180 + }, + { + "epoch": 0.4202337274086153, + "grad_norm": 0.6053094267845154, + "learning_rate": 4.737353920369615e-05, + "loss": 1.3486, + "step": 6185 + }, + { + "epoch": 0.42057344747927705, + "grad_norm": 0.6094326972961426, + "learning_rate": 4.7371415953254524e-05, + "loss": 1.4286, + "step": 6190 + }, + { + "epoch": 0.42091316754993885, + "grad_norm": 0.557897686958313, + "learning_rate": 4.736929270281288e-05, + "loss": 1.4682, + "step": 6195 + }, + { + "epoch": 0.42125288762060065, + "grad_norm": 0.6170535683631897, + "learning_rate": 4.7367169452371245e-05, + "loss": 1.3753, + "step": 6200 + }, + { + "epoch": 0.4215926076912624, + "grad_norm": 0.600373387336731, + "learning_rate": 4.7365046201929616e-05, + "loss": 1.3378, + "step": 6205 + }, + { + "epoch": 0.4219323277619242, + "grad_norm": 0.594676673412323, + "learning_rate": 4.736292295148797e-05, + "loss": 1.3615, + "step": 6210 + }, + { + "epoch": 0.42227204783258593, + "grad_norm": 0.6378969550132751, + "learning_rate": 4.736079970104634e-05, + "loss": 1.3388, + "step": 6215 + }, + { + "epoch": 0.4226117679032477, + "grad_norm": 0.5239458680152893, + "learning_rate": 4.735867645060471e-05, + "loss": 1.3513, + "step": 6220 + }, + { + "epoch": 0.4229514879739095, + "grad_norm": 0.6183655261993408, + "learning_rate": 4.7356553200163065e-05, + "loss": 1.4161, + "step": 6225 + }, + { + "epoch": 0.42329120804457127, + "grad_norm": 0.6560379862785339, + "learning_rate": 4.735442994972143e-05, + "loss": 1.505, + "step": 6230 + }, + { + "epoch": 0.42363092811523306, + "grad_norm": 0.7305738925933838, + "learning_rate": 4.73523066992798e-05, + "loss": 1.4154, + "step": 6235 + }, + { + "epoch": 0.4239706481858948, + "grad_norm": 0.6894407868385315, + "learning_rate": 4.735018344883816e-05, + "loss": 1.444, + "step": 6240 + }, + { + "epoch": 0.4243103682565566, + "grad_norm": 0.6726522445678711, + "learning_rate": 4.734806019839652e-05, + "loss": 1.3739, + "step": 6245 + }, + { + "epoch": 0.42465008832721834, + "grad_norm": 0.6662222743034363, + "learning_rate": 4.734593694795489e-05, + "loss": 1.3381, + "step": 6250 + }, + { + "epoch": 0.42498980839788014, + "grad_norm": 0.6684558391571045, + "learning_rate": 4.734381369751325e-05, + "loss": 1.3474, + "step": 6255 + }, + { + "epoch": 0.42532952846854194, + "grad_norm": 0.6196736693382263, + "learning_rate": 4.734169044707161e-05, + "loss": 1.3213, + "step": 6260 + }, + { + "epoch": 0.4256692485392037, + "grad_norm": 0.6730858683586121, + "learning_rate": 4.733956719662998e-05, + "loss": 1.3643, + "step": 6265 + }, + { + "epoch": 0.4260089686098655, + "grad_norm": 0.8188268542289734, + "learning_rate": 4.733744394618834e-05, + "loss": 1.4328, + "step": 6270 + }, + { + "epoch": 0.4263486886805272, + "grad_norm": 0.6468964219093323, + "learning_rate": 4.7335320695746705e-05, + "loss": 1.3889, + "step": 6275 + }, + { + "epoch": 0.426688408751189, + "grad_norm": 0.6821523308753967, + "learning_rate": 4.733319744530507e-05, + "loss": 1.4177, + "step": 6280 + }, + { + "epoch": 0.4270281288218508, + "grad_norm": 0.6204740405082703, + "learning_rate": 4.733107419486343e-05, + "loss": 1.4181, + "step": 6285 + }, + { + "epoch": 0.42736784889251256, + "grad_norm": 0.688260018825531, + "learning_rate": 4.73289509444218e-05, + "loss": 1.3448, + "step": 6290 + }, + { + "epoch": 0.42770756896317436, + "grad_norm": 0.5745126008987427, + "learning_rate": 4.732682769398016e-05, + "loss": 1.4101, + "step": 6295 + }, + { + "epoch": 0.4280472890338361, + "grad_norm": 0.7786775827407837, + "learning_rate": 4.7324704443538525e-05, + "loss": 1.514, + "step": 6300 + }, + { + "epoch": 0.4283870091044979, + "grad_norm": 0.6279415488243103, + "learning_rate": 4.7322581193096896e-05, + "loss": 1.3362, + "step": 6305 + }, + { + "epoch": 0.4287267291751597, + "grad_norm": 0.5530821084976196, + "learning_rate": 4.732045794265525e-05, + "loss": 1.3756, + "step": 6310 + }, + { + "epoch": 0.42906644924582144, + "grad_norm": 0.7006688714027405, + "learning_rate": 4.731833469221362e-05, + "loss": 1.4522, + "step": 6315 + }, + { + "epoch": 0.42940616931648323, + "grad_norm": 0.6122555136680603, + "learning_rate": 4.731621144177199e-05, + "loss": 1.3682, + "step": 6320 + }, + { + "epoch": 0.429745889387145, + "grad_norm": 0.618565559387207, + "learning_rate": 4.7314088191330345e-05, + "loss": 1.3329, + "step": 6325 + }, + { + "epoch": 0.4300856094578068, + "grad_norm": 0.6874955296516418, + "learning_rate": 4.731196494088871e-05, + "loss": 1.3354, + "step": 6330 + }, + { + "epoch": 0.4304253295284685, + "grad_norm": 0.6150823831558228, + "learning_rate": 4.730984169044707e-05, + "loss": 1.4378, + "step": 6335 + }, + { + "epoch": 0.4307650495991303, + "grad_norm": 0.6561357975006104, + "learning_rate": 4.730771844000544e-05, + "loss": 1.3912, + "step": 6340 + }, + { + "epoch": 0.4311047696697921, + "grad_norm": 0.579881489276886, + "learning_rate": 4.73055951895638e-05, + "loss": 1.4251, + "step": 6345 + }, + { + "epoch": 0.43144448974045385, + "grad_norm": 0.6598808765411377, + "learning_rate": 4.7303471939122165e-05, + "loss": 1.4157, + "step": 6350 + }, + { + "epoch": 0.43178420981111565, + "grad_norm": 0.6837253570556641, + "learning_rate": 4.730134868868053e-05, + "loss": 1.401, + "step": 6355 + }, + { + "epoch": 0.4321239298817774, + "grad_norm": 0.6009359359741211, + "learning_rate": 4.729922543823889e-05, + "loss": 1.3655, + "step": 6360 + }, + { + "epoch": 0.4324636499524392, + "grad_norm": 0.6403203010559082, + "learning_rate": 4.729710218779726e-05, + "loss": 1.4184, + "step": 6365 + }, + { + "epoch": 0.432803370023101, + "grad_norm": 0.6336822509765625, + "learning_rate": 4.729497893735562e-05, + "loss": 1.3008, + "step": 6370 + }, + { + "epoch": 0.43314309009376273, + "grad_norm": 0.6576248407363892, + "learning_rate": 4.7292855686913985e-05, + "loss": 1.4843, + "step": 6375 + }, + { + "epoch": 0.4334828101644245, + "grad_norm": 0.5695661306381226, + "learning_rate": 4.729073243647235e-05, + "loss": 1.4633, + "step": 6380 + }, + { + "epoch": 0.43382253023508627, + "grad_norm": 0.678441047668457, + "learning_rate": 4.728860918603071e-05, + "loss": 1.4119, + "step": 6385 + }, + { + "epoch": 0.43416225030574807, + "grad_norm": 0.6367850303649902, + "learning_rate": 4.728648593558908e-05, + "loss": 1.3968, + "step": 6390 + }, + { + "epoch": 0.43450197037640986, + "grad_norm": 0.6021522283554077, + "learning_rate": 4.728436268514744e-05, + "loss": 1.4134, + "step": 6395 + }, + { + "epoch": 0.4348416904470716, + "grad_norm": 0.592147707939148, + "learning_rate": 4.7282239434705805e-05, + "loss": 1.3061, + "step": 6400 + }, + { + "epoch": 0.4351814105177334, + "grad_norm": 0.6148473620414734, + "learning_rate": 4.728011618426417e-05, + "loss": 1.4722, + "step": 6405 + }, + { + "epoch": 0.43552113058839514, + "grad_norm": 0.5737019777297974, + "learning_rate": 4.727799293382253e-05, + "loss": 1.3992, + "step": 6410 + }, + { + "epoch": 0.43586085065905694, + "grad_norm": 0.6439921259880066, + "learning_rate": 4.72758696833809e-05, + "loss": 1.3698, + "step": 6415 + }, + { + "epoch": 0.4362005707297187, + "grad_norm": 0.553864598274231, + "learning_rate": 4.727374643293926e-05, + "loss": 1.291, + "step": 6420 + }, + { + "epoch": 0.4365402908003805, + "grad_norm": 0.5933894515037537, + "learning_rate": 4.7271623182497625e-05, + "loss": 1.3912, + "step": 6425 + }, + { + "epoch": 0.4368800108710423, + "grad_norm": 0.6752405762672424, + "learning_rate": 4.726949993205599e-05, + "loss": 1.4306, + "step": 6430 + }, + { + "epoch": 0.437219730941704, + "grad_norm": 0.6308922171592712, + "learning_rate": 4.7267376681614353e-05, + "loss": 1.3051, + "step": 6435 + }, + { + "epoch": 0.4375594510123658, + "grad_norm": 0.6437475681304932, + "learning_rate": 4.726525343117272e-05, + "loss": 1.3897, + "step": 6440 + }, + { + "epoch": 0.43789917108302756, + "grad_norm": 0.7018872499465942, + "learning_rate": 4.726313018073108e-05, + "loss": 1.3172, + "step": 6445 + }, + { + "epoch": 0.43823889115368936, + "grad_norm": 0.662291944026947, + "learning_rate": 4.7261006930289445e-05, + "loss": 1.4606, + "step": 6450 + }, + { + "epoch": 0.43857861122435116, + "grad_norm": 0.5564162731170654, + "learning_rate": 4.725888367984781e-05, + "loss": 1.4022, + "step": 6455 + }, + { + "epoch": 0.4389183312950129, + "grad_norm": 0.677634060382843, + "learning_rate": 4.7256760429406173e-05, + "loss": 1.3791, + "step": 6460 + }, + { + "epoch": 0.4392580513656747, + "grad_norm": 0.5525080561637878, + "learning_rate": 4.725463717896453e-05, + "loss": 1.4171, + "step": 6465 + }, + { + "epoch": 0.43959777143633644, + "grad_norm": 0.5328442454338074, + "learning_rate": 4.72525139285229e-05, + "loss": 1.5618, + "step": 6470 + }, + { + "epoch": 0.43993749150699824, + "grad_norm": 0.6505507826805115, + "learning_rate": 4.7250390678081265e-05, + "loss": 1.3914, + "step": 6475 + }, + { + "epoch": 0.44027721157766003, + "grad_norm": 0.6275343894958496, + "learning_rate": 4.724826742763962e-05, + "loss": 1.4201, + "step": 6480 + }, + { + "epoch": 0.4406169316483218, + "grad_norm": 0.670893669128418, + "learning_rate": 4.7246144177197993e-05, + "loss": 1.426, + "step": 6485 + }, + { + "epoch": 0.44095665171898357, + "grad_norm": 0.6744541525840759, + "learning_rate": 4.724402092675636e-05, + "loss": 1.4765, + "step": 6490 + }, + { + "epoch": 0.4412963717896453, + "grad_norm": 0.620466411113739, + "learning_rate": 4.7241897676314715e-05, + "loss": 1.3884, + "step": 6495 + }, + { + "epoch": 0.4416360918603071, + "grad_norm": 0.6874172687530518, + "learning_rate": 4.7239774425873085e-05, + "loss": 1.3943, + "step": 6500 + }, + { + "epoch": 0.4419758119309689, + "grad_norm": 0.6267426013946533, + "learning_rate": 4.723765117543145e-05, + "loss": 1.3539, + "step": 6505 + }, + { + "epoch": 0.44231553200163065, + "grad_norm": 0.5028911828994751, + "learning_rate": 4.723552792498981e-05, + "loss": 1.4493, + "step": 6510 + }, + { + "epoch": 0.44265525207229245, + "grad_norm": 0.5759350061416626, + "learning_rate": 4.723340467454818e-05, + "loss": 1.3223, + "step": 6515 + }, + { + "epoch": 0.4429949721429542, + "grad_norm": 0.5979635119438171, + "learning_rate": 4.723128142410654e-05, + "loss": 1.3262, + "step": 6520 + }, + { + "epoch": 0.443334692213616, + "grad_norm": 0.6062570810317993, + "learning_rate": 4.72291581736649e-05, + "loss": 1.3175, + "step": 6525 + }, + { + "epoch": 0.44367441228427773, + "grad_norm": 0.6158884167671204, + "learning_rate": 4.722703492322327e-05, + "loss": 1.311, + "step": 6530 + }, + { + "epoch": 0.44401413235493953, + "grad_norm": 0.7551906108856201, + "learning_rate": 4.722491167278163e-05, + "loss": 1.3355, + "step": 6535 + }, + { + "epoch": 0.4443538524256013, + "grad_norm": 0.6330267190933228, + "learning_rate": 4.722278842233999e-05, + "loss": 1.3874, + "step": 6540 + }, + { + "epoch": 0.44469357249626307, + "grad_norm": 0.5953761339187622, + "learning_rate": 4.722066517189836e-05, + "loss": 1.4243, + "step": 6545 + }, + { + "epoch": 0.44503329256692487, + "grad_norm": 0.6187408566474915, + "learning_rate": 4.721854192145672e-05, + "loss": 1.3054, + "step": 6550 + }, + { + "epoch": 0.4453730126375866, + "grad_norm": 0.607340395450592, + "learning_rate": 4.721641867101508e-05, + "loss": 1.3926, + "step": 6555 + }, + { + "epoch": 0.4457127327082484, + "grad_norm": 0.5794751644134521, + "learning_rate": 4.7214295420573453e-05, + "loss": 1.3777, + "step": 6560 + }, + { + "epoch": 0.4460524527789102, + "grad_norm": 0.6140636205673218, + "learning_rate": 4.721217217013181e-05, + "loss": 1.3956, + "step": 6565 + }, + { + "epoch": 0.44639217284957194, + "grad_norm": 0.5925490856170654, + "learning_rate": 4.7210048919690175e-05, + "loss": 1.3495, + "step": 6570 + }, + { + "epoch": 0.44673189292023374, + "grad_norm": 0.6611554026603699, + "learning_rate": 4.7207925669248545e-05, + "loss": 1.43, + "step": 6575 + }, + { + "epoch": 0.4470716129908955, + "grad_norm": 0.588813841342926, + "learning_rate": 4.72058024188069e-05, + "loss": 1.4257, + "step": 6580 + }, + { + "epoch": 0.4474113330615573, + "grad_norm": 0.6727645397186279, + "learning_rate": 4.720367916836527e-05, + "loss": 1.3628, + "step": 6585 + }, + { + "epoch": 0.4477510531322191, + "grad_norm": 0.6347823739051819, + "learning_rate": 4.720155591792364e-05, + "loss": 1.387, + "step": 6590 + }, + { + "epoch": 0.4480907732028808, + "grad_norm": 0.624548077583313, + "learning_rate": 4.7199432667481995e-05, + "loss": 1.4544, + "step": 6595 + }, + { + "epoch": 0.4484304932735426, + "grad_norm": 0.6414671540260315, + "learning_rate": 4.719730941704036e-05, + "loss": 1.3541, + "step": 6600 + }, + { + "epoch": 0.44877021334420436, + "grad_norm": 0.6082646250724792, + "learning_rate": 4.719518616659872e-05, + "loss": 1.394, + "step": 6605 + }, + { + "epoch": 0.44910993341486616, + "grad_norm": 0.6644629836082458, + "learning_rate": 4.719306291615709e-05, + "loss": 1.5086, + "step": 6610 + }, + { + "epoch": 0.4494496534855279, + "grad_norm": 0.6724450588226318, + "learning_rate": 4.719093966571545e-05, + "loss": 1.3885, + "step": 6615 + }, + { + "epoch": 0.4497893735561897, + "grad_norm": 0.5682170987129211, + "learning_rate": 4.7188816415273815e-05, + "loss": 1.5027, + "step": 6620 + }, + { + "epoch": 0.4501290936268515, + "grad_norm": 0.6639217138290405, + "learning_rate": 4.718669316483218e-05, + "loss": 1.3845, + "step": 6625 + }, + { + "epoch": 0.45046881369751324, + "grad_norm": 0.6022807955741882, + "learning_rate": 4.718456991439054e-05, + "loss": 1.331, + "step": 6630 + }, + { + "epoch": 0.45080853376817503, + "grad_norm": 0.6910311579704285, + "learning_rate": 4.718244666394891e-05, + "loss": 1.4313, + "step": 6635 + }, + { + "epoch": 0.4511482538388368, + "grad_norm": 0.5989499688148499, + "learning_rate": 4.718032341350727e-05, + "loss": 1.4147, + "step": 6640 + }, + { + "epoch": 0.4514879739094986, + "grad_norm": 0.6600388288497925, + "learning_rate": 4.717820016306564e-05, + "loss": 1.376, + "step": 6645 + }, + { + "epoch": 0.45182769398016037, + "grad_norm": 0.5953102111816406, + "learning_rate": 4.7176076912624e-05, + "loss": 1.2933, + "step": 6650 + }, + { + "epoch": 0.4521674140508221, + "grad_norm": 0.6237321496009827, + "learning_rate": 4.717395366218236e-05, + "loss": 1.3621, + "step": 6655 + }, + { + "epoch": 0.4525071341214839, + "grad_norm": 0.6195266246795654, + "learning_rate": 4.7171830411740734e-05, + "loss": 1.4022, + "step": 6660 + }, + { + "epoch": 0.45284685419214565, + "grad_norm": 0.6216328740119934, + "learning_rate": 4.716970716129909e-05, + "loss": 1.2973, + "step": 6665 + }, + { + "epoch": 0.45318657426280745, + "grad_norm": 0.5761063694953918, + "learning_rate": 4.7167583910857455e-05, + "loss": 1.3438, + "step": 6670 + }, + { + "epoch": 0.45352629433346925, + "grad_norm": 0.5767399668693542, + "learning_rate": 4.7165460660415826e-05, + "loss": 1.4085, + "step": 6675 + }, + { + "epoch": 0.453866014404131, + "grad_norm": 0.7233198285102844, + "learning_rate": 4.716333740997418e-05, + "loss": 1.4204, + "step": 6680 + }, + { + "epoch": 0.4542057344747928, + "grad_norm": 0.5569356679916382, + "learning_rate": 4.716121415953255e-05, + "loss": 1.3326, + "step": 6685 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.6534759998321533, + "learning_rate": 4.715909090909091e-05, + "loss": 1.2859, + "step": 6690 + }, + { + "epoch": 0.45488517461611633, + "grad_norm": 0.7906098365783691, + "learning_rate": 4.7156967658649275e-05, + "loss": 1.4885, + "step": 6695 + }, + { + "epoch": 0.45522489468677807, + "grad_norm": 0.6368966102600098, + "learning_rate": 4.715484440820764e-05, + "loss": 1.2917, + "step": 6700 + }, + { + "epoch": 0.45556461475743987, + "grad_norm": 0.6911798119544983, + "learning_rate": 4.7152721157766e-05, + "loss": 1.3531, + "step": 6705 + }, + { + "epoch": 0.45590433482810166, + "grad_norm": 0.6642591953277588, + "learning_rate": 4.715059790732437e-05, + "loss": 1.4106, + "step": 6710 + }, + { + "epoch": 0.4562440548987634, + "grad_norm": 0.6652267575263977, + "learning_rate": 4.714847465688273e-05, + "loss": 1.3623, + "step": 6715 + }, + { + "epoch": 0.4565837749694252, + "grad_norm": 0.7334158420562744, + "learning_rate": 4.7146351406441095e-05, + "loss": 1.4278, + "step": 6720 + }, + { + "epoch": 0.45692349504008695, + "grad_norm": 0.6598769426345825, + "learning_rate": 4.714422815599946e-05, + "loss": 1.3, + "step": 6725 + }, + { + "epoch": 0.45726321511074874, + "grad_norm": 0.6216796636581421, + "learning_rate": 4.714210490555782e-05, + "loss": 1.4718, + "step": 6730 + }, + { + "epoch": 0.45760293518141054, + "grad_norm": 0.7299495339393616, + "learning_rate": 4.713998165511619e-05, + "loss": 1.3517, + "step": 6735 + }, + { + "epoch": 0.4579426552520723, + "grad_norm": 0.6052029132843018, + "learning_rate": 4.713785840467455e-05, + "loss": 1.3797, + "step": 6740 + }, + { + "epoch": 0.4582823753227341, + "grad_norm": 0.7223475575447083, + "learning_rate": 4.7135735154232915e-05, + "loss": 1.4449, + "step": 6745 + }, + { + "epoch": 0.4586220953933958, + "grad_norm": 0.6296783685684204, + "learning_rate": 4.713361190379128e-05, + "loss": 1.3496, + "step": 6750 + }, + { + "epoch": 0.4589618154640576, + "grad_norm": 0.7194005846977234, + "learning_rate": 4.713148865334964e-05, + "loss": 1.5065, + "step": 6755 + }, + { + "epoch": 0.4593015355347194, + "grad_norm": 0.579858124256134, + "learning_rate": 4.712936540290801e-05, + "loss": 1.3154, + "step": 6760 + }, + { + "epoch": 0.45964125560538116, + "grad_norm": 0.6932454705238342, + "learning_rate": 4.712724215246637e-05, + "loss": 1.3644, + "step": 6765 + }, + { + "epoch": 0.45998097567604296, + "grad_norm": 0.6134516596794128, + "learning_rate": 4.7125118902024735e-05, + "loss": 1.4632, + "step": 6770 + }, + { + "epoch": 0.4603206957467047, + "grad_norm": 0.647254467010498, + "learning_rate": 4.71229956515831e-05, + "loss": 1.4343, + "step": 6775 + }, + { + "epoch": 0.4606604158173665, + "grad_norm": 0.6321381330490112, + "learning_rate": 4.712087240114146e-05, + "loss": 1.4337, + "step": 6780 + }, + { + "epoch": 0.46100013588802824, + "grad_norm": 0.6225603222846985, + "learning_rate": 4.711874915069983e-05, + "loss": 1.3577, + "step": 6785 + }, + { + "epoch": 0.46133985595869004, + "grad_norm": 0.6371875405311584, + "learning_rate": 4.711662590025819e-05, + "loss": 1.4572, + "step": 6790 + }, + { + "epoch": 0.46167957602935183, + "grad_norm": 0.5947543382644653, + "learning_rate": 4.7114502649816555e-05, + "loss": 1.3498, + "step": 6795 + }, + { + "epoch": 0.4620192961000136, + "grad_norm": 0.6039636731147766, + "learning_rate": 4.711237939937492e-05, + "loss": 1.4372, + "step": 6800 + }, + { + "epoch": 0.4623590161706754, + "grad_norm": 0.6076428294181824, + "learning_rate": 4.7110256148933276e-05, + "loss": 1.3423, + "step": 6805 + }, + { + "epoch": 0.4626987362413371, + "grad_norm": 0.623203456401825, + "learning_rate": 4.710813289849165e-05, + "loss": 1.3663, + "step": 6810 + }, + { + "epoch": 0.4630384563119989, + "grad_norm": 0.5835606455802917, + "learning_rate": 4.710600964805001e-05, + "loss": 1.3292, + "step": 6815 + }, + { + "epoch": 0.4633781763826607, + "grad_norm": 0.6508517861366272, + "learning_rate": 4.710388639760837e-05, + "loss": 1.3595, + "step": 6820 + }, + { + "epoch": 0.46371789645332245, + "grad_norm": 0.5497671961784363, + "learning_rate": 4.710176314716674e-05, + "loss": 1.3624, + "step": 6825 + }, + { + "epoch": 0.46405761652398425, + "grad_norm": 0.6289246082305908, + "learning_rate": 4.70996398967251e-05, + "loss": 1.3744, + "step": 6830 + }, + { + "epoch": 0.464397336594646, + "grad_norm": 0.7137528657913208, + "learning_rate": 4.709751664628346e-05, + "loss": 1.3934, + "step": 6835 + }, + { + "epoch": 0.4647370566653078, + "grad_norm": 0.638431191444397, + "learning_rate": 4.709539339584183e-05, + "loss": 1.4849, + "step": 6840 + }, + { + "epoch": 0.4650767767359696, + "grad_norm": 0.5792069435119629, + "learning_rate": 4.7093270145400195e-05, + "loss": 1.3843, + "step": 6845 + }, + { + "epoch": 0.46541649680663133, + "grad_norm": 0.7117582559585571, + "learning_rate": 4.709114689495855e-05, + "loss": 1.5094, + "step": 6850 + }, + { + "epoch": 0.4657562168772931, + "grad_norm": 0.5959650278091431, + "learning_rate": 4.708902364451692e-05, + "loss": 1.3869, + "step": 6855 + }, + { + "epoch": 0.46609593694795487, + "grad_norm": 0.6324079632759094, + "learning_rate": 4.708690039407529e-05, + "loss": 1.3351, + "step": 6860 + }, + { + "epoch": 0.46643565701861667, + "grad_norm": 0.6711081266403198, + "learning_rate": 4.7084777143633644e-05, + "loss": 1.4543, + "step": 6865 + }, + { + "epoch": 0.4667753770892784, + "grad_norm": 0.6800785660743713, + "learning_rate": 4.7082653893192015e-05, + "loss": 1.302, + "step": 6870 + }, + { + "epoch": 0.4671150971599402, + "grad_norm": 0.6625350117683411, + "learning_rate": 4.708053064275038e-05, + "loss": 1.2314, + "step": 6875 + }, + { + "epoch": 0.467454817230602, + "grad_norm": 0.7001163959503174, + "learning_rate": 4.7078407392308736e-05, + "loss": 1.4239, + "step": 6880 + }, + { + "epoch": 0.46779453730126375, + "grad_norm": 0.6386942267417908, + "learning_rate": 4.707628414186711e-05, + "loss": 1.4019, + "step": 6885 + }, + { + "epoch": 0.46813425737192554, + "grad_norm": 0.6393757462501526, + "learning_rate": 4.7074160891425464e-05, + "loss": 1.4489, + "step": 6890 + }, + { + "epoch": 0.4684739774425873, + "grad_norm": 0.6970923542976379, + "learning_rate": 4.707203764098383e-05, + "loss": 1.368, + "step": 6895 + }, + { + "epoch": 0.4688136975132491, + "grad_norm": 0.599149227142334, + "learning_rate": 4.70699143905422e-05, + "loss": 1.4188, + "step": 6900 + }, + { + "epoch": 0.4691534175839109, + "grad_norm": 0.6489973664283752, + "learning_rate": 4.7067791140100556e-05, + "loss": 1.3446, + "step": 6905 + }, + { + "epoch": 0.4694931376545726, + "grad_norm": 0.6070247888565063, + "learning_rate": 4.706566788965892e-05, + "loss": 1.3729, + "step": 6910 + }, + { + "epoch": 0.4698328577252344, + "grad_norm": 0.6290463805198669, + "learning_rate": 4.706354463921729e-05, + "loss": 1.4297, + "step": 6915 + }, + { + "epoch": 0.47017257779589616, + "grad_norm": 0.6896281242370605, + "learning_rate": 4.706142138877565e-05, + "loss": 1.3642, + "step": 6920 + }, + { + "epoch": 0.47051229786655796, + "grad_norm": 0.6045809984207153, + "learning_rate": 4.705929813833401e-05, + "loss": 1.4051, + "step": 6925 + }, + { + "epoch": 0.47085201793721976, + "grad_norm": 0.5476675033569336, + "learning_rate": 4.705717488789238e-05, + "loss": 1.3467, + "step": 6930 + }, + { + "epoch": 0.4711917380078815, + "grad_norm": 0.6183815598487854, + "learning_rate": 4.705505163745074e-05, + "loss": 1.4159, + "step": 6935 + }, + { + "epoch": 0.4715314580785433, + "grad_norm": 0.6676488518714905, + "learning_rate": 4.7052928387009104e-05, + "loss": 1.4024, + "step": 6940 + }, + { + "epoch": 0.47187117814920504, + "grad_norm": 0.6127513647079468, + "learning_rate": 4.7050805136567475e-05, + "loss": 1.4613, + "step": 6945 + }, + { + "epoch": 0.47221089821986684, + "grad_norm": 0.6065813899040222, + "learning_rate": 4.704868188612583e-05, + "loss": 1.2956, + "step": 6950 + }, + { + "epoch": 0.4725506182905286, + "grad_norm": 0.6002477407455444, + "learning_rate": 4.7046558635684196e-05, + "loss": 1.5058, + "step": 6955 + }, + { + "epoch": 0.4728903383611904, + "grad_norm": 0.7777348756790161, + "learning_rate": 4.704443538524256e-05, + "loss": 1.4562, + "step": 6960 + }, + { + "epoch": 0.4732300584318522, + "grad_norm": 0.6998386979103088, + "learning_rate": 4.7042312134800924e-05, + "loss": 1.4011, + "step": 6965 + }, + { + "epoch": 0.4735697785025139, + "grad_norm": 0.5755128264427185, + "learning_rate": 4.704018888435929e-05, + "loss": 1.4089, + "step": 6970 + }, + { + "epoch": 0.4739094985731757, + "grad_norm": 0.5363697409629822, + "learning_rate": 4.703806563391765e-05, + "loss": 1.4084, + "step": 6975 + }, + { + "epoch": 0.47424921864383746, + "grad_norm": 0.6444015502929688, + "learning_rate": 4.7035942383476016e-05, + "loss": 1.3412, + "step": 6980 + }, + { + "epoch": 0.47458893871449925, + "grad_norm": 0.6073984503746033, + "learning_rate": 4.703381913303439e-05, + "loss": 1.3084, + "step": 6985 + }, + { + "epoch": 0.47492865878516105, + "grad_norm": 0.6416052579879761, + "learning_rate": 4.7031695882592744e-05, + "loss": 1.393, + "step": 6990 + }, + { + "epoch": 0.4752683788558228, + "grad_norm": 0.7063747644424438, + "learning_rate": 4.702957263215111e-05, + "loss": 1.3068, + "step": 6995 + }, + { + "epoch": 0.4756080989264846, + "grad_norm": 0.5851465463638306, + "learning_rate": 4.702744938170948e-05, + "loss": 1.3124, + "step": 7000 + }, + { + "epoch": 0.47594781899714633, + "grad_norm": 0.625334620475769, + "learning_rate": 4.7025326131267836e-05, + "loss": 1.3969, + "step": 7005 + }, + { + "epoch": 0.47628753906780813, + "grad_norm": 0.7166358828544617, + "learning_rate": 4.70232028808262e-05, + "loss": 1.4506, + "step": 7010 + }, + { + "epoch": 0.4766272591384699, + "grad_norm": 0.663744330406189, + "learning_rate": 4.702107963038457e-05, + "loss": 1.4552, + "step": 7015 + }, + { + "epoch": 0.47696697920913167, + "grad_norm": 0.6093037724494934, + "learning_rate": 4.701895637994293e-05, + "loss": 1.3591, + "step": 7020 + }, + { + "epoch": 0.47730669927979347, + "grad_norm": 0.5953529477119446, + "learning_rate": 4.701683312950129e-05, + "loss": 1.2685, + "step": 7025 + }, + { + "epoch": 0.4776464193504552, + "grad_norm": 0.7081599235534668, + "learning_rate": 4.7014709879059656e-05, + "loss": 1.4126, + "step": 7030 + }, + { + "epoch": 0.477986139421117, + "grad_norm": 0.6899991035461426, + "learning_rate": 4.701258662861802e-05, + "loss": 1.3675, + "step": 7035 + }, + { + "epoch": 0.47832585949177875, + "grad_norm": 0.6287958025932312, + "learning_rate": 4.7010463378176384e-05, + "loss": 1.3333, + "step": 7040 + }, + { + "epoch": 0.47866557956244055, + "grad_norm": 0.5969959497451782, + "learning_rate": 4.700834012773475e-05, + "loss": 1.3552, + "step": 7045 + }, + { + "epoch": 0.47900529963310234, + "grad_norm": 0.5409806370735168, + "learning_rate": 4.700621687729311e-05, + "loss": 1.3008, + "step": 7050 + }, + { + "epoch": 0.4793450197037641, + "grad_norm": 0.6205386519432068, + "learning_rate": 4.7004093626851476e-05, + "loss": 1.3809, + "step": 7055 + }, + { + "epoch": 0.4796847397744259, + "grad_norm": 0.5811137557029724, + "learning_rate": 4.700197037640984e-05, + "loss": 1.4104, + "step": 7060 + }, + { + "epoch": 0.4800244598450876, + "grad_norm": 0.6639488339424133, + "learning_rate": 4.6999847125968204e-05, + "loss": 1.3618, + "step": 7065 + }, + { + "epoch": 0.4803641799157494, + "grad_norm": 0.6906101107597351, + "learning_rate": 4.699772387552657e-05, + "loss": 1.3211, + "step": 7070 + }, + { + "epoch": 0.4807038999864112, + "grad_norm": 0.6427626609802246, + "learning_rate": 4.699560062508493e-05, + "loss": 1.4017, + "step": 7075 + }, + { + "epoch": 0.48104362005707296, + "grad_norm": 0.6597930788993835, + "learning_rate": 4.6993477374643296e-05, + "loss": 1.391, + "step": 7080 + }, + { + "epoch": 0.48138334012773476, + "grad_norm": 0.5214569568634033, + "learning_rate": 4.699135412420166e-05, + "loss": 1.4231, + "step": 7085 + }, + { + "epoch": 0.4817230601983965, + "grad_norm": 0.6299998760223389, + "learning_rate": 4.6989230873760024e-05, + "loss": 1.4031, + "step": 7090 + }, + { + "epoch": 0.4820627802690583, + "grad_norm": 0.6915963292121887, + "learning_rate": 4.698710762331839e-05, + "loss": 1.3353, + "step": 7095 + }, + { + "epoch": 0.4824025003397201, + "grad_norm": 0.5781539082527161, + "learning_rate": 4.698498437287675e-05, + "loss": 1.3934, + "step": 7100 + }, + { + "epoch": 0.48274222041038184, + "grad_norm": 0.8143801689147949, + "learning_rate": 4.6982861122435116e-05, + "loss": 1.3657, + "step": 7105 + }, + { + "epoch": 0.48308194048104364, + "grad_norm": 0.5864207744598389, + "learning_rate": 4.698073787199348e-05, + "loss": 1.4226, + "step": 7110 + }, + { + "epoch": 0.4834216605517054, + "grad_norm": 0.6527549028396606, + "learning_rate": 4.6978614621551844e-05, + "loss": 1.2369, + "step": 7115 + }, + { + "epoch": 0.4837613806223672, + "grad_norm": 0.6397340893745422, + "learning_rate": 4.697649137111021e-05, + "loss": 1.4503, + "step": 7120 + }, + { + "epoch": 0.4841011006930289, + "grad_norm": 0.7032833099365234, + "learning_rate": 4.697436812066857e-05, + "loss": 1.4234, + "step": 7125 + }, + { + "epoch": 0.4844408207636907, + "grad_norm": 0.6597045063972473, + "learning_rate": 4.6972244870226936e-05, + "loss": 1.3776, + "step": 7130 + }, + { + "epoch": 0.4847805408343525, + "grad_norm": 0.6307904124259949, + "learning_rate": 4.69701216197853e-05, + "loss": 1.3324, + "step": 7135 + }, + { + "epoch": 0.48512026090501426, + "grad_norm": 0.597841203212738, + "learning_rate": 4.6967998369343664e-05, + "loss": 1.3702, + "step": 7140 + }, + { + "epoch": 0.48545998097567605, + "grad_norm": 0.6850742101669312, + "learning_rate": 4.696587511890203e-05, + "loss": 1.3493, + "step": 7145 + }, + { + "epoch": 0.4857997010463378, + "grad_norm": 0.6079716682434082, + "learning_rate": 4.696375186846039e-05, + "loss": 1.3664, + "step": 7150 + }, + { + "epoch": 0.4861394211169996, + "grad_norm": 0.6594722270965576, + "learning_rate": 4.6961628618018756e-05, + "loss": 1.3821, + "step": 7155 + }, + { + "epoch": 0.4864791411876614, + "grad_norm": 0.6235490441322327, + "learning_rate": 4.6959505367577114e-05, + "loss": 1.3944, + "step": 7160 + }, + { + "epoch": 0.48681886125832313, + "grad_norm": 0.6533124446868896, + "learning_rate": 4.6957382117135484e-05, + "loss": 1.314, + "step": 7165 + }, + { + "epoch": 0.48715858132898493, + "grad_norm": 0.6049984693527222, + "learning_rate": 4.695525886669385e-05, + "loss": 1.4004, + "step": 7170 + }, + { + "epoch": 0.48749830139964667, + "grad_norm": 0.6813513040542603, + "learning_rate": 4.6953135616252206e-05, + "loss": 1.4451, + "step": 7175 + }, + { + "epoch": 0.48783802147030847, + "grad_norm": 0.707304060459137, + "learning_rate": 4.6951012365810576e-05, + "loss": 1.331, + "step": 7180 + }, + { + "epoch": 0.48817774154097027, + "grad_norm": 0.596437394618988, + "learning_rate": 4.694888911536894e-05, + "loss": 1.3058, + "step": 7185 + }, + { + "epoch": 0.488517461611632, + "grad_norm": 0.6496755480766296, + "learning_rate": 4.69467658649273e-05, + "loss": 1.3408, + "step": 7190 + }, + { + "epoch": 0.4888571816822938, + "grad_norm": 0.6542823910713196, + "learning_rate": 4.694464261448567e-05, + "loss": 1.4765, + "step": 7195 + }, + { + "epoch": 0.48919690175295555, + "grad_norm": 0.5419815182685852, + "learning_rate": 4.694251936404403e-05, + "loss": 1.4465, + "step": 7200 + }, + { + "epoch": 0.48953662182361735, + "grad_norm": 0.6572680473327637, + "learning_rate": 4.694039611360239e-05, + "loss": 1.376, + "step": 7205 + }, + { + "epoch": 0.4898763418942791, + "grad_norm": 0.5701210498809814, + "learning_rate": 4.693827286316076e-05, + "loss": 1.3777, + "step": 7210 + }, + { + "epoch": 0.4902160619649409, + "grad_norm": 0.7354646325111389, + "learning_rate": 4.6936149612719124e-05, + "loss": 1.3926, + "step": 7215 + }, + { + "epoch": 0.4905557820356027, + "grad_norm": 0.6235008239746094, + "learning_rate": 4.693402636227748e-05, + "loss": 1.3472, + "step": 7220 + }, + { + "epoch": 0.4908955021062644, + "grad_norm": 0.5395787954330444, + "learning_rate": 4.693190311183585e-05, + "loss": 1.4022, + "step": 7225 + }, + { + "epoch": 0.4912352221769262, + "grad_norm": 0.6403634548187256, + "learning_rate": 4.692977986139421e-05, + "loss": 1.3633, + "step": 7230 + }, + { + "epoch": 0.49157494224758796, + "grad_norm": 0.724759578704834, + "learning_rate": 4.6927656610952574e-05, + "loss": 1.3742, + "step": 7235 + }, + { + "epoch": 0.49191466231824976, + "grad_norm": 0.6368885636329651, + "learning_rate": 4.6925533360510944e-05, + "loss": 1.2781, + "step": 7240 + }, + { + "epoch": 0.49225438238891156, + "grad_norm": 0.635615348815918, + "learning_rate": 4.69234101100693e-05, + "loss": 1.328, + "step": 7245 + }, + { + "epoch": 0.4925941024595733, + "grad_norm": 0.669722855091095, + "learning_rate": 4.6921286859627666e-05, + "loss": 1.4532, + "step": 7250 + }, + { + "epoch": 0.4929338225302351, + "grad_norm": 0.584342360496521, + "learning_rate": 4.6919163609186036e-05, + "loss": 1.3522, + "step": 7255 + }, + { + "epoch": 0.49327354260089684, + "grad_norm": 0.6784916520118713, + "learning_rate": 4.6917040358744394e-05, + "loss": 1.4906, + "step": 7260 + }, + { + "epoch": 0.49361326267155864, + "grad_norm": 0.6705710291862488, + "learning_rate": 4.691491710830276e-05, + "loss": 1.3883, + "step": 7265 + }, + { + "epoch": 0.49395298274222044, + "grad_norm": 0.65829998254776, + "learning_rate": 4.691279385786113e-05, + "loss": 1.3488, + "step": 7270 + }, + { + "epoch": 0.4942927028128822, + "grad_norm": 0.6567642688751221, + "learning_rate": 4.6910670607419486e-05, + "loss": 1.4068, + "step": 7275 + }, + { + "epoch": 0.494632422883544, + "grad_norm": 0.7130700349807739, + "learning_rate": 4.690854735697785e-05, + "loss": 1.3641, + "step": 7280 + }, + { + "epoch": 0.4949721429542057, + "grad_norm": 0.6480490565299988, + "learning_rate": 4.690642410653622e-05, + "loss": 1.4546, + "step": 7285 + }, + { + "epoch": 0.4953118630248675, + "grad_norm": 0.6228196620941162, + "learning_rate": 4.690430085609458e-05, + "loss": 1.3729, + "step": 7290 + }, + { + "epoch": 0.49565158309552926, + "grad_norm": 0.5975043773651123, + "learning_rate": 4.690217760565294e-05, + "loss": 1.3468, + "step": 7295 + }, + { + "epoch": 0.49599130316619106, + "grad_norm": 0.6154775023460388, + "learning_rate": 4.690005435521131e-05, + "loss": 1.4006, + "step": 7300 + }, + { + "epoch": 0.49633102323685285, + "grad_norm": 0.6062136888504028, + "learning_rate": 4.689793110476967e-05, + "loss": 1.3717, + "step": 7305 + }, + { + "epoch": 0.4966707433075146, + "grad_norm": 0.6884073615074158, + "learning_rate": 4.6895807854328034e-05, + "loss": 1.3792, + "step": 7310 + }, + { + "epoch": 0.4970104633781764, + "grad_norm": 0.6220802068710327, + "learning_rate": 4.68936846038864e-05, + "loss": 1.3939, + "step": 7315 + }, + { + "epoch": 0.49735018344883813, + "grad_norm": 0.6321655511856079, + "learning_rate": 4.689156135344476e-05, + "loss": 1.4618, + "step": 7320 + }, + { + "epoch": 0.49768990351949993, + "grad_norm": 0.6812154650688171, + "learning_rate": 4.688943810300313e-05, + "loss": 1.4334, + "step": 7325 + }, + { + "epoch": 0.49802962359016173, + "grad_norm": 0.5642015337944031, + "learning_rate": 4.688731485256149e-05, + "loss": 1.4009, + "step": 7330 + }, + { + "epoch": 0.49836934366082347, + "grad_norm": 0.6238872408866882, + "learning_rate": 4.6885191602119854e-05, + "loss": 1.4417, + "step": 7335 + }, + { + "epoch": 0.49870906373148527, + "grad_norm": 0.575141191482544, + "learning_rate": 4.6883068351678225e-05, + "loss": 1.3855, + "step": 7340 + }, + { + "epoch": 0.499048783802147, + "grad_norm": 0.5635263919830322, + "learning_rate": 4.688094510123658e-05, + "loss": 1.3929, + "step": 7345 + }, + { + "epoch": 0.4993885038728088, + "grad_norm": 0.6585545539855957, + "learning_rate": 4.6878821850794946e-05, + "loss": 1.2866, + "step": 7350 + }, + { + "epoch": 0.4997282239434706, + "grad_norm": 0.6060187220573425, + "learning_rate": 4.6876698600353317e-05, + "loss": 1.3086, + "step": 7355 + }, + { + "epoch": 0.5000679440141323, + "grad_norm": 0.7069248557090759, + "learning_rate": 4.6874575349911674e-05, + "loss": 1.4091, + "step": 7360 + }, + { + "epoch": 0.5004076640847941, + "grad_norm": 0.5305033922195435, + "learning_rate": 4.687245209947004e-05, + "loss": 1.3546, + "step": 7365 + }, + { + "epoch": 0.5007473841554559, + "grad_norm": 0.5994262099266052, + "learning_rate": 4.687032884902841e-05, + "loss": 1.3083, + "step": 7370 + }, + { + "epoch": 0.5010871042261177, + "grad_norm": 0.5297023057937622, + "learning_rate": 4.6868205598586766e-05, + "loss": 1.2246, + "step": 7375 + }, + { + "epoch": 0.5014268242967794, + "grad_norm": 0.5558198690414429, + "learning_rate": 4.686608234814513e-05, + "loss": 1.4296, + "step": 7380 + }, + { + "epoch": 0.5017665443674413, + "grad_norm": 0.6724082827568054, + "learning_rate": 4.6863959097703494e-05, + "loss": 1.4768, + "step": 7385 + }, + { + "epoch": 0.502106264438103, + "grad_norm": 0.6454963684082031, + "learning_rate": 4.686183584726186e-05, + "loss": 1.3367, + "step": 7390 + }, + { + "epoch": 0.5024459845087648, + "grad_norm": 0.6249825358390808, + "learning_rate": 4.685971259682022e-05, + "loss": 1.3552, + "step": 7395 + }, + { + "epoch": 0.5027857045794265, + "grad_norm": 0.6613280773162842, + "learning_rate": 4.6857589346378586e-05, + "loss": 1.4176, + "step": 7400 + }, + { + "epoch": 0.5031254246500884, + "grad_norm": 0.6486158967018127, + "learning_rate": 4.685546609593695e-05, + "loss": 1.3796, + "step": 7405 + }, + { + "epoch": 0.5034651447207501, + "grad_norm": 0.5767577886581421, + "learning_rate": 4.6853342845495314e-05, + "loss": 1.3338, + "step": 7410 + }, + { + "epoch": 0.5038048647914118, + "grad_norm": 0.6602384448051453, + "learning_rate": 4.685121959505368e-05, + "loss": 1.4648, + "step": 7415 + }, + { + "epoch": 0.5041445848620737, + "grad_norm": 0.725269079208374, + "learning_rate": 4.684909634461204e-05, + "loss": 1.4488, + "step": 7420 + }, + { + "epoch": 0.5044843049327354, + "grad_norm": 0.6460714936256409, + "learning_rate": 4.6846973094170406e-05, + "loss": 1.3039, + "step": 7425 + }, + { + "epoch": 0.5048240250033972, + "grad_norm": 0.6101348996162415, + "learning_rate": 4.684484984372877e-05, + "loss": 1.3841, + "step": 7430 + }, + { + "epoch": 0.5051637450740589, + "grad_norm": 0.5442931652069092, + "learning_rate": 4.6842726593287134e-05, + "loss": 1.4632, + "step": 7435 + }, + { + "epoch": 0.5055034651447208, + "grad_norm": 0.6163715124130249, + "learning_rate": 4.68406033428455e-05, + "loss": 1.4037, + "step": 7440 + }, + { + "epoch": 0.5058431852153825, + "grad_norm": 0.5447462201118469, + "learning_rate": 4.683848009240386e-05, + "loss": 1.3181, + "step": 7445 + }, + { + "epoch": 0.5061829052860443, + "grad_norm": 0.5884779691696167, + "learning_rate": 4.6836356841962226e-05, + "loss": 1.3845, + "step": 7450 + }, + { + "epoch": 0.5065226253567061, + "grad_norm": 0.675547182559967, + "learning_rate": 4.683423359152059e-05, + "loss": 1.3882, + "step": 7455 + }, + { + "epoch": 0.5068623454273679, + "grad_norm": 0.5374890565872192, + "learning_rate": 4.6832110341078954e-05, + "loss": 1.4428, + "step": 7460 + }, + { + "epoch": 0.5072020654980296, + "grad_norm": 0.662009596824646, + "learning_rate": 4.682998709063732e-05, + "loss": 1.4613, + "step": 7465 + }, + { + "epoch": 0.5075417855686915, + "grad_norm": 0.5851577520370483, + "learning_rate": 4.682786384019568e-05, + "loss": 1.3581, + "step": 7470 + }, + { + "epoch": 0.5078815056393532, + "grad_norm": 0.6344434022903442, + "learning_rate": 4.6825740589754046e-05, + "loss": 1.4882, + "step": 7475 + }, + { + "epoch": 0.5082212257100149, + "grad_norm": 0.6997050642967224, + "learning_rate": 4.682361733931241e-05, + "loss": 1.3075, + "step": 7480 + }, + { + "epoch": 0.5085609457806767, + "grad_norm": 0.6474372744560242, + "learning_rate": 4.6821494088870774e-05, + "loss": 1.4286, + "step": 7485 + }, + { + "epoch": 0.5089006658513385, + "grad_norm": 0.5623887181282043, + "learning_rate": 4.681937083842914e-05, + "loss": 1.3027, + "step": 7490 + }, + { + "epoch": 0.5092403859220003, + "grad_norm": 0.6566697359085083, + "learning_rate": 4.68172475879875e-05, + "loss": 1.4586, + "step": 7495 + }, + { + "epoch": 0.509580105992662, + "grad_norm": 0.6457698345184326, + "learning_rate": 4.6815124337545866e-05, + "loss": 1.3691, + "step": 7500 + }, + { + "epoch": 0.5099198260633239, + "grad_norm": 0.6728169918060303, + "learning_rate": 4.681300108710423e-05, + "loss": 1.4661, + "step": 7505 + }, + { + "epoch": 0.5102595461339856, + "grad_norm": 0.5606579780578613, + "learning_rate": 4.6810877836662594e-05, + "loss": 1.4124, + "step": 7510 + }, + { + "epoch": 0.5105992662046474, + "grad_norm": 0.5920879244804382, + "learning_rate": 4.680875458622095e-05, + "loss": 1.4345, + "step": 7515 + }, + { + "epoch": 0.5109389862753091, + "grad_norm": 0.7026877403259277, + "learning_rate": 4.680663133577932e-05, + "loss": 1.4688, + "step": 7520 + }, + { + "epoch": 0.511278706345971, + "grad_norm": 0.6280520558357239, + "learning_rate": 4.6804508085337686e-05, + "loss": 1.4535, + "step": 7525 + }, + { + "epoch": 0.5116184264166327, + "grad_norm": 0.6298320889472961, + "learning_rate": 4.680238483489604e-05, + "loss": 1.3375, + "step": 7530 + }, + { + "epoch": 0.5119581464872944, + "grad_norm": 0.6901682019233704, + "learning_rate": 4.6800261584454414e-05, + "loss": 1.3174, + "step": 7535 + }, + { + "epoch": 0.5122978665579563, + "grad_norm": 0.6426547765731812, + "learning_rate": 4.679813833401278e-05, + "loss": 1.4635, + "step": 7540 + }, + { + "epoch": 0.512637586628618, + "grad_norm": 0.6120851039886475, + "learning_rate": 4.6796015083571135e-05, + "loss": 1.3263, + "step": 7545 + }, + { + "epoch": 0.5129773066992798, + "grad_norm": 0.6297520995140076, + "learning_rate": 4.6793891833129506e-05, + "loss": 1.4802, + "step": 7550 + }, + { + "epoch": 0.5133170267699416, + "grad_norm": 0.6352310180664062, + "learning_rate": 4.679176858268787e-05, + "loss": 1.3879, + "step": 7555 + }, + { + "epoch": 0.5136567468406034, + "grad_norm": 0.633722186088562, + "learning_rate": 4.678964533224623e-05, + "loss": 1.4923, + "step": 7560 + }, + { + "epoch": 0.5139964669112651, + "grad_norm": 0.7394413948059082, + "learning_rate": 4.67875220818046e-05, + "loss": 1.4159, + "step": 7565 + }, + { + "epoch": 0.5143361869819268, + "grad_norm": 0.631238579750061, + "learning_rate": 4.678539883136296e-05, + "loss": 1.3724, + "step": 7570 + }, + { + "epoch": 0.5146759070525887, + "grad_norm": 0.7046985626220703, + "learning_rate": 4.678327558092132e-05, + "loss": 1.2557, + "step": 7575 + }, + { + "epoch": 0.5150156271232504, + "grad_norm": 0.5655717253684998, + "learning_rate": 4.678115233047969e-05, + "loss": 1.3346, + "step": 7580 + }, + { + "epoch": 0.5153553471939122, + "grad_norm": 0.620157778263092, + "learning_rate": 4.677902908003805e-05, + "loss": 1.4225, + "step": 7585 + }, + { + "epoch": 0.515695067264574, + "grad_norm": 0.6900854706764221, + "learning_rate": 4.677690582959641e-05, + "loss": 1.2892, + "step": 7590 + }, + { + "epoch": 0.5160347873352358, + "grad_norm": 0.6468934416770935, + "learning_rate": 4.677478257915478e-05, + "loss": 1.3578, + "step": 7595 + }, + { + "epoch": 0.5163745074058975, + "grad_norm": 0.6362219452857971, + "learning_rate": 4.677265932871314e-05, + "loss": 1.3582, + "step": 7600 + }, + { + "epoch": 0.5167142274765593, + "grad_norm": 0.5996149182319641, + "learning_rate": 4.67705360782715e-05, + "loss": 1.3891, + "step": 7605 + }, + { + "epoch": 0.5170539475472211, + "grad_norm": 0.6514232754707336, + "learning_rate": 4.6768412827829874e-05, + "loss": 1.3815, + "step": 7610 + }, + { + "epoch": 0.5173936676178829, + "grad_norm": 0.7216064929962158, + "learning_rate": 4.676628957738823e-05, + "loss": 1.2489, + "step": 7615 + }, + { + "epoch": 0.5177333876885446, + "grad_norm": 0.6348338723182678, + "learning_rate": 4.6764166326946595e-05, + "loss": 1.4793, + "step": 7620 + }, + { + "epoch": 0.5180731077592065, + "grad_norm": 0.5960789918899536, + "learning_rate": 4.6762043076504966e-05, + "loss": 1.4501, + "step": 7625 + }, + { + "epoch": 0.5184128278298682, + "grad_norm": 0.6377695798873901, + "learning_rate": 4.675991982606332e-05, + "loss": 1.3961, + "step": 7630 + }, + { + "epoch": 0.5187525479005299, + "grad_norm": 0.6256961822509766, + "learning_rate": 4.675779657562169e-05, + "loss": 1.3389, + "step": 7635 + }, + { + "epoch": 0.5190922679711918, + "grad_norm": 0.5948920249938965, + "learning_rate": 4.675567332518006e-05, + "loss": 1.4922, + "step": 7640 + }, + { + "epoch": 0.5194319880418535, + "grad_norm": 0.6825036406517029, + "learning_rate": 4.6753550074738415e-05, + "loss": 1.3309, + "step": 7645 + }, + { + "epoch": 0.5197717081125153, + "grad_norm": 0.6182436943054199, + "learning_rate": 4.675142682429678e-05, + "loss": 1.3435, + "step": 7650 + }, + { + "epoch": 0.520111428183177, + "grad_norm": 0.6601023077964783, + "learning_rate": 4.674930357385514e-05, + "loss": 1.3334, + "step": 7655 + }, + { + "epoch": 0.5204511482538389, + "grad_norm": 0.5339378714561462, + "learning_rate": 4.674718032341351e-05, + "loss": 1.3559, + "step": 7660 + }, + { + "epoch": 0.5207908683245006, + "grad_norm": 0.6746454834938049, + "learning_rate": 4.674505707297187e-05, + "loss": 1.3395, + "step": 7665 + }, + { + "epoch": 0.5211305883951624, + "grad_norm": 0.6270281076431274, + "learning_rate": 4.6742933822530235e-05, + "loss": 1.4089, + "step": 7670 + }, + { + "epoch": 0.5214703084658242, + "grad_norm": 0.7043746709823608, + "learning_rate": 4.67408105720886e-05, + "loss": 1.3482, + "step": 7675 + }, + { + "epoch": 0.521810028536486, + "grad_norm": 0.6281951069831848, + "learning_rate": 4.673868732164697e-05, + "loss": 1.2795, + "step": 7680 + }, + { + "epoch": 0.5221497486071477, + "grad_norm": 0.7626495361328125, + "learning_rate": 4.673656407120533e-05, + "loss": 1.4372, + "step": 7685 + }, + { + "epoch": 0.5224894686778094, + "grad_norm": 0.672217607498169, + "learning_rate": 4.673444082076369e-05, + "loss": 1.4279, + "step": 7690 + }, + { + "epoch": 0.5228291887484713, + "grad_norm": 0.6980194449424744, + "learning_rate": 4.673231757032206e-05, + "loss": 1.3344, + "step": 7695 + }, + { + "epoch": 0.523168908819133, + "grad_norm": 0.6135451793670654, + "learning_rate": 4.673019431988042e-05, + "loss": 1.399, + "step": 7700 + }, + { + "epoch": 0.5235086288897948, + "grad_norm": 0.5958032608032227, + "learning_rate": 4.672807106943878e-05, + "loss": 1.4593, + "step": 7705 + }, + { + "epoch": 0.5238483489604566, + "grad_norm": 0.6722570657730103, + "learning_rate": 4.6725947818997154e-05, + "loss": 1.2588, + "step": 7710 + }, + { + "epoch": 0.5241880690311184, + "grad_norm": 0.6172633767127991, + "learning_rate": 4.672382456855551e-05, + "loss": 1.3507, + "step": 7715 + }, + { + "epoch": 0.5245277891017801, + "grad_norm": 0.6627243757247925, + "learning_rate": 4.6721701318113875e-05, + "loss": 1.4994, + "step": 7720 + }, + { + "epoch": 0.524867509172442, + "grad_norm": 0.7068824172019958, + "learning_rate": 4.6719578067672246e-05, + "loss": 1.3198, + "step": 7725 + }, + { + "epoch": 0.5252072292431037, + "grad_norm": 0.6820070743560791, + "learning_rate": 4.67174548172306e-05, + "loss": 1.4842, + "step": 7730 + }, + { + "epoch": 0.5255469493137654, + "grad_norm": 0.6187071204185486, + "learning_rate": 4.671533156678897e-05, + "loss": 1.3429, + "step": 7735 + }, + { + "epoch": 0.5258866693844272, + "grad_norm": 0.5970851182937622, + "learning_rate": 4.671320831634733e-05, + "loss": 1.3337, + "step": 7740 + }, + { + "epoch": 0.526226389455089, + "grad_norm": 0.6725092530250549, + "learning_rate": 4.6711085065905695e-05, + "loss": 1.3842, + "step": 7745 + }, + { + "epoch": 0.5265661095257508, + "grad_norm": 0.6618882417678833, + "learning_rate": 4.670896181546406e-05, + "loss": 1.4507, + "step": 7750 + }, + { + "epoch": 0.5269058295964125, + "grad_norm": 0.670928955078125, + "learning_rate": 4.670683856502242e-05, + "loss": 1.5447, + "step": 7755 + }, + { + "epoch": 0.5272455496670744, + "grad_norm": 0.7112619280815125, + "learning_rate": 4.670471531458079e-05, + "loss": 1.3642, + "step": 7760 + }, + { + "epoch": 0.5275852697377361, + "grad_norm": 0.6146485805511475, + "learning_rate": 4.670259206413915e-05, + "loss": 1.2933, + "step": 7765 + }, + { + "epoch": 0.5279249898083979, + "grad_norm": 0.8029811382293701, + "learning_rate": 4.6700468813697515e-05, + "loss": 1.2899, + "step": 7770 + }, + { + "epoch": 0.5282647098790596, + "grad_norm": 0.6654508113861084, + "learning_rate": 4.669834556325588e-05, + "loss": 1.4213, + "step": 7775 + }, + { + "epoch": 0.5286044299497215, + "grad_norm": 0.5974158644676208, + "learning_rate": 4.669622231281424e-05, + "loss": 1.2864, + "step": 7780 + }, + { + "epoch": 0.5289441500203832, + "grad_norm": 0.572313666343689, + "learning_rate": 4.669409906237261e-05, + "loss": 1.4704, + "step": 7785 + }, + { + "epoch": 0.5292838700910449, + "grad_norm": 0.6661244034767151, + "learning_rate": 4.669197581193097e-05, + "loss": 1.4355, + "step": 7790 + }, + { + "epoch": 0.5296235901617068, + "grad_norm": 0.6460347175598145, + "learning_rate": 4.6689852561489335e-05, + "loss": 1.468, + "step": 7795 + }, + { + "epoch": 0.5299633102323685, + "grad_norm": 0.6542794704437256, + "learning_rate": 4.66877293110477e-05, + "loss": 1.3652, + "step": 7800 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.7438021302223206, + "learning_rate": 4.668560606060606e-05, + "loss": 1.3834, + "step": 7805 + }, + { + "epoch": 0.5306427503736921, + "grad_norm": 0.6667519807815552, + "learning_rate": 4.668348281016443e-05, + "loss": 1.2816, + "step": 7810 + }, + { + "epoch": 0.5309824704443539, + "grad_norm": 0.6257959008216858, + "learning_rate": 4.668135955972279e-05, + "loss": 1.3173, + "step": 7815 + }, + { + "epoch": 0.5313221905150156, + "grad_norm": 0.6496632695198059, + "learning_rate": 4.6679236309281155e-05, + "loss": 1.3937, + "step": 7820 + }, + { + "epoch": 0.5316619105856774, + "grad_norm": 0.6605326533317566, + "learning_rate": 4.667711305883952e-05, + "loss": 1.3761, + "step": 7825 + }, + { + "epoch": 0.5320016306563392, + "grad_norm": 0.6309987306594849, + "learning_rate": 4.667498980839788e-05, + "loss": 1.3127, + "step": 7830 + }, + { + "epoch": 0.532341350727001, + "grad_norm": 0.6613396406173706, + "learning_rate": 4.667286655795625e-05, + "loss": 1.3807, + "step": 7835 + }, + { + "epoch": 0.5326810707976627, + "grad_norm": 0.6253994107246399, + "learning_rate": 4.667074330751461e-05, + "loss": 1.2943, + "step": 7840 + }, + { + "epoch": 0.5330207908683245, + "grad_norm": 0.6277719140052795, + "learning_rate": 4.6668620057072975e-05, + "loss": 1.3851, + "step": 7845 + }, + { + "epoch": 0.5333605109389863, + "grad_norm": 0.6118496060371399, + "learning_rate": 4.666649680663134e-05, + "loss": 1.4124, + "step": 7850 + }, + { + "epoch": 0.533700231009648, + "grad_norm": 0.6762769818305969, + "learning_rate": 4.6664373556189697e-05, + "loss": 1.3867, + "step": 7855 + }, + { + "epoch": 0.5340399510803098, + "grad_norm": 0.5931509137153625, + "learning_rate": 4.666225030574807e-05, + "loss": 1.3406, + "step": 7860 + }, + { + "epoch": 0.5343796711509716, + "grad_norm": 0.5516414046287537, + "learning_rate": 4.666012705530643e-05, + "loss": 1.426, + "step": 7865 + }, + { + "epoch": 0.5347193912216334, + "grad_norm": 0.539910078048706, + "learning_rate": 4.665800380486479e-05, + "loss": 1.4105, + "step": 7870 + }, + { + "epoch": 0.5350591112922951, + "grad_norm": 0.6131874322891235, + "learning_rate": 4.665588055442316e-05, + "loss": 1.3148, + "step": 7875 + }, + { + "epoch": 0.535398831362957, + "grad_norm": 0.6400133967399597, + "learning_rate": 4.6653757303981523e-05, + "loss": 1.3658, + "step": 7880 + }, + { + "epoch": 0.5357385514336187, + "grad_norm": 0.5909250974655151, + "learning_rate": 4.665163405353988e-05, + "loss": 1.3665, + "step": 7885 + }, + { + "epoch": 0.5360782715042804, + "grad_norm": 0.6109614372253418, + "learning_rate": 4.664951080309825e-05, + "loss": 1.4881, + "step": 7890 + }, + { + "epoch": 0.5364179915749423, + "grad_norm": 0.6652154922485352, + "learning_rate": 4.6647387552656615e-05, + "loss": 1.4135, + "step": 7895 + }, + { + "epoch": 0.536757711645604, + "grad_norm": 0.5965635180473328, + "learning_rate": 4.664526430221497e-05, + "loss": 1.2594, + "step": 7900 + }, + { + "epoch": 0.5370974317162658, + "grad_norm": 0.6420058608055115, + "learning_rate": 4.6643141051773343e-05, + "loss": 1.411, + "step": 7905 + }, + { + "epoch": 0.5374371517869275, + "grad_norm": 0.7349154353141785, + "learning_rate": 4.664101780133171e-05, + "loss": 1.5397, + "step": 7910 + }, + { + "epoch": 0.5377768718575894, + "grad_norm": 0.6341392397880554, + "learning_rate": 4.6638894550890065e-05, + "loss": 1.5346, + "step": 7915 + }, + { + "epoch": 0.5381165919282511, + "grad_norm": 0.5322096943855286, + "learning_rate": 4.6636771300448435e-05, + "loss": 1.4165, + "step": 7920 + }, + { + "epoch": 0.5384563119989129, + "grad_norm": 0.6493896245956421, + "learning_rate": 4.66346480500068e-05, + "loss": 1.3916, + "step": 7925 + }, + { + "epoch": 0.5387960320695747, + "grad_norm": 0.6232993602752686, + "learning_rate": 4.663252479956516e-05, + "loss": 1.3467, + "step": 7930 + }, + { + "epoch": 0.5391357521402365, + "grad_norm": 0.6491148471832275, + "learning_rate": 4.663040154912353e-05, + "loss": 1.4513, + "step": 7935 + }, + { + "epoch": 0.5394754722108982, + "grad_norm": 0.6049793362617493, + "learning_rate": 4.6628278298681885e-05, + "loss": 1.3655, + "step": 7940 + }, + { + "epoch": 0.5398151922815599, + "grad_norm": 0.6911430954933167, + "learning_rate": 4.662615504824025e-05, + "loss": 1.3749, + "step": 7945 + }, + { + "epoch": 0.5401549123522218, + "grad_norm": 0.7091353535652161, + "learning_rate": 4.662403179779862e-05, + "loss": 1.4294, + "step": 7950 + }, + { + "epoch": 0.5404946324228835, + "grad_norm": 0.6359396576881409, + "learning_rate": 4.662190854735698e-05, + "loss": 1.3657, + "step": 7955 + }, + { + "epoch": 0.5408343524935453, + "grad_norm": 0.6963664293289185, + "learning_rate": 4.661978529691534e-05, + "loss": 1.3415, + "step": 7960 + }, + { + "epoch": 0.5411740725642071, + "grad_norm": 0.6002401113510132, + "learning_rate": 4.661766204647371e-05, + "loss": 1.4231, + "step": 7965 + }, + { + "epoch": 0.5415137926348689, + "grad_norm": 0.6737387776374817, + "learning_rate": 4.661553879603207e-05, + "loss": 1.4223, + "step": 7970 + }, + { + "epoch": 0.5418535127055306, + "grad_norm": 0.7201437950134277, + "learning_rate": 4.661341554559043e-05, + "loss": 1.4063, + "step": 7975 + }, + { + "epoch": 0.5421932327761925, + "grad_norm": 0.5882259607315063, + "learning_rate": 4.6611292295148803e-05, + "loss": 1.3551, + "step": 7980 + }, + { + "epoch": 0.5425329528468542, + "grad_norm": 0.5975483059883118, + "learning_rate": 4.660916904470716e-05, + "loss": 1.4396, + "step": 7985 + }, + { + "epoch": 0.542872672917516, + "grad_norm": 0.4384988844394684, + "learning_rate": 4.6607045794265525e-05, + "loss": 1.3125, + "step": 7990 + }, + { + "epoch": 0.5432123929881777, + "grad_norm": 0.7761228680610657, + "learning_rate": 4.6604922543823895e-05, + "loss": 1.3367, + "step": 7995 + }, + { + "epoch": 0.5435521130588395, + "grad_norm": 0.6241304278373718, + "learning_rate": 4.660279929338225e-05, + "loss": 1.4024, + "step": 8000 + }, + { + "epoch": 0.5438918331295013, + "grad_norm": 0.649651288986206, + "learning_rate": 4.660067604294062e-05, + "loss": 1.3397, + "step": 8005 + }, + { + "epoch": 0.544231553200163, + "grad_norm": 0.6383991241455078, + "learning_rate": 4.659855279249898e-05, + "loss": 1.4675, + "step": 8010 + }, + { + "epoch": 0.5445712732708249, + "grad_norm": 0.6509715914726257, + "learning_rate": 4.6596429542057345e-05, + "loss": 1.3761, + "step": 8015 + }, + { + "epoch": 0.5449109933414866, + "grad_norm": 0.685106098651886, + "learning_rate": 4.6594306291615716e-05, + "loss": 1.3551, + "step": 8020 + }, + { + "epoch": 0.5452507134121484, + "grad_norm": 0.6405821442604065, + "learning_rate": 4.659218304117407e-05, + "loss": 1.3918, + "step": 8025 + }, + { + "epoch": 0.5455904334828101, + "grad_norm": 0.6775296926498413, + "learning_rate": 4.659005979073244e-05, + "loss": 1.3949, + "step": 8030 + }, + { + "epoch": 0.545930153553472, + "grad_norm": 0.6392462849617004, + "learning_rate": 4.658793654029081e-05, + "loss": 1.4426, + "step": 8035 + }, + { + "epoch": 0.5462698736241337, + "grad_norm": 0.5327250957489014, + "learning_rate": 4.6585813289849165e-05, + "loss": 1.322, + "step": 8040 + }, + { + "epoch": 0.5466095936947954, + "grad_norm": 1.0725865364074707, + "learning_rate": 4.658369003940753e-05, + "loss": 1.3749, + "step": 8045 + }, + { + "epoch": 0.5469493137654573, + "grad_norm": 0.6358645558357239, + "learning_rate": 4.65815667889659e-05, + "loss": 1.4291, + "step": 8050 + }, + { + "epoch": 0.547289033836119, + "grad_norm": 0.6378662586212158, + "learning_rate": 4.657944353852426e-05, + "loss": 1.3221, + "step": 8055 + }, + { + "epoch": 0.5476287539067808, + "grad_norm": 0.607462465763092, + "learning_rate": 4.657732028808262e-05, + "loss": 1.3914, + "step": 8060 + }, + { + "epoch": 0.5479684739774426, + "grad_norm": 0.6590172052383423, + "learning_rate": 4.657519703764099e-05, + "loss": 1.2552, + "step": 8065 + }, + { + "epoch": 0.5483081940481044, + "grad_norm": 0.6041439771652222, + "learning_rate": 4.657307378719935e-05, + "loss": 1.4059, + "step": 8070 + }, + { + "epoch": 0.5486479141187661, + "grad_norm": 0.5647345185279846, + "learning_rate": 4.657095053675771e-05, + "loss": 1.4185, + "step": 8075 + }, + { + "epoch": 0.5489876341894279, + "grad_norm": 0.5834894776344299, + "learning_rate": 4.656882728631608e-05, + "loss": 1.3359, + "step": 8080 + }, + { + "epoch": 0.5493273542600897, + "grad_norm": 0.637019693851471, + "learning_rate": 4.656670403587444e-05, + "loss": 1.385, + "step": 8085 + }, + { + "epoch": 0.5496670743307515, + "grad_norm": 0.635460376739502, + "learning_rate": 4.6564580785432805e-05, + "loss": 1.3736, + "step": 8090 + }, + { + "epoch": 0.5500067944014132, + "grad_norm": 0.6867489814758301, + "learning_rate": 4.656245753499117e-05, + "loss": 1.3258, + "step": 8095 + }, + { + "epoch": 0.550346514472075, + "grad_norm": 0.7199000120162964, + "learning_rate": 4.656033428454953e-05, + "loss": 1.3913, + "step": 8100 + }, + { + "epoch": 0.5506862345427368, + "grad_norm": 0.6548787951469421, + "learning_rate": 4.65582110341079e-05, + "loss": 1.3841, + "step": 8105 + }, + { + "epoch": 0.5510259546133985, + "grad_norm": 0.6761186122894287, + "learning_rate": 4.655608778366626e-05, + "loss": 1.3442, + "step": 8110 + }, + { + "epoch": 0.5513656746840603, + "grad_norm": 0.592474639415741, + "learning_rate": 4.6553964533224625e-05, + "loss": 1.38, + "step": 8115 + }, + { + "epoch": 0.5517053947547221, + "grad_norm": 0.66417396068573, + "learning_rate": 4.655184128278299e-05, + "loss": 1.5308, + "step": 8120 + }, + { + "epoch": 0.5520451148253839, + "grad_norm": 0.623815655708313, + "learning_rate": 4.654971803234135e-05, + "loss": 1.4102, + "step": 8125 + }, + { + "epoch": 0.5523848348960456, + "grad_norm": 0.6209930181503296, + "learning_rate": 4.654759478189972e-05, + "loss": 1.3864, + "step": 8130 + }, + { + "epoch": 0.5527245549667075, + "grad_norm": 0.6548243165016174, + "learning_rate": 4.654547153145808e-05, + "loss": 1.4629, + "step": 8135 + }, + { + "epoch": 0.5530642750373692, + "grad_norm": 0.6529601812362671, + "learning_rate": 4.6543348281016445e-05, + "loss": 1.4546, + "step": 8140 + }, + { + "epoch": 0.553403995108031, + "grad_norm": 0.667816162109375, + "learning_rate": 4.654122503057481e-05, + "loss": 1.4255, + "step": 8145 + }, + { + "epoch": 0.5537437151786928, + "grad_norm": 0.6340169906616211, + "learning_rate": 4.653910178013317e-05, + "loss": 1.343, + "step": 8150 + }, + { + "epoch": 0.5540834352493546, + "grad_norm": 0.6256027221679688, + "learning_rate": 4.653697852969154e-05, + "loss": 1.3906, + "step": 8155 + }, + { + "epoch": 0.5544231553200163, + "grad_norm": 0.648058295249939, + "learning_rate": 4.65348552792499e-05, + "loss": 1.3785, + "step": 8160 + }, + { + "epoch": 0.554762875390678, + "grad_norm": 0.6203805208206177, + "learning_rate": 4.6532732028808265e-05, + "loss": 1.4438, + "step": 8165 + }, + { + "epoch": 0.5551025954613399, + "grad_norm": 0.6117679476737976, + "learning_rate": 4.653060877836663e-05, + "loss": 1.4236, + "step": 8170 + }, + { + "epoch": 0.5554423155320016, + "grad_norm": 0.6029890775680542, + "learning_rate": 4.652848552792499e-05, + "loss": 1.3134, + "step": 8175 + }, + { + "epoch": 0.5557820356026634, + "grad_norm": 0.6157516241073608, + "learning_rate": 4.652636227748336e-05, + "loss": 1.3097, + "step": 8180 + }, + { + "epoch": 0.5561217556733252, + "grad_norm": 0.6411975622177124, + "learning_rate": 4.652423902704172e-05, + "loss": 1.2708, + "step": 8185 + }, + { + "epoch": 0.556461475743987, + "grad_norm": 0.5759761929512024, + "learning_rate": 4.6522115776600085e-05, + "loss": 1.3998, + "step": 8190 + }, + { + "epoch": 0.5568011958146487, + "grad_norm": 0.7343934774398804, + "learning_rate": 4.651999252615845e-05, + "loss": 1.3454, + "step": 8195 + }, + { + "epoch": 0.5571409158853105, + "grad_norm": 0.6397271752357483, + "learning_rate": 4.651786927571681e-05, + "loss": 1.3922, + "step": 8200 + }, + { + "epoch": 0.5574806359559723, + "grad_norm": 0.6115093231201172, + "learning_rate": 4.651574602527518e-05, + "loss": 1.3425, + "step": 8205 + }, + { + "epoch": 0.557820356026634, + "grad_norm": 0.6679167151451111, + "learning_rate": 4.6513622774833534e-05, + "loss": 1.3271, + "step": 8210 + }, + { + "epoch": 0.5581600760972958, + "grad_norm": 0.5975149273872375, + "learning_rate": 4.6511499524391905e-05, + "loss": 1.2803, + "step": 8215 + }, + { + "epoch": 0.5584997961679576, + "grad_norm": 0.5187572240829468, + "learning_rate": 4.650937627395027e-05, + "loss": 1.2844, + "step": 8220 + }, + { + "epoch": 0.5588395162386194, + "grad_norm": 0.6356489062309265, + "learning_rate": 4.6507253023508626e-05, + "loss": 1.3259, + "step": 8225 + }, + { + "epoch": 0.5591792363092811, + "grad_norm": 0.7586395740509033, + "learning_rate": 4.6505129773067e-05, + "loss": 1.4771, + "step": 8230 + }, + { + "epoch": 0.559518956379943, + "grad_norm": 0.5931305289268494, + "learning_rate": 4.650300652262536e-05, + "loss": 1.3238, + "step": 8235 + }, + { + "epoch": 0.5598586764506047, + "grad_norm": 0.6305395364761353, + "learning_rate": 4.650088327218372e-05, + "loss": 1.4735, + "step": 8240 + }, + { + "epoch": 0.5601983965212665, + "grad_norm": 0.6581220626831055, + "learning_rate": 4.649876002174209e-05, + "loss": 1.4475, + "step": 8245 + }, + { + "epoch": 0.5605381165919282, + "grad_norm": 0.6286765933036804, + "learning_rate": 4.649663677130045e-05, + "loss": 1.2987, + "step": 8250 + }, + { + "epoch": 0.5608778366625901, + "grad_norm": 0.6141452193260193, + "learning_rate": 4.649451352085881e-05, + "loss": 1.2827, + "step": 8255 + }, + { + "epoch": 0.5612175567332518, + "grad_norm": 0.6577660441398621, + "learning_rate": 4.649239027041718e-05, + "loss": 1.3723, + "step": 8260 + }, + { + "epoch": 0.5615572768039135, + "grad_norm": 0.6323842406272888, + "learning_rate": 4.6490267019975545e-05, + "loss": 1.3636, + "step": 8265 + }, + { + "epoch": 0.5618969968745754, + "grad_norm": 0.5749620199203491, + "learning_rate": 4.64881437695339e-05, + "loss": 1.3261, + "step": 8270 + }, + { + "epoch": 0.5622367169452371, + "grad_norm": 0.6635522246360779, + "learning_rate": 4.648602051909227e-05, + "loss": 1.3411, + "step": 8275 + }, + { + "epoch": 0.5625764370158989, + "grad_norm": 0.7397140264511108, + "learning_rate": 4.648389726865063e-05, + "loss": 1.3515, + "step": 8280 + }, + { + "epoch": 0.5629161570865606, + "grad_norm": 0.5669236779212952, + "learning_rate": 4.6481774018208994e-05, + "loss": 1.3943, + "step": 8285 + }, + { + "epoch": 0.5632558771572225, + "grad_norm": 0.5314184427261353, + "learning_rate": 4.6479650767767365e-05, + "loss": 1.4635, + "step": 8290 + }, + { + "epoch": 0.5635955972278842, + "grad_norm": 0.6521828174591064, + "learning_rate": 4.647752751732572e-05, + "loss": 1.3649, + "step": 8295 + }, + { + "epoch": 0.563935317298546, + "grad_norm": 0.6266273260116577, + "learning_rate": 4.6475404266884086e-05, + "loss": 1.4524, + "step": 8300 + }, + { + "epoch": 0.5642750373692078, + "grad_norm": 0.5829536318778992, + "learning_rate": 4.647328101644246e-05, + "loss": 1.354, + "step": 8305 + }, + { + "epoch": 0.5646147574398696, + "grad_norm": 0.6950231194496155, + "learning_rate": 4.6471157766000814e-05, + "loss": 1.3899, + "step": 8310 + }, + { + "epoch": 0.5649544775105313, + "grad_norm": 0.6778619885444641, + "learning_rate": 4.646903451555918e-05, + "loss": 1.3522, + "step": 8315 + }, + { + "epoch": 0.5652941975811931, + "grad_norm": 0.6076153516769409, + "learning_rate": 4.646691126511755e-05, + "loss": 1.3722, + "step": 8320 + }, + { + "epoch": 0.5656339176518549, + "grad_norm": 0.6762505769729614, + "learning_rate": 4.6464788014675906e-05, + "loss": 1.4833, + "step": 8325 + }, + { + "epoch": 0.5659736377225166, + "grad_norm": 0.7496538758277893, + "learning_rate": 4.646266476423427e-05, + "loss": 1.433, + "step": 8330 + }, + { + "epoch": 0.5663133577931784, + "grad_norm": 0.692680299282074, + "learning_rate": 4.646054151379264e-05, + "loss": 1.5166, + "step": 8335 + }, + { + "epoch": 0.5666530778638402, + "grad_norm": 0.5774497985839844, + "learning_rate": 4.6458418263351e-05, + "loss": 1.3301, + "step": 8340 + }, + { + "epoch": 0.566992797934502, + "grad_norm": 0.6338465213775635, + "learning_rate": 4.645629501290936e-05, + "loss": 1.463, + "step": 8345 + }, + { + "epoch": 0.5673325180051637, + "grad_norm": 0.6851441860198975, + "learning_rate": 4.645417176246773e-05, + "loss": 1.3758, + "step": 8350 + }, + { + "epoch": 0.5676722380758256, + "grad_norm": 0.7029812335968018, + "learning_rate": 4.645204851202609e-05, + "loss": 1.4005, + "step": 8355 + }, + { + "epoch": 0.5680119581464873, + "grad_norm": 0.5923866033554077, + "learning_rate": 4.644992526158446e-05, + "loss": 1.4234, + "step": 8360 + }, + { + "epoch": 0.568351678217149, + "grad_norm": 0.6720953583717346, + "learning_rate": 4.644780201114282e-05, + "loss": 1.4136, + "step": 8365 + }, + { + "epoch": 0.5686913982878108, + "grad_norm": 0.6519859433174133, + "learning_rate": 4.644567876070118e-05, + "loss": 1.346, + "step": 8370 + }, + { + "epoch": 0.5690311183584726, + "grad_norm": 0.6643994450569153, + "learning_rate": 4.644355551025955e-05, + "loss": 1.3092, + "step": 8375 + }, + { + "epoch": 0.5693708384291344, + "grad_norm": 0.6579692363739014, + "learning_rate": 4.644143225981791e-05, + "loss": 1.4446, + "step": 8380 + }, + { + "epoch": 0.5697105584997961, + "grad_norm": 0.6603977084159851, + "learning_rate": 4.6439309009376274e-05, + "loss": 1.4177, + "step": 8385 + }, + { + "epoch": 0.570050278570458, + "grad_norm": 0.6441840529441833, + "learning_rate": 4.6437185758934645e-05, + "loss": 1.3774, + "step": 8390 + }, + { + "epoch": 0.5703899986411197, + "grad_norm": 0.62338787317276, + "learning_rate": 4.6435062508493e-05, + "loss": 1.4207, + "step": 8395 + }, + { + "epoch": 0.5707297187117815, + "grad_norm": 0.6295756697654724, + "learning_rate": 4.6432939258051366e-05, + "loss": 1.3859, + "step": 8400 + }, + { + "epoch": 0.5710694387824433, + "grad_norm": 0.6080472469329834, + "learning_rate": 4.643081600760974e-05, + "loss": 1.4107, + "step": 8405 + }, + { + "epoch": 0.5714091588531051, + "grad_norm": 0.6909534931182861, + "learning_rate": 4.6428692757168094e-05, + "loss": 1.3254, + "step": 8410 + }, + { + "epoch": 0.5717488789237668, + "grad_norm": 0.7021681666374207, + "learning_rate": 4.642656950672646e-05, + "loss": 1.3105, + "step": 8415 + }, + { + "epoch": 0.5720885989944285, + "grad_norm": 0.6477012634277344, + "learning_rate": 4.642444625628483e-05, + "loss": 1.4423, + "step": 8420 + }, + { + "epoch": 0.5724283190650904, + "grad_norm": 0.6198950409889221, + "learning_rate": 4.6422323005843186e-05, + "loss": 1.4801, + "step": 8425 + }, + { + "epoch": 0.5727680391357521, + "grad_norm": 0.6134934425354004, + "learning_rate": 4.642019975540155e-05, + "loss": 1.3673, + "step": 8430 + }, + { + "epoch": 0.5731077592064139, + "grad_norm": 0.8099306225776672, + "learning_rate": 4.6418076504959914e-05, + "loss": 1.3798, + "step": 8435 + }, + { + "epoch": 0.5734474792770757, + "grad_norm": 0.6538095474243164, + "learning_rate": 4.641595325451828e-05, + "loss": 1.3588, + "step": 8440 + }, + { + "epoch": 0.5737871993477375, + "grad_norm": 0.7463709712028503, + "learning_rate": 4.641383000407664e-05, + "loss": 1.3673, + "step": 8445 + }, + { + "epoch": 0.5741269194183992, + "grad_norm": 0.6359599232673645, + "learning_rate": 4.6411706753635006e-05, + "loss": 1.3445, + "step": 8450 + }, + { + "epoch": 0.574466639489061, + "grad_norm": 1.1299289464950562, + "learning_rate": 4.640958350319337e-05, + "loss": 1.3005, + "step": 8455 + }, + { + "epoch": 0.5748063595597228, + "grad_norm": 0.6729915142059326, + "learning_rate": 4.6407460252751734e-05, + "loss": 1.3593, + "step": 8460 + }, + { + "epoch": 0.5751460796303846, + "grad_norm": 0.6672472953796387, + "learning_rate": 4.64053370023101e-05, + "loss": 1.3796, + "step": 8465 + }, + { + "epoch": 0.5754857997010463, + "grad_norm": 0.6459980607032776, + "learning_rate": 4.640321375186846e-05, + "loss": 1.3767, + "step": 8470 + }, + { + "epoch": 0.5758255197717081, + "grad_norm": 0.7157725095748901, + "learning_rate": 4.6401090501426826e-05, + "loss": 1.3143, + "step": 8475 + }, + { + "epoch": 0.5761652398423699, + "grad_norm": 0.6390661597251892, + "learning_rate": 4.639896725098519e-05, + "loss": 1.3659, + "step": 8480 + }, + { + "epoch": 0.5765049599130316, + "grad_norm": 0.6072826981544495, + "learning_rate": 4.6396844000543554e-05, + "loss": 1.3133, + "step": 8485 + }, + { + "epoch": 0.5768446799836935, + "grad_norm": 0.6549344658851624, + "learning_rate": 4.639472075010192e-05, + "loss": 1.3423, + "step": 8490 + }, + { + "epoch": 0.5771844000543552, + "grad_norm": 0.6404393315315247, + "learning_rate": 4.639259749966028e-05, + "loss": 1.3167, + "step": 8495 + }, + { + "epoch": 0.577524120125017, + "grad_norm": 0.6117603182792664, + "learning_rate": 4.6390474249218646e-05, + "loss": 1.4581, + "step": 8500 + }, + { + "epoch": 0.5778638401956787, + "grad_norm": 0.6287073493003845, + "learning_rate": 4.638835099877701e-05, + "loss": 1.3583, + "step": 8505 + }, + { + "epoch": 0.5782035602663406, + "grad_norm": 0.6488497257232666, + "learning_rate": 4.6386227748335374e-05, + "loss": 1.3389, + "step": 8510 + }, + { + "epoch": 0.5785432803370023, + "grad_norm": 0.6550486087799072, + "learning_rate": 4.638410449789374e-05, + "loss": 1.3674, + "step": 8515 + }, + { + "epoch": 0.578883000407664, + "grad_norm": 0.6923273205757141, + "learning_rate": 4.63819812474521e-05, + "loss": 1.3788, + "step": 8520 + }, + { + "epoch": 0.5792227204783259, + "grad_norm": 0.6016242504119873, + "learning_rate": 4.6379857997010466e-05, + "loss": 1.3521, + "step": 8525 + }, + { + "epoch": 0.5795624405489876, + "grad_norm": 0.5966993570327759, + "learning_rate": 4.637773474656883e-05, + "loss": 1.3855, + "step": 8530 + }, + { + "epoch": 0.5799021606196494, + "grad_norm": 0.7263109683990479, + "learning_rate": 4.6375611496127194e-05, + "loss": 1.3474, + "step": 8535 + }, + { + "epoch": 0.5802418806903111, + "grad_norm": 0.6226108074188232, + "learning_rate": 4.637348824568556e-05, + "loss": 1.3671, + "step": 8540 + }, + { + "epoch": 0.580581600760973, + "grad_norm": 0.5628734230995178, + "learning_rate": 4.637136499524392e-05, + "loss": 1.4494, + "step": 8545 + }, + { + "epoch": 0.5809213208316347, + "grad_norm": 0.5334903001785278, + "learning_rate": 4.6369241744802286e-05, + "loss": 1.3946, + "step": 8550 + }, + { + "epoch": 0.5812610409022965, + "grad_norm": 0.6473321318626404, + "learning_rate": 4.636711849436065e-05, + "loss": 1.3325, + "step": 8555 + }, + { + "epoch": 0.5816007609729583, + "grad_norm": 0.6834282875061035, + "learning_rate": 4.6364995243919014e-05, + "loss": 1.3812, + "step": 8560 + }, + { + "epoch": 0.5819404810436201, + "grad_norm": 0.6114623546600342, + "learning_rate": 4.636287199347737e-05, + "loss": 1.3266, + "step": 8565 + }, + { + "epoch": 0.5822802011142818, + "grad_norm": 0.6906845569610596, + "learning_rate": 4.636074874303574e-05, + "loss": 1.4233, + "step": 8570 + }, + { + "epoch": 0.5826199211849437, + "grad_norm": 0.5607256889343262, + "learning_rate": 4.6358625492594106e-05, + "loss": 1.4599, + "step": 8575 + }, + { + "epoch": 0.5829596412556054, + "grad_norm": 0.6390836238861084, + "learning_rate": 4.6356502242152464e-05, + "loss": 1.426, + "step": 8580 + }, + { + "epoch": 0.5832993613262671, + "grad_norm": 0.5814246535301208, + "learning_rate": 4.6354378991710834e-05, + "loss": 1.3193, + "step": 8585 + }, + { + "epoch": 0.5836390813969289, + "grad_norm": 0.6813691258430481, + "learning_rate": 4.63522557412692e-05, + "loss": 1.4357, + "step": 8590 + }, + { + "epoch": 0.5839788014675907, + "grad_norm": 0.6934071183204651, + "learning_rate": 4.6350132490827556e-05, + "loss": 1.3752, + "step": 8595 + }, + { + "epoch": 0.5843185215382525, + "grad_norm": 0.6603383421897888, + "learning_rate": 4.6348009240385926e-05, + "loss": 1.2453, + "step": 8600 + }, + { + "epoch": 0.5846582416089142, + "grad_norm": 0.71683669090271, + "learning_rate": 4.634588598994429e-05, + "loss": 1.5543, + "step": 8605 + }, + { + "epoch": 0.5849979616795761, + "grad_norm": 0.6986023783683777, + "learning_rate": 4.634376273950265e-05, + "loss": 1.4098, + "step": 8610 + }, + { + "epoch": 0.5853376817502378, + "grad_norm": 0.6881933212280273, + "learning_rate": 4.634163948906102e-05, + "loss": 1.4021, + "step": 8615 + }, + { + "epoch": 0.5856774018208996, + "grad_norm": 0.6830471158027649, + "learning_rate": 4.633951623861938e-05, + "loss": 1.3863, + "step": 8620 + }, + { + "epoch": 0.5860171218915613, + "grad_norm": 0.5090907216072083, + "learning_rate": 4.633739298817774e-05, + "loss": 1.4505, + "step": 8625 + }, + { + "epoch": 0.5863568419622232, + "grad_norm": 0.6305417418479919, + "learning_rate": 4.633526973773611e-05, + "loss": 1.3958, + "step": 8630 + }, + { + "epoch": 0.5866965620328849, + "grad_norm": 0.627467691898346, + "learning_rate": 4.633314648729447e-05, + "loss": 1.4374, + "step": 8635 + }, + { + "epoch": 0.5870362821035466, + "grad_norm": 0.6345938444137573, + "learning_rate": 4.633102323685283e-05, + "loss": 1.2474, + "step": 8640 + }, + { + "epoch": 0.5873760021742085, + "grad_norm": 0.6117781400680542, + "learning_rate": 4.63288999864112e-05, + "loss": 1.3993, + "step": 8645 + }, + { + "epoch": 0.5877157222448702, + "grad_norm": 0.6087744235992432, + "learning_rate": 4.632677673596956e-05, + "loss": 1.5465, + "step": 8650 + }, + { + "epoch": 0.588055442315532, + "grad_norm": 0.6593524217605591, + "learning_rate": 4.6324653485527924e-05, + "loss": 1.3562, + "step": 8655 + }, + { + "epoch": 0.5883951623861938, + "grad_norm": 0.6665415167808533, + "learning_rate": 4.6322530235086294e-05, + "loss": 1.4428, + "step": 8660 + }, + { + "epoch": 0.5887348824568556, + "grad_norm": 0.6231390237808228, + "learning_rate": 4.632040698464465e-05, + "loss": 1.3792, + "step": 8665 + }, + { + "epoch": 0.5890746025275173, + "grad_norm": 0.5803211331367493, + "learning_rate": 4.6318283734203016e-05, + "loss": 1.3635, + "step": 8670 + }, + { + "epoch": 0.589414322598179, + "grad_norm": 0.6013233065605164, + "learning_rate": 4.6316160483761386e-05, + "loss": 1.515, + "step": 8675 + }, + { + "epoch": 0.5897540426688409, + "grad_norm": 0.6222535967826843, + "learning_rate": 4.6314037233319744e-05, + "loss": 1.4195, + "step": 8680 + }, + { + "epoch": 0.5900937627395026, + "grad_norm": 0.6328880786895752, + "learning_rate": 4.631191398287811e-05, + "loss": 1.3758, + "step": 8685 + }, + { + "epoch": 0.5904334828101644, + "grad_norm": 0.6459696292877197, + "learning_rate": 4.630979073243648e-05, + "loss": 1.2561, + "step": 8690 + }, + { + "epoch": 0.5907732028808262, + "grad_norm": 0.5639860033988953, + "learning_rate": 4.6307667481994836e-05, + "loss": 1.267, + "step": 8695 + }, + { + "epoch": 0.591112922951488, + "grad_norm": 0.6313180327415466, + "learning_rate": 4.6305544231553206e-05, + "loss": 1.3963, + "step": 8700 + }, + { + "epoch": 0.5914526430221497, + "grad_norm": 0.6009690761566162, + "learning_rate": 4.630342098111157e-05, + "loss": 1.3293, + "step": 8705 + }, + { + "epoch": 0.5917923630928115, + "grad_norm": 0.5952971577644348, + "learning_rate": 4.630129773066993e-05, + "loss": 1.4078, + "step": 8710 + }, + { + "epoch": 0.5921320831634733, + "grad_norm": 0.6673296689987183, + "learning_rate": 4.62991744802283e-05, + "loss": 1.2694, + "step": 8715 + }, + { + "epoch": 0.5924718032341351, + "grad_norm": 0.5815384984016418, + "learning_rate": 4.6297051229786656e-05, + "loss": 1.2968, + "step": 8720 + }, + { + "epoch": 0.5928115233047968, + "grad_norm": 0.6946033239364624, + "learning_rate": 4.629492797934502e-05, + "loss": 1.3911, + "step": 8725 + }, + { + "epoch": 0.5931512433754587, + "grad_norm": 0.6673735976219177, + "learning_rate": 4.629280472890339e-05, + "loss": 1.3914, + "step": 8730 + }, + { + "epoch": 0.5934909634461204, + "grad_norm": 0.7089139819145203, + "learning_rate": 4.629068147846175e-05, + "loss": 1.428, + "step": 8735 + }, + { + "epoch": 0.5938306835167821, + "grad_norm": 0.6138942837715149, + "learning_rate": 4.628855822802011e-05, + "loss": 1.3783, + "step": 8740 + }, + { + "epoch": 0.594170403587444, + "grad_norm": 0.6514503359794617, + "learning_rate": 4.628643497757848e-05, + "loss": 1.3654, + "step": 8745 + }, + { + "epoch": 0.5945101236581057, + "grad_norm": 0.6299140453338623, + "learning_rate": 4.628431172713684e-05, + "loss": 1.4794, + "step": 8750 + }, + { + "epoch": 0.5948498437287675, + "grad_norm": 0.5932992696762085, + "learning_rate": 4.6282188476695204e-05, + "loss": 1.342, + "step": 8755 + }, + { + "epoch": 0.5951895637994292, + "grad_norm": 0.9953003525733948, + "learning_rate": 4.6280065226253575e-05, + "loss": 1.4064, + "step": 8760 + }, + { + "epoch": 0.5955292838700911, + "grad_norm": 0.6479867696762085, + "learning_rate": 4.627794197581193e-05, + "loss": 1.3773, + "step": 8765 + }, + { + "epoch": 0.5958690039407528, + "grad_norm": 0.6373013854026794, + "learning_rate": 4.6275818725370296e-05, + "loss": 1.3575, + "step": 8770 + }, + { + "epoch": 0.5962087240114146, + "grad_norm": 0.6302305459976196, + "learning_rate": 4.6273695474928667e-05, + "loss": 1.3272, + "step": 8775 + }, + { + "epoch": 0.5965484440820764, + "grad_norm": 0.676969051361084, + "learning_rate": 4.6271572224487024e-05, + "loss": 1.3772, + "step": 8780 + }, + { + "epoch": 0.5968881641527382, + "grad_norm": 0.5809783339500427, + "learning_rate": 4.626944897404539e-05, + "loss": 1.322, + "step": 8785 + }, + { + "epoch": 0.5972278842233999, + "grad_norm": 0.5713885426521301, + "learning_rate": 4.626732572360375e-05, + "loss": 1.4192, + "step": 8790 + }, + { + "epoch": 0.5975676042940616, + "grad_norm": 0.6405071020126343, + "learning_rate": 4.6265202473162116e-05, + "loss": 1.4343, + "step": 8795 + }, + { + "epoch": 0.5979073243647235, + "grad_norm": 0.6022502183914185, + "learning_rate": 4.626307922272048e-05, + "loss": 1.3822, + "step": 8800 + }, + { + "epoch": 0.5982470444353852, + "grad_norm": 0.7166122198104858, + "learning_rate": 4.6260955972278844e-05, + "loss": 1.3959, + "step": 8805 + }, + { + "epoch": 0.598586764506047, + "grad_norm": 0.5999231934547424, + "learning_rate": 4.625883272183721e-05, + "loss": 1.3961, + "step": 8810 + }, + { + "epoch": 0.5989264845767088, + "grad_norm": 0.6405361294746399, + "learning_rate": 4.625670947139557e-05, + "loss": 1.3848, + "step": 8815 + }, + { + "epoch": 0.5992662046473706, + "grad_norm": 0.6125131845474243, + "learning_rate": 4.6254586220953936e-05, + "loss": 1.2984, + "step": 8820 + }, + { + "epoch": 0.5996059247180323, + "grad_norm": 0.555432915687561, + "learning_rate": 4.62524629705123e-05, + "loss": 1.3137, + "step": 8825 + }, + { + "epoch": 0.5999456447886942, + "grad_norm": 0.6365654468536377, + "learning_rate": 4.6250339720070664e-05, + "loss": 1.294, + "step": 8830 + }, + { + "epoch": 0.6002853648593559, + "grad_norm": 0.5567549467086792, + "learning_rate": 4.624821646962903e-05, + "loss": 1.4571, + "step": 8835 + }, + { + "epoch": 0.6006250849300176, + "grad_norm": 0.6197007298469543, + "learning_rate": 4.624609321918739e-05, + "loss": 1.4303, + "step": 8840 + }, + { + "epoch": 0.6009648050006794, + "grad_norm": 0.6491262912750244, + "learning_rate": 4.6243969968745756e-05, + "loss": 1.3946, + "step": 8845 + }, + { + "epoch": 0.6013045250713412, + "grad_norm": 0.6252356171607971, + "learning_rate": 4.624184671830412e-05, + "loss": 1.4354, + "step": 8850 + }, + { + "epoch": 0.601644245142003, + "grad_norm": 0.6818503141403198, + "learning_rate": 4.6239723467862484e-05, + "loss": 1.3727, + "step": 8855 + }, + { + "epoch": 0.6019839652126647, + "grad_norm": 0.5524508357048035, + "learning_rate": 4.623760021742085e-05, + "loss": 1.3857, + "step": 8860 + }, + { + "epoch": 0.6023236852833266, + "grad_norm": 0.6112775206565857, + "learning_rate": 4.623547696697921e-05, + "loss": 1.3369, + "step": 8865 + }, + { + "epoch": 0.6026634053539883, + "grad_norm": 0.6917428374290466, + "learning_rate": 4.6233353716537576e-05, + "loss": 1.4105, + "step": 8870 + }, + { + "epoch": 0.6030031254246501, + "grad_norm": 0.6560357213020325, + "learning_rate": 4.623123046609594e-05, + "loss": 1.3502, + "step": 8875 + }, + { + "epoch": 0.6033428454953118, + "grad_norm": 0.6382858753204346, + "learning_rate": 4.6229107215654304e-05, + "loss": 1.2918, + "step": 8880 + }, + { + "epoch": 0.6036825655659737, + "grad_norm": 0.7183751463890076, + "learning_rate": 4.622698396521267e-05, + "loss": 1.3846, + "step": 8885 + }, + { + "epoch": 0.6040222856366354, + "grad_norm": 0.627293050289154, + "learning_rate": 4.622486071477103e-05, + "loss": 1.4626, + "step": 8890 + }, + { + "epoch": 0.6043620057072971, + "grad_norm": 0.6370736360549927, + "learning_rate": 4.6222737464329396e-05, + "loss": 1.31, + "step": 8895 + }, + { + "epoch": 0.604701725777959, + "grad_norm": 0.6612152457237244, + "learning_rate": 4.622061421388776e-05, + "loss": 1.4417, + "step": 8900 + }, + { + "epoch": 0.6050414458486207, + "grad_norm": 0.6770370006561279, + "learning_rate": 4.6218490963446124e-05, + "loss": 1.3413, + "step": 8905 + }, + { + "epoch": 0.6053811659192825, + "grad_norm": 0.6521413326263428, + "learning_rate": 4.621636771300449e-05, + "loss": 1.4035, + "step": 8910 + }, + { + "epoch": 0.6057208859899443, + "grad_norm": 0.6531280279159546, + "learning_rate": 4.621424446256285e-05, + "loss": 1.2742, + "step": 8915 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.6770360469818115, + "learning_rate": 4.621212121212121e-05, + "loss": 1.4134, + "step": 8920 + }, + { + "epoch": 0.6064003261312678, + "grad_norm": 0.6506828665733337, + "learning_rate": 4.620999796167958e-05, + "loss": 1.3772, + "step": 8925 + }, + { + "epoch": 0.6067400462019296, + "grad_norm": 0.6468803882598877, + "learning_rate": 4.6207874711237944e-05, + "loss": 1.3124, + "step": 8930 + }, + { + "epoch": 0.6070797662725914, + "grad_norm": 0.6341661810874939, + "learning_rate": 4.62057514607963e-05, + "loss": 1.4299, + "step": 8935 + }, + { + "epoch": 0.6074194863432532, + "grad_norm": 0.6457089781761169, + "learning_rate": 4.620362821035467e-05, + "loss": 1.4044, + "step": 8940 + }, + { + "epoch": 0.6077592064139149, + "grad_norm": 0.7401508092880249, + "learning_rate": 4.6201504959913036e-05, + "loss": 1.3108, + "step": 8945 + }, + { + "epoch": 0.6080989264845768, + "grad_norm": 0.6043117046356201, + "learning_rate": 4.619938170947139e-05, + "loss": 1.3466, + "step": 8950 + }, + { + "epoch": 0.6084386465552385, + "grad_norm": 0.6644378900527954, + "learning_rate": 4.6197258459029764e-05, + "loss": 1.3746, + "step": 8955 + }, + { + "epoch": 0.6087783666259002, + "grad_norm": 0.6129704713821411, + "learning_rate": 4.619513520858813e-05, + "loss": 1.376, + "step": 8960 + }, + { + "epoch": 0.609118086696562, + "grad_norm": 0.7116718888282776, + "learning_rate": 4.6193011958146485e-05, + "loss": 1.3112, + "step": 8965 + }, + { + "epoch": 0.6094578067672238, + "grad_norm": 0.6450315713882446, + "learning_rate": 4.6190888707704856e-05, + "loss": 1.4244, + "step": 8970 + }, + { + "epoch": 0.6097975268378856, + "grad_norm": 0.708161473274231, + "learning_rate": 4.618876545726322e-05, + "loss": 1.3267, + "step": 8975 + }, + { + "epoch": 0.6101372469085473, + "grad_norm": 0.5978407859802246, + "learning_rate": 4.618664220682158e-05, + "loss": 1.3002, + "step": 8980 + }, + { + "epoch": 0.6104769669792092, + "grad_norm": 0.7121630907058716, + "learning_rate": 4.618451895637995e-05, + "loss": 1.364, + "step": 8985 + }, + { + "epoch": 0.6108166870498709, + "grad_norm": 0.6190058588981628, + "learning_rate": 4.6182395705938305e-05, + "loss": 1.3812, + "step": 8990 + }, + { + "epoch": 0.6111564071205327, + "grad_norm": 0.621415913105011, + "learning_rate": 4.618027245549667e-05, + "loss": 1.3124, + "step": 8995 + }, + { + "epoch": 0.6114961271911945, + "grad_norm": 0.6940446496009827, + "learning_rate": 4.617814920505504e-05, + "loss": 1.3833, + "step": 9000 + }, + { + "epoch": 0.6118358472618562, + "grad_norm": 0.6798141002655029, + "learning_rate": 4.61760259546134e-05, + "loss": 1.3791, + "step": 9005 + }, + { + "epoch": 0.612175567332518, + "grad_norm": 0.6257941126823425, + "learning_rate": 4.617390270417176e-05, + "loss": 1.3625, + "step": 9010 + }, + { + "epoch": 0.6125152874031797, + "grad_norm": 0.7014443278312683, + "learning_rate": 4.617177945373013e-05, + "loss": 1.3498, + "step": 9015 + }, + { + "epoch": 0.6128550074738416, + "grad_norm": 0.6132319569587708, + "learning_rate": 4.616965620328849e-05, + "loss": 1.2609, + "step": 9020 + }, + { + "epoch": 0.6131947275445033, + "grad_norm": 0.665908694267273, + "learning_rate": 4.616753295284685e-05, + "loss": 1.2704, + "step": 9025 + }, + { + "epoch": 0.6135344476151651, + "grad_norm": 0.4162493050098419, + "learning_rate": 4.6165409702405224e-05, + "loss": 1.3329, + "step": 9030 + }, + { + "epoch": 0.6138741676858269, + "grad_norm": 0.6778438091278076, + "learning_rate": 4.616328645196358e-05, + "loss": 1.4025, + "step": 9035 + }, + { + "epoch": 0.6142138877564887, + "grad_norm": 0.6079490780830383, + "learning_rate": 4.616116320152195e-05, + "loss": 1.4282, + "step": 9040 + }, + { + "epoch": 0.6145536078271504, + "grad_norm": 0.6796638369560242, + "learning_rate": 4.6159039951080316e-05, + "loss": 1.3924, + "step": 9045 + }, + { + "epoch": 0.6148933278978121, + "grad_norm": 0.6430286765098572, + "learning_rate": 4.615691670063867e-05, + "loss": 1.3837, + "step": 9050 + }, + { + "epoch": 0.615233047968474, + "grad_norm": 0.6890765428543091, + "learning_rate": 4.6154793450197044e-05, + "loss": 1.5043, + "step": 9055 + }, + { + "epoch": 0.6155727680391357, + "grad_norm": 0.634694516658783, + "learning_rate": 4.61526701997554e-05, + "loss": 1.3749, + "step": 9060 + }, + { + "epoch": 0.6159124881097975, + "grad_norm": 0.6552861928939819, + "learning_rate": 4.6150546949313765e-05, + "loss": 1.4356, + "step": 9065 + }, + { + "epoch": 0.6162522081804593, + "grad_norm": 0.6340364217758179, + "learning_rate": 4.6148423698872136e-05, + "loss": 1.3417, + "step": 9070 + }, + { + "epoch": 0.6165919282511211, + "grad_norm": 0.6809887886047363, + "learning_rate": 4.614630044843049e-05, + "loss": 1.3178, + "step": 9075 + }, + { + "epoch": 0.6169316483217828, + "grad_norm": 0.6862459182739258, + "learning_rate": 4.614417719798886e-05, + "loss": 1.334, + "step": 9080 + }, + { + "epoch": 0.6172713683924447, + "grad_norm": 0.6462175250053406, + "learning_rate": 4.614205394754723e-05, + "loss": 1.3098, + "step": 9085 + }, + { + "epoch": 0.6176110884631064, + "grad_norm": 0.6594539284706116, + "learning_rate": 4.6139930697105585e-05, + "loss": 1.3112, + "step": 9090 + }, + { + "epoch": 0.6179508085337682, + "grad_norm": 0.6995545625686646, + "learning_rate": 4.613780744666395e-05, + "loss": 1.3699, + "step": 9095 + }, + { + "epoch": 0.6182905286044299, + "grad_norm": 0.6023643016815186, + "learning_rate": 4.613568419622232e-05, + "loss": 1.446, + "step": 9100 + }, + { + "epoch": 0.6186302486750918, + "grad_norm": 0.6412670016288757, + "learning_rate": 4.613356094578068e-05, + "loss": 1.3672, + "step": 9105 + }, + { + "epoch": 0.6189699687457535, + "grad_norm": 0.6643955707550049, + "learning_rate": 4.613143769533904e-05, + "loss": 1.2998, + "step": 9110 + }, + { + "epoch": 0.6193096888164152, + "grad_norm": 0.7588699460029602, + "learning_rate": 4.612931444489741e-05, + "loss": 1.4095, + "step": 9115 + }, + { + "epoch": 0.6196494088870771, + "grad_norm": 0.5423884987831116, + "learning_rate": 4.612719119445577e-05, + "loss": 1.3938, + "step": 9120 + }, + { + "epoch": 0.6199891289577388, + "grad_norm": 0.5800355672836304, + "learning_rate": 4.612506794401413e-05, + "loss": 1.358, + "step": 9125 + }, + { + "epoch": 0.6203288490284006, + "grad_norm": 0.7485507130622864, + "learning_rate": 4.6122944693572504e-05, + "loss": 1.3613, + "step": 9130 + }, + { + "epoch": 0.6206685690990623, + "grad_norm": 0.6042966842651367, + "learning_rate": 4.612082144313086e-05, + "loss": 1.4026, + "step": 9135 + }, + { + "epoch": 0.6210082891697242, + "grad_norm": 0.6381845474243164, + "learning_rate": 4.6118698192689225e-05, + "loss": 1.3385, + "step": 9140 + }, + { + "epoch": 0.6213480092403859, + "grad_norm": 0.6302849650382996, + "learning_rate": 4.611657494224759e-05, + "loss": 1.342, + "step": 9145 + }, + { + "epoch": 0.6216877293110477, + "grad_norm": 0.6172165870666504, + "learning_rate": 4.611445169180595e-05, + "loss": 1.4811, + "step": 9150 + }, + { + "epoch": 0.6220274493817095, + "grad_norm": 0.641302764415741, + "learning_rate": 4.611232844136432e-05, + "loss": 1.3648, + "step": 9155 + }, + { + "epoch": 0.6223671694523712, + "grad_norm": 0.6315203905105591, + "learning_rate": 4.611020519092268e-05, + "loss": 1.4134, + "step": 9160 + }, + { + "epoch": 0.622706889523033, + "grad_norm": 0.67299485206604, + "learning_rate": 4.6108081940481045e-05, + "loss": 1.3513, + "step": 9165 + }, + { + "epoch": 0.6230466095936948, + "grad_norm": 0.6160862445831299, + "learning_rate": 4.610595869003941e-05, + "loss": 1.3066, + "step": 9170 + }, + { + "epoch": 0.6233863296643566, + "grad_norm": 0.7062851190567017, + "learning_rate": 4.610383543959777e-05, + "loss": 1.3928, + "step": 9175 + }, + { + "epoch": 0.6237260497350183, + "grad_norm": 0.8130349516868591, + "learning_rate": 4.610171218915614e-05, + "loss": 1.345, + "step": 9180 + }, + { + "epoch": 0.6240657698056801, + "grad_norm": 0.739722490310669, + "learning_rate": 4.60995889387145e-05, + "loss": 1.4844, + "step": 9185 + }, + { + "epoch": 0.6244054898763419, + "grad_norm": 0.6197490692138672, + "learning_rate": 4.6097465688272865e-05, + "loss": 1.417, + "step": 9190 + }, + { + "epoch": 0.6247452099470037, + "grad_norm": 0.642519474029541, + "learning_rate": 4.609534243783123e-05, + "loss": 1.4504, + "step": 9195 + }, + { + "epoch": 0.6250849300176654, + "grad_norm": 0.6041985750198364, + "learning_rate": 4.609321918738959e-05, + "loss": 1.3105, + "step": 9200 + }, + { + "epoch": 0.6254246500883273, + "grad_norm": 0.7667335867881775, + "learning_rate": 4.609109593694796e-05, + "loss": 1.407, + "step": 9205 + }, + { + "epoch": 0.625764370158989, + "grad_norm": 0.6304047703742981, + "learning_rate": 4.608897268650632e-05, + "loss": 1.4449, + "step": 9210 + }, + { + "epoch": 0.6261040902296507, + "grad_norm": 0.6934990882873535, + "learning_rate": 4.6086849436064685e-05, + "loss": 1.344, + "step": 9215 + }, + { + "epoch": 0.6264438103003126, + "grad_norm": 0.6668086051940918, + "learning_rate": 4.608472618562305e-05, + "loss": 1.4584, + "step": 9220 + }, + { + "epoch": 0.6267835303709743, + "grad_norm": 0.711803138256073, + "learning_rate": 4.608260293518141e-05, + "loss": 1.4036, + "step": 9225 + }, + { + "epoch": 0.6271232504416361, + "grad_norm": 0.6676630973815918, + "learning_rate": 4.608047968473978e-05, + "loss": 1.3854, + "step": 9230 + }, + { + "epoch": 0.6274629705122978, + "grad_norm": 0.5944895148277283, + "learning_rate": 4.607835643429814e-05, + "loss": 1.3905, + "step": 9235 + }, + { + "epoch": 0.6278026905829597, + "grad_norm": 0.5850406289100647, + "learning_rate": 4.6076233183856505e-05, + "loss": 1.3627, + "step": 9240 + }, + { + "epoch": 0.6281424106536214, + "grad_norm": 0.7356735467910767, + "learning_rate": 4.607410993341487e-05, + "loss": 1.436, + "step": 9245 + }, + { + "epoch": 0.6284821307242832, + "grad_norm": 0.7528555989265442, + "learning_rate": 4.607198668297323e-05, + "loss": 1.3113, + "step": 9250 + }, + { + "epoch": 0.628821850794945, + "grad_norm": 0.6859064698219299, + "learning_rate": 4.60698634325316e-05, + "loss": 1.3161, + "step": 9255 + }, + { + "epoch": 0.6291615708656068, + "grad_norm": 0.642941415309906, + "learning_rate": 4.6067740182089955e-05, + "loss": 1.4547, + "step": 9260 + }, + { + "epoch": 0.6295012909362685, + "grad_norm": 0.6627421379089355, + "learning_rate": 4.6065616931648325e-05, + "loss": 1.2929, + "step": 9265 + }, + { + "epoch": 0.6298410110069302, + "grad_norm": 0.6617950201034546, + "learning_rate": 4.606349368120669e-05, + "loss": 1.3382, + "step": 9270 + }, + { + "epoch": 0.6301807310775921, + "grad_norm": 0.6738972663879395, + "learning_rate": 4.6061370430765047e-05, + "loss": 1.3662, + "step": 9275 + }, + { + "epoch": 0.6305204511482538, + "grad_norm": 0.5935032963752747, + "learning_rate": 4.605924718032342e-05, + "loss": 1.3452, + "step": 9280 + }, + { + "epoch": 0.6308601712189156, + "grad_norm": 0.641117513179779, + "learning_rate": 4.605712392988178e-05, + "loss": 1.4126, + "step": 9285 + }, + { + "epoch": 0.6311998912895774, + "grad_norm": 0.6209895014762878, + "learning_rate": 4.605500067944014e-05, + "loss": 1.3441, + "step": 9290 + }, + { + "epoch": 0.6315396113602392, + "grad_norm": 0.6649410724639893, + "learning_rate": 4.605287742899851e-05, + "loss": 1.4278, + "step": 9295 + }, + { + "epoch": 0.6318793314309009, + "grad_norm": 0.6906175017356873, + "learning_rate": 4.6050754178556873e-05, + "loss": 1.3647, + "step": 9300 + }, + { + "epoch": 0.6322190515015628, + "grad_norm": 0.5536985993385315, + "learning_rate": 4.604863092811523e-05, + "loss": 1.3098, + "step": 9305 + }, + { + "epoch": 0.6325587715722245, + "grad_norm": 0.6450605988502502, + "learning_rate": 4.60465076776736e-05, + "loss": 1.322, + "step": 9310 + }, + { + "epoch": 0.6328984916428863, + "grad_norm": 0.6113890409469604, + "learning_rate": 4.6044384427231965e-05, + "loss": 1.2808, + "step": 9315 + }, + { + "epoch": 0.633238211713548, + "grad_norm": 0.6422805190086365, + "learning_rate": 4.604226117679032e-05, + "loss": 1.2798, + "step": 9320 + }, + { + "epoch": 0.6335779317842098, + "grad_norm": 0.6318135857582092, + "learning_rate": 4.6040137926348693e-05, + "loss": 1.3633, + "step": 9325 + }, + { + "epoch": 0.6339176518548716, + "grad_norm": 0.62293940782547, + "learning_rate": 4.603801467590706e-05, + "loss": 1.4161, + "step": 9330 + }, + { + "epoch": 0.6342573719255333, + "grad_norm": 0.7394024729728699, + "learning_rate": 4.6035891425465415e-05, + "loss": 1.3523, + "step": 9335 + }, + { + "epoch": 0.6345970919961952, + "grad_norm": 0.6278140544891357, + "learning_rate": 4.6033768175023785e-05, + "loss": 1.3903, + "step": 9340 + }, + { + "epoch": 0.6349368120668569, + "grad_norm": 0.702151358127594, + "learning_rate": 4.603164492458214e-05, + "loss": 1.3482, + "step": 9345 + }, + { + "epoch": 0.6352765321375187, + "grad_norm": 0.5546267032623291, + "learning_rate": 4.602952167414051e-05, + "loss": 1.3292, + "step": 9350 + }, + { + "epoch": 0.6356162522081804, + "grad_norm": 0.6712154150009155, + "learning_rate": 4.602739842369888e-05, + "loss": 1.3712, + "step": 9355 + }, + { + "epoch": 0.6359559722788423, + "grad_norm": 0.6703802943229675, + "learning_rate": 4.6025275173257235e-05, + "loss": 1.3229, + "step": 9360 + }, + { + "epoch": 0.636295692349504, + "grad_norm": 0.6230586767196655, + "learning_rate": 4.60231519228156e-05, + "loss": 1.2958, + "step": 9365 + }, + { + "epoch": 0.6366354124201657, + "grad_norm": 0.5519856214523315, + "learning_rate": 4.602102867237397e-05, + "loss": 1.3783, + "step": 9370 + }, + { + "epoch": 0.6369751324908276, + "grad_norm": 0.6753631234169006, + "learning_rate": 4.601890542193233e-05, + "loss": 1.3171, + "step": 9375 + }, + { + "epoch": 0.6373148525614893, + "grad_norm": 0.7067842483520508, + "learning_rate": 4.60167821714907e-05, + "loss": 1.3057, + "step": 9380 + }, + { + "epoch": 0.6376545726321511, + "grad_norm": 0.6295456290245056, + "learning_rate": 4.601465892104906e-05, + "loss": 1.3678, + "step": 9385 + }, + { + "epoch": 0.6379942927028129, + "grad_norm": 0.6710845828056335, + "learning_rate": 4.601253567060742e-05, + "loss": 1.3506, + "step": 9390 + }, + { + "epoch": 0.6383340127734747, + "grad_norm": 0.5668164491653442, + "learning_rate": 4.601041242016579e-05, + "loss": 1.2924, + "step": 9395 + }, + { + "epoch": 0.6386737328441364, + "grad_norm": 0.5902765989303589, + "learning_rate": 4.6008289169724153e-05, + "loss": 1.3587, + "step": 9400 + }, + { + "epoch": 0.6390134529147982, + "grad_norm": 0.6761592626571655, + "learning_rate": 4.600616591928251e-05, + "loss": 1.3528, + "step": 9405 + }, + { + "epoch": 0.63935317298546, + "grad_norm": 0.6213817000389099, + "learning_rate": 4.600404266884088e-05, + "loss": 1.3861, + "step": 9410 + }, + { + "epoch": 0.6396928930561218, + "grad_norm": 0.6939463019371033, + "learning_rate": 4.600191941839924e-05, + "loss": 1.3352, + "step": 9415 + }, + { + "epoch": 0.6400326131267835, + "grad_norm": 0.6145976781845093, + "learning_rate": 4.59997961679576e-05, + "loss": 1.3684, + "step": 9420 + }, + { + "epoch": 0.6403723331974454, + "grad_norm": 0.5914034247398376, + "learning_rate": 4.5997672917515973e-05, + "loss": 1.4238, + "step": 9425 + }, + { + "epoch": 0.6407120532681071, + "grad_norm": 0.6548230051994324, + "learning_rate": 4.599554966707433e-05, + "loss": 1.3183, + "step": 9430 + }, + { + "epoch": 0.6410517733387688, + "grad_norm": 0.6701985001564026, + "learning_rate": 4.5993426416632695e-05, + "loss": 1.2767, + "step": 9435 + }, + { + "epoch": 0.6413914934094306, + "grad_norm": 0.6515761613845825, + "learning_rate": 4.5991303166191066e-05, + "loss": 1.2968, + "step": 9440 + }, + { + "epoch": 0.6417312134800924, + "grad_norm": 0.6450496315956116, + "learning_rate": 4.598917991574942e-05, + "loss": 1.4845, + "step": 9445 + }, + { + "epoch": 0.6420709335507542, + "grad_norm": 0.6368112564086914, + "learning_rate": 4.598705666530779e-05, + "loss": 1.3298, + "step": 9450 + }, + { + "epoch": 0.6424106536214159, + "grad_norm": 0.7002708315849304, + "learning_rate": 4.598493341486616e-05, + "loss": 1.3902, + "step": 9455 + }, + { + "epoch": 0.6427503736920778, + "grad_norm": 0.6493120193481445, + "learning_rate": 4.5982810164424515e-05, + "loss": 1.4037, + "step": 9460 + }, + { + "epoch": 0.6430900937627395, + "grad_norm": 0.6865414381027222, + "learning_rate": 4.598068691398288e-05, + "loss": 1.2946, + "step": 9465 + }, + { + "epoch": 0.6434298138334013, + "grad_norm": 0.6673260927200317, + "learning_rate": 4.597856366354125e-05, + "loss": 1.3747, + "step": 9470 + }, + { + "epoch": 0.6437695339040631, + "grad_norm": 0.6680063009262085, + "learning_rate": 4.597644041309961e-05, + "loss": 1.3585, + "step": 9475 + }, + { + "epoch": 0.6441092539747248, + "grad_norm": 0.6357535123825073, + "learning_rate": 4.597431716265797e-05, + "loss": 1.3602, + "step": 9480 + }, + { + "epoch": 0.6444489740453866, + "grad_norm": 0.7351832985877991, + "learning_rate": 4.5972193912216335e-05, + "loss": 1.4211, + "step": 9485 + }, + { + "epoch": 0.6447886941160483, + "grad_norm": 0.6761483550071716, + "learning_rate": 4.59700706617747e-05, + "loss": 1.4096, + "step": 9490 + }, + { + "epoch": 0.6451284141867102, + "grad_norm": 0.6655097007751465, + "learning_rate": 4.596794741133306e-05, + "loss": 1.373, + "step": 9495 + }, + { + "epoch": 0.6454681342573719, + "grad_norm": 0.6212973594665527, + "learning_rate": 4.596582416089143e-05, + "loss": 1.388, + "step": 9500 + }, + { + "epoch": 0.6458078543280337, + "grad_norm": 0.6282244920730591, + "learning_rate": 4.596370091044979e-05, + "loss": 1.3183, + "step": 9505 + }, + { + "epoch": 0.6461475743986955, + "grad_norm": 0.6414934396743774, + "learning_rate": 4.5961577660008155e-05, + "loss": 1.3416, + "step": 9510 + }, + { + "epoch": 0.6464872944693573, + "grad_norm": 0.6590577960014343, + "learning_rate": 4.595945440956652e-05, + "loss": 1.4681, + "step": 9515 + }, + { + "epoch": 0.646827014540019, + "grad_norm": 0.6200308203697205, + "learning_rate": 4.595733115912488e-05, + "loss": 1.3792, + "step": 9520 + }, + { + "epoch": 0.6471667346106807, + "grad_norm": 0.7878032326698303, + "learning_rate": 4.595520790868325e-05, + "loss": 1.366, + "step": 9525 + }, + { + "epoch": 0.6475064546813426, + "grad_norm": 0.5594908595085144, + "learning_rate": 4.595308465824161e-05, + "loss": 1.3368, + "step": 9530 + }, + { + "epoch": 0.6478461747520043, + "grad_norm": 0.6782909035682678, + "learning_rate": 4.5950961407799975e-05, + "loss": 1.3882, + "step": 9535 + }, + { + "epoch": 0.6481858948226661, + "grad_norm": 0.6730229258537292, + "learning_rate": 4.594883815735834e-05, + "loss": 1.4006, + "step": 9540 + }, + { + "epoch": 0.6485256148933279, + "grad_norm": 0.6307126879692078, + "learning_rate": 4.59467149069167e-05, + "loss": 1.3597, + "step": 9545 + }, + { + "epoch": 0.6488653349639897, + "grad_norm": 0.7037644982337952, + "learning_rate": 4.594459165647507e-05, + "loss": 1.4252, + "step": 9550 + }, + { + "epoch": 0.6492050550346514, + "grad_norm": 0.5555718541145325, + "learning_rate": 4.594246840603343e-05, + "loss": 1.3795, + "step": 9555 + }, + { + "epoch": 0.6495447751053133, + "grad_norm": 0.6250681281089783, + "learning_rate": 4.5940345155591795e-05, + "loss": 1.2769, + "step": 9560 + }, + { + "epoch": 0.649884495175975, + "grad_norm": 0.6747601628303528, + "learning_rate": 4.593822190515016e-05, + "loss": 1.4018, + "step": 9565 + }, + { + "epoch": 0.6502242152466368, + "grad_norm": 0.6627749800682068, + "learning_rate": 4.593609865470852e-05, + "loss": 1.3658, + "step": 9570 + }, + { + "epoch": 0.6505639353172985, + "grad_norm": 0.6767715811729431, + "learning_rate": 4.593397540426689e-05, + "loss": 1.3927, + "step": 9575 + }, + { + "epoch": 0.6509036553879604, + "grad_norm": 0.6625750660896301, + "learning_rate": 4.593185215382525e-05, + "loss": 1.3192, + "step": 9580 + }, + { + "epoch": 0.6512433754586221, + "grad_norm": 0.6238129734992981, + "learning_rate": 4.5929728903383615e-05, + "loss": 1.3058, + "step": 9585 + }, + { + "epoch": 0.6515830955292838, + "grad_norm": 0.5913823843002319, + "learning_rate": 4.592760565294198e-05, + "loss": 1.3467, + "step": 9590 + }, + { + "epoch": 0.6519228155999457, + "grad_norm": 0.6385979652404785, + "learning_rate": 4.592548240250034e-05, + "loss": 1.4386, + "step": 9595 + }, + { + "epoch": 0.6522625356706074, + "grad_norm": 0.6688828468322754, + "learning_rate": 4.592335915205871e-05, + "loss": 1.43, + "step": 9600 + }, + { + "epoch": 0.6526022557412692, + "grad_norm": 0.662375271320343, + "learning_rate": 4.592123590161707e-05, + "loss": 1.3591, + "step": 9605 + }, + { + "epoch": 0.6529419758119309, + "grad_norm": 0.6747382283210754, + "learning_rate": 4.5919112651175435e-05, + "loss": 1.362, + "step": 9610 + }, + { + "epoch": 0.6532816958825928, + "grad_norm": 0.6197258830070496, + "learning_rate": 4.591698940073379e-05, + "loss": 1.3253, + "step": 9615 + }, + { + "epoch": 0.6536214159532545, + "grad_norm": 0.7278828024864197, + "learning_rate": 4.591486615029216e-05, + "loss": 1.3492, + "step": 9620 + }, + { + "epoch": 0.6539611360239163, + "grad_norm": 0.6371850967407227, + "learning_rate": 4.591274289985053e-05, + "loss": 1.4012, + "step": 9625 + }, + { + "epoch": 0.6543008560945781, + "grad_norm": 0.6267068386077881, + "learning_rate": 4.5910619649408884e-05, + "loss": 1.3798, + "step": 9630 + }, + { + "epoch": 0.6546405761652399, + "grad_norm": 0.666438102722168, + "learning_rate": 4.5908496398967255e-05, + "loss": 1.3579, + "step": 9635 + }, + { + "epoch": 0.6549802962359016, + "grad_norm": 0.7566459774971008, + "learning_rate": 4.590637314852562e-05, + "loss": 1.3679, + "step": 9640 + }, + { + "epoch": 0.6553200163065634, + "grad_norm": 0.6004300713539124, + "learning_rate": 4.5904249898083976e-05, + "loss": 1.3402, + "step": 9645 + }, + { + "epoch": 0.6556597363772252, + "grad_norm": 0.6538270115852356, + "learning_rate": 4.590212664764235e-05, + "loss": 1.3925, + "step": 9650 + }, + { + "epoch": 0.6559994564478869, + "grad_norm": 0.6357282400131226, + "learning_rate": 4.590000339720071e-05, + "loss": 1.3934, + "step": 9655 + }, + { + "epoch": 0.6563391765185487, + "grad_norm": 0.6436564326286316, + "learning_rate": 4.589788014675907e-05, + "loss": 1.439, + "step": 9660 + }, + { + "epoch": 0.6566788965892105, + "grad_norm": 0.6942059993743896, + "learning_rate": 4.589575689631744e-05, + "loss": 1.3961, + "step": 9665 + }, + { + "epoch": 0.6570186166598723, + "grad_norm": 0.6094388961791992, + "learning_rate": 4.58936336458758e-05, + "loss": 1.3108, + "step": 9670 + }, + { + "epoch": 0.657358336730534, + "grad_norm": 0.8097217679023743, + "learning_rate": 4.589151039543416e-05, + "loss": 1.2711, + "step": 9675 + }, + { + "epoch": 0.6576980568011959, + "grad_norm": 0.6026155948638916, + "learning_rate": 4.588938714499253e-05, + "loss": 1.3313, + "step": 9680 + }, + { + "epoch": 0.6580377768718576, + "grad_norm": 0.7359176874160767, + "learning_rate": 4.588726389455089e-05, + "loss": 1.2888, + "step": 9685 + }, + { + "epoch": 0.6583774969425193, + "grad_norm": 0.6421589255332947, + "learning_rate": 4.588514064410925e-05, + "loss": 1.3364, + "step": 9690 + }, + { + "epoch": 0.6587172170131811, + "grad_norm": 0.6737176775932312, + "learning_rate": 4.588301739366762e-05, + "loss": 1.3345, + "step": 9695 + }, + { + "epoch": 0.6590569370838429, + "grad_norm": 0.6418071389198303, + "learning_rate": 4.588089414322598e-05, + "loss": 1.3952, + "step": 9700 + }, + { + "epoch": 0.6593966571545047, + "grad_norm": 0.7056443095207214, + "learning_rate": 4.5878770892784344e-05, + "loss": 1.3663, + "step": 9705 + }, + { + "epoch": 0.6597363772251664, + "grad_norm": 0.6683934926986694, + "learning_rate": 4.5876647642342715e-05, + "loss": 1.3471, + "step": 9710 + }, + { + "epoch": 0.6600760972958283, + "grad_norm": 0.6746886372566223, + "learning_rate": 4.587452439190107e-05, + "loss": 1.3707, + "step": 9715 + }, + { + "epoch": 0.66041581736649, + "grad_norm": 0.634894073009491, + "learning_rate": 4.587240114145944e-05, + "loss": 1.4886, + "step": 9720 + }, + { + "epoch": 0.6607555374371518, + "grad_norm": 0.6497269868850708, + "learning_rate": 4.587027789101781e-05, + "loss": 1.3776, + "step": 9725 + }, + { + "epoch": 0.6610952575078136, + "grad_norm": 0.577656090259552, + "learning_rate": 4.5868154640576164e-05, + "loss": 1.2446, + "step": 9730 + }, + { + "epoch": 0.6614349775784754, + "grad_norm": 0.5996268391609192, + "learning_rate": 4.5866031390134535e-05, + "loss": 1.3608, + "step": 9735 + }, + { + "epoch": 0.6617746976491371, + "grad_norm": 0.602403461933136, + "learning_rate": 4.58639081396929e-05, + "loss": 1.4305, + "step": 9740 + }, + { + "epoch": 0.6621144177197988, + "grad_norm": 0.5692216157913208, + "learning_rate": 4.5861784889251256e-05, + "loss": 1.376, + "step": 9745 + }, + { + "epoch": 0.6624541377904607, + "grad_norm": 0.6115549206733704, + "learning_rate": 4.585966163880963e-05, + "loss": 1.3595, + "step": 9750 + }, + { + "epoch": 0.6627938578611224, + "grad_norm": 0.6958981156349182, + "learning_rate": 4.585753838836799e-05, + "loss": 1.3745, + "step": 9755 + }, + { + "epoch": 0.6631335779317842, + "grad_norm": 0.6716538667678833, + "learning_rate": 4.585541513792635e-05, + "loss": 1.3938, + "step": 9760 + }, + { + "epoch": 0.663473298002446, + "grad_norm": 0.641048014163971, + "learning_rate": 4.585329188748472e-05, + "loss": 1.2594, + "step": 9765 + }, + { + "epoch": 0.6638130180731078, + "grad_norm": 0.6822525858879089, + "learning_rate": 4.5851168637043076e-05, + "loss": 1.4685, + "step": 9770 + }, + { + "epoch": 0.6641527381437695, + "grad_norm": 0.617434024810791, + "learning_rate": 4.584904538660144e-05, + "loss": 1.3618, + "step": 9775 + }, + { + "epoch": 0.6644924582144313, + "grad_norm": 0.6560618281364441, + "learning_rate": 4.584692213615981e-05, + "loss": 1.4008, + "step": 9780 + }, + { + "epoch": 0.6648321782850931, + "grad_norm": 0.7547979950904846, + "learning_rate": 4.584479888571817e-05, + "loss": 1.4516, + "step": 9785 + }, + { + "epoch": 0.6651718983557549, + "grad_norm": 0.615734338760376, + "learning_rate": 4.584267563527653e-05, + "loss": 1.4049, + "step": 9790 + }, + { + "epoch": 0.6655116184264166, + "grad_norm": 0.741465151309967, + "learning_rate": 4.58405523848349e-05, + "loss": 1.2591, + "step": 9795 + }, + { + "epoch": 0.6658513384970784, + "grad_norm": 0.5768656730651855, + "learning_rate": 4.583842913439326e-05, + "loss": 1.2828, + "step": 9800 + }, + { + "epoch": 0.6661910585677402, + "grad_norm": 0.6337997317314148, + "learning_rate": 4.5836305883951624e-05, + "loss": 1.3354, + "step": 9805 + }, + { + "epoch": 0.6665307786384019, + "grad_norm": 0.6725940704345703, + "learning_rate": 4.5834182633509995e-05, + "loss": 1.3705, + "step": 9810 + }, + { + "epoch": 0.6668704987090638, + "grad_norm": 0.750545084476471, + "learning_rate": 4.583205938306835e-05, + "loss": 1.3244, + "step": 9815 + }, + { + "epoch": 0.6672102187797255, + "grad_norm": 0.7528793215751648, + "learning_rate": 4.5829936132626716e-05, + "loss": 1.339, + "step": 9820 + }, + { + "epoch": 0.6675499388503873, + "grad_norm": 0.7245656251907349, + "learning_rate": 4.582781288218509e-05, + "loss": 1.3719, + "step": 9825 + }, + { + "epoch": 0.667889658921049, + "grad_norm": 0.6033870577812195, + "learning_rate": 4.5825689631743444e-05, + "loss": 1.3602, + "step": 9830 + }, + { + "epoch": 0.6682293789917109, + "grad_norm": 0.6469632983207703, + "learning_rate": 4.582356638130181e-05, + "loss": 1.3224, + "step": 9835 + }, + { + "epoch": 0.6685690990623726, + "grad_norm": 0.6322104930877686, + "learning_rate": 4.582144313086017e-05, + "loss": 1.4193, + "step": 9840 + }, + { + "epoch": 0.6689088191330343, + "grad_norm": 0.7348239421844482, + "learning_rate": 4.5819319880418536e-05, + "loss": 1.2968, + "step": 9845 + }, + { + "epoch": 0.6692485392036962, + "grad_norm": 0.6168177127838135, + "learning_rate": 4.58171966299769e-05, + "loss": 1.3212, + "step": 9850 + }, + { + "epoch": 0.6695882592743579, + "grad_norm": 0.7116106748580933, + "learning_rate": 4.5815073379535264e-05, + "loss": 1.3936, + "step": 9855 + }, + { + "epoch": 0.6699279793450197, + "grad_norm": 0.6527019143104553, + "learning_rate": 4.581295012909363e-05, + "loss": 1.4694, + "step": 9860 + }, + { + "epoch": 0.6702676994156814, + "grad_norm": 0.5700908899307251, + "learning_rate": 4.581082687865199e-05, + "loss": 1.3862, + "step": 9865 + }, + { + "epoch": 0.6706074194863433, + "grad_norm": 0.6820536851882935, + "learning_rate": 4.5808703628210356e-05, + "loss": 1.4069, + "step": 9870 + }, + { + "epoch": 0.670947139557005, + "grad_norm": 0.6084043383598328, + "learning_rate": 4.580658037776872e-05, + "loss": 1.4985, + "step": 9875 + }, + { + "epoch": 0.6712868596276668, + "grad_norm": 0.6403409242630005, + "learning_rate": 4.5804457127327084e-05, + "loss": 1.2438, + "step": 9880 + }, + { + "epoch": 0.6716265796983286, + "grad_norm": 0.7354403138160706, + "learning_rate": 4.580233387688545e-05, + "loss": 1.4273, + "step": 9885 + }, + { + "epoch": 0.6719662997689904, + "grad_norm": 0.6923107504844666, + "learning_rate": 4.580021062644381e-05, + "loss": 1.3991, + "step": 9890 + }, + { + "epoch": 0.6723060198396521, + "grad_norm": 0.65451580286026, + "learning_rate": 4.5798087376002176e-05, + "loss": 1.2395, + "step": 9895 + }, + { + "epoch": 0.672645739910314, + "grad_norm": 0.6222007274627686, + "learning_rate": 4.579596412556054e-05, + "loss": 1.414, + "step": 9900 + }, + { + "epoch": 0.6729854599809757, + "grad_norm": 0.6794261932373047, + "learning_rate": 4.5793840875118904e-05, + "loss": 1.3881, + "step": 9905 + }, + { + "epoch": 0.6733251800516374, + "grad_norm": 0.6541102528572083, + "learning_rate": 4.579171762467727e-05, + "loss": 1.4314, + "step": 9910 + }, + { + "epoch": 0.6736649001222992, + "grad_norm": 0.6328390836715698, + "learning_rate": 4.578959437423563e-05, + "loss": 1.5039, + "step": 9915 + }, + { + "epoch": 0.674004620192961, + "grad_norm": 0.6314879059791565, + "learning_rate": 4.5787471123793996e-05, + "loss": 1.3484, + "step": 9920 + }, + { + "epoch": 0.6743443402636228, + "grad_norm": 0.6886136531829834, + "learning_rate": 4.578534787335236e-05, + "loss": 1.3884, + "step": 9925 + }, + { + "epoch": 0.6746840603342845, + "grad_norm": 0.6789883971214294, + "learning_rate": 4.5783224622910724e-05, + "loss": 1.4977, + "step": 9930 + }, + { + "epoch": 0.6750237804049464, + "grad_norm": 0.6783531904220581, + "learning_rate": 4.578110137246909e-05, + "loss": 1.3998, + "step": 9935 + }, + { + "epoch": 0.6753635004756081, + "grad_norm": 0.6744592785835266, + "learning_rate": 4.577897812202745e-05, + "loss": 1.4035, + "step": 9940 + }, + { + "epoch": 0.6757032205462699, + "grad_norm": 0.64136803150177, + "learning_rate": 4.5776854871585816e-05, + "loss": 1.333, + "step": 9945 + }, + { + "epoch": 0.6760429406169316, + "grad_norm": 0.6484505534172058, + "learning_rate": 4.577473162114418e-05, + "loss": 1.4407, + "step": 9950 + }, + { + "epoch": 0.6763826606875935, + "grad_norm": 0.6182332634925842, + "learning_rate": 4.5772608370702544e-05, + "loss": 1.293, + "step": 9955 + }, + { + "epoch": 0.6767223807582552, + "grad_norm": 0.6932333111763, + "learning_rate": 4.577048512026091e-05, + "loss": 1.273, + "step": 9960 + }, + { + "epoch": 0.6770621008289169, + "grad_norm": 0.646960437297821, + "learning_rate": 4.576836186981927e-05, + "loss": 1.3672, + "step": 9965 + }, + { + "epoch": 0.6774018208995788, + "grad_norm": 0.691676139831543, + "learning_rate": 4.576623861937763e-05, + "loss": 1.318, + "step": 9970 + }, + { + "epoch": 0.6777415409702405, + "grad_norm": 0.6424114108085632, + "learning_rate": 4.5764115368936e-05, + "loss": 1.3387, + "step": 9975 + }, + { + "epoch": 0.6780812610409023, + "grad_norm": 0.6947415471076965, + "learning_rate": 4.5761992118494364e-05, + "loss": 1.3571, + "step": 9980 + }, + { + "epoch": 0.6784209811115641, + "grad_norm": 0.6696570515632629, + "learning_rate": 4.575986886805272e-05, + "loss": 1.3673, + "step": 9985 + }, + { + "epoch": 0.6787607011822259, + "grad_norm": 0.6752480864524841, + "learning_rate": 4.575774561761109e-05, + "loss": 1.3672, + "step": 9990 + }, + { + "epoch": 0.6791004212528876, + "grad_norm": 0.6025106310844421, + "learning_rate": 4.5755622367169456e-05, + "loss": 1.3795, + "step": 9995 + }, + { + "epoch": 0.6794401413235494, + "grad_norm": 0.6306586861610413, + "learning_rate": 4.5753499116727814e-05, + "loss": 1.2824, + "step": 10000 + }, + { + "epoch": 0.6797798613942112, + "grad_norm": 0.7083040475845337, + "learning_rate": 4.5751375866286184e-05, + "loss": 1.3134, + "step": 10005 + }, + { + "epoch": 0.680119581464873, + "grad_norm": 0.6891843676567078, + "learning_rate": 4.574925261584455e-05, + "loss": 1.2929, + "step": 10010 + }, + { + "epoch": 0.6804593015355347, + "grad_norm": 0.6544457077980042, + "learning_rate": 4.5747129365402906e-05, + "loss": 1.4114, + "step": 10015 + }, + { + "epoch": 0.6807990216061965, + "grad_norm": 0.607488751411438, + "learning_rate": 4.5745006114961276e-05, + "loss": 1.3442, + "step": 10020 + }, + { + "epoch": 0.6811387416768583, + "grad_norm": 0.6513368487358093, + "learning_rate": 4.574288286451964e-05, + "loss": 1.3319, + "step": 10025 + }, + { + "epoch": 0.68147846174752, + "grad_norm": 0.5297539234161377, + "learning_rate": 4.5740759614078e-05, + "loss": 1.3075, + "step": 10030 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.6227191686630249, + "learning_rate": 4.573863636363637e-05, + "loss": 1.3797, + "step": 10035 + }, + { + "epoch": 0.6821579018888436, + "grad_norm": 0.6473566293716431, + "learning_rate": 4.5736513113194726e-05, + "loss": 1.3847, + "step": 10040 + }, + { + "epoch": 0.6824976219595054, + "grad_norm": 0.6239564418792725, + "learning_rate": 4.573438986275309e-05, + "loss": 1.343, + "step": 10045 + }, + { + "epoch": 0.6828373420301671, + "grad_norm": 0.6086986660957336, + "learning_rate": 4.573226661231146e-05, + "loss": 1.2983, + "step": 10050 + }, + { + "epoch": 0.683177062100829, + "grad_norm": 0.6269145011901855, + "learning_rate": 4.573014336186982e-05, + "loss": 1.2954, + "step": 10055 + }, + { + "epoch": 0.6835167821714907, + "grad_norm": 0.6437265276908875, + "learning_rate": 4.572802011142819e-05, + "loss": 1.3948, + "step": 10060 + }, + { + "epoch": 0.6838565022421524, + "grad_norm": 0.5740530490875244, + "learning_rate": 4.572589686098655e-05, + "loss": 1.4117, + "step": 10065 + }, + { + "epoch": 0.6841962223128143, + "grad_norm": 0.6062965393066406, + "learning_rate": 4.572377361054491e-05, + "loss": 1.378, + "step": 10070 + }, + { + "epoch": 0.684535942383476, + "grad_norm": 0.6366291046142578, + "learning_rate": 4.572165036010328e-05, + "loss": 1.3278, + "step": 10075 + }, + { + "epoch": 0.6848756624541378, + "grad_norm": 0.6265395283699036, + "learning_rate": 4.5719527109661644e-05, + "loss": 1.4207, + "step": 10080 + }, + { + "epoch": 0.6852153825247995, + "grad_norm": 0.5947545766830444, + "learning_rate": 4.571740385922e-05, + "loss": 1.4252, + "step": 10085 + }, + { + "epoch": 0.6855551025954614, + "grad_norm": 0.6057619452476501, + "learning_rate": 4.571528060877837e-05, + "loss": 1.3236, + "step": 10090 + }, + { + "epoch": 0.6858948226661231, + "grad_norm": 0.698820173740387, + "learning_rate": 4.5713157358336736e-05, + "loss": 1.2652, + "step": 10095 + }, + { + "epoch": 0.6862345427367849, + "grad_norm": 0.6159490942955017, + "learning_rate": 4.5711034107895094e-05, + "loss": 1.3296, + "step": 10100 + }, + { + "epoch": 0.6865742628074467, + "grad_norm": 0.6031413078308105, + "learning_rate": 4.5708910857453464e-05, + "loss": 1.3923, + "step": 10105 + }, + { + "epoch": 0.6869139828781085, + "grad_norm": 0.6347100734710693, + "learning_rate": 4.570678760701182e-05, + "loss": 1.2772, + "step": 10110 + }, + { + "epoch": 0.6872537029487702, + "grad_norm": 0.6727352142333984, + "learning_rate": 4.5704664356570186e-05, + "loss": 1.3894, + "step": 10115 + }, + { + "epoch": 0.6875934230194319, + "grad_norm": 0.6287788152694702, + "learning_rate": 4.5702541106128556e-05, + "loss": 1.3129, + "step": 10120 + }, + { + "epoch": 0.6879331430900938, + "grad_norm": 0.7411888837814331, + "learning_rate": 4.5700417855686914e-05, + "loss": 1.4734, + "step": 10125 + }, + { + "epoch": 0.6882728631607555, + "grad_norm": 0.6954744458198547, + "learning_rate": 4.569829460524528e-05, + "loss": 1.3616, + "step": 10130 + }, + { + "epoch": 0.6886125832314173, + "grad_norm": 0.6447896957397461, + "learning_rate": 4.569617135480365e-05, + "loss": 1.4141, + "step": 10135 + }, + { + "epoch": 0.6889523033020791, + "grad_norm": 0.6385108232498169, + "learning_rate": 4.5694048104362006e-05, + "loss": 1.4046, + "step": 10140 + }, + { + "epoch": 0.6892920233727409, + "grad_norm": 0.7570716142654419, + "learning_rate": 4.569192485392037e-05, + "loss": 1.4226, + "step": 10145 + }, + { + "epoch": 0.6896317434434026, + "grad_norm": 0.6235305070877075, + "learning_rate": 4.568980160347874e-05, + "loss": 1.2798, + "step": 10150 + }, + { + "epoch": 0.6899714635140645, + "grad_norm": 0.6237177848815918, + "learning_rate": 4.56876783530371e-05, + "loss": 1.3472, + "step": 10155 + }, + { + "epoch": 0.6903111835847262, + "grad_norm": 0.6608687043190002, + "learning_rate": 4.568555510259546e-05, + "loss": 1.3718, + "step": 10160 + }, + { + "epoch": 0.690650903655388, + "grad_norm": 0.6903308033943176, + "learning_rate": 4.568343185215383e-05, + "loss": 1.3937, + "step": 10165 + }, + { + "epoch": 0.6909906237260497, + "grad_norm": 0.6505640149116516, + "learning_rate": 4.568130860171219e-05, + "loss": 1.4443, + "step": 10170 + }, + { + "epoch": 0.6913303437967115, + "grad_norm": 0.7226462364196777, + "learning_rate": 4.5679185351270554e-05, + "loss": 1.3357, + "step": 10175 + }, + { + "epoch": 0.6916700638673733, + "grad_norm": 0.6706532835960388, + "learning_rate": 4.5677062100828925e-05, + "loss": 1.4377, + "step": 10180 + }, + { + "epoch": 0.692009783938035, + "grad_norm": 0.5964548587799072, + "learning_rate": 4.567493885038728e-05, + "loss": 1.3318, + "step": 10185 + }, + { + "epoch": 0.6923495040086969, + "grad_norm": 0.4633779525756836, + "learning_rate": 4.5672815599945646e-05, + "loss": 1.3428, + "step": 10190 + }, + { + "epoch": 0.6926892240793586, + "grad_norm": 0.7144092917442322, + "learning_rate": 4.567069234950401e-05, + "loss": 1.4012, + "step": 10195 + }, + { + "epoch": 0.6930289441500204, + "grad_norm": 0.6145314574241638, + "learning_rate": 4.5668569099062374e-05, + "loss": 1.3638, + "step": 10200 + }, + { + "epoch": 0.6933686642206821, + "grad_norm": 0.699490487575531, + "learning_rate": 4.566644584862074e-05, + "loss": 1.2711, + "step": 10205 + }, + { + "epoch": 0.693708384291344, + "grad_norm": 0.6997410655021667, + "learning_rate": 4.56643225981791e-05, + "loss": 1.3239, + "step": 10210 + }, + { + "epoch": 0.6940481043620057, + "grad_norm": 0.63850998878479, + "learning_rate": 4.5662199347737466e-05, + "loss": 1.3865, + "step": 10215 + }, + { + "epoch": 0.6943878244326674, + "grad_norm": 0.6392037272453308, + "learning_rate": 4.566007609729583e-05, + "loss": 1.4092, + "step": 10220 + }, + { + "epoch": 0.6947275445033293, + "grad_norm": 0.7251085638999939, + "learning_rate": 4.5657952846854194e-05, + "loss": 1.35, + "step": 10225 + }, + { + "epoch": 0.695067264573991, + "grad_norm": 0.70627361536026, + "learning_rate": 4.565582959641256e-05, + "loss": 1.3771, + "step": 10230 + }, + { + "epoch": 0.6954069846446528, + "grad_norm": 0.5996231436729431, + "learning_rate": 4.565370634597092e-05, + "loss": 1.3957, + "step": 10235 + }, + { + "epoch": 0.6957467047153146, + "grad_norm": 0.6541487574577332, + "learning_rate": 4.5651583095529286e-05, + "loss": 1.294, + "step": 10240 + }, + { + "epoch": 0.6960864247859764, + "grad_norm": 0.6588912010192871, + "learning_rate": 4.564945984508765e-05, + "loss": 1.4421, + "step": 10245 + }, + { + "epoch": 0.6964261448566381, + "grad_norm": 0.7020395994186401, + "learning_rate": 4.5647336594646014e-05, + "loss": 1.3956, + "step": 10250 + }, + { + "epoch": 0.6967658649272999, + "grad_norm": 0.6050479412078857, + "learning_rate": 4.564521334420438e-05, + "loss": 1.3626, + "step": 10255 + }, + { + "epoch": 0.6971055849979617, + "grad_norm": 0.6568503379821777, + "learning_rate": 4.564309009376274e-05, + "loss": 1.3887, + "step": 10260 + }, + { + "epoch": 0.6974453050686235, + "grad_norm": 0.6809664368629456, + "learning_rate": 4.5640966843321106e-05, + "loss": 1.327, + "step": 10265 + }, + { + "epoch": 0.6977850251392852, + "grad_norm": 0.6142020225524902, + "learning_rate": 4.563884359287947e-05, + "loss": 1.388, + "step": 10270 + }, + { + "epoch": 0.698124745209947, + "grad_norm": 0.5905758142471313, + "learning_rate": 4.5636720342437834e-05, + "loss": 1.2479, + "step": 10275 + }, + { + "epoch": 0.6984644652806088, + "grad_norm": 0.7039254307746887, + "learning_rate": 4.56345970919962e-05, + "loss": 1.3374, + "step": 10280 + }, + { + "epoch": 0.6988041853512705, + "grad_norm": 0.6263798475265503, + "learning_rate": 4.563247384155456e-05, + "loss": 1.4462, + "step": 10285 + }, + { + "epoch": 0.6991439054219323, + "grad_norm": 0.6036354899406433, + "learning_rate": 4.5630350591112926e-05, + "loss": 1.4544, + "step": 10290 + }, + { + "epoch": 0.6994836254925941, + "grad_norm": 0.7210366725921631, + "learning_rate": 4.562822734067129e-05, + "loss": 1.253, + "step": 10295 + }, + { + "epoch": 0.6998233455632559, + "grad_norm": 0.5882965326309204, + "learning_rate": 4.5626104090229654e-05, + "loss": 1.3674, + "step": 10300 + }, + { + "epoch": 0.7001630656339176, + "grad_norm": 0.6516746878623962, + "learning_rate": 4.562398083978802e-05, + "loss": 1.3727, + "step": 10305 + }, + { + "epoch": 0.7005027857045795, + "grad_norm": 0.6511339545249939, + "learning_rate": 4.5621857589346375e-05, + "loss": 1.337, + "step": 10310 + }, + { + "epoch": 0.7008425057752412, + "grad_norm": 0.7368311882019043, + "learning_rate": 4.5619734338904746e-05, + "loss": 1.3485, + "step": 10315 + }, + { + "epoch": 0.701182225845903, + "grad_norm": 0.6463493704795837, + "learning_rate": 4.561761108846311e-05, + "loss": 1.3034, + "step": 10320 + }, + { + "epoch": 0.7015219459165648, + "grad_norm": 0.6115689873695374, + "learning_rate": 4.561548783802147e-05, + "loss": 1.3668, + "step": 10325 + }, + { + "epoch": 0.7018616659872265, + "grad_norm": 0.6443473100662231, + "learning_rate": 4.561336458757984e-05, + "loss": 1.3558, + "step": 10330 + }, + { + "epoch": 0.7022013860578883, + "grad_norm": 0.677765429019928, + "learning_rate": 4.56112413371382e-05, + "loss": 1.3628, + "step": 10335 + }, + { + "epoch": 0.70254110612855, + "grad_norm": 0.6149821877479553, + "learning_rate": 4.560911808669656e-05, + "loss": 1.3836, + "step": 10340 + }, + { + "epoch": 0.7028808261992119, + "grad_norm": 0.6386359333992004, + "learning_rate": 4.560699483625493e-05, + "loss": 1.395, + "step": 10345 + }, + { + "epoch": 0.7032205462698736, + "grad_norm": 0.605919361114502, + "learning_rate": 4.5604871585813294e-05, + "loss": 1.462, + "step": 10350 + }, + { + "epoch": 0.7035602663405354, + "grad_norm": 0.6383451819419861, + "learning_rate": 4.560274833537165e-05, + "loss": 1.2691, + "step": 10355 + }, + { + "epoch": 0.7038999864111972, + "grad_norm": 0.7361869215965271, + "learning_rate": 4.560062508493002e-05, + "loss": 1.3405, + "step": 10360 + }, + { + "epoch": 0.704239706481859, + "grad_norm": 0.8459182977676392, + "learning_rate": 4.5598501834488386e-05, + "loss": 1.4351, + "step": 10365 + }, + { + "epoch": 0.7045794265525207, + "grad_norm": 0.6890857219696045, + "learning_rate": 4.559637858404674e-05, + "loss": 1.3238, + "step": 10370 + }, + { + "epoch": 0.7049191466231824, + "grad_norm": 0.6507689356803894, + "learning_rate": 4.5594255333605114e-05, + "loss": 1.3762, + "step": 10375 + }, + { + "epoch": 0.7052588666938443, + "grad_norm": 0.7193464040756226, + "learning_rate": 4.559213208316348e-05, + "loss": 1.3918, + "step": 10380 + }, + { + "epoch": 0.705598586764506, + "grad_norm": 0.6927496194839478, + "learning_rate": 4.5590008832721835e-05, + "loss": 1.4717, + "step": 10385 + }, + { + "epoch": 0.7059383068351678, + "grad_norm": 0.6915751695632935, + "learning_rate": 4.5587885582280206e-05, + "loss": 1.3379, + "step": 10390 + }, + { + "epoch": 0.7062780269058296, + "grad_norm": 0.7506421208381653, + "learning_rate": 4.558576233183856e-05, + "loss": 1.3583, + "step": 10395 + }, + { + "epoch": 0.7066177469764914, + "grad_norm": 0.6764235496520996, + "learning_rate": 4.5583639081396934e-05, + "loss": 1.3166, + "step": 10400 + }, + { + "epoch": 0.7069574670471531, + "grad_norm": 0.6450632810592651, + "learning_rate": 4.55815158309553e-05, + "loss": 1.2878, + "step": 10405 + }, + { + "epoch": 0.707297187117815, + "grad_norm": 0.6657335758209229, + "learning_rate": 4.5579392580513655e-05, + "loss": 1.4115, + "step": 10410 + }, + { + "epoch": 0.7076369071884767, + "grad_norm": 0.6178698539733887, + "learning_rate": 4.5577269330072026e-05, + "loss": 1.3599, + "step": 10415 + }, + { + "epoch": 0.7079766272591385, + "grad_norm": 0.6750048398971558, + "learning_rate": 4.557514607963039e-05, + "loss": 1.3979, + "step": 10420 + }, + { + "epoch": 0.7083163473298002, + "grad_norm": 0.6003528237342834, + "learning_rate": 4.557302282918875e-05, + "loss": 1.4673, + "step": 10425 + }, + { + "epoch": 0.708656067400462, + "grad_norm": 0.6799806356430054, + "learning_rate": 4.557089957874712e-05, + "loss": 1.3712, + "step": 10430 + }, + { + "epoch": 0.7089957874711238, + "grad_norm": 0.6539444327354431, + "learning_rate": 4.556877632830548e-05, + "loss": 1.4233, + "step": 10435 + }, + { + "epoch": 0.7093355075417855, + "grad_norm": 0.7056052684783936, + "learning_rate": 4.556665307786384e-05, + "loss": 1.4099, + "step": 10440 + }, + { + "epoch": 0.7096752276124474, + "grad_norm": 0.6594746112823486, + "learning_rate": 4.556452982742221e-05, + "loss": 1.4822, + "step": 10445 + }, + { + "epoch": 0.7100149476831091, + "grad_norm": 0.636754035949707, + "learning_rate": 4.5562406576980574e-05, + "loss": 1.3653, + "step": 10450 + }, + { + "epoch": 0.7103546677537709, + "grad_norm": 0.6520203948020935, + "learning_rate": 4.556028332653893e-05, + "loss": 1.2665, + "step": 10455 + }, + { + "epoch": 0.7106943878244326, + "grad_norm": 0.6684704422950745, + "learning_rate": 4.55581600760973e-05, + "loss": 1.4546, + "step": 10460 + }, + { + "epoch": 0.7110341078950945, + "grad_norm": 0.7014590501785278, + "learning_rate": 4.555603682565566e-05, + "loss": 1.3714, + "step": 10465 + }, + { + "epoch": 0.7113738279657562, + "grad_norm": 0.677331805229187, + "learning_rate": 4.555391357521402e-05, + "loss": 1.4425, + "step": 10470 + }, + { + "epoch": 0.711713548036418, + "grad_norm": 0.6193029880523682, + "learning_rate": 4.5551790324772394e-05, + "loss": 1.4065, + "step": 10475 + }, + { + "epoch": 0.7120532681070798, + "grad_norm": 0.8266158699989319, + "learning_rate": 4.554966707433075e-05, + "loss": 1.3732, + "step": 10480 + }, + { + "epoch": 0.7123929881777415, + "grad_norm": 0.6649559140205383, + "learning_rate": 4.5547543823889115e-05, + "loss": 1.2375, + "step": 10485 + }, + { + "epoch": 0.7127327082484033, + "grad_norm": 0.6089277863502502, + "learning_rate": 4.5545420573447486e-05, + "loss": 1.3748, + "step": 10490 + }, + { + "epoch": 0.7130724283190651, + "grad_norm": 0.7148370146751404, + "learning_rate": 4.554329732300584e-05, + "loss": 1.2781, + "step": 10495 + }, + { + "epoch": 0.7134121483897269, + "grad_norm": 0.6397234201431274, + "learning_rate": 4.554117407256421e-05, + "loss": 1.305, + "step": 10500 + }, + { + "epoch": 0.7137518684603886, + "grad_norm": 0.6898680925369263, + "learning_rate": 4.553905082212258e-05, + "loss": 1.4395, + "step": 10505 + }, + { + "epoch": 0.7140915885310504, + "grad_norm": 0.6065067648887634, + "learning_rate": 4.5536927571680935e-05, + "loss": 1.4486, + "step": 10510 + }, + { + "epoch": 0.7144313086017122, + "grad_norm": 0.6090561151504517, + "learning_rate": 4.55348043212393e-05, + "loss": 1.4499, + "step": 10515 + }, + { + "epoch": 0.714771028672374, + "grad_norm": 0.7384569048881531, + "learning_rate": 4.553268107079767e-05, + "loss": 1.3616, + "step": 10520 + }, + { + "epoch": 0.7151107487430357, + "grad_norm": 0.6976181268692017, + "learning_rate": 4.553055782035603e-05, + "loss": 1.3167, + "step": 10525 + }, + { + "epoch": 0.7154504688136976, + "grad_norm": 0.6307021975517273, + "learning_rate": 4.552843456991439e-05, + "loss": 1.388, + "step": 10530 + }, + { + "epoch": 0.7157901888843593, + "grad_norm": 0.4543355703353882, + "learning_rate": 4.5526311319472755e-05, + "loss": 1.2206, + "step": 10535 + }, + { + "epoch": 0.716129908955021, + "grad_norm": 0.6298014521598816, + "learning_rate": 4.552418806903112e-05, + "loss": 1.3189, + "step": 10540 + }, + { + "epoch": 0.7164696290256828, + "grad_norm": 0.7497329115867615, + "learning_rate": 4.552206481858948e-05, + "loss": 1.3096, + "step": 10545 + }, + { + "epoch": 0.7168093490963446, + "grad_norm": 0.7202334403991699, + "learning_rate": 4.551994156814785e-05, + "loss": 1.3713, + "step": 10550 + }, + { + "epoch": 0.7171490691670064, + "grad_norm": 0.6543573141098022, + "learning_rate": 4.551781831770621e-05, + "loss": 1.3518, + "step": 10555 + }, + { + "epoch": 0.7174887892376681, + "grad_norm": 0.6194443106651306, + "learning_rate": 4.5515695067264575e-05, + "loss": 1.4825, + "step": 10560 + }, + { + "epoch": 0.71782850930833, + "grad_norm": 0.6676272749900818, + "learning_rate": 4.551357181682294e-05, + "loss": 1.4397, + "step": 10565 + }, + { + "epoch": 0.7181682293789917, + "grad_norm": 0.6669387221336365, + "learning_rate": 4.55114485663813e-05, + "loss": 1.4975, + "step": 10570 + }, + { + "epoch": 0.7185079494496535, + "grad_norm": 0.7604942321777344, + "learning_rate": 4.550932531593967e-05, + "loss": 1.4554, + "step": 10575 + }, + { + "epoch": 0.7188476695203153, + "grad_norm": 0.7051596641540527, + "learning_rate": 4.550720206549803e-05, + "loss": 1.4082, + "step": 10580 + }, + { + "epoch": 0.719187389590977, + "grad_norm": 0.7017998695373535, + "learning_rate": 4.5505078815056395e-05, + "loss": 1.4139, + "step": 10585 + }, + { + "epoch": 0.7195271096616388, + "grad_norm": 0.6412633657455444, + "learning_rate": 4.550295556461476e-05, + "loss": 1.3885, + "step": 10590 + }, + { + "epoch": 0.7198668297323005, + "grad_norm": 0.6595339179039001, + "learning_rate": 4.550083231417312e-05, + "loss": 1.2869, + "step": 10595 + }, + { + "epoch": 0.7202065498029624, + "grad_norm": 0.6425991654396057, + "learning_rate": 4.549870906373149e-05, + "loss": 1.3251, + "step": 10600 + }, + { + "epoch": 0.7205462698736241, + "grad_norm": 0.6715484261512756, + "learning_rate": 4.549658581328985e-05, + "loss": 1.3397, + "step": 10605 + }, + { + "epoch": 0.7208859899442859, + "grad_norm": 0.6597923636436462, + "learning_rate": 4.5494462562848215e-05, + "loss": 1.3952, + "step": 10610 + }, + { + "epoch": 0.7212257100149477, + "grad_norm": 0.704318106174469, + "learning_rate": 4.549233931240658e-05, + "loss": 1.3744, + "step": 10615 + }, + { + "epoch": 0.7215654300856095, + "grad_norm": 0.7401502132415771, + "learning_rate": 4.549021606196494e-05, + "loss": 1.417, + "step": 10620 + }, + { + "epoch": 0.7219051501562712, + "grad_norm": 0.661598801612854, + "learning_rate": 4.548809281152331e-05, + "loss": 1.426, + "step": 10625 + }, + { + "epoch": 0.722244870226933, + "grad_norm": 0.5903460383415222, + "learning_rate": 4.548596956108167e-05, + "loss": 1.4073, + "step": 10630 + }, + { + "epoch": 0.7225845902975948, + "grad_norm": 0.628123939037323, + "learning_rate": 4.5483846310640035e-05, + "loss": 1.3774, + "step": 10635 + }, + { + "epoch": 0.7229243103682566, + "grad_norm": 0.5890015363693237, + "learning_rate": 4.54817230601984e-05, + "loss": 1.2802, + "step": 10640 + }, + { + "epoch": 0.7232640304389183, + "grad_norm": 0.7097578048706055, + "learning_rate": 4.547959980975676e-05, + "loss": 1.2938, + "step": 10645 + }, + { + "epoch": 0.7236037505095801, + "grad_norm": 0.6969096064567566, + "learning_rate": 4.547747655931513e-05, + "loss": 1.3474, + "step": 10650 + }, + { + "epoch": 0.7239434705802419, + "grad_norm": 0.6347588300704956, + "learning_rate": 4.547535330887349e-05, + "loss": 1.3791, + "step": 10655 + }, + { + "epoch": 0.7242831906509036, + "grad_norm": 0.7243689894676208, + "learning_rate": 4.5473230058431855e-05, + "loss": 1.3636, + "step": 10660 + }, + { + "epoch": 0.7246229107215655, + "grad_norm": 0.6821337342262268, + "learning_rate": 4.547110680799021e-05, + "loss": 1.4354, + "step": 10665 + }, + { + "epoch": 0.7249626307922272, + "grad_norm": 0.6853811144828796, + "learning_rate": 4.546898355754858e-05, + "loss": 1.31, + "step": 10670 + }, + { + "epoch": 0.725302350862889, + "grad_norm": 0.6670143604278564, + "learning_rate": 4.546686030710695e-05, + "loss": 1.4195, + "step": 10675 + }, + { + "epoch": 0.7256420709335507, + "grad_norm": 0.711848258972168, + "learning_rate": 4.5464737056665305e-05, + "loss": 1.2501, + "step": 10680 + }, + { + "epoch": 0.7259817910042126, + "grad_norm": 0.7048289775848389, + "learning_rate": 4.5462613806223675e-05, + "loss": 1.3292, + "step": 10685 + }, + { + "epoch": 0.7263215110748743, + "grad_norm": 0.6657536029815674, + "learning_rate": 4.546049055578204e-05, + "loss": 1.2919, + "step": 10690 + }, + { + "epoch": 0.726661231145536, + "grad_norm": 0.6005533933639526, + "learning_rate": 4.5458367305340397e-05, + "loss": 1.3737, + "step": 10695 + }, + { + "epoch": 0.7270009512161979, + "grad_norm": 0.6253713965415955, + "learning_rate": 4.545624405489877e-05, + "loss": 1.4649, + "step": 10700 + }, + { + "epoch": 0.7273406712868596, + "grad_norm": 0.7332903146743774, + "learning_rate": 4.545412080445713e-05, + "loss": 1.4178, + "step": 10705 + }, + { + "epoch": 0.7276803913575214, + "grad_norm": 0.7133682370185852, + "learning_rate": 4.545199755401549e-05, + "loss": 1.3826, + "step": 10710 + }, + { + "epoch": 0.7280201114281831, + "grad_norm": 0.6334335803985596, + "learning_rate": 4.544987430357386e-05, + "loss": 1.3523, + "step": 10715 + }, + { + "epoch": 0.728359831498845, + "grad_norm": 0.6501543521881104, + "learning_rate": 4.544775105313222e-05, + "loss": 1.3774, + "step": 10720 + }, + { + "epoch": 0.7286995515695067, + "grad_norm": 0.6430721282958984, + "learning_rate": 4.544562780269058e-05, + "loss": 1.3669, + "step": 10725 + }, + { + "epoch": 0.7290392716401685, + "grad_norm": 0.6951696872711182, + "learning_rate": 4.544350455224895e-05, + "loss": 1.3315, + "step": 10730 + }, + { + "epoch": 0.7293789917108303, + "grad_norm": 0.6585467457771301, + "learning_rate": 4.5441381301807315e-05, + "loss": 1.3278, + "step": 10735 + }, + { + "epoch": 0.7297187117814921, + "grad_norm": 0.6286793351173401, + "learning_rate": 4.543925805136568e-05, + "loss": 1.442, + "step": 10740 + }, + { + "epoch": 0.7300584318521538, + "grad_norm": 0.6113405227661133, + "learning_rate": 4.5437134800924043e-05, + "loss": 1.3904, + "step": 10745 + }, + { + "epoch": 0.7303981519228157, + "grad_norm": 0.6477635502815247, + "learning_rate": 4.54350115504824e-05, + "loss": 1.285, + "step": 10750 + }, + { + "epoch": 0.7307378719934774, + "grad_norm": 0.6457996368408203, + "learning_rate": 4.543288830004077e-05, + "loss": 1.4113, + "step": 10755 + }, + { + "epoch": 0.7310775920641391, + "grad_norm": 0.6372414827346802, + "learning_rate": 4.5430765049599135e-05, + "loss": 1.384, + "step": 10760 + }, + { + "epoch": 0.7314173121348009, + "grad_norm": 0.6154240369796753, + "learning_rate": 4.542864179915749e-05, + "loss": 1.3949, + "step": 10765 + }, + { + "epoch": 0.7317570322054627, + "grad_norm": 0.670173168182373, + "learning_rate": 4.5426518548715863e-05, + "loss": 1.2533, + "step": 10770 + }, + { + "epoch": 0.7320967522761245, + "grad_norm": 0.6684125661849976, + "learning_rate": 4.542439529827423e-05, + "loss": 1.4462, + "step": 10775 + }, + { + "epoch": 0.7324364723467862, + "grad_norm": 0.5796847343444824, + "learning_rate": 4.5422272047832585e-05, + "loss": 1.386, + "step": 10780 + }, + { + "epoch": 0.7327761924174481, + "grad_norm": 0.7200872898101807, + "learning_rate": 4.5420148797390955e-05, + "loss": 1.3339, + "step": 10785 + }, + { + "epoch": 0.7331159124881098, + "grad_norm": 0.6216338276863098, + "learning_rate": 4.541802554694932e-05, + "loss": 1.3574, + "step": 10790 + }, + { + "epoch": 0.7334556325587716, + "grad_norm": 0.6423646807670593, + "learning_rate": 4.541590229650768e-05, + "loss": 1.3184, + "step": 10795 + }, + { + "epoch": 0.7337953526294333, + "grad_norm": 0.7440258860588074, + "learning_rate": 4.541377904606605e-05, + "loss": 1.3108, + "step": 10800 + }, + { + "epoch": 0.7341350727000951, + "grad_norm": 0.6222683191299438, + "learning_rate": 4.541165579562441e-05, + "loss": 1.3767, + "step": 10805 + }, + { + "epoch": 0.7344747927707569, + "grad_norm": 0.6356257796287537, + "learning_rate": 4.540953254518277e-05, + "loss": 1.3499, + "step": 10810 + }, + { + "epoch": 0.7348145128414186, + "grad_norm": 0.6644449830055237, + "learning_rate": 4.540740929474114e-05, + "loss": 1.3655, + "step": 10815 + }, + { + "epoch": 0.7351542329120805, + "grad_norm": 0.6641932129859924, + "learning_rate": 4.54052860442995e-05, + "loss": 1.394, + "step": 10820 + }, + { + "epoch": 0.7354939529827422, + "grad_norm": 0.6184588074684143, + "learning_rate": 4.540316279385786e-05, + "loss": 1.4549, + "step": 10825 + }, + { + "epoch": 0.735833673053404, + "grad_norm": 0.5978854894638062, + "learning_rate": 4.540103954341623e-05, + "loss": 1.4342, + "step": 10830 + }, + { + "epoch": 0.7361733931240658, + "grad_norm": 0.6878186464309692, + "learning_rate": 4.539891629297459e-05, + "loss": 1.3338, + "step": 10835 + }, + { + "epoch": 0.7365131131947276, + "grad_norm": 0.5620693564414978, + "learning_rate": 4.539679304253295e-05, + "loss": 1.3268, + "step": 10840 + }, + { + "epoch": 0.7368528332653893, + "grad_norm": 0.8356578946113586, + "learning_rate": 4.5394669792091323e-05, + "loss": 1.3955, + "step": 10845 + }, + { + "epoch": 0.737192553336051, + "grad_norm": 0.7349069118499756, + "learning_rate": 4.539254654164968e-05, + "loss": 1.4559, + "step": 10850 + }, + { + "epoch": 0.7375322734067129, + "grad_norm": 0.6967862248420715, + "learning_rate": 4.5390423291208045e-05, + "loss": 1.2999, + "step": 10855 + }, + { + "epoch": 0.7378719934773746, + "grad_norm": 0.6220554709434509, + "learning_rate": 4.5388300040766415e-05, + "loss": 1.4476, + "step": 10860 + }, + { + "epoch": 0.7382117135480364, + "grad_norm": 0.5988366007804871, + "learning_rate": 4.538617679032477e-05, + "loss": 1.3814, + "step": 10865 + }, + { + "epoch": 0.7385514336186982, + "grad_norm": 0.667321503162384, + "learning_rate": 4.538405353988314e-05, + "loss": 1.3919, + "step": 10870 + }, + { + "epoch": 0.73889115368936, + "grad_norm": 0.7849789261817932, + "learning_rate": 4.538193028944151e-05, + "loss": 1.3866, + "step": 10875 + }, + { + "epoch": 0.7392308737600217, + "grad_norm": 0.6790648102760315, + "learning_rate": 4.5379807038999865e-05, + "loss": 1.4061, + "step": 10880 + }, + { + "epoch": 0.7395705938306835, + "grad_norm": 0.7118006348609924, + "learning_rate": 4.537768378855823e-05, + "loss": 1.3913, + "step": 10885 + }, + { + "epoch": 0.7399103139013453, + "grad_norm": 0.6077026724815369, + "learning_rate": 4.537556053811659e-05, + "loss": 1.3604, + "step": 10890 + }, + { + "epoch": 0.7402500339720071, + "grad_norm": 0.641882061958313, + "learning_rate": 4.537343728767496e-05, + "loss": 1.3833, + "step": 10895 + }, + { + "epoch": 0.7405897540426688, + "grad_norm": 0.6631798148155212, + "learning_rate": 4.537131403723332e-05, + "loss": 1.3602, + "step": 10900 + }, + { + "epoch": 0.7409294741133307, + "grad_norm": 0.6099927425384521, + "learning_rate": 4.5369190786791685e-05, + "loss": 1.2839, + "step": 10905 + }, + { + "epoch": 0.7412691941839924, + "grad_norm": 0.6890628933906555, + "learning_rate": 4.536706753635005e-05, + "loss": 1.3124, + "step": 10910 + }, + { + "epoch": 0.7416089142546541, + "grad_norm": 0.678059995174408, + "learning_rate": 4.536494428590841e-05, + "loss": 1.3665, + "step": 10915 + }, + { + "epoch": 0.741948634325316, + "grad_norm": 0.5877015590667725, + "learning_rate": 4.536282103546678e-05, + "loss": 1.3879, + "step": 10920 + }, + { + "epoch": 0.7422883543959777, + "grad_norm": 0.7351785898208618, + "learning_rate": 4.536069778502514e-05, + "loss": 1.4093, + "step": 10925 + }, + { + "epoch": 0.7426280744666395, + "grad_norm": 0.5812264680862427, + "learning_rate": 4.5358574534583505e-05, + "loss": 1.3514, + "step": 10930 + }, + { + "epoch": 0.7429677945373012, + "grad_norm": 0.7274196743965149, + "learning_rate": 4.535645128414187e-05, + "loss": 1.4067, + "step": 10935 + }, + { + "epoch": 0.7433075146079631, + "grad_norm": 0.7358936071395874, + "learning_rate": 4.535432803370023e-05, + "loss": 1.3441, + "step": 10940 + }, + { + "epoch": 0.7436472346786248, + "grad_norm": 0.5588825941085815, + "learning_rate": 4.53522047832586e-05, + "loss": 1.3177, + "step": 10945 + }, + { + "epoch": 0.7439869547492866, + "grad_norm": 0.6864970922470093, + "learning_rate": 4.535008153281696e-05, + "loss": 1.4194, + "step": 10950 + }, + { + "epoch": 0.7443266748199484, + "grad_norm": 0.6844140887260437, + "learning_rate": 4.5347958282375325e-05, + "loss": 1.3596, + "step": 10955 + }, + { + "epoch": 0.7446663948906102, + "grad_norm": 0.628604531288147, + "learning_rate": 4.534583503193369e-05, + "loss": 1.3307, + "step": 10960 + }, + { + "epoch": 0.7450061149612719, + "grad_norm": 0.6032277941703796, + "learning_rate": 4.534371178149205e-05, + "loss": 1.3565, + "step": 10965 + }, + { + "epoch": 0.7453458350319336, + "grad_norm": 0.6987267136573792, + "learning_rate": 4.534158853105042e-05, + "loss": 1.3718, + "step": 10970 + }, + { + "epoch": 0.7456855551025955, + "grad_norm": 0.6521071791648865, + "learning_rate": 4.533946528060878e-05, + "loss": 1.2881, + "step": 10975 + }, + { + "epoch": 0.7460252751732572, + "grad_norm": 0.7675398588180542, + "learning_rate": 4.5337342030167145e-05, + "loss": 1.3546, + "step": 10980 + }, + { + "epoch": 0.746364995243919, + "grad_norm": 0.697261393070221, + "learning_rate": 4.533521877972551e-05, + "loss": 1.4466, + "step": 10985 + }, + { + "epoch": 0.7467047153145808, + "grad_norm": 0.6618101000785828, + "learning_rate": 4.533309552928387e-05, + "loss": 1.3422, + "step": 10990 + }, + { + "epoch": 0.7470444353852426, + "grad_norm": 0.6806418299674988, + "learning_rate": 4.533097227884224e-05, + "loss": 1.2398, + "step": 10995 + }, + { + "epoch": 0.7473841554559043, + "grad_norm": 0.6811333894729614, + "learning_rate": 4.53288490284006e-05, + "loss": 1.3156, + "step": 11000 + }, + { + "epoch": 0.7477238755265662, + "grad_norm": 0.6440121531486511, + "learning_rate": 4.5326725777958965e-05, + "loss": 1.3333, + "step": 11005 + }, + { + "epoch": 0.7480635955972279, + "grad_norm": 0.6451594233512878, + "learning_rate": 4.532460252751733e-05, + "loss": 1.3979, + "step": 11010 + }, + { + "epoch": 0.7484033156678896, + "grad_norm": 0.6549493670463562, + "learning_rate": 4.532247927707569e-05, + "loss": 1.2734, + "step": 11015 + }, + { + "epoch": 0.7487430357385514, + "grad_norm": 0.6674458384513855, + "learning_rate": 4.532035602663405e-05, + "loss": 1.3546, + "step": 11020 + }, + { + "epoch": 0.7490827558092132, + "grad_norm": 0.645881712436676, + "learning_rate": 4.531823277619242e-05, + "loss": 1.3222, + "step": 11025 + }, + { + "epoch": 0.749422475879875, + "grad_norm": 0.7385879158973694, + "learning_rate": 4.5316109525750785e-05, + "loss": 1.3321, + "step": 11030 + }, + { + "epoch": 0.7497621959505367, + "grad_norm": 0.6517509818077087, + "learning_rate": 4.531398627530914e-05, + "loss": 1.3645, + "step": 11035 + }, + { + "epoch": 0.7501019160211986, + "grad_norm": 0.7039206624031067, + "learning_rate": 4.531186302486751e-05, + "loss": 1.4079, + "step": 11040 + }, + { + "epoch": 0.7504416360918603, + "grad_norm": 0.6504554748535156, + "learning_rate": 4.530973977442588e-05, + "loss": 1.2961, + "step": 11045 + }, + { + "epoch": 0.7507813561625221, + "grad_norm": 0.6780588626861572, + "learning_rate": 4.5307616523984234e-05, + "loss": 1.428, + "step": 11050 + }, + { + "epoch": 0.7511210762331838, + "grad_norm": 0.718646764755249, + "learning_rate": 4.5305493273542605e-05, + "loss": 1.2856, + "step": 11055 + }, + { + "epoch": 0.7514607963038457, + "grad_norm": 0.7259412407875061, + "learning_rate": 4.530337002310097e-05, + "loss": 1.4011, + "step": 11060 + }, + { + "epoch": 0.7518005163745074, + "grad_norm": 0.6184737682342529, + "learning_rate": 4.5301246772659326e-05, + "loss": 1.3844, + "step": 11065 + }, + { + "epoch": 0.7521402364451691, + "grad_norm": 0.7287435531616211, + "learning_rate": 4.52991235222177e-05, + "loss": 1.268, + "step": 11070 + }, + { + "epoch": 0.752479956515831, + "grad_norm": 0.5963881611824036, + "learning_rate": 4.529700027177606e-05, + "loss": 1.3586, + "step": 11075 + }, + { + "epoch": 0.7528196765864927, + "grad_norm": 0.6301303505897522, + "learning_rate": 4.5294877021334425e-05, + "loss": 1.4247, + "step": 11080 + }, + { + "epoch": 0.7531593966571545, + "grad_norm": 0.6454952955245972, + "learning_rate": 4.529275377089279e-05, + "loss": 1.381, + "step": 11085 + }, + { + "epoch": 0.7534991167278163, + "grad_norm": 0.6887718439102173, + "learning_rate": 4.5290630520451146e-05, + "loss": 1.3288, + "step": 11090 + }, + { + "epoch": 0.7538388367984781, + "grad_norm": 0.698443591594696, + "learning_rate": 4.528850727000952e-05, + "loss": 1.3923, + "step": 11095 + }, + { + "epoch": 0.7541785568691398, + "grad_norm": 0.6156599521636963, + "learning_rate": 4.528638401956788e-05, + "loss": 1.3565, + "step": 11100 + }, + { + "epoch": 0.7545182769398016, + "grad_norm": 0.6258707642555237, + "learning_rate": 4.528426076912624e-05, + "loss": 1.4253, + "step": 11105 + }, + { + "epoch": 0.7548579970104634, + "grad_norm": 0.748116135597229, + "learning_rate": 4.528213751868461e-05, + "loss": 1.3582, + "step": 11110 + }, + { + "epoch": 0.7551977170811252, + "grad_norm": 0.6185370683670044, + "learning_rate": 4.528001426824297e-05, + "loss": 1.3557, + "step": 11115 + }, + { + "epoch": 0.7555374371517869, + "grad_norm": 0.6756013631820679, + "learning_rate": 4.527789101780133e-05, + "loss": 1.3789, + "step": 11120 + }, + { + "epoch": 0.7558771572224487, + "grad_norm": 0.7011415958404541, + "learning_rate": 4.52757677673597e-05, + "loss": 1.4337, + "step": 11125 + }, + { + "epoch": 0.7562168772931105, + "grad_norm": 0.6777071356773376, + "learning_rate": 4.5273644516918065e-05, + "loss": 1.3069, + "step": 11130 + }, + { + "epoch": 0.7565565973637722, + "grad_norm": 0.5916514992713928, + "learning_rate": 4.527152126647642e-05, + "loss": 1.4871, + "step": 11135 + }, + { + "epoch": 0.756896317434434, + "grad_norm": 0.6710848808288574, + "learning_rate": 4.526939801603479e-05, + "loss": 1.3507, + "step": 11140 + }, + { + "epoch": 0.7572360375050958, + "grad_norm": 0.633577287197113, + "learning_rate": 4.526727476559316e-05, + "loss": 1.3479, + "step": 11145 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.6314858794212341, + "learning_rate": 4.5265151515151514e-05, + "loss": 1.4401, + "step": 11150 + }, + { + "epoch": 0.7579154776464193, + "grad_norm": 0.6551578044891357, + "learning_rate": 4.5263028264709885e-05, + "loss": 1.2934, + "step": 11155 + }, + { + "epoch": 0.7582551977170812, + "grad_norm": 0.7416176795959473, + "learning_rate": 4.526090501426825e-05, + "loss": 1.4084, + "step": 11160 + }, + { + "epoch": 0.7585949177877429, + "grad_norm": 0.6715858578681946, + "learning_rate": 4.5258781763826606e-05, + "loss": 1.269, + "step": 11165 + }, + { + "epoch": 0.7589346378584046, + "grad_norm": 0.6637235879898071, + "learning_rate": 4.525665851338498e-05, + "loss": 1.3334, + "step": 11170 + }, + { + "epoch": 0.7592743579290665, + "grad_norm": 0.6463482975959778, + "learning_rate": 4.5254535262943334e-05, + "loss": 1.4485, + "step": 11175 + }, + { + "epoch": 0.7596140779997282, + "grad_norm": 0.7097727656364441, + "learning_rate": 4.52524120125017e-05, + "loss": 1.3639, + "step": 11180 + }, + { + "epoch": 0.75995379807039, + "grad_norm": 0.6772062182426453, + "learning_rate": 4.525028876206007e-05, + "loss": 1.3756, + "step": 11185 + }, + { + "epoch": 0.7602935181410517, + "grad_norm": 0.6018824577331543, + "learning_rate": 4.5248165511618426e-05, + "loss": 1.3822, + "step": 11190 + }, + { + "epoch": 0.7606332382117136, + "grad_norm": 0.610094428062439, + "learning_rate": 4.524604226117679e-05, + "loss": 1.4147, + "step": 11195 + }, + { + "epoch": 0.7609729582823753, + "grad_norm": 0.6564494967460632, + "learning_rate": 4.524391901073516e-05, + "loss": 1.3623, + "step": 11200 + }, + { + "epoch": 0.7613126783530371, + "grad_norm": 0.6258195638656616, + "learning_rate": 4.524179576029352e-05, + "loss": 1.3278, + "step": 11205 + }, + { + "epoch": 0.7616523984236989, + "grad_norm": 0.5058313012123108, + "learning_rate": 4.523967250985188e-05, + "loss": 1.2296, + "step": 11210 + }, + { + "epoch": 0.7619921184943607, + "grad_norm": 0.6882098317146301, + "learning_rate": 4.523754925941025e-05, + "loss": 1.283, + "step": 11215 + }, + { + "epoch": 0.7623318385650224, + "grad_norm": 0.6522939801216125, + "learning_rate": 4.523542600896861e-05, + "loss": 1.3471, + "step": 11220 + }, + { + "epoch": 0.7626715586356841, + "grad_norm": 0.7013705968856812, + "learning_rate": 4.5233302758526974e-05, + "loss": 1.3625, + "step": 11225 + }, + { + "epoch": 0.763011278706346, + "grad_norm": 0.669908881187439, + "learning_rate": 4.5231179508085345e-05, + "loss": 1.4546, + "step": 11230 + }, + { + "epoch": 0.7633509987770077, + "grad_norm": 0.6184269785881042, + "learning_rate": 4.52290562576437e-05, + "loss": 1.4344, + "step": 11235 + }, + { + "epoch": 0.7636907188476695, + "grad_norm": 0.6487805247306824, + "learning_rate": 4.5226933007202066e-05, + "loss": 1.4197, + "step": 11240 + }, + { + "epoch": 0.7640304389183313, + "grad_norm": 0.7183907628059387, + "learning_rate": 4.522480975676043e-05, + "loss": 1.268, + "step": 11245 + }, + { + "epoch": 0.7643701589889931, + "grad_norm": 0.6929051876068115, + "learning_rate": 4.5222686506318794e-05, + "loss": 1.3469, + "step": 11250 + }, + { + "epoch": 0.7647098790596548, + "grad_norm": 0.6434996724128723, + "learning_rate": 4.522056325587716e-05, + "loss": 1.3583, + "step": 11255 + }, + { + "epoch": 0.7650495991303167, + "grad_norm": 0.7112532258033752, + "learning_rate": 4.521844000543552e-05, + "loss": 1.3904, + "step": 11260 + }, + { + "epoch": 0.7653893192009784, + "grad_norm": 0.6957665085792542, + "learning_rate": 4.5216316754993886e-05, + "loss": 1.4094, + "step": 11265 + }, + { + "epoch": 0.7657290392716402, + "grad_norm": 0.6419957280158997, + "learning_rate": 4.521419350455225e-05, + "loss": 1.2995, + "step": 11270 + }, + { + "epoch": 0.7660687593423019, + "grad_norm": 0.5854098796844482, + "learning_rate": 4.5212070254110614e-05, + "loss": 1.2422, + "step": 11275 + }, + { + "epoch": 0.7664084794129638, + "grad_norm": 0.6475620269775391, + "learning_rate": 4.520994700366898e-05, + "loss": 1.3331, + "step": 11280 + }, + { + "epoch": 0.7667481994836255, + "grad_norm": 0.6581407785415649, + "learning_rate": 4.520782375322734e-05, + "loss": 1.2795, + "step": 11285 + }, + { + "epoch": 0.7670879195542872, + "grad_norm": 0.6953970789909363, + "learning_rate": 4.5205700502785706e-05, + "loss": 1.4028, + "step": 11290 + }, + { + "epoch": 0.7674276396249491, + "grad_norm": 0.6748022437095642, + "learning_rate": 4.520357725234407e-05, + "loss": 1.3456, + "step": 11295 + }, + { + "epoch": 0.7677673596956108, + "grad_norm": 0.6086054444313049, + "learning_rate": 4.5201454001902434e-05, + "loss": 1.4028, + "step": 11300 + }, + { + "epoch": 0.7681070797662726, + "grad_norm": 0.5455803871154785, + "learning_rate": 4.51993307514608e-05, + "loss": 1.3613, + "step": 11305 + }, + { + "epoch": 0.7684467998369343, + "grad_norm": 0.7180493474006653, + "learning_rate": 4.519720750101916e-05, + "loss": 1.357, + "step": 11310 + }, + { + "epoch": 0.7687865199075962, + "grad_norm": 0.6505820751190186, + "learning_rate": 4.5195084250577526e-05, + "loss": 1.2822, + "step": 11315 + }, + { + "epoch": 0.7691262399782579, + "grad_norm": 0.7178351879119873, + "learning_rate": 4.519296100013589e-05, + "loss": 1.3484, + "step": 11320 + }, + { + "epoch": 0.7694659600489197, + "grad_norm": 0.7940161228179932, + "learning_rate": 4.5190837749694254e-05, + "loss": 1.2942, + "step": 11325 + }, + { + "epoch": 0.7698056801195815, + "grad_norm": 0.693401575088501, + "learning_rate": 4.518871449925262e-05, + "loss": 1.3506, + "step": 11330 + }, + { + "epoch": 0.7701454001902432, + "grad_norm": 0.7177097201347351, + "learning_rate": 4.518659124881098e-05, + "loss": 1.2475, + "step": 11335 + }, + { + "epoch": 0.770485120260905, + "grad_norm": 0.7371872067451477, + "learning_rate": 4.5184467998369346e-05, + "loss": 1.3756, + "step": 11340 + }, + { + "epoch": 0.7708248403315668, + "grad_norm": 0.7532777190208435, + "learning_rate": 4.518234474792771e-05, + "loss": 1.3471, + "step": 11345 + }, + { + "epoch": 0.7711645604022286, + "grad_norm": 0.8056697845458984, + "learning_rate": 4.5180221497486074e-05, + "loss": 1.4821, + "step": 11350 + }, + { + "epoch": 0.7715042804728903, + "grad_norm": 0.6422814726829529, + "learning_rate": 4.517809824704444e-05, + "loss": 1.3969, + "step": 11355 + }, + { + "epoch": 0.7718440005435521, + "grad_norm": 0.6496434807777405, + "learning_rate": 4.51759749966028e-05, + "loss": 1.395, + "step": 11360 + }, + { + "epoch": 0.7721837206142139, + "grad_norm": 0.6860586404800415, + "learning_rate": 4.5173851746161166e-05, + "loss": 1.2887, + "step": 11365 + }, + { + "epoch": 0.7725234406848757, + "grad_norm": 0.6781855821609497, + "learning_rate": 4.517172849571953e-05, + "loss": 1.4128, + "step": 11370 + }, + { + "epoch": 0.7728631607555374, + "grad_norm": 0.6779969930648804, + "learning_rate": 4.516960524527789e-05, + "loss": 1.2631, + "step": 11375 + }, + { + "epoch": 0.7732028808261993, + "grad_norm": 0.7600628137588501, + "learning_rate": 4.516748199483626e-05, + "loss": 1.3465, + "step": 11380 + }, + { + "epoch": 0.773542600896861, + "grad_norm": 0.6441403031349182, + "learning_rate": 4.516535874439462e-05, + "loss": 1.446, + "step": 11385 + }, + { + "epoch": 0.7738823209675227, + "grad_norm": 0.6897464990615845, + "learning_rate": 4.516323549395298e-05, + "loss": 1.3506, + "step": 11390 + }, + { + "epoch": 0.7742220410381845, + "grad_norm": 0.7222656607627869, + "learning_rate": 4.516111224351135e-05, + "loss": 1.3308, + "step": 11395 + }, + { + "epoch": 0.7745617611088463, + "grad_norm": 0.6664167642593384, + "learning_rate": 4.5158988993069714e-05, + "loss": 1.3773, + "step": 11400 + }, + { + "epoch": 0.7749014811795081, + "grad_norm": 0.690582275390625, + "learning_rate": 4.515686574262807e-05, + "loss": 1.4416, + "step": 11405 + }, + { + "epoch": 0.7752412012501698, + "grad_norm": 0.6101199984550476, + "learning_rate": 4.515474249218644e-05, + "loss": 1.3748, + "step": 11410 + }, + { + "epoch": 0.7755809213208317, + "grad_norm": 0.8245213627815247, + "learning_rate": 4.5152619241744806e-05, + "loss": 1.3361, + "step": 11415 + }, + { + "epoch": 0.7759206413914934, + "grad_norm": 0.7068120241165161, + "learning_rate": 4.515049599130317e-05, + "loss": 1.3667, + "step": 11420 + }, + { + "epoch": 0.7762603614621552, + "grad_norm": 0.6961685419082642, + "learning_rate": 4.5148372740861534e-05, + "loss": 1.4166, + "step": 11425 + }, + { + "epoch": 0.776600081532817, + "grad_norm": 0.6665591597557068, + "learning_rate": 4.51462494904199e-05, + "loss": 1.3002, + "step": 11430 + }, + { + "epoch": 0.7769398016034788, + "grad_norm": 0.6144057512283325, + "learning_rate": 4.514412623997826e-05, + "loss": 1.4215, + "step": 11435 + }, + { + "epoch": 0.7772795216741405, + "grad_norm": 0.6899815797805786, + "learning_rate": 4.5142002989536626e-05, + "loss": 1.4774, + "step": 11440 + }, + { + "epoch": 0.7776192417448022, + "grad_norm": 0.6336473226547241, + "learning_rate": 4.5139879739094984e-05, + "loss": 1.4458, + "step": 11445 + }, + { + "epoch": 0.7779589618154641, + "grad_norm": 0.734282374382019, + "learning_rate": 4.5137756488653354e-05, + "loss": 1.3422, + "step": 11450 + }, + { + "epoch": 0.7782986818861258, + "grad_norm": 0.6325424313545227, + "learning_rate": 4.513563323821172e-05, + "loss": 1.6687, + "step": 11455 + }, + { + "epoch": 0.7786384019567876, + "grad_norm": 0.7124289274215698, + "learning_rate": 4.5133509987770076e-05, + "loss": 1.3919, + "step": 11460 + }, + { + "epoch": 0.7789781220274494, + "grad_norm": 0.6993489861488342, + "learning_rate": 4.5131386737328446e-05, + "loss": 1.3735, + "step": 11465 + }, + { + "epoch": 0.7793178420981112, + "grad_norm": 0.6520628929138184, + "learning_rate": 4.512926348688681e-05, + "loss": 1.3458, + "step": 11470 + }, + { + "epoch": 0.7796575621687729, + "grad_norm": 0.6258121132850647, + "learning_rate": 4.512714023644517e-05, + "loss": 1.3438, + "step": 11475 + }, + { + "epoch": 0.7799972822394347, + "grad_norm": 0.6752614378929138, + "learning_rate": 4.512501698600354e-05, + "loss": 1.4234, + "step": 11480 + }, + { + "epoch": 0.7803370023100965, + "grad_norm": 0.7024703621864319, + "learning_rate": 4.51228937355619e-05, + "loss": 1.4341, + "step": 11485 + }, + { + "epoch": 0.7806767223807582, + "grad_norm": 0.65691739320755, + "learning_rate": 4.512077048512026e-05, + "loss": 1.319, + "step": 11490 + }, + { + "epoch": 0.78101644245142, + "grad_norm": 0.7620543837547302, + "learning_rate": 4.511864723467863e-05, + "loss": 1.3211, + "step": 11495 + }, + { + "epoch": 0.7813561625220818, + "grad_norm": 0.6572378873825073, + "learning_rate": 4.5116523984236994e-05, + "loss": 1.3941, + "step": 11500 + }, + { + "epoch": 0.7816958825927436, + "grad_norm": 0.6216370463371277, + "learning_rate": 4.511440073379535e-05, + "loss": 1.2718, + "step": 11505 + }, + { + "epoch": 0.7820356026634053, + "grad_norm": 0.6395732164382935, + "learning_rate": 4.511227748335372e-05, + "loss": 1.3318, + "step": 11510 + }, + { + "epoch": 0.7823753227340672, + "grad_norm": 0.6153049468994141, + "learning_rate": 4.511015423291208e-05, + "loss": 1.33, + "step": 11515 + }, + { + "epoch": 0.7827150428047289, + "grad_norm": 0.6759575605392456, + "learning_rate": 4.5108030982470444e-05, + "loss": 1.4836, + "step": 11520 + }, + { + "epoch": 0.7830547628753907, + "grad_norm": 0.6762524247169495, + "learning_rate": 4.5105907732028814e-05, + "loss": 1.427, + "step": 11525 + }, + { + "epoch": 0.7833944829460524, + "grad_norm": 0.6496263146400452, + "learning_rate": 4.510378448158717e-05, + "loss": 1.3195, + "step": 11530 + }, + { + "epoch": 0.7837342030167143, + "grad_norm": 0.6128015518188477, + "learning_rate": 4.5101661231145536e-05, + "loss": 1.3555, + "step": 11535 + }, + { + "epoch": 0.784073923087376, + "grad_norm": 0.6299436092376709, + "learning_rate": 4.5099537980703906e-05, + "loss": 1.4056, + "step": 11540 + }, + { + "epoch": 0.7844136431580377, + "grad_norm": 0.6537281274795532, + "learning_rate": 4.5097414730262264e-05, + "loss": 1.3819, + "step": 11545 + }, + { + "epoch": 0.7847533632286996, + "grad_norm": 0.6531508564949036, + "learning_rate": 4.509529147982063e-05, + "loss": 1.3104, + "step": 11550 + }, + { + "epoch": 0.7850930832993613, + "grad_norm": 0.6828337907791138, + "learning_rate": 4.5093168229379e-05, + "loss": 1.4078, + "step": 11555 + }, + { + "epoch": 0.7854328033700231, + "grad_norm": 0.6737443804740906, + "learning_rate": 4.5091044978937356e-05, + "loss": 1.4144, + "step": 11560 + }, + { + "epoch": 0.7857725234406848, + "grad_norm": 0.7346624732017517, + "learning_rate": 4.508892172849572e-05, + "loss": 1.4487, + "step": 11565 + }, + { + "epoch": 0.7861122435113467, + "grad_norm": 0.6856018304824829, + "learning_rate": 4.508679847805409e-05, + "loss": 1.2994, + "step": 11570 + }, + { + "epoch": 0.7864519635820084, + "grad_norm": 0.6291006207466125, + "learning_rate": 4.508467522761245e-05, + "loss": 1.1959, + "step": 11575 + }, + { + "epoch": 0.7867916836526702, + "grad_norm": 0.6951645612716675, + "learning_rate": 4.508255197717081e-05, + "loss": 1.3675, + "step": 11580 + }, + { + "epoch": 0.787131403723332, + "grad_norm": 0.8353897333145142, + "learning_rate": 4.508042872672918e-05, + "loss": 1.3176, + "step": 11585 + }, + { + "epoch": 0.7874711237939938, + "grad_norm": 0.6127036809921265, + "learning_rate": 4.507830547628754e-05, + "loss": 1.3862, + "step": 11590 + }, + { + "epoch": 0.7878108438646555, + "grad_norm": 0.6927005648612976, + "learning_rate": 4.5076182225845904e-05, + "loss": 1.362, + "step": 11595 + }, + { + "epoch": 0.7881505639353173, + "grad_norm": 0.6582446098327637, + "learning_rate": 4.507405897540427e-05, + "loss": 1.3313, + "step": 11600 + }, + { + "epoch": 0.7884902840059791, + "grad_norm": 0.6648750901222229, + "learning_rate": 4.507193572496263e-05, + "loss": 1.411, + "step": 11605 + }, + { + "epoch": 0.7888300040766408, + "grad_norm": 0.6673555374145508, + "learning_rate": 4.5069812474520996e-05, + "loss": 1.4298, + "step": 11610 + }, + { + "epoch": 0.7891697241473026, + "grad_norm": 0.6484569907188416, + "learning_rate": 4.506768922407936e-05, + "loss": 1.3281, + "step": 11615 + }, + { + "epoch": 0.7895094442179644, + "grad_norm": 0.7276854515075684, + "learning_rate": 4.5065565973637724e-05, + "loss": 1.3447, + "step": 11620 + }, + { + "epoch": 0.7898491642886262, + "grad_norm": 0.7185012698173523, + "learning_rate": 4.506344272319609e-05, + "loss": 1.3579, + "step": 11625 + }, + { + "epoch": 0.7901888843592879, + "grad_norm": 0.6514365077018738, + "learning_rate": 4.506131947275445e-05, + "loss": 1.4761, + "step": 11630 + }, + { + "epoch": 0.7905286044299498, + "grad_norm": 0.6557944416999817, + "learning_rate": 4.5059196222312816e-05, + "loss": 1.4036, + "step": 11635 + }, + { + "epoch": 0.7908683245006115, + "grad_norm": 0.648170530796051, + "learning_rate": 4.505707297187118e-05, + "loss": 1.385, + "step": 11640 + }, + { + "epoch": 0.7912080445712733, + "grad_norm": 0.6744073629379272, + "learning_rate": 4.5054949721429544e-05, + "loss": 1.3894, + "step": 11645 + }, + { + "epoch": 0.791547764641935, + "grad_norm": 0.6395312547683716, + "learning_rate": 4.505282647098791e-05, + "loss": 1.4086, + "step": 11650 + }, + { + "epoch": 0.7918874847125968, + "grad_norm": 0.7469674944877625, + "learning_rate": 4.505070322054627e-05, + "loss": 1.3691, + "step": 11655 + }, + { + "epoch": 0.7922272047832586, + "grad_norm": 0.676816999912262, + "learning_rate": 4.5048579970104636e-05, + "loss": 1.4049, + "step": 11660 + }, + { + "epoch": 0.7925669248539203, + "grad_norm": 0.6831178069114685, + "learning_rate": 4.5046456719663e-05, + "loss": 1.4065, + "step": 11665 + }, + { + "epoch": 0.7929066449245822, + "grad_norm": 0.6235385537147522, + "learning_rate": 4.5044333469221364e-05, + "loss": 1.3501, + "step": 11670 + }, + { + "epoch": 0.7932463649952439, + "grad_norm": 0.7086701393127441, + "learning_rate": 4.504221021877973e-05, + "loss": 1.3325, + "step": 11675 + }, + { + "epoch": 0.7935860850659057, + "grad_norm": 0.629204511642456, + "learning_rate": 4.504008696833809e-05, + "loss": 1.309, + "step": 11680 + }, + { + "epoch": 0.7939258051365675, + "grad_norm": 0.6631782650947571, + "learning_rate": 4.5037963717896456e-05, + "loss": 1.4683, + "step": 11685 + }, + { + "epoch": 0.7942655252072293, + "grad_norm": 0.6834977865219116, + "learning_rate": 4.503584046745482e-05, + "loss": 1.2518, + "step": 11690 + }, + { + "epoch": 0.794605245277891, + "grad_norm": 0.7612637281417847, + "learning_rate": 4.5033717217013184e-05, + "loss": 1.4312, + "step": 11695 + }, + { + "epoch": 0.7949449653485527, + "grad_norm": 0.7162277698516846, + "learning_rate": 4.503159396657155e-05, + "loss": 1.4017, + "step": 11700 + }, + { + "epoch": 0.7952846854192146, + "grad_norm": 0.7021288275718689, + "learning_rate": 4.502947071612991e-05, + "loss": 1.2941, + "step": 11705 + }, + { + "epoch": 0.7956244054898763, + "grad_norm": 0.6390158534049988, + "learning_rate": 4.5027347465688276e-05, + "loss": 1.3193, + "step": 11710 + }, + { + "epoch": 0.7959641255605381, + "grad_norm": 0.7118582129478455, + "learning_rate": 4.502522421524663e-05, + "loss": 1.4165, + "step": 11715 + }, + { + "epoch": 0.7963038456311999, + "grad_norm": 0.6698769927024841, + "learning_rate": 4.5023100964805004e-05, + "loss": 1.3383, + "step": 11720 + }, + { + "epoch": 0.7966435657018617, + "grad_norm": 0.6821621656417847, + "learning_rate": 4.502097771436337e-05, + "loss": 1.3457, + "step": 11725 + }, + { + "epoch": 0.7969832857725234, + "grad_norm": 0.6107878088951111, + "learning_rate": 4.5018854463921725e-05, + "loss": 1.2819, + "step": 11730 + }, + { + "epoch": 0.7973230058431852, + "grad_norm": 0.783470630645752, + "learning_rate": 4.5016731213480096e-05, + "loss": 1.3601, + "step": 11735 + }, + { + "epoch": 0.797662725913847, + "grad_norm": 0.5205401182174683, + "learning_rate": 4.501460796303846e-05, + "loss": 1.2042, + "step": 11740 + }, + { + "epoch": 0.7980024459845088, + "grad_norm": 0.6711729168891907, + "learning_rate": 4.501248471259682e-05, + "loss": 1.2931, + "step": 11745 + }, + { + "epoch": 0.7983421660551705, + "grad_norm": 0.7575369477272034, + "learning_rate": 4.501036146215519e-05, + "loss": 1.32, + "step": 11750 + }, + { + "epoch": 0.7986818861258324, + "grad_norm": 0.7468956708908081, + "learning_rate": 4.500823821171355e-05, + "loss": 1.3422, + "step": 11755 + }, + { + "epoch": 0.7990216061964941, + "grad_norm": 0.6427664756774902, + "learning_rate": 4.5006114961271916e-05, + "loss": 1.3344, + "step": 11760 + }, + { + "epoch": 0.7993613262671558, + "grad_norm": 0.6748515963554382, + "learning_rate": 4.500399171083028e-05, + "loss": 1.3656, + "step": 11765 + }, + { + "epoch": 0.7997010463378177, + "grad_norm": 0.6909468173980713, + "learning_rate": 4.5001868460388644e-05, + "loss": 1.5675, + "step": 11770 + }, + { + "epoch": 0.8000407664084794, + "grad_norm": 0.6876988410949707, + "learning_rate": 4.499974520994701e-05, + "loss": 1.3812, + "step": 11775 + }, + { + "epoch": 0.8003804864791412, + "grad_norm": 0.698937714099884, + "learning_rate": 4.499762195950537e-05, + "loss": 1.3926, + "step": 11780 + }, + { + "epoch": 0.8007202065498029, + "grad_norm": 0.5930280089378357, + "learning_rate": 4.4995498709063736e-05, + "loss": 1.4472, + "step": 11785 + }, + { + "epoch": 0.8010599266204648, + "grad_norm": 0.6637153625488281, + "learning_rate": 4.49933754586221e-05, + "loss": 1.4034, + "step": 11790 + }, + { + "epoch": 0.8013996466911265, + "grad_norm": 0.665380597114563, + "learning_rate": 4.4991252208180464e-05, + "loss": 1.4277, + "step": 11795 + }, + { + "epoch": 0.8017393667617883, + "grad_norm": 0.7033234238624573, + "learning_rate": 4.498912895773882e-05, + "loss": 1.4312, + "step": 11800 + }, + { + "epoch": 0.8020790868324501, + "grad_norm": 0.7284811735153198, + "learning_rate": 4.498700570729719e-05, + "loss": 1.4031, + "step": 11805 + }, + { + "epoch": 0.8024188069031118, + "grad_norm": 0.7314227819442749, + "learning_rate": 4.4984882456855556e-05, + "loss": 1.3191, + "step": 11810 + }, + { + "epoch": 0.8027585269737736, + "grad_norm": 0.8094779253005981, + "learning_rate": 4.498275920641391e-05, + "loss": 1.3541, + "step": 11815 + }, + { + "epoch": 0.8030982470444353, + "grad_norm": 0.6797764301300049, + "learning_rate": 4.4980635955972284e-05, + "loss": 1.3905, + "step": 11820 + }, + { + "epoch": 0.8034379671150972, + "grad_norm": 0.7100040912628174, + "learning_rate": 4.497851270553065e-05, + "loss": 1.4634, + "step": 11825 + }, + { + "epoch": 0.8037776871857589, + "grad_norm": 0.730214536190033, + "learning_rate": 4.4976389455089005e-05, + "loss": 1.3458, + "step": 11830 + }, + { + "epoch": 0.8041174072564207, + "grad_norm": 0.6728085875511169, + "learning_rate": 4.4974266204647376e-05, + "loss": 1.4842, + "step": 11835 + }, + { + "epoch": 0.8044571273270825, + "grad_norm": 0.7188294529914856, + "learning_rate": 4.497214295420574e-05, + "loss": 1.3956, + "step": 11840 + }, + { + "epoch": 0.8047968473977443, + "grad_norm": 0.7234100699424744, + "learning_rate": 4.49700197037641e-05, + "loss": 1.3436, + "step": 11845 + }, + { + "epoch": 0.805136567468406, + "grad_norm": 0.6598080992698669, + "learning_rate": 4.496789645332247e-05, + "loss": 1.3716, + "step": 11850 + }, + { + "epoch": 0.8054762875390679, + "grad_norm": 0.697017252445221, + "learning_rate": 4.496577320288083e-05, + "loss": 1.2724, + "step": 11855 + }, + { + "epoch": 0.8058160076097296, + "grad_norm": 0.7265612483024597, + "learning_rate": 4.496364995243919e-05, + "loss": 1.3161, + "step": 11860 + }, + { + "epoch": 0.8061557276803913, + "grad_norm": 0.7386727333068848, + "learning_rate": 4.496152670199756e-05, + "loss": 1.312, + "step": 11865 + }, + { + "epoch": 0.8064954477510531, + "grad_norm": 0.6545016169548035, + "learning_rate": 4.495940345155592e-05, + "loss": 1.3828, + "step": 11870 + }, + { + "epoch": 0.8068351678217149, + "grad_norm": 0.6210530400276184, + "learning_rate": 4.495728020111428e-05, + "loss": 1.3055, + "step": 11875 + }, + { + "epoch": 0.8071748878923767, + "grad_norm": 0.6984265446662903, + "learning_rate": 4.495515695067265e-05, + "loss": 1.315, + "step": 11880 + }, + { + "epoch": 0.8075146079630384, + "grad_norm": 0.8059458136558533, + "learning_rate": 4.495303370023101e-05, + "loss": 1.412, + "step": 11885 + }, + { + "epoch": 0.8078543280337003, + "grad_norm": 0.6970521211624146, + "learning_rate": 4.495091044978937e-05, + "loss": 1.3041, + "step": 11890 + }, + { + "epoch": 0.808194048104362, + "grad_norm": 0.680242657661438, + "learning_rate": 4.4948787199347744e-05, + "loss": 1.3809, + "step": 11895 + }, + { + "epoch": 0.8085337681750238, + "grad_norm": 0.6085513234138489, + "learning_rate": 4.49466639489061e-05, + "loss": 1.2587, + "step": 11900 + }, + { + "epoch": 0.8088734882456855, + "grad_norm": 0.7571601867675781, + "learning_rate": 4.4944540698464465e-05, + "loss": 1.4282, + "step": 11905 + }, + { + "epoch": 0.8092132083163474, + "grad_norm": 0.7318002581596375, + "learning_rate": 4.4942417448022836e-05, + "loss": 1.5599, + "step": 11910 + }, + { + "epoch": 0.8095529283870091, + "grad_norm": 0.6579176187515259, + "learning_rate": 4.494029419758119e-05, + "loss": 1.3842, + "step": 11915 + }, + { + "epoch": 0.8098926484576708, + "grad_norm": 0.6124897003173828, + "learning_rate": 4.493817094713956e-05, + "loss": 1.3003, + "step": 11920 + }, + { + "epoch": 0.8102323685283327, + "grad_norm": 0.6390940546989441, + "learning_rate": 4.493604769669793e-05, + "loss": 1.4045, + "step": 11925 + }, + { + "epoch": 0.8105720885989944, + "grad_norm": 0.7170646786689758, + "learning_rate": 4.4933924446256285e-05, + "loss": 1.397, + "step": 11930 + }, + { + "epoch": 0.8109118086696562, + "grad_norm": 0.6255606412887573, + "learning_rate": 4.493180119581465e-05, + "loss": 1.3251, + "step": 11935 + }, + { + "epoch": 0.811251528740318, + "grad_norm": 0.5758702158927917, + "learning_rate": 4.492967794537301e-05, + "loss": 1.4673, + "step": 11940 + }, + { + "epoch": 0.8115912488109798, + "grad_norm": 0.6808621287345886, + "learning_rate": 4.492755469493138e-05, + "loss": 1.3583, + "step": 11945 + }, + { + "epoch": 0.8119309688816415, + "grad_norm": 0.5954741835594177, + "learning_rate": 4.492543144448974e-05, + "loss": 1.4841, + "step": 11950 + }, + { + "epoch": 0.8122706889523033, + "grad_norm": 0.7672919034957886, + "learning_rate": 4.4923308194048105e-05, + "loss": 1.3046, + "step": 11955 + }, + { + "epoch": 0.8126104090229651, + "grad_norm": 0.6502673625946045, + "learning_rate": 4.492118494360647e-05, + "loss": 1.4264, + "step": 11960 + }, + { + "epoch": 0.8129501290936268, + "grad_norm": 0.6518172025680542, + "learning_rate": 4.491906169316483e-05, + "loss": 1.3076, + "step": 11965 + }, + { + "epoch": 0.8132898491642886, + "grad_norm": 0.6700319051742554, + "learning_rate": 4.49169384427232e-05, + "loss": 1.4476, + "step": 11970 + }, + { + "epoch": 0.8136295692349504, + "grad_norm": 0.7004209160804749, + "learning_rate": 4.491481519228156e-05, + "loss": 1.3513, + "step": 11975 + }, + { + "epoch": 0.8139692893056122, + "grad_norm": 0.6354067921638489, + "learning_rate": 4.4912691941839925e-05, + "loss": 1.3639, + "step": 11980 + }, + { + "epoch": 0.8143090093762739, + "grad_norm": 0.6545446515083313, + "learning_rate": 4.491056869139829e-05, + "loss": 1.3883, + "step": 11985 + }, + { + "epoch": 0.8146487294469357, + "grad_norm": 0.6701433062553406, + "learning_rate": 4.490844544095665e-05, + "loss": 1.3951, + "step": 11990 + }, + { + "epoch": 0.8149884495175975, + "grad_norm": 0.666741669178009, + "learning_rate": 4.490632219051502e-05, + "loss": 1.2614, + "step": 11995 + }, + { + "epoch": 0.8153281695882593, + "grad_norm": 0.6113370656967163, + "learning_rate": 4.490419894007338e-05, + "loss": 1.3702, + "step": 12000 + }, + { + "epoch": 0.815667889658921, + "grad_norm": 0.9769265651702881, + "learning_rate": 4.4902075689631745e-05, + "loss": 1.3561, + "step": 12005 + }, + { + "epoch": 0.8160076097295829, + "grad_norm": 0.6849420070648193, + "learning_rate": 4.489995243919011e-05, + "loss": 1.3951, + "step": 12010 + }, + { + "epoch": 0.8163473298002446, + "grad_norm": 0.6160445809364319, + "learning_rate": 4.489782918874847e-05, + "loss": 1.3413, + "step": 12015 + }, + { + "epoch": 0.8166870498709063, + "grad_norm": 0.7271686792373657, + "learning_rate": 4.489570593830684e-05, + "loss": 1.3699, + "step": 12020 + }, + { + "epoch": 0.8170267699415682, + "grad_norm": 0.6870571374893188, + "learning_rate": 4.48935826878652e-05, + "loss": 1.3483, + "step": 12025 + }, + { + "epoch": 0.8173664900122299, + "grad_norm": 0.7074158191680908, + "learning_rate": 4.4891459437423565e-05, + "loss": 1.3876, + "step": 12030 + }, + { + "epoch": 0.8177062100828917, + "grad_norm": 0.7424540519714355, + "learning_rate": 4.488933618698193e-05, + "loss": 1.3534, + "step": 12035 + }, + { + "epoch": 0.8180459301535534, + "grad_norm": 0.6281335353851318, + "learning_rate": 4.488721293654029e-05, + "loss": 1.3199, + "step": 12040 + }, + { + "epoch": 0.8183856502242153, + "grad_norm": 0.7014420628547668, + "learning_rate": 4.488508968609866e-05, + "loss": 1.4242, + "step": 12045 + }, + { + "epoch": 0.818725370294877, + "grad_norm": 0.5700911283493042, + "learning_rate": 4.488296643565702e-05, + "loss": 1.3779, + "step": 12050 + }, + { + "epoch": 0.8190650903655388, + "grad_norm": 0.7599695920944214, + "learning_rate": 4.4880843185215385e-05, + "loss": 1.391, + "step": 12055 + }, + { + "epoch": 0.8194048104362006, + "grad_norm": 0.6629148721694946, + "learning_rate": 4.487871993477375e-05, + "loss": 1.4024, + "step": 12060 + }, + { + "epoch": 0.8197445305068624, + "grad_norm": 0.6432792544364929, + "learning_rate": 4.487659668433211e-05, + "loss": 1.3843, + "step": 12065 + }, + { + "epoch": 0.8200842505775241, + "grad_norm": 0.6555769443511963, + "learning_rate": 4.487447343389047e-05, + "loss": 1.3477, + "step": 12070 + }, + { + "epoch": 0.8204239706481858, + "grad_norm": 0.7444199919700623, + "learning_rate": 4.487235018344884e-05, + "loss": 1.4641, + "step": 12075 + }, + { + "epoch": 0.8207636907188477, + "grad_norm": 0.6874467730522156, + "learning_rate": 4.4870226933007205e-05, + "loss": 1.3105, + "step": 12080 + }, + { + "epoch": 0.8211034107895094, + "grad_norm": 0.6294021010398865, + "learning_rate": 4.486810368256556e-05, + "loss": 1.2731, + "step": 12085 + }, + { + "epoch": 0.8214431308601712, + "grad_norm": 0.6811396479606628, + "learning_rate": 4.486598043212393e-05, + "loss": 1.4021, + "step": 12090 + }, + { + "epoch": 0.821782850930833, + "grad_norm": 0.6806802153587341, + "learning_rate": 4.48638571816823e-05, + "loss": 1.4288, + "step": 12095 + }, + { + "epoch": 0.8221225710014948, + "grad_norm": 0.7208311557769775, + "learning_rate": 4.486173393124066e-05, + "loss": 1.4121, + "step": 12100 + }, + { + "epoch": 0.8224622910721565, + "grad_norm": 0.7082199454307556, + "learning_rate": 4.4859610680799025e-05, + "loss": 1.3365, + "step": 12105 + }, + { + "epoch": 0.8228020111428184, + "grad_norm": 0.6614202260971069, + "learning_rate": 4.485748743035739e-05, + "loss": 1.2902, + "step": 12110 + }, + { + "epoch": 0.8231417312134801, + "grad_norm": 0.6267545819282532, + "learning_rate": 4.485536417991575e-05, + "loss": 1.355, + "step": 12115 + }, + { + "epoch": 0.8234814512841419, + "grad_norm": 0.6980351209640503, + "learning_rate": 4.485324092947412e-05, + "loss": 1.3563, + "step": 12120 + }, + { + "epoch": 0.8238211713548036, + "grad_norm": 0.690074622631073, + "learning_rate": 4.485111767903248e-05, + "loss": 1.3108, + "step": 12125 + }, + { + "epoch": 0.8241608914254654, + "grad_norm": 0.5915892720222473, + "learning_rate": 4.4848994428590845e-05, + "loss": 1.4385, + "step": 12130 + }, + { + "epoch": 0.8245006114961272, + "grad_norm": 0.716480016708374, + "learning_rate": 4.484687117814921e-05, + "loss": 1.4252, + "step": 12135 + }, + { + "epoch": 0.8248403315667889, + "grad_norm": 0.6369530558586121, + "learning_rate": 4.4844747927707567e-05, + "loss": 1.3827, + "step": 12140 + }, + { + "epoch": 0.8251800516374508, + "grad_norm": 0.650497555732727, + "learning_rate": 4.484262467726594e-05, + "loss": 1.4093, + "step": 12145 + }, + { + "epoch": 0.8255197717081125, + "grad_norm": 0.7560164928436279, + "learning_rate": 4.48405014268243e-05, + "loss": 1.3747, + "step": 12150 + }, + { + "epoch": 0.8258594917787743, + "grad_norm": 0.6294389963150024, + "learning_rate": 4.483837817638266e-05, + "loss": 1.4042, + "step": 12155 + }, + { + "epoch": 0.826199211849436, + "grad_norm": 0.66148442029953, + "learning_rate": 4.483625492594103e-05, + "loss": 1.2952, + "step": 12160 + }, + { + "epoch": 0.8265389319200979, + "grad_norm": 0.7001969218254089, + "learning_rate": 4.4834131675499393e-05, + "loss": 1.3641, + "step": 12165 + }, + { + "epoch": 0.8268786519907596, + "grad_norm": 0.6744056940078735, + "learning_rate": 4.483200842505775e-05, + "loss": 1.3479, + "step": 12170 + }, + { + "epoch": 0.8272183720614213, + "grad_norm": 0.7539799809455872, + "learning_rate": 4.482988517461612e-05, + "loss": 1.3364, + "step": 12175 + }, + { + "epoch": 0.8275580921320832, + "grad_norm": 0.6546370983123779, + "learning_rate": 4.4827761924174485e-05, + "loss": 1.4197, + "step": 12180 + }, + { + "epoch": 0.8278978122027449, + "grad_norm": 0.6741619110107422, + "learning_rate": 4.482563867373284e-05, + "loss": 1.3835, + "step": 12185 + }, + { + "epoch": 0.8282375322734067, + "grad_norm": 0.6156356334686279, + "learning_rate": 4.4823515423291213e-05, + "loss": 1.2733, + "step": 12190 + }, + { + "epoch": 0.8285772523440685, + "grad_norm": 0.6626706123352051, + "learning_rate": 4.482139217284958e-05, + "loss": 1.3214, + "step": 12195 + }, + { + "epoch": 0.8289169724147303, + "grad_norm": 0.6759384870529175, + "learning_rate": 4.4819268922407935e-05, + "loss": 1.3404, + "step": 12200 + }, + { + "epoch": 0.829256692485392, + "grad_norm": 0.6920154094696045, + "learning_rate": 4.4817145671966305e-05, + "loss": 1.419, + "step": 12205 + }, + { + "epoch": 0.8295964125560538, + "grad_norm": 0.6718053817749023, + "learning_rate": 4.481502242152467e-05, + "loss": 1.3467, + "step": 12210 + }, + { + "epoch": 0.8299361326267156, + "grad_norm": 0.7080114483833313, + "learning_rate": 4.481289917108303e-05, + "loss": 1.4269, + "step": 12215 + }, + { + "epoch": 0.8302758526973774, + "grad_norm": 0.6120716333389282, + "learning_rate": 4.48107759206414e-05, + "loss": 1.2772, + "step": 12220 + }, + { + "epoch": 0.8306155727680391, + "grad_norm": 0.6118149757385254, + "learning_rate": 4.4808652670199755e-05, + "loss": 1.4172, + "step": 12225 + }, + { + "epoch": 0.830955292838701, + "grad_norm": 0.6838310956954956, + "learning_rate": 4.480652941975812e-05, + "loss": 1.3848, + "step": 12230 + }, + { + "epoch": 0.8312950129093627, + "grad_norm": 0.7534611225128174, + "learning_rate": 4.480440616931649e-05, + "loss": 1.2555, + "step": 12235 + }, + { + "epoch": 0.8316347329800244, + "grad_norm": 0.6162254214286804, + "learning_rate": 4.480228291887485e-05, + "loss": 1.3061, + "step": 12240 + }, + { + "epoch": 0.8319744530506862, + "grad_norm": 0.6374291181564331, + "learning_rate": 4.480015966843321e-05, + "loss": 1.3659, + "step": 12245 + }, + { + "epoch": 0.832314173121348, + "grad_norm": 0.69317227602005, + "learning_rate": 4.479803641799158e-05, + "loss": 1.3199, + "step": 12250 + }, + { + "epoch": 0.8326538931920098, + "grad_norm": 0.5970177054405212, + "learning_rate": 4.479591316754994e-05, + "loss": 1.3317, + "step": 12255 + }, + { + "epoch": 0.8329936132626715, + "grad_norm": 0.6941751837730408, + "learning_rate": 4.47937899171083e-05, + "loss": 1.2643, + "step": 12260 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.651773989200592, + "learning_rate": 4.4791666666666673e-05, + "loss": 1.3537, + "step": 12265 + }, + { + "epoch": 0.8336730534039951, + "grad_norm": 0.7591174840927124, + "learning_rate": 4.478954341622503e-05, + "loss": 1.203, + "step": 12270 + }, + { + "epoch": 0.8340127734746569, + "grad_norm": 0.666493833065033, + "learning_rate": 4.4787420165783395e-05, + "loss": 1.3851, + "step": 12275 + }, + { + "epoch": 0.8343524935453187, + "grad_norm": 0.6488901376724243, + "learning_rate": 4.4785296915341765e-05, + "loss": 1.3747, + "step": 12280 + }, + { + "epoch": 0.8346922136159804, + "grad_norm": 0.6691904067993164, + "learning_rate": 4.478317366490012e-05, + "loss": 1.4109, + "step": 12285 + }, + { + "epoch": 0.8350319336866422, + "grad_norm": 0.6803374290466309, + "learning_rate": 4.478105041445849e-05, + "loss": 1.3148, + "step": 12290 + }, + { + "epoch": 0.8353716537573039, + "grad_norm": 0.770541787147522, + "learning_rate": 4.477892716401685e-05, + "loss": 1.3877, + "step": 12295 + }, + { + "epoch": 0.8357113738279658, + "grad_norm": 0.6622931361198425, + "learning_rate": 4.4776803913575215e-05, + "loss": 1.3414, + "step": 12300 + }, + { + "epoch": 0.8360510938986275, + "grad_norm": 0.7337367534637451, + "learning_rate": 4.477468066313358e-05, + "loss": 1.312, + "step": 12305 + }, + { + "epoch": 0.8363908139692893, + "grad_norm": 0.7819303870201111, + "learning_rate": 4.477255741269194e-05, + "loss": 1.337, + "step": 12310 + }, + { + "epoch": 0.8367305340399511, + "grad_norm": 0.6312582492828369, + "learning_rate": 4.477043416225031e-05, + "loss": 1.3172, + "step": 12315 + }, + { + "epoch": 0.8370702541106129, + "grad_norm": 0.6198135018348694, + "learning_rate": 4.476831091180867e-05, + "loss": 1.3957, + "step": 12320 + }, + { + "epoch": 0.8374099741812746, + "grad_norm": 0.7267417311668396, + "learning_rate": 4.4766187661367035e-05, + "loss": 1.3964, + "step": 12325 + }, + { + "epoch": 0.8377496942519363, + "grad_norm": 0.524419903755188, + "learning_rate": 4.47640644109254e-05, + "loss": 1.2128, + "step": 12330 + }, + { + "epoch": 0.8380894143225982, + "grad_norm": 0.6941156387329102, + "learning_rate": 4.476194116048376e-05, + "loss": 1.4847, + "step": 12335 + }, + { + "epoch": 0.83842913439326, + "grad_norm": 0.7097057700157166, + "learning_rate": 4.475981791004213e-05, + "loss": 1.3109, + "step": 12340 + }, + { + "epoch": 0.8387688544639217, + "grad_norm": 0.6544502377510071, + "learning_rate": 4.475769465960049e-05, + "loss": 1.3919, + "step": 12345 + }, + { + "epoch": 0.8391085745345835, + "grad_norm": 0.7121051549911499, + "learning_rate": 4.4755571409158855e-05, + "loss": 1.2416, + "step": 12350 + }, + { + "epoch": 0.8394482946052453, + "grad_norm": 0.6496121883392334, + "learning_rate": 4.475344815871722e-05, + "loss": 1.3091, + "step": 12355 + }, + { + "epoch": 0.839788014675907, + "grad_norm": 0.6753186583518982, + "learning_rate": 4.475132490827558e-05, + "loss": 1.4142, + "step": 12360 + }, + { + "epoch": 0.8401277347465689, + "grad_norm": 0.6260736584663391, + "learning_rate": 4.474920165783395e-05, + "loss": 1.4386, + "step": 12365 + }, + { + "epoch": 0.8404674548172306, + "grad_norm": 0.7074244022369385, + "learning_rate": 4.474707840739231e-05, + "loss": 1.2946, + "step": 12370 + }, + { + "epoch": 0.8408071748878924, + "grad_norm": 0.7217336297035217, + "learning_rate": 4.4744955156950675e-05, + "loss": 1.3625, + "step": 12375 + }, + { + "epoch": 0.8411468949585541, + "grad_norm": 0.6912894248962402, + "learning_rate": 4.474283190650904e-05, + "loss": 1.3473, + "step": 12380 + }, + { + "epoch": 0.841486615029216, + "grad_norm": 0.7598395347595215, + "learning_rate": 4.47407086560674e-05, + "loss": 1.3052, + "step": 12385 + }, + { + "epoch": 0.8418263350998777, + "grad_norm": 0.5946134924888611, + "learning_rate": 4.473858540562577e-05, + "loss": 1.3984, + "step": 12390 + }, + { + "epoch": 0.8421660551705394, + "grad_norm": 0.646136462688446, + "learning_rate": 4.473646215518413e-05, + "loss": 1.3275, + "step": 12395 + }, + { + "epoch": 0.8425057752412013, + "grad_norm": 0.6367974281311035, + "learning_rate": 4.4734338904742495e-05, + "loss": 1.3249, + "step": 12400 + }, + { + "epoch": 0.842845495311863, + "grad_norm": 0.5981682538986206, + "learning_rate": 4.473221565430086e-05, + "loss": 1.3666, + "step": 12405 + }, + { + "epoch": 0.8431852153825248, + "grad_norm": 0.7241153717041016, + "learning_rate": 4.473009240385922e-05, + "loss": 1.3762, + "step": 12410 + }, + { + "epoch": 0.8435249354531865, + "grad_norm": 0.5798956751823425, + "learning_rate": 4.472796915341759e-05, + "loss": 1.2933, + "step": 12415 + }, + { + "epoch": 0.8438646555238484, + "grad_norm": 0.6971281170845032, + "learning_rate": 4.472584590297595e-05, + "loss": 1.3748, + "step": 12420 + }, + { + "epoch": 0.8442043755945101, + "grad_norm": 0.8242133855819702, + "learning_rate": 4.472372265253431e-05, + "loss": 1.3884, + "step": 12425 + }, + { + "epoch": 0.8445440956651719, + "grad_norm": 0.7227784991264343, + "learning_rate": 4.472159940209268e-05, + "loss": 1.362, + "step": 12430 + }, + { + "epoch": 0.8448838157358337, + "grad_norm": 0.7591477632522583, + "learning_rate": 4.471947615165104e-05, + "loss": 1.3163, + "step": 12435 + }, + { + "epoch": 0.8452235358064955, + "grad_norm": 0.7205374240875244, + "learning_rate": 4.471735290120941e-05, + "loss": 1.2853, + "step": 12440 + }, + { + "epoch": 0.8455632558771572, + "grad_norm": 0.6931034922599792, + "learning_rate": 4.471522965076777e-05, + "loss": 1.4047, + "step": 12445 + }, + { + "epoch": 0.845902975947819, + "grad_norm": 0.5156599283218384, + "learning_rate": 4.4713106400326135e-05, + "loss": 1.4269, + "step": 12450 + }, + { + "epoch": 0.8462426960184808, + "grad_norm": 0.7908781170845032, + "learning_rate": 4.47109831498845e-05, + "loss": 1.4037, + "step": 12455 + }, + { + "epoch": 0.8465824160891425, + "grad_norm": 0.6748826503753662, + "learning_rate": 4.470885989944286e-05, + "loss": 1.3054, + "step": 12460 + }, + { + "epoch": 0.8469221361598043, + "grad_norm": 0.7158135771751404, + "learning_rate": 4.470673664900123e-05, + "loss": 1.3301, + "step": 12465 + }, + { + "epoch": 0.8472618562304661, + "grad_norm": 0.6487123370170593, + "learning_rate": 4.470461339855959e-05, + "loss": 1.2662, + "step": 12470 + }, + { + "epoch": 0.8476015763011279, + "grad_norm": 0.7046877145767212, + "learning_rate": 4.4702490148117955e-05, + "loss": 1.339, + "step": 12475 + }, + { + "epoch": 0.8479412963717896, + "grad_norm": 0.673567533493042, + "learning_rate": 4.470036689767632e-05, + "loss": 1.3771, + "step": 12480 + }, + { + "epoch": 0.8482810164424515, + "grad_norm": 0.7173770070075989, + "learning_rate": 4.469824364723468e-05, + "loss": 1.4022, + "step": 12485 + }, + { + "epoch": 0.8486207365131132, + "grad_norm": 0.7046067714691162, + "learning_rate": 4.469612039679305e-05, + "loss": 1.4082, + "step": 12490 + }, + { + "epoch": 0.848960456583775, + "grad_norm": 0.7364188432693481, + "learning_rate": 4.4693997146351404e-05, + "loss": 1.3989, + "step": 12495 + }, + { + "epoch": 0.8493001766544367, + "grad_norm": 0.7246367931365967, + "learning_rate": 4.4691873895909775e-05, + "loss": 1.4032, + "step": 12500 + }, + { + "epoch": 0.8496398967250985, + "grad_norm": 0.7582381963729858, + "learning_rate": 4.468975064546814e-05, + "loss": 1.2776, + "step": 12505 + }, + { + "epoch": 0.8499796167957603, + "grad_norm": 0.7966914772987366, + "learning_rate": 4.4687627395026496e-05, + "loss": 1.3539, + "step": 12510 + }, + { + "epoch": 0.850319336866422, + "grad_norm": 0.5626029372215271, + "learning_rate": 4.468550414458487e-05, + "loss": 1.3589, + "step": 12515 + }, + { + "epoch": 0.8506590569370839, + "grad_norm": 0.5940312147140503, + "learning_rate": 4.468338089414323e-05, + "loss": 1.298, + "step": 12520 + }, + { + "epoch": 0.8509987770077456, + "grad_norm": 0.7309895753860474, + "learning_rate": 4.468125764370159e-05, + "loss": 1.4446, + "step": 12525 + }, + { + "epoch": 0.8513384970784074, + "grad_norm": 0.7690249681472778, + "learning_rate": 4.467913439325996e-05, + "loss": 1.3459, + "step": 12530 + }, + { + "epoch": 0.8516782171490692, + "grad_norm": 0.6707097887992859, + "learning_rate": 4.467701114281832e-05, + "loss": 1.3827, + "step": 12535 + }, + { + "epoch": 0.852017937219731, + "grad_norm": 0.7316380739212036, + "learning_rate": 4.467488789237668e-05, + "loss": 1.3802, + "step": 12540 + }, + { + "epoch": 0.8523576572903927, + "grad_norm": 0.6959583163261414, + "learning_rate": 4.467276464193505e-05, + "loss": 1.3125, + "step": 12545 + }, + { + "epoch": 0.8526973773610544, + "grad_norm": 0.6206613183021545, + "learning_rate": 4.4670641391493415e-05, + "loss": 1.3474, + "step": 12550 + }, + { + "epoch": 0.8530370974317163, + "grad_norm": 0.683360755443573, + "learning_rate": 4.466851814105177e-05, + "loss": 1.3881, + "step": 12555 + }, + { + "epoch": 0.853376817502378, + "grad_norm": 0.5920791029930115, + "learning_rate": 4.466639489061014e-05, + "loss": 1.3983, + "step": 12560 + }, + { + "epoch": 0.8537165375730398, + "grad_norm": 0.7246683239936829, + "learning_rate": 4.46642716401685e-05, + "loss": 1.3337, + "step": 12565 + }, + { + "epoch": 0.8540562576437016, + "grad_norm": 0.6815592646598816, + "learning_rate": 4.4662148389726864e-05, + "loss": 1.4058, + "step": 12570 + }, + { + "epoch": 0.8543959777143634, + "grad_norm": 0.6942914724349976, + "learning_rate": 4.4660025139285235e-05, + "loss": 1.3565, + "step": 12575 + }, + { + "epoch": 0.8547356977850251, + "grad_norm": 0.7076385021209717, + "learning_rate": 4.465790188884359e-05, + "loss": 1.214, + "step": 12580 + }, + { + "epoch": 0.8550754178556869, + "grad_norm": 0.714252769947052, + "learning_rate": 4.4655778638401956e-05, + "loss": 1.3316, + "step": 12585 + }, + { + "epoch": 0.8554151379263487, + "grad_norm": 0.629565417766571, + "learning_rate": 4.465365538796033e-05, + "loss": 1.3956, + "step": 12590 + }, + { + "epoch": 0.8557548579970105, + "grad_norm": 0.7133734822273254, + "learning_rate": 4.4651532137518684e-05, + "loss": 1.4328, + "step": 12595 + }, + { + "epoch": 0.8560945780676722, + "grad_norm": 0.556172788143158, + "learning_rate": 4.464940888707705e-05, + "loss": 1.3339, + "step": 12600 + }, + { + "epoch": 0.856434298138334, + "grad_norm": 0.7105357050895691, + "learning_rate": 4.464728563663542e-05, + "loss": 1.3449, + "step": 12605 + }, + { + "epoch": 0.8567740182089958, + "grad_norm": 0.6951549649238586, + "learning_rate": 4.4645162386193776e-05, + "loss": 1.3716, + "step": 12610 + }, + { + "epoch": 0.8571137382796575, + "grad_norm": 0.6994973421096802, + "learning_rate": 4.464303913575214e-05, + "loss": 1.3174, + "step": 12615 + }, + { + "epoch": 0.8574534583503194, + "grad_norm": 0.6157001256942749, + "learning_rate": 4.464091588531051e-05, + "loss": 1.3073, + "step": 12620 + }, + { + "epoch": 0.8577931784209811, + "grad_norm": 0.7500584721565247, + "learning_rate": 4.463879263486887e-05, + "loss": 1.3058, + "step": 12625 + }, + { + "epoch": 0.8581328984916429, + "grad_norm": 0.6024318337440491, + "learning_rate": 4.463666938442723e-05, + "loss": 1.4033, + "step": 12630 + }, + { + "epoch": 0.8584726185623046, + "grad_norm": 0.619443953037262, + "learning_rate": 4.46345461339856e-05, + "loss": 1.4824, + "step": 12635 + }, + { + "epoch": 0.8588123386329665, + "grad_norm": 0.7248201966285706, + "learning_rate": 4.463242288354396e-05, + "loss": 1.4398, + "step": 12640 + }, + { + "epoch": 0.8591520587036282, + "grad_norm": 0.631622850894928, + "learning_rate": 4.4630299633102324e-05, + "loss": 1.4092, + "step": 12645 + }, + { + "epoch": 0.85949177877429, + "grad_norm": 0.6558489799499512, + "learning_rate": 4.462817638266069e-05, + "loss": 1.3589, + "step": 12650 + }, + { + "epoch": 0.8598314988449518, + "grad_norm": 0.7223998308181763, + "learning_rate": 4.462605313221905e-05, + "loss": 1.4554, + "step": 12655 + }, + { + "epoch": 0.8601712189156135, + "grad_norm": 0.7063769102096558, + "learning_rate": 4.4623929881777416e-05, + "loss": 1.403, + "step": 12660 + }, + { + "epoch": 0.8605109389862753, + "grad_norm": 0.6834850311279297, + "learning_rate": 4.462180663133578e-05, + "loss": 1.34, + "step": 12665 + }, + { + "epoch": 0.860850659056937, + "grad_norm": 0.6297242641448975, + "learning_rate": 4.4619683380894144e-05, + "loss": 1.2888, + "step": 12670 + }, + { + "epoch": 0.8611903791275989, + "grad_norm": 0.6202226877212524, + "learning_rate": 4.461756013045251e-05, + "loss": 1.433, + "step": 12675 + }, + { + "epoch": 0.8615300991982606, + "grad_norm": 0.7646043300628662, + "learning_rate": 4.461543688001087e-05, + "loss": 1.3271, + "step": 12680 + }, + { + "epoch": 0.8618698192689224, + "grad_norm": 0.6283760070800781, + "learning_rate": 4.4613313629569236e-05, + "loss": 1.4297, + "step": 12685 + }, + { + "epoch": 0.8622095393395842, + "grad_norm": 0.7506958842277527, + "learning_rate": 4.46111903791276e-05, + "loss": 1.3527, + "step": 12690 + }, + { + "epoch": 0.862549259410246, + "grad_norm": 0.709775984287262, + "learning_rate": 4.4609067128685964e-05, + "loss": 1.3299, + "step": 12695 + }, + { + "epoch": 0.8628889794809077, + "grad_norm": 0.8392066359519958, + "learning_rate": 4.460694387824433e-05, + "loss": 1.3447, + "step": 12700 + }, + { + "epoch": 0.8632286995515696, + "grad_norm": 0.7184649109840393, + "learning_rate": 4.460482062780269e-05, + "loss": 1.3057, + "step": 12705 + }, + { + "epoch": 0.8635684196222313, + "grad_norm": 0.7541151642799377, + "learning_rate": 4.4602697377361056e-05, + "loss": 1.3778, + "step": 12710 + }, + { + "epoch": 0.863908139692893, + "grad_norm": 0.6973274350166321, + "learning_rate": 4.460057412691942e-05, + "loss": 1.3023, + "step": 12715 + }, + { + "epoch": 0.8642478597635548, + "grad_norm": 0.5811592936515808, + "learning_rate": 4.4598450876477784e-05, + "loss": 1.4126, + "step": 12720 + }, + { + "epoch": 0.8645875798342166, + "grad_norm": 0.8044971823692322, + "learning_rate": 4.459632762603615e-05, + "loss": 1.3891, + "step": 12725 + }, + { + "epoch": 0.8649272999048784, + "grad_norm": 0.6741278171539307, + "learning_rate": 4.459420437559451e-05, + "loss": 1.4562, + "step": 12730 + }, + { + "epoch": 0.8652670199755401, + "grad_norm": 0.6165005564689636, + "learning_rate": 4.4592081125152876e-05, + "loss": 1.4003, + "step": 12735 + }, + { + "epoch": 0.865606740046202, + "grad_norm": 0.7122872471809387, + "learning_rate": 4.458995787471124e-05, + "loss": 1.4728, + "step": 12740 + }, + { + "epoch": 0.8659464601168637, + "grad_norm": 0.7181335687637329, + "learning_rate": 4.4587834624269604e-05, + "loss": 1.4214, + "step": 12745 + }, + { + "epoch": 0.8662861801875255, + "grad_norm": 0.5936729907989502, + "learning_rate": 4.458571137382797e-05, + "loss": 1.3632, + "step": 12750 + }, + { + "epoch": 0.8666259002581872, + "grad_norm": 0.6827543377876282, + "learning_rate": 4.458358812338633e-05, + "loss": 1.3793, + "step": 12755 + }, + { + "epoch": 0.866965620328849, + "grad_norm": 0.7110022306442261, + "learning_rate": 4.4581464872944696e-05, + "loss": 1.381, + "step": 12760 + }, + { + "epoch": 0.8673053403995108, + "grad_norm": 0.6814020872116089, + "learning_rate": 4.4579341622503054e-05, + "loss": 1.3126, + "step": 12765 + }, + { + "epoch": 0.8676450604701725, + "grad_norm": 0.7111496925354004, + "learning_rate": 4.4577218372061424e-05, + "loss": 1.2872, + "step": 12770 + }, + { + "epoch": 0.8679847805408344, + "grad_norm": 0.6328551173210144, + "learning_rate": 4.457509512161979e-05, + "loss": 1.3097, + "step": 12775 + }, + { + "epoch": 0.8683245006114961, + "grad_norm": 0.7361866235733032, + "learning_rate": 4.457297187117815e-05, + "loss": 1.3533, + "step": 12780 + }, + { + "epoch": 0.8686642206821579, + "grad_norm": 0.6520326733589172, + "learning_rate": 4.4570848620736516e-05, + "loss": 1.4271, + "step": 12785 + }, + { + "epoch": 0.8690039407528197, + "grad_norm": 0.7593867182731628, + "learning_rate": 4.456872537029488e-05, + "loss": 1.3221, + "step": 12790 + }, + { + "epoch": 0.8693436608234815, + "grad_norm": 0.6264833211898804, + "learning_rate": 4.4566602119853244e-05, + "loss": 1.3859, + "step": 12795 + }, + { + "epoch": 0.8696833808941432, + "grad_norm": 0.5434083938598633, + "learning_rate": 4.456447886941161e-05, + "loss": 1.3825, + "step": 12800 + }, + { + "epoch": 0.870023100964805, + "grad_norm": 0.583276629447937, + "learning_rate": 4.456235561896997e-05, + "loss": 1.3597, + "step": 12805 + }, + { + "epoch": 0.8703628210354668, + "grad_norm": 0.65021812915802, + "learning_rate": 4.4560232368528336e-05, + "loss": 1.4308, + "step": 12810 + }, + { + "epoch": 0.8707025411061285, + "grad_norm": 0.7106450200080872, + "learning_rate": 4.45581091180867e-05, + "loss": 1.4236, + "step": 12815 + }, + { + "epoch": 0.8710422611767903, + "grad_norm": 0.6209539771080017, + "learning_rate": 4.4555985867645064e-05, + "loss": 1.3181, + "step": 12820 + }, + { + "epoch": 0.8713819812474521, + "grad_norm": 0.6393656730651855, + "learning_rate": 4.455386261720343e-05, + "loss": 1.4685, + "step": 12825 + }, + { + "epoch": 0.8717217013181139, + "grad_norm": 0.7080166935920715, + "learning_rate": 4.455173936676179e-05, + "loss": 1.2752, + "step": 12830 + }, + { + "epoch": 0.8720614213887756, + "grad_norm": 0.6802095770835876, + "learning_rate": 4.4549616116320156e-05, + "loss": 1.3952, + "step": 12835 + }, + { + "epoch": 0.8724011414594374, + "grad_norm": 0.6535228490829468, + "learning_rate": 4.454749286587852e-05, + "loss": 1.3435, + "step": 12840 + }, + { + "epoch": 0.8727408615300992, + "grad_norm": 0.8674468398094177, + "learning_rate": 4.4545369615436884e-05, + "loss": 1.3229, + "step": 12845 + }, + { + "epoch": 0.873080581600761, + "grad_norm": 0.6909651756286621, + "learning_rate": 4.454324636499524e-05, + "loss": 1.4718, + "step": 12850 + }, + { + "epoch": 0.8734203016714227, + "grad_norm": 0.6655237078666687, + "learning_rate": 4.454112311455361e-05, + "loss": 1.3768, + "step": 12855 + }, + { + "epoch": 0.8737600217420846, + "grad_norm": 0.6951791048049927, + "learning_rate": 4.4538999864111976e-05, + "loss": 1.2887, + "step": 12860 + }, + { + "epoch": 0.8740997418127463, + "grad_norm": 0.676295816898346, + "learning_rate": 4.4536876613670334e-05, + "loss": 1.3273, + "step": 12865 + }, + { + "epoch": 0.874439461883408, + "grad_norm": 0.7393082976341248, + "learning_rate": 4.4534753363228704e-05, + "loss": 1.1891, + "step": 12870 + }, + { + "epoch": 0.8747791819540699, + "grad_norm": 0.6166176199913025, + "learning_rate": 4.453263011278707e-05, + "loss": 1.3926, + "step": 12875 + }, + { + "epoch": 0.8751189020247316, + "grad_norm": 0.7175416350364685, + "learning_rate": 4.4530506862345426e-05, + "loss": 1.3168, + "step": 12880 + }, + { + "epoch": 0.8754586220953934, + "grad_norm": 0.665534496307373, + "learning_rate": 4.4528383611903796e-05, + "loss": 1.3882, + "step": 12885 + }, + { + "epoch": 0.8757983421660551, + "grad_norm": 0.6802185773849487, + "learning_rate": 4.452626036146216e-05, + "loss": 1.366, + "step": 12890 + }, + { + "epoch": 0.876138062236717, + "grad_norm": 0.6779190897941589, + "learning_rate": 4.452413711102052e-05, + "loss": 1.3629, + "step": 12895 + }, + { + "epoch": 0.8764777823073787, + "grad_norm": 0.622493326663971, + "learning_rate": 4.452201386057889e-05, + "loss": 1.4485, + "step": 12900 + }, + { + "epoch": 0.8768175023780405, + "grad_norm": 0.7519457340240479, + "learning_rate": 4.451989061013725e-05, + "loss": 1.3622, + "step": 12905 + }, + { + "epoch": 0.8771572224487023, + "grad_norm": 0.6253534555435181, + "learning_rate": 4.451776735969561e-05, + "loss": 1.3836, + "step": 12910 + }, + { + "epoch": 0.877496942519364, + "grad_norm": 0.6690431833267212, + "learning_rate": 4.451564410925398e-05, + "loss": 1.3331, + "step": 12915 + }, + { + "epoch": 0.8778366625900258, + "grad_norm": 0.6430015563964844, + "learning_rate": 4.451352085881234e-05, + "loss": 1.3918, + "step": 12920 + }, + { + "epoch": 0.8781763826606876, + "grad_norm": 0.6269593834877014, + "learning_rate": 4.45113976083707e-05, + "loss": 1.2831, + "step": 12925 + }, + { + "epoch": 0.8785161027313494, + "grad_norm": 0.6534284353256226, + "learning_rate": 4.450927435792907e-05, + "loss": 1.329, + "step": 12930 + }, + { + "epoch": 0.8788558228020111, + "grad_norm": 0.685684859752655, + "learning_rate": 4.450715110748743e-05, + "loss": 1.3478, + "step": 12935 + }, + { + "epoch": 0.8791955428726729, + "grad_norm": 0.7119500041007996, + "learning_rate": 4.4505027857045794e-05, + "loss": 1.2784, + "step": 12940 + }, + { + "epoch": 0.8795352629433347, + "grad_norm": 0.6209441423416138, + "learning_rate": 4.4502904606604164e-05, + "loss": 1.3552, + "step": 12945 + }, + { + "epoch": 0.8798749830139965, + "grad_norm": 0.7009634375572205, + "learning_rate": 4.450078135616252e-05, + "loss": 1.305, + "step": 12950 + }, + { + "epoch": 0.8802147030846582, + "grad_norm": 0.6846415996551514, + "learning_rate": 4.4498658105720886e-05, + "loss": 1.343, + "step": 12955 + }, + { + "epoch": 0.8805544231553201, + "grad_norm": 0.7048271894454956, + "learning_rate": 4.4496534855279256e-05, + "loss": 1.3897, + "step": 12960 + }, + { + "epoch": 0.8808941432259818, + "grad_norm": 0.6972647905349731, + "learning_rate": 4.4494411604837614e-05, + "loss": 1.3743, + "step": 12965 + }, + { + "epoch": 0.8812338632966435, + "grad_norm": 0.6757969856262207, + "learning_rate": 4.449228835439598e-05, + "loss": 1.3773, + "step": 12970 + }, + { + "epoch": 0.8815735833673053, + "grad_norm": 0.6842308044433594, + "learning_rate": 4.449016510395435e-05, + "loss": 1.3798, + "step": 12975 + }, + { + "epoch": 0.8819133034379671, + "grad_norm": 0.6644297242164612, + "learning_rate": 4.4488041853512706e-05, + "loss": 1.3698, + "step": 12980 + }, + { + "epoch": 0.8822530235086289, + "grad_norm": 0.7060693502426147, + "learning_rate": 4.448591860307107e-05, + "loss": 1.3364, + "step": 12985 + }, + { + "epoch": 0.8825927435792906, + "grad_norm": 0.767492413520813, + "learning_rate": 4.448379535262944e-05, + "loss": 1.4915, + "step": 12990 + }, + { + "epoch": 0.8829324636499525, + "grad_norm": 0.6986490488052368, + "learning_rate": 4.44816721021878e-05, + "loss": 1.356, + "step": 12995 + }, + { + "epoch": 0.8832721837206142, + "grad_norm": 0.8189640641212463, + "learning_rate": 4.447954885174616e-05, + "loss": 1.3555, + "step": 13000 + }, + { + "epoch": 0.883611903791276, + "grad_norm": 0.822814404964447, + "learning_rate": 4.4477425601304526e-05, + "loss": 1.4497, + "step": 13005 + }, + { + "epoch": 0.8839516238619378, + "grad_norm": 0.7144615650177002, + "learning_rate": 4.447530235086289e-05, + "loss": 1.3439, + "step": 13010 + }, + { + "epoch": 0.8842913439325996, + "grad_norm": 0.6712161302566528, + "learning_rate": 4.4473179100421254e-05, + "loss": 1.4226, + "step": 13015 + }, + { + "epoch": 0.8846310640032613, + "grad_norm": 0.6807404160499573, + "learning_rate": 4.447105584997962e-05, + "loss": 1.3964, + "step": 13020 + }, + { + "epoch": 0.884970784073923, + "grad_norm": 0.6526500582695007, + "learning_rate": 4.446893259953798e-05, + "loss": 1.257, + "step": 13025 + }, + { + "epoch": 0.8853105041445849, + "grad_norm": 0.791623055934906, + "learning_rate": 4.4466809349096346e-05, + "loss": 1.3861, + "step": 13030 + }, + { + "epoch": 0.8856502242152466, + "grad_norm": 0.7261638045310974, + "learning_rate": 4.446468609865471e-05, + "loss": 1.3642, + "step": 13035 + }, + { + "epoch": 0.8859899442859084, + "grad_norm": 0.7141435742378235, + "learning_rate": 4.4462562848213074e-05, + "loss": 1.3801, + "step": 13040 + }, + { + "epoch": 0.8863296643565702, + "grad_norm": 0.6886386275291443, + "learning_rate": 4.446043959777144e-05, + "loss": 1.441, + "step": 13045 + }, + { + "epoch": 0.886669384427232, + "grad_norm": 0.6873751282691956, + "learning_rate": 4.44583163473298e-05, + "loss": 1.3655, + "step": 13050 + }, + { + "epoch": 0.8870091044978937, + "grad_norm": 0.7220982909202576, + "learning_rate": 4.4456193096888166e-05, + "loss": 1.3556, + "step": 13055 + }, + { + "epoch": 0.8873488245685555, + "grad_norm": 0.6405097842216492, + "learning_rate": 4.445406984644653e-05, + "loss": 1.3033, + "step": 13060 + }, + { + "epoch": 0.8876885446392173, + "grad_norm": 0.6902096271514893, + "learning_rate": 4.4451946596004894e-05, + "loss": 1.6509, + "step": 13065 + }, + { + "epoch": 0.8880282647098791, + "grad_norm": 0.8238720893859863, + "learning_rate": 4.444982334556326e-05, + "loss": 1.3401, + "step": 13070 + }, + { + "epoch": 0.8883679847805408, + "grad_norm": 0.7122278809547424, + "learning_rate": 4.444770009512162e-05, + "loss": 1.4091, + "step": 13075 + }, + { + "epoch": 0.8887077048512027, + "grad_norm": 0.6530733108520508, + "learning_rate": 4.4445576844679986e-05, + "loss": 1.3942, + "step": 13080 + }, + { + "epoch": 0.8890474249218644, + "grad_norm": 0.7043144702911377, + "learning_rate": 4.444345359423835e-05, + "loss": 1.4006, + "step": 13085 + }, + { + "epoch": 0.8893871449925261, + "grad_norm": 0.6719483137130737, + "learning_rate": 4.4441330343796714e-05, + "loss": 1.3808, + "step": 13090 + }, + { + "epoch": 0.889726865063188, + "grad_norm": 0.6334452033042908, + "learning_rate": 4.443920709335508e-05, + "loss": 1.4171, + "step": 13095 + }, + { + "epoch": 0.8900665851338497, + "grad_norm": 0.686010479927063, + "learning_rate": 4.443708384291344e-05, + "loss": 1.3543, + "step": 13100 + }, + { + "epoch": 0.8904063052045115, + "grad_norm": 0.6849266290664673, + "learning_rate": 4.4434960592471806e-05, + "loss": 1.3469, + "step": 13105 + }, + { + "epoch": 0.8907460252751732, + "grad_norm": 0.6323592066764832, + "learning_rate": 4.443283734203017e-05, + "loss": 1.3562, + "step": 13110 + }, + { + "epoch": 0.8910857453458351, + "grad_norm": 0.7662525773048401, + "learning_rate": 4.4430714091588534e-05, + "loss": 1.3266, + "step": 13115 + }, + { + "epoch": 0.8914254654164968, + "grad_norm": 0.6142858266830444, + "learning_rate": 4.44285908411469e-05, + "loss": 1.3438, + "step": 13120 + }, + { + "epoch": 0.8917651854871586, + "grad_norm": 0.6289658546447754, + "learning_rate": 4.442646759070526e-05, + "loss": 1.2906, + "step": 13125 + }, + { + "epoch": 0.8921049055578204, + "grad_norm": 0.7273269295692444, + "learning_rate": 4.4424344340263626e-05, + "loss": 1.3285, + "step": 13130 + }, + { + "epoch": 0.8924446256284821, + "grad_norm": 0.7088006734848022, + "learning_rate": 4.442222108982199e-05, + "loss": 1.2829, + "step": 13135 + }, + { + "epoch": 0.8927843456991439, + "grad_norm": 0.663790762424469, + "learning_rate": 4.4420097839380354e-05, + "loss": 1.3886, + "step": 13140 + }, + { + "epoch": 0.8931240657698056, + "grad_norm": 0.5893279314041138, + "learning_rate": 4.441797458893872e-05, + "loss": 1.3726, + "step": 13145 + }, + { + "epoch": 0.8934637858404675, + "grad_norm": 0.6438887119293213, + "learning_rate": 4.441585133849708e-05, + "loss": 1.4389, + "step": 13150 + }, + { + "epoch": 0.8938035059111292, + "grad_norm": 0.6501761674880981, + "learning_rate": 4.4413728088055446e-05, + "loss": 1.4758, + "step": 13155 + }, + { + "epoch": 0.894143225981791, + "grad_norm": 0.693305253982544, + "learning_rate": 4.441160483761381e-05, + "loss": 1.336, + "step": 13160 + }, + { + "epoch": 0.8944829460524528, + "grad_norm": 0.641102135181427, + "learning_rate": 4.4409481587172174e-05, + "loss": 1.2956, + "step": 13165 + }, + { + "epoch": 0.8948226661231146, + "grad_norm": 0.6698890328407288, + "learning_rate": 4.440735833673054e-05, + "loss": 1.3542, + "step": 13170 + }, + { + "epoch": 0.8951623861937763, + "grad_norm": 0.6819259524345398, + "learning_rate": 4.44052350862889e-05, + "loss": 1.4348, + "step": 13175 + }, + { + "epoch": 0.8955021062644382, + "grad_norm": 0.7356864809989929, + "learning_rate": 4.4403111835847266e-05, + "loss": 1.4219, + "step": 13180 + }, + { + "epoch": 0.8958418263350999, + "grad_norm": 0.7078156471252441, + "learning_rate": 4.440098858540563e-05, + "loss": 1.4111, + "step": 13185 + }, + { + "epoch": 0.8961815464057616, + "grad_norm": 0.6777173280715942, + "learning_rate": 4.4398865334963994e-05, + "loss": 1.3411, + "step": 13190 + }, + { + "epoch": 0.8965212664764234, + "grad_norm": 0.6705132722854614, + "learning_rate": 4.439674208452236e-05, + "loss": 1.337, + "step": 13195 + }, + { + "epoch": 0.8968609865470852, + "grad_norm": 0.6225191950798035, + "learning_rate": 4.439461883408072e-05, + "loss": 1.3543, + "step": 13200 + }, + { + "epoch": 0.897200706617747, + "grad_norm": 0.7024900913238525, + "learning_rate": 4.439249558363908e-05, + "loss": 1.3405, + "step": 13205 + }, + { + "epoch": 0.8975404266884087, + "grad_norm": 0.6447491645812988, + "learning_rate": 4.439037233319745e-05, + "loss": 1.2934, + "step": 13210 + }, + { + "epoch": 0.8978801467590706, + "grad_norm": 0.6816796660423279, + "learning_rate": 4.4388249082755814e-05, + "loss": 1.2964, + "step": 13215 + }, + { + "epoch": 0.8982198668297323, + "grad_norm": 0.7214002013206482, + "learning_rate": 4.438612583231417e-05, + "loss": 1.3165, + "step": 13220 + }, + { + "epoch": 0.8985595869003941, + "grad_norm": 0.7090854048728943, + "learning_rate": 4.438400258187254e-05, + "loss": 1.3587, + "step": 13225 + }, + { + "epoch": 0.8988993069710558, + "grad_norm": 0.5795954465866089, + "learning_rate": 4.4381879331430906e-05, + "loss": 1.3832, + "step": 13230 + }, + { + "epoch": 0.8992390270417177, + "grad_norm": 0.7099621295928955, + "learning_rate": 4.437975608098926e-05, + "loss": 1.3428, + "step": 13235 + }, + { + "epoch": 0.8995787471123794, + "grad_norm": 0.6782974600791931, + "learning_rate": 4.4377632830547634e-05, + "loss": 1.369, + "step": 13240 + }, + { + "epoch": 0.8999184671830411, + "grad_norm": 0.6810418367385864, + "learning_rate": 4.4375509580106e-05, + "loss": 1.3566, + "step": 13245 + }, + { + "epoch": 0.900258187253703, + "grad_norm": 0.7035811543464661, + "learning_rate": 4.4373386329664355e-05, + "loss": 1.3677, + "step": 13250 + }, + { + "epoch": 0.9005979073243647, + "grad_norm": 0.7452874779701233, + "learning_rate": 4.4371263079222726e-05, + "loss": 1.3218, + "step": 13255 + }, + { + "epoch": 0.9009376273950265, + "grad_norm": 0.6274643540382385, + "learning_rate": 4.436913982878109e-05, + "loss": 1.3924, + "step": 13260 + }, + { + "epoch": 0.9012773474656883, + "grad_norm": 0.6658132672309875, + "learning_rate": 4.436701657833945e-05, + "loss": 1.3177, + "step": 13265 + }, + { + "epoch": 0.9016170675363501, + "grad_norm": 0.7131541967391968, + "learning_rate": 4.436489332789782e-05, + "loss": 1.3075, + "step": 13270 + }, + { + "epoch": 0.9019567876070118, + "grad_norm": 0.6533933877944946, + "learning_rate": 4.4362770077456175e-05, + "loss": 1.3699, + "step": 13275 + }, + { + "epoch": 0.9022965076776736, + "grad_norm": 0.7328100800514221, + "learning_rate": 4.436064682701454e-05, + "loss": 1.3425, + "step": 13280 + }, + { + "epoch": 0.9026362277483354, + "grad_norm": 0.6254872679710388, + "learning_rate": 4.435852357657291e-05, + "loss": 1.3822, + "step": 13285 + }, + { + "epoch": 0.9029759478189971, + "grad_norm": 0.6984084844589233, + "learning_rate": 4.435640032613127e-05, + "loss": 1.3947, + "step": 13290 + }, + { + "epoch": 0.9033156678896589, + "grad_norm": 0.5597203969955444, + "learning_rate": 4.435427707568963e-05, + "loss": 1.3589, + "step": 13295 + }, + { + "epoch": 0.9036553879603207, + "grad_norm": 0.668630838394165, + "learning_rate": 4.4352153825248e-05, + "loss": 1.4675, + "step": 13300 + }, + { + "epoch": 0.9039951080309825, + "grad_norm": 0.7078883647918701, + "learning_rate": 4.435003057480636e-05, + "loss": 1.3876, + "step": 13305 + }, + { + "epoch": 0.9043348281016442, + "grad_norm": 0.6608153581619263, + "learning_rate": 4.434790732436472e-05, + "loss": 1.4429, + "step": 13310 + }, + { + "epoch": 0.904674548172306, + "grad_norm": 0.6872571706771851, + "learning_rate": 4.4345784073923094e-05, + "loss": 1.4909, + "step": 13315 + }, + { + "epoch": 0.9050142682429678, + "grad_norm": 0.7475165724754333, + "learning_rate": 4.434366082348145e-05, + "loss": 1.3526, + "step": 13320 + }, + { + "epoch": 0.9053539883136296, + "grad_norm": 0.7566111087799072, + "learning_rate": 4.4341537573039815e-05, + "loss": 1.2801, + "step": 13325 + }, + { + "epoch": 0.9056937083842913, + "grad_norm": 0.6893516778945923, + "learning_rate": 4.4339414322598186e-05, + "loss": 1.3676, + "step": 13330 + }, + { + "epoch": 0.9060334284549532, + "grad_norm": 0.7658765912055969, + "learning_rate": 4.433729107215654e-05, + "loss": 1.3154, + "step": 13335 + }, + { + "epoch": 0.9063731485256149, + "grad_norm": 0.6320791840553284, + "learning_rate": 4.433516782171491e-05, + "loss": 1.3613, + "step": 13340 + }, + { + "epoch": 0.9067128685962766, + "grad_norm": 0.7179452180862427, + "learning_rate": 4.433304457127327e-05, + "loss": 1.4489, + "step": 13345 + }, + { + "epoch": 0.9070525886669385, + "grad_norm": 0.7113944292068481, + "learning_rate": 4.4330921320831635e-05, + "loss": 1.2941, + "step": 13350 + }, + { + "epoch": 0.9073923087376002, + "grad_norm": 0.7318709492683411, + "learning_rate": 4.432879807039e-05, + "loss": 1.3876, + "step": 13355 + }, + { + "epoch": 0.907732028808262, + "grad_norm": 0.7524285912513733, + "learning_rate": 4.432667481994836e-05, + "loss": 1.4184, + "step": 13360 + }, + { + "epoch": 0.9080717488789237, + "grad_norm": 0.6286381483078003, + "learning_rate": 4.432455156950673e-05, + "loss": 1.3671, + "step": 13365 + }, + { + "epoch": 0.9084114689495856, + "grad_norm": 0.6272299289703369, + "learning_rate": 4.432242831906509e-05, + "loss": 1.3946, + "step": 13370 + }, + { + "epoch": 0.9087511890202473, + "grad_norm": 0.5813237428665161, + "learning_rate": 4.4320305068623455e-05, + "loss": 1.2941, + "step": 13375 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.6842602491378784, + "learning_rate": 4.431818181818182e-05, + "loss": 1.4783, + "step": 13380 + }, + { + "epoch": 0.9094306291615709, + "grad_norm": 0.6329503655433655, + "learning_rate": 4.431605856774018e-05, + "loss": 1.374, + "step": 13385 + }, + { + "epoch": 0.9097703492322327, + "grad_norm": 0.7969860434532166, + "learning_rate": 4.431393531729855e-05, + "loss": 1.2046, + "step": 13390 + }, + { + "epoch": 0.9101100693028944, + "grad_norm": 0.7060311436653137, + "learning_rate": 4.431181206685691e-05, + "loss": 1.3383, + "step": 13395 + }, + { + "epoch": 0.9104497893735561, + "grad_norm": 0.5485876798629761, + "learning_rate": 4.4309688816415275e-05, + "loss": 1.2818, + "step": 13400 + }, + { + "epoch": 0.910789509444218, + "grad_norm": 0.6860634088516235, + "learning_rate": 4.430756556597364e-05, + "loss": 1.4044, + "step": 13405 + }, + { + "epoch": 0.9111292295148797, + "grad_norm": 0.6536875367164612, + "learning_rate": 4.4305442315532e-05, + "loss": 1.3108, + "step": 13410 + }, + { + "epoch": 0.9114689495855415, + "grad_norm": 0.689294159412384, + "learning_rate": 4.430331906509037e-05, + "loss": 1.39, + "step": 13415 + }, + { + "epoch": 0.9118086696562033, + "grad_norm": 0.6730312705039978, + "learning_rate": 4.430119581464873e-05, + "loss": 1.3091, + "step": 13420 + }, + { + "epoch": 0.9121483897268651, + "grad_norm": 0.6333032250404358, + "learning_rate": 4.4299072564207095e-05, + "loss": 1.4008, + "step": 13425 + }, + { + "epoch": 0.9124881097975268, + "grad_norm": 0.7166286706924438, + "learning_rate": 4.429694931376546e-05, + "loss": 1.222, + "step": 13430 + }, + { + "epoch": 0.9128278298681887, + "grad_norm": 0.7296689748764038, + "learning_rate": 4.429482606332382e-05, + "loss": 1.3895, + "step": 13435 + }, + { + "epoch": 0.9131675499388504, + "grad_norm": 0.6304849982261658, + "learning_rate": 4.429270281288219e-05, + "loss": 1.3599, + "step": 13440 + }, + { + "epoch": 0.9135072700095122, + "grad_norm": 0.6465513706207275, + "learning_rate": 4.429057956244055e-05, + "loss": 1.315, + "step": 13445 + }, + { + "epoch": 0.9138469900801739, + "grad_norm": 0.7055228352546692, + "learning_rate": 4.4288456311998915e-05, + "loss": 1.374, + "step": 13450 + }, + { + "epoch": 0.9141867101508357, + "grad_norm": 0.6719948053359985, + "learning_rate": 4.428633306155728e-05, + "loss": 1.4075, + "step": 13455 + }, + { + "epoch": 0.9145264302214975, + "grad_norm": 0.7837120294570923, + "learning_rate": 4.428420981111564e-05, + "loss": 1.3971, + "step": 13460 + }, + { + "epoch": 0.9148661502921592, + "grad_norm": 0.6214168667793274, + "learning_rate": 4.428208656067401e-05, + "loss": 1.3746, + "step": 13465 + }, + { + "epoch": 0.9152058703628211, + "grad_norm": 0.6817909479141235, + "learning_rate": 4.427996331023237e-05, + "loss": 1.2682, + "step": 13470 + }, + { + "epoch": 0.9155455904334828, + "grad_norm": 0.6014137864112854, + "learning_rate": 4.4277840059790735e-05, + "loss": 1.3443, + "step": 13475 + }, + { + "epoch": 0.9158853105041446, + "grad_norm": 0.7761210799217224, + "learning_rate": 4.42757168093491e-05, + "loss": 1.4239, + "step": 13480 + }, + { + "epoch": 0.9162250305748063, + "grad_norm": 0.7059696316719055, + "learning_rate": 4.427359355890746e-05, + "loss": 1.3228, + "step": 13485 + }, + { + "epoch": 0.9165647506454682, + "grad_norm": 0.7353407740592957, + "learning_rate": 4.427147030846583e-05, + "loss": 1.3071, + "step": 13490 + }, + { + "epoch": 0.9169044707161299, + "grad_norm": 0.6706827878952026, + "learning_rate": 4.426934705802419e-05, + "loss": 1.3291, + "step": 13495 + }, + { + "epoch": 0.9172441907867916, + "grad_norm": 0.725422739982605, + "learning_rate": 4.4267223807582555e-05, + "loss": 1.3362, + "step": 13500 + }, + { + "epoch": 0.9175839108574535, + "grad_norm": 0.6997534036636353, + "learning_rate": 4.426510055714092e-05, + "loss": 1.3418, + "step": 13505 + }, + { + "epoch": 0.9179236309281152, + "grad_norm": 0.659503161907196, + "learning_rate": 4.426297730669928e-05, + "loss": 1.4357, + "step": 13510 + }, + { + "epoch": 0.918263350998777, + "grad_norm": 0.6595714092254639, + "learning_rate": 4.426085405625765e-05, + "loss": 1.3532, + "step": 13515 + }, + { + "epoch": 0.9186030710694388, + "grad_norm": 0.6540241837501526, + "learning_rate": 4.425873080581601e-05, + "loss": 1.2698, + "step": 13520 + }, + { + "epoch": 0.9189427911401006, + "grad_norm": 0.6801666021347046, + "learning_rate": 4.4256607555374375e-05, + "loss": 1.3823, + "step": 13525 + }, + { + "epoch": 0.9192825112107623, + "grad_norm": 0.7063329815864563, + "learning_rate": 4.425448430493274e-05, + "loss": 1.269, + "step": 13530 + }, + { + "epoch": 0.9196222312814241, + "grad_norm": 0.6541804671287537, + "learning_rate": 4.42523610544911e-05, + "loss": 1.4347, + "step": 13535 + }, + { + "epoch": 0.9199619513520859, + "grad_norm": 0.7060062885284424, + "learning_rate": 4.425023780404947e-05, + "loss": 1.4064, + "step": 13540 + }, + { + "epoch": 0.9203016714227477, + "grad_norm": 0.7766232490539551, + "learning_rate": 4.4248114553607825e-05, + "loss": 1.4848, + "step": 13545 + }, + { + "epoch": 0.9206413914934094, + "grad_norm": 0.6796672940254211, + "learning_rate": 4.4245991303166195e-05, + "loss": 1.4842, + "step": 13550 + }, + { + "epoch": 0.9209811115640713, + "grad_norm": 0.6018598675727844, + "learning_rate": 4.424386805272456e-05, + "loss": 1.3176, + "step": 13555 + }, + { + "epoch": 0.921320831634733, + "grad_norm": 0.6565004587173462, + "learning_rate": 4.4241744802282917e-05, + "loss": 1.3705, + "step": 13560 + }, + { + "epoch": 0.9216605517053947, + "grad_norm": 0.6929068565368652, + "learning_rate": 4.423962155184129e-05, + "loss": 1.3362, + "step": 13565 + }, + { + "epoch": 0.9220002717760565, + "grad_norm": 0.6933250427246094, + "learning_rate": 4.423749830139965e-05, + "loss": 1.3811, + "step": 13570 + }, + { + "epoch": 0.9223399918467183, + "grad_norm": 0.7431418299674988, + "learning_rate": 4.423537505095801e-05, + "loss": 1.3066, + "step": 13575 + }, + { + "epoch": 0.9226797119173801, + "grad_norm": 0.686419665813446, + "learning_rate": 4.423325180051638e-05, + "loss": 1.3276, + "step": 13580 + }, + { + "epoch": 0.9230194319880418, + "grad_norm": 0.6179741024971008, + "learning_rate": 4.423112855007474e-05, + "loss": 1.3087, + "step": 13585 + }, + { + "epoch": 0.9233591520587037, + "grad_norm": 0.6031321287155151, + "learning_rate": 4.42290052996331e-05, + "loss": 1.3237, + "step": 13590 + }, + { + "epoch": 0.9236988721293654, + "grad_norm": 0.7255411744117737, + "learning_rate": 4.422688204919147e-05, + "loss": 1.3777, + "step": 13595 + }, + { + "epoch": 0.9240385922000272, + "grad_norm": 0.6925275325775146, + "learning_rate": 4.4224758798749835e-05, + "loss": 1.3086, + "step": 13600 + }, + { + "epoch": 0.924378312270689, + "grad_norm": 0.7831964492797852, + "learning_rate": 4.422263554830819e-05, + "loss": 1.4558, + "step": 13605 + }, + { + "epoch": 0.9247180323413507, + "grad_norm": 0.6948872208595276, + "learning_rate": 4.4220512297866563e-05, + "loss": 1.3506, + "step": 13610 + }, + { + "epoch": 0.9250577524120125, + "grad_norm": 0.5872417092323303, + "learning_rate": 4.421838904742493e-05, + "loss": 1.3226, + "step": 13615 + }, + { + "epoch": 0.9253974724826742, + "grad_norm": 0.6647357940673828, + "learning_rate": 4.4216265796983285e-05, + "loss": 1.4053, + "step": 13620 + }, + { + "epoch": 0.9257371925533361, + "grad_norm": 0.7161340713500977, + "learning_rate": 4.4214142546541655e-05, + "loss": 1.3165, + "step": 13625 + }, + { + "epoch": 0.9260769126239978, + "grad_norm": 0.6334004402160645, + "learning_rate": 4.421201929610001e-05, + "loss": 1.2966, + "step": 13630 + }, + { + "epoch": 0.9264166326946596, + "grad_norm": 0.6781123876571655, + "learning_rate": 4.420989604565838e-05, + "loss": 1.3496, + "step": 13635 + }, + { + "epoch": 0.9267563527653214, + "grad_norm": 0.7244052290916443, + "learning_rate": 4.420777279521675e-05, + "loss": 1.3406, + "step": 13640 + }, + { + "epoch": 0.9270960728359832, + "grad_norm": 0.5577734708786011, + "learning_rate": 4.4205649544775105e-05, + "loss": 1.2553, + "step": 13645 + }, + { + "epoch": 0.9274357929066449, + "grad_norm": 0.6536522507667542, + "learning_rate": 4.420352629433347e-05, + "loss": 1.3195, + "step": 13650 + }, + { + "epoch": 0.9277755129773066, + "grad_norm": 0.7267488241195679, + "learning_rate": 4.420140304389184e-05, + "loss": 1.3983, + "step": 13655 + }, + { + "epoch": 0.9281152330479685, + "grad_norm": 0.7505548596382141, + "learning_rate": 4.41992797934502e-05, + "loss": 1.4297, + "step": 13660 + }, + { + "epoch": 0.9284549531186302, + "grad_norm": 0.6807383894920349, + "learning_rate": 4.419715654300856e-05, + "loss": 1.3618, + "step": 13665 + }, + { + "epoch": 0.928794673189292, + "grad_norm": 0.7637690901756287, + "learning_rate": 4.419503329256693e-05, + "loss": 1.346, + "step": 13670 + }, + { + "epoch": 0.9291343932599538, + "grad_norm": 0.6206974983215332, + "learning_rate": 4.419291004212529e-05, + "loss": 1.3381, + "step": 13675 + }, + { + "epoch": 0.9294741133306156, + "grad_norm": 0.7176368236541748, + "learning_rate": 4.419078679168365e-05, + "loss": 1.3247, + "step": 13680 + }, + { + "epoch": 0.9298138334012773, + "grad_norm": 0.6860157251358032, + "learning_rate": 4.4188663541242023e-05, + "loss": 1.3047, + "step": 13685 + }, + { + "epoch": 0.9301535534719392, + "grad_norm": 0.6906935572624207, + "learning_rate": 4.418654029080038e-05, + "loss": 1.3295, + "step": 13690 + }, + { + "epoch": 0.9304932735426009, + "grad_norm": 0.6833156943321228, + "learning_rate": 4.4184417040358745e-05, + "loss": 1.3558, + "step": 13695 + }, + { + "epoch": 0.9308329936132627, + "grad_norm": 0.7648639678955078, + "learning_rate": 4.418229378991711e-05, + "loss": 1.3238, + "step": 13700 + }, + { + "epoch": 0.9311727136839244, + "grad_norm": 0.7127102613449097, + "learning_rate": 4.418017053947547e-05, + "loss": 1.2945, + "step": 13705 + }, + { + "epoch": 0.9315124337545863, + "grad_norm": 0.7148470282554626, + "learning_rate": 4.417804728903384e-05, + "loss": 1.3695, + "step": 13710 + }, + { + "epoch": 0.931852153825248, + "grad_norm": 0.7143694162368774, + "learning_rate": 4.41759240385922e-05, + "loss": 1.4847, + "step": 13715 + }, + { + "epoch": 0.9321918738959097, + "grad_norm": 0.7163874506950378, + "learning_rate": 4.4173800788150565e-05, + "loss": 1.3838, + "step": 13720 + }, + { + "epoch": 0.9325315939665716, + "grad_norm": 0.5652332305908203, + "learning_rate": 4.417167753770893e-05, + "loss": 1.3025, + "step": 13725 + }, + { + "epoch": 0.9328713140372333, + "grad_norm": 0.7008733153343201, + "learning_rate": 4.416955428726729e-05, + "loss": 1.3852, + "step": 13730 + }, + { + "epoch": 0.9332110341078951, + "grad_norm": 0.7026612162590027, + "learning_rate": 4.416743103682566e-05, + "loss": 1.4156, + "step": 13735 + }, + { + "epoch": 0.9335507541785568, + "grad_norm": 0.7290593981742859, + "learning_rate": 4.416530778638402e-05, + "loss": 1.3394, + "step": 13740 + }, + { + "epoch": 0.9338904742492187, + "grad_norm": 0.656147301197052, + "learning_rate": 4.4163184535942385e-05, + "loss": 1.3438, + "step": 13745 + }, + { + "epoch": 0.9342301943198804, + "grad_norm": 0.6537467837333679, + "learning_rate": 4.416106128550075e-05, + "loss": 1.2681, + "step": 13750 + }, + { + "epoch": 0.9345699143905422, + "grad_norm": 0.6275011301040649, + "learning_rate": 4.415893803505911e-05, + "loss": 1.374, + "step": 13755 + }, + { + "epoch": 0.934909634461204, + "grad_norm": 0.7148743271827698, + "learning_rate": 4.415681478461748e-05, + "loss": 1.3613, + "step": 13760 + }, + { + "epoch": 0.9352493545318658, + "grad_norm": 0.665732741355896, + "learning_rate": 4.415469153417584e-05, + "loss": 1.3207, + "step": 13765 + }, + { + "epoch": 0.9355890746025275, + "grad_norm": 0.7353973984718323, + "learning_rate": 4.4152568283734205e-05, + "loss": 1.3649, + "step": 13770 + }, + { + "epoch": 0.9359287946731893, + "grad_norm": 0.7345528602600098, + "learning_rate": 4.415044503329257e-05, + "loss": 1.3052, + "step": 13775 + }, + { + "epoch": 0.9362685147438511, + "grad_norm": 0.7020397782325745, + "learning_rate": 4.414832178285093e-05, + "loss": 1.36, + "step": 13780 + }, + { + "epoch": 0.9366082348145128, + "grad_norm": 0.6661944389343262, + "learning_rate": 4.41461985324093e-05, + "loss": 1.3425, + "step": 13785 + }, + { + "epoch": 0.9369479548851746, + "grad_norm": 0.6353135704994202, + "learning_rate": 4.414407528196766e-05, + "loss": 1.3545, + "step": 13790 + }, + { + "epoch": 0.9372876749558364, + "grad_norm": 0.6998313665390015, + "learning_rate": 4.4141952031526025e-05, + "loss": 1.4537, + "step": 13795 + }, + { + "epoch": 0.9376273950264982, + "grad_norm": 0.6690735816955566, + "learning_rate": 4.413982878108439e-05, + "loss": 1.3284, + "step": 13800 + }, + { + "epoch": 0.9379671150971599, + "grad_norm": 0.7317988276481628, + "learning_rate": 4.413770553064275e-05, + "loss": 1.3734, + "step": 13805 + }, + { + "epoch": 0.9383068351678218, + "grad_norm": 0.7345866560935974, + "learning_rate": 4.413558228020112e-05, + "loss": 1.2194, + "step": 13810 + }, + { + "epoch": 0.9386465552384835, + "grad_norm": 0.6236724257469177, + "learning_rate": 4.413345902975948e-05, + "loss": 1.4252, + "step": 13815 + }, + { + "epoch": 0.9389862753091452, + "grad_norm": 0.8949745297431946, + "learning_rate": 4.4131335779317845e-05, + "loss": 1.3196, + "step": 13820 + }, + { + "epoch": 0.939325995379807, + "grad_norm": 0.6839173436164856, + "learning_rate": 4.412921252887621e-05, + "loss": 1.2955, + "step": 13825 + }, + { + "epoch": 0.9396657154504688, + "grad_norm": 0.7415878176689148, + "learning_rate": 4.412708927843457e-05, + "loss": 1.331, + "step": 13830 + }, + { + "epoch": 0.9400054355211306, + "grad_norm": 0.7342458963394165, + "learning_rate": 4.412496602799294e-05, + "loss": 1.4263, + "step": 13835 + }, + { + "epoch": 0.9403451555917923, + "grad_norm": 0.6380994319915771, + "learning_rate": 4.41228427775513e-05, + "loss": 1.3935, + "step": 13840 + }, + { + "epoch": 0.9406848756624542, + "grad_norm": 0.7457711100578308, + "learning_rate": 4.4120719527109665e-05, + "loss": 1.3192, + "step": 13845 + }, + { + "epoch": 0.9410245957331159, + "grad_norm": 0.7554078102111816, + "learning_rate": 4.411859627666803e-05, + "loss": 1.3497, + "step": 13850 + }, + { + "epoch": 0.9413643158037777, + "grad_norm": 0.6834602952003479, + "learning_rate": 4.411647302622639e-05, + "loss": 1.5496, + "step": 13855 + }, + { + "epoch": 0.9417040358744395, + "grad_norm": 0.6930656433105469, + "learning_rate": 4.411434977578476e-05, + "loss": 1.5183, + "step": 13860 + }, + { + "epoch": 0.9420437559451013, + "grad_norm": 0.6998500227928162, + "learning_rate": 4.411222652534312e-05, + "loss": 1.3476, + "step": 13865 + }, + { + "epoch": 0.942383476015763, + "grad_norm": 0.6948878765106201, + "learning_rate": 4.4110103274901485e-05, + "loss": 1.3836, + "step": 13870 + }, + { + "epoch": 0.9427231960864247, + "grad_norm": 0.8128736615180969, + "learning_rate": 4.410798002445985e-05, + "loss": 1.3481, + "step": 13875 + }, + { + "epoch": 0.9430629161570866, + "grad_norm": 0.6872991323471069, + "learning_rate": 4.410585677401821e-05, + "loss": 1.4402, + "step": 13880 + }, + { + "epoch": 0.9434026362277483, + "grad_norm": 0.6555947661399841, + "learning_rate": 4.410373352357658e-05, + "loss": 1.2432, + "step": 13885 + }, + { + "epoch": 0.9437423562984101, + "grad_norm": 0.6612246036529541, + "learning_rate": 4.410161027313494e-05, + "loss": 1.448, + "step": 13890 + }, + { + "epoch": 0.9440820763690719, + "grad_norm": 0.673306405544281, + "learning_rate": 4.4099487022693305e-05, + "loss": 1.3303, + "step": 13895 + }, + { + "epoch": 0.9444217964397337, + "grad_norm": 0.6873499155044556, + "learning_rate": 4.409736377225166e-05, + "loss": 1.2458, + "step": 13900 + }, + { + "epoch": 0.9447615165103954, + "grad_norm": 0.7184569239616394, + "learning_rate": 4.409524052181003e-05, + "loss": 1.3367, + "step": 13905 + }, + { + "epoch": 0.9451012365810572, + "grad_norm": 0.7577341198921204, + "learning_rate": 4.40931172713684e-05, + "loss": 1.4501, + "step": 13910 + }, + { + "epoch": 0.945440956651719, + "grad_norm": 0.7062777280807495, + "learning_rate": 4.4090994020926754e-05, + "loss": 1.4258, + "step": 13915 + }, + { + "epoch": 0.9457806767223808, + "grad_norm": 0.638261079788208, + "learning_rate": 4.4088870770485125e-05, + "loss": 1.311, + "step": 13920 + }, + { + "epoch": 0.9461203967930425, + "grad_norm": 0.5948472023010254, + "learning_rate": 4.408674752004349e-05, + "loss": 1.3772, + "step": 13925 + }, + { + "epoch": 0.9464601168637043, + "grad_norm": 0.643025279045105, + "learning_rate": 4.4084624269601846e-05, + "loss": 1.2932, + "step": 13930 + }, + { + "epoch": 0.9467998369343661, + "grad_norm": 0.6830620169639587, + "learning_rate": 4.408250101916022e-05, + "loss": 1.3575, + "step": 13935 + }, + { + "epoch": 0.9471395570050278, + "grad_norm": 0.6490104794502258, + "learning_rate": 4.408037776871858e-05, + "loss": 1.4628, + "step": 13940 + }, + { + "epoch": 0.9474792770756897, + "grad_norm": 0.6798790693283081, + "learning_rate": 4.407825451827694e-05, + "loss": 1.3162, + "step": 13945 + }, + { + "epoch": 0.9478189971463514, + "grad_norm": 0.6297010779380798, + "learning_rate": 4.407613126783531e-05, + "loss": 1.3067, + "step": 13950 + }, + { + "epoch": 0.9481587172170132, + "grad_norm": 0.6597910523414612, + "learning_rate": 4.407400801739367e-05, + "loss": 1.3153, + "step": 13955 + }, + { + "epoch": 0.9484984372876749, + "grad_norm": 0.6417344212532043, + "learning_rate": 4.407188476695203e-05, + "loss": 1.3862, + "step": 13960 + }, + { + "epoch": 0.9488381573583368, + "grad_norm": 0.7095355987548828, + "learning_rate": 4.40697615165104e-05, + "loss": 1.3388, + "step": 13965 + }, + { + "epoch": 0.9491778774289985, + "grad_norm": 0.6944183707237244, + "learning_rate": 4.406763826606876e-05, + "loss": 1.3299, + "step": 13970 + }, + { + "epoch": 0.9495175974996602, + "grad_norm": 0.6939918398857117, + "learning_rate": 4.406551501562712e-05, + "loss": 1.3929, + "step": 13975 + }, + { + "epoch": 0.9498573175703221, + "grad_norm": 0.7122329473495483, + "learning_rate": 4.406339176518549e-05, + "loss": 1.5842, + "step": 13980 + }, + { + "epoch": 0.9501970376409838, + "grad_norm": 0.5930275321006775, + "learning_rate": 4.406126851474385e-05, + "loss": 1.2767, + "step": 13985 + }, + { + "epoch": 0.9505367577116456, + "grad_norm": 0.6418907642364502, + "learning_rate": 4.4059145264302214e-05, + "loss": 1.241, + "step": 13990 + }, + { + "epoch": 0.9508764777823073, + "grad_norm": 0.6851585507392883, + "learning_rate": 4.4057022013860585e-05, + "loss": 1.2957, + "step": 13995 + }, + { + "epoch": 0.9512161978529692, + "grad_norm": 0.6519604325294495, + "learning_rate": 4.405489876341894e-05, + "loss": 1.3293, + "step": 14000 + }, + { + "epoch": 0.9515559179236309, + "grad_norm": 0.6667711734771729, + "learning_rate": 4.4052775512977306e-05, + "loss": 1.3517, + "step": 14005 + }, + { + "epoch": 0.9518956379942927, + "grad_norm": 0.7155119180679321, + "learning_rate": 4.405065226253568e-05, + "loss": 1.2733, + "step": 14010 + }, + { + "epoch": 0.9522353580649545, + "grad_norm": 0.6329841017723083, + "learning_rate": 4.4048529012094034e-05, + "loss": 1.3746, + "step": 14015 + }, + { + "epoch": 0.9525750781356163, + "grad_norm": 0.7616704106330872, + "learning_rate": 4.40464057616524e-05, + "loss": 1.3645, + "step": 14020 + }, + { + "epoch": 0.952914798206278, + "grad_norm": 0.6776059865951538, + "learning_rate": 4.404428251121077e-05, + "loss": 1.4592, + "step": 14025 + }, + { + "epoch": 0.9532545182769399, + "grad_norm": 0.613772451877594, + "learning_rate": 4.4042159260769126e-05, + "loss": 1.2752, + "step": 14030 + }, + { + "epoch": 0.9535942383476016, + "grad_norm": 0.5922441482543945, + "learning_rate": 4.404003601032749e-05, + "loss": 1.2123, + "step": 14035 + }, + { + "epoch": 0.9539339584182633, + "grad_norm": 0.6554942727088928, + "learning_rate": 4.403791275988586e-05, + "loss": 1.3778, + "step": 14040 + }, + { + "epoch": 0.9542736784889251, + "grad_norm": 0.7736828923225403, + "learning_rate": 4.403578950944422e-05, + "loss": 1.4264, + "step": 14045 + }, + { + "epoch": 0.9546133985595869, + "grad_norm": 0.6744146347045898, + "learning_rate": 4.403366625900258e-05, + "loss": 1.3186, + "step": 14050 + }, + { + "epoch": 0.9549531186302487, + "grad_norm": 0.7017351984977722, + "learning_rate": 4.4031543008560946e-05, + "loss": 1.3742, + "step": 14055 + }, + { + "epoch": 0.9552928387009104, + "grad_norm": 0.6876327395439148, + "learning_rate": 4.402941975811931e-05, + "loss": 1.3365, + "step": 14060 + }, + { + "epoch": 0.9556325587715723, + "grad_norm": 0.6205369234085083, + "learning_rate": 4.4027296507677674e-05, + "loss": 1.3912, + "step": 14065 + }, + { + "epoch": 0.955972278842234, + "grad_norm": 0.6234815120697021, + "learning_rate": 4.402517325723604e-05, + "loss": 1.3267, + "step": 14070 + }, + { + "epoch": 0.9563119989128958, + "grad_norm": 0.6214469075202942, + "learning_rate": 4.40230500067944e-05, + "loss": 1.3449, + "step": 14075 + }, + { + "epoch": 0.9566517189835575, + "grad_norm": 0.622333824634552, + "learning_rate": 4.4020926756352766e-05, + "loss": 1.3436, + "step": 14080 + }, + { + "epoch": 0.9569914390542194, + "grad_norm": 0.7575642466545105, + "learning_rate": 4.401880350591113e-05, + "loss": 1.3706, + "step": 14085 + }, + { + "epoch": 0.9573311591248811, + "grad_norm": 0.7207659482955933, + "learning_rate": 4.4016680255469494e-05, + "loss": 1.293, + "step": 14090 + }, + { + "epoch": 0.9576708791955428, + "grad_norm": 0.6630386114120483, + "learning_rate": 4.401455700502786e-05, + "loss": 1.3204, + "step": 14095 + }, + { + "epoch": 0.9580105992662047, + "grad_norm": 0.7721298933029175, + "learning_rate": 4.401243375458622e-05, + "loss": 1.3292, + "step": 14100 + }, + { + "epoch": 0.9583503193368664, + "grad_norm": 0.7689454555511475, + "learning_rate": 4.4010310504144586e-05, + "loss": 1.3164, + "step": 14105 + }, + { + "epoch": 0.9586900394075282, + "grad_norm": 0.7109953761100769, + "learning_rate": 4.400818725370295e-05, + "loss": 1.2628, + "step": 14110 + }, + { + "epoch": 0.95902975947819, + "grad_norm": 0.7149671316146851, + "learning_rate": 4.4006064003261314e-05, + "loss": 1.3386, + "step": 14115 + }, + { + "epoch": 0.9593694795488518, + "grad_norm": 0.7511143088340759, + "learning_rate": 4.400394075281968e-05, + "loss": 1.4523, + "step": 14120 + }, + { + "epoch": 0.9597091996195135, + "grad_norm": 0.70169997215271, + "learning_rate": 4.400181750237804e-05, + "loss": 1.3615, + "step": 14125 + }, + { + "epoch": 0.9600489196901753, + "grad_norm": 0.7001254558563232, + "learning_rate": 4.3999694251936406e-05, + "loss": 1.38, + "step": 14130 + }, + { + "epoch": 0.9603886397608371, + "grad_norm": 0.6793377995491028, + "learning_rate": 4.399757100149477e-05, + "loss": 1.4265, + "step": 14135 + }, + { + "epoch": 0.9607283598314988, + "grad_norm": 0.6415596008300781, + "learning_rate": 4.3995447751053134e-05, + "loss": 1.3525, + "step": 14140 + }, + { + "epoch": 0.9610680799021606, + "grad_norm": 0.6914222240447998, + "learning_rate": 4.39933245006115e-05, + "loss": 1.2615, + "step": 14145 + }, + { + "epoch": 0.9614077999728224, + "grad_norm": 0.6529057025909424, + "learning_rate": 4.399120125016986e-05, + "loss": 1.4715, + "step": 14150 + }, + { + "epoch": 0.9617475200434842, + "grad_norm": 0.6536417603492737, + "learning_rate": 4.3989077999728226e-05, + "loss": 1.2823, + "step": 14155 + }, + { + "epoch": 0.9620872401141459, + "grad_norm": 0.733706533908844, + "learning_rate": 4.398695474928659e-05, + "loss": 1.4046, + "step": 14160 + }, + { + "epoch": 0.9624269601848077, + "grad_norm": 0.757579505443573, + "learning_rate": 4.3984831498844954e-05, + "loss": 1.4021, + "step": 14165 + }, + { + "epoch": 0.9627666802554695, + "grad_norm": 0.7217411994934082, + "learning_rate": 4.398270824840332e-05, + "loss": 1.3788, + "step": 14170 + }, + { + "epoch": 0.9631064003261313, + "grad_norm": 0.6954963207244873, + "learning_rate": 4.398058499796168e-05, + "loss": 1.3508, + "step": 14175 + }, + { + "epoch": 0.963446120396793, + "grad_norm": 0.6648340225219727, + "learning_rate": 4.3978461747520046e-05, + "loss": 1.3537, + "step": 14180 + }, + { + "epoch": 0.9637858404674549, + "grad_norm": 0.7050533294677734, + "learning_rate": 4.397633849707841e-05, + "loss": 1.4402, + "step": 14185 + }, + { + "epoch": 0.9641255605381166, + "grad_norm": 0.7376846671104431, + "learning_rate": 4.3974215246636774e-05, + "loss": 1.3408, + "step": 14190 + }, + { + "epoch": 0.9644652806087783, + "grad_norm": 0.6715635657310486, + "learning_rate": 4.397209199619514e-05, + "loss": 1.3493, + "step": 14195 + }, + { + "epoch": 0.9648050006794402, + "grad_norm": 0.708532452583313, + "learning_rate": 4.39699687457535e-05, + "loss": 1.3779, + "step": 14200 + }, + { + "epoch": 0.9651447207501019, + "grad_norm": 0.6870357394218445, + "learning_rate": 4.3967845495311866e-05, + "loss": 1.3674, + "step": 14205 + }, + { + "epoch": 0.9654844408207637, + "grad_norm": 0.7117043137550354, + "learning_rate": 4.396572224487023e-05, + "loss": 1.3592, + "step": 14210 + }, + { + "epoch": 0.9658241608914254, + "grad_norm": 0.8228017091751099, + "learning_rate": 4.3963598994428594e-05, + "loss": 1.3517, + "step": 14215 + }, + { + "epoch": 0.9661638809620873, + "grad_norm": 0.6799255609512329, + "learning_rate": 4.396147574398696e-05, + "loss": 1.3451, + "step": 14220 + }, + { + "epoch": 0.966503601032749, + "grad_norm": 0.7322139739990234, + "learning_rate": 4.395935249354532e-05, + "loss": 1.3108, + "step": 14225 + }, + { + "epoch": 0.9668433211034108, + "grad_norm": 0.7069768309593201, + "learning_rate": 4.3957229243103686e-05, + "loss": 1.3206, + "step": 14230 + }, + { + "epoch": 0.9671830411740726, + "grad_norm": 0.6934696435928345, + "learning_rate": 4.395510599266205e-05, + "loss": 1.4205, + "step": 14235 + }, + { + "epoch": 0.9675227612447344, + "grad_norm": 0.6686298251152039, + "learning_rate": 4.3952982742220414e-05, + "loss": 1.3191, + "step": 14240 + }, + { + "epoch": 0.9678624813153961, + "grad_norm": 0.713447093963623, + "learning_rate": 4.395085949177878e-05, + "loss": 1.3402, + "step": 14245 + }, + { + "epoch": 0.9682022013860578, + "grad_norm": 0.660870373249054, + "learning_rate": 4.394873624133714e-05, + "loss": 1.3791, + "step": 14250 + }, + { + "epoch": 0.9685419214567197, + "grad_norm": 0.6390849947929382, + "learning_rate": 4.39466129908955e-05, + "loss": 1.3189, + "step": 14255 + }, + { + "epoch": 0.9688816415273814, + "grad_norm": 0.7503184080123901, + "learning_rate": 4.394448974045387e-05, + "loss": 1.3094, + "step": 14260 + }, + { + "epoch": 0.9692213615980432, + "grad_norm": 0.714127779006958, + "learning_rate": 4.3942366490012234e-05, + "loss": 1.378, + "step": 14265 + }, + { + "epoch": 0.969561081668705, + "grad_norm": 0.6611217856407166, + "learning_rate": 4.394024323957059e-05, + "loss": 1.1974, + "step": 14270 + }, + { + "epoch": 0.9699008017393668, + "grad_norm": 0.8122195601463318, + "learning_rate": 4.393811998912896e-05, + "loss": 1.3743, + "step": 14275 + }, + { + "epoch": 0.9702405218100285, + "grad_norm": 0.6840839982032776, + "learning_rate": 4.3935996738687326e-05, + "loss": 1.2965, + "step": 14280 + }, + { + "epoch": 0.9705802418806904, + "grad_norm": 0.5934416055679321, + "learning_rate": 4.3933873488245684e-05, + "loss": 1.2991, + "step": 14285 + }, + { + "epoch": 0.9709199619513521, + "grad_norm": 0.6726582646369934, + "learning_rate": 4.3931750237804054e-05, + "loss": 1.3566, + "step": 14290 + }, + { + "epoch": 0.9712596820220138, + "grad_norm": 0.6945735812187195, + "learning_rate": 4.392962698736242e-05, + "loss": 1.316, + "step": 14295 + }, + { + "epoch": 0.9715994020926756, + "grad_norm": 0.676308810710907, + "learning_rate": 4.3927503736920776e-05, + "loss": 1.3917, + "step": 14300 + }, + { + "epoch": 0.9719391221633374, + "grad_norm": 0.6670910716056824, + "learning_rate": 4.3925380486479146e-05, + "loss": 1.3191, + "step": 14305 + }, + { + "epoch": 0.9722788422339992, + "grad_norm": 0.575186550617218, + "learning_rate": 4.392325723603751e-05, + "loss": 1.3523, + "step": 14310 + }, + { + "epoch": 0.9726185623046609, + "grad_norm": 0.6964967250823975, + "learning_rate": 4.392113398559587e-05, + "loss": 1.2912, + "step": 14315 + }, + { + "epoch": 0.9729582823753228, + "grad_norm": 0.7684643268585205, + "learning_rate": 4.391901073515424e-05, + "loss": 1.4089, + "step": 14320 + }, + { + "epoch": 0.9732980024459845, + "grad_norm": 0.7359722852706909, + "learning_rate": 4.3916887484712596e-05, + "loss": 1.3103, + "step": 14325 + }, + { + "epoch": 0.9736377225166463, + "grad_norm": 0.7156456112861633, + "learning_rate": 4.391476423427096e-05, + "loss": 1.4543, + "step": 14330 + }, + { + "epoch": 0.973977442587308, + "grad_norm": 0.6346716284751892, + "learning_rate": 4.391264098382933e-05, + "loss": 1.505, + "step": 14335 + }, + { + "epoch": 0.9743171626579699, + "grad_norm": 0.6405880451202393, + "learning_rate": 4.391051773338769e-05, + "loss": 1.3251, + "step": 14340 + }, + { + "epoch": 0.9746568827286316, + "grad_norm": 0.5706714987754822, + "learning_rate": 4.390839448294605e-05, + "loss": 1.3277, + "step": 14345 + }, + { + "epoch": 0.9749966027992933, + "grad_norm": 0.7190221548080444, + "learning_rate": 4.390627123250442e-05, + "loss": 1.3276, + "step": 14350 + }, + { + "epoch": 0.9753363228699552, + "grad_norm": 0.6241934895515442, + "learning_rate": 4.390414798206278e-05, + "loss": 1.3346, + "step": 14355 + }, + { + "epoch": 0.9756760429406169, + "grad_norm": 0.7271825075149536, + "learning_rate": 4.3902024731621144e-05, + "loss": 1.3684, + "step": 14360 + }, + { + "epoch": 0.9760157630112787, + "grad_norm": 0.8547173738479614, + "learning_rate": 4.3899901481179514e-05, + "loss": 1.4018, + "step": 14365 + }, + { + "epoch": 0.9763554830819405, + "grad_norm": 0.7631275057792664, + "learning_rate": 4.389777823073787e-05, + "loss": 1.3373, + "step": 14370 + }, + { + "epoch": 0.9766952031526023, + "grad_norm": 0.7032694220542908, + "learning_rate": 4.3895654980296236e-05, + "loss": 1.3935, + "step": 14375 + }, + { + "epoch": 0.977034923223264, + "grad_norm": 0.6929472088813782, + "learning_rate": 4.3893531729854606e-05, + "loss": 1.3589, + "step": 14380 + }, + { + "epoch": 0.9773746432939258, + "grad_norm": 0.6832548975944519, + "learning_rate": 4.3891408479412964e-05, + "loss": 1.4363, + "step": 14385 + }, + { + "epoch": 0.9777143633645876, + "grad_norm": 0.6320139765739441, + "learning_rate": 4.388928522897133e-05, + "loss": 1.2939, + "step": 14390 + }, + { + "epoch": 0.9780540834352494, + "grad_norm": 0.6729910969734192, + "learning_rate": 4.388716197852969e-05, + "loss": 1.3276, + "step": 14395 + }, + { + "epoch": 0.9783938035059111, + "grad_norm": 0.7248609066009521, + "learning_rate": 4.3885038728088056e-05, + "loss": 1.3013, + "step": 14400 + }, + { + "epoch": 0.978733523576573, + "grad_norm": 0.7062149047851562, + "learning_rate": 4.388291547764642e-05, + "loss": 1.4143, + "step": 14405 + }, + { + "epoch": 0.9790732436472347, + "grad_norm": 0.7126065492630005, + "learning_rate": 4.3880792227204784e-05, + "loss": 1.3011, + "step": 14410 + }, + { + "epoch": 0.9794129637178964, + "grad_norm": 0.6709874272346497, + "learning_rate": 4.387866897676315e-05, + "loss": 1.3646, + "step": 14415 + }, + { + "epoch": 0.9797526837885582, + "grad_norm": 0.7767813205718994, + "learning_rate": 4.387654572632151e-05, + "loss": 1.2983, + "step": 14420 + }, + { + "epoch": 0.98009240385922, + "grad_norm": 0.7139157056808472, + "learning_rate": 4.3874422475879876e-05, + "loss": 1.4078, + "step": 14425 + }, + { + "epoch": 0.9804321239298818, + "grad_norm": 0.6205175518989563, + "learning_rate": 4.387229922543824e-05, + "loss": 1.4646, + "step": 14430 + }, + { + "epoch": 0.9807718440005435, + "grad_norm": 0.7783640623092651, + "learning_rate": 4.3870175974996604e-05, + "loss": 1.3237, + "step": 14435 + }, + { + "epoch": 0.9811115640712054, + "grad_norm": 0.6792538166046143, + "learning_rate": 4.386805272455497e-05, + "loss": 1.4417, + "step": 14440 + }, + { + "epoch": 0.9814512841418671, + "grad_norm": 0.6064441204071045, + "learning_rate": 4.386592947411333e-05, + "loss": 1.3311, + "step": 14445 + }, + { + "epoch": 0.9817910042125289, + "grad_norm": 0.7223596572875977, + "learning_rate": 4.3863806223671696e-05, + "loss": 1.3375, + "step": 14450 + }, + { + "epoch": 0.9821307242831907, + "grad_norm": 0.6360611319541931, + "learning_rate": 4.386168297323006e-05, + "loss": 1.2944, + "step": 14455 + }, + { + "epoch": 0.9824704443538524, + "grad_norm": 0.6797980666160583, + "learning_rate": 4.3859559722788424e-05, + "loss": 1.3256, + "step": 14460 + }, + { + "epoch": 0.9828101644245142, + "grad_norm": 0.7019459009170532, + "learning_rate": 4.385743647234679e-05, + "loss": 1.3465, + "step": 14465 + }, + { + "epoch": 0.9831498844951759, + "grad_norm": 0.6758162379264832, + "learning_rate": 4.385531322190515e-05, + "loss": 1.2201, + "step": 14470 + }, + { + "epoch": 0.9834896045658378, + "grad_norm": 0.722413957118988, + "learning_rate": 4.3853189971463516e-05, + "loss": 1.3084, + "step": 14475 + }, + { + "epoch": 0.9838293246364995, + "grad_norm": 0.6451125144958496, + "learning_rate": 4.385106672102188e-05, + "loss": 1.3079, + "step": 14480 + }, + { + "epoch": 0.9841690447071613, + "grad_norm": 0.6718553900718689, + "learning_rate": 4.3848943470580244e-05, + "loss": 1.4237, + "step": 14485 + }, + { + "epoch": 0.9845087647778231, + "grad_norm": 0.623634397983551, + "learning_rate": 4.384682022013861e-05, + "loss": 1.3196, + "step": 14490 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.7772073149681091, + "learning_rate": 4.384469696969697e-05, + "loss": 1.4259, + "step": 14495 + }, + { + "epoch": 0.9851882049191466, + "grad_norm": 0.7545345425605774, + "learning_rate": 4.3842573719255336e-05, + "loss": 1.3736, + "step": 14500 + }, + { + "epoch": 0.9855279249898083, + "grad_norm": 0.6576807498931885, + "learning_rate": 4.38404504688137e-05, + "loss": 1.3358, + "step": 14505 + }, + { + "epoch": 0.9858676450604702, + "grad_norm": 0.7687963247299194, + "learning_rate": 4.3838327218372064e-05, + "loss": 1.3047, + "step": 14510 + }, + { + "epoch": 0.9862073651311319, + "grad_norm": 0.6813951134681702, + "learning_rate": 4.383620396793043e-05, + "loss": 1.3818, + "step": 14515 + }, + { + "epoch": 0.9865470852017937, + "grad_norm": 0.6597345471382141, + "learning_rate": 4.383408071748879e-05, + "loss": 1.3778, + "step": 14520 + }, + { + "epoch": 0.9868868052724555, + "grad_norm": 0.6556777954101562, + "learning_rate": 4.3831957467047156e-05, + "loss": 1.3506, + "step": 14525 + }, + { + "epoch": 0.9872265253431173, + "grad_norm": 0.6998063921928406, + "learning_rate": 4.382983421660552e-05, + "loss": 1.3793, + "step": 14530 + }, + { + "epoch": 0.987566245413779, + "grad_norm": 0.7922748327255249, + "learning_rate": 4.3827710966163884e-05, + "loss": 1.3357, + "step": 14535 + }, + { + "epoch": 0.9879059654844409, + "grad_norm": 0.6029360294342041, + "learning_rate": 4.382558771572225e-05, + "loss": 1.3205, + "step": 14540 + }, + { + "epoch": 0.9882456855551026, + "grad_norm": 0.6841976642608643, + "learning_rate": 4.382346446528061e-05, + "loss": 1.3609, + "step": 14545 + }, + { + "epoch": 0.9885854056257644, + "grad_norm": 0.729258120059967, + "learning_rate": 4.3821341214838976e-05, + "loss": 1.4015, + "step": 14550 + }, + { + "epoch": 0.9889251256964261, + "grad_norm": 0.6758211851119995, + "learning_rate": 4.381921796439734e-05, + "loss": 1.3338, + "step": 14555 + }, + { + "epoch": 0.989264845767088, + "grad_norm": 0.6893669366836548, + "learning_rate": 4.3817094713955704e-05, + "loss": 1.3181, + "step": 14560 + }, + { + "epoch": 0.9896045658377497, + "grad_norm": 0.6467033624649048, + "learning_rate": 4.381497146351407e-05, + "loss": 1.3362, + "step": 14565 + }, + { + "epoch": 0.9899442859084114, + "grad_norm": 0.6745814681053162, + "learning_rate": 4.381284821307243e-05, + "loss": 1.3821, + "step": 14570 + }, + { + "epoch": 0.9902840059790733, + "grad_norm": 0.6461717486381531, + "learning_rate": 4.3810724962630796e-05, + "loss": 1.339, + "step": 14575 + }, + { + "epoch": 0.990623726049735, + "grad_norm": 0.7416273951530457, + "learning_rate": 4.380860171218916e-05, + "loss": 1.3768, + "step": 14580 + }, + { + "epoch": 0.9909634461203968, + "grad_norm": 0.6382895708084106, + "learning_rate": 4.3806478461747524e-05, + "loss": 1.4391, + "step": 14585 + }, + { + "epoch": 0.9913031661910585, + "grad_norm": 0.6742331981658936, + "learning_rate": 4.380435521130589e-05, + "loss": 1.3704, + "step": 14590 + }, + { + "epoch": 0.9916428862617204, + "grad_norm": 0.7783130407333374, + "learning_rate": 4.3802231960864245e-05, + "loss": 1.3181, + "step": 14595 + }, + { + "epoch": 0.9919826063323821, + "grad_norm": 0.766089916229248, + "learning_rate": 4.3800108710422616e-05, + "loss": 1.3837, + "step": 14600 + }, + { + "epoch": 0.9923223264030439, + "grad_norm": 0.6705353856086731, + "learning_rate": 4.379798545998098e-05, + "loss": 1.3804, + "step": 14605 + }, + { + "epoch": 0.9926620464737057, + "grad_norm": 0.7190149426460266, + "learning_rate": 4.379586220953934e-05, + "loss": 1.3942, + "step": 14610 + }, + { + "epoch": 0.9930017665443674, + "grad_norm": 0.6647737622261047, + "learning_rate": 4.379373895909771e-05, + "loss": 1.3856, + "step": 14615 + }, + { + "epoch": 0.9933414866150292, + "grad_norm": 0.6353852152824402, + "learning_rate": 4.379161570865607e-05, + "loss": 1.3071, + "step": 14620 + }, + { + "epoch": 0.993681206685691, + "grad_norm": 0.6535605192184448, + "learning_rate": 4.378949245821443e-05, + "loss": 1.3018, + "step": 14625 + }, + { + "epoch": 0.9940209267563528, + "grad_norm": 0.6865241527557373, + "learning_rate": 4.37873692077728e-05, + "loss": 1.4223, + "step": 14630 + }, + { + "epoch": 0.9943606468270145, + "grad_norm": 0.7610270977020264, + "learning_rate": 4.3785245957331164e-05, + "loss": 1.3702, + "step": 14635 + }, + { + "epoch": 0.9947003668976763, + "grad_norm": 0.696793794631958, + "learning_rate": 4.378312270688952e-05, + "loss": 1.3446, + "step": 14640 + }, + { + "epoch": 0.9950400869683381, + "grad_norm": 0.7753585577011108, + "learning_rate": 4.378099945644789e-05, + "loss": 1.2673, + "step": 14645 + }, + { + "epoch": 0.9953798070389999, + "grad_norm": 0.7199378609657288, + "learning_rate": 4.3778876206006256e-05, + "loss": 1.4074, + "step": 14650 + }, + { + "epoch": 0.9957195271096616, + "grad_norm": 0.7430431842803955, + "learning_rate": 4.377675295556461e-05, + "loss": 1.344, + "step": 14655 + }, + { + "epoch": 0.9960592471803235, + "grad_norm": 0.7380971312522888, + "learning_rate": 4.3774629705122984e-05, + "loss": 1.4623, + "step": 14660 + }, + { + "epoch": 0.9963989672509852, + "grad_norm": 0.7530075907707214, + "learning_rate": 4.377250645468135e-05, + "loss": 1.3504, + "step": 14665 + }, + { + "epoch": 0.9967386873216469, + "grad_norm": 0.7144792079925537, + "learning_rate": 4.3770383204239705e-05, + "loss": 1.3724, + "step": 14670 + }, + { + "epoch": 0.9970784073923087, + "grad_norm": 0.7846295833587646, + "learning_rate": 4.3768259953798076e-05, + "loss": 1.4125, + "step": 14675 + }, + { + "epoch": 0.9974181274629705, + "grad_norm": 0.6689630150794983, + "learning_rate": 4.376613670335643e-05, + "loss": 1.3159, + "step": 14680 + }, + { + "epoch": 0.9977578475336323, + "grad_norm": 0.6798150539398193, + "learning_rate": 4.37640134529148e-05, + "loss": 1.2581, + "step": 14685 + }, + { + "epoch": 0.998097567604294, + "grad_norm": 0.6772509217262268, + "learning_rate": 4.376189020247317e-05, + "loss": 1.4554, + "step": 14690 + }, + { + "epoch": 0.9984372876749559, + "grad_norm": 0.6583994030952454, + "learning_rate": 4.3759766952031525e-05, + "loss": 1.4909, + "step": 14695 + }, + { + "epoch": 0.9987770077456176, + "grad_norm": 0.648712158203125, + "learning_rate": 4.375764370158989e-05, + "loss": 1.3482, + "step": 14700 + }, + { + "epoch": 0.9991167278162794, + "grad_norm": 0.6531977653503418, + "learning_rate": 4.375552045114826e-05, + "loss": 1.3399, + "step": 14705 + }, + { + "epoch": 0.9994564478869412, + "grad_norm": 0.7878149747848511, + "learning_rate": 4.375339720070662e-05, + "loss": 1.3877, + "step": 14710 + }, + { + "epoch": 0.999796167957603, + "grad_norm": 0.6606357097625732, + "learning_rate": 4.375127395026498e-05, + "loss": 1.3811, + "step": 14715 + }, + { + "epoch": 1.0, + "eval_loss": 1.493106484413147, + "eval_runtime": 189.937, + "eval_samples_per_second": 54.329, + "eval_steps_per_second": 6.792, + "step": 14718 + } + ], + "logging_steps": 5, + "max_steps": 117744, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.985745646904279e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}