{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 17388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005751092707614447, "grad_norm": 378.4447326660156, "learning_rate": 1.7241379310344825e-08, "loss": 2.138, "step": 10 }, { "epoch": 0.0011502185415228894, "grad_norm": 287.0553894042969, "learning_rate": 3.639846743295019e-08, "loss": 2.049, "step": 20 }, { "epoch": 0.001725327812284334, "grad_norm": 350.13385009765625, "learning_rate": 5.555555555555555e-08, "loss": 1.9662, "step": 30 }, { "epoch": 0.002300437083045779, "grad_norm": 333.6873779296875, "learning_rate": 7.471264367816092e-08, "loss": 1.822, "step": 40 }, { "epoch": 0.002875546353807223, "grad_norm": 326.5357360839844, "learning_rate": 9.386973180076628e-08, "loss": 1.7705, "step": 50 }, { "epoch": 0.003450655624568668, "grad_norm": 280.16082763671875, "learning_rate": 1.1302681992337163e-07, "loss": 1.5354, "step": 60 }, { "epoch": 0.004025764895330112, "grad_norm": 151.53012084960938, "learning_rate": 1.32183908045977e-07, "loss": 1.2082, "step": 70 }, { "epoch": 0.004600874166091558, "grad_norm": 92.53370666503906, "learning_rate": 1.5134099616858236e-07, "loss": 0.913, "step": 80 }, { "epoch": 0.005175983436853002, "grad_norm": 78.08407592773438, "learning_rate": 1.7049808429118773e-07, "loss": 0.9203, "step": 90 }, { "epoch": 0.005751092707614446, "grad_norm": 46.01520538330078, "learning_rate": 1.896551724137931e-07, "loss": 0.8304, "step": 100 }, { "epoch": 0.006326201978375892, "grad_norm": 22.086585998535156, "learning_rate": 2.0881226053639848e-07, "loss": 0.8002, "step": 110 }, { "epoch": 0.006901311249137336, "grad_norm": 21.16158676147461, "learning_rate": 2.2796934865900382e-07, "loss": 0.7998, "step": 120 }, { "epoch": 0.00747642051989878, "grad_norm": 21.24992561340332, "learning_rate": 2.471264367816092e-07, "loss": 0.7121, "step": 130 }, { "epoch": 0.008051529790660225, "grad_norm": 18.597881317138672, "learning_rate": 2.6628352490421455e-07, "loss": 0.8002, "step": 140 }, { "epoch": 0.00862663906142167, "grad_norm": 17.09422492980957, "learning_rate": 2.854406130268199e-07, "loss": 0.6848, "step": 150 }, { "epoch": 0.009201748332183115, "grad_norm": 20.343952178955078, "learning_rate": 3.045977011494253e-07, "loss": 0.6602, "step": 160 }, { "epoch": 0.009776857602944559, "grad_norm": 13.714895248413086, "learning_rate": 3.2375478927203064e-07, "loss": 0.6903, "step": 170 }, { "epoch": 0.010351966873706004, "grad_norm": 13.499238967895508, "learning_rate": 3.42911877394636e-07, "loss": 0.6772, "step": 180 }, { "epoch": 0.01092707614446745, "grad_norm": 13.379589080810547, "learning_rate": 3.620689655172414e-07, "loss": 0.6421, "step": 190 }, { "epoch": 0.011502185415228893, "grad_norm": 13.906352996826172, "learning_rate": 3.8122605363984674e-07, "loss": 0.6483, "step": 200 }, { "epoch": 0.012077294685990338, "grad_norm": 16.421953201293945, "learning_rate": 4.003831417624521e-07, "loss": 0.6462, "step": 210 }, { "epoch": 0.012652403956751783, "grad_norm": 12.944228172302246, "learning_rate": 4.195402298850574e-07, "loss": 0.6658, "step": 220 }, { "epoch": 0.013227513227513227, "grad_norm": 14.112801551818848, "learning_rate": 4.386973180076628e-07, "loss": 0.7028, "step": 230 }, { "epoch": 0.013802622498274672, "grad_norm": 10.653109550476074, "learning_rate": 4.5785440613026814e-07, "loss": 0.6253, "step": 240 }, { "epoch": 0.014377731769036117, "grad_norm": 12.70252799987793, "learning_rate": 4.770114942528736e-07, "loss": 0.63, "step": 250 }, { "epoch": 0.01495284103979756, "grad_norm": 10.284605979919434, "learning_rate": 4.96168582375479e-07, "loss": 0.5631, "step": 260 }, { "epoch": 0.015527950310559006, "grad_norm": 10.665888786315918, "learning_rate": 5.153256704980843e-07, "loss": 0.5978, "step": 270 }, { "epoch": 0.01610305958132045, "grad_norm": 6.88971471786499, "learning_rate": 5.344827586206896e-07, "loss": 0.631, "step": 280 }, { "epoch": 0.016678168852081895, "grad_norm": 11.316570281982422, "learning_rate": 5.53639846743295e-07, "loss": 0.6217, "step": 290 }, { "epoch": 0.01725327812284334, "grad_norm": 9.504354476928711, "learning_rate": 5.727969348659003e-07, "loss": 0.6462, "step": 300 }, { "epoch": 0.017828387393604785, "grad_norm": 9.698684692382812, "learning_rate": 5.919540229885057e-07, "loss": 0.5933, "step": 310 }, { "epoch": 0.01840349666436623, "grad_norm": 8.215251922607422, "learning_rate": 6.111111111111112e-07, "loss": 0.633, "step": 320 }, { "epoch": 0.018978605935127676, "grad_norm": 6.503454208374023, "learning_rate": 6.302681992337165e-07, "loss": 0.5981, "step": 330 }, { "epoch": 0.019553715205889118, "grad_norm": 6.727804660797119, "learning_rate": 6.494252873563219e-07, "loss": 0.5943, "step": 340 }, { "epoch": 0.020128824476650563, "grad_norm": 9.203789710998535, "learning_rate": 6.685823754789271e-07, "loss": 0.6116, "step": 350 }, { "epoch": 0.020703933747412008, "grad_norm": 8.241390228271484, "learning_rate": 6.877394636015325e-07, "loss": 0.5599, "step": 360 }, { "epoch": 0.021279043018173453, "grad_norm": 7.991062164306641, "learning_rate": 7.068965517241378e-07, "loss": 0.5736, "step": 370 }, { "epoch": 0.0218541522889349, "grad_norm": 6.530081272125244, "learning_rate": 7.260536398467432e-07, "loss": 0.5643, "step": 380 }, { "epoch": 0.022429261559696344, "grad_norm": 6.640897750854492, "learning_rate": 7.452107279693486e-07, "loss": 0.5753, "step": 390 }, { "epoch": 0.023004370830457786, "grad_norm": 8.924554824829102, "learning_rate": 7.64367816091954e-07, "loss": 0.5617, "step": 400 }, { "epoch": 0.02357948010121923, "grad_norm": 8.912935256958008, "learning_rate": 7.835249042145593e-07, "loss": 0.5626, "step": 410 }, { "epoch": 0.024154589371980676, "grad_norm": 6.595212459564209, "learning_rate": 8.026819923371647e-07, "loss": 0.57, "step": 420 }, { "epoch": 0.02472969864274212, "grad_norm": 8.049797058105469, "learning_rate": 8.218390804597701e-07, "loss": 0.547, "step": 430 }, { "epoch": 0.025304807913503567, "grad_norm": 7.892799377441406, "learning_rate": 8.409961685823754e-07, "loss": 0.5377, "step": 440 }, { "epoch": 0.025879917184265012, "grad_norm": 8.175146102905273, "learning_rate": 8.601532567049809e-07, "loss": 0.5828, "step": 450 }, { "epoch": 0.026455026455026454, "grad_norm": 8.500373840332031, "learning_rate": 8.793103448275862e-07, "loss": 0.6026, "step": 460 }, { "epoch": 0.0270301357257879, "grad_norm": 8.72022533416748, "learning_rate": 8.984674329501916e-07, "loss": 0.5934, "step": 470 }, { "epoch": 0.027605244996549344, "grad_norm": 6.508808135986328, "learning_rate": 9.176245210727969e-07, "loss": 0.5758, "step": 480 }, { "epoch": 0.02818035426731079, "grad_norm": 9.332911491394043, "learning_rate": 9.367816091954023e-07, "loss": 0.5403, "step": 490 }, { "epoch": 0.028755463538072235, "grad_norm": 7.683056831359863, "learning_rate": 9.559386973180076e-07, "loss": 0.5733, "step": 500 }, { "epoch": 0.02933057280883368, "grad_norm": 7.228301048278809, "learning_rate": 9.75095785440613e-07, "loss": 0.5423, "step": 510 }, { "epoch": 0.02990568207959512, "grad_norm": 5.8469367027282715, "learning_rate": 9.942528735632182e-07, "loss": 0.5453, "step": 520 }, { "epoch": 0.030480791350356567, "grad_norm": 6.454069137573242, "learning_rate": 9.999995749778336e-07, "loss": 0.5401, "step": 530 }, { "epoch": 0.031055900621118012, "grad_norm": 5.703695774078369, "learning_rate": 9.9999749323835e-07, "loss": 0.5446, "step": 540 }, { "epoch": 0.031631009891879454, "grad_norm": 8.486488342285156, "learning_rate": 9.999936767234675e-07, "loss": 0.5431, "step": 550 }, { "epoch": 0.0322061191626409, "grad_norm": 6.780856609344482, "learning_rate": 9.99988125446427e-07, "loss": 0.549, "step": 560 }, { "epoch": 0.032781228433402344, "grad_norm": 6.282104015350342, "learning_rate": 9.999808394264896e-07, "loss": 0.5403, "step": 570 }, { "epoch": 0.03335633770416379, "grad_norm": 6.615207195281982, "learning_rate": 9.999718186889343e-07, "loss": 0.5647, "step": 580 }, { "epoch": 0.033931446974925235, "grad_norm": 8.089784622192383, "learning_rate": 9.999610632650593e-07, "loss": 0.5516, "step": 590 }, { "epoch": 0.03450655624568668, "grad_norm": 6.8586039543151855, "learning_rate": 9.999485731921815e-07, "loss": 0.5188, "step": 600 }, { "epoch": 0.035081665516448125, "grad_norm": 8.950047492980957, "learning_rate": 9.999343485136357e-07, "loss": 0.571, "step": 610 }, { "epoch": 0.03565677478720957, "grad_norm": 9.919050216674805, "learning_rate": 9.999183892787755e-07, "loss": 0.548, "step": 620 }, { "epoch": 0.036231884057971016, "grad_norm": 9.946770668029785, "learning_rate": 9.999006955429727e-07, "loss": 0.5802, "step": 630 }, { "epoch": 0.03680699332873246, "grad_norm": 6.635040283203125, "learning_rate": 9.998812673676168e-07, "loss": 0.5253, "step": 640 }, { "epoch": 0.037382102599493906, "grad_norm": 8.00068473815918, "learning_rate": 9.998601048201154e-07, "loss": 0.4953, "step": 650 }, { "epoch": 0.03795721187025535, "grad_norm": 8.169313430786133, "learning_rate": 9.99837207973893e-07, "loss": 0.5459, "step": 660 }, { "epoch": 0.03853232114101679, "grad_norm": 6.902243137359619, "learning_rate": 9.998125769083918e-07, "loss": 0.5453, "step": 670 }, { "epoch": 0.039107430411778235, "grad_norm": 5.566956043243408, "learning_rate": 9.997862117090714e-07, "loss": 0.4941, "step": 680 }, { "epoch": 0.03968253968253968, "grad_norm": 7.291991710662842, "learning_rate": 9.997581124674074e-07, "loss": 0.5327, "step": 690 }, { "epoch": 0.040257648953301126, "grad_norm": 7.69106912612915, "learning_rate": 9.99728279280892e-07, "loss": 0.5313, "step": 700 }, { "epoch": 0.04083275822406257, "grad_norm": 6.865135192871094, "learning_rate": 9.996967122530334e-07, "loss": 0.512, "step": 710 }, { "epoch": 0.041407867494824016, "grad_norm": 7.884315490722656, "learning_rate": 9.996634114933557e-07, "loss": 0.5401, "step": 720 }, { "epoch": 0.04198297676558546, "grad_norm": 6.865697860717773, "learning_rate": 9.996283771173982e-07, "loss": 0.4948, "step": 730 }, { "epoch": 0.04255808603634691, "grad_norm": 7.946505069732666, "learning_rate": 9.99591609246715e-07, "loss": 0.5229, "step": 740 }, { "epoch": 0.04313319530710835, "grad_norm": 9.0175142288208, "learning_rate": 9.995531080088744e-07, "loss": 0.5186, "step": 750 }, { "epoch": 0.0437083045778698, "grad_norm": 8.412775993347168, "learning_rate": 9.995128735374597e-07, "loss": 0.5102, "step": 760 }, { "epoch": 0.04428341384863124, "grad_norm": 6.2595534324646, "learning_rate": 9.994709059720666e-07, "loss": 0.5325, "step": 770 }, { "epoch": 0.04485852311939269, "grad_norm": 7.325301170349121, "learning_rate": 9.994272054583048e-07, "loss": 0.4926, "step": 780 }, { "epoch": 0.045433632390154126, "grad_norm": 7.66895055770874, "learning_rate": 9.993817721477959e-07, "loss": 0.5195, "step": 790 }, { "epoch": 0.04600874166091557, "grad_norm": 7.364901065826416, "learning_rate": 9.993346061981742e-07, "loss": 0.5434, "step": 800 }, { "epoch": 0.046583850931677016, "grad_norm": 6.321810245513916, "learning_rate": 9.99285707773085e-07, "loss": 0.5378, "step": 810 }, { "epoch": 0.04715896020243846, "grad_norm": 5.99085807800293, "learning_rate": 9.992350770421848e-07, "loss": 0.5053, "step": 820 }, { "epoch": 0.04773406947319991, "grad_norm": 6.933141708374023, "learning_rate": 9.9918271418114e-07, "loss": 0.5297, "step": 830 }, { "epoch": 0.04830917874396135, "grad_norm": 5.8328657150268555, "learning_rate": 9.991286193716277e-07, "loss": 0.5135, "step": 840 }, { "epoch": 0.0488842880147228, "grad_norm": 6.307294845581055, "learning_rate": 9.99072792801333e-07, "loss": 0.503, "step": 850 }, { "epoch": 0.04945939728548424, "grad_norm": 7.259733200073242, "learning_rate": 9.990152346639505e-07, "loss": 0.518, "step": 860 }, { "epoch": 0.05003450655624569, "grad_norm": 5.9123992919921875, "learning_rate": 9.98955945159182e-07, "loss": 0.5477, "step": 870 }, { "epoch": 0.05060961582700713, "grad_norm": 6.4600419998168945, "learning_rate": 9.988949244927359e-07, "loss": 0.5204, "step": 880 }, { "epoch": 0.05118472509776858, "grad_norm": 7.12496280670166, "learning_rate": 9.98832172876328e-07, "loss": 0.4995, "step": 890 }, { "epoch": 0.051759834368530024, "grad_norm": 6.391274929046631, "learning_rate": 9.987676905276795e-07, "loss": 0.5273, "step": 900 }, { "epoch": 0.05233494363929147, "grad_norm": 7.585725784301758, "learning_rate": 9.987014776705158e-07, "loss": 0.4659, "step": 910 }, { "epoch": 0.05291005291005291, "grad_norm": 6.9592366218566895, "learning_rate": 9.986335345345673e-07, "loss": 0.5034, "step": 920 }, { "epoch": 0.05348516218081435, "grad_norm": 8.959076881408691, "learning_rate": 9.985638613555674e-07, "loss": 0.4871, "step": 930 }, { "epoch": 0.0540602714515758, "grad_norm": 6.32243537902832, "learning_rate": 9.984924583752517e-07, "loss": 0.5356, "step": 940 }, { "epoch": 0.05463538072233724, "grad_norm": 7.113640785217285, "learning_rate": 9.984193258413578e-07, "loss": 0.5029, "step": 950 }, { "epoch": 0.05521048999309869, "grad_norm": 7.942057132720947, "learning_rate": 9.983444640076241e-07, "loss": 0.5076, "step": 960 }, { "epoch": 0.05578559926386013, "grad_norm": 7.458593845367432, "learning_rate": 9.982678731337887e-07, "loss": 0.5077, "step": 970 }, { "epoch": 0.05636070853462158, "grad_norm": 5.514812469482422, "learning_rate": 9.981895534855889e-07, "loss": 0.5069, "step": 980 }, { "epoch": 0.056935817805383024, "grad_norm": 7.074843406677246, "learning_rate": 9.981095053347602e-07, "loss": 0.4831, "step": 990 }, { "epoch": 0.05751092707614447, "grad_norm": 6.780029296875, "learning_rate": 9.980277289590349e-07, "loss": 0.5054, "step": 1000 }, { "epoch": 0.058086036346905914, "grad_norm": 5.824387073516846, "learning_rate": 9.97944224642142e-07, "loss": 0.5167, "step": 1010 }, { "epoch": 0.05866114561766736, "grad_norm": 7.242621898651123, "learning_rate": 9.978589926738049e-07, "loss": 0.495, "step": 1020 }, { "epoch": 0.059236254888428805, "grad_norm": 6.214984893798828, "learning_rate": 9.977720333497423e-07, "loss": 0.5373, "step": 1030 }, { "epoch": 0.05981136415919024, "grad_norm": 9.091554641723633, "learning_rate": 9.97683346971665e-07, "loss": 0.5155, "step": 1040 }, { "epoch": 0.06038647342995169, "grad_norm": 5.6391520500183105, "learning_rate": 9.97592933847277e-07, "loss": 0.5132, "step": 1050 }, { "epoch": 0.060961582700713134, "grad_norm": 8.69354248046875, "learning_rate": 9.975007942902724e-07, "loss": 0.5052, "step": 1060 }, { "epoch": 0.06153669197147458, "grad_norm": 6.963156700134277, "learning_rate": 9.974069286203357e-07, "loss": 0.4832, "step": 1070 }, { "epoch": 0.062111801242236024, "grad_norm": 6.7154459953308105, "learning_rate": 9.973113371631406e-07, "loss": 0.4964, "step": 1080 }, { "epoch": 0.06268691051299748, "grad_norm": 8.184334754943848, "learning_rate": 9.972140202503476e-07, "loss": 0.5047, "step": 1090 }, { "epoch": 0.06326201978375891, "grad_norm": 7.158923149108887, "learning_rate": 9.971149782196046e-07, "loss": 0.4896, "step": 1100 }, { "epoch": 0.06383712905452035, "grad_norm": 8.574070930480957, "learning_rate": 9.97014211414545e-07, "loss": 0.4938, "step": 1110 }, { "epoch": 0.0644122383252818, "grad_norm": 6.727128028869629, "learning_rate": 9.969117201847856e-07, "loss": 0.4978, "step": 1120 }, { "epoch": 0.06498734759604324, "grad_norm": 6.78040075302124, "learning_rate": 9.968075048859273e-07, "loss": 0.5397, "step": 1130 }, { "epoch": 0.06556245686680469, "grad_norm": 7.219523906707764, "learning_rate": 9.967015658795514e-07, "loss": 0.5263, "step": 1140 }, { "epoch": 0.06613756613756613, "grad_norm": 6.930751323699951, "learning_rate": 9.965939035332214e-07, "loss": 0.4971, "step": 1150 }, { "epoch": 0.06671267540832758, "grad_norm": 8.369160652160645, "learning_rate": 9.964845182204785e-07, "loss": 0.5006, "step": 1160 }, { "epoch": 0.06728778467908902, "grad_norm": 9.492759704589844, "learning_rate": 9.963734103208425e-07, "loss": 0.5201, "step": 1170 }, { "epoch": 0.06786289394985047, "grad_norm": 7.325068473815918, "learning_rate": 9.962605802198104e-07, "loss": 0.4862, "step": 1180 }, { "epoch": 0.06843800322061191, "grad_norm": 6.4751739501953125, "learning_rate": 9.961460283088534e-07, "loss": 0.4809, "step": 1190 }, { "epoch": 0.06901311249137336, "grad_norm": 7.028703212738037, "learning_rate": 9.960297549854174e-07, "loss": 0.5035, "step": 1200 }, { "epoch": 0.0695882217621348, "grad_norm": 7.2553205490112305, "learning_rate": 9.959117606529205e-07, "loss": 0.5073, "step": 1210 }, { "epoch": 0.07016333103289625, "grad_norm": 7.411725044250488, "learning_rate": 9.95792045720752e-07, "loss": 0.5099, "step": 1220 }, { "epoch": 0.0707384403036577, "grad_norm": 6.864030361175537, "learning_rate": 9.956706106042714e-07, "loss": 0.493, "step": 1230 }, { "epoch": 0.07131354957441914, "grad_norm": 7.2550272941589355, "learning_rate": 9.955474557248058e-07, "loss": 0.4863, "step": 1240 }, { "epoch": 0.07188865884518059, "grad_norm": 8.028414726257324, "learning_rate": 9.954225815096494e-07, "loss": 0.4895, "step": 1250 }, { "epoch": 0.07246376811594203, "grad_norm": 7.895452499389648, "learning_rate": 9.95295988392062e-07, "loss": 0.516, "step": 1260 }, { "epoch": 0.07303887738670348, "grad_norm": 7.028430461883545, "learning_rate": 9.95167676811267e-07, "loss": 0.5489, "step": 1270 }, { "epoch": 0.07361398665746492, "grad_norm": 6.802486896514893, "learning_rate": 9.950376472124504e-07, "loss": 0.5168, "step": 1280 }, { "epoch": 0.07418909592822637, "grad_norm": 7.639976501464844, "learning_rate": 9.949059000467583e-07, "loss": 0.5014, "step": 1290 }, { "epoch": 0.07476420519898781, "grad_norm": 8.238563537597656, "learning_rate": 9.947724357712968e-07, "loss": 0.5022, "step": 1300 }, { "epoch": 0.07533931446974926, "grad_norm": 7.058792591094971, "learning_rate": 9.946372548491292e-07, "loss": 0.5205, "step": 1310 }, { "epoch": 0.0759144237405107, "grad_norm": 6.692798614501953, "learning_rate": 9.94500357749275e-07, "loss": 0.5006, "step": 1320 }, { "epoch": 0.07648953301127215, "grad_norm": 6.259213447570801, "learning_rate": 9.943617449467079e-07, "loss": 0.4486, "step": 1330 }, { "epoch": 0.07706464228203358, "grad_norm": 7.863189697265625, "learning_rate": 9.942214169223544e-07, "loss": 0.5161, "step": 1340 }, { "epoch": 0.07763975155279502, "grad_norm": 6.202320575714111, "learning_rate": 9.940793741630925e-07, "loss": 0.4903, "step": 1350 }, { "epoch": 0.07821486082355647, "grad_norm": 6.850808143615723, "learning_rate": 9.939356171617488e-07, "loss": 0.5045, "step": 1360 }, { "epoch": 0.07878997009431792, "grad_norm": 7.875378608703613, "learning_rate": 9.937901464170987e-07, "loss": 0.485, "step": 1370 }, { "epoch": 0.07936507936507936, "grad_norm": 9.453932762145996, "learning_rate": 9.93642962433862e-07, "loss": 0.4926, "step": 1380 }, { "epoch": 0.0799401886358408, "grad_norm": 7.6340556144714355, "learning_rate": 9.93494065722704e-07, "loss": 0.4799, "step": 1390 }, { "epoch": 0.08051529790660225, "grad_norm": 10.074248313903809, "learning_rate": 9.933434568002322e-07, "loss": 0.5157, "step": 1400 }, { "epoch": 0.0810904071773637, "grad_norm": 9.18978500366211, "learning_rate": 9.93191136188994e-07, "loss": 0.4891, "step": 1410 }, { "epoch": 0.08166551644812514, "grad_norm": 9.167423248291016, "learning_rate": 9.930371044174765e-07, "loss": 0.4855, "step": 1420 }, { "epoch": 0.08224062571888659, "grad_norm": 5.81760311126709, "learning_rate": 9.928813620201037e-07, "loss": 0.4943, "step": 1430 }, { "epoch": 0.08281573498964803, "grad_norm": 6.762801647186279, "learning_rate": 9.927239095372339e-07, "loss": 0.4831, "step": 1440 }, { "epoch": 0.08339084426040948, "grad_norm": 8.259499549865723, "learning_rate": 9.925647475151595e-07, "loss": 0.5122, "step": 1450 }, { "epoch": 0.08396595353117092, "grad_norm": 7.878564357757568, "learning_rate": 9.92403876506104e-07, "loss": 0.4889, "step": 1460 }, { "epoch": 0.08454106280193237, "grad_norm": 8.114038467407227, "learning_rate": 9.922412970682204e-07, "loss": 0.4972, "step": 1470 }, { "epoch": 0.08511617207269381, "grad_norm": 7.688379287719727, "learning_rate": 9.920770097655887e-07, "loss": 0.4936, "step": 1480 }, { "epoch": 0.08569128134345526, "grad_norm": 8.700139045715332, "learning_rate": 9.919110151682157e-07, "loss": 0.485, "step": 1490 }, { "epoch": 0.0862663906142167, "grad_norm": 7.685487747192383, "learning_rate": 9.917433138520303e-07, "loss": 0.4907, "step": 1500 }, { "epoch": 0.08684149988497815, "grad_norm": 6.047664165496826, "learning_rate": 9.915739063988839e-07, "loss": 0.4933, "step": 1510 }, { "epoch": 0.0874166091557396, "grad_norm": 7.135039806365967, "learning_rate": 9.914027933965472e-07, "loss": 0.5228, "step": 1520 }, { "epoch": 0.08799171842650104, "grad_norm": 8.45931339263916, "learning_rate": 9.912299754387084e-07, "loss": 0.503, "step": 1530 }, { "epoch": 0.08856682769726248, "grad_norm": 6.646641731262207, "learning_rate": 9.910554531249712e-07, "loss": 0.5136, "step": 1540 }, { "epoch": 0.08914193696802393, "grad_norm": 7.235086441040039, "learning_rate": 9.908792270608528e-07, "loss": 0.4894, "step": 1550 }, { "epoch": 0.08971704623878538, "grad_norm": 6.455569267272949, "learning_rate": 9.907012978577814e-07, "loss": 0.5102, "step": 1560 }, { "epoch": 0.09029215550954682, "grad_norm": 7.360365390777588, "learning_rate": 9.905216661330944e-07, "loss": 0.4887, "step": 1570 }, { "epoch": 0.09086726478030825, "grad_norm": 6.838494777679443, "learning_rate": 9.903403325100365e-07, "loss": 0.4909, "step": 1580 }, { "epoch": 0.0914423740510697, "grad_norm": 7.338395118713379, "learning_rate": 9.90157297617757e-07, "loss": 0.4914, "step": 1590 }, { "epoch": 0.09201748332183114, "grad_norm": 7.555124759674072, "learning_rate": 9.899725620913079e-07, "loss": 0.475, "step": 1600 }, { "epoch": 0.09259259259259259, "grad_norm": 7.6270294189453125, "learning_rate": 9.897861265716419e-07, "loss": 0.4589, "step": 1610 }, { "epoch": 0.09316770186335403, "grad_norm": 7.4105329513549805, "learning_rate": 9.895979917056096e-07, "loss": 0.4764, "step": 1620 }, { "epoch": 0.09374281113411548, "grad_norm": 6.3092451095581055, "learning_rate": 9.894081581459577e-07, "loss": 0.5004, "step": 1630 }, { "epoch": 0.09431792040487692, "grad_norm": 6.808461666107178, "learning_rate": 9.89216626551327e-07, "loss": 0.458, "step": 1640 }, { "epoch": 0.09489302967563837, "grad_norm": 7.136944770812988, "learning_rate": 9.890233975862487e-07, "loss": 0.5183, "step": 1650 }, { "epoch": 0.09546813894639981, "grad_norm": 7.4427409172058105, "learning_rate": 9.888284719211447e-07, "loss": 0.4651, "step": 1660 }, { "epoch": 0.09604324821716126, "grad_norm": 9.500293731689453, "learning_rate": 9.886318502323224e-07, "loss": 0.4879, "step": 1670 }, { "epoch": 0.0966183574879227, "grad_norm": 7.445298194885254, "learning_rate": 9.884335332019745e-07, "loss": 0.4869, "step": 1680 }, { "epoch": 0.09719346675868415, "grad_norm": 7.293051719665527, "learning_rate": 9.88233521518175e-07, "loss": 0.4631, "step": 1690 }, { "epoch": 0.0977685760294456, "grad_norm": 9.078981399536133, "learning_rate": 9.880318158748783e-07, "loss": 0.4992, "step": 1700 }, { "epoch": 0.09834368530020704, "grad_norm": 6.5568952560424805, "learning_rate": 9.87828416971916e-07, "loss": 0.4792, "step": 1710 }, { "epoch": 0.09891879457096849, "grad_norm": 6.179459571838379, "learning_rate": 9.876233255149944e-07, "loss": 0.4638, "step": 1720 }, { "epoch": 0.09949390384172993, "grad_norm": 9.902800559997559, "learning_rate": 9.874165422156922e-07, "loss": 0.4913, "step": 1730 }, { "epoch": 0.10006901311249138, "grad_norm": 6.764627456665039, "learning_rate": 9.872080677914583e-07, "loss": 0.4628, "step": 1740 }, { "epoch": 0.10064412238325282, "grad_norm": 8.149230003356934, "learning_rate": 9.869979029656087e-07, "loss": 0.4543, "step": 1750 }, { "epoch": 0.10121923165401427, "grad_norm": 7.914442539215088, "learning_rate": 9.86786048467325e-07, "loss": 0.4738, "step": 1760 }, { "epoch": 0.10179434092477571, "grad_norm": 9.655527114868164, "learning_rate": 9.865725050316506e-07, "loss": 0.4671, "step": 1770 }, { "epoch": 0.10236945019553716, "grad_norm": 7.488994121551514, "learning_rate": 9.863572733994888e-07, "loss": 0.4723, "step": 1780 }, { "epoch": 0.1029445594662986, "grad_norm": 6.832953929901123, "learning_rate": 9.861403543176007e-07, "loss": 0.4691, "step": 1790 }, { "epoch": 0.10351966873706005, "grad_norm": 9.229466438293457, "learning_rate": 9.859217485386019e-07, "loss": 0.4737, "step": 1800 }, { "epoch": 0.10409477800782149, "grad_norm": 7.8232502937316895, "learning_rate": 9.857014568209597e-07, "loss": 0.4869, "step": 1810 }, { "epoch": 0.10466988727858294, "grad_norm": 7.68008279800415, "learning_rate": 9.854794799289916e-07, "loss": 0.4697, "step": 1820 }, { "epoch": 0.10524499654934437, "grad_norm": 7.456464767456055, "learning_rate": 9.852558186328612e-07, "loss": 0.4833, "step": 1830 }, { "epoch": 0.10582010582010581, "grad_norm": 8.517068862915039, "learning_rate": 9.850304737085771e-07, "loss": 0.4667, "step": 1840 }, { "epoch": 0.10639521509086726, "grad_norm": 7.837017059326172, "learning_rate": 9.848034459379886e-07, "loss": 0.5062, "step": 1850 }, { "epoch": 0.1069703243616287, "grad_norm": 7.960002422332764, "learning_rate": 9.84574736108784e-07, "loss": 0.5057, "step": 1860 }, { "epoch": 0.10754543363239015, "grad_norm": 8.457831382751465, "learning_rate": 9.843443450144877e-07, "loss": 0.4623, "step": 1870 }, { "epoch": 0.1081205429031516, "grad_norm": 8.666006088256836, "learning_rate": 9.841122734544573e-07, "loss": 0.4627, "step": 1880 }, { "epoch": 0.10869565217391304, "grad_norm": 8.465331077575684, "learning_rate": 9.83878522233881e-07, "loss": 0.4793, "step": 1890 }, { "epoch": 0.10927076144467449, "grad_norm": 6.455883979797363, "learning_rate": 9.836430921637745e-07, "loss": 0.4635, "step": 1900 }, { "epoch": 0.10984587071543593, "grad_norm": 8.273035049438477, "learning_rate": 9.834059840609786e-07, "loss": 0.5041, "step": 1910 }, { "epoch": 0.11042097998619738, "grad_norm": 7.623924732208252, "learning_rate": 9.831671987481556e-07, "loss": 0.452, "step": 1920 }, { "epoch": 0.11099608925695882, "grad_norm": 6.700584411621094, "learning_rate": 9.829267370537881e-07, "loss": 0.4823, "step": 1930 }, { "epoch": 0.11157119852772027, "grad_norm": 6.2908034324646, "learning_rate": 9.82684599812174e-07, "loss": 0.471, "step": 1940 }, { "epoch": 0.11214630779848171, "grad_norm": 9.044986724853516, "learning_rate": 9.824407878634249e-07, "loss": 0.5016, "step": 1950 }, { "epoch": 0.11272141706924316, "grad_norm": 9.634522438049316, "learning_rate": 9.82195302053463e-07, "loss": 0.4488, "step": 1960 }, { "epoch": 0.1132965263400046, "grad_norm": 7.235198974609375, "learning_rate": 9.819481432340182e-07, "loss": 0.4907, "step": 1970 }, { "epoch": 0.11387163561076605, "grad_norm": 6.926739692687988, "learning_rate": 9.816993122626248e-07, "loss": 0.471, "step": 1980 }, { "epoch": 0.11444674488152749, "grad_norm": 8.106722831726074, "learning_rate": 9.814488100026189e-07, "loss": 0.4851, "step": 1990 }, { "epoch": 0.11502185415228894, "grad_norm": 7.746631145477295, "learning_rate": 9.811966373231348e-07, "loss": 0.4646, "step": 2000 }, { "epoch": 0.11559696342305038, "grad_norm": 7.829645156860352, "learning_rate": 9.809427950991034e-07, "loss": 0.4713, "step": 2010 }, { "epoch": 0.11617207269381183, "grad_norm": 6.595952033996582, "learning_rate": 9.806872842112473e-07, "loss": 0.4571, "step": 2020 }, { "epoch": 0.11674718196457327, "grad_norm": 7.912197589874268, "learning_rate": 9.804301055460788e-07, "loss": 0.4643, "step": 2030 }, { "epoch": 0.11732229123533472, "grad_norm": 7.4515767097473145, "learning_rate": 9.801712599958973e-07, "loss": 0.4664, "step": 2040 }, { "epoch": 0.11789740050609616, "grad_norm": 8.590794563293457, "learning_rate": 9.799107484587845e-07, "loss": 0.4569, "step": 2050 }, { "epoch": 0.11847250977685761, "grad_norm": 8.468313217163086, "learning_rate": 9.796485718386033e-07, "loss": 0.4891, "step": 2060 }, { "epoch": 0.11904761904761904, "grad_norm": 6.699227333068848, "learning_rate": 9.793847310449932e-07, "loss": 0.5027, "step": 2070 }, { "epoch": 0.11962272831838049, "grad_norm": 7.846785068511963, "learning_rate": 9.79119226993368e-07, "loss": 0.5021, "step": 2080 }, { "epoch": 0.12019783758914193, "grad_norm": 6.45270299911499, "learning_rate": 9.78852060604912e-07, "loss": 0.483, "step": 2090 }, { "epoch": 0.12077294685990338, "grad_norm": 8.339261054992676, "learning_rate": 9.785832328065772e-07, "loss": 0.4794, "step": 2100 }, { "epoch": 0.12134805613066482, "grad_norm": 7.963165283203125, "learning_rate": 9.7831274453108e-07, "loss": 0.4756, "step": 2110 }, { "epoch": 0.12192316540142627, "grad_norm": 5.589999675750732, "learning_rate": 9.780405967168979e-07, "loss": 0.4552, "step": 2120 }, { "epoch": 0.12249827467218771, "grad_norm": 7.354642391204834, "learning_rate": 9.777667903082662e-07, "loss": 0.4681, "step": 2130 }, { "epoch": 0.12307338394294916, "grad_norm": 8.318604469299316, "learning_rate": 9.77491326255175e-07, "loss": 0.4672, "step": 2140 }, { "epoch": 0.1236484932137106, "grad_norm": 7.038641452789307, "learning_rate": 9.772142055133658e-07, "loss": 0.4582, "step": 2150 }, { "epoch": 0.12422360248447205, "grad_norm": 7.882426738739014, "learning_rate": 9.76935429044328e-07, "loss": 0.452, "step": 2160 }, { "epoch": 0.1247987117552335, "grad_norm": 6.205801010131836, "learning_rate": 9.766549978152949e-07, "loss": 0.4371, "step": 2170 }, { "epoch": 0.12537382102599495, "grad_norm": 10.060054779052734, "learning_rate": 9.763729127992424e-07, "loss": 0.4769, "step": 2180 }, { "epoch": 0.12594893029675638, "grad_norm": 8.294169425964355, "learning_rate": 9.760891749748834e-07, "loss": 0.4512, "step": 2190 }, { "epoch": 0.12652403956751782, "grad_norm": 7.491000175476074, "learning_rate": 9.758037853266656e-07, "loss": 0.4831, "step": 2200 }, { "epoch": 0.12709914883827927, "grad_norm": 6.7114338874816895, "learning_rate": 9.755167448447682e-07, "loss": 0.4597, "step": 2210 }, { "epoch": 0.1276742581090407, "grad_norm": 6.357844352722168, "learning_rate": 9.752280545250974e-07, "loss": 0.4587, "step": 2220 }, { "epoch": 0.12824936737980216, "grad_norm": 7.564320087432861, "learning_rate": 9.74937715369284e-07, "loss": 0.439, "step": 2230 }, { "epoch": 0.1288244766505636, "grad_norm": 7.965040683746338, "learning_rate": 9.746457283846796e-07, "loss": 0.4634, "step": 2240 }, { "epoch": 0.12939958592132506, "grad_norm": 7.674879550933838, "learning_rate": 9.743520945843528e-07, "loss": 0.4963, "step": 2250 }, { "epoch": 0.1299746951920865, "grad_norm": 7.220790863037109, "learning_rate": 9.740568149870864e-07, "loss": 0.4528, "step": 2260 }, { "epoch": 0.13054980446284795, "grad_norm": 7.454987049102783, "learning_rate": 9.737598906173728e-07, "loss": 0.4736, "step": 2270 }, { "epoch": 0.13112491373360938, "grad_norm": 6.603438377380371, "learning_rate": 9.734613225054114e-07, "loss": 0.4301, "step": 2280 }, { "epoch": 0.13170002300437084, "grad_norm": 8.584218978881836, "learning_rate": 9.731611116871046e-07, "loss": 0.5009, "step": 2290 }, { "epoch": 0.13227513227513227, "grad_norm": 7.783442974090576, "learning_rate": 9.728592592040546e-07, "loss": 0.4706, "step": 2300 }, { "epoch": 0.13285024154589373, "grad_norm": 7.947256088256836, "learning_rate": 9.725557661035587e-07, "loss": 0.4379, "step": 2310 }, { "epoch": 0.13342535081665516, "grad_norm": 7.300652980804443, "learning_rate": 9.722506334386072e-07, "loss": 0.4519, "step": 2320 }, { "epoch": 0.13400046008741662, "grad_norm": 6.790456295013428, "learning_rate": 9.719438622678787e-07, "loss": 0.4545, "step": 2330 }, { "epoch": 0.13457556935817805, "grad_norm": 7.528162002563477, "learning_rate": 9.716354536557368e-07, "loss": 0.4614, "step": 2340 }, { "epoch": 0.1351506786289395, "grad_norm": 7.22351598739624, "learning_rate": 9.713254086722259e-07, "loss": 0.4653, "step": 2350 }, { "epoch": 0.13572578789970094, "grad_norm": 6.963607311248779, "learning_rate": 9.710137283930681e-07, "loss": 0.473, "step": 2360 }, { "epoch": 0.1363008971704624, "grad_norm": 7.811157703399658, "learning_rate": 9.707004138996602e-07, "loss": 0.4918, "step": 2370 }, { "epoch": 0.13687600644122383, "grad_norm": 7.2192583084106445, "learning_rate": 9.703854662790671e-07, "loss": 0.4742, "step": 2380 }, { "epoch": 0.1374511157119853, "grad_norm": 6.321882247924805, "learning_rate": 9.70068886624022e-07, "loss": 0.4906, "step": 2390 }, { "epoch": 0.13802622498274672, "grad_norm": 8.579646110534668, "learning_rate": 9.697506760329191e-07, "loss": 0.4851, "step": 2400 }, { "epoch": 0.13860133425350818, "grad_norm": 6.281529426574707, "learning_rate": 9.694308356098118e-07, "loss": 0.4613, "step": 2410 }, { "epoch": 0.1391764435242696, "grad_norm": 8.344769477844238, "learning_rate": 9.69109366464408e-07, "loss": 0.4941, "step": 2420 }, { "epoch": 0.13975155279503104, "grad_norm": 6.060925483703613, "learning_rate": 9.687862697120678e-07, "loss": 0.4625, "step": 2430 }, { "epoch": 0.1403266620657925, "grad_norm": 7.197717666625977, "learning_rate": 9.684615464737961e-07, "loss": 0.4555, "step": 2440 }, { "epoch": 0.14090177133655393, "grad_norm": 8.615433692932129, "learning_rate": 9.681351978762433e-07, "loss": 0.4731, "step": 2450 }, { "epoch": 0.1414768806073154, "grad_norm": 7.4354987144470215, "learning_rate": 9.678072250516977e-07, "loss": 0.4467, "step": 2460 }, { "epoch": 0.14205198987807682, "grad_norm": 6.946926593780518, "learning_rate": 9.674776291380834e-07, "loss": 0.4533, "step": 2470 }, { "epoch": 0.14262709914883828, "grad_norm": 6.41255521774292, "learning_rate": 9.67146411278956e-07, "loss": 0.4391, "step": 2480 }, { "epoch": 0.1432022084195997, "grad_norm": 8.05007266998291, "learning_rate": 9.668135726234985e-07, "loss": 0.4544, "step": 2490 }, { "epoch": 0.14377731769036117, "grad_norm": 8.178547859191895, "learning_rate": 9.66479114326517e-07, "loss": 0.4744, "step": 2500 }, { "epoch": 0.1443524269611226, "grad_norm": 8.13766098022461, "learning_rate": 9.661430375484376e-07, "loss": 0.4795, "step": 2510 }, { "epoch": 0.14492753623188406, "grad_norm": 10.130875587463379, "learning_rate": 9.658053434553017e-07, "loss": 0.4655, "step": 2520 }, { "epoch": 0.1455026455026455, "grad_norm": 6.591521739959717, "learning_rate": 9.65466033218762e-07, "loss": 0.4747, "step": 2530 }, { "epoch": 0.14607775477340695, "grad_norm": 6.906799793243408, "learning_rate": 9.65125108016078e-07, "loss": 0.4284, "step": 2540 }, { "epoch": 0.14665286404416839, "grad_norm": 8.933794021606445, "learning_rate": 9.647825690301138e-07, "loss": 0.4668, "step": 2550 }, { "epoch": 0.14722797331492984, "grad_norm": 8.35267162322998, "learning_rate": 9.64438417449331e-07, "loss": 0.4517, "step": 2560 }, { "epoch": 0.14780308258569128, "grad_norm": 7.716118335723877, "learning_rate": 9.640926544677871e-07, "loss": 0.4638, "step": 2570 }, { "epoch": 0.14837819185645273, "grad_norm": 5.974536418914795, "learning_rate": 9.637452812851308e-07, "loss": 0.4375, "step": 2580 }, { "epoch": 0.14895330112721417, "grad_norm": 8.364542961120605, "learning_rate": 9.633962991065965e-07, "loss": 0.4531, "step": 2590 }, { "epoch": 0.14952841039797563, "grad_norm": 7.649923801422119, "learning_rate": 9.63045709143002e-07, "loss": 0.4778, "step": 2600 }, { "epoch": 0.15010351966873706, "grad_norm": 9.233725547790527, "learning_rate": 9.626935126107432e-07, "loss": 0.4735, "step": 2610 }, { "epoch": 0.15067862893949852, "grad_norm": 6.524007320404053, "learning_rate": 9.623397107317896e-07, "loss": 0.4788, "step": 2620 }, { "epoch": 0.15125373821025995, "grad_norm": 7.5222697257995605, "learning_rate": 9.619843047336812e-07, "loss": 0.4514, "step": 2630 }, { "epoch": 0.1518288474810214, "grad_norm": 7.617444038391113, "learning_rate": 9.616272958495233e-07, "loss": 0.4497, "step": 2640 }, { "epoch": 0.15240395675178284, "grad_norm": 7.2856831550598145, "learning_rate": 9.612686853179823e-07, "loss": 0.4468, "step": 2650 }, { "epoch": 0.1529790660225443, "grad_norm": 7.46363639831543, "learning_rate": 9.609084743832824e-07, "loss": 0.4492, "step": 2660 }, { "epoch": 0.15355417529330573, "grad_norm": 9.211614608764648, "learning_rate": 9.605466642951997e-07, "loss": 0.4216, "step": 2670 }, { "epoch": 0.15412928456406716, "grad_norm": 6.009915351867676, "learning_rate": 9.60183256309059e-07, "loss": 0.4547, "step": 2680 }, { "epoch": 0.15470439383482862, "grad_norm": 9.210302352905273, "learning_rate": 9.598182516857292e-07, "loss": 0.4576, "step": 2690 }, { "epoch": 0.15527950310559005, "grad_norm": 7.249082088470459, "learning_rate": 9.594516516916184e-07, "loss": 0.4578, "step": 2700 }, { "epoch": 0.1558546123763515, "grad_norm": 7.087738513946533, "learning_rate": 9.59083457598671e-07, "loss": 0.454, "step": 2710 }, { "epoch": 0.15642972164711294, "grad_norm": 9.38258171081543, "learning_rate": 9.587136706843605e-07, "loss": 0.4511, "step": 2720 }, { "epoch": 0.1570048309178744, "grad_norm": 8.41699504852295, "learning_rate": 9.583422922316886e-07, "loss": 0.5174, "step": 2730 }, { "epoch": 0.15757994018863583, "grad_norm": 7.991817474365234, "learning_rate": 9.579693235291779e-07, "loss": 0.4346, "step": 2740 }, { "epoch": 0.1581550494593973, "grad_norm": 6.90451192855835, "learning_rate": 9.575947658708686e-07, "loss": 0.4487, "step": 2750 }, { "epoch": 0.15873015873015872, "grad_norm": 6.438752174377441, "learning_rate": 9.572186205563144e-07, "loss": 0.449, "step": 2760 }, { "epoch": 0.15930526800092018, "grad_norm": 10.701172828674316, "learning_rate": 9.56840888890577e-07, "loss": 0.4705, "step": 2770 }, { "epoch": 0.1598803772716816, "grad_norm": 7.297903060913086, "learning_rate": 9.564615721842218e-07, "loss": 0.4605, "step": 2780 }, { "epoch": 0.16045548654244307, "grad_norm": 9.851449966430664, "learning_rate": 9.560806717533146e-07, "loss": 0.4177, "step": 2790 }, { "epoch": 0.1610305958132045, "grad_norm": 6.712676048278809, "learning_rate": 9.556981889194149e-07, "loss": 0.4514, "step": 2800 }, { "epoch": 0.16160570508396596, "grad_norm": 6.611025333404541, "learning_rate": 9.55314125009573e-07, "loss": 0.4566, "step": 2810 }, { "epoch": 0.1621808143547274, "grad_norm": 9.157703399658203, "learning_rate": 9.549284813563254e-07, "loss": 0.4483, "step": 2820 }, { "epoch": 0.16275592362548885, "grad_norm": 9.179961204528809, "learning_rate": 9.545412592976884e-07, "loss": 0.4608, "step": 2830 }, { "epoch": 0.16333103289625028, "grad_norm": 6.974459171295166, "learning_rate": 9.541524601771555e-07, "loss": 0.4578, "step": 2840 }, { "epoch": 0.16390614216701174, "grad_norm": 6.843561172485352, "learning_rate": 9.537620853436922e-07, "loss": 0.4608, "step": 2850 }, { "epoch": 0.16448125143777317, "grad_norm": 9.29882526397705, "learning_rate": 9.533701361517299e-07, "loss": 0.4408, "step": 2860 }, { "epoch": 0.16505636070853463, "grad_norm": 7.002045631408691, "learning_rate": 9.529766139611635e-07, "loss": 0.4697, "step": 2870 }, { "epoch": 0.16563146997929606, "grad_norm": 5.7329421043396, "learning_rate": 9.525815201373451e-07, "loss": 0.4726, "step": 2880 }, { "epoch": 0.16620657925005752, "grad_norm": 6.277803897857666, "learning_rate": 9.521848560510795e-07, "loss": 0.4544, "step": 2890 }, { "epoch": 0.16678168852081895, "grad_norm": 6.794547080993652, "learning_rate": 9.517866230786198e-07, "loss": 0.446, "step": 2900 }, { "epoch": 0.16735679779158041, "grad_norm": 6.249117851257324, "learning_rate": 9.513868226016625e-07, "loss": 0.4405, "step": 2910 }, { "epoch": 0.16793190706234185, "grad_norm": 6.457124710083008, "learning_rate": 9.509854560073427e-07, "loss": 0.4313, "step": 2920 }, { "epoch": 0.16850701633310328, "grad_norm": 7.078563213348389, "learning_rate": 9.505825246882289e-07, "loss": 0.4886, "step": 2930 }, { "epoch": 0.16908212560386474, "grad_norm": 6.063405990600586, "learning_rate": 9.501780300423191e-07, "loss": 0.4262, "step": 2940 }, { "epoch": 0.16965723487462617, "grad_norm": 7.255430698394775, "learning_rate": 9.497719734730349e-07, "loss": 0.4684, "step": 2950 }, { "epoch": 0.17023234414538763, "grad_norm": 7.689632415771484, "learning_rate": 9.493643563892173e-07, "loss": 0.4348, "step": 2960 }, { "epoch": 0.17080745341614906, "grad_norm": 6.128988265991211, "learning_rate": 9.489551802051217e-07, "loss": 0.4277, "step": 2970 }, { "epoch": 0.17138256268691052, "grad_norm": 7.355824947357178, "learning_rate": 9.485444463404125e-07, "loss": 0.4742, "step": 2980 }, { "epoch": 0.17195767195767195, "grad_norm": 7.306033611297607, "learning_rate": 9.481321562201589e-07, "loss": 0.4438, "step": 2990 }, { "epoch": 0.1725327812284334, "grad_norm": 6.289697647094727, "learning_rate": 9.477183112748297e-07, "loss": 0.4397, "step": 3000 }, { "epoch": 0.17310789049919484, "grad_norm": 7.582209587097168, "learning_rate": 9.473029129402883e-07, "loss": 0.4514, "step": 3010 }, { "epoch": 0.1736829997699563, "grad_norm": 8.875397682189941, "learning_rate": 9.468859626577871e-07, "loss": 0.4709, "step": 3020 }, { "epoch": 0.17425810904071773, "grad_norm": 8.873878479003906, "learning_rate": 9.464674618739641e-07, "loss": 0.4279, "step": 3030 }, { "epoch": 0.1748332183114792, "grad_norm": 7.123440265655518, "learning_rate": 9.46047412040836e-07, "loss": 0.4301, "step": 3040 }, { "epoch": 0.17540832758224062, "grad_norm": 9.689279556274414, "learning_rate": 9.456258146157943e-07, "loss": 0.4345, "step": 3050 }, { "epoch": 0.17598343685300208, "grad_norm": 7.508395195007324, "learning_rate": 9.452026710615999e-07, "loss": 0.4599, "step": 3060 }, { "epoch": 0.1765585461237635, "grad_norm": 7.05275297164917, "learning_rate": 9.447779828463786e-07, "loss": 0.4768, "step": 3070 }, { "epoch": 0.17713365539452497, "grad_norm": 6.077139377593994, "learning_rate": 9.44351751443615e-07, "loss": 0.4971, "step": 3080 }, { "epoch": 0.1777087646652864, "grad_norm": 9.290302276611328, "learning_rate": 9.439239783321478e-07, "loss": 0.4421, "step": 3090 }, { "epoch": 0.17828387393604786, "grad_norm": 7.146210670471191, "learning_rate": 9.434946649961651e-07, "loss": 0.4546, "step": 3100 }, { "epoch": 0.1788589832068093, "grad_norm": 6.620079517364502, "learning_rate": 9.430638129251989e-07, "loss": 0.4607, "step": 3110 }, { "epoch": 0.17943409247757075, "grad_norm": 6.333140850067139, "learning_rate": 9.426314236141198e-07, "loss": 0.4749, "step": 3120 }, { "epoch": 0.18000920174833218, "grad_norm": 9.721125602722168, "learning_rate": 9.421974985631319e-07, "loss": 0.463, "step": 3130 }, { "epoch": 0.18058431101909364, "grad_norm": 7.114312171936035, "learning_rate": 9.417620392777679e-07, "loss": 0.4045, "step": 3140 }, { "epoch": 0.18115942028985507, "grad_norm": 6.953790664672852, "learning_rate": 9.413250472688832e-07, "loss": 0.4476, "step": 3150 }, { "epoch": 0.1817345295606165, "grad_norm": 6.260456562042236, "learning_rate": 9.408865240526518e-07, "loss": 0.4813, "step": 3160 }, { "epoch": 0.18230963883137796, "grad_norm": 7.821824550628662, "learning_rate": 9.404464711505595e-07, "loss": 0.4597, "step": 3170 }, { "epoch": 0.1828847481021394, "grad_norm": 8.789093971252441, "learning_rate": 9.400048900894003e-07, "loss": 0.4504, "step": 3180 }, { "epoch": 0.18345985737290085, "grad_norm": 7.451201915740967, "learning_rate": 9.395617824012693e-07, "loss": 0.4545, "step": 3190 }, { "epoch": 0.18403496664366228, "grad_norm": 6.279775619506836, "learning_rate": 9.391171496235591e-07, "loss": 0.4529, "step": 3200 }, { "epoch": 0.18461007591442374, "grad_norm": 7.9017133712768555, "learning_rate": 9.386709932989535e-07, "loss": 0.4404, "step": 3210 }, { "epoch": 0.18518518518518517, "grad_norm": 6.8365654945373535, "learning_rate": 9.382233149754222e-07, "loss": 0.4466, "step": 3220 }, { "epoch": 0.18576029445594663, "grad_norm": 7.044270038604736, "learning_rate": 9.377741162062156e-07, "loss": 0.4663, "step": 3230 }, { "epoch": 0.18633540372670807, "grad_norm": 7.3222198486328125, "learning_rate": 9.373233985498597e-07, "loss": 0.4374, "step": 3240 }, { "epoch": 0.18691051299746952, "grad_norm": 7.342752456665039, "learning_rate": 9.368711635701498e-07, "loss": 0.4467, "step": 3250 }, { "epoch": 0.18748562226823096, "grad_norm": 6.475831985473633, "learning_rate": 9.364174128361463e-07, "loss": 0.4258, "step": 3260 }, { "epoch": 0.18806073153899242, "grad_norm": 8.114928245544434, "learning_rate": 9.359621479221683e-07, "loss": 0.449, "step": 3270 }, { "epoch": 0.18863584080975385, "grad_norm": 8.220779418945312, "learning_rate": 9.355053704077883e-07, "loss": 0.4832, "step": 3280 }, { "epoch": 0.1892109500805153, "grad_norm": 7.149385452270508, "learning_rate": 9.350470818778271e-07, "loss": 0.4478, "step": 3290 }, { "epoch": 0.18978605935127674, "grad_norm": 7.05686616897583, "learning_rate": 9.345872839223481e-07, "loss": 0.4425, "step": 3300 }, { "epoch": 0.1903611686220382, "grad_norm": 7.715849876403809, "learning_rate": 9.341259781366515e-07, "loss": 0.4417, "step": 3310 }, { "epoch": 0.19093627789279963, "grad_norm": 8.44191837310791, "learning_rate": 9.336631661212692e-07, "loss": 0.4899, "step": 3320 }, { "epoch": 0.1915113871635611, "grad_norm": 9.569662094116211, "learning_rate": 9.331988494819593e-07, "loss": 0.4407, "step": 3330 }, { "epoch": 0.19208649643432252, "grad_norm": 6.279257297515869, "learning_rate": 9.327330298296996e-07, "loss": 0.4438, "step": 3340 }, { "epoch": 0.19266160570508398, "grad_norm": 7.9307098388671875, "learning_rate": 9.322657087806836e-07, "loss": 0.4578, "step": 3350 }, { "epoch": 0.1932367149758454, "grad_norm": 7.901614665985107, "learning_rate": 9.317968879563133e-07, "loss": 0.4383, "step": 3360 }, { "epoch": 0.19381182424660687, "grad_norm": 6.400086879730225, "learning_rate": 9.313265689831947e-07, "loss": 0.4263, "step": 3370 }, { "epoch": 0.1943869335173683, "grad_norm": 8.782342910766602, "learning_rate": 9.308547534931312e-07, "loss": 0.4584, "step": 3380 }, { "epoch": 0.19496204278812976, "grad_norm": 7.300394058227539, "learning_rate": 9.303814431231189e-07, "loss": 0.4334, "step": 3390 }, { "epoch": 0.1955371520588912, "grad_norm": 9.415356636047363, "learning_rate": 9.299066395153409e-07, "loss": 0.4205, "step": 3400 }, { "epoch": 0.19611226132965262, "grad_norm": 7.652698040008545, "learning_rate": 9.294303443171603e-07, "loss": 0.4688, "step": 3410 }, { "epoch": 0.19668737060041408, "grad_norm": 6.977850437164307, "learning_rate": 9.289525591811156e-07, "loss": 0.4522, "step": 3420 }, { "epoch": 0.1972624798711755, "grad_norm": 8.344732284545898, "learning_rate": 9.284732857649153e-07, "loss": 0.4639, "step": 3430 }, { "epoch": 0.19783758914193697, "grad_norm": 7.731173515319824, "learning_rate": 9.27992525731431e-07, "loss": 0.4686, "step": 3440 }, { "epoch": 0.1984126984126984, "grad_norm": 6.032643795013428, "learning_rate": 9.275102807486924e-07, "loss": 0.4745, "step": 3450 }, { "epoch": 0.19898780768345986, "grad_norm": 6.80446195602417, "learning_rate": 9.270265524898812e-07, "loss": 0.4214, "step": 3460 }, { "epoch": 0.1995629169542213, "grad_norm": 10.586133003234863, "learning_rate": 9.26541342633326e-07, "loss": 0.4776, "step": 3470 }, { "epoch": 0.20013802622498275, "grad_norm": 7.73087215423584, "learning_rate": 9.260546528624953e-07, "loss": 0.4193, "step": 3480 }, { "epoch": 0.20071313549574418, "grad_norm": 7.399821758270264, "learning_rate": 9.255664848659924e-07, "loss": 0.4668, "step": 3490 }, { "epoch": 0.20128824476650564, "grad_norm": 7.147819995880127, "learning_rate": 9.250768403375493e-07, "loss": 0.4212, "step": 3500 }, { "epoch": 0.20186335403726707, "grad_norm": 8.024975776672363, "learning_rate": 9.245857209760213e-07, "loss": 0.456, "step": 3510 }, { "epoch": 0.20243846330802853, "grad_norm": 7.848642826080322, "learning_rate": 9.240931284853806e-07, "loss": 0.4326, "step": 3520 }, { "epoch": 0.20301357257878996, "grad_norm": 5.983529567718506, "learning_rate": 9.235990645747101e-07, "loss": 0.4411, "step": 3530 }, { "epoch": 0.20358868184955142, "grad_norm": 6.5141119956970215, "learning_rate": 9.231035309581985e-07, "loss": 0.4449, "step": 3540 }, { "epoch": 0.20416379112031285, "grad_norm": 6.487335681915283, "learning_rate": 9.226065293551335e-07, "loss": 0.4374, "step": 3550 }, { "epoch": 0.2047389003910743, "grad_norm": 8.524537086486816, "learning_rate": 9.22108061489896e-07, "loss": 0.4694, "step": 3560 }, { "epoch": 0.20531400966183574, "grad_norm": 7.094654560089111, "learning_rate": 9.216081290919539e-07, "loss": 0.4406, "step": 3570 }, { "epoch": 0.2058891189325972, "grad_norm": 7.105228424072266, "learning_rate": 9.211067338958573e-07, "loss": 0.4741, "step": 3580 }, { "epoch": 0.20646422820335864, "grad_norm": 7.7301225662231445, "learning_rate": 9.206038776412308e-07, "loss": 0.4984, "step": 3590 }, { "epoch": 0.2070393374741201, "grad_norm": 5.989398002624512, "learning_rate": 9.200995620727684e-07, "loss": 0.4631, "step": 3600 }, { "epoch": 0.20761444674488153, "grad_norm": 6.620017051696777, "learning_rate": 9.195937889402274e-07, "loss": 0.465, "step": 3610 }, { "epoch": 0.20818955601564298, "grad_norm": 6.503094673156738, "learning_rate": 9.190865599984222e-07, "loss": 0.4495, "step": 3620 }, { "epoch": 0.20876466528640442, "grad_norm": 7.4195556640625, "learning_rate": 9.185778770072181e-07, "loss": 0.4593, "step": 3630 }, { "epoch": 0.20933977455716588, "grad_norm": 8.530793190002441, "learning_rate": 9.180677417315255e-07, "loss": 0.4391, "step": 3640 }, { "epoch": 0.2099148838279273, "grad_norm": 7.999217987060547, "learning_rate": 9.175561559412936e-07, "loss": 0.4404, "step": 3650 }, { "epoch": 0.21048999309868874, "grad_norm": 8.838647842407227, "learning_rate": 9.170431214115042e-07, "loss": 0.4595, "step": 3660 }, { "epoch": 0.2110651023694502, "grad_norm": 5.631801605224609, "learning_rate": 9.165286399221655e-07, "loss": 0.4425, "step": 3670 }, { "epoch": 0.21164021164021163, "grad_norm": 7.343861103057861, "learning_rate": 9.160127132583059e-07, "loss": 0.4509, "step": 3680 }, { "epoch": 0.2122153209109731, "grad_norm": 11.188214302062988, "learning_rate": 9.154953432099686e-07, "loss": 0.4573, "step": 3690 }, { "epoch": 0.21279043018173452, "grad_norm": 6.560288906097412, "learning_rate": 9.149765315722038e-07, "loss": 0.4316, "step": 3700 }, { "epoch": 0.21336553945249598, "grad_norm": 6.623726844787598, "learning_rate": 9.144562801450642e-07, "loss": 0.4301, "step": 3710 }, { "epoch": 0.2139406487232574, "grad_norm": 6.603649139404297, "learning_rate": 9.139345907335976e-07, "loss": 0.4449, "step": 3720 }, { "epoch": 0.21451575799401887, "grad_norm": 7.99400520324707, "learning_rate": 9.134114651478406e-07, "loss": 0.4462, "step": 3730 }, { "epoch": 0.2150908672647803, "grad_norm": 5.738957405090332, "learning_rate": 9.128869052028137e-07, "loss": 0.4257, "step": 3740 }, { "epoch": 0.21566597653554176, "grad_norm": 7.310655117034912, "learning_rate": 9.123609127185128e-07, "loss": 0.4501, "step": 3750 }, { "epoch": 0.2162410858063032, "grad_norm": 6.6573805809021, "learning_rate": 9.118334895199052e-07, "loss": 0.4561, "step": 3760 }, { "epoch": 0.21681619507706465, "grad_norm": 6.993839263916016, "learning_rate": 9.113046374369211e-07, "loss": 0.4411, "step": 3770 }, { "epoch": 0.21739130434782608, "grad_norm": 7.0291056632995605, "learning_rate": 9.107743583044493e-07, "loss": 0.4634, "step": 3780 }, { "epoch": 0.21796641361858754, "grad_norm": 6.7845778465271, "learning_rate": 9.102426539623294e-07, "loss": 0.4345, "step": 3790 }, { "epoch": 0.21854152288934897, "grad_norm": 7.999422550201416, "learning_rate": 9.097095262553458e-07, "loss": 0.4663, "step": 3800 }, { "epoch": 0.21911663216011043, "grad_norm": 7.416889190673828, "learning_rate": 9.091749770332214e-07, "loss": 0.4713, "step": 3810 }, { "epoch": 0.21969174143087186, "grad_norm": 7.8123016357421875, "learning_rate": 9.08639008150611e-07, "loss": 0.4453, "step": 3820 }, { "epoch": 0.22026685070163332, "grad_norm": 7.918675899505615, "learning_rate": 9.081016214670955e-07, "loss": 0.4364, "step": 3830 }, { "epoch": 0.22084195997239475, "grad_norm": 7.056694030761719, "learning_rate": 9.075628188471744e-07, "loss": 0.419, "step": 3840 }, { "epoch": 0.2214170692431562, "grad_norm": 8.189493179321289, "learning_rate": 9.070226021602603e-07, "loss": 0.4457, "step": 3850 }, { "epoch": 0.22199217851391764, "grad_norm": 7.041081428527832, "learning_rate": 9.064809732806718e-07, "loss": 0.4505, "step": 3860 }, { "epoch": 0.2225672877846791, "grad_norm": 7.861651420593262, "learning_rate": 9.059379340876273e-07, "loss": 0.434, "step": 3870 }, { "epoch": 0.22314239705544053, "grad_norm": 7.338512897491455, "learning_rate": 9.053934864652382e-07, "loss": 0.445, "step": 3880 }, { "epoch": 0.22371750632620196, "grad_norm": 7.283870220184326, "learning_rate": 9.048476323025025e-07, "loss": 0.4196, "step": 3890 }, { "epoch": 0.22429261559696342, "grad_norm": 8.336540222167969, "learning_rate": 9.043003734932988e-07, "loss": 0.4643, "step": 3900 }, { "epoch": 0.22486772486772486, "grad_norm": 5.803771495819092, "learning_rate": 9.037517119363787e-07, "loss": 0.4548, "step": 3910 }, { "epoch": 0.22544283413848631, "grad_norm": 7.371987819671631, "learning_rate": 9.032016495353608e-07, "loss": 0.432, "step": 3920 }, { "epoch": 0.22601794340924775, "grad_norm": 6.515702724456787, "learning_rate": 9.026501881987241e-07, "loss": 0.4388, "step": 3930 }, { "epoch": 0.2265930526800092, "grad_norm": 6.624001979827881, "learning_rate": 9.020973298398015e-07, "loss": 0.4238, "step": 3940 }, { "epoch": 0.22716816195077064, "grad_norm": 8.837011337280273, "learning_rate": 9.015430763767726e-07, "loss": 0.4103, "step": 3950 }, { "epoch": 0.2277432712215321, "grad_norm": 7.257904529571533, "learning_rate": 9.009874297326576e-07, "loss": 0.4501, "step": 3960 }, { "epoch": 0.22831838049229353, "grad_norm": 5.916260242462158, "learning_rate": 9.004303918353107e-07, "loss": 0.431, "step": 3970 }, { "epoch": 0.22889348976305499, "grad_norm": 7.61647891998291, "learning_rate": 8.998719646174125e-07, "loss": 0.4449, "step": 3980 }, { "epoch": 0.22946859903381642, "grad_norm": 7.539576530456543, "learning_rate": 8.993121500164645e-07, "loss": 0.4226, "step": 3990 }, { "epoch": 0.23004370830457788, "grad_norm": 7.2787089347839355, "learning_rate": 8.987509499747814e-07, "loss": 0.4287, "step": 4000 }, { "epoch": 0.2306188175753393, "grad_norm": 7.724874496459961, "learning_rate": 8.981883664394853e-07, "loss": 0.4408, "step": 4010 }, { "epoch": 0.23119392684610077, "grad_norm": 6.672788619995117, "learning_rate": 8.976244013624983e-07, "loss": 0.438, "step": 4020 }, { "epoch": 0.2317690361168622, "grad_norm": 7.771937847137451, "learning_rate": 8.970590567005352e-07, "loss": 0.4217, "step": 4030 }, { "epoch": 0.23234414538762366, "grad_norm": 7.927281856536865, "learning_rate": 8.964923344150982e-07, "loss": 0.4378, "step": 4040 }, { "epoch": 0.2329192546583851, "grad_norm": 9.862508773803711, "learning_rate": 8.959242364724688e-07, "loss": 0.4716, "step": 4050 }, { "epoch": 0.23349436392914655, "grad_norm": 6.6189799308776855, "learning_rate": 8.953547648437015e-07, "loss": 0.45, "step": 4060 }, { "epoch": 0.23406947319990798, "grad_norm": 7.498517036437988, "learning_rate": 8.947839215046171e-07, "loss": 0.4222, "step": 4070 }, { "epoch": 0.23464458247066944, "grad_norm": 8.622501373291016, "learning_rate": 8.942117084357954e-07, "loss": 0.4269, "step": 4080 }, { "epoch": 0.23521969174143087, "grad_norm": 7.8676958084106445, "learning_rate": 8.936381276225686e-07, "loss": 0.4491, "step": 4090 }, { "epoch": 0.23579480101219233, "grad_norm": 7.693765163421631, "learning_rate": 8.930631810550146e-07, "loss": 0.4364, "step": 4100 }, { "epoch": 0.23636991028295376, "grad_norm": 8.006646156311035, "learning_rate": 8.924868707279496e-07, "loss": 0.4715, "step": 4110 }, { "epoch": 0.23694501955371522, "grad_norm": 7.287476539611816, "learning_rate": 8.919091986409218e-07, "loss": 0.4137, "step": 4120 }, { "epoch": 0.23752012882447665, "grad_norm": 7.429540634155273, "learning_rate": 8.913301667982035e-07, "loss": 0.4397, "step": 4130 }, { "epoch": 0.23809523809523808, "grad_norm": 8.005236625671387, "learning_rate": 8.907497772087854e-07, "loss": 0.4486, "step": 4140 }, { "epoch": 0.23867034736599954, "grad_norm": 6.451951026916504, "learning_rate": 8.901680318863689e-07, "loss": 0.4278, "step": 4150 }, { "epoch": 0.23924545663676097, "grad_norm": 9.805582046508789, "learning_rate": 8.895849328493588e-07, "loss": 0.4609, "step": 4160 }, { "epoch": 0.23982056590752243, "grad_norm": 6.742419242858887, "learning_rate": 8.890004821208571e-07, "loss": 0.4591, "step": 4170 }, { "epoch": 0.24039567517828386, "grad_norm": 6.634343147277832, "learning_rate": 8.884146817286555e-07, "loss": 0.4499, "step": 4180 }, { "epoch": 0.24097078444904532, "grad_norm": 7.307589530944824, "learning_rate": 8.878275337052282e-07, "loss": 0.4135, "step": 4190 }, { "epoch": 0.24154589371980675, "grad_norm": 7.264256000518799, "learning_rate": 8.872390400877256e-07, "loss": 0.4564, "step": 4200 }, { "epoch": 0.2421210029905682, "grad_norm": 8.509105682373047, "learning_rate": 8.866492029179663e-07, "loss": 0.4393, "step": 4210 }, { "epoch": 0.24269611226132964, "grad_norm": 6.191605567932129, "learning_rate": 8.860580242424306e-07, "loss": 0.4173, "step": 4220 }, { "epoch": 0.2432712215320911, "grad_norm": 6.38659143447876, "learning_rate": 8.854655061122531e-07, "loss": 0.4464, "step": 4230 }, { "epoch": 0.24384633080285253, "grad_norm": 7.845058441162109, "learning_rate": 8.848716505832161e-07, "loss": 0.4628, "step": 4240 }, { "epoch": 0.244421440073614, "grad_norm": 6.523630142211914, "learning_rate": 8.842764597157416e-07, "loss": 0.429, "step": 4250 }, { "epoch": 0.24499654934437542, "grad_norm": 8.246659278869629, "learning_rate": 8.83679935574885e-07, "loss": 0.4404, "step": 4260 }, { "epoch": 0.24557165861513688, "grad_norm": 7.586349964141846, "learning_rate": 8.830820802303277e-07, "loss": 0.4268, "step": 4270 }, { "epoch": 0.24614676788589832, "grad_norm": 6.173338890075684, "learning_rate": 8.824828957563692e-07, "loss": 0.4276, "step": 4280 }, { "epoch": 0.24672187715665977, "grad_norm": 8.178177833557129, "learning_rate": 8.818823842319212e-07, "loss": 0.4752, "step": 4290 }, { "epoch": 0.2472969864274212, "grad_norm": 8.361733436584473, "learning_rate": 8.812805477404993e-07, "loss": 0.4461, "step": 4300 }, { "epoch": 0.24787209569818267, "grad_norm": 6.543208599090576, "learning_rate": 8.806773883702163e-07, "loss": 0.4329, "step": 4310 }, { "epoch": 0.2484472049689441, "grad_norm": 6.091966152191162, "learning_rate": 8.800729082137745e-07, "loss": 0.4263, "step": 4320 }, { "epoch": 0.24902231423970556, "grad_norm": 7.65754508972168, "learning_rate": 8.794671093684594e-07, "loss": 0.4274, "step": 4330 }, { "epoch": 0.249597423510467, "grad_norm": 7.258798599243164, "learning_rate": 8.788599939361311e-07, "loss": 0.4593, "step": 4340 }, { "epoch": 0.2501725327812284, "grad_norm": 7.0468926429748535, "learning_rate": 8.782515640232182e-07, "loss": 0.434, "step": 4350 }, { "epoch": 0.2507476420519899, "grad_norm": 6.3189544677734375, "learning_rate": 8.776418217407095e-07, "loss": 0.427, "step": 4360 }, { "epoch": 0.25132275132275134, "grad_norm": 8.817581176757812, "learning_rate": 8.770307692041479e-07, "loss": 0.4119, "step": 4370 }, { "epoch": 0.25189786059351277, "grad_norm": 6.4999542236328125, "learning_rate": 8.764184085336213e-07, "loss": 0.4163, "step": 4380 }, { "epoch": 0.2524729698642742, "grad_norm": 8.368284225463867, "learning_rate": 8.758047418537573e-07, "loss": 0.4317, "step": 4390 }, { "epoch": 0.25304807913503563, "grad_norm": 9.475980758666992, "learning_rate": 8.751897712937138e-07, "loss": 0.44, "step": 4400 }, { "epoch": 0.2536231884057971, "grad_norm": 8.334697723388672, "learning_rate": 8.745734989871736e-07, "loss": 0.4004, "step": 4410 }, { "epoch": 0.25419829767655855, "grad_norm": 7.612847805023193, "learning_rate": 8.739559270723352e-07, "loss": 0.4316, "step": 4420 }, { "epoch": 0.25477340694732, "grad_norm": 6.775670051574707, "learning_rate": 8.733370576919065e-07, "loss": 0.4171, "step": 4430 }, { "epoch": 0.2553485162180814, "grad_norm": 7.52505350112915, "learning_rate": 8.72716892993097e-07, "loss": 0.4272, "step": 4440 }, { "epoch": 0.2559236254888429, "grad_norm": 6.064124584197998, "learning_rate": 8.720954351276104e-07, "loss": 0.4477, "step": 4450 }, { "epoch": 0.25649873475960433, "grad_norm": 9.010953903198242, "learning_rate": 8.714726862516372e-07, "loss": 0.4379, "step": 4460 }, { "epoch": 0.25707384403036576, "grad_norm": 6.36660623550415, "learning_rate": 8.708486485258474e-07, "loss": 0.422, "step": 4470 }, { "epoch": 0.2576489533011272, "grad_norm": 7.115530014038086, "learning_rate": 8.702233241153818e-07, "loss": 0.4308, "step": 4480 }, { "epoch": 0.2582240625718887, "grad_norm": 6.13870096206665, "learning_rate": 8.695967151898467e-07, "loss": 0.4116, "step": 4490 }, { "epoch": 0.2587991718426501, "grad_norm": 6.9618730545043945, "learning_rate": 8.68968823923304e-07, "loss": 0.4107, "step": 4500 }, { "epoch": 0.25937428111341154, "grad_norm": 7.7726287841796875, "learning_rate": 8.683396524942654e-07, "loss": 0.4452, "step": 4510 }, { "epoch": 0.259949390384173, "grad_norm": 7.81471061706543, "learning_rate": 8.677092030856845e-07, "loss": 0.432, "step": 4520 }, { "epoch": 0.26052449965493446, "grad_norm": 9.879206657409668, "learning_rate": 8.670774778849479e-07, "loss": 0.4298, "step": 4530 }, { "epoch": 0.2610996089256959, "grad_norm": 7.656399250030518, "learning_rate": 8.664444790838697e-07, "loss": 0.4458, "step": 4540 }, { "epoch": 0.2616747181964573, "grad_norm": 6.9384684562683105, "learning_rate": 8.658102088786821e-07, "loss": 0.4187, "step": 4550 }, { "epoch": 0.26224982746721875, "grad_norm": 8.309664726257324, "learning_rate": 8.651746694700292e-07, "loss": 0.4381, "step": 4560 }, { "epoch": 0.26282493673798024, "grad_norm": 11.52549934387207, "learning_rate": 8.645378630629579e-07, "loss": 0.4225, "step": 4570 }, { "epoch": 0.2634000460087417, "grad_norm": 7.769007682800293, "learning_rate": 8.638997918669117e-07, "loss": 0.443, "step": 4580 }, { "epoch": 0.2639751552795031, "grad_norm": 8.543374061584473, "learning_rate": 8.632604580957221e-07, "loss": 0.4547, "step": 4590 }, { "epoch": 0.26455026455026454, "grad_norm": 7.566020965576172, "learning_rate": 8.626198639676013e-07, "loss": 0.42, "step": 4600 }, { "epoch": 0.265125373821026, "grad_norm": 5.682227611541748, "learning_rate": 8.619780117051341e-07, "loss": 0.4459, "step": 4610 }, { "epoch": 0.26570048309178745, "grad_norm": 8.134650230407715, "learning_rate": 8.613349035352708e-07, "loss": 0.433, "step": 4620 }, { "epoch": 0.2662755923625489, "grad_norm": 9.267498970031738, "learning_rate": 8.606905416893191e-07, "loss": 0.445, "step": 4630 }, { "epoch": 0.2668507016333103, "grad_norm": 7.135440826416016, "learning_rate": 8.600449284029363e-07, "loss": 0.4299, "step": 4640 }, { "epoch": 0.26742581090407175, "grad_norm": 7.700968265533447, "learning_rate": 8.593980659161218e-07, "loss": 0.3996, "step": 4650 }, { "epoch": 0.26800092017483323, "grad_norm": 6.853052139282227, "learning_rate": 8.587499564732089e-07, "loss": 0.4209, "step": 4660 }, { "epoch": 0.26857602944559467, "grad_norm": 8.014687538146973, "learning_rate": 8.581006023228579e-07, "loss": 0.4288, "step": 4670 }, { "epoch": 0.2691511387163561, "grad_norm": 7.333747863769531, "learning_rate": 8.574500057180469e-07, "loss": 0.408, "step": 4680 }, { "epoch": 0.26972624798711753, "grad_norm": 6.5996294021606445, "learning_rate": 8.567981689160653e-07, "loss": 0.4227, "step": 4690 }, { "epoch": 0.270301357257879, "grad_norm": 8.393999099731445, "learning_rate": 8.561450941785056e-07, "loss": 0.4048, "step": 4700 }, { "epoch": 0.27087646652864045, "grad_norm": 7.693059921264648, "learning_rate": 8.554907837712548e-07, "loss": 0.4277, "step": 4710 }, { "epoch": 0.2714515757994019, "grad_norm": 9.395722389221191, "learning_rate": 8.548352399644876e-07, "loss": 0.4378, "step": 4720 }, { "epoch": 0.2720266850701633, "grad_norm": 6.264937877655029, "learning_rate": 8.54178465032658e-07, "loss": 0.4076, "step": 4730 }, { "epoch": 0.2726017943409248, "grad_norm": 7.825188636779785, "learning_rate": 8.535204612544913e-07, "loss": 0.4105, "step": 4740 }, { "epoch": 0.27317690361168623, "grad_norm": 6.873167991638184, "learning_rate": 8.528612309129766e-07, "loss": 0.3929, "step": 4750 }, { "epoch": 0.27375201288244766, "grad_norm": 7.05859899520874, "learning_rate": 8.522007762953585e-07, "loss": 0.4283, "step": 4760 }, { "epoch": 0.2743271221532091, "grad_norm": 8.042074203491211, "learning_rate": 8.51539099693129e-07, "loss": 0.4362, "step": 4770 }, { "epoch": 0.2749022314239706, "grad_norm": 7.675294399261475, "learning_rate": 8.508762034020209e-07, "loss": 0.4242, "step": 4780 }, { "epoch": 0.275477340694732, "grad_norm": 8.475811958312988, "learning_rate": 8.502120897219975e-07, "loss": 0.4612, "step": 4790 }, { "epoch": 0.27605244996549344, "grad_norm": 7.014756202697754, "learning_rate": 8.495467609572467e-07, "loss": 0.4185, "step": 4800 }, { "epoch": 0.27662755923625487, "grad_norm": 7.501617431640625, "learning_rate": 8.488802194161719e-07, "loss": 0.4347, "step": 4810 }, { "epoch": 0.27720266850701636, "grad_norm": 7.894189357757568, "learning_rate": 8.482124674113848e-07, "loss": 0.3896, "step": 4820 }, { "epoch": 0.2777777777777778, "grad_norm": 7.893086910247803, "learning_rate": 8.475435072596961e-07, "loss": 0.4562, "step": 4830 }, { "epoch": 0.2783528870485392, "grad_norm": 6.423941135406494, "learning_rate": 8.468733412821087e-07, "loss": 0.3893, "step": 4840 }, { "epoch": 0.27892799631930065, "grad_norm": 7.725098133087158, "learning_rate": 8.462019718038095e-07, "loss": 0.4171, "step": 4850 }, { "epoch": 0.2795031055900621, "grad_norm": 5.74798059463501, "learning_rate": 8.455294011541602e-07, "loss": 0.4126, "step": 4860 }, { "epoch": 0.28007821486082357, "grad_norm": 10.348775863647461, "learning_rate": 8.448556316666911e-07, "loss": 0.4327, "step": 4870 }, { "epoch": 0.280653324131585, "grad_norm": 8.3062105178833, "learning_rate": 8.441806656790908e-07, "loss": 0.4328, "step": 4880 }, { "epoch": 0.28122843340234643, "grad_norm": 7.227969169616699, "learning_rate": 8.435045055332005e-07, "loss": 0.4584, "step": 4890 }, { "epoch": 0.28180354267310787, "grad_norm": 7.532920837402344, "learning_rate": 8.428271535750032e-07, "loss": 0.4442, "step": 4900 }, { "epoch": 0.28237865194386935, "grad_norm": 17.655643463134766, "learning_rate": 8.421486121546179e-07, "loss": 0.418, "step": 4910 }, { "epoch": 0.2829537612146308, "grad_norm": 8.107803344726562, "learning_rate": 8.414688836262909e-07, "loss": 0.4202, "step": 4920 }, { "epoch": 0.2835288704853922, "grad_norm": 8.114568710327148, "learning_rate": 8.40787970348386e-07, "loss": 0.4421, "step": 4930 }, { "epoch": 0.28410397975615365, "grad_norm": 7.049617290496826, "learning_rate": 8.401058746833785e-07, "loss": 0.4331, "step": 4940 }, { "epoch": 0.28467908902691513, "grad_norm": 7.685065269470215, "learning_rate": 8.39422598997846e-07, "loss": 0.4344, "step": 4950 }, { "epoch": 0.28525419829767656, "grad_norm": 8.698698997497559, "learning_rate": 8.387381456624598e-07, "loss": 0.4421, "step": 4960 }, { "epoch": 0.285829307568438, "grad_norm": 7.022157192230225, "learning_rate": 8.380525170519775e-07, "loss": 0.41, "step": 4970 }, { "epoch": 0.2864044168391994, "grad_norm": 8.361230850219727, "learning_rate": 8.373657155452345e-07, "loss": 0.4471, "step": 4980 }, { "epoch": 0.2869795261099609, "grad_norm": 8.506590843200684, "learning_rate": 8.366777435251352e-07, "loss": 0.419, "step": 4990 }, { "epoch": 0.28755463538072235, "grad_norm": 6.984246730804443, "learning_rate": 8.359886033786456e-07, "loss": 0.407, "step": 5000 }, { "epoch": 0.2881297446514838, "grad_norm": 6.893902778625488, "learning_rate": 8.352982974967843e-07, "loss": 0.4252, "step": 5010 }, { "epoch": 0.2887048539222452, "grad_norm": 6.882055759429932, "learning_rate": 8.346068282746151e-07, "loss": 0.4216, "step": 5020 }, { "epoch": 0.2892799631930067, "grad_norm": 7.890246868133545, "learning_rate": 8.339141981112368e-07, "loss": 0.4416, "step": 5030 }, { "epoch": 0.2898550724637681, "grad_norm": 9.570390701293945, "learning_rate": 8.332204094097776e-07, "loss": 0.402, "step": 5040 }, { "epoch": 0.29043018173452956, "grad_norm": 8.155569076538086, "learning_rate": 8.325254645773847e-07, "loss": 0.4206, "step": 5050 }, { "epoch": 0.291005291005291, "grad_norm": 7.394750118255615, "learning_rate": 8.318293660252162e-07, "loss": 0.4497, "step": 5060 }, { "epoch": 0.2915804002760525, "grad_norm": 7.165675163269043, "learning_rate": 8.311321161684339e-07, "loss": 0.4085, "step": 5070 }, { "epoch": 0.2921555095468139, "grad_norm": 9.274934768676758, "learning_rate": 8.304337174261935e-07, "loss": 0.4264, "step": 5080 }, { "epoch": 0.29273061881757534, "grad_norm": 7.784249305725098, "learning_rate": 8.297341722216371e-07, "loss": 0.4381, "step": 5090 }, { "epoch": 0.29330572808833677, "grad_norm": 6.5512285232543945, "learning_rate": 8.290334829818845e-07, "loss": 0.437, "step": 5100 }, { "epoch": 0.2938808373590982, "grad_norm": 7.652458667755127, "learning_rate": 8.283316521380249e-07, "loss": 0.4593, "step": 5110 }, { "epoch": 0.2944559466298597, "grad_norm": 9.981424331665039, "learning_rate": 8.276286821251081e-07, "loss": 0.4239, "step": 5120 }, { "epoch": 0.2950310559006211, "grad_norm": 9.13131046295166, "learning_rate": 8.269245753821366e-07, "loss": 0.4259, "step": 5130 }, { "epoch": 0.29560616517138255, "grad_norm": 11.011831283569336, "learning_rate": 8.262193343520567e-07, "loss": 0.4605, "step": 5140 }, { "epoch": 0.296181274442144, "grad_norm": 7.364424705505371, "learning_rate": 8.255129614817503e-07, "loss": 0.4511, "step": 5150 }, { "epoch": 0.29675638371290547, "grad_norm": 6.667275428771973, "learning_rate": 8.248054592220263e-07, "loss": 0.4081, "step": 5160 }, { "epoch": 0.2973314929836669, "grad_norm": 9.481012344360352, "learning_rate": 8.240968300276119e-07, "loss": 0.4443, "step": 5170 }, { "epoch": 0.29790660225442833, "grad_norm": 9.473079681396484, "learning_rate": 8.233870763571444e-07, "loss": 0.4248, "step": 5180 }, { "epoch": 0.29848171152518976, "grad_norm": 7.815703392028809, "learning_rate": 8.226762006731627e-07, "loss": 0.4319, "step": 5190 }, { "epoch": 0.29905682079595125, "grad_norm": 7.1472907066345215, "learning_rate": 8.219642054420986e-07, "loss": 0.4496, "step": 5200 }, { "epoch": 0.2996319300667127, "grad_norm": 8.527371406555176, "learning_rate": 8.212510931342676e-07, "loss": 0.4216, "step": 5210 }, { "epoch": 0.3002070393374741, "grad_norm": 9.398560523986816, "learning_rate": 8.205368662238618e-07, "loss": 0.4177, "step": 5220 }, { "epoch": 0.30078214860823554, "grad_norm": 9.455587387084961, "learning_rate": 8.198215271889404e-07, "loss": 0.4305, "step": 5230 }, { "epoch": 0.30135725787899703, "grad_norm": 7.393576145172119, "learning_rate": 8.191050785114206e-07, "loss": 0.4105, "step": 5240 }, { "epoch": 0.30193236714975846, "grad_norm": 6.751893520355225, "learning_rate": 8.183875226770701e-07, "loss": 0.4116, "step": 5250 }, { "epoch": 0.3025074764205199, "grad_norm": 7.984627723693848, "learning_rate": 8.176688621754976e-07, "loss": 0.4055, "step": 5260 }, { "epoch": 0.3030825856912813, "grad_norm": 7.323383331298828, "learning_rate": 8.169490995001451e-07, "loss": 0.4176, "step": 5270 }, { "epoch": 0.3036576949620428, "grad_norm": 6.1765570640563965, "learning_rate": 8.16228237148278e-07, "loss": 0.4186, "step": 5280 }, { "epoch": 0.30423280423280424, "grad_norm": 8.252169609069824, "learning_rate": 8.155062776209776e-07, "loss": 0.4182, "step": 5290 }, { "epoch": 0.3048079135035657, "grad_norm": 8.166460990905762, "learning_rate": 8.147832234231313e-07, "loss": 0.4478, "step": 5300 }, { "epoch": 0.3053830227743271, "grad_norm": 7.824160099029541, "learning_rate": 8.140590770634255e-07, "loss": 0.462, "step": 5310 }, { "epoch": 0.3059581320450886, "grad_norm": 6.750142574310303, "learning_rate": 8.133338410543349e-07, "loss": 0.4114, "step": 5320 }, { "epoch": 0.30653324131585, "grad_norm": 7.4868364334106445, "learning_rate": 8.126075179121155e-07, "loss": 0.4098, "step": 5330 }, { "epoch": 0.30710835058661146, "grad_norm": 8.23891544342041, "learning_rate": 8.118801101567949e-07, "loss": 0.4315, "step": 5340 }, { "epoch": 0.3076834598573729, "grad_norm": 6.802544593811035, "learning_rate": 8.111516203121643e-07, "loss": 0.4076, "step": 5350 }, { "epoch": 0.3082585691281343, "grad_norm": 6.818166255950928, "learning_rate": 8.104220509057682e-07, "loss": 0.4083, "step": 5360 }, { "epoch": 0.3088336783988958, "grad_norm": 7.149864673614502, "learning_rate": 8.096914044688979e-07, "loss": 0.4112, "step": 5370 }, { "epoch": 0.30940878766965724, "grad_norm": 10.085858345031738, "learning_rate": 8.089596835365808e-07, "loss": 0.4169, "step": 5380 }, { "epoch": 0.30998389694041867, "grad_norm": 10.012545585632324, "learning_rate": 8.082268906475729e-07, "loss": 0.411, "step": 5390 }, { "epoch": 0.3105590062111801, "grad_norm": 6.078998565673828, "learning_rate": 8.074930283443488e-07, "loss": 0.4189, "step": 5400 }, { "epoch": 0.3111341154819416, "grad_norm": 8.843835830688477, "learning_rate": 8.067580991730938e-07, "loss": 0.4375, "step": 5410 }, { "epoch": 0.311709224752703, "grad_norm": 5.870880126953125, "learning_rate": 8.060221056836948e-07, "loss": 0.4082, "step": 5420 }, { "epoch": 0.31228433402346445, "grad_norm": 7.027002334594727, "learning_rate": 8.052850504297317e-07, "loss": 0.4247, "step": 5430 }, { "epoch": 0.3128594432942259, "grad_norm": 6.229483127593994, "learning_rate": 8.045469359684676e-07, "loss": 0.3907, "step": 5440 }, { "epoch": 0.31343455256498737, "grad_norm": 9.400445938110352, "learning_rate": 8.038077648608412e-07, "loss": 0.4112, "step": 5450 }, { "epoch": 0.3140096618357488, "grad_norm": 9.403877258300781, "learning_rate": 8.030675396714568e-07, "loss": 0.4146, "step": 5460 }, { "epoch": 0.31458477110651023, "grad_norm": 7.73607063293457, "learning_rate": 8.023262629685764e-07, "loss": 0.4219, "step": 5470 }, { "epoch": 0.31515988037727166, "grad_norm": 8.176775932312012, "learning_rate": 8.0158393732411e-07, "loss": 0.3975, "step": 5480 }, { "epoch": 0.31573498964803315, "grad_norm": 8.81912612915039, "learning_rate": 8.00840565313607e-07, "loss": 0.4395, "step": 5490 }, { "epoch": 0.3163100989187946, "grad_norm": 8.939974784851074, "learning_rate": 8.000961495162474e-07, "loss": 0.423, "step": 5500 }, { "epoch": 0.316885208189556, "grad_norm": 9.536949157714844, "learning_rate": 7.993506925148323e-07, "loss": 0.4216, "step": 5510 }, { "epoch": 0.31746031746031744, "grad_norm": 7.893071174621582, "learning_rate": 7.98604196895776e-07, "loss": 0.4005, "step": 5520 }, { "epoch": 0.31803542673107893, "grad_norm": 9.107876777648926, "learning_rate": 7.978566652490957e-07, "loss": 0.4303, "step": 5530 }, { "epoch": 0.31861053600184036, "grad_norm": 7.277716636657715, "learning_rate": 7.971081001684036e-07, "loss": 0.4388, "step": 5540 }, { "epoch": 0.3191856452726018, "grad_norm": 6.5604143142700195, "learning_rate": 7.963585042508971e-07, "loss": 0.4608, "step": 5550 }, { "epoch": 0.3197607545433632, "grad_norm": 6.244699478149414, "learning_rate": 7.956078800973504e-07, "loss": 0.387, "step": 5560 }, { "epoch": 0.3203358638141247, "grad_norm": 6.448412895202637, "learning_rate": 7.948562303121051e-07, "loss": 0.4128, "step": 5570 }, { "epoch": 0.32091097308488614, "grad_norm": 10.246508598327637, "learning_rate": 7.941035575030617e-07, "loss": 0.4086, "step": 5580 }, { "epoch": 0.3214860823556476, "grad_norm": 8.837204933166504, "learning_rate": 7.933498642816696e-07, "loss": 0.4114, "step": 5590 }, { "epoch": 0.322061191626409, "grad_norm": 6.853328704833984, "learning_rate": 7.92595153262919e-07, "loss": 0.425, "step": 5600 }, { "epoch": 0.32263630089717044, "grad_norm": 7.165212631225586, "learning_rate": 7.918394270653308e-07, "loss": 0.3992, "step": 5610 }, { "epoch": 0.3232114101679319, "grad_norm": 10.715254783630371, "learning_rate": 7.91082688310949e-07, "loss": 0.4304, "step": 5620 }, { "epoch": 0.32378651943869335, "grad_norm": 6.389898300170898, "learning_rate": 7.903249396253302e-07, "loss": 0.4311, "step": 5630 }, { "epoch": 0.3243616287094548, "grad_norm": 7.129954814910889, "learning_rate": 7.895661836375353e-07, "loss": 0.4242, "step": 5640 }, { "epoch": 0.3249367379802162, "grad_norm": 7.64763069152832, "learning_rate": 7.888064229801197e-07, "loss": 0.4398, "step": 5650 }, { "epoch": 0.3255118472509777, "grad_norm": 8.438844680786133, "learning_rate": 7.880456602891249e-07, "loss": 0.4047, "step": 5660 }, { "epoch": 0.32608695652173914, "grad_norm": 5.858060836791992, "learning_rate": 7.872838982040691e-07, "loss": 0.4296, "step": 5670 }, { "epoch": 0.32666206579250057, "grad_norm": 6.910191535949707, "learning_rate": 7.865211393679373e-07, "loss": 0.4517, "step": 5680 }, { "epoch": 0.327237175063262, "grad_norm": 9.14713191986084, "learning_rate": 7.857573864271738e-07, "loss": 0.4259, "step": 5690 }, { "epoch": 0.3278122843340235, "grad_norm": 6.044519424438477, "learning_rate": 7.849926420316715e-07, "loss": 0.3938, "step": 5700 }, { "epoch": 0.3283873936047849, "grad_norm": 7.824113368988037, "learning_rate": 7.842269088347629e-07, "loss": 0.4309, "step": 5710 }, { "epoch": 0.32896250287554635, "grad_norm": 9.161027908325195, "learning_rate": 7.834601894932117e-07, "loss": 0.4373, "step": 5720 }, { "epoch": 0.3295376121463078, "grad_norm": 6.452655792236328, "learning_rate": 7.82692486667203e-07, "loss": 0.4101, "step": 5730 }, { "epoch": 0.33011272141706927, "grad_norm": 7.519268035888672, "learning_rate": 7.819238030203341e-07, "loss": 0.4247, "step": 5740 }, { "epoch": 0.3306878306878307, "grad_norm": 10.807053565979004, "learning_rate": 7.811541412196052e-07, "loss": 0.3668, "step": 5750 }, { "epoch": 0.33126293995859213, "grad_norm": 8.221691131591797, "learning_rate": 7.803835039354105e-07, "loss": 0.4213, "step": 5760 }, { "epoch": 0.33183804922935356, "grad_norm": 8.100844383239746, "learning_rate": 7.796118938415288e-07, "loss": 0.4357, "step": 5770 }, { "epoch": 0.33241315850011505, "grad_norm": 7.710001468658447, "learning_rate": 7.788393136151137e-07, "loss": 0.4311, "step": 5780 }, { "epoch": 0.3329882677708765, "grad_norm": 8.31539249420166, "learning_rate": 7.78065765936685e-07, "loss": 0.4167, "step": 5790 }, { "epoch": 0.3335633770416379, "grad_norm": 8.466967582702637, "learning_rate": 7.772912534901191e-07, "loss": 0.458, "step": 5800 }, { "epoch": 0.33413848631239934, "grad_norm": 6.750044822692871, "learning_rate": 7.765157789626398e-07, "loss": 0.4403, "step": 5810 }, { "epoch": 0.33471359558316083, "grad_norm": 7.744177341461182, "learning_rate": 7.757393450448088e-07, "loss": 0.4232, "step": 5820 }, { "epoch": 0.33528870485392226, "grad_norm": 8.18170166015625, "learning_rate": 7.749619544305168e-07, "loss": 0.4593, "step": 5830 }, { "epoch": 0.3358638141246837, "grad_norm": 8.732441902160645, "learning_rate": 7.741836098169732e-07, "loss": 0.4144, "step": 5840 }, { "epoch": 0.3364389233954451, "grad_norm": 6.030580997467041, "learning_rate": 7.734043139046978e-07, "loss": 0.4226, "step": 5850 }, { "epoch": 0.33701403266620655, "grad_norm": 6.181154251098633, "learning_rate": 7.726240693975111e-07, "loss": 0.4277, "step": 5860 }, { "epoch": 0.33758914193696804, "grad_norm": 9.514756202697754, "learning_rate": 7.718428790025244e-07, "loss": 0.4299, "step": 5870 }, { "epoch": 0.33816425120772947, "grad_norm": 6.474589824676514, "learning_rate": 7.710607454301311e-07, "loss": 0.398, "step": 5880 }, { "epoch": 0.3387393604784909, "grad_norm": 7.582132816314697, "learning_rate": 7.702776713939971e-07, "loss": 0.4271, "step": 5890 }, { "epoch": 0.33931446974925233, "grad_norm": 7.031158924102783, "learning_rate": 7.694936596110509e-07, "loss": 0.4052, "step": 5900 }, { "epoch": 0.3398895790200138, "grad_norm": 6.694680213928223, "learning_rate": 7.687087128014752e-07, "loss": 0.4488, "step": 5910 }, { "epoch": 0.34046468829077525, "grad_norm": 6.710944652557373, "learning_rate": 7.679228336886962e-07, "loss": 0.4183, "step": 5920 }, { "epoch": 0.3410397975615367, "grad_norm": 7.997107982635498, "learning_rate": 7.671360249993752e-07, "loss": 0.4057, "step": 5930 }, { "epoch": 0.3416149068322981, "grad_norm": 8.980799674987793, "learning_rate": 7.663482894633986e-07, "loss": 0.4124, "step": 5940 }, { "epoch": 0.3421900161030596, "grad_norm": 6.871220111846924, "learning_rate": 7.655596298138682e-07, "loss": 0.4271, "step": 5950 }, { "epoch": 0.34276512537382103, "grad_norm": 5.300321102142334, "learning_rate": 7.647700487870931e-07, "loss": 0.406, "step": 5960 }, { "epoch": 0.34334023464458246, "grad_norm": 7.73538064956665, "learning_rate": 7.639795491225783e-07, "loss": 0.4102, "step": 5970 }, { "epoch": 0.3439153439153439, "grad_norm": 7.657395839691162, "learning_rate": 7.631881335630159e-07, "loss": 0.4096, "step": 5980 }, { "epoch": 0.3444904531861054, "grad_norm": 8.956995964050293, "learning_rate": 7.623958048542767e-07, "loss": 0.4132, "step": 5990 }, { "epoch": 0.3450655624568668, "grad_norm": 8.866388320922852, "learning_rate": 7.616025657453987e-07, "loss": 0.4187, "step": 6000 }, { "epoch": 0.34564067172762825, "grad_norm": 8.416523933410645, "learning_rate": 7.608084189885793e-07, "loss": 0.4128, "step": 6010 }, { "epoch": 0.3462157809983897, "grad_norm": 8.050093650817871, "learning_rate": 7.600133673391648e-07, "loss": 0.4129, "step": 6020 }, { "epoch": 0.34679089026915116, "grad_norm": 9.310523986816406, "learning_rate": 7.592174135556413e-07, "loss": 0.3988, "step": 6030 }, { "epoch": 0.3473659995399126, "grad_norm": 7.3867268562316895, "learning_rate": 7.584205603996246e-07, "loss": 0.4361, "step": 6040 }, { "epoch": 0.347941108810674, "grad_norm": 7.312601089477539, "learning_rate": 7.576228106358508e-07, "loss": 0.4197, "step": 6050 }, { "epoch": 0.34851621808143546, "grad_norm": 9.213817596435547, "learning_rate": 7.568241670321675e-07, "loss": 0.4266, "step": 6060 }, { "epoch": 0.34909132735219695, "grad_norm": 6.582385063171387, "learning_rate": 7.56024632359523e-07, "loss": 0.4098, "step": 6070 }, { "epoch": 0.3496664366229584, "grad_norm": 8.709901809692383, "learning_rate": 7.552242093919572e-07, "loss": 0.4328, "step": 6080 }, { "epoch": 0.3502415458937198, "grad_norm": 8.129631042480469, "learning_rate": 7.544229009065925e-07, "loss": 0.4403, "step": 6090 }, { "epoch": 0.35081665516448124, "grad_norm": 8.484320640563965, "learning_rate": 7.53620709683623e-07, "loss": 0.4179, "step": 6100 }, { "epoch": 0.35139176443524267, "grad_norm": 7.56968355178833, "learning_rate": 7.528176385063062e-07, "loss": 0.4431, "step": 6110 }, { "epoch": 0.35196687370600416, "grad_norm": 8.228879928588867, "learning_rate": 7.520136901609521e-07, "loss": 0.4213, "step": 6120 }, { "epoch": 0.3525419829767656, "grad_norm": 6.134531497955322, "learning_rate": 7.512088674369142e-07, "loss": 0.4181, "step": 6130 }, { "epoch": 0.353117092247527, "grad_norm": 7.709410190582275, "learning_rate": 7.504031731265801e-07, "loss": 0.4256, "step": 6140 }, { "epoch": 0.35369220151828845, "grad_norm": 7.92034387588501, "learning_rate": 7.495966100253608e-07, "loss": 0.3956, "step": 6150 }, { "epoch": 0.35426731078904994, "grad_norm": 6.0986528396606445, "learning_rate": 7.487891809316824e-07, "loss": 0.3937, "step": 6160 }, { "epoch": 0.35484242005981137, "grad_norm": 8.144950866699219, "learning_rate": 7.47980888646975e-07, "loss": 0.4101, "step": 6170 }, { "epoch": 0.3554175293305728, "grad_norm": 10.101218223571777, "learning_rate": 7.471717359756638e-07, "loss": 0.3931, "step": 6180 }, { "epoch": 0.35599263860133423, "grad_norm": 8.366281509399414, "learning_rate": 7.463617257251591e-07, "loss": 0.3883, "step": 6190 }, { "epoch": 0.3565677478720957, "grad_norm": 8.332032203674316, "learning_rate": 7.455508607058466e-07, "loss": 0.428, "step": 6200 }, { "epoch": 0.35714285714285715, "grad_norm": 8.367914199829102, "learning_rate": 7.447391437310781e-07, "loss": 0.4335, "step": 6210 }, { "epoch": 0.3577179664136186, "grad_norm": 6.952946186065674, "learning_rate": 7.439265776171611e-07, "loss": 0.4427, "step": 6220 }, { "epoch": 0.35829307568438, "grad_norm": 6.281846523284912, "learning_rate": 7.431131651833485e-07, "loss": 0.368, "step": 6230 }, { "epoch": 0.3588681849551415, "grad_norm": 7.413853645324707, "learning_rate": 7.422989092518308e-07, "loss": 0.4353, "step": 6240 }, { "epoch": 0.35944329422590293, "grad_norm": 10.959481239318848, "learning_rate": 7.414838126477244e-07, "loss": 0.3928, "step": 6250 }, { "epoch": 0.36001840349666436, "grad_norm": 9.352222442626953, "learning_rate": 7.406678781990622e-07, "loss": 0.409, "step": 6260 }, { "epoch": 0.3605935127674258, "grad_norm": 8.50424575805664, "learning_rate": 7.398511087367848e-07, "loss": 0.4183, "step": 6270 }, { "epoch": 0.3611686220381873, "grad_norm": 7.464046001434326, "learning_rate": 7.390335070947292e-07, "loss": 0.4149, "step": 6280 }, { "epoch": 0.3617437313089487, "grad_norm": 6.106412410736084, "learning_rate": 7.382150761096204e-07, "loss": 0.411, "step": 6290 }, { "epoch": 0.36231884057971014, "grad_norm": 6.958272933959961, "learning_rate": 7.373958186210602e-07, "loss": 0.4043, "step": 6300 }, { "epoch": 0.3628939498504716, "grad_norm": 8.150871276855469, "learning_rate": 7.365757374715187e-07, "loss": 0.4327, "step": 6310 }, { "epoch": 0.363469059121233, "grad_norm": 8.361721992492676, "learning_rate": 7.357548355063231e-07, "loss": 0.4281, "step": 6320 }, { "epoch": 0.3640441683919945, "grad_norm": 6.69493293762207, "learning_rate": 7.349331155736489e-07, "loss": 0.4207, "step": 6330 }, { "epoch": 0.3646192776627559, "grad_norm": 6.854587554931641, "learning_rate": 7.341105805245091e-07, "loss": 0.389, "step": 6340 }, { "epoch": 0.36519438693351736, "grad_norm": 7.23399019241333, "learning_rate": 7.332872332127456e-07, "loss": 0.3996, "step": 6350 }, { "epoch": 0.3657694962042788, "grad_norm": 7.066460132598877, "learning_rate": 7.324630764950177e-07, "loss": 0.4279, "step": 6360 }, { "epoch": 0.3663446054750403, "grad_norm": 8.809345245361328, "learning_rate": 7.316381132307936e-07, "loss": 0.3901, "step": 6370 }, { "epoch": 0.3669197147458017, "grad_norm": 9.097206115722656, "learning_rate": 7.308123462823393e-07, "loss": 0.4154, "step": 6380 }, { "epoch": 0.36749482401656314, "grad_norm": 7.902084827423096, "learning_rate": 7.299857785147099e-07, "loss": 0.4065, "step": 6390 }, { "epoch": 0.36806993328732457, "grad_norm": 8.47659683227539, "learning_rate": 7.291584127957383e-07, "loss": 0.3939, "step": 6400 }, { "epoch": 0.36864504255808606, "grad_norm": 8.699891090393066, "learning_rate": 7.283302519960266e-07, "loss": 0.4178, "step": 6410 }, { "epoch": 0.3692201518288475, "grad_norm": 6.71785831451416, "learning_rate": 7.275012989889349e-07, "loss": 0.3984, "step": 6420 }, { "epoch": 0.3697952610996089, "grad_norm": 7.791053295135498, "learning_rate": 7.266715566505722e-07, "loss": 0.4142, "step": 6430 }, { "epoch": 0.37037037037037035, "grad_norm": 6.147428035736084, "learning_rate": 7.258410278597863e-07, "loss": 0.3998, "step": 6440 }, { "epoch": 0.37094547964113184, "grad_norm": 7.686953067779541, "learning_rate": 7.250097154981533e-07, "loss": 0.4084, "step": 6450 }, { "epoch": 0.37152058891189327, "grad_norm": 7.331892013549805, "learning_rate": 7.241776224499684e-07, "loss": 0.4195, "step": 6460 }, { "epoch": 0.3720956981826547, "grad_norm": 8.164823532104492, "learning_rate": 7.233447516022349e-07, "loss": 0.4043, "step": 6470 }, { "epoch": 0.37267080745341613, "grad_norm": 7.713817596435547, "learning_rate": 7.225111058446551e-07, "loss": 0.4073, "step": 6480 }, { "epoch": 0.3732459167241776, "grad_norm": 8.074881553649902, "learning_rate": 7.216766880696198e-07, "loss": 0.4106, "step": 6490 }, { "epoch": 0.37382102599493905, "grad_norm": 10.047574996948242, "learning_rate": 7.208415011721983e-07, "loss": 0.3915, "step": 6500 }, { "epoch": 0.3743961352657005, "grad_norm": 6.859342098236084, "learning_rate": 7.200055480501287e-07, "loss": 0.3783, "step": 6510 }, { "epoch": 0.3749712445364619, "grad_norm": 8.105209350585938, "learning_rate": 7.191688316038072e-07, "loss": 0.3925, "step": 6520 }, { "epoch": 0.3755463538072234, "grad_norm": 8.792828559875488, "learning_rate": 7.183313547362784e-07, "loss": 0.394, "step": 6530 }, { "epoch": 0.37612146307798483, "grad_norm": 8.246500015258789, "learning_rate": 7.174931203532258e-07, "loss": 0.4078, "step": 6540 }, { "epoch": 0.37669657234874626, "grad_norm": 9.437535285949707, "learning_rate": 7.166541313629606e-07, "loss": 0.4194, "step": 6550 }, { "epoch": 0.3772716816195077, "grad_norm": 6.131165981292725, "learning_rate": 7.158143906764122e-07, "loss": 0.4085, "step": 6560 }, { "epoch": 0.3778467908902691, "grad_norm": 8.962398529052734, "learning_rate": 7.149739012071183e-07, "loss": 0.4183, "step": 6570 }, { "epoch": 0.3784219001610306, "grad_norm": 8.934048652648926, "learning_rate": 7.141326658712142e-07, "loss": 0.422, "step": 6580 }, { "epoch": 0.37899700943179204, "grad_norm": 10.42822265625, "learning_rate": 7.132906875874236e-07, "loss": 0.4148, "step": 6590 }, { "epoch": 0.3795721187025535, "grad_norm": 8.330751419067383, "learning_rate": 7.124479692770473e-07, "loss": 0.4328, "step": 6600 }, { "epoch": 0.3801472279733149, "grad_norm": 7.162095069885254, "learning_rate": 7.116045138639542e-07, "loss": 0.4191, "step": 6610 }, { "epoch": 0.3807223372440764, "grad_norm": 6.6673502922058105, "learning_rate": 7.107603242745701e-07, "loss": 0.4469, "step": 6620 }, { "epoch": 0.3812974465148378, "grad_norm": 10.797370910644531, "learning_rate": 7.099154034378686e-07, "loss": 0.3809, "step": 6630 }, { "epoch": 0.38187255578559925, "grad_norm": 9.02065658569336, "learning_rate": 7.0906975428536e-07, "loss": 0.3943, "step": 6640 }, { "epoch": 0.3824476650563607, "grad_norm": 8.190543174743652, "learning_rate": 7.082233797510817e-07, "loss": 0.4058, "step": 6650 }, { "epoch": 0.3830227743271222, "grad_norm": 11.366854667663574, "learning_rate": 7.073762827715879e-07, "loss": 0.4355, "step": 6660 }, { "epoch": 0.3835978835978836, "grad_norm": 8.739253044128418, "learning_rate": 7.065284662859394e-07, "loss": 0.4186, "step": 6670 }, { "epoch": 0.38417299286864504, "grad_norm": 10.615376472473145, "learning_rate": 7.056799332356932e-07, "loss": 0.4233, "step": 6680 }, { "epoch": 0.38474810213940647, "grad_norm": 8.456158638000488, "learning_rate": 7.048306865648928e-07, "loss": 0.4162, "step": 6690 }, { "epoch": 0.38532321141016795, "grad_norm": 10.032965660095215, "learning_rate": 7.039807292200572e-07, "loss": 0.4101, "step": 6700 }, { "epoch": 0.3858983206809294, "grad_norm": 6.33935022354126, "learning_rate": 7.031300641501714e-07, "loss": 0.42, "step": 6710 }, { "epoch": 0.3864734299516908, "grad_norm": 6.527713775634766, "learning_rate": 7.022786943066759e-07, "loss": 0.3903, "step": 6720 }, { "epoch": 0.38704853922245225, "grad_norm": 8.37816047668457, "learning_rate": 7.014266226434564e-07, "loss": 0.4073, "step": 6730 }, { "epoch": 0.38762364849321373, "grad_norm": 7.82682466506958, "learning_rate": 7.005738521168338e-07, "loss": 0.453, "step": 6740 }, { "epoch": 0.38819875776397517, "grad_norm": 8.559891700744629, "learning_rate": 6.997203856855534e-07, "loss": 0.385, "step": 6750 }, { "epoch": 0.3887738670347366, "grad_norm": 8.251172065734863, "learning_rate": 6.988662263107754e-07, "loss": 0.4087, "step": 6760 }, { "epoch": 0.38934897630549803, "grad_norm": 6.502833843231201, "learning_rate": 6.980113769560638e-07, "loss": 0.3961, "step": 6770 }, { "epoch": 0.3899240855762595, "grad_norm": 7.490931987762451, "learning_rate": 6.971558405873768e-07, "loss": 0.4443, "step": 6780 }, { "epoch": 0.39049919484702095, "grad_norm": 7.985132694244385, "learning_rate": 6.962996201730561e-07, "loss": 0.3899, "step": 6790 }, { "epoch": 0.3910743041177824, "grad_norm": 8.339662551879883, "learning_rate": 6.954427186838169e-07, "loss": 0.4116, "step": 6800 }, { "epoch": 0.3916494133885438, "grad_norm": 7.270541191101074, "learning_rate": 6.945851390927374e-07, "loss": 0.427, "step": 6810 }, { "epoch": 0.39222452265930524, "grad_norm": 8.568276405334473, "learning_rate": 6.937268843752484e-07, "loss": 0.4083, "step": 6820 }, { "epoch": 0.39279963193006673, "grad_norm": 6.397177219390869, "learning_rate": 6.928679575091231e-07, "loss": 0.3848, "step": 6830 }, { "epoch": 0.39337474120082816, "grad_norm": 9.292791366577148, "learning_rate": 6.92008361474467e-07, "loss": 0.4397, "step": 6840 }, { "epoch": 0.3939498504715896, "grad_norm": 8.376216888427734, "learning_rate": 6.91148099253707e-07, "loss": 0.3918, "step": 6850 }, { "epoch": 0.394524959742351, "grad_norm": 6.551718235015869, "learning_rate": 6.902871738315817e-07, "loss": 0.4113, "step": 6860 }, { "epoch": 0.3951000690131125, "grad_norm": 6.759249687194824, "learning_rate": 6.894255881951305e-07, "loss": 0.377, "step": 6870 }, { "epoch": 0.39567517828387394, "grad_norm": 6.7694902420043945, "learning_rate": 6.885633453336834e-07, "loss": 0.3768, "step": 6880 }, { "epoch": 0.39625028755463537, "grad_norm": 7.024812698364258, "learning_rate": 6.877004482388509e-07, "loss": 0.3902, "step": 6890 }, { "epoch": 0.3968253968253968, "grad_norm": 7.190764427185059, "learning_rate": 6.868368999045133e-07, "loss": 0.3795, "step": 6900 }, { "epoch": 0.3974005060961583, "grad_norm": 8.456724166870117, "learning_rate": 6.859727033268107e-07, "loss": 0.4007, "step": 6910 }, { "epoch": 0.3979756153669197, "grad_norm": 10.534616470336914, "learning_rate": 6.851078615041317e-07, "loss": 0.3885, "step": 6920 }, { "epoch": 0.39855072463768115, "grad_norm": 8.120814323425293, "learning_rate": 6.842423774371039e-07, "loss": 0.3854, "step": 6930 }, { "epoch": 0.3991258339084426, "grad_norm": 7.934427261352539, "learning_rate": 6.833762541285836e-07, "loss": 0.4156, "step": 6940 }, { "epoch": 0.39970094317920407, "grad_norm": 9.128240585327148, "learning_rate": 6.825094945836445e-07, "loss": 0.4258, "step": 6950 }, { "epoch": 0.4002760524499655, "grad_norm": 7.29741907119751, "learning_rate": 6.81642101809568e-07, "loss": 0.3992, "step": 6960 }, { "epoch": 0.40085116172072693, "grad_norm": 6.112662315368652, "learning_rate": 6.807740788158327e-07, "loss": 0.4203, "step": 6970 }, { "epoch": 0.40142627099148837, "grad_norm": 8.280617713928223, "learning_rate": 6.799054286141033e-07, "loss": 0.4054, "step": 6980 }, { "epoch": 0.40200138026224985, "grad_norm": 8.349048614501953, "learning_rate": 6.790361542182208e-07, "loss": 0.4138, "step": 6990 }, { "epoch": 0.4025764895330113, "grad_norm": 9.218025207519531, "learning_rate": 6.781662586441923e-07, "loss": 0.402, "step": 7000 }, { "epoch": 0.4031515988037727, "grad_norm": 7.454174518585205, "learning_rate": 6.772957449101798e-07, "loss": 0.4029, "step": 7010 }, { "epoch": 0.40372670807453415, "grad_norm": 7.731265544891357, "learning_rate": 6.764246160364897e-07, "loss": 0.4174, "step": 7020 }, { "epoch": 0.40430181734529563, "grad_norm": 7.531976222991943, "learning_rate": 6.755528750455633e-07, "loss": 0.3906, "step": 7030 }, { "epoch": 0.40487692661605706, "grad_norm": 9.454731941223145, "learning_rate": 6.746805249619655e-07, "loss": 0.4149, "step": 7040 }, { "epoch": 0.4054520358868185, "grad_norm": 8.5861177444458, "learning_rate": 6.738075688123743e-07, "loss": 0.4003, "step": 7050 }, { "epoch": 0.4060271451575799, "grad_norm": 5.850071907043457, "learning_rate": 6.729340096255706e-07, "loss": 0.4091, "step": 7060 }, { "epoch": 0.40660225442834136, "grad_norm": 7.688616752624512, "learning_rate": 6.720598504324275e-07, "loss": 0.4207, "step": 7070 }, { "epoch": 0.40717736369910285, "grad_norm": 9.393671989440918, "learning_rate": 6.711850942659e-07, "loss": 0.4016, "step": 7080 }, { "epoch": 0.4077524729698643, "grad_norm": 9.653244018554688, "learning_rate": 6.703097441610143e-07, "loss": 0.4047, "step": 7090 }, { "epoch": 0.4083275822406257, "grad_norm": 9.060815811157227, "learning_rate": 6.694338031548572e-07, "loss": 0.4173, "step": 7100 }, { "epoch": 0.40890269151138714, "grad_norm": 8.815903663635254, "learning_rate": 6.685572742865658e-07, "loss": 0.4123, "step": 7110 }, { "epoch": 0.4094778007821486, "grad_norm": 9.129491806030273, "learning_rate": 6.676801605973169e-07, "loss": 0.4164, "step": 7120 }, { "epoch": 0.41005291005291006, "grad_norm": 8.948802947998047, "learning_rate": 6.66802465130316e-07, "loss": 0.3897, "step": 7130 }, { "epoch": 0.4106280193236715, "grad_norm": 9.767799377441406, "learning_rate": 6.659241909307877e-07, "loss": 0.3913, "step": 7140 }, { "epoch": 0.4112031285944329, "grad_norm": 9.538566589355469, "learning_rate": 6.650453410459637e-07, "loss": 0.4145, "step": 7150 }, { "epoch": 0.4117782378651944, "grad_norm": 7.445476531982422, "learning_rate": 6.641659185250743e-07, "loss": 0.4036, "step": 7160 }, { "epoch": 0.41235334713595584, "grad_norm": 9.46981143951416, "learning_rate": 6.632859264193355e-07, "loss": 0.3991, "step": 7170 }, { "epoch": 0.41292845640671727, "grad_norm": 7.189068794250488, "learning_rate": 6.624053677819398e-07, "loss": 0.4086, "step": 7180 }, { "epoch": 0.4135035656774787, "grad_norm": 7.383470058441162, "learning_rate": 6.615242456680457e-07, "loss": 0.4041, "step": 7190 }, { "epoch": 0.4140786749482402, "grad_norm": 7.74267578125, "learning_rate": 6.606425631347664e-07, "loss": 0.4077, "step": 7200 }, { "epoch": 0.4146537842190016, "grad_norm": 7.086597919464111, "learning_rate": 6.597603232411596e-07, "loss": 0.4193, "step": 7210 }, { "epoch": 0.41522889348976305, "grad_norm": 9.035558700561523, "learning_rate": 6.588775290482167e-07, "loss": 0.4261, "step": 7220 }, { "epoch": 0.4158040027605245, "grad_norm": 9.277562141418457, "learning_rate": 6.579941836188523e-07, "loss": 0.4155, "step": 7230 }, { "epoch": 0.41637911203128597, "grad_norm": 6.792115211486816, "learning_rate": 6.571102900178938e-07, "loss": 0.384, "step": 7240 }, { "epoch": 0.4169542213020474, "grad_norm": 8.839338302612305, "learning_rate": 6.562258513120699e-07, "loss": 0.3844, "step": 7250 }, { "epoch": 0.41752933057280883, "grad_norm": 7.84649133682251, "learning_rate": 6.553408705700017e-07, "loss": 0.409, "step": 7260 }, { "epoch": 0.41810443984357026, "grad_norm": 8.922412872314453, "learning_rate": 6.544553508621897e-07, "loss": 0.412, "step": 7270 }, { "epoch": 0.41867954911433175, "grad_norm": 10.495413780212402, "learning_rate": 6.535692952610051e-07, "loss": 0.3986, "step": 7280 }, { "epoch": 0.4192546583850932, "grad_norm": 7.6890950202941895, "learning_rate": 6.526827068406783e-07, "loss": 0.4157, "step": 7290 }, { "epoch": 0.4198297676558546, "grad_norm": 12.510689735412598, "learning_rate": 6.517955886772879e-07, "loss": 0.4095, "step": 7300 }, { "epoch": 0.42040487692661604, "grad_norm": 7.5043816566467285, "learning_rate": 6.509079438487514e-07, "loss": 0.3667, "step": 7310 }, { "epoch": 0.4209799861973775, "grad_norm": 6.709923267364502, "learning_rate": 6.500197754348127e-07, "loss": 0.3706, "step": 7320 }, { "epoch": 0.42155509546813896, "grad_norm": 9.896550178527832, "learning_rate": 6.491310865170327e-07, "loss": 0.4454, "step": 7330 }, { "epoch": 0.4221302047389004, "grad_norm": 7.701712608337402, "learning_rate": 6.482418801787784e-07, "loss": 0.4075, "step": 7340 }, { "epoch": 0.4227053140096618, "grad_norm": 9.607941627502441, "learning_rate": 6.473521595052116e-07, "loss": 0.3972, "step": 7350 }, { "epoch": 0.42328042328042326, "grad_norm": 8.256829261779785, "learning_rate": 6.46461927583279e-07, "loss": 0.4101, "step": 7360 }, { "epoch": 0.42385553255118474, "grad_norm": 8.89173698425293, "learning_rate": 6.45571187501701e-07, "loss": 0.4079, "step": 7370 }, { "epoch": 0.4244306418219462, "grad_norm": 8.207355499267578, "learning_rate": 6.446799423509608e-07, "loss": 0.3879, "step": 7380 }, { "epoch": 0.4250057510927076, "grad_norm": 8.09197998046875, "learning_rate": 6.437881952232947e-07, "loss": 0.405, "step": 7390 }, { "epoch": 0.42558086036346904, "grad_norm": 9.39001750946045, "learning_rate": 6.428959492126795e-07, "loss": 0.4131, "step": 7400 }, { "epoch": 0.4261559696342305, "grad_norm": 8.552804946899414, "learning_rate": 6.420032074148243e-07, "loss": 0.4097, "step": 7410 }, { "epoch": 0.42673107890499196, "grad_norm": 8.793439865112305, "learning_rate": 6.411099729271571e-07, "loss": 0.3831, "step": 7420 }, { "epoch": 0.4273061881757534, "grad_norm": 12.047957420349121, "learning_rate": 6.402162488488162e-07, "loss": 0.4, "step": 7430 }, { "epoch": 0.4278812974465148, "grad_norm": 9.54802417755127, "learning_rate": 6.393220382806382e-07, "loss": 0.3909, "step": 7440 }, { "epoch": 0.4284564067172763, "grad_norm": 9.151510238647461, "learning_rate": 6.384273443251472e-07, "loss": 0.3888, "step": 7450 }, { "epoch": 0.42903151598803774, "grad_norm": 8.276642799377441, "learning_rate": 6.375321700865454e-07, "loss": 0.391, "step": 7460 }, { "epoch": 0.42960662525879917, "grad_norm": 6.7039947509765625, "learning_rate": 6.366365186707009e-07, "loss": 0.3942, "step": 7470 }, { "epoch": 0.4301817345295606, "grad_norm": 8.672518730163574, "learning_rate": 6.357403931851369e-07, "loss": 0.4017, "step": 7480 }, { "epoch": 0.4307568438003221, "grad_norm": 11.273883819580078, "learning_rate": 6.348437967390224e-07, "loss": 0.4076, "step": 7490 }, { "epoch": 0.4313319530710835, "grad_norm": 8.661757469177246, "learning_rate": 6.339467324431596e-07, "loss": 0.4227, "step": 7500 }, { "epoch": 0.43190706234184495, "grad_norm": 8.627997398376465, "learning_rate": 6.330492034099742e-07, "loss": 0.4024, "step": 7510 }, { "epoch": 0.4324821716126064, "grad_norm": 7.375794410705566, "learning_rate": 6.321512127535045e-07, "loss": 0.4066, "step": 7520 }, { "epoch": 0.4330572808833678, "grad_norm": 8.933667182922363, "learning_rate": 6.312527635893904e-07, "loss": 0.4068, "step": 7530 }, { "epoch": 0.4336323901541293, "grad_norm": 11.272631645202637, "learning_rate": 6.303538590348624e-07, "loss": 0.3951, "step": 7540 }, { "epoch": 0.43420749942489073, "grad_norm": 9.46224308013916, "learning_rate": 6.294545022087314e-07, "loss": 0.3805, "step": 7550 }, { "epoch": 0.43478260869565216, "grad_norm": 9.46367359161377, "learning_rate": 6.285546962313768e-07, "loss": 0.4001, "step": 7560 }, { "epoch": 0.4353577179664136, "grad_norm": 8.426703453063965, "learning_rate": 6.276544442247373e-07, "loss": 0.401, "step": 7570 }, { "epoch": 0.4359328272371751, "grad_norm": 8.663817405700684, "learning_rate": 6.267537493122983e-07, "loss": 0.3901, "step": 7580 }, { "epoch": 0.4365079365079365, "grad_norm": 8.724206924438477, "learning_rate": 6.258526146190823e-07, "loss": 0.3599, "step": 7590 }, { "epoch": 0.43708304577869794, "grad_norm": 10.957181930541992, "learning_rate": 6.249510432716373e-07, "loss": 0.3788, "step": 7600 }, { "epoch": 0.4376581550494594, "grad_norm": 9.13005256652832, "learning_rate": 6.240490383980271e-07, "loss": 0.4307, "step": 7610 }, { "epoch": 0.43823326432022086, "grad_norm": 7.380809307098389, "learning_rate": 6.231466031278189e-07, "loss": 0.4072, "step": 7620 }, { "epoch": 0.4388083735909823, "grad_norm": 8.344758033752441, "learning_rate": 6.222437405920733e-07, "loss": 0.3988, "step": 7630 }, { "epoch": 0.4393834828617437, "grad_norm": 6.267268657684326, "learning_rate": 6.213404539233335e-07, "loss": 0.3786, "step": 7640 }, { "epoch": 0.43995859213250516, "grad_norm": 8.896737098693848, "learning_rate": 6.204367462556142e-07, "loss": 0.389, "step": 7650 }, { "epoch": 0.44053370140326664, "grad_norm": 6.51915979385376, "learning_rate": 6.19532620724391e-07, "loss": 0.4044, "step": 7660 }, { "epoch": 0.4411088106740281, "grad_norm": 8.525016784667969, "learning_rate": 6.186280804665885e-07, "loss": 0.3825, "step": 7670 }, { "epoch": 0.4416839199447895, "grad_norm": 7.309818744659424, "learning_rate": 6.177231286205713e-07, "loss": 0.3869, "step": 7680 }, { "epoch": 0.44225902921555094, "grad_norm": 8.411114692687988, "learning_rate": 6.168177683261316e-07, "loss": 0.3836, "step": 7690 }, { "epoch": 0.4428341384863124, "grad_norm": 8.353103637695312, "learning_rate": 6.159120027244783e-07, "loss": 0.3868, "step": 7700 }, { "epoch": 0.44340924775707385, "grad_norm": 8.293525695800781, "learning_rate": 6.150058349582272e-07, "loss": 0.4136, "step": 7710 }, { "epoch": 0.4439843570278353, "grad_norm": 9.89013671875, "learning_rate": 6.140992681713892e-07, "loss": 0.3887, "step": 7720 }, { "epoch": 0.4445594662985967, "grad_norm": 7.847920894622803, "learning_rate": 6.131923055093593e-07, "loss": 0.3799, "step": 7730 }, { "epoch": 0.4451345755693582, "grad_norm": 7.311437129974365, "learning_rate": 6.122849501189065e-07, "loss": 0.3907, "step": 7740 }, { "epoch": 0.44570968484011964, "grad_norm": 9.555363655090332, "learning_rate": 6.113772051481621e-07, "loss": 0.384, "step": 7750 }, { "epoch": 0.44628479411088107, "grad_norm": 8.55188274383545, "learning_rate": 6.104690737466094e-07, "loss": 0.4144, "step": 7760 }, { "epoch": 0.4468599033816425, "grad_norm": 8.475045204162598, "learning_rate": 6.095605590650721e-07, "loss": 0.3946, "step": 7770 }, { "epoch": 0.44743501265240393, "grad_norm": 7.609455108642578, "learning_rate": 6.086516642557037e-07, "loss": 0.3792, "step": 7780 }, { "epoch": 0.4480101219231654, "grad_norm": 7.721874237060547, "learning_rate": 6.07742392471977e-07, "loss": 0.395, "step": 7790 }, { "epoch": 0.44858523119392685, "grad_norm": 8.212798118591309, "learning_rate": 6.068327468686721e-07, "loss": 0.3963, "step": 7800 }, { "epoch": 0.4491603404646883, "grad_norm": 7.176240921020508, "learning_rate": 6.059227306018668e-07, "loss": 0.3954, "step": 7810 }, { "epoch": 0.4497354497354497, "grad_norm": 9.372278213500977, "learning_rate": 6.050123468289246e-07, "loss": 0.3871, "step": 7820 }, { "epoch": 0.4503105590062112, "grad_norm": 10.033339500427246, "learning_rate": 6.041015987084836e-07, "loss": 0.3982, "step": 7830 }, { "epoch": 0.45088566827697263, "grad_norm": 8.569169998168945, "learning_rate": 6.03190489400447e-07, "loss": 0.3731, "step": 7840 }, { "epoch": 0.45146077754773406, "grad_norm": 7.567975997924805, "learning_rate": 6.022790220659705e-07, "loss": 0.379, "step": 7850 }, { "epoch": 0.4520358868184955, "grad_norm": 8.770020484924316, "learning_rate": 6.013671998674524e-07, "loss": 0.4134, "step": 7860 }, { "epoch": 0.452610996089257, "grad_norm": 7.939765453338623, "learning_rate": 6.004550259685216e-07, "loss": 0.4133, "step": 7870 }, { "epoch": 0.4531861053600184, "grad_norm": 8.965749740600586, "learning_rate": 5.995425035340278e-07, "loss": 0.415, "step": 7880 }, { "epoch": 0.45376121463077984, "grad_norm": 8.472042083740234, "learning_rate": 5.9862963573003e-07, "loss": 0.3969, "step": 7890 }, { "epoch": 0.4543363239015413, "grad_norm": 9.622425079345703, "learning_rate": 5.97716425723785e-07, "loss": 0.3945, "step": 7900 }, { "epoch": 0.45491143317230276, "grad_norm": 9.959004402160645, "learning_rate": 5.968028766837374e-07, "loss": 0.3863, "step": 7910 }, { "epoch": 0.4554865424430642, "grad_norm": 9.1616792678833, "learning_rate": 5.958889917795079e-07, "loss": 0.3869, "step": 7920 }, { "epoch": 0.4560616517138256, "grad_norm": 8.373703002929688, "learning_rate": 5.949747741818823e-07, "loss": 0.4276, "step": 7930 }, { "epoch": 0.45663676098458705, "grad_norm": 10.7528076171875, "learning_rate": 5.940602270628012e-07, "loss": 0.4122, "step": 7940 }, { "epoch": 0.45721187025534854, "grad_norm": 10.298049926757812, "learning_rate": 5.931453535953479e-07, "loss": 0.414, "step": 7950 }, { "epoch": 0.45778697952610997, "grad_norm": 9.817627906799316, "learning_rate": 5.922301569537385e-07, "loss": 0.4048, "step": 7960 }, { "epoch": 0.4583620887968714, "grad_norm": 5.534698009490967, "learning_rate": 5.9131464031331e-07, "loss": 0.4107, "step": 7970 }, { "epoch": 0.45893719806763283, "grad_norm": 7.931804656982422, "learning_rate": 5.903988068505099e-07, "loss": 0.3755, "step": 7980 }, { "epoch": 0.4595123073383943, "grad_norm": 6.034002780914307, "learning_rate": 5.894826597428848e-07, "loss": 0.436, "step": 7990 }, { "epoch": 0.46008741660915575, "grad_norm": 9.296198844909668, "learning_rate": 5.885662021690698e-07, "loss": 0.3808, "step": 8000 }, { "epoch": 0.4606625258799172, "grad_norm": 7.4406514167785645, "learning_rate": 5.876494373087765e-07, "loss": 0.3917, "step": 8010 }, { "epoch": 0.4612376351506786, "grad_norm": 8.423439025878906, "learning_rate": 5.867323683427835e-07, "loss": 0.4036, "step": 8020 }, { "epoch": 0.46181274442144005, "grad_norm": 9.315818786621094, "learning_rate": 5.858149984529238e-07, "loss": 0.3826, "step": 8030 }, { "epoch": 0.46238785369220153, "grad_norm": 9.049417495727539, "learning_rate": 5.848973308220752e-07, "loss": 0.3773, "step": 8040 }, { "epoch": 0.46296296296296297, "grad_norm": 7.655517578125, "learning_rate": 5.839793686341476e-07, "loss": 0.3894, "step": 8050 }, { "epoch": 0.4635380722337244, "grad_norm": 9.614046096801758, "learning_rate": 5.830611150740738e-07, "loss": 0.3798, "step": 8060 }, { "epoch": 0.4641131815044858, "grad_norm": 7.4124274253845215, "learning_rate": 5.821425733277972e-07, "loss": 0.3984, "step": 8070 }, { "epoch": 0.4646882907752473, "grad_norm": 12.54110050201416, "learning_rate": 5.812237465822607e-07, "loss": 0.3722, "step": 8080 }, { "epoch": 0.46526340004600875, "grad_norm": 7.983209133148193, "learning_rate": 5.803046380253967e-07, "loss": 0.3827, "step": 8090 }, { "epoch": 0.4658385093167702, "grad_norm": 9.480877876281738, "learning_rate": 5.793852508461145e-07, "loss": 0.3785, "step": 8100 }, { "epoch": 0.4664136185875316, "grad_norm": 7.611504077911377, "learning_rate": 5.784655882342912e-07, "loss": 0.3951, "step": 8110 }, { "epoch": 0.4669887278582931, "grad_norm": 7.177801132202148, "learning_rate": 5.775456533807587e-07, "loss": 0.3871, "step": 8120 }, { "epoch": 0.4675638371290545, "grad_norm": 8.323403358459473, "learning_rate": 5.766254494772935e-07, "loss": 0.4148, "step": 8130 }, { "epoch": 0.46813894639981596, "grad_norm": 7.47517204284668, "learning_rate": 5.757049797166062e-07, "loss": 0.4216, "step": 8140 }, { "epoch": 0.4687140556705774, "grad_norm": 8.74323558807373, "learning_rate": 5.747842472923291e-07, "loss": 0.4042, "step": 8150 }, { "epoch": 0.4692891649413389, "grad_norm": 7.652016639709473, "learning_rate": 5.738632553990062e-07, "loss": 0.388, "step": 8160 }, { "epoch": 0.4698642742121003, "grad_norm": 7.905698776245117, "learning_rate": 5.729420072320818e-07, "loss": 0.3801, "step": 8170 }, { "epoch": 0.47043938348286174, "grad_norm": 10.23285961151123, "learning_rate": 5.720205059878891e-07, "loss": 0.4109, "step": 8180 }, { "epoch": 0.47101449275362317, "grad_norm": 8.718255043029785, "learning_rate": 5.710987548636396e-07, "loss": 0.3994, "step": 8190 }, { "epoch": 0.47158960202438466, "grad_norm": 6.8897223472595215, "learning_rate": 5.701767570574115e-07, "loss": 0.4082, "step": 8200 }, { "epoch": 0.4721647112951461, "grad_norm": 8.031255722045898, "learning_rate": 5.692545157681393e-07, "loss": 0.3847, "step": 8210 }, { "epoch": 0.4727398205659075, "grad_norm": 6.121545791625977, "learning_rate": 5.68332034195602e-07, "loss": 0.3948, "step": 8220 }, { "epoch": 0.47331492983666895, "grad_norm": 6.5809502601623535, "learning_rate": 5.674093155404123e-07, "loss": 0.3649, "step": 8230 }, { "epoch": 0.47389003910743044, "grad_norm": 9.008655548095703, "learning_rate": 5.664863630040054e-07, "loss": 0.3948, "step": 8240 }, { "epoch": 0.47446514837819187, "grad_norm": 10.445096015930176, "learning_rate": 5.655631797886281e-07, "loss": 0.4024, "step": 8250 }, { "epoch": 0.4750402576489533, "grad_norm": 7.081674098968506, "learning_rate": 5.646397690973276e-07, "loss": 0.3697, "step": 8260 }, { "epoch": 0.47561536691971473, "grad_norm": 8.034218788146973, "learning_rate": 5.6371613413394e-07, "loss": 0.3936, "step": 8270 }, { "epoch": 0.47619047619047616, "grad_norm": 6.757801055908203, "learning_rate": 5.627922781030803e-07, "loss": 0.4169, "step": 8280 }, { "epoch": 0.47676558546123765, "grad_norm": 7.660385608673096, "learning_rate": 5.618682042101297e-07, "loss": 0.3708, "step": 8290 }, { "epoch": 0.4773406947319991, "grad_norm": 8.661242485046387, "learning_rate": 5.609439156612254e-07, "loss": 0.4001, "step": 8300 }, { "epoch": 0.4779158040027605, "grad_norm": 8.182344436645508, "learning_rate": 5.600194156632499e-07, "loss": 0.3877, "step": 8310 }, { "epoch": 0.47849091327352195, "grad_norm": 10.07321548461914, "learning_rate": 5.590947074238188e-07, "loss": 0.3942, "step": 8320 }, { "epoch": 0.47906602254428343, "grad_norm": 8.564963340759277, "learning_rate": 5.581697941512704e-07, "loss": 0.3887, "step": 8330 }, { "epoch": 0.47964113181504486, "grad_norm": 9.38545036315918, "learning_rate": 5.572446790546545e-07, "loss": 0.4143, "step": 8340 }, { "epoch": 0.4802162410858063, "grad_norm": 9.876385688781738, "learning_rate": 5.563193653437207e-07, "loss": 0.3871, "step": 8350 }, { "epoch": 0.4807913503565677, "grad_norm": 8.650846481323242, "learning_rate": 5.553938562289085e-07, "loss": 0.4126, "step": 8360 }, { "epoch": 0.4813664596273292, "grad_norm": 9.579472541809082, "learning_rate": 5.544681549213344e-07, "loss": 0.3749, "step": 8370 }, { "epoch": 0.48194156889809064, "grad_norm": 8.464648246765137, "learning_rate": 5.535422646327825e-07, "loss": 0.3922, "step": 8380 }, { "epoch": 0.4825166781688521, "grad_norm": 7.547084331512451, "learning_rate": 5.526161885756924e-07, "loss": 0.3811, "step": 8390 }, { "epoch": 0.4830917874396135, "grad_norm": 6.809798717498779, "learning_rate": 5.516899299631477e-07, "loss": 0.409, "step": 8400 }, { "epoch": 0.483666896710375, "grad_norm": 7.388305187225342, "learning_rate": 5.507634920088662e-07, "loss": 0.3819, "step": 8410 }, { "epoch": 0.4842420059811364, "grad_norm": 7.151841640472412, "learning_rate": 5.498368779271873e-07, "loss": 0.4046, "step": 8420 }, { "epoch": 0.48481711525189786, "grad_norm": 8.391980171203613, "learning_rate": 5.489100909330622e-07, "loss": 0.3818, "step": 8430 }, { "epoch": 0.4853922245226593, "grad_norm": 8.18085765838623, "learning_rate": 5.479831342420411e-07, "loss": 0.38, "step": 8440 }, { "epoch": 0.4859673337934208, "grad_norm": 9.31074047088623, "learning_rate": 5.470560110702636e-07, "loss": 0.3734, "step": 8450 }, { "epoch": 0.4865424430641822, "grad_norm": 11.207425117492676, "learning_rate": 5.461287246344468e-07, "loss": 0.4282, "step": 8460 }, { "epoch": 0.48711755233494364, "grad_norm": 8.129477500915527, "learning_rate": 5.452012781518742e-07, "loss": 0.3791, "step": 8470 }, { "epoch": 0.48769266160570507, "grad_norm": 9.150031089782715, "learning_rate": 5.442736748403846e-07, "loss": 0.3801, "step": 8480 }, { "epoch": 0.48826777087646656, "grad_norm": 7.559999942779541, "learning_rate": 5.433459179183611e-07, "loss": 0.3656, "step": 8490 }, { "epoch": 0.488842880147228, "grad_norm": 10.16154670715332, "learning_rate": 5.424180106047194e-07, "loss": 0.3844, "step": 8500 }, { "epoch": 0.4894179894179894, "grad_norm": 8.556404113769531, "learning_rate": 5.414899561188973e-07, "loss": 0.3638, "step": 8510 }, { "epoch": 0.48999309868875085, "grad_norm": 10.767192840576172, "learning_rate": 5.40561757680843e-07, "loss": 0.3718, "step": 8520 }, { "epoch": 0.4905682079595123, "grad_norm": 8.927427291870117, "learning_rate": 5.396334185110045e-07, "loss": 0.3713, "step": 8530 }, { "epoch": 0.49114331723027377, "grad_norm": 8.367263793945312, "learning_rate": 5.387049418303178e-07, "loss": 0.3776, "step": 8540 }, { "epoch": 0.4917184265010352, "grad_norm": 10.070588111877441, "learning_rate": 5.377763308601958e-07, "loss": 0.3932, "step": 8550 }, { "epoch": 0.49229353577179663, "grad_norm": 5.940333843231201, "learning_rate": 5.368475888225179e-07, "loss": 0.3722, "step": 8560 }, { "epoch": 0.49286864504255806, "grad_norm": 7.292755126953125, "learning_rate": 5.359187189396177e-07, "loss": 0.3992, "step": 8570 }, { "epoch": 0.49344375431331955, "grad_norm": 8.8858060836792, "learning_rate": 5.349897244342729e-07, "loss": 0.4182, "step": 8580 }, { "epoch": 0.494018863584081, "grad_norm": 7.709681034088135, "learning_rate": 5.340606085296933e-07, "loss": 0.3865, "step": 8590 }, { "epoch": 0.4945939728548424, "grad_norm": 11.981656074523926, "learning_rate": 5.331313744495093e-07, "loss": 0.3618, "step": 8600 }, { "epoch": 0.49516908212560384, "grad_norm": 9.989206314086914, "learning_rate": 5.322020254177625e-07, "loss": 0.4208, "step": 8610 }, { "epoch": 0.49574419139636533, "grad_norm": 10.450528144836426, "learning_rate": 5.312725646588923e-07, "loss": 0.3976, "step": 8620 }, { "epoch": 0.49631930066712676, "grad_norm": 8.344999313354492, "learning_rate": 5.303429953977268e-07, "loss": 0.3801, "step": 8630 }, { "epoch": 0.4968944099378882, "grad_norm": 9.25851058959961, "learning_rate": 5.294133208594694e-07, "loss": 0.3899, "step": 8640 }, { "epoch": 0.4974695192086496, "grad_norm": 9.457186698913574, "learning_rate": 5.284835442696895e-07, "loss": 0.4129, "step": 8650 }, { "epoch": 0.4980446284794111, "grad_norm": 8.049318313598633, "learning_rate": 5.275536688543105e-07, "loss": 0.3927, "step": 8660 }, { "epoch": 0.49861973775017254, "grad_norm": 8.051761627197266, "learning_rate": 5.266236978395982e-07, "loss": 0.3932, "step": 8670 }, { "epoch": 0.499194847020934, "grad_norm": 9.402393341064453, "learning_rate": 5.256936344521508e-07, "loss": 0.4086, "step": 8680 }, { "epoch": 0.4997699562916954, "grad_norm": 10.095841407775879, "learning_rate": 5.247634819188867e-07, "loss": 0.3769, "step": 8690 }, { "epoch": 0.5003450655624568, "grad_norm": 9.891613006591797, "learning_rate": 5.238332434670331e-07, "loss": 0.4031, "step": 8700 }, { "epoch": 0.5009201748332183, "grad_norm": 9.820318222045898, "learning_rate": 5.22902922324116e-07, "loss": 0.3864, "step": 8710 }, { "epoch": 0.5014952841039798, "grad_norm": 9.479342460632324, "learning_rate": 5.219725217179483e-07, "loss": 0.3882, "step": 8720 }, { "epoch": 0.5020703933747412, "grad_norm": 9.338078498840332, "learning_rate": 5.21042044876618e-07, "loss": 0.3715, "step": 8730 }, { "epoch": 0.5026455026455027, "grad_norm": 9.074222564697266, "learning_rate": 5.201114950284782e-07, "loss": 0.3667, "step": 8740 }, { "epoch": 0.5032206119162641, "grad_norm": 8.187487602233887, "learning_rate": 5.191808754021347e-07, "loss": 0.3853, "step": 8750 }, { "epoch": 0.5037957211870255, "grad_norm": 9.177314758300781, "learning_rate": 5.182501892264362e-07, "loss": 0.3972, "step": 8760 }, { "epoch": 0.504370830457787, "grad_norm": 10.114627838134766, "learning_rate": 5.173194397304614e-07, "loss": 0.3869, "step": 8770 }, { "epoch": 0.5049459397285484, "grad_norm": 20.078929901123047, "learning_rate": 5.163886301435096e-07, "loss": 0.3718, "step": 8780 }, { "epoch": 0.5055210489993098, "grad_norm": 8.296409606933594, "learning_rate": 5.154577636950879e-07, "loss": 0.364, "step": 8790 }, { "epoch": 0.5060961582700713, "grad_norm": 8.666600227355957, "learning_rate": 5.14526843614901e-07, "loss": 0.3941, "step": 8800 }, { "epoch": 0.5066712675408328, "grad_norm": 8.099319458007812, "learning_rate": 5.135958731328393e-07, "loss": 0.3685, "step": 8810 }, { "epoch": 0.5072463768115942, "grad_norm": 11.752509117126465, "learning_rate": 5.126648554789687e-07, "loss": 0.4249, "step": 8820 }, { "epoch": 0.5078214860823557, "grad_norm": 7.84968900680542, "learning_rate": 5.117337938835186e-07, "loss": 0.3911, "step": 8830 }, { "epoch": 0.5083965953531171, "grad_norm": 10.39072036743164, "learning_rate": 5.108026915768703e-07, "loss": 0.4048, "step": 8840 }, { "epoch": 0.5089717046238785, "grad_norm": 9.711438179016113, "learning_rate": 5.098715517895467e-07, "loss": 0.3983, "step": 8850 }, { "epoch": 0.50954681389464, "grad_norm": 8.864706993103027, "learning_rate": 5.089403777522012e-07, "loss": 0.3793, "step": 8860 }, { "epoch": 0.5101219231654014, "grad_norm": 8.130143165588379, "learning_rate": 5.080091726956053e-07, "loss": 0.3739, "step": 8870 }, { "epoch": 0.5106970324361628, "grad_norm": 9.084704399108887, "learning_rate": 5.070779398506389e-07, "loss": 0.3884, "step": 8880 }, { "epoch": 0.5112721417069244, "grad_norm": 7.390566825866699, "learning_rate": 5.061466824482772e-07, "loss": 0.4181, "step": 8890 }, { "epoch": 0.5118472509776858, "grad_norm": 9.379162788391113, "learning_rate": 5.052154037195816e-07, "loss": 0.3846, "step": 8900 }, { "epoch": 0.5124223602484472, "grad_norm": 7.616793155670166, "learning_rate": 5.042841068956871e-07, "loss": 0.3681, "step": 8910 }, { "epoch": 0.5129974695192087, "grad_norm": 10.172737121582031, "learning_rate": 5.033527952077916e-07, "loss": 0.3856, "step": 8920 }, { "epoch": 0.5135725787899701, "grad_norm": 8.443028450012207, "learning_rate": 5.024214718871446e-07, "loss": 0.3651, "step": 8930 }, { "epoch": 0.5141476880607315, "grad_norm": 8.353960037231445, "learning_rate": 5.014901401650358e-07, "loss": 0.3892, "step": 8940 }, { "epoch": 0.514722797331493, "grad_norm": 11.373422622680664, "learning_rate": 5.00558803272784e-07, "loss": 0.3933, "step": 8950 }, { "epoch": 0.5152979066022544, "grad_norm": 7.5027594566345215, "learning_rate": 4.996274644417261e-07, "loss": 0.3903, "step": 8960 }, { "epoch": 0.5158730158730159, "grad_norm": 13.179106712341309, "learning_rate": 4.986961269032059e-07, "loss": 0.3804, "step": 8970 }, { "epoch": 0.5164481251437774, "grad_norm": 8.801780700683594, "learning_rate": 4.977647938885621e-07, "loss": 0.3769, "step": 8980 }, { "epoch": 0.5170232344145388, "grad_norm": 8.873416900634766, "learning_rate": 4.968334686291183e-07, "loss": 0.3595, "step": 8990 }, { "epoch": 0.5175983436853002, "grad_norm": 9.038322448730469, "learning_rate": 4.959021543561711e-07, "loss": 0.3907, "step": 9000 }, { "epoch": 0.5181734529560617, "grad_norm": 12.6765775680542, "learning_rate": 4.949708543009789e-07, "loss": 0.3769, "step": 9010 }, { "epoch": 0.5187485622268231, "grad_norm": 8.318666458129883, "learning_rate": 4.940395716947506e-07, "loss": 0.4025, "step": 9020 }, { "epoch": 0.5193236714975845, "grad_norm": 7.41690731048584, "learning_rate": 4.931083097686348e-07, "loss": 0.4055, "step": 9030 }, { "epoch": 0.519898780768346, "grad_norm": 8.717617988586426, "learning_rate": 4.921770717537082e-07, "loss": 0.3888, "step": 9040 }, { "epoch": 0.5204738900391074, "grad_norm": 9.414563179016113, "learning_rate": 4.912458608809646e-07, "loss": 0.4092, "step": 9050 }, { "epoch": 0.5210489993098689, "grad_norm": 7.57497501373291, "learning_rate": 4.903146803813036e-07, "loss": 0.3838, "step": 9060 }, { "epoch": 0.5216241085806304, "grad_norm": 8.830973625183105, "learning_rate": 4.893835334855196e-07, "loss": 0.3938, "step": 9070 }, { "epoch": 0.5221992178513918, "grad_norm": 8.381980895996094, "learning_rate": 4.884524234242902e-07, "loss": 0.3958, "step": 9080 }, { "epoch": 0.5227743271221532, "grad_norm": 10.487943649291992, "learning_rate": 4.875213534281649e-07, "loss": 0.4013, "step": 9090 }, { "epoch": 0.5233494363929146, "grad_norm": 9.358920097351074, "learning_rate": 4.865903267275551e-07, "loss": 0.3547, "step": 9100 }, { "epoch": 0.5239245456636761, "grad_norm": 12.230690002441406, "learning_rate": 4.856593465527212e-07, "loss": 0.4002, "step": 9110 }, { "epoch": 0.5244996549344375, "grad_norm": 9.160329818725586, "learning_rate": 4.847284161337622e-07, "loss": 0.3885, "step": 9120 }, { "epoch": 0.5250747642051989, "grad_norm": 9.662772178649902, "learning_rate": 4.83797538700605e-07, "loss": 0.3767, "step": 9130 }, { "epoch": 0.5256498734759605, "grad_norm": 8.298624992370605, "learning_rate": 4.828667174829919e-07, "loss": 0.41, "step": 9140 }, { "epoch": 0.5262249827467219, "grad_norm": 12.033020973205566, "learning_rate": 4.819359557104711e-07, "loss": 0.396, "step": 9150 }, { "epoch": 0.5268000920174833, "grad_norm": 9.344993591308594, "learning_rate": 4.810052566123838e-07, "loss": 0.3834, "step": 9160 }, { "epoch": 0.5273752012882448, "grad_norm": 8.782896041870117, "learning_rate": 4.800746234178541e-07, "loss": 0.3784, "step": 9170 }, { "epoch": 0.5279503105590062, "grad_norm": 8.540607452392578, "learning_rate": 4.791440593557771e-07, "loss": 0.3622, "step": 9180 }, { "epoch": 0.5285254198297676, "grad_norm": 7.005080223083496, "learning_rate": 4.782135676548083e-07, "loss": 0.3507, "step": 9190 }, { "epoch": 0.5291005291005291, "grad_norm": 11.658373832702637, "learning_rate": 4.772831515433523e-07, "loss": 0.4023, "step": 9200 }, { "epoch": 0.5296756383712905, "grad_norm": 8.22269058227539, "learning_rate": 4.763528142495512e-07, "loss": 0.3777, "step": 9210 }, { "epoch": 0.530250747642052, "grad_norm": 10.951814651489258, "learning_rate": 4.7542255900127364e-07, "loss": 0.3848, "step": 9220 }, { "epoch": 0.5308258569128135, "grad_norm": 7.629526615142822, "learning_rate": 4.7449238902610364e-07, "loss": 0.4047, "step": 9230 }, { "epoch": 0.5314009661835749, "grad_norm": 9.275042533874512, "learning_rate": 4.735623075513293e-07, "loss": 0.3841, "step": 9240 }, { "epoch": 0.5319760754543363, "grad_norm": 10.380615234375, "learning_rate": 4.726323178039319e-07, "loss": 0.3838, "step": 9250 }, { "epoch": 0.5325511847250978, "grad_norm": 6.909326076507568, "learning_rate": 4.7170242301057406e-07, "loss": 0.3818, "step": 9260 }, { "epoch": 0.5331262939958592, "grad_norm": 9.363625526428223, "learning_rate": 4.7077262639758935e-07, "loss": 0.3801, "step": 9270 }, { "epoch": 0.5337014032666206, "grad_norm": 9.213701248168945, "learning_rate": 4.698429311909705e-07, "loss": 0.4103, "step": 9280 }, { "epoch": 0.5342765125373821, "grad_norm": 9.35634708404541, "learning_rate": 4.689133406163581e-07, "loss": 0.4272, "step": 9290 }, { "epoch": 0.5348516218081435, "grad_norm": 8.502677917480469, "learning_rate": 4.679838578990306e-07, "loss": 0.3742, "step": 9300 }, { "epoch": 0.535426731078905, "grad_norm": 10.744138717651367, "learning_rate": 4.6705448626389143e-07, "loss": 0.3952, "step": 9310 }, { "epoch": 0.5360018403496665, "grad_norm": 9.87901496887207, "learning_rate": 4.661252289354588e-07, "loss": 0.3898, "step": 9320 }, { "epoch": 0.5365769496204279, "grad_norm": 10.13231086730957, "learning_rate": 4.651960891378545e-07, "loss": 0.374, "step": 9330 }, { "epoch": 0.5371520588911893, "grad_norm": 12.246869087219238, "learning_rate": 4.6426707009479207e-07, "loss": 0.391, "step": 9340 }, { "epoch": 0.5377271681619508, "grad_norm": 8.236720085144043, "learning_rate": 4.6333817502956687e-07, "loss": 0.3993, "step": 9350 }, { "epoch": 0.5383022774327122, "grad_norm": 9.106837272644043, "learning_rate": 4.624094071650435e-07, "loss": 0.3851, "step": 9360 }, { "epoch": 0.5388773867034736, "grad_norm": 12.498353958129883, "learning_rate": 4.614807697236454e-07, "loss": 0.3668, "step": 9370 }, { "epoch": 0.5394524959742351, "grad_norm": 8.320091247558594, "learning_rate": 4.6055226592734346e-07, "loss": 0.3609, "step": 9380 }, { "epoch": 0.5400276052449966, "grad_norm": 9.980466842651367, "learning_rate": 4.596238989976449e-07, "loss": 0.371, "step": 9390 }, { "epoch": 0.540602714515758, "grad_norm": 7.928401947021484, "learning_rate": 4.5869567215558224e-07, "loss": 0.3426, "step": 9400 }, { "epoch": 0.5411778237865195, "grad_norm": 11.54423713684082, "learning_rate": 4.5776758862170174e-07, "loss": 0.3772, "step": 9410 }, { "epoch": 0.5417529330572809, "grad_norm": 10.193535804748535, "learning_rate": 4.568396516160524e-07, "loss": 0.3464, "step": 9420 }, { "epoch": 0.5423280423280423, "grad_norm": 10.121536254882812, "learning_rate": 4.5591186435817513e-07, "loss": 0.3743, "step": 9430 }, { "epoch": 0.5429031515988038, "grad_norm": 10.88989543914795, "learning_rate": 4.5498423006709073e-07, "loss": 0.3813, "step": 9440 }, { "epoch": 0.5434782608695652, "grad_norm": 9.790507316589355, "learning_rate": 4.540567519612901e-07, "loss": 0.3938, "step": 9450 }, { "epoch": 0.5440533701403266, "grad_norm": 9.20999813079834, "learning_rate": 4.5312943325872154e-07, "loss": 0.3676, "step": 9460 }, { "epoch": 0.5446284794110882, "grad_norm": 12.062773704528809, "learning_rate": 4.5220227717678074e-07, "loss": 0.3657, "step": 9470 }, { "epoch": 0.5452035886818496, "grad_norm": 7.550793647766113, "learning_rate": 4.512752869322983e-07, "loss": 0.3835, "step": 9480 }, { "epoch": 0.545778697952611, "grad_norm": 7.587610244750977, "learning_rate": 4.503484657415309e-07, "loss": 0.355, "step": 9490 }, { "epoch": 0.5463538072233725, "grad_norm": 11.069981575012207, "learning_rate": 4.4942181682014747e-07, "loss": 0.3838, "step": 9500 }, { "epoch": 0.5469289164941339, "grad_norm": 11.602307319641113, "learning_rate": 4.4849534338321964e-07, "loss": 0.3903, "step": 9510 }, { "epoch": 0.5475040257648953, "grad_norm": 10.172114372253418, "learning_rate": 4.475690486452102e-07, "loss": 0.3914, "step": 9520 }, { "epoch": 0.5480791350356568, "grad_norm": 8.332169532775879, "learning_rate": 4.4664293581996187e-07, "loss": 0.4103, "step": 9530 }, { "epoch": 0.5486542443064182, "grad_norm": 11.920575141906738, "learning_rate": 4.457170081206864e-07, "loss": 0.3769, "step": 9540 }, { "epoch": 0.5492293535771796, "grad_norm": 9.023862838745117, "learning_rate": 4.44791268759953e-07, "loss": 0.3918, "step": 9550 }, { "epoch": 0.5498044628479412, "grad_norm": 9.064888954162598, "learning_rate": 4.438657209496775e-07, "loss": 0.3601, "step": 9560 }, { "epoch": 0.5503795721187026, "grad_norm": 7.356508255004883, "learning_rate": 4.4294036790111116e-07, "loss": 0.3725, "step": 9570 }, { "epoch": 0.550954681389464, "grad_norm": 8.236316680908203, "learning_rate": 4.4201521282482923e-07, "loss": 0.3907, "step": 9580 }, { "epoch": 0.5515297906602254, "grad_norm": 9.039237976074219, "learning_rate": 4.4109025893072083e-07, "loss": 0.4162, "step": 9590 }, { "epoch": 0.5521048999309869, "grad_norm": 7.15929651260376, "learning_rate": 4.401655094279763e-07, "loss": 0.3718, "step": 9600 }, { "epoch": 0.5526800092017483, "grad_norm": 8.132118225097656, "learning_rate": 4.3924096752507723e-07, "loss": 0.3941, "step": 9610 }, { "epoch": 0.5532551184725097, "grad_norm": 11.471051216125488, "learning_rate": 4.3831663642978457e-07, "loss": 0.4192, "step": 9620 }, { "epoch": 0.5538302277432712, "grad_norm": 9.253182411193848, "learning_rate": 4.3739251934912804e-07, "loss": 0.3898, "step": 9630 }, { "epoch": 0.5544053370140327, "grad_norm": 9.275343894958496, "learning_rate": 4.3646861948939516e-07, "loss": 0.3771, "step": 9640 }, { "epoch": 0.5549804462847941, "grad_norm": 8.71243667602539, "learning_rate": 4.3554494005611945e-07, "loss": 0.3652, "step": 9650 }, { "epoch": 0.5555555555555556, "grad_norm": 7.886996269226074, "learning_rate": 4.3462148425406964e-07, "loss": 0.3859, "step": 9660 }, { "epoch": 0.556130664826317, "grad_norm": 9.061646461486816, "learning_rate": 4.3369825528723856e-07, "loss": 0.3666, "step": 9670 }, { "epoch": 0.5567057740970784, "grad_norm": 10.263635635375977, "learning_rate": 4.32775256358832e-07, "loss": 0.3818, "step": 9680 }, { "epoch": 0.5572808833678399, "grad_norm": 9.78042221069336, "learning_rate": 4.3185249067125786e-07, "loss": 0.4143, "step": 9690 }, { "epoch": 0.5578559926386013, "grad_norm": 8.842947006225586, "learning_rate": 4.309299614261146e-07, "loss": 0.3818, "step": 9700 }, { "epoch": 0.5584311019093627, "grad_norm": 10.982131958007812, "learning_rate": 4.3000767182418026e-07, "loss": 0.3869, "step": 9710 }, { "epoch": 0.5590062111801242, "grad_norm": 8.662030220031738, "learning_rate": 4.290856250654015e-07, "loss": 0.3805, "step": 9720 }, { "epoch": 0.5595813204508857, "grad_norm": 9.16728687286377, "learning_rate": 4.281638243488823e-07, "loss": 0.3913, "step": 9730 }, { "epoch": 0.5601564297216471, "grad_norm": 10.240887641906738, "learning_rate": 4.272422728728734e-07, "loss": 0.3631, "step": 9740 }, { "epoch": 0.5607315389924086, "grad_norm": 10.587800979614258, "learning_rate": 4.263209738347605e-07, "loss": 0.3937, "step": 9750 }, { "epoch": 0.56130664826317, "grad_norm": 8.276656150817871, "learning_rate": 4.2539993043105326e-07, "loss": 0.3634, "step": 9760 }, { "epoch": 0.5618817575339314, "grad_norm": 9.007804870605469, "learning_rate": 4.2447914585737455e-07, "loss": 0.4033, "step": 9770 }, { "epoch": 0.5624568668046929, "grad_norm": 9.236212730407715, "learning_rate": 4.2355862330844916e-07, "loss": 0.3852, "step": 9780 }, { "epoch": 0.5630319760754543, "grad_norm": 9.567317008972168, "learning_rate": 4.226383659780931e-07, "loss": 0.3756, "step": 9790 }, { "epoch": 0.5636070853462157, "grad_norm": 11.15540599822998, "learning_rate": 4.2171837705920187e-07, "loss": 0.4062, "step": 9800 }, { "epoch": 0.5641821946169773, "grad_norm": 13.722125053405762, "learning_rate": 4.207986597437398e-07, "loss": 0.3966, "step": 9810 }, { "epoch": 0.5647573038877387, "grad_norm": 8.244195938110352, "learning_rate": 4.198792172227287e-07, "loss": 0.354, "step": 9820 }, { "epoch": 0.5653324131585001, "grad_norm": 8.409849166870117, "learning_rate": 4.1896005268623694e-07, "loss": 0.3764, "step": 9830 }, { "epoch": 0.5659075224292616, "grad_norm": 9.555737495422363, "learning_rate": 4.1804116932336897e-07, "loss": 0.3903, "step": 9840 }, { "epoch": 0.566482631700023, "grad_norm": 7.941007137298584, "learning_rate": 4.1712257032225313e-07, "loss": 0.4035, "step": 9850 }, { "epoch": 0.5670577409707844, "grad_norm": 10.590397834777832, "learning_rate": 4.162042588700312e-07, "loss": 0.3695, "step": 9860 }, { "epoch": 0.5676328502415459, "grad_norm": 10.407230377197266, "learning_rate": 4.152862381528474e-07, "loss": 0.379, "step": 9870 }, { "epoch": 0.5682079595123073, "grad_norm": 9.772747993469238, "learning_rate": 4.1436851135583703e-07, "loss": 0.3695, "step": 9880 }, { "epoch": 0.5687830687830688, "grad_norm": 9.584944725036621, "learning_rate": 4.134510816631163e-07, "loss": 0.3876, "step": 9890 }, { "epoch": 0.5693581780538303, "grad_norm": 11.409784317016602, "learning_rate": 4.1253395225776975e-07, "loss": 0.3589, "step": 9900 }, { "epoch": 0.5699332873245917, "grad_norm": 9.611605644226074, "learning_rate": 4.116171263218404e-07, "loss": 0.3561, "step": 9910 }, { "epoch": 0.5705083965953531, "grad_norm": 14.128211975097656, "learning_rate": 4.107006070363184e-07, "loss": 0.3846, "step": 9920 }, { "epoch": 0.5710835058661146, "grad_norm": 10.094975471496582, "learning_rate": 4.0978439758112974e-07, "loss": 0.3701, "step": 9930 }, { "epoch": 0.571658615136876, "grad_norm": 9.51294231414795, "learning_rate": 4.08868501135126e-07, "loss": 0.3924, "step": 9940 }, { "epoch": 0.5722337244076374, "grad_norm": 7.5738396644592285, "learning_rate": 4.079529208760721e-07, "loss": 0.3775, "step": 9950 }, { "epoch": 0.5728088336783989, "grad_norm": 8.295430183410645, "learning_rate": 4.070376599806362e-07, "loss": 0.3895, "step": 9960 }, { "epoch": 0.5733839429491603, "grad_norm": 7.938765525817871, "learning_rate": 4.0612272162437844e-07, "loss": 0.3844, "step": 9970 }, { "epoch": 0.5739590522199218, "grad_norm": 8.9369478225708, "learning_rate": 4.0520810898173965e-07, "loss": 0.3598, "step": 9980 }, { "epoch": 0.5745341614906833, "grad_norm": 7.868127822875977, "learning_rate": 4.0429382522603104e-07, "loss": 0.3635, "step": 9990 }, { "epoch": 0.5751092707614447, "grad_norm": 10.051660537719727, "learning_rate": 4.033798735294224e-07, "loss": 0.4018, "step": 10000 }, { "epoch": 0.5756843800322061, "grad_norm": 8.911669731140137, "learning_rate": 4.024662570629313e-07, "loss": 0.368, "step": 10010 }, { "epoch": 0.5762594893029676, "grad_norm": 11.635683059692383, "learning_rate": 4.0155297899641255e-07, "loss": 0.369, "step": 10020 }, { "epoch": 0.576834598573729, "grad_norm": 9.292219161987305, "learning_rate": 4.0064004249854645e-07, "loss": 0.3739, "step": 10030 }, { "epoch": 0.5774097078444904, "grad_norm": 8.785758972167969, "learning_rate": 3.997274507368287e-07, "loss": 0.3857, "step": 10040 }, { "epoch": 0.5779848171152518, "grad_norm": 9.122446060180664, "learning_rate": 3.9881520687755853e-07, "loss": 0.3868, "step": 10050 }, { "epoch": 0.5785599263860134, "grad_norm": 8.800936698913574, "learning_rate": 3.979033140858283e-07, "loss": 0.3602, "step": 10060 }, { "epoch": 0.5791350356567748, "grad_norm": 8.303102493286133, "learning_rate": 3.9699177552551204e-07, "loss": 0.3962, "step": 10070 }, { "epoch": 0.5797101449275363, "grad_norm": 10.973525047302246, "learning_rate": 3.9608059435925496e-07, "loss": 0.3809, "step": 10080 }, { "epoch": 0.5802852541982977, "grad_norm": 8.03174877166748, "learning_rate": 3.951697737484625e-07, "loss": 0.3971, "step": 10090 }, { "epoch": 0.5808603634690591, "grad_norm": 11.021552085876465, "learning_rate": 3.942593168532888e-07, "loss": 0.378, "step": 10100 }, { "epoch": 0.5814354727398205, "grad_norm": 9.88539981842041, "learning_rate": 3.9334922683262604e-07, "loss": 0.3739, "step": 10110 }, { "epoch": 0.582010582010582, "grad_norm": 7.3574442863464355, "learning_rate": 3.924395068440937e-07, "loss": 0.3765, "step": 10120 }, { "epoch": 0.5825856912813434, "grad_norm": 9.142216682434082, "learning_rate": 3.915301600440271e-07, "loss": 0.3598, "step": 10130 }, { "epoch": 0.583160800552105, "grad_norm": 9.07522964477539, "learning_rate": 3.906211895874673e-07, "loss": 0.343, "step": 10140 }, { "epoch": 0.5837359098228664, "grad_norm": 9.05262279510498, "learning_rate": 3.897125986281491e-07, "loss": 0.3852, "step": 10150 }, { "epoch": 0.5843110190936278, "grad_norm": 8.661456108093262, "learning_rate": 3.888043903184909e-07, "loss": 0.3979, "step": 10160 }, { "epoch": 0.5848861283643892, "grad_norm": 6.661818981170654, "learning_rate": 3.878965678095832e-07, "loss": 0.3787, "step": 10170 }, { "epoch": 0.5854612376351507, "grad_norm": 10.352274894714355, "learning_rate": 3.869891342511781e-07, "loss": 0.3751, "step": 10180 }, { "epoch": 0.5860363469059121, "grad_norm": 11.959846496582031, "learning_rate": 3.8608209279167844e-07, "loss": 0.3722, "step": 10190 }, { "epoch": 0.5866114561766735, "grad_norm": 9.641359329223633, "learning_rate": 3.851754465781263e-07, "loss": 0.3831, "step": 10200 }, { "epoch": 0.587186565447435, "grad_norm": 10.447661399841309, "learning_rate": 3.8426919875619246e-07, "loss": 0.3886, "step": 10210 }, { "epoch": 0.5877616747181964, "grad_norm": 8.652332305908203, "learning_rate": 3.833633524701656e-07, "loss": 0.3735, "step": 10220 }, { "epoch": 0.588336783988958, "grad_norm": 9.70828914642334, "learning_rate": 3.8245791086294106e-07, "loss": 0.3375, "step": 10230 }, { "epoch": 0.5889118932597194, "grad_norm": 8.372089385986328, "learning_rate": 3.815528770760105e-07, "loss": 0.3741, "step": 10240 }, { "epoch": 0.5894870025304808, "grad_norm": 6.901708602905273, "learning_rate": 3.8064825424945023e-07, "loss": 0.3721, "step": 10250 }, { "epoch": 0.5900621118012422, "grad_norm": 6.712554931640625, "learning_rate": 3.797440455219109e-07, "loss": 0.3735, "step": 10260 }, { "epoch": 0.5906372210720037, "grad_norm": 9.683284759521484, "learning_rate": 3.788402540306063e-07, "loss": 0.3566, "step": 10270 }, { "epoch": 0.5912123303427651, "grad_norm": 7.25424861907959, "learning_rate": 3.779368829113024e-07, "loss": 0.3242, "step": 10280 }, { "epoch": 0.5917874396135265, "grad_norm": 9.811884880065918, "learning_rate": 3.770339352983073e-07, "loss": 0.3819, "step": 10290 }, { "epoch": 0.592362548884288, "grad_norm": 8.950639724731445, "learning_rate": 3.7613141432445916e-07, "loss": 0.3933, "step": 10300 }, { "epoch": 0.5929376581550495, "grad_norm": 9.117372512817383, "learning_rate": 3.75229323121116e-07, "loss": 0.3925, "step": 10310 }, { "epoch": 0.5935127674258109, "grad_norm": 9.603479385375977, "learning_rate": 3.743276648181448e-07, "loss": 0.3715, "step": 10320 }, { "epoch": 0.5940878766965724, "grad_norm": 10.83611011505127, "learning_rate": 3.7342644254391044e-07, "loss": 0.406, "step": 10330 }, { "epoch": 0.5946629859673338, "grad_norm": 10.441534042358398, "learning_rate": 3.7252565942526527e-07, "loss": 0.3943, "step": 10340 }, { "epoch": 0.5952380952380952, "grad_norm": 9.773183822631836, "learning_rate": 3.716253185875376e-07, "loss": 0.3679, "step": 10350 }, { "epoch": 0.5958132045088567, "grad_norm": 8.356080055236816, "learning_rate": 3.7072542315452137e-07, "loss": 0.3796, "step": 10360 }, { "epoch": 0.5963883137796181, "grad_norm": 9.368612289428711, "learning_rate": 3.6982597624846524e-07, "loss": 0.373, "step": 10370 }, { "epoch": 0.5969634230503795, "grad_norm": 9.359908103942871, "learning_rate": 3.689269809900612e-07, "loss": 0.3584, "step": 10380 }, { "epoch": 0.5975385323211411, "grad_norm": 9.303851127624512, "learning_rate": 3.6802844049843516e-07, "loss": 0.3761, "step": 10390 }, { "epoch": 0.5981136415919025, "grad_norm": 7.944705009460449, "learning_rate": 3.6713035789113446e-07, "loss": 0.3386, "step": 10400 }, { "epoch": 0.5986887508626639, "grad_norm": 11.734755516052246, "learning_rate": 3.6623273628411787e-07, "loss": 0.3794, "step": 10410 }, { "epoch": 0.5992638601334254, "grad_norm": 9.052252769470215, "learning_rate": 3.653355787917448e-07, "loss": 0.3765, "step": 10420 }, { "epoch": 0.5998389694041868, "grad_norm": 8.246960639953613, "learning_rate": 3.6443888852676406e-07, "loss": 0.3491, "step": 10430 }, { "epoch": 0.6004140786749482, "grad_norm": 12.871159553527832, "learning_rate": 3.6354266860030414e-07, "loss": 0.3642, "step": 10440 }, { "epoch": 0.6009891879457097, "grad_norm": 8.794795036315918, "learning_rate": 3.62646922121861e-07, "loss": 0.3971, "step": 10450 }, { "epoch": 0.6015642972164711, "grad_norm": 10.551728248596191, "learning_rate": 3.617516521992881e-07, "loss": 0.3764, "step": 10460 }, { "epoch": 0.6021394064872325, "grad_norm": 10.349842071533203, "learning_rate": 3.6085686193878553e-07, "loss": 0.3469, "step": 10470 }, { "epoch": 0.6027145157579941, "grad_norm": 8.66835880279541, "learning_rate": 3.5996255444488897e-07, "loss": 0.3547, "step": 10480 }, { "epoch": 0.6032896250287555, "grad_norm": 9.510896682739258, "learning_rate": 3.5906873282045946e-07, "loss": 0.3819, "step": 10490 }, { "epoch": 0.6038647342995169, "grad_norm": 8.172520637512207, "learning_rate": 3.581754001666721e-07, "loss": 0.3888, "step": 10500 }, { "epoch": 0.6044398435702784, "grad_norm": 9.389524459838867, "learning_rate": 3.572825595830053e-07, "loss": 0.3847, "step": 10510 }, { "epoch": 0.6050149528410398, "grad_norm": 10.090486526489258, "learning_rate": 3.5639021416723036e-07, "loss": 0.3721, "step": 10520 }, { "epoch": 0.6055900621118012, "grad_norm": 12.074410438537598, "learning_rate": 3.5549836701540047e-07, "loss": 0.4085, "step": 10530 }, { "epoch": 0.6061651713825627, "grad_norm": 10.663188934326172, "learning_rate": 3.546070212218404e-07, "loss": 0.3804, "step": 10540 }, { "epoch": 0.6067402806533241, "grad_norm": 10.813393592834473, "learning_rate": 3.5371617987913516e-07, "loss": 0.3558, "step": 10550 }, { "epoch": 0.6073153899240856, "grad_norm": 7.5838303565979, "learning_rate": 3.5282584607811934e-07, "loss": 0.3436, "step": 10560 }, { "epoch": 0.6078904991948471, "grad_norm": 9.14936637878418, "learning_rate": 3.5193602290786684e-07, "loss": 0.3796, "step": 10570 }, { "epoch": 0.6084656084656085, "grad_norm": 13.196319580078125, "learning_rate": 3.510467134556796e-07, "loss": 0.345, "step": 10580 }, { "epoch": 0.6090407177363699, "grad_norm": 11.366345405578613, "learning_rate": 3.5015792080707785e-07, "loss": 0.3916, "step": 10590 }, { "epoch": 0.6096158270071314, "grad_norm": 7.773449897766113, "learning_rate": 3.4926964804578806e-07, "loss": 0.3748, "step": 10600 }, { "epoch": 0.6101909362778928, "grad_norm": 12.788496971130371, "learning_rate": 3.483818982537333e-07, "loss": 0.3824, "step": 10610 }, { "epoch": 0.6107660455486542, "grad_norm": 7.7415971755981445, "learning_rate": 3.4749467451102176e-07, "loss": 0.3643, "step": 10620 }, { "epoch": 0.6113411548194156, "grad_norm": 9.731002807617188, "learning_rate": 3.466079798959368e-07, "loss": 0.3769, "step": 10630 }, { "epoch": 0.6119162640901772, "grad_norm": 8.176984786987305, "learning_rate": 3.45721817484926e-07, "loss": 0.3543, "step": 10640 }, { "epoch": 0.6124913733609386, "grad_norm": 10.30233383178711, "learning_rate": 3.448361903525902e-07, "loss": 0.3632, "step": 10650 }, { "epoch": 0.6130664826317, "grad_norm": 7.894333362579346, "learning_rate": 3.4395110157167305e-07, "loss": 0.3612, "step": 10660 }, { "epoch": 0.6136415919024615, "grad_norm": 12.018261909484863, "learning_rate": 3.4306655421305055e-07, "loss": 0.3572, "step": 10670 }, { "epoch": 0.6142167011732229, "grad_norm": 9.024635314941406, "learning_rate": 3.421825513457198e-07, "loss": 0.3816, "step": 10680 }, { "epoch": 0.6147918104439843, "grad_norm": 12.097403526306152, "learning_rate": 3.412990960367895e-07, "loss": 0.3623, "step": 10690 }, { "epoch": 0.6153669197147458, "grad_norm": 8.523496627807617, "learning_rate": 3.404161913514678e-07, "loss": 0.3522, "step": 10700 }, { "epoch": 0.6159420289855072, "grad_norm": 11.388949394226074, "learning_rate": 3.395338403530528e-07, "loss": 0.3883, "step": 10710 }, { "epoch": 0.6165171382562686, "grad_norm": 8.799589157104492, "learning_rate": 3.386520461029214e-07, "loss": 0.3968, "step": 10720 }, { "epoch": 0.6170922475270302, "grad_norm": 11.623543739318848, "learning_rate": 3.377708116605186e-07, "loss": 0.376, "step": 10730 }, { "epoch": 0.6176673567977916, "grad_norm": 11.008196830749512, "learning_rate": 3.368901400833479e-07, "loss": 0.3999, "step": 10740 }, { "epoch": 0.618242466068553, "grad_norm": 10.373955726623535, "learning_rate": 3.360100344269593e-07, "loss": 0.3953, "step": 10750 }, { "epoch": 0.6188175753393145, "grad_norm": 9.713980674743652, "learning_rate": 3.3513049774493923e-07, "loss": 0.4111, "step": 10760 }, { "epoch": 0.6193926846100759, "grad_norm": 11.72384262084961, "learning_rate": 3.342515330889002e-07, "loss": 0.3998, "step": 10770 }, { "epoch": 0.6199677938808373, "grad_norm": 8.974841117858887, "learning_rate": 3.3337314350847e-07, "loss": 0.3798, "step": 10780 }, { "epoch": 0.6205429031515988, "grad_norm": 11.002151489257812, "learning_rate": 3.3249533205128133e-07, "loss": 0.3665, "step": 10790 }, { "epoch": 0.6211180124223602, "grad_norm": 8.447975158691406, "learning_rate": 3.316181017629608e-07, "loss": 0.3603, "step": 10800 }, { "epoch": 0.6216931216931217, "grad_norm": 9.070975303649902, "learning_rate": 3.307414556871187e-07, "loss": 0.3706, "step": 10810 }, { "epoch": 0.6222682309638832, "grad_norm": 16.820966720581055, "learning_rate": 3.2986539686533844e-07, "loss": 0.392, "step": 10820 }, { "epoch": 0.6228433402346446, "grad_norm": 10.62855339050293, "learning_rate": 3.2898992833716563e-07, "loss": 0.4058, "step": 10830 }, { "epoch": 0.623418449505406, "grad_norm": 8.267560005187988, "learning_rate": 3.2811505314009835e-07, "loss": 0.3378, "step": 10840 }, { "epoch": 0.6239935587761675, "grad_norm": 11.870079040527344, "learning_rate": 3.2724077430957557e-07, "loss": 0.3392, "step": 10850 }, { "epoch": 0.6245686680469289, "grad_norm": 9.235787391662598, "learning_rate": 3.263670948789674e-07, "loss": 0.3779, "step": 10860 }, { "epoch": 0.6251437773176903, "grad_norm": 7.006470680236816, "learning_rate": 3.254940178795641e-07, "loss": 0.3787, "step": 10870 }, { "epoch": 0.6257188865884518, "grad_norm": 8.086721420288086, "learning_rate": 3.2462154634056574e-07, "loss": 0.375, "step": 10880 }, { "epoch": 0.6262939958592133, "grad_norm": 10.55824089050293, "learning_rate": 3.237496832890722e-07, "loss": 0.3779, "step": 10890 }, { "epoch": 0.6268691051299747, "grad_norm": 9.587925910949707, "learning_rate": 3.2287843175007157e-07, "loss": 0.3676, "step": 10900 }, { "epoch": 0.6274442144007362, "grad_norm": 9.798205375671387, "learning_rate": 3.2200779474643053e-07, "loss": 0.3871, "step": 10910 }, { "epoch": 0.6280193236714976, "grad_norm": 9.184255599975586, "learning_rate": 3.211377752988834e-07, "loss": 0.3645, "step": 10920 }, { "epoch": 0.628594432942259, "grad_norm": 10.591147422790527, "learning_rate": 3.2026837642602214e-07, "loss": 0.3854, "step": 10930 }, { "epoch": 0.6291695422130205, "grad_norm": 8.594269752502441, "learning_rate": 3.1939960114428534e-07, "loss": 0.3634, "step": 10940 }, { "epoch": 0.6297446514837819, "grad_norm": 9.28696060180664, "learning_rate": 3.185314524679481e-07, "loss": 0.3665, "step": 10950 }, { "epoch": 0.6303197607545433, "grad_norm": 11.588949203491211, "learning_rate": 3.176639334091116e-07, "loss": 0.3885, "step": 10960 }, { "epoch": 0.6308948700253048, "grad_norm": 10.453193664550781, "learning_rate": 3.167970469776921e-07, "loss": 0.3685, "step": 10970 }, { "epoch": 0.6314699792960663, "grad_norm": 9.438701629638672, "learning_rate": 3.159307961814113e-07, "loss": 0.3489, "step": 10980 }, { "epoch": 0.6320450885668277, "grad_norm": 10.433137893676758, "learning_rate": 3.1506518402578546e-07, "loss": 0.36, "step": 10990 }, { "epoch": 0.6326201978375892, "grad_norm": 11.62016773223877, "learning_rate": 3.1420021351411504e-07, "loss": 0.3536, "step": 11000 }, { "epoch": 0.6331953071083506, "grad_norm": 11.125321388244629, "learning_rate": 3.133358876474741e-07, "loss": 0.3906, "step": 11010 }, { "epoch": 0.633770416379112, "grad_norm": 13.302066802978516, "learning_rate": 3.124722094247002e-07, "loss": 0.4068, "step": 11020 }, { "epoch": 0.6343455256498735, "grad_norm": 10.023965835571289, "learning_rate": 3.116091818423837e-07, "loss": 0.3913, "step": 11030 }, { "epoch": 0.6349206349206349, "grad_norm": 11.6726713180542, "learning_rate": 3.10746807894858e-07, "loss": 0.3671, "step": 11040 }, { "epoch": 0.6354957441913963, "grad_norm": 10.859635353088379, "learning_rate": 3.09885090574188e-07, "loss": 0.3508, "step": 11050 }, { "epoch": 0.6360708534621579, "grad_norm": 10.164657592773438, "learning_rate": 3.09024032870161e-07, "loss": 0.3911, "step": 11060 }, { "epoch": 0.6366459627329193, "grad_norm": 8.010098457336426, "learning_rate": 3.0816363777027507e-07, "loss": 0.3742, "step": 11070 }, { "epoch": 0.6372210720036807, "grad_norm": 10.904577255249023, "learning_rate": 3.0730390825972984e-07, "loss": 0.3491, "step": 11080 }, { "epoch": 0.6377961812744422, "grad_norm": 9.330544471740723, "learning_rate": 3.0644484732141564e-07, "loss": 0.3604, "step": 11090 }, { "epoch": 0.6383712905452036, "grad_norm": 9.657205581665039, "learning_rate": 3.055864579359028e-07, "loss": 0.379, "step": 11100 }, { "epoch": 0.638946399815965, "grad_norm": 9.462019920349121, "learning_rate": 3.047287430814318e-07, "loss": 0.3665, "step": 11110 }, { "epoch": 0.6395215090867264, "grad_norm": 9.105822563171387, "learning_rate": 3.038717057339029e-07, "loss": 0.3701, "step": 11120 }, { "epoch": 0.6400966183574879, "grad_norm": 9.753474235534668, "learning_rate": 3.030153488668654e-07, "loss": 0.3784, "step": 11130 }, { "epoch": 0.6406717276282494, "grad_norm": 9.288264274597168, "learning_rate": 3.0215967545150825e-07, "loss": 0.3665, "step": 11140 }, { "epoch": 0.6412468368990109, "grad_norm": 8.721306800842285, "learning_rate": 3.0130468845664853e-07, "loss": 0.3483, "step": 11150 }, { "epoch": 0.6418219461697723, "grad_norm": 7.468956470489502, "learning_rate": 3.004503908487219e-07, "loss": 0.3459, "step": 11160 }, { "epoch": 0.6423970554405337, "grad_norm": 10.578388214111328, "learning_rate": 2.9959678559177215e-07, "loss": 0.3427, "step": 11170 }, { "epoch": 0.6429721647112951, "grad_norm": 10.7095365524292, "learning_rate": 2.9874387564744077e-07, "loss": 0.3567, "step": 11180 }, { "epoch": 0.6435472739820566, "grad_norm": 8.8557767868042, "learning_rate": 2.9789166397495724e-07, "loss": 0.3565, "step": 11190 }, { "epoch": 0.644122383252818, "grad_norm": 15.037124633789062, "learning_rate": 2.970401535311281e-07, "loss": 0.38, "step": 11200 }, { "epoch": 0.6446974925235794, "grad_norm": 9.793429374694824, "learning_rate": 2.961893472703268e-07, "loss": 0.3697, "step": 11210 }, { "epoch": 0.6452726017943409, "grad_norm": 8.69896125793457, "learning_rate": 2.953392481444837e-07, "loss": 0.36, "step": 11220 }, { "epoch": 0.6458477110651024, "grad_norm": 19.59389877319336, "learning_rate": 2.9448985910307544e-07, "loss": 0.3732, "step": 11230 }, { "epoch": 0.6464228203358638, "grad_norm": 10.323915481567383, "learning_rate": 2.936411830931156e-07, "loss": 0.3639, "step": 11240 }, { "epoch": 0.6469979296066253, "grad_norm": 7.897282600402832, "learning_rate": 2.9279322305914334e-07, "loss": 0.3709, "step": 11250 }, { "epoch": 0.6475730388773867, "grad_norm": 9.93266487121582, "learning_rate": 2.9194598194321375e-07, "loss": 0.3677, "step": 11260 }, { "epoch": 0.6481481481481481, "grad_norm": 11.581242561340332, "learning_rate": 2.910994626848878e-07, "loss": 0.3811, "step": 11270 }, { "epoch": 0.6487232574189096, "grad_norm": 12.288777351379395, "learning_rate": 2.9025366822122146e-07, "loss": 0.3379, "step": 11280 }, { "epoch": 0.649298366689671, "grad_norm": 12.624247550964355, "learning_rate": 2.894086014867566e-07, "loss": 0.363, "step": 11290 }, { "epoch": 0.6498734759604324, "grad_norm": 9.52424144744873, "learning_rate": 2.8856426541350994e-07, "loss": 0.4067, "step": 11300 }, { "epoch": 0.650448585231194, "grad_norm": 9.850395202636719, "learning_rate": 2.8772066293096285e-07, "loss": 0.3431, "step": 11310 }, { "epoch": 0.6510236945019554, "grad_norm": 9.17895793914795, "learning_rate": 2.868777969660518e-07, "loss": 0.3627, "step": 11320 }, { "epoch": 0.6515988037727168, "grad_norm": 8.563547134399414, "learning_rate": 2.860356704431575e-07, "loss": 0.344, "step": 11330 }, { "epoch": 0.6521739130434783, "grad_norm": 12.342236518859863, "learning_rate": 2.851942862840957e-07, "loss": 0.3425, "step": 11340 }, { "epoch": 0.6527490223142397, "grad_norm": 11.226378440856934, "learning_rate": 2.8435364740810595e-07, "loss": 0.3531, "step": 11350 }, { "epoch": 0.6533241315850011, "grad_norm": 12.589462280273438, "learning_rate": 2.835137567318422e-07, "loss": 0.366, "step": 11360 }, { "epoch": 0.6538992408557626, "grad_norm": 11.611923217773438, "learning_rate": 2.8267461716936233e-07, "loss": 0.3685, "step": 11370 }, { "epoch": 0.654474350126524, "grad_norm": 11.206465721130371, "learning_rate": 2.818362316321183e-07, "loss": 0.3623, "step": 11380 }, { "epoch": 0.6550494593972855, "grad_norm": 9.734159469604492, "learning_rate": 2.8099860302894603e-07, "loss": 0.38, "step": 11390 }, { "epoch": 0.655624568668047, "grad_norm": 9.573464393615723, "learning_rate": 2.8016173426605493e-07, "loss": 0.3658, "step": 11400 }, { "epoch": 0.6561996779388084, "grad_norm": 10.52890682220459, "learning_rate": 2.7932562824701834e-07, "loss": 0.3847, "step": 11410 }, { "epoch": 0.6567747872095698, "grad_norm": 14.41618824005127, "learning_rate": 2.7849028787276304e-07, "loss": 0.3422, "step": 11420 }, { "epoch": 0.6573498964803313, "grad_norm": 10.575922012329102, "learning_rate": 2.7765571604155923e-07, "loss": 0.3912, "step": 11430 }, { "epoch": 0.6579250057510927, "grad_norm": 10.838577270507812, "learning_rate": 2.7682191564901123e-07, "loss": 0.3684, "step": 11440 }, { "epoch": 0.6585001150218541, "grad_norm": 8.066337585449219, "learning_rate": 2.7598888958804613e-07, "loss": 0.3651, "step": 11450 }, { "epoch": 0.6590752242926156, "grad_norm": 8.986671447753906, "learning_rate": 2.751566407489047e-07, "loss": 0.3732, "step": 11460 }, { "epoch": 0.659650333563377, "grad_norm": 14.571503639221191, "learning_rate": 2.74325172019131e-07, "loss": 0.3748, "step": 11470 }, { "epoch": 0.6602254428341385, "grad_norm": 9.973418235778809, "learning_rate": 2.734944862835622e-07, "loss": 0.3649, "step": 11480 }, { "epoch": 0.6608005521049, "grad_norm": 11.035398483276367, "learning_rate": 2.7266458642431964e-07, "loss": 0.3663, "step": 11490 }, { "epoch": 0.6613756613756614, "grad_norm": 9.11816692352295, "learning_rate": 2.718354753207973e-07, "loss": 0.3886, "step": 11500 }, { "epoch": 0.6619507706464228, "grad_norm": 8.852313995361328, "learning_rate": 2.710071558496526e-07, "loss": 0.3722, "step": 11510 }, { "epoch": 0.6625258799171843, "grad_norm": 11.543720245361328, "learning_rate": 2.7017963088479657e-07, "loss": 0.3344, "step": 11520 }, { "epoch": 0.6631009891879457, "grad_norm": 9.127649307250977, "learning_rate": 2.6935290329738327e-07, "loss": 0.3841, "step": 11530 }, { "epoch": 0.6636760984587071, "grad_norm": 7.7858195304870605, "learning_rate": 2.685269759558006e-07, "loss": 0.3546, "step": 11540 }, { "epoch": 0.6642512077294686, "grad_norm": 8.366317749023438, "learning_rate": 2.6770185172565973e-07, "loss": 0.3577, "step": 11550 }, { "epoch": 0.6648263170002301, "grad_norm": 18.225082397460938, "learning_rate": 2.668775334697856e-07, "loss": 0.3777, "step": 11560 }, { "epoch": 0.6654014262709915, "grad_norm": 8.430078506469727, "learning_rate": 2.6605402404820635e-07, "loss": 0.3804, "step": 11570 }, { "epoch": 0.665976535541753, "grad_norm": 10.557585716247559, "learning_rate": 2.652313263181441e-07, "loss": 0.3794, "step": 11580 }, { "epoch": 0.6665516448125144, "grad_norm": 8.931429862976074, "learning_rate": 2.64409443134005e-07, "loss": 0.3635, "step": 11590 }, { "epoch": 0.6671267540832758, "grad_norm": 8.522780418395996, "learning_rate": 2.635883773473687e-07, "loss": 0.3708, "step": 11600 }, { "epoch": 0.6677018633540373, "grad_norm": 11.639320373535156, "learning_rate": 2.627681318069789e-07, "loss": 0.3632, "step": 11610 }, { "epoch": 0.6682769726247987, "grad_norm": 7.625691890716553, "learning_rate": 2.6194870935873334e-07, "loss": 0.4056, "step": 11620 }, { "epoch": 0.6688520818955601, "grad_norm": 11.023968696594238, "learning_rate": 2.6113011284567403e-07, "loss": 0.3416, "step": 11630 }, { "epoch": 0.6694271911663217, "grad_norm": 7.746280670166016, "learning_rate": 2.603123451079777e-07, "loss": 0.3575, "step": 11640 }, { "epoch": 0.6700023004370831, "grad_norm": 8.20147705078125, "learning_rate": 2.594954089829452e-07, "loss": 0.359, "step": 11650 }, { "epoch": 0.6705774097078445, "grad_norm": 9.55453109741211, "learning_rate": 2.58679307304992e-07, "loss": 0.3671, "step": 11660 }, { "epoch": 0.671152518978606, "grad_norm": 9.929685592651367, "learning_rate": 2.5786404290563853e-07, "loss": 0.3249, "step": 11670 }, { "epoch": 0.6717276282493674, "grad_norm": 11.354582786560059, "learning_rate": 2.570496186135003e-07, "loss": 0.3447, "step": 11680 }, { "epoch": 0.6723027375201288, "grad_norm": 13.547934532165527, "learning_rate": 2.562360372542778e-07, "loss": 0.386, "step": 11690 }, { "epoch": 0.6728778467908902, "grad_norm": 8.371198654174805, "learning_rate": 2.554233016507472e-07, "loss": 0.3554, "step": 11700 }, { "epoch": 0.6734529560616517, "grad_norm": 9.270309448242188, "learning_rate": 2.5461141462275e-07, "loss": 0.3518, "step": 11710 }, { "epoch": 0.6740280653324131, "grad_norm": 7.019067287445068, "learning_rate": 2.538003789871836e-07, "loss": 0.3732, "step": 11720 }, { "epoch": 0.6746031746031746, "grad_norm": 9.43471622467041, "learning_rate": 2.5299019755799134e-07, "loss": 0.3732, "step": 11730 }, { "epoch": 0.6751782838739361, "grad_norm": 8.293878555297852, "learning_rate": 2.521808731461532e-07, "loss": 0.3322, "step": 11740 }, { "epoch": 0.6757533931446975, "grad_norm": 10.619819641113281, "learning_rate": 2.5137240855967533e-07, "loss": 0.3765, "step": 11750 }, { "epoch": 0.6763285024154589, "grad_norm": 9.34313678741455, "learning_rate": 2.505648066035807e-07, "loss": 0.3511, "step": 11760 }, { "epoch": 0.6769036116862204, "grad_norm": 9.56104850769043, "learning_rate": 2.4975807007989937e-07, "loss": 0.3839, "step": 11770 }, { "epoch": 0.6774787209569818, "grad_norm": 8.995777130126953, "learning_rate": 2.4895220178765854e-07, "loss": 0.3669, "step": 11780 }, { "epoch": 0.6780538302277432, "grad_norm": 9.7854642868042, "learning_rate": 2.481472045228736e-07, "loss": 0.3595, "step": 11790 }, { "epoch": 0.6786289394985047, "grad_norm": 10.17132568359375, "learning_rate": 2.473430810785372e-07, "loss": 0.3687, "step": 11800 }, { "epoch": 0.6792040487692662, "grad_norm": 12.5444974899292, "learning_rate": 2.4653983424461053e-07, "loss": 0.3862, "step": 11810 }, { "epoch": 0.6797791580400276, "grad_norm": 11.558302879333496, "learning_rate": 2.4573746680801326e-07, "loss": 0.3495, "step": 11820 }, { "epoch": 0.6803542673107891, "grad_norm": 11.570616722106934, "learning_rate": 2.449359815526139e-07, "loss": 0.3427, "step": 11830 }, { "epoch": 0.6809293765815505, "grad_norm": 9.393718719482422, "learning_rate": 2.441353812592202e-07, "loss": 0.3438, "step": 11840 }, { "epoch": 0.6815044858523119, "grad_norm": 10.474637031555176, "learning_rate": 2.4333566870556946e-07, "loss": 0.3665, "step": 11850 }, { "epoch": 0.6820795951230734, "grad_norm": 10.463462829589844, "learning_rate": 2.4253684666631894e-07, "loss": 0.3862, "step": 11860 }, { "epoch": 0.6826547043938348, "grad_norm": 9.14112377166748, "learning_rate": 2.417389179130362e-07, "loss": 0.3357, "step": 11870 }, { "epoch": 0.6832298136645962, "grad_norm": 10.079310417175293, "learning_rate": 2.4094188521418927e-07, "loss": 0.357, "step": 11880 }, { "epoch": 0.6838049229353578, "grad_norm": 12.55269718170166, "learning_rate": 2.4014575133513783e-07, "loss": 0.3587, "step": 11890 }, { "epoch": 0.6843800322061192, "grad_norm": 10.12232494354248, "learning_rate": 2.393505190381224e-07, "loss": 0.3882, "step": 11900 }, { "epoch": 0.6849551414768806, "grad_norm": 8.600556373596191, "learning_rate": 2.3855619108225573e-07, "loss": 0.3597, "step": 11910 }, { "epoch": 0.6855302507476421, "grad_norm": 11.575599670410156, "learning_rate": 2.3776277022351288e-07, "loss": 0.3537, "step": 11920 }, { "epoch": 0.6861053600184035, "grad_norm": 14.244400978088379, "learning_rate": 2.3697025921472131e-07, "loss": 0.3677, "step": 11930 }, { "epoch": 0.6866804692891649, "grad_norm": 10.260430335998535, "learning_rate": 2.361786608055525e-07, "loss": 0.3713, "step": 11940 }, { "epoch": 0.6872555785599264, "grad_norm": 11.146723747253418, "learning_rate": 2.3538797774251084e-07, "loss": 0.3774, "step": 11950 }, { "epoch": 0.6878306878306878, "grad_norm": 8.774675369262695, "learning_rate": 2.3459821276892523e-07, "loss": 0.3589, "step": 11960 }, { "epoch": 0.6884057971014492, "grad_norm": 9.59105396270752, "learning_rate": 2.33809368624939e-07, "loss": 0.3767, "step": 11970 }, { "epoch": 0.6889809063722108, "grad_norm": 10.355560302734375, "learning_rate": 2.3302144804750072e-07, "loss": 0.3837, "step": 11980 }, { "epoch": 0.6895560156429722, "grad_norm": 9.18564224243164, "learning_rate": 2.3223445377035467e-07, "loss": 0.3515, "step": 11990 }, { "epoch": 0.6901311249137336, "grad_norm": 8.38479232788086, "learning_rate": 2.3144838852403104e-07, "loss": 0.3433, "step": 12000 }, { "epoch": 0.6907062341844951, "grad_norm": 10.972187995910645, "learning_rate": 2.3066325503583688e-07, "loss": 0.3448, "step": 12010 }, { "epoch": 0.6912813434552565, "grad_norm": 10.162338256835938, "learning_rate": 2.2987905602984635e-07, "loss": 0.4152, "step": 12020 }, { "epoch": 0.6918564527260179, "grad_norm": 10.592615127563477, "learning_rate": 2.290957942268912e-07, "loss": 0.3565, "step": 12030 }, { "epoch": 0.6924315619967794, "grad_norm": 7.207513809204102, "learning_rate": 2.2831347234455194e-07, "loss": 0.348, "step": 12040 }, { "epoch": 0.6930066712675408, "grad_norm": 10.998027801513672, "learning_rate": 2.2753209309714766e-07, "loss": 0.3711, "step": 12050 }, { "epoch": 0.6935817805383023, "grad_norm": 10.63157844543457, "learning_rate": 2.267516591957268e-07, "loss": 0.364, "step": 12060 }, { "epoch": 0.6941568898090638, "grad_norm": 10.574440956115723, "learning_rate": 2.2597217334805812e-07, "loss": 0.362, "step": 12070 }, { "epoch": 0.6947319990798252, "grad_norm": 9.121818542480469, "learning_rate": 2.251936382586207e-07, "loss": 0.3646, "step": 12080 }, { "epoch": 0.6953071083505866, "grad_norm": 10.715917587280273, "learning_rate": 2.2441605662859547e-07, "loss": 0.3582, "step": 12090 }, { "epoch": 0.695882217621348, "grad_norm": 13.156806945800781, "learning_rate": 2.2363943115585476e-07, "loss": 0.3528, "step": 12100 }, { "epoch": 0.6964573268921095, "grad_norm": 9.297057151794434, "learning_rate": 2.2286376453495366e-07, "loss": 0.3651, "step": 12110 }, { "epoch": 0.6970324361628709, "grad_norm": 9.173491477966309, "learning_rate": 2.2208905945712026e-07, "loss": 0.3273, "step": 12120 }, { "epoch": 0.6976075454336323, "grad_norm": 10.527074813842773, "learning_rate": 2.2131531861024678e-07, "loss": 0.3649, "step": 12130 }, { "epoch": 0.6981826547043939, "grad_norm": 9.947880744934082, "learning_rate": 2.2054254467887979e-07, "loss": 0.3557, "step": 12140 }, { "epoch": 0.6987577639751553, "grad_norm": 10.328466415405273, "learning_rate": 2.1977074034421121e-07, "loss": 0.3556, "step": 12150 }, { "epoch": 0.6993328732459168, "grad_norm": 13.296133995056152, "learning_rate": 2.1899990828406872e-07, "loss": 0.3816, "step": 12160 }, { "epoch": 0.6999079825166782, "grad_norm": 13.18120002746582, "learning_rate": 2.1823005117290684e-07, "loss": 0.3768, "step": 12170 }, { "epoch": 0.7004830917874396, "grad_norm": 7.442436695098877, "learning_rate": 2.1746117168179702e-07, "loss": 0.3485, "step": 12180 }, { "epoch": 0.701058201058201, "grad_norm": 11.364765167236328, "learning_rate": 2.1669327247841946e-07, "loss": 0.3701, "step": 12190 }, { "epoch": 0.7016333103289625, "grad_norm": 8.353933334350586, "learning_rate": 2.1592635622705268e-07, "loss": 0.3855, "step": 12200 }, { "epoch": 0.7022084195997239, "grad_norm": 9.093218803405762, "learning_rate": 2.1516042558856495e-07, "loss": 0.3554, "step": 12210 }, { "epoch": 0.7027835288704853, "grad_norm": 10.258438110351562, "learning_rate": 2.1439548322040474e-07, "loss": 0.3593, "step": 12220 }, { "epoch": 0.7033586381412469, "grad_norm": 11.912676811218262, "learning_rate": 2.1363153177659166e-07, "loss": 0.3937, "step": 12230 }, { "epoch": 0.7039337474120083, "grad_norm": 11.807969093322754, "learning_rate": 2.1286857390770768e-07, "loss": 0.3692, "step": 12240 }, { "epoch": 0.7045088566827697, "grad_norm": 10.06244945526123, "learning_rate": 2.12106612260887e-07, "loss": 0.3483, "step": 12250 }, { "epoch": 0.7050839659535312, "grad_norm": 12.934144020080566, "learning_rate": 2.1134564947980744e-07, "loss": 0.357, "step": 12260 }, { "epoch": 0.7056590752242926, "grad_norm": 11.410298347473145, "learning_rate": 2.1058568820468131e-07, "loss": 0.3683, "step": 12270 }, { "epoch": 0.706234184495054, "grad_norm": 10.504271507263184, "learning_rate": 2.0982673107224613e-07, "loss": 0.3518, "step": 12280 }, { "epoch": 0.7068092937658155, "grad_norm": 9.841277122497559, "learning_rate": 2.0906878071575533e-07, "loss": 0.3616, "step": 12290 }, { "epoch": 0.7073844030365769, "grad_norm": 10.716776847839355, "learning_rate": 2.0831183976496942e-07, "loss": 0.3562, "step": 12300 }, { "epoch": 0.7079595123073384, "grad_norm": 12.302347183227539, "learning_rate": 2.0755591084614666e-07, "loss": 0.3877, "step": 12310 }, { "epoch": 0.7085346215780999, "grad_norm": 10.450098037719727, "learning_rate": 2.0680099658203386e-07, "loss": 0.3689, "step": 12320 }, { "epoch": 0.7091097308488613, "grad_norm": 10.227410316467285, "learning_rate": 2.0604709959185756e-07, "loss": 0.3459, "step": 12330 }, { "epoch": 0.7096848401196227, "grad_norm": 9.508758544921875, "learning_rate": 2.0529422249131495e-07, "loss": 0.3817, "step": 12340 }, { "epoch": 0.7102599493903842, "grad_norm": 8.112154960632324, "learning_rate": 2.045423678925644e-07, "loss": 0.3648, "step": 12350 }, { "epoch": 0.7108350586611456, "grad_norm": 10.053050994873047, "learning_rate": 2.037915384042167e-07, "loss": 0.3842, "step": 12360 }, { "epoch": 0.711410167931907, "grad_norm": 7.712599754333496, "learning_rate": 2.0304173663132594e-07, "loss": 0.3882, "step": 12370 }, { "epoch": 0.7119852772026685, "grad_norm": 8.629646301269531, "learning_rate": 2.0229296517538037e-07, "loss": 0.3436, "step": 12380 }, { "epoch": 0.7125603864734299, "grad_norm": 8.83506965637207, "learning_rate": 2.0154522663429384e-07, "loss": 0.3381, "step": 12390 }, { "epoch": 0.7131354957441914, "grad_norm": 13.242164611816406, "learning_rate": 2.007985236023962e-07, "loss": 0.3503, "step": 12400 }, { "epoch": 0.7137106050149529, "grad_norm": 11.10744857788086, "learning_rate": 2.0005285867042442e-07, "loss": 0.3874, "step": 12410 }, { "epoch": 0.7142857142857143, "grad_norm": 9.249998092651367, "learning_rate": 1.993082344255139e-07, "loss": 0.3332, "step": 12420 }, { "epoch": 0.7148608235564757, "grad_norm": 11.591361045837402, "learning_rate": 1.9856465345118922e-07, "loss": 0.3933, "step": 12430 }, { "epoch": 0.7154359328272372, "grad_norm": 17.66505241394043, "learning_rate": 1.978221183273553e-07, "loss": 0.3666, "step": 12440 }, { "epoch": 0.7160110420979986, "grad_norm": 17.603321075439453, "learning_rate": 1.970806316302883e-07, "loss": 0.3587, "step": 12450 }, { "epoch": 0.71658615136876, "grad_norm": 14.320380210876465, "learning_rate": 1.9634019593262697e-07, "loss": 0.3352, "step": 12460 }, { "epoch": 0.7171612606395215, "grad_norm": 12.437124252319336, "learning_rate": 1.956008138033634e-07, "loss": 0.3475, "step": 12470 }, { "epoch": 0.717736369910283, "grad_norm": 11.1328706741333, "learning_rate": 1.9486248780783415e-07, "loss": 0.3735, "step": 12480 }, { "epoch": 0.7183114791810444, "grad_norm": 11.849899291992188, "learning_rate": 1.9412522050771203e-07, "loss": 0.3391, "step": 12490 }, { "epoch": 0.7188865884518059, "grad_norm": 22.29025650024414, "learning_rate": 1.9338901446099586e-07, "loss": 0.3136, "step": 12500 }, { "epoch": 0.7194616977225673, "grad_norm": 11.59174633026123, "learning_rate": 1.926538722220029e-07, "loss": 0.3662, "step": 12510 }, { "epoch": 0.7200368069933287, "grad_norm": 9.499114990234375, "learning_rate": 1.9191979634135924e-07, "loss": 0.3746, "step": 12520 }, { "epoch": 0.7206119162640902, "grad_norm": 9.597895622253418, "learning_rate": 1.9118678936599103e-07, "loss": 0.3802, "step": 12530 }, { "epoch": 0.7211870255348516, "grad_norm": 13.12649154663086, "learning_rate": 1.9045485383911625e-07, "loss": 0.3831, "step": 12540 }, { "epoch": 0.721762134805613, "grad_norm": 12.615784645080566, "learning_rate": 1.89723992300235e-07, "loss": 0.3615, "step": 12550 }, { "epoch": 0.7223372440763746, "grad_norm": 10.053581237792969, "learning_rate": 1.8899420728512123e-07, "loss": 0.3792, "step": 12560 }, { "epoch": 0.722912353347136, "grad_norm": 10.481292724609375, "learning_rate": 1.882655013258139e-07, "loss": 0.3599, "step": 12570 }, { "epoch": 0.7234874626178974, "grad_norm": 10.406064987182617, "learning_rate": 1.8753787695060768e-07, "loss": 0.3583, "step": 12580 }, { "epoch": 0.7240625718886589, "grad_norm": 12.189239501953125, "learning_rate": 1.8681133668404535e-07, "loss": 0.3435, "step": 12590 }, { "epoch": 0.7246376811594203, "grad_norm": 10.19714641571045, "learning_rate": 1.8608588304690775e-07, "loss": 0.3407, "step": 12600 }, { "epoch": 0.7252127904301817, "grad_norm": 7.726068019866943, "learning_rate": 1.853615185562058e-07, "loss": 0.3641, "step": 12610 }, { "epoch": 0.7257878997009432, "grad_norm": 15.442294120788574, "learning_rate": 1.8463824572517146e-07, "loss": 0.3521, "step": 12620 }, { "epoch": 0.7263630089717046, "grad_norm": 17.015697479248047, "learning_rate": 1.8391606706324898e-07, "loss": 0.3971, "step": 12630 }, { "epoch": 0.726938118242466, "grad_norm": 11.070687294006348, "learning_rate": 1.8319498507608676e-07, "loss": 0.3366, "step": 12640 }, { "epoch": 0.7275132275132276, "grad_norm": 8.336080551147461, "learning_rate": 1.8247500226552787e-07, "loss": 0.3661, "step": 12650 }, { "epoch": 0.728088336783989, "grad_norm": 12.03930377960205, "learning_rate": 1.8175612112960164e-07, "loss": 0.3669, "step": 12660 }, { "epoch": 0.7286634460547504, "grad_norm": 8.635086059570312, "learning_rate": 1.810383441625153e-07, "loss": 0.3372, "step": 12670 }, { "epoch": 0.7292385553255119, "grad_norm": 7.580562114715576, "learning_rate": 1.8032167385464475e-07, "loss": 0.3627, "step": 12680 }, { "epoch": 0.7298136645962733, "grad_norm": 10.577232360839844, "learning_rate": 1.7960611269252684e-07, "loss": 0.3664, "step": 12690 }, { "epoch": 0.7303887738670347, "grad_norm": 12.720074653625488, "learning_rate": 1.7889166315884973e-07, "loss": 0.3567, "step": 12700 }, { "epoch": 0.7309638831377961, "grad_norm": 9.77199935913086, "learning_rate": 1.7817832773244484e-07, "loss": 0.3734, "step": 12710 }, { "epoch": 0.7315389924085576, "grad_norm": 11.00805377960205, "learning_rate": 1.7746610888827784e-07, "loss": 0.361, "step": 12720 }, { "epoch": 0.7321141016793191, "grad_norm": 9.939645767211914, "learning_rate": 1.7675500909744056e-07, "loss": 0.3494, "step": 12730 }, { "epoch": 0.7326892109500805, "grad_norm": 13.502605438232422, "learning_rate": 1.760450308271425e-07, "loss": 0.348, "step": 12740 }, { "epoch": 0.733264320220842, "grad_norm": 7.523022651672363, "learning_rate": 1.753361765407016e-07, "loss": 0.3513, "step": 12750 }, { "epoch": 0.7338394294916034, "grad_norm": 10.321375846862793, "learning_rate": 1.74628448697536e-07, "loss": 0.366, "step": 12760 }, { "epoch": 0.7344145387623648, "grad_norm": 8.382083892822266, "learning_rate": 1.7392184975315588e-07, "loss": 0.3481, "step": 12770 }, { "epoch": 0.7349896480331263, "grad_norm": 13.211483001708984, "learning_rate": 1.7321638215915425e-07, "loss": 0.3752, "step": 12780 }, { "epoch": 0.7355647573038877, "grad_norm": 10.795145034790039, "learning_rate": 1.7251204836319933e-07, "loss": 0.3496, "step": 12790 }, { "epoch": 0.7361398665746491, "grad_norm": 8.237744331359863, "learning_rate": 1.7180885080902508e-07, "loss": 0.3447, "step": 12800 }, { "epoch": 0.7367149758454107, "grad_norm": 8.555767059326172, "learning_rate": 1.711067919364234e-07, "loss": 0.3724, "step": 12810 }, { "epoch": 0.7372900851161721, "grad_norm": 9.814056396484375, "learning_rate": 1.7040587418123542e-07, "loss": 0.3392, "step": 12820 }, { "epoch": 0.7378651943869335, "grad_norm": 9.928976058959961, "learning_rate": 1.6970609997534291e-07, "loss": 0.3355, "step": 12830 }, { "epoch": 0.738440303657695, "grad_norm": 9.701496124267578, "learning_rate": 1.6900747174666075e-07, "loss": 0.345, "step": 12840 }, { "epoch": 0.7390154129284564, "grad_norm": 10.921716690063477, "learning_rate": 1.6830999191912664e-07, "loss": 0.337, "step": 12850 }, { "epoch": 0.7395905221992178, "grad_norm": 10.669507026672363, "learning_rate": 1.6761366291269462e-07, "loss": 0.3831, "step": 12860 }, { "epoch": 0.7401656314699793, "grad_norm": 10.588301658630371, "learning_rate": 1.6691848714332563e-07, "loss": 0.3706, "step": 12870 }, { "epoch": 0.7407407407407407, "grad_norm": 11.753715515136719, "learning_rate": 1.662244670229793e-07, "loss": 0.3701, "step": 12880 }, { "epoch": 0.7413158500115021, "grad_norm": 14.984206199645996, "learning_rate": 1.6553160495960606e-07, "loss": 0.3632, "step": 12890 }, { "epoch": 0.7418909592822637, "grad_norm": 10.777162551879883, "learning_rate": 1.6483990335713792e-07, "loss": 0.3463, "step": 12900 }, { "epoch": 0.7424660685530251, "grad_norm": 8.32373332977295, "learning_rate": 1.641493646154808e-07, "loss": 0.3285, "step": 12910 }, { "epoch": 0.7430411778237865, "grad_norm": 10.207345962524414, "learning_rate": 1.6345999113050584e-07, "loss": 0.3671, "step": 12920 }, { "epoch": 0.743616287094548, "grad_norm": 10.594234466552734, "learning_rate": 1.6277178529404133e-07, "loss": 0.3574, "step": 12930 }, { "epoch": 0.7441913963653094, "grad_norm": 9.562149047851562, "learning_rate": 1.6208474949386457e-07, "loss": 0.3523, "step": 12940 }, { "epoch": 0.7447665056360708, "grad_norm": 10.264373779296875, "learning_rate": 1.6139888611369306e-07, "loss": 0.3414, "step": 12950 }, { "epoch": 0.7453416149068323, "grad_norm": 8.59309196472168, "learning_rate": 1.607141975331765e-07, "loss": 0.3547, "step": 12960 }, { "epoch": 0.7459167241775937, "grad_norm": 8.123299598693848, "learning_rate": 1.6003068612788862e-07, "loss": 0.3779, "step": 12970 }, { "epoch": 0.7464918334483552, "grad_norm": 9.963277816772461, "learning_rate": 1.593483542693188e-07, "loss": 0.3448, "step": 12980 }, { "epoch": 0.7470669427191167, "grad_norm": 10.955879211425781, "learning_rate": 1.5866720432486408e-07, "loss": 0.3739, "step": 12990 }, { "epoch": 0.7476420519898781, "grad_norm": 8.467711448669434, "learning_rate": 1.5798723865782054e-07, "loss": 0.3342, "step": 13000 }, { "epoch": 0.7482171612606395, "grad_norm": 11.882523536682129, "learning_rate": 1.5730845962737549e-07, "loss": 0.3764, "step": 13010 }, { "epoch": 0.748792270531401, "grad_norm": 11.149009704589844, "learning_rate": 1.56630869588599e-07, "loss": 0.3468, "step": 13020 }, { "epoch": 0.7493673798021624, "grad_norm": 8.852293968200684, "learning_rate": 1.5595447089243585e-07, "loss": 0.3743, "step": 13030 }, { "epoch": 0.7499424890729238, "grad_norm": 10.239322662353516, "learning_rate": 1.5527926588569768e-07, "loss": 0.355, "step": 13040 }, { "epoch": 0.7505175983436853, "grad_norm": 10.110032081604004, "learning_rate": 1.5460525691105414e-07, "loss": 0.3484, "step": 13050 }, { "epoch": 0.7510927076144468, "grad_norm": 10.117029190063477, "learning_rate": 1.539324463070254e-07, "loss": 0.3668, "step": 13060 }, { "epoch": 0.7516678168852082, "grad_norm": 7.320294380187988, "learning_rate": 1.532608364079737e-07, "loss": 0.3782, "step": 13070 }, { "epoch": 0.7522429261559697, "grad_norm": 10.726649284362793, "learning_rate": 1.5259042954409517e-07, "loss": 0.3804, "step": 13080 }, { "epoch": 0.7528180354267311, "grad_norm": 10.829776763916016, "learning_rate": 1.5192122804141256e-07, "loss": 0.3626, "step": 13090 }, { "epoch": 0.7533931446974925, "grad_norm": 11.959969520568848, "learning_rate": 1.512532342217659e-07, "loss": 0.3392, "step": 13100 }, { "epoch": 0.753968253968254, "grad_norm": 11.82127571105957, "learning_rate": 1.5058645040280531e-07, "loss": 0.3593, "step": 13110 }, { "epoch": 0.7545433632390154, "grad_norm": 14.269821166992188, "learning_rate": 1.499208788979827e-07, "loss": 0.3473, "step": 13120 }, { "epoch": 0.7551184725097768, "grad_norm": 8.863948822021484, "learning_rate": 1.492565220165438e-07, "loss": 0.3471, "step": 13130 }, { "epoch": 0.7556935817805382, "grad_norm": 9.357298851013184, "learning_rate": 1.485933820635202e-07, "loss": 0.3692, "step": 13140 }, { "epoch": 0.7562686910512998, "grad_norm": 11.794713020324707, "learning_rate": 1.4793146133972107e-07, "loss": 0.3688, "step": 13150 }, { "epoch": 0.7568438003220612, "grad_norm": 11.96875286102295, "learning_rate": 1.472707621417255e-07, "loss": 0.3677, "step": 13160 }, { "epoch": 0.7574189095928227, "grad_norm": 10.658202171325684, "learning_rate": 1.466112867618745e-07, "loss": 0.3598, "step": 13170 }, { "epoch": 0.7579940188635841, "grad_norm": 10.615762710571289, "learning_rate": 1.459530374882627e-07, "loss": 0.3574, "step": 13180 }, { "epoch": 0.7585691281343455, "grad_norm": 10.985566139221191, "learning_rate": 1.452960166047311e-07, "loss": 0.3726, "step": 13190 }, { "epoch": 0.759144237405107, "grad_norm": 9.561962127685547, "learning_rate": 1.4464022639085833e-07, "loss": 0.3508, "step": 13200 }, { "epoch": 0.7597193466758684, "grad_norm": 9.164432525634766, "learning_rate": 1.439856691219533e-07, "loss": 0.3477, "step": 13210 }, { "epoch": 0.7602944559466298, "grad_norm": 10.697032928466797, "learning_rate": 1.4333234706904702e-07, "loss": 0.3505, "step": 13220 }, { "epoch": 0.7608695652173914, "grad_norm": 10.152024269104004, "learning_rate": 1.4268026249888475e-07, "loss": 0.3862, "step": 13230 }, { "epoch": 0.7614446744881528, "grad_norm": 8.366179466247559, "learning_rate": 1.420294176739188e-07, "loss": 0.356, "step": 13240 }, { "epoch": 0.7620197837589142, "grad_norm": 12.960453987121582, "learning_rate": 1.4137981485229932e-07, "loss": 0.346, "step": 13250 }, { "epoch": 0.7625948930296756, "grad_norm": 11.22591781616211, "learning_rate": 1.407314562878678e-07, "loss": 0.3744, "step": 13260 }, { "epoch": 0.7631700023004371, "grad_norm": 8.118637084960938, "learning_rate": 1.4008434423014836e-07, "loss": 0.3282, "step": 13270 }, { "epoch": 0.7637451115711985, "grad_norm": 9.106740951538086, "learning_rate": 1.394384809243405e-07, "loss": 0.3479, "step": 13280 }, { "epoch": 0.7643202208419599, "grad_norm": 9.564493179321289, "learning_rate": 1.3879386861131092e-07, "loss": 0.3781, "step": 13290 }, { "epoch": 0.7648953301127214, "grad_norm": 10.076489448547363, "learning_rate": 1.3815050952758611e-07, "loss": 0.3618, "step": 13300 }, { "epoch": 0.7654704393834829, "grad_norm": 12.20744514465332, "learning_rate": 1.3750840590534413e-07, "loss": 0.3589, "step": 13310 }, { "epoch": 0.7660455486542443, "grad_norm": 12.875628471374512, "learning_rate": 1.368675599724074e-07, "loss": 0.3662, "step": 13320 }, { "epoch": 0.7666206579250058, "grad_norm": 10.139248847961426, "learning_rate": 1.362279739522344e-07, "loss": 0.3469, "step": 13330 }, { "epoch": 0.7671957671957672, "grad_norm": 12.559904098510742, "learning_rate": 1.3558965006391275e-07, "loss": 0.3187, "step": 13340 }, { "epoch": 0.7677708764665286, "grad_norm": 13.162323951721191, "learning_rate": 1.3495259052215057e-07, "loss": 0.3633, "step": 13350 }, { "epoch": 0.7683459857372901, "grad_norm": 8.369441986083984, "learning_rate": 1.3431679753726937e-07, "loss": 0.3657, "step": 13360 }, { "epoch": 0.7689210950080515, "grad_norm": 11.537557601928711, "learning_rate": 1.3368227331519626e-07, "loss": 0.3755, "step": 13370 }, { "epoch": 0.7694962042788129, "grad_norm": 11.373461723327637, "learning_rate": 1.3304902005745616e-07, "loss": 0.3495, "step": 13380 }, { "epoch": 0.7700713135495744, "grad_norm": 10.185256004333496, "learning_rate": 1.324170399611647e-07, "loss": 0.3542, "step": 13390 }, { "epoch": 0.7706464228203359, "grad_norm": 10.871580123901367, "learning_rate": 1.3178633521901983e-07, "loss": 0.3461, "step": 13400 }, { "epoch": 0.7712215320910973, "grad_norm": 11.034517288208008, "learning_rate": 1.3115690801929464e-07, "loss": 0.3556, "step": 13410 }, { "epoch": 0.7717966413618588, "grad_norm": 12.19965934753418, "learning_rate": 1.3052876054582967e-07, "loss": 0.3627, "step": 13420 }, { "epoch": 0.7723717506326202, "grad_norm": 9.713489532470703, "learning_rate": 1.2990189497802551e-07, "loss": 0.3573, "step": 13430 }, { "epoch": 0.7729468599033816, "grad_norm": 8.96914005279541, "learning_rate": 1.292763134908349e-07, "loss": 0.3606, "step": 13440 }, { "epoch": 0.7735219691741431, "grad_norm": 11.563376426696777, "learning_rate": 1.2865201825475553e-07, "loss": 0.3685, "step": 13450 }, { "epoch": 0.7740970784449045, "grad_norm": 12.596309661865234, "learning_rate": 1.2802901143582228e-07, "loss": 0.371, "step": 13460 }, { "epoch": 0.7746721877156659, "grad_norm": 11.115567207336426, "learning_rate": 1.274072951955998e-07, "loss": 0.3341, "step": 13470 }, { "epoch": 0.7752472969864275, "grad_norm": 13.427979469299316, "learning_rate": 1.267868716911748e-07, "loss": 0.3454, "step": 13480 }, { "epoch": 0.7758224062571889, "grad_norm": 12.673726081848145, "learning_rate": 1.2616774307514928e-07, "loss": 0.3523, "step": 13490 }, { "epoch": 0.7763975155279503, "grad_norm": 8.59458065032959, "learning_rate": 1.25549911495632e-07, "loss": 0.3395, "step": 13500 }, { "epoch": 0.7769726247987118, "grad_norm": 14.911060333251953, "learning_rate": 1.249333790962318e-07, "loss": 0.3342, "step": 13510 }, { "epoch": 0.7775477340694732, "grad_norm": 11.930248260498047, "learning_rate": 1.243181480160499e-07, "loss": 0.3753, "step": 13520 }, { "epoch": 0.7781228433402346, "grad_norm": 10.171967506408691, "learning_rate": 1.2370422038967228e-07, "loss": 0.3656, "step": 13530 }, { "epoch": 0.7786979526109961, "grad_norm": 12.881772994995117, "learning_rate": 1.23091598347163e-07, "loss": 0.3711, "step": 13540 }, { "epoch": 0.7792730618817575, "grad_norm": 8.411242485046387, "learning_rate": 1.2248028401405592e-07, "loss": 0.3418, "step": 13550 }, { "epoch": 0.779848171152519, "grad_norm": 11.161796569824219, "learning_rate": 1.2187027951134775e-07, "loss": 0.3343, "step": 13560 }, { "epoch": 0.7804232804232805, "grad_norm": 11.658342361450195, "learning_rate": 1.2126158695549078e-07, "loss": 0.348, "step": 13570 }, { "epoch": 0.7809983896940419, "grad_norm": 11.38686466217041, "learning_rate": 1.2065420845838525e-07, "loss": 0.3513, "step": 13580 }, { "epoch": 0.7815734989648033, "grad_norm": 10.265167236328125, "learning_rate": 1.2004814612737236e-07, "loss": 0.3561, "step": 13590 }, { "epoch": 0.7821486082355648, "grad_norm": 12.847384452819824, "learning_rate": 1.1944340206522664e-07, "loss": 0.371, "step": 13600 }, { "epoch": 0.7827237175063262, "grad_norm": 14.280150413513184, "learning_rate": 1.1883997837014887e-07, "loss": 0.3495, "step": 13610 }, { "epoch": 0.7832988267770876, "grad_norm": 10.117447853088379, "learning_rate": 1.1823787713575873e-07, "loss": 0.3591, "step": 13620 }, { "epoch": 0.783873936047849, "grad_norm": 10.571895599365234, "learning_rate": 1.1763710045108732e-07, "loss": 0.3746, "step": 13630 }, { "epoch": 0.7844490453186105, "grad_norm": 9.489627838134766, "learning_rate": 1.1703765040057068e-07, "loss": 0.3709, "step": 13640 }, { "epoch": 0.785024154589372, "grad_norm": 9.369521141052246, "learning_rate": 1.1643952906404142e-07, "loss": 0.3652, "step": 13650 }, { "epoch": 0.7855992638601335, "grad_norm": 11.9126558303833, "learning_rate": 1.1584273851672238e-07, "loss": 0.3359, "step": 13660 }, { "epoch": 0.7861743731308949, "grad_norm": 14.555208206176758, "learning_rate": 1.1524728082921897e-07, "loss": 0.3612, "step": 13670 }, { "epoch": 0.7867494824016563, "grad_norm": 10.765843391418457, "learning_rate": 1.1465315806751218e-07, "loss": 0.3741, "step": 13680 }, { "epoch": 0.7873245916724178, "grad_norm": 10.353617668151855, "learning_rate": 1.1406037229295168e-07, "loss": 0.367, "step": 13690 }, { "epoch": 0.7878997009431792, "grad_norm": 9.79420280456543, "learning_rate": 1.1346892556224802e-07, "loss": 0.357, "step": 13700 }, { "epoch": 0.7884748102139406, "grad_norm": 10.044177055358887, "learning_rate": 1.1287881992746594e-07, "loss": 0.3431, "step": 13710 }, { "epoch": 0.789049919484702, "grad_norm": 11.224512100219727, "learning_rate": 1.1229005743601721e-07, "loss": 0.3586, "step": 13720 }, { "epoch": 0.7896250287554636, "grad_norm": 9.292570114135742, "learning_rate": 1.1170264013065345e-07, "loss": 0.3597, "step": 13730 }, { "epoch": 0.790200138026225, "grad_norm": 10.773053169250488, "learning_rate": 1.1111657004945906e-07, "loss": 0.3541, "step": 13740 }, { "epoch": 0.7907752472969864, "grad_norm": 11.246283531188965, "learning_rate": 1.1053184922584412e-07, "loss": 0.3454, "step": 13750 }, { "epoch": 0.7913503565677479, "grad_norm": 9.876521110534668, "learning_rate": 1.0994847968853743e-07, "loss": 0.3314, "step": 13760 }, { "epoch": 0.7919254658385093, "grad_norm": 9.813785552978516, "learning_rate": 1.0936646346157936e-07, "loss": 0.3563, "step": 13770 }, { "epoch": 0.7925005751092707, "grad_norm": 11.603730201721191, "learning_rate": 1.0878580256431474e-07, "loss": 0.3565, "step": 13780 }, { "epoch": 0.7930756843800322, "grad_norm": 8.297478675842285, "learning_rate": 1.0820649901138651e-07, "loss": 0.35, "step": 13790 }, { "epoch": 0.7936507936507936, "grad_norm": 11.70054817199707, "learning_rate": 1.0762855481272765e-07, "loss": 0.3307, "step": 13800 }, { "epoch": 0.7942259029215551, "grad_norm": 11.532483100891113, "learning_rate": 1.0705197197355492e-07, "loss": 0.3807, "step": 13810 }, { "epoch": 0.7948010121923166, "grad_norm": 10.842299461364746, "learning_rate": 1.0647675249436195e-07, "loss": 0.3684, "step": 13820 }, { "epoch": 0.795376121463078, "grad_norm": 12.1455078125, "learning_rate": 1.0590289837091166e-07, "loss": 0.3708, "step": 13830 }, { "epoch": 0.7959512307338394, "grad_norm": 9.389388084411621, "learning_rate": 1.0533041159423039e-07, "loss": 0.3517, "step": 13840 }, { "epoch": 0.7965263400046009, "grad_norm": 12.699463844299316, "learning_rate": 1.0475929415059997e-07, "loss": 0.3489, "step": 13850 }, { "epoch": 0.7971014492753623, "grad_norm": 12.861173629760742, "learning_rate": 1.0418954802155128e-07, "loss": 0.3741, "step": 13860 }, { "epoch": 0.7976765585461237, "grad_norm": 8.38845157623291, "learning_rate": 1.0362117518385733e-07, "loss": 0.3565, "step": 13870 }, { "epoch": 0.7982516678168852, "grad_norm": 10.280447959899902, "learning_rate": 1.0305417760952646e-07, "loss": 0.3801, "step": 13880 }, { "epoch": 0.7988267770876466, "grad_norm": 11.884799003601074, "learning_rate": 1.0248855726579548e-07, "loss": 0.3727, "step": 13890 }, { "epoch": 0.7994018863584081, "grad_norm": 12.978116035461426, "learning_rate": 1.0192431611512264e-07, "loss": 0.374, "step": 13900 }, { "epoch": 0.7999769956291696, "grad_norm": 9.984334945678711, "learning_rate": 1.0136145611518116e-07, "loss": 0.3448, "step": 13910 }, { "epoch": 0.800552104899931, "grad_norm": 12.050146102905273, "learning_rate": 1.007999792188522e-07, "loss": 0.3441, "step": 13920 }, { "epoch": 0.8011272141706924, "grad_norm": 8.787823677062988, "learning_rate": 1.0023988737421796e-07, "loss": 0.3443, "step": 13930 }, { "epoch": 0.8017023234414539, "grad_norm": 10.595130920410156, "learning_rate": 9.968118252455554e-08, "loss": 0.3745, "step": 13940 }, { "epoch": 0.8022774327122153, "grad_norm": 10.575966835021973, "learning_rate": 9.912386660832944e-08, "loss": 0.3641, "step": 13950 }, { "epoch": 0.8028525419829767, "grad_norm": 14.217774391174316, "learning_rate": 9.856794155918524e-08, "loss": 0.3654, "step": 13960 }, { "epoch": 0.8034276512537382, "grad_norm": 12.97602367401123, "learning_rate": 9.801340930594281e-08, "loss": 0.3808, "step": 13970 }, { "epoch": 0.8040027605244997, "grad_norm": 10.63708782196045, "learning_rate": 9.746027177258942e-08, "loss": 0.3645, "step": 13980 }, { "epoch": 0.8045778697952611, "grad_norm": 9.457315444946289, "learning_rate": 9.690853087827383e-08, "loss": 0.3457, "step": 13990 }, { "epoch": 0.8051529790660226, "grad_norm": 13.015606880187988, "learning_rate": 9.635818853729844e-08, "loss": 0.3344, "step": 14000 }, { "epoch": 0.805728088336784, "grad_norm": 9.467044830322266, "learning_rate": 9.58092466591136e-08, "loss": 0.3493, "step": 14010 }, { "epoch": 0.8063031976075454, "grad_norm": 13.132433891296387, "learning_rate": 9.52617071483106e-08, "loss": 0.3643, "step": 14020 }, { "epoch": 0.8068783068783069, "grad_norm": 11.035394668579102, "learning_rate": 9.471557190461515e-08, "loss": 0.367, "step": 14030 }, { "epoch": 0.8074534161490683, "grad_norm": 10.35888385772705, "learning_rate": 9.417084282288062e-08, "loss": 0.3837, "step": 14040 }, { "epoch": 0.8080285254198297, "grad_norm": 8.863741874694824, "learning_rate": 9.362752179308176e-08, "loss": 0.3292, "step": 14050 }, { "epoch": 0.8086036346905913, "grad_norm": 11.647882461547852, "learning_rate": 9.308561070030791e-08, "loss": 0.3186, "step": 14060 }, { "epoch": 0.8091787439613527, "grad_norm": 10.325666427612305, "learning_rate": 9.254511142475657e-08, "loss": 0.3594, "step": 14070 }, { "epoch": 0.8097538532321141, "grad_norm": 9.519370079040527, "learning_rate": 9.200602584172678e-08, "loss": 0.3546, "step": 14080 }, { "epoch": 0.8103289625028756, "grad_norm": 12.68906307220459, "learning_rate": 9.146835582161289e-08, "loss": 0.3824, "step": 14090 }, { "epoch": 0.810904071773637, "grad_norm": 13.472137451171875, "learning_rate": 9.093210322989758e-08, "loss": 0.347, "step": 14100 }, { "epoch": 0.8114791810443984, "grad_norm": 9.139920234680176, "learning_rate": 9.039726992714586e-08, "loss": 0.3684, "step": 14110 }, { "epoch": 0.8120542903151599, "grad_norm": 10.475013732910156, "learning_rate": 8.986385776899829e-08, "loss": 0.3514, "step": 14120 }, { "epoch": 0.8126293995859213, "grad_norm": 10.607888221740723, "learning_rate": 8.933186860616459e-08, "loss": 0.3648, "step": 14130 }, { "epoch": 0.8132045088566827, "grad_norm": 13.211078643798828, "learning_rate": 8.880130428441774e-08, "loss": 0.3614, "step": 14140 }, { "epoch": 0.8137796181274443, "grad_norm": 11.186147689819336, "learning_rate": 8.827216664458664e-08, "loss": 0.3437, "step": 14150 }, { "epoch": 0.8143547273982057, "grad_norm": 9.046735763549805, "learning_rate": 8.774445752255049e-08, "loss": 0.3755, "step": 14160 }, { "epoch": 0.8149298366689671, "grad_norm": 9.569860458374023, "learning_rate": 8.721817874923204e-08, "loss": 0.3648, "step": 14170 }, { "epoch": 0.8155049459397286, "grad_norm": 12.560746192932129, "learning_rate": 8.669333215059137e-08, "loss": 0.358, "step": 14180 }, { "epoch": 0.81608005521049, "grad_norm": 8.761438369750977, "learning_rate": 8.616991954761954e-08, "loss": 0.3305, "step": 14190 }, { "epoch": 0.8166551644812514, "grad_norm": 16.295719146728516, "learning_rate": 8.564794275633236e-08, "loss": 0.3454, "step": 14200 }, { "epoch": 0.8172302737520128, "grad_norm": 14.543996810913086, "learning_rate": 8.512740358776377e-08, "loss": 0.3481, "step": 14210 }, { "epoch": 0.8178053830227743, "grad_norm": 11.397529602050781, "learning_rate": 8.460830384795997e-08, "loss": 0.3558, "step": 14220 }, { "epoch": 0.8183804922935358, "grad_norm": 11.829632759094238, "learning_rate": 8.409064533797283e-08, "loss": 0.3737, "step": 14230 }, { "epoch": 0.8189556015642973, "grad_norm": 10.892115592956543, "learning_rate": 8.357442985385398e-08, "loss": 0.3523, "step": 14240 }, { "epoch": 0.8195307108350587, "grad_norm": 14.383431434631348, "learning_rate": 8.305965918664826e-08, "loss": 0.3464, "step": 14250 }, { "epoch": 0.8201058201058201, "grad_norm": 9.2858304977417, "learning_rate": 8.254633512238757e-08, "loss": 0.3192, "step": 14260 }, { "epoch": 0.8206809293765815, "grad_norm": 11.794058799743652, "learning_rate": 8.203445944208464e-08, "loss": 0.3314, "step": 14270 }, { "epoch": 0.821256038647343, "grad_norm": 11.258898735046387, "learning_rate": 8.152403392172708e-08, "loss": 0.3572, "step": 14280 }, { "epoch": 0.8218311479181044, "grad_norm": 12.50719165802002, "learning_rate": 8.101506033227118e-08, "loss": 0.3742, "step": 14290 }, { "epoch": 0.8224062571888658, "grad_norm": 10.249022483825684, "learning_rate": 8.050754043963543e-08, "loss": 0.3315, "step": 14300 }, { "epoch": 0.8229813664596274, "grad_norm": 10.835911750793457, "learning_rate": 8.000147600469476e-08, "loss": 0.3452, "step": 14310 }, { "epoch": 0.8235564757303888, "grad_norm": 10.978577613830566, "learning_rate": 7.949686878327427e-08, "loss": 0.3731, "step": 14320 }, { "epoch": 0.8241315850011502, "grad_norm": 10.263269424438477, "learning_rate": 7.899372052614278e-08, "loss": 0.3774, "step": 14330 }, { "epoch": 0.8247066942719117, "grad_norm": 10.701677322387695, "learning_rate": 7.849203297900792e-08, "loss": 0.3432, "step": 14340 }, { "epoch": 0.8252818035426731, "grad_norm": 8.649724960327148, "learning_rate": 7.799180788250858e-08, "loss": 0.3698, "step": 14350 }, { "epoch": 0.8258569128134345, "grad_norm": 11.309865951538086, "learning_rate": 7.749304697221004e-08, "loss": 0.3513, "step": 14360 }, { "epoch": 0.826432022084196, "grad_norm": 10.158900260925293, "learning_rate": 7.699575197859709e-08, "loss": 0.3418, "step": 14370 }, { "epoch": 0.8270071313549574, "grad_norm": 12.188514709472656, "learning_rate": 7.649992462706867e-08, "loss": 0.3589, "step": 14380 }, { "epoch": 0.8275822406257188, "grad_norm": 11.690703392028809, "learning_rate": 7.600556663793173e-08, "loss": 0.3448, "step": 14390 }, { "epoch": 0.8281573498964804, "grad_norm": 10.629937171936035, "learning_rate": 7.551267972639491e-08, "loss": 0.3533, "step": 14400 }, { "epoch": 0.8287324591672418, "grad_norm": 11.284601211547852, "learning_rate": 7.5021265602563e-08, "loss": 0.3547, "step": 14410 }, { "epoch": 0.8293075684380032, "grad_norm": 14.455087661743164, "learning_rate": 7.453132597143064e-08, "loss": 0.3663, "step": 14420 }, { "epoch": 0.8298826777087647, "grad_norm": 8.814629554748535, "learning_rate": 7.404286253287712e-08, "loss": 0.3452, "step": 14430 }, { "epoch": 0.8304577869795261, "grad_norm": 13.938016891479492, "learning_rate": 7.355587698165944e-08, "loss": 0.3575, "step": 14440 }, { "epoch": 0.8310328962502875, "grad_norm": 10.003166198730469, "learning_rate": 7.307037100740721e-08, "loss": 0.3817, "step": 14450 }, { "epoch": 0.831608005521049, "grad_norm": 11.926239013671875, "learning_rate": 7.258634629461662e-08, "loss": 0.3259, "step": 14460 }, { "epoch": 0.8321831147918104, "grad_norm": 8.915820121765137, "learning_rate": 7.210380452264414e-08, "loss": 0.3894, "step": 14470 }, { "epoch": 0.8327582240625719, "grad_norm": 10.52724838256836, "learning_rate": 7.162274736570162e-08, "loss": 0.351, "step": 14480 }, { "epoch": 0.8333333333333334, "grad_norm": 8.204668998718262, "learning_rate": 7.114317649284957e-08, "loss": 0.3259, "step": 14490 }, { "epoch": 0.8339084426040948, "grad_norm": 11.467281341552734, "learning_rate": 7.066509356799189e-08, "loss": 0.3591, "step": 14500 }, { "epoch": 0.8344835518748562, "grad_norm": 11.008315086364746, "learning_rate": 7.018850024986983e-08, "loss": 0.3627, "step": 14510 }, { "epoch": 0.8350586611456177, "grad_norm": 10.458906173706055, "learning_rate": 6.971339819205629e-08, "loss": 0.3861, "step": 14520 }, { "epoch": 0.8356337704163791, "grad_norm": 9.528473854064941, "learning_rate": 6.923978904295052e-08, "loss": 0.3682, "step": 14530 }, { "epoch": 0.8362088796871405, "grad_norm": 9.396112442016602, "learning_rate": 6.876767444577164e-08, "loss": 0.3316, "step": 14540 }, { "epoch": 0.836783988957902, "grad_norm": 11.007659912109375, "learning_rate": 6.829705603855346e-08, "loss": 0.3497, "step": 14550 }, { "epoch": 0.8373590982286635, "grad_norm": 7.507017135620117, "learning_rate": 6.782793545413861e-08, "loss": 0.3596, "step": 14560 }, { "epoch": 0.8379342074994249, "grad_norm": 11.661662101745605, "learning_rate": 6.73603143201728e-08, "loss": 0.3482, "step": 14570 }, { "epoch": 0.8385093167701864, "grad_norm": 11.868661880493164, "learning_rate": 6.689419425909976e-08, "loss": 0.3509, "step": 14580 }, { "epoch": 0.8390844260409478, "grad_norm": 11.347305297851562, "learning_rate": 6.642957688815476e-08, "loss": 0.3475, "step": 14590 }, { "epoch": 0.8396595353117092, "grad_norm": 10.524690628051758, "learning_rate": 6.596646381935922e-08, "loss": 0.3419, "step": 14600 }, { "epoch": 0.8402346445824707, "grad_norm": 10.767640113830566, "learning_rate": 6.550485665951567e-08, "loss": 0.3447, "step": 14610 }, { "epoch": 0.8408097538532321, "grad_norm": 8.498409271240234, "learning_rate": 6.504475701020146e-08, "loss": 0.3407, "step": 14620 }, { "epoch": 0.8413848631239935, "grad_norm": 10.328351020812988, "learning_rate": 6.458616646776399e-08, "loss": 0.3339, "step": 14630 }, { "epoch": 0.841959972394755, "grad_norm": 11.048827171325684, "learning_rate": 6.412908662331423e-08, "loss": 0.359, "step": 14640 }, { "epoch": 0.8425350816655165, "grad_norm": 10.316198348999023, "learning_rate": 6.3673519062722e-08, "loss": 0.3595, "step": 14650 }, { "epoch": 0.8431101909362779, "grad_norm": 9.965385437011719, "learning_rate": 6.321946536660989e-08, "loss": 0.357, "step": 14660 }, { "epoch": 0.8436853002070394, "grad_norm": 11.32456111907959, "learning_rate": 6.276692711034809e-08, "loss": 0.3578, "step": 14670 }, { "epoch": 0.8442604094778008, "grad_norm": 10.42683219909668, "learning_rate": 6.231590586404916e-08, "loss": 0.3459, "step": 14680 }, { "epoch": 0.8448355187485622, "grad_norm": 10.229205131530762, "learning_rate": 6.186640319256198e-08, "loss": 0.3191, "step": 14690 }, { "epoch": 0.8454106280193237, "grad_norm": 13.010375022888184, "learning_rate": 6.141842065546671e-08, "loss": 0.3719, "step": 14700 }, { "epoch": 0.8459857372900851, "grad_norm": 16.174335479736328, "learning_rate": 6.097195980706932e-08, "loss": 0.3693, "step": 14710 }, { "epoch": 0.8465608465608465, "grad_norm": 9.866643905639648, "learning_rate": 6.052702219639605e-08, "loss": 0.3614, "step": 14720 }, { "epoch": 0.8471359558316081, "grad_norm": 8.864749908447266, "learning_rate": 6.008360936718859e-08, "loss": 0.3769, "step": 14730 }, { "epoch": 0.8477110651023695, "grad_norm": 7.970646858215332, "learning_rate": 5.96417228578977e-08, "loss": 0.3592, "step": 14740 }, { "epoch": 0.8482861743731309, "grad_norm": 10.511832237243652, "learning_rate": 5.9201364201678874e-08, "loss": 0.344, "step": 14750 }, { "epoch": 0.8488612836438924, "grad_norm": 9.681062698364258, "learning_rate": 5.876253492638644e-08, "loss": 0.3641, "step": 14760 }, { "epoch": 0.8494363929146538, "grad_norm": 11.771842002868652, "learning_rate": 5.832523655456845e-08, "loss": 0.3342, "step": 14770 }, { "epoch": 0.8500115021854152, "grad_norm": 10.87653923034668, "learning_rate": 5.7889470603461557e-08, "loss": 0.3571, "step": 14780 }, { "epoch": 0.8505866114561766, "grad_norm": 13.32516098022461, "learning_rate": 5.745523858498541e-08, "loss": 0.3561, "step": 14790 }, { "epoch": 0.8511617207269381, "grad_norm": 9.936367988586426, "learning_rate": 5.702254200573764e-08, "loss": 0.3597, "step": 14800 }, { "epoch": 0.8517368299976996, "grad_norm": 14.262615203857422, "learning_rate": 5.659138236698846e-08, "loss": 0.3613, "step": 14810 }, { "epoch": 0.852311939268461, "grad_norm": 8.628487586975098, "learning_rate": 5.6161761164675523e-08, "loss": 0.3611, "step": 14820 }, { "epoch": 0.8528870485392225, "grad_norm": 12.990911483764648, "learning_rate": 5.57336798893992e-08, "loss": 0.3651, "step": 14830 }, { "epoch": 0.8534621578099839, "grad_norm": 14.7277250289917, "learning_rate": 5.5307140026416455e-08, "loss": 0.3435, "step": 14840 }, { "epoch": 0.8540372670807453, "grad_norm": 10.506429672241211, "learning_rate": 5.488214305563649e-08, "loss": 0.355, "step": 14850 }, { "epoch": 0.8546123763515068, "grad_norm": 9.288267135620117, "learning_rate": 5.445869045161522e-08, "loss": 0.3641, "step": 14860 }, { "epoch": 0.8551874856222682, "grad_norm": 12.662288665771484, "learning_rate": 5.403678368355041e-08, "loss": 0.3172, "step": 14870 }, { "epoch": 0.8557625948930296, "grad_norm": 10.810647964477539, "learning_rate": 5.3616424215276294e-08, "loss": 0.3335, "step": 14880 }, { "epoch": 0.8563377041637911, "grad_norm": 9.307625770568848, "learning_rate": 5.3197613505258756e-08, "loss": 0.3354, "step": 14890 }, { "epoch": 0.8569128134345526, "grad_norm": 10.473979949951172, "learning_rate": 5.278035300659012e-08, "loss": 0.3573, "step": 14900 }, { "epoch": 0.857487922705314, "grad_norm": 10.22640323638916, "learning_rate": 5.236464416698411e-08, "loss": 0.3728, "step": 14910 }, { "epoch": 0.8580630319760755, "grad_norm": 12.327559471130371, "learning_rate": 5.195048842877081e-08, "loss": 0.3405, "step": 14920 }, { "epoch": 0.8586381412468369, "grad_norm": 9.54967975616455, "learning_rate": 5.153788722889202e-08, "loss": 0.3603, "step": 14930 }, { "epoch": 0.8592132505175983, "grad_norm": 10.21427059173584, "learning_rate": 5.1126841998895556e-08, "loss": 0.3577, "step": 14940 }, { "epoch": 0.8597883597883598, "grad_norm": 9.431503295898438, "learning_rate": 5.071735416493095e-08, "loss": 0.3589, "step": 14950 }, { "epoch": 0.8603634690591212, "grad_norm": 9.924721717834473, "learning_rate": 5.030942514774417e-08, "loss": 0.3325, "step": 14960 }, { "epoch": 0.8609385783298826, "grad_norm": 12.352200508117676, "learning_rate": 4.990305636267261e-08, "loss": 0.3876, "step": 14970 }, { "epoch": 0.8615136876006442, "grad_norm": 9.853137016296387, "learning_rate": 4.9498249219640687e-08, "loss": 0.3621, "step": 14980 }, { "epoch": 0.8620887968714056, "grad_norm": 11.632318496704102, "learning_rate": 4.9095005123154344e-08, "loss": 0.3725, "step": 14990 }, { "epoch": 0.862663906142167, "grad_norm": 12.55959415435791, "learning_rate": 4.869332547229643e-08, "loss": 0.3743, "step": 15000 }, { "epoch": 0.8632390154129285, "grad_norm": 11.447243690490723, "learning_rate": 4.829321166072187e-08, "loss": 0.3534, "step": 15010 }, { "epoch": 0.8638141246836899, "grad_norm": 12.634971618652344, "learning_rate": 4.789466507665285e-08, "loss": 0.3974, "step": 15020 }, { "epoch": 0.8643892339544513, "grad_norm": 10.0396146774292, "learning_rate": 4.749768710287394e-08, "loss": 0.3356, "step": 15030 }, { "epoch": 0.8649643432252128, "grad_norm": 11.967917442321777, "learning_rate": 4.710227911672721e-08, "loss": 0.3589, "step": 15040 }, { "epoch": 0.8655394524959742, "grad_norm": 8.657121658325195, "learning_rate": 4.670844249010775e-08, "loss": 0.3547, "step": 15050 }, { "epoch": 0.8661145617667356, "grad_norm": 9.126193046569824, "learning_rate": 4.6316178589458466e-08, "loss": 0.3472, "step": 15060 }, { "epoch": 0.8666896710374972, "grad_norm": 17.75835609436035, "learning_rate": 4.592548877576574e-08, "loss": 0.346, "step": 15070 }, { "epoch": 0.8672647803082586, "grad_norm": 9.86093807220459, "learning_rate": 4.55363744045546e-08, "loss": 0.3483, "step": 15080 }, { "epoch": 0.86783988957902, "grad_norm": 12.15440559387207, "learning_rate": 4.514883682588389e-08, "loss": 0.3634, "step": 15090 }, { "epoch": 0.8684149988497815, "grad_norm": 10.627328872680664, "learning_rate": 4.476287738434159e-08, "loss": 0.347, "step": 15100 }, { "epoch": 0.8689901081205429, "grad_norm": 10.921732902526855, "learning_rate": 4.437849741904037e-08, "loss": 0.3503, "step": 15110 }, { "epoch": 0.8695652173913043, "grad_norm": 10.761367797851562, "learning_rate": 4.39956982636126e-08, "loss": 0.3548, "step": 15120 }, { "epoch": 0.8701403266620658, "grad_norm": 10.053627014160156, "learning_rate": 4.36144812462062e-08, "loss": 0.3586, "step": 15130 }, { "epoch": 0.8707154359328272, "grad_norm": 10.254932403564453, "learning_rate": 4.32348476894796e-08, "loss": 0.3446, "step": 15140 }, { "epoch": 0.8712905452035887, "grad_norm": 12.502052307128906, "learning_rate": 4.285679891059729e-08, "loss": 0.3549, "step": 15150 }, { "epoch": 0.8718656544743502, "grad_norm": 9.361943244934082, "learning_rate": 4.248033622122527e-08, "loss": 0.3434, "step": 15160 }, { "epoch": 0.8724407637451116, "grad_norm": 10.865105628967285, "learning_rate": 4.210546092752648e-08, "loss": 0.3727, "step": 15170 }, { "epoch": 0.873015873015873, "grad_norm": 12.16160774230957, "learning_rate": 4.173217433015636e-08, "loss": 0.3316, "step": 15180 }, { "epoch": 0.8735909822866345, "grad_norm": 11.313312530517578, "learning_rate": 4.136047772425821e-08, "loss": 0.3612, "step": 15190 }, { "epoch": 0.8741660915573959, "grad_norm": 12.830582618713379, "learning_rate": 4.09903723994588e-08, "loss": 0.3397, "step": 15200 }, { "epoch": 0.8747412008281573, "grad_norm": 13.35426139831543, "learning_rate": 4.06218596398637e-08, "loss": 0.3535, "step": 15210 }, { "epoch": 0.8753163100989187, "grad_norm": 9.687232971191406, "learning_rate": 4.0254940724053e-08, "loss": 0.3287, "step": 15220 }, { "epoch": 0.8758914193696803, "grad_norm": 10.2943696975708, "learning_rate": 3.988961692507714e-08, "loss": 0.3166, "step": 15230 }, { "epoch": 0.8764665286404417, "grad_norm": 16.923683166503906, "learning_rate": 3.9525889510451773e-08, "loss": 0.3188, "step": 15240 }, { "epoch": 0.8770416379112032, "grad_norm": 11.177139282226562, "learning_rate": 3.916375974215402e-08, "loss": 0.327, "step": 15250 }, { "epoch": 0.8776167471819646, "grad_norm": 15.044921875, "learning_rate": 3.8803228876617796e-08, "loss": 0.3738, "step": 15260 }, { "epoch": 0.878191856452726, "grad_norm": 10.12452220916748, "learning_rate": 3.844429816472944e-08, "loss": 0.341, "step": 15270 }, { "epoch": 0.8787669657234874, "grad_norm": 13.836044311523438, "learning_rate": 3.808696885182372e-08, "loss": 0.3407, "step": 15280 }, { "epoch": 0.8793420749942489, "grad_norm": 17.48280143737793, "learning_rate": 3.7731242177678925e-08, "loss": 0.3561, "step": 15290 }, { "epoch": 0.8799171842650103, "grad_norm": 8.586662292480469, "learning_rate": 3.737711937651305e-08, "loss": 0.3502, "step": 15300 }, { "epoch": 0.8804922935357717, "grad_norm": 21.888261795043945, "learning_rate": 3.7024601676979295e-08, "loss": 0.4108, "step": 15310 }, { "epoch": 0.8810674028065333, "grad_norm": 9.716233253479004, "learning_rate": 3.6673690302161833e-08, "loss": 0.363, "step": 15320 }, { "epoch": 0.8816425120772947, "grad_norm": 16.969402313232422, "learning_rate": 3.6324386469571646e-08, "loss": 0.338, "step": 15330 }, { "epoch": 0.8822176213480561, "grad_norm": 9.240605354309082, "learning_rate": 3.5976691391142175e-08, "loss": 0.3372, "step": 15340 }, { "epoch": 0.8827927306188176, "grad_norm": 13.646862030029297, "learning_rate": 3.563060627322523e-08, "loss": 0.3581, "step": 15350 }, { "epoch": 0.883367839889579, "grad_norm": 11.374551773071289, "learning_rate": 3.528613231658673e-08, "loss": 0.3373, "step": 15360 }, { "epoch": 0.8839429491603404, "grad_norm": 10.868992805480957, "learning_rate": 3.4943270716402405e-08, "loss": 0.3491, "step": 15370 }, { "epoch": 0.8845180584311019, "grad_norm": 10.237171173095703, "learning_rate": 3.460202266225421e-08, "loss": 0.3553, "step": 15380 }, { "epoch": 0.8850931677018633, "grad_norm": 7.943317890167236, "learning_rate": 3.426238933812542e-08, "loss": 0.3401, "step": 15390 }, { "epoch": 0.8856682769726248, "grad_norm": 13.657381057739258, "learning_rate": 3.3924371922397e-08, "loss": 0.3559, "step": 15400 }, { "epoch": 0.8862433862433863, "grad_norm": 13.528092384338379, "learning_rate": 3.358797158784349e-08, "loss": 0.3464, "step": 15410 }, { "epoch": 0.8868184955141477, "grad_norm": 9.931370735168457, "learning_rate": 3.3253189501628586e-08, "loss": 0.345, "step": 15420 }, { "epoch": 0.8873936047849091, "grad_norm": 14.774806022644043, "learning_rate": 3.2920026825301875e-08, "loss": 0.3989, "step": 15430 }, { "epoch": 0.8879687140556706, "grad_norm": 11.493646621704102, "learning_rate": 3.258848471479381e-08, "loss": 0.379, "step": 15440 }, { "epoch": 0.888543823326432, "grad_norm": 10.184978485107422, "learning_rate": 3.225856432041235e-08, "loss": 0.3424, "step": 15450 }, { "epoch": 0.8891189325971934, "grad_norm": 10.041132926940918, "learning_rate": 3.1930266786838856e-08, "loss": 0.3524, "step": 15460 }, { "epoch": 0.8896940418679549, "grad_norm": 10.633180618286133, "learning_rate": 3.160359325312395e-08, "loss": 0.3468, "step": 15470 }, { "epoch": 0.8902691511387164, "grad_norm": 9.204377174377441, "learning_rate": 3.127854485268372e-08, "loss": 0.311, "step": 15480 }, { "epoch": 0.8908442604094778, "grad_norm": 15.211405754089355, "learning_rate": 3.095512271329587e-08, "loss": 0.3755, "step": 15490 }, { "epoch": 0.8914193696802393, "grad_norm": 9.80736255645752, "learning_rate": 3.0633327957095444e-08, "loss": 0.3782, "step": 15500 }, { "epoch": 0.8919944789510007, "grad_norm": 9.6182861328125, "learning_rate": 3.031316170057141e-08, "loss": 0.3494, "step": 15510 }, { "epoch": 0.8925695882217621, "grad_norm": 10.12015151977539, "learning_rate": 2.999462505456224e-08, "loss": 0.3578, "step": 15520 }, { "epoch": 0.8931446974925236, "grad_norm": 9.013107299804688, "learning_rate": 2.9677719124252764e-08, "loss": 0.3693, "step": 15530 }, { "epoch": 0.893719806763285, "grad_norm": 10.038949966430664, "learning_rate": 2.9362445009169566e-08, "loss": 0.3585, "step": 15540 }, { "epoch": 0.8942949160340464, "grad_norm": 10.157662391662598, "learning_rate": 2.9048803803177814e-08, "loss": 0.3667, "step": 15550 }, { "epoch": 0.8948700253048079, "grad_norm": 10.42690372467041, "learning_rate": 2.8736796594476974e-08, "loss": 0.3636, "step": 15560 }, { "epoch": 0.8954451345755694, "grad_norm": 9.941238403320312, "learning_rate": 2.8426424465597288e-08, "loss": 0.3803, "step": 15570 }, { "epoch": 0.8960202438463308, "grad_norm": 10.201858520507812, "learning_rate": 2.8117688493396186e-08, "loss": 0.3704, "step": 15580 }, { "epoch": 0.8965953531170923, "grad_norm": 8.473631858825684, "learning_rate": 2.7810589749054037e-08, "loss": 0.3163, "step": 15590 }, { "epoch": 0.8971704623878537, "grad_norm": 9.668301582336426, "learning_rate": 2.7505129298070916e-08, "loss": 0.3411, "step": 15600 }, { "epoch": 0.8977455716586151, "grad_norm": 13.118300437927246, "learning_rate": 2.7201308200262728e-08, "loss": 0.366, "step": 15610 }, { "epoch": 0.8983206809293766, "grad_norm": 14.1558837890625, "learning_rate": 2.6899127509757423e-08, "loss": 0.3637, "step": 15620 }, { "epoch": 0.898895790200138, "grad_norm": 11.401450157165527, "learning_rate": 2.6598588274991562e-08, "loss": 0.3874, "step": 15630 }, { "epoch": 0.8994708994708994, "grad_norm": 9.774895668029785, "learning_rate": 2.629969153870648e-08, "loss": 0.3458, "step": 15640 }, { "epoch": 0.900046008741661, "grad_norm": 13.037107467651367, "learning_rate": 2.60024383379448e-08, "loss": 0.366, "step": 15650 }, { "epoch": 0.9006211180124224, "grad_norm": 12.392276763916016, "learning_rate": 2.5706829704046806e-08, "loss": 0.3441, "step": 15660 }, { "epoch": 0.9011962272831838, "grad_norm": 14.193033218383789, "learning_rate": 2.5412866662646694e-08, "loss": 0.3607, "step": 15670 }, { "epoch": 0.9017713365539453, "grad_norm": 12.774453163146973, "learning_rate": 2.5120550233669437e-08, "loss": 0.3518, "step": 15680 }, { "epoch": 0.9023464458247067, "grad_norm": 10.698077201843262, "learning_rate": 2.482988143132675e-08, "loss": 0.3788, "step": 15690 }, { "epoch": 0.9029215550954681, "grad_norm": 10.370417594909668, "learning_rate": 2.4540861264113866e-08, "loss": 0.3667, "step": 15700 }, { "epoch": 0.9034966643662296, "grad_norm": 11.826659202575684, "learning_rate": 2.425349073480598e-08, "loss": 0.3285, "step": 15710 }, { "epoch": 0.904071773636991, "grad_norm": 10.145709037780762, "learning_rate": 2.3967770840454647e-08, "loss": 0.3552, "step": 15720 }, { "epoch": 0.9046468829077525, "grad_norm": 11.724169731140137, "learning_rate": 2.3683702572384668e-08, "loss": 0.3603, "step": 15730 }, { "epoch": 0.905221992178514, "grad_norm": 10.78227710723877, "learning_rate": 2.34012869161902e-08, "loss": 0.3349, "step": 15740 }, { "epoch": 0.9057971014492754, "grad_norm": 10.573351860046387, "learning_rate": 2.312052485173166e-08, "loss": 0.3419, "step": 15750 }, { "epoch": 0.9063722107200368, "grad_norm": 10.340923309326172, "learning_rate": 2.284141735313211e-08, "loss": 0.3692, "step": 15760 }, { "epoch": 0.9069473199907983, "grad_norm": 10.517963409423828, "learning_rate": 2.2563965388774085e-08, "loss": 0.3358, "step": 15770 }, { "epoch": 0.9075224292615597, "grad_norm": 9.152632713317871, "learning_rate": 2.228816992129612e-08, "loss": 0.341, "step": 15780 }, { "epoch": 0.9080975385323211, "grad_norm": 9.386467933654785, "learning_rate": 2.2014031907589382e-08, "loss": 0.3505, "step": 15790 }, { "epoch": 0.9086726478030825, "grad_norm": 10.698101997375488, "learning_rate": 2.174155229879432e-08, "loss": 0.3405, "step": 15800 }, { "epoch": 0.909247757073844, "grad_norm": 11.125164031982422, "learning_rate": 2.1470732040297647e-08, "loss": 0.3565, "step": 15810 }, { "epoch": 0.9098228663446055, "grad_norm": 12.168891906738281, "learning_rate": 2.120157207172857e-08, "loss": 0.3382, "step": 15820 }, { "epoch": 0.910397975615367, "grad_norm": 10.894423484802246, "learning_rate": 2.0934073326956125e-08, "loss": 0.3388, "step": 15830 }, { "epoch": 0.9109730848861284, "grad_norm": 10.521305084228516, "learning_rate": 2.0668236734085408e-08, "loss": 0.3423, "step": 15840 }, { "epoch": 0.9115481941568898, "grad_norm": 11.741790771484375, "learning_rate": 2.0404063215454515e-08, "loss": 0.3864, "step": 15850 }, { "epoch": 0.9121233034276512, "grad_norm": 10.856982231140137, "learning_rate": 2.014155368763154e-08, "loss": 0.3495, "step": 15860 }, { "epoch": 0.9126984126984127, "grad_norm": 10.141417503356934, "learning_rate": 1.988070906141115e-08, "loss": 0.3615, "step": 15870 }, { "epoch": 0.9132735219691741, "grad_norm": 12.382743835449219, "learning_rate": 1.9621530241811678e-08, "loss": 0.3413, "step": 15880 }, { "epoch": 0.9138486312399355, "grad_norm": 11.401932716369629, "learning_rate": 1.936401812807159e-08, "loss": 0.3672, "step": 15890 }, { "epoch": 0.9144237405106971, "grad_norm": 11.99639892578125, "learning_rate": 1.9108173613646806e-08, "loss": 0.3717, "step": 15900 }, { "epoch": 0.9149988497814585, "grad_norm": 12.137653350830078, "learning_rate": 1.885399758620726e-08, "loss": 0.3489, "step": 15910 }, { "epoch": 0.9155739590522199, "grad_norm": 11.457845687866211, "learning_rate": 1.860149092763402e-08, "loss": 0.3733, "step": 15920 }, { "epoch": 0.9161490683229814, "grad_norm": 11.016530990600586, "learning_rate": 1.835065451401624e-08, "loss": 0.3464, "step": 15930 }, { "epoch": 0.9167241775937428, "grad_norm": 10.79750919342041, "learning_rate": 1.8101489215647804e-08, "loss": 0.3678, "step": 15940 }, { "epoch": 0.9172992868645042, "grad_norm": 11.251911163330078, "learning_rate": 1.7853995897024852e-08, "loss": 0.3716, "step": 15950 }, { "epoch": 0.9178743961352657, "grad_norm": 11.416596412658691, "learning_rate": 1.7608175416842285e-08, "loss": 0.3306, "step": 15960 }, { "epoch": 0.9184495054060271, "grad_norm": 10.999648094177246, "learning_rate": 1.7364028627990913e-08, "loss": 0.341, "step": 15970 }, { "epoch": 0.9190246146767886, "grad_norm": 11.256757736206055, "learning_rate": 1.7121556377554858e-08, "loss": 0.3825, "step": 15980 }, { "epoch": 0.9195997239475501, "grad_norm": 13.413955688476562, "learning_rate": 1.6880759506808006e-08, "loss": 0.3708, "step": 15990 }, { "epoch": 0.9201748332183115, "grad_norm": 15.258024215698242, "learning_rate": 1.6641638851211557e-08, "loss": 0.3651, "step": 16000 }, { "epoch": 0.9207499424890729, "grad_norm": 13.095954895019531, "learning_rate": 1.640419524041092e-08, "loss": 0.3863, "step": 16010 }, { "epoch": 0.9213250517598344, "grad_norm": 11.403904914855957, "learning_rate": 1.6168429498232883e-08, "loss": 0.3528, "step": 16020 }, { "epoch": 0.9219001610305958, "grad_norm": 9.634064674377441, "learning_rate": 1.5934342442682823e-08, "loss": 0.353, "step": 16030 }, { "epoch": 0.9224752703013572, "grad_norm": 11.792839050292969, "learning_rate": 1.570193488594168e-08, "loss": 0.3281, "step": 16040 }, { "epoch": 0.9230503795721187, "grad_norm": 15.517866134643555, "learning_rate": 1.5471207634363382e-08, "loss": 0.3692, "step": 16050 }, { "epoch": 0.9236254888428801, "grad_norm": 11.871737480163574, "learning_rate": 1.5242161488471794e-08, "loss": 0.3585, "step": 16060 }, { "epoch": 0.9242005981136416, "grad_norm": 9.062506675720215, "learning_rate": 1.5014797242958177e-08, "loss": 0.351, "step": 16070 }, { "epoch": 0.9247757073844031, "grad_norm": 10.90822982788086, "learning_rate": 1.4789115686678289e-08, "loss": 0.3593, "step": 16080 }, { "epoch": 0.9253508166551645, "grad_norm": 15.502705574035645, "learning_rate": 1.4565117602649667e-08, "loss": 0.3626, "step": 16090 }, { "epoch": 0.9259259259259259, "grad_norm": 21.155424118041992, "learning_rate": 1.4342803768048973e-08, "loss": 0.3698, "step": 16100 }, { "epoch": 0.9265010351966874, "grad_norm": 11.442899703979492, "learning_rate": 1.4122174954209143e-08, "loss": 0.3255, "step": 16110 }, { "epoch": 0.9270761444674488, "grad_norm": 13.778807640075684, "learning_rate": 1.390323192661691e-08, "loss": 0.3369, "step": 16120 }, { "epoch": 0.9276512537382102, "grad_norm": 10.557825088500977, "learning_rate": 1.3685975444910069e-08, "loss": 0.3404, "step": 16130 }, { "epoch": 0.9282263630089717, "grad_norm": 10.77072811126709, "learning_rate": 1.3470406262874824e-08, "loss": 0.3541, "step": 16140 }, { "epoch": 0.9288014722797332, "grad_norm": 13.807185173034668, "learning_rate": 1.3256525128443108e-08, "loss": 0.3696, "step": 16150 }, { "epoch": 0.9293765815504946, "grad_norm": 9.097618103027344, "learning_rate": 1.3044332783690104e-08, "loss": 0.3638, "step": 16160 }, { "epoch": 0.9299516908212561, "grad_norm": 10.07507038116455, "learning_rate": 1.2833829964831622e-08, "loss": 0.3577, "step": 16170 }, { "epoch": 0.9305268000920175, "grad_norm": 13.105895042419434, "learning_rate": 1.2625017402221605e-08, "loss": 0.3738, "step": 16180 }, { "epoch": 0.9311019093627789, "grad_norm": 10.269083976745605, "learning_rate": 1.2417895820349411e-08, "loss": 0.3428, "step": 16190 }, { "epoch": 0.9316770186335404, "grad_norm": 13.134654998779297, "learning_rate": 1.221246593783748e-08, "loss": 0.3713, "step": 16200 }, { "epoch": 0.9322521279043018, "grad_norm": 10.30395221710205, "learning_rate": 1.2008728467438945e-08, "loss": 0.368, "step": 16210 }, { "epoch": 0.9328272371750632, "grad_norm": 11.278501510620117, "learning_rate": 1.1806684116034582e-08, "loss": 0.3496, "step": 16220 }, { "epoch": 0.9334023464458248, "grad_norm": 12.51138687133789, "learning_rate": 1.1606333584631256e-08, "loss": 0.3637, "step": 16230 }, { "epoch": 0.9339774557165862, "grad_norm": 9.823774337768555, "learning_rate": 1.1407677568358808e-08, "loss": 0.3555, "step": 16240 }, { "epoch": 0.9345525649873476, "grad_norm": 12.621105194091797, "learning_rate": 1.121071675646773e-08, "loss": 0.3602, "step": 16250 }, { "epoch": 0.935127674258109, "grad_norm": 9.792707443237305, "learning_rate": 1.1015451832326994e-08, "loss": 0.3205, "step": 16260 }, { "epoch": 0.9357027835288705, "grad_norm": 9.927570343017578, "learning_rate": 1.0821883473421612e-08, "loss": 0.3353, "step": 16270 }, { "epoch": 0.9362778927996319, "grad_norm": 11.706604957580566, "learning_rate": 1.0630012351350248e-08, "loss": 0.3342, "step": 16280 }, { "epoch": 0.9368530020703933, "grad_norm": 10.553886413574219, "learning_rate": 1.0439839131822781e-08, "loss": 0.3567, "step": 16290 }, { "epoch": 0.9374281113411548, "grad_norm": 9.487189292907715, "learning_rate": 1.0251364474658186e-08, "loss": 0.3705, "step": 16300 }, { "epoch": 0.9380032206119162, "grad_norm": 13.456291198730469, "learning_rate": 1.0064589033782156e-08, "loss": 0.3453, "step": 16310 }, { "epoch": 0.9385783298826778, "grad_norm": 12.77872085571289, "learning_rate": 9.879513457224709e-09, "loss": 0.3913, "step": 16320 }, { "epoch": 0.9391534391534392, "grad_norm": 12.238675117492676, "learning_rate": 9.696138387118302e-09, "loss": 0.3593, "step": 16330 }, { "epoch": 0.9397285484242006, "grad_norm": 9.848069190979004, "learning_rate": 9.514464459695281e-09, "loss": 0.354, "step": 16340 }, { "epoch": 0.940303657694962, "grad_norm": 9.282127380371094, "learning_rate": 9.334492305285657e-09, "loss": 0.35, "step": 16350 }, { "epoch": 0.9408787669657235, "grad_norm": 11.366129875183105, "learning_rate": 9.156222548315051e-09, "loss": 0.3521, "step": 16360 }, { "epoch": 0.9414538762364849, "grad_norm": 10.10111141204834, "learning_rate": 8.979655807302534e-09, "loss": 0.3609, "step": 16370 }, { "epoch": 0.9420289855072463, "grad_norm": 11.499642372131348, "learning_rate": 8.80479269485851e-09, "loss": 0.3376, "step": 16380 }, { "epoch": 0.9426040947780078, "grad_norm": 13.999999046325684, "learning_rate": 8.631633817682504e-09, "loss": 0.3478, "step": 16390 }, { "epoch": 0.9431792040487693, "grad_norm": 10.673562049865723, "learning_rate": 8.460179776561049e-09, "loss": 0.3585, "step": 16400 }, { "epoch": 0.9437543133195307, "grad_norm": 14.619071960449219, "learning_rate": 8.29043116636552e-09, "loss": 0.3154, "step": 16410 }, { "epoch": 0.9443294225902922, "grad_norm": 10.757208824157715, "learning_rate": 8.12238857605041e-09, "loss": 0.3442, "step": 16420 }, { "epoch": 0.9449045318610536, "grad_norm": 9.080318450927734, "learning_rate": 7.956052588650897e-09, "loss": 0.3055, "step": 16430 }, { "epoch": 0.945479641131815, "grad_norm": 9.728543281555176, "learning_rate": 7.791423781281003e-09, "loss": 0.3226, "step": 16440 }, { "epoch": 0.9460547504025765, "grad_norm": 12.932111740112305, "learning_rate": 7.628502725131714e-09, "loss": 0.3737, "step": 16450 }, { "epoch": 0.9466298596733379, "grad_norm": 12.188366889953613, "learning_rate": 7.467289985468584e-09, "loss": 0.3462, "step": 16460 }, { "epoch": 0.9472049689440993, "grad_norm": 10.762121200561523, "learning_rate": 7.307786121630355e-09, "loss": 0.3746, "step": 16470 }, { "epoch": 0.9477800782148609, "grad_norm": 8.90848445892334, "learning_rate": 7.1499916870265135e-09, "loss": 0.3645, "step": 16480 }, { "epoch": 0.9483551874856223, "grad_norm": 11.018730163574219, "learning_rate": 6.993907229135565e-09, "loss": 0.3405, "step": 16490 }, { "epoch": 0.9489302967563837, "grad_norm": 12.56182861328125, "learning_rate": 6.8395332895032076e-09, "loss": 0.3433, "step": 16500 }, { "epoch": 0.9495054060271452, "grad_norm": 13.92578411102295, "learning_rate": 6.68687040374033e-09, "loss": 0.3408, "step": 16510 }, { "epoch": 0.9500805152979066, "grad_norm": 13.65556526184082, "learning_rate": 6.535919101521237e-09, "loss": 0.3482, "step": 16520 }, { "epoch": 0.950655624568668, "grad_norm": 11.196197509765625, "learning_rate": 6.386679906581816e-09, "loss": 0.3581, "step": 16530 }, { "epoch": 0.9512307338394295, "grad_norm": 9.110787391662598, "learning_rate": 6.239153336717595e-09, "loss": 0.3648, "step": 16540 }, { "epoch": 0.9518058431101909, "grad_norm": 11.412559509277344, "learning_rate": 6.093339903782024e-09, "loss": 0.3556, "step": 16550 }, { "epoch": 0.9523809523809523, "grad_norm": 9.926128387451172, "learning_rate": 5.949240113684862e-09, "loss": 0.3527, "step": 16560 }, { "epoch": 0.9529560616517139, "grad_norm": 10.849930763244629, "learning_rate": 5.80685446639001e-09, "loss": 0.3611, "step": 16570 }, { "epoch": 0.9535311709224753, "grad_norm": 10.978145599365234, "learning_rate": 5.666183455914297e-09, "loss": 0.3324, "step": 16580 }, { "epoch": 0.9541062801932367, "grad_norm": 13.039079666137695, "learning_rate": 5.5272275703253656e-09, "loss": 0.3306, "step": 16590 }, { "epoch": 0.9546813894639982, "grad_norm": 11.077832221984863, "learning_rate": 5.389987291740117e-09, "loss": 0.3518, "step": 16600 }, { "epoch": 0.9552564987347596, "grad_norm": 12.849186897277832, "learning_rate": 5.254463096323047e-09, "loss": 0.3456, "step": 16610 }, { "epoch": 0.955831608005521, "grad_norm": 11.505592346191406, "learning_rate": 5.1206554542846394e-09, "loss": 0.3748, "step": 16620 }, { "epoch": 0.9564067172762825, "grad_norm": 12.972572326660156, "learning_rate": 4.988564829879582e-09, "loss": 0.3542, "step": 16630 }, { "epoch": 0.9569818265470439, "grad_norm": 10.25854206085205, "learning_rate": 4.858191681405277e-09, "loss": 0.3634, "step": 16640 }, { "epoch": 0.9575569358178054, "grad_norm": 13.831768035888672, "learning_rate": 4.72953646120039e-09, "loss": 0.3456, "step": 16650 }, { "epoch": 0.9581320450885669, "grad_norm": 14.497851371765137, "learning_rate": 4.602599615642799e-09, "loss": 0.3635, "step": 16660 }, { "epoch": 0.9587071543593283, "grad_norm": 10.675956726074219, "learning_rate": 4.477381585148654e-09, "loss": 0.3438, "step": 16670 }, { "epoch": 0.9592822636300897, "grad_norm": 10.478116035461426, "learning_rate": 4.353882804170484e-09, "loss": 0.3618, "step": 16680 }, { "epoch": 0.9598573729008512, "grad_norm": 8.508675575256348, "learning_rate": 4.232103701195644e-09, "loss": 0.3985, "step": 16690 }, { "epoch": 0.9604324821716126, "grad_norm": 11.126923561096191, "learning_rate": 4.112044698745098e-09, "loss": 0.3707, "step": 16700 }, { "epoch": 0.961007591442374, "grad_norm": 10.939498901367188, "learning_rate": 3.993706213371695e-09, "loss": 0.3677, "step": 16710 }, { "epoch": 0.9615827007131355, "grad_norm": 10.846037864685059, "learning_rate": 3.877088655658889e-09, "loss": 0.3306, "step": 16720 }, { "epoch": 0.962157809983897, "grad_norm": 11.301316261291504, "learning_rate": 3.762192430219302e-09, "loss": 0.3449, "step": 16730 }, { "epoch": 0.9627329192546584, "grad_norm": 12.614832878112793, "learning_rate": 3.649017935693166e-09, "loss": 0.3591, "step": 16740 }, { "epoch": 0.9633080285254199, "grad_norm": 14.318446159362793, "learning_rate": 3.5375655647471026e-09, "loss": 0.3353, "step": 16750 }, { "epoch": 0.9638831377961813, "grad_norm": 13.013409614562988, "learning_rate": 3.42783570407279e-09, "loss": 0.3532, "step": 16760 }, { "epoch": 0.9644582470669427, "grad_norm": 14.582551002502441, "learning_rate": 3.3198287343853547e-09, "loss": 0.3504, "step": 16770 }, { "epoch": 0.9650333563377042, "grad_norm": 10.84306812286377, "learning_rate": 3.2135450304224266e-09, "loss": 0.3629, "step": 16780 }, { "epoch": 0.9656084656084656, "grad_norm": 8.859792709350586, "learning_rate": 3.108984960942529e-09, "loss": 0.3607, "step": 16790 }, { "epoch": 0.966183574879227, "grad_norm": 11.298430442810059, "learning_rate": 3.0061488887239695e-09, "loss": 0.3368, "step": 16800 }, { "epoch": 0.9667586841499884, "grad_norm": 10.368522644042969, "learning_rate": 2.905037170563507e-09, "loss": 0.3426, "step": 16810 }, { "epoch": 0.96733379342075, "grad_norm": 10.784027099609375, "learning_rate": 2.8056501572750748e-09, "loss": 0.343, "step": 16820 }, { "epoch": 0.9679089026915114, "grad_norm": 11.991925239562988, "learning_rate": 2.707988193688837e-09, "loss": 0.3236, "step": 16830 }, { "epoch": 0.9684840119622729, "grad_norm": 9.97059440612793, "learning_rate": 2.6120516186495243e-09, "loss": 0.3626, "step": 16840 }, { "epoch": 0.9690591212330343, "grad_norm": 11.384905815124512, "learning_rate": 2.5178407650156553e-09, "loss": 0.3523, "step": 16850 }, { "epoch": 0.9696342305037957, "grad_norm": 10.08507251739502, "learning_rate": 2.425355959658204e-09, "loss": 0.358, "step": 16860 }, { "epoch": 0.9702093397745571, "grad_norm": 10.032130241394043, "learning_rate": 2.3345975234594917e-09, "loss": 0.3524, "step": 16870 }, { "epoch": 0.9707844490453186, "grad_norm": 15.847508430480957, "learning_rate": 2.2455657713121857e-09, "loss": 0.3686, "step": 16880 }, { "epoch": 0.97135955831608, "grad_norm": 10.296757698059082, "learning_rate": 2.158261012117968e-09, "loss": 0.349, "step": 16890 }, { "epoch": 0.9719346675868415, "grad_norm": 11.762929916381836, "learning_rate": 2.0726835487866466e-09, "loss": 0.3758, "step": 16900 }, { "epoch": 0.972509776857603, "grad_norm": 8.135751724243164, "learning_rate": 1.9888336782350467e-09, "loss": 0.3488, "step": 16910 }, { "epoch": 0.9730848861283644, "grad_norm": 10.39211654663086, "learning_rate": 1.906711691386065e-09, "loss": 0.341, "step": 16920 }, { "epoch": 0.9736599953991258, "grad_norm": 11.97961711883545, "learning_rate": 1.8263178731675045e-09, "loss": 0.3474, "step": 16930 }, { "epoch": 0.9742351046698873, "grad_norm": 10.728039741516113, "learning_rate": 1.7476525025111876e-09, "loss": 0.3616, "step": 16940 }, { "epoch": 0.9748102139406487, "grad_norm": 10.268973350524902, "learning_rate": 1.6707158523520115e-09, "loss": 0.328, "step": 16950 }, { "epoch": 0.9753853232114101, "grad_norm": 9.969135284423828, "learning_rate": 1.595508189626893e-09, "loss": 0.3516, "step": 16960 }, { "epoch": 0.9759604324821716, "grad_norm": 9.581161499023438, "learning_rate": 1.5220297752739919e-09, "loss": 0.3446, "step": 16970 }, { "epoch": 0.9765355417529331, "grad_norm": 10.132065773010254, "learning_rate": 1.450280864231601e-09, "loss": 0.3593, "step": 16980 }, { "epoch": 0.9771106510236945, "grad_norm": 9.61719799041748, "learning_rate": 1.380261705437591e-09, "loss": 0.3641, "step": 16990 }, { "epoch": 0.977685760294456, "grad_norm": 10.138646125793457, "learning_rate": 1.311972541828077e-09, "loss": 0.3402, "step": 17000 }, { "epoch": 0.9782608695652174, "grad_norm": 11.132647514343262, "learning_rate": 1.2454136103370315e-09, "loss": 0.3555, "step": 17010 }, { "epoch": 0.9788359788359788, "grad_norm": 9.739797592163086, "learning_rate": 1.1805851418952294e-09, "loss": 0.3546, "step": 17020 }, { "epoch": 0.9794110881067403, "grad_norm": 9.894083976745605, "learning_rate": 1.1174873614294145e-09, "loss": 0.3184, "step": 17030 }, { "epoch": 0.9799861973775017, "grad_norm": 20.141143798828125, "learning_rate": 1.0561204878616337e-09, "loss": 0.3555, "step": 17040 }, { "epoch": 0.9805613066482631, "grad_norm": 9.90503978729248, "learning_rate": 9.964847341085158e-10, "loss": 0.357, "step": 17050 }, { "epoch": 0.9811364159190246, "grad_norm": 9.853381156921387, "learning_rate": 9.385803070802723e-10, "loss": 0.3618, "step": 17060 }, { "epoch": 0.9817115251897861, "grad_norm": 10.17227554321289, "learning_rate": 8.824074076803078e-10, "loss": 0.3311, "step": 17070 }, { "epoch": 0.9822866344605475, "grad_norm": 11.115245819091797, "learning_rate": 8.279662308043888e-10, "loss": 0.3348, "step": 17080 }, { "epoch": 0.982861743731309, "grad_norm": 12.686921119689941, "learning_rate": 7.752569653397545e-10, "loss": 0.333, "step": 17090 }, { "epoch": 0.9834368530020704, "grad_norm": 10.707682609558105, "learning_rate": 7.242797941649503e-10, "loss": 0.3492, "step": 17100 }, { "epoch": 0.9840119622728318, "grad_norm": 10.170624732971191, "learning_rate": 6.750348941486073e-10, "loss": 0.3308, "step": 17110 }, { "epoch": 0.9845870715435933, "grad_norm": 10.090374946594238, "learning_rate": 6.275224361493858e-10, "loss": 0.3489, "step": 17120 }, { "epoch": 0.9851621808143547, "grad_norm": 9.269074440002441, "learning_rate": 5.81742585014866e-10, "loss": 0.3488, "step": 17130 }, { "epoch": 0.9857372900851161, "grad_norm": 8.36646556854248, "learning_rate": 5.376954995814919e-10, "loss": 0.3792, "step": 17140 }, { "epoch": 0.9863123993558777, "grad_norm": 10.531664848327637, "learning_rate": 4.953813326735723e-10, "loss": 0.3146, "step": 17150 }, { "epoch": 0.9868875086266391, "grad_norm": 9.710155487060547, "learning_rate": 4.54800231103003e-10, "loss": 0.3539, "step": 17160 }, { "epoch": 0.9874626178974005, "grad_norm": 11.801857948303223, "learning_rate": 4.1595233566865673e-10, "loss": 0.3912, "step": 17170 }, { "epoch": 0.988037727168162, "grad_norm": 11.040277481079102, "learning_rate": 3.7883778115599395e-10, "loss": 0.35, "step": 17180 }, { "epoch": 0.9886128364389234, "grad_norm": 11.53235912322998, "learning_rate": 3.434566963364527e-10, "loss": 0.328, "step": 17190 }, { "epoch": 0.9891879457096848, "grad_norm": 13.099756240844727, "learning_rate": 3.098092039671707e-10, "loss": 0.3617, "step": 17200 }, { "epoch": 0.9897630549804463, "grad_norm": 13.452399253845215, "learning_rate": 2.778954207904305e-10, "loss": 0.332, "step": 17210 }, { "epoch": 0.9903381642512077, "grad_norm": 12.929037094116211, "learning_rate": 2.477154575331597e-10, "loss": 0.3501, "step": 17220 }, { "epoch": 0.9909132735219692, "grad_norm": 11.548810005187988, "learning_rate": 2.1926941890693108e-10, "loss": 0.3415, "step": 17230 }, { "epoch": 0.9914883827927307, "grad_norm": 15.000441551208496, "learning_rate": 1.925574036071298e-10, "loss": 0.3511, "step": 17240 }, { "epoch": 0.9920634920634921, "grad_norm": 13.140406608581543, "learning_rate": 1.6757950431295353e-10, "loss": 0.3349, "step": 17250 }, { "epoch": 0.9926386013342535, "grad_norm": 10.228362083435059, "learning_rate": 1.4433580768696828e-10, "loss": 0.365, "step": 17260 }, { "epoch": 0.993213710605015, "grad_norm": 11.546327590942383, "learning_rate": 1.2282639437466435e-10, "loss": 0.3606, "step": 17270 }, { "epoch": 0.9937888198757764, "grad_norm": 10.614378929138184, "learning_rate": 1.0305133900451179e-10, "loss": 0.3553, "step": 17280 }, { "epoch": 0.9943639291465378, "grad_norm": 13.574530601501465, "learning_rate": 8.501071018729433e-11, "loss": 0.3758, "step": 17290 }, { "epoch": 0.9949390384172992, "grad_norm": 10.464898109436035, "learning_rate": 6.870457051638689e-11, "loss": 0.3593, "step": 17300 }, { "epoch": 0.9955141476880607, "grad_norm": 11.059677124023438, "learning_rate": 5.413297656686744e-11, "loss": 0.3362, "step": 17310 }, { "epoch": 0.9960892569588222, "grad_norm": 11.568893432617188, "learning_rate": 4.129597889601655e-11, "loss": 0.356, "step": 17320 }, { "epoch": 0.9966643662295837, "grad_norm": 12.195451736450195, "learning_rate": 3.019362204254028e-11, "loss": 0.3555, "step": 17330 }, { "epoch": 0.9972394755003451, "grad_norm": 13.027482986450195, "learning_rate": 2.082594452695874e-11, "loss": 0.3381, "step": 17340 }, { "epoch": 0.9978145847711065, "grad_norm": 10.714599609375, "learning_rate": 1.3192978850995463e-11, "loss": 0.355, "step": 17350 }, { "epoch": 0.998389694041868, "grad_norm": 10.665827751159668, "learning_rate": 7.294751497743945e-12, "loss": 0.35, "step": 17360 }, { "epoch": 0.9989648033126294, "grad_norm": 13.261438369750977, "learning_rate": 3.1312829315011114e-12, "loss": 0.3336, "step": 17370 }, { "epoch": 0.9995399125833908, "grad_norm": 11.001378059387207, "learning_rate": 7.025875977673124e-13, "loss": 0.3368, "step": 17380 }, { "epoch": 1.0, "step": 17388, "total_flos": 5.231438585724823e+19, "train_loss": 0.4114093804238773, "train_runtime": 31710.3993, "train_samples_per_second": 35.092, "train_steps_per_second": 0.548 } ], "logging_steps": 10, "max_steps": 17388, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.231438585724823e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }