{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3295, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030350165029022344, "grad_norm": 2.2979175122091453, "learning_rate": 2.7272727272727274e-07, "loss": 0.789, "step": 10 }, { "epoch": 0.006070033005804469, "grad_norm": 1.7852801501566888, "learning_rate": 5.757575757575758e-07, "loss": 0.7904, "step": 20 }, { "epoch": 0.009105049508706704, "grad_norm": 1.1632800870309477, "learning_rate": 8.787878787878788e-07, "loss": 0.744, "step": 30 }, { "epoch": 0.012140066011608937, "grad_norm": 0.8818693207084786, "learning_rate": 1.181818181818182e-06, "loss": 0.693, "step": 40 }, { "epoch": 0.015175082514511173, "grad_norm": 0.657250412764622, "learning_rate": 1.484848484848485e-06, "loss": 0.6656, "step": 50 }, { "epoch": 0.018210099017413408, "grad_norm": 0.5316521889428266, "learning_rate": 1.787878787878788e-06, "loss": 0.6313, "step": 60 }, { "epoch": 0.021245115520315643, "grad_norm": 0.4575711828032985, "learning_rate": 2.090909090909091e-06, "loss": 0.6098, "step": 70 }, { "epoch": 0.024280132023217875, "grad_norm": 0.4548384308100045, "learning_rate": 2.393939393939394e-06, "loss": 0.5885, "step": 80 }, { "epoch": 0.02731514852612011, "grad_norm": 0.4458364948040082, "learning_rate": 2.6969696969696972e-06, "loss": 0.5916, "step": 90 }, { "epoch": 0.030350165029022345, "grad_norm": 0.4794166169125783, "learning_rate": 3e-06, "loss": 0.5753, "step": 100 }, { "epoch": 0.03338518153192458, "grad_norm": 0.4567719754046465, "learning_rate": 3.3030303030303033e-06, "loss": 0.5597, "step": 110 }, { "epoch": 0.036420198034826816, "grad_norm": 0.4307419720442758, "learning_rate": 3.606060606060606e-06, "loss": 0.5684, "step": 120 }, { "epoch": 0.03945521453772905, "grad_norm": 0.4458642819812865, "learning_rate": 3.90909090909091e-06, "loss": 0.5569, "step": 130 }, { "epoch": 0.042490231040631286, "grad_norm": 0.45568817442378473, "learning_rate": 4.212121212121212e-06, "loss": 0.5551, "step": 140 }, { "epoch": 0.045525247543533515, "grad_norm": 0.4950300651709512, "learning_rate": 4.5151515151515155e-06, "loss": 0.5565, "step": 150 }, { "epoch": 0.04856026404643575, "grad_norm": 0.4441786811205893, "learning_rate": 4.818181818181819e-06, "loss": 0.5482, "step": 160 }, { "epoch": 0.051595280549337985, "grad_norm": 0.4926559856436913, "learning_rate": 5.121212121212121e-06, "loss": 0.5482, "step": 170 }, { "epoch": 0.05463029705224022, "grad_norm": 0.46505669754342027, "learning_rate": 5.424242424242425e-06, "loss": 0.5397, "step": 180 }, { "epoch": 0.057665313555142456, "grad_norm": 0.4862932493508676, "learning_rate": 5.727272727272728e-06, "loss": 0.5379, "step": 190 }, { "epoch": 0.06070033005804469, "grad_norm": 0.5633080158535704, "learning_rate": 6.030303030303031e-06, "loss": 0.5487, "step": 200 }, { "epoch": 0.06373534656094693, "grad_norm": 0.5144964810224658, "learning_rate": 6.333333333333333e-06, "loss": 0.5354, "step": 210 }, { "epoch": 0.06677036306384916, "grad_norm": 0.623858354223414, "learning_rate": 6.6363636363636375e-06, "loss": 0.5321, "step": 220 }, { "epoch": 0.0698053795667514, "grad_norm": 0.5384794963785807, "learning_rate": 6.93939393939394e-06, "loss": 0.5286, "step": 230 }, { "epoch": 0.07284039606965363, "grad_norm": 0.535110770579217, "learning_rate": 7.242424242424243e-06, "loss": 0.5277, "step": 240 }, { "epoch": 0.07587541257255587, "grad_norm": 0.5036293665682129, "learning_rate": 7.545454545454546e-06, "loss": 0.5333, "step": 250 }, { "epoch": 0.0789104290754581, "grad_norm": 0.47084801964860584, "learning_rate": 7.848484848484849e-06, "loss": 0.5272, "step": 260 }, { "epoch": 0.08194544557836034, "grad_norm": 0.6020093296202623, "learning_rate": 8.151515151515152e-06, "loss": 0.5283, "step": 270 }, { "epoch": 0.08498046208126257, "grad_norm": 0.5024737305172274, "learning_rate": 8.454545454545455e-06, "loss": 0.5212, "step": 280 }, { "epoch": 0.08801547858416481, "grad_norm": 0.4994326809453112, "learning_rate": 8.757575757575759e-06, "loss": 0.5161, "step": 290 }, { "epoch": 0.09105049508706703, "grad_norm": 0.5231140922773104, "learning_rate": 9.06060606060606e-06, "loss": 0.5281, "step": 300 }, { "epoch": 0.09408551158996926, "grad_norm": 0.5098295511036418, "learning_rate": 9.363636363636365e-06, "loss": 0.5154, "step": 310 }, { "epoch": 0.0971205280928715, "grad_norm": 0.5892467794961541, "learning_rate": 9.666666666666667e-06, "loss": 0.5218, "step": 320 }, { "epoch": 0.10015554459577374, "grad_norm": 0.4951705896851936, "learning_rate": 9.96969696969697e-06, "loss": 0.5169, "step": 330 }, { "epoch": 0.10319056109867597, "grad_norm": 0.6364271310354288, "learning_rate": 9.999772661973056e-06, "loss": 0.5133, "step": 340 }, { "epoch": 0.1062255776015782, "grad_norm": 0.5028738085648048, "learning_rate": 9.99898682866784e-06, "loss": 0.52, "step": 350 }, { "epoch": 0.10926059410448044, "grad_norm": 0.5217859360497503, "learning_rate": 9.997639781643002e-06, "loss": 0.5008, "step": 360 }, { "epoch": 0.11229561060738268, "grad_norm": 0.5157314231456531, "learning_rate": 9.99573167212544e-06, "loss": 0.517, "step": 370 }, { "epoch": 0.11533062711028491, "grad_norm": 0.524618846108783, "learning_rate": 9.993262714330009e-06, "loss": 0.5092, "step": 380 }, { "epoch": 0.11836564361318715, "grad_norm": 0.5191412167795787, "learning_rate": 9.990233185435473e-06, "loss": 0.513, "step": 390 }, { "epoch": 0.12140066011608938, "grad_norm": 0.5047990336080693, "learning_rate": 9.986643425553386e-06, "loss": 0.5129, "step": 400 }, { "epoch": 0.12443567661899162, "grad_norm": 0.49678214945335275, "learning_rate": 9.98249383768991e-06, "loss": 0.5073, "step": 410 }, { "epoch": 0.12747069312189385, "grad_norm": 0.5341967716757828, "learning_rate": 9.977784887700572e-06, "loss": 0.5088, "step": 420 }, { "epoch": 0.13050570962479607, "grad_norm": 0.49917946146850173, "learning_rate": 9.972517104237961e-06, "loss": 0.4991, "step": 430 }, { "epoch": 0.13354072612769832, "grad_norm": 0.49221501327780853, "learning_rate": 9.966691078692386e-06, "loss": 0.506, "step": 440 }, { "epoch": 0.13657574263060054, "grad_norm": 0.5448393355182506, "learning_rate": 9.960307465125472e-06, "loss": 0.5025, "step": 450 }, { "epoch": 0.1396107591335028, "grad_norm": 0.9165522381667764, "learning_rate": 9.953366980196746e-06, "loss": 0.4976, "step": 460 }, { "epoch": 0.14264577563640501, "grad_norm": 0.5122649698502958, "learning_rate": 9.945870403083164e-06, "loss": 0.503, "step": 470 }, { "epoch": 0.14568079213930726, "grad_norm": 0.48003844926471007, "learning_rate": 9.937818575391654e-06, "loss": 0.5044, "step": 480 }, { "epoch": 0.14871580864220948, "grad_norm": 0.5083116253946615, "learning_rate": 9.929212401064616e-06, "loss": 0.505, "step": 490 }, { "epoch": 0.15175082514511173, "grad_norm": 0.4997990195713387, "learning_rate": 9.920052846278455e-06, "loss": 0.4991, "step": 500 }, { "epoch": 0.15478584164801396, "grad_norm": 0.4863244820808186, "learning_rate": 9.910340939335098e-06, "loss": 0.4889, "step": 510 }, { "epoch": 0.1578208581509162, "grad_norm": 0.519835159370644, "learning_rate": 9.900077770546567e-06, "loss": 0.488, "step": 520 }, { "epoch": 0.16085587465381843, "grad_norm": 0.5247955372337217, "learning_rate": 9.889264492112563e-06, "loss": 0.5025, "step": 530 }, { "epoch": 0.16389089115672067, "grad_norm": 0.47839854894729905, "learning_rate": 9.877902317991116e-06, "loss": 0.4946, "step": 540 }, { "epoch": 0.1669259076596229, "grad_norm": 0.4889470867795177, "learning_rate": 9.865992523762306e-06, "loss": 0.4989, "step": 550 }, { "epoch": 0.16996092416252515, "grad_norm": 0.4753575663441252, "learning_rate": 9.853536446485048e-06, "loss": 0.503, "step": 560 }, { "epoch": 0.17299594066542737, "grad_norm": 0.5422582939397118, "learning_rate": 9.840535484546996e-06, "loss": 0.4903, "step": 570 }, { "epoch": 0.17603095716832962, "grad_norm": 0.4706321475205836, "learning_rate": 9.826991097507548e-06, "loss": 0.4958, "step": 580 }, { "epoch": 0.17906597367123184, "grad_norm": 0.4839096225021675, "learning_rate": 9.812904805933989e-06, "loss": 0.4922, "step": 590 }, { "epoch": 0.18210099017413406, "grad_norm": 0.4732593571246822, "learning_rate": 9.798278191230783e-06, "loss": 0.5004, "step": 600 }, { "epoch": 0.1851360066770363, "grad_norm": 0.4505029818283895, "learning_rate": 9.78311289546204e-06, "loss": 0.4802, "step": 610 }, { "epoch": 0.18817102317993853, "grad_norm": 0.5055436622019127, "learning_rate": 9.76741062116716e-06, "loss": 0.4945, "step": 620 }, { "epoch": 0.19120603968284078, "grad_norm": 0.49572149753193984, "learning_rate": 9.751173131169705e-06, "loss": 0.4906, "step": 630 }, { "epoch": 0.194241056185743, "grad_norm": 0.4448394645138839, "learning_rate": 9.73440224837949e-06, "loss": 0.496, "step": 640 }, { "epoch": 0.19727607268864525, "grad_norm": 0.49316009769902025, "learning_rate": 9.717099855587935e-06, "loss": 0.486, "step": 650 }, { "epoch": 0.20031108919154747, "grad_norm": 0.47797040662070445, "learning_rate": 9.699267895256695e-06, "loss": 0.4769, "step": 660 }, { "epoch": 0.20334610569444972, "grad_norm": 0.5092952568059224, "learning_rate": 9.68090836929958e-06, "loss": 0.4918, "step": 670 }, { "epoch": 0.20638112219735194, "grad_norm": 0.45202984420347087, "learning_rate": 9.662023338857822e-06, "loss": 0.485, "step": 680 }, { "epoch": 0.2094161387002542, "grad_norm": 0.4983093319634588, "learning_rate": 9.642614924068667e-06, "loss": 0.4902, "step": 690 }, { "epoch": 0.2124511552031564, "grad_norm": 0.492174959560037, "learning_rate": 9.622685303827366e-06, "loss": 0.4881, "step": 700 }, { "epoch": 0.21548617170605866, "grad_norm": 0.47815379215317105, "learning_rate": 9.602236715542557e-06, "loss": 0.4848, "step": 710 }, { "epoch": 0.21852118820896088, "grad_norm": 0.48589573091096694, "learning_rate": 9.581271454885077e-06, "loss": 0.4903, "step": 720 }, { "epoch": 0.22155620471186313, "grad_norm": 0.46478604800112194, "learning_rate": 9.559791875530247e-06, "loss": 0.489, "step": 730 }, { "epoch": 0.22459122121476535, "grad_norm": 0.5035008932669235, "learning_rate": 9.537800388893628e-06, "loss": 0.4864, "step": 740 }, { "epoch": 0.2276262377176676, "grad_norm": 0.49713803635487736, "learning_rate": 9.515299463860301e-06, "loss": 0.4858, "step": 750 }, { "epoch": 0.23066125422056982, "grad_norm": 0.48728829894131276, "learning_rate": 9.492291626507705e-06, "loss": 0.4874, "step": 760 }, { "epoch": 0.23369627072347207, "grad_norm": 0.5654089866669252, "learning_rate": 9.468779459822034e-06, "loss": 0.4865, "step": 770 }, { "epoch": 0.2367312872263743, "grad_norm": 0.456617843862989, "learning_rate": 9.444765603408273e-06, "loss": 0.4834, "step": 780 }, { "epoch": 0.23976630372927651, "grad_norm": 0.7108327075954629, "learning_rate": 9.420252753193842e-06, "loss": 0.4725, "step": 790 }, { "epoch": 0.24280132023217876, "grad_norm": 0.49021009503467355, "learning_rate": 9.395243661125948e-06, "loss": 0.4882, "step": 800 }, { "epoch": 0.24583633673508098, "grad_norm": 0.48115706251944507, "learning_rate": 9.369741134862636e-06, "loss": 0.4752, "step": 810 }, { "epoch": 0.24887135323798323, "grad_norm": 0.5131051402999688, "learning_rate": 9.343748037457585e-06, "loss": 0.4869, "step": 820 }, { "epoch": 0.2519063697408855, "grad_norm": 0.4658252213252816, "learning_rate": 9.317267287038682e-06, "loss": 0.4884, "step": 830 }, { "epoch": 0.2549413862437877, "grad_norm": 0.4751568661396783, "learning_rate": 9.290301856480425e-06, "loss": 0.4797, "step": 840 }, { "epoch": 0.2579764027466899, "grad_norm": 0.47930138340752015, "learning_rate": 9.262854773070157e-06, "loss": 0.4869, "step": 850 }, { "epoch": 0.26101141924959215, "grad_norm": 0.47733793569696137, "learning_rate": 9.234929118168228e-06, "loss": 0.4712, "step": 860 }, { "epoch": 0.2640464357524944, "grad_norm": 0.4566798290905438, "learning_rate": 9.206528026862043e-06, "loss": 0.4765, "step": 870 }, { "epoch": 0.26708145225539665, "grad_norm": 0.4466757545533174, "learning_rate": 9.177654687614112e-06, "loss": 0.4824, "step": 880 }, { "epoch": 0.27011646875829887, "grad_norm": 0.47012964956941894, "learning_rate": 9.148312341904095e-06, "loss": 0.4768, "step": 890 }, { "epoch": 0.2731514852612011, "grad_norm": 0.4650672010975063, "learning_rate": 9.118504283864891e-06, "loss": 0.4763, "step": 900 }, { "epoch": 0.27618650176410336, "grad_norm": 0.4522875610929074, "learning_rate": 9.088233859912823e-06, "loss": 0.4774, "step": 910 }, { "epoch": 0.2792215182670056, "grad_norm": 0.9218754990609884, "learning_rate": 9.057504468371954e-06, "loss": 0.4774, "step": 920 }, { "epoch": 0.2822565347699078, "grad_norm": 0.4446080760553245, "learning_rate": 9.026319559092566e-06, "loss": 0.4822, "step": 930 }, { "epoch": 0.28529155127281003, "grad_norm": 0.45974314278861356, "learning_rate": 8.994682633063868e-06, "loss": 0.4737, "step": 940 }, { "epoch": 0.2883265677757123, "grad_norm": 0.46640835738246217, "learning_rate": 8.962597242020947e-06, "loss": 0.4772, "step": 950 }, { "epoch": 0.2913615842786145, "grad_norm": 0.45954793881402833, "learning_rate": 8.930066988046042e-06, "loss": 0.4688, "step": 960 }, { "epoch": 0.29439660078151675, "grad_norm": 0.5238237149613728, "learning_rate": 8.897095523164141e-06, "loss": 0.4742, "step": 970 }, { "epoch": 0.29743161728441897, "grad_norm": 0.45541775923959604, "learning_rate": 8.863686548933001e-06, "loss": 0.4786, "step": 980 }, { "epoch": 0.3004666337873212, "grad_norm": 0.46811147224647004, "learning_rate": 8.829843816027575e-06, "loss": 0.4706, "step": 990 }, { "epoch": 0.30350165029022347, "grad_norm": 0.47480506009090023, "learning_rate": 8.795571123818948e-06, "loss": 0.4733, "step": 1000 }, { "epoch": 0.3065366667931257, "grad_norm": 0.46579935990440124, "learning_rate": 8.760872319947796e-06, "loss": 0.467, "step": 1010 }, { "epoch": 0.3095716832960279, "grad_norm": 0.4713650285222608, "learning_rate": 8.72575129989244e-06, "loss": 0.4714, "step": 1020 }, { "epoch": 0.31260669979893013, "grad_norm": 0.5172958915739383, "learning_rate": 8.690212006531498e-06, "loss": 0.4778, "step": 1030 }, { "epoch": 0.3156417163018324, "grad_norm": 0.4636306902344978, "learning_rate": 8.654258429701254e-06, "loss": 0.4766, "step": 1040 }, { "epoch": 0.31867673280473463, "grad_norm": 0.4630632755815923, "learning_rate": 8.617894605747728e-06, "loss": 0.471, "step": 1050 }, { "epoch": 0.32171174930763685, "grad_norm": 0.4726631111043291, "learning_rate": 8.581124617073531e-06, "loss": 0.4754, "step": 1060 }, { "epoch": 0.3247467658105391, "grad_norm": 0.449173460165581, "learning_rate": 8.543952591679565e-06, "loss": 0.4757, "step": 1070 }, { "epoch": 0.32778178231344135, "grad_norm": 0.4593115406956091, "learning_rate": 8.506382702701575e-06, "loss": 0.4682, "step": 1080 }, { "epoch": 0.33081679881634357, "grad_norm": 0.49121579493660045, "learning_rate": 8.468419167941658e-06, "loss": 0.4631, "step": 1090 }, { "epoch": 0.3338518153192458, "grad_norm": 0.4799330378295981, "learning_rate": 8.430066249394754e-06, "loss": 0.4786, "step": 1100 }, { "epoch": 0.336886831822148, "grad_norm": 0.46743240892192656, "learning_rate": 8.391328252770165e-06, "loss": 0.4648, "step": 1110 }, { "epoch": 0.3399218483250503, "grad_norm": 0.49238589329434995, "learning_rate": 8.352209527008164e-06, "loss": 0.4785, "step": 1120 }, { "epoch": 0.3429568648279525, "grad_norm": 0.463123088401655, "learning_rate": 8.31271446379178e-06, "loss": 0.4684, "step": 1130 }, { "epoch": 0.34599188133085473, "grad_norm": 0.45804967626699783, "learning_rate": 8.272847497053745e-06, "loss": 0.467, "step": 1140 }, { "epoch": 0.34902689783375695, "grad_norm": 0.49208327340861874, "learning_rate": 8.232613102478722e-06, "loss": 0.4734, "step": 1150 }, { "epoch": 0.35206191433665923, "grad_norm": 0.4834727766246894, "learning_rate": 8.192015797000849e-06, "loss": 0.4634, "step": 1160 }, { "epoch": 0.35509693083956145, "grad_norm": 0.45802821271745225, "learning_rate": 8.151060138296624e-06, "loss": 0.4769, "step": 1170 }, { "epoch": 0.3581319473424637, "grad_norm": 0.47069229829560316, "learning_rate": 8.10975072427326e-06, "loss": 0.4631, "step": 1180 }, { "epoch": 0.3611669638453659, "grad_norm": 0.4638785009109008, "learning_rate": 8.068092192552473e-06, "loss": 0.4621, "step": 1190 }, { "epoch": 0.3642019803482681, "grad_norm": 0.5267368501556567, "learning_rate": 8.026089219949856e-06, "loss": 0.4707, "step": 1200 }, { "epoch": 0.3672369968511704, "grad_norm": 0.4603778748004369, "learning_rate": 7.983746521949822e-06, "loss": 0.4691, "step": 1210 }, { "epoch": 0.3702720133540726, "grad_norm": 0.5044371212314646, "learning_rate": 7.941068852176233e-06, "loss": 0.4673, "step": 1220 }, { "epoch": 0.37330702985697484, "grad_norm": 0.5023326078920469, "learning_rate": 7.898061001858712e-06, "loss": 0.4652, "step": 1230 }, { "epoch": 0.37634204635987706, "grad_norm": 0.5465382069712866, "learning_rate": 7.854727799294768e-06, "loss": 0.4648, "step": 1240 }, { "epoch": 0.37937706286277934, "grad_norm": 0.46811731936405077, "learning_rate": 7.81107410930774e-06, "loss": 0.474, "step": 1250 }, { "epoch": 0.38241207936568156, "grad_norm": 0.4582110078248361, "learning_rate": 7.767104832700645e-06, "loss": 0.4557, "step": 1260 }, { "epoch": 0.3854470958685838, "grad_norm": 0.47816100008054285, "learning_rate": 7.72282490570599e-06, "loss": 0.4655, "step": 1270 }, { "epoch": 0.388482112371486, "grad_norm": 0.4587552509577914, "learning_rate": 7.678239299431594e-06, "loss": 0.4675, "step": 1280 }, { "epoch": 0.3915171288743883, "grad_norm": 0.48955107625039307, "learning_rate": 7.633353019302519e-06, "loss": 0.4628, "step": 1290 }, { "epoch": 0.3945521453772905, "grad_norm": 0.4697573855941327, "learning_rate": 7.58817110449912e-06, "loss": 0.4705, "step": 1300 }, { "epoch": 0.3975871618801927, "grad_norm": 0.4586083541834714, "learning_rate": 7.5426986273913275e-06, "loss": 0.4633, "step": 1310 }, { "epoch": 0.40062217838309494, "grad_norm": 0.4943303423222579, "learning_rate": 7.496940692969188e-06, "loss": 0.4664, "step": 1320 }, { "epoch": 0.4036571948859972, "grad_norm": 0.4472430460309261, "learning_rate": 7.450902438269761e-06, "loss": 0.466, "step": 1330 }, { "epoch": 0.40669221138889944, "grad_norm": 0.45151069440310115, "learning_rate": 7.404589031800395e-06, "loss": 0.466, "step": 1340 }, { "epoch": 0.40972722789180166, "grad_norm": 0.4931709679843594, "learning_rate": 7.358005672958488e-06, "loss": 0.4638, "step": 1350 }, { "epoch": 0.4127622443947039, "grad_norm": 0.4458955481322604, "learning_rate": 7.311157591447775e-06, "loss": 0.4574, "step": 1360 }, { "epoch": 0.41579726089760616, "grad_norm": 0.5037362623251065, "learning_rate": 7.264050046691211e-06, "loss": 0.4631, "step": 1370 }, { "epoch": 0.4188322774005084, "grad_norm": 0.5401075737360693, "learning_rate": 7.216688327240523e-06, "loss": 0.4672, "step": 1380 }, { "epoch": 0.4218672939034106, "grad_norm": 0.46823354166024683, "learning_rate": 7.16907775018248e-06, "loss": 0.4613, "step": 1390 }, { "epoch": 0.4249023104063128, "grad_norm": 0.4780876891839522, "learning_rate": 7.1212236605419795e-06, "loss": 0.4666, "step": 1400 }, { "epoch": 0.42793732690921504, "grad_norm": 0.5119425577262322, "learning_rate": 7.0731314306819725e-06, "loss": 0.454, "step": 1410 }, { "epoch": 0.4309723434121173, "grad_norm": 0.47436698182963066, "learning_rate": 7.024806459700344e-06, "loss": 0.4745, "step": 1420 }, { "epoch": 0.43400735991501954, "grad_norm": 0.47009240919947903, "learning_rate": 6.976254172823773e-06, "loss": 0.4578, "step": 1430 }, { "epoch": 0.43704237641792176, "grad_norm": 0.4505894481894143, "learning_rate": 6.92748002079867e-06, "loss": 0.4652, "step": 1440 }, { "epoch": 0.440077392920824, "grad_norm": 0.4739852546862808, "learning_rate": 6.878489479279248e-06, "loss": 0.4634, "step": 1450 }, { "epoch": 0.44311240942372626, "grad_norm": 0.43339777098222865, "learning_rate": 6.829288048212789e-06, "loss": 0.4583, "step": 1460 }, { "epoch": 0.4461474259266285, "grad_norm": 0.48444826933680063, "learning_rate": 6.779881251222198e-06, "loss": 0.4654, "step": 1470 }, { "epoch": 0.4491824424295307, "grad_norm": 0.43409574874106044, "learning_rate": 6.730274634985883e-06, "loss": 0.4671, "step": 1480 }, { "epoch": 0.4522174589324329, "grad_norm": 0.4532708590504662, "learning_rate": 6.6804737686150615e-06, "loss": 0.4698, "step": 1490 }, { "epoch": 0.4552524754353352, "grad_norm": 0.473135305256464, "learning_rate": 6.630484243028534e-06, "loss": 0.4737, "step": 1500 }, { "epoch": 0.4582874919382374, "grad_norm": 0.4685015310135646, "learning_rate": 6.580311670325029e-06, "loss": 0.4556, "step": 1510 }, { "epoch": 0.46132250844113964, "grad_norm": 0.46358839605143787, "learning_rate": 6.529961683153136e-06, "loss": 0.4604, "step": 1520 }, { "epoch": 0.46435752494404187, "grad_norm": 0.45388131225935224, "learning_rate": 6.479439934078983e-06, "loss": 0.4559, "step": 1530 }, { "epoch": 0.46739254144694414, "grad_norm": 0.435672967522485, "learning_rate": 6.428752094951621e-06, "loss": 0.4589, "step": 1540 }, { "epoch": 0.47042755794984636, "grad_norm": 0.4681448507359377, "learning_rate": 6.377903856266285e-06, "loss": 0.4656, "step": 1550 }, { "epoch": 0.4734625744527486, "grad_norm": 0.4866196593408997, "learning_rate": 6.326900926525552e-06, "loss": 0.4587, "step": 1560 }, { "epoch": 0.4764975909556508, "grad_norm": 0.5334135903943417, "learning_rate": 6.275749031598457e-06, "loss": 0.4596, "step": 1570 }, { "epoch": 0.47953260745855303, "grad_norm": 0.545053303736673, "learning_rate": 6.224453914077691e-06, "loss": 0.4599, "step": 1580 }, { "epoch": 0.4825676239614553, "grad_norm": 0.43049803862728125, "learning_rate": 6.173021332634899e-06, "loss": 0.4609, "step": 1590 }, { "epoch": 0.4856026404643575, "grad_norm": 0.4679235070374552, "learning_rate": 6.121457061374182e-06, "loss": 0.4659, "step": 1600 }, { "epoch": 0.48863765696725975, "grad_norm": 0.48596466698858914, "learning_rate": 6.06976688918386e-06, "loss": 0.4552, "step": 1610 }, { "epoch": 0.49167267347016197, "grad_norm": 0.4654168221541189, "learning_rate": 6.017956619086585e-06, "loss": 0.4652, "step": 1620 }, { "epoch": 0.49470768997306425, "grad_norm": 0.43813429165646467, "learning_rate": 5.966032067587862e-06, "loss": 0.4596, "step": 1630 }, { "epoch": 0.49774270647596647, "grad_norm": 0.6681280851162651, "learning_rate": 5.913999064023046e-06, "loss": 0.4572, "step": 1640 }, { "epoch": 0.5007777229788687, "grad_norm": 0.5204323449742513, "learning_rate": 5.861863449902926e-06, "loss": 0.4628, "step": 1650 }, { "epoch": 0.503812739481771, "grad_norm": 0.4296041756151093, "learning_rate": 5.80963107825791e-06, "loss": 0.4568, "step": 1660 }, { "epoch": 0.5068477559846731, "grad_norm": 0.4687304772565422, "learning_rate": 5.7573078129809386e-06, "loss": 0.4604, "step": 1670 }, { "epoch": 0.5098827724875754, "grad_norm": 0.4554317567297939, "learning_rate": 5.704899528169175e-06, "loss": 0.4698, "step": 1680 }, { "epoch": 0.5129177889904777, "grad_norm": 0.4550611945171018, "learning_rate": 5.652412107464532e-06, "loss": 0.4559, "step": 1690 }, { "epoch": 0.5159528054933799, "grad_norm": 0.4737844991440093, "learning_rate": 5.5998514433931636e-06, "loss": 0.4657, "step": 1700 }, { "epoch": 0.5189878219962821, "grad_norm": 0.44176755708702975, "learning_rate": 5.547223436703919e-06, "loss": 0.4555, "step": 1710 }, { "epoch": 0.5220228384991843, "grad_norm": 0.4602868804162559, "learning_rate": 5.494533995705904e-06, "loss": 0.4587, "step": 1720 }, { "epoch": 0.5250578550020866, "grad_norm": 0.4619636659852323, "learning_rate": 5.441789035605174e-06, "loss": 0.4605, "step": 1730 }, { "epoch": 0.5280928715049888, "grad_norm": 0.4918743050797947, "learning_rate": 5.3889944778406656e-06, "loss": 0.4601, "step": 1740 }, { "epoch": 0.531127888007891, "grad_norm": 0.5108035906034613, "learning_rate": 5.336156249419422e-06, "loss": 0.4583, "step": 1750 }, { "epoch": 0.5341629045107933, "grad_norm": 0.4644351963738344, "learning_rate": 5.283280282251192e-06, "loss": 0.451, "step": 1760 }, { "epoch": 0.5371979210136956, "grad_norm": 0.466505558488539, "learning_rate": 5.230372512482485e-06, "loss": 0.4569, "step": 1770 }, { "epoch": 0.5402329375165977, "grad_norm": 0.4395984838919295, "learning_rate": 5.177438879830148e-06, "loss": 0.4546, "step": 1780 }, { "epoch": 0.5432679540195, "grad_norm": 0.5081543080746838, "learning_rate": 5.1244853269145315e-06, "loss": 0.4522, "step": 1790 }, { "epoch": 0.5463029705224022, "grad_norm": 0.4739999458440851, "learning_rate": 5.0715177985923454e-06, "loss": 0.4575, "step": 1800 }, { "epoch": 0.5493379870253045, "grad_norm": 0.44986425269809655, "learning_rate": 5.0185422412892615e-06, "loss": 0.4504, "step": 1810 }, { "epoch": 0.5523730035282067, "grad_norm": 0.493838430904318, "learning_rate": 4.96556460233232e-06, "loss": 0.4565, "step": 1820 }, { "epoch": 0.5554080200311089, "grad_norm": 0.4597123498494703, "learning_rate": 4.912590829282269e-06, "loss": 0.4552, "step": 1830 }, { "epoch": 0.5584430365340112, "grad_norm": 0.45249866862931715, "learning_rate": 4.859626869265838e-06, "loss": 0.4646, "step": 1840 }, { "epoch": 0.5614780530369133, "grad_norm": 0.47693139933323253, "learning_rate": 4.806678668308102e-06, "loss": 0.4593, "step": 1850 }, { "epoch": 0.5645130695398156, "grad_norm": 0.4441745555089401, "learning_rate": 4.753752170664926e-06, "loss": 0.4518, "step": 1860 }, { "epoch": 0.5675480860427179, "grad_norm": 0.5407791277506199, "learning_rate": 4.700853318155655e-06, "loss": 0.4537, "step": 1870 }, { "epoch": 0.5705831025456201, "grad_norm": 0.43766170865780535, "learning_rate": 4.647988049496026e-06, "loss": 0.456, "step": 1880 }, { "epoch": 0.5736181190485223, "grad_norm": 0.4381830382862563, "learning_rate": 4.5951622996314785e-06, "loss": 0.4544, "step": 1890 }, { "epoch": 0.5766531355514246, "grad_norm": 0.5008063102656403, "learning_rate": 4.542381999070851e-06, "loss": 0.4576, "step": 1900 }, { "epoch": 0.5796881520543268, "grad_norm": 0.4449319997966233, "learning_rate": 4.489653073220593e-06, "loss": 0.4479, "step": 1910 }, { "epoch": 0.582723168557229, "grad_norm": 0.4325405045901816, "learning_rate": 4.43698144171955e-06, "loss": 0.4566, "step": 1920 }, { "epoch": 0.5857581850601312, "grad_norm": 0.4558072435464798, "learning_rate": 4.3843730177743835e-06, "loss": 0.4522, "step": 1930 }, { "epoch": 0.5887932015630335, "grad_norm": 0.46993480403343113, "learning_rate": 4.331833707495735e-06, "loss": 0.4497, "step": 1940 }, { "epoch": 0.5918282180659358, "grad_norm": 0.48201210672220995, "learning_rate": 4.279369409235159e-06, "loss": 0.4557, "step": 1950 }, { "epoch": 0.5948632345688379, "grad_norm": 0.48886046384215476, "learning_rate": 4.226986012922954e-06, "loss": 0.4527, "step": 1960 }, { "epoch": 0.5978982510717402, "grad_norm": 0.496378890478507, "learning_rate": 4.174689399406917e-06, "loss": 0.4474, "step": 1970 }, { "epoch": 0.6009332675746424, "grad_norm": 0.5586162522102414, "learning_rate": 4.122485439792139e-06, "loss": 0.4525, "step": 1980 }, { "epoch": 0.6039682840775447, "grad_norm": 0.446752242572371, "learning_rate": 4.070379994781865e-06, "loss": 0.446, "step": 1990 }, { "epoch": 0.6070033005804469, "grad_norm": 0.47706411203661847, "learning_rate": 4.018378914019556e-06, "loss": 0.4596, "step": 2000 }, { "epoch": 0.6100383170833491, "grad_norm": 0.538309740370942, "learning_rate": 3.966488035432169e-06, "loss": 0.4421, "step": 2010 }, { "epoch": 0.6130733335862514, "grad_norm": 0.5230632035555158, "learning_rate": 3.914713184574759e-06, "loss": 0.4569, "step": 2020 }, { "epoch": 0.6161083500891537, "grad_norm": 0.4338338834185748, "learning_rate": 3.863060173976466e-06, "loss": 0.4541, "step": 2030 }, { "epoch": 0.6191433665920558, "grad_norm": 0.5603785317621145, "learning_rate": 3.811534802487983e-06, "loss": 0.4551, "step": 2040 }, { "epoch": 0.6221783830949581, "grad_norm": 0.47642244684394447, "learning_rate": 3.7601428546305246e-06, "loss": 0.4523, "step": 2050 }, { "epoch": 0.6252133995978603, "grad_norm": 0.46992960142430185, "learning_rate": 3.7088900999464432e-06, "loss": 0.446, "step": 2060 }, { "epoch": 0.6282484161007625, "grad_norm": 0.4845504298042459, "learning_rate": 3.657782292351501e-06, "loss": 0.4566, "step": 2070 }, { "epoch": 0.6312834326036648, "grad_norm": 0.47017322226992764, "learning_rate": 3.6068251694888973e-06, "loss": 0.4508, "step": 2080 }, { "epoch": 0.634318449106567, "grad_norm": 0.5121234005197363, "learning_rate": 3.556024452085144e-06, "loss": 0.4431, "step": 2090 }, { "epoch": 0.6373534656094693, "grad_norm": 0.487255052511626, "learning_rate": 3.505385843307809e-06, "loss": 0.4473, "step": 2100 }, { "epoch": 0.6403884821123715, "grad_norm": 0.4564465966378128, "learning_rate": 3.4549150281252635e-06, "loss": 0.4472, "step": 2110 }, { "epoch": 0.6434234986152737, "grad_norm": 0.4806890404068221, "learning_rate": 3.404617672668441e-06, "loss": 0.4536, "step": 2120 }, { "epoch": 0.646458515118176, "grad_norm": 0.45235070380959885, "learning_rate": 3.354499423594737e-06, "loss": 0.4522, "step": 2130 }, { "epoch": 0.6494935316210781, "grad_norm": 0.45955408837797435, "learning_rate": 3.3045659074540797e-06, "loss": 0.4441, "step": 2140 }, { "epoch": 0.6525285481239804, "grad_norm": 0.4923322956992551, "learning_rate": 3.254822730057266e-06, "loss": 0.4551, "step": 2150 }, { "epoch": 0.6555635646268827, "grad_norm": 0.48682070386464155, "learning_rate": 3.205275475846614e-06, "loss": 0.4496, "step": 2160 }, { "epoch": 0.6585985811297849, "grad_norm": 0.47952708737796373, "learning_rate": 3.1559297072690376e-06, "loss": 0.4509, "step": 2170 }, { "epoch": 0.6616335976326871, "grad_norm": 0.4543123589314356, "learning_rate": 3.106790964151556e-06, "loss": 0.4469, "step": 2180 }, { "epoch": 0.6646686141355893, "grad_norm": 0.4820597535972369, "learning_rate": 3.0578647630793845e-06, "loss": 0.45, "step": 2190 }, { "epoch": 0.6677036306384916, "grad_norm": 0.47560501988050274, "learning_rate": 3.0091565967765903e-06, "loss": 0.4506, "step": 2200 }, { "epoch": 0.6707386471413939, "grad_norm": 0.4590471436665147, "learning_rate": 2.9606719334894673e-06, "loss": 0.4411, "step": 2210 }, { "epoch": 0.673773663644296, "grad_norm": 0.4871582819595622, "learning_rate": 2.9124162163726333e-06, "loss": 0.4581, "step": 2220 }, { "epoch": 0.6768086801471983, "grad_norm": 0.4621936812853514, "learning_rate": 2.864394862877945e-06, "loss": 0.4392, "step": 2230 }, { "epoch": 0.6798436966501006, "grad_norm": 0.5106734882471429, "learning_rate": 2.8166132641463174e-06, "loss": 0.4514, "step": 2240 }, { "epoch": 0.6828787131530027, "grad_norm": 0.4872901241863599, "learning_rate": 2.7690767844024757e-06, "loss": 0.456, "step": 2250 }, { "epoch": 0.685913729655905, "grad_norm": 0.5327159140172049, "learning_rate": 2.7217907603527425e-06, "loss": 0.4502, "step": 2260 }, { "epoch": 0.6889487461588072, "grad_norm": 0.46596161967531874, "learning_rate": 2.67476050058591e-06, "loss": 0.4368, "step": 2270 }, { "epoch": 0.6919837626617095, "grad_norm": 0.4756057088646994, "learning_rate": 2.627991284977265e-06, "loss": 0.4427, "step": 2280 }, { "epoch": 0.6950187791646117, "grad_norm": 0.457554867967829, "learning_rate": 2.5814883640958425e-06, "loss": 0.4492, "step": 2290 }, { "epoch": 0.6980537956675139, "grad_norm": 0.4914134735789525, "learning_rate": 2.535256958614972e-06, "loss": 0.4521, "step": 2300 }, { "epoch": 0.7010888121704162, "grad_norm": 0.4514454510170551, "learning_rate": 2.489302258726169e-06, "loss": 0.445, "step": 2310 }, { "epoch": 0.7041238286733185, "grad_norm": 0.5541195252480908, "learning_rate": 2.4436294235564616e-06, "loss": 0.4487, "step": 2320 }, { "epoch": 0.7071588451762206, "grad_norm": 0.4701102512963565, "learning_rate": 2.398243580589197e-06, "loss": 0.4467, "step": 2330 }, { "epoch": 0.7101938616791229, "grad_norm": 0.5033348530844566, "learning_rate": 2.353149825088401e-06, "loss": 0.4424, "step": 2340 }, { "epoch": 0.7132288781820251, "grad_norm": 0.4896392329694068, "learning_rate": 2.30835321952675e-06, "loss": 0.4492, "step": 2350 }, { "epoch": 0.7162638946849273, "grad_norm": 0.45357241678052895, "learning_rate": 2.263858793017247e-06, "loss": 0.4399, "step": 2360 }, { "epoch": 0.7192989111878296, "grad_norm": 0.47254537181545697, "learning_rate": 2.219671540748607e-06, "loss": 0.4486, "step": 2370 }, { "epoch": 0.7223339276907318, "grad_norm": 1.0513237554990393, "learning_rate": 2.1757964234244806e-06, "loss": 0.4516, "step": 2380 }, { "epoch": 0.7253689441936341, "grad_norm": 1.211615701259343, "learning_rate": 2.1322383667065328e-06, "loss": 0.4459, "step": 2390 }, { "epoch": 0.7284039606965362, "grad_norm": 0.49549117242754526, "learning_rate": 2.0890022606614658e-06, "loss": 0.4519, "step": 2400 }, { "epoch": 0.7314389771994385, "grad_norm": 0.441302787437587, "learning_rate": 2.0460929592120286e-06, "loss": 0.4421, "step": 2410 }, { "epoch": 0.7344739937023408, "grad_norm": 0.47459997274968196, "learning_rate": 2.0035152795920943e-06, "loss": 0.4474, "step": 2420 }, { "epoch": 0.737509010205243, "grad_norm": 0.5534394858546283, "learning_rate": 1.961274001805844e-06, "loss": 0.4506, "step": 2430 }, { "epoch": 0.7405440267081452, "grad_norm": 0.5940641703049439, "learning_rate": 1.9193738680911444e-06, "loss": 0.4435, "step": 2440 }, { "epoch": 0.7435790432110475, "grad_norm": 0.47749388420290173, "learning_rate": 1.8778195823871537e-06, "loss": 0.4473, "step": 2450 }, { "epoch": 0.7466140597139497, "grad_norm": 0.46649043860109973, "learning_rate": 1.836615809806232e-06, "loss": 0.441, "step": 2460 }, { "epoch": 0.749649076216852, "grad_norm": 0.47765577041447577, "learning_rate": 1.7957671761102142e-06, "loss": 0.4394, "step": 2470 }, { "epoch": 0.7526840927197541, "grad_norm": 0.4609425591065501, "learning_rate": 1.7552782671910845e-06, "loss": 0.4491, "step": 2480 }, { "epoch": 0.7557191092226564, "grad_norm": 0.4952386048466494, "learning_rate": 1.715153628556162e-06, "loss": 0.4429, "step": 2490 }, { "epoch": 0.7587541257255587, "grad_norm": 0.45327322111640855, "learning_rate": 1.6753977648177682e-06, "loss": 0.452, "step": 2500 }, { "epoch": 0.7617891422284608, "grad_norm": 0.560152662522095, "learning_rate": 1.6360151391875395e-06, "loss": 0.4482, "step": 2510 }, { "epoch": 0.7648241587313631, "grad_norm": 0.5588616468957155, "learning_rate": 1.5970101729753485e-06, "loss": 0.4411, "step": 2520 }, { "epoch": 0.7678591752342654, "grad_norm": 0.5012865331807312, "learning_rate": 1.5583872450929455e-06, "loss": 0.4466, "step": 2530 }, { "epoch": 0.7708941917371676, "grad_norm": 0.4458865832841834, "learning_rate": 1.5201506915623621e-06, "loss": 0.443, "step": 2540 }, { "epoch": 0.7739292082400698, "grad_norm": 0.5048770346419175, "learning_rate": 1.4823048050291211e-06, "loss": 0.452, "step": 2550 }, { "epoch": 0.776964224742972, "grad_norm": 0.46420339110163, "learning_rate": 1.4448538342803242e-06, "loss": 0.4405, "step": 2560 }, { "epoch": 0.7799992412458743, "grad_norm": 0.5148070470946807, "learning_rate": 1.407801983767656e-06, "loss": 0.4452, "step": 2570 }, { "epoch": 0.7830342577487766, "grad_norm": 0.4919925143595241, "learning_rate": 1.3711534131353738e-06, "loss": 0.4481, "step": 2580 }, { "epoch": 0.7860692742516787, "grad_norm": 0.8295554704623347, "learning_rate": 1.3349122367533135e-06, "loss": 0.4443, "step": 2590 }, { "epoch": 0.789104290754581, "grad_norm": 0.4715835221582969, "learning_rate": 1.2990825232550065e-06, "loss": 0.4441, "step": 2600 }, { "epoch": 0.7921393072574832, "grad_norm": 0.4888383711595245, "learning_rate": 1.2636682950808882e-06, "loss": 0.4414, "step": 2610 }, { "epoch": 0.7951743237603854, "grad_norm": 0.46367852635576223, "learning_rate": 1.228673528026741e-06, "loss": 0.443, "step": 2620 }, { "epoch": 0.7982093402632877, "grad_norm": 0.458931669471863, "learning_rate": 1.194102150797326e-06, "loss": 0.445, "step": 2630 }, { "epoch": 0.8012443567661899, "grad_norm": 0.5805342428508335, "learning_rate": 1.1599580445653496e-06, "loss": 0.4416, "step": 2640 }, { "epoch": 0.8042793732690922, "grad_norm": 0.47446300706665095, "learning_rate": 1.1262450425357175e-06, "loss": 0.4527, "step": 2650 }, { "epoch": 0.8073143897719944, "grad_norm": 0.4585780912736001, "learning_rate": 1.092966929515218e-06, "loss": 0.44, "step": 2660 }, { "epoch": 0.8103494062748966, "grad_norm": 0.48233598242441533, "learning_rate": 1.0601274414876067e-06, "loss": 0.4455, "step": 2670 }, { "epoch": 0.8133844227777989, "grad_norm": 0.46200584695129754, "learning_rate": 1.0277302651941894e-06, "loss": 0.4446, "step": 2680 }, { "epoch": 0.816419439280701, "grad_norm": 0.4874246650348082, "learning_rate": 9.95779037719926e-07, "loss": 0.4397, "step": 2690 }, { "epoch": 0.8194544557836033, "grad_norm": 0.4547919796548769, "learning_rate": 9.642773460851141e-07, "loss": 0.4473, "step": 2700 }, { "epoch": 0.8224894722865056, "grad_norm": 0.4972539970599672, "learning_rate": 9.332287268426881e-07, "loss": 0.4425, "step": 2710 }, { "epoch": 0.8255244887894078, "grad_norm": 0.43865232063488374, "learning_rate": 9.026366656811835e-07, "loss": 0.4401, "step": 2720 }, { "epoch": 0.82855950529231, "grad_norm": 1.1602238104916722, "learning_rate": 8.725045970334262e-07, "loss": 0.4504, "step": 2730 }, { "epoch": 0.8315945217952123, "grad_norm": 0.4663648778218256, "learning_rate": 8.428359036909455e-07, "loss": 0.4391, "step": 2740 }, { "epoch": 0.8346295382981145, "grad_norm": 0.48682417123785116, "learning_rate": 8.136339164242241e-07, "loss": 0.4467, "step": 2750 }, { "epoch": 0.8376645548010168, "grad_norm": 0.4673166286188927, "learning_rate": 7.849019136087477e-07, "loss": 0.4398, "step": 2760 }, { "epoch": 0.8406995713039189, "grad_norm": 0.4903812570342262, "learning_rate": 7.566431208569747e-07, "loss": 0.4413, "step": 2770 }, { "epoch": 0.8437345878068212, "grad_norm": 0.485844612436569, "learning_rate": 7.288607106561935e-07, "loss": 0.4451, "step": 2780 }, { "epoch": 0.8467696043097235, "grad_norm": 0.4585651698156433, "learning_rate": 7.015578020123804e-07, "loss": 0.439, "step": 2790 }, { "epoch": 0.8498046208126256, "grad_norm": 0.4839043585723541, "learning_rate": 6.747374601000229e-07, "loss": 0.4451, "step": 2800 }, { "epoch": 0.8528396373155279, "grad_norm": 0.45692832107739795, "learning_rate": 6.484026959180256e-07, "loss": 0.439, "step": 2810 }, { "epoch": 0.8558746538184301, "grad_norm": 0.8934971320404392, "learning_rate": 6.225564659516653e-07, "loss": 0.4427, "step": 2820 }, { "epoch": 0.8589096703213324, "grad_norm": 0.42447350665857003, "learning_rate": 5.972016718406832e-07, "loss": 0.445, "step": 2830 }, { "epoch": 0.8619446868242346, "grad_norm": 0.5221390912623963, "learning_rate": 5.723411600535378e-07, "loss": 0.4493, "step": 2840 }, { "epoch": 0.8649797033271368, "grad_norm": 0.470728180227603, "learning_rate": 5.4797772156783e-07, "loss": 0.4436, "step": 2850 }, { "epoch": 0.8680147198300391, "grad_norm": 0.47672096853939294, "learning_rate": 5.24114091556992e-07, "loss": 0.4405, "step": 2860 }, { "epoch": 0.8710497363329414, "grad_norm": 0.4546129075040933, "learning_rate": 5.00752949083202e-07, "loss": 0.4534, "step": 2870 }, { "epoch": 0.8740847528358435, "grad_norm": 0.463106848164873, "learning_rate": 4.778969167966346e-07, "loss": 0.444, "step": 2880 }, { "epoch": 0.8771197693387458, "grad_norm": 0.4491761462781034, "learning_rate": 4.5554856064101314e-07, "loss": 0.436, "step": 2890 }, { "epoch": 0.880154785841648, "grad_norm": 0.4675366494918407, "learning_rate": 4.337103895655581e-07, "loss": 0.4531, "step": 2900 }, { "epoch": 0.8831898023445502, "grad_norm": 0.44541163156936503, "learning_rate": 4.123848552433019e-07, "loss": 0.4375, "step": 2910 }, { "epoch": 0.8862248188474525, "grad_norm": 0.49240823007466994, "learning_rate": 3.9157435179586756e-07, "loss": 0.4374, "step": 2920 }, { "epoch": 0.8892598353503547, "grad_norm": 0.4733511579541881, "learning_rate": 3.712812155246759e-07, "loss": 0.4441, "step": 2930 }, { "epoch": 0.892294851853257, "grad_norm": 0.5930159408214484, "learning_rate": 3.5150772464867314e-07, "loss": 0.4441, "step": 2940 }, { "epoch": 0.8953298683561592, "grad_norm": 0.4915855850221476, "learning_rate": 3.322560990485535e-07, "loss": 0.4475, "step": 2950 }, { "epoch": 0.8983648848590614, "grad_norm": 0.4400578392570043, "learning_rate": 3.135285000175531e-07, "loss": 0.437, "step": 2960 }, { "epoch": 0.9013999013619637, "grad_norm": 1.0625536086809468, "learning_rate": 2.953270300188038e-07, "loss": 0.4461, "step": 2970 }, { "epoch": 0.9044349178648659, "grad_norm": 0.47898803434881615, "learning_rate": 2.776537324493045e-07, "loss": 0.4411, "step": 2980 }, { "epoch": 0.9074699343677681, "grad_norm": 0.48013757465246243, "learning_rate": 2.6051059141051713e-07, "loss": 0.4463, "step": 2990 }, { "epoch": 0.9105049508706704, "grad_norm": 0.46792544413322756, "learning_rate": 2.4389953148561574e-07, "loss": 0.4541, "step": 3000 }, { "epoch": 0.9135399673735726, "grad_norm": 0.47509286467451683, "learning_rate": 2.2782241752343004e-07, "loss": 0.4392, "step": 3010 }, { "epoch": 0.9165749838764748, "grad_norm": 0.5355760986925688, "learning_rate": 2.122810544290782e-07, "loss": 0.4459, "step": 3020 }, { "epoch": 0.919610000379377, "grad_norm": 0.4481951370049728, "learning_rate": 1.972771869613499e-07, "loss": 0.4408, "step": 3030 }, { "epoch": 0.9226450168822793, "grad_norm": 1.212596637597613, "learning_rate": 1.8281249953681633e-07, "loss": 0.4524, "step": 3040 }, { "epoch": 0.9256800333851816, "grad_norm": 0.4653062821594877, "learning_rate": 1.6888861604074158e-07, "loss": 0.4547, "step": 3050 }, { "epoch": 0.9287150498880837, "grad_norm": 0.4417864126009544, "learning_rate": 1.5550709964476606e-07, "loss": 0.4343, "step": 3060 }, { "epoch": 0.931750066390986, "grad_norm": 0.4636627611118036, "learning_rate": 1.4266945263142152e-07, "loss": 0.4442, "step": 3070 }, { "epoch": 0.9347850828938883, "grad_norm": 0.4750602636819015, "learning_rate": 1.3037711622547633e-07, "loss": 0.4402, "step": 3080 }, { "epoch": 0.9378200993967905, "grad_norm": 0.5641879257431062, "learning_rate": 1.1863147043213453e-07, "loss": 0.4463, "step": 3090 }, { "epoch": 0.9408551158996927, "grad_norm": 0.48298472160583994, "learning_rate": 1.0743383388210849e-07, "loss": 0.4509, "step": 3100 }, { "epoch": 0.9438901324025949, "grad_norm": 0.4709867848354663, "learning_rate": 9.678546368358299e-08, "loss": 0.4469, "step": 3110 }, { "epoch": 0.9469251489054972, "grad_norm": 0.45111687792423893, "learning_rate": 8.668755528108586e-08, "loss": 0.446, "step": 3120 }, { "epoch": 0.9499601654083994, "grad_norm": 0.4640080663242888, "learning_rate": 7.714124232127974e-08, "loss": 0.447, "step": 3130 }, { "epoch": 0.9529951819113016, "grad_norm": 0.4835533725431675, "learning_rate": 6.814759652569391e-08, "loss": 0.4471, "step": 3140 }, { "epoch": 0.9560301984142039, "grad_norm": 0.4775784576850005, "learning_rate": 5.970762757040339e-08, "loss": 0.4581, "step": 3150 }, { "epoch": 0.9590652149171061, "grad_norm": 0.4406150853474837, "learning_rate": 5.182228297268388e-08, "loss": 0.4377, "step": 3160 }, { "epoch": 0.9621002314200083, "grad_norm": 0.4875065749326578, "learning_rate": 4.449244798463037e-08, "loss": 0.4466, "step": 3170 }, { "epoch": 0.9651352479229106, "grad_norm": 0.5331920537623118, "learning_rate": 3.7718945493781523e-08, "loss": 0.4457, "step": 3180 }, { "epoch": 0.9681702644258128, "grad_norm": 0.4829553299763894, "learning_rate": 3.150253593073027e-08, "loss": 0.4373, "step": 3190 }, { "epoch": 0.971205280928715, "grad_norm": 0.4683361415791196, "learning_rate": 2.5843917183761002e-08, "loss": 0.4541, "step": 3200 }, { "epoch": 0.9742402974316173, "grad_norm": 0.4647148561695759, "learning_rate": 2.0743724520495e-08, "loss": 0.4372, "step": 3210 }, { "epoch": 0.9772753139345195, "grad_norm": 0.45464384300389377, "learning_rate": 1.6202530516574165e-08, "loss": 0.4495, "step": 3220 }, { "epoch": 0.9803103304374218, "grad_norm": 0.4823468219123769, "learning_rate": 1.222084499138243e-08, "loss": 0.4408, "step": 3230 }, { "epoch": 0.9833453469403239, "grad_norm": 0.4802790220017127, "learning_rate": 8.799114950806542e-09, "loss": 0.4419, "step": 3240 }, { "epoch": 0.9863803634432262, "grad_norm": 0.46706355617957335, "learning_rate": 5.9377245370551005e-09, "loss": 0.4453, "step": 3250 }, { "epoch": 0.9894153799461285, "grad_norm": 0.440626466158207, "learning_rate": 3.636994985534159e-09, "loss": 0.4345, "step": 3260 }, { "epoch": 0.9924503964490307, "grad_norm": 0.4555769223211312, "learning_rate": 1.8971845887794105e-09, "loss": 0.4468, "step": 3270 }, { "epoch": 0.9954854129519329, "grad_norm": 0.4398220412821882, "learning_rate": 7.184886674627134e-10, "loss": 0.4384, "step": 3280 }, { "epoch": 0.9985204294548352, "grad_norm": 0.45149244572807423, "learning_rate": 1.010395484624116e-10, "loss": 0.44, "step": 3290 }, { "epoch": 1.0, "step": 3295, "total_flos": 8.530187773318005e+18, "train_loss": 0.4728999632081421, "train_runtime": 62323.8697, "train_samples_per_second": 6.767, "train_steps_per_second": 0.053 } ], "logging_steps": 10, "max_steps": 3295, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.530187773318005e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }