{ "best_global_step": 96000, "best_metric": 3.5291364192962646, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_push_5039/checkpoint-40000", "epoch": 33.76065440149045, "eval_steps": 1000, "global_step": 116000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014555193292966931, "grad_norm": 1.59328293800354, "learning_rate": 0.000294, "loss": 8.4848, "step": 50 }, { "epoch": 0.029110386585933862, "grad_norm": 0.6936681270599365, "learning_rate": 0.0005939999999999999, "loss": 6.7461, "step": 100 }, { "epoch": 0.04366557987890079, "grad_norm": 0.5175042152404785, "learning_rate": 0.0005998287711124053, "loss": 6.3607, "step": 150 }, { "epoch": 0.058220773171867725, "grad_norm": 0.49561843276023865, "learning_rate": 0.000599654047757717, "loss": 6.1364, "step": 200 }, { "epoch": 0.07277596646483465, "grad_norm": 0.6248387098312378, "learning_rate": 0.0005994793244030285, "loss": 5.9948, "step": 250 }, { "epoch": 0.08733115975780158, "grad_norm": 0.43479761481285095, "learning_rate": 0.00059930460104834, "loss": 5.8855, "step": 300 }, { "epoch": 0.10188635305076851, "grad_norm": 0.47381791472435, "learning_rate": 0.0005991298776936517, "loss": 5.7523, "step": 350 }, { "epoch": 0.11644154634373545, "grad_norm": 0.5421844124794006, "learning_rate": 0.0005989551543389632, "loss": 5.6323, "step": 400 }, { "epoch": 0.1309967396367024, "grad_norm": 0.4568154811859131, "learning_rate": 0.0005987804309842748, "loss": 5.5149, "step": 450 }, { "epoch": 0.1455519329296693, "grad_norm": 0.42767488956451416, "learning_rate": 0.0005986057076295864, "loss": 5.4131, "step": 500 }, { "epoch": 0.16010712622263623, "grad_norm": 0.44224125146865845, "learning_rate": 0.0005984309842748981, "loss": 5.3338, "step": 550 }, { "epoch": 0.17466231951560315, "grad_norm": 0.41784995794296265, "learning_rate": 0.0005982562609202096, "loss": 5.2578, "step": 600 }, { "epoch": 0.1892175128085701, "grad_norm": 0.4385405480861664, "learning_rate": 0.0005980815375655212, "loss": 5.1979, "step": 650 }, { "epoch": 0.20377270610153703, "grad_norm": 0.44367310404777527, "learning_rate": 0.0005979068142108328, "loss": 5.1412, "step": 700 }, { "epoch": 0.21832789939450395, "grad_norm": 0.4334621727466583, "learning_rate": 0.0005977320908561445, "loss": 5.0751, "step": 750 }, { "epoch": 0.2328830926874709, "grad_norm": 0.5640344023704529, "learning_rate": 0.000597557367501456, "loss": 5.0352, "step": 800 }, { "epoch": 0.24743828598043782, "grad_norm": 0.5306651592254639, "learning_rate": 0.0005973826441467675, "loss": 4.9753, "step": 850 }, { "epoch": 0.2619934792734048, "grad_norm": 0.4476029872894287, "learning_rate": 0.0005972079207920792, "loss": 4.9104, "step": 900 }, { "epoch": 0.27654867256637167, "grad_norm": 0.46251359581947327, "learning_rate": 0.0005970331974373907, "loss": 4.8629, "step": 950 }, { "epoch": 0.2911038658593386, "grad_norm": 0.3950487971305847, "learning_rate": 0.0005968584740827023, "loss": 4.8239, "step": 1000 }, { "epoch": 0.2911038658593386, "eval_accuracy": 0.25577126647776893, "eval_loss": 4.746592044830322, "eval_runtime": 180.336, "eval_samples_per_second": 92.333, "eval_steps_per_second": 5.773, "step": 1000 }, { "epoch": 0.30565905915230557, "grad_norm": 0.4484369456768036, "learning_rate": 0.0005966837507280139, "loss": 4.7872, "step": 1050 }, { "epoch": 0.32021425244527246, "grad_norm": 0.4923895299434662, "learning_rate": 0.0005965090273733256, "loss": 4.7557, "step": 1100 }, { "epoch": 0.3347694457382394, "grad_norm": 0.4452304542064667, "learning_rate": 0.0005963343040186371, "loss": 4.7049, "step": 1150 }, { "epoch": 0.3493246390312063, "grad_norm": 0.4268929958343506, "learning_rate": 0.0005961595806639486, "loss": 4.6585, "step": 1200 }, { "epoch": 0.36387983232417326, "grad_norm": 0.4515587091445923, "learning_rate": 0.0005959848573092603, "loss": 4.6235, "step": 1250 }, { "epoch": 0.3784350256171402, "grad_norm": 0.39001479744911194, "learning_rate": 0.0005958101339545718, "loss": 4.5944, "step": 1300 }, { "epoch": 0.3929902189101071, "grad_norm": 0.41055354475975037, "learning_rate": 0.0005956354105998835, "loss": 4.5803, "step": 1350 }, { "epoch": 0.40754541220307405, "grad_norm": 0.5165393352508545, "learning_rate": 0.000595460687245195, "loss": 4.5481, "step": 1400 }, { "epoch": 0.422100605496041, "grad_norm": 0.4382358193397522, "learning_rate": 0.0005952859638905067, "loss": 4.5157, "step": 1450 }, { "epoch": 0.4366557987890079, "grad_norm": 0.434405118227005, "learning_rate": 0.0005951112405358182, "loss": 4.4938, "step": 1500 }, { "epoch": 0.45121099208197485, "grad_norm": 0.4131261706352234, "learning_rate": 0.0005949365171811299, "loss": 4.4815, "step": 1550 }, { "epoch": 0.4657661853749418, "grad_norm": 0.4448164999485016, "learning_rate": 0.0005947617938264414, "loss": 4.4603, "step": 1600 }, { "epoch": 0.4803213786679087, "grad_norm": 0.42042869329452515, "learning_rate": 0.000594587070471753, "loss": 4.4571, "step": 1650 }, { "epoch": 0.49487657196087564, "grad_norm": 0.4064828157424927, "learning_rate": 0.0005944123471170646, "loss": 4.4301, "step": 1700 }, { "epoch": 0.5094317652538426, "grad_norm": 0.42795395851135254, "learning_rate": 0.0005942376237623762, "loss": 4.3961, "step": 1750 }, { "epoch": 0.5239869585468095, "grad_norm": 0.4005245566368103, "learning_rate": 0.0005940629004076878, "loss": 4.3893, "step": 1800 }, { "epoch": 0.5385421518397764, "grad_norm": 0.4350132346153259, "learning_rate": 0.0005938881770529993, "loss": 4.3894, "step": 1850 }, { "epoch": 0.5530973451327433, "grad_norm": 0.40528157353401184, "learning_rate": 0.000593713453698311, "loss": 4.3639, "step": 1900 }, { "epoch": 0.5676525384257103, "grad_norm": 0.4326445162296295, "learning_rate": 0.0005935387303436226, "loss": 4.3532, "step": 1950 }, { "epoch": 0.5822077317186772, "grad_norm": 0.4028080701828003, "learning_rate": 0.0005933640069889342, "loss": 4.3346, "step": 2000 }, { "epoch": 0.5822077317186772, "eval_accuracy": 0.2990040383557413, "eval_loss": 4.286277770996094, "eval_runtime": 180.6066, "eval_samples_per_second": 92.195, "eval_steps_per_second": 5.764, "step": 2000 }, { "epoch": 0.5967629250116442, "grad_norm": 0.4349151849746704, "learning_rate": 0.0005931892836342457, "loss": 4.3212, "step": 2050 }, { "epoch": 0.6113181183046111, "grad_norm": 0.39076679944992065, "learning_rate": 0.0005930145602795573, "loss": 4.3166, "step": 2100 }, { "epoch": 0.625873311597578, "grad_norm": 0.3575606048107147, "learning_rate": 0.000592839836924869, "loss": 4.3079, "step": 2150 }, { "epoch": 0.6404285048905449, "grad_norm": 0.39779335260391235, "learning_rate": 0.0005926651135701805, "loss": 4.2939, "step": 2200 }, { "epoch": 0.6549836981835119, "grad_norm": 0.3903760612010956, "learning_rate": 0.0005924903902154921, "loss": 4.2769, "step": 2250 }, { "epoch": 0.6695388914764788, "grad_norm": 0.3906783163547516, "learning_rate": 0.0005923156668608037, "loss": 4.2664, "step": 2300 }, { "epoch": 0.6840940847694458, "grad_norm": 0.37090644240379333, "learning_rate": 0.0005921409435061153, "loss": 4.2565, "step": 2350 }, { "epoch": 0.6986492780624126, "grad_norm": 0.3849121928215027, "learning_rate": 0.0005919662201514268, "loss": 4.2491, "step": 2400 }, { "epoch": 0.7132044713553796, "grad_norm": 0.39589667320251465, "learning_rate": 0.0005917914967967384, "loss": 4.2398, "step": 2450 }, { "epoch": 0.7277596646483465, "grad_norm": 0.36322706937789917, "learning_rate": 0.0005916167734420501, "loss": 4.2298, "step": 2500 }, { "epoch": 0.7423148579413135, "grad_norm": 0.38025781512260437, "learning_rate": 0.0005914420500873616, "loss": 4.215, "step": 2550 }, { "epoch": 0.7568700512342804, "grad_norm": 0.3777216076850891, "learning_rate": 0.0005912673267326732, "loss": 4.2232, "step": 2600 }, { "epoch": 0.7714252445272474, "grad_norm": 0.35441768169403076, "learning_rate": 0.0005910926033779848, "loss": 4.1997, "step": 2650 }, { "epoch": 0.7859804378202142, "grad_norm": 0.37016597390174866, "learning_rate": 0.0005909178800232964, "loss": 4.1914, "step": 2700 }, { "epoch": 0.8005356311131812, "grad_norm": 0.39094072580337524, "learning_rate": 0.000590743156668608, "loss": 4.186, "step": 2750 }, { "epoch": 0.8150908244061481, "grad_norm": 0.38974034786224365, "learning_rate": 0.0005905684333139196, "loss": 4.1762, "step": 2800 }, { "epoch": 0.8296460176991151, "grad_norm": 0.3833460807800293, "learning_rate": 0.0005903937099592312, "loss": 4.1783, "step": 2850 }, { "epoch": 0.844201210992082, "grad_norm": 0.37555766105651855, "learning_rate": 0.0005902189866045427, "loss": 4.1713, "step": 2900 }, { "epoch": 0.858756404285049, "grad_norm": 0.36685946583747864, "learning_rate": 0.0005900442632498543, "loss": 4.1463, "step": 2950 }, { "epoch": 0.8733115975780158, "grad_norm": 0.37745383381843567, "learning_rate": 0.0005898695398951659, "loss": 4.1431, "step": 3000 }, { "epoch": 0.8733115975780158, "eval_accuracy": 0.31544481558261633, "eval_loss": 4.0944952964782715, "eval_runtime": 180.5379, "eval_samples_per_second": 92.23, "eval_steps_per_second": 5.766, "step": 3000 }, { "epoch": 0.8878667908709827, "grad_norm": 0.38102462887763977, "learning_rate": 0.0005896948165404776, "loss": 4.1363, "step": 3050 }, { "epoch": 0.9024219841639497, "grad_norm": 0.3536563813686371, "learning_rate": 0.0005895200931857891, "loss": 4.1421, "step": 3100 }, { "epoch": 0.9169771774569166, "grad_norm": 0.3871707320213318, "learning_rate": 0.0005893453698311007, "loss": 4.1249, "step": 3150 }, { "epoch": 0.9315323707498836, "grad_norm": 0.34409627318382263, "learning_rate": 0.0005891706464764123, "loss": 4.1198, "step": 3200 }, { "epoch": 0.9460875640428504, "grad_norm": 0.3520107567310333, "learning_rate": 0.0005889959231217238, "loss": 4.1088, "step": 3250 }, { "epoch": 0.9606427573358174, "grad_norm": 0.36660316586494446, "learning_rate": 0.0005888211997670355, "loss": 4.1079, "step": 3300 }, { "epoch": 0.9751979506287843, "grad_norm": 0.3971594572067261, "learning_rate": 0.000588646476412347, "loss": 4.1049, "step": 3350 }, { "epoch": 0.9897531439217513, "grad_norm": 0.36216068267822266, "learning_rate": 0.0005884717530576587, "loss": 4.0896, "step": 3400 }, { "epoch": 1.0040754541220307, "grad_norm": 0.3755367398262024, "learning_rate": 0.0005882970297029702, "loss": 4.0812, "step": 3450 }, { "epoch": 1.0186306474149978, "grad_norm": 0.3382784426212311, "learning_rate": 0.0005881223063482818, "loss": 4.0265, "step": 3500 }, { "epoch": 1.0331858407079646, "grad_norm": 0.3530285656452179, "learning_rate": 0.0005879475829935934, "loss": 4.0113, "step": 3550 }, { "epoch": 1.0477410340009314, "grad_norm": 0.36967766284942627, "learning_rate": 0.0005877728596389051, "loss": 4.0183, "step": 3600 }, { "epoch": 1.0622962272938985, "grad_norm": 0.35606473684310913, "learning_rate": 0.0005875981362842166, "loss": 4.0121, "step": 3650 }, { "epoch": 1.0768514205868653, "grad_norm": 0.3531215786933899, "learning_rate": 0.0005874234129295281, "loss": 4.0223, "step": 3700 }, { "epoch": 1.0914066138798324, "grad_norm": 0.37718191742897034, "learning_rate": 0.0005872486895748398, "loss": 4.0102, "step": 3750 }, { "epoch": 1.1059618071727992, "grad_norm": 0.3446013331413269, "learning_rate": 0.0005870739662201513, "loss": 4.0073, "step": 3800 }, { "epoch": 1.120517000465766, "grad_norm": 0.36104434728622437, "learning_rate": 0.000586899242865463, "loss": 3.9866, "step": 3850 }, { "epoch": 1.1350721937587331, "grad_norm": 0.34700337052345276, "learning_rate": 0.0005867245195107746, "loss": 3.9953, "step": 3900 }, { "epoch": 1.1496273870517, "grad_norm": 0.3538568913936615, "learning_rate": 0.0005865497961560862, "loss": 3.9803, "step": 3950 }, { "epoch": 1.164182580344667, "grad_norm": 0.3729252219200134, "learning_rate": 0.0005863750728013977, "loss": 3.9889, "step": 4000 }, { "epoch": 1.164182580344667, "eval_accuracy": 0.3248556970362317, "eval_loss": 3.9908831119537354, "eval_runtime": 180.4355, "eval_samples_per_second": 92.282, "eval_steps_per_second": 5.769, "step": 4000 }, { "epoch": 1.1787377736376339, "grad_norm": 0.350835919380188, "learning_rate": 0.0005862003494467094, "loss": 3.9775, "step": 4050 }, { "epoch": 1.193292966930601, "grad_norm": 0.3552381694316864, "learning_rate": 0.0005860256260920209, "loss": 3.9815, "step": 4100 }, { "epoch": 1.2078481602235678, "grad_norm": 0.3644408583641052, "learning_rate": 0.0005858509027373325, "loss": 3.9955, "step": 4150 }, { "epoch": 1.2224033535165346, "grad_norm": 0.34302929043769836, "learning_rate": 0.0005856761793826441, "loss": 3.9613, "step": 4200 }, { "epoch": 1.2369585468095017, "grad_norm": 0.352975457906723, "learning_rate": 0.0005855014560279557, "loss": 3.974, "step": 4250 }, { "epoch": 1.2515137401024685, "grad_norm": 0.3483916223049164, "learning_rate": 0.0005853267326732673, "loss": 3.967, "step": 4300 }, { "epoch": 1.2660689333954354, "grad_norm": 0.3443877100944519, "learning_rate": 0.0005851520093185788, "loss": 3.9741, "step": 4350 }, { "epoch": 1.2806241266884024, "grad_norm": 0.33151453733444214, "learning_rate": 0.0005849772859638905, "loss": 3.9596, "step": 4400 }, { "epoch": 1.2951793199813695, "grad_norm": 0.3247957229614258, "learning_rate": 0.0005848025626092021, "loss": 3.9726, "step": 4450 }, { "epoch": 1.3097345132743363, "grad_norm": 0.3368767201900482, "learning_rate": 0.0005846278392545136, "loss": 3.9584, "step": 4500 }, { "epoch": 1.3242897065673032, "grad_norm": 0.3503372371196747, "learning_rate": 0.0005844531158998252, "loss": 3.9589, "step": 4550 }, { "epoch": 1.3388448998602702, "grad_norm": 0.3575039207935333, "learning_rate": 0.0005842783925451368, "loss": 3.9638, "step": 4600 }, { "epoch": 1.353400093153237, "grad_norm": 0.32507437467575073, "learning_rate": 0.0005841036691904484, "loss": 3.9534, "step": 4650 }, { "epoch": 1.367955286446204, "grad_norm": 0.35451099276542664, "learning_rate": 0.00058392894583576, "loss": 3.9471, "step": 4700 }, { "epoch": 1.382510479739171, "grad_norm": 0.36062905192375183, "learning_rate": 0.0005837542224810716, "loss": 3.9468, "step": 4750 }, { "epoch": 1.3970656730321378, "grad_norm": 0.3349565863609314, "learning_rate": 0.0005835794991263832, "loss": 3.944, "step": 4800 }, { "epoch": 1.4116208663251049, "grad_norm": 0.33899959921836853, "learning_rate": 0.0005834047757716948, "loss": 3.9371, "step": 4850 }, { "epoch": 1.4261760596180717, "grad_norm": 0.32472407817840576, "learning_rate": 0.0005832300524170063, "loss": 3.9396, "step": 4900 }, { "epoch": 1.4407312529110388, "grad_norm": 0.31866762042045593, "learning_rate": 0.0005830553290623179, "loss": 3.9223, "step": 4950 }, { "epoch": 1.4552864462040056, "grad_norm": 0.33732911944389343, "learning_rate": 0.0005828806057076296, "loss": 3.9256, "step": 5000 }, { "epoch": 1.4552864462040056, "eval_accuracy": 0.3322673214974718, "eval_loss": 3.9127135276794434, "eval_runtime": 180.5829, "eval_samples_per_second": 92.207, "eval_steps_per_second": 5.765, "step": 5000 }, { "epoch": 1.4698416394969724, "grad_norm": 0.3338392376899719, "learning_rate": 0.0005827058823529411, "loss": 3.9309, "step": 5050 }, { "epoch": 1.4843968327899395, "grad_norm": 0.3375056982040405, "learning_rate": 0.0005825311589982527, "loss": 3.9315, "step": 5100 }, { "epoch": 1.4989520260829063, "grad_norm": 0.3378584384918213, "learning_rate": 0.0005823564356435643, "loss": 3.9224, "step": 5150 }, { "epoch": 1.5135072193758732, "grad_norm": 0.34928908944129944, "learning_rate": 0.0005821817122888759, "loss": 3.9263, "step": 5200 }, { "epoch": 1.5280624126688402, "grad_norm": 0.32031747698783875, "learning_rate": 0.0005820069889341875, "loss": 3.9183, "step": 5250 }, { "epoch": 1.5426176059618073, "grad_norm": 0.3607842028141022, "learning_rate": 0.000581832265579499, "loss": 3.9094, "step": 5300 }, { "epoch": 1.5571727992547741, "grad_norm": 0.3123273551464081, "learning_rate": 0.0005816575422248107, "loss": 3.8999, "step": 5350 }, { "epoch": 1.571727992547741, "grad_norm": 0.3072376549243927, "learning_rate": 0.0005814828188701222, "loss": 3.9049, "step": 5400 }, { "epoch": 1.586283185840708, "grad_norm": 0.35170677304267883, "learning_rate": 0.0005813080955154338, "loss": 3.9047, "step": 5450 }, { "epoch": 1.6008383791336749, "grad_norm": 0.3275664448738098, "learning_rate": 0.0005811333721607454, "loss": 3.9068, "step": 5500 }, { "epoch": 1.6153935724266417, "grad_norm": 0.34021180868148804, "learning_rate": 0.0005809586488060571, "loss": 3.9025, "step": 5550 }, { "epoch": 1.6299487657196088, "grad_norm": 0.33757513761520386, "learning_rate": 0.0005807839254513686, "loss": 3.8947, "step": 5600 }, { "epoch": 1.6445039590125758, "grad_norm": 0.372082382440567, "learning_rate": 0.0005806092020966802, "loss": 3.8851, "step": 5650 }, { "epoch": 1.6590591523055425, "grad_norm": 0.3107072710990906, "learning_rate": 0.0005804344787419918, "loss": 3.8832, "step": 5700 }, { "epoch": 1.6736143455985095, "grad_norm": 0.32251301407814026, "learning_rate": 0.0005802597553873033, "loss": 3.8966, "step": 5750 }, { "epoch": 1.6881695388914766, "grad_norm": 0.35399067401885986, "learning_rate": 0.000580085032032615, "loss": 3.8792, "step": 5800 }, { "epoch": 1.7027247321844434, "grad_norm": 0.3271541893482208, "learning_rate": 0.0005799103086779265, "loss": 3.8779, "step": 5850 }, { "epoch": 1.7172799254774103, "grad_norm": 0.3056686520576477, "learning_rate": 0.0005797355853232382, "loss": 3.8845, "step": 5900 }, { "epoch": 1.7318351187703773, "grad_norm": 0.34892335534095764, "learning_rate": 0.0005795608619685497, "loss": 3.8734, "step": 5950 }, { "epoch": 1.7463903120633442, "grad_norm": 0.33052363991737366, "learning_rate": 0.0005793861386138614, "loss": 3.8679, "step": 6000 }, { "epoch": 1.7463903120633442, "eval_accuracy": 0.33734191549058073, "eval_loss": 3.8543548583984375, "eval_runtime": 180.5456, "eval_samples_per_second": 92.226, "eval_steps_per_second": 5.766, "step": 6000 }, { "epoch": 1.760945505356311, "grad_norm": 0.3236662745475769, "learning_rate": 0.0005792114152591729, "loss": 3.8761, "step": 6050 }, { "epoch": 1.775500698649278, "grad_norm": 0.3223971128463745, "learning_rate": 0.0005790366919044846, "loss": 3.8762, "step": 6100 }, { "epoch": 1.7900558919422451, "grad_norm": 0.33235934376716614, "learning_rate": 0.0005788619685497961, "loss": 3.8641, "step": 6150 }, { "epoch": 1.804611085235212, "grad_norm": 0.3270372152328491, "learning_rate": 0.0005786872451951077, "loss": 3.8767, "step": 6200 }, { "epoch": 1.8191662785281788, "grad_norm": 0.31036415696144104, "learning_rate": 0.0005785125218404193, "loss": 3.8802, "step": 6250 }, { "epoch": 1.8337214718211459, "grad_norm": 0.33436477184295654, "learning_rate": 0.0005783377984857308, "loss": 3.8665, "step": 6300 }, { "epoch": 1.8482766651141127, "grad_norm": 0.3159632682800293, "learning_rate": 0.0005781630751310425, "loss": 3.8717, "step": 6350 }, { "epoch": 1.8628318584070795, "grad_norm": 0.3587862253189087, "learning_rate": 0.0005779883517763541, "loss": 3.8672, "step": 6400 }, { "epoch": 1.8773870517000466, "grad_norm": 0.3283621668815613, "learning_rate": 0.0005778136284216657, "loss": 3.8751, "step": 6450 }, { "epoch": 1.8919422449930137, "grad_norm": 0.31528571248054504, "learning_rate": 0.0005776389050669772, "loss": 3.8561, "step": 6500 }, { "epoch": 1.9064974382859803, "grad_norm": 0.321069598197937, "learning_rate": 0.0005774641817122889, "loss": 3.8516, "step": 6550 }, { "epoch": 1.9210526315789473, "grad_norm": 0.3143472969532013, "learning_rate": 0.0005772894583576004, "loss": 3.8567, "step": 6600 }, { "epoch": 1.9356078248719144, "grad_norm": 0.3452266752719879, "learning_rate": 0.000577114735002912, "loss": 3.8509, "step": 6650 }, { "epoch": 1.9501630181648812, "grad_norm": 0.3208848834037781, "learning_rate": 0.0005769400116482236, "loss": 3.8378, "step": 6700 }, { "epoch": 1.964718211457848, "grad_norm": 0.31689900159835815, "learning_rate": 0.0005767652882935352, "loss": 3.8385, "step": 6750 }, { "epoch": 1.9792734047508151, "grad_norm": 0.31279170513153076, "learning_rate": 0.0005765905649388468, "loss": 3.8336, "step": 6800 }, { "epoch": 1.993828598043782, "grad_norm": 0.3179241418838501, "learning_rate": 0.0005764158415841583, "loss": 3.8312, "step": 6850 }, { "epoch": 2.0081509082440614, "grad_norm": 0.3182215690612793, "learning_rate": 0.00057624111822947, "loss": 3.7798, "step": 6900 }, { "epoch": 2.0227061015370285, "grad_norm": 0.3253922462463379, "learning_rate": 0.0005760663948747816, "loss": 3.7416, "step": 6950 }, { "epoch": 2.0372612948299955, "grad_norm": 0.30186590552330017, "learning_rate": 0.0005758916715200931, "loss": 3.7456, "step": 7000 }, { "epoch": 2.0372612948299955, "eval_accuracy": 0.34163271988389243, "eval_loss": 3.8115620613098145, "eval_runtime": 180.4372, "eval_samples_per_second": 92.281, "eval_steps_per_second": 5.769, "step": 7000 }, { "epoch": 2.051816488122962, "grad_norm": 0.3521772027015686, "learning_rate": 0.0005757169481654047, "loss": 3.7446, "step": 7050 }, { "epoch": 2.066371681415929, "grad_norm": 0.3291357159614563, "learning_rate": 0.0005755422248107163, "loss": 3.7478, "step": 7100 }, { "epoch": 2.0809268747088963, "grad_norm": 0.3286155164241791, "learning_rate": 0.0005753675014560279, "loss": 3.7519, "step": 7150 }, { "epoch": 2.095482068001863, "grad_norm": 0.33210304379463196, "learning_rate": 0.0005751927781013395, "loss": 3.742, "step": 7200 }, { "epoch": 2.11003726129483, "grad_norm": 0.32869860529899597, "learning_rate": 0.0005750180547466511, "loss": 3.7514, "step": 7250 }, { "epoch": 2.124592454587797, "grad_norm": 0.3232797384262085, "learning_rate": 0.0005748433313919627, "loss": 3.7418, "step": 7300 }, { "epoch": 2.139147647880764, "grad_norm": 0.31019943952560425, "learning_rate": 0.0005746686080372743, "loss": 3.7577, "step": 7350 }, { "epoch": 2.1537028411737307, "grad_norm": 0.3321756422519684, "learning_rate": 0.0005744938846825858, "loss": 3.7577, "step": 7400 }, { "epoch": 2.1682580344666977, "grad_norm": 0.3237709403038025, "learning_rate": 0.0005743191613278974, "loss": 3.765, "step": 7450 }, { "epoch": 2.182813227759665, "grad_norm": 0.3210693299770355, "learning_rate": 0.0005741444379732091, "loss": 3.7457, "step": 7500 }, { "epoch": 2.1973684210526314, "grad_norm": 0.31137460470199585, "learning_rate": 0.0005739697146185206, "loss": 3.74, "step": 7550 }, { "epoch": 2.2119236143455985, "grad_norm": 0.36119261384010315, "learning_rate": 0.0005737949912638322, "loss": 3.7488, "step": 7600 }, { "epoch": 2.2264788076385655, "grad_norm": 0.32139718532562256, "learning_rate": 0.0005736202679091438, "loss": 3.752, "step": 7650 }, { "epoch": 2.241034000931532, "grad_norm": 0.3395932912826538, "learning_rate": 0.0005734455445544554, "loss": 3.7644, "step": 7700 }, { "epoch": 2.255589194224499, "grad_norm": 0.3269149959087372, "learning_rate": 0.000573270821199767, "loss": 3.7553, "step": 7750 }, { "epoch": 2.2701443875174663, "grad_norm": 0.33723190426826477, "learning_rate": 0.0005730960978450785, "loss": 3.7475, "step": 7800 }, { "epoch": 2.2846995808104333, "grad_norm": 0.33637166023254395, "learning_rate": 0.0005729213744903902, "loss": 3.748, "step": 7850 }, { "epoch": 2.2992547741034, "grad_norm": 0.31758731603622437, "learning_rate": 0.0005727466511357017, "loss": 3.7538, "step": 7900 }, { "epoch": 2.313809967396367, "grad_norm": 0.3541746735572815, "learning_rate": 0.0005725719277810134, "loss": 3.7556, "step": 7950 }, { "epoch": 2.328365160689334, "grad_norm": 0.3319081664085388, "learning_rate": 0.0005723972044263249, "loss": 3.7495, "step": 8000 }, { "epoch": 2.328365160689334, "eval_accuracy": 0.3446397735201814, "eval_loss": 3.7811524868011475, "eval_runtime": 180.4018, "eval_samples_per_second": 92.3, "eval_steps_per_second": 5.77, "step": 8000 }, { "epoch": 2.3429203539823007, "grad_norm": 0.3212926387786865, "learning_rate": 0.0005722224810716366, "loss": 3.7507, "step": 8050 }, { "epoch": 2.3574755472752678, "grad_norm": 0.3508698344230652, "learning_rate": 0.0005720477577169481, "loss": 3.7626, "step": 8100 }, { "epoch": 2.372030740568235, "grad_norm": 0.32493266463279724, "learning_rate": 0.0005718730343622598, "loss": 3.7551, "step": 8150 }, { "epoch": 2.386585933861202, "grad_norm": 0.32088997960090637, "learning_rate": 0.0005716983110075713, "loss": 3.7463, "step": 8200 }, { "epoch": 2.4011411271541685, "grad_norm": 0.33391422033309937, "learning_rate": 0.0005715235876528828, "loss": 3.7435, "step": 8250 }, { "epoch": 2.4156963204471356, "grad_norm": 0.32425162196159363, "learning_rate": 0.0005713488642981945, "loss": 3.7502, "step": 8300 }, { "epoch": 2.4302515137401026, "grad_norm": 0.31833091378211975, "learning_rate": 0.0005711741409435061, "loss": 3.7568, "step": 8350 }, { "epoch": 2.4448067070330692, "grad_norm": 0.3318716287612915, "learning_rate": 0.0005709994175888177, "loss": 3.74, "step": 8400 }, { "epoch": 2.4593619003260363, "grad_norm": 0.31370651721954346, "learning_rate": 0.0005708246942341292, "loss": 3.7474, "step": 8450 }, { "epoch": 2.4739170936190034, "grad_norm": 0.3109401762485504, "learning_rate": 0.0005706499708794409, "loss": 3.7389, "step": 8500 }, { "epoch": 2.4884722869119704, "grad_norm": 0.3356344699859619, "learning_rate": 0.0005704752475247524, "loss": 3.7445, "step": 8550 }, { "epoch": 2.503027480204937, "grad_norm": 0.3097998797893524, "learning_rate": 0.0005703005241700641, "loss": 3.739, "step": 8600 }, { "epoch": 2.517582673497904, "grad_norm": 0.3175306022167206, "learning_rate": 0.0005701258008153756, "loss": 3.7322, "step": 8650 }, { "epoch": 2.5321378667908707, "grad_norm": 0.32787057757377625, "learning_rate": 0.0005699510774606872, "loss": 3.7479, "step": 8700 }, { "epoch": 2.546693060083838, "grad_norm": 0.3269086480140686, "learning_rate": 0.0005697763541059988, "loss": 3.7487, "step": 8750 }, { "epoch": 2.561248253376805, "grad_norm": 0.3165985941886902, "learning_rate": 0.0005696016307513103, "loss": 3.7433, "step": 8800 }, { "epoch": 2.575803446669772, "grad_norm": 0.3171882927417755, "learning_rate": 0.000569426907396622, "loss": 3.7381, "step": 8850 }, { "epoch": 2.590358639962739, "grad_norm": 0.32805752754211426, "learning_rate": 0.0005692521840419336, "loss": 3.7386, "step": 8900 }, { "epoch": 2.6049138332557056, "grad_norm": 0.31334584951400757, "learning_rate": 0.0005690774606872452, "loss": 3.7288, "step": 8950 }, { "epoch": 2.6194690265486726, "grad_norm": 0.3102093040943146, "learning_rate": 0.0005689027373325567, "loss": 3.7333, "step": 9000 }, { "epoch": 2.6194690265486726, "eval_accuracy": 0.34755609607669175, "eval_loss": 3.7507848739624023, "eval_runtime": 180.4183, "eval_samples_per_second": 92.291, "eval_steps_per_second": 5.77, "step": 9000 }, { "epoch": 2.6340242198416393, "grad_norm": 0.3321838974952698, "learning_rate": 0.0005687280139778683, "loss": 3.7319, "step": 9050 }, { "epoch": 2.6485794131346063, "grad_norm": 0.3211287260055542, "learning_rate": 0.0005685532906231799, "loss": 3.752, "step": 9100 }, { "epoch": 2.6631346064275734, "grad_norm": 0.3193918466567993, "learning_rate": 0.0005683785672684915, "loss": 3.7149, "step": 9150 }, { "epoch": 2.6776897997205404, "grad_norm": 0.3272760212421417, "learning_rate": 0.0005682038439138031, "loss": 3.7346, "step": 9200 }, { "epoch": 2.692244993013507, "grad_norm": 0.3239770829677582, "learning_rate": 0.0005680291205591147, "loss": 3.732, "step": 9250 }, { "epoch": 2.706800186306474, "grad_norm": 0.31155821681022644, "learning_rate": 0.0005678543972044263, "loss": 3.7331, "step": 9300 }, { "epoch": 2.721355379599441, "grad_norm": 0.3312417268753052, "learning_rate": 0.0005676796738497378, "loss": 3.7357, "step": 9350 }, { "epoch": 2.735910572892408, "grad_norm": 0.3030058741569519, "learning_rate": 0.0005675049504950495, "loss": 3.7368, "step": 9400 }, { "epoch": 2.750465766185375, "grad_norm": 0.31816330552101135, "learning_rate": 0.0005673302271403611, "loss": 3.7218, "step": 9450 }, { "epoch": 2.765020959478342, "grad_norm": 0.3228764235973358, "learning_rate": 0.0005671555037856726, "loss": 3.7303, "step": 9500 }, { "epoch": 2.779576152771309, "grad_norm": 0.32315465807914734, "learning_rate": 0.0005669807804309842, "loss": 3.7211, "step": 9550 }, { "epoch": 2.7941313460642756, "grad_norm": 0.320466011762619, "learning_rate": 0.0005668060570762958, "loss": 3.7183, "step": 9600 }, { "epoch": 2.8086865393572427, "grad_norm": 0.30546510219573975, "learning_rate": 0.0005666313337216074, "loss": 3.7183, "step": 9650 }, { "epoch": 2.8232417326502097, "grad_norm": 0.31467297673225403, "learning_rate": 0.000566456610366919, "loss": 3.7162, "step": 9700 }, { "epoch": 2.8377969259431763, "grad_norm": 0.3176824450492859, "learning_rate": 0.0005662818870122306, "loss": 3.7254, "step": 9750 }, { "epoch": 2.8523521192361434, "grad_norm": 0.2985418438911438, "learning_rate": 0.0005661071636575422, "loss": 3.7329, "step": 9800 }, { "epoch": 2.8669073125291105, "grad_norm": 0.30694860219955444, "learning_rate": 0.0005659324403028537, "loss": 3.7151, "step": 9850 }, { "epoch": 2.8814625058220775, "grad_norm": 0.3074248433113098, "learning_rate": 0.0005657577169481653, "loss": 3.7288, "step": 9900 }, { "epoch": 2.896017699115044, "grad_norm": 0.32826361060142517, "learning_rate": 0.0005655829935934769, "loss": 3.7323, "step": 9950 }, { "epoch": 2.910572892408011, "grad_norm": 0.32125818729400635, "learning_rate": 0.0005654082702387886, "loss": 3.7235, "step": 10000 }, { "epoch": 2.910572892408011, "eval_accuracy": 0.34995247783405636, "eval_loss": 3.72528076171875, "eval_runtime": 180.3706, "eval_samples_per_second": 92.315, "eval_steps_per_second": 5.771, "step": 10000 }, { "epoch": 2.9251280857009783, "grad_norm": 0.3392745554447174, "learning_rate": 0.0005652335468841001, "loss": 3.7138, "step": 10050 }, { "epoch": 2.939683278993945, "grad_norm": 0.319692999124527, "learning_rate": 0.0005650588235294117, "loss": 3.7249, "step": 10100 }, { "epoch": 2.954238472286912, "grad_norm": 0.3139653205871582, "learning_rate": 0.0005648841001747233, "loss": 3.7188, "step": 10150 }, { "epoch": 2.968793665579879, "grad_norm": 0.32949382066726685, "learning_rate": 0.0005647093768200349, "loss": 3.7033, "step": 10200 }, { "epoch": 2.983348858872846, "grad_norm": 0.3081764280796051, "learning_rate": 0.0005645346534653465, "loss": 3.7083, "step": 10250 }, { "epoch": 2.9979040521658127, "grad_norm": 0.3366018831729889, "learning_rate": 0.0005643599301106582, "loss": 3.7284, "step": 10300 }, { "epoch": 3.0122263623660923, "grad_norm": 0.32959625124931335, "learning_rate": 0.0005641852067559697, "loss": 3.6255, "step": 10350 }, { "epoch": 3.026781555659059, "grad_norm": 0.32114019989967346, "learning_rate": 0.0005640104834012812, "loss": 3.6153, "step": 10400 }, { "epoch": 3.041336748952026, "grad_norm": 0.3065637946128845, "learning_rate": 0.0005638357600465929, "loss": 3.6146, "step": 10450 }, { "epoch": 3.055891942244993, "grad_norm": 0.32524827122688293, "learning_rate": 0.0005636610366919044, "loss": 3.6265, "step": 10500 }, { "epoch": 3.07044713553796, "grad_norm": 0.33376210927963257, "learning_rate": 0.0005634863133372161, "loss": 3.6109, "step": 10550 }, { "epoch": 3.0850023288309267, "grad_norm": 0.32622119784355164, "learning_rate": 0.0005633115899825276, "loss": 3.6125, "step": 10600 }, { "epoch": 3.099557522123894, "grad_norm": 0.34294867515563965, "learning_rate": 0.0005631368666278393, "loss": 3.6206, "step": 10650 }, { "epoch": 3.114112715416861, "grad_norm": 0.3191373348236084, "learning_rate": 0.0005629621432731508, "loss": 3.6227, "step": 10700 }, { "epoch": 3.1286679087098275, "grad_norm": 0.30693554878234863, "learning_rate": 0.0005627874199184623, "loss": 3.6275, "step": 10750 }, { "epoch": 3.1432231020027945, "grad_norm": 0.3276565670967102, "learning_rate": 0.000562612696563774, "loss": 3.6399, "step": 10800 }, { "epoch": 3.1577782952957616, "grad_norm": 0.31640875339508057, "learning_rate": 0.0005624379732090856, "loss": 3.626, "step": 10850 }, { "epoch": 3.1723334885887287, "grad_norm": 0.3158662021160126, "learning_rate": 0.0005622632498543972, "loss": 3.6269, "step": 10900 }, { "epoch": 3.1868886818816953, "grad_norm": 0.2997385263442993, "learning_rate": 0.0005620885264997087, "loss": 3.6182, "step": 10950 }, { "epoch": 3.2014438751746623, "grad_norm": 0.31996774673461914, "learning_rate": 0.0005619138031450204, "loss": 3.6348, "step": 11000 }, { "epoch": 3.2014438751746623, "eval_accuracy": 0.35218056049007007, "eval_loss": 3.7120718955993652, "eval_runtime": 180.5995, "eval_samples_per_second": 92.198, "eval_steps_per_second": 5.764, "step": 11000 }, { "epoch": 3.2159990684676294, "grad_norm": 0.6666705012321472, "learning_rate": 0.0005617390797903319, "loss": 3.6281, "step": 11050 }, { "epoch": 3.230554261760596, "grad_norm": 0.3219234347343445, "learning_rate": 0.0005615643564356436, "loss": 3.6394, "step": 11100 }, { "epoch": 3.245109455053563, "grad_norm": 0.31070762872695923, "learning_rate": 0.0005613896330809551, "loss": 3.649, "step": 11150 }, { "epoch": 3.25966464834653, "grad_norm": 0.3365066349506378, "learning_rate": 0.0005612149097262667, "loss": 3.644, "step": 11200 }, { "epoch": 3.274219841639497, "grad_norm": 0.3324010372161865, "learning_rate": 0.0005610401863715783, "loss": 3.6291, "step": 11250 }, { "epoch": 3.288775034932464, "grad_norm": 0.33356112241744995, "learning_rate": 0.0005608654630168898, "loss": 3.6333, "step": 11300 }, { "epoch": 3.303330228225431, "grad_norm": 0.3165360391139984, "learning_rate": 0.0005606907396622015, "loss": 3.6389, "step": 11350 }, { "epoch": 3.317885421518398, "grad_norm": 0.3153711259365082, "learning_rate": 0.0005605160163075131, "loss": 3.6384, "step": 11400 }, { "epoch": 3.3324406148113646, "grad_norm": 0.31237781047821045, "learning_rate": 0.0005603412929528247, "loss": 3.6354, "step": 11450 }, { "epoch": 3.3469958081043316, "grad_norm": 0.3095959424972534, "learning_rate": 0.0005601665695981362, "loss": 3.6423, "step": 11500 }, { "epoch": 3.3615510013972987, "grad_norm": 0.3354591429233551, "learning_rate": 0.0005599918462434478, "loss": 3.6412, "step": 11550 }, { "epoch": 3.3761061946902653, "grad_norm": 0.3205614984035492, "learning_rate": 0.0005598171228887594, "loss": 3.6297, "step": 11600 }, { "epoch": 3.3906613879832324, "grad_norm": 0.3375381827354431, "learning_rate": 0.0005596423995340709, "loss": 3.6476, "step": 11650 }, { "epoch": 3.4052165812761994, "grad_norm": 0.3077198266983032, "learning_rate": 0.0005594676761793826, "loss": 3.647, "step": 11700 }, { "epoch": 3.419771774569166, "grad_norm": 0.31726962327957153, "learning_rate": 0.0005592929528246942, "loss": 3.6382, "step": 11750 }, { "epoch": 3.434326967862133, "grad_norm": 0.32717880606651306, "learning_rate": 0.0005591182294700058, "loss": 3.6427, "step": 11800 }, { "epoch": 3.4488821611551, "grad_norm": 0.3359130322933197, "learning_rate": 0.0005589435061153173, "loss": 3.6424, "step": 11850 }, { "epoch": 3.463437354448067, "grad_norm": 0.30671921372413635, "learning_rate": 0.000558768782760629, "loss": 3.6385, "step": 11900 }, { "epoch": 3.477992547741034, "grad_norm": 0.42041298747062683, "learning_rate": 0.0005585940594059406, "loss": 3.6319, "step": 11950 }, { "epoch": 3.492547741034001, "grad_norm": 0.33865419030189514, "learning_rate": 0.0005584193360512521, "loss": 3.6474, "step": 12000 }, { "epoch": 3.492547741034001, "eval_accuracy": 0.35361169048808033, "eval_loss": 3.693437099456787, "eval_runtime": 180.3628, "eval_samples_per_second": 92.32, "eval_steps_per_second": 5.772, "step": 12000 }, { "epoch": 3.507102934326968, "grad_norm": 0.30656898021698, "learning_rate": 0.0005582446126965637, "loss": 3.6506, "step": 12050 }, { "epoch": 3.5216581276199346, "grad_norm": 0.3042758107185364, "learning_rate": 0.0005580698893418753, "loss": 3.6428, "step": 12100 }, { "epoch": 3.5362133209129016, "grad_norm": 0.3031887710094452, "learning_rate": 0.0005578951659871869, "loss": 3.6357, "step": 12150 }, { "epoch": 3.5507685142058687, "grad_norm": 0.3300098180770874, "learning_rate": 0.0005577204426324985, "loss": 3.6459, "step": 12200 }, { "epoch": 3.5653237074988358, "grad_norm": 0.3157689571380615, "learning_rate": 0.0005575457192778101, "loss": 3.6369, "step": 12250 }, { "epoch": 3.5798789007918024, "grad_norm": 0.3036363124847412, "learning_rate": 0.0005573709959231217, "loss": 3.6428, "step": 12300 }, { "epoch": 3.5944340940847694, "grad_norm": 0.31025630235671997, "learning_rate": 0.0005571962725684332, "loss": 3.6406, "step": 12350 }, { "epoch": 3.6089892873777365, "grad_norm": 0.3138718008995056, "learning_rate": 0.0005570215492137449, "loss": 3.642, "step": 12400 }, { "epoch": 3.623544480670703, "grad_norm": 0.3533076047897339, "learning_rate": 0.0005568468258590564, "loss": 3.6381, "step": 12450 }, { "epoch": 3.63809967396367, "grad_norm": 0.3099108040332794, "learning_rate": 0.0005566721025043681, "loss": 3.6462, "step": 12500 }, { "epoch": 3.6526548672566372, "grad_norm": 0.3372749090194702, "learning_rate": 0.0005564973791496796, "loss": 3.6349, "step": 12550 }, { "epoch": 3.6672100605496043, "grad_norm": 0.30877193808555603, "learning_rate": 0.0005563226557949913, "loss": 3.6511, "step": 12600 }, { "epoch": 3.681765253842571, "grad_norm": 0.3230888247489929, "learning_rate": 0.0005561479324403028, "loss": 3.6516, "step": 12650 }, { "epoch": 3.696320447135538, "grad_norm": 0.33067062497138977, "learning_rate": 0.0005559732090856144, "loss": 3.6405, "step": 12700 }, { "epoch": 3.710875640428505, "grad_norm": 0.3125595152378082, "learning_rate": 0.000555798485730926, "loss": 3.6493, "step": 12750 }, { "epoch": 3.7254308337214717, "grad_norm": 0.31595587730407715, "learning_rate": 0.0005556237623762376, "loss": 3.64, "step": 12800 }, { "epoch": 3.7399860270144387, "grad_norm": 0.3145303726196289, "learning_rate": 0.0005554490390215492, "loss": 3.636, "step": 12850 }, { "epoch": 3.754541220307406, "grad_norm": 0.3151354193687439, "learning_rate": 0.0005552743156668607, "loss": 3.6295, "step": 12900 }, { "epoch": 3.769096413600373, "grad_norm": 0.31332525610923767, "learning_rate": 0.0005550995923121724, "loss": 3.6418, "step": 12950 }, { "epoch": 3.7836516068933395, "grad_norm": 0.30277061462402344, "learning_rate": 0.0005549248689574839, "loss": 3.6408, "step": 13000 }, { "epoch": 3.7836516068933395, "eval_accuracy": 0.3551880842355807, "eval_loss": 3.676245927810669, "eval_runtime": 180.4349, "eval_samples_per_second": 92.283, "eval_steps_per_second": 5.769, "step": 13000 }, { "epoch": 3.7982068001863065, "grad_norm": 0.31708842515945435, "learning_rate": 0.0005547501456027955, "loss": 3.6355, "step": 13050 }, { "epoch": 3.812761993479273, "grad_norm": 0.3286830484867096, "learning_rate": 0.0005545754222481071, "loss": 3.6204, "step": 13100 }, { "epoch": 3.82731718677224, "grad_norm": 0.30938348174095154, "learning_rate": 0.0005544006988934188, "loss": 3.6417, "step": 13150 }, { "epoch": 3.8418723800652073, "grad_norm": 0.33141428232192993, "learning_rate": 0.0005542259755387303, "loss": 3.6262, "step": 13200 }, { "epoch": 3.8564275733581743, "grad_norm": 0.318694144487381, "learning_rate": 0.0005540512521840418, "loss": 3.6301, "step": 13250 }, { "epoch": 3.8709827666511414, "grad_norm": 0.32242947816848755, "learning_rate": 0.0005538765288293535, "loss": 3.6432, "step": 13300 }, { "epoch": 3.885537959944108, "grad_norm": 0.32441020011901855, "learning_rate": 0.0005537018054746651, "loss": 3.6424, "step": 13350 }, { "epoch": 3.900093153237075, "grad_norm": 0.31077954173088074, "learning_rate": 0.0005535270821199767, "loss": 3.6446, "step": 13400 }, { "epoch": 3.9146483465300417, "grad_norm": 0.3134579062461853, "learning_rate": 0.0005533523587652882, "loss": 3.6484, "step": 13450 }, { "epoch": 3.9292035398230087, "grad_norm": 0.32088223099708557, "learning_rate": 0.0005531776354105999, "loss": 3.6319, "step": 13500 }, { "epoch": 3.943758733115976, "grad_norm": 0.3342723548412323, "learning_rate": 0.0005530029120559114, "loss": 3.634, "step": 13550 }, { "epoch": 3.958313926408943, "grad_norm": 0.30833426117897034, "learning_rate": 0.0005528281887012229, "loss": 3.6398, "step": 13600 }, { "epoch": 3.9728691197019095, "grad_norm": 0.3233463168144226, "learning_rate": 0.0005526534653465346, "loss": 3.6338, "step": 13650 }, { "epoch": 3.9874243129948765, "grad_norm": 0.31912580132484436, "learning_rate": 0.0005524787419918462, "loss": 3.6286, "step": 13700 }, { "epoch": 4.001746623195156, "grad_norm": 0.3431865870952606, "learning_rate": 0.0005523040186371578, "loss": 3.6134, "step": 13750 }, { "epoch": 4.016301816488123, "grad_norm": 0.3147588074207306, "learning_rate": 0.0005521292952824693, "loss": 3.5368, "step": 13800 }, { "epoch": 4.03085700978109, "grad_norm": 0.32361555099487305, "learning_rate": 0.000551954571927781, "loss": 3.5237, "step": 13850 }, { "epoch": 4.045412203074057, "grad_norm": 0.3246564567089081, "learning_rate": 0.0005517798485730926, "loss": 3.5313, "step": 13900 }, { "epoch": 4.059967396367024, "grad_norm": 0.3165980279445648, "learning_rate": 0.0005516051252184042, "loss": 3.5284, "step": 13950 }, { "epoch": 4.074522589659991, "grad_norm": 0.32714539766311646, "learning_rate": 0.0005514304018637157, "loss": 3.5411, "step": 14000 }, { "epoch": 4.074522589659991, "eval_accuracy": 0.3561706125088307, "eval_loss": 3.6695468425750732, "eval_runtime": 180.4431, "eval_samples_per_second": 92.278, "eval_steps_per_second": 5.769, "step": 14000 }, { "epoch": 4.089077782952957, "grad_norm": 0.3239183723926544, "learning_rate": 0.0005512556785090273, "loss": 3.5332, "step": 14050 }, { "epoch": 4.103632976245924, "grad_norm": 0.3330540060997009, "learning_rate": 0.0005510809551543389, "loss": 3.5454, "step": 14100 }, { "epoch": 4.118188169538891, "grad_norm": 0.3179191052913666, "learning_rate": 0.0005509062317996504, "loss": 3.5489, "step": 14150 }, { "epoch": 4.132743362831858, "grad_norm": 0.3329083025455475, "learning_rate": 0.0005507315084449621, "loss": 3.5371, "step": 14200 }, { "epoch": 4.1472985561248255, "grad_norm": 0.36331698298454285, "learning_rate": 0.0005505567850902737, "loss": 3.5434, "step": 14250 }, { "epoch": 4.1618537494177925, "grad_norm": 0.2996724545955658, "learning_rate": 0.0005503820617355853, "loss": 3.5422, "step": 14300 }, { "epoch": 4.17640894271076, "grad_norm": 0.3349573612213135, "learning_rate": 0.0005502073383808969, "loss": 3.5535, "step": 14350 }, { "epoch": 4.190964136003726, "grad_norm": 0.3202705979347229, "learning_rate": 0.0005500326150262085, "loss": 3.5518, "step": 14400 }, { "epoch": 4.205519329296693, "grad_norm": 0.3231564164161682, "learning_rate": 0.00054985789167152, "loss": 3.5563, "step": 14450 }, { "epoch": 4.22007452258966, "grad_norm": 0.32384511828422546, "learning_rate": 0.0005496831683168316, "loss": 3.5423, "step": 14500 }, { "epoch": 4.234629715882627, "grad_norm": 0.32724398374557495, "learning_rate": 0.0005495084449621433, "loss": 3.565, "step": 14550 }, { "epoch": 4.249184909175594, "grad_norm": 0.317710816860199, "learning_rate": 0.0005493337216074548, "loss": 3.5519, "step": 14600 }, { "epoch": 4.263740102468561, "grad_norm": 0.3252590298652649, "learning_rate": 0.0005491589982527664, "loss": 3.5683, "step": 14650 }, { "epoch": 4.278295295761528, "grad_norm": 0.3199692368507385, "learning_rate": 0.000548984274898078, "loss": 3.5647, "step": 14700 }, { "epoch": 4.292850489054494, "grad_norm": 0.33188480138778687, "learning_rate": 0.0005488095515433897, "loss": 3.5568, "step": 14750 }, { "epoch": 4.307405682347461, "grad_norm": 0.3168836236000061, "learning_rate": 0.0005486348281887012, "loss": 3.5729, "step": 14800 }, { "epoch": 4.321960875640428, "grad_norm": 0.3001454174518585, "learning_rate": 0.0005484601048340127, "loss": 3.5625, "step": 14850 }, { "epoch": 4.3365160689333955, "grad_norm": 0.33755332231521606, "learning_rate": 0.0005482853814793244, "loss": 3.5661, "step": 14900 }, { "epoch": 4.3510712622263625, "grad_norm": 0.3125065267086029, "learning_rate": 0.0005481106581246359, "loss": 3.5723, "step": 14950 }, { "epoch": 4.36562645551933, "grad_norm": 0.34576064348220825, "learning_rate": 0.0005479359347699475, "loss": 3.5657, "step": 15000 }, { "epoch": 4.36562645551933, "eval_accuracy": 0.35745600864812926, "eval_loss": 3.6553664207458496, "eval_runtime": 180.3556, "eval_samples_per_second": 92.323, "eval_steps_per_second": 5.772, "step": 15000 }, { "epoch": 4.380181648812297, "grad_norm": 0.3235915005207062, "learning_rate": 0.0005477612114152591, "loss": 3.5612, "step": 15050 }, { "epoch": 4.394736842105263, "grad_norm": 0.3036567270755768, "learning_rate": 0.0005475864880605708, "loss": 3.5696, "step": 15100 }, { "epoch": 4.40929203539823, "grad_norm": 0.33459851145744324, "learning_rate": 0.0005474117647058823, "loss": 3.5689, "step": 15150 }, { "epoch": 4.423847228691197, "grad_norm": 0.3202270269393921, "learning_rate": 0.0005472370413511939, "loss": 3.5813, "step": 15200 }, { "epoch": 4.438402421984164, "grad_norm": 0.3253897428512573, "learning_rate": 0.0005470623179965055, "loss": 3.5597, "step": 15250 }, { "epoch": 4.452957615277131, "grad_norm": 0.31558945775032043, "learning_rate": 0.0005468875946418171, "loss": 3.577, "step": 15300 }, { "epoch": 4.467512808570098, "grad_norm": 0.3243916928768158, "learning_rate": 0.0005467128712871287, "loss": 3.5743, "step": 15350 }, { "epoch": 4.482068001863064, "grad_norm": 0.3244709074497223, "learning_rate": 0.0005465381479324402, "loss": 3.5753, "step": 15400 }, { "epoch": 4.496623195156031, "grad_norm": 0.31137195229530334, "learning_rate": 0.0005463634245777519, "loss": 3.578, "step": 15450 }, { "epoch": 4.511178388448998, "grad_norm": 0.3101448118686676, "learning_rate": 0.0005461887012230634, "loss": 3.5803, "step": 15500 }, { "epoch": 4.5257335817419655, "grad_norm": 0.3212225139141083, "learning_rate": 0.000546013977868375, "loss": 3.5731, "step": 15550 }, { "epoch": 4.5402887750349326, "grad_norm": 0.316310852766037, "learning_rate": 0.0005458392545136866, "loss": 3.5775, "step": 15600 }, { "epoch": 4.5548439683279, "grad_norm": 0.3199855387210846, "learning_rate": 0.0005456645311589983, "loss": 3.5637, "step": 15650 }, { "epoch": 4.569399161620867, "grad_norm": 0.3195266127586365, "learning_rate": 0.0005454898078043098, "loss": 3.5693, "step": 15700 }, { "epoch": 4.583954354913834, "grad_norm": 0.30563077330589294, "learning_rate": 0.0005453150844496213, "loss": 3.5743, "step": 15750 }, { "epoch": 4.5985095482068, "grad_norm": 0.3304414451122284, "learning_rate": 0.000545140361094933, "loss": 3.5775, "step": 15800 }, { "epoch": 4.613064741499767, "grad_norm": 0.32884886860847473, "learning_rate": 0.0005449656377402445, "loss": 3.5672, "step": 15850 }, { "epoch": 4.627619934792734, "grad_norm": 0.3263738453388214, "learning_rate": 0.0005447909143855562, "loss": 3.5761, "step": 15900 }, { "epoch": 4.642175128085701, "grad_norm": 0.3209979832172394, "learning_rate": 0.0005446161910308677, "loss": 3.5728, "step": 15950 }, { "epoch": 4.656730321378668, "grad_norm": 0.306265264749527, "learning_rate": 0.0005444414676761794, "loss": 3.5775, "step": 16000 }, { "epoch": 4.656730321378668, "eval_accuracy": 0.35885141034529405, "eval_loss": 3.6422030925750732, "eval_runtime": 180.3496, "eval_samples_per_second": 92.326, "eval_steps_per_second": 5.772, "step": 16000 }, { "epoch": 4.671285514671635, "grad_norm": 0.3310377299785614, "learning_rate": 0.0005442667443214909, "loss": 3.5669, "step": 16050 }, { "epoch": 4.685840707964601, "grad_norm": 0.3084382116794586, "learning_rate": 0.0005440920209668024, "loss": 3.5817, "step": 16100 }, { "epoch": 4.7003959012575685, "grad_norm": 0.3179602026939392, "learning_rate": 0.0005439172976121141, "loss": 3.5694, "step": 16150 }, { "epoch": 4.7149510945505355, "grad_norm": 0.319417268037796, "learning_rate": 0.0005437425742574257, "loss": 3.5722, "step": 16200 }, { "epoch": 4.729506287843503, "grad_norm": 0.31063929200172424, "learning_rate": 0.0005435678509027373, "loss": 3.5842, "step": 16250 }, { "epoch": 4.74406148113647, "grad_norm": 0.30904245376586914, "learning_rate": 0.0005433931275480488, "loss": 3.5799, "step": 16300 }, { "epoch": 4.758616674429437, "grad_norm": 0.33482858538627625, "learning_rate": 0.0005432184041933605, "loss": 3.5862, "step": 16350 }, { "epoch": 4.773171867722404, "grad_norm": 0.31110939383506775, "learning_rate": 0.000543043680838672, "loss": 3.5885, "step": 16400 }, { "epoch": 4.78772706101537, "grad_norm": 0.3215453028678894, "learning_rate": 0.0005428689574839837, "loss": 3.5734, "step": 16450 }, { "epoch": 4.802282254308337, "grad_norm": 0.36134353280067444, "learning_rate": 0.0005426942341292952, "loss": 3.5659, "step": 16500 }, { "epoch": 4.816837447601304, "grad_norm": 0.32289305329322815, "learning_rate": 0.0005425195107746068, "loss": 3.5878, "step": 16550 }, { "epoch": 4.831392640894271, "grad_norm": 0.3091415762901306, "learning_rate": 0.0005423447874199184, "loss": 3.5763, "step": 16600 }, { "epoch": 4.845947834187238, "grad_norm": 0.340044766664505, "learning_rate": 0.00054217006406523, "loss": 3.5832, "step": 16650 }, { "epoch": 4.860503027480205, "grad_norm": 0.3226177990436554, "learning_rate": 0.0005419953407105417, "loss": 3.5799, "step": 16700 }, { "epoch": 4.875058220773171, "grad_norm": 0.32585617899894714, "learning_rate": 0.0005418206173558532, "loss": 3.5762, "step": 16750 }, { "epoch": 4.8896134140661385, "grad_norm": 0.325832337141037, "learning_rate": 0.0005416458940011648, "loss": 3.5801, "step": 16800 }, { "epoch": 4.9041686073591055, "grad_norm": 0.35441067814826965, "learning_rate": 0.0005414711706464764, "loss": 3.5698, "step": 16850 }, { "epoch": 4.918723800652073, "grad_norm": 0.3278867304325104, "learning_rate": 0.000541296447291788, "loss": 3.5767, "step": 16900 }, { "epoch": 4.93327899394504, "grad_norm": 0.31768500804901123, "learning_rate": 0.0005411217239370995, "loss": 3.5723, "step": 16950 }, { "epoch": 4.947834187238007, "grad_norm": 0.31338393688201904, "learning_rate": 0.0005409470005824111, "loss": 3.5844, "step": 17000 }, { "epoch": 4.947834187238007, "eval_accuracy": 0.36009849258302806, "eval_loss": 3.6307671070098877, "eval_runtime": 180.2793, "eval_samples_per_second": 92.362, "eval_steps_per_second": 5.774, "step": 17000 }, { "epoch": 4.962389380530974, "grad_norm": 0.331552654504776, "learning_rate": 0.0005407722772277228, "loss": 3.5585, "step": 17050 }, { "epoch": 4.976944573823941, "grad_norm": 0.31583139300346375, "learning_rate": 0.0005405975538730343, "loss": 3.5675, "step": 17100 }, { "epoch": 4.991499767116907, "grad_norm": 0.31500598788261414, "learning_rate": 0.0005404228305183459, "loss": 3.5705, "step": 17150 }, { "epoch": 5.005822077317187, "grad_norm": 0.3317112624645233, "learning_rate": 0.0005402481071636575, "loss": 3.523, "step": 17200 }, { "epoch": 5.020377270610154, "grad_norm": 0.3370459973812103, "learning_rate": 0.0005400733838089692, "loss": 3.4707, "step": 17250 }, { "epoch": 5.034932463903121, "grad_norm": 0.32439619302749634, "learning_rate": 0.0005398986604542807, "loss": 3.4637, "step": 17300 }, { "epoch": 5.049487657196088, "grad_norm": 0.323920339345932, "learning_rate": 0.0005397239370995922, "loss": 3.4721, "step": 17350 }, { "epoch": 5.064042850489055, "grad_norm": 0.30164316296577454, "learning_rate": 0.0005395492137449039, "loss": 3.478, "step": 17400 }, { "epoch": 5.078598043782021, "grad_norm": 0.3119945824146271, "learning_rate": 0.0005393744903902154, "loss": 3.4785, "step": 17450 }, { "epoch": 5.093153237074988, "grad_norm": 0.3116292357444763, "learning_rate": 0.000539199767035527, "loss": 3.4853, "step": 17500 }, { "epoch": 5.107708430367955, "grad_norm": 0.3407755494117737, "learning_rate": 0.0005390250436808386, "loss": 3.4863, "step": 17550 }, { "epoch": 5.122263623660922, "grad_norm": 0.3576432168483734, "learning_rate": 0.0005388503203261503, "loss": 3.4937, "step": 17600 }, { "epoch": 5.136818816953889, "grad_norm": 0.3124430477619171, "learning_rate": 0.0005386755969714618, "loss": 3.4888, "step": 17650 }, { "epoch": 5.151374010246856, "grad_norm": 0.34951767325401306, "learning_rate": 0.0005385008736167733, "loss": 3.4913, "step": 17700 }, { "epoch": 5.165929203539823, "grad_norm": 0.341585636138916, "learning_rate": 0.000538326150262085, "loss": 3.4913, "step": 17750 }, { "epoch": 5.18048439683279, "grad_norm": 0.31971949338912964, "learning_rate": 0.0005381514269073965, "loss": 3.5012, "step": 17800 }, { "epoch": 5.195039590125757, "grad_norm": 0.35585907101631165, "learning_rate": 0.0005379767035527082, "loss": 3.5005, "step": 17850 }, { "epoch": 5.209594783418724, "grad_norm": 0.3526330888271332, "learning_rate": 0.0005378019801980197, "loss": 3.5101, "step": 17900 }, { "epoch": 5.224149976711691, "grad_norm": 0.31869256496429443, "learning_rate": 0.0005376272568433314, "loss": 3.5044, "step": 17950 }, { "epoch": 5.238705170004658, "grad_norm": 0.357380211353302, "learning_rate": 0.0005374525334886429, "loss": 3.5074, "step": 18000 }, { "epoch": 5.238705170004658, "eval_accuracy": 0.36049714520298787, "eval_loss": 3.6327102184295654, "eval_runtime": 180.3891, "eval_samples_per_second": 92.306, "eval_steps_per_second": 5.771, "step": 18000 }, { "epoch": 5.253260363297625, "grad_norm": 0.3412487804889679, "learning_rate": 0.0005372778101339545, "loss": 3.502, "step": 18050 }, { "epoch": 5.267815556590591, "grad_norm": 0.36009863018989563, "learning_rate": 0.0005371030867792661, "loss": 3.5056, "step": 18100 }, { "epoch": 5.282370749883558, "grad_norm": 0.3359185457229614, "learning_rate": 0.0005369283634245778, "loss": 3.5111, "step": 18150 }, { "epoch": 5.296925943176525, "grad_norm": 0.33600130677223206, "learning_rate": 0.0005367536400698893, "loss": 3.5122, "step": 18200 }, { "epoch": 5.311481136469492, "grad_norm": 0.3453161120414734, "learning_rate": 0.0005365789167152008, "loss": 3.4998, "step": 18250 }, { "epoch": 5.326036329762459, "grad_norm": 0.33084920048713684, "learning_rate": 0.0005364041933605125, "loss": 3.5097, "step": 18300 }, { "epoch": 5.340591523055426, "grad_norm": 0.34047576785087585, "learning_rate": 0.000536229470005824, "loss": 3.5037, "step": 18350 }, { "epoch": 5.3551467163483935, "grad_norm": 0.32609522342681885, "learning_rate": 0.0005360547466511357, "loss": 3.5154, "step": 18400 }, { "epoch": 5.36970190964136, "grad_norm": 0.30840250849723816, "learning_rate": 0.0005358800232964472, "loss": 3.5122, "step": 18450 }, { "epoch": 5.384257102934327, "grad_norm": 0.3481442928314209, "learning_rate": 0.0005357052999417589, "loss": 3.5119, "step": 18500 }, { "epoch": 5.398812296227294, "grad_norm": 0.3191811442375183, "learning_rate": 0.0005355305765870704, "loss": 3.5168, "step": 18550 }, { "epoch": 5.413367489520261, "grad_norm": 0.3202814757823944, "learning_rate": 0.000535355853232382, "loss": 3.5088, "step": 18600 }, { "epoch": 5.427922682813228, "grad_norm": 0.3154858946800232, "learning_rate": 0.0005351811298776936, "loss": 3.5204, "step": 18650 }, { "epoch": 5.442477876106195, "grad_norm": 0.3650684356689453, "learning_rate": 0.0005350064065230052, "loss": 3.5166, "step": 18700 }, { "epoch": 5.457033069399162, "grad_norm": 0.33510497212409973, "learning_rate": 0.0005348316831683168, "loss": 3.5231, "step": 18750 }, { "epoch": 5.471588262692128, "grad_norm": 0.3288259208202362, "learning_rate": 0.0005346569598136284, "loss": 3.5132, "step": 18800 }, { "epoch": 5.486143455985095, "grad_norm": 0.31960999965667725, "learning_rate": 0.00053448223645894, "loss": 3.5132, "step": 18850 }, { "epoch": 5.500698649278062, "grad_norm": 0.33212122321128845, "learning_rate": 0.0005343075131042515, "loss": 3.5386, "step": 18900 }, { "epoch": 5.515253842571029, "grad_norm": 0.31529274582862854, "learning_rate": 0.0005341327897495632, "loss": 3.5126, "step": 18950 }, { "epoch": 5.529809035863996, "grad_norm": 0.3251727819442749, "learning_rate": 0.0005339580663948748, "loss": 3.5232, "step": 19000 }, { "epoch": 5.529809035863996, "eval_accuracy": 0.36127729145631726, "eval_loss": 3.6238110065460205, "eval_runtime": 180.228, "eval_samples_per_second": 92.389, "eval_steps_per_second": 5.776, "step": 19000 }, { "epoch": 5.5443642291569635, "grad_norm": 0.31291624903678894, "learning_rate": 0.0005337833430401863, "loss": 3.5295, "step": 19050 }, { "epoch": 5.5589194224499305, "grad_norm": 0.33338844776153564, "learning_rate": 0.0005336086196854979, "loss": 3.5196, "step": 19100 }, { "epoch": 5.573474615742897, "grad_norm": 0.33093225955963135, "learning_rate": 0.0005334338963308095, "loss": 3.5333, "step": 19150 }, { "epoch": 5.588029809035864, "grad_norm": 0.3218625783920288, "learning_rate": 0.0005332591729761211, "loss": 3.5127, "step": 19200 }, { "epoch": 5.602585002328831, "grad_norm": 0.3261992335319519, "learning_rate": 0.0005330844496214327, "loss": 3.5309, "step": 19250 }, { "epoch": 5.617140195621798, "grad_norm": 0.3156827390193939, "learning_rate": 0.0005329097262667443, "loss": 3.5271, "step": 19300 }, { "epoch": 5.631695388914765, "grad_norm": 0.3294409215450287, "learning_rate": 0.0005327350029120559, "loss": 3.5253, "step": 19350 }, { "epoch": 5.646250582207732, "grad_norm": 0.3337712287902832, "learning_rate": 0.0005325602795573674, "loss": 3.5221, "step": 19400 }, { "epoch": 5.660805775500698, "grad_norm": 0.34715068340301514, "learning_rate": 0.000532385556202679, "loss": 3.5261, "step": 19450 }, { "epoch": 5.675360968793665, "grad_norm": 0.3143634796142578, "learning_rate": 0.0005322108328479906, "loss": 3.5165, "step": 19500 }, { "epoch": 5.689916162086632, "grad_norm": 0.33024221658706665, "learning_rate": 0.0005320361094933023, "loss": 3.5265, "step": 19550 }, { "epoch": 5.704471355379599, "grad_norm": 0.31611496210098267, "learning_rate": 0.0005318613861386138, "loss": 3.5224, "step": 19600 }, { "epoch": 5.719026548672566, "grad_norm": 0.32733219861984253, "learning_rate": 0.0005316866627839254, "loss": 3.5163, "step": 19650 }, { "epoch": 5.7335817419655335, "grad_norm": 0.3083952069282532, "learning_rate": 0.000531511939429237, "loss": 3.5225, "step": 19700 }, { "epoch": 5.748136935258501, "grad_norm": 0.3310599625110626, "learning_rate": 0.0005313372160745486, "loss": 3.5351, "step": 19750 }, { "epoch": 5.762692128551468, "grad_norm": 0.3217622935771942, "learning_rate": 0.0005311624927198602, "loss": 3.5269, "step": 19800 }, { "epoch": 5.777247321844434, "grad_norm": 0.331962913274765, "learning_rate": 0.0005309877693651717, "loss": 3.5122, "step": 19850 }, { "epoch": 5.791802515137401, "grad_norm": 0.33045053482055664, "learning_rate": 0.0005308130460104834, "loss": 3.5242, "step": 19900 }, { "epoch": 5.806357708430368, "grad_norm": 0.33240386843681335, "learning_rate": 0.0005306383226557949, "loss": 3.5224, "step": 19950 }, { "epoch": 5.820912901723335, "grad_norm": 0.2992856800556183, "learning_rate": 0.0005304635993011065, "loss": 3.5269, "step": 20000 }, { "epoch": 5.820912901723335, "eval_accuracy": 0.3623494930635972, "eval_loss": 3.6123011112213135, "eval_runtime": 180.5083, "eval_samples_per_second": 92.245, "eval_steps_per_second": 5.767, "step": 20000 }, { "epoch": 5.835468095016302, "grad_norm": 0.32584938406944275, "learning_rate": 0.0005302888759464181, "loss": 3.528, "step": 20050 }, { "epoch": 5.850023288309269, "grad_norm": 0.3376038074493408, "learning_rate": 0.0005301141525917298, "loss": 3.5396, "step": 20100 }, { "epoch": 5.864578481602235, "grad_norm": 0.33971336483955383, "learning_rate": 0.0005299394292370413, "loss": 3.5467, "step": 20150 }, { "epoch": 5.879133674895202, "grad_norm": 0.3295598030090332, "learning_rate": 0.0005297647058823528, "loss": 3.5243, "step": 20200 }, { "epoch": 5.893688868188169, "grad_norm": 0.31646406650543213, "learning_rate": 0.0005295899825276645, "loss": 3.5256, "step": 20250 }, { "epoch": 5.9082440614811365, "grad_norm": 0.33667871356010437, "learning_rate": 0.000529415259172976, "loss": 3.5183, "step": 20300 }, { "epoch": 5.9227992547741035, "grad_norm": 0.3198454976081848, "learning_rate": 0.0005292405358182877, "loss": 3.5226, "step": 20350 }, { "epoch": 5.937354448067071, "grad_norm": 0.3440448045730591, "learning_rate": 0.0005290658124635992, "loss": 3.5356, "step": 20400 }, { "epoch": 5.951909641360038, "grad_norm": 0.3336467146873474, "learning_rate": 0.0005288910891089109, "loss": 3.5398, "step": 20450 }, { "epoch": 5.966464834653004, "grad_norm": 0.32705527544021606, "learning_rate": 0.0005287163657542224, "loss": 3.5317, "step": 20500 }, { "epoch": 5.981020027945971, "grad_norm": 0.32881981134414673, "learning_rate": 0.000528541642399534, "loss": 3.5201, "step": 20550 }, { "epoch": 5.995575221238938, "grad_norm": 0.31796741485595703, "learning_rate": 0.0005283669190448456, "loss": 3.534, "step": 20600 }, { "epoch": 6.009897531439218, "grad_norm": 0.3202293813228607, "learning_rate": 0.0005281921956901572, "loss": 3.4539, "step": 20650 }, { "epoch": 6.024452724732185, "grad_norm": 0.3355855345726013, "learning_rate": 0.0005280174723354688, "loss": 3.4002, "step": 20700 }, { "epoch": 6.039007918025152, "grad_norm": 0.3445442318916321, "learning_rate": 0.0005278427489807804, "loss": 3.4296, "step": 20750 }, { "epoch": 6.053563111318118, "grad_norm": 0.33696219325065613, "learning_rate": 0.000527668025626092, "loss": 3.4346, "step": 20800 }, { "epoch": 6.068118304611085, "grad_norm": 0.35377752780914307, "learning_rate": 0.0005274933022714035, "loss": 3.4253, "step": 20850 }, { "epoch": 6.082673497904052, "grad_norm": 0.3427239656448364, "learning_rate": 0.0005273185789167152, "loss": 3.4389, "step": 20900 }, { "epoch": 6.097228691197019, "grad_norm": 0.3238384425640106, "learning_rate": 0.0005271438555620268, "loss": 3.4398, "step": 20950 }, { "epoch": 6.111783884489986, "grad_norm": 0.3189035654067993, "learning_rate": 0.0005269691322073384, "loss": 3.45, "step": 21000 }, { "epoch": 6.111783884489986, "eval_accuracy": 0.362332921713534, "eval_loss": 3.617029905319214, "eval_runtime": 180.4177, "eval_samples_per_second": 92.291, "eval_steps_per_second": 5.77, "step": 21000 }, { "epoch": 6.126339077782953, "grad_norm": 0.33343222737312317, "learning_rate": 0.0005267944088526499, "loss": 3.4394, "step": 21050 }, { "epoch": 6.14089427107592, "grad_norm": 0.31434309482574463, "learning_rate": 0.0005266196854979615, "loss": 3.4424, "step": 21100 }, { "epoch": 6.155449464368886, "grad_norm": 0.31062355637550354, "learning_rate": 0.0005264449621432731, "loss": 3.4451, "step": 21150 }, { "epoch": 6.1700046576618535, "grad_norm": 0.33464014530181885, "learning_rate": 0.0005262702387885847, "loss": 3.4454, "step": 21200 }, { "epoch": 6.1845598509548205, "grad_norm": 0.33242765069007874, "learning_rate": 0.0005260955154338963, "loss": 3.4547, "step": 21250 }, { "epoch": 6.199115044247788, "grad_norm": 0.3487234115600586, "learning_rate": 0.0005259207920792079, "loss": 3.4563, "step": 21300 }, { "epoch": 6.213670237540755, "grad_norm": 0.32406190037727356, "learning_rate": 0.0005257460687245195, "loss": 3.4619, "step": 21350 }, { "epoch": 6.228225430833722, "grad_norm": 0.3314816951751709, "learning_rate": 0.000525571345369831, "loss": 3.4534, "step": 21400 }, { "epoch": 6.242780624126689, "grad_norm": 0.34296348690986633, "learning_rate": 0.0005253966220151426, "loss": 3.4597, "step": 21450 }, { "epoch": 6.257335817419655, "grad_norm": 0.3384166359901428, "learning_rate": 0.0005252218986604543, "loss": 3.4594, "step": 21500 }, { "epoch": 6.271891010712622, "grad_norm": 0.34624239802360535, "learning_rate": 0.0005250471753057658, "loss": 3.4748, "step": 21550 }, { "epoch": 6.286446204005589, "grad_norm": 0.3315073847770691, "learning_rate": 0.0005248724519510774, "loss": 3.4704, "step": 21600 }, { "epoch": 6.301001397298556, "grad_norm": 0.3236428499221802, "learning_rate": 0.000524697728596389, "loss": 3.4697, "step": 21650 }, { "epoch": 6.315556590591523, "grad_norm": 0.3306480944156647, "learning_rate": 0.0005245230052417006, "loss": 3.4745, "step": 21700 }, { "epoch": 6.33011178388449, "grad_norm": 0.3375949263572693, "learning_rate": 0.0005243482818870122, "loss": 3.4609, "step": 21750 }, { "epoch": 6.344666977177457, "grad_norm": 0.33086276054382324, "learning_rate": 0.0005241735585323238, "loss": 3.4718, "step": 21800 }, { "epoch": 6.3592221704704235, "grad_norm": 0.32785555720329285, "learning_rate": 0.0005239988351776354, "loss": 3.4648, "step": 21850 }, { "epoch": 6.3737773637633905, "grad_norm": 0.31982743740081787, "learning_rate": 0.0005238241118229469, "loss": 3.4771, "step": 21900 }, { "epoch": 6.388332557056358, "grad_norm": 0.3218741714954376, "learning_rate": 0.0005236493884682585, "loss": 3.466, "step": 21950 }, { "epoch": 6.402887750349325, "grad_norm": 0.35872524976730347, "learning_rate": 0.0005234746651135701, "loss": 3.4654, "step": 22000 }, { "epoch": 6.402887750349325, "eval_accuracy": 0.3633112190037892, "eval_loss": 3.608182668685913, "eval_runtime": 180.1839, "eval_samples_per_second": 92.411, "eval_steps_per_second": 5.777, "step": 22000 }, { "epoch": 6.417442943642292, "grad_norm": 0.344078004360199, "learning_rate": 0.0005232999417588818, "loss": 3.4703, "step": 22050 }, { "epoch": 6.431998136935259, "grad_norm": 0.33091557025909424, "learning_rate": 0.0005231252184041933, "loss": 3.4684, "step": 22100 }, { "epoch": 6.446553330228225, "grad_norm": 0.32295098900794983, "learning_rate": 0.0005229504950495049, "loss": 3.4793, "step": 22150 }, { "epoch": 6.461108523521192, "grad_norm": 0.321740061044693, "learning_rate": 0.0005227757716948165, "loss": 3.4846, "step": 22200 }, { "epoch": 6.475663716814159, "grad_norm": 0.3518405556678772, "learning_rate": 0.000522601048340128, "loss": 3.4745, "step": 22250 }, { "epoch": 6.490218910107126, "grad_norm": 0.32650724053382874, "learning_rate": 0.0005224263249854397, "loss": 3.4974, "step": 22300 }, { "epoch": 6.504774103400093, "grad_norm": 0.33031734824180603, "learning_rate": 0.0005222516016307512, "loss": 3.4762, "step": 22350 }, { "epoch": 6.51932929669306, "grad_norm": 0.3188895583152771, "learning_rate": 0.0005220768782760629, "loss": 3.47, "step": 22400 }, { "epoch": 6.533884489986027, "grad_norm": 0.30965742468833923, "learning_rate": 0.0005219021549213744, "loss": 3.4835, "step": 22450 }, { "epoch": 6.548439683278994, "grad_norm": 0.33755552768707275, "learning_rate": 0.000521727431566686, "loss": 3.4817, "step": 22500 }, { "epoch": 6.562994876571961, "grad_norm": 0.3175261616706848, "learning_rate": 0.0005215527082119976, "loss": 3.502, "step": 22550 }, { "epoch": 6.577550069864928, "grad_norm": 0.35669413208961487, "learning_rate": 0.0005213779848573093, "loss": 3.4824, "step": 22600 }, { "epoch": 6.592105263157895, "grad_norm": 0.3446201682090759, "learning_rate": 0.0005212032615026208, "loss": 3.4867, "step": 22650 }, { "epoch": 6.606660456450862, "grad_norm": 0.3323017358779907, "learning_rate": 0.0005210285381479323, "loss": 3.4752, "step": 22700 }, { "epoch": 6.621215649743829, "grad_norm": 0.3301149904727936, "learning_rate": 0.000520853814793244, "loss": 3.4818, "step": 22750 }, { "epoch": 6.635770843036796, "grad_norm": 0.33560121059417725, "learning_rate": 0.0005206790914385555, "loss": 3.4807, "step": 22800 }, { "epoch": 6.650326036329762, "grad_norm": 0.3351544439792633, "learning_rate": 0.0005205043680838672, "loss": 3.5003, "step": 22850 }, { "epoch": 6.664881229622729, "grad_norm": 0.30252987146377563, "learning_rate": 0.0005203296447291787, "loss": 3.4812, "step": 22900 }, { "epoch": 6.679436422915696, "grad_norm": 0.3330850899219513, "learning_rate": 0.0005201549213744904, "loss": 3.4726, "step": 22950 }, { "epoch": 6.693991616208663, "grad_norm": 0.3413783609867096, "learning_rate": 0.0005199801980198019, "loss": 3.4808, "step": 23000 }, { "epoch": 6.693991616208663, "eval_accuracy": 0.36418997066635983, "eval_loss": 3.59979510307312, "eval_runtime": 180.1834, "eval_samples_per_second": 92.411, "eval_steps_per_second": 5.777, "step": 23000 }, { "epoch": 6.70854680950163, "grad_norm": 0.32982543110847473, "learning_rate": 0.0005198054746651136, "loss": 3.5009, "step": 23050 }, { "epoch": 6.723102002794597, "grad_norm": 0.3471560776233673, "learning_rate": 0.0005196307513104251, "loss": 3.4778, "step": 23100 }, { "epoch": 6.737657196087564, "grad_norm": 0.332000732421875, "learning_rate": 0.0005194560279557367, "loss": 3.4882, "step": 23150 }, { "epoch": 6.752212389380531, "grad_norm": 0.3488910496234894, "learning_rate": 0.0005192813046010483, "loss": 3.4874, "step": 23200 }, { "epoch": 6.766767582673498, "grad_norm": 0.3276961147785187, "learning_rate": 0.0005191065812463599, "loss": 3.4983, "step": 23250 }, { "epoch": 6.781322775966465, "grad_norm": 0.3638952970504761, "learning_rate": 0.0005189318578916715, "loss": 3.4918, "step": 23300 }, { "epoch": 6.795877969259432, "grad_norm": 0.3009364902973175, "learning_rate": 0.000518757134536983, "loss": 3.4812, "step": 23350 }, { "epoch": 6.810433162552399, "grad_norm": 0.31631243228912354, "learning_rate": 0.0005185824111822947, "loss": 3.495, "step": 23400 }, { "epoch": 6.824988355845366, "grad_norm": 0.32096341252326965, "learning_rate": 0.0005184076878276063, "loss": 3.4888, "step": 23450 }, { "epoch": 6.839543549138332, "grad_norm": 0.2987549901008606, "learning_rate": 0.0005182329644729179, "loss": 3.4863, "step": 23500 }, { "epoch": 6.854098742431299, "grad_norm": 0.3552905321121216, "learning_rate": 0.0005180582411182294, "loss": 3.4917, "step": 23550 }, { "epoch": 6.868653935724266, "grad_norm": 0.35249367356300354, "learning_rate": 0.000517883517763541, "loss": 3.4976, "step": 23600 }, { "epoch": 6.883209129017233, "grad_norm": 0.33594492077827454, "learning_rate": 0.0005177087944088526, "loss": 3.4905, "step": 23650 }, { "epoch": 6.8977643223102, "grad_norm": 0.3413904309272766, "learning_rate": 0.0005175340710541642, "loss": 3.4877, "step": 23700 }, { "epoch": 6.912319515603167, "grad_norm": 0.32692745327949524, "learning_rate": 0.0005173593476994758, "loss": 3.4905, "step": 23750 }, { "epoch": 6.926874708896134, "grad_norm": 0.3206517696380615, "learning_rate": 0.0005171846243447874, "loss": 3.4844, "step": 23800 }, { "epoch": 6.9414299021891015, "grad_norm": 0.3234865665435791, "learning_rate": 0.000517009900990099, "loss": 3.4952, "step": 23850 }, { "epoch": 6.955985095482068, "grad_norm": 0.33857715129852295, "learning_rate": 0.0005168351776354105, "loss": 3.4871, "step": 23900 }, { "epoch": 6.970540288775035, "grad_norm": 0.31941258907318115, "learning_rate": 0.0005166604542807221, "loss": 3.5024, "step": 23950 }, { "epoch": 6.985095482068002, "grad_norm": 0.3046661913394928, "learning_rate": 0.0005164857309260338, "loss": 3.494, "step": 24000 }, { "epoch": 6.985095482068002, "eval_accuracy": 0.36480546116480606, "eval_loss": 3.5888559818267822, "eval_runtime": 180.4882, "eval_samples_per_second": 92.255, "eval_steps_per_second": 5.768, "step": 24000 }, { "epoch": 6.999650675360969, "grad_norm": 0.33471864461898804, "learning_rate": 0.0005163110075713453, "loss": 3.4811, "step": 24050 }, { "epoch": 7.0139729855612485, "grad_norm": 0.32821211218833923, "learning_rate": 0.0005161362842166569, "loss": 3.3907, "step": 24100 }, { "epoch": 7.0285281788542155, "grad_norm": 0.3387831747531891, "learning_rate": 0.0005159615608619685, "loss": 3.3759, "step": 24150 }, { "epoch": 7.043083372147182, "grad_norm": 0.3345881700515747, "learning_rate": 0.0005157868375072801, "loss": 3.3712, "step": 24200 }, { "epoch": 7.057638565440149, "grad_norm": 0.3260855972766876, "learning_rate": 0.0005156121141525917, "loss": 3.3896, "step": 24250 }, { "epoch": 7.072193758733116, "grad_norm": 0.3474087417125702, "learning_rate": 0.0005154373907979033, "loss": 3.3901, "step": 24300 }, { "epoch": 7.086748952026083, "grad_norm": 0.3276037573814392, "learning_rate": 0.0005152626674432149, "loss": 3.3977, "step": 24350 }, { "epoch": 7.10130414531905, "grad_norm": 0.30755236744880676, "learning_rate": 0.0005150879440885264, "loss": 3.396, "step": 24400 }, { "epoch": 7.115859338612017, "grad_norm": 0.35878077149391174, "learning_rate": 0.000514913220733838, "loss": 3.4045, "step": 24450 }, { "epoch": 7.130414531904984, "grad_norm": 0.32121166586875916, "learning_rate": 0.0005147384973791496, "loss": 3.4154, "step": 24500 }, { "epoch": 7.14496972519795, "grad_norm": 0.34105220437049866, "learning_rate": 0.0005145637740244613, "loss": 3.4097, "step": 24550 }, { "epoch": 7.159524918490917, "grad_norm": 0.3617614805698395, "learning_rate": 0.0005143890506697728, "loss": 3.4045, "step": 24600 }, { "epoch": 7.174080111783884, "grad_norm": 0.33047446608543396, "learning_rate": 0.0005142143273150844, "loss": 3.4171, "step": 24650 }, { "epoch": 7.1886353050768514, "grad_norm": 0.3147647976875305, "learning_rate": 0.000514039603960396, "loss": 3.4151, "step": 24700 }, { "epoch": 7.2031904983698185, "grad_norm": 0.336982399225235, "learning_rate": 0.0005138648806057075, "loss": 3.4105, "step": 24750 }, { "epoch": 7.217745691662786, "grad_norm": 0.33287307620048523, "learning_rate": 0.0005136901572510192, "loss": 3.4279, "step": 24800 }, { "epoch": 7.232300884955753, "grad_norm": 0.37929704785346985, "learning_rate": 0.0005135154338963307, "loss": 3.4129, "step": 24850 }, { "epoch": 7.246856078248719, "grad_norm": 0.3289410173892975, "learning_rate": 0.0005133407105416424, "loss": 3.4252, "step": 24900 }, { "epoch": 7.261411271541686, "grad_norm": 0.3278612494468689, "learning_rate": 0.0005131659871869539, "loss": 3.4324, "step": 24950 }, { "epoch": 7.275966464834653, "grad_norm": 0.3302483558654785, "learning_rate": 0.0005129912638322656, "loss": 3.4155, "step": 25000 }, { "epoch": 7.275966464834653, "eval_accuracy": 0.36486269696254203, "eval_loss": 3.599623441696167, "eval_runtime": 180.4651, "eval_samples_per_second": 92.267, "eval_steps_per_second": 5.768, "step": 25000 }, { "epoch": 7.29052165812762, "grad_norm": 0.33454978466033936, "learning_rate": 0.0005128165404775771, "loss": 3.4315, "step": 25050 }, { "epoch": 7.305076851420587, "grad_norm": 0.3311181664466858, "learning_rate": 0.0005126418171228888, "loss": 3.4198, "step": 25100 }, { "epoch": 7.319632044713554, "grad_norm": 0.3281189799308777, "learning_rate": 0.0005124670937682003, "loss": 3.4367, "step": 25150 }, { "epoch": 7.334187238006521, "grad_norm": 0.3463766276836395, "learning_rate": 0.000512292370413512, "loss": 3.4301, "step": 25200 }, { "epoch": 7.348742431299487, "grad_norm": 0.3153272867202759, "learning_rate": 0.0005121176470588235, "loss": 3.4252, "step": 25250 }, { "epoch": 7.363297624592454, "grad_norm": 0.3330363631248474, "learning_rate": 0.000511942923704135, "loss": 3.4467, "step": 25300 }, { "epoch": 7.3778528178854215, "grad_norm": 0.341755211353302, "learning_rate": 0.0005117682003494467, "loss": 3.4415, "step": 25350 }, { "epoch": 7.3924080111783885, "grad_norm": 0.34595128893852234, "learning_rate": 0.0005115934769947583, "loss": 3.4426, "step": 25400 }, { "epoch": 7.406963204471356, "grad_norm": 0.31683796644210815, "learning_rate": 0.0005114187536400699, "loss": 3.4441, "step": 25450 }, { "epoch": 7.421518397764323, "grad_norm": 0.3140467703342438, "learning_rate": 0.0005112440302853814, "loss": 3.4488, "step": 25500 }, { "epoch": 7.436073591057289, "grad_norm": 0.33137738704681396, "learning_rate": 0.0005110693069306931, "loss": 3.4327, "step": 25550 }, { "epoch": 7.450628784350256, "grad_norm": 0.3456050157546997, "learning_rate": 0.0005108945835760046, "loss": 3.4322, "step": 25600 }, { "epoch": 7.465183977643223, "grad_norm": 0.3371207118034363, "learning_rate": 0.0005107198602213162, "loss": 3.4505, "step": 25650 }, { "epoch": 7.47973917093619, "grad_norm": 0.3465997874736786, "learning_rate": 0.0005105451368666278, "loss": 3.4465, "step": 25700 }, { "epoch": 7.494294364229157, "grad_norm": 0.33178913593292236, "learning_rate": 0.0005103704135119394, "loss": 3.443, "step": 25750 }, { "epoch": 7.508849557522124, "grad_norm": 0.32614588737487793, "learning_rate": 0.000510195690157251, "loss": 3.4541, "step": 25800 }, { "epoch": 7.523404750815091, "grad_norm": 0.336892694234848, "learning_rate": 0.0005100209668025625, "loss": 3.4503, "step": 25850 }, { "epoch": 7.537959944108058, "grad_norm": 0.3476903736591339, "learning_rate": 0.0005098462434478742, "loss": 3.4423, "step": 25900 }, { "epoch": 7.552515137401024, "grad_norm": 0.3244699239730835, "learning_rate": 0.0005096715200931858, "loss": 3.445, "step": 25950 }, { "epoch": 7.5670703306939915, "grad_norm": 0.3290373384952545, "learning_rate": 0.0005094967967384974, "loss": 3.4493, "step": 26000 }, { "epoch": 7.5670703306939915, "eval_accuracy": 0.3653377423310201, "eval_loss": 3.5898900032043457, "eval_runtime": 180.4047, "eval_samples_per_second": 92.298, "eval_steps_per_second": 5.77, "step": 26000 }, { "epoch": 7.5816255239869585, "grad_norm": 0.3152379095554352, "learning_rate": 0.0005093220733838089, "loss": 3.447, "step": 26050 }, { "epoch": 7.596180717279926, "grad_norm": 0.3170357048511505, "learning_rate": 0.0005091473500291205, "loss": 3.445, "step": 26100 }, { "epoch": 7.610735910572893, "grad_norm": 0.3438595235347748, "learning_rate": 0.0005089726266744321, "loss": 3.4543, "step": 26150 }, { "epoch": 7.625291103865859, "grad_norm": 0.33858057856559753, "learning_rate": 0.0005087979033197437, "loss": 3.4545, "step": 26200 }, { "epoch": 7.639846297158826, "grad_norm": 0.3303052484989166, "learning_rate": 0.0005086231799650553, "loss": 3.4471, "step": 26250 }, { "epoch": 7.654401490451793, "grad_norm": 0.3555377125740051, "learning_rate": 0.0005084484566103669, "loss": 3.4571, "step": 26300 }, { "epoch": 7.66895668374476, "grad_norm": 0.3649464249610901, "learning_rate": 0.0005082737332556785, "loss": 3.4648, "step": 26350 }, { "epoch": 7.683511877037727, "grad_norm": 0.32981374859809875, "learning_rate": 0.00050809900990099, "loss": 3.4557, "step": 26400 }, { "epoch": 7.698067070330694, "grad_norm": 0.33963456749916077, "learning_rate": 0.0005079242865463016, "loss": 3.4524, "step": 26450 }, { "epoch": 7.712622263623661, "grad_norm": 0.31539008021354675, "learning_rate": 0.0005077495631916133, "loss": 3.4554, "step": 26500 }, { "epoch": 7.727177456916628, "grad_norm": 0.34639355540275574, "learning_rate": 0.0005075748398369248, "loss": 3.4679, "step": 26550 }, { "epoch": 7.7417326502095944, "grad_norm": 0.3239724934101105, "learning_rate": 0.0005074001164822364, "loss": 3.4472, "step": 26600 }, { "epoch": 7.7562878435025615, "grad_norm": 0.34899502992630005, "learning_rate": 0.000507225393127548, "loss": 3.4625, "step": 26650 }, { "epoch": 7.770843036795529, "grad_norm": 0.34454357624053955, "learning_rate": 0.0005070506697728596, "loss": 3.4551, "step": 26700 }, { "epoch": 7.785398230088496, "grad_norm": 0.3207840919494629, "learning_rate": 0.0005068759464181711, "loss": 3.4565, "step": 26750 }, { "epoch": 7.799953423381463, "grad_norm": 0.3321252465248108, "learning_rate": 0.0005067012230634828, "loss": 3.4498, "step": 26800 }, { "epoch": 7.81450861667443, "grad_norm": 0.3324463665485382, "learning_rate": 0.0005065264997087944, "loss": 3.4605, "step": 26850 }, { "epoch": 7.829063809967396, "grad_norm": 0.3028212785720825, "learning_rate": 0.0005063517763541059, "loss": 3.4556, "step": 26900 }, { "epoch": 7.843619003260363, "grad_norm": 0.36324483156204224, "learning_rate": 0.0005061770529994175, "loss": 3.4536, "step": 26950 }, { "epoch": 7.85817419655333, "grad_norm": 0.312711238861084, "learning_rate": 0.0005060023296447291, "loss": 3.463, "step": 27000 }, { "epoch": 7.85817419655333, "eval_accuracy": 0.36624293763730864, "eval_loss": 3.580157518386841, "eval_runtime": 180.254, "eval_samples_per_second": 92.375, "eval_steps_per_second": 5.775, "step": 27000 }, { "epoch": 7.872729389846297, "grad_norm": 0.33005058765411377, "learning_rate": 0.0005058276062900408, "loss": 3.4636, "step": 27050 }, { "epoch": 7.887284583139264, "grad_norm": 0.34216201305389404, "learning_rate": 0.0005056528829353523, "loss": 3.4698, "step": 27100 }, { "epoch": 7.901839776432231, "grad_norm": 0.3425782024860382, "learning_rate": 0.000505478159580664, "loss": 3.4616, "step": 27150 }, { "epoch": 7.916394969725198, "grad_norm": 0.3791617155075073, "learning_rate": 0.0005053034362259755, "loss": 3.4571, "step": 27200 }, { "epoch": 7.930950163018165, "grad_norm": 0.32575327157974243, "learning_rate": 0.000505128712871287, "loss": 3.4691, "step": 27250 }, { "epoch": 7.9455053563111315, "grad_norm": 0.3228894770145416, "learning_rate": 0.0005049539895165987, "loss": 3.4653, "step": 27300 }, { "epoch": 7.960060549604099, "grad_norm": 0.35627007484436035, "learning_rate": 0.0005047792661619103, "loss": 3.4473, "step": 27350 }, { "epoch": 7.974615742897066, "grad_norm": 0.3586435616016388, "learning_rate": 0.0005046045428072219, "loss": 3.4565, "step": 27400 }, { "epoch": 7.989170936190033, "grad_norm": 0.3928644061088562, "learning_rate": 0.0005044298194525334, "loss": 3.4705, "step": 27450 }, { "epoch": 8.003493246390311, "grad_norm": 0.3316367268562317, "learning_rate": 0.0005042550960978451, "loss": 3.4414, "step": 27500 }, { "epoch": 8.018048439683279, "grad_norm": 0.35785290598869324, "learning_rate": 0.0005040803727431566, "loss": 3.3468, "step": 27550 }, { "epoch": 8.032603632976246, "grad_norm": 0.3238537013530731, "learning_rate": 0.0005039056493884683, "loss": 3.3564, "step": 27600 }, { "epoch": 8.047158826269213, "grad_norm": 0.33659982681274414, "learning_rate": 0.0005037309260337798, "loss": 3.3583, "step": 27650 }, { "epoch": 8.06171401956218, "grad_norm": 0.355198472738266, "learning_rate": 0.0005035562026790914, "loss": 3.3647, "step": 27700 }, { "epoch": 8.076269212855147, "grad_norm": 0.3489474058151245, "learning_rate": 0.000503381479324403, "loss": 3.3706, "step": 27750 }, { "epoch": 8.090824406148114, "grad_norm": 0.3374377489089966, "learning_rate": 0.0005032067559697145, "loss": 3.3604, "step": 27800 }, { "epoch": 8.10537959944108, "grad_norm": 0.3467634618282318, "learning_rate": 0.0005030320326150262, "loss": 3.3714, "step": 27850 }, { "epoch": 8.119934792734048, "grad_norm": 0.347935289144516, "learning_rate": 0.0005028573092603378, "loss": 3.385, "step": 27900 }, { "epoch": 8.134489986027015, "grad_norm": 0.35712653398513794, "learning_rate": 0.0005026825859056494, "loss": 3.3626, "step": 27950 }, { "epoch": 8.149045179319982, "grad_norm": 0.3268442153930664, "learning_rate": 0.0005025078625509609, "loss": 3.3767, "step": 28000 }, { "epoch": 8.149045179319982, "eval_accuracy": 0.3661386909174076, "eval_loss": 3.5874390602111816, "eval_runtime": 180.28, "eval_samples_per_second": 92.362, "eval_steps_per_second": 5.774, "step": 28000 }, { "epoch": 8.16360037261295, "grad_norm": 0.33844125270843506, "learning_rate": 0.0005023331391962726, "loss": 3.3813, "step": 28050 }, { "epoch": 8.178155565905914, "grad_norm": 0.34955015778541565, "learning_rate": 0.0005021584158415841, "loss": 3.3814, "step": 28100 }, { "epoch": 8.192710759198881, "grad_norm": 0.3521413207054138, "learning_rate": 0.0005019836924868956, "loss": 3.3952, "step": 28150 }, { "epoch": 8.207265952491849, "grad_norm": 0.3510971665382385, "learning_rate": 0.0005018089691322073, "loss": 3.3921, "step": 28200 }, { "epoch": 8.221821145784816, "grad_norm": 0.33943721652030945, "learning_rate": 0.0005016342457775189, "loss": 3.3908, "step": 28250 }, { "epoch": 8.236376339077783, "grad_norm": 0.3289835453033447, "learning_rate": 0.0005014595224228305, "loss": 3.3899, "step": 28300 }, { "epoch": 8.25093153237075, "grad_norm": 0.3255699574947357, "learning_rate": 0.000501284799068142, "loss": 3.4025, "step": 28350 }, { "epoch": 8.265486725663717, "grad_norm": 0.33189740777015686, "learning_rate": 0.0005011100757134537, "loss": 3.4023, "step": 28400 }, { "epoch": 8.280041918956684, "grad_norm": 0.34844663739204407, "learning_rate": 0.0005009353523587653, "loss": 3.3986, "step": 28450 }, { "epoch": 8.294597112249651, "grad_norm": 0.3706410229206085, "learning_rate": 0.0005007606290040768, "loss": 3.3826, "step": 28500 }, { "epoch": 8.309152305542618, "grad_norm": 0.3392998278141022, "learning_rate": 0.0005005859056493884, "loss": 3.4084, "step": 28550 }, { "epoch": 8.323707498835585, "grad_norm": 0.33849596977233887, "learning_rate": 0.0005004111822947, "loss": 3.4007, "step": 28600 }, { "epoch": 8.338262692128552, "grad_norm": 0.3535289764404297, "learning_rate": 0.0005002364589400116, "loss": 3.3973, "step": 28650 }, { "epoch": 8.35281788542152, "grad_norm": 0.32421401143074036, "learning_rate": 0.0005000617355853231, "loss": 3.3954, "step": 28700 }, { "epoch": 8.367373078714486, "grad_norm": 0.3335261940956116, "learning_rate": 0.0004998870122306348, "loss": 3.4094, "step": 28750 }, { "epoch": 8.381928272007451, "grad_norm": 0.3622640371322632, "learning_rate": 0.0004997122888759464, "loss": 3.4038, "step": 28800 }, { "epoch": 8.396483465300419, "grad_norm": 0.33357468247413635, "learning_rate": 0.000499537565521258, "loss": 3.4061, "step": 28850 }, { "epoch": 8.411038658593386, "grad_norm": 0.3338543474674225, "learning_rate": 0.0004993628421665695, "loss": 3.4013, "step": 28900 }, { "epoch": 8.425593851886353, "grad_norm": 0.3319488763809204, "learning_rate": 0.0004991881188118811, "loss": 3.4112, "step": 28950 }, { "epoch": 8.44014904517932, "grad_norm": 0.33868297934532166, "learning_rate": 0.0004990133954571928, "loss": 3.4147, "step": 29000 }, { "epoch": 8.44014904517932, "eval_accuracy": 0.3669886483901521, "eval_loss": 3.580271005630493, "eval_runtime": 180.2731, "eval_samples_per_second": 92.365, "eval_steps_per_second": 5.775, "step": 29000 }, { "epoch": 8.454704238472287, "grad_norm": 0.33619558811187744, "learning_rate": 0.0004988386721025043, "loss": 3.4144, "step": 29050 }, { "epoch": 8.469259431765254, "grad_norm": 0.3467794358730316, "learning_rate": 0.0004986639487478159, "loss": 3.4239, "step": 29100 }, { "epoch": 8.483814625058221, "grad_norm": 0.3543495237827301, "learning_rate": 0.0004984892253931275, "loss": 3.4063, "step": 29150 }, { "epoch": 8.498369818351188, "grad_norm": 0.33876168727874756, "learning_rate": 0.0004983145020384391, "loss": 3.4193, "step": 29200 }, { "epoch": 8.512925011644155, "grad_norm": 0.37324759364128113, "learning_rate": 0.0004981397786837507, "loss": 3.4214, "step": 29250 }, { "epoch": 8.527480204937122, "grad_norm": 0.3415110111236572, "learning_rate": 0.0004979650553290622, "loss": 3.4177, "step": 29300 }, { "epoch": 8.54203539823009, "grad_norm": 0.34661757946014404, "learning_rate": 0.0004977903319743739, "loss": 3.4233, "step": 29350 }, { "epoch": 8.556590591523056, "grad_norm": 0.3273857831954956, "learning_rate": 0.0004976156086196854, "loss": 3.4104, "step": 29400 }, { "epoch": 8.571145784816022, "grad_norm": 0.33362478017807007, "learning_rate": 0.0004974408852649971, "loss": 3.4148, "step": 29450 }, { "epoch": 8.585700978108989, "grad_norm": 0.35230565071105957, "learning_rate": 0.0004972661619103086, "loss": 3.4156, "step": 29500 }, { "epoch": 8.600256171401956, "grad_norm": 0.34774646162986755, "learning_rate": 0.0004970914385556202, "loss": 3.4192, "step": 29550 }, { "epoch": 8.614811364694923, "grad_norm": 0.34162718057632446, "learning_rate": 0.0004969167152009318, "loss": 3.4324, "step": 29600 }, { "epoch": 8.62936655798789, "grad_norm": 0.3288877308368683, "learning_rate": 0.0004967419918462435, "loss": 3.4258, "step": 29650 }, { "epoch": 8.643921751280857, "grad_norm": 0.34925612807273865, "learning_rate": 0.000496567268491555, "loss": 3.4391, "step": 29700 }, { "epoch": 8.658476944573824, "grad_norm": 0.3531148433685303, "learning_rate": 0.0004963925451368665, "loss": 3.4273, "step": 29750 }, { "epoch": 8.673032137866791, "grad_norm": 0.3444005250930786, "learning_rate": 0.0004962178217821782, "loss": 3.4193, "step": 29800 }, { "epoch": 8.687587331159758, "grad_norm": 0.31189289689064026, "learning_rate": 0.0004960430984274898, "loss": 3.4266, "step": 29850 }, { "epoch": 8.702142524452725, "grad_norm": 0.3210609257221222, "learning_rate": 0.0004958683750728014, "loss": 3.4203, "step": 29900 }, { "epoch": 8.716697717745692, "grad_norm": 0.37667086720466614, "learning_rate": 0.0004956936517181129, "loss": 3.4251, "step": 29950 }, { "epoch": 8.73125291103866, "grad_norm": 0.3318817615509033, "learning_rate": 0.0004955189283634246, "loss": 3.4224, "step": 30000 }, { "epoch": 8.73125291103866, "eval_accuracy": 0.36724474038864635, "eval_loss": 3.570685625076294, "eval_runtime": 180.221, "eval_samples_per_second": 92.392, "eval_steps_per_second": 5.776, "step": 30000 }, { "epoch": 8.745808104331626, "grad_norm": 0.32190632820129395, "learning_rate": 0.0004953442050087361, "loss": 3.4281, "step": 30050 }, { "epoch": 8.760363297624593, "grad_norm": 0.31246793270111084, "learning_rate": 0.0004951694816540476, "loss": 3.4317, "step": 30100 }, { "epoch": 8.774918490917559, "grad_norm": 0.31603461503982544, "learning_rate": 0.0004949947582993593, "loss": 3.4304, "step": 30150 }, { "epoch": 8.789473684210526, "grad_norm": 0.35877224802970886, "learning_rate": 0.0004948200349446709, "loss": 3.4269, "step": 30200 }, { "epoch": 8.804028877503493, "grad_norm": 0.31865957379341125, "learning_rate": 0.0004946453115899825, "loss": 3.4184, "step": 30250 }, { "epoch": 8.81858407079646, "grad_norm": 0.3399755656719208, "learning_rate": 0.000494470588235294, "loss": 3.4352, "step": 30300 }, { "epoch": 8.833139264089427, "grad_norm": 0.3316100835800171, "learning_rate": 0.0004942958648806057, "loss": 3.4254, "step": 30350 }, { "epoch": 8.847694457382394, "grad_norm": 0.33851954340934753, "learning_rate": 0.0004941211415259173, "loss": 3.4246, "step": 30400 }, { "epoch": 8.862249650675361, "grad_norm": 0.3344631791114807, "learning_rate": 0.0004939464181712289, "loss": 3.4325, "step": 30450 }, { "epoch": 8.876804843968328, "grad_norm": 0.3349270820617676, "learning_rate": 0.0004937716948165404, "loss": 3.4219, "step": 30500 }, { "epoch": 8.891360037261295, "grad_norm": 0.3200686275959015, "learning_rate": 0.000493596971461852, "loss": 3.432, "step": 30550 }, { "epoch": 8.905915230554262, "grad_norm": 0.31628715991973877, "learning_rate": 0.0004934222481071636, "loss": 3.4319, "step": 30600 }, { "epoch": 8.92047042384723, "grad_norm": 0.32253578305244446, "learning_rate": 0.0004932475247524751, "loss": 3.4424, "step": 30650 }, { "epoch": 8.935025617140196, "grad_norm": 0.33545029163360596, "learning_rate": 0.0004930728013977868, "loss": 3.4383, "step": 30700 }, { "epoch": 8.949580810433163, "grad_norm": 0.33790600299835205, "learning_rate": 0.0004928980780430984, "loss": 3.4362, "step": 30750 }, { "epoch": 8.964136003726129, "grad_norm": 0.32301875948905945, "learning_rate": 0.00049272335468841, "loss": 3.4417, "step": 30800 }, { "epoch": 8.978691197019096, "grad_norm": 0.3249106705188751, "learning_rate": 0.0004925486313337215, "loss": 3.4431, "step": 30850 }, { "epoch": 8.993246390312063, "grad_norm": 0.36046266555786133, "learning_rate": 0.0004923739079790332, "loss": 3.4302, "step": 30900 }, { "epoch": 9.007568700512342, "grad_norm": 0.3235209286212921, "learning_rate": 0.0004921991846243447, "loss": 3.389, "step": 30950 }, { "epoch": 9.02212389380531, "grad_norm": 0.36187535524368286, "learning_rate": 0.0004920244612696563, "loss": 3.3221, "step": 31000 }, { "epoch": 9.02212389380531, "eval_accuracy": 0.3672638973394286, "eval_loss": 3.575305223464966, "eval_runtime": 180.1978, "eval_samples_per_second": 92.404, "eval_steps_per_second": 5.777, "step": 31000 }, { "epoch": 9.036679087098276, "grad_norm": 0.35662412643432617, "learning_rate": 0.0004918497379149679, "loss": 3.3411, "step": 31050 }, { "epoch": 9.051234280391244, "grad_norm": 0.3371991515159607, "learning_rate": 0.0004916750145602795, "loss": 3.3441, "step": 31100 }, { "epoch": 9.06578947368421, "grad_norm": 0.34950143098831177, "learning_rate": 0.0004915002912055911, "loss": 3.3471, "step": 31150 }, { "epoch": 9.080344666977178, "grad_norm": 0.37028005719184875, "learning_rate": 0.0004913255678509026, "loss": 3.3389, "step": 31200 }, { "epoch": 9.094899860270145, "grad_norm": 0.3721117377281189, "learning_rate": 0.0004911508444962143, "loss": 3.3536, "step": 31250 }, { "epoch": 9.109455053563112, "grad_norm": 0.3477531671524048, "learning_rate": 0.0004909761211415259, "loss": 3.3449, "step": 31300 }, { "epoch": 9.124010246856079, "grad_norm": 0.3422900140285492, "learning_rate": 0.0004908013977868375, "loss": 3.3408, "step": 31350 }, { "epoch": 9.138565440149046, "grad_norm": 0.3360619843006134, "learning_rate": 0.0004906266744321491, "loss": 3.3495, "step": 31400 }, { "epoch": 9.153120633442011, "grad_norm": 0.37848517298698425, "learning_rate": 0.0004904519510774606, "loss": 3.3578, "step": 31450 }, { "epoch": 9.167675826734978, "grad_norm": 0.361354798078537, "learning_rate": 0.0004902772277227722, "loss": 3.3468, "step": 31500 }, { "epoch": 9.182231020027945, "grad_norm": 0.35507822036743164, "learning_rate": 0.0004901025043680838, "loss": 3.3568, "step": 31550 }, { "epoch": 9.196786213320912, "grad_norm": 0.33032676577568054, "learning_rate": 0.0004899277810133955, "loss": 3.3565, "step": 31600 }, { "epoch": 9.21134140661388, "grad_norm": 0.35065019130706787, "learning_rate": 0.000489753057658707, "loss": 3.3672, "step": 31650 }, { "epoch": 9.225896599906847, "grad_norm": 0.3741622269153595, "learning_rate": 0.0004895783343040186, "loss": 3.3686, "step": 31700 }, { "epoch": 9.240451793199814, "grad_norm": 0.3545469641685486, "learning_rate": 0.0004894036109493302, "loss": 3.3758, "step": 31750 }, { "epoch": 9.25500698649278, "grad_norm": 0.3496120572090149, "learning_rate": 0.0004892288875946419, "loss": 3.3586, "step": 31800 }, { "epoch": 9.269562179785748, "grad_norm": 0.3568468987941742, "learning_rate": 0.0004890541642399534, "loss": 3.3697, "step": 31850 }, { "epoch": 9.284117373078715, "grad_norm": 0.3583312928676605, "learning_rate": 0.0004888794408852649, "loss": 3.37, "step": 31900 }, { "epoch": 9.298672566371682, "grad_norm": 0.36404433846473694, "learning_rate": 0.0004887047175305766, "loss": 3.3765, "step": 31950 }, { "epoch": 9.313227759664649, "grad_norm": 0.3239692747592926, "learning_rate": 0.0004885299941758881, "loss": 3.3657, "step": 32000 }, { "epoch": 9.313227759664649, "eval_accuracy": 0.3678151004018141, "eval_loss": 3.5762815475463867, "eval_runtime": 180.247, "eval_samples_per_second": 92.379, "eval_steps_per_second": 5.775, "step": 32000 }, { "epoch": 9.327782952957616, "grad_norm": 0.3725309669971466, "learning_rate": 0.0004883552708211997, "loss": 3.3785, "step": 32050 }, { "epoch": 9.342338146250583, "grad_norm": 0.3427802622318268, "learning_rate": 0.00048818054746651137, "loss": 3.3737, "step": 32100 }, { "epoch": 9.356893339543548, "grad_norm": 0.33704817295074463, "learning_rate": 0.0004880058241118229, "loss": 3.3824, "step": 32150 }, { "epoch": 9.371448532836515, "grad_norm": 0.35286030173301697, "learning_rate": 0.0004878311007571345, "loss": 3.3731, "step": 32200 }, { "epoch": 9.386003726129482, "grad_norm": 0.33433571457862854, "learning_rate": 0.0004876563774024461, "loss": 3.3811, "step": 32250 }, { "epoch": 9.40055891942245, "grad_norm": 0.34206315875053406, "learning_rate": 0.00048748165404775763, "loss": 3.3723, "step": 32300 }, { "epoch": 9.415114112715417, "grad_norm": 0.3595890700817108, "learning_rate": 0.0004873069306930693, "loss": 3.3879, "step": 32350 }, { "epoch": 9.429669306008384, "grad_norm": 0.3569333553314209, "learning_rate": 0.0004871322073383809, "loss": 3.3881, "step": 32400 }, { "epoch": 9.44422449930135, "grad_norm": 0.35105010867118835, "learning_rate": 0.00048695748398369247, "loss": 3.3838, "step": 32450 }, { "epoch": 9.458779692594318, "grad_norm": 0.35852131247520447, "learning_rate": 0.000486782760629004, "loss": 3.3908, "step": 32500 }, { "epoch": 9.473334885887285, "grad_norm": 0.328770250082016, "learning_rate": 0.0004866080372743156, "loss": 3.3867, "step": 32550 }, { "epoch": 9.487890079180252, "grad_norm": 0.3429681360721588, "learning_rate": 0.0004864333139196272, "loss": 3.3908, "step": 32600 }, { "epoch": 9.502445272473219, "grad_norm": 0.3401564955711365, "learning_rate": 0.00048625859056493885, "loss": 3.401, "step": 32650 }, { "epoch": 9.517000465766186, "grad_norm": 0.35480111837387085, "learning_rate": 0.0004860838672102504, "loss": 3.3864, "step": 32700 }, { "epoch": 9.531555659059153, "grad_norm": 0.34914788603782654, "learning_rate": 0.000485909143855562, "loss": 3.3759, "step": 32750 }, { "epoch": 9.546110852352118, "grad_norm": 0.3693493902683258, "learning_rate": 0.0004857344205008736, "loss": 3.3912, "step": 32800 }, { "epoch": 9.560666045645085, "grad_norm": 0.3824955224990845, "learning_rate": 0.00048555969714618517, "loss": 3.3948, "step": 32850 }, { "epoch": 9.575221238938052, "grad_norm": 0.3300747871398926, "learning_rate": 0.0004853849737914967, "loss": 3.3848, "step": 32900 }, { "epoch": 9.58977643223102, "grad_norm": 0.37243399024009705, "learning_rate": 0.00048521025043680836, "loss": 3.3803, "step": 32950 }, { "epoch": 9.604331625523987, "grad_norm": 0.33435991406440735, "learning_rate": 0.00048503552708211995, "loss": 3.4001, "step": 33000 }, { "epoch": 9.604331625523987, "eval_accuracy": 0.36851850132470904, "eval_loss": 3.569307804107666, "eval_runtime": 180.3956, "eval_samples_per_second": 92.303, "eval_steps_per_second": 5.771, "step": 33000 }, { "epoch": 9.618886818816954, "grad_norm": 0.3287038803100586, "learning_rate": 0.00048486080372743155, "loss": 3.395, "step": 33050 }, { "epoch": 9.63344201210992, "grad_norm": 0.35405418276786804, "learning_rate": 0.0004846860803727431, "loss": 3.4001, "step": 33100 }, { "epoch": 9.647997205402888, "grad_norm": 0.3454687297344208, "learning_rate": 0.0004845113570180547, "loss": 3.3906, "step": 33150 }, { "epoch": 9.662552398695855, "grad_norm": 0.36389002203941345, "learning_rate": 0.00048433663366336633, "loss": 3.3872, "step": 33200 }, { "epoch": 9.677107591988822, "grad_norm": 0.34604567289352417, "learning_rate": 0.0004841619103086779, "loss": 3.3988, "step": 33250 }, { "epoch": 9.691662785281789, "grad_norm": 0.34256061911582947, "learning_rate": 0.00048398718695398947, "loss": 3.4001, "step": 33300 }, { "epoch": 9.706217978574756, "grad_norm": 0.3654734194278717, "learning_rate": 0.00048381246359930106, "loss": 3.4122, "step": 33350 }, { "epoch": 9.720773171867723, "grad_norm": 0.3377828896045685, "learning_rate": 0.00048363774024461265, "loss": 3.3892, "step": 33400 }, { "epoch": 9.73532836516069, "grad_norm": 0.36031660437583923, "learning_rate": 0.0004834630168899242, "loss": 3.4032, "step": 33450 }, { "epoch": 9.749883558453657, "grad_norm": 0.3316538631916046, "learning_rate": 0.00048328829353523584, "loss": 3.3988, "step": 33500 }, { "epoch": 9.764438751746622, "grad_norm": 0.32376495003700256, "learning_rate": 0.00048311357018054744, "loss": 3.4065, "step": 33550 }, { "epoch": 9.77899394503959, "grad_norm": 0.3392708897590637, "learning_rate": 0.00048293884682585903, "loss": 3.4202, "step": 33600 }, { "epoch": 9.793549138332557, "grad_norm": 0.3366035521030426, "learning_rate": 0.00048276412347117057, "loss": 3.4159, "step": 33650 }, { "epoch": 9.808104331625524, "grad_norm": 0.31964221596717834, "learning_rate": 0.00048258940011648217, "loss": 3.4184, "step": 33700 }, { "epoch": 9.82265952491849, "grad_norm": 0.35629507899284363, "learning_rate": 0.0004824146767617938, "loss": 3.4017, "step": 33750 }, { "epoch": 9.837214718211458, "grad_norm": 0.34650474786758423, "learning_rate": 0.0004822399534071054, "loss": 3.4053, "step": 33800 }, { "epoch": 9.851769911504425, "grad_norm": 0.3471023738384247, "learning_rate": 0.00048206523005241695, "loss": 3.4082, "step": 33850 }, { "epoch": 9.866325104797392, "grad_norm": 0.35872936248779297, "learning_rate": 0.00048189050669772854, "loss": 3.4147, "step": 33900 }, { "epoch": 9.880880298090359, "grad_norm": 0.3455120027065277, "learning_rate": 0.00048171578334304014, "loss": 3.4125, "step": 33950 }, { "epoch": 9.895435491383326, "grad_norm": 0.32333680987358093, "learning_rate": 0.00048154105998835173, "loss": 3.4211, "step": 34000 }, { "epoch": 9.895435491383326, "eval_accuracy": 0.36859818483777884, "eval_loss": 3.5596694946289062, "eval_runtime": 180.3996, "eval_samples_per_second": 92.301, "eval_steps_per_second": 5.771, "step": 34000 }, { "epoch": 9.909990684676293, "grad_norm": 0.32725003361701965, "learning_rate": 0.0004813663366336633, "loss": 3.4146, "step": 34050 }, { "epoch": 9.92454587796926, "grad_norm": 0.36068347096443176, "learning_rate": 0.0004811916132789749, "loss": 3.4132, "step": 34100 }, { "epoch": 9.939101071262227, "grad_norm": 0.3258740305900574, "learning_rate": 0.0004810168899242865, "loss": 3.4114, "step": 34150 }, { "epoch": 9.953656264555192, "grad_norm": 0.32389259338378906, "learning_rate": 0.0004808421665695981, "loss": 3.417, "step": 34200 }, { "epoch": 9.96821145784816, "grad_norm": 0.3479362428188324, "learning_rate": 0.00048066744321490965, "loss": 3.4109, "step": 34250 }, { "epoch": 9.982766651141127, "grad_norm": 0.34827226400375366, "learning_rate": 0.00048049271986022124, "loss": 3.4072, "step": 34300 }, { "epoch": 9.997321844434094, "grad_norm": 0.38424593210220337, "learning_rate": 0.0004803179965055329, "loss": 3.4037, "step": 34350 }, { "epoch": 10.011644154634373, "grad_norm": 0.3425210118293762, "learning_rate": 0.0004801432731508445, "loss": 3.3371, "step": 34400 }, { "epoch": 10.02619934792734, "grad_norm": 0.36271992325782776, "learning_rate": 0.000479968549796156, "loss": 3.3022, "step": 34450 }, { "epoch": 10.040754541220307, "grad_norm": 0.33403560519218445, "learning_rate": 0.0004797938264414676, "loss": 3.3102, "step": 34500 }, { "epoch": 10.055309734513274, "grad_norm": 0.35867980122566223, "learning_rate": 0.0004796191030867792, "loss": 3.2987, "step": 34550 }, { "epoch": 10.069864927806242, "grad_norm": 0.38395223021507263, "learning_rate": 0.00047944437973209086, "loss": 3.3189, "step": 34600 }, { "epoch": 10.084420121099209, "grad_norm": 0.3382980525493622, "learning_rate": 0.0004792696563774024, "loss": 3.3182, "step": 34650 }, { "epoch": 10.098975314392176, "grad_norm": 0.34451884031295776, "learning_rate": 0.000479094933022714, "loss": 3.3255, "step": 34700 }, { "epoch": 10.113530507685143, "grad_norm": 0.37175989151000977, "learning_rate": 0.0004789202096680256, "loss": 3.3201, "step": 34750 }, { "epoch": 10.12808570097811, "grad_norm": 0.35105881094932556, "learning_rate": 0.00047874548631333713, "loss": 3.3201, "step": 34800 }, { "epoch": 10.142640894271075, "grad_norm": 0.34279462695121765, "learning_rate": 0.0004785707629586487, "loss": 3.3235, "step": 34850 }, { "epoch": 10.157196087564042, "grad_norm": 0.3397054672241211, "learning_rate": 0.0004783960396039604, "loss": 3.3373, "step": 34900 }, { "epoch": 10.17175128085701, "grad_norm": 0.3448835611343384, "learning_rate": 0.00047822131624927197, "loss": 3.3277, "step": 34950 }, { "epoch": 10.186306474149976, "grad_norm": 0.3452892005443573, "learning_rate": 0.0004780465928945835, "loss": 3.322, "step": 35000 }, { "epoch": 10.186306474149976, "eval_accuracy": 0.3684472797776289, "eval_loss": 3.5718932151794434, "eval_runtime": 180.4472, "eval_samples_per_second": 92.276, "eval_steps_per_second": 5.769, "step": 35000 }, { "epoch": 10.200861667442943, "grad_norm": 0.40142256021499634, "learning_rate": 0.0004778718695398951, "loss": 3.338, "step": 35050 }, { "epoch": 10.21541686073591, "grad_norm": 0.3434009552001953, "learning_rate": 0.0004776971461852067, "loss": 3.3376, "step": 35100 }, { "epoch": 10.229972054028877, "grad_norm": 0.359845906496048, "learning_rate": 0.00047752242283051835, "loss": 3.3372, "step": 35150 }, { "epoch": 10.244527247321844, "grad_norm": 0.32966330647468567, "learning_rate": 0.00047734769947582994, "loss": 3.3296, "step": 35200 }, { "epoch": 10.259082440614812, "grad_norm": 0.374866783618927, "learning_rate": 0.0004771729761211415, "loss": 3.3491, "step": 35250 }, { "epoch": 10.273637633907779, "grad_norm": 0.34373581409454346, "learning_rate": 0.0004769982527664531, "loss": 3.3564, "step": 35300 }, { "epoch": 10.288192827200746, "grad_norm": 0.3617185354232788, "learning_rate": 0.00047682352941176467, "loss": 3.3548, "step": 35350 }, { "epoch": 10.302748020493713, "grad_norm": 0.3325277268886566, "learning_rate": 0.0004766488060570762, "loss": 3.353, "step": 35400 }, { "epoch": 10.31730321378668, "grad_norm": 0.3682878613471985, "learning_rate": 0.00047647408270238786, "loss": 3.3626, "step": 35450 }, { "epoch": 10.331858407079647, "grad_norm": 0.3664827346801758, "learning_rate": 0.00047629935934769945, "loss": 3.3564, "step": 35500 }, { "epoch": 10.346413600372612, "grad_norm": 0.34020736813545227, "learning_rate": 0.00047612463599301105, "loss": 3.3518, "step": 35550 }, { "epoch": 10.36096879366558, "grad_norm": 0.3571007549762726, "learning_rate": 0.0004759499126383226, "loss": 3.3407, "step": 35600 }, { "epoch": 10.375523986958546, "grad_norm": 0.36048614978790283, "learning_rate": 0.0004757751892836342, "loss": 3.3714, "step": 35650 }, { "epoch": 10.390079180251513, "grad_norm": 0.3628058433532715, "learning_rate": 0.0004756004659289458, "loss": 3.3599, "step": 35700 }, { "epoch": 10.40463437354448, "grad_norm": 0.35612308979034424, "learning_rate": 0.0004754257425742574, "loss": 3.3563, "step": 35750 }, { "epoch": 10.419189566837447, "grad_norm": 0.35725560784339905, "learning_rate": 0.00047525101921956896, "loss": 3.3706, "step": 35800 }, { "epoch": 10.433744760130415, "grad_norm": 0.35436466336250305, "learning_rate": 0.00047507629586488056, "loss": 3.3652, "step": 35850 }, { "epoch": 10.448299953423382, "grad_norm": 0.3382556140422821, "learning_rate": 0.00047490157251019215, "loss": 3.3532, "step": 35900 }, { "epoch": 10.462855146716349, "grad_norm": 0.35225167870521545, "learning_rate": 0.0004747268491555037, "loss": 3.3646, "step": 35950 }, { "epoch": 10.477410340009316, "grad_norm": 0.32531359791755676, "learning_rate": 0.00047455212580081534, "loss": 3.3627, "step": 36000 }, { "epoch": 10.477410340009316, "eval_accuracy": 0.36913105364051996, "eval_loss": 3.563882827758789, "eval_runtime": 180.2613, "eval_samples_per_second": 92.371, "eval_steps_per_second": 5.775, "step": 36000 }, { "epoch": 10.491965533302283, "grad_norm": 0.34755268692970276, "learning_rate": 0.00047437740244612694, "loss": 3.375, "step": 36050 }, { "epoch": 10.50652072659525, "grad_norm": 0.33514404296875, "learning_rate": 0.00047420267909143853, "loss": 3.3773, "step": 36100 }, { "epoch": 10.521075919888217, "grad_norm": 0.38694730401039124, "learning_rate": 0.0004740279557367501, "loss": 3.3542, "step": 36150 }, { "epoch": 10.535631113181182, "grad_norm": 0.3417895436286926, "learning_rate": 0.00047385323238206166, "loss": 3.3665, "step": 36200 }, { "epoch": 10.55018630647415, "grad_norm": 0.3314196765422821, "learning_rate": 0.00047367850902737326, "loss": 3.3636, "step": 36250 }, { "epoch": 10.564741499767116, "grad_norm": 0.34591639041900635, "learning_rate": 0.0004735037856726849, "loss": 3.3788, "step": 36300 }, { "epoch": 10.579296693060083, "grad_norm": 0.358770489692688, "learning_rate": 0.0004733290623179965, "loss": 3.3828, "step": 36350 }, { "epoch": 10.59385188635305, "grad_norm": 0.3519880771636963, "learning_rate": 0.00047315433896330804, "loss": 3.3722, "step": 36400 }, { "epoch": 10.608407079646017, "grad_norm": 0.3622148633003235, "learning_rate": 0.00047297961560861964, "loss": 3.3689, "step": 36450 }, { "epoch": 10.622962272938985, "grad_norm": 0.35043129324913025, "learning_rate": 0.00047280489225393123, "loss": 3.3859, "step": 36500 }, { "epoch": 10.637517466231952, "grad_norm": 0.35105785727500916, "learning_rate": 0.0004726301688992429, "loss": 3.3838, "step": 36550 }, { "epoch": 10.652072659524919, "grad_norm": 0.3403192460536957, "learning_rate": 0.0004724554455445544, "loss": 3.3754, "step": 36600 }, { "epoch": 10.666627852817886, "grad_norm": 0.3527704179286957, "learning_rate": 0.000472280722189866, "loss": 3.3921, "step": 36650 }, { "epoch": 10.681183046110853, "grad_norm": 0.3420533537864685, "learning_rate": 0.0004721059988351776, "loss": 3.3756, "step": 36700 }, { "epoch": 10.69573823940382, "grad_norm": 0.35744109749794006, "learning_rate": 0.00047193127548048915, "loss": 3.3857, "step": 36750 }, { "epoch": 10.710293432696787, "grad_norm": 0.3527224063873291, "learning_rate": 0.00047175655212580074, "loss": 3.3853, "step": 36800 }, { "epoch": 10.724848625989754, "grad_norm": 0.3439313471317291, "learning_rate": 0.0004715818287711124, "loss": 3.3893, "step": 36850 }, { "epoch": 10.73940381928272, "grad_norm": 0.3417615592479706, "learning_rate": 0.000471407105416424, "loss": 3.3822, "step": 36900 }, { "epoch": 10.753959012575686, "grad_norm": 0.3969389796257019, "learning_rate": 0.0004712323820617355, "loss": 3.3851, "step": 36950 }, { "epoch": 10.768514205868653, "grad_norm": 0.3360283672809601, "learning_rate": 0.0004710576587070471, "loss": 3.3818, "step": 37000 }, { "epoch": 10.768514205868653, "eval_accuracy": 0.36942592964980037, "eval_loss": 3.5583348274230957, "eval_runtime": 180.1813, "eval_samples_per_second": 92.412, "eval_steps_per_second": 5.778, "step": 37000 }, { "epoch": 10.78306939916162, "grad_norm": 0.37160441279411316, "learning_rate": 0.0004708829353523587, "loss": 3.3785, "step": 37050 }, { "epoch": 10.797624592454587, "grad_norm": 0.36143237352371216, "learning_rate": 0.0004707082119976703, "loss": 3.3838, "step": 37100 }, { "epoch": 10.812179785747555, "grad_norm": 0.35929399728775024, "learning_rate": 0.0004705334886429819, "loss": 3.3866, "step": 37150 }, { "epoch": 10.826734979040522, "grad_norm": 0.3479052782058716, "learning_rate": 0.0004703587652882935, "loss": 3.3967, "step": 37200 }, { "epoch": 10.841290172333489, "grad_norm": 0.3509685695171356, "learning_rate": 0.0004701840419336051, "loss": 3.3834, "step": 37250 }, { "epoch": 10.855845365626456, "grad_norm": 0.3558409512042999, "learning_rate": 0.0004700093185789167, "loss": 3.3897, "step": 37300 }, { "epoch": 10.870400558919423, "grad_norm": 0.34786784648895264, "learning_rate": 0.0004698345952242282, "loss": 3.3922, "step": 37350 }, { "epoch": 10.88495575221239, "grad_norm": 0.3419925272464752, "learning_rate": 0.00046965987186953987, "loss": 3.3931, "step": 37400 }, { "epoch": 10.899510945505357, "grad_norm": 0.3539605140686035, "learning_rate": 0.00046948514851485147, "loss": 3.3835, "step": 37450 }, { "epoch": 10.914066138798324, "grad_norm": 0.3516765832901001, "learning_rate": 0.00046931042516016306, "loss": 3.3791, "step": 37500 }, { "epoch": 10.92862133209129, "grad_norm": 0.35374191403388977, "learning_rate": 0.0004691357018054746, "loss": 3.3899, "step": 37550 }, { "epoch": 10.943176525384256, "grad_norm": 0.348615825176239, "learning_rate": 0.0004689609784507862, "loss": 3.3911, "step": 37600 }, { "epoch": 10.957731718677223, "grad_norm": 0.3586934804916382, "learning_rate": 0.0004687862550960978, "loss": 3.3837, "step": 37650 }, { "epoch": 10.97228691197019, "grad_norm": 0.3774052560329437, "learning_rate": 0.00046861153174140944, "loss": 3.3899, "step": 37700 }, { "epoch": 10.986842105263158, "grad_norm": 0.3531131148338318, "learning_rate": 0.000468436808386721, "loss": 3.3886, "step": 37750 }, { "epoch": 11.001164415463437, "grad_norm": 0.37890660762786865, "learning_rate": 0.0004682620850320326, "loss": 3.3903, "step": 37800 }, { "epoch": 11.015719608756404, "grad_norm": 0.3617331087589264, "learning_rate": 0.00046808736167734417, "loss": 3.2755, "step": 37850 }, { "epoch": 11.030274802049371, "grad_norm": 0.3595845103263855, "learning_rate": 0.0004679126383226557, "loss": 3.2797, "step": 37900 }, { "epoch": 11.044829995342338, "grad_norm": 0.3700632154941559, "learning_rate": 0.00046773791496796736, "loss": 3.2957, "step": 37950 }, { "epoch": 11.059385188635305, "grad_norm": 0.35127753019332886, "learning_rate": 0.00046756319161327895, "loss": 3.2802, "step": 38000 }, { "epoch": 11.059385188635305, "eval_accuracy": 0.3692489335278489, "eval_loss": 3.566408395767212, "eval_runtime": 180.4233, "eval_samples_per_second": 92.289, "eval_steps_per_second": 5.77, "step": 38000 }, { "epoch": 11.073940381928272, "grad_norm": 0.3485884368419647, "learning_rate": 0.00046738846825859054, "loss": 3.2898, "step": 38050 }, { "epoch": 11.08849557522124, "grad_norm": 0.3383578360080719, "learning_rate": 0.0004672137449039021, "loss": 3.2914, "step": 38100 }, { "epoch": 11.103050768514207, "grad_norm": 0.35948649048805237, "learning_rate": 0.0004670390215492137, "loss": 3.3153, "step": 38150 }, { "epoch": 11.117605961807174, "grad_norm": 0.3707924485206604, "learning_rate": 0.0004668642981945253, "loss": 3.2946, "step": 38200 }, { "epoch": 11.132161155100139, "grad_norm": 0.3674662113189697, "learning_rate": 0.0004666895748398369, "loss": 3.316, "step": 38250 }, { "epoch": 11.146716348393106, "grad_norm": 0.3381125330924988, "learning_rate": 0.00046651485148514846, "loss": 3.3024, "step": 38300 }, { "epoch": 11.161271541686073, "grad_norm": 0.33862781524658203, "learning_rate": 0.00046634012813046006, "loss": 3.3116, "step": 38350 }, { "epoch": 11.17582673497904, "grad_norm": 0.3579528033733368, "learning_rate": 0.00046616540477577165, "loss": 3.324, "step": 38400 }, { "epoch": 11.190381928272007, "grad_norm": 0.3564028739929199, "learning_rate": 0.00046599068142108324, "loss": 3.3027, "step": 38450 }, { "epoch": 11.204937121564974, "grad_norm": 0.3573664128780365, "learning_rate": 0.0004658159580663948, "loss": 3.3219, "step": 38500 }, { "epoch": 11.219492314857941, "grad_norm": 0.39046090841293335, "learning_rate": 0.00046564123471170643, "loss": 3.3317, "step": 38550 }, { "epoch": 11.234047508150908, "grad_norm": 0.3551309406757355, "learning_rate": 0.00046546651135701803, "loss": 3.3281, "step": 38600 }, { "epoch": 11.248602701443875, "grad_norm": 0.39006704092025757, "learning_rate": 0.0004652917880023296, "loss": 3.3157, "step": 38650 }, { "epoch": 11.263157894736842, "grad_norm": 0.37342214584350586, "learning_rate": 0.00046511706464764116, "loss": 3.3242, "step": 38700 }, { "epoch": 11.27771308802981, "grad_norm": 0.38613465428352356, "learning_rate": 0.00046494234129295276, "loss": 3.3317, "step": 38750 }, { "epoch": 11.292268281322777, "grad_norm": 0.3982740044593811, "learning_rate": 0.0004647676179382644, "loss": 3.3216, "step": 38800 }, { "epoch": 11.306823474615744, "grad_norm": 0.3330700397491455, "learning_rate": 0.000464592894583576, "loss": 3.3345, "step": 38850 }, { "epoch": 11.32137866790871, "grad_norm": 0.3406359851360321, "learning_rate": 0.00046441817122888754, "loss": 3.3454, "step": 38900 }, { "epoch": 11.335933861201676, "grad_norm": 0.34419894218444824, "learning_rate": 0.00046424344787419913, "loss": 3.3413, "step": 38950 }, { "epoch": 11.350489054494643, "grad_norm": 0.343436062335968, "learning_rate": 0.00046406872451951073, "loss": 3.3399, "step": 39000 }, { "epoch": 11.350489054494643, "eval_accuracy": 0.3694167625199782, "eval_loss": 3.564185857772827, "eval_runtime": 180.3211, "eval_samples_per_second": 92.341, "eval_steps_per_second": 5.773, "step": 39000 }, { "epoch": 11.36504424778761, "grad_norm": 0.37155765295028687, "learning_rate": 0.00046389400116482227, "loss": 3.3407, "step": 39050 }, { "epoch": 11.379599441080577, "grad_norm": 0.374979704618454, "learning_rate": 0.0004637192778101339, "loss": 3.3381, "step": 39100 }, { "epoch": 11.394154634373544, "grad_norm": 0.3403925597667694, "learning_rate": 0.0004635445544554455, "loss": 3.3316, "step": 39150 }, { "epoch": 11.408709827666511, "grad_norm": 0.4009750783443451, "learning_rate": 0.0004633698311007571, "loss": 3.3433, "step": 39200 }, { "epoch": 11.423265020959478, "grad_norm": 0.3344782590866089, "learning_rate": 0.0004631951077460687, "loss": 3.3446, "step": 39250 }, { "epoch": 11.437820214252445, "grad_norm": 0.34845563769340515, "learning_rate": 0.00046302038439138024, "loss": 3.3432, "step": 39300 }, { "epoch": 11.452375407545413, "grad_norm": 0.3674914538860321, "learning_rate": 0.0004628456610366919, "loss": 3.356, "step": 39350 }, { "epoch": 11.46693060083838, "grad_norm": 0.34272027015686035, "learning_rate": 0.0004626709376820035, "loss": 3.3519, "step": 39400 }, { "epoch": 11.481485794131347, "grad_norm": 0.3808971643447876, "learning_rate": 0.0004624962143273151, "loss": 3.3579, "step": 39450 }, { "epoch": 11.496040987424314, "grad_norm": 0.3545505404472351, "learning_rate": 0.0004623214909726266, "loss": 3.3463, "step": 39500 }, { "epoch": 11.51059618071728, "grad_norm": 0.3522847592830658, "learning_rate": 0.0004621467676179382, "loss": 3.3469, "step": 39550 }, { "epoch": 11.525151374010246, "grad_norm": 0.3556261360645294, "learning_rate": 0.0004619720442632498, "loss": 3.3488, "step": 39600 }, { "epoch": 11.539706567303213, "grad_norm": 0.38514190912246704, "learning_rate": 0.00046179732090856145, "loss": 3.3529, "step": 39650 }, { "epoch": 11.55426176059618, "grad_norm": 0.3602418303489685, "learning_rate": 0.000461622597553873, "loss": 3.3548, "step": 39700 }, { "epoch": 11.568816953889147, "grad_norm": 0.36884602904319763, "learning_rate": 0.0004614478741991846, "loss": 3.3509, "step": 39750 }, { "epoch": 11.583372147182114, "grad_norm": 0.3395715653896332, "learning_rate": 0.0004612731508444962, "loss": 3.352, "step": 39800 }, { "epoch": 11.597927340475081, "grad_norm": 0.3535105288028717, "learning_rate": 0.0004610984274898077, "loss": 3.3637, "step": 39850 }, { "epoch": 11.612482533768048, "grad_norm": 0.34964367747306824, "learning_rate": 0.00046092370413511937, "loss": 3.3548, "step": 39900 }, { "epoch": 11.627037727061015, "grad_norm": 0.33077746629714966, "learning_rate": 0.00046074898078043096, "loss": 3.3675, "step": 39950 }, { "epoch": 11.641592920353983, "grad_norm": 0.3740006685256958, "learning_rate": 0.00046057425742574256, "loss": 3.3617, "step": 40000 }, { "epoch": 11.641592920353983, "eval_accuracy": 0.36993376513648857, "eval_loss": 3.555708646774292, "eval_runtime": 180.3223, "eval_samples_per_second": 92.34, "eval_steps_per_second": 5.773, "step": 40000 }, { "epoch": 11.65614811364695, "grad_norm": 0.3477848470211029, "learning_rate": 0.0004603995340710541, "loss": 3.3557, "step": 40050 }, { "epoch": 11.670703306939917, "grad_norm": 0.3602433502674103, "learning_rate": 0.0004602248107163657, "loss": 3.3637, "step": 40100 }, { "epoch": 11.685258500232884, "grad_norm": 0.3608153760433197, "learning_rate": 0.0004600500873616773, "loss": 3.3517, "step": 40150 }, { "epoch": 11.69981369352585, "grad_norm": 0.33879753947257996, "learning_rate": 0.00045987536400698894, "loss": 3.357, "step": 40200 }, { "epoch": 11.714368886818818, "grad_norm": 0.34680354595184326, "learning_rate": 0.0004597006406523005, "loss": 3.3629, "step": 40250 }, { "epoch": 11.728924080111783, "grad_norm": 0.40730366110801697, "learning_rate": 0.00045952591729761207, "loss": 3.3531, "step": 40300 }, { "epoch": 11.74347927340475, "grad_norm": 0.359557181596756, "learning_rate": 0.00045935119394292367, "loss": 3.359, "step": 40350 }, { "epoch": 11.758034466697717, "grad_norm": 0.364164263010025, "learning_rate": 0.00045917647058823526, "loss": 3.3611, "step": 40400 }, { "epoch": 11.772589659990684, "grad_norm": 0.33116286993026733, "learning_rate": 0.0004590017472335468, "loss": 3.354, "step": 40450 }, { "epoch": 11.787144853283651, "grad_norm": 0.3643271327018738, "learning_rate": 0.00045882702387885845, "loss": 3.3618, "step": 40500 }, { "epoch": 11.801700046576618, "grad_norm": 0.3368518054485321, "learning_rate": 0.00045865230052417004, "loss": 3.3636, "step": 40550 }, { "epoch": 11.816255239869585, "grad_norm": 0.38348156213760376, "learning_rate": 0.00045847757716948164, "loss": 3.3618, "step": 40600 }, { "epoch": 11.830810433162553, "grad_norm": 0.3593296408653259, "learning_rate": 0.0004583028538147932, "loss": 3.363, "step": 40650 }, { "epoch": 11.84536562645552, "grad_norm": 0.32494640350341797, "learning_rate": 0.00045812813046010477, "loss": 3.3647, "step": 40700 }, { "epoch": 11.859920819748487, "grad_norm": 0.368127703666687, "learning_rate": 0.0004579534071054164, "loss": 3.3613, "step": 40750 }, { "epoch": 11.874476013041454, "grad_norm": 0.348438024520874, "learning_rate": 0.000457778683750728, "loss": 3.3815, "step": 40800 }, { "epoch": 11.88903120633442, "grad_norm": 0.36136871576309204, "learning_rate": 0.00045760396039603955, "loss": 3.3584, "step": 40850 }, { "epoch": 11.903586399627388, "grad_norm": 0.35239171981811523, "learning_rate": 0.00045742923704135115, "loss": 3.3709, "step": 40900 }, { "epoch": 11.918141592920353, "grad_norm": 0.3534851372241974, "learning_rate": 0.00045725451368666274, "loss": 3.378, "step": 40950 }, { "epoch": 11.93269678621332, "grad_norm": 0.35740330815315247, "learning_rate": 0.0004570797903319743, "loss": 3.3705, "step": 41000 }, { "epoch": 11.93269678621332, "eval_accuracy": 0.3707223733558077, "eval_loss": 3.548895835876465, "eval_runtime": 180.5799, "eval_samples_per_second": 92.208, "eval_steps_per_second": 5.765, "step": 41000 }, { "epoch": 11.947251979506287, "grad_norm": 0.3446290194988251, "learning_rate": 0.00045690506697728593, "loss": 3.3662, "step": 41050 }, { "epoch": 11.961807172799254, "grad_norm": 0.35064175724983215, "learning_rate": 0.0004567303436225975, "loss": 3.3731, "step": 41100 }, { "epoch": 11.976362366092221, "grad_norm": 0.34023401141166687, "learning_rate": 0.0004565556202679091, "loss": 3.3651, "step": 41150 }, { "epoch": 11.990917559385188, "grad_norm": 0.3787825107574463, "learning_rate": 0.00045638089691322066, "loss": 3.3668, "step": 41200 }, { "epoch": 12.005239869585468, "grad_norm": 0.3622465133666992, "learning_rate": 0.00045620617355853225, "loss": 3.3389, "step": 41250 }, { "epoch": 12.019795062878435, "grad_norm": 0.39711546897888184, "learning_rate": 0.0004560314502038439, "loss": 3.2623, "step": 41300 }, { "epoch": 12.034350256171402, "grad_norm": 0.350892037153244, "learning_rate": 0.0004558567268491555, "loss": 3.2649, "step": 41350 }, { "epoch": 12.04890544946437, "grad_norm": 0.34795042872428894, "learning_rate": 0.00045568200349446704, "loss": 3.2671, "step": 41400 }, { "epoch": 12.063460642757336, "grad_norm": 0.3888186514377594, "learning_rate": 0.00045550728013977863, "loss": 3.2879, "step": 41450 }, { "epoch": 12.078015836050303, "grad_norm": 0.34922316670417786, "learning_rate": 0.0004553325567850902, "loss": 3.2832, "step": 41500 }, { "epoch": 12.09257102934327, "grad_norm": 0.33052611351013184, "learning_rate": 0.0004551578334304018, "loss": 3.2908, "step": 41550 }, { "epoch": 12.107126222636236, "grad_norm": 0.36272740364074707, "learning_rate": 0.00045498311007571347, "loss": 3.2702, "step": 41600 }, { "epoch": 12.121681415929203, "grad_norm": 0.38955602049827576, "learning_rate": 0.000454808386721025, "loss": 3.2789, "step": 41650 }, { "epoch": 12.13623660922217, "grad_norm": 0.36334124207496643, "learning_rate": 0.0004546336633663366, "loss": 3.294, "step": 41700 }, { "epoch": 12.150791802515137, "grad_norm": 0.3944578468799591, "learning_rate": 0.0004544589400116482, "loss": 3.2951, "step": 41750 }, { "epoch": 12.165346995808104, "grad_norm": 0.372695654630661, "learning_rate": 0.00045428421665695974, "loss": 3.2878, "step": 41800 }, { "epoch": 12.179902189101071, "grad_norm": 0.37630680203437805, "learning_rate": 0.00045410949330227133, "loss": 3.3036, "step": 41850 }, { "epoch": 12.194457382394038, "grad_norm": 0.37000468373298645, "learning_rate": 0.000453934769947583, "loss": 3.3052, "step": 41900 }, { "epoch": 12.209012575687005, "grad_norm": 0.405077189207077, "learning_rate": 0.0004537600465928946, "loss": 3.309, "step": 41950 }, { "epoch": 12.223567768979972, "grad_norm": 0.3456836938858032, "learning_rate": 0.0004535853232382061, "loss": 3.3017, "step": 42000 }, { "epoch": 12.223567768979972, "eval_accuracy": 0.36992142476942025, "eval_loss": 3.5639872550964355, "eval_runtime": 180.489, "eval_samples_per_second": 92.255, "eval_steps_per_second": 5.768, "step": 42000 }, { "epoch": 12.23812296227294, "grad_norm": 0.36617839336395264, "learning_rate": 0.0004534105998835177, "loss": 3.3066, "step": 42050 }, { "epoch": 12.252678155565906, "grad_norm": 0.3613641560077667, "learning_rate": 0.0004532358765288293, "loss": 3.3036, "step": 42100 }, { "epoch": 12.267233348858873, "grad_norm": 0.35123613476753235, "learning_rate": 0.00045306115317414095, "loss": 3.307, "step": 42150 }, { "epoch": 12.28178854215184, "grad_norm": 0.3253026306629181, "learning_rate": 0.0004528864298194525, "loss": 3.3034, "step": 42200 }, { "epoch": 12.296343735444808, "grad_norm": 0.40897390246391296, "learning_rate": 0.0004527117064647641, "loss": 3.3096, "step": 42250 }, { "epoch": 12.310898928737773, "grad_norm": 0.3497923016548157, "learning_rate": 0.0004525369831100757, "loss": 3.3168, "step": 42300 }, { "epoch": 12.32545412203074, "grad_norm": 0.37551721930503845, "learning_rate": 0.0004523622597553872, "loss": 3.3101, "step": 42350 }, { "epoch": 12.340009315323707, "grad_norm": 0.3474026918411255, "learning_rate": 0.0004521875364006988, "loss": 3.3126, "step": 42400 }, { "epoch": 12.354564508616674, "grad_norm": 0.3557891845703125, "learning_rate": 0.00045201281304601046, "loss": 3.3148, "step": 42450 }, { "epoch": 12.369119701909641, "grad_norm": 0.35979312658309937, "learning_rate": 0.00045183808969132206, "loss": 3.316, "step": 42500 }, { "epoch": 12.383674895202608, "grad_norm": 0.34894880652427673, "learning_rate": 0.00045166336633663365, "loss": 3.3215, "step": 42550 }, { "epoch": 12.398230088495575, "grad_norm": 0.350329726934433, "learning_rate": 0.0004514886429819452, "loss": 3.3254, "step": 42600 }, { "epoch": 12.412785281788542, "grad_norm": 0.36864790320396423, "learning_rate": 0.0004513139196272568, "loss": 3.3232, "step": 42650 }, { "epoch": 12.42734047508151, "grad_norm": 0.3645729720592499, "learning_rate": 0.00045113919627256843, "loss": 3.3231, "step": 42700 }, { "epoch": 12.441895668374476, "grad_norm": 0.3485332429409027, "learning_rate": 0.00045096447291788003, "loss": 3.3168, "step": 42750 }, { "epoch": 12.456450861667443, "grad_norm": 0.37229326367378235, "learning_rate": 0.00045078974956319157, "loss": 3.3287, "step": 42800 }, { "epoch": 12.47100605496041, "grad_norm": 0.3589822053909302, "learning_rate": 0.00045061502620850316, "loss": 3.3216, "step": 42850 }, { "epoch": 12.485561248253378, "grad_norm": 0.40110453963279724, "learning_rate": 0.00045044030285381476, "loss": 3.3302, "step": 42900 }, { "epoch": 12.500116441546343, "grad_norm": 0.351875901222229, "learning_rate": 0.0004502655794991263, "loss": 3.3319, "step": 42950 }, { "epoch": 12.51467163483931, "grad_norm": 0.3724513053894043, "learning_rate": 0.00045009085614443795, "loss": 3.3312, "step": 43000 }, { "epoch": 12.51467163483931, "eval_accuracy": 0.37046357822928894, "eval_loss": 3.5569114685058594, "eval_runtime": 180.4517, "eval_samples_per_second": 92.274, "eval_steps_per_second": 5.769, "step": 43000 }, { "epoch": 12.529226828132277, "grad_norm": 0.3918021023273468, "learning_rate": 0.00044991613278974954, "loss": 3.3331, "step": 43050 }, { "epoch": 12.543782021425244, "grad_norm": 0.3732805848121643, "learning_rate": 0.00044974140943506113, "loss": 3.3387, "step": 43100 }, { "epoch": 12.558337214718211, "grad_norm": 0.3405734896659851, "learning_rate": 0.0004495666860803727, "loss": 3.3384, "step": 43150 }, { "epoch": 12.572892408011178, "grad_norm": 0.3441692590713501, "learning_rate": 0.00044939196272568427, "loss": 3.3281, "step": 43200 }, { "epoch": 12.587447601304145, "grad_norm": 0.3538864552974701, "learning_rate": 0.00044921723937099586, "loss": 3.3412, "step": 43250 }, { "epoch": 12.602002794597112, "grad_norm": 0.3746154010295868, "learning_rate": 0.0004490425160163075, "loss": 3.3504, "step": 43300 }, { "epoch": 12.61655798789008, "grad_norm": 0.3543429970741272, "learning_rate": 0.00044886779266161905, "loss": 3.3332, "step": 43350 }, { "epoch": 12.631113181183046, "grad_norm": 0.383897066116333, "learning_rate": 0.00044869306930693065, "loss": 3.3323, "step": 43400 }, { "epoch": 12.645668374476013, "grad_norm": 0.34187546372413635, "learning_rate": 0.00044851834595224224, "loss": 3.3465, "step": 43450 }, { "epoch": 12.66022356776898, "grad_norm": 0.34762442111968994, "learning_rate": 0.00044834362259755383, "loss": 3.3391, "step": 43500 }, { "epoch": 12.674778761061948, "grad_norm": 0.3649243414402008, "learning_rate": 0.00044816889924286543, "loss": 3.345, "step": 43550 }, { "epoch": 12.689333954354915, "grad_norm": 0.3724057078361511, "learning_rate": 0.000447994175888177, "loss": 3.3524, "step": 43600 }, { "epoch": 12.703889147647882, "grad_norm": 0.35211169719696045, "learning_rate": 0.0004478194525334886, "loss": 3.3341, "step": 43650 }, { "epoch": 12.718444340940847, "grad_norm": 0.35109853744506836, "learning_rate": 0.0004476447291788002, "loss": 3.3446, "step": 43700 }, { "epoch": 12.732999534233814, "grad_norm": 0.3826819062232971, "learning_rate": 0.00044747000582411175, "loss": 3.3336, "step": 43750 }, { "epoch": 12.747554727526781, "grad_norm": 0.3516761064529419, "learning_rate": 0.00044729528246942335, "loss": 3.3442, "step": 43800 }, { "epoch": 12.762109920819748, "grad_norm": 0.35584548115730286, "learning_rate": 0.000447120559114735, "loss": 3.339, "step": 43850 }, { "epoch": 12.776665114112715, "grad_norm": 0.36919936537742615, "learning_rate": 0.0004469458357600466, "loss": 3.3464, "step": 43900 }, { "epoch": 12.791220307405682, "grad_norm": 0.3403640687465668, "learning_rate": 0.00044677111240535813, "loss": 3.3497, "step": 43950 }, { "epoch": 12.80577550069865, "grad_norm": 0.34703755378723145, "learning_rate": 0.0004465963890506697, "loss": 3.3499, "step": 44000 }, { "epoch": 12.80577550069865, "eval_accuracy": 0.3706825315992728, "eval_loss": 3.5486204624176025, "eval_runtime": 181.1204, "eval_samples_per_second": 91.933, "eval_steps_per_second": 5.748, "step": 44000 }, { "epoch": 12.820330693991616, "grad_norm": 0.35650208592414856, "learning_rate": 0.0004464216656959813, "loss": 3.3583, "step": 44050 }, { "epoch": 12.834885887284583, "grad_norm": 0.3516455590724945, "learning_rate": 0.00044624694234129297, "loss": 3.3528, "step": 44100 }, { "epoch": 12.84944108057755, "grad_norm": 0.38415780663490295, "learning_rate": 0.0004460722189866045, "loss": 3.3487, "step": 44150 }, { "epoch": 12.863996273870518, "grad_norm": 0.35980987548828125, "learning_rate": 0.0004458974956319161, "loss": 3.3573, "step": 44200 }, { "epoch": 12.878551467163485, "grad_norm": 0.35467320680618286, "learning_rate": 0.0004457227722772277, "loss": 3.3508, "step": 44250 }, { "epoch": 12.89310666045645, "grad_norm": 0.3336358964443207, "learning_rate": 0.00044554804892253923, "loss": 3.3512, "step": 44300 }, { "epoch": 12.907661853749417, "grad_norm": 0.3593714237213135, "learning_rate": 0.00044537332556785083, "loss": 3.3485, "step": 44350 }, { "epoch": 12.922217047042384, "grad_norm": 0.3557552099227905, "learning_rate": 0.0004451986022131625, "loss": 3.3421, "step": 44400 }, { "epoch": 12.936772240335351, "grad_norm": 0.3727822005748749, "learning_rate": 0.00044502387885847407, "loss": 3.3561, "step": 44450 }, { "epoch": 12.951327433628318, "grad_norm": 0.3670724034309387, "learning_rate": 0.0004448491555037856, "loss": 3.3642, "step": 44500 }, { "epoch": 12.965882626921285, "grad_norm": 0.37972763180732727, "learning_rate": 0.0004446744321490972, "loss": 3.3539, "step": 44550 }, { "epoch": 12.980437820214252, "grad_norm": 0.36333584785461426, "learning_rate": 0.0004444997087944088, "loss": 3.3456, "step": 44600 }, { "epoch": 12.99499301350722, "grad_norm": 0.37191566824913025, "learning_rate": 0.0004443249854397204, "loss": 3.354, "step": 44650 }, { "epoch": 13.009315323707499, "grad_norm": 0.3661990165710449, "learning_rate": 0.000444150262085032, "loss": 3.2879, "step": 44700 }, { "epoch": 13.023870517000466, "grad_norm": 0.342840313911438, "learning_rate": 0.0004439755387303436, "loss": 3.2537, "step": 44750 }, { "epoch": 13.038425710293433, "grad_norm": 0.3745648264884949, "learning_rate": 0.0004438008153756552, "loss": 3.2493, "step": 44800 }, { "epoch": 13.0529809035864, "grad_norm": 0.3433801829814911, "learning_rate": 0.00044362609202096677, "loss": 3.2534, "step": 44850 }, { "epoch": 13.067536096879367, "grad_norm": 0.40368548035621643, "learning_rate": 0.0004434513686662783, "loss": 3.2495, "step": 44900 }, { "epoch": 13.082091290172334, "grad_norm": 0.3416557013988495, "learning_rate": 0.00044327664531158996, "loss": 3.2603, "step": 44950 }, { "epoch": 13.0966464834653, "grad_norm": 0.35723575949668884, "learning_rate": 0.00044310192195690155, "loss": 3.2567, "step": 45000 }, { "epoch": 13.0966464834653, "eval_accuracy": 0.3703908288272385, "eval_loss": 3.5622401237487793, "eval_runtime": 180.4882, "eval_samples_per_second": 92.255, "eval_steps_per_second": 5.768, "step": 45000 }, { "epoch": 13.111201676758267, "grad_norm": 0.3603862226009369, "learning_rate": 0.00044292719860221315, "loss": 3.2632, "step": 45050 }, { "epoch": 13.125756870051234, "grad_norm": 0.348423570394516, "learning_rate": 0.0004427524752475247, "loss": 3.2712, "step": 45100 }, { "epoch": 13.1403120633442, "grad_norm": 0.35942065715789795, "learning_rate": 0.0004425777518928363, "loss": 3.2842, "step": 45150 }, { "epoch": 13.154867256637168, "grad_norm": 0.39489057660102844, "learning_rate": 0.0004424030285381479, "loss": 3.2781, "step": 45200 }, { "epoch": 13.169422449930135, "grad_norm": 0.36711356043815613, "learning_rate": 0.0004422283051834595, "loss": 3.2889, "step": 45250 }, { "epoch": 13.183977643223102, "grad_norm": 0.3640125095844269, "learning_rate": 0.00044205358182877107, "loss": 3.2888, "step": 45300 }, { "epoch": 13.198532836516069, "grad_norm": 0.3865148425102234, "learning_rate": 0.00044187885847408266, "loss": 3.2854, "step": 45350 }, { "epoch": 13.213088029809036, "grad_norm": 0.36336416006088257, "learning_rate": 0.00044170413511939425, "loss": 3.2787, "step": 45400 }, { "epoch": 13.227643223102003, "grad_norm": 0.3866657018661499, "learning_rate": 0.0004415294117647058, "loss": 3.2815, "step": 45450 }, { "epoch": 13.24219841639497, "grad_norm": 0.3744238018989563, "learning_rate": 0.00044135468841001744, "loss": 3.2905, "step": 45500 }, { "epoch": 13.256753609687937, "grad_norm": 0.3592257797718048, "learning_rate": 0.00044117996505532904, "loss": 3.2828, "step": 45550 }, { "epoch": 13.271308802980904, "grad_norm": 0.34845948219299316, "learning_rate": 0.00044100524170064063, "loss": 3.2852, "step": 45600 }, { "epoch": 13.285863996273871, "grad_norm": 0.3521272540092468, "learning_rate": 0.0004408305183459522, "loss": 3.285, "step": 45650 }, { "epoch": 13.300419189566837, "grad_norm": 0.3670864701271057, "learning_rate": 0.00044065579499126377, "loss": 3.3033, "step": 45700 }, { "epoch": 13.314974382859804, "grad_norm": 0.35708314180374146, "learning_rate": 0.00044048107163657536, "loss": 3.2781, "step": 45750 }, { "epoch": 13.32952957615277, "grad_norm": 0.38420259952545166, "learning_rate": 0.000440306348281887, "loss": 3.3038, "step": 45800 }, { "epoch": 13.344084769445738, "grad_norm": 0.3823310434818268, "learning_rate": 0.0004401316249271986, "loss": 3.3092, "step": 45850 }, { "epoch": 13.358639962738705, "grad_norm": 0.3686697483062744, "learning_rate": 0.00043995690157251014, "loss": 3.291, "step": 45900 }, { "epoch": 13.373195156031672, "grad_norm": 0.36878153681755066, "learning_rate": 0.00043978217821782174, "loss": 3.3143, "step": 45950 }, { "epoch": 13.387750349324639, "grad_norm": 0.36531081795692444, "learning_rate": 0.00043960745486313333, "loss": 3.2984, "step": 46000 }, { "epoch": 13.387750349324639, "eval_accuracy": 0.3708552967382294, "eval_loss": 3.5585896968841553, "eval_runtime": 180.4424, "eval_samples_per_second": 92.279, "eval_steps_per_second": 5.769, "step": 46000 }, { "epoch": 13.402305542617606, "grad_norm": 0.3713867664337158, "learning_rate": 0.00043943273150844487, "loss": 3.3138, "step": 46050 }, { "epoch": 13.416860735910573, "grad_norm": 0.3893279433250427, "learning_rate": 0.0004392580081537565, "loss": 3.3096, "step": 46100 }, { "epoch": 13.43141592920354, "grad_norm": 0.3807111978530884, "learning_rate": 0.0004390832847990681, "loss": 3.3067, "step": 46150 }, { "epoch": 13.445971122496507, "grad_norm": 0.37018829584121704, "learning_rate": 0.0004389085614443797, "loss": 3.3131, "step": 46200 }, { "epoch": 13.460526315789474, "grad_norm": 0.37271586060523987, "learning_rate": 0.00043873383808969125, "loss": 3.2943, "step": 46250 }, { "epoch": 13.475081509082441, "grad_norm": 0.43294456601142883, "learning_rate": 0.00043855911473500284, "loss": 3.3182, "step": 46300 }, { "epoch": 13.489636702375407, "grad_norm": 0.3804895877838135, "learning_rate": 0.0004383843913803145, "loss": 3.2997, "step": 46350 }, { "epoch": 13.504191895668374, "grad_norm": 0.37023502588272095, "learning_rate": 0.0004382096680256261, "loss": 3.3076, "step": 46400 }, { "epoch": 13.51874708896134, "grad_norm": 0.3998282849788666, "learning_rate": 0.0004380349446709376, "loss": 3.3182, "step": 46450 }, { "epoch": 13.533302282254308, "grad_norm": 0.3805851340293884, "learning_rate": 0.0004378602213162492, "loss": 3.3288, "step": 46500 }, { "epoch": 13.547857475547275, "grad_norm": 0.3723335564136505, "learning_rate": 0.0004376854979615608, "loss": 3.3236, "step": 46550 }, { "epoch": 13.562412668840242, "grad_norm": 0.3624033033847809, "learning_rate": 0.0004375107746068724, "loss": 3.3236, "step": 46600 }, { "epoch": 13.576967862133209, "grad_norm": 0.37310343980789185, "learning_rate": 0.000437336051252184, "loss": 3.3204, "step": 46650 }, { "epoch": 13.591523055426176, "grad_norm": 0.3635174632072449, "learning_rate": 0.0004371613278974956, "loss": 3.3321, "step": 46700 }, { "epoch": 13.606078248719143, "grad_norm": 0.37709909677505493, "learning_rate": 0.0004369866045428072, "loss": 3.3223, "step": 46750 }, { "epoch": 13.62063344201211, "grad_norm": 0.38971418142318726, "learning_rate": 0.0004368118811881188, "loss": 3.3053, "step": 46800 }, { "epoch": 13.635188635305077, "grad_norm": 0.3583306670188904, "learning_rate": 0.0004366371578334303, "loss": 3.323, "step": 46850 }, { "epoch": 13.649743828598044, "grad_norm": 0.3505880534648895, "learning_rate": 0.000436462434478742, "loss": 3.3114, "step": 46900 }, { "epoch": 13.664299021891011, "grad_norm": 0.36524346470832825, "learning_rate": 0.00043628771112405357, "loss": 3.3328, "step": 46950 }, { "epoch": 13.678854215183978, "grad_norm": 0.3893430233001709, "learning_rate": 0.00043611298776936516, "loss": 3.3315, "step": 47000 }, { "epoch": 13.678854215183978, "eval_accuracy": 0.3712787476196313, "eval_loss": 3.549983501434326, "eval_runtime": 180.4535, "eval_samples_per_second": 92.273, "eval_steps_per_second": 5.769, "step": 47000 }, { "epoch": 13.693409408476944, "grad_norm": 0.34820255637168884, "learning_rate": 0.0004359382644146767, "loss": 3.3291, "step": 47050 }, { "epoch": 13.70796460176991, "grad_norm": 0.3759825825691223, "learning_rate": 0.0004357635410599883, "loss": 3.3256, "step": 47100 }, { "epoch": 13.722519795062878, "grad_norm": 0.3566809594631195, "learning_rate": 0.0004355888177052999, "loss": 3.3287, "step": 47150 }, { "epoch": 13.737074988355845, "grad_norm": 0.36069244146347046, "learning_rate": 0.00043541409435061154, "loss": 3.3126, "step": 47200 }, { "epoch": 13.751630181648812, "grad_norm": 0.34476426243782043, "learning_rate": 0.0004352393709959231, "loss": 3.3293, "step": 47250 }, { "epoch": 13.766185374941779, "grad_norm": 0.338467001914978, "learning_rate": 0.0004350646476412347, "loss": 3.3335, "step": 47300 }, { "epoch": 13.780740568234746, "grad_norm": 0.3689163625240326, "learning_rate": 0.00043488992428654627, "loss": 3.3423, "step": 47350 }, { "epoch": 13.795295761527713, "grad_norm": 0.3408607542514801, "learning_rate": 0.0004347152009318578, "loss": 3.3361, "step": 47400 }, { "epoch": 13.80985095482068, "grad_norm": 0.3644997775554657, "learning_rate": 0.00043454047757716946, "loss": 3.3273, "step": 47450 }, { "epoch": 13.824406148113647, "grad_norm": 0.36501219868659973, "learning_rate": 0.00043436575422248105, "loss": 3.3387, "step": 47500 }, { "epoch": 13.838961341406614, "grad_norm": 0.3601604104042053, "learning_rate": 0.00043419103086779265, "loss": 3.3355, "step": 47550 }, { "epoch": 13.853516534699581, "grad_norm": 0.3577875792980194, "learning_rate": 0.0004340163075131042, "loss": 3.3324, "step": 47600 }, { "epoch": 13.868071727992549, "grad_norm": 0.34529775381088257, "learning_rate": 0.0004338415841584158, "loss": 3.3308, "step": 47650 }, { "epoch": 13.882626921285514, "grad_norm": 0.37057533860206604, "learning_rate": 0.0004336668608037274, "loss": 3.3261, "step": 47700 }, { "epoch": 13.89718211457848, "grad_norm": 0.34348443150520325, "learning_rate": 0.000433492137449039, "loss": 3.3411, "step": 47750 }, { "epoch": 13.911737307871448, "grad_norm": 0.34097886085510254, "learning_rate": 0.00043331741409435056, "loss": 3.3308, "step": 47800 }, { "epoch": 13.926292501164415, "grad_norm": 0.3649720549583435, "learning_rate": 0.00043314269073966216, "loss": 3.3425, "step": 47850 }, { "epoch": 13.940847694457382, "grad_norm": 0.3554098606109619, "learning_rate": 0.00043296796738497375, "loss": 3.3524, "step": 47900 }, { "epoch": 13.955402887750349, "grad_norm": 0.36760854721069336, "learning_rate": 0.00043279324403028535, "loss": 3.3368, "step": 47950 }, { "epoch": 13.969958081043316, "grad_norm": 0.3529110252857208, "learning_rate": 0.0004326185206755969, "loss": 3.3493, "step": 48000 }, { "epoch": 13.969958081043316, "eval_accuracy": 0.37196357922827106, "eval_loss": 3.5378243923187256, "eval_runtime": 180.495, "eval_samples_per_second": 92.252, "eval_steps_per_second": 5.767, "step": 48000 }, { "epoch": 13.984513274336283, "grad_norm": 0.36286503076553345, "learning_rate": 0.00043244379732090854, "loss": 3.3509, "step": 48050 }, { "epoch": 13.99906846762925, "grad_norm": 0.37462398409843445, "learning_rate": 0.00043226907396622013, "loss": 3.3332, "step": 48100 }, { "epoch": 14.01339077782953, "grad_norm": 0.3753434419631958, "learning_rate": 0.0004320943506115317, "loss": 3.2436, "step": 48150 }, { "epoch": 14.027945971122497, "grad_norm": 0.35388123989105225, "learning_rate": 0.00043191962725684326, "loss": 3.2303, "step": 48200 }, { "epoch": 14.042501164415464, "grad_norm": 0.34403374791145325, "learning_rate": 0.00043174490390215486, "loss": 3.2336, "step": 48250 }, { "epoch": 14.057056357708431, "grad_norm": 0.37131327390670776, "learning_rate": 0.0004315701805474665, "loss": 3.2411, "step": 48300 }, { "epoch": 14.071611551001398, "grad_norm": 0.36330315470695496, "learning_rate": 0.0004313954571927781, "loss": 3.2457, "step": 48350 }, { "epoch": 14.086166744294363, "grad_norm": 0.35816964507102966, "learning_rate": 0.00043122073383808964, "loss": 3.2456, "step": 48400 }, { "epoch": 14.10072193758733, "grad_norm": 0.3898118734359741, "learning_rate": 0.00043104601048340124, "loss": 3.2493, "step": 48450 }, { "epoch": 14.115277130880298, "grad_norm": 0.3512522578239441, "learning_rate": 0.00043087128712871283, "loss": 3.259, "step": 48500 }, { "epoch": 14.129832324173265, "grad_norm": 0.39272579550743103, "learning_rate": 0.00043069656377402437, "loss": 3.2569, "step": 48550 }, { "epoch": 14.144387517466232, "grad_norm": 0.3540444076061249, "learning_rate": 0.000430521840419336, "loss": 3.2633, "step": 48600 }, { "epoch": 14.158942710759199, "grad_norm": 0.3757491409778595, "learning_rate": 0.0004303471170646476, "loss": 3.2588, "step": 48650 }, { "epoch": 14.173497904052166, "grad_norm": 0.37900933623313904, "learning_rate": 0.0004301723937099592, "loss": 3.2658, "step": 48700 }, { "epoch": 14.188053097345133, "grad_norm": 0.39364224672317505, "learning_rate": 0.00042999767035527075, "loss": 3.2744, "step": 48750 }, { "epoch": 14.2026082906381, "grad_norm": 0.3963830769062042, "learning_rate": 0.00042982294700058234, "loss": 3.2708, "step": 48800 }, { "epoch": 14.217163483931067, "grad_norm": 0.35951411724090576, "learning_rate": 0.000429648223645894, "loss": 3.2666, "step": 48850 }, { "epoch": 14.231718677224034, "grad_norm": 0.4134068787097931, "learning_rate": 0.0004294735002912056, "loss": 3.2653, "step": 48900 }, { "epoch": 14.246273870517001, "grad_norm": 0.36823442578315735, "learning_rate": 0.0004292987769365172, "loss": 3.2736, "step": 48950 }, { "epoch": 14.260829063809968, "grad_norm": 0.409718781709671, "learning_rate": 0.0004291240535818287, "loss": 3.2806, "step": 49000 }, { "epoch": 14.260829063809968, "eval_accuracy": 0.3708979591500942, "eval_loss": 3.5577683448791504, "eval_runtime": 180.4382, "eval_samples_per_second": 92.281, "eval_steps_per_second": 5.769, "step": 49000 }, { "epoch": 14.275384257102935, "grad_norm": 0.3630460798740387, "learning_rate": 0.0004289493302271403, "loss": 3.2805, "step": 49050 }, { "epoch": 14.2899394503959, "grad_norm": 0.4071013033390045, "learning_rate": 0.0004287746068724519, "loss": 3.2869, "step": 49100 }, { "epoch": 14.304494643688868, "grad_norm": 0.38805535435676575, "learning_rate": 0.00042859988351776356, "loss": 3.2882, "step": 49150 }, { "epoch": 14.319049836981835, "grad_norm": 0.3587045967578888, "learning_rate": 0.0004284251601630751, "loss": 3.2782, "step": 49200 }, { "epoch": 14.333605030274802, "grad_norm": 0.3769710659980774, "learning_rate": 0.0004282504368083867, "loss": 3.2847, "step": 49250 }, { "epoch": 14.348160223567769, "grad_norm": 0.3556223213672638, "learning_rate": 0.0004280757134536983, "loss": 3.2762, "step": 49300 }, { "epoch": 14.362715416860736, "grad_norm": 0.350798636674881, "learning_rate": 0.0004279009900990098, "loss": 3.2907, "step": 49350 }, { "epoch": 14.377270610153703, "grad_norm": 0.3763960897922516, "learning_rate": 0.0004277262667443214, "loss": 3.289, "step": 49400 }, { "epoch": 14.39182580344667, "grad_norm": 0.3522035777568817, "learning_rate": 0.00042755154338963307, "loss": 3.2861, "step": 49450 }, { "epoch": 14.406380996739637, "grad_norm": 0.35651910305023193, "learning_rate": 0.00042737682003494466, "loss": 3.2896, "step": 49500 }, { "epoch": 14.420936190032604, "grad_norm": 0.35279908776283264, "learning_rate": 0.0004272020966802562, "loss": 3.2861, "step": 49550 }, { "epoch": 14.435491383325571, "grad_norm": 0.3602607548236847, "learning_rate": 0.0004270273733255678, "loss": 3.2924, "step": 49600 }, { "epoch": 14.450046576618538, "grad_norm": 0.36753034591674805, "learning_rate": 0.0004268526499708794, "loss": 3.2983, "step": 49650 }, { "epoch": 14.464601769911505, "grad_norm": 0.388078510761261, "learning_rate": 0.00042667792661619104, "loss": 3.3127, "step": 49700 }, { "epoch": 14.47915696320447, "grad_norm": 0.36958014965057373, "learning_rate": 0.0004265032032615026, "loss": 3.3024, "step": 49750 }, { "epoch": 14.493712156497438, "grad_norm": 0.36868277192115784, "learning_rate": 0.0004263284799068142, "loss": 3.3041, "step": 49800 }, { "epoch": 14.508267349790405, "grad_norm": 0.3976775109767914, "learning_rate": 0.00042615375655212577, "loss": 3.2848, "step": 49850 }, { "epoch": 14.522822543083372, "grad_norm": 0.34939709305763245, "learning_rate": 0.00042597903319743736, "loss": 3.3041, "step": 49900 }, { "epoch": 14.537377736376339, "grad_norm": 0.35492560267448425, "learning_rate": 0.0004258043098427489, "loss": 3.3098, "step": 49950 }, { "epoch": 14.551932929669306, "grad_norm": 0.3550533354282379, "learning_rate": 0.00042562958648806055, "loss": 3.3178, "step": 50000 }, { "epoch": 14.551932929669306, "eval_accuracy": 0.3716823363864185, "eval_loss": 3.5471725463867188, "eval_runtime": 180.4163, "eval_samples_per_second": 92.292, "eval_steps_per_second": 5.77, "step": 50000 }, { "epoch": 14.566488122962273, "grad_norm": 0.3814389109611511, "learning_rate": 0.00042545486313337214, "loss": 3.2964, "step": 50050 }, { "epoch": 14.58104331625524, "grad_norm": 0.38196730613708496, "learning_rate": 0.00042528013977868374, "loss": 3.2915, "step": 50100 }, { "epoch": 14.595598509548207, "grad_norm": 0.3799495995044708, "learning_rate": 0.0004251054164239953, "loss": 3.3072, "step": 50150 }, { "epoch": 14.610153702841174, "grad_norm": 0.3424219489097595, "learning_rate": 0.0004249306930693069, "loss": 3.3003, "step": 50200 }, { "epoch": 14.624708896134141, "grad_norm": 0.3747063875198364, "learning_rate": 0.0004247559697146185, "loss": 3.3019, "step": 50250 }, { "epoch": 14.639264089427108, "grad_norm": 0.39657047390937805, "learning_rate": 0.0004245812463599301, "loss": 3.2963, "step": 50300 }, { "epoch": 14.653819282720075, "grad_norm": 0.4067077040672302, "learning_rate": 0.00042440652300524166, "loss": 3.3035, "step": 50350 }, { "epoch": 14.668374476013042, "grad_norm": 0.34821370244026184, "learning_rate": 0.00042423179965055325, "loss": 3.3222, "step": 50400 }, { "epoch": 14.682929669306008, "grad_norm": 0.4000264108181, "learning_rate": 0.00042405707629586484, "loss": 3.3049, "step": 50450 }, { "epoch": 14.697484862598975, "grad_norm": 0.39220643043518066, "learning_rate": 0.0004238823529411764, "loss": 3.3034, "step": 50500 }, { "epoch": 14.712040055891942, "grad_norm": 0.35506945848464966, "learning_rate": 0.00042370762958648803, "loss": 3.307, "step": 50550 }, { "epoch": 14.726595249184909, "grad_norm": 0.3691282868385315, "learning_rate": 0.00042353290623179963, "loss": 3.3135, "step": 50600 }, { "epoch": 14.741150442477876, "grad_norm": 0.36537739634513855, "learning_rate": 0.0004233581828771112, "loss": 3.3077, "step": 50650 }, { "epoch": 14.755705635770843, "grad_norm": 0.3595852255821228, "learning_rate": 0.00042318345952242276, "loss": 3.311, "step": 50700 }, { "epoch": 14.77026082906381, "grad_norm": 0.3846476078033447, "learning_rate": 0.00042300873616773436, "loss": 3.3165, "step": 50750 }, { "epoch": 14.784816022356777, "grad_norm": 0.39410391449928284, "learning_rate": 0.00042283401281304595, "loss": 3.3094, "step": 50800 }, { "epoch": 14.799371215649744, "grad_norm": 0.35959598422050476, "learning_rate": 0.0004226592894583576, "loss": 3.3189, "step": 50850 }, { "epoch": 14.813926408942711, "grad_norm": 0.38338419795036316, "learning_rate": 0.00042248456610366914, "loss": 3.3313, "step": 50900 }, { "epoch": 14.828481602235678, "grad_norm": 0.3378566801548004, "learning_rate": 0.00042230984274898073, "loss": 3.3199, "step": 50950 }, { "epoch": 14.843036795528645, "grad_norm": 0.36178532242774963, "learning_rate": 0.00042213511939429233, "loss": 3.3285, "step": 51000 }, { "epoch": 14.843036795528645, "eval_accuracy": 0.37214057535022255, "eval_loss": 3.541658639907837, "eval_runtime": 180.4575, "eval_samples_per_second": 92.271, "eval_steps_per_second": 5.769, "step": 51000 }, { "epoch": 14.857591988821612, "grad_norm": 0.37920406460762024, "learning_rate": 0.0004219603960396039, "loss": 3.3277, "step": 51050 }, { "epoch": 14.872147182114578, "grad_norm": 0.3401511013507843, "learning_rate": 0.0004217856726849155, "loss": 3.3211, "step": 51100 }, { "epoch": 14.886702375407545, "grad_norm": 0.3963386118412018, "learning_rate": 0.0004216109493302271, "loss": 3.3176, "step": 51150 }, { "epoch": 14.901257568700512, "grad_norm": 0.363406240940094, "learning_rate": 0.0004214362259755387, "loss": 3.3229, "step": 51200 }, { "epoch": 14.915812761993479, "grad_norm": 0.37761637568473816, "learning_rate": 0.0004212615026208503, "loss": 3.3187, "step": 51250 }, { "epoch": 14.930367955286446, "grad_norm": 0.34835371375083923, "learning_rate": 0.00042108677926616184, "loss": 3.3247, "step": 51300 }, { "epoch": 14.944923148579413, "grad_norm": 0.3946931064128876, "learning_rate": 0.00042091205591147343, "loss": 3.3359, "step": 51350 }, { "epoch": 14.95947834187238, "grad_norm": 0.35312914848327637, "learning_rate": 0.0004207373325567851, "loss": 3.3206, "step": 51400 }, { "epoch": 14.974033535165347, "grad_norm": 0.3725389838218689, "learning_rate": 0.0004205626092020967, "loss": 3.3231, "step": 51450 }, { "epoch": 14.988588728458314, "grad_norm": 0.3966827094554901, "learning_rate": 0.0004203878858474082, "loss": 3.3251, "step": 51500 }, { "epoch": 15.002911038658594, "grad_norm": 0.36925891041755676, "learning_rate": 0.0004202131624927198, "loss": 3.2951, "step": 51550 }, { "epoch": 15.01746623195156, "grad_norm": 0.3605971336364746, "learning_rate": 0.0004200384391380314, "loss": 3.224, "step": 51600 }, { "epoch": 15.032021425244528, "grad_norm": 0.35993263125419617, "learning_rate": 0.00041986371578334305, "loss": 3.2213, "step": 51650 }, { "epoch": 15.046576618537495, "grad_norm": 0.363483726978302, "learning_rate": 0.0004196889924286546, "loss": 3.2261, "step": 51700 }, { "epoch": 15.06113181183046, "grad_norm": 0.36927178502082825, "learning_rate": 0.0004195142690739662, "loss": 3.2109, "step": 51750 }, { "epoch": 15.075687005123427, "grad_norm": 0.3780657947063446, "learning_rate": 0.0004193395457192778, "loss": 3.2325, "step": 51800 }, { "epoch": 15.090242198416394, "grad_norm": 0.36395806074142456, "learning_rate": 0.0004191648223645893, "loss": 3.2292, "step": 51850 }, { "epoch": 15.104797391709361, "grad_norm": 0.39396318793296814, "learning_rate": 0.0004189900990099009, "loss": 3.2328, "step": 51900 }, { "epoch": 15.119352585002328, "grad_norm": 0.37105056643486023, "learning_rate": 0.00041881537565521256, "loss": 3.2263, "step": 51950 }, { "epoch": 15.133907778295296, "grad_norm": 0.3839941918849945, "learning_rate": 0.00041864065230052416, "loss": 3.2349, "step": 52000 }, { "epoch": 15.133907778295296, "eval_accuracy": 0.3715804402126257, "eval_loss": 3.5551469326019287, "eval_runtime": 180.5066, "eval_samples_per_second": 92.246, "eval_steps_per_second": 5.767, "step": 52000 }, { "epoch": 15.148462971588263, "grad_norm": 0.43423089385032654, "learning_rate": 0.00041846592894583575, "loss": 3.2523, "step": 52050 }, { "epoch": 15.16301816488123, "grad_norm": 0.36436381936073303, "learning_rate": 0.0004182912055911473, "loss": 3.2399, "step": 52100 }, { "epoch": 15.177573358174197, "grad_norm": 0.38402456045150757, "learning_rate": 0.0004181164822364589, "loss": 3.2547, "step": 52150 }, { "epoch": 15.192128551467164, "grad_norm": 0.4141615033149719, "learning_rate": 0.0004179417588817705, "loss": 3.2399, "step": 52200 }, { "epoch": 15.20668374476013, "grad_norm": 0.3821197748184204, "learning_rate": 0.00041776703552708213, "loss": 3.254, "step": 52250 }, { "epoch": 15.221238938053098, "grad_norm": 0.3751130998134613, "learning_rate": 0.00041759231217239367, "loss": 3.2598, "step": 52300 }, { "epoch": 15.235794131346065, "grad_norm": 0.35358160734176636, "learning_rate": 0.00041741758881770527, "loss": 3.2479, "step": 52350 }, { "epoch": 15.250349324639032, "grad_norm": 0.36289575695991516, "learning_rate": 0.00041724286546301686, "loss": 3.2578, "step": 52400 }, { "epoch": 15.264904517931997, "grad_norm": 0.3592086136341095, "learning_rate": 0.0004170681421083284, "loss": 3.2531, "step": 52450 }, { "epoch": 15.279459711224964, "grad_norm": 0.3920702040195465, "learning_rate": 0.00041689341875364005, "loss": 3.26, "step": 52500 }, { "epoch": 15.294014904517931, "grad_norm": 0.3840749263763428, "learning_rate": 0.00041671869539895164, "loss": 3.2673, "step": 52550 }, { "epoch": 15.308570097810899, "grad_norm": 0.35413870215415955, "learning_rate": 0.00041654397204426324, "loss": 3.2657, "step": 52600 }, { "epoch": 15.323125291103866, "grad_norm": 0.4196890592575073, "learning_rate": 0.0004163692486895748, "loss": 3.2858, "step": 52650 }, { "epoch": 15.337680484396833, "grad_norm": 0.35870644450187683, "learning_rate": 0.00041619452533488637, "loss": 3.275, "step": 52700 }, { "epoch": 15.3522356776898, "grad_norm": 0.38391217589378357, "learning_rate": 0.00041601980198019797, "loss": 3.2625, "step": 52750 }, { "epoch": 15.366790870982767, "grad_norm": 0.39391493797302246, "learning_rate": 0.0004158450786255096, "loss": 3.2834, "step": 52800 }, { "epoch": 15.381346064275734, "grad_norm": 0.3893953859806061, "learning_rate": 0.00041567035527082115, "loss": 3.2694, "step": 52850 }, { "epoch": 15.3959012575687, "grad_norm": 0.3612153232097626, "learning_rate": 0.00041549563191613275, "loss": 3.2833, "step": 52900 }, { "epoch": 15.410456450861668, "grad_norm": 0.37496301531791687, "learning_rate": 0.00041532090856144434, "loss": 3.283, "step": 52950 }, { "epoch": 15.425011644154635, "grad_norm": 0.3867912292480469, "learning_rate": 0.00041514618520675594, "loss": 3.2865, "step": 53000 }, { "epoch": 15.425011644154635, "eval_accuracy": 0.3719092816131704, "eval_loss": 3.5477328300476074, "eval_runtime": 180.446, "eval_samples_per_second": 92.277, "eval_steps_per_second": 5.769, "step": 53000 }, { "epoch": 15.439566837447602, "grad_norm": 0.36597126722335815, "learning_rate": 0.00041497146185206753, "loss": 3.2785, "step": 53050 }, { "epoch": 15.454122030740567, "grad_norm": 0.3563242554664612, "learning_rate": 0.0004147967384973791, "loss": 3.2796, "step": 53100 }, { "epoch": 15.468677224033534, "grad_norm": 0.38371267914772034, "learning_rate": 0.0004146220151426907, "loss": 3.2781, "step": 53150 }, { "epoch": 15.483232417326501, "grad_norm": 0.37337714433670044, "learning_rate": 0.0004144472917880023, "loss": 3.2827, "step": 53200 }, { "epoch": 15.497787610619469, "grad_norm": 0.40076225996017456, "learning_rate": 0.00041427256843331385, "loss": 3.2852, "step": 53250 }, { "epoch": 15.512342803912436, "grad_norm": 0.3688531816005707, "learning_rate": 0.00041409784507862545, "loss": 3.2753, "step": 53300 }, { "epoch": 15.526897997205403, "grad_norm": 0.3890566825866699, "learning_rate": 0.0004139231217239371, "loss": 3.2987, "step": 53350 }, { "epoch": 15.54145319049837, "grad_norm": 0.3741496801376343, "learning_rate": 0.0004137483983692487, "loss": 3.2979, "step": 53400 }, { "epoch": 15.556008383791337, "grad_norm": 0.3602406680583954, "learning_rate": 0.00041357367501456023, "loss": 3.3013, "step": 53450 }, { "epoch": 15.570563577084304, "grad_norm": 0.356662392616272, "learning_rate": 0.0004133989516598718, "loss": 3.289, "step": 53500 }, { "epoch": 15.585118770377271, "grad_norm": 0.362364262342453, "learning_rate": 0.0004132242283051834, "loss": 3.2831, "step": 53550 }, { "epoch": 15.599673963670238, "grad_norm": 0.38070622086524963, "learning_rate": 0.00041304950495049496, "loss": 3.2797, "step": 53600 }, { "epoch": 15.614229156963205, "grad_norm": 0.3662395477294922, "learning_rate": 0.0004128747815958066, "loss": 3.2879, "step": 53650 }, { "epoch": 15.628784350256172, "grad_norm": 0.3733738660812378, "learning_rate": 0.0004127000582411182, "loss": 3.2984, "step": 53700 }, { "epoch": 15.64333954354914, "grad_norm": 0.40175968408584595, "learning_rate": 0.0004125253348864298, "loss": 3.295, "step": 53750 }, { "epoch": 15.657894736842106, "grad_norm": 0.39177513122558594, "learning_rate": 0.00041235061153174134, "loss": 3.2958, "step": 53800 }, { "epoch": 15.672449930135071, "grad_norm": 0.3641617000102997, "learning_rate": 0.00041217588817705293, "loss": 3.2878, "step": 53850 }, { "epoch": 15.687005123428039, "grad_norm": 0.38100600242614746, "learning_rate": 0.0004120011648223646, "loss": 3.2933, "step": 53900 }, { "epoch": 15.701560316721006, "grad_norm": 0.3599903881549835, "learning_rate": 0.0004118264414676762, "loss": 3.3017, "step": 53950 }, { "epoch": 15.716115510013973, "grad_norm": 0.37543267011642456, "learning_rate": 0.0004116517181129877, "loss": 3.2972, "step": 54000 }, { "epoch": 15.716115510013973, "eval_accuracy": 0.372320979764031, "eval_loss": 3.5453689098358154, "eval_runtime": 180.3536, "eval_samples_per_second": 92.324, "eval_steps_per_second": 5.772, "step": 54000 }, { "epoch": 15.73067070330694, "grad_norm": 0.4046041965484619, "learning_rate": 0.0004114769947582993, "loss": 3.306, "step": 54050 }, { "epoch": 15.745225896599907, "grad_norm": 0.37718820571899414, "learning_rate": 0.0004113022714036109, "loss": 3.3036, "step": 54100 }, { "epoch": 15.759781089892874, "grad_norm": 0.38076573610305786, "learning_rate": 0.0004111275480489225, "loss": 3.3027, "step": 54150 }, { "epoch": 15.774336283185841, "grad_norm": 0.389849454164505, "learning_rate": 0.0004109528246942341, "loss": 3.3144, "step": 54200 }, { "epoch": 15.788891476478808, "grad_norm": 0.3586804270744324, "learning_rate": 0.0004107781013395457, "loss": 3.3095, "step": 54250 }, { "epoch": 15.803446669771775, "grad_norm": 0.3473518490791321, "learning_rate": 0.0004106033779848573, "loss": 3.3077, "step": 54300 }, { "epoch": 15.818001863064742, "grad_norm": 0.3816808760166168, "learning_rate": 0.0004104286546301689, "loss": 3.3148, "step": 54350 }, { "epoch": 15.83255705635771, "grad_norm": 0.3530210554599762, "learning_rate": 0.0004102539312754804, "loss": 3.3064, "step": 54400 }, { "epoch": 15.847112249650674, "grad_norm": 0.376775860786438, "learning_rate": 0.00041007920792079206, "loss": 3.3061, "step": 54450 }, { "epoch": 15.861667442943642, "grad_norm": 0.3644925355911255, "learning_rate": 0.00040990448456610366, "loss": 3.3084, "step": 54500 }, { "epoch": 15.876222636236609, "grad_norm": 0.38478991389274597, "learning_rate": 0.00040972976121141525, "loss": 3.3095, "step": 54550 }, { "epoch": 15.890777829529576, "grad_norm": 0.3731720745563507, "learning_rate": 0.0004095550378567268, "loss": 3.3135, "step": 54600 }, { "epoch": 15.905333022822543, "grad_norm": 0.3699355125427246, "learning_rate": 0.0004093803145020384, "loss": 3.3069, "step": 54650 }, { "epoch": 15.91988821611551, "grad_norm": 0.39868977665901184, "learning_rate": 0.00040920559114735, "loss": 3.3017, "step": 54700 }, { "epoch": 15.934443409408477, "grad_norm": 0.35023972392082214, "learning_rate": 0.00040903086779266163, "loss": 3.3165, "step": 54750 }, { "epoch": 15.948998602701444, "grad_norm": 0.37012845277786255, "learning_rate": 0.00040885614443797317, "loss": 3.312, "step": 54800 }, { "epoch": 15.963553795994411, "grad_norm": 0.3557792901992798, "learning_rate": 0.00040868142108328476, "loss": 3.3121, "step": 54850 }, { "epoch": 15.978108989287378, "grad_norm": 0.35981467366218567, "learning_rate": 0.00040850669772859636, "loss": 3.3074, "step": 54900 }, { "epoch": 15.992664182580345, "grad_norm": 0.38941019773483276, "learning_rate": 0.0004083319743739079, "loss": 3.3188, "step": 54950 }, { "epoch": 16.006986492780623, "grad_norm": 0.3688517212867737, "learning_rate": 0.0004081572510192195, "loss": 3.2481, "step": 55000 }, { "epoch": 16.006986492780623, "eval_accuracy": 0.3720196397529529, "eval_loss": 3.548246383666992, "eval_runtime": 180.4235, "eval_samples_per_second": 92.288, "eval_steps_per_second": 5.77, "step": 55000 }, { "epoch": 16.02154168607359, "grad_norm": 0.3910331428050995, "learning_rate": 0.00040798252766453114, "loss": 3.1929, "step": 55050 }, { "epoch": 16.036096879366557, "grad_norm": 0.3464187979698181, "learning_rate": 0.00040780780430984273, "loss": 3.2234, "step": 55100 }, { "epoch": 16.050652072659524, "grad_norm": 0.37600189447402954, "learning_rate": 0.0004076330809551543, "loss": 3.2048, "step": 55150 }, { "epoch": 16.06520726595249, "grad_norm": 0.3939422369003296, "learning_rate": 0.00040745835760046587, "loss": 3.2186, "step": 55200 }, { "epoch": 16.079762459245458, "grad_norm": 0.37883082032203674, "learning_rate": 0.00040728363424577746, "loss": 3.2257, "step": 55250 }, { "epoch": 16.094317652538425, "grad_norm": 0.3817696273326874, "learning_rate": 0.0004071089108910891, "loss": 3.2299, "step": 55300 }, { "epoch": 16.108872845831392, "grad_norm": 0.3791786730289459, "learning_rate": 0.0004069341875364007, "loss": 3.2236, "step": 55350 }, { "epoch": 16.12342803912436, "grad_norm": 0.4004996418952942, "learning_rate": 0.00040675946418171225, "loss": 3.2305, "step": 55400 }, { "epoch": 16.137983232417326, "grad_norm": 0.4008648991584778, "learning_rate": 0.00040658474082702384, "loss": 3.2306, "step": 55450 }, { "epoch": 16.152538425710294, "grad_norm": 0.40361517667770386, "learning_rate": 0.00040641001747233543, "loss": 3.2338, "step": 55500 }, { "epoch": 16.16709361900326, "grad_norm": 0.38455691933631897, "learning_rate": 0.000406235294117647, "loss": 3.2367, "step": 55550 }, { "epoch": 16.181648812296228, "grad_norm": 0.3895011842250824, "learning_rate": 0.0004060605707629586, "loss": 3.241, "step": 55600 }, { "epoch": 16.196204005589195, "grad_norm": 0.4153099060058594, "learning_rate": 0.0004058858474082702, "loss": 3.238, "step": 55650 }, { "epoch": 16.21075919888216, "grad_norm": 0.3801526427268982, "learning_rate": 0.0004057111240535818, "loss": 3.2406, "step": 55700 }, { "epoch": 16.22531439217513, "grad_norm": 0.39654862880706787, "learning_rate": 0.00040553640069889335, "loss": 3.2473, "step": 55750 }, { "epoch": 16.239869585468096, "grad_norm": 0.3951687812805176, "learning_rate": 0.00040536167734420495, "loss": 3.2479, "step": 55800 }, { "epoch": 16.254424778761063, "grad_norm": 0.40282097458839417, "learning_rate": 0.0004051869539895166, "loss": 3.2512, "step": 55850 }, { "epoch": 16.26897997205403, "grad_norm": 0.40193888545036316, "learning_rate": 0.0004050122306348282, "loss": 3.2385, "step": 55900 }, { "epoch": 16.283535165346997, "grad_norm": 0.3884776532649994, "learning_rate": 0.00040483750728013973, "loss": 3.2413, "step": 55950 }, { "epoch": 16.298090358639964, "grad_norm": 0.3863818943500519, "learning_rate": 0.0004046627839254513, "loss": 3.2594, "step": 56000 }, { "epoch": 16.298090358639964, "eval_accuracy": 0.3720323327019375, "eval_loss": 3.5523624420166016, "eval_runtime": 180.1219, "eval_samples_per_second": 92.443, "eval_steps_per_second": 5.779, "step": 56000 }, { "epoch": 16.31264555193293, "grad_norm": 0.3894936442375183, "learning_rate": 0.0004044880605707629, "loss": 3.2471, "step": 56050 }, { "epoch": 16.3272007452259, "grad_norm": 0.40686696767807007, "learning_rate": 0.00040431333721607446, "loss": 3.25, "step": 56100 }, { "epoch": 16.341755938518865, "grad_norm": 0.36061444878578186, "learning_rate": 0.0004041386138613861, "loss": 3.2657, "step": 56150 }, { "epoch": 16.35631113181183, "grad_norm": 0.35572490096092224, "learning_rate": 0.0004039638905066977, "loss": 3.2598, "step": 56200 }, { "epoch": 16.370866325104796, "grad_norm": 0.37899231910705566, "learning_rate": 0.0004037891671520093, "loss": 3.2609, "step": 56250 }, { "epoch": 16.385421518397763, "grad_norm": 0.38011741638183594, "learning_rate": 0.0004036144437973209, "loss": 3.254, "step": 56300 }, { "epoch": 16.39997671169073, "grad_norm": 0.3768341839313507, "learning_rate": 0.00040343972044263243, "loss": 3.27, "step": 56350 }, { "epoch": 16.414531904983697, "grad_norm": 0.38822469115257263, "learning_rate": 0.0004032649970879441, "loss": 3.2633, "step": 56400 }, { "epoch": 16.429087098276664, "grad_norm": 0.3971615731716156, "learning_rate": 0.00040309027373325567, "loss": 3.2643, "step": 56450 }, { "epoch": 16.44364229156963, "grad_norm": 0.37578991055488586, "learning_rate": 0.00040291555037856727, "loss": 3.2667, "step": 56500 }, { "epoch": 16.4581974848626, "grad_norm": 0.3829331696033478, "learning_rate": 0.0004027408270238788, "loss": 3.2809, "step": 56550 }, { "epoch": 16.472752678155565, "grad_norm": 0.4001660645008087, "learning_rate": 0.0004025661036691904, "loss": 3.2716, "step": 56600 }, { "epoch": 16.487307871448532, "grad_norm": 0.40043875575065613, "learning_rate": 0.000402391380314502, "loss": 3.2557, "step": 56650 }, { "epoch": 16.5018630647415, "grad_norm": 0.4183054566383362, "learning_rate": 0.00040221665695981364, "loss": 3.2633, "step": 56700 }, { "epoch": 16.516418258034467, "grad_norm": 0.3828704059123993, "learning_rate": 0.0004020419336051252, "loss": 3.2798, "step": 56750 }, { "epoch": 16.530973451327434, "grad_norm": 0.3919852077960968, "learning_rate": 0.0004018672102504368, "loss": 3.2812, "step": 56800 }, { "epoch": 16.5455286446204, "grad_norm": 0.371501624584198, "learning_rate": 0.00040169248689574837, "loss": 3.266, "step": 56850 }, { "epoch": 16.560083837913368, "grad_norm": 0.37707042694091797, "learning_rate": 0.0004015177635410599, "loss": 3.2695, "step": 56900 }, { "epoch": 16.574639031206335, "grad_norm": 0.3715432584285736, "learning_rate": 0.0004013430401863715, "loss": 3.271, "step": 56950 }, { "epoch": 16.589194224499302, "grad_norm": 0.3684331178665161, "learning_rate": 0.00040116831683168315, "loss": 3.2754, "step": 57000 }, { "epoch": 16.589194224499302, "eval_accuracy": 0.3725670819415652, "eval_loss": 3.543412446975708, "eval_runtime": 179.9314, "eval_samples_per_second": 92.541, "eval_steps_per_second": 5.786, "step": 57000 }, { "epoch": 16.60374941779227, "grad_norm": 0.36808183789253235, "learning_rate": 0.00040099359347699475, "loss": 3.2798, "step": 57050 }, { "epoch": 16.618304611085236, "grad_norm": 0.3891730010509491, "learning_rate": 0.0004008188701223063, "loss": 3.2758, "step": 57100 }, { "epoch": 16.632859804378203, "grad_norm": 0.3876141905784607, "learning_rate": 0.0004006441467676179, "loss": 3.2796, "step": 57150 }, { "epoch": 16.64741499767117, "grad_norm": 0.39390671253204346, "learning_rate": 0.0004004694234129295, "loss": 3.2877, "step": 57200 }, { "epoch": 16.661970190964137, "grad_norm": 0.3696586489677429, "learning_rate": 0.0004002947000582411, "loss": 3.2796, "step": 57250 }, { "epoch": 16.676525384257104, "grad_norm": 0.3565455973148346, "learning_rate": 0.00040011997670355267, "loss": 3.2703, "step": 57300 }, { "epoch": 16.69108057755007, "grad_norm": 0.4118868410587311, "learning_rate": 0.00039994525334886426, "loss": 3.2922, "step": 57350 }, { "epoch": 16.70563577084304, "grad_norm": 0.37132760882377625, "learning_rate": 0.00039977052999417585, "loss": 3.2914, "step": 57400 }, { "epoch": 16.720190964136005, "grad_norm": 0.36809346079826355, "learning_rate": 0.00039959580663948745, "loss": 3.2891, "step": 57450 }, { "epoch": 16.734746157428972, "grad_norm": 0.36107897758483887, "learning_rate": 0.000399421083284799, "loss": 3.291, "step": 57500 }, { "epoch": 16.749301350721936, "grad_norm": 0.4199921786785126, "learning_rate": 0.00039924635993011064, "loss": 3.2868, "step": 57550 }, { "epoch": 16.763856544014903, "grad_norm": 0.36306288838386536, "learning_rate": 0.00039907163657542223, "loss": 3.2899, "step": 57600 }, { "epoch": 16.77841173730787, "grad_norm": 0.38478779792785645, "learning_rate": 0.0003988969132207338, "loss": 3.2837, "step": 57650 }, { "epoch": 16.792966930600837, "grad_norm": 0.3598993122577667, "learning_rate": 0.00039872218986604537, "loss": 3.2994, "step": 57700 }, { "epoch": 16.807522123893804, "grad_norm": 0.38993242383003235, "learning_rate": 0.00039854746651135696, "loss": 3.2804, "step": 57750 }, { "epoch": 16.82207731718677, "grad_norm": 0.4152110815048218, "learning_rate": 0.0003983727431566686, "loss": 3.3006, "step": 57800 }, { "epoch": 16.83663251047974, "grad_norm": 0.36189135909080505, "learning_rate": 0.0003981980198019802, "loss": 3.3056, "step": 57850 }, { "epoch": 16.851187703772705, "grad_norm": 0.39668968319892883, "learning_rate": 0.00039802329644729174, "loss": 3.2883, "step": 57900 }, { "epoch": 16.865742897065672, "grad_norm": 0.3889414072036743, "learning_rate": 0.00039784857309260334, "loss": 3.303, "step": 57950 }, { "epoch": 16.88029809035864, "grad_norm": 0.3900097608566284, "learning_rate": 0.00039767384973791493, "loss": 3.2861, "step": 58000 }, { "epoch": 16.88029809035864, "eval_accuracy": 0.3730723318275343, "eval_loss": 3.536813259124756, "eval_runtime": 179.9463, "eval_samples_per_second": 92.533, "eval_steps_per_second": 5.785, "step": 58000 }, { "epoch": 16.894853283651607, "grad_norm": 0.34834152460098267, "learning_rate": 0.00039749912638322647, "loss": 3.2945, "step": 58050 }, { "epoch": 16.909408476944574, "grad_norm": 0.372644305229187, "learning_rate": 0.0003973244030285381, "loss": 3.2985, "step": 58100 }, { "epoch": 16.92396367023754, "grad_norm": 0.37328094244003296, "learning_rate": 0.0003971496796738497, "loss": 3.2984, "step": 58150 }, { "epoch": 16.938518863530508, "grad_norm": 0.35464411973953247, "learning_rate": 0.0003969749563191613, "loss": 3.2921, "step": 58200 }, { "epoch": 16.953074056823475, "grad_norm": 0.35444575548171997, "learning_rate": 0.00039680023296447285, "loss": 3.2977, "step": 58250 }, { "epoch": 16.967629250116442, "grad_norm": 0.3729502260684967, "learning_rate": 0.00039662550960978444, "loss": 3.2904, "step": 58300 }, { "epoch": 16.98218444340941, "grad_norm": 0.37876373529434204, "learning_rate": 0.00039645078625509604, "loss": 3.2951, "step": 58350 }, { "epoch": 16.996739636702376, "grad_norm": 0.3637172281742096, "learning_rate": 0.0003962760629004077, "loss": 3.3061, "step": 58400 }, { "epoch": 17.011061946902654, "grad_norm": 0.35592201352119446, "learning_rate": 0.0003961013395457193, "loss": 3.2151, "step": 58450 }, { "epoch": 17.02561714019562, "grad_norm": 0.38119444251060486, "learning_rate": 0.0003959266161910308, "loss": 3.1818, "step": 58500 }, { "epoch": 17.040172333488588, "grad_norm": 0.3767271339893341, "learning_rate": 0.0003957518928363424, "loss": 3.2017, "step": 58550 }, { "epoch": 17.054727526781555, "grad_norm": 0.4044979214668274, "learning_rate": 0.000395577169481654, "loss": 3.2113, "step": 58600 }, { "epoch": 17.069282720074522, "grad_norm": 0.383327454328537, "learning_rate": 0.00039540244612696566, "loss": 3.2021, "step": 58650 }, { "epoch": 17.08383791336749, "grad_norm": 0.36652854084968567, "learning_rate": 0.0003952277227722772, "loss": 3.2158, "step": 58700 }, { "epoch": 17.098393106660456, "grad_norm": 0.40328896045684814, "learning_rate": 0.0003950529994175888, "loss": 3.2248, "step": 58750 }, { "epoch": 17.112948299953423, "grad_norm": 0.4199967682361603, "learning_rate": 0.0003948782760629004, "loss": 3.2186, "step": 58800 }, { "epoch": 17.12750349324639, "grad_norm": 0.39083749055862427, "learning_rate": 0.0003947035527082119, "loss": 3.2288, "step": 58850 }, { "epoch": 17.142058686539357, "grad_norm": 0.3882003426551819, "learning_rate": 0.0003945288293535235, "loss": 3.214, "step": 58900 }, { "epoch": 17.156613879832324, "grad_norm": 0.38862890005111694, "learning_rate": 0.00039435410599883517, "loss": 3.2137, "step": 58950 }, { "epoch": 17.17116907312529, "grad_norm": 0.3689446747303009, "learning_rate": 0.00039417938264414676, "loss": 3.2287, "step": 59000 }, { "epoch": 17.17116907312529, "eval_accuracy": 0.372285956627018, "eval_loss": 3.5500593185424805, "eval_runtime": 180.3273, "eval_samples_per_second": 92.338, "eval_steps_per_second": 5.773, "step": 59000 }, { "epoch": 17.18572426641826, "grad_norm": 0.403664231300354, "learning_rate": 0.0003940046592894583, "loss": 3.2241, "step": 59050 }, { "epoch": 17.200279459711226, "grad_norm": 0.3581107258796692, "learning_rate": 0.0003938299359347699, "loss": 3.2268, "step": 59100 }, { "epoch": 17.214834653004193, "grad_norm": 0.397594153881073, "learning_rate": 0.0003936552125800815, "loss": 3.2394, "step": 59150 }, { "epoch": 17.22938984629716, "grad_norm": 0.35579121112823486, "learning_rate": 0.00039348048922539314, "loss": 3.2239, "step": 59200 }, { "epoch": 17.243945039590127, "grad_norm": 0.36616387963294983, "learning_rate": 0.0003933057658707047, "loss": 3.2287, "step": 59250 }, { "epoch": 17.258500232883094, "grad_norm": 0.4169488847255707, "learning_rate": 0.0003931310425160163, "loss": 3.2367, "step": 59300 }, { "epoch": 17.27305542617606, "grad_norm": 0.3743215799331665, "learning_rate": 0.00039295631916132787, "loss": 3.2394, "step": 59350 }, { "epoch": 17.287610619469028, "grad_norm": 0.3789711594581604, "learning_rate": 0.00039278159580663946, "loss": 3.2365, "step": 59400 }, { "epoch": 17.302165812761995, "grad_norm": 0.4440571963787079, "learning_rate": 0.000392606872451951, "loss": 3.2434, "step": 59450 }, { "epoch": 17.316721006054962, "grad_norm": 0.3888919949531555, "learning_rate": 0.00039243214909726265, "loss": 3.2346, "step": 59500 }, { "epoch": 17.331276199347926, "grad_norm": 0.43626418709754944, "learning_rate": 0.00039225742574257425, "loss": 3.2527, "step": 59550 }, { "epoch": 17.345831392640893, "grad_norm": 0.4046328663825989, "learning_rate": 0.00039208270238788584, "loss": 3.2449, "step": 59600 }, { "epoch": 17.36038658593386, "grad_norm": 0.4085264205932617, "learning_rate": 0.0003919079790331974, "loss": 3.2413, "step": 59650 }, { "epoch": 17.374941779226827, "grad_norm": 0.42200592160224915, "learning_rate": 0.000391733255678509, "loss": 3.247, "step": 59700 }, { "epoch": 17.389496972519794, "grad_norm": 0.3673103153705597, "learning_rate": 0.00039155853232382057, "loss": 3.251, "step": 59750 }, { "epoch": 17.40405216581276, "grad_norm": 0.3644881248474121, "learning_rate": 0.0003913838089691322, "loss": 3.2413, "step": 59800 }, { "epoch": 17.418607359105728, "grad_norm": 0.36705729365348816, "learning_rate": 0.00039120908561444376, "loss": 3.2547, "step": 59850 }, { "epoch": 17.433162552398695, "grad_norm": 0.36907321214675903, "learning_rate": 0.00039103436225975535, "loss": 3.2521, "step": 59900 }, { "epoch": 17.447717745691662, "grad_norm": 0.3926324248313904, "learning_rate": 0.00039085963890506695, "loss": 3.2553, "step": 59950 }, { "epoch": 17.46227293898463, "grad_norm": 0.3525819480419159, "learning_rate": 0.0003906849155503785, "loss": 3.2512, "step": 60000 }, { "epoch": 17.46227293898463, "eval_accuracy": 0.37282129350317283, "eval_loss": 3.5421013832092285, "eval_runtime": 180.0414, "eval_samples_per_second": 92.484, "eval_steps_per_second": 5.782, "step": 60000 }, { "epoch": 17.476828132277596, "grad_norm": 0.38942793011665344, "learning_rate": 0.00039051019219569014, "loss": 3.2603, "step": 60050 }, { "epoch": 17.491383325570563, "grad_norm": 0.40185338258743286, "learning_rate": 0.00039033546884100173, "loss": 3.2521, "step": 60100 }, { "epoch": 17.50593851886353, "grad_norm": 0.3747898042201996, "learning_rate": 0.0003901607454863133, "loss": 3.2653, "step": 60150 }, { "epoch": 17.520493712156497, "grad_norm": 0.38964682817459106, "learning_rate": 0.00038998602213162486, "loss": 3.2776, "step": 60200 }, { "epoch": 17.535048905449464, "grad_norm": 0.3838801681995392, "learning_rate": 0.00038981129877693646, "loss": 3.2638, "step": 60250 }, { "epoch": 17.54960409874243, "grad_norm": 0.3489764928817749, "learning_rate": 0.00038963657542224805, "loss": 3.2706, "step": 60300 }, { "epoch": 17.5641592920354, "grad_norm": 0.41108909249305725, "learning_rate": 0.0003894618520675597, "loss": 3.2585, "step": 60350 }, { "epoch": 17.578714485328366, "grad_norm": 0.38499513268470764, "learning_rate": 0.00038928712871287124, "loss": 3.2572, "step": 60400 }, { "epoch": 17.593269678621333, "grad_norm": 0.40830159187316895, "learning_rate": 0.00038911240535818284, "loss": 3.2562, "step": 60450 }, { "epoch": 17.6078248719143, "grad_norm": 0.4046759009361267, "learning_rate": 0.00038893768200349443, "loss": 3.2662, "step": 60500 }, { "epoch": 17.622380065207267, "grad_norm": 0.37540534138679504, "learning_rate": 0.000388762958648806, "loss": 3.2628, "step": 60550 }, { "epoch": 17.636935258500234, "grad_norm": 0.3739311099052429, "learning_rate": 0.0003885882352941176, "loss": 3.2653, "step": 60600 }, { "epoch": 17.6514904517932, "grad_norm": 0.37558451294898987, "learning_rate": 0.0003884135119394292, "loss": 3.2741, "step": 60650 }, { "epoch": 17.666045645086168, "grad_norm": 0.37424349784851074, "learning_rate": 0.0003882387885847408, "loss": 3.2736, "step": 60700 }, { "epoch": 17.680600838379135, "grad_norm": 0.379001647233963, "learning_rate": 0.0003880640652300524, "loss": 3.2583, "step": 60750 }, { "epoch": 17.695156031672102, "grad_norm": 0.36271026730537415, "learning_rate": 0.00038788934187536394, "loss": 3.2785, "step": 60800 }, { "epoch": 17.70971122496507, "grad_norm": 0.3793847858905792, "learning_rate": 0.00038771461852067554, "loss": 3.2806, "step": 60850 }, { "epoch": 17.724266418258033, "grad_norm": 0.36626124382019043, "learning_rate": 0.0003875398951659872, "loss": 3.2764, "step": 60900 }, { "epoch": 17.738821611551, "grad_norm": 0.3531216084957123, "learning_rate": 0.0003873651718112988, "loss": 3.2689, "step": 60950 }, { "epoch": 17.753376804843967, "grad_norm": 0.3907027542591095, "learning_rate": 0.0003871904484566103, "loss": 3.2781, "step": 61000 }, { "epoch": 17.753376804843967, "eval_accuracy": 0.37280389946197173, "eval_loss": 3.5358006954193115, "eval_runtime": 180.1603, "eval_samples_per_second": 92.423, "eval_steps_per_second": 5.778, "step": 61000 }, { "epoch": 17.767931998136934, "grad_norm": 0.4098285138607025, "learning_rate": 0.0003870157251019219, "loss": 3.2861, "step": 61050 }, { "epoch": 17.7824871914299, "grad_norm": 0.35150986909866333, "learning_rate": 0.0003868410017472335, "loss": 3.2715, "step": 61100 }, { "epoch": 17.797042384722868, "grad_norm": 0.36557427048683167, "learning_rate": 0.00038666627839254505, "loss": 3.2828, "step": 61150 }, { "epoch": 17.811597578015835, "grad_norm": 0.39524245262145996, "learning_rate": 0.0003864915550378567, "loss": 3.2722, "step": 61200 }, { "epoch": 17.826152771308802, "grad_norm": 0.3873816430568695, "learning_rate": 0.0003863168316831683, "loss": 3.2796, "step": 61250 }, { "epoch": 17.84070796460177, "grad_norm": 0.40092021226882935, "learning_rate": 0.0003861421083284799, "loss": 3.2816, "step": 61300 }, { "epoch": 17.855263157894736, "grad_norm": 0.38361555337905884, "learning_rate": 0.0003859673849737914, "loss": 3.2912, "step": 61350 }, { "epoch": 17.869818351187703, "grad_norm": 0.38881802558898926, "learning_rate": 0.000385792661619103, "loss": 3.2786, "step": 61400 }, { "epoch": 17.88437354448067, "grad_norm": 0.3683280348777771, "learning_rate": 0.00038561793826441467, "loss": 3.2749, "step": 61450 }, { "epoch": 17.898928737773637, "grad_norm": 0.39087674021720886, "learning_rate": 0.00038544321490972626, "loss": 3.2871, "step": 61500 }, { "epoch": 17.913483931066605, "grad_norm": 0.37165287137031555, "learning_rate": 0.0003852684915550378, "loss": 3.2775, "step": 61550 }, { "epoch": 17.92803912435957, "grad_norm": 0.3986796736717224, "learning_rate": 0.0003850937682003494, "loss": 3.2802, "step": 61600 }, { "epoch": 17.94259431765254, "grad_norm": 0.35459092259407043, "learning_rate": 0.000384919044845661, "loss": 3.2804, "step": 61650 }, { "epoch": 17.957149510945506, "grad_norm": 0.3967892825603485, "learning_rate": 0.0003847443214909726, "loss": 3.2813, "step": 61700 }, { "epoch": 17.971704704238473, "grad_norm": 0.3752075731754303, "learning_rate": 0.00038456959813628423, "loss": 3.2872, "step": 61750 }, { "epoch": 17.98625989753144, "grad_norm": 0.39225253462791443, "learning_rate": 0.0003843948747815958, "loss": 3.2836, "step": 61800 }, { "epoch": 18.000582207731718, "grad_norm": 0.42140141129493713, "learning_rate": 0.00038422015142690737, "loss": 3.282, "step": 61850 }, { "epoch": 18.015137401024685, "grad_norm": 0.40152743458747864, "learning_rate": 0.00038404542807221896, "loss": 3.1837, "step": 61900 }, { "epoch": 18.029692594317652, "grad_norm": 0.37373530864715576, "learning_rate": 0.0003838707047175305, "loss": 3.1764, "step": 61950 }, { "epoch": 18.04424778761062, "grad_norm": 0.37651729583740234, "learning_rate": 0.00038369598136284215, "loss": 3.1885, "step": 62000 }, { "epoch": 18.04424778761062, "eval_accuracy": 0.37258341823701757, "eval_loss": 3.54862904548645, "eval_runtime": 180.2349, "eval_samples_per_second": 92.385, "eval_steps_per_second": 5.776, "step": 62000 }, { "epoch": 18.058802980903586, "grad_norm": 0.3814041018486023, "learning_rate": 0.00038352125800815374, "loss": 3.1892, "step": 62050 }, { "epoch": 18.073358174196553, "grad_norm": 0.4944424629211426, "learning_rate": 0.00038334653465346534, "loss": 3.1993, "step": 62100 }, { "epoch": 18.08791336748952, "grad_norm": 0.4087458550930023, "learning_rate": 0.0003831718112987769, "loss": 3.2022, "step": 62150 }, { "epoch": 18.102468560782487, "grad_norm": 0.3899799883365631, "learning_rate": 0.0003829970879440885, "loss": 3.2063, "step": 62200 }, { "epoch": 18.117023754075454, "grad_norm": 0.4076121151447296, "learning_rate": 0.00038282236458940007, "loss": 3.2, "step": 62250 }, { "epoch": 18.13157894736842, "grad_norm": 0.3632643222808838, "learning_rate": 0.0003826476412347117, "loss": 3.2039, "step": 62300 }, { "epoch": 18.14613414066139, "grad_norm": 0.42115017771720886, "learning_rate": 0.00038247291788002326, "loss": 3.2063, "step": 62350 }, { "epoch": 18.160689333954355, "grad_norm": 0.3967747390270233, "learning_rate": 0.00038229819452533485, "loss": 3.2026, "step": 62400 }, { "epoch": 18.175244527247322, "grad_norm": 0.3931179344654083, "learning_rate": 0.00038212347117064644, "loss": 3.209, "step": 62450 }, { "epoch": 18.18979972054029, "grad_norm": 0.39802834391593933, "learning_rate": 0.000381948747815958, "loss": 3.2039, "step": 62500 }, { "epoch": 18.204354913833257, "grad_norm": 0.3884234130382538, "learning_rate": 0.0003817740244612696, "loss": 3.2183, "step": 62550 }, { "epoch": 18.218910107126224, "grad_norm": 0.3891511857509613, "learning_rate": 0.00038159930110658123, "loss": 3.2184, "step": 62600 }, { "epoch": 18.23346530041919, "grad_norm": 0.36321720480918884, "learning_rate": 0.0003814245777518928, "loss": 3.2076, "step": 62650 }, { "epoch": 18.248020493712158, "grad_norm": 0.4004864990711212, "learning_rate": 0.0003812498543972044, "loss": 3.2146, "step": 62700 }, { "epoch": 18.262575687005125, "grad_norm": 0.41579169034957886, "learning_rate": 0.00038107513104251596, "loss": 3.2436, "step": 62750 }, { "epoch": 18.277130880298092, "grad_norm": 0.3897056579589844, "learning_rate": 0.00038090040768782755, "loss": 3.2361, "step": 62800 }, { "epoch": 18.29168607359106, "grad_norm": 0.3931843340396881, "learning_rate": 0.0003807256843331392, "loss": 3.2245, "step": 62850 }, { "epoch": 18.306241266884022, "grad_norm": 0.38746121525764465, "learning_rate": 0.0003805509609784508, "loss": 3.2207, "step": 62900 }, { "epoch": 18.32079646017699, "grad_norm": 0.38906243443489075, "learning_rate": 0.00038037623762376233, "loss": 3.2328, "step": 62950 }, { "epoch": 18.335351653469957, "grad_norm": 0.3792373239994049, "learning_rate": 0.00038020151426907393, "loss": 3.2445, "step": 63000 }, { "epoch": 18.335351653469957, "eval_accuracy": 0.3725381702244337, "eval_loss": 3.545516014099121, "eval_runtime": 180.0356, "eval_samples_per_second": 92.487, "eval_steps_per_second": 5.782, "step": 63000 }, { "epoch": 18.349906846762924, "grad_norm": 0.4212195575237274, "learning_rate": 0.0003800267909143855, "loss": 3.237, "step": 63050 }, { "epoch": 18.36446204005589, "grad_norm": 0.4178350269794464, "learning_rate": 0.00037985206755969706, "loss": 3.2283, "step": 63100 }, { "epoch": 18.379017233348858, "grad_norm": 0.41265183687210083, "learning_rate": 0.0003796773442050087, "loss": 3.2409, "step": 63150 }, { "epoch": 18.393572426641825, "grad_norm": 0.3973737061023712, "learning_rate": 0.0003795026208503203, "loss": 3.2316, "step": 63200 }, { "epoch": 18.408127619934792, "grad_norm": 0.4266796112060547, "learning_rate": 0.0003793278974956319, "loss": 3.2305, "step": 63250 }, { "epoch": 18.42268281322776, "grad_norm": 0.40173739194869995, "learning_rate": 0.00037915317414094344, "loss": 3.2346, "step": 63300 }, { "epoch": 18.437238006520726, "grad_norm": 0.39043140411376953, "learning_rate": 0.00037897845078625503, "loss": 3.2434, "step": 63350 }, { "epoch": 18.451793199813693, "grad_norm": 0.373183012008667, "learning_rate": 0.0003788037274315667, "loss": 3.2368, "step": 63400 }, { "epoch": 18.46634839310666, "grad_norm": 0.42309918999671936, "learning_rate": 0.0003786290040768783, "loss": 3.2392, "step": 63450 }, { "epoch": 18.480903586399627, "grad_norm": 0.37451791763305664, "learning_rate": 0.0003784542807221898, "loss": 3.2479, "step": 63500 }, { "epoch": 18.495458779692594, "grad_norm": 0.3538275361061096, "learning_rate": 0.0003782795573675014, "loss": 3.2392, "step": 63550 }, { "epoch": 18.51001397298556, "grad_norm": 0.38027676939964294, "learning_rate": 0.000378104834012813, "loss": 3.2473, "step": 63600 }, { "epoch": 18.52456916627853, "grad_norm": 0.4012594521045685, "learning_rate": 0.0003779301106581246, "loss": 3.2405, "step": 63650 }, { "epoch": 18.539124359571495, "grad_norm": 0.41343775391578674, "learning_rate": 0.0003777553873034362, "loss": 3.2525, "step": 63700 }, { "epoch": 18.553679552864462, "grad_norm": 0.40950021147727966, "learning_rate": 0.0003775806639487478, "loss": 3.2576, "step": 63750 }, { "epoch": 18.56823474615743, "grad_norm": 0.38618969917297363, "learning_rate": 0.0003774059405940594, "loss": 3.2453, "step": 63800 }, { "epoch": 18.582789939450397, "grad_norm": 0.38107502460479736, "learning_rate": 0.000377231217239371, "loss": 3.2594, "step": 63850 }, { "epoch": 18.597345132743364, "grad_norm": 0.38566017150878906, "learning_rate": 0.0003770564938846825, "loss": 3.2561, "step": 63900 }, { "epoch": 18.61190032603633, "grad_norm": 0.39558449387550354, "learning_rate": 0.00037688177052999416, "loss": 3.2493, "step": 63950 }, { "epoch": 18.626455519329298, "grad_norm": 0.4115273952484131, "learning_rate": 0.00037670704717530576, "loss": 3.2465, "step": 64000 }, { "epoch": 18.626455519329298, "eval_accuracy": 0.37315883192431804, "eval_loss": 3.540478467941284, "eval_runtime": 180.2088, "eval_samples_per_second": 92.398, "eval_steps_per_second": 5.777, "step": 64000 }, { "epoch": 18.641010712622265, "grad_norm": 0.368210107088089, "learning_rate": 0.00037653232382061735, "loss": 3.2514, "step": 64050 }, { "epoch": 18.655565905915232, "grad_norm": 0.39872172474861145, "learning_rate": 0.0003763576004659289, "loss": 3.2559, "step": 64100 }, { "epoch": 18.6701210992082, "grad_norm": 0.4296588599681854, "learning_rate": 0.0003761828771112405, "loss": 3.2535, "step": 64150 }, { "epoch": 18.684676292501166, "grad_norm": 0.39066988229751587, "learning_rate": 0.0003760081537565521, "loss": 3.2682, "step": 64200 }, { "epoch": 18.69923148579413, "grad_norm": 0.3903511166572571, "learning_rate": 0.00037583343040186373, "loss": 3.2698, "step": 64250 }, { "epoch": 18.713786679087097, "grad_norm": 0.37758907675743103, "learning_rate": 0.00037565870704717527, "loss": 3.2598, "step": 64300 }, { "epoch": 18.728341872380064, "grad_norm": 0.40596866607666016, "learning_rate": 0.00037548398369248687, "loss": 3.2718, "step": 64350 }, { "epoch": 18.74289706567303, "grad_norm": 0.3940157890319824, "learning_rate": 0.00037530926033779846, "loss": 3.2595, "step": 64400 }, { "epoch": 18.757452258965998, "grad_norm": 0.41992583870887756, "learning_rate": 0.00037513453698311, "loss": 3.2585, "step": 64450 }, { "epoch": 18.772007452258965, "grad_norm": 0.3646014332771301, "learning_rate": 0.0003749598136284216, "loss": 3.2716, "step": 64500 }, { "epoch": 18.786562645551932, "grad_norm": 0.38287869095802307, "learning_rate": 0.00037478509027373324, "loss": 3.2749, "step": 64550 }, { "epoch": 18.8011178388449, "grad_norm": 0.3826562166213989, "learning_rate": 0.00037461036691904484, "loss": 3.2659, "step": 64600 }, { "epoch": 18.815673032137866, "grad_norm": 0.4171614348888397, "learning_rate": 0.0003744356435643564, "loss": 3.2648, "step": 64650 }, { "epoch": 18.830228225430833, "grad_norm": 0.3784530460834503, "learning_rate": 0.00037426092020966797, "loss": 3.2846, "step": 64700 }, { "epoch": 18.8447834187238, "grad_norm": 0.3979530334472656, "learning_rate": 0.00037408619685497957, "loss": 3.2685, "step": 64750 }, { "epoch": 18.859338612016767, "grad_norm": 0.36709171533584595, "learning_rate": 0.0003739114735002912, "loss": 3.2728, "step": 64800 }, { "epoch": 18.873893805309734, "grad_norm": 0.34891071915626526, "learning_rate": 0.0003737367501456028, "loss": 3.2833, "step": 64850 }, { "epoch": 18.8884489986027, "grad_norm": 0.3874875009059906, "learning_rate": 0.00037356202679091435, "loss": 3.2724, "step": 64900 }, { "epoch": 18.90300419189567, "grad_norm": 0.4058416485786438, "learning_rate": 0.00037338730343622594, "loss": 3.2844, "step": 64950 }, { "epoch": 18.917559385188635, "grad_norm": 0.44667288661003113, "learning_rate": 0.00037321258008153754, "loss": 3.2764, "step": 65000 }, { "epoch": 18.917559385188635, "eval_accuracy": 0.3735741734216465, "eval_loss": 3.5327699184417725, "eval_runtime": 180.0167, "eval_samples_per_second": 92.497, "eval_steps_per_second": 5.783, "step": 65000 }, { "epoch": 18.932114578481603, "grad_norm": 0.3843158781528473, "learning_rate": 0.0003730378567268491, "loss": 3.274, "step": 65050 }, { "epoch": 18.94666977177457, "grad_norm": 0.3540723919868469, "learning_rate": 0.0003728631333721607, "loss": 3.2812, "step": 65100 }, { "epoch": 18.961224965067537, "grad_norm": 0.40248191356658936, "learning_rate": 0.0003726884100174723, "loss": 3.275, "step": 65150 }, { "epoch": 18.975780158360504, "grad_norm": 0.34741318225860596, "learning_rate": 0.0003725136866627839, "loss": 3.2775, "step": 65200 }, { "epoch": 18.99033535165347, "grad_norm": 0.39936500787734985, "learning_rate": 0.00037233896330809545, "loss": 3.2672, "step": 65250 }, { "epoch": 19.00465766185375, "grad_norm": 0.3778507113456726, "learning_rate": 0.00037216423995340705, "loss": 3.2513, "step": 65300 }, { "epoch": 19.019212855146716, "grad_norm": 0.3780449330806732, "learning_rate": 0.0003719895165987187, "loss": 3.1684, "step": 65350 }, { "epoch": 19.033768048439683, "grad_norm": 0.39780861139297485, "learning_rate": 0.0003718147932440303, "loss": 3.1865, "step": 65400 }, { "epoch": 19.04832324173265, "grad_norm": 0.3987348675727844, "learning_rate": 0.00037164006988934183, "loss": 3.1687, "step": 65450 }, { "epoch": 19.062878435025617, "grad_norm": 0.38881710171699524, "learning_rate": 0.0003714653465346534, "loss": 3.1707, "step": 65500 }, { "epoch": 19.077433628318584, "grad_norm": 0.3946903645992279, "learning_rate": 0.000371290623179965, "loss": 3.1756, "step": 65550 }, { "epoch": 19.09198882161155, "grad_norm": 0.4039105474948883, "learning_rate": 0.00037111589982527656, "loss": 3.1861, "step": 65600 }, { "epoch": 19.106544014904518, "grad_norm": 0.3881465792655945, "learning_rate": 0.0003709411764705882, "loss": 3.2007, "step": 65650 }, { "epoch": 19.121099208197485, "grad_norm": 0.3852066695690155, "learning_rate": 0.0003707664531158998, "loss": 3.1996, "step": 65700 }, { "epoch": 19.135654401490452, "grad_norm": 0.39785265922546387, "learning_rate": 0.0003705917297612114, "loss": 3.1999, "step": 65750 }, { "epoch": 19.15020959478342, "grad_norm": 0.392558217048645, "learning_rate": 0.000370417006406523, "loss": 3.1988, "step": 65800 }, { "epoch": 19.164764788076386, "grad_norm": 0.39606714248657227, "learning_rate": 0.00037024228305183453, "loss": 3.2025, "step": 65850 }, { "epoch": 19.179319981369353, "grad_norm": 0.404738187789917, "learning_rate": 0.0003700675596971461, "loss": 3.2077, "step": 65900 }, { "epoch": 19.19387517466232, "grad_norm": 0.40327906608581543, "learning_rate": 0.0003698928363424578, "loss": 3.198, "step": 65950 }, { "epoch": 19.208430367955287, "grad_norm": 0.4272768199443817, "learning_rate": 0.00036971811298776937, "loss": 3.219, "step": 66000 }, { "epoch": 19.208430367955287, "eval_accuracy": 0.37296209121505725, "eval_loss": 3.5485544204711914, "eval_runtime": 180.1561, "eval_samples_per_second": 92.425, "eval_steps_per_second": 5.778, "step": 66000 }, { "epoch": 19.222985561248255, "grad_norm": 0.38709867000579834, "learning_rate": 0.0003695433896330809, "loss": 3.208, "step": 66050 }, { "epoch": 19.23754075454122, "grad_norm": 0.40662863850593567, "learning_rate": 0.0003693686662783925, "loss": 3.2098, "step": 66100 }, { "epoch": 19.25209594783419, "grad_norm": 0.37747031450271606, "learning_rate": 0.0003691939429237041, "loss": 3.2196, "step": 66150 }, { "epoch": 19.266651141127156, "grad_norm": 0.3744738698005676, "learning_rate": 0.00036901921956901575, "loss": 3.2079, "step": 66200 }, { "epoch": 19.281206334420123, "grad_norm": 0.3797939419746399, "learning_rate": 0.0003688444962143273, "loss": 3.2086, "step": 66250 }, { "epoch": 19.29576152771309, "grad_norm": 0.423014372587204, "learning_rate": 0.0003686697728596389, "loss": 3.2184, "step": 66300 }, { "epoch": 19.310316721006053, "grad_norm": 0.375533789396286, "learning_rate": 0.0003684950495049505, "loss": 3.2126, "step": 66350 }, { "epoch": 19.32487191429902, "grad_norm": 0.4028799831867218, "learning_rate": 0.000368320326150262, "loss": 3.2137, "step": 66400 }, { "epoch": 19.339427107591987, "grad_norm": 0.3832583427429199, "learning_rate": 0.0003681456027955736, "loss": 3.2249, "step": 66450 }, { "epoch": 19.353982300884955, "grad_norm": 0.36618658900260925, "learning_rate": 0.00036797087944088526, "loss": 3.2198, "step": 66500 }, { "epoch": 19.36853749417792, "grad_norm": 0.3598731458187103, "learning_rate": 0.00036779615608619685, "loss": 3.2163, "step": 66550 }, { "epoch": 19.38309268747089, "grad_norm": 0.39049288630485535, "learning_rate": 0.0003676214327315084, "loss": 3.2243, "step": 66600 }, { "epoch": 19.397647880763856, "grad_norm": 0.3819805383682251, "learning_rate": 0.00036744670937682, "loss": 3.2224, "step": 66650 }, { "epoch": 19.412203074056823, "grad_norm": 0.4180566072463989, "learning_rate": 0.0003672719860221316, "loss": 3.2247, "step": 66700 }, { "epoch": 19.42675826734979, "grad_norm": 0.41168153285980225, "learning_rate": 0.00036709726266744323, "loss": 3.2312, "step": 66750 }, { "epoch": 19.441313460642757, "grad_norm": 0.406559020280838, "learning_rate": 0.00036692253931275477, "loss": 3.2268, "step": 66800 }, { "epoch": 19.455868653935724, "grad_norm": 0.41155582666397095, "learning_rate": 0.00036674781595806636, "loss": 3.2348, "step": 66850 }, { "epoch": 19.47042384722869, "grad_norm": 0.38779836893081665, "learning_rate": 0.00036657309260337796, "loss": 3.2363, "step": 66900 }, { "epoch": 19.484979040521658, "grad_norm": 0.44479212164878845, "learning_rate": 0.00036639836924868955, "loss": 3.2268, "step": 66950 }, { "epoch": 19.499534233814625, "grad_norm": 0.39139696955680847, "learning_rate": 0.0003662236458940011, "loss": 3.2462, "step": 67000 }, { "epoch": 19.499534233814625, "eval_accuracy": 0.3734428954215005, "eval_loss": 3.542003870010376, "eval_runtime": 179.954, "eval_samples_per_second": 92.529, "eval_steps_per_second": 5.785, "step": 67000 }, { "epoch": 19.514089427107592, "grad_norm": 0.3847859501838684, "learning_rate": 0.00036604892253931274, "loss": 3.2392, "step": 67050 }, { "epoch": 19.52864462040056, "grad_norm": 0.3877638578414917, "learning_rate": 0.00036587419918462433, "loss": 3.2374, "step": 67100 }, { "epoch": 19.543199813693526, "grad_norm": 0.39824333786964417, "learning_rate": 0.00036569947582993593, "loss": 3.2341, "step": 67150 }, { "epoch": 19.557755006986493, "grad_norm": 0.37552469968795776, "learning_rate": 0.00036552475247524747, "loss": 3.2486, "step": 67200 }, { "epoch": 19.57231020027946, "grad_norm": 0.4169178903102875, "learning_rate": 0.00036535002912055906, "loss": 3.2467, "step": 67250 }, { "epoch": 19.586865393572428, "grad_norm": 0.3765036463737488, "learning_rate": 0.00036517530576587066, "loss": 3.2498, "step": 67300 }, { "epoch": 19.601420586865395, "grad_norm": 0.39935049414634705, "learning_rate": 0.0003650005824111823, "loss": 3.2462, "step": 67350 }, { "epoch": 19.61597578015836, "grad_norm": 0.40401580929756165, "learning_rate": 0.00036482585905649385, "loss": 3.2514, "step": 67400 }, { "epoch": 19.63053097345133, "grad_norm": 0.3723312318325043, "learning_rate": 0.00036465113570180544, "loss": 3.2506, "step": 67450 }, { "epoch": 19.645086166744296, "grad_norm": 0.36596208810806274, "learning_rate": 0.00036447641234711703, "loss": 3.2478, "step": 67500 }, { "epoch": 19.659641360037263, "grad_norm": 0.37875619530677795, "learning_rate": 0.0003643016889924286, "loss": 3.2353, "step": 67550 }, { "epoch": 19.67419655333023, "grad_norm": 0.3985027074813843, "learning_rate": 0.0003641269656377402, "loss": 3.2523, "step": 67600 }, { "epoch": 19.688751746623197, "grad_norm": 0.37452825903892517, "learning_rate": 0.0003639522422830518, "loss": 3.2468, "step": 67650 }, { "epoch": 19.70330693991616, "grad_norm": 0.38654303550720215, "learning_rate": 0.0003637775189283634, "loss": 3.2443, "step": 67700 }, { "epoch": 19.717862133209128, "grad_norm": 0.42383846640586853, "learning_rate": 0.00036360279557367495, "loss": 3.2495, "step": 67750 }, { "epoch": 19.732417326502095, "grad_norm": 0.41492125391960144, "learning_rate": 0.00036342807221898655, "loss": 3.2478, "step": 67800 }, { "epoch": 19.74697251979506, "grad_norm": 0.4153563976287842, "learning_rate": 0.00036325334886429814, "loss": 3.2594, "step": 67850 }, { "epoch": 19.76152771308803, "grad_norm": 0.3797180950641632, "learning_rate": 0.0003630786255096098, "loss": 3.2477, "step": 67900 }, { "epoch": 19.776082906380996, "grad_norm": 0.45580700039863586, "learning_rate": 0.00036290390215492133, "loss": 3.2621, "step": 67950 }, { "epoch": 19.790638099673963, "grad_norm": 0.3669946789741516, "learning_rate": 0.0003627291788002329, "loss": 3.2577, "step": 68000 }, { "epoch": 19.790638099673963, "eval_accuracy": 0.3737603366734202, "eval_loss": 3.533120632171631, "eval_runtime": 180.1071, "eval_samples_per_second": 92.451, "eval_steps_per_second": 5.78, "step": 68000 }, { "epoch": 19.80519329296693, "grad_norm": 0.3931442201137543, "learning_rate": 0.0003625544554455445, "loss": 3.257, "step": 68050 }, { "epoch": 19.819748486259897, "grad_norm": 0.3938417434692383, "learning_rate": 0.0003623797320908561, "loss": 3.2519, "step": 68100 }, { "epoch": 19.834303679552864, "grad_norm": 0.3742924928665161, "learning_rate": 0.00036220500873616776, "loss": 3.2496, "step": 68150 }, { "epoch": 19.84885887284583, "grad_norm": 0.39316800236701965, "learning_rate": 0.0003620302853814793, "loss": 3.262, "step": 68200 }, { "epoch": 19.863414066138798, "grad_norm": 0.42025673389434814, "learning_rate": 0.0003618555620267909, "loss": 3.26, "step": 68250 }, { "epoch": 19.877969259431765, "grad_norm": 0.395795077085495, "learning_rate": 0.0003616808386721025, "loss": 3.2595, "step": 68300 }, { "epoch": 19.892524452724732, "grad_norm": 0.4050418734550476, "learning_rate": 0.00036150611531741403, "loss": 3.2667, "step": 68350 }, { "epoch": 19.9070796460177, "grad_norm": 0.40615615248680115, "learning_rate": 0.0003613313919627256, "loss": 3.256, "step": 68400 }, { "epoch": 19.921634839310666, "grad_norm": 0.3845473825931549, "learning_rate": 0.00036115666860803727, "loss": 3.2573, "step": 68450 }, { "epoch": 19.936190032603633, "grad_norm": 0.36425554752349854, "learning_rate": 0.00036098194525334887, "loss": 3.2748, "step": 68500 }, { "epoch": 19.9507452258966, "grad_norm": 0.41645219922065735, "learning_rate": 0.0003608072218986604, "loss": 3.2623, "step": 68550 }, { "epoch": 19.965300419189568, "grad_norm": 0.3781222999095917, "learning_rate": 0.000360632498543972, "loss": 3.2599, "step": 68600 }, { "epoch": 19.979855612482535, "grad_norm": 0.4246581792831421, "learning_rate": 0.0003604577751892836, "loss": 3.2604, "step": 68650 }, { "epoch": 19.9944108057755, "grad_norm": 0.41079947352409363, "learning_rate": 0.00036028305183459513, "loss": 3.2589, "step": 68700 }, { "epoch": 20.00873311597578, "grad_norm": 0.3992273211479187, "learning_rate": 0.0003601083284799068, "loss": 3.1972, "step": 68750 }, { "epoch": 20.023288309268747, "grad_norm": 0.3754191994667053, "learning_rate": 0.0003599336051252184, "loss": 3.157, "step": 68800 }, { "epoch": 20.037843502561714, "grad_norm": 0.38159775733947754, "learning_rate": 0.00035975888177052997, "loss": 3.1717, "step": 68850 }, { "epoch": 20.05239869585468, "grad_norm": 0.3973357081413269, "learning_rate": 0.0003595841584158415, "loss": 3.1687, "step": 68900 }, { "epoch": 20.066953889147648, "grad_norm": 0.3836260139942169, "learning_rate": 0.0003594094350611531, "loss": 3.1707, "step": 68950 }, { "epoch": 20.081509082440615, "grad_norm": 0.40186402201652527, "learning_rate": 0.00035923471170646475, "loss": 3.167, "step": 69000 }, { "epoch": 20.081509082440615, "eval_accuracy": 0.3731075900191581, "eval_loss": 3.549851655960083, "eval_runtime": 179.9656, "eval_samples_per_second": 92.523, "eval_steps_per_second": 5.784, "step": 69000 }, { "epoch": 20.096064275733582, "grad_norm": 0.40042492747306824, "learning_rate": 0.00035905998835177635, "loss": 3.1769, "step": 69050 }, { "epoch": 20.11061946902655, "grad_norm": 0.4222542345523834, "learning_rate": 0.00035888526499708794, "loss": 3.1681, "step": 69100 }, { "epoch": 20.125174662319516, "grad_norm": 0.3928000032901764, "learning_rate": 0.0003587105416423995, "loss": 3.1891, "step": 69150 }, { "epoch": 20.139729855612483, "grad_norm": 0.4104302227497101, "learning_rate": 0.0003585358182877111, "loss": 3.1954, "step": 69200 }, { "epoch": 20.15428504890545, "grad_norm": 0.3952016532421112, "learning_rate": 0.00035836109493302267, "loss": 3.1906, "step": 69250 }, { "epoch": 20.168840242198417, "grad_norm": 0.4144646227359772, "learning_rate": 0.0003581863715783343, "loss": 3.1889, "step": 69300 }, { "epoch": 20.183395435491384, "grad_norm": 0.4134828448295593, "learning_rate": 0.00035801164822364586, "loss": 3.1892, "step": 69350 }, { "epoch": 20.19795062878435, "grad_norm": 0.40182971954345703, "learning_rate": 0.00035783692486895745, "loss": 3.1905, "step": 69400 }, { "epoch": 20.21250582207732, "grad_norm": 0.41879725456237793, "learning_rate": 0.00035766220151426905, "loss": 3.1966, "step": 69450 }, { "epoch": 20.227061015370285, "grad_norm": 0.3737371861934662, "learning_rate": 0.0003574874781595806, "loss": 3.2003, "step": 69500 }, { "epoch": 20.241616208663253, "grad_norm": 0.39393800497055054, "learning_rate": 0.00035731275480489224, "loss": 3.2002, "step": 69550 }, { "epoch": 20.25617140195622, "grad_norm": 0.3873503506183624, "learning_rate": 0.00035713803145020383, "loss": 3.1926, "step": 69600 }, { "epoch": 20.270726595249187, "grad_norm": 0.4044394791126251, "learning_rate": 0.0003569633080955154, "loss": 3.2207, "step": 69650 }, { "epoch": 20.28528178854215, "grad_norm": 0.4036005437374115, "learning_rate": 0.00035678858474082697, "loss": 3.2149, "step": 69700 }, { "epoch": 20.299836981835117, "grad_norm": 0.39418116211891174, "learning_rate": 0.00035661386138613856, "loss": 3.1983, "step": 69750 }, { "epoch": 20.314392175128084, "grad_norm": 0.3770759403705597, "learning_rate": 0.00035643913803145015, "loss": 3.1991, "step": 69800 }, { "epoch": 20.32894736842105, "grad_norm": 0.39399826526641846, "learning_rate": 0.0003562644146767618, "loss": 3.1957, "step": 69850 }, { "epoch": 20.34350256171402, "grad_norm": 0.4375757873058319, "learning_rate": 0.00035608969132207334, "loss": 3.2131, "step": 69900 }, { "epoch": 20.358057755006985, "grad_norm": 0.44734153151512146, "learning_rate": 0.00035591496796738494, "loss": 3.2229, "step": 69950 }, { "epoch": 20.372612948299953, "grad_norm": 0.37196409702301025, "learning_rate": 0.00035574024461269653, "loss": 3.222, "step": 70000 }, { "epoch": 20.372612948299953, "eval_accuracy": 0.3733573355431601, "eval_loss": 3.5451903343200684, "eval_runtime": 180.0168, "eval_samples_per_second": 92.497, "eval_steps_per_second": 5.783, "step": 70000 }, { "epoch": 20.38716814159292, "grad_norm": 0.4085177779197693, "learning_rate": 0.0003555655212580081, "loss": 3.2213, "step": 70050 }, { "epoch": 20.401723334885887, "grad_norm": 0.38436266779899597, "learning_rate": 0.00035539079790331967, "loss": 3.2107, "step": 70100 }, { "epoch": 20.416278528178854, "grad_norm": 0.3825179636478424, "learning_rate": 0.0003552160745486313, "loss": 3.2259, "step": 70150 }, { "epoch": 20.43083372147182, "grad_norm": 0.3777851462364197, "learning_rate": 0.0003550413511939429, "loss": 3.2156, "step": 70200 }, { "epoch": 20.445388914764788, "grad_norm": 0.39768534898757935, "learning_rate": 0.0003548666278392545, "loss": 3.2171, "step": 70250 }, { "epoch": 20.459944108057755, "grad_norm": 0.3983946442604065, "learning_rate": 0.00035469190448456604, "loss": 3.2204, "step": 70300 }, { "epoch": 20.474499301350722, "grad_norm": 0.4463108777999878, "learning_rate": 0.00035451718112987764, "loss": 3.2244, "step": 70350 }, { "epoch": 20.48905449464369, "grad_norm": 0.39498451352119446, "learning_rate": 0.0003543424577751893, "loss": 3.2292, "step": 70400 }, { "epoch": 20.503609687936656, "grad_norm": 0.4179089665412903, "learning_rate": 0.0003541677344205009, "loss": 3.2239, "step": 70450 }, { "epoch": 20.518164881229623, "grad_norm": 0.39351847767829895, "learning_rate": 0.0003539930110658124, "loss": 3.2316, "step": 70500 }, { "epoch": 20.53272007452259, "grad_norm": 0.39744892716407776, "learning_rate": 0.000353818287711124, "loss": 3.2368, "step": 70550 }, { "epoch": 20.547275267815557, "grad_norm": 0.4104430079460144, "learning_rate": 0.0003536435643564356, "loss": 3.2287, "step": 70600 }, { "epoch": 20.561830461108524, "grad_norm": 0.3697310984134674, "learning_rate": 0.00035346884100174715, "loss": 3.241, "step": 70650 }, { "epoch": 20.57638565440149, "grad_norm": 0.380302757024765, "learning_rate": 0.0003532941176470588, "loss": 3.2207, "step": 70700 }, { "epoch": 20.59094084769446, "grad_norm": 0.40270739793777466, "learning_rate": 0.0003531193942923704, "loss": 3.2319, "step": 70750 }, { "epoch": 20.605496040987425, "grad_norm": 0.38786470890045166, "learning_rate": 0.000352944670937682, "loss": 3.2265, "step": 70800 }, { "epoch": 20.620051234280393, "grad_norm": 0.3962876796722412, "learning_rate": 0.0003527699475829935, "loss": 3.2257, "step": 70850 }, { "epoch": 20.63460642757336, "grad_norm": 0.38521769642829895, "learning_rate": 0.0003525952242283051, "loss": 3.2221, "step": 70900 }, { "epoch": 20.649161620866327, "grad_norm": 0.3857249915599823, "learning_rate": 0.00035242050087361677, "loss": 3.2421, "step": 70950 }, { "epoch": 20.663716814159294, "grad_norm": 0.42854899168014526, "learning_rate": 0.00035224577751892836, "loss": 3.2491, "step": 71000 }, { "epoch": 20.663716814159294, "eval_accuracy": 0.3735177603150484, "eval_loss": 3.538139581680298, "eval_runtime": 180.0287, "eval_samples_per_second": 92.491, "eval_steps_per_second": 5.782, "step": 71000 }, { "epoch": 20.678272007452257, "grad_norm": 0.4254036843776703, "learning_rate": 0.0003520710541642399, "loss": 3.2446, "step": 71050 }, { "epoch": 20.692827200745224, "grad_norm": 0.428549200296402, "learning_rate": 0.0003518963308095515, "loss": 3.2401, "step": 71100 }, { "epoch": 20.70738239403819, "grad_norm": 0.39716416597366333, "learning_rate": 0.0003517216074548631, "loss": 3.2556, "step": 71150 }, { "epoch": 20.72193758733116, "grad_norm": 0.408746600151062, "learning_rate": 0.0003515468841001747, "loss": 3.2433, "step": 71200 }, { "epoch": 20.736492780624125, "grad_norm": 0.4162278473377228, "learning_rate": 0.00035137216074548634, "loss": 3.2448, "step": 71250 }, { "epoch": 20.751047973917093, "grad_norm": 0.3788270056247711, "learning_rate": 0.0003511974373907979, "loss": 3.237, "step": 71300 }, { "epoch": 20.76560316721006, "grad_norm": 0.39856094121932983, "learning_rate": 0.00035102271403610947, "loss": 3.2449, "step": 71350 }, { "epoch": 20.780158360503027, "grad_norm": 0.38427576422691345, "learning_rate": 0.00035084799068142106, "loss": 3.2359, "step": 71400 }, { "epoch": 20.794713553795994, "grad_norm": 0.39521774649620056, "learning_rate": 0.0003506732673267326, "loss": 3.2485, "step": 71450 }, { "epoch": 20.80926874708896, "grad_norm": 0.4041496515274048, "learning_rate": 0.00035049854397204425, "loss": 3.2346, "step": 71500 }, { "epoch": 20.823823940381928, "grad_norm": 0.3986034095287323, "learning_rate": 0.00035032382061735585, "loss": 3.2398, "step": 71550 }, { "epoch": 20.838379133674895, "grad_norm": 0.41229447722435, "learning_rate": 0.00035014909726266744, "loss": 3.2556, "step": 71600 }, { "epoch": 20.852934326967862, "grad_norm": 0.38755762577056885, "learning_rate": 0.000349974373907979, "loss": 3.2569, "step": 71650 }, { "epoch": 20.86748952026083, "grad_norm": 0.39384886622428894, "learning_rate": 0.0003497996505532906, "loss": 3.2367, "step": 71700 }, { "epoch": 20.882044713553796, "grad_norm": 0.3720444440841675, "learning_rate": 0.00034962492719860217, "loss": 3.2524, "step": 71750 }, { "epoch": 20.896599906846763, "grad_norm": 0.4046402871608734, "learning_rate": 0.0003494502038439138, "loss": 3.2483, "step": 71800 }, { "epoch": 20.91115510013973, "grad_norm": 0.4132034182548523, "learning_rate": 0.00034927548048922536, "loss": 3.247, "step": 71850 }, { "epoch": 20.925710293432697, "grad_norm": 0.3795117437839508, "learning_rate": 0.00034910075713453695, "loss": 3.256, "step": 71900 }, { "epoch": 20.940265486725664, "grad_norm": 0.4026730954647064, "learning_rate": 0.00034892603377984855, "loss": 3.2435, "step": 71950 }, { "epoch": 20.95482068001863, "grad_norm": 0.4107663333415985, "learning_rate": 0.0003487513104251601, "loss": 3.2553, "step": 72000 }, { "epoch": 20.95482068001863, "eval_accuracy": 0.37412960746702684, "eval_loss": 3.530897378921509, "eval_runtime": 180.2486, "eval_samples_per_second": 92.378, "eval_steps_per_second": 5.775, "step": 72000 }, { "epoch": 20.9693758733116, "grad_norm": 0.40838685631752014, "learning_rate": 0.0003485765870704717, "loss": 3.2488, "step": 72050 }, { "epoch": 20.983931066604566, "grad_norm": 0.4090084135532379, "learning_rate": 0.00034840186371578333, "loss": 3.2534, "step": 72100 }, { "epoch": 20.998486259897533, "grad_norm": 0.412088006734848, "learning_rate": 0.0003482271403610949, "loss": 3.2535, "step": 72150 }, { "epoch": 21.01280857009781, "grad_norm": 0.3911440372467041, "learning_rate": 0.0003480524170064065, "loss": 3.168, "step": 72200 }, { "epoch": 21.027363763390778, "grad_norm": 0.4080849289894104, "learning_rate": 0.00034787769365171806, "loss": 3.1416, "step": 72250 }, { "epoch": 21.041918956683745, "grad_norm": 0.40647292137145996, "learning_rate": 0.00034770297029702965, "loss": 3.141, "step": 72300 }, { "epoch": 21.05647414997671, "grad_norm": 0.40753230452537537, "learning_rate": 0.0003475282469423413, "loss": 3.1604, "step": 72350 }, { "epoch": 21.07102934326968, "grad_norm": 0.3992460072040558, "learning_rate": 0.0003473535235876529, "loss": 3.1669, "step": 72400 }, { "epoch": 21.085584536562646, "grad_norm": 0.40380969643592834, "learning_rate": 0.00034717880023296444, "loss": 3.1779, "step": 72450 }, { "epoch": 21.100139729855613, "grad_norm": 0.4181092381477356, "learning_rate": 0.00034700407687827603, "loss": 3.1626, "step": 72500 }, { "epoch": 21.11469492314858, "grad_norm": 0.4171045124530792, "learning_rate": 0.0003468293535235876, "loss": 3.1711, "step": 72550 }, { "epoch": 21.129250116441547, "grad_norm": 0.43060147762298584, "learning_rate": 0.00034665463016889916, "loss": 3.1769, "step": 72600 }, { "epoch": 21.143805309734514, "grad_norm": 0.4202156066894531, "learning_rate": 0.0003464799068142108, "loss": 3.1898, "step": 72650 }, { "epoch": 21.15836050302748, "grad_norm": 0.39121031761169434, "learning_rate": 0.0003463051834595224, "loss": 3.1852, "step": 72700 }, { "epoch": 21.172915696320448, "grad_norm": 0.376960426568985, "learning_rate": 0.000346130460104834, "loss": 3.1689, "step": 72750 }, { "epoch": 21.187470889613415, "grad_norm": 0.39861029386520386, "learning_rate": 0.00034595573675014554, "loss": 3.182, "step": 72800 }, { "epoch": 21.202026082906382, "grad_norm": 0.40720611810684204, "learning_rate": 0.00034578101339545714, "loss": 3.1793, "step": 72850 }, { "epoch": 21.21658127619935, "grad_norm": 0.39739352464675903, "learning_rate": 0.0003456062900407688, "loss": 3.1883, "step": 72900 }, { "epoch": 21.231136469492316, "grad_norm": 0.42822033166885376, "learning_rate": 0.0003454315666860804, "loss": 3.1878, "step": 72950 }, { "epoch": 21.245691662785283, "grad_norm": 0.4081740379333496, "learning_rate": 0.0003452568433313919, "loss": 3.1908, "step": 73000 }, { "epoch": 21.245691662785283, "eval_accuracy": 0.3733120875305762, "eval_loss": 3.5485897064208984, "eval_runtime": 179.9545, "eval_samples_per_second": 92.529, "eval_steps_per_second": 5.785, "step": 73000 }, { "epoch": 21.260246856078247, "grad_norm": 0.38723376393318176, "learning_rate": 0.0003450821199767035, "loss": 3.1966, "step": 73050 }, { "epoch": 21.274802049371214, "grad_norm": 0.3919490575790405, "learning_rate": 0.0003449073966220151, "loss": 3.1982, "step": 73100 }, { "epoch": 21.28935724266418, "grad_norm": 0.39738136529922485, "learning_rate": 0.0003447326732673267, "loss": 3.1933, "step": 73150 }, { "epoch": 21.303912435957148, "grad_norm": 0.4063931107521057, "learning_rate": 0.0003445579499126383, "loss": 3.1874, "step": 73200 }, { "epoch": 21.318467629250115, "grad_norm": 0.43299397826194763, "learning_rate": 0.0003443832265579499, "loss": 3.1968, "step": 73250 }, { "epoch": 21.333022822543082, "grad_norm": 0.40099453926086426, "learning_rate": 0.0003442085032032615, "loss": 3.2096, "step": 73300 }, { "epoch": 21.34757801583605, "grad_norm": 0.39773184061050415, "learning_rate": 0.0003440337798485731, "loss": 3.1968, "step": 73350 }, { "epoch": 21.362133209129016, "grad_norm": 0.3891940116882324, "learning_rate": 0.0003438590564938846, "loss": 3.2047, "step": 73400 }, { "epoch": 21.376688402421983, "grad_norm": 0.4392438232898712, "learning_rate": 0.0003436843331391962, "loss": 3.1968, "step": 73450 }, { "epoch": 21.39124359571495, "grad_norm": 0.3854880630970001, "learning_rate": 0.00034350960978450786, "loss": 3.205, "step": 73500 }, { "epoch": 21.405798789007918, "grad_norm": 0.3821921646595001, "learning_rate": 0.00034333488642981946, "loss": 3.2154, "step": 73550 }, { "epoch": 21.420353982300885, "grad_norm": 0.3998347222805023, "learning_rate": 0.000343160163075131, "loss": 3.2112, "step": 73600 }, { "epoch": 21.43490917559385, "grad_norm": 0.3848041296005249, "learning_rate": 0.0003429854397204426, "loss": 3.1998, "step": 73650 }, { "epoch": 21.44946436888682, "grad_norm": 0.4280722439289093, "learning_rate": 0.0003428107163657542, "loss": 3.2154, "step": 73700 }, { "epoch": 21.464019562179786, "grad_norm": 0.40564489364624023, "learning_rate": 0.00034263599301106583, "loss": 3.2214, "step": 73750 }, { "epoch": 21.478574755472753, "grad_norm": 0.3900902569293976, "learning_rate": 0.0003424612696563774, "loss": 3.2217, "step": 73800 }, { "epoch": 21.49312994876572, "grad_norm": 0.42186617851257324, "learning_rate": 0.00034228654630168897, "loss": 3.2041, "step": 73850 }, { "epoch": 21.507685142058687, "grad_norm": 0.40647152066230774, "learning_rate": 0.00034211182294700056, "loss": 3.2211, "step": 73900 }, { "epoch": 21.522240335351654, "grad_norm": 0.41695690155029297, "learning_rate": 0.0003419370995923121, "loss": 3.2186, "step": 73950 }, { "epoch": 21.53679552864462, "grad_norm": 0.4023297131061554, "learning_rate": 0.0003417623762376237, "loss": 3.2235, "step": 74000 }, { "epoch": 21.53679552864462, "eval_accuracy": 0.3741432406344547, "eval_loss": 3.5417726039886475, "eval_runtime": 181.233, "eval_samples_per_second": 91.876, "eval_steps_per_second": 5.744, "step": 74000 }, { "epoch": 21.551350721937588, "grad_norm": 0.39241155982017517, "learning_rate": 0.00034158765288293534, "loss": 3.2205, "step": 74050 }, { "epoch": 21.565905915230555, "grad_norm": 0.3722202479839325, "learning_rate": 0.00034141292952824694, "loss": 3.2243, "step": 74100 }, { "epoch": 21.580461108523522, "grad_norm": 0.38290977478027344, "learning_rate": 0.0003412382061735585, "loss": 3.226, "step": 74150 }, { "epoch": 21.59501630181649, "grad_norm": 0.43857231736183167, "learning_rate": 0.0003410634828188701, "loss": 3.2162, "step": 74200 }, { "epoch": 21.609571495109456, "grad_norm": 0.39363670349121094, "learning_rate": 0.00034088875946418167, "loss": 3.2247, "step": 74250 }, { "epoch": 21.624126688402423, "grad_norm": 0.3867611885070801, "learning_rate": 0.0003407140361094933, "loss": 3.2143, "step": 74300 }, { "epoch": 21.63868188169539, "grad_norm": 0.3955768942832947, "learning_rate": 0.00034053931275480486, "loss": 3.2255, "step": 74350 }, { "epoch": 21.653237074988354, "grad_norm": 0.39432933926582336, "learning_rate": 0.00034036458940011645, "loss": 3.2358, "step": 74400 }, { "epoch": 21.66779226828132, "grad_norm": 0.4440739154815674, "learning_rate": 0.00034018986604542804, "loss": 3.2295, "step": 74450 }, { "epoch": 21.682347461574288, "grad_norm": 0.41163793206214905, "learning_rate": 0.00034001514269073964, "loss": 3.224, "step": 74500 }, { "epoch": 21.696902654867255, "grad_norm": 0.41176968812942505, "learning_rate": 0.0003398404193360512, "loss": 3.2322, "step": 74550 }, { "epoch": 21.711457848160222, "grad_norm": 0.43713653087615967, "learning_rate": 0.00033966569598136283, "loss": 3.2274, "step": 74600 }, { "epoch": 21.72601304145319, "grad_norm": 0.4290430545806885, "learning_rate": 0.0003394909726266744, "loss": 3.2288, "step": 74650 }, { "epoch": 21.740568234746156, "grad_norm": 0.46461647748947144, "learning_rate": 0.000339316249271986, "loss": 3.2266, "step": 74700 }, { "epoch": 21.755123428039123, "grad_norm": 0.45526576042175293, "learning_rate": 0.00033914152591729756, "loss": 3.2179, "step": 74750 }, { "epoch": 21.76967862133209, "grad_norm": 0.3954998552799225, "learning_rate": 0.00033896680256260915, "loss": 3.229, "step": 74800 }, { "epoch": 21.784233814625058, "grad_norm": 0.40740442276000977, "learning_rate": 0.00033879207920792074, "loss": 3.2283, "step": 74850 }, { "epoch": 21.798789007918025, "grad_norm": 0.4069225788116455, "learning_rate": 0.0003386173558532324, "loss": 3.2387, "step": 74900 }, { "epoch": 21.81334420121099, "grad_norm": 0.3828088343143463, "learning_rate": 0.00033844263249854393, "loss": 3.2361, "step": 74950 }, { "epoch": 21.82789939450396, "grad_norm": 0.410266250371933, "learning_rate": 0.00033826790914385553, "loss": 3.2307, "step": 75000 }, { "epoch": 21.82789939450396, "eval_accuracy": 0.37409681734881667, "eval_loss": 3.5353522300720215, "eval_runtime": 181.2078, "eval_samples_per_second": 91.889, "eval_steps_per_second": 5.745, "step": 75000 }, { "epoch": 21.842454587796926, "grad_norm": 0.42453983426094055, "learning_rate": 0.0003380931857891671, "loss": 3.2427, "step": 75050 }, { "epoch": 21.857009781089893, "grad_norm": 0.4058069884777069, "learning_rate": 0.00033791846243447866, "loss": 3.2342, "step": 75100 }, { "epoch": 21.87156497438286, "grad_norm": 0.3781680464744568, "learning_rate": 0.0003377437390797903, "loss": 3.2345, "step": 75150 }, { "epoch": 21.886120167675827, "grad_norm": 0.4093674421310425, "learning_rate": 0.0003375690157251019, "loss": 3.2572, "step": 75200 }, { "epoch": 21.900675360968794, "grad_norm": 0.4051060676574707, "learning_rate": 0.0003373942923704135, "loss": 3.2447, "step": 75250 }, { "epoch": 21.91523055426176, "grad_norm": 0.39351558685302734, "learning_rate": 0.00033721956901572504, "loss": 3.2351, "step": 75300 }, { "epoch": 21.929785747554728, "grad_norm": 0.41610103845596313, "learning_rate": 0.00033704484566103663, "loss": 3.2474, "step": 75350 }, { "epoch": 21.944340940847695, "grad_norm": 0.40157684683799744, "learning_rate": 0.00033687012230634823, "loss": 3.2448, "step": 75400 }, { "epoch": 21.958896134140662, "grad_norm": 0.384278804063797, "learning_rate": 0.0003366953989516599, "loss": 3.2495, "step": 75450 }, { "epoch": 21.97345132743363, "grad_norm": 0.3966929018497467, "learning_rate": 0.00033652067559697147, "loss": 3.2433, "step": 75500 }, { "epoch": 21.988006520726596, "grad_norm": 0.41684508323669434, "learning_rate": 0.000336345952242283, "loss": 3.2445, "step": 75550 }, { "epoch": 22.002328830926874, "grad_norm": 0.44120311737060547, "learning_rate": 0.0003361712288875946, "loss": 3.2211, "step": 75600 }, { "epoch": 22.01688402421984, "grad_norm": 0.382026731967926, "learning_rate": 0.0003359965055329062, "loss": 3.1453, "step": 75650 }, { "epoch": 22.03143921751281, "grad_norm": 0.4008730947971344, "learning_rate": 0.00033582178217821785, "loss": 3.147, "step": 75700 }, { "epoch": 22.045994410805775, "grad_norm": 0.41892507672309875, "learning_rate": 0.0003356470588235294, "loss": 3.1466, "step": 75750 }, { "epoch": 22.060549604098743, "grad_norm": 0.41688308119773865, "learning_rate": 0.000335472335468841, "loss": 3.1618, "step": 75800 }, { "epoch": 22.07510479739171, "grad_norm": 0.3967905342578888, "learning_rate": 0.0003352976121141526, "loss": 3.1486, "step": 75850 }, { "epoch": 22.089659990684677, "grad_norm": 0.3955349922180176, "learning_rate": 0.0003351228887594641, "loss": 3.1511, "step": 75900 }, { "epoch": 22.104215183977644, "grad_norm": 0.42016756534576416, "learning_rate": 0.0003349481654047757, "loss": 3.1424, "step": 75950 }, { "epoch": 22.11877037727061, "grad_norm": 0.39622655510902405, "learning_rate": 0.00033477344205008736, "loss": 3.1703, "step": 76000 }, { "epoch": 22.11877037727061, "eval_accuracy": 0.3735835756060795, "eval_loss": 3.551079273223877, "eval_runtime": 181.1919, "eval_samples_per_second": 91.897, "eval_steps_per_second": 5.745, "step": 76000 }, { "epoch": 22.133325570563578, "grad_norm": 0.4219890236854553, "learning_rate": 0.00033459871869539895, "loss": 3.1713, "step": 76050 }, { "epoch": 22.147880763856545, "grad_norm": 0.39037638902664185, "learning_rate": 0.0003344239953407105, "loss": 3.1659, "step": 76100 }, { "epoch": 22.162435957149512, "grad_norm": 0.42194926738739014, "learning_rate": 0.0003342492719860221, "loss": 3.1716, "step": 76150 }, { "epoch": 22.17699115044248, "grad_norm": 0.44351381063461304, "learning_rate": 0.0003340745486313337, "loss": 3.1714, "step": 76200 }, { "epoch": 22.191546343735446, "grad_norm": 0.4843369722366333, "learning_rate": 0.0003338998252766453, "loss": 3.1725, "step": 76250 }, { "epoch": 22.206101537028413, "grad_norm": 0.41010621190071106, "learning_rate": 0.00033372510192195687, "loss": 3.1746, "step": 76300 }, { "epoch": 22.22065673032138, "grad_norm": 0.4231889247894287, "learning_rate": 0.00033355037856726847, "loss": 3.1836, "step": 76350 }, { "epoch": 22.235211923614347, "grad_norm": 0.4336659014225006, "learning_rate": 0.00033337565521258006, "loss": 3.1767, "step": 76400 }, { "epoch": 22.24976711690731, "grad_norm": 0.4049505889415741, "learning_rate": 0.00033320093185789165, "loss": 3.1804, "step": 76450 }, { "epoch": 22.264322310200278, "grad_norm": 0.40216079354286194, "learning_rate": 0.0003330262085032032, "loss": 3.1903, "step": 76500 }, { "epoch": 22.278877503493245, "grad_norm": 0.43066754937171936, "learning_rate": 0.00033285148514851484, "loss": 3.1809, "step": 76550 }, { "epoch": 22.293432696786212, "grad_norm": 0.41968709230422974, "learning_rate": 0.00033267676179382644, "loss": 3.19, "step": 76600 }, { "epoch": 22.30798789007918, "grad_norm": 0.41467177867889404, "learning_rate": 0.00033250203843913803, "loss": 3.1901, "step": 76650 }, { "epoch": 22.322543083372146, "grad_norm": 0.40680229663848877, "learning_rate": 0.00033232731508444957, "loss": 3.1795, "step": 76700 }, { "epoch": 22.337098276665113, "grad_norm": 0.42462196946144104, "learning_rate": 0.00033215259172976117, "loss": 3.1945, "step": 76750 }, { "epoch": 22.35165346995808, "grad_norm": 0.396403968334198, "learning_rate": 0.00033197786837507276, "loss": 3.1855, "step": 76800 }, { "epoch": 22.366208663251047, "grad_norm": 0.4305762052536011, "learning_rate": 0.0003318031450203844, "loss": 3.1948, "step": 76850 }, { "epoch": 22.380763856544014, "grad_norm": 0.42148691415786743, "learning_rate": 0.00033162842166569595, "loss": 3.1944, "step": 76900 }, { "epoch": 22.39531904983698, "grad_norm": 0.41125115752220154, "learning_rate": 0.00033145369831100754, "loss": 3.197, "step": 76950 }, { "epoch": 22.40987424312995, "grad_norm": 0.39410680532455444, "learning_rate": 0.00033127897495631914, "loss": 3.1936, "step": 77000 }, { "epoch": 22.40987424312995, "eval_accuracy": 0.37403887638724825, "eval_loss": 3.5436227321624756, "eval_runtime": 180.5916, "eval_samples_per_second": 92.203, "eval_steps_per_second": 5.764, "step": 77000 }, { "epoch": 22.424429436422916, "grad_norm": 0.4024583399295807, "learning_rate": 0.0003311042516016307, "loss": 3.1968, "step": 77050 }, { "epoch": 22.438984629715883, "grad_norm": 0.406925767660141, "learning_rate": 0.0003309295282469423, "loss": 3.1932, "step": 77100 }, { "epoch": 22.45353982300885, "grad_norm": 0.40767523646354675, "learning_rate": 0.0003307548048922539, "loss": 3.1967, "step": 77150 }, { "epoch": 22.468095016301817, "grad_norm": 0.4174925684928894, "learning_rate": 0.0003305800815375655, "loss": 3.2027, "step": 77200 }, { "epoch": 22.482650209594784, "grad_norm": 0.4187748432159424, "learning_rate": 0.00033040535818287705, "loss": 3.204, "step": 77250 }, { "epoch": 22.49720540288775, "grad_norm": 0.4081430733203888, "learning_rate": 0.00033023063482818865, "loss": 3.2084, "step": 77300 }, { "epoch": 22.511760596180718, "grad_norm": 0.41643956303596497, "learning_rate": 0.00033005591147350024, "loss": 3.196, "step": 77350 }, { "epoch": 22.526315789473685, "grad_norm": 0.413574755191803, "learning_rate": 0.0003298811881188119, "loss": 3.218, "step": 77400 }, { "epoch": 22.540870982766652, "grad_norm": 0.41706109046936035, "learning_rate": 0.00032970646476412343, "loss": 3.2047, "step": 77450 }, { "epoch": 22.55542617605962, "grad_norm": 0.42964473366737366, "learning_rate": 0.000329531741409435, "loss": 3.2166, "step": 77500 }, { "epoch": 22.569981369352586, "grad_norm": 0.4270336329936981, "learning_rate": 0.0003293570180547466, "loss": 3.2276, "step": 77550 }, { "epoch": 22.584536562645553, "grad_norm": 0.4078294336795807, "learning_rate": 0.0003291822947000582, "loss": 3.2108, "step": 77600 }, { "epoch": 22.59909175593852, "grad_norm": 0.4054129719734192, "learning_rate": 0.00032900757134536975, "loss": 3.2217, "step": 77650 }, { "epoch": 22.613646949231487, "grad_norm": 0.4075845181941986, "learning_rate": 0.0003288328479906814, "loss": 3.2102, "step": 77700 }, { "epoch": 22.628202142524454, "grad_norm": 0.39583030343055725, "learning_rate": 0.000328658124635993, "loss": 3.2149, "step": 77750 }, { "epoch": 22.64275733581742, "grad_norm": 0.39721065759658813, "learning_rate": 0.0003284834012813046, "loss": 3.2037, "step": 77800 }, { "epoch": 22.657312529110385, "grad_norm": 0.4116293489933014, "learning_rate": 0.00032830867792661613, "loss": 3.2096, "step": 77850 }, { "epoch": 22.671867722403352, "grad_norm": 0.381740540266037, "learning_rate": 0.0003281339545719277, "loss": 3.2147, "step": 77900 }, { "epoch": 22.68642291569632, "grad_norm": 0.400325745344162, "learning_rate": 0.0003279592312172394, "loss": 3.2152, "step": 77950 }, { "epoch": 22.700978108989286, "grad_norm": 0.4306262135505676, "learning_rate": 0.00032778450786255097, "loss": 3.2232, "step": 78000 }, { "epoch": 22.700978108989286, "eval_accuracy": 0.37414547365325757, "eval_loss": 3.5401101112365723, "eval_runtime": 180.5002, "eval_samples_per_second": 92.249, "eval_steps_per_second": 5.767, "step": 78000 }, { "epoch": 22.715533302282253, "grad_norm": 0.3976389765739441, "learning_rate": 0.0003276097845078625, "loss": 3.2265, "step": 78050 }, { "epoch": 22.73008849557522, "grad_norm": 0.4250744581222534, "learning_rate": 0.0003274350611531741, "loss": 3.2223, "step": 78100 }, { "epoch": 22.744643688868187, "grad_norm": 0.40967994928359985, "learning_rate": 0.0003272603377984857, "loss": 3.2121, "step": 78150 }, { "epoch": 22.759198882161154, "grad_norm": 0.3863125145435333, "learning_rate": 0.00032708561444379724, "loss": 3.2196, "step": 78200 }, { "epoch": 22.77375407545412, "grad_norm": 0.3916330635547638, "learning_rate": 0.0003269108910891089, "loss": 3.225, "step": 78250 }, { "epoch": 22.78830926874709, "grad_norm": 0.4228772222995758, "learning_rate": 0.0003267361677344205, "loss": 3.2327, "step": 78300 }, { "epoch": 22.802864462040056, "grad_norm": 0.406265527009964, "learning_rate": 0.0003265614443797321, "loss": 3.2282, "step": 78350 }, { "epoch": 22.817419655333023, "grad_norm": 0.4067714214324951, "learning_rate": 0.0003263867210250436, "loss": 3.2186, "step": 78400 }, { "epoch": 22.83197484862599, "grad_norm": 0.4357832074165344, "learning_rate": 0.0003262119976703552, "loss": 3.2212, "step": 78450 }, { "epoch": 22.846530041918957, "grad_norm": 0.3951626718044281, "learning_rate": 0.00032603727431566686, "loss": 3.2296, "step": 78500 }, { "epoch": 22.861085235211924, "grad_norm": 0.4343777894973755, "learning_rate": 0.00032586255096097845, "loss": 3.2309, "step": 78550 }, { "epoch": 22.87564042850489, "grad_norm": 0.3866595923900604, "learning_rate": 0.00032568782760629005, "loss": 3.2332, "step": 78600 }, { "epoch": 22.890195621797858, "grad_norm": 0.42146727442741394, "learning_rate": 0.0003255131042516016, "loss": 3.2322, "step": 78650 }, { "epoch": 22.904750815090825, "grad_norm": 0.4161469638347626, "learning_rate": 0.0003253383808969132, "loss": 3.2291, "step": 78700 }, { "epoch": 22.919306008383792, "grad_norm": 0.3808055520057678, "learning_rate": 0.0003251636575422248, "loss": 3.2391, "step": 78750 }, { "epoch": 22.93386120167676, "grad_norm": 0.44764867424964905, "learning_rate": 0.0003249889341875364, "loss": 3.2234, "step": 78800 }, { "epoch": 22.948416394969726, "grad_norm": 0.3993889391422272, "learning_rate": 0.00032481421083284796, "loss": 3.2397, "step": 78850 }, { "epoch": 22.962971588262693, "grad_norm": 0.4148414134979248, "learning_rate": 0.00032463948747815956, "loss": 3.2377, "step": 78900 }, { "epoch": 22.97752678155566, "grad_norm": 0.41175973415374756, "learning_rate": 0.00032446476412347115, "loss": 3.2389, "step": 78950 }, { "epoch": 22.992081974848627, "grad_norm": 0.40341904759407043, "learning_rate": 0.0003242900407687827, "loss": 3.2529, "step": 79000 }, { "epoch": 22.992081974848627, "eval_accuracy": 0.3744128482730714, "eval_loss": 3.5317115783691406, "eval_runtime": 180.2314, "eval_samples_per_second": 92.387, "eval_steps_per_second": 5.776, "step": 79000 }, { "epoch": 23.006404285048905, "grad_norm": 0.4090251624584198, "learning_rate": 0.00032411531741409434, "loss": 3.1867, "step": 79050 }, { "epoch": 23.020959478341872, "grad_norm": 0.42285946011543274, "learning_rate": 0.00032394059405940593, "loss": 3.1296, "step": 79100 }, { "epoch": 23.03551467163484, "grad_norm": 0.5036446452140808, "learning_rate": 0.00032376587070471753, "loss": 3.1323, "step": 79150 }, { "epoch": 23.050069864927806, "grad_norm": 0.41659125685691833, "learning_rate": 0.00032359114735002907, "loss": 3.1447, "step": 79200 }, { "epoch": 23.064625058220773, "grad_norm": 0.4147135019302368, "learning_rate": 0.00032341642399534066, "loss": 3.1481, "step": 79250 }, { "epoch": 23.07918025151374, "grad_norm": 0.4449823200702667, "learning_rate": 0.00032324170064065226, "loss": 3.1521, "step": 79300 }, { "epoch": 23.093735444806708, "grad_norm": 0.3893721401691437, "learning_rate": 0.0003230669772859639, "loss": 3.1493, "step": 79350 }, { "epoch": 23.108290638099675, "grad_norm": 0.3994235098361969, "learning_rate": 0.00032289225393127545, "loss": 3.1473, "step": 79400 }, { "epoch": 23.12284583139264, "grad_norm": 0.39284127950668335, "learning_rate": 0.00032271753057658704, "loss": 3.1447, "step": 79450 }, { "epoch": 23.13740102468561, "grad_norm": 0.4207184910774231, "learning_rate": 0.00032254280722189863, "loss": 3.1562, "step": 79500 }, { "epoch": 23.151956217978576, "grad_norm": 0.4245017468929291, "learning_rate": 0.00032236808386721023, "loss": 3.1569, "step": 79550 }, { "epoch": 23.166511411271543, "grad_norm": 0.4493178129196167, "learning_rate": 0.00032219336051252177, "loss": 3.162, "step": 79600 }, { "epoch": 23.18106660456451, "grad_norm": 0.4329953193664551, "learning_rate": 0.0003220186371578334, "loss": 3.165, "step": 79650 }, { "epoch": 23.195621797857477, "grad_norm": 0.4129304885864258, "learning_rate": 0.000321843913803145, "loss": 3.166, "step": 79700 }, { "epoch": 23.210176991150444, "grad_norm": 0.43213337659835815, "learning_rate": 0.0003216691904484566, "loss": 3.1669, "step": 79750 }, { "epoch": 23.22473218444341, "grad_norm": 0.4205012917518616, "learning_rate": 0.00032149446709376815, "loss": 3.1663, "step": 79800 }, { "epoch": 23.239287377736375, "grad_norm": 0.41995224356651306, "learning_rate": 0.00032131974373907974, "loss": 3.1634, "step": 79850 }, { "epoch": 23.25384257102934, "grad_norm": 0.41790732741355896, "learning_rate": 0.0003211450203843914, "loss": 3.1827, "step": 79900 }, { "epoch": 23.26839776432231, "grad_norm": 0.4145872890949249, "learning_rate": 0.000320970297029703, "loss": 3.166, "step": 79950 }, { "epoch": 23.282952957615276, "grad_norm": 0.40969178080558777, "learning_rate": 0.0003207955736750145, "loss": 3.1891, "step": 80000 }, { "epoch": 23.282952957615276, "eval_accuracy": 0.3738053496313932, "eval_loss": 3.54736065864563, "eval_runtime": 180.3289, "eval_samples_per_second": 92.337, "eval_steps_per_second": 5.773, "step": 80000 }, { "epoch": 23.297508150908243, "grad_norm": 0.4115394055843353, "learning_rate": 0.0003206208503203261, "loss": 3.1423, "step": 80050 }, { "epoch": 23.31206334420121, "grad_norm": 0.4162193536758423, "learning_rate": 0.0003204461269656377, "loss": 3.1351, "step": 80100 }, { "epoch": 23.326618537494177, "grad_norm": 0.45002660155296326, "learning_rate": 0.00032027140361094925, "loss": 3.1466, "step": 80150 }, { "epoch": 23.341173730787144, "grad_norm": 0.40258246660232544, "learning_rate": 0.0003200966802562609, "loss": 3.1543, "step": 80200 }, { "epoch": 23.35572892408011, "grad_norm": 0.44384118914604187, "learning_rate": 0.0003199219569015725, "loss": 3.1479, "step": 80250 }, { "epoch": 23.370284117373078, "grad_norm": 0.42511990666389465, "learning_rate": 0.0003197472335468841, "loss": 3.1391, "step": 80300 }, { "epoch": 23.384839310666045, "grad_norm": 0.3872159719467163, "learning_rate": 0.00031957251019219563, "loss": 3.1577, "step": 80350 }, { "epoch": 23.399394503959012, "grad_norm": 0.46670249104499817, "learning_rate": 0.0003193977868375072, "loss": 3.156, "step": 80400 }, { "epoch": 23.41394969725198, "grad_norm": 0.4092205762863159, "learning_rate": 0.00031922306348281887, "loss": 3.1537, "step": 80450 }, { "epoch": 23.428504890544946, "grad_norm": 0.4209052324295044, "learning_rate": 0.00031904834012813047, "loss": 3.1655, "step": 80500 }, { "epoch": 23.443060083837914, "grad_norm": 0.4230552017688751, "learning_rate": 0.000318873616773442, "loss": 3.1611, "step": 80550 }, { "epoch": 23.45761527713088, "grad_norm": 0.462198406457901, "learning_rate": 0.0003186988934187536, "loss": 3.1597, "step": 80600 }, { "epoch": 23.472170470423848, "grad_norm": 0.4439510703086853, "learning_rate": 0.0003185241700640652, "loss": 3.1637, "step": 80650 }, { "epoch": 23.486725663716815, "grad_norm": 0.4088172912597656, "learning_rate": 0.0003183494467093768, "loss": 3.1662, "step": 80700 }, { "epoch": 23.50128085700978, "grad_norm": 0.4460408389568329, "learning_rate": 0.0003181747233546884, "loss": 3.1698, "step": 80750 }, { "epoch": 23.51583605030275, "grad_norm": 0.40806201100349426, "learning_rate": 0.000318, "loss": 3.1677, "step": 80800 }, { "epoch": 23.530391243595716, "grad_norm": 0.418067067861557, "learning_rate": 0.00031782527664531157, "loss": 3.1743, "step": 80850 }, { "epoch": 23.544946436888683, "grad_norm": 0.4435212314128876, "learning_rate": 0.00031765055329062317, "loss": 3.1804, "step": 80900 }, { "epoch": 23.55950163018165, "grad_norm": 0.4240227937698364, "learning_rate": 0.0003174758299359347, "loss": 3.1575, "step": 80950 }, { "epoch": 23.574056823474617, "grad_norm": 0.4234561026096344, "learning_rate": 0.0003173011065812463, "loss": 3.1885, "step": 81000 }, { "epoch": 23.574056823474617, "eval_accuracy": 0.37372954451940205, "eval_loss": 3.5489554405212402, "eval_runtime": 178.5698, "eval_samples_per_second": 93.246, "eval_steps_per_second": 5.83, "step": 81000 }, { "epoch": 23.588612016767584, "grad_norm": 0.404136061668396, "learning_rate": 0.00031712638322655795, "loss": 3.1762, "step": 81050 }, { "epoch": 23.60316721006055, "grad_norm": 0.44025400280952454, "learning_rate": 0.00031695165987186954, "loss": 3.1706, "step": 81100 }, { "epoch": 23.61772240335352, "grad_norm": 0.41822871565818787, "learning_rate": 0.0003167769365171811, "loss": 3.1913, "step": 81150 }, { "epoch": 23.63227759664648, "grad_norm": 0.4196312129497528, "learning_rate": 0.0003166022131624927, "loss": 3.1865, "step": 81200 }, { "epoch": 23.64683278993945, "grad_norm": 0.4244672358036041, "learning_rate": 0.00031642748980780427, "loss": 3.1866, "step": 81250 }, { "epoch": 23.661387983232416, "grad_norm": 0.42409825325012207, "learning_rate": 0.0003162527664531159, "loss": 3.1799, "step": 81300 }, { "epoch": 23.675943176525383, "grad_norm": 0.42242127656936646, "learning_rate": 0.00031607804309842746, "loss": 3.1905, "step": 81350 }, { "epoch": 23.69049836981835, "grad_norm": 0.43008777499198914, "learning_rate": 0.00031590331974373905, "loss": 3.1855, "step": 81400 }, { "epoch": 23.705053563111317, "grad_norm": 0.4073195457458496, "learning_rate": 0.00031572859638905065, "loss": 3.189, "step": 81450 }, { "epoch": 23.719608756404284, "grad_norm": 0.42558857798576355, "learning_rate": 0.0003155538730343622, "loss": 3.1868, "step": 81500 }, { "epoch": 23.73416394969725, "grad_norm": 0.42349162697792053, "learning_rate": 0.0003153791496796738, "loss": 3.1946, "step": 81550 }, { "epoch": 23.74871914299022, "grad_norm": 0.4403086006641388, "learning_rate": 0.00031520442632498543, "loss": 3.2023, "step": 81600 }, { "epoch": 23.763274336283185, "grad_norm": 0.4507555663585663, "learning_rate": 0.000315029702970297, "loss": 3.2127, "step": 81650 }, { "epoch": 23.777829529576152, "grad_norm": 0.44032490253448486, "learning_rate": 0.0003148549796156086, "loss": 3.1864, "step": 81700 }, { "epoch": 23.79238472286912, "grad_norm": 0.39969223737716675, "learning_rate": 0.00031468025626092016, "loss": 3.2012, "step": 81750 }, { "epoch": 23.806939916162086, "grad_norm": 0.43395310640335083, "learning_rate": 0.00031450553290623175, "loss": 3.1875, "step": 81800 }, { "epoch": 23.821495109455054, "grad_norm": 0.42276349663734436, "learning_rate": 0.0003143308095515434, "loss": 3.2046, "step": 81850 }, { "epoch": 23.83605030274802, "grad_norm": 0.45192578434944153, "learning_rate": 0.000314156086196855, "loss": 3.2047, "step": 81900 }, { "epoch": 23.850605496040988, "grad_norm": 0.42621010541915894, "learning_rate": 0.00031398136284216654, "loss": 3.2012, "step": 81950 }, { "epoch": 23.865160689333955, "grad_norm": 0.4181872606277466, "learning_rate": 0.00031380663948747813, "loss": 3.1981, "step": 82000 }, { "epoch": 23.865160689333955, "eval_accuracy": 0.3742756939076548, "eval_loss": 3.5399296283721924, "eval_runtime": 177.8039, "eval_samples_per_second": 93.648, "eval_steps_per_second": 5.855, "step": 82000 }, { "epoch": 23.879715882626922, "grad_norm": 0.3875061571598053, "learning_rate": 0.0003136319161327897, "loss": 3.1984, "step": 82050 }, { "epoch": 23.89427107591989, "grad_norm": 0.42548853158950806, "learning_rate": 0.00031345719277810127, "loss": 3.2015, "step": 82100 }, { "epoch": 23.908826269212856, "grad_norm": 0.4052736163139343, "learning_rate": 0.0003132824694234129, "loss": 3.2148, "step": 82150 }, { "epoch": 23.923381462505823, "grad_norm": 0.4092583954334259, "learning_rate": 0.0003131077460687245, "loss": 3.2071, "step": 82200 }, { "epoch": 23.93793665579879, "grad_norm": 0.4146585762500763, "learning_rate": 0.0003129330227140361, "loss": 3.2061, "step": 82250 }, { "epoch": 23.952491849091757, "grad_norm": 0.42231765389442444, "learning_rate": 0.00031275829935934764, "loss": 3.2096, "step": 82300 }, { "epoch": 23.967047042384724, "grad_norm": 0.43706318736076355, "learning_rate": 0.00031258357600465924, "loss": 3.2062, "step": 82350 }, { "epoch": 23.98160223567769, "grad_norm": 0.4457162022590637, "learning_rate": 0.00031240885264997083, "loss": 3.2077, "step": 82400 }, { "epoch": 23.99615742897066, "grad_norm": 0.3893353343009949, "learning_rate": 0.0003122341292952825, "loss": 3.2128, "step": 82450 }, { "epoch": 24.010770843036795, "grad_norm": 0.4287562370300293, "learning_rate": 0.000312059405940594, "loss": 3.2119, "step": 82500 }, { "epoch": 24.025326036329762, "grad_norm": 0.47328105568885803, "learning_rate": 0.0003118846825859056, "loss": 3.1189, "step": 82550 }, { "epoch": 24.03988122962273, "grad_norm": 0.41763442754745483, "learning_rate": 0.0003117099592312172, "loss": 3.1262, "step": 82600 }, { "epoch": 24.054436422915696, "grad_norm": 0.41365721821784973, "learning_rate": 0.0003115352358765288, "loss": 3.1278, "step": 82650 }, { "epoch": 24.068991616208663, "grad_norm": 0.42613592743873596, "learning_rate": 0.0003113605125218404, "loss": 3.1437, "step": 82700 }, { "epoch": 24.08354680950163, "grad_norm": 0.41430389881134033, "learning_rate": 0.000311185789167152, "loss": 3.13, "step": 82750 }, { "epoch": 24.098102002794597, "grad_norm": 0.438297837972641, "learning_rate": 0.0003110110658124636, "loss": 3.1544, "step": 82800 }, { "epoch": 24.112657196087564, "grad_norm": 0.41660276055336, "learning_rate": 0.0003108363424577752, "loss": 3.1362, "step": 82850 }, { "epoch": 24.12721238938053, "grad_norm": 0.44473373889923096, "learning_rate": 0.0003106616191030867, "loss": 3.1511, "step": 82900 }, { "epoch": 24.1417675826735, "grad_norm": 0.4273695647716522, "learning_rate": 0.0003104868957483983, "loss": 3.1625, "step": 82950 }, { "epoch": 24.156322775966466, "grad_norm": 0.42920610308647156, "learning_rate": 0.00031031217239370996, "loss": 3.1553, "step": 83000 }, { "epoch": 24.156322775966466, "eval_accuracy": 0.373702748293768, "eval_loss": 3.5522077083587646, "eval_runtime": 177.7292, "eval_samples_per_second": 93.687, "eval_steps_per_second": 5.857, "step": 83000 }, { "epoch": 24.170877969259433, "grad_norm": 0.4231477677822113, "learning_rate": 0.00031013744903902156, "loss": 3.1539, "step": 83050 }, { "epoch": 24.1854331625524, "grad_norm": 0.429381787776947, "learning_rate": 0.0003099627256843331, "loss": 3.1646, "step": 83100 }, { "epoch": 24.199988355845367, "grad_norm": 0.458092600107193, "learning_rate": 0.0003097880023296447, "loss": 3.1625, "step": 83150 }, { "epoch": 24.214543549138334, "grad_norm": 0.42336297035217285, "learning_rate": 0.0003096132789749563, "loss": 3.166, "step": 83200 }, { "epoch": 24.2290987424313, "grad_norm": 0.432204008102417, "learning_rate": 0.00030943855562026794, "loss": 3.1701, "step": 83250 }, { "epoch": 24.243653935724268, "grad_norm": 0.4335592985153198, "learning_rate": 0.0003092638322655795, "loss": 3.1778, "step": 83300 }, { "epoch": 24.258209129017235, "grad_norm": 0.4331112205982208, "learning_rate": 0.00030908910891089107, "loss": 3.1755, "step": 83350 }, { "epoch": 24.2727643223102, "grad_norm": 0.3974718451499939, "learning_rate": 0.00030891438555620266, "loss": 3.1706, "step": 83400 }, { "epoch": 24.287319515603166, "grad_norm": 0.4160006046295166, "learning_rate": 0.0003087396622015142, "loss": 3.1598, "step": 83450 }, { "epoch": 24.301874708896133, "grad_norm": 0.4419631063938141, "learning_rate": 0.0003085649388468258, "loss": 3.191, "step": 83500 }, { "epoch": 24.3164299021891, "grad_norm": 0.4415464699268341, "learning_rate": 0.00030839021549213745, "loss": 3.1648, "step": 83550 }, { "epoch": 24.330985095482067, "grad_norm": 0.41344505548477173, "learning_rate": 0.00030821549213744904, "loss": 3.1686, "step": 83600 }, { "epoch": 24.345540288775034, "grad_norm": 0.4480566084384918, "learning_rate": 0.0003080407687827606, "loss": 3.178, "step": 83650 }, { "epoch": 24.360095482068, "grad_norm": 0.41227227449417114, "learning_rate": 0.0003078660454280722, "loss": 3.1783, "step": 83700 }, { "epoch": 24.374650675360968, "grad_norm": 0.4389805197715759, "learning_rate": 0.00030769132207338377, "loss": 3.191, "step": 83750 }, { "epoch": 24.389205868653935, "grad_norm": 0.4029884934425354, "learning_rate": 0.00030751659871869536, "loss": 3.1696, "step": 83800 }, { "epoch": 24.403761061946902, "grad_norm": 0.42409229278564453, "learning_rate": 0.00030734187536400696, "loss": 3.1726, "step": 83850 }, { "epoch": 24.41831625523987, "grad_norm": 0.4129829406738281, "learning_rate": 0.00030716715200931855, "loss": 3.1782, "step": 83900 }, { "epoch": 24.432871448532836, "grad_norm": 0.4360862970352173, "learning_rate": 0.00030699242865463015, "loss": 3.1856, "step": 83950 }, { "epoch": 24.447426641825803, "grad_norm": 0.4500051736831665, "learning_rate": 0.00030681770529994174, "loss": 3.1871, "step": 84000 }, { "epoch": 24.447426641825803, "eval_accuracy": 0.37416780384128595, "eval_loss": 3.5479676723480225, "eval_runtime": 178.3844, "eval_samples_per_second": 93.343, "eval_steps_per_second": 5.836, "step": 84000 }, { "epoch": 24.46198183511877, "grad_norm": 0.42669326066970825, "learning_rate": 0.0003066429819452533, "loss": 3.1807, "step": 84050 }, { "epoch": 24.476537028411737, "grad_norm": 0.45645323395729065, "learning_rate": 0.00030646825859056493, "loss": 3.1884, "step": 84100 }, { "epoch": 24.491092221704704, "grad_norm": 0.4066215753555298, "learning_rate": 0.0003062935352358765, "loss": 3.1873, "step": 84150 }, { "epoch": 24.50564741499767, "grad_norm": 0.41606536507606506, "learning_rate": 0.0003061188118811881, "loss": 3.1933, "step": 84200 }, { "epoch": 24.52020260829064, "grad_norm": 0.41580286622047424, "learning_rate": 0.00030594408852649966, "loss": 3.1968, "step": 84250 }, { "epoch": 24.534757801583606, "grad_norm": 0.4156491756439209, "learning_rate": 0.00030576936517181125, "loss": 3.2047, "step": 84300 }, { "epoch": 24.549312994876573, "grad_norm": 0.4082728326320648, "learning_rate": 0.00030559464181712285, "loss": 3.1886, "step": 84350 }, { "epoch": 24.56386818816954, "grad_norm": 0.46042659878730774, "learning_rate": 0.0003054199184624345, "loss": 3.1916, "step": 84400 }, { "epoch": 24.578423381462507, "grad_norm": 0.4049529731273651, "learning_rate": 0.00030524519510774604, "loss": 3.2045, "step": 84450 }, { "epoch": 24.592978574755474, "grad_norm": 0.4158893823623657, "learning_rate": 0.00030507047175305763, "loss": 3.1986, "step": 84500 }, { "epoch": 24.60753376804844, "grad_norm": 0.4263823926448822, "learning_rate": 0.0003048957483983692, "loss": 3.2023, "step": 84550 }, { "epoch": 24.622088961341408, "grad_norm": 0.40439581871032715, "learning_rate": 0.00030472102504368076, "loss": 3.2079, "step": 84600 }, { "epoch": 24.636644154634375, "grad_norm": 0.4388013780117035, "learning_rate": 0.0003045463016889924, "loss": 3.1873, "step": 84650 }, { "epoch": 24.651199347927342, "grad_norm": 0.40215572714805603, "learning_rate": 0.000304371578334304, "loss": 3.2058, "step": 84700 }, { "epoch": 24.66575454122031, "grad_norm": 0.4271703362464905, "learning_rate": 0.0003041968549796156, "loss": 3.2184, "step": 84750 }, { "epoch": 24.680309734513273, "grad_norm": 0.43761512637138367, "learning_rate": 0.00030402213162492714, "loss": 3.2011, "step": 84800 }, { "epoch": 24.69486492780624, "grad_norm": 0.441463440656662, "learning_rate": 0.00030384740827023874, "loss": 3.2017, "step": 84850 }, { "epoch": 24.709420121099207, "grad_norm": 0.4295259118080139, "learning_rate": 0.00030367268491555033, "loss": 3.194, "step": 84900 }, { "epoch": 24.723975314392174, "grad_norm": 0.42946338653564453, "learning_rate": 0.000303497961560862, "loss": 3.2003, "step": 84950 }, { "epoch": 24.73853050768514, "grad_norm": 0.38940832018852234, "learning_rate": 0.00030332323820617357, "loss": 3.2118, "step": 85000 }, { "epoch": 24.73853050768514, "eval_accuracy": 0.37463885328137997, "eval_loss": 3.537196397781372, "eval_runtime": 178.4737, "eval_samples_per_second": 93.297, "eval_steps_per_second": 5.833, "step": 85000 }, { "epoch": 24.753085700978108, "grad_norm": 0.4103018641471863, "learning_rate": 0.0003031485148514851, "loss": 3.2075, "step": 85050 }, { "epoch": 24.767640894271075, "grad_norm": 0.43657201528549194, "learning_rate": 0.0003029737914967967, "loss": 3.2184, "step": 85100 }, { "epoch": 24.782196087564042, "grad_norm": 0.429223895072937, "learning_rate": 0.0003027990681421083, "loss": 3.2254, "step": 85150 }, { "epoch": 24.79675128085701, "grad_norm": 0.41025614738464355, "learning_rate": 0.00030262434478741984, "loss": 3.2142, "step": 85200 }, { "epoch": 24.811306474149976, "grad_norm": 0.430196613073349, "learning_rate": 0.0003024496214327315, "loss": 3.196, "step": 85250 }, { "epoch": 24.825861667442943, "grad_norm": 0.406845360994339, "learning_rate": 0.0003022748980780431, "loss": 3.1903, "step": 85300 }, { "epoch": 24.84041686073591, "grad_norm": 0.390212744474411, "learning_rate": 0.0003021001747233547, "loss": 3.2196, "step": 85350 }, { "epoch": 24.854972054028877, "grad_norm": 0.4439830482006073, "learning_rate": 0.0003019254513686662, "loss": 3.2014, "step": 85400 }, { "epoch": 24.869527247321844, "grad_norm": 0.4344117045402527, "learning_rate": 0.0003017507280139778, "loss": 3.2074, "step": 85450 }, { "epoch": 24.88408244061481, "grad_norm": 0.46311089396476746, "learning_rate": 0.00030157600465928946, "loss": 3.2158, "step": 85500 }, { "epoch": 24.89863763390778, "grad_norm": 0.3941739499568939, "learning_rate": 0.00030140128130460106, "loss": 3.2128, "step": 85550 }, { "epoch": 24.913192827200746, "grad_norm": 0.4007490873336792, "learning_rate": 0.0003012265579499126, "loss": 3.2184, "step": 85600 }, { "epoch": 24.927748020493713, "grad_norm": 0.4124232828617096, "learning_rate": 0.0003010518345952242, "loss": 3.2106, "step": 85650 }, { "epoch": 24.94230321378668, "grad_norm": 0.4270252585411072, "learning_rate": 0.0003008771112405358, "loss": 3.2169, "step": 85700 }, { "epoch": 24.956858407079647, "grad_norm": 0.4554911255836487, "learning_rate": 0.0003007023878858473, "loss": 3.2171, "step": 85750 }, { "epoch": 24.971413600372614, "grad_norm": 0.39927980303764343, "learning_rate": 0.000300527664531159, "loss": 3.2109, "step": 85800 }, { "epoch": 24.98596879366558, "grad_norm": 0.42868730425834656, "learning_rate": 0.00030035294117647057, "loss": 3.2091, "step": 85850 }, { "epoch": 25.00029110386586, "grad_norm": 0.41933250427246094, "learning_rate": 0.00030017821782178216, "loss": 3.2104, "step": 85900 }, { "epoch": 25.014846297158826, "grad_norm": 0.4270532429218292, "learning_rate": 0.00030000349446709376, "loss": 3.115, "step": 85950 }, { "epoch": 25.029401490451793, "grad_norm": 0.4229387640953064, "learning_rate": 0.00029982877111240535, "loss": 3.1186, "step": 86000 }, { "epoch": 25.029401490451793, "eval_accuracy": 0.37432811108586883, "eval_loss": 3.5472147464752197, "eval_runtime": 178.3516, "eval_samples_per_second": 93.361, "eval_steps_per_second": 5.837, "step": 86000 }, { "epoch": 25.04395668374476, "grad_norm": 0.4088560938835144, "learning_rate": 0.00029965404775771694, "loss": 3.1199, "step": 86050 }, { "epoch": 25.058511877037727, "grad_norm": 0.4071793556213379, "learning_rate": 0.0002994793244030285, "loss": 3.125, "step": 86100 }, { "epoch": 25.073067070330694, "grad_norm": 0.39798495173454285, "learning_rate": 0.00029930460104834013, "loss": 3.1447, "step": 86150 }, { "epoch": 25.08762226362366, "grad_norm": 0.40349072217941284, "learning_rate": 0.0002991298776936517, "loss": 3.1383, "step": 86200 }, { "epoch": 25.10217745691663, "grad_norm": 0.41893699765205383, "learning_rate": 0.00029895515433896327, "loss": 3.1349, "step": 86250 }, { "epoch": 25.116732650209595, "grad_norm": 0.42792829871177673, "learning_rate": 0.00029878043098427486, "loss": 3.1361, "step": 86300 }, { "epoch": 25.131287843502562, "grad_norm": 0.4104972183704376, "learning_rate": 0.00029860570762958646, "loss": 3.1457, "step": 86350 }, { "epoch": 25.14584303679553, "grad_norm": 0.4476071298122406, "learning_rate": 0.00029843098427489805, "loss": 3.1324, "step": 86400 }, { "epoch": 25.160398230088497, "grad_norm": 0.4160546064376831, "learning_rate": 0.00029825626092020964, "loss": 3.1495, "step": 86450 }, { "epoch": 25.174953423381464, "grad_norm": 0.40107226371765137, "learning_rate": 0.00029808153756552124, "loss": 3.1538, "step": 86500 }, { "epoch": 25.18950861667443, "grad_norm": 0.4060494899749756, "learning_rate": 0.00029790681421083283, "loss": 3.1613, "step": 86550 }, { "epoch": 25.204063809967398, "grad_norm": 0.4456344544887543, "learning_rate": 0.00029773209085614443, "loss": 3.1443, "step": 86600 }, { "epoch": 25.218619003260365, "grad_norm": 0.4477149248123169, "learning_rate": 0.00029755736750145597, "loss": 3.1595, "step": 86650 }, { "epoch": 25.233174196553332, "grad_norm": 0.43678659200668335, "learning_rate": 0.0002973826441467676, "loss": 3.1414, "step": 86700 }, { "epoch": 25.2477293898463, "grad_norm": 0.4599519670009613, "learning_rate": 0.00029720792079207916, "loss": 3.1618, "step": 86750 }, { "epoch": 25.262284583139262, "grad_norm": 0.44289496541023254, "learning_rate": 0.00029703319743739075, "loss": 3.1477, "step": 86800 }, { "epoch": 25.27683977643223, "grad_norm": 0.4248427152633667, "learning_rate": 0.00029685847408270234, "loss": 3.1539, "step": 86850 }, { "epoch": 25.291394969725197, "grad_norm": 0.444109171628952, "learning_rate": 0.00029668375072801394, "loss": 3.1548, "step": 86900 }, { "epoch": 25.305950163018164, "grad_norm": 0.4281579256057739, "learning_rate": 0.00029650902737332553, "loss": 3.1611, "step": 86950 }, { "epoch": 25.32050535631113, "grad_norm": 0.4330824017524719, "learning_rate": 0.00029633430401863713, "loss": 3.1519, "step": 87000 }, { "epoch": 25.32050535631113, "eval_accuracy": 0.37392134908183555, "eval_loss": 3.5476152896881104, "eval_runtime": 178.351, "eval_samples_per_second": 93.361, "eval_steps_per_second": 5.837, "step": 87000 }, { "epoch": 25.335060549604098, "grad_norm": 0.4235324561595917, "learning_rate": 0.0002961595806639487, "loss": 3.1792, "step": 87050 }, { "epoch": 25.349615742897065, "grad_norm": 0.4355413615703583, "learning_rate": 0.0002959848573092603, "loss": 3.1715, "step": 87100 }, { "epoch": 25.364170936190032, "grad_norm": 0.42077159881591797, "learning_rate": 0.0002958101339545719, "loss": 3.166, "step": 87150 }, { "epoch": 25.378726129483, "grad_norm": 0.441746324300766, "learning_rate": 0.0002956354105998835, "loss": 3.1764, "step": 87200 }, { "epoch": 25.393281322775966, "grad_norm": 0.45938950777053833, "learning_rate": 0.0002954606872451951, "loss": 3.1725, "step": 87250 }, { "epoch": 25.407836516068933, "grad_norm": 0.4866997301578522, "learning_rate": 0.0002952859638905067, "loss": 3.1598, "step": 87300 }, { "epoch": 25.4223917093619, "grad_norm": 0.42531818151474, "learning_rate": 0.00029511124053581823, "loss": 3.1785, "step": 87350 }, { "epoch": 25.436946902654867, "grad_norm": 0.43883636593818665, "learning_rate": 0.0002949365171811299, "loss": 3.1676, "step": 87400 }, { "epoch": 25.451502095947834, "grad_norm": 0.41908153891563416, "learning_rate": 0.0002947617938264414, "loss": 3.1758, "step": 87450 }, { "epoch": 25.4660572892408, "grad_norm": 0.40582430362701416, "learning_rate": 0.000294587070471753, "loss": 3.1913, "step": 87500 }, { "epoch": 25.48061248253377, "grad_norm": 0.4078127145767212, "learning_rate": 0.0002944123471170646, "loss": 3.1827, "step": 87550 }, { "epoch": 25.495167675826735, "grad_norm": 0.4438916742801666, "learning_rate": 0.0002942376237623762, "loss": 3.1789, "step": 87600 }, { "epoch": 25.509722869119702, "grad_norm": 0.434966117143631, "learning_rate": 0.0002940629004076878, "loss": 3.1688, "step": 87650 }, { "epoch": 25.52427806241267, "grad_norm": 0.4515625834465027, "learning_rate": 0.0002938881770529994, "loss": 3.1848, "step": 87700 }, { "epoch": 25.538833255705637, "grad_norm": 0.4303271770477295, "learning_rate": 0.000293713453698311, "loss": 3.1899, "step": 87750 }, { "epoch": 25.553388448998604, "grad_norm": 0.42675459384918213, "learning_rate": 0.0002935387303436226, "loss": 3.1713, "step": 87800 }, { "epoch": 25.56794364229157, "grad_norm": 0.4226889908313751, "learning_rate": 0.0002933640069889342, "loss": 3.1708, "step": 87850 }, { "epoch": 25.582498835584538, "grad_norm": 0.38473665714263916, "learning_rate": 0.0002931892836342457, "loss": 3.1994, "step": 87900 }, { "epoch": 25.597054028877505, "grad_norm": 0.41067373752593994, "learning_rate": 0.00029301456027955736, "loss": 3.1912, "step": 87950 }, { "epoch": 25.611609222170472, "grad_norm": 0.4160154163837433, "learning_rate": 0.0002928398369248689, "loss": 3.1872, "step": 88000 }, { "epoch": 25.611609222170472, "eval_accuracy": 0.37471971206750393, "eval_loss": 3.5368266105651855, "eval_runtime": 178.2354, "eval_samples_per_second": 93.421, "eval_steps_per_second": 5.841, "step": 88000 }, { "epoch": 25.62616441546344, "grad_norm": 0.42414504289627075, "learning_rate": 0.0002926651135701805, "loss": 3.1878, "step": 88050 }, { "epoch": 25.640719608756406, "grad_norm": 0.4490009844303131, "learning_rate": 0.00029249039021549215, "loss": 3.1933, "step": 88100 }, { "epoch": 25.65527480204937, "grad_norm": 0.4567476212978363, "learning_rate": 0.0002923156668608037, "loss": 3.1887, "step": 88150 }, { "epoch": 25.669829995342337, "grad_norm": 0.4161531925201416, "learning_rate": 0.0002921409435061153, "loss": 3.1952, "step": 88200 }, { "epoch": 25.684385188635304, "grad_norm": 0.42233118414878845, "learning_rate": 0.0002919662201514269, "loss": 3.1985, "step": 88250 }, { "epoch": 25.69894038192827, "grad_norm": 0.4406331777572632, "learning_rate": 0.00029179149679673847, "loss": 3.1852, "step": 88300 }, { "epoch": 25.713495575221238, "grad_norm": 0.4222116768360138, "learning_rate": 0.00029161677344205007, "loss": 3.1803, "step": 88350 }, { "epoch": 25.728050768514205, "grad_norm": 0.42603495717048645, "learning_rate": 0.00029144205008736166, "loss": 3.1839, "step": 88400 }, { "epoch": 25.742605961807172, "grad_norm": 0.4418340027332306, "learning_rate": 0.00029126732673267325, "loss": 3.1956, "step": 88450 }, { "epoch": 25.75716115510014, "grad_norm": 0.4185956120491028, "learning_rate": 0.00029109260337798485, "loss": 3.1888, "step": 88500 }, { "epoch": 25.771716348393106, "grad_norm": 0.4348103702068329, "learning_rate": 0.00029091788002329644, "loss": 3.2036, "step": 88550 }, { "epoch": 25.786271541686073, "grad_norm": 0.4157055616378784, "learning_rate": 0.000290743156668608, "loss": 3.1883, "step": 88600 }, { "epoch": 25.80082673497904, "grad_norm": 0.4167759120464325, "learning_rate": 0.00029056843331391963, "loss": 3.2008, "step": 88650 }, { "epoch": 25.815381928272007, "grad_norm": 0.45029062032699585, "learning_rate": 0.00029039370995923117, "loss": 3.1898, "step": 88700 }, { "epoch": 25.829937121564974, "grad_norm": 0.43252915143966675, "learning_rate": 0.00029021898660454277, "loss": 3.1977, "step": 88750 }, { "epoch": 25.84449231485794, "grad_norm": 0.40199464559555054, "learning_rate": 0.00029004426324985436, "loss": 3.1963, "step": 88800 }, { "epoch": 25.85904750815091, "grad_norm": 0.4148293137550354, "learning_rate": 0.00028986953989516595, "loss": 3.2018, "step": 88850 }, { "epoch": 25.873602701443875, "grad_norm": 0.4121401011943817, "learning_rate": 0.00028969481654047755, "loss": 3.2052, "step": 88900 }, { "epoch": 25.888157894736842, "grad_norm": 0.4078361690044403, "learning_rate": 0.00028952009318578914, "loss": 3.2041, "step": 88950 }, { "epoch": 25.90271308802981, "grad_norm": 0.4040093719959259, "learning_rate": 0.00028934536983110074, "loss": 3.1996, "step": 89000 }, { "epoch": 25.90271308802981, "eval_accuracy": 0.3751646704457963, "eval_loss": 3.5331084728240967, "eval_runtime": 179.0455, "eval_samples_per_second": 92.999, "eval_steps_per_second": 5.814, "step": 89000 }, { "epoch": 25.917268281322777, "grad_norm": 0.4156050682067871, "learning_rate": 0.00028917064647641233, "loss": 3.2138, "step": 89050 }, { "epoch": 25.931823474615744, "grad_norm": 0.41610783338546753, "learning_rate": 0.0002889959231217239, "loss": 3.2033, "step": 89100 }, { "epoch": 25.94637866790871, "grad_norm": 0.44033291935920715, "learning_rate": 0.0002888211997670355, "loss": 3.1947, "step": 89150 }, { "epoch": 25.960933861201678, "grad_norm": 0.418897420167923, "learning_rate": 0.0002886464764123471, "loss": 3.2122, "step": 89200 }, { "epoch": 25.975489054494645, "grad_norm": 0.44611796736717224, "learning_rate": 0.0002884717530576587, "loss": 3.2048, "step": 89250 }, { "epoch": 25.990044247787612, "grad_norm": 0.4740305244922638, "learning_rate": 0.00028829702970297025, "loss": 3.2059, "step": 89300 }, { "epoch": 26.00436655798789, "grad_norm": 0.45718663930892944, "learning_rate": 0.0002881223063482819, "loss": 3.1797, "step": 89350 }, { "epoch": 26.018921751280857, "grad_norm": 0.4130460023880005, "learning_rate": 0.00028794758299359344, "loss": 3.0988, "step": 89400 }, { "epoch": 26.033476944573824, "grad_norm": 0.4237079918384552, "learning_rate": 0.00028777285963890503, "loss": 3.1181, "step": 89450 }, { "epoch": 26.04803213786679, "grad_norm": 0.4581432640552521, "learning_rate": 0.0002875981362842166, "loss": 3.1109, "step": 89500 }, { "epoch": 26.062587331159758, "grad_norm": 0.43361377716064453, "learning_rate": 0.0002874234129295282, "loss": 3.1042, "step": 89550 }, { "epoch": 26.077142524452725, "grad_norm": 0.44345295429229736, "learning_rate": 0.0002872486895748398, "loss": 3.118, "step": 89600 }, { "epoch": 26.091697717745692, "grad_norm": 0.42599573731422424, "learning_rate": 0.0002870739662201514, "loss": 3.1229, "step": 89650 }, { "epoch": 26.10625291103866, "grad_norm": 0.43545016646385193, "learning_rate": 0.000286899242865463, "loss": 3.119, "step": 89700 }, { "epoch": 26.120808104331626, "grad_norm": 0.4526759088039398, "learning_rate": 0.0002867245195107746, "loss": 3.127, "step": 89750 }, { "epoch": 26.135363297624593, "grad_norm": 0.49524712562561035, "learning_rate": 0.0002865497961560862, "loss": 3.1355, "step": 89800 }, { "epoch": 26.14991849091756, "grad_norm": 0.4363962411880493, "learning_rate": 0.00028637507280139773, "loss": 3.1345, "step": 89850 }, { "epoch": 26.164473684210527, "grad_norm": 0.3986723721027374, "learning_rate": 0.0002862003494467094, "loss": 3.1299, "step": 89900 }, { "epoch": 26.179028877503494, "grad_norm": 0.4242367744445801, "learning_rate": 0.0002860256260920209, "loss": 3.1364, "step": 89950 }, { "epoch": 26.19358407079646, "grad_norm": 0.4348607063293457, "learning_rate": 0.0002858509027373325, "loss": 3.1469, "step": 90000 }, { "epoch": 26.19358407079646, "eval_accuracy": 0.37410304629600355, "eval_loss": 3.551170825958252, "eval_runtime": 178.2776, "eval_samples_per_second": 93.399, "eval_steps_per_second": 5.839, "step": 90000 }, { "epoch": 26.20813926408943, "grad_norm": 0.41987913846969604, "learning_rate": 0.0002856761793826441, "loss": 3.1454, "step": 90050 }, { "epoch": 26.222694457382396, "grad_norm": 0.46335843205451965, "learning_rate": 0.0002855014560279557, "loss": 3.1297, "step": 90100 }, { "epoch": 26.23724965067536, "grad_norm": 0.40075039863586426, "learning_rate": 0.0002853267326732673, "loss": 3.1437, "step": 90150 }, { "epoch": 26.251804843968326, "grad_norm": 0.4587453305721283, "learning_rate": 0.0002851520093185789, "loss": 3.1418, "step": 90200 }, { "epoch": 26.266360037261293, "grad_norm": 0.4216047525405884, "learning_rate": 0.0002849772859638905, "loss": 3.1384, "step": 90250 }, { "epoch": 26.28091523055426, "grad_norm": 0.4243725538253784, "learning_rate": 0.0002848025626092021, "loss": 3.1424, "step": 90300 }, { "epoch": 26.295470423847227, "grad_norm": 0.4510405361652374, "learning_rate": 0.0002846278392545137, "loss": 3.151, "step": 90350 }, { "epoch": 26.310025617140194, "grad_norm": 0.41723543405532837, "learning_rate": 0.00028445311589982527, "loss": 3.1602, "step": 90400 }, { "epoch": 26.32458081043316, "grad_norm": 0.45388859510421753, "learning_rate": 0.00028427839254513686, "loss": 3.1465, "step": 90450 }, { "epoch": 26.33913600372613, "grad_norm": 0.4335187077522278, "learning_rate": 0.00028410366919044846, "loss": 3.1518, "step": 90500 }, { "epoch": 26.353691197019096, "grad_norm": 0.4376620054244995, "learning_rate": 0.00028392894583576, "loss": 3.1563, "step": 90550 }, { "epoch": 26.368246390312063, "grad_norm": 0.4251609444618225, "learning_rate": 0.00028375422248107165, "loss": 3.1635, "step": 90600 }, { "epoch": 26.38280158360503, "grad_norm": 0.44332945346832275, "learning_rate": 0.0002835794991263832, "loss": 3.1695, "step": 90650 }, { "epoch": 26.397356776897997, "grad_norm": 0.4631660580635071, "learning_rate": 0.0002834047757716948, "loss": 3.1736, "step": 90700 }, { "epoch": 26.411911970190964, "grad_norm": 0.4380042254924774, "learning_rate": 0.0002832300524170064, "loss": 3.1525, "step": 90750 }, { "epoch": 26.42646716348393, "grad_norm": 0.4693313241004944, "learning_rate": 0.00028305532906231797, "loss": 3.1753, "step": 90800 }, { "epoch": 26.441022356776898, "grad_norm": 0.4185572862625122, "learning_rate": 0.00028288060570762956, "loss": 3.1713, "step": 90850 }, { "epoch": 26.455577550069865, "grad_norm": 0.43091192841529846, "learning_rate": 0.00028270588235294116, "loss": 3.1675, "step": 90900 }, { "epoch": 26.470132743362832, "grad_norm": 0.4310244917869568, "learning_rate": 0.00028253115899825275, "loss": 3.173, "step": 90950 }, { "epoch": 26.4846879366558, "grad_norm": 0.47923094034194946, "learning_rate": 0.0002823564356435643, "loss": 3.1706, "step": 91000 }, { "epoch": 26.4846879366558, "eval_accuracy": 0.37461993138520855, "eval_loss": 3.540400743484497, "eval_runtime": 178.6187, "eval_samples_per_second": 93.221, "eval_steps_per_second": 5.828, "step": 91000 }, { "epoch": 26.499243129948766, "grad_norm": 0.441110759973526, "learning_rate": 0.00028218171228887594, "loss": 3.1661, "step": 91050 }, { "epoch": 26.513798323241733, "grad_norm": 0.42260661721229553, "learning_rate": 0.0002820069889341875, "loss": 3.1693, "step": 91100 }, { "epoch": 26.5283535165347, "grad_norm": 0.4326378405094147, "learning_rate": 0.00028183226557949913, "loss": 3.1718, "step": 91150 }, { "epoch": 26.542908709827667, "grad_norm": 0.4197952449321747, "learning_rate": 0.00028165754222481067, "loss": 3.1747, "step": 91200 }, { "epoch": 26.557463903120635, "grad_norm": 0.40971311926841736, "learning_rate": 0.00028148281887012226, "loss": 3.1862, "step": 91250 }, { "epoch": 26.5720190964136, "grad_norm": 0.4447433650493622, "learning_rate": 0.0002813080955154339, "loss": 3.1756, "step": 91300 }, { "epoch": 26.58657428970657, "grad_norm": 0.45010611414909363, "learning_rate": 0.00028113337216074545, "loss": 3.1645, "step": 91350 }, { "epoch": 26.601129482999536, "grad_norm": 0.43731409311294556, "learning_rate": 0.00028095864880605705, "loss": 3.1739, "step": 91400 }, { "epoch": 26.615684676292503, "grad_norm": 0.4269058406352997, "learning_rate": 0.00028078392545136864, "loss": 3.168, "step": 91450 }, { "epoch": 26.630239869585466, "grad_norm": 0.44210341572761536, "learning_rate": 0.00028060920209668023, "loss": 3.1818, "step": 91500 }, { "epoch": 26.644795062878433, "grad_norm": 0.4212300479412079, "learning_rate": 0.00028043447874199183, "loss": 3.1798, "step": 91550 }, { "epoch": 26.6593502561714, "grad_norm": 0.46399936079978943, "learning_rate": 0.0002802597553873034, "loss": 3.1813, "step": 91600 }, { "epoch": 26.673905449464367, "grad_norm": 0.4186314642429352, "learning_rate": 0.000280085032032615, "loss": 3.1856, "step": 91650 }, { "epoch": 26.688460642757335, "grad_norm": 0.46720054745674133, "learning_rate": 0.00027991030867792656, "loss": 3.1731, "step": 91700 }, { "epoch": 26.7030158360503, "grad_norm": 0.4485943019390106, "learning_rate": 0.0002797355853232382, "loss": 3.1747, "step": 91750 }, { "epoch": 26.71757102934327, "grad_norm": 0.4307052791118622, "learning_rate": 0.00027956086196854975, "loss": 3.1887, "step": 91800 }, { "epoch": 26.732126222636236, "grad_norm": 0.43720656633377075, "learning_rate": 0.0002793861386138614, "loss": 3.1917, "step": 91850 }, { "epoch": 26.746681415929203, "grad_norm": 0.41725441813468933, "learning_rate": 0.00027921141525917293, "loss": 3.1804, "step": 91900 }, { "epoch": 26.76123660922217, "grad_norm": 0.43516477942466736, "learning_rate": 0.00027903669190448453, "loss": 3.1857, "step": 91950 }, { "epoch": 26.775791802515137, "grad_norm": 0.4388038218021393, "learning_rate": 0.0002788619685497961, "loss": 3.1898, "step": 92000 }, { "epoch": 26.775791802515137, "eval_accuracy": 0.37509356642602165, "eval_loss": 3.535130262374878, "eval_runtime": 178.6138, "eval_samples_per_second": 93.223, "eval_steps_per_second": 5.828, "step": 92000 }, { "epoch": 26.790346995808104, "grad_norm": 0.4371972382068634, "learning_rate": 0.0002786872451951077, "loss": 3.195, "step": 92050 }, { "epoch": 26.80490218910107, "grad_norm": 0.43415766954421997, "learning_rate": 0.0002785125218404193, "loss": 3.196, "step": 92100 }, { "epoch": 26.819457382394038, "grad_norm": 0.4587138593196869, "learning_rate": 0.0002783377984857309, "loss": 3.2012, "step": 92150 }, { "epoch": 26.834012575687005, "grad_norm": 0.4176168739795685, "learning_rate": 0.0002781630751310425, "loss": 3.194, "step": 92200 }, { "epoch": 26.848567768979972, "grad_norm": 0.45048636198043823, "learning_rate": 0.0002779883517763541, "loss": 3.2002, "step": 92250 }, { "epoch": 26.86312296227294, "grad_norm": 0.4477289319038391, "learning_rate": 0.0002778136284216657, "loss": 3.1966, "step": 92300 }, { "epoch": 26.877678155565906, "grad_norm": 0.46485674381256104, "learning_rate": 0.0002776389050669773, "loss": 3.2013, "step": 92350 }, { "epoch": 26.892233348858873, "grad_norm": 0.4228600263595581, "learning_rate": 0.0002774641817122888, "loss": 3.1885, "step": 92400 }, { "epoch": 26.90678854215184, "grad_norm": 0.4267354905605316, "learning_rate": 0.00027728945835760047, "loss": 3.1902, "step": 92450 }, { "epoch": 26.921343735444808, "grad_norm": 0.44875380396842957, "learning_rate": 0.000277114735002912, "loss": 3.1996, "step": 92500 }, { "epoch": 26.935898928737775, "grad_norm": 0.45626726746559143, "learning_rate": 0.00027694001164822366, "loss": 3.1861, "step": 92550 }, { "epoch": 26.95045412203074, "grad_norm": 0.4311225414276123, "learning_rate": 0.0002767652882935352, "loss": 3.1912, "step": 92600 }, { "epoch": 26.96500931532371, "grad_norm": 0.4225662350654602, "learning_rate": 0.0002765905649388468, "loss": 3.2042, "step": 92650 }, { "epoch": 26.979564508616676, "grad_norm": 0.4945019781589508, "learning_rate": 0.0002764158415841584, "loss": 3.1968, "step": 92700 }, { "epoch": 26.994119701909643, "grad_norm": 0.4463379383087158, "learning_rate": 0.00027624111822947, "loss": 3.2029, "step": 92750 }, { "epoch": 27.00844201210992, "grad_norm": 0.44404837489128113, "learning_rate": 0.0002760663948747816, "loss": 3.1376, "step": 92800 }, { "epoch": 27.022997205402888, "grad_norm": 0.4302866756916046, "learning_rate": 0.00027589167152009317, "loss": 3.1001, "step": 92850 }, { "epoch": 27.037552398695855, "grad_norm": 0.43204373121261597, "learning_rate": 0.00027571694816540477, "loss": 3.0887, "step": 92900 }, { "epoch": 27.052107591988822, "grad_norm": 0.4462272822856903, "learning_rate": 0.0002755422248107163, "loss": 3.1157, "step": 92950 }, { "epoch": 27.06666278528179, "grad_norm": 0.4375362694263458, "learning_rate": 0.00027536750145602795, "loss": 3.0979, "step": 93000 }, { "epoch": 27.06666278528179, "eval_accuracy": 0.3744526900296063, "eval_loss": 3.549834728240967, "eval_runtime": 178.5628, "eval_samples_per_second": 93.25, "eval_steps_per_second": 5.83, "step": 93000 }, { "epoch": 27.081217978574756, "grad_norm": 0.43857455253601074, "learning_rate": 0.0002751927781013395, "loss": 3.1114, "step": 93050 }, { "epoch": 27.095773171867723, "grad_norm": 0.44870609045028687, "learning_rate": 0.0002750180547466511, "loss": 3.111, "step": 93100 }, { "epoch": 27.11032836516069, "grad_norm": 0.45709872245788574, "learning_rate": 0.0002748433313919627, "loss": 3.1303, "step": 93150 }, { "epoch": 27.124883558453657, "grad_norm": 0.4482116401195526, "learning_rate": 0.0002746686080372743, "loss": 3.1263, "step": 93200 }, { "epoch": 27.139438751746624, "grad_norm": 0.4261331558227539, "learning_rate": 0.00027449388468258587, "loss": 3.1311, "step": 93250 }, { "epoch": 27.15399394503959, "grad_norm": 0.458110511302948, "learning_rate": 0.00027431916132789747, "loss": 3.1178, "step": 93300 }, { "epoch": 27.16854913833256, "grad_norm": 0.44779232144355774, "learning_rate": 0.00027414443797320906, "loss": 3.1323, "step": 93350 }, { "epoch": 27.183104331625525, "grad_norm": 0.4548485279083252, "learning_rate": 0.00027396971461852065, "loss": 3.1276, "step": 93400 }, { "epoch": 27.197659524918492, "grad_norm": 0.41539332270622253, "learning_rate": 0.00027379499126383225, "loss": 3.1344, "step": 93450 }, { "epoch": 27.21221471821146, "grad_norm": 0.42336076498031616, "learning_rate": 0.00027362026790914384, "loss": 3.128, "step": 93500 }, { "epoch": 27.226769911504423, "grad_norm": 0.43021145462989807, "learning_rate": 0.00027344554455445544, "loss": 3.1322, "step": 93550 }, { "epoch": 27.24132510479739, "grad_norm": 0.4417980909347534, "learning_rate": 0.00027327082119976703, "loss": 3.136, "step": 93600 }, { "epoch": 27.255880298090357, "grad_norm": 0.4387443959712982, "learning_rate": 0.00027309609784507857, "loss": 3.1394, "step": 93650 }, { "epoch": 27.270435491383324, "grad_norm": 0.4308546781539917, "learning_rate": 0.0002729213744903902, "loss": 3.1508, "step": 93700 }, { "epoch": 27.28499068467629, "grad_norm": 0.47179627418518066, "learning_rate": 0.00027274665113570176, "loss": 3.1405, "step": 93750 }, { "epoch": 27.29954587796926, "grad_norm": 0.4395540952682495, "learning_rate": 0.00027257192778101335, "loss": 3.1434, "step": 93800 }, { "epoch": 27.314101071262225, "grad_norm": 0.43695396184921265, "learning_rate": 0.00027239720442632495, "loss": 3.1465, "step": 93850 }, { "epoch": 27.328656264555192, "grad_norm": 0.44696319103240967, "learning_rate": 0.00027222248107163654, "loss": 3.1419, "step": 93900 }, { "epoch": 27.34321145784816, "grad_norm": 0.42955282330513, "learning_rate": 0.00027204775771694814, "loss": 3.1439, "step": 93950 }, { "epoch": 27.357766651141127, "grad_norm": 0.4387293756008148, "learning_rate": 0.00027187303436225973, "loss": 3.1498, "step": 94000 }, { "epoch": 27.357766651141127, "eval_accuracy": 0.3745280250323758, "eval_loss": 3.546278715133667, "eval_runtime": 178.6233, "eval_samples_per_second": 93.219, "eval_steps_per_second": 5.828, "step": 94000 }, { "epoch": 27.372321844434094, "grad_norm": 0.44071051478385925, "learning_rate": 0.0002716983110075713, "loss": 3.1543, "step": 94050 }, { "epoch": 27.38687703772706, "grad_norm": 0.47081178426742554, "learning_rate": 0.0002715235876528829, "loss": 3.1525, "step": 94100 }, { "epoch": 27.401432231020028, "grad_norm": 0.4589060842990875, "learning_rate": 0.0002713488642981945, "loss": 3.1552, "step": 94150 }, { "epoch": 27.415987424312995, "grad_norm": 0.4278987646102905, "learning_rate": 0.00027117414094350606, "loss": 3.1519, "step": 94200 }, { "epoch": 27.430542617605962, "grad_norm": 0.4573371410369873, "learning_rate": 0.0002709994175888177, "loss": 3.1656, "step": 94250 }, { "epoch": 27.44509781089893, "grad_norm": 0.4469725787639618, "learning_rate": 0.00027082469423412924, "loss": 3.1553, "step": 94300 }, { "epoch": 27.459653004191896, "grad_norm": 0.4670809209346771, "learning_rate": 0.00027064997087944084, "loss": 3.1731, "step": 94350 }, { "epoch": 27.474208197484863, "grad_norm": 0.44537362456321716, "learning_rate": 0.00027047524752475243, "loss": 3.1667, "step": 94400 }, { "epoch": 27.48876339077783, "grad_norm": 0.44716644287109375, "learning_rate": 0.000270300524170064, "loss": 3.1568, "step": 94450 }, { "epoch": 27.503318584070797, "grad_norm": 0.44701218605041504, "learning_rate": 0.0002701258008153757, "loss": 3.1721, "step": 94500 }, { "epoch": 27.517873777363764, "grad_norm": 0.42892196774482727, "learning_rate": 0.0002699510774606872, "loss": 3.1678, "step": 94550 }, { "epoch": 27.53242897065673, "grad_norm": 0.41372281312942505, "learning_rate": 0.0002697763541059988, "loss": 3.165, "step": 94600 }, { "epoch": 27.5469841639497, "grad_norm": 0.41405484080314636, "learning_rate": 0.0002696016307513104, "loss": 3.1605, "step": 94650 }, { "epoch": 27.561539357242665, "grad_norm": 0.41708678007125854, "learning_rate": 0.000269426907396622, "loss": 3.163, "step": 94700 }, { "epoch": 27.576094550535633, "grad_norm": 0.47939321398735046, "learning_rate": 0.0002692521840419336, "loss": 3.1722, "step": 94750 }, { "epoch": 27.5906497438286, "grad_norm": 0.48287197947502136, "learning_rate": 0.0002690774606872452, "loss": 3.1707, "step": 94800 }, { "epoch": 27.605204937121567, "grad_norm": 0.44868147373199463, "learning_rate": 0.0002689027373325568, "loss": 3.1646, "step": 94850 }, { "epoch": 27.619760130414534, "grad_norm": 0.4690980911254883, "learning_rate": 0.0002687280139778683, "loss": 3.1652, "step": 94900 }, { "epoch": 27.634315323707497, "grad_norm": 0.4309838116168976, "learning_rate": 0.00026855329062317997, "loss": 3.1583, "step": 94950 }, { "epoch": 27.648870517000464, "grad_norm": 0.4600057303905487, "learning_rate": 0.0002683785672684915, "loss": 3.1628, "step": 95000 }, { "epoch": 27.648870517000464, "eval_accuracy": 0.3747204172313364, "eval_loss": 3.541227340698242, "eval_runtime": 178.7188, "eval_samples_per_second": 93.169, "eval_steps_per_second": 5.825, "step": 95000 }, { "epoch": 27.66342571029343, "grad_norm": 0.43608319759368896, "learning_rate": 0.0002682038439138031, "loss": 3.1812, "step": 95050 }, { "epoch": 27.6779809035864, "grad_norm": 0.42355069518089294, "learning_rate": 0.0002680291205591147, "loss": 3.1667, "step": 95100 }, { "epoch": 27.692536096879365, "grad_norm": 0.4440571367740631, "learning_rate": 0.0002678543972044263, "loss": 3.1653, "step": 95150 }, { "epoch": 27.707091290172333, "grad_norm": 0.4445669651031494, "learning_rate": 0.0002676796738497379, "loss": 3.1585, "step": 95200 }, { "epoch": 27.7216464834653, "grad_norm": 0.4321427643299103, "learning_rate": 0.0002675049504950495, "loss": 3.1695, "step": 95250 }, { "epoch": 27.736201676758267, "grad_norm": 0.4369765818119049, "learning_rate": 0.0002673302271403611, "loss": 3.1718, "step": 95300 }, { "epoch": 27.750756870051234, "grad_norm": 0.44305044412612915, "learning_rate": 0.00026715550378567267, "loss": 3.1771, "step": 95350 }, { "epoch": 27.7653120633442, "grad_norm": 0.45802849531173706, "learning_rate": 0.00026698078043098426, "loss": 3.1761, "step": 95400 }, { "epoch": 27.779867256637168, "grad_norm": 0.4291384816169739, "learning_rate": 0.00026680605707629586, "loss": 3.1811, "step": 95450 }, { "epoch": 27.794422449930135, "grad_norm": 0.45684850215911865, "learning_rate": 0.00026663133372160745, "loss": 3.1791, "step": 95500 }, { "epoch": 27.808977643223102, "grad_norm": 0.4409759044647217, "learning_rate": 0.00026645661036691905, "loss": 3.1773, "step": 95550 }, { "epoch": 27.82353283651607, "grad_norm": 0.41172921657562256, "learning_rate": 0.0002662818870122306, "loss": 3.1913, "step": 95600 }, { "epoch": 27.838088029809036, "grad_norm": 0.462798148393631, "learning_rate": 0.00026610716365754224, "loss": 3.1757, "step": 95650 }, { "epoch": 27.852643223102003, "grad_norm": 0.4223795533180237, "learning_rate": 0.0002659324403028538, "loss": 3.1809, "step": 95700 }, { "epoch": 27.86719841639497, "grad_norm": 0.44442063570022583, "learning_rate": 0.00026575771694816537, "loss": 3.1827, "step": 95750 }, { "epoch": 27.881753609687937, "grad_norm": 0.4229409098625183, "learning_rate": 0.00026558299359347696, "loss": 3.1835, "step": 95800 }, { "epoch": 27.896308802980904, "grad_norm": 0.4268472492694855, "learning_rate": 0.00026540827023878856, "loss": 3.1975, "step": 95850 }, { "epoch": 27.91086399627387, "grad_norm": 0.44019851088523865, "learning_rate": 0.00026523354688410015, "loss": 3.1697, "step": 95900 }, { "epoch": 27.92541918956684, "grad_norm": 0.42980363965034485, "learning_rate": 0.00026505882352941175, "loss": 3.1859, "step": 95950 }, { "epoch": 27.939974382859806, "grad_norm": 0.453268438577652, "learning_rate": 0.00026488410017472334, "loss": 3.1913, "step": 96000 }, { "epoch": 27.939974382859806, "eval_accuracy": 0.3754902210817895, "eval_loss": 3.5291364192962646, "eval_runtime": 180.3135, "eval_samples_per_second": 92.345, "eval_steps_per_second": 5.773, "step": 96000 }, { "epoch": 27.954529576152773, "grad_norm": 0.4899916648864746, "learning_rate": 0.00026470937682003494, "loss": 3.2026, "step": 96050 }, { "epoch": 27.96908476944574, "grad_norm": 0.43684250116348267, "learning_rate": 0.00026453465346534653, "loss": 3.1894, "step": 96100 }, { "epoch": 27.983639962738707, "grad_norm": 0.45027562975883484, "learning_rate": 0.00026435993011065807, "loss": 3.1861, "step": 96150 }, { "epoch": 27.998195156031674, "grad_norm": 0.426952987909317, "learning_rate": 0.0002641852067559697, "loss": 3.1927, "step": 96200 }, { "epoch": 28.01251746623195, "grad_norm": 0.4302879273891449, "learning_rate": 0.00026401048340128126, "loss": 3.1153, "step": 96250 }, { "epoch": 28.02707265952492, "grad_norm": 0.4396148920059204, "learning_rate": 0.00026383576004659285, "loss": 3.0919, "step": 96300 }, { "epoch": 28.041627852817886, "grad_norm": 0.4662112593650818, "learning_rate": 0.00026366103669190445, "loss": 3.0964, "step": 96350 }, { "epoch": 28.056183046110853, "grad_norm": 0.44618555903434753, "learning_rate": 0.00026348631333721604, "loss": 3.1003, "step": 96400 }, { "epoch": 28.07073823940382, "grad_norm": 0.44887861609458923, "learning_rate": 0.00026331158998252764, "loss": 3.1172, "step": 96450 }, { "epoch": 28.085293432696787, "grad_norm": 0.43568849563598633, "learning_rate": 0.00026313686662783923, "loss": 3.1006, "step": 96500 }, { "epoch": 28.099848625989754, "grad_norm": 0.44593697786331177, "learning_rate": 0.0002629621432731508, "loss": 3.1288, "step": 96550 }, { "epoch": 28.11440381928272, "grad_norm": 0.4617462754249573, "learning_rate": 0.0002627874199184624, "loss": 3.1096, "step": 96600 }, { "epoch": 28.128959012575688, "grad_norm": 0.4409908950328827, "learning_rate": 0.000262612696563774, "loss": 3.1005, "step": 96650 }, { "epoch": 28.143514205868655, "grad_norm": 0.457353800535202, "learning_rate": 0.0002624379732090856, "loss": 3.1142, "step": 96700 }, { "epoch": 28.158069399161622, "grad_norm": 0.4217976927757263, "learning_rate": 0.0002622632498543972, "loss": 3.1036, "step": 96750 }, { "epoch": 28.17262459245459, "grad_norm": 0.4667282998561859, "learning_rate": 0.0002620885264997088, "loss": 3.1277, "step": 96800 }, { "epoch": 28.187179785747556, "grad_norm": 0.4643479883670807, "learning_rate": 0.00026191380314502034, "loss": 3.1176, "step": 96850 }, { "epoch": 28.201734979040523, "grad_norm": 0.45740798115730286, "learning_rate": 0.000261739079790332, "loss": 3.1243, "step": 96900 }, { "epoch": 28.216290172333487, "grad_norm": 0.4555386006832123, "learning_rate": 0.0002615643564356435, "loss": 3.1106, "step": 96950 }, { "epoch": 28.230845365626454, "grad_norm": 0.44366058707237244, "learning_rate": 0.0002613896330809551, "loss": 3.1256, "step": 97000 }, { "epoch": 28.230845365626454, "eval_accuracy": 0.37466541445240326, "eval_loss": 3.5540220737457275, "eval_runtime": 180.359, "eval_samples_per_second": 92.321, "eval_steps_per_second": 5.772, "step": 97000 }, { "epoch": 28.24540055891942, "grad_norm": 0.4390740394592285, "learning_rate": 0.0002612149097262667, "loss": 3.1308, "step": 97050 }, { "epoch": 28.259955752212388, "grad_norm": 0.4719174802303314, "learning_rate": 0.0002610401863715783, "loss": 3.1291, "step": 97100 }, { "epoch": 28.274510945505355, "grad_norm": 0.43631115555763245, "learning_rate": 0.0002608654630168899, "loss": 3.1326, "step": 97150 }, { "epoch": 28.289066138798322, "grad_norm": 0.46663373708724976, "learning_rate": 0.0002606907396622015, "loss": 3.1393, "step": 97200 }, { "epoch": 28.30362133209129, "grad_norm": 0.45384928584098816, "learning_rate": 0.0002605160163075131, "loss": 3.1296, "step": 97250 }, { "epoch": 28.318176525384256, "grad_norm": 0.4470818042755127, "learning_rate": 0.0002603412929528247, "loss": 3.1334, "step": 97300 }, { "epoch": 28.332731718677223, "grad_norm": 0.42972907423973083, "learning_rate": 0.0002601665695981363, "loss": 3.1433, "step": 97350 }, { "epoch": 28.34728691197019, "grad_norm": 0.45368412137031555, "learning_rate": 0.0002599918462434478, "loss": 3.1349, "step": 97400 }, { "epoch": 28.361842105263158, "grad_norm": 0.43361520767211914, "learning_rate": 0.00025981712288875947, "loss": 3.1483, "step": 97450 }, { "epoch": 28.376397298556125, "grad_norm": 0.43730029463768005, "learning_rate": 0.000259642399534071, "loss": 3.1585, "step": 97500 }, { "epoch": 28.39095249184909, "grad_norm": 0.45435991883277893, "learning_rate": 0.0002594676761793826, "loss": 3.1422, "step": 97550 }, { "epoch": 28.40550768514206, "grad_norm": 0.4923446476459503, "learning_rate": 0.0002592929528246942, "loss": 3.129, "step": 97600 }, { "epoch": 28.420062878435026, "grad_norm": 0.4437585175037384, "learning_rate": 0.0002591182294700058, "loss": 3.1515, "step": 97650 }, { "epoch": 28.434618071727993, "grad_norm": 0.5055341720581055, "learning_rate": 0.0002589435061153174, "loss": 3.1588, "step": 97700 }, { "epoch": 28.44917326502096, "grad_norm": 0.44801563024520874, "learning_rate": 0.000258768782760629, "loss": 3.1522, "step": 97750 }, { "epoch": 28.463728458313927, "grad_norm": 0.46998879313468933, "learning_rate": 0.0002585940594059406, "loss": 3.1494, "step": 97800 }, { "epoch": 28.478283651606894, "grad_norm": 0.4956221282482147, "learning_rate": 0.00025841933605125217, "loss": 3.1632, "step": 97850 }, { "epoch": 28.49283884489986, "grad_norm": 0.44266363978385925, "learning_rate": 0.00025824461269656376, "loss": 3.1472, "step": 97900 }, { "epoch": 28.507394038192828, "grad_norm": 0.4530094265937805, "learning_rate": 0.00025806988934187536, "loss": 3.1572, "step": 97950 }, { "epoch": 28.521949231485795, "grad_norm": 0.4416975975036621, "learning_rate": 0.00025789516598718695, "loss": 3.1429, "step": 98000 }, { "epoch": 28.521949231485795, "eval_accuracy": 0.3748948277525688, "eval_loss": 3.5408880710601807, "eval_runtime": 180.426, "eval_samples_per_second": 92.287, "eval_steps_per_second": 5.77, "step": 98000 }, { "epoch": 28.536504424778762, "grad_norm": 0.41299423575401306, "learning_rate": 0.00025772044263249854, "loss": 3.159, "step": 98050 }, { "epoch": 28.55105961807173, "grad_norm": 0.46825602650642395, "learning_rate": 0.0002575457192778101, "loss": 3.1569, "step": 98100 }, { "epoch": 28.565614811364696, "grad_norm": 0.43501028418540955, "learning_rate": 0.00025737099592312173, "loss": 3.1601, "step": 98150 }, { "epoch": 28.580170004657663, "grad_norm": 0.4388478100299835, "learning_rate": 0.0002571962725684333, "loss": 3.148, "step": 98200 }, { "epoch": 28.59472519795063, "grad_norm": 0.4357254207134247, "learning_rate": 0.00025702154921374487, "loss": 3.1506, "step": 98250 }, { "epoch": 28.609280391243594, "grad_norm": 0.4324454963207245, "learning_rate": 0.00025684682585905646, "loss": 3.1666, "step": 98300 }, { "epoch": 28.62383558453656, "grad_norm": 0.41808149218559265, "learning_rate": 0.00025667210250436806, "loss": 3.1655, "step": 98350 }, { "epoch": 28.638390777829528, "grad_norm": 0.46886593103408813, "learning_rate": 0.00025649737914967965, "loss": 3.1638, "step": 98400 }, { "epoch": 28.652945971122495, "grad_norm": 0.46281686425209045, "learning_rate": 0.00025632265579499124, "loss": 3.1677, "step": 98450 }, { "epoch": 28.667501164415462, "grad_norm": 0.4501182734966278, "learning_rate": 0.00025614793244030284, "loss": 3.1629, "step": 98500 }, { "epoch": 28.68205635770843, "grad_norm": 0.44337964057922363, "learning_rate": 0.0002559732090856144, "loss": 3.1674, "step": 98550 }, { "epoch": 28.696611551001396, "grad_norm": 0.4638313055038452, "learning_rate": 0.00025579848573092603, "loss": 3.1584, "step": 98600 }, { "epoch": 28.711166744294363, "grad_norm": 0.440639853477478, "learning_rate": 0.0002556237623762376, "loss": 3.1665, "step": 98650 }, { "epoch": 28.72572193758733, "grad_norm": 0.44387996196746826, "learning_rate": 0.0002554490390215492, "loss": 3.1742, "step": 98700 }, { "epoch": 28.740277130880298, "grad_norm": 0.4620234966278076, "learning_rate": 0.0002552743156668608, "loss": 3.1676, "step": 98750 }, { "epoch": 28.754832324173265, "grad_norm": 0.4579341411590576, "learning_rate": 0.00025509959231217235, "loss": 3.1733, "step": 98800 }, { "epoch": 28.76938751746623, "grad_norm": 0.44975659251213074, "learning_rate": 0.000254924868957484, "loss": 3.1714, "step": 98850 }, { "epoch": 28.7839427107592, "grad_norm": 0.46107542514801025, "learning_rate": 0.00025475014560279554, "loss": 3.1743, "step": 98900 }, { "epoch": 28.798497904052166, "grad_norm": 0.465629905462265, "learning_rate": 0.00025457542224810713, "loss": 3.1688, "step": 98950 }, { "epoch": 28.813053097345133, "grad_norm": 0.4210714101791382, "learning_rate": 0.00025440069889341873, "loss": 3.153, "step": 99000 }, { "epoch": 28.813053097345133, "eval_accuracy": 0.37529618350055316, "eval_loss": 3.5361506938934326, "eval_runtime": 178.8048, "eval_samples_per_second": 93.124, "eval_steps_per_second": 5.822, "step": 99000 }, { "epoch": 28.8276082906381, "grad_norm": 0.43500855565071106, "learning_rate": 0.0002542259755387303, "loss": 3.1668, "step": 99050 }, { "epoch": 28.842163483931067, "grad_norm": 0.439731627702713, "learning_rate": 0.0002540512521840419, "loss": 3.1743, "step": 99100 }, { "epoch": 28.856718677224034, "grad_norm": 0.476308673620224, "learning_rate": 0.0002538765288293535, "loss": 3.1707, "step": 99150 }, { "epoch": 28.871273870517, "grad_norm": 0.43574225902557373, "learning_rate": 0.0002537018054746651, "loss": 3.173, "step": 99200 }, { "epoch": 28.885829063809968, "grad_norm": 0.45793789625167847, "learning_rate": 0.00025352708211997664, "loss": 3.1602, "step": 99250 }, { "epoch": 28.900384257102935, "grad_norm": 0.42923271656036377, "learning_rate": 0.0002533523587652883, "loss": 3.1745, "step": 99300 }, { "epoch": 28.914939450395902, "grad_norm": 0.45427390933036804, "learning_rate": 0.00025317763541059983, "loss": 3.1794, "step": 99350 }, { "epoch": 28.92949464368887, "grad_norm": 0.4721647799015045, "learning_rate": 0.0002530029120559115, "loss": 3.1874, "step": 99400 }, { "epoch": 28.944049836981836, "grad_norm": 0.4318784773349762, "learning_rate": 0.000252828188701223, "loss": 3.1678, "step": 99450 }, { "epoch": 28.958605030274803, "grad_norm": 0.4156908094882965, "learning_rate": 0.0002526534653465346, "loss": 3.1882, "step": 99500 }, { "epoch": 28.97316022356777, "grad_norm": 0.474138468503952, "learning_rate": 0.0002524787419918462, "loss": 3.176, "step": 99550 }, { "epoch": 28.987715416860738, "grad_norm": 0.4202764630317688, "learning_rate": 0.0002523040186371578, "loss": 3.1908, "step": 99600 }, { "epoch": 29.002037727061015, "grad_norm": 0.4535371959209442, "learning_rate": 0.0002521292952824694, "loss": 3.1569, "step": 99650 }, { "epoch": 29.016592920353983, "grad_norm": 0.4415458142757416, "learning_rate": 0.000251954571927781, "loss": 3.0871, "step": 99700 }, { "epoch": 29.03114811364695, "grad_norm": 0.4552340507507324, "learning_rate": 0.0002517798485730926, "loss": 3.0841, "step": 99750 }, { "epoch": 29.045703306939917, "grad_norm": 0.4531419575214386, "learning_rate": 0.0002516051252184042, "loss": 3.088, "step": 99800 }, { "epoch": 29.060258500232884, "grad_norm": 0.4142932593822479, "learning_rate": 0.0002514304018637158, "loss": 3.0887, "step": 99850 }, { "epoch": 29.07481369352585, "grad_norm": 0.44254419207572937, "learning_rate": 0.00025125567850902737, "loss": 3.0885, "step": 99900 }, { "epoch": 29.089368886818818, "grad_norm": 0.4339526891708374, "learning_rate": 0.0002510809551543389, "loss": 3.1025, "step": 99950 }, { "epoch": 29.103924080111785, "grad_norm": 0.47974538803100586, "learning_rate": 0.00025090623179965056, "loss": 3.096, "step": 100000 }, { "epoch": 29.103924080111785, "eval_accuracy": 0.3745269672866271, "eval_loss": 3.5486350059509277, "eval_runtime": 178.8968, "eval_samples_per_second": 93.076, "eval_steps_per_second": 5.819, "step": 100000 }, { "epoch": 29.118479273404752, "grad_norm": 0.4522671103477478, "learning_rate": 0.0002507315084449621, "loss": 3.1082, "step": 100050 }, { "epoch": 29.13303446669772, "grad_norm": 0.4260740876197815, "learning_rate": 0.00025055678509027375, "loss": 3.0939, "step": 100100 }, { "epoch": 29.147589659990686, "grad_norm": 0.447147011756897, "learning_rate": 0.0002503820617355853, "loss": 3.103, "step": 100150 }, { "epoch": 29.162144853283653, "grad_norm": 0.4594668745994568, "learning_rate": 0.0002502073383808969, "loss": 3.1076, "step": 100200 }, { "epoch": 29.17670004657662, "grad_norm": 0.44465282559394836, "learning_rate": 0.0002500326150262085, "loss": 3.1121, "step": 100250 }, { "epoch": 29.191255239869584, "grad_norm": 0.45539477467536926, "learning_rate": 0.00024985789167152007, "loss": 3.1139, "step": 100300 }, { "epoch": 29.20581043316255, "grad_norm": 0.4591493308544159, "learning_rate": 0.00024968316831683167, "loss": 3.1183, "step": 100350 }, { "epoch": 29.220365626455518, "grad_norm": 0.45283299684524536, "learning_rate": 0.00024950844496214326, "loss": 3.1191, "step": 100400 }, { "epoch": 29.234920819748485, "grad_norm": 0.4387270510196686, "learning_rate": 0.00024933372160745485, "loss": 3.1083, "step": 100450 }, { "epoch": 29.249476013041452, "grad_norm": 0.45270800590515137, "learning_rate": 0.0002491589982527664, "loss": 3.1267, "step": 100500 }, { "epoch": 29.26403120633442, "grad_norm": 0.4369964897632599, "learning_rate": 0.00024898427489807804, "loss": 3.1264, "step": 100550 }, { "epoch": 29.278586399627386, "grad_norm": 0.4369131326675415, "learning_rate": 0.0002488095515433896, "loss": 3.1232, "step": 100600 }, { "epoch": 29.293141592920353, "grad_norm": 0.45754966139793396, "learning_rate": 0.0002486348281887012, "loss": 3.1295, "step": 100650 }, { "epoch": 29.30769678621332, "grad_norm": 0.4375859200954437, "learning_rate": 0.00024846010483401277, "loss": 3.1185, "step": 100700 }, { "epoch": 29.322251979506287, "grad_norm": 0.4830842912197113, "learning_rate": 0.00024828538147932437, "loss": 3.1465, "step": 100750 }, { "epoch": 29.336807172799254, "grad_norm": 0.4631670415401459, "learning_rate": 0.00024811065812463596, "loss": 3.1405, "step": 100800 }, { "epoch": 29.35136236609222, "grad_norm": 0.44618287682533264, "learning_rate": 0.00024793593476994755, "loss": 3.1188, "step": 100850 }, { "epoch": 29.36591755938519, "grad_norm": 0.4755122661590576, "learning_rate": 0.00024776121141525915, "loss": 3.1346, "step": 100900 }, { "epoch": 29.380472752678156, "grad_norm": 0.4321480393409729, "learning_rate": 0.00024758648806057074, "loss": 3.1249, "step": 100950 }, { "epoch": 29.395027945971123, "grad_norm": 0.42231783270835876, "learning_rate": 0.00024741176470588234, "loss": 3.1396, "step": 101000 }, { "epoch": 29.395027945971123, "eval_accuracy": 0.3748001007444062, "eval_loss": 3.548363208770752, "eval_runtime": 178.7651, "eval_samples_per_second": 93.145, "eval_steps_per_second": 5.823, "step": 101000 }, { "epoch": 29.40958313926409, "grad_norm": 0.5084123015403748, "learning_rate": 0.00024723704135119393, "loss": 3.1432, "step": 101050 }, { "epoch": 29.424138332557057, "grad_norm": 0.43953821063041687, "learning_rate": 0.0002470623179965055, "loss": 3.1477, "step": 101100 }, { "epoch": 29.438693525850024, "grad_norm": 0.4613848626613617, "learning_rate": 0.0002468875946418171, "loss": 3.1426, "step": 101150 }, { "epoch": 29.45324871914299, "grad_norm": 0.47989189624786377, "learning_rate": 0.00024671287128712866, "loss": 3.1526, "step": 101200 }, { "epoch": 29.467803912435958, "grad_norm": 0.4898466169834137, "learning_rate": 0.0002465381479324403, "loss": 3.1317, "step": 101250 }, { "epoch": 29.482359105728925, "grad_norm": 0.4452469050884247, "learning_rate": 0.00024636342457775185, "loss": 3.1518, "step": 101300 }, { "epoch": 29.496914299021892, "grad_norm": 0.4506460428237915, "learning_rate": 0.00024618870122306344, "loss": 3.1419, "step": 101350 }, { "epoch": 29.51146949231486, "grad_norm": 0.4382789134979248, "learning_rate": 0.00024601397786837504, "loss": 3.1552, "step": 101400 }, { "epoch": 29.526024685607826, "grad_norm": 0.4696340560913086, "learning_rate": 0.00024583925451368663, "loss": 3.1491, "step": 101450 }, { "epoch": 29.540579878900793, "grad_norm": 0.48290392756462097, "learning_rate": 0.0002456645311589982, "loss": 3.1474, "step": 101500 }, { "epoch": 29.55513507219376, "grad_norm": 0.46145153045654297, "learning_rate": 0.0002454898078043098, "loss": 3.1483, "step": 101550 }, { "epoch": 29.569690265486727, "grad_norm": 0.4575146436691284, "learning_rate": 0.0002453150844496214, "loss": 3.1479, "step": 101600 }, { "epoch": 29.58424545877969, "grad_norm": 0.44925957918167114, "learning_rate": 0.000245140361094933, "loss": 3.1466, "step": 101650 }, { "epoch": 29.598800652072658, "grad_norm": 0.4962725043296814, "learning_rate": 0.0002449656377402446, "loss": 3.1528, "step": 101700 }, { "epoch": 29.613355845365625, "grad_norm": 0.4611053168773651, "learning_rate": 0.00024479091438555614, "loss": 3.1559, "step": 101750 }, { "epoch": 29.627911038658592, "grad_norm": 0.43480363488197327, "learning_rate": 0.0002446161910308678, "loss": 3.1425, "step": 101800 }, { "epoch": 29.64246623195156, "grad_norm": 0.46831223368644714, "learning_rate": 0.0002444414676761794, "loss": 3.1533, "step": 101850 }, { "epoch": 29.657021425244526, "grad_norm": 0.46180784702301025, "learning_rate": 0.0002442667443214909, "loss": 3.1501, "step": 101900 }, { "epoch": 29.671576618537493, "grad_norm": 0.41940245032310486, "learning_rate": 0.00024409202096680255, "loss": 3.158, "step": 101950 }, { "epoch": 29.68613181183046, "grad_norm": 0.46004733443260193, "learning_rate": 0.00024391729761211411, "loss": 3.1598, "step": 102000 }, { "epoch": 29.68613181183046, "eval_accuracy": 0.3753476604603239, "eval_loss": 3.5395259857177734, "eval_runtime": 178.8507, "eval_samples_per_second": 93.1, "eval_steps_per_second": 5.82, "step": 102000 }, { "epoch": 29.700687005123427, "grad_norm": 0.44788387417793274, "learning_rate": 0.00024374257425742574, "loss": 3.1414, "step": 102050 }, { "epoch": 29.715242198416394, "grad_norm": 0.4157842695713043, "learning_rate": 0.0002435678509027373, "loss": 3.1572, "step": 102100 }, { "epoch": 29.72979739170936, "grad_norm": 0.4743036925792694, "learning_rate": 0.0002433931275480489, "loss": 3.1574, "step": 102150 }, { "epoch": 29.74435258500233, "grad_norm": 0.4311430752277374, "learning_rate": 0.00024321840419336052, "loss": 3.1644, "step": 102200 }, { "epoch": 29.758907778295296, "grad_norm": 0.46171319484710693, "learning_rate": 0.00024304368083867209, "loss": 3.1633, "step": 102250 }, { "epoch": 29.773462971588263, "grad_norm": 0.4693566858768463, "learning_rate": 0.00024286895748398365, "loss": 3.1603, "step": 102300 }, { "epoch": 29.78801816488123, "grad_norm": 0.45776861906051636, "learning_rate": 0.00024269423412929527, "loss": 3.1665, "step": 102350 }, { "epoch": 29.802573358174197, "grad_norm": 0.5034863948822021, "learning_rate": 0.00024251951077460684, "loss": 3.1559, "step": 102400 }, { "epoch": 29.817128551467164, "grad_norm": 0.4421289563179016, "learning_rate": 0.00024234478741991844, "loss": 3.1719, "step": 102450 }, { "epoch": 29.83168374476013, "grad_norm": 0.4508364796638489, "learning_rate": 0.00024217006406523003, "loss": 3.1549, "step": 102500 }, { "epoch": 29.846238938053098, "grad_norm": 0.45362427830696106, "learning_rate": 0.00024199534071054162, "loss": 3.157, "step": 102550 }, { "epoch": 29.860794131346065, "grad_norm": 0.44056737422943115, "learning_rate": 0.0002418206173558532, "loss": 3.17, "step": 102600 }, { "epoch": 29.875349324639032, "grad_norm": 0.4519415497779846, "learning_rate": 0.0002416458940011648, "loss": 3.1672, "step": 102650 }, { "epoch": 29.889904517932, "grad_norm": 0.4712349474430084, "learning_rate": 0.00024147117064647638, "loss": 3.1833, "step": 102700 }, { "epoch": 29.904459711224966, "grad_norm": 0.4546526372432709, "learning_rate": 0.000241296447291788, "loss": 3.1662, "step": 102750 }, { "epoch": 29.919014904517933, "grad_norm": 0.48723068833351135, "learning_rate": 0.00024112172393709957, "loss": 3.1711, "step": 102800 }, { "epoch": 29.9335700978109, "grad_norm": 0.5052331686019897, "learning_rate": 0.00024094700058241116, "loss": 3.1653, "step": 102850 }, { "epoch": 29.948125291103867, "grad_norm": 0.42059940099716187, "learning_rate": 0.00024077227722772276, "loss": 3.1755, "step": 102900 }, { "epoch": 29.962680484396834, "grad_norm": 0.4757409691810608, "learning_rate": 0.00024059755387303435, "loss": 3.171, "step": 102950 }, { "epoch": 29.977235677689798, "grad_norm": 0.44011008739471436, "learning_rate": 0.00024042283051834592, "loss": 3.1707, "step": 103000 }, { "epoch": 29.977235677689798, "eval_accuracy": 0.3756088061329509, "eval_loss": 3.529524326324463, "eval_runtime": 178.9486, "eval_samples_per_second": 93.049, "eval_steps_per_second": 5.817, "step": 103000 }, { "epoch": 29.991790870982765, "grad_norm": 0.44571083784103394, "learning_rate": 0.00024024810716365754, "loss": 3.1645, "step": 103050 }, { "epoch": 30.006113181183046, "grad_norm": 0.45272624492645264, "learning_rate": 0.0002400733838089691, "loss": 3.1302, "step": 103100 }, { "epoch": 30.020668374476013, "grad_norm": 0.43727314472198486, "learning_rate": 0.0002398986604542807, "loss": 3.0793, "step": 103150 }, { "epoch": 30.03522356776898, "grad_norm": 0.44057533144950867, "learning_rate": 0.0002397239370995923, "loss": 3.0786, "step": 103200 }, { "epoch": 30.049778761061948, "grad_norm": 0.43019768595695496, "learning_rate": 0.0002395492137449039, "loss": 3.0861, "step": 103250 }, { "epoch": 30.064333954354915, "grad_norm": 0.4350365400314331, "learning_rate": 0.00023937449039021546, "loss": 3.0941, "step": 103300 }, { "epoch": 30.07888914764788, "grad_norm": 0.4855474531650543, "learning_rate": 0.00023919976703552708, "loss": 3.0927, "step": 103350 }, { "epoch": 30.09344434094085, "grad_norm": 0.4501292109489441, "learning_rate": 0.00023902504368083865, "loss": 3.0831, "step": 103400 }, { "epoch": 30.107999534233816, "grad_norm": 0.47538337111473083, "learning_rate": 0.00023885032032615027, "loss": 3.0898, "step": 103450 }, { "epoch": 30.122554727526783, "grad_norm": 0.4833683967590332, "learning_rate": 0.00023867559697146183, "loss": 3.1053, "step": 103500 }, { "epoch": 30.13710992081975, "grad_norm": 0.44806477427482605, "learning_rate": 0.0002385008736167734, "loss": 3.0974, "step": 103550 }, { "epoch": 30.151665114112717, "grad_norm": 0.4546588659286499, "learning_rate": 0.00023832615026208502, "loss": 3.1125, "step": 103600 }, { "epoch": 30.166220307405684, "grad_norm": 0.451274573802948, "learning_rate": 0.0002381514269073966, "loss": 3.1047, "step": 103650 }, { "epoch": 30.180775500698648, "grad_norm": 0.4401869475841522, "learning_rate": 0.00023797670355270818, "loss": 3.1042, "step": 103700 }, { "epoch": 30.195330693991615, "grad_norm": 0.47902384400367737, "learning_rate": 0.0002378019801980198, "loss": 3.1136, "step": 103750 }, { "epoch": 30.20988588728458, "grad_norm": 0.4810866117477417, "learning_rate": 0.00023762725684333137, "loss": 3.1054, "step": 103800 }, { "epoch": 30.22444108057755, "grad_norm": 0.4866798520088196, "learning_rate": 0.00023745253348864294, "loss": 3.1095, "step": 103850 }, { "epoch": 30.238996273870516, "grad_norm": 0.4621856212615967, "learning_rate": 0.00023727781013395456, "loss": 3.1108, "step": 103900 }, { "epoch": 30.253551467163483, "grad_norm": 0.45390817523002625, "learning_rate": 0.00023710308677926613, "loss": 3.1123, "step": 103950 }, { "epoch": 30.26810666045645, "grad_norm": 0.44979554414749146, "learning_rate": 0.00023692836342457772, "loss": 3.121, "step": 104000 }, { "epoch": 30.26810666045645, "eval_accuracy": 0.374520385757524, "eval_loss": 3.5517687797546387, "eval_runtime": 179.1015, "eval_samples_per_second": 92.97, "eval_steps_per_second": 5.812, "step": 104000 }, { "epoch": 30.282661853749417, "grad_norm": 0.4823976457118988, "learning_rate": 0.00023675364006988932, "loss": 3.1166, "step": 104050 }, { "epoch": 30.297217047042384, "grad_norm": 0.4512874484062195, "learning_rate": 0.0002365789167152009, "loss": 3.1291, "step": 104100 }, { "epoch": 30.31177224033535, "grad_norm": 0.46822547912597656, "learning_rate": 0.0002364041933605125, "loss": 3.1218, "step": 104150 }, { "epoch": 30.326327433628318, "grad_norm": 0.4621245861053467, "learning_rate": 0.0002362294700058241, "loss": 3.1261, "step": 104200 }, { "epoch": 30.340882626921285, "grad_norm": 0.45678770542144775, "learning_rate": 0.00023605474665113567, "loss": 3.1201, "step": 104250 }, { "epoch": 30.355437820214252, "grad_norm": 0.4435892403125763, "learning_rate": 0.0002358800232964473, "loss": 3.1164, "step": 104300 }, { "epoch": 30.36999301350722, "grad_norm": 0.4691522717475891, "learning_rate": 0.00023570529994175886, "loss": 3.1397, "step": 104350 }, { "epoch": 30.384548206800186, "grad_norm": 0.44499778747558594, "learning_rate": 0.00023553057658707045, "loss": 3.1219, "step": 104400 }, { "epoch": 30.399103400093153, "grad_norm": 0.47024157643318176, "learning_rate": 0.00023535585323238204, "loss": 3.1235, "step": 104450 }, { "epoch": 30.41365859338612, "grad_norm": 0.47966906428337097, "learning_rate": 0.00023518112987769364, "loss": 3.1358, "step": 104500 }, { "epoch": 30.428213786679088, "grad_norm": 0.4930260479450226, "learning_rate": 0.0002350064065230052, "loss": 3.1356, "step": 104550 }, { "epoch": 30.442768979972055, "grad_norm": 0.47716206312179565, "learning_rate": 0.00023483168316831683, "loss": 3.1299, "step": 104600 }, { "epoch": 30.45732417326502, "grad_norm": 0.43428975343704224, "learning_rate": 0.0002346569598136284, "loss": 3.1381, "step": 104650 }, { "epoch": 30.47187936655799, "grad_norm": 0.46784719824790955, "learning_rate": 0.00023448223645894, "loss": 3.1407, "step": 104700 }, { "epoch": 30.486434559850956, "grad_norm": 0.47711265087127686, "learning_rate": 0.00023430751310425158, "loss": 3.1285, "step": 104750 }, { "epoch": 30.500989753143923, "grad_norm": 0.4717588424682617, "learning_rate": 0.00023413278974956318, "loss": 3.1282, "step": 104800 }, { "epoch": 30.51554494643689, "grad_norm": 0.4573390781879425, "learning_rate": 0.00023395806639487477, "loss": 3.1327, "step": 104850 }, { "epoch": 30.530100139729857, "grad_norm": 0.4628978669643402, "learning_rate": 0.00023378334304018637, "loss": 3.1399, "step": 104900 }, { "epoch": 30.544655333022824, "grad_norm": 0.4464574456214905, "learning_rate": 0.00023360861968549793, "loss": 3.1358, "step": 104950 }, { "epoch": 30.55921052631579, "grad_norm": 0.4731169641017914, "learning_rate": 0.00023343389633080955, "loss": 3.1484, "step": 105000 }, { "epoch": 30.55921052631579, "eval_accuracy": 0.37564277152421516, "eval_loss": 3.5386412143707275, "eval_runtime": 179.018, "eval_samples_per_second": 93.013, "eval_steps_per_second": 5.815, "step": 105000 }, { "epoch": 30.573765719608758, "grad_norm": 0.4915218651294708, "learning_rate": 0.00023325917297612112, "loss": 3.135, "step": 105050 }, { "epoch": 30.58832091290172, "grad_norm": 0.464535653591156, "learning_rate": 0.0002330844496214327, "loss": 3.1484, "step": 105100 }, { "epoch": 30.60287610619469, "grad_norm": 0.46330970525741577, "learning_rate": 0.0002329097262667443, "loss": 3.1367, "step": 105150 }, { "epoch": 30.617431299487656, "grad_norm": 0.4700284004211426, "learning_rate": 0.00023273500291205588, "loss": 3.1401, "step": 105200 }, { "epoch": 30.631986492780623, "grad_norm": 0.4735076129436493, "learning_rate": 0.00023256027955736747, "loss": 3.1347, "step": 105250 }, { "epoch": 30.64654168607359, "grad_norm": 0.4604129493236542, "learning_rate": 0.00023238555620267907, "loss": 3.1593, "step": 105300 }, { "epoch": 30.661096879366557, "grad_norm": 0.4553196430206299, "learning_rate": 0.00023221083284799066, "loss": 3.1415, "step": 105350 }, { "epoch": 30.675652072659524, "grad_norm": 0.48099684715270996, "learning_rate": 0.00023203610949330223, "loss": 3.1433, "step": 105400 }, { "epoch": 30.69020726595249, "grad_norm": 0.478710800409317, "learning_rate": 0.00023186138613861385, "loss": 3.1518, "step": 105450 }, { "epoch": 30.704762459245458, "grad_norm": 0.46138402819633484, "learning_rate": 0.00023168666278392542, "loss": 3.1523, "step": 105500 }, { "epoch": 30.719317652538425, "grad_norm": 0.47966495156288147, "learning_rate": 0.00023151193942923704, "loss": 3.1438, "step": 105550 }, { "epoch": 30.733872845831392, "grad_norm": 0.47663363814353943, "learning_rate": 0.0002313372160745486, "loss": 3.1483, "step": 105600 }, { "epoch": 30.74842803912436, "grad_norm": 0.4620051085948944, "learning_rate": 0.0002311624927198602, "loss": 3.1518, "step": 105650 }, { "epoch": 30.762983232417326, "grad_norm": 0.47040557861328125, "learning_rate": 0.0002309877693651718, "loss": 3.1497, "step": 105700 }, { "epoch": 30.777538425710294, "grad_norm": 0.47730788588523865, "learning_rate": 0.0002308130460104834, "loss": 3.155, "step": 105750 }, { "epoch": 30.79209361900326, "grad_norm": 0.49402493238449097, "learning_rate": 0.00023063832265579496, "loss": 3.1459, "step": 105800 }, { "epoch": 30.806648812296228, "grad_norm": 0.452426016330719, "learning_rate": 0.00023046359930110658, "loss": 3.1468, "step": 105850 }, { "epoch": 30.821204005589195, "grad_norm": 0.4422298073768616, "learning_rate": 0.00023028887594641814, "loss": 3.1575, "step": 105900 }, { "epoch": 30.83575919888216, "grad_norm": 0.4627573788166046, "learning_rate": 0.00023011415259172974, "loss": 3.1408, "step": 105950 }, { "epoch": 30.85031439217513, "grad_norm": 0.45685991644859314, "learning_rate": 0.00022993942923704133, "loss": 3.1598, "step": 106000 }, { "epoch": 30.85031439217513, "eval_accuracy": 0.3756513510175103, "eval_loss": 3.5360524654388428, "eval_runtime": 178.889, "eval_samples_per_second": 93.08, "eval_steps_per_second": 5.819, "step": 106000 }, { "epoch": 30.864869585468096, "grad_norm": 0.47248050570487976, "learning_rate": 0.00022976470588235293, "loss": 3.1518, "step": 106050 }, { "epoch": 30.879424778761063, "grad_norm": 0.446719765663147, "learning_rate": 0.0002295899825276645, "loss": 3.1598, "step": 106100 }, { "epoch": 30.89397997205403, "grad_norm": 0.4616938531398773, "learning_rate": 0.00022941525917297612, "loss": 3.1619, "step": 106150 }, { "epoch": 30.908535165346997, "grad_norm": 0.4327644407749176, "learning_rate": 0.00022924053581828768, "loss": 3.1637, "step": 106200 }, { "epoch": 30.923090358639964, "grad_norm": 0.475490540266037, "learning_rate": 0.0002290658124635993, "loss": 3.1482, "step": 106250 }, { "epoch": 30.93764555193293, "grad_norm": 0.4473344683647156, "learning_rate": 0.00022889108910891087, "loss": 3.165, "step": 106300 }, { "epoch": 30.9522007452259, "grad_norm": 0.43939265608787537, "learning_rate": 0.00022871636575422247, "loss": 3.1661, "step": 106350 }, { "epoch": 30.966755938518865, "grad_norm": 0.48219022154808044, "learning_rate": 0.00022854164239953406, "loss": 3.1675, "step": 106400 }, { "epoch": 30.98131113181183, "grad_norm": 0.4780034124851227, "learning_rate": 0.00022836691904484565, "loss": 3.1583, "step": 106450 }, { "epoch": 30.995866325104796, "grad_norm": 0.47152233123779297, "learning_rate": 0.00022819219569015722, "loss": 3.1605, "step": 106500 }, { "epoch": 31.010188635305077, "grad_norm": 0.45192500948905945, "learning_rate": 0.00022801747233546884, "loss": 3.0934, "step": 106550 }, { "epoch": 31.024743828598044, "grad_norm": 0.4668784439563751, "learning_rate": 0.0002278427489807804, "loss": 3.0671, "step": 106600 }, { "epoch": 31.03929902189101, "grad_norm": 0.4962749183177948, "learning_rate": 0.00022766802562609198, "loss": 3.0759, "step": 106650 }, { "epoch": 31.05385421518398, "grad_norm": 0.45623254776000977, "learning_rate": 0.0002274933022714036, "loss": 3.075, "step": 106700 }, { "epoch": 31.068409408476946, "grad_norm": 0.46809470653533936, "learning_rate": 0.00022731857891671517, "loss": 3.0721, "step": 106750 }, { "epoch": 31.082964601769913, "grad_norm": 0.48478496074676514, "learning_rate": 0.00022714385556202676, "loss": 3.0782, "step": 106800 }, { "epoch": 31.09751979506288, "grad_norm": 0.46201130747795105, "learning_rate": 0.00022696913220733835, "loss": 3.0806, "step": 106850 }, { "epoch": 31.112074988355847, "grad_norm": 0.4738673269748688, "learning_rate": 0.00022679440885264995, "loss": 3.0891, "step": 106900 }, { "epoch": 31.126630181648814, "grad_norm": 0.4977607727050781, "learning_rate": 0.00022661968549796157, "loss": 3.0976, "step": 106950 }, { "epoch": 31.14118537494178, "grad_norm": 0.46951159834861755, "learning_rate": 0.00022644496214327314, "loss": 3.0879, "step": 107000 }, { "epoch": 31.14118537494178, "eval_accuracy": 0.3748574540694476, "eval_loss": 3.552915334701538, "eval_runtime": 178.7982, "eval_samples_per_second": 93.127, "eval_steps_per_second": 5.822, "step": 107000 }, { "epoch": 31.155740568234748, "grad_norm": 0.4681188762187958, "learning_rate": 0.0002262702387885847, "loss": 3.0916, "step": 107050 }, { "epoch": 31.17029576152771, "grad_norm": 0.48094692826271057, "learning_rate": 0.00022609551543389633, "loss": 3.0946, "step": 107100 }, { "epoch": 31.18485095482068, "grad_norm": 0.4932039678096771, "learning_rate": 0.0002259207920792079, "loss": 3.1058, "step": 107150 }, { "epoch": 31.199406148113646, "grad_norm": 0.48230600357055664, "learning_rate": 0.0002257460687245195, "loss": 3.0957, "step": 107200 }, { "epoch": 31.213961341406613, "grad_norm": 0.4459396302700043, "learning_rate": 0.00022557134536983108, "loss": 3.1039, "step": 107250 }, { "epoch": 31.22851653469958, "grad_norm": 0.493160218000412, "learning_rate": 0.00022539662201514268, "loss": 3.1047, "step": 107300 }, { "epoch": 31.243071727992547, "grad_norm": 0.4656211733818054, "learning_rate": 0.00022522189866045424, "loss": 3.1031, "step": 107350 }, { "epoch": 31.257626921285514, "grad_norm": 0.4818492829799652, "learning_rate": 0.00022504717530576586, "loss": 3.1011, "step": 107400 }, { "epoch": 31.27218211457848, "grad_norm": 0.49490463733673096, "learning_rate": 0.00022487245195107743, "loss": 3.1169, "step": 107450 }, { "epoch": 31.286737307871448, "grad_norm": 0.45430421829223633, "learning_rate": 0.00022469772859638903, "loss": 3.1101, "step": 107500 }, { "epoch": 31.301292501164415, "grad_norm": 0.495371013879776, "learning_rate": 0.00022452300524170062, "loss": 3.1195, "step": 107550 }, { "epoch": 31.315847694457382, "grad_norm": 0.4393438696861267, "learning_rate": 0.00022434828188701221, "loss": 3.1165, "step": 107600 }, { "epoch": 31.33040288775035, "grad_norm": 0.4675419330596924, "learning_rate": 0.0002241735585323238, "loss": 3.1256, "step": 107650 }, { "epoch": 31.344958081043316, "grad_norm": 0.46298012137413025, "learning_rate": 0.0002239988351776354, "loss": 3.1232, "step": 107700 }, { "epoch": 31.359513274336283, "grad_norm": 0.48230257630348206, "learning_rate": 0.00022382411182294697, "loss": 3.1149, "step": 107750 }, { "epoch": 31.37406846762925, "grad_norm": 0.4581402838230133, "learning_rate": 0.0002236493884682586, "loss": 3.1143, "step": 107800 }, { "epoch": 31.388623660922217, "grad_norm": 0.47338271141052246, "learning_rate": 0.00022347466511357016, "loss": 3.1202, "step": 107850 }, { "epoch": 31.403178854215184, "grad_norm": 0.4785659909248352, "learning_rate": 0.00022329994175888175, "loss": 3.1174, "step": 107900 }, { "epoch": 31.41773404750815, "grad_norm": 0.4678061306476593, "learning_rate": 0.00022312521840419335, "loss": 3.1154, "step": 107950 }, { "epoch": 31.43228924080112, "grad_norm": 0.48249468207359314, "learning_rate": 0.00022295049504950494, "loss": 3.1218, "step": 108000 }, { "epoch": 31.43228924080112, "eval_accuracy": 0.3748012760174603, "eval_loss": 3.546966552734375, "eval_runtime": 178.6598, "eval_samples_per_second": 93.199, "eval_steps_per_second": 5.827, "step": 108000 }, { "epoch": 31.446844434094086, "grad_norm": 0.4670863449573517, "learning_rate": 0.0002227757716948165, "loss": 3.1273, "step": 108050 }, { "epoch": 31.461399627387053, "grad_norm": 0.4544913172721863, "learning_rate": 0.00022260104834012813, "loss": 3.1348, "step": 108100 }, { "epoch": 31.47595482068002, "grad_norm": 0.47836384177207947, "learning_rate": 0.0002224263249854397, "loss": 3.1225, "step": 108150 }, { "epoch": 31.490510013972987, "grad_norm": 0.5040008425712585, "learning_rate": 0.00022225160163075126, "loss": 3.1312, "step": 108200 }, { "epoch": 31.505065207265954, "grad_norm": 0.48573142290115356, "learning_rate": 0.00022207687827606289, "loss": 3.1263, "step": 108250 }, { "epoch": 31.51962040055892, "grad_norm": 0.48172304034233093, "learning_rate": 0.00022190215492137445, "loss": 3.1238, "step": 108300 }, { "epoch": 31.534175593851888, "grad_norm": 0.46813297271728516, "learning_rate": 0.00022172743156668607, "loss": 3.1282, "step": 108350 }, { "epoch": 31.548730787144855, "grad_norm": 0.49096304178237915, "learning_rate": 0.00022155270821199764, "loss": 3.1231, "step": 108400 }, { "epoch": 31.56328598043782, "grad_norm": 0.5114392638206482, "learning_rate": 0.00022137798485730924, "loss": 3.1277, "step": 108450 }, { "epoch": 31.577841173730786, "grad_norm": 0.4781743586063385, "learning_rate": 0.00022120326150262083, "loss": 3.1319, "step": 108500 }, { "epoch": 31.592396367023753, "grad_norm": 0.463967889547348, "learning_rate": 0.00022102853814793242, "loss": 3.142, "step": 108550 }, { "epoch": 31.60695156031672, "grad_norm": 0.4561751186847687, "learning_rate": 0.000220853814793244, "loss": 3.135, "step": 108600 }, { "epoch": 31.621506753609687, "grad_norm": 0.4930993318557739, "learning_rate": 0.0002206790914385556, "loss": 3.1355, "step": 108650 }, { "epoch": 31.636061946902654, "grad_norm": 0.47756820917129517, "learning_rate": 0.00022050436808386718, "loss": 3.1432, "step": 108700 }, { "epoch": 31.65061714019562, "grad_norm": 0.45308125019073486, "learning_rate": 0.00022032964472917877, "loss": 3.1443, "step": 108750 }, { "epoch": 31.665172333488588, "grad_norm": 0.4814562201499939, "learning_rate": 0.00022015492137449037, "loss": 3.1407, "step": 108800 }, { "epoch": 31.679727526781555, "grad_norm": 0.46541181206703186, "learning_rate": 0.00021998019801980196, "loss": 3.1392, "step": 108850 }, { "epoch": 31.694282720074522, "grad_norm": 0.45067304372787476, "learning_rate": 0.00021980547466511353, "loss": 3.1423, "step": 108900 }, { "epoch": 31.70883791336749, "grad_norm": 0.45165324211120605, "learning_rate": 0.00021963075131042515, "loss": 3.1366, "step": 108950 }, { "epoch": 31.723393106660456, "grad_norm": 0.4571157991886139, "learning_rate": 0.00021945602795573672, "loss": 3.1351, "step": 109000 }, { "epoch": 31.723393106660456, "eval_accuracy": 0.3757617091572928, "eval_loss": 3.5376572608947754, "eval_runtime": 178.7662, "eval_samples_per_second": 93.144, "eval_steps_per_second": 5.823, "step": 109000 }, { "epoch": 31.737948299953423, "grad_norm": 0.4623583257198334, "learning_rate": 0.00021928130460104834, "loss": 3.1399, "step": 109050 }, { "epoch": 31.75250349324639, "grad_norm": 0.4531978368759155, "learning_rate": 0.0002191065812463599, "loss": 3.1415, "step": 109100 }, { "epoch": 31.767058686539357, "grad_norm": 0.4783373177051544, "learning_rate": 0.0002189318578916715, "loss": 3.1351, "step": 109150 }, { "epoch": 31.781613879832324, "grad_norm": 0.48766523599624634, "learning_rate": 0.0002187571345369831, "loss": 3.1442, "step": 109200 }, { "epoch": 31.79616907312529, "grad_norm": 0.48519569635391235, "learning_rate": 0.0002185824111822947, "loss": 3.1431, "step": 109250 }, { "epoch": 31.81072426641826, "grad_norm": 0.4792105555534363, "learning_rate": 0.00021840768782760626, "loss": 3.1432, "step": 109300 }, { "epoch": 31.825279459711226, "grad_norm": 0.4858682155609131, "learning_rate": 0.00021823296447291788, "loss": 3.1446, "step": 109350 }, { "epoch": 31.839834653004193, "grad_norm": 0.4517224133014679, "learning_rate": 0.00021805824111822945, "loss": 3.1578, "step": 109400 }, { "epoch": 31.85438984629716, "grad_norm": 0.4686705470085144, "learning_rate": 0.00021788351776354104, "loss": 3.1517, "step": 109450 }, { "epoch": 31.868945039590127, "grad_norm": 0.467055082321167, "learning_rate": 0.00021770879440885263, "loss": 3.1465, "step": 109500 }, { "epoch": 31.883500232883094, "grad_norm": 0.4561103284358978, "learning_rate": 0.00021753407105416423, "loss": 3.1488, "step": 109550 }, { "epoch": 31.89805542617606, "grad_norm": 0.4442567229270935, "learning_rate": 0.0002173593476994758, "loss": 3.1351, "step": 109600 }, { "epoch": 31.912610619469028, "grad_norm": 0.43746140599250793, "learning_rate": 0.00021718462434478742, "loss": 3.1559, "step": 109650 }, { "epoch": 31.927165812761995, "grad_norm": 0.4594532549381256, "learning_rate": 0.00021700990099009898, "loss": 3.1508, "step": 109700 }, { "epoch": 31.941721006054962, "grad_norm": 0.4692226052284241, "learning_rate": 0.0002168351776354106, "loss": 3.1499, "step": 109750 }, { "epoch": 31.956276199347926, "grad_norm": 0.4692417085170746, "learning_rate": 0.00021666045428072217, "loss": 3.1473, "step": 109800 }, { "epoch": 31.970831392640893, "grad_norm": 0.44723644852638245, "learning_rate": 0.00021648573092603374, "loss": 3.1633, "step": 109850 }, { "epoch": 31.98538658593386, "grad_norm": 0.46695899963378906, "learning_rate": 0.00021631100757134536, "loss": 3.1568, "step": 109900 }, { "epoch": 31.999941779226827, "grad_norm": 0.4549327790737152, "learning_rate": 0.00021613628421665693, "loss": 3.1598, "step": 109950 }, { "epoch": 32.01426408942711, "grad_norm": 0.488547146320343, "learning_rate": 0.00021596156086196852, "loss": 3.069, "step": 110000 }, { "epoch": 32.01426408942711, "eval_accuracy": 0.37522413926233517, "eval_loss": 3.5496914386749268, "eval_runtime": 178.8678, "eval_samples_per_second": 93.091, "eval_steps_per_second": 5.82, "step": 110000 }, { "epoch": 32.02881928272007, "grad_norm": 0.4579232335090637, "learning_rate": 0.00021578683750728012, "loss": 3.0748, "step": 110050 }, { "epoch": 32.04337447601304, "grad_norm": 0.46105533838272095, "learning_rate": 0.0002156121141525917, "loss": 3.0645, "step": 110100 }, { "epoch": 32.057929669306006, "grad_norm": 0.4574984312057495, "learning_rate": 0.00021543739079790328, "loss": 3.068, "step": 110150 }, { "epoch": 32.07248486259898, "grad_norm": 0.45768246054649353, "learning_rate": 0.0002152626674432149, "loss": 3.0714, "step": 110200 }, { "epoch": 32.08704005589194, "grad_norm": 0.48479366302490234, "learning_rate": 0.00021508794408852647, "loss": 3.0688, "step": 110250 }, { "epoch": 32.10159524918491, "grad_norm": 0.45836198329925537, "learning_rate": 0.0002149132207338381, "loss": 3.072, "step": 110300 }, { "epoch": 32.116150442477874, "grad_norm": 0.48688456416130066, "learning_rate": 0.00021473849737914966, "loss": 3.0843, "step": 110350 }, { "epoch": 32.130705635770845, "grad_norm": 0.47388702630996704, "learning_rate": 0.00021456377402446125, "loss": 3.0973, "step": 110400 }, { "epoch": 32.14526082906381, "grad_norm": 0.47046688199043274, "learning_rate": 0.00021438905066977284, "loss": 3.092, "step": 110450 }, { "epoch": 32.15981602235678, "grad_norm": 0.47430869936943054, "learning_rate": 0.00021421432731508444, "loss": 3.0993, "step": 110500 }, { "epoch": 32.17437121564974, "grad_norm": 0.46983247995376587, "learning_rate": 0.000214039603960396, "loss": 3.0874, "step": 110550 }, { "epoch": 32.18892640894271, "grad_norm": 0.46737906336784363, "learning_rate": 0.00021386488060570763, "loss": 3.0852, "step": 110600 }, { "epoch": 32.20348160223568, "grad_norm": 0.45788851380348206, "learning_rate": 0.0002136901572510192, "loss": 3.1098, "step": 110650 }, { "epoch": 32.21803679552865, "grad_norm": 0.5003287196159363, "learning_rate": 0.0002135154338963308, "loss": 3.0889, "step": 110700 }, { "epoch": 32.23259198882161, "grad_norm": 0.4800771176815033, "learning_rate": 0.00021334071054164238, "loss": 3.0896, "step": 110750 }, { "epoch": 32.24714718211458, "grad_norm": 0.4706825017929077, "learning_rate": 0.00021316598718695398, "loss": 3.1033, "step": 110800 }, { "epoch": 32.261702375407545, "grad_norm": 0.4968228042125702, "learning_rate": 0.00021299126383226554, "loss": 3.0965, "step": 110850 }, { "epoch": 32.276257568700515, "grad_norm": 0.4602368175983429, "learning_rate": 0.00021281654047757717, "loss": 3.1089, "step": 110900 }, { "epoch": 32.29081276199348, "grad_norm": 0.46941208839416504, "learning_rate": 0.00021264181712288873, "loss": 3.1227, "step": 110950 }, { "epoch": 32.30536795528645, "grad_norm": 0.4664950668811798, "learning_rate": 0.00021246709376820035, "loss": 3.0951, "step": 111000 }, { "epoch": 32.30536795528645, "eval_accuracy": 0.3753109919410351, "eval_loss": 3.546433448791504, "eval_runtime": 178.8074, "eval_samples_per_second": 93.123, "eval_steps_per_second": 5.822, "step": 111000 }, { "epoch": 32.31992314857941, "grad_norm": 0.4616936147212982, "learning_rate": 0.00021229237041351192, "loss": 3.0917, "step": 111050 }, { "epoch": 32.334478341872384, "grad_norm": 0.45398086309432983, "learning_rate": 0.00021211764705882352, "loss": 3.1078, "step": 111100 }, { "epoch": 32.34903353516535, "grad_norm": 0.47605571150779724, "learning_rate": 0.0002119429237041351, "loss": 3.1096, "step": 111150 }, { "epoch": 32.36358872845831, "grad_norm": 0.47797513008117676, "learning_rate": 0.0002117682003494467, "loss": 3.1048, "step": 111200 }, { "epoch": 32.37814392175128, "grad_norm": 0.46724599599838257, "learning_rate": 0.00021159347699475827, "loss": 3.1056, "step": 111250 }, { "epoch": 32.392699115044245, "grad_norm": 0.48777705430984497, "learning_rate": 0.0002114187536400699, "loss": 3.1172, "step": 111300 }, { "epoch": 32.407254308337215, "grad_norm": 0.4803573191165924, "learning_rate": 0.00021124403028538146, "loss": 3.1166, "step": 111350 }, { "epoch": 32.42180950163018, "grad_norm": 0.468847393989563, "learning_rate": 0.00021106930693069303, "loss": 3.1059, "step": 111400 }, { "epoch": 32.43636469492315, "grad_norm": 0.46275416016578674, "learning_rate": 0.00021089458357600465, "loss": 3.1265, "step": 111450 }, { "epoch": 32.45091988821611, "grad_norm": 0.5064470767974854, "learning_rate": 0.00021071986022131622, "loss": 3.1075, "step": 111500 }, { "epoch": 32.465475081509084, "grad_norm": 0.5147144794464111, "learning_rate": 0.0002105451368666278, "loss": 3.1166, "step": 111550 }, { "epoch": 32.48003027480205, "grad_norm": 0.48303794860839844, "learning_rate": 0.0002103704135119394, "loss": 3.1179, "step": 111600 }, { "epoch": 32.49458546809502, "grad_norm": 0.4648039937019348, "learning_rate": 0.000210195690157251, "loss": 3.1204, "step": 111650 }, { "epoch": 32.50914066138798, "grad_norm": 0.48321637511253357, "learning_rate": 0.0002100209668025626, "loss": 3.1078, "step": 111700 }, { "epoch": 32.52369585468095, "grad_norm": 0.4882371425628662, "learning_rate": 0.0002098462434478742, "loss": 3.134, "step": 111750 }, { "epoch": 32.538251047973915, "grad_norm": 0.48640525341033936, "learning_rate": 0.00020967152009318576, "loss": 3.1196, "step": 111800 }, { "epoch": 32.552806241266886, "grad_norm": 0.4857983887195587, "learning_rate": 0.00020949679673849738, "loss": 3.1299, "step": 111850 }, { "epoch": 32.56736143455985, "grad_norm": 0.47827160358428955, "learning_rate": 0.00020932207338380894, "loss": 3.1222, "step": 111900 }, { "epoch": 32.58191662785282, "grad_norm": 0.4789576232433319, "learning_rate": 0.00020914735002912054, "loss": 3.1328, "step": 111950 }, { "epoch": 32.596471821145784, "grad_norm": 0.46239346265792847, "learning_rate": 0.00020897262667443213, "loss": 3.1199, "step": 112000 }, { "epoch": 32.596471821145784, "eval_accuracy": 0.37519875336436603, "eval_loss": 3.5462632179260254, "eval_runtime": 178.7924, "eval_samples_per_second": 93.13, "eval_steps_per_second": 5.822, "step": 112000 }, { "epoch": 32.611027014438754, "grad_norm": 0.4794664680957794, "learning_rate": 0.00020879790331974373, "loss": 3.1224, "step": 112050 }, { "epoch": 32.62558220773172, "grad_norm": 0.4489709436893463, "learning_rate": 0.0002086231799650553, "loss": 3.1232, "step": 112100 }, { "epoch": 32.64013740102469, "grad_norm": 0.4708504378795624, "learning_rate": 0.00020844845661036692, "loss": 3.1339, "step": 112150 }, { "epoch": 32.65469259431765, "grad_norm": 0.44888994097709656, "learning_rate": 0.00020827373325567848, "loss": 3.1202, "step": 112200 }, { "epoch": 32.66924778761062, "grad_norm": 0.45401710271835327, "learning_rate": 0.00020809900990099008, "loss": 3.1391, "step": 112250 }, { "epoch": 32.683802980903586, "grad_norm": 0.4785029888153076, "learning_rate": 0.00020792428654630167, "loss": 3.1272, "step": 112300 }, { "epoch": 32.69835817419656, "grad_norm": 0.4662979245185852, "learning_rate": 0.00020774956319161327, "loss": 3.1331, "step": 112350 }, { "epoch": 32.71291336748952, "grad_norm": 0.4512777030467987, "learning_rate": 0.00020757483983692486, "loss": 3.1308, "step": 112400 }, { "epoch": 32.72746856078249, "grad_norm": 0.47191712260246277, "learning_rate": 0.00020740011648223645, "loss": 3.1355, "step": 112450 }, { "epoch": 32.742023754075454, "grad_norm": 0.5223751068115234, "learning_rate": 0.00020722539312754802, "loss": 3.141, "step": 112500 }, { "epoch": 32.75657894736842, "grad_norm": 0.4959058165550232, "learning_rate": 0.00020705066977285964, "loss": 3.1209, "step": 112550 }, { "epoch": 32.77113414066139, "grad_norm": 0.4924633800983429, "learning_rate": 0.0002068759464181712, "loss": 3.1311, "step": 112600 }, { "epoch": 32.78568933395435, "grad_norm": 0.4769672751426697, "learning_rate": 0.0002067012230634828, "loss": 3.1332, "step": 112650 }, { "epoch": 32.80024452724732, "grad_norm": 0.48988962173461914, "learning_rate": 0.0002065264997087944, "loss": 3.1372, "step": 112700 }, { "epoch": 32.814799720540286, "grad_norm": 0.46738287806510925, "learning_rate": 0.000206351776354106, "loss": 3.1312, "step": 112750 }, { "epoch": 32.82935491383326, "grad_norm": 0.4991190433502197, "learning_rate": 0.00020617705299941756, "loss": 3.1335, "step": 112800 }, { "epoch": 32.84391010712622, "grad_norm": 0.4654034972190857, "learning_rate": 0.00020600232964472918, "loss": 3.1246, "step": 112850 }, { "epoch": 32.85846530041919, "grad_norm": 0.4580225646495819, "learning_rate": 0.00020582760629004075, "loss": 3.1313, "step": 112900 }, { "epoch": 32.873020493712154, "grad_norm": 0.49401524662971497, "learning_rate": 0.00020565288293535232, "loss": 3.141, "step": 112950 }, { "epoch": 32.887575687005125, "grad_norm": 0.44626525044441223, "learning_rate": 0.00020547815958066394, "loss": 3.1481, "step": 113000 }, { "epoch": 32.887575687005125, "eval_accuracy": 0.37586231253072605, "eval_loss": 3.5356757640838623, "eval_runtime": 179.0522, "eval_samples_per_second": 92.995, "eval_steps_per_second": 5.814, "step": 113000 }, { "epoch": 32.90213088029809, "grad_norm": 0.449421226978302, "learning_rate": 0.0002053034362259755, "loss": 3.1478, "step": 113050 }, { "epoch": 32.91668607359106, "grad_norm": 0.46115419268608093, "learning_rate": 0.00020512871287128713, "loss": 3.138, "step": 113100 }, { "epoch": 32.93124126688402, "grad_norm": 0.48968982696533203, "learning_rate": 0.0002049539895165987, "loss": 3.1352, "step": 113150 }, { "epoch": 32.94579646017699, "grad_norm": 0.46866336464881897, "learning_rate": 0.0002047792661619103, "loss": 3.1601, "step": 113200 }, { "epoch": 32.96035165346996, "grad_norm": 0.5086426138877869, "learning_rate": 0.00020460454280722188, "loss": 3.1405, "step": 113250 }, { "epoch": 32.97490684676293, "grad_norm": 0.45002827048301697, "learning_rate": 0.00020442981945253348, "loss": 3.1474, "step": 113300 }, { "epoch": 32.98946204005589, "grad_norm": 0.5016616582870483, "learning_rate": 0.00020425509609784504, "loss": 3.1398, "step": 113350 }, { "epoch": 33.00378435025617, "grad_norm": 0.4685134291648865, "learning_rate": 0.00020408037274315666, "loss": 3.1196, "step": 113400 }, { "epoch": 33.01833954354914, "grad_norm": 0.47646480798721313, "learning_rate": 0.00020390564938846823, "loss": 3.0559, "step": 113450 }, { "epoch": 33.0328947368421, "grad_norm": 0.4655097723007202, "learning_rate": 0.00020373092603377983, "loss": 3.0522, "step": 113500 }, { "epoch": 33.04744993013507, "grad_norm": 0.46806007623672485, "learning_rate": 0.00020355620267909142, "loss": 3.0691, "step": 113550 }, { "epoch": 33.06200512342804, "grad_norm": 0.4651787281036377, "learning_rate": 0.00020338147932440301, "loss": 3.0643, "step": 113600 }, { "epoch": 33.07656031672101, "grad_norm": 0.4729501008987427, "learning_rate": 0.00020320675596971458, "loss": 3.0694, "step": 113650 }, { "epoch": 33.09111551001397, "grad_norm": 0.4735945463180542, "learning_rate": 0.0002030320326150262, "loss": 3.0715, "step": 113700 }, { "epoch": 33.10567070330694, "grad_norm": 0.4909193813800812, "learning_rate": 0.00020285730926033777, "loss": 3.0657, "step": 113750 }, { "epoch": 33.120225896599905, "grad_norm": 0.4546658396720886, "learning_rate": 0.0002026825859056494, "loss": 3.073, "step": 113800 }, { "epoch": 33.134781089892876, "grad_norm": 0.4555451571941376, "learning_rate": 0.00020250786255096096, "loss": 3.074, "step": 113850 }, { "epoch": 33.14933628318584, "grad_norm": 0.5068358778953552, "learning_rate": 0.00020233313919627255, "loss": 3.0776, "step": 113900 }, { "epoch": 33.16389147647881, "grad_norm": 0.5064336657524109, "learning_rate": 0.00020215841584158415, "loss": 3.0832, "step": 113950 }, { "epoch": 33.17844666977177, "grad_norm": 0.4599713385105133, "learning_rate": 0.00020198369248689574, "loss": 3.0671, "step": 114000 }, { "epoch": 33.17844666977177, "eval_accuracy": 0.37513423087369446, "eval_loss": 3.554382085800171, "eval_runtime": 178.8649, "eval_samples_per_second": 93.093, "eval_steps_per_second": 5.82, "step": 114000 }, { "epoch": 33.193001863064744, "grad_norm": 0.4853729009628296, "learning_rate": 0.0002018089691322073, "loss": 3.0812, "step": 114050 }, { "epoch": 33.20755705635771, "grad_norm": 0.4849071800708771, "learning_rate": 0.00020163424577751893, "loss": 3.0922, "step": 114100 }, { "epoch": 33.22211224965068, "grad_norm": 0.469696044921875, "learning_rate": 0.0002014595224228305, "loss": 3.0867, "step": 114150 }, { "epoch": 33.23666744294364, "grad_norm": 0.48402729630470276, "learning_rate": 0.00020128479906814206, "loss": 3.0773, "step": 114200 }, { "epoch": 33.25122263623661, "grad_norm": 0.49005845189094543, "learning_rate": 0.00020111007571345369, "loss": 3.0876, "step": 114250 }, { "epoch": 33.265777829529576, "grad_norm": 0.473341703414917, "learning_rate": 0.00020093535235876528, "loss": 3.1015, "step": 114300 }, { "epoch": 33.280333022822546, "grad_norm": 0.4849575161933899, "learning_rate": 0.00020076062900407685, "loss": 3.0956, "step": 114350 }, { "epoch": 33.29488821611551, "grad_norm": 0.494200199842453, "learning_rate": 0.00020058590564938847, "loss": 3.0835, "step": 114400 }, { "epoch": 33.30944340940848, "grad_norm": 0.5012302398681641, "learning_rate": 0.00020041118229470004, "loss": 3.1034, "step": 114450 }, { "epoch": 33.323998602701444, "grad_norm": 0.47126054763793945, "learning_rate": 0.00020023645894001166, "loss": 3.1085, "step": 114500 }, { "epoch": 33.33855379599441, "grad_norm": 0.4911480247974396, "learning_rate": 0.00020006173558532322, "loss": 3.1149, "step": 114550 }, { "epoch": 33.35310898928738, "grad_norm": 0.4866625666618347, "learning_rate": 0.0001998870122306348, "loss": 3.0898, "step": 114600 }, { "epoch": 33.36766418258034, "grad_norm": 0.4714414179325104, "learning_rate": 0.0001997122888759464, "loss": 3.1107, "step": 114650 }, { "epoch": 33.38221937587331, "grad_norm": 0.4877467751502991, "learning_rate": 0.00019953756552125798, "loss": 3.1017, "step": 114700 }, { "epoch": 33.396774569166276, "grad_norm": 0.485159307718277, "learning_rate": 0.00019936284216656957, "loss": 3.1153, "step": 114750 }, { "epoch": 33.411329762459246, "grad_norm": 0.4799259901046753, "learning_rate": 0.00019918811881188117, "loss": 3.1096, "step": 114800 }, { "epoch": 33.42588495575221, "grad_norm": 0.47757476568222046, "learning_rate": 0.00019901339545719276, "loss": 3.1026, "step": 114850 }, { "epoch": 33.44044014904518, "grad_norm": 0.4510685205459595, "learning_rate": 0.00019883867210250433, "loss": 3.1046, "step": 114900 }, { "epoch": 33.454995342338144, "grad_norm": 0.4681328237056732, "learning_rate": 0.00019866394874781595, "loss": 3.1112, "step": 114950 }, { "epoch": 33.469550535631114, "grad_norm": 0.47803494334220886, "learning_rate": 0.00019848922539312752, "loss": 3.1046, "step": 115000 }, { "epoch": 33.469550535631114, "eval_accuracy": 0.3753604709366139, "eval_loss": 3.546867609024048, "eval_runtime": 178.779, "eval_samples_per_second": 93.137, "eval_steps_per_second": 5.823, "step": 115000 }, { "epoch": 33.48410572892408, "grad_norm": 0.45089006423950195, "learning_rate": 0.0001983145020384391, "loss": 3.11, "step": 115050 }, { "epoch": 33.49866092221705, "grad_norm": 0.5001338720321655, "learning_rate": 0.0001981397786837507, "loss": 3.103, "step": 115100 }, { "epoch": 33.51321611551001, "grad_norm": 0.46167829632759094, "learning_rate": 0.0001979650553290623, "loss": 3.1014, "step": 115150 }, { "epoch": 33.52777130880298, "grad_norm": 0.46934643387794495, "learning_rate": 0.0001977903319743739, "loss": 3.1158, "step": 115200 }, { "epoch": 33.542326502095946, "grad_norm": 0.4888630509376526, "learning_rate": 0.0001976156086196855, "loss": 3.1215, "step": 115250 }, { "epoch": 33.55688169538892, "grad_norm": 0.5060179233551025, "learning_rate": 0.00019744088526499706, "loss": 3.1092, "step": 115300 }, { "epoch": 33.57143688868188, "grad_norm": 0.48514828085899353, "learning_rate": 0.00019726616191030868, "loss": 3.1201, "step": 115350 }, { "epoch": 33.58599208197485, "grad_norm": 0.4591329097747803, "learning_rate": 0.00019709143855562025, "loss": 3.1113, "step": 115400 }, { "epoch": 33.600547275267814, "grad_norm": 0.5153087973594666, "learning_rate": 0.00019691671520093184, "loss": 3.1173, "step": 115450 }, { "epoch": 33.615102468560785, "grad_norm": 0.47350478172302246, "learning_rate": 0.00019674199184624343, "loss": 3.1185, "step": 115500 }, { "epoch": 33.62965766185375, "grad_norm": 0.46424007415771484, "learning_rate": 0.00019656726849155503, "loss": 3.1159, "step": 115550 }, { "epoch": 33.64421285514672, "grad_norm": 0.45461326837539673, "learning_rate": 0.0001963925451368666, "loss": 3.1286, "step": 115600 }, { "epoch": 33.65876804843968, "grad_norm": 0.4766155183315277, "learning_rate": 0.00019621782178217822, "loss": 3.1215, "step": 115650 }, { "epoch": 33.67332324173265, "grad_norm": 0.47571155428886414, "learning_rate": 0.00019604309842748978, "loss": 3.1233, "step": 115700 }, { "epoch": 33.68787843502562, "grad_norm": 0.46603256464004517, "learning_rate": 0.00019586837507280135, "loss": 3.1216, "step": 115750 }, { "epoch": 33.70243362831859, "grad_norm": 0.462780624628067, "learning_rate": 0.00019569365171811297, "loss": 3.1272, "step": 115800 }, { "epoch": 33.71698882161155, "grad_norm": 0.48087212443351746, "learning_rate": 0.00019551892836342457, "loss": 3.1207, "step": 115850 }, { "epoch": 33.731544014904514, "grad_norm": 0.46344083547592163, "learning_rate": 0.00019534420500873616, "loss": 3.1287, "step": 115900 }, { "epoch": 33.746099208197485, "grad_norm": 0.48390519618988037, "learning_rate": 0.00019516948165404776, "loss": 3.1222, "step": 115950 }, { "epoch": 33.76065440149045, "grad_norm": 0.48883119225502014, "learning_rate": 0.00019499475829935932, "loss": 3.124, "step": 116000 }, { "epoch": 33.76065440149045, "eval_accuracy": 0.37565734491008634, "eval_loss": 3.5397818088531494, "eval_runtime": 178.6993, "eval_samples_per_second": 93.179, "eval_steps_per_second": 5.825, "step": 116000 }, { "epoch": 33.76065440149045, "step": 116000, "total_flos": 2.424181507817472e+18, "train_loss": 0.97679699680723, "train_runtime": 71541.8287, "train_samples_per_second": 192.06, "train_steps_per_second": 2.401 } ], "logging_steps": 50, "max_steps": 171800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 10000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 20, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 14 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.424181507817472e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }