| { |
| "best_global_step": 79000, |
| "best_metric": 3.5309038162231445, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_push_1032/checkpoint-40000", |
| "epoch": 28.812761993479274, |
| "eval_steps": 1000, |
| "global_step": 99000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014555193292966931, |
| "grad_norm": 0.686847448348999, |
| "learning_rate": 0.000294, |
| "loss": 8.4758, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029110386585933862, |
| "grad_norm": 0.6339422464370728, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7351, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04366557987890079, |
| "grad_norm": 0.521210253238678, |
| "learning_rate": 0.0005998287711124053, |
| "loss": 6.3454, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058220773171867725, |
| "grad_norm": 0.4336276948451996, |
| "learning_rate": 0.000599654047757717, |
| "loss": 6.1404, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07277596646483465, |
| "grad_norm": 0.48013415932655334, |
| "learning_rate": 0.0005994793244030285, |
| "loss": 5.9914, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08733115975780158, |
| "grad_norm": 0.44007983803749084, |
| "learning_rate": 0.00059930460104834, |
| "loss": 5.8606, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10188635305076851, |
| "grad_norm": 0.5110030174255371, |
| "learning_rate": 0.0005991298776936517, |
| "loss": 5.7424, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11644154634373545, |
| "grad_norm": 0.4467369318008423, |
| "learning_rate": 0.0005989551543389632, |
| "loss": 5.6216, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1309967396367024, |
| "grad_norm": 0.4658336043357849, |
| "learning_rate": 0.0005987804309842748, |
| "loss": 5.5233, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1455519329296693, |
| "grad_norm": 0.44280505180358887, |
| "learning_rate": 0.0005986057076295864, |
| "loss": 5.4187, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16010712622263623, |
| "grad_norm": 0.5383680462837219, |
| "learning_rate": 0.0005984309842748981, |
| "loss": 5.3322, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17466231951560315, |
| "grad_norm": 0.48139896988868713, |
| "learning_rate": 0.0005982562609202096, |
| "loss": 5.2656, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1892175128085701, |
| "grad_norm": 0.4733883738517761, |
| "learning_rate": 0.0005980815375655212, |
| "loss": 5.198, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20377270610153703, |
| "grad_norm": 0.520087718963623, |
| "learning_rate": 0.0005979068142108328, |
| "loss": 5.1358, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21832789939450395, |
| "grad_norm": 0.4379572868347168, |
| "learning_rate": 0.0005977320908561445, |
| "loss": 5.0737, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2328830926874709, |
| "grad_norm": 0.4522136151790619, |
| "learning_rate": 0.000597557367501456, |
| "loss": 5.035, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24743828598043782, |
| "grad_norm": 0.43239545822143555, |
| "learning_rate": 0.0005973826441467675, |
| "loss": 4.9725, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2619934792734048, |
| "grad_norm": 0.45372676849365234, |
| "learning_rate": 0.0005972079207920792, |
| "loss": 4.919, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27654867256637167, |
| "grad_norm": 0.44981423020362854, |
| "learning_rate": 0.0005970331974373907, |
| "loss": 4.9016, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2911038658593386, |
| "grad_norm": 0.44235020875930786, |
| "learning_rate": 0.0005968584740827023, |
| "loss": 4.8368, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2911038658593386, |
| "eval_accuracy": 0.25370901484969255, |
| "eval_loss": 4.760239124298096, |
| "eval_runtime": 80.7145, |
| "eval_samples_per_second": 206.295, |
| "eval_steps_per_second": 12.897, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30565905915230557, |
| "grad_norm": 0.47886380553245544, |
| "learning_rate": 0.0005966837507280139, |
| "loss": 4.7753, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.32021425244527246, |
| "grad_norm": 0.46111685037612915, |
| "learning_rate": 0.0005965090273733256, |
| "loss": 4.7392, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3347694457382394, |
| "grad_norm": 0.4319298565387726, |
| "learning_rate": 0.0005963343040186371, |
| "loss": 4.7019, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3493246390312063, |
| "grad_norm": 0.4780100882053375, |
| "learning_rate": 0.0005961595806639486, |
| "loss": 4.6697, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36387983232417326, |
| "grad_norm": 0.4365830719470978, |
| "learning_rate": 0.0005959848573092603, |
| "loss": 4.636, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3784350256171402, |
| "grad_norm": 0.4605623781681061, |
| "learning_rate": 0.0005958101339545718, |
| "loss": 4.5988, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3929902189101071, |
| "grad_norm": 0.42985111474990845, |
| "learning_rate": 0.0005956354105998835, |
| "loss": 4.5778, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40754541220307405, |
| "grad_norm": 0.4026910364627838, |
| "learning_rate": 0.000595460687245195, |
| "loss": 4.5464, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.422100605496041, |
| "grad_norm": 0.4782453179359436, |
| "learning_rate": 0.0005952859638905067, |
| "loss": 4.5306, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4366557987890079, |
| "grad_norm": 0.45075687766075134, |
| "learning_rate": 0.0005951112405358182, |
| "loss": 4.516, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45121099208197485, |
| "grad_norm": 0.4650851786136627, |
| "learning_rate": 0.0005949365171811299, |
| "loss": 4.4787, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4657661853749418, |
| "grad_norm": 0.3915107846260071, |
| "learning_rate": 0.0005947617938264414, |
| "loss": 4.4689, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4803213786679087, |
| "grad_norm": 0.41667816042900085, |
| "learning_rate": 0.000594587070471753, |
| "loss": 4.4442, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49487657196087564, |
| "grad_norm": 0.3894031345844269, |
| "learning_rate": 0.0005944123471170646, |
| "loss": 4.4208, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5094317652538426, |
| "grad_norm": 0.36939871311187744, |
| "learning_rate": 0.0005942376237623762, |
| "loss": 4.4348, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5239869585468095, |
| "grad_norm": 0.36502978205680847, |
| "learning_rate": 0.0005940629004076878, |
| "loss": 4.3948, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5385421518397764, |
| "grad_norm": 0.4573618471622467, |
| "learning_rate": 0.0005938881770529993, |
| "loss": 4.3819, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5530973451327433, |
| "grad_norm": 0.4011140465736389, |
| "learning_rate": 0.000593713453698311, |
| "loss": 4.373, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5676525384257103, |
| "grad_norm": 0.4717750549316406, |
| "learning_rate": 0.0005935387303436226, |
| "loss": 4.3585, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5822077317186772, |
| "grad_norm": 0.3939552903175354, |
| "learning_rate": 0.0005933640069889342, |
| "loss": 4.3514, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5822077317186772, |
| "eval_accuracy": 0.29934921605173836, |
| "eval_loss": 4.285079479217529, |
| "eval_runtime": 80.8003, |
| "eval_samples_per_second": 206.076, |
| "eval_steps_per_second": 12.884, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5967629250116442, |
| "grad_norm": 0.3877165615558624, |
| "learning_rate": 0.0005931892836342457, |
| "loss": 4.3361, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6113181183046111, |
| "grad_norm": 0.43088725209236145, |
| "learning_rate": 0.0005930145602795573, |
| "loss": 4.3091, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.625873311597578, |
| "grad_norm": 0.42761871218681335, |
| "learning_rate": 0.000592839836924869, |
| "loss": 4.3027, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6404285048905449, |
| "grad_norm": 0.40478938817977905, |
| "learning_rate": 0.0005926651135701805, |
| "loss": 4.2962, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6549836981835119, |
| "grad_norm": 0.38832443952560425, |
| "learning_rate": 0.0005924903902154921, |
| "loss": 4.2914, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6695388914764788, |
| "grad_norm": 0.40584078431129456, |
| "learning_rate": 0.0005923156668608037, |
| "loss": 4.2738, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6840940847694458, |
| "grad_norm": 0.4648968577384949, |
| "learning_rate": 0.0005921409435061153, |
| "loss": 4.2719, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6986492780624126, |
| "grad_norm": 0.3713863790035248, |
| "learning_rate": 0.0005919662201514268, |
| "loss": 4.2445, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7132044713553796, |
| "grad_norm": 0.3868687152862549, |
| "learning_rate": 0.0005917914967967384, |
| "loss": 4.2408, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7277596646483465, |
| "grad_norm": 0.3640577495098114, |
| "learning_rate": 0.0005916167734420501, |
| "loss": 4.2353, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7423148579413135, |
| "grad_norm": 0.4075905382633209, |
| "learning_rate": 0.0005914420500873616, |
| "loss": 4.2106, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7568700512342804, |
| "grad_norm": 0.4145139455795288, |
| "learning_rate": 0.0005912673267326732, |
| "loss": 4.2097, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7714252445272474, |
| "grad_norm": 0.3656455874443054, |
| "learning_rate": 0.0005910926033779848, |
| "loss": 4.204, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7859804378202142, |
| "grad_norm": 0.3666558265686035, |
| "learning_rate": 0.0005909178800232964, |
| "loss": 4.2007, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8005356311131812, |
| "grad_norm": 0.36030831933021545, |
| "learning_rate": 0.000590743156668608, |
| "loss": 4.1861, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8150908244061481, |
| "grad_norm": 0.43598672747612, |
| "learning_rate": 0.0005905684333139196, |
| "loss": 4.1762, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8296460176991151, |
| "grad_norm": 0.37617698311805725, |
| "learning_rate": 0.0005903937099592312, |
| "loss": 4.1716, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.844201210992082, |
| "grad_norm": 0.3954637348651886, |
| "learning_rate": 0.0005902189866045427, |
| "loss": 4.175, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.858756404285049, |
| "grad_norm": 0.35737791657447815, |
| "learning_rate": 0.0005900442632498543, |
| "loss": 4.1572, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8733115975780158, |
| "grad_norm": 0.35217055678367615, |
| "learning_rate": 0.0005898695398951659, |
| "loss": 4.1537, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8733115975780158, |
| "eval_accuracy": 0.31552085574921834, |
| "eval_loss": 4.096437931060791, |
| "eval_runtime": 80.6777, |
| "eval_samples_per_second": 206.389, |
| "eval_steps_per_second": 12.903, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8878667908709827, |
| "grad_norm": 0.38153108954429626, |
| "learning_rate": 0.0005896948165404776, |
| "loss": 4.1409, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9024219841639497, |
| "grad_norm": 0.3713655471801758, |
| "learning_rate": 0.0005895200931857891, |
| "loss": 4.1527, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9169771774569166, |
| "grad_norm": 0.36295828223228455, |
| "learning_rate": 0.0005893453698311007, |
| "loss": 4.1285, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9315323707498836, |
| "grad_norm": 0.4058021008968353, |
| "learning_rate": 0.0005891706464764123, |
| "loss": 4.1245, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9460875640428504, |
| "grad_norm": 0.34858351945877075, |
| "learning_rate": 0.0005889959231217238, |
| "loss": 4.1113, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9606427573358174, |
| "grad_norm": 0.37450098991394043, |
| "learning_rate": 0.0005888211997670355, |
| "loss": 4.0968, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9751979506287843, |
| "grad_norm": 0.35292214155197144, |
| "learning_rate": 0.000588646476412347, |
| "loss": 4.0934, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9897531439217513, |
| "grad_norm": 0.3288467228412628, |
| "learning_rate": 0.0005884717530576587, |
| "loss": 4.0906, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0040754541220307, |
| "grad_norm": 0.336471825838089, |
| "learning_rate": 0.0005882970297029702, |
| "loss": 4.0816, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0186306474149978, |
| "grad_norm": 0.35409805178642273, |
| "learning_rate": 0.0005881223063482818, |
| "loss": 4.0133, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0331858407079646, |
| "grad_norm": 0.34770667552948, |
| "learning_rate": 0.0005879475829935934, |
| "loss": 4.0164, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0477410340009314, |
| "grad_norm": 0.334242582321167, |
| "learning_rate": 0.0005877728596389051, |
| "loss": 4.0235, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0622962272938985, |
| "grad_norm": 0.3742446303367615, |
| "learning_rate": 0.0005875981362842166, |
| "loss": 4.0166, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0768514205868653, |
| "grad_norm": 0.3473527729511261, |
| "learning_rate": 0.0005874234129295281, |
| "loss": 4.0106, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0914066138798324, |
| "grad_norm": 0.3610967695713043, |
| "learning_rate": 0.0005872486895748398, |
| "loss": 3.9922, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1059618071727992, |
| "grad_norm": 0.3372357487678528, |
| "learning_rate": 0.0005870739662201513, |
| "loss": 4.0113, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.120517000465766, |
| "grad_norm": 0.3435993194580078, |
| "learning_rate": 0.000586899242865463, |
| "loss": 4.0138, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1350721937587331, |
| "grad_norm": 0.3696841299533844, |
| "learning_rate": 0.0005867245195107746, |
| "loss": 3.994, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1496273870517, |
| "grad_norm": 0.33927223086357117, |
| "learning_rate": 0.0005865497961560862, |
| "loss": 3.994, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.164182580344667, |
| "grad_norm": 0.3394480049610138, |
| "learning_rate": 0.0005863750728013977, |
| "loss": 3.984, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.164182580344667, |
| "eval_accuracy": 0.3248662744937188, |
| "eval_loss": 3.9905617237091064, |
| "eval_runtime": 80.772, |
| "eval_samples_per_second": 206.148, |
| "eval_steps_per_second": 12.888, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1787377736376339, |
| "grad_norm": 0.358623206615448, |
| "learning_rate": 0.0005862003494467094, |
| "loss": 3.9918, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.193292966930601, |
| "grad_norm": 0.34555932879447937, |
| "learning_rate": 0.0005860256260920209, |
| "loss": 3.9909, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2078481602235678, |
| "grad_norm": 0.35212889313697815, |
| "learning_rate": 0.0005858509027373325, |
| "loss": 3.9801, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2224033535165346, |
| "grad_norm": 0.3538370728492737, |
| "learning_rate": 0.0005856761793826441, |
| "loss": 3.9793, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2369585468095017, |
| "grad_norm": 0.3170168995857239, |
| "learning_rate": 0.0005855014560279557, |
| "loss": 3.9947, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2515137401024685, |
| "grad_norm": 0.3463495671749115, |
| "learning_rate": 0.0005853267326732673, |
| "loss": 3.9787, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2660689333954354, |
| "grad_norm": 0.3341588079929352, |
| "learning_rate": 0.0005851520093185788, |
| "loss": 3.9695, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2806241266884024, |
| "grad_norm": 0.348074734210968, |
| "learning_rate": 0.0005849772859638905, |
| "loss": 3.9691, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2951793199813695, |
| "grad_norm": 0.36371445655822754, |
| "learning_rate": 0.0005848025626092021, |
| "loss": 3.9655, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3097345132743363, |
| "grad_norm": 0.356099933385849, |
| "learning_rate": 0.0005846278392545136, |
| "loss": 3.955, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3242897065673032, |
| "grad_norm": 0.35547953844070435, |
| "learning_rate": 0.0005844531158998252, |
| "loss": 3.9673, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3388448998602702, |
| "grad_norm": 0.3330957293510437, |
| "learning_rate": 0.0005842783925451368, |
| "loss": 3.9521, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.353400093153237, |
| "grad_norm": 0.36023977398872375, |
| "learning_rate": 0.0005841036691904484, |
| "loss": 3.9628, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.367955286446204, |
| "grad_norm": 0.33673009276390076, |
| "learning_rate": 0.00058392894583576, |
| "loss": 3.9497, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.382510479739171, |
| "grad_norm": 0.32880502939224243, |
| "learning_rate": 0.0005837542224810716, |
| "loss": 3.9612, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3970656730321378, |
| "grad_norm": 0.3557884991168976, |
| "learning_rate": 0.0005835794991263832, |
| "loss": 3.9334, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4116208663251049, |
| "grad_norm": 0.358247846364975, |
| "learning_rate": 0.0005834047757716948, |
| "loss": 3.9403, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.4261760596180717, |
| "grad_norm": 0.32892918586730957, |
| "learning_rate": 0.0005832300524170063, |
| "loss": 3.9405, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4407312529110388, |
| "grad_norm": 0.31469571590423584, |
| "learning_rate": 0.0005830553290623179, |
| "loss": 3.9364, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4552864462040056, |
| "grad_norm": 0.3483119010925293, |
| "learning_rate": 0.0005828806057076296, |
| "loss": 3.9259, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4552864462040056, |
| "eval_accuracy": 0.3314562655628189, |
| "eval_loss": 3.9168081283569336, |
| "eval_runtime": 80.9479, |
| "eval_samples_per_second": 205.7, |
| "eval_steps_per_second": 12.86, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4698416394969724, |
| "grad_norm": 0.3485657274723053, |
| "learning_rate": 0.0005827058823529411, |
| "loss": 3.9295, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4843968327899395, |
| "grad_norm": 0.3288109302520752, |
| "learning_rate": 0.0005825311589982527, |
| "loss": 3.9272, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4989520260829063, |
| "grad_norm": 0.3118053674697876, |
| "learning_rate": 0.0005823564356435643, |
| "loss": 3.9236, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5135072193758732, |
| "grad_norm": 0.3337427079677582, |
| "learning_rate": 0.0005821817122888759, |
| "loss": 3.9168, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5280624126688402, |
| "grad_norm": 0.3375294804573059, |
| "learning_rate": 0.0005820069889341875, |
| "loss": 3.9231, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5426176059618073, |
| "grad_norm": 0.34049808979034424, |
| "learning_rate": 0.000581832265579499, |
| "loss": 3.9116, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5571727992547741, |
| "grad_norm": 0.3389519453048706, |
| "learning_rate": 0.0005816575422248107, |
| "loss": 3.9266, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.571727992547741, |
| "grad_norm": 0.34914088249206543, |
| "learning_rate": 0.0005814828188701222, |
| "loss": 3.9127, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.586283185840708, |
| "grad_norm": 0.35371309518814087, |
| "learning_rate": 0.0005813080955154338, |
| "loss": 3.9182, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6008383791336749, |
| "grad_norm": 0.33470141887664795, |
| "learning_rate": 0.0005811333721607454, |
| "loss": 3.9089, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6153935724266417, |
| "grad_norm": 0.33131688833236694, |
| "learning_rate": 0.0005809586488060571, |
| "loss": 3.9093, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6299487657196088, |
| "grad_norm": 0.36808761954307556, |
| "learning_rate": 0.0005807839254513686, |
| "loss": 3.9103, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6445039590125758, |
| "grad_norm": 0.3227200508117676, |
| "learning_rate": 0.0005806092020966802, |
| "loss": 3.8958, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6590591523055425, |
| "grad_norm": 0.34133103489875793, |
| "learning_rate": 0.0005804344787419918, |
| "loss": 3.8961, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6736143455985095, |
| "grad_norm": 0.3205196261405945, |
| "learning_rate": 0.0005802597553873033, |
| "loss": 3.8943, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6881695388914766, |
| "grad_norm": 0.33680620789527893, |
| "learning_rate": 0.000580085032032615, |
| "loss": 3.8921, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7027247321844434, |
| "grad_norm": 0.3578755855560303, |
| "learning_rate": 0.0005799103086779265, |
| "loss": 3.8943, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.7172799254774103, |
| "grad_norm": 0.35171619057655334, |
| "learning_rate": 0.0005797355853232382, |
| "loss": 3.8784, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7318351187703773, |
| "grad_norm": 0.3266540467739105, |
| "learning_rate": 0.0005795608619685497, |
| "loss": 3.8853, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7463903120633442, |
| "grad_norm": 0.3230031430721283, |
| "learning_rate": 0.0005793861386138614, |
| "loss": 3.8785, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7463903120633442, |
| "eval_accuracy": 0.3372420172809799, |
| "eval_loss": 3.857877254486084, |
| "eval_runtime": 80.7565, |
| "eval_samples_per_second": 206.188, |
| "eval_steps_per_second": 12.891, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.760945505356311, |
| "grad_norm": 0.3300028145313263, |
| "learning_rate": 0.0005792114152591729, |
| "loss": 3.8906, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.775500698649278, |
| "grad_norm": 0.3108998239040375, |
| "learning_rate": 0.0005790366919044846, |
| "loss": 3.8773, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7900558919422451, |
| "grad_norm": 0.3450206518173218, |
| "learning_rate": 0.0005788619685497961, |
| "loss": 3.8776, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.804611085235212, |
| "grad_norm": 0.3282497823238373, |
| "learning_rate": 0.0005786872451951077, |
| "loss": 3.8661, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.8191662785281788, |
| "grad_norm": 0.34226560592651367, |
| "learning_rate": 0.0005785125218404193, |
| "loss": 3.867, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8337214718211459, |
| "grad_norm": 0.3334205448627472, |
| "learning_rate": 0.0005783377984857308, |
| "loss": 3.8708, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8482766651141127, |
| "grad_norm": 0.32135775685310364, |
| "learning_rate": 0.0005781630751310425, |
| "loss": 3.8721, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8628318584070795, |
| "grad_norm": 0.3335668742656708, |
| "learning_rate": 0.0005779883517763541, |
| "loss": 3.8587, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8773870517000466, |
| "grad_norm": 0.3514259159564972, |
| "learning_rate": 0.0005778136284216657, |
| "loss": 3.8572, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8919422449930137, |
| "grad_norm": 0.325557142496109, |
| "learning_rate": 0.0005776389050669772, |
| "loss": 3.8551, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9064974382859803, |
| "grad_norm": 0.3148052990436554, |
| "learning_rate": 0.0005774641817122889, |
| "loss": 3.8621, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9210526315789473, |
| "grad_norm": 0.3131468892097473, |
| "learning_rate": 0.0005772894583576004, |
| "loss": 3.8518, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9356078248719144, |
| "grad_norm": 0.32041823863983154, |
| "learning_rate": 0.000577114735002912, |
| "loss": 3.8596, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9501630181648812, |
| "grad_norm": 0.31143495440483093, |
| "learning_rate": 0.0005769400116482236, |
| "loss": 3.8629, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.964718211457848, |
| "grad_norm": 0.3137151598930359, |
| "learning_rate": 0.0005767652882935352, |
| "loss": 3.8451, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9792734047508151, |
| "grad_norm": 0.3450681269168854, |
| "learning_rate": 0.0005765905649388468, |
| "loss": 3.8645, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.993828598043782, |
| "grad_norm": 0.31859156489372253, |
| "learning_rate": 0.0005764158415841583, |
| "loss": 3.8442, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.0081509082440614, |
| "grad_norm": 0.31846410036087036, |
| "learning_rate": 0.00057624111822947, |
| "loss": 3.8075, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0227061015370285, |
| "grad_norm": 0.3247222304344177, |
| "learning_rate": 0.0005760663948747816, |
| "loss": 3.7526, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.0372612948299955, |
| "grad_norm": 0.3297727108001709, |
| "learning_rate": 0.0005758916715200931, |
| "loss": 3.749, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0372612948299955, |
| "eval_accuracy": 0.34135382758814814, |
| "eval_loss": 3.815169334411621, |
| "eval_runtime": 80.6048, |
| "eval_samples_per_second": 206.576, |
| "eval_steps_per_second": 12.915, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.051816488122962, |
| "grad_norm": 0.31669923663139343, |
| "learning_rate": 0.0005757169481654047, |
| "loss": 3.7408, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.066371681415929, |
| "grad_norm": 0.3360742926597595, |
| "learning_rate": 0.0005755422248107163, |
| "loss": 3.7597, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0809268747088963, |
| "grad_norm": 0.33094435930252075, |
| "learning_rate": 0.0005753675014560279, |
| "loss": 3.7534, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.095482068001863, |
| "grad_norm": 0.3317500352859497, |
| "learning_rate": 0.0005751927781013395, |
| "loss": 3.773, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.11003726129483, |
| "grad_norm": 0.32213926315307617, |
| "learning_rate": 0.0005750180547466511, |
| "loss": 3.7557, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.124592454587797, |
| "grad_norm": 0.33375924825668335, |
| "learning_rate": 0.0005748433313919627, |
| "loss": 3.7663, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.139147647880764, |
| "grad_norm": 0.3372042179107666, |
| "learning_rate": 0.0005746686080372743, |
| "loss": 3.7554, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1537028411737307, |
| "grad_norm": 0.32004597783088684, |
| "learning_rate": 0.0005744938846825858, |
| "loss": 3.7509, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.1682580344666977, |
| "grad_norm": 0.33065932989120483, |
| "learning_rate": 0.0005743191613278974, |
| "loss": 3.7474, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.182813227759665, |
| "grad_norm": 0.3302080035209656, |
| "learning_rate": 0.0005741444379732091, |
| "loss": 3.7635, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.1973684210526314, |
| "grad_norm": 0.31607386469841003, |
| "learning_rate": 0.0005739697146185206, |
| "loss": 3.762, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.2119236143455985, |
| "grad_norm": 0.30909621715545654, |
| "learning_rate": 0.0005737949912638322, |
| "loss": 3.7537, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2264788076385655, |
| "grad_norm": 0.33623868227005005, |
| "learning_rate": 0.0005736202679091438, |
| "loss": 3.7475, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.241034000931532, |
| "grad_norm": 0.34129029512405396, |
| "learning_rate": 0.0005734455445544554, |
| "loss": 3.7407, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.255589194224499, |
| "grad_norm": 0.35044869780540466, |
| "learning_rate": 0.000573270821199767, |
| "loss": 3.7586, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2701443875174663, |
| "grad_norm": 0.3177712559700012, |
| "learning_rate": 0.0005730960978450785, |
| "loss": 3.7524, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.2846995808104333, |
| "grad_norm": 0.31583237648010254, |
| "learning_rate": 0.0005729213744903902, |
| "loss": 3.7529, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.2992547741034, |
| "grad_norm": 0.3143191635608673, |
| "learning_rate": 0.0005727466511357017, |
| "loss": 3.7554, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.313809967396367, |
| "grad_norm": 0.3100457787513733, |
| "learning_rate": 0.0005725719277810134, |
| "loss": 3.7575, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.328365160689334, |
| "grad_norm": 0.33243656158447266, |
| "learning_rate": 0.0005723972044263249, |
| "loss": 3.7559, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.328365160689334, |
| "eval_accuracy": 0.34451072853883824, |
| "eval_loss": 3.7819924354553223, |
| "eval_runtime": 80.6907, |
| "eval_samples_per_second": 206.356, |
| "eval_steps_per_second": 12.901, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3429203539823007, |
| "grad_norm": 0.3311172127723694, |
| "learning_rate": 0.0005722224810716366, |
| "loss": 3.7465, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.3574755472752678, |
| "grad_norm": 0.31717216968536377, |
| "learning_rate": 0.0005720477577169481, |
| "loss": 3.7589, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.372030740568235, |
| "grad_norm": 0.33885911107063293, |
| "learning_rate": 0.0005718730343622598, |
| "loss": 3.7542, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.386585933861202, |
| "grad_norm": 0.33938223123550415, |
| "learning_rate": 0.0005716983110075713, |
| "loss": 3.7582, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4011411271541685, |
| "grad_norm": 0.35579636693000793, |
| "learning_rate": 0.0005715235876528828, |
| "loss": 3.748, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4156963204471356, |
| "grad_norm": 0.32005932927131653, |
| "learning_rate": 0.0005713488642981945, |
| "loss": 3.7616, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.4302515137401026, |
| "grad_norm": 0.3208124041557312, |
| "learning_rate": 0.0005711741409435061, |
| "loss": 3.7637, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4448067070330692, |
| "grad_norm": 0.33127275109291077, |
| "learning_rate": 0.0005709994175888177, |
| "loss": 3.7459, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.4593619003260363, |
| "grad_norm": 0.32771068811416626, |
| "learning_rate": 0.0005708246942341292, |
| "loss": 3.7441, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4739170936190034, |
| "grad_norm": 0.3294542133808136, |
| "learning_rate": 0.0005706499708794409, |
| "loss": 3.7477, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4884722869119704, |
| "grad_norm": 0.32679829001426697, |
| "learning_rate": 0.0005704752475247524, |
| "loss": 3.7332, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.503027480204937, |
| "grad_norm": 0.32272106409072876, |
| "learning_rate": 0.0005703005241700641, |
| "loss": 3.7496, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.517582673497904, |
| "grad_norm": 0.3153320252895355, |
| "learning_rate": 0.0005701258008153756, |
| "loss": 3.7467, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.5321378667908707, |
| "grad_norm": 0.3348556458950043, |
| "learning_rate": 0.0005699510774606872, |
| "loss": 3.7438, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.546693060083838, |
| "grad_norm": 0.32009074091911316, |
| "learning_rate": 0.0005697763541059988, |
| "loss": 3.7556, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.561248253376805, |
| "grad_norm": 0.3292238712310791, |
| "learning_rate": 0.0005696016307513103, |
| "loss": 3.7396, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.575803446669772, |
| "grad_norm": 0.31015273928642273, |
| "learning_rate": 0.000569426907396622, |
| "loss": 3.7467, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.590358639962739, |
| "grad_norm": 0.32287493348121643, |
| "learning_rate": 0.0005692521840419336, |
| "loss": 3.7362, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.6049138332557056, |
| "grad_norm": 0.3242190182209015, |
| "learning_rate": 0.0005690774606872452, |
| "loss": 3.7454, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6194690265486726, |
| "grad_norm": 0.3385515511035919, |
| "learning_rate": 0.0005689027373325567, |
| "loss": 3.7391, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6194690265486726, |
| "eval_accuracy": 0.3473332643056293, |
| "eval_loss": 3.7549922466278076, |
| "eval_runtime": 80.8735, |
| "eval_samples_per_second": 205.889, |
| "eval_steps_per_second": 12.872, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6340242198416393, |
| "grad_norm": 0.32262712717056274, |
| "learning_rate": 0.0005687280139778683, |
| "loss": 3.7403, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6485794131346063, |
| "grad_norm": 0.29943108558654785, |
| "learning_rate": 0.0005685532906231799, |
| "loss": 3.7475, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6631346064275734, |
| "grad_norm": 0.3075816333293915, |
| "learning_rate": 0.0005683785672684915, |
| "loss": 3.7267, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.6776897997205404, |
| "grad_norm": 0.35581615567207336, |
| "learning_rate": 0.0005682038439138031, |
| "loss": 3.7367, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.692244993013507, |
| "grad_norm": 0.31568026542663574, |
| "learning_rate": 0.0005680291205591147, |
| "loss": 3.7589, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.706800186306474, |
| "grad_norm": 0.3257187604904175, |
| "learning_rate": 0.0005678543972044263, |
| "loss": 3.7434, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.721355379599441, |
| "grad_norm": 0.3284075856208801, |
| "learning_rate": 0.0005676796738497378, |
| "loss": 3.7337, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.735910572892408, |
| "grad_norm": 0.3168555796146393, |
| "learning_rate": 0.0005675049504950495, |
| "loss": 3.7416, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.750465766185375, |
| "grad_norm": 0.316857248544693, |
| "learning_rate": 0.0005673302271403611, |
| "loss": 3.7266, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.765020959478342, |
| "grad_norm": 0.36205533146858215, |
| "learning_rate": 0.0005671555037856726, |
| "loss": 3.7278, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.779576152771309, |
| "grad_norm": 0.31689226627349854, |
| "learning_rate": 0.0005669807804309842, |
| "loss": 3.7282, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7941313460642756, |
| "grad_norm": 0.33204737305641174, |
| "learning_rate": 0.0005668060570762958, |
| "loss": 3.7377, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.8086865393572427, |
| "grad_norm": 0.3281012177467346, |
| "learning_rate": 0.0005666313337216074, |
| "loss": 3.7281, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8232417326502097, |
| "grad_norm": 0.32469600439071655, |
| "learning_rate": 0.000566456610366919, |
| "loss": 3.7214, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8377969259431763, |
| "grad_norm": 0.30553606152534485, |
| "learning_rate": 0.0005662818870122306, |
| "loss": 3.732, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.8523521192361434, |
| "grad_norm": 0.31606000661849976, |
| "learning_rate": 0.0005661071636575422, |
| "loss": 3.7293, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.8669073125291105, |
| "grad_norm": 0.3030395805835724, |
| "learning_rate": 0.0005659324403028537, |
| "loss": 3.7173, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.8814625058220775, |
| "grad_norm": 0.3114665150642395, |
| "learning_rate": 0.0005657577169481653, |
| "loss": 3.7245, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.896017699115044, |
| "grad_norm": 0.3268154561519623, |
| "learning_rate": 0.0005655829935934769, |
| "loss": 3.7187, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.910572892408011, |
| "grad_norm": 0.3039146065711975, |
| "learning_rate": 0.0005654082702387886, |
| "loss": 3.7358, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.910572892408011, |
| "eval_accuracy": 0.34964855222225916, |
| "eval_loss": 3.728553056716919, |
| "eval_runtime": 80.6468, |
| "eval_samples_per_second": 206.468, |
| "eval_steps_per_second": 12.908, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9251280857009783, |
| "grad_norm": 0.3251325786113739, |
| "learning_rate": 0.0005652335468841001, |
| "loss": 3.7181, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.939683278993945, |
| "grad_norm": 0.32574522495269775, |
| "learning_rate": 0.0005650588235294117, |
| "loss": 3.7253, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.954238472286912, |
| "grad_norm": 0.3061930537223816, |
| "learning_rate": 0.0005648841001747233, |
| "loss": 3.7225, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.968793665579879, |
| "grad_norm": 0.3113483786582947, |
| "learning_rate": 0.0005647093768200349, |
| "loss": 3.7303, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.983348858872846, |
| "grad_norm": 0.3267492353916168, |
| "learning_rate": 0.0005645346534653465, |
| "loss": 3.718, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9979040521658127, |
| "grad_norm": 0.30930453538894653, |
| "learning_rate": 0.0005643599301106582, |
| "loss": 3.7165, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0122263623660923, |
| "grad_norm": 0.33333995938301086, |
| "learning_rate": 0.0005641852067559697, |
| "loss": 3.6506, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.026781555659059, |
| "grad_norm": 0.32402825355529785, |
| "learning_rate": 0.0005640104834012812, |
| "loss": 3.6191, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.041336748952026, |
| "grad_norm": 0.3048689365386963, |
| "learning_rate": 0.0005638357600465929, |
| "loss": 3.612, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.055891942244993, |
| "grad_norm": 0.3315111994743347, |
| "learning_rate": 0.0005636610366919044, |
| "loss": 3.6134, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.07044713553796, |
| "grad_norm": 0.33771762251853943, |
| "learning_rate": 0.0005634863133372161, |
| "loss": 3.6258, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0850023288309267, |
| "grad_norm": 0.3369899392127991, |
| "learning_rate": 0.0005633115899825276, |
| "loss": 3.6318, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.099557522123894, |
| "grad_norm": 0.34458309412002563, |
| "learning_rate": 0.0005631368666278393, |
| "loss": 3.631, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.114112715416861, |
| "grad_norm": 0.3282023072242737, |
| "learning_rate": 0.0005629621432731508, |
| "loss": 3.6488, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.1286679087098275, |
| "grad_norm": 0.32579246163368225, |
| "learning_rate": 0.0005627874199184623, |
| "loss": 3.6363, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1432231020027945, |
| "grad_norm": 0.3263109028339386, |
| "learning_rate": 0.000562612696563774, |
| "loss": 3.6383, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1577782952957616, |
| "grad_norm": 0.32289543747901917, |
| "learning_rate": 0.0005624379732090856, |
| "loss": 3.6326, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.1723334885887287, |
| "grad_norm": 0.3056752383708954, |
| "learning_rate": 0.0005622632498543972, |
| "loss": 3.631, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1868886818816953, |
| "grad_norm": 0.33408114314079285, |
| "learning_rate": 0.0005620885264997087, |
| "loss": 3.6395, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2014438751746623, |
| "grad_norm": 0.3321467339992523, |
| "learning_rate": 0.0005619138031450204, |
| "loss": 3.6391, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2014438751746623, |
| "eval_accuracy": 0.35126772590892974, |
| "eval_loss": 3.715968608856201, |
| "eval_runtime": 80.693, |
| "eval_samples_per_second": 206.35, |
| "eval_steps_per_second": 12.901, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2159990684676294, |
| "grad_norm": 0.3220079243183136, |
| "learning_rate": 0.0005617390797903319, |
| "loss": 3.6311, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.230554261760596, |
| "grad_norm": 0.344857394695282, |
| "learning_rate": 0.0005615643564356436, |
| "loss": 3.6402, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.245109455053563, |
| "grad_norm": 0.3287631571292877, |
| "learning_rate": 0.0005613896330809551, |
| "loss": 3.6399, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.25966464834653, |
| "grad_norm": 0.3120687007904053, |
| "learning_rate": 0.0005612149097262667, |
| "loss": 3.6368, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.274219841639497, |
| "grad_norm": 0.3104822337627411, |
| "learning_rate": 0.0005610401863715783, |
| "loss": 3.6433, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.288775034932464, |
| "grad_norm": 0.32921087741851807, |
| "learning_rate": 0.0005608654630168898, |
| "loss": 3.6416, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.303330228225431, |
| "grad_norm": 0.3301078677177429, |
| "learning_rate": 0.0005606907396622015, |
| "loss": 3.6498, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.317885421518398, |
| "grad_norm": 0.3328259587287903, |
| "learning_rate": 0.0005605160163075131, |
| "loss": 3.6515, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3324406148113646, |
| "grad_norm": 0.3562871813774109, |
| "learning_rate": 0.0005603412929528247, |
| "loss": 3.6428, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.3469958081043316, |
| "grad_norm": 0.3341737985610962, |
| "learning_rate": 0.0005601665695981362, |
| "loss": 3.6459, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3615510013972987, |
| "grad_norm": 0.32263845205307007, |
| "learning_rate": 0.0005599918462434478, |
| "loss": 3.6434, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.3761061946902653, |
| "grad_norm": 0.34071630239486694, |
| "learning_rate": 0.0005598171228887594, |
| "loss": 3.6477, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.3906613879832324, |
| "grad_norm": 0.347917765378952, |
| "learning_rate": 0.0005596423995340709, |
| "loss": 3.6382, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4052165812761994, |
| "grad_norm": 0.3371036946773529, |
| "learning_rate": 0.0005594676761793826, |
| "loss": 3.6432, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.419771774569166, |
| "grad_norm": 0.316082626581192, |
| "learning_rate": 0.0005592929528246942, |
| "loss": 3.637, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.434326967862133, |
| "grad_norm": 0.3077159523963928, |
| "learning_rate": 0.0005591182294700058, |
| "loss": 3.6454, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4488821611551, |
| "grad_norm": 0.33680781722068787, |
| "learning_rate": 0.0005589435061153173, |
| "loss": 3.6513, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.463437354448067, |
| "grad_norm": 0.32402291893959045, |
| "learning_rate": 0.000558768782760629, |
| "loss": 3.6519, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.477992547741034, |
| "grad_norm": 0.33457881212234497, |
| "learning_rate": 0.0005585940594059406, |
| "loss": 3.641, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.492547741034001, |
| "grad_norm": 0.3288632333278656, |
| "learning_rate": 0.0005584193360512521, |
| "loss": 3.6562, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.492547741034001, |
| "eval_accuracy": 0.35305966473455697, |
| "eval_loss": 3.6968538761138916, |
| "eval_runtime": 80.7055, |
| "eval_samples_per_second": 206.318, |
| "eval_steps_per_second": 12.899, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.507102934326968, |
| "grad_norm": 0.3138067126274109, |
| "learning_rate": 0.0005582446126965637, |
| "loss": 3.6445, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5216581276199346, |
| "grad_norm": 0.337358295917511, |
| "learning_rate": 0.0005580698893418753, |
| "loss": 3.6375, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.5362133209129016, |
| "grad_norm": 0.3176218867301941, |
| "learning_rate": 0.0005578951659871869, |
| "loss": 3.6429, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.5507685142058687, |
| "grad_norm": 0.413610577583313, |
| "learning_rate": 0.0005577204426324985, |
| "loss": 3.6526, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5653237074988358, |
| "grad_norm": 0.3302057683467865, |
| "learning_rate": 0.0005575457192778101, |
| "loss": 3.6489, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5798789007918024, |
| "grad_norm": 0.32902398705482483, |
| "learning_rate": 0.0005573709959231217, |
| "loss": 3.6514, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.5944340940847694, |
| "grad_norm": 0.3304519057273865, |
| "learning_rate": 0.0005571962725684332, |
| "loss": 3.6577, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6089892873777365, |
| "grad_norm": 0.3087259531021118, |
| "learning_rate": 0.0005570215492137449, |
| "loss": 3.6269, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.623544480670703, |
| "grad_norm": 0.30895984172821045, |
| "learning_rate": 0.0005568468258590564, |
| "loss": 3.6358, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.63809967396367, |
| "grad_norm": 0.3273976445198059, |
| "learning_rate": 0.0005566721025043681, |
| "loss": 3.6402, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6526548672566372, |
| "grad_norm": 0.3480411469936371, |
| "learning_rate": 0.0005564973791496796, |
| "loss": 3.6282, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6672100605496043, |
| "grad_norm": 0.31076759099960327, |
| "learning_rate": 0.0005563226557949913, |
| "loss": 3.6484, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.681765253842571, |
| "grad_norm": 0.31380197405815125, |
| "learning_rate": 0.0005561479324403028, |
| "loss": 3.6503, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.696320447135538, |
| "grad_norm": 0.32344043254852295, |
| "learning_rate": 0.0005559732090856144, |
| "loss": 3.6326, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.710875640428505, |
| "grad_norm": 0.34007418155670166, |
| "learning_rate": 0.000555798485730926, |
| "loss": 3.6376, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7254308337214717, |
| "grad_norm": 0.3169088661670685, |
| "learning_rate": 0.0005556237623762376, |
| "loss": 3.6522, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7399860270144387, |
| "grad_norm": 0.3295254707336426, |
| "learning_rate": 0.0005554490390215492, |
| "loss": 3.6384, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.754541220307406, |
| "grad_norm": 0.32250064611434937, |
| "learning_rate": 0.0005552743156668607, |
| "loss": 3.6373, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.769096413600373, |
| "grad_norm": 0.33161431550979614, |
| "learning_rate": 0.0005550995923121724, |
| "loss": 3.644, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7836516068933395, |
| "grad_norm": 0.3506690561771393, |
| "learning_rate": 0.0005549248689574839, |
| "loss": 3.6513, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7836516068933395, |
| "eval_accuracy": 0.35488756691564044, |
| "eval_loss": 3.678985118865967, |
| "eval_runtime": 80.8992, |
| "eval_samples_per_second": 205.824, |
| "eval_steps_per_second": 12.868, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7982068001863065, |
| "grad_norm": 0.30930566787719727, |
| "learning_rate": 0.0005547501456027955, |
| "loss": 3.6508, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.812761993479273, |
| "grad_norm": 0.3367414176464081, |
| "learning_rate": 0.0005545754222481071, |
| "loss": 3.6332, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.82731718677224, |
| "grad_norm": 0.32576629519462585, |
| "learning_rate": 0.0005544006988934188, |
| "loss": 3.6427, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8418723800652073, |
| "grad_norm": 0.31796979904174805, |
| "learning_rate": 0.0005542259755387303, |
| "loss": 3.6372, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.8564275733581743, |
| "grad_norm": 0.32642459869384766, |
| "learning_rate": 0.0005540512521840418, |
| "loss": 3.6309, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.8709827666511414, |
| "grad_norm": 0.3310360312461853, |
| "learning_rate": 0.0005538765288293535, |
| "loss": 3.631, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.885537959944108, |
| "grad_norm": 0.3166876435279846, |
| "learning_rate": 0.0005537018054746651, |
| "loss": 3.6359, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.900093153237075, |
| "grad_norm": 0.33555924892425537, |
| "learning_rate": 0.0005535270821199767, |
| "loss": 3.6349, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.9146483465300417, |
| "grad_norm": 0.3408661484718323, |
| "learning_rate": 0.0005533523587652882, |
| "loss": 3.6296, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.9292035398230087, |
| "grad_norm": 0.32099148631095886, |
| "learning_rate": 0.0005531776354105999, |
| "loss": 3.6561, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.943758733115976, |
| "grad_norm": 0.3195011019706726, |
| "learning_rate": 0.0005530029120559114, |
| "loss": 3.6481, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.958313926408943, |
| "grad_norm": 0.3143410086631775, |
| "learning_rate": 0.0005528281887012229, |
| "loss": 3.6438, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.9728691197019095, |
| "grad_norm": 0.30868852138519287, |
| "learning_rate": 0.0005526534653465346, |
| "loss": 3.6328, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9874243129948765, |
| "grad_norm": 0.318017840385437, |
| "learning_rate": 0.0005524787419918462, |
| "loss": 3.6393, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.001746623195156, |
| "grad_norm": 0.3467653691768646, |
| "learning_rate": 0.0005523040186371578, |
| "loss": 3.6257, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.016301816488123, |
| "grad_norm": 0.3159387409687042, |
| "learning_rate": 0.0005521292952824693, |
| "loss": 3.533, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.03085700978109, |
| "grad_norm": 0.32453659176826477, |
| "learning_rate": 0.000551954571927781, |
| "loss": 3.5268, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.045412203074057, |
| "grad_norm": 0.3308572769165039, |
| "learning_rate": 0.0005517798485730926, |
| "loss": 3.5263, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.059967396367024, |
| "grad_norm": 0.33152005076408386, |
| "learning_rate": 0.0005516051252184042, |
| "loss": 3.5264, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.074522589659991, |
| "grad_norm": 0.3357018530368805, |
| "learning_rate": 0.0005514304018637157, |
| "loss": 3.5372, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.074522589659991, |
| "eval_accuracy": 0.3562491207488464, |
| "eval_loss": 3.6702051162719727, |
| "eval_runtime": 80.6876, |
| "eval_samples_per_second": 206.364, |
| "eval_steps_per_second": 12.902, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.089077782952957, |
| "grad_norm": 0.3306080996990204, |
| "learning_rate": 0.0005512556785090273, |
| "loss": 3.5466, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.103632976245924, |
| "grad_norm": 0.3229883909225464, |
| "learning_rate": 0.0005510809551543389, |
| "loss": 3.5388, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.118188169538891, |
| "grad_norm": 0.3394538462162018, |
| "learning_rate": 0.0005509062317996504, |
| "loss": 3.5427, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.132743362831858, |
| "grad_norm": 0.3470659852027893, |
| "learning_rate": 0.0005507315084449621, |
| "loss": 3.5527, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.1472985561248255, |
| "grad_norm": 0.310424268245697, |
| "learning_rate": 0.0005505567850902737, |
| "loss": 3.5556, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.1618537494177925, |
| "grad_norm": 0.3339404761791229, |
| "learning_rate": 0.0005503820617355853, |
| "loss": 3.552, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.17640894271076, |
| "grad_norm": 0.3140861690044403, |
| "learning_rate": 0.0005502073383808969, |
| "loss": 3.5591, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.190964136003726, |
| "grad_norm": 0.32036176323890686, |
| "learning_rate": 0.0005500326150262085, |
| "loss": 3.5628, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.205519329296693, |
| "grad_norm": 0.3170437216758728, |
| "learning_rate": 0.00054985789167152, |
| "loss": 3.5651, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.22007452258966, |
| "grad_norm": 0.35987627506256104, |
| "learning_rate": 0.0005496831683168316, |
| "loss": 3.5555, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.234629715882627, |
| "grad_norm": 0.3336397707462311, |
| "learning_rate": 0.0005495084449621433, |
| "loss": 3.569, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.249184909175594, |
| "grad_norm": 0.3451847732067108, |
| "learning_rate": 0.0005493337216074548, |
| "loss": 3.5573, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.263740102468561, |
| "grad_norm": 0.315585732460022, |
| "learning_rate": 0.0005491589982527664, |
| "loss": 3.5588, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.278295295761528, |
| "grad_norm": 0.32119220495224, |
| "learning_rate": 0.000548984274898078, |
| "loss": 3.5609, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.292850489054494, |
| "grad_norm": 0.3273337781429291, |
| "learning_rate": 0.0005488095515433897, |
| "loss": 3.5731, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.307405682347461, |
| "grad_norm": 0.35502392053604126, |
| "learning_rate": 0.0005486348281887012, |
| "loss": 3.5591, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.321960875640428, |
| "grad_norm": 0.3433130383491516, |
| "learning_rate": 0.0005484601048340127, |
| "loss": 3.5669, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.3365160689333955, |
| "grad_norm": 0.3262738585472107, |
| "learning_rate": 0.0005482853814793244, |
| "loss": 3.5721, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.3510712622263625, |
| "grad_norm": 0.31744176149368286, |
| "learning_rate": 0.0005481106581246359, |
| "loss": 3.5664, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.36562645551933, |
| "grad_norm": 0.3194833993911743, |
| "learning_rate": 0.0005479359347699475, |
| "loss": 3.5803, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.36562645551933, |
| "eval_accuracy": 0.3568811825973558, |
| "eval_loss": 3.663872480392456, |
| "eval_runtime": 80.8042, |
| "eval_samples_per_second": 206.066, |
| "eval_steps_per_second": 12.883, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.380181648812297, |
| "grad_norm": 0.3289634585380554, |
| "learning_rate": 0.0005477612114152591, |
| "loss": 3.5721, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.394736842105263, |
| "grad_norm": 0.32845333218574524, |
| "learning_rate": 0.0005475864880605708, |
| "loss": 3.5759, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.40929203539823, |
| "grad_norm": 0.35052117705345154, |
| "learning_rate": 0.0005474117647058823, |
| "loss": 3.5648, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.423847228691197, |
| "grad_norm": 0.32375937700271606, |
| "learning_rate": 0.0005472370413511939, |
| "loss": 3.5816, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.438402421984164, |
| "grad_norm": 0.34090960025787354, |
| "learning_rate": 0.0005470623179965055, |
| "loss": 3.5746, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.452957615277131, |
| "grad_norm": 0.325315922498703, |
| "learning_rate": 0.0005468875946418171, |
| "loss": 3.5782, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.467512808570098, |
| "grad_norm": 0.325226753950119, |
| "learning_rate": 0.0005467128712871287, |
| "loss": 3.5765, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.482068001863064, |
| "grad_norm": 0.33783793449401855, |
| "learning_rate": 0.0005465381479324402, |
| "loss": 3.5717, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.496623195156031, |
| "grad_norm": 0.3271529972553253, |
| "learning_rate": 0.0005463634245777519, |
| "loss": 3.5847, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.511178388448998, |
| "grad_norm": 0.3214201033115387, |
| "learning_rate": 0.0005461887012230634, |
| "loss": 3.5856, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.5257335817419655, |
| "grad_norm": 0.3204360008239746, |
| "learning_rate": 0.000546013977868375, |
| "loss": 3.5672, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.5402887750349326, |
| "grad_norm": 0.32717350125312805, |
| "learning_rate": 0.0005458392545136866, |
| "loss": 3.5786, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.5548439683279, |
| "grad_norm": 0.3417745530605316, |
| "learning_rate": 0.0005456645311589983, |
| "loss": 3.5754, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.569399161620867, |
| "grad_norm": 0.3492000997066498, |
| "learning_rate": 0.0005454898078043098, |
| "loss": 3.5793, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.583954354913834, |
| "grad_norm": 0.3486742377281189, |
| "learning_rate": 0.0005453150844496213, |
| "loss": 3.5684, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.5985095482068, |
| "grad_norm": 0.32634660601615906, |
| "learning_rate": 0.000545140361094933, |
| "loss": 3.5779, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.613064741499767, |
| "grad_norm": 0.3425087034702301, |
| "learning_rate": 0.0005449656377402445, |
| "loss": 3.5736, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.627619934792734, |
| "grad_norm": 0.3174213469028473, |
| "learning_rate": 0.0005447909143855562, |
| "loss": 3.5873, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.642175128085701, |
| "grad_norm": 0.33729368448257446, |
| "learning_rate": 0.0005446161910308677, |
| "loss": 3.5833, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.656730321378668, |
| "grad_norm": 0.33884721994400024, |
| "learning_rate": 0.0005444414676761794, |
| "loss": 3.5727, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.656730321378668, |
| "eval_accuracy": 0.3583588534083095, |
| "eval_loss": 3.6462974548339844, |
| "eval_runtime": 80.9977, |
| "eval_samples_per_second": 205.574, |
| "eval_steps_per_second": 12.852, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.671285514671635, |
| "grad_norm": 0.3453623950481415, |
| "learning_rate": 0.0005442667443214909, |
| "loss": 3.5805, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.685840707964601, |
| "grad_norm": 0.3109801411628723, |
| "learning_rate": 0.0005440920209668024, |
| "loss": 3.5697, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.7003959012575685, |
| "grad_norm": 0.32847920060157776, |
| "learning_rate": 0.0005439172976121141, |
| "loss": 3.5744, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.7149510945505355, |
| "grad_norm": 0.3102755546569824, |
| "learning_rate": 0.0005437425742574257, |
| "loss": 3.5653, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.729506287843503, |
| "grad_norm": 0.347015917301178, |
| "learning_rate": 0.0005435678509027373, |
| "loss": 3.5794, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.74406148113647, |
| "grad_norm": 0.3300328850746155, |
| "learning_rate": 0.0005433931275480488, |
| "loss": 3.574, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.758616674429437, |
| "grad_norm": 0.32994967699050903, |
| "learning_rate": 0.0005432184041933605, |
| "loss": 3.5918, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.773171867722404, |
| "grad_norm": 0.3314038813114166, |
| "learning_rate": 0.000543043680838672, |
| "loss": 3.5834, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.78772706101537, |
| "grad_norm": 0.342044860124588, |
| "learning_rate": 0.0005428689574839837, |
| "loss": 3.5988, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.802282254308337, |
| "grad_norm": 0.3658464550971985, |
| "learning_rate": 0.0005426942341292952, |
| "loss": 3.5782, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.816837447601304, |
| "grad_norm": 0.3310931324958801, |
| "learning_rate": 0.0005425195107746068, |
| "loss": 3.5979, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.831392640894271, |
| "grad_norm": 0.32096490263938904, |
| "learning_rate": 0.0005423447874199184, |
| "loss": 3.5803, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.845947834187238, |
| "grad_norm": 0.3225961923599243, |
| "learning_rate": 0.00054217006406523, |
| "loss": 3.5792, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.860503027480205, |
| "grad_norm": 0.31186428666114807, |
| "learning_rate": 0.0005419953407105417, |
| "loss": 3.577, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.875058220773171, |
| "grad_norm": 0.34908661246299744, |
| "learning_rate": 0.0005418206173558532, |
| "loss": 3.5696, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.8896134140661385, |
| "grad_norm": 0.3291402757167816, |
| "learning_rate": 0.0005416458940011648, |
| "loss": 3.5665, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.9041686073591055, |
| "grad_norm": 0.324118435382843, |
| "learning_rate": 0.0005414711706464764, |
| "loss": 3.5831, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.918723800652073, |
| "grad_norm": 0.3113384544849396, |
| "learning_rate": 0.000541296447291788, |
| "loss": 3.588, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.93327899394504, |
| "grad_norm": 0.31743142008781433, |
| "learning_rate": 0.0005411217239370995, |
| "loss": 3.5821, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.947834187238007, |
| "grad_norm": 0.3153553009033203, |
| "learning_rate": 0.0005409470005824111, |
| "loss": 3.5822, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.947834187238007, |
| "eval_accuracy": 0.35960534800951643, |
| "eval_loss": 3.6361920833587646, |
| "eval_runtime": 80.6318, |
| "eval_samples_per_second": 206.507, |
| "eval_steps_per_second": 12.911, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.962389380530974, |
| "grad_norm": 0.32598674297332764, |
| "learning_rate": 0.0005407722772277228, |
| "loss": 3.5835, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.976944573823941, |
| "grad_norm": 0.3023843467235565, |
| "learning_rate": 0.0005405975538730343, |
| "loss": 3.583, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.991499767116907, |
| "grad_norm": 0.34621861577033997, |
| "learning_rate": 0.0005404228305183459, |
| "loss": 3.5923, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.005822077317187, |
| "grad_norm": 0.3231063187122345, |
| "learning_rate": 0.0005402481071636575, |
| "loss": 3.5416, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.020377270610154, |
| "grad_norm": 0.3355882167816162, |
| "learning_rate": 0.0005400733838089692, |
| "loss": 3.4799, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.034932463903121, |
| "grad_norm": 0.3221699297428131, |
| "learning_rate": 0.0005398986604542807, |
| "loss": 3.4788, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.049487657196088, |
| "grad_norm": 0.32003962993621826, |
| "learning_rate": 0.0005397239370995922, |
| "loss": 3.4764, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.064042850489055, |
| "grad_norm": 0.3263167142868042, |
| "learning_rate": 0.0005395492137449039, |
| "loss": 3.4664, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.078598043782021, |
| "grad_norm": 0.3246323764324188, |
| "learning_rate": 0.0005393744903902154, |
| "loss": 3.492, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.093153237074988, |
| "grad_norm": 0.33185046911239624, |
| "learning_rate": 0.000539199767035527, |
| "loss": 3.4846, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.107708430367955, |
| "grad_norm": 0.357795387506485, |
| "learning_rate": 0.0005390250436808386, |
| "loss": 3.4787, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.122263623660922, |
| "grad_norm": 0.3294745683670044, |
| "learning_rate": 0.0005388503203261503, |
| "loss": 3.4884, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.136818816953889, |
| "grad_norm": 0.30938440561294556, |
| "learning_rate": 0.0005386755969714618, |
| "loss": 3.4896, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.151374010246856, |
| "grad_norm": 0.3487060070037842, |
| "learning_rate": 0.0005385008736167733, |
| "loss": 3.5001, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.165929203539823, |
| "grad_norm": 0.3345823287963867, |
| "learning_rate": 0.000538326150262085, |
| "loss": 3.4855, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.18048439683279, |
| "grad_norm": 0.32364848256111145, |
| "learning_rate": 0.0005381514269073965, |
| "loss": 3.4899, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.195039590125757, |
| "grad_norm": 0.32645007967948914, |
| "learning_rate": 0.0005379767035527082, |
| "loss": 3.4994, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.209594783418724, |
| "grad_norm": 0.3121260404586792, |
| "learning_rate": 0.0005378019801980197, |
| "loss": 3.4927, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.224149976711691, |
| "grad_norm": 0.3230101466178894, |
| "learning_rate": 0.0005376272568433314, |
| "loss": 3.5047, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.238705170004658, |
| "grad_norm": 0.3242955803871155, |
| "learning_rate": 0.0005374525334886429, |
| "loss": 3.4961, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.238705170004658, |
| "eval_accuracy": 0.36021625494305154, |
| "eval_loss": 3.6362240314483643, |
| "eval_runtime": 80.744, |
| "eval_samples_per_second": 206.22, |
| "eval_steps_per_second": 12.893, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.253260363297625, |
| "grad_norm": 0.3549533188343048, |
| "learning_rate": 0.0005372778101339545, |
| "loss": 3.52, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.267815556590591, |
| "grad_norm": 0.33276858925819397, |
| "learning_rate": 0.0005371030867792661, |
| "loss": 3.5064, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.282370749883558, |
| "grad_norm": 0.33603978157043457, |
| "learning_rate": 0.0005369283634245778, |
| "loss": 3.5108, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.296925943176525, |
| "grad_norm": 0.305027574300766, |
| "learning_rate": 0.0005367536400698893, |
| "loss": 3.5072, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.311481136469492, |
| "grad_norm": 0.34311068058013916, |
| "learning_rate": 0.0005365789167152008, |
| "loss": 3.4956, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.326036329762459, |
| "grad_norm": 0.3491630256175995, |
| "learning_rate": 0.0005364041933605125, |
| "loss": 3.5176, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.340591523055426, |
| "grad_norm": 0.36249032616615295, |
| "learning_rate": 0.000536229470005824, |
| "loss": 3.5246, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.3551467163483935, |
| "grad_norm": 0.3234519958496094, |
| "learning_rate": 0.0005360547466511357, |
| "loss": 3.528, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.36970190964136, |
| "grad_norm": 0.33486780524253845, |
| "learning_rate": 0.0005358800232964472, |
| "loss": 3.5129, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.384257102934327, |
| "grad_norm": 0.31147733330726624, |
| "learning_rate": 0.0005357052999417589, |
| "loss": 3.5227, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.398812296227294, |
| "grad_norm": 0.32688507437705994, |
| "learning_rate": 0.0005355305765870704, |
| "loss": 3.5156, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.413367489520261, |
| "grad_norm": 0.3291279077529907, |
| "learning_rate": 0.000535355853232382, |
| "loss": 3.502, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.427922682813228, |
| "grad_norm": 0.34294310212135315, |
| "learning_rate": 0.0005351811298776936, |
| "loss": 3.5293, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.442477876106195, |
| "grad_norm": 0.3323133587837219, |
| "learning_rate": 0.0005350064065230052, |
| "loss": 3.5203, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.457033069399162, |
| "grad_norm": 0.32330361008644104, |
| "learning_rate": 0.0005348316831683168, |
| "loss": 3.5238, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.471588262692128, |
| "grad_norm": 0.36598992347717285, |
| "learning_rate": 0.0005346569598136284, |
| "loss": 3.5254, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.486143455985095, |
| "grad_norm": 0.31992754340171814, |
| "learning_rate": 0.00053448223645894, |
| "loss": 3.5193, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.500698649278062, |
| "grad_norm": 0.3281792998313904, |
| "learning_rate": 0.0005343075131042515, |
| "loss": 3.5291, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.515253842571029, |
| "grad_norm": 0.3420809805393219, |
| "learning_rate": 0.0005341327897495632, |
| "loss": 3.532, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.529809035863996, |
| "grad_norm": 0.3127780556678772, |
| "learning_rate": 0.0005339580663948748, |
| "loss": 3.5195, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.529809035863996, |
| "eval_accuracy": 0.3612013688170207, |
| "eval_loss": 3.6283442974090576, |
| "eval_runtime": 80.636, |
| "eval_samples_per_second": 206.496, |
| "eval_steps_per_second": 12.91, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5443642291569635, |
| "grad_norm": 0.3270993232727051, |
| "learning_rate": 0.0005337833430401863, |
| "loss": 3.5383, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.5589194224499305, |
| "grad_norm": 0.3306346535682678, |
| "learning_rate": 0.0005336086196854979, |
| "loss": 3.5302, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.573474615742897, |
| "grad_norm": 0.3481011688709259, |
| "learning_rate": 0.0005334338963308095, |
| "loss": 3.5189, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.588029809035864, |
| "grad_norm": 0.34454742074012756, |
| "learning_rate": 0.0005332591729761211, |
| "loss": 3.523, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.602585002328831, |
| "grad_norm": 0.33567488193511963, |
| "learning_rate": 0.0005330844496214327, |
| "loss": 3.5187, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.617140195621798, |
| "grad_norm": 0.33072608709335327, |
| "learning_rate": 0.0005329097262667443, |
| "loss": 3.5222, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.631695388914765, |
| "grad_norm": 0.3306700885295868, |
| "learning_rate": 0.0005327350029120559, |
| "loss": 3.5363, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.646250582207732, |
| "grad_norm": 0.312166303396225, |
| "learning_rate": 0.0005325602795573674, |
| "loss": 3.5251, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.660805775500698, |
| "grad_norm": 0.3478124141693115, |
| "learning_rate": 0.000532385556202679, |
| "loss": 3.5293, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.675360968793665, |
| "grad_norm": 0.3342107832431793, |
| "learning_rate": 0.0005322108328479906, |
| "loss": 3.5418, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.689916162086632, |
| "grad_norm": 0.31559666991233826, |
| "learning_rate": 0.0005320361094933023, |
| "loss": 3.5261, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.704471355379599, |
| "grad_norm": 0.32865777611732483, |
| "learning_rate": 0.0005318613861386138, |
| "loss": 3.528, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.719026548672566, |
| "grad_norm": 0.34536466002464294, |
| "learning_rate": 0.0005316866627839254, |
| "loss": 3.5313, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.7335817419655335, |
| "grad_norm": 0.3410492539405823, |
| "learning_rate": 0.000531511939429237, |
| "loss": 3.5321, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.748136935258501, |
| "grad_norm": 0.3239506781101227, |
| "learning_rate": 0.0005313372160745486, |
| "loss": 3.5249, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.762692128551468, |
| "grad_norm": 0.3330889642238617, |
| "learning_rate": 0.0005311624927198602, |
| "loss": 3.5304, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.777247321844434, |
| "grad_norm": 0.3351874053478241, |
| "learning_rate": 0.0005309877693651717, |
| "loss": 3.5268, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.791802515137401, |
| "grad_norm": 0.34032142162323, |
| "learning_rate": 0.0005308130460104834, |
| "loss": 3.5326, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.806357708430368, |
| "grad_norm": 0.43564939498901367, |
| "learning_rate": 0.0005306383226557949, |
| "loss": 3.5357, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.820912901723335, |
| "grad_norm": 0.35573258996009827, |
| "learning_rate": 0.0005304635993011065, |
| "loss": 3.5309, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.820912901723335, |
| "eval_accuracy": 0.36227027965974906, |
| "eval_loss": 3.6137149333953857, |
| "eval_runtime": 80.6797, |
| "eval_samples_per_second": 206.384, |
| "eval_steps_per_second": 12.903, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.835468095016302, |
| "grad_norm": 0.3097273111343384, |
| "learning_rate": 0.0005302888759464181, |
| "loss": 3.5325, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.850023288309269, |
| "grad_norm": 0.33340880274772644, |
| "learning_rate": 0.0005301141525917298, |
| "loss": 3.5285, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.864578481602235, |
| "grad_norm": 0.32990628480911255, |
| "learning_rate": 0.0005299394292370413, |
| "loss": 3.5451, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.879133674895202, |
| "grad_norm": 0.32696178555488586, |
| "learning_rate": 0.0005297647058823528, |
| "loss": 3.5271, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.893688868188169, |
| "grad_norm": 0.3369854688644409, |
| "learning_rate": 0.0005295899825276645, |
| "loss": 3.5334, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.9082440614811365, |
| "grad_norm": 0.3369443714618683, |
| "learning_rate": 0.000529415259172976, |
| "loss": 3.5423, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.9227992547741035, |
| "grad_norm": 0.324912428855896, |
| "learning_rate": 0.0005292405358182877, |
| "loss": 3.5398, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.937354448067071, |
| "grad_norm": 0.3217441737651825, |
| "learning_rate": 0.0005290658124635992, |
| "loss": 3.544, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.951909641360038, |
| "grad_norm": 0.31031718850135803, |
| "learning_rate": 0.0005288910891089109, |
| "loss": 3.542, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.966464834653004, |
| "grad_norm": 0.3219641447067261, |
| "learning_rate": 0.0005287163657542224, |
| "loss": 3.5293, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.981020027945971, |
| "grad_norm": 0.31779083609580994, |
| "learning_rate": 0.000528541642399534, |
| "loss": 3.5357, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.995575221238938, |
| "grad_norm": 0.3212047815322876, |
| "learning_rate": 0.0005283669190448456, |
| "loss": 3.5255, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.009897531439218, |
| "grad_norm": 0.3332254886627197, |
| "learning_rate": 0.0005281921956901572, |
| "loss": 3.4641, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.024452724732185, |
| "grad_norm": 0.327150821685791, |
| "learning_rate": 0.0005280174723354688, |
| "loss": 3.4129, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.039007918025152, |
| "grad_norm": 0.3457014858722687, |
| "learning_rate": 0.0005278427489807804, |
| "loss": 3.4289, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.053563111318118, |
| "grad_norm": 0.33637768030166626, |
| "learning_rate": 0.000527668025626092, |
| "loss": 3.4282, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.068118304611085, |
| "grad_norm": 0.3657829165458679, |
| "learning_rate": 0.0005274933022714035, |
| "loss": 3.4312, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.082673497904052, |
| "grad_norm": 0.3319191336631775, |
| "learning_rate": 0.0005273185789167152, |
| "loss": 3.446, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.097228691197019, |
| "grad_norm": 0.3340849280357361, |
| "learning_rate": 0.0005271438555620268, |
| "loss": 3.4364, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.111783884489986, |
| "grad_norm": 0.33662015199661255, |
| "learning_rate": 0.0005269691322073384, |
| "loss": 3.4547, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.111783884489986, |
| "eval_accuracy": 0.3625745578534625, |
| "eval_loss": 3.6171655654907227, |
| "eval_runtime": 81.004, |
| "eval_samples_per_second": 205.558, |
| "eval_steps_per_second": 12.851, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.126339077782953, |
| "grad_norm": 0.33729586005210876, |
| "learning_rate": 0.0005267944088526499, |
| "loss": 3.448, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.14089427107592, |
| "grad_norm": 0.3400915861129761, |
| "learning_rate": 0.0005266196854979615, |
| "loss": 3.4429, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.155449464368886, |
| "grad_norm": 0.31154096126556396, |
| "learning_rate": 0.0005264449621432731, |
| "loss": 3.4559, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.1700046576618535, |
| "grad_norm": 0.33595478534698486, |
| "learning_rate": 0.0005262702387885847, |
| "loss": 3.4434, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.1845598509548205, |
| "grad_norm": 0.31518498063087463, |
| "learning_rate": 0.0005260955154338963, |
| "loss": 3.4646, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.199115044247788, |
| "grad_norm": 0.3166975677013397, |
| "learning_rate": 0.0005259207920792079, |
| "loss": 3.4613, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.213670237540755, |
| "grad_norm": 0.31890466809272766, |
| "learning_rate": 0.0005257460687245195, |
| "loss": 3.4552, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.228225430833722, |
| "grad_norm": 0.3278941512107849, |
| "learning_rate": 0.000525571345369831, |
| "loss": 3.4563, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.242780624126689, |
| "grad_norm": 0.3718642294406891, |
| "learning_rate": 0.0005253966220151426, |
| "loss": 3.4698, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.257335817419655, |
| "grad_norm": 0.3402986228466034, |
| "learning_rate": 0.0005252218986604543, |
| "loss": 3.476, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.271891010712622, |
| "grad_norm": 0.3532449007034302, |
| "learning_rate": 0.0005250471753057658, |
| "loss": 3.4683, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.286446204005589, |
| "grad_norm": 0.3616986870765686, |
| "learning_rate": 0.0005248724519510774, |
| "loss": 3.47, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.301001397298556, |
| "grad_norm": 0.32584360241889954, |
| "learning_rate": 0.000524697728596389, |
| "loss": 3.4652, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.315556590591523, |
| "grad_norm": 0.32824012637138367, |
| "learning_rate": 0.0005245230052417006, |
| "loss": 3.4637, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.33011178388449, |
| "grad_norm": 0.3561561107635498, |
| "learning_rate": 0.0005243482818870122, |
| "loss": 3.4683, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.344666977177457, |
| "grad_norm": 0.3296772837638855, |
| "learning_rate": 0.0005241735585323238, |
| "loss": 3.4721, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.3592221704704235, |
| "grad_norm": 0.3535138666629791, |
| "learning_rate": 0.0005239988351776354, |
| "loss": 3.469, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.3737773637633905, |
| "grad_norm": 0.3119599223136902, |
| "learning_rate": 0.0005238241118229469, |
| "loss": 3.4744, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.388332557056358, |
| "grad_norm": 0.33031851053237915, |
| "learning_rate": 0.0005236493884682585, |
| "loss": 3.4669, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.402887750349325, |
| "grad_norm": 0.3422005772590637, |
| "learning_rate": 0.0005234746651135701, |
| "loss": 3.4809, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.402887750349325, |
| "eval_accuracy": 0.36318664006005175, |
| "eval_loss": 3.610767126083374, |
| "eval_runtime": 80.6458, |
| "eval_samples_per_second": 206.471, |
| "eval_steps_per_second": 12.908, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.417442943642292, |
| "grad_norm": 0.32802167534828186, |
| "learning_rate": 0.0005232999417588818, |
| "loss": 3.4682, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.431998136935259, |
| "grad_norm": 0.3188464641571045, |
| "learning_rate": 0.0005231252184041933, |
| "loss": 3.4756, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.446553330228225, |
| "grad_norm": 0.3288556635379791, |
| "learning_rate": 0.0005229504950495049, |
| "loss": 3.4732, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.461108523521192, |
| "grad_norm": 0.3288338780403137, |
| "learning_rate": 0.0005227757716948165, |
| "loss": 3.4789, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.475663716814159, |
| "grad_norm": 0.33035436272621155, |
| "learning_rate": 0.000522601048340128, |
| "loss": 3.4704, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.490218910107126, |
| "grad_norm": 0.3219433128833771, |
| "learning_rate": 0.0005224263249854397, |
| "loss": 3.4821, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.504774103400093, |
| "grad_norm": 0.3673233687877655, |
| "learning_rate": 0.0005222516016307512, |
| "loss": 3.5005, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.51932929669306, |
| "grad_norm": 0.33592429757118225, |
| "learning_rate": 0.0005220768782760629, |
| "loss": 3.4885, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.533884489986027, |
| "grad_norm": 0.3222636580467224, |
| "learning_rate": 0.0005219021549213744, |
| "loss": 3.4742, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.548439683278994, |
| "grad_norm": 0.3360852599143982, |
| "learning_rate": 0.000521727431566686, |
| "loss": 3.4918, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.562994876571961, |
| "grad_norm": 0.34307196736335754, |
| "learning_rate": 0.0005215527082119976, |
| "loss": 3.4795, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.577550069864928, |
| "grad_norm": 0.3408685028553009, |
| "learning_rate": 0.0005213779848573093, |
| "loss": 3.4851, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.592105263157895, |
| "grad_norm": 0.31650927662849426, |
| "learning_rate": 0.0005212032615026208, |
| "loss": 3.4909, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.606660456450862, |
| "grad_norm": 0.34447646141052246, |
| "learning_rate": 0.0005210285381479323, |
| "loss": 3.4788, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.621215649743829, |
| "grad_norm": 0.32232335209846497, |
| "learning_rate": 0.000520853814793244, |
| "loss": 3.4863, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.635770843036796, |
| "grad_norm": 0.31169578433036804, |
| "learning_rate": 0.0005206790914385555, |
| "loss": 3.4836, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.650326036329762, |
| "grad_norm": 0.32257068157196045, |
| "learning_rate": 0.0005205043680838672, |
| "loss": 3.4917, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.664881229622729, |
| "grad_norm": 0.32953596115112305, |
| "learning_rate": 0.0005203296447291787, |
| "loss": 3.4757, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.679436422915696, |
| "grad_norm": 0.32181575894355774, |
| "learning_rate": 0.0005201549213744904, |
| "loss": 3.4977, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.693991616208663, |
| "grad_norm": 0.33119407296180725, |
| "learning_rate": 0.0005199801980198019, |
| "loss": 3.4881, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.693991616208663, |
| "eval_accuracy": 0.36388592752725724, |
| "eval_loss": 3.603271245956421, |
| "eval_runtime": 80.8085, |
| "eval_samples_per_second": 206.055, |
| "eval_steps_per_second": 12.882, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.70854680950163, |
| "grad_norm": 0.3496987521648407, |
| "learning_rate": 0.0005198054746651136, |
| "loss": 3.4832, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.723102002794597, |
| "grad_norm": 0.3182956874370575, |
| "learning_rate": 0.0005196307513104251, |
| "loss": 3.4974, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.737657196087564, |
| "grad_norm": 0.32630103826522827, |
| "learning_rate": 0.0005194560279557367, |
| "loss": 3.4986, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.752212389380531, |
| "grad_norm": 0.3377934992313385, |
| "learning_rate": 0.0005192813046010483, |
| "loss": 3.4818, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.766767582673498, |
| "grad_norm": 0.3513481616973877, |
| "learning_rate": 0.0005191065812463599, |
| "loss": 3.5032, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.781322775966465, |
| "grad_norm": 0.34878620505332947, |
| "learning_rate": 0.0005189318578916715, |
| "loss": 3.4924, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.795877969259432, |
| "grad_norm": 0.3278437852859497, |
| "learning_rate": 0.000518757134536983, |
| "loss": 3.491, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.810433162552399, |
| "grad_norm": 0.30785784125328064, |
| "learning_rate": 0.0005185824111822947, |
| "loss": 3.492, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.824988355845366, |
| "grad_norm": 0.3543236553668976, |
| "learning_rate": 0.0005184076878276063, |
| "loss": 3.5088, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.839543549138332, |
| "grad_norm": 0.32503288984298706, |
| "learning_rate": 0.0005182329644729179, |
| "loss": 3.4963, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.854098742431299, |
| "grad_norm": 0.3083336353302002, |
| "learning_rate": 0.0005180582411182294, |
| "loss": 3.4893, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.868653935724266, |
| "grad_norm": 0.31416818499565125, |
| "learning_rate": 0.000517883517763541, |
| "loss": 3.501, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.883209129017233, |
| "grad_norm": 0.33796000480651855, |
| "learning_rate": 0.0005177087944088526, |
| "loss": 3.4881, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.8977643223102, |
| "grad_norm": 0.31594499945640564, |
| "learning_rate": 0.0005175340710541642, |
| "loss": 3.5006, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.912319515603167, |
| "grad_norm": 0.3467632234096527, |
| "learning_rate": 0.0005173593476994758, |
| "loss": 3.4972, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.926874708896134, |
| "grad_norm": 0.34574928879737854, |
| "learning_rate": 0.0005171846243447874, |
| "loss": 3.4942, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.9414299021891015, |
| "grad_norm": 0.3243410289287567, |
| "learning_rate": 0.000517009900990099, |
| "loss": 3.4836, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.955985095482068, |
| "grad_norm": 0.34617167711257935, |
| "learning_rate": 0.0005168351776354105, |
| "loss": 3.5003, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.970540288775035, |
| "grad_norm": 0.3080184757709503, |
| "learning_rate": 0.0005166604542807221, |
| "loss": 3.5008, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.985095482068002, |
| "grad_norm": 0.33652469515800476, |
| "learning_rate": 0.0005164857309260338, |
| "loss": 3.5017, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.985095482068002, |
| "eval_accuracy": 0.36449601176965446, |
| "eval_loss": 3.5923714637756348, |
| "eval_runtime": 80.6897, |
| "eval_samples_per_second": 206.358, |
| "eval_steps_per_second": 12.901, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.999650675360969, |
| "grad_norm": 0.3190842568874359, |
| "learning_rate": 0.0005163110075713453, |
| "loss": 3.5009, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.0139729855612485, |
| "grad_norm": 0.35079169273376465, |
| "learning_rate": 0.0005161362842166569, |
| "loss": 3.3919, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.0285281788542155, |
| "grad_norm": 0.36309176683425903, |
| "learning_rate": 0.0005159615608619685, |
| "loss": 3.396, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.043083372147182, |
| "grad_norm": 0.317272424697876, |
| "learning_rate": 0.0005157868375072801, |
| "loss": 3.388, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.057638565440149, |
| "grad_norm": 0.3446238934993744, |
| "learning_rate": 0.0005156121141525917, |
| "loss": 3.3877, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.072193758733116, |
| "grad_norm": 0.3453896641731262, |
| "learning_rate": 0.0005154373907979033, |
| "loss": 3.3921, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.086748952026083, |
| "grad_norm": 0.31213486194610596, |
| "learning_rate": 0.0005152626674432149, |
| "loss": 3.397, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.10130414531905, |
| "grad_norm": 0.3341032564640045, |
| "learning_rate": 0.0005150879440885264, |
| "loss": 3.3957, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.115859338612017, |
| "grad_norm": 0.34508928656578064, |
| "learning_rate": 0.000514913220733838, |
| "loss": 3.4145, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.130414531904984, |
| "grad_norm": 0.3692660331726074, |
| "learning_rate": 0.0005147384973791496, |
| "loss": 3.4165, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.14496972519795, |
| "grad_norm": 0.31498512625694275, |
| "learning_rate": 0.0005145637740244613, |
| "loss": 3.4131, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.159524918490917, |
| "grad_norm": 0.3347342908382416, |
| "learning_rate": 0.0005143890506697728, |
| "loss": 3.4131, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.174080111783884, |
| "grad_norm": 0.3238277733325958, |
| "learning_rate": 0.0005142143273150844, |
| "loss": 3.4196, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.1886353050768514, |
| "grad_norm": 0.3462424576282501, |
| "learning_rate": 0.000514039603960396, |
| "loss": 3.4125, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.2031904983698185, |
| "grad_norm": 0.3721330761909485, |
| "learning_rate": 0.0005138648806057075, |
| "loss": 3.4191, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.217745691662786, |
| "grad_norm": 0.3516317903995514, |
| "learning_rate": 0.0005136901572510192, |
| "loss": 3.4243, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.232300884955753, |
| "grad_norm": 0.3437836170196533, |
| "learning_rate": 0.0005135154338963307, |
| "loss": 3.4231, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.246856078248719, |
| "grad_norm": 0.3657211363315582, |
| "learning_rate": 0.0005133407105416424, |
| "loss": 3.4387, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.261411271541686, |
| "grad_norm": 0.37410634756088257, |
| "learning_rate": 0.0005131659871869539, |
| "loss": 3.4288, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.275966464834653, |
| "grad_norm": 0.3592871129512787, |
| "learning_rate": 0.0005129912638322656, |
| "loss": 3.4179, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.275966464834653, |
| "eval_accuracy": 0.3646205907133919, |
| "eval_loss": 3.599027633666992, |
| "eval_runtime": 80.8615, |
| "eval_samples_per_second": 205.92, |
| "eval_steps_per_second": 12.874, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.29052165812762, |
| "grad_norm": 0.3743526041507721, |
| "learning_rate": 0.0005128165404775771, |
| "loss": 3.4306, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.305076851420587, |
| "grad_norm": 0.3669351041316986, |
| "learning_rate": 0.0005126418171228888, |
| "loss": 3.4268, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.319632044713554, |
| "grad_norm": 0.357820987701416, |
| "learning_rate": 0.0005124670937682003, |
| "loss": 3.4278, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.334187238006521, |
| "grad_norm": 0.3508703410625458, |
| "learning_rate": 0.000512292370413512, |
| "loss": 3.4417, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.348742431299487, |
| "grad_norm": 0.3238923251628876, |
| "learning_rate": 0.0005121176470588235, |
| "loss": 3.4391, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.363297624592454, |
| "grad_norm": 0.3186517655849457, |
| "learning_rate": 0.000511942923704135, |
| "loss": 3.4541, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.3778528178854215, |
| "grad_norm": 0.32173988223075867, |
| "learning_rate": 0.0005117682003494467, |
| "loss": 3.4407, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.3924080111783885, |
| "grad_norm": 0.3288579285144806, |
| "learning_rate": 0.0005115934769947583, |
| "loss": 3.4359, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.406963204471356, |
| "grad_norm": 0.3681701421737671, |
| "learning_rate": 0.0005114187536400699, |
| "loss": 3.4328, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.421518397764323, |
| "grad_norm": 0.3507705628871918, |
| "learning_rate": 0.0005112440302853814, |
| "loss": 3.4391, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.436073591057289, |
| "grad_norm": 0.33441418409347534, |
| "learning_rate": 0.0005110693069306931, |
| "loss": 3.4319, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.450628784350256, |
| "grad_norm": 0.35155999660491943, |
| "learning_rate": 0.0005108945835760046, |
| "loss": 3.4518, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.465183977643223, |
| "grad_norm": 0.3278109133243561, |
| "learning_rate": 0.0005107198602213162, |
| "loss": 3.4509, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.47973917093619, |
| "grad_norm": 0.3472205698490143, |
| "learning_rate": 0.0005105451368666278, |
| "loss": 3.4573, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.494294364229157, |
| "grad_norm": 0.33118316531181335, |
| "learning_rate": 0.0005103704135119394, |
| "loss": 3.4505, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.508849557522124, |
| "grad_norm": 0.3439773619174957, |
| "learning_rate": 0.000510195690157251, |
| "loss": 3.4527, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.523404750815091, |
| "grad_norm": 0.36599114537239075, |
| "learning_rate": 0.0005100209668025625, |
| "loss": 3.442, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.537959944108058, |
| "grad_norm": 0.34174075722694397, |
| "learning_rate": 0.0005098462434478742, |
| "loss": 3.4394, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.552515137401024, |
| "grad_norm": 0.3380934000015259, |
| "learning_rate": 0.0005096715200931858, |
| "loss": 3.4521, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.5670703306939915, |
| "grad_norm": 0.33770203590393066, |
| "learning_rate": 0.0005094967967384974, |
| "loss": 3.4484, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.5670703306939915, |
| "eval_accuracy": 0.3652045838939875, |
| "eval_loss": 3.5904653072357178, |
| "eval_runtime": 80.7057, |
| "eval_samples_per_second": 206.318, |
| "eval_steps_per_second": 12.899, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.5816255239869585, |
| "grad_norm": 0.35267844796180725, |
| "learning_rate": 0.0005093220733838089, |
| "loss": 3.4489, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.596180717279926, |
| "grad_norm": 0.334037721157074, |
| "learning_rate": 0.0005091473500291205, |
| "loss": 3.4453, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.610735910572893, |
| "grad_norm": 0.34790873527526855, |
| "learning_rate": 0.0005089726266744321, |
| "loss": 3.4489, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.625291103865859, |
| "grad_norm": 0.33023956418037415, |
| "learning_rate": 0.0005087979033197437, |
| "loss": 3.4556, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.639846297158826, |
| "grad_norm": 0.3459625840187073, |
| "learning_rate": 0.0005086231799650553, |
| "loss": 3.4485, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.654401490451793, |
| "grad_norm": 0.3560470938682556, |
| "learning_rate": 0.0005084484566103669, |
| "loss": 3.4577, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.66895668374476, |
| "grad_norm": 0.31734979152679443, |
| "learning_rate": 0.0005082737332556785, |
| "loss": 3.4598, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.683511877037727, |
| "grad_norm": 0.34644100069999695, |
| "learning_rate": 0.00050809900990099, |
| "loss": 3.4487, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.698067070330694, |
| "grad_norm": 0.3957260549068451, |
| "learning_rate": 0.0005079242865463016, |
| "loss": 3.4596, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.712622263623661, |
| "grad_norm": 0.3240290880203247, |
| "learning_rate": 0.0005077495631916133, |
| "loss": 3.4689, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.727177456916628, |
| "grad_norm": 0.34162086248397827, |
| "learning_rate": 0.0005075748398369248, |
| "loss": 3.4624, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.7417326502095944, |
| "grad_norm": 0.32350483536720276, |
| "learning_rate": 0.0005074001164822364, |
| "loss": 3.4564, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.7562878435025615, |
| "grad_norm": 0.32674768567085266, |
| "learning_rate": 0.000507225393127548, |
| "loss": 3.4634, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.770843036795529, |
| "grad_norm": 0.3366118371486664, |
| "learning_rate": 0.0005070506697728596, |
| "loss": 3.4634, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.785398230088496, |
| "grad_norm": 0.3206356167793274, |
| "learning_rate": 0.0005068759464181711, |
| "loss": 3.4658, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.799953423381463, |
| "grad_norm": 0.3736628592014313, |
| "learning_rate": 0.0005067012230634828, |
| "loss": 3.4623, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.81450861667443, |
| "grad_norm": 0.3338164687156677, |
| "learning_rate": 0.0005065264997087944, |
| "loss": 3.4624, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.829063809967396, |
| "grad_norm": 0.3364134132862091, |
| "learning_rate": 0.0005063517763541059, |
| "loss": 3.4584, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.843619003260363, |
| "grad_norm": 0.3480990529060364, |
| "learning_rate": 0.0005061770529994175, |
| "loss": 3.4576, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.85817419655333, |
| "grad_norm": 0.3255138397216797, |
| "learning_rate": 0.0005060023296447291, |
| "loss": 3.4755, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.85817419655333, |
| "eval_accuracy": 0.3662412922550328, |
| "eval_loss": 3.582038164138794, |
| "eval_runtime": 80.6694, |
| "eval_samples_per_second": 206.41, |
| "eval_steps_per_second": 12.905, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.872729389846297, |
| "grad_norm": 0.3669666349887848, |
| "learning_rate": 0.0005058276062900408, |
| "loss": 3.4551, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.887284583139264, |
| "grad_norm": 0.3200521469116211, |
| "learning_rate": 0.0005056528829353523, |
| "loss": 3.4637, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.901839776432231, |
| "grad_norm": 0.33549970388412476, |
| "learning_rate": 0.000505478159580664, |
| "loss": 3.4679, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.916394969725198, |
| "grad_norm": 0.32390254735946655, |
| "learning_rate": 0.0005053034362259755, |
| "loss": 3.4728, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.930950163018165, |
| "grad_norm": 0.3328930735588074, |
| "learning_rate": 0.000505128712871287, |
| "loss": 3.4652, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.9455053563111315, |
| "grad_norm": 0.3249746263027191, |
| "learning_rate": 0.0005049539895165987, |
| "loss": 3.4651, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.960060549604099, |
| "grad_norm": 0.3543454110622406, |
| "learning_rate": 0.0005047792661619103, |
| "loss": 3.4591, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.974615742897066, |
| "grad_norm": 0.3269532024860382, |
| "learning_rate": 0.0005046045428072219, |
| "loss": 3.4683, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.989170936190033, |
| "grad_norm": 0.318595826625824, |
| "learning_rate": 0.0005044298194525334, |
| "loss": 3.4505, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.003493246390311, |
| "grad_norm": 0.3522030711174011, |
| "learning_rate": 0.0005042550960978451, |
| "loss": 3.4485, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.018048439683279, |
| "grad_norm": 0.3393933176994324, |
| "learning_rate": 0.0005040803727431566, |
| "loss": 3.3642, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.032603632976246, |
| "grad_norm": 0.3365688621997833, |
| "learning_rate": 0.0005039056493884683, |
| "loss": 3.3535, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.047158826269213, |
| "grad_norm": 0.3570943772792816, |
| "learning_rate": 0.0005037309260337798, |
| "loss": 3.3594, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.06171401956218, |
| "grad_norm": 0.34117403626441956, |
| "learning_rate": 0.0005035562026790914, |
| "loss": 3.3681, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.076269212855147, |
| "grad_norm": 0.3326704204082489, |
| "learning_rate": 0.000503381479324403, |
| "loss": 3.3644, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.090824406148114, |
| "grad_norm": 0.35452020168304443, |
| "learning_rate": 0.0005032067559697145, |
| "loss": 3.3689, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.10537959944108, |
| "grad_norm": 0.3346391022205353, |
| "learning_rate": 0.0005030320326150262, |
| "loss": 3.359, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.119934792734048, |
| "grad_norm": 0.3348582983016968, |
| "learning_rate": 0.0005028573092603378, |
| "loss": 3.3779, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.134489986027015, |
| "grad_norm": 0.35497674345970154, |
| "learning_rate": 0.0005026825859056494, |
| "loss": 3.3901, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.149045179319982, |
| "grad_norm": 0.3535161018371582, |
| "learning_rate": 0.0005025078625509609, |
| "loss": 3.3854, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.149045179319982, |
| "eval_accuracy": 0.36600858819031573, |
| "eval_loss": 3.589311122894287, |
| "eval_runtime": 80.7519, |
| "eval_samples_per_second": 206.199, |
| "eval_steps_per_second": 12.891, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.16360037261295, |
| "grad_norm": 0.33929312229156494, |
| "learning_rate": 0.0005023331391962726, |
| "loss": 3.3852, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.178155565905914, |
| "grad_norm": 0.37256792187690735, |
| "learning_rate": 0.0005021584158415841, |
| "loss": 3.3846, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.192710759198881, |
| "grad_norm": 0.3273381292819977, |
| "learning_rate": 0.0005019836924868956, |
| "loss": 3.3934, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.207265952491849, |
| "grad_norm": 0.3291190266609192, |
| "learning_rate": 0.0005018089691322073, |
| "loss": 3.3943, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.221821145784816, |
| "grad_norm": 0.3618149161338806, |
| "learning_rate": 0.0005016342457775189, |
| "loss": 3.3959, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.236376339077783, |
| "grad_norm": 0.3529667556285858, |
| "learning_rate": 0.0005014595224228305, |
| "loss": 3.3832, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.25093153237075, |
| "grad_norm": 0.33151862025260925, |
| "learning_rate": 0.000501284799068142, |
| "loss": 3.3927, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.265486725663717, |
| "grad_norm": 0.35264867544174194, |
| "learning_rate": 0.0005011100757134537, |
| "loss": 3.391, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.280041918956684, |
| "grad_norm": 0.342640221118927, |
| "learning_rate": 0.0005009353523587653, |
| "loss": 3.406, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.294597112249651, |
| "grad_norm": 0.35768377780914307, |
| "learning_rate": 0.0005007606290040768, |
| "loss": 3.4005, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.309152305542618, |
| "grad_norm": 0.37022480368614197, |
| "learning_rate": 0.0005005859056493884, |
| "loss": 3.3888, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.323707498835585, |
| "grad_norm": 0.3579113781452179, |
| "learning_rate": 0.0005004111822947, |
| "loss": 3.3857, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.338262692128552, |
| "grad_norm": 0.3357008099555969, |
| "learning_rate": 0.0005002364589400116, |
| "loss": 3.4108, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.35281788542152, |
| "grad_norm": 0.345289945602417, |
| "learning_rate": 0.0005000617355853231, |
| "loss": 3.4068, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.367373078714486, |
| "grad_norm": 0.3502272069454193, |
| "learning_rate": 0.0004998870122306348, |
| "loss": 3.4201, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.381928272007451, |
| "grad_norm": 0.3254723846912384, |
| "learning_rate": 0.0004997122888759464, |
| "loss": 3.4172, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.396483465300419, |
| "grad_norm": 0.33674824237823486, |
| "learning_rate": 0.000499537565521258, |
| "loss": 3.4194, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.411038658593386, |
| "grad_norm": 0.3852351903915405, |
| "learning_rate": 0.0004993628421665695, |
| "loss": 3.418, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.425593851886353, |
| "grad_norm": 0.3287079930305481, |
| "learning_rate": 0.0004991881188118811, |
| "loss": 3.4043, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.44014904517932, |
| "grad_norm": 0.36066827178001404, |
| "learning_rate": 0.0004990133954571928, |
| "loss": 3.4074, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.44014904517932, |
| "eval_accuracy": 0.3665003224361624, |
| "eval_loss": 3.5851986408233643, |
| "eval_runtime": 80.6737, |
| "eval_samples_per_second": 206.399, |
| "eval_steps_per_second": 12.904, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.454704238472287, |
| "grad_norm": 0.34460631012916565, |
| "learning_rate": 0.0004988386721025043, |
| "loss": 3.4078, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.469259431765254, |
| "grad_norm": 0.3344840705394745, |
| "learning_rate": 0.0004986639487478159, |
| "loss": 3.4159, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.483814625058221, |
| "grad_norm": 0.3520732522010803, |
| "learning_rate": 0.0004984892253931275, |
| "loss": 3.4152, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.498369818351188, |
| "grad_norm": 0.325728178024292, |
| "learning_rate": 0.0004983145020384391, |
| "loss": 3.4028, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.512925011644155, |
| "grad_norm": 0.37337592244148254, |
| "learning_rate": 0.0004981397786837507, |
| "loss": 3.4058, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.527480204937122, |
| "grad_norm": 0.35207632184028625, |
| "learning_rate": 0.0004979650553290622, |
| "loss": 3.4373, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.54203539823009, |
| "grad_norm": 0.368791788816452, |
| "learning_rate": 0.0004977903319743739, |
| "loss": 3.4416, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.556590591523056, |
| "grad_norm": 0.36731842160224915, |
| "learning_rate": 0.0004976156086196854, |
| "loss": 3.425, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.571145784816022, |
| "grad_norm": 0.3556763529777527, |
| "learning_rate": 0.0004974408852649971, |
| "loss": 3.4242, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.585700978108989, |
| "grad_norm": 0.3634316027164459, |
| "learning_rate": 0.0004972661619103086, |
| "loss": 3.4212, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.600256171401956, |
| "grad_norm": 0.33771318197250366, |
| "learning_rate": 0.0004970914385556202, |
| "loss": 3.4255, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.614811364694923, |
| "grad_norm": 0.32704877853393555, |
| "learning_rate": 0.0004969167152009318, |
| "loss": 3.4262, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.62936655798789, |
| "grad_norm": 0.33070218563079834, |
| "learning_rate": 0.0004967419918462435, |
| "loss": 3.432, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.643921751280857, |
| "grad_norm": 0.34220090508461, |
| "learning_rate": 0.000496567268491555, |
| "loss": 3.421, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.658476944573824, |
| "grad_norm": 0.37015730142593384, |
| "learning_rate": 0.0004963925451368665, |
| "loss": 3.4348, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.673032137866791, |
| "grad_norm": 0.3155536353588104, |
| "learning_rate": 0.0004962178217821782, |
| "loss": 3.4358, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.687587331159758, |
| "grad_norm": 0.3707929849624634, |
| "learning_rate": 0.0004960430984274898, |
| "loss": 3.4317, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.702142524452725, |
| "grad_norm": 0.335113525390625, |
| "learning_rate": 0.0004958683750728014, |
| "loss": 3.4238, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.716697717745692, |
| "grad_norm": 0.32378828525543213, |
| "learning_rate": 0.0004956936517181129, |
| "loss": 3.4291, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.73125291103866, |
| "grad_norm": 0.35760384798049927, |
| "learning_rate": 0.0004955189283634246, |
| "loss": 3.4407, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.73125291103866, |
| "eval_accuracy": 0.36699252679123073, |
| "eval_loss": 3.573824405670166, |
| "eval_runtime": 80.6086, |
| "eval_samples_per_second": 206.566, |
| "eval_steps_per_second": 12.914, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.745808104331626, |
| "grad_norm": 0.322648286819458, |
| "learning_rate": 0.0004953442050087361, |
| "loss": 3.4383, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.760363297624593, |
| "grad_norm": 0.33331647515296936, |
| "learning_rate": 0.0004951694816540476, |
| "loss": 3.4113, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.774918490917559, |
| "grad_norm": 0.36145666241645813, |
| "learning_rate": 0.0004949947582993593, |
| "loss": 3.43, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.789473684210526, |
| "grad_norm": 0.3275957703590393, |
| "learning_rate": 0.0004948200349446709, |
| "loss": 3.4282, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.804028877503493, |
| "grad_norm": 0.3384641408920288, |
| "learning_rate": 0.0004946453115899825, |
| "loss": 3.4326, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.81858407079646, |
| "grad_norm": 0.3381904363632202, |
| "learning_rate": 0.000494470588235294, |
| "loss": 3.4269, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.833139264089427, |
| "grad_norm": 0.3424563705921173, |
| "learning_rate": 0.0004942958648806057, |
| "loss": 3.4322, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.847694457382394, |
| "grad_norm": 0.33910584449768066, |
| "learning_rate": 0.0004941211415259173, |
| "loss": 3.4322, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.862249650675361, |
| "grad_norm": 0.3556019067764282, |
| "learning_rate": 0.0004939464181712289, |
| "loss": 3.4368, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.876804843968328, |
| "grad_norm": 0.3441767990589142, |
| "learning_rate": 0.0004937716948165404, |
| "loss": 3.4447, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.891360037261295, |
| "grad_norm": 0.3617768883705139, |
| "learning_rate": 0.000493596971461852, |
| "loss": 3.4439, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.905915230554262, |
| "grad_norm": 0.3746488094329834, |
| "learning_rate": 0.0004934222481071636, |
| "loss": 3.4338, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.92047042384723, |
| "grad_norm": 0.3503645360469818, |
| "learning_rate": 0.0004932475247524751, |
| "loss": 3.4431, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.935025617140196, |
| "grad_norm": 0.345306396484375, |
| "learning_rate": 0.0004930728013977868, |
| "loss": 3.4443, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.949580810433163, |
| "grad_norm": 0.32689064741134644, |
| "learning_rate": 0.0004928980780430984, |
| "loss": 3.4392, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.964136003726129, |
| "grad_norm": 0.33641254901885986, |
| "learning_rate": 0.00049272335468841, |
| "loss": 3.4376, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.978691197019096, |
| "grad_norm": 0.30834445357322693, |
| "learning_rate": 0.0004925486313337215, |
| "loss": 3.4467, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.993246390312063, |
| "grad_norm": 0.3289487361907959, |
| "learning_rate": 0.0004923739079790332, |
| "loss": 3.428, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.007568700512342, |
| "grad_norm": 0.35091307759284973, |
| "learning_rate": 0.0004921991846243447, |
| "loss": 3.3764, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.02212389380531, |
| "grad_norm": 0.3354990482330322, |
| "learning_rate": 0.0004920244612696563, |
| "loss": 3.3176, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.02212389380531, |
| "eval_accuracy": 0.36725496526421725, |
| "eval_loss": 3.5825090408325195, |
| "eval_runtime": 80.6295, |
| "eval_samples_per_second": 206.513, |
| "eval_steps_per_second": 12.911, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.036679087098276, |
| "grad_norm": 0.3487994074821472, |
| "learning_rate": 0.0004918497379149679, |
| "loss": 3.3376, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.051234280391244, |
| "grad_norm": 0.34804099798202515, |
| "learning_rate": 0.0004916750145602795, |
| "loss": 3.3385, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.06578947368421, |
| "grad_norm": 0.37987184524536133, |
| "learning_rate": 0.0004915002912055911, |
| "loss": 3.3381, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.080344666977178, |
| "grad_norm": 0.34263938665390015, |
| "learning_rate": 0.0004913255678509026, |
| "loss": 3.3492, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.094899860270145, |
| "grad_norm": 0.353423535823822, |
| "learning_rate": 0.0004911508444962143, |
| "loss": 3.3489, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.109455053563112, |
| "grad_norm": 0.3398151099681854, |
| "learning_rate": 0.0004909761211415259, |
| "loss": 3.3551, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.124010246856079, |
| "grad_norm": 0.3823663294315338, |
| "learning_rate": 0.0004908013977868375, |
| "loss": 3.3563, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.138565440149046, |
| "grad_norm": 0.33569806814193726, |
| "learning_rate": 0.0004906266744321491, |
| "loss": 3.3448, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.153120633442011, |
| "grad_norm": 0.3402097523212433, |
| "learning_rate": 0.0004904519510774606, |
| "loss": 3.347, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.167675826734978, |
| "grad_norm": 0.3301268219947815, |
| "learning_rate": 0.0004902772277227722, |
| "loss": 3.3687, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.182231020027945, |
| "grad_norm": 0.36000022292137146, |
| "learning_rate": 0.0004901025043680838, |
| "loss": 3.3629, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.196786213320912, |
| "grad_norm": 0.36111563444137573, |
| "learning_rate": 0.0004899277810133955, |
| "loss": 3.3604, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.21134140661388, |
| "grad_norm": 0.3617996573448181, |
| "learning_rate": 0.000489753057658707, |
| "loss": 3.3642, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.225896599906847, |
| "grad_norm": 0.3363676369190216, |
| "learning_rate": 0.0004895783343040186, |
| "loss": 3.3719, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.240451793199814, |
| "grad_norm": 0.33455902338027954, |
| "learning_rate": 0.0004894036109493302, |
| "loss": 3.369, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.25500698649278, |
| "grad_norm": 0.35149598121643066, |
| "learning_rate": 0.0004892288875946419, |
| "loss": 3.3626, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.269562179785748, |
| "grad_norm": 0.3641233742237091, |
| "learning_rate": 0.0004890541642399534, |
| "loss": 3.3752, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.284117373078715, |
| "grad_norm": 0.3771204352378845, |
| "learning_rate": 0.0004888794408852649, |
| "loss": 3.3787, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.298672566371682, |
| "grad_norm": 0.3395664691925049, |
| "learning_rate": 0.0004887047175305766, |
| "loss": 3.38, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.313227759664649, |
| "grad_norm": 0.39188772439956665, |
| "learning_rate": 0.0004885299941758881, |
| "loss": 3.3657, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.313227759664649, |
| "eval_accuracy": 0.36743807280605023, |
| "eval_loss": 3.5808088779449463, |
| "eval_runtime": 80.6163, |
| "eval_samples_per_second": 206.546, |
| "eval_steps_per_second": 12.913, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.327782952957616, |
| "grad_norm": 0.32802528142929077, |
| "learning_rate": 0.0004883552708211997, |
| "loss": 3.38, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.342338146250583, |
| "grad_norm": 0.3449357748031616, |
| "learning_rate": 0.00048818054746651137, |
| "loss": 3.373, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.356893339543548, |
| "grad_norm": 0.3312017619609833, |
| "learning_rate": 0.0004880058241118229, |
| "loss": 3.38, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.371448532836515, |
| "grad_norm": 0.3633946180343628, |
| "learning_rate": 0.0004878311007571345, |
| "loss": 3.3821, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.386003726129482, |
| "grad_norm": 0.339104026556015, |
| "learning_rate": 0.0004876563774024461, |
| "loss": 3.3863, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.40055891942245, |
| "grad_norm": 0.3600173890590668, |
| "learning_rate": 0.00048748165404775763, |
| "loss": 3.3895, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.415114112715417, |
| "grad_norm": 0.3350042402744293, |
| "learning_rate": 0.0004873069306930693, |
| "loss": 3.3818, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.429669306008384, |
| "grad_norm": 0.3640735149383545, |
| "learning_rate": 0.0004871322073383809, |
| "loss": 3.3903, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.44422449930135, |
| "grad_norm": 0.34667709469795227, |
| "learning_rate": 0.00048695748398369247, |
| "loss": 3.3916, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.458779692594318, |
| "grad_norm": 0.3317042291164398, |
| "learning_rate": 0.000486782760629004, |
| "loss": 3.3894, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.473334885887285, |
| "grad_norm": 0.36951321363449097, |
| "learning_rate": 0.0004866080372743156, |
| "loss": 3.3846, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.487890079180252, |
| "grad_norm": 0.35966700315475464, |
| "learning_rate": 0.0004864333139196272, |
| "loss": 3.3956, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.502445272473219, |
| "grad_norm": 0.37394630908966064, |
| "learning_rate": 0.00048625859056493885, |
| "loss": 3.3927, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.517000465766186, |
| "grad_norm": 0.35459792613983154, |
| "learning_rate": 0.0004860838672102504, |
| "loss": 3.3979, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.531555659059153, |
| "grad_norm": 0.35643768310546875, |
| "learning_rate": 0.000485909143855562, |
| "loss": 3.402, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.546110852352118, |
| "grad_norm": 0.3416605293750763, |
| "learning_rate": 0.0004857344205008736, |
| "loss": 3.3937, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.560666045645085, |
| "grad_norm": 0.34179872274398804, |
| "learning_rate": 0.00048555969714618517, |
| "loss": 3.3891, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.575221238938052, |
| "grad_norm": 0.3436225652694702, |
| "learning_rate": 0.0004853849737914967, |
| "loss": 3.3976, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.58977643223102, |
| "grad_norm": 0.35517799854278564, |
| "learning_rate": 0.00048521025043680836, |
| "loss": 3.402, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.604331625523987, |
| "grad_norm": 0.3344137668609619, |
| "learning_rate": 0.00048503552708211995, |
| "loss": 3.3947, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.604331625523987, |
| "eval_accuracy": 0.36812760550690643, |
| "eval_loss": 3.5693697929382324, |
| "eval_runtime": 80.7995, |
| "eval_samples_per_second": 206.078, |
| "eval_steps_per_second": 12.884, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.618886818816954, |
| "grad_norm": 0.3380126953125, |
| "learning_rate": 0.00048486080372743155, |
| "loss": 3.3951, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.63344201210992, |
| "grad_norm": 0.33093544840812683, |
| "learning_rate": 0.0004846860803727431, |
| "loss": 3.4064, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.647997205402888, |
| "grad_norm": 0.32893943786621094, |
| "learning_rate": 0.0004845113570180547, |
| "loss": 3.398, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.662552398695855, |
| "grad_norm": 0.3626655638217926, |
| "learning_rate": 0.00048433663366336633, |
| "loss": 3.4024, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.677107591988822, |
| "grad_norm": 0.37326326966285706, |
| "learning_rate": 0.0004841619103086779, |
| "loss": 3.4041, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.691662785281789, |
| "grad_norm": 0.33492520451545715, |
| "learning_rate": 0.00048398718695398947, |
| "loss": 3.4065, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.706217978574756, |
| "grad_norm": 0.33473700284957886, |
| "learning_rate": 0.00048381246359930106, |
| "loss": 3.4059, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.720773171867723, |
| "grad_norm": 0.3173752427101135, |
| "learning_rate": 0.00048363774024461265, |
| "loss": 3.4122, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.73532836516069, |
| "grad_norm": 0.3227013945579529, |
| "learning_rate": 0.0004834630168899242, |
| "loss": 3.3922, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.749883558453657, |
| "grad_norm": 0.3504323363304138, |
| "learning_rate": 0.00048328829353523584, |
| "loss": 3.4092, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.764438751746622, |
| "grad_norm": 0.3445659875869751, |
| "learning_rate": 0.00048311357018054744, |
| "loss": 3.3993, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.77899394503959, |
| "grad_norm": 0.3275570869445801, |
| "learning_rate": 0.00048293884682585903, |
| "loss": 3.4151, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.793549138332557, |
| "grad_norm": 0.34345829486846924, |
| "learning_rate": 0.00048276412347117057, |
| "loss": 3.4176, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.808104331625524, |
| "grad_norm": 0.34399914741516113, |
| "learning_rate": 0.00048258940011648217, |
| "loss": 3.4076, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.82265952491849, |
| "grad_norm": 0.3698842525482178, |
| "learning_rate": 0.0004824146767617938, |
| "loss": 3.4107, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.837214718211458, |
| "grad_norm": 0.3306451439857483, |
| "learning_rate": 0.0004822399534071054, |
| "loss": 3.4118, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.851769911504425, |
| "grad_norm": 0.4052211046218872, |
| "learning_rate": 0.00048206523005241695, |
| "loss": 3.4068, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.866325104797392, |
| "grad_norm": 0.32337918877601624, |
| "learning_rate": 0.00048189050669772854, |
| "loss": 3.4078, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.880880298090359, |
| "grad_norm": 0.3355276882648468, |
| "learning_rate": 0.00048171578334304014, |
| "loss": 3.4092, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.895435491383326, |
| "grad_norm": 0.37603944540023804, |
| "learning_rate": 0.00048154105998835173, |
| "loss": 3.423, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.895435491383326, |
| "eval_accuracy": 0.36860147560233036, |
| "eval_loss": 3.5630943775177, |
| "eval_runtime": 80.9801, |
| "eval_samples_per_second": 205.618, |
| "eval_steps_per_second": 12.855, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.909990684676293, |
| "grad_norm": 0.3216181993484497, |
| "learning_rate": 0.0004813663366336633, |
| "loss": 3.4118, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.92454587796926, |
| "grad_norm": 0.32490965723991394, |
| "learning_rate": 0.0004811916132789749, |
| "loss": 3.4175, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.939101071262227, |
| "grad_norm": 0.34413382411003113, |
| "learning_rate": 0.0004810168899242865, |
| "loss": 3.3998, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.953656264555192, |
| "grad_norm": 0.35348689556121826, |
| "learning_rate": 0.0004808421665695981, |
| "loss": 3.4162, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.96821145784816, |
| "grad_norm": 0.3519061207771301, |
| "learning_rate": 0.00048066744321490965, |
| "loss": 3.4131, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.982766651141127, |
| "grad_norm": 0.33331534266471863, |
| "learning_rate": 0.00048049271986022124, |
| "loss": 3.4186, |
| "step": 34300 |
| }, |
| { |
| "epoch": 9.997321844434094, |
| "grad_norm": 0.3361709713935852, |
| "learning_rate": 0.0004803179965055329, |
| "loss": 3.4185, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.011644154634373, |
| "grad_norm": 0.35055112838745117, |
| "learning_rate": 0.0004801432731508445, |
| "loss": 3.3286, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.02619934792734, |
| "grad_norm": 0.33240410685539246, |
| "learning_rate": 0.000479968549796156, |
| "loss": 3.2997, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.040754541220307, |
| "grad_norm": 0.35409680008888245, |
| "learning_rate": 0.0004797938264414676, |
| "loss": 3.3063, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.055309734513274, |
| "grad_norm": 0.32570916414260864, |
| "learning_rate": 0.0004796191030867792, |
| "loss": 3.304, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.069864927806242, |
| "grad_norm": 0.3497979938983917, |
| "learning_rate": 0.00047944437973209086, |
| "loss": 3.3137, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.084420121099209, |
| "grad_norm": 0.3470655679702759, |
| "learning_rate": 0.0004792696563774024, |
| "loss": 3.3269, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.098975314392176, |
| "grad_norm": 0.37032514810562134, |
| "learning_rate": 0.000479094933022714, |
| "loss": 3.3344, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.113530507685143, |
| "grad_norm": 0.3583158254623413, |
| "learning_rate": 0.0004789202096680256, |
| "loss": 3.3228, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.12808570097811, |
| "grad_norm": 0.34624138474464417, |
| "learning_rate": 0.00047874548631333713, |
| "loss": 3.3303, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.142640894271075, |
| "grad_norm": 0.3356753885746002, |
| "learning_rate": 0.0004785707629586487, |
| "loss": 3.3346, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.157196087564042, |
| "grad_norm": 0.3515348732471466, |
| "learning_rate": 0.0004783960396039604, |
| "loss": 3.3411, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.17175128085701, |
| "grad_norm": 0.3411479592323303, |
| "learning_rate": 0.00047822131624927197, |
| "loss": 3.3347, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.186306474149976, |
| "grad_norm": 0.37313827872276306, |
| "learning_rate": 0.0004780465928945835, |
| "loss": 3.3361, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.186306474149976, |
| "eval_accuracy": 0.3683544332063529, |
| "eval_loss": 3.576887845993042, |
| "eval_runtime": 80.5796, |
| "eval_samples_per_second": 206.64, |
| "eval_steps_per_second": 12.919, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.200861667442943, |
| "grad_norm": 0.3882916271686554, |
| "learning_rate": 0.0004778718695398951, |
| "loss": 3.342, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.21541686073591, |
| "grad_norm": 0.3489084541797638, |
| "learning_rate": 0.0004776971461852067, |
| "loss": 3.3346, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.229972054028877, |
| "grad_norm": 0.3721896708011627, |
| "learning_rate": 0.00047752242283051835, |
| "loss": 3.3468, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.244527247321844, |
| "grad_norm": 0.34356698393821716, |
| "learning_rate": 0.00047734769947582994, |
| "loss": 3.3488, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.259082440614812, |
| "grad_norm": 0.33439069986343384, |
| "learning_rate": 0.0004771729761211415, |
| "loss": 3.3594, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.273637633907779, |
| "grad_norm": 0.35101476311683655, |
| "learning_rate": 0.0004769982527664531, |
| "loss": 3.3484, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.288192827200746, |
| "grad_norm": 0.33851975202560425, |
| "learning_rate": 0.00047682352941176467, |
| "loss": 3.3528, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.302748020493713, |
| "grad_norm": 0.3419068157672882, |
| "learning_rate": 0.0004766488060570762, |
| "loss": 3.3561, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.31730321378668, |
| "grad_norm": 0.3344142436981201, |
| "learning_rate": 0.00047647408270238786, |
| "loss": 3.3563, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.331858407079647, |
| "grad_norm": 0.36310529708862305, |
| "learning_rate": 0.00047629935934769945, |
| "loss": 3.3676, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.346413600372612, |
| "grad_norm": 0.36880937218666077, |
| "learning_rate": 0.00047612463599301105, |
| "loss": 3.3419, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.36096879366558, |
| "grad_norm": 0.3478317856788635, |
| "learning_rate": 0.0004759499126383226, |
| "loss": 3.3422, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.375523986958546, |
| "grad_norm": 0.3487631380558014, |
| "learning_rate": 0.0004757751892836342, |
| "loss": 3.3558, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.390079180251513, |
| "grad_norm": 0.34165093302726746, |
| "learning_rate": 0.0004756004659289458, |
| "loss": 3.3697, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.40463437354448, |
| "grad_norm": 0.32482489943504333, |
| "learning_rate": 0.0004754257425742574, |
| "loss": 3.3566, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.419189566837447, |
| "grad_norm": 0.3664044737815857, |
| "learning_rate": 0.00047525101921956896, |
| "loss": 3.3695, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.433744760130415, |
| "grad_norm": 0.3533634543418884, |
| "learning_rate": 0.00047507629586488056, |
| "loss": 3.3769, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.448299953423382, |
| "grad_norm": 0.35418933629989624, |
| "learning_rate": 0.00047490157251019215, |
| "loss": 3.3571, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.462855146716349, |
| "grad_norm": 0.3441552221775055, |
| "learning_rate": 0.0004747268491555037, |
| "loss": 3.3755, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.477410340009316, |
| "grad_norm": 0.36008110642433167, |
| "learning_rate": 0.00047455212580081534, |
| "loss": 3.3663, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.477410340009316, |
| "eval_accuracy": 0.36862251298999926, |
| "eval_loss": 3.568178653717041, |
| "eval_runtime": 80.6732, |
| "eval_samples_per_second": 206.401, |
| "eval_steps_per_second": 12.904, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.491965533302283, |
| "grad_norm": 0.3479558229446411, |
| "learning_rate": 0.00047437740244612694, |
| "loss": 3.3715, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.50652072659525, |
| "grad_norm": 0.36907416582107544, |
| "learning_rate": 0.00047420267909143853, |
| "loss": 3.3763, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.521075919888217, |
| "grad_norm": 0.3439713716506958, |
| "learning_rate": 0.0004740279557367501, |
| "loss": 3.3824, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.535631113181182, |
| "grad_norm": 0.35420823097229004, |
| "learning_rate": 0.00047385323238206166, |
| "loss": 3.3745, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.55018630647415, |
| "grad_norm": 0.346736341714859, |
| "learning_rate": 0.00047367850902737326, |
| "loss": 3.3725, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.564741499767116, |
| "grad_norm": 0.3463842272758484, |
| "learning_rate": 0.0004735037856726849, |
| "loss": 3.3684, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.579296693060083, |
| "grad_norm": 0.36961302161216736, |
| "learning_rate": 0.0004733290623179965, |
| "loss": 3.3773, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.59385188635305, |
| "grad_norm": 0.3354074954986572, |
| "learning_rate": 0.00047315433896330804, |
| "loss": 3.3777, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.608407079646017, |
| "grad_norm": 0.3874535858631134, |
| "learning_rate": 0.00047297961560861964, |
| "loss": 3.38, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.622962272938985, |
| "grad_norm": 0.3506501615047455, |
| "learning_rate": 0.00047280489225393123, |
| "loss": 3.3809, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.637517466231952, |
| "grad_norm": 0.36295077204704285, |
| "learning_rate": 0.0004726301688992429, |
| "loss": 3.369, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.652072659524919, |
| "grad_norm": 0.3743818998336792, |
| "learning_rate": 0.0004724554455445544, |
| "loss": 3.3871, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.666627852817886, |
| "grad_norm": 0.3380889892578125, |
| "learning_rate": 0.000472280722189866, |
| "loss": 3.3787, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.681183046110853, |
| "grad_norm": 0.34594374895095825, |
| "learning_rate": 0.0004721059988351776, |
| "loss": 3.3821, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.69573823940382, |
| "grad_norm": 0.3528180718421936, |
| "learning_rate": 0.00047193127548048915, |
| "loss": 3.3826, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.710293432696787, |
| "grad_norm": 0.32848599553108215, |
| "learning_rate": 0.00047175655212580074, |
| "loss": 3.3765, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.724848625989754, |
| "grad_norm": 0.35429057478904724, |
| "learning_rate": 0.0004715818287711124, |
| "loss": 3.3856, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.73940381928272, |
| "grad_norm": 0.3778534531593323, |
| "learning_rate": 0.000471407105416424, |
| "loss": 3.3716, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.753959012575686, |
| "grad_norm": 0.3315613865852356, |
| "learning_rate": 0.0004712323820617355, |
| "loss": 3.382, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.768514205868653, |
| "grad_norm": 0.3546532392501831, |
| "learning_rate": 0.0004710576587070471, |
| "loss": 3.3733, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.768514205868653, |
| "eval_accuracy": 0.36933966460762746, |
| "eval_loss": 3.560076951980591, |
| "eval_runtime": 80.5485, |
| "eval_samples_per_second": 206.72, |
| "eval_steps_per_second": 12.924, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.78306939916162, |
| "grad_norm": 0.33720970153808594, |
| "learning_rate": 0.0004708829353523587, |
| "loss": 3.3881, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.797624592454587, |
| "grad_norm": 0.3673788011074066, |
| "learning_rate": 0.0004707082119976703, |
| "loss": 3.3832, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.812179785747555, |
| "grad_norm": 0.3662302792072296, |
| "learning_rate": 0.0004705334886429819, |
| "loss": 3.3823, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.826734979040522, |
| "grad_norm": 0.33522477746009827, |
| "learning_rate": 0.0004703587652882935, |
| "loss": 3.3849, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.841290172333489, |
| "grad_norm": 0.3745405972003937, |
| "learning_rate": 0.0004701840419336051, |
| "loss": 3.395, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.855845365626456, |
| "grad_norm": 0.355096697807312, |
| "learning_rate": 0.0004700093185789167, |
| "loss": 3.3828, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.870400558919423, |
| "grad_norm": 0.3523814380168915, |
| "learning_rate": 0.0004698345952242282, |
| "loss": 3.3962, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.88495575221239, |
| "grad_norm": 0.34357428550720215, |
| "learning_rate": 0.00046965987186953987, |
| "loss": 3.3983, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.899510945505357, |
| "grad_norm": 0.33313852548599243, |
| "learning_rate": 0.00046948514851485147, |
| "loss": 3.4045, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.914066138798324, |
| "grad_norm": 0.3556467592716217, |
| "learning_rate": 0.00046931042516016306, |
| "loss": 3.3832, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.92862133209129, |
| "grad_norm": 0.34138110280036926, |
| "learning_rate": 0.0004691357018054746, |
| "loss": 3.3925, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.943176525384256, |
| "grad_norm": 0.3168981969356537, |
| "learning_rate": 0.0004689609784507862, |
| "loss": 3.3878, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.957731718677223, |
| "grad_norm": 0.34316354990005493, |
| "learning_rate": 0.0004687862550960978, |
| "loss": 3.3995, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.97228691197019, |
| "grad_norm": 0.34419411420822144, |
| "learning_rate": 0.00046861153174140944, |
| "loss": 3.3972, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.986842105263158, |
| "grad_norm": 0.37786999344825745, |
| "learning_rate": 0.000468436808386721, |
| "loss": 3.402, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.001164415463437, |
| "grad_norm": 0.3877948522567749, |
| "learning_rate": 0.0004682620850320326, |
| "loss": 3.3899, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.015719608756404, |
| "grad_norm": 0.34007924795150757, |
| "learning_rate": 0.00046808736167734417, |
| "loss": 3.2865, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.030274802049371, |
| "grad_norm": 0.36556553840637207, |
| "learning_rate": 0.0004679126383226557, |
| "loss": 3.2896, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.044829995342338, |
| "grad_norm": 0.33104008436203003, |
| "learning_rate": 0.00046773791496796736, |
| "loss": 3.2937, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.059385188635305, |
| "grad_norm": 0.33673563599586487, |
| "learning_rate": 0.00046756319161327895, |
| "loss": 3.2945, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.059385188635305, |
| "eval_accuracy": 0.3689408944603622, |
| "eval_loss": 3.5687294006347656, |
| "eval_runtime": 80.5465, |
| "eval_samples_per_second": 206.725, |
| "eval_steps_per_second": 12.924, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.073940381928272, |
| "grad_norm": 0.34879371523857117, |
| "learning_rate": 0.00046738846825859054, |
| "loss": 3.3033, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.08849557522124, |
| "grad_norm": 0.3467499315738678, |
| "learning_rate": 0.0004672137449039021, |
| "loss": 3.2979, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.103050768514207, |
| "grad_norm": 0.3651968538761139, |
| "learning_rate": 0.0004670390215492137, |
| "loss": 3.3004, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.117605961807174, |
| "grad_norm": 0.3391800820827484, |
| "learning_rate": 0.0004668642981945253, |
| "loss": 3.3073, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.132161155100139, |
| "grad_norm": 0.3402816653251648, |
| "learning_rate": 0.0004666895748398369, |
| "loss": 3.2998, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.146716348393106, |
| "grad_norm": 0.3482193052768707, |
| "learning_rate": 0.00046651485148514846, |
| "loss": 3.3243, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.161271541686073, |
| "grad_norm": 0.36231711506843567, |
| "learning_rate": 0.00046634012813046006, |
| "loss": 3.3174, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.17582673497904, |
| "grad_norm": 0.4014952480792999, |
| "learning_rate": 0.00046616540477577165, |
| "loss": 3.3257, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.190381928272007, |
| "grad_norm": 0.36003029346466064, |
| "learning_rate": 0.00046599068142108324, |
| "loss": 3.3263, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.204937121564974, |
| "grad_norm": 0.35195857286453247, |
| "learning_rate": 0.0004658159580663948, |
| "loss": 3.3165, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.219492314857941, |
| "grad_norm": 0.3698701858520508, |
| "learning_rate": 0.00046564123471170643, |
| "loss": 3.3134, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.234047508150908, |
| "grad_norm": 0.35177069902420044, |
| "learning_rate": 0.00046546651135701803, |
| "loss": 3.3186, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.248602701443875, |
| "grad_norm": 0.37909796833992004, |
| "learning_rate": 0.0004652917880023296, |
| "loss": 3.3206, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.263157894736842, |
| "grad_norm": 0.34829825162887573, |
| "learning_rate": 0.00046511706464764116, |
| "loss": 3.3332, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.27771308802981, |
| "grad_norm": 0.35905563831329346, |
| "learning_rate": 0.00046494234129295276, |
| "loss": 3.334, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.292268281322777, |
| "grad_norm": 0.3676919639110565, |
| "learning_rate": 0.0004647676179382644, |
| "loss": 3.3245, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.306823474615744, |
| "grad_norm": 0.3640693128108978, |
| "learning_rate": 0.000464592894583576, |
| "loss": 3.3278, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.32137866790871, |
| "grad_norm": 0.38337743282318115, |
| "learning_rate": 0.00046441817122888754, |
| "loss": 3.3309, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.335933861201676, |
| "grad_norm": 0.40273717045783997, |
| "learning_rate": 0.00046424344787419913, |
| "loss": 3.3402, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.350489054494643, |
| "grad_norm": 0.3661263883113861, |
| "learning_rate": 0.00046406872451951073, |
| "loss": 3.3349, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.350489054494643, |
| "eval_accuracy": 0.3692438798537161, |
| "eval_loss": 3.567197561264038, |
| "eval_runtime": 80.6423, |
| "eval_samples_per_second": 206.48, |
| "eval_steps_per_second": 12.909, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.36504424778761, |
| "grad_norm": 0.35090526938438416, |
| "learning_rate": 0.00046389400116482227, |
| "loss": 3.3383, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.379599441080577, |
| "grad_norm": 0.34823402762413025, |
| "learning_rate": 0.0004637192778101339, |
| "loss": 3.3438, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.394154634373544, |
| "grad_norm": 0.36821842193603516, |
| "learning_rate": 0.0004635445544554455, |
| "loss": 3.3405, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.408709827666511, |
| "grad_norm": 0.3392237722873688, |
| "learning_rate": 0.0004633698311007571, |
| "loss": 3.3475, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.423265020959478, |
| "grad_norm": 0.3582727015018463, |
| "learning_rate": 0.0004631951077460687, |
| "loss": 3.3406, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.437820214252445, |
| "grad_norm": 0.37612417340278625, |
| "learning_rate": 0.00046302038439138024, |
| "loss": 3.3438, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.452375407545413, |
| "grad_norm": 0.3890506625175476, |
| "learning_rate": 0.0004628456610366919, |
| "loss": 3.3468, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.46693060083838, |
| "grad_norm": 0.3575476408004761, |
| "learning_rate": 0.0004626709376820035, |
| "loss": 3.3367, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.481485794131347, |
| "grad_norm": 0.34998655319213867, |
| "learning_rate": 0.0004624962143273151, |
| "loss": 3.3514, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.496040987424314, |
| "grad_norm": 0.3472655415534973, |
| "learning_rate": 0.0004623214909726266, |
| "loss": 3.3563, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.51059618071728, |
| "grad_norm": 0.368045836687088, |
| "learning_rate": 0.0004621467676179382, |
| "loss": 3.3558, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.525151374010246, |
| "grad_norm": 0.36388421058654785, |
| "learning_rate": 0.0004619720442632498, |
| "loss": 3.3576, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.539706567303213, |
| "grad_norm": 0.37991783022880554, |
| "learning_rate": 0.00046179732090856145, |
| "loss": 3.3617, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.55426176059618, |
| "grad_norm": 0.32521185278892517, |
| "learning_rate": 0.000461622597553873, |
| "loss": 3.3594, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.568816953889147, |
| "grad_norm": 0.35419413447380066, |
| "learning_rate": 0.0004614478741991846, |
| "loss": 3.3553, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.583372147182114, |
| "grad_norm": 0.34850800037384033, |
| "learning_rate": 0.0004612731508444962, |
| "loss": 3.3594, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.597927340475081, |
| "grad_norm": 0.34451690316200256, |
| "learning_rate": 0.0004610984274898077, |
| "loss": 3.3574, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.612482533768048, |
| "grad_norm": 0.3622346818447113, |
| "learning_rate": 0.00046092370413511937, |
| "loss": 3.3581, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.627037727061015, |
| "grad_norm": 0.37333163619041443, |
| "learning_rate": 0.00046074898078043096, |
| "loss": 3.3624, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.641592920353983, |
| "grad_norm": 0.3494884967803955, |
| "learning_rate": 0.00046057425742574256, |
| "loss": 3.3615, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.641592920353983, |
| "eval_accuracy": 0.36974266573788755, |
| "eval_loss": 3.5566110610961914, |
| "eval_runtime": 80.6475, |
| "eval_samples_per_second": 206.466, |
| "eval_steps_per_second": 12.908, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.65614811364695, |
| "grad_norm": 0.3493458032608032, |
| "learning_rate": 0.0004603995340710541, |
| "loss": 3.3612, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.670703306939917, |
| "grad_norm": 0.3222495913505554, |
| "learning_rate": 0.0004602248107163657, |
| "loss": 3.3665, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.685258500232884, |
| "grad_norm": 0.35087063908576965, |
| "learning_rate": 0.0004600500873616773, |
| "loss": 3.3693, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.69981369352585, |
| "grad_norm": 0.3482939302921295, |
| "learning_rate": 0.00045987536400698894, |
| "loss": 3.3622, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.714368886818818, |
| "grad_norm": 0.35141560435295105, |
| "learning_rate": 0.0004597006406523005, |
| "loss": 3.3577, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.728924080111783, |
| "grad_norm": 0.3620748519897461, |
| "learning_rate": 0.00045952591729761207, |
| "loss": 3.3663, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.74347927340475, |
| "grad_norm": 0.3475601375102997, |
| "learning_rate": 0.00045935119394292367, |
| "loss": 3.3631, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.758034466697717, |
| "grad_norm": 0.35568055510520935, |
| "learning_rate": 0.00045917647058823526, |
| "loss": 3.3724, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.772589659990684, |
| "grad_norm": 0.37977227568626404, |
| "learning_rate": 0.0004590017472335468, |
| "loss": 3.3771, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.787144853283651, |
| "grad_norm": 0.35385626554489136, |
| "learning_rate": 0.00045882702387885845, |
| "loss": 3.3664, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.801700046576618, |
| "grad_norm": 0.3767772912979126, |
| "learning_rate": 0.00045865230052417004, |
| "loss": 3.3565, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.816255239869585, |
| "grad_norm": 0.35348716378211975, |
| "learning_rate": 0.00045847757716948164, |
| "loss": 3.3598, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.830810433162553, |
| "grad_norm": 0.3293631970882416, |
| "learning_rate": 0.0004583028538147932, |
| "loss": 3.3691, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.84536562645552, |
| "grad_norm": 0.3734552562236786, |
| "learning_rate": 0.00045812813046010477, |
| "loss": 3.3749, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.859920819748487, |
| "grad_norm": 0.35125109553337097, |
| "learning_rate": 0.0004579534071054164, |
| "loss": 3.3705, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.874476013041454, |
| "grad_norm": 0.3409786820411682, |
| "learning_rate": 0.000457778683750728, |
| "loss": 3.3617, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.88903120633442, |
| "grad_norm": 0.34281209111213684, |
| "learning_rate": 0.00045760396039603955, |
| "loss": 3.3731, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.903586399627388, |
| "grad_norm": 0.3612605631351471, |
| "learning_rate": 0.00045742923704135115, |
| "loss": 3.3729, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.918141592920353, |
| "grad_norm": 0.3471694886684418, |
| "learning_rate": 0.00045725451368666274, |
| "loss": 3.3748, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93269678621332, |
| "grad_norm": 0.352755606174469, |
| "learning_rate": 0.0004570797903319743, |
| "loss": 3.3565, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93269678621332, |
| "eval_accuracy": 0.3703009204385978, |
| "eval_loss": 3.5500266551971436, |
| "eval_runtime": 80.7914, |
| "eval_samples_per_second": 206.099, |
| "eval_steps_per_second": 12.885, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.947251979506287, |
| "grad_norm": 0.35682013630867004, |
| "learning_rate": 0.00045690506697728593, |
| "loss": 3.3781, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.961807172799254, |
| "grad_norm": 0.34535935521125793, |
| "learning_rate": 0.0004567303436225975, |
| "loss": 3.3685, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.976362366092221, |
| "grad_norm": 0.34006449580192566, |
| "learning_rate": 0.0004565556202679091, |
| "loss": 3.3782, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.990917559385188, |
| "grad_norm": 0.3530231714248657, |
| "learning_rate": 0.00045638089691322066, |
| "loss": 3.3652, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.005239869585468, |
| "grad_norm": 0.39088454842567444, |
| "learning_rate": 0.00045620617355853225, |
| "loss": 3.3385, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.019795062878435, |
| "grad_norm": 0.3561397194862366, |
| "learning_rate": 0.0004560314502038439, |
| "loss": 3.2589, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.034350256171402, |
| "grad_norm": 0.3777152895927429, |
| "learning_rate": 0.0004558567268491555, |
| "loss": 3.2616, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.04890544946437, |
| "grad_norm": 0.362715482711792, |
| "learning_rate": 0.00045568200349446704, |
| "loss": 3.2638, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.063460642757336, |
| "grad_norm": 0.3643210232257843, |
| "learning_rate": 0.00045550728013977863, |
| "loss": 3.266, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.078015836050303, |
| "grad_norm": 0.3752143085002899, |
| "learning_rate": 0.0004553325567850902, |
| "loss": 3.2854, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.09257102934327, |
| "grad_norm": 0.38578253984451294, |
| "learning_rate": 0.0004551578334304018, |
| "loss": 3.2763, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.107126222636236, |
| "grad_norm": 0.33161571621894836, |
| "learning_rate": 0.00045498311007571347, |
| "loss": 3.2889, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.121681415929203, |
| "grad_norm": 0.34136560559272766, |
| "learning_rate": 0.000454808386721025, |
| "loss": 3.2931, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.13623660922217, |
| "grad_norm": 0.34598612785339355, |
| "learning_rate": 0.0004546336633663366, |
| "loss": 3.2821, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.150791802515137, |
| "grad_norm": 0.3534063398838043, |
| "learning_rate": 0.0004544589400116482, |
| "loss": 3.2787, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.165346995808104, |
| "grad_norm": 0.392181396484375, |
| "learning_rate": 0.00045428421665695974, |
| "loss": 3.2975, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.179902189101071, |
| "grad_norm": 0.35500138998031616, |
| "learning_rate": 0.00045410949330227133, |
| "loss": 3.2947, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.194457382394038, |
| "grad_norm": 0.3367665708065033, |
| "learning_rate": 0.000453934769947583, |
| "loss": 3.2899, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.209012575687005, |
| "grad_norm": 0.3882429599761963, |
| "learning_rate": 0.0004537600465928946, |
| "loss": 3.307, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.223567768979972, |
| "grad_norm": 0.3567937910556793, |
| "learning_rate": 0.0004535853232382061, |
| "loss": 3.3014, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.223567768979972, |
| "eval_accuracy": 0.36986383638976805, |
| "eval_loss": 3.563836097717285, |
| "eval_runtime": 80.8147, |
| "eval_samples_per_second": 206.039, |
| "eval_steps_per_second": 12.881, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.23812296227294, |
| "grad_norm": 0.35219383239746094, |
| "learning_rate": 0.0004534105998835177, |
| "loss": 3.3094, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.252678155565906, |
| "grad_norm": 0.3962681293487549, |
| "learning_rate": 0.0004532358765288293, |
| "loss": 3.3131, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.267233348858873, |
| "grad_norm": 0.32676026225090027, |
| "learning_rate": 0.00045306115317414095, |
| "loss": 3.3012, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.28178854215184, |
| "grad_norm": 0.3838665187358856, |
| "learning_rate": 0.0004528864298194525, |
| "loss": 3.3101, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.296343735444808, |
| "grad_norm": 0.34425997734069824, |
| "learning_rate": 0.0004527117064647641, |
| "loss": 3.3308, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.310898928737773, |
| "grad_norm": 0.33010685443878174, |
| "learning_rate": 0.0004525369831100757, |
| "loss": 3.3134, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.32545412203074, |
| "grad_norm": 0.37003856897354126, |
| "learning_rate": 0.0004523622597553872, |
| "loss": 3.3197, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.340009315323707, |
| "grad_norm": 0.37403255701065063, |
| "learning_rate": 0.0004521875364006988, |
| "loss": 3.3272, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.354564508616674, |
| "grad_norm": 0.369806170463562, |
| "learning_rate": 0.00045201281304601046, |
| "loss": 3.3366, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.369119701909641, |
| "grad_norm": 0.3683629333972931, |
| "learning_rate": 0.00045183808969132206, |
| "loss": 3.3306, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.383674895202608, |
| "grad_norm": 0.34494805335998535, |
| "learning_rate": 0.00045166336633663365, |
| "loss": 3.3171, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.398230088495575, |
| "grad_norm": 0.3581896722316742, |
| "learning_rate": 0.0004514886429819452, |
| "loss": 3.3247, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.412785281788542, |
| "grad_norm": 0.3682483732700348, |
| "learning_rate": 0.0004513139196272568, |
| "loss": 3.3321, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.42734047508151, |
| "grad_norm": 0.3279290795326233, |
| "learning_rate": 0.00045113919627256843, |
| "loss": 3.3277, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.441895668374476, |
| "grad_norm": 0.36215299367904663, |
| "learning_rate": 0.00045096447291788003, |
| "loss": 3.3214, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.456450861667443, |
| "grad_norm": 0.3544810116291046, |
| "learning_rate": 0.00045078974956319157, |
| "loss": 3.3281, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.47100605496041, |
| "grad_norm": 0.34559234976768494, |
| "learning_rate": 0.00045061502620850316, |
| "loss": 3.3289, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.485561248253378, |
| "grad_norm": 0.35635361075401306, |
| "learning_rate": 0.00045044030285381476, |
| "loss": 3.3309, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.500116441546343, |
| "grad_norm": 0.3953072726726532, |
| "learning_rate": 0.0004502655794991263, |
| "loss": 3.3472, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.51467163483931, |
| "grad_norm": 0.34090185165405273, |
| "learning_rate": 0.00045009085614443795, |
| "loss": 3.329, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.51467163483931, |
| "eval_accuracy": 0.3702339298745126, |
| "eval_loss": 3.5574758052825928, |
| "eval_runtime": 80.6612, |
| "eval_samples_per_second": 206.431, |
| "eval_steps_per_second": 12.906, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.529226828132277, |
| "grad_norm": 0.3187888264656067, |
| "learning_rate": 0.00044991613278974954, |
| "loss": 3.3215, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.543782021425244, |
| "grad_norm": 0.3778735101222992, |
| "learning_rate": 0.00044974140943506113, |
| "loss": 3.3322, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.558337214718211, |
| "grad_norm": 0.367658406496048, |
| "learning_rate": 0.0004495666860803727, |
| "loss": 3.3335, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.572892408011178, |
| "grad_norm": 0.3635483682155609, |
| "learning_rate": 0.00044939196272568427, |
| "loss": 3.3409, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.587447601304145, |
| "grad_norm": 0.38421738147735596, |
| "learning_rate": 0.00044921723937099586, |
| "loss": 3.338, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.602002794597112, |
| "grad_norm": 0.37833017110824585, |
| "learning_rate": 0.0004490425160163075, |
| "loss": 3.3363, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.61655798789008, |
| "grad_norm": 0.38908010721206665, |
| "learning_rate": 0.00044886779266161905, |
| "loss": 3.3423, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.631113181183046, |
| "grad_norm": 0.36583903431892395, |
| "learning_rate": 0.00044869306930693065, |
| "loss": 3.3368, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.645668374476013, |
| "grad_norm": 0.34624579548835754, |
| "learning_rate": 0.00044851834595224224, |
| "loss": 3.3443, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.66022356776898, |
| "grad_norm": 0.34427469968795776, |
| "learning_rate": 0.00044834362259755383, |
| "loss": 3.3473, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.674778761061948, |
| "grad_norm": 0.35878947377204895, |
| "learning_rate": 0.00044816889924286543, |
| "loss": 3.3455, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.689333954354915, |
| "grad_norm": 0.3516271412372589, |
| "learning_rate": 0.000447994175888177, |
| "loss": 3.3553, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.703889147647882, |
| "grad_norm": 0.3550575077533722, |
| "learning_rate": 0.0004478194525334886, |
| "loss": 3.3463, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.718444340940847, |
| "grad_norm": 0.3580949306488037, |
| "learning_rate": 0.0004476447291788002, |
| "loss": 3.3497, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.732999534233814, |
| "grad_norm": 0.3522760272026062, |
| "learning_rate": 0.00044747000582411175, |
| "loss": 3.3368, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.747554727526781, |
| "grad_norm": 0.3506473898887634, |
| "learning_rate": 0.00044729528246942335, |
| "loss": 3.3566, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.762109920819748, |
| "grad_norm": 0.358919233083725, |
| "learning_rate": 0.000447120559114735, |
| "loss": 3.3582, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.776665114112715, |
| "grad_norm": 0.3589182198047638, |
| "learning_rate": 0.0004469458357600466, |
| "loss": 3.3535, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.791220307405682, |
| "grad_norm": 0.38385888934135437, |
| "learning_rate": 0.00044677111240535813, |
| "loss": 3.3525, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.80577550069865, |
| "grad_norm": 0.3656897246837616, |
| "learning_rate": 0.0004465963890506697, |
| "loss": 3.3453, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.80577550069865, |
| "eval_accuracy": 0.37094626287261884, |
| "eval_loss": 3.5471415519714355, |
| "eval_runtime": 80.8526, |
| "eval_samples_per_second": 205.943, |
| "eval_steps_per_second": 12.875, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.820330693991616, |
| "grad_norm": 0.3611510396003723, |
| "learning_rate": 0.0004464216656959813, |
| "loss": 3.357, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.834885887284583, |
| "grad_norm": 0.37905770540237427, |
| "learning_rate": 0.00044624694234129297, |
| "loss": 3.35, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.84944108057755, |
| "grad_norm": 0.3715709149837494, |
| "learning_rate": 0.0004460722189866045, |
| "loss": 3.3502, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.863996273870518, |
| "grad_norm": 0.36706671118736267, |
| "learning_rate": 0.0004458974956319161, |
| "loss": 3.3584, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.878551467163485, |
| "grad_norm": 0.35801252722740173, |
| "learning_rate": 0.0004457227722772277, |
| "loss": 3.3502, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.89310666045645, |
| "grad_norm": 0.35899823904037476, |
| "learning_rate": 0.00044554804892253923, |
| "loss": 3.3582, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.907661853749417, |
| "grad_norm": 0.38531598448753357, |
| "learning_rate": 0.00044537332556785083, |
| "loss": 3.3601, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.922217047042384, |
| "grad_norm": 0.3540276288986206, |
| "learning_rate": 0.0004451986022131625, |
| "loss": 3.3592, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.936772240335351, |
| "grad_norm": 0.3795982301235199, |
| "learning_rate": 0.00044502387885847407, |
| "loss": 3.3578, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.951327433628318, |
| "grad_norm": 0.3632720708847046, |
| "learning_rate": 0.0004448491555037856, |
| "loss": 3.3561, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.965882626921285, |
| "grad_norm": 0.3583764135837555, |
| "learning_rate": 0.0004446744321490972, |
| "loss": 3.3536, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.980437820214252, |
| "grad_norm": 0.3445485830307007, |
| "learning_rate": 0.0004444997087944088, |
| "loss": 3.3491, |
| "step": 44600 |
| }, |
| { |
| "epoch": 12.99499301350722, |
| "grad_norm": 0.330710768699646, |
| "learning_rate": 0.0004443249854397204, |
| "loss": 3.3575, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.009315323707499, |
| "grad_norm": 0.3801380693912506, |
| "learning_rate": 0.000444150262085032, |
| "loss": 3.278, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.023870517000466, |
| "grad_norm": 0.3726879358291626, |
| "learning_rate": 0.0004439755387303436, |
| "loss": 3.2508, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.038425710293433, |
| "grad_norm": 0.3740154802799225, |
| "learning_rate": 0.0004438008153756552, |
| "loss": 3.25, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.0529809035864, |
| "grad_norm": 0.3695807456970215, |
| "learning_rate": 0.00044362609202096677, |
| "loss": 3.2545, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.067536096879367, |
| "grad_norm": 0.3376754820346832, |
| "learning_rate": 0.0004434513686662783, |
| "loss": 3.2625, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.082091290172334, |
| "grad_norm": 0.347817599773407, |
| "learning_rate": 0.00044327664531158996, |
| "loss": 3.2663, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.0966464834653, |
| "grad_norm": 0.3624245822429657, |
| "learning_rate": 0.00044310192195690155, |
| "loss": 3.2699, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.0966464834653, |
| "eval_accuracy": 0.36998724006045136, |
| "eval_loss": 3.5660512447357178, |
| "eval_runtime": 80.9262, |
| "eval_samples_per_second": 205.755, |
| "eval_steps_per_second": 12.864, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.111201676758267, |
| "grad_norm": 0.38243043422698975, |
| "learning_rate": 0.00044292719860221315, |
| "loss": 3.2791, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.125756870051234, |
| "grad_norm": 0.3722783029079437, |
| "learning_rate": 0.0004427524752475247, |
| "loss": 3.2538, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.1403120633442, |
| "grad_norm": 0.36125919222831726, |
| "learning_rate": 0.0004425777518928363, |
| "loss": 3.2816, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.154867256637168, |
| "grad_norm": 0.340777188539505, |
| "learning_rate": 0.0004424030285381479, |
| "loss": 3.2741, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.169422449930135, |
| "grad_norm": 0.4105551242828369, |
| "learning_rate": 0.0004422283051834595, |
| "loss": 3.2782, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.183977643223102, |
| "grad_norm": 0.3820706307888031, |
| "learning_rate": 0.00044205358182877107, |
| "loss": 3.2944, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.198532836516069, |
| "grad_norm": 0.3701717257499695, |
| "learning_rate": 0.00044187885847408266, |
| "loss": 3.2891, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.213088029809036, |
| "grad_norm": 0.3480601906776428, |
| "learning_rate": 0.00044170413511939425, |
| "loss": 3.2821, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.227643223102003, |
| "grad_norm": 0.4469354450702667, |
| "learning_rate": 0.0004415294117647058, |
| "loss": 3.2893, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.24219841639497, |
| "grad_norm": 0.36050304770469666, |
| "learning_rate": 0.00044135468841001744, |
| "loss": 3.3095, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.256753609687937, |
| "grad_norm": 0.3583950400352478, |
| "learning_rate": 0.00044117996505532904, |
| "loss": 3.2937, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.271308802980904, |
| "grad_norm": 0.37139999866485596, |
| "learning_rate": 0.00044100524170064063, |
| "loss": 3.2876, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.285863996273871, |
| "grad_norm": 0.3572840392589569, |
| "learning_rate": 0.0004408305183459522, |
| "loss": 3.2836, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.300419189566837, |
| "grad_norm": 0.35243862867355347, |
| "learning_rate": 0.00044065579499126377, |
| "loss": 3.294, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.314974382859804, |
| "grad_norm": 0.35934120416641235, |
| "learning_rate": 0.00044048107163657536, |
| "loss": 3.3044, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.32952957615277, |
| "grad_norm": 0.3521519601345062, |
| "learning_rate": 0.000440306348281887, |
| "loss": 3.2956, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.344084769445738, |
| "grad_norm": 0.3356475830078125, |
| "learning_rate": 0.0004401316249271986, |
| "loss": 3.2952, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.358639962738705, |
| "grad_norm": 0.3391338586807251, |
| "learning_rate": 0.00043995690157251014, |
| "loss": 3.2991, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.373195156031672, |
| "grad_norm": 0.35844820737838745, |
| "learning_rate": 0.00043978217821782174, |
| "loss": 3.2985, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.387750349324639, |
| "grad_norm": 0.3737053871154785, |
| "learning_rate": 0.00043960745486313333, |
| "loss": 3.3058, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.387750349324639, |
| "eval_accuracy": 0.37062071223662574, |
| "eval_loss": 3.5592100620269775, |
| "eval_runtime": 80.7305, |
| "eval_samples_per_second": 206.254, |
| "eval_steps_per_second": 12.895, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.402305542617606, |
| "grad_norm": 0.33657747507095337, |
| "learning_rate": 0.00043943273150844487, |
| "loss": 3.3109, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.416860735910573, |
| "grad_norm": 0.3510534167289734, |
| "learning_rate": 0.0004392580081537565, |
| "loss": 3.3137, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.43141592920354, |
| "grad_norm": 0.38979029655456543, |
| "learning_rate": 0.0004390832847990681, |
| "loss": 3.3189, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.445971122496507, |
| "grad_norm": 0.36385878920555115, |
| "learning_rate": 0.0004389085614443797, |
| "loss": 3.3133, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.460526315789474, |
| "grad_norm": 0.37798404693603516, |
| "learning_rate": 0.00043873383808969125, |
| "loss": 3.309, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.475081509082441, |
| "grad_norm": 0.3567357063293457, |
| "learning_rate": 0.00043855911473500284, |
| "loss": 3.319, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.489636702375407, |
| "grad_norm": 0.3558754026889801, |
| "learning_rate": 0.0004383843913803145, |
| "loss": 3.3097, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.504191895668374, |
| "grad_norm": 0.3527604937553406, |
| "learning_rate": 0.0004382096680256261, |
| "loss": 3.3149, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.51874708896134, |
| "grad_norm": 0.35432425141334534, |
| "learning_rate": 0.0004380349446709376, |
| "loss": 3.3199, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.533302282254308, |
| "grad_norm": 0.3399069607257843, |
| "learning_rate": 0.0004378602213162492, |
| "loss": 3.329, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.547857475547275, |
| "grad_norm": 0.37489521503448486, |
| "learning_rate": 0.0004376854979615608, |
| "loss": 3.3218, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.562412668840242, |
| "grad_norm": 0.3411555886268616, |
| "learning_rate": 0.0004375107746068724, |
| "loss": 3.3185, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.576967862133209, |
| "grad_norm": 0.3735775947570801, |
| "learning_rate": 0.000437336051252184, |
| "loss": 3.3289, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.591523055426176, |
| "grad_norm": 0.3777002692222595, |
| "learning_rate": 0.0004371613278974956, |
| "loss": 3.322, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.606078248719143, |
| "grad_norm": 0.3491983711719513, |
| "learning_rate": 0.0004369866045428072, |
| "loss": 3.3181, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.62063344201211, |
| "grad_norm": 0.38500961661338806, |
| "learning_rate": 0.0004368118811881188, |
| "loss": 3.3203, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.635188635305077, |
| "grad_norm": 0.37621399760246277, |
| "learning_rate": 0.0004366371578334303, |
| "loss": 3.3303, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.649743828598044, |
| "grad_norm": 0.3332267999649048, |
| "learning_rate": 0.000436462434478742, |
| "loss": 3.3256, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.664299021891011, |
| "grad_norm": 0.38459545373916626, |
| "learning_rate": 0.00043628771112405357, |
| "loss": 3.3146, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.678854215183978, |
| "grad_norm": 0.36546584963798523, |
| "learning_rate": 0.00043611298776936516, |
| "loss": 3.319, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.678854215183978, |
| "eval_accuracy": 0.3710756604358782, |
| "eval_loss": 3.546867847442627, |
| "eval_runtime": 80.8041, |
| "eval_samples_per_second": 206.066, |
| "eval_steps_per_second": 12.883, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.693409408476944, |
| "grad_norm": 0.35775843262672424, |
| "learning_rate": 0.0004359382644146767, |
| "loss": 3.3276, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.70796460176991, |
| "grad_norm": 0.3480321764945984, |
| "learning_rate": 0.0004357635410599883, |
| "loss": 3.3282, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.722519795062878, |
| "grad_norm": 0.3780113458633423, |
| "learning_rate": 0.0004355888177052999, |
| "loss": 3.3293, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.737074988355845, |
| "grad_norm": 0.3673963248729706, |
| "learning_rate": 0.00043541409435061154, |
| "loss": 3.3343, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.751630181648812, |
| "grad_norm": 0.35989508032798767, |
| "learning_rate": 0.0004352393709959231, |
| "loss": 3.3442, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.766185374941779, |
| "grad_norm": 0.376995325088501, |
| "learning_rate": 0.0004350646476412347, |
| "loss": 3.3316, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.780740568234746, |
| "grad_norm": 0.3618956506252289, |
| "learning_rate": 0.00043488992428654627, |
| "loss": 3.3355, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.795295761527713, |
| "grad_norm": 0.3722332715988159, |
| "learning_rate": 0.0004347152009318578, |
| "loss": 3.34, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.80985095482068, |
| "grad_norm": 0.38128942251205444, |
| "learning_rate": 0.00043454047757716946, |
| "loss": 3.336, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.824406148113647, |
| "grad_norm": 0.3805423676967621, |
| "learning_rate": 0.00043436575422248105, |
| "loss": 3.3396, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.838961341406614, |
| "grad_norm": 0.36344876885414124, |
| "learning_rate": 0.00043419103086779265, |
| "loss": 3.3273, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.853516534699581, |
| "grad_norm": 0.37594860792160034, |
| "learning_rate": 0.0004340163075131042, |
| "loss": 3.3397, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.868071727992549, |
| "grad_norm": 0.3371012210845947, |
| "learning_rate": 0.0004338415841584158, |
| "loss": 3.3346, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.882626921285514, |
| "grad_norm": 0.3437005877494812, |
| "learning_rate": 0.0004336668608037274, |
| "loss": 3.3453, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.89718211457848, |
| "grad_norm": 0.3510890007019043, |
| "learning_rate": 0.000433492137449039, |
| "loss": 3.3425, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.911737307871448, |
| "grad_norm": 0.3832473158836365, |
| "learning_rate": 0.00043331741409435056, |
| "loss": 3.3409, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.926292501164415, |
| "grad_norm": 0.37826868891716003, |
| "learning_rate": 0.00043314269073966216, |
| "loss": 3.3397, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.940847694457382, |
| "grad_norm": 0.3409828543663025, |
| "learning_rate": 0.00043296796738497375, |
| "loss": 3.3361, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.955402887750349, |
| "grad_norm": 0.337909460067749, |
| "learning_rate": 0.00043279324403028535, |
| "loss": 3.3434, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.969958081043316, |
| "grad_norm": 0.35256093740463257, |
| "learning_rate": 0.0004326185206755969, |
| "loss": 3.3464, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.969958081043316, |
| "eval_accuracy": 0.371596188871551, |
| "eval_loss": 3.541656732559204, |
| "eval_runtime": 80.8106, |
| "eval_samples_per_second": 206.05, |
| "eval_steps_per_second": 12.882, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.984513274336283, |
| "grad_norm": 0.3490828275680542, |
| "learning_rate": 0.00043244379732090854, |
| "loss": 3.3466, |
| "step": 48050 |
| }, |
| { |
| "epoch": 13.99906846762925, |
| "grad_norm": 0.3917688727378845, |
| "learning_rate": 0.00043226907396622013, |
| "loss": 3.3277, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.01339077782953, |
| "grad_norm": 0.36740434169769287, |
| "learning_rate": 0.0004320943506115317, |
| "loss": 3.2402, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.027945971122497, |
| "grad_norm": 0.33615073561668396, |
| "learning_rate": 0.00043191962725684326, |
| "loss": 3.2389, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.042501164415464, |
| "grad_norm": 0.3751748204231262, |
| "learning_rate": 0.00043174490390215486, |
| "loss": 3.2496, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.057056357708431, |
| "grad_norm": 0.3675802946090698, |
| "learning_rate": 0.0004315701805474665, |
| "loss": 3.2452, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.071611551001398, |
| "grad_norm": 0.3610966205596924, |
| "learning_rate": 0.0004313954571927781, |
| "loss": 3.2454, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.086166744294363, |
| "grad_norm": 0.3592851161956787, |
| "learning_rate": 0.00043122073383808964, |
| "loss": 3.2432, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.10072193758733, |
| "grad_norm": 0.36754968762397766, |
| "learning_rate": 0.00043104601048340124, |
| "loss": 3.2598, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.115277130880298, |
| "grad_norm": 0.36106571555137634, |
| "learning_rate": 0.00043087128712871283, |
| "loss": 3.2641, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.129832324173265, |
| "grad_norm": 0.3604142963886261, |
| "learning_rate": 0.00043069656377402437, |
| "loss": 3.2532, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.144387517466232, |
| "grad_norm": 0.4012484848499298, |
| "learning_rate": 0.000430521840419336, |
| "loss": 3.2569, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.158942710759199, |
| "grad_norm": 0.38403233885765076, |
| "learning_rate": 0.0004303471170646476, |
| "loss": 3.275, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.173497904052166, |
| "grad_norm": 0.3913150429725647, |
| "learning_rate": 0.0004301723937099592, |
| "loss": 3.2615, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.188053097345133, |
| "grad_norm": 0.3920729160308838, |
| "learning_rate": 0.00042999767035527075, |
| "loss": 3.2695, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.2026082906381, |
| "grad_norm": 0.34493526816368103, |
| "learning_rate": 0.00042982294700058234, |
| "loss": 3.2674, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.217163483931067, |
| "grad_norm": 0.3325290083885193, |
| "learning_rate": 0.000429648223645894, |
| "loss": 3.2642, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.231718677224034, |
| "grad_norm": 0.3610672950744629, |
| "learning_rate": 0.0004294735002912056, |
| "loss": 3.2598, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.246273870517001, |
| "grad_norm": 0.3817266523838043, |
| "learning_rate": 0.0004292987769365172, |
| "loss": 3.2855, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.260829063809968, |
| "grad_norm": 0.34767383337020874, |
| "learning_rate": 0.0004291240535818287, |
| "loss": 3.2773, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.260829063809968, |
| "eval_accuracy": 0.37084542444457474, |
| "eval_loss": 3.558955192565918, |
| "eval_runtime": 80.7458, |
| "eval_samples_per_second": 206.215, |
| "eval_steps_per_second": 12.892, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.275384257102935, |
| "grad_norm": 0.37547221779823303, |
| "learning_rate": 0.0004289493302271403, |
| "loss": 3.2872, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.2899394503959, |
| "grad_norm": 0.41593822836875916, |
| "learning_rate": 0.0004287746068724519, |
| "loss": 3.2714, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.304494643688868, |
| "grad_norm": 0.34178221225738525, |
| "learning_rate": 0.00042859988351776356, |
| "loss": 3.2797, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.319049836981835, |
| "grad_norm": 0.3600603938102722, |
| "learning_rate": 0.0004284251601630751, |
| "loss": 3.2684, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.333605030274802, |
| "grad_norm": 0.367582768201828, |
| "learning_rate": 0.0004282504368083867, |
| "loss": 3.2855, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.348160223567769, |
| "grad_norm": 0.3863992393016815, |
| "learning_rate": 0.0004280757134536983, |
| "loss": 3.2904, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.362715416860736, |
| "grad_norm": 0.37199097871780396, |
| "learning_rate": 0.0004279009900990098, |
| "loss": 3.2868, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.377270610153703, |
| "grad_norm": 0.3646756708621979, |
| "learning_rate": 0.0004277262667443214, |
| "loss": 3.2887, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.39182580344667, |
| "grad_norm": 0.3616171181201935, |
| "learning_rate": 0.00042755154338963307, |
| "loss": 3.2916, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.406380996739637, |
| "grad_norm": 0.3641643524169922, |
| "learning_rate": 0.00042737682003494466, |
| "loss": 3.2886, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.420936190032604, |
| "grad_norm": 0.36623871326446533, |
| "learning_rate": 0.0004272020966802562, |
| "loss": 3.3027, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.435491383325571, |
| "grad_norm": 0.3587304353713989, |
| "learning_rate": 0.0004270273733255678, |
| "loss": 3.2889, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.450046576618538, |
| "grad_norm": 0.37353044748306274, |
| "learning_rate": 0.0004268526499708794, |
| "loss": 3.295, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.464601769911505, |
| "grad_norm": 0.376542866230011, |
| "learning_rate": 0.00042667792661619104, |
| "loss": 3.2916, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.47915696320447, |
| "grad_norm": 0.3699222803115845, |
| "learning_rate": 0.0004265032032615026, |
| "loss": 3.293, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.493712156497438, |
| "grad_norm": 0.3968975841999054, |
| "learning_rate": 0.0004263284799068142, |
| "loss": 3.3107, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.508267349790405, |
| "grad_norm": 0.37532177567481995, |
| "learning_rate": 0.00042615375655212577, |
| "loss": 3.3022, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.522822543083372, |
| "grad_norm": 0.38349661231040955, |
| "learning_rate": 0.00042597903319743736, |
| "loss": 3.3049, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.537377736376339, |
| "grad_norm": 0.37298819422721863, |
| "learning_rate": 0.0004258043098427489, |
| "loss": 3.2996, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.551932929669306, |
| "grad_norm": 0.3718286454677582, |
| "learning_rate": 0.00042562958648806055, |
| "loss": 3.3011, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.551932929669306, |
| "eval_accuracy": 0.3713487938936573, |
| "eval_loss": 3.55180287361145, |
| "eval_runtime": 80.7535, |
| "eval_samples_per_second": 206.195, |
| "eval_steps_per_second": 12.891, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.566488122962273, |
| "grad_norm": 0.3525947630405426, |
| "learning_rate": 0.00042545486313337214, |
| "loss": 3.2978, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.58104331625524, |
| "grad_norm": 0.3560762107372284, |
| "learning_rate": 0.00042528013977868374, |
| "loss": 3.3095, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.595598509548207, |
| "grad_norm": 0.3790857195854187, |
| "learning_rate": 0.0004251054164239953, |
| "loss": 3.3054, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.610153702841174, |
| "grad_norm": 0.3617846965789795, |
| "learning_rate": 0.0004249306930693069, |
| "loss": 3.3013, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.624708896134141, |
| "grad_norm": 0.35423752665519714, |
| "learning_rate": 0.0004247559697146185, |
| "loss": 3.3116, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.639264089427108, |
| "grad_norm": 0.346599817276001, |
| "learning_rate": 0.0004245812463599301, |
| "loss": 3.3017, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.653819282720075, |
| "grad_norm": 0.3855575621128082, |
| "learning_rate": 0.00042440652300524166, |
| "loss": 3.3096, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.668374476013042, |
| "grad_norm": 0.38708043098449707, |
| "learning_rate": 0.00042423179965055325, |
| "loss": 3.3136, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.682929669306008, |
| "grad_norm": 0.3753894865512848, |
| "learning_rate": 0.00042405707629586484, |
| "loss": 3.3255, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.697484862598975, |
| "grad_norm": 0.37022125720977783, |
| "learning_rate": 0.0004238823529411764, |
| "loss": 3.315, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.712040055891942, |
| "grad_norm": 0.3891310691833496, |
| "learning_rate": 0.00042370762958648803, |
| "loss": 3.3248, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.726595249184909, |
| "grad_norm": 0.3737620413303375, |
| "learning_rate": 0.00042353290623179963, |
| "loss": 3.3085, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.741150442477876, |
| "grad_norm": 0.39528465270996094, |
| "learning_rate": 0.0004233581828771112, |
| "loss": 3.3073, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.755705635770843, |
| "grad_norm": 0.36659860610961914, |
| "learning_rate": 0.00042318345952242276, |
| "loss": 3.3146, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.77026082906381, |
| "grad_norm": 0.37093621492385864, |
| "learning_rate": 0.00042300873616773436, |
| "loss": 3.3214, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.784816022356777, |
| "grad_norm": 0.39852967858314514, |
| "learning_rate": 0.00042283401281304595, |
| "loss": 3.3229, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.799371215649744, |
| "grad_norm": 0.3862568736076355, |
| "learning_rate": 0.0004226592894583576, |
| "loss": 3.3197, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.813926408942711, |
| "grad_norm": 0.3859146237373352, |
| "learning_rate": 0.00042248456610366914, |
| "loss": 3.3325, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.828481602235678, |
| "grad_norm": 0.3942420184612274, |
| "learning_rate": 0.00042230984274898073, |
| "loss": 3.315, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.843036795528645, |
| "grad_norm": 0.3653635084629059, |
| "learning_rate": 0.00042213511939429233, |
| "loss": 3.3317, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.843036795528645, |
| "eval_accuracy": 0.37195887813605455, |
| "eval_loss": 3.54195237159729, |
| "eval_runtime": 80.6461, |
| "eval_samples_per_second": 206.47, |
| "eval_steps_per_second": 12.908, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.857591988821612, |
| "grad_norm": 0.3597552180290222, |
| "learning_rate": 0.0004219603960396039, |
| "loss": 3.3282, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.872147182114578, |
| "grad_norm": 0.36636701226234436, |
| "learning_rate": 0.0004217856726849155, |
| "loss": 3.3233, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.886702375407545, |
| "grad_norm": 0.3324110209941864, |
| "learning_rate": 0.0004216109493302271, |
| "loss": 3.3295, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.901257568700512, |
| "grad_norm": 0.3558681607246399, |
| "learning_rate": 0.0004214362259755387, |
| "loss": 3.3288, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.915812761993479, |
| "grad_norm": 0.3884345293045044, |
| "learning_rate": 0.0004212615026208503, |
| "loss": 3.335, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.930367955286446, |
| "grad_norm": 0.3660114109516144, |
| "learning_rate": 0.00042108677926616184, |
| "loss": 3.3263, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.944923148579413, |
| "grad_norm": 0.34780845046043396, |
| "learning_rate": 0.00042091205591147343, |
| "loss": 3.316, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.95947834187238, |
| "grad_norm": 0.3973323106765747, |
| "learning_rate": 0.0004207373325567851, |
| "loss": 3.3378, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.974033535165347, |
| "grad_norm": 0.3800559639930725, |
| "learning_rate": 0.0004205626092020967, |
| "loss": 3.3322, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.988588728458314, |
| "grad_norm": 0.37440791726112366, |
| "learning_rate": 0.0004203878858474082, |
| "loss": 3.3221, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.002911038658594, |
| "grad_norm": 0.37643206119537354, |
| "learning_rate": 0.0004202131624927198, |
| "loss": 3.3035, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.01746623195156, |
| "grad_norm": 0.3641842305660248, |
| "learning_rate": 0.0004200384391380314, |
| "loss": 3.2244, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.032021425244528, |
| "grad_norm": 0.36003291606903076, |
| "learning_rate": 0.00041986371578334305, |
| "loss": 3.2229, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.046576618537495, |
| "grad_norm": 0.3611176908016205, |
| "learning_rate": 0.0004196889924286546, |
| "loss": 3.2259, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.06113181183046, |
| "grad_norm": 0.3740618824958801, |
| "learning_rate": 0.0004195142690739662, |
| "loss": 3.2408, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.075687005123427, |
| "grad_norm": 0.39059212803840637, |
| "learning_rate": 0.0004193395457192778, |
| "loss": 3.2397, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.090242198416394, |
| "grad_norm": 0.36870676279067993, |
| "learning_rate": 0.0004191648223645893, |
| "loss": 3.2274, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.104797391709361, |
| "grad_norm": 0.37898561358451843, |
| "learning_rate": 0.0004189900990099009, |
| "loss": 3.2486, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.119352585002328, |
| "grad_norm": 0.3992726802825928, |
| "learning_rate": 0.00041881537565521256, |
| "loss": 3.2369, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.133907778295296, |
| "grad_norm": 0.36570054292678833, |
| "learning_rate": 0.00041864065230052416, |
| "loss": 3.2458, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.133907778295296, |
| "eval_accuracy": 0.37141766489462913, |
| "eval_loss": 3.559756278991699, |
| "eval_runtime": 80.757, |
| "eval_samples_per_second": 206.186, |
| "eval_steps_per_second": 12.891, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.148462971588263, |
| "grad_norm": 0.3593365550041199, |
| "learning_rate": 0.00041846592894583575, |
| "loss": 3.2519, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.16301816488123, |
| "grad_norm": 0.35268986225128174, |
| "learning_rate": 0.0004182912055911473, |
| "loss": 3.2514, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.177573358174197, |
| "grad_norm": 0.3755052983760834, |
| "learning_rate": 0.0004181164822364589, |
| "loss": 3.255, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.192128551467164, |
| "grad_norm": 0.40018025040626526, |
| "learning_rate": 0.0004179417588817705, |
| "loss": 3.2536, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.20668374476013, |
| "grad_norm": 0.3776955306529999, |
| "learning_rate": 0.00041776703552708213, |
| "loss": 3.2568, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.221238938053098, |
| "grad_norm": 0.3594443202018738, |
| "learning_rate": 0.00041759231217239367, |
| "loss": 3.2632, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.235794131346065, |
| "grad_norm": 0.37025004625320435, |
| "learning_rate": 0.00041741758881770527, |
| "loss": 3.255, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.250349324639032, |
| "grad_norm": 0.34959134459495544, |
| "learning_rate": 0.00041724286546301686, |
| "loss": 3.2659, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.264904517931997, |
| "grad_norm": 0.361098051071167, |
| "learning_rate": 0.0004170681421083284, |
| "loss": 3.267, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.279459711224964, |
| "grad_norm": 0.36912664771080017, |
| "learning_rate": 0.00041689341875364005, |
| "loss": 3.2672, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.294014904517931, |
| "grad_norm": 0.37851908802986145, |
| "learning_rate": 0.00041671869539895164, |
| "loss": 3.2633, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.308570097810899, |
| "grad_norm": 0.36610502004623413, |
| "learning_rate": 0.00041654397204426324, |
| "loss": 3.2653, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.323125291103866, |
| "grad_norm": 0.37920302152633667, |
| "learning_rate": 0.0004163692486895748, |
| "loss": 3.2747, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.337680484396833, |
| "grad_norm": 0.41160011291503906, |
| "learning_rate": 0.00041619452533488637, |
| "loss": 3.2777, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.3522356776898, |
| "grad_norm": 0.3660199046134949, |
| "learning_rate": 0.00041601980198019797, |
| "loss": 3.2736, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.366790870982767, |
| "grad_norm": 0.40935975313186646, |
| "learning_rate": 0.0004158450786255096, |
| "loss": 3.2746, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.381346064275734, |
| "grad_norm": 0.3514406979084015, |
| "learning_rate": 0.00041567035527082115, |
| "loss": 3.2827, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.3959012575687, |
| "grad_norm": 0.3820960819721222, |
| "learning_rate": 0.00041549563191613275, |
| "loss": 3.2814, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.410456450861668, |
| "grad_norm": 0.3737422525882721, |
| "learning_rate": 0.00041532090856144434, |
| "loss": 3.2739, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.425011644154635, |
| "grad_norm": 0.3673994243144989, |
| "learning_rate": 0.00041514618520675594, |
| "loss": 3.2734, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.425011644154635, |
| "eval_accuracy": 0.3716824539137239, |
| "eval_loss": 3.55224609375, |
| "eval_runtime": 80.7258, |
| "eval_samples_per_second": 206.266, |
| "eval_steps_per_second": 12.896, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.439566837447602, |
| "grad_norm": 0.346460223197937, |
| "learning_rate": 0.00041497146185206753, |
| "loss": 3.2808, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.454122030740567, |
| "grad_norm": 0.40880486369132996, |
| "learning_rate": 0.0004147967384973791, |
| "loss": 3.2725, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.468677224033534, |
| "grad_norm": 0.3735993504524231, |
| "learning_rate": 0.0004146220151426907, |
| "loss": 3.2861, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.483232417326501, |
| "grad_norm": 0.3774520754814148, |
| "learning_rate": 0.0004144472917880023, |
| "loss": 3.2969, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.497787610619469, |
| "grad_norm": 0.37339529395103455, |
| "learning_rate": 0.00041427256843331385, |
| "loss": 3.289, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.512342803912436, |
| "grad_norm": 0.36602962017059326, |
| "learning_rate": 0.00041409784507862545, |
| "loss": 3.2841, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.526897997205403, |
| "grad_norm": 0.41036468744277954, |
| "learning_rate": 0.0004139231217239371, |
| "loss": 3.2938, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.54145319049837, |
| "grad_norm": 0.3478046655654907, |
| "learning_rate": 0.0004137483983692487, |
| "loss": 3.3036, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.556008383791337, |
| "grad_norm": 0.35059481859207153, |
| "learning_rate": 0.00041357367501456023, |
| "loss": 3.3005, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.570563577084304, |
| "grad_norm": 0.3484688401222229, |
| "learning_rate": 0.0004133989516598718, |
| "loss": 3.2775, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.585118770377271, |
| "grad_norm": 0.377468079328537, |
| "learning_rate": 0.0004132242283051834, |
| "loss": 3.2829, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.599673963670238, |
| "grad_norm": 0.36800843477249146, |
| "learning_rate": 0.00041304950495049496, |
| "loss": 3.3021, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.614229156963205, |
| "grad_norm": 0.38241273164749146, |
| "learning_rate": 0.0004128747815958066, |
| "loss": 3.2971, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.628784350256172, |
| "grad_norm": 0.3814278244972229, |
| "learning_rate": 0.0004127000582411182, |
| "loss": 3.2992, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.64333954354914, |
| "grad_norm": 0.3674376904964447, |
| "learning_rate": 0.0004125253348864298, |
| "loss": 3.2954, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.657894736842106, |
| "grad_norm": 0.3729366958141327, |
| "learning_rate": 0.00041235061153174134, |
| "loss": 3.2919, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.672449930135071, |
| "grad_norm": 0.35434195399284363, |
| "learning_rate": 0.00041217588817705293, |
| "loss": 3.2955, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.687005123428039, |
| "grad_norm": 0.38400375843048096, |
| "learning_rate": 0.0004120011648223646, |
| "loss": 3.3018, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.701560316721006, |
| "grad_norm": 0.37085700035095215, |
| "learning_rate": 0.0004118264414676762, |
| "loss": 3.313, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.716115510013973, |
| "grad_norm": 0.3556494116783142, |
| "learning_rate": 0.0004116517181129877, |
| "loss": 3.2856, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.716115510013973, |
| "eval_accuracy": 0.3724631878035804, |
| "eval_loss": 3.541750431060791, |
| "eval_runtime": 80.7071, |
| "eval_samples_per_second": 206.314, |
| "eval_steps_per_second": 12.899, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.73067070330694, |
| "grad_norm": 0.38073569536209106, |
| "learning_rate": 0.0004114769947582993, |
| "loss": 3.3045, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.745225896599907, |
| "grad_norm": 0.4057568907737732, |
| "learning_rate": 0.0004113022714036109, |
| "loss": 3.3061, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.759781089892874, |
| "grad_norm": 0.36155977845191956, |
| "learning_rate": 0.0004111275480489225, |
| "loss": 3.2912, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.774336283185841, |
| "grad_norm": 0.353712797164917, |
| "learning_rate": 0.0004109528246942341, |
| "loss": 3.3017, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.788891476478808, |
| "grad_norm": 0.37780117988586426, |
| "learning_rate": 0.0004107781013395457, |
| "loss": 3.2924, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.803446669771775, |
| "grad_norm": 0.38512688875198364, |
| "learning_rate": 0.0004106033779848573, |
| "loss": 3.3167, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.818001863064742, |
| "grad_norm": 0.41960573196411133, |
| "learning_rate": 0.0004104286546301689, |
| "loss": 3.3106, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.83255705635771, |
| "grad_norm": 0.3920349180698395, |
| "learning_rate": 0.0004102539312754804, |
| "loss": 3.3054, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.847112249650674, |
| "grad_norm": 0.35872983932495117, |
| "learning_rate": 0.00041007920792079206, |
| "loss": 3.31, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.861667442943642, |
| "grad_norm": 0.3779909014701843, |
| "learning_rate": 0.00040990448456610366, |
| "loss": 3.303, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.876222636236609, |
| "grad_norm": 0.365961492061615, |
| "learning_rate": 0.00040972976121141525, |
| "loss": 3.2956, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.890777829529576, |
| "grad_norm": 0.383811354637146, |
| "learning_rate": 0.0004095550378567268, |
| "loss": 3.3247, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.905333022822543, |
| "grad_norm": 0.3860636353492737, |
| "learning_rate": 0.0004093803145020384, |
| "loss": 3.3051, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.91988821611551, |
| "grad_norm": 0.3569910228252411, |
| "learning_rate": 0.00040920559114735, |
| "loss": 3.3135, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.934443409408477, |
| "grad_norm": 0.3867490291595459, |
| "learning_rate": 0.00040903086779266163, |
| "loss": 3.3124, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.948998602701444, |
| "grad_norm": 0.3425275385379791, |
| "learning_rate": 0.00040885614443797317, |
| "loss": 3.3082, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.963553795994411, |
| "grad_norm": 0.37110820412635803, |
| "learning_rate": 0.00040868142108328476, |
| "loss": 3.3136, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.978108989287378, |
| "grad_norm": 0.35990962386131287, |
| "learning_rate": 0.00040850669772859636, |
| "loss": 3.3079, |
| "step": 54900 |
| }, |
| { |
| "epoch": 15.992664182580345, |
| "grad_norm": 0.40666720271110535, |
| "learning_rate": 0.0004083319743739079, |
| "loss": 3.308, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.006986492780623, |
| "grad_norm": 0.3485197126865387, |
| "learning_rate": 0.0004081572510192195, |
| "loss": 3.2596, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.006986492780623, |
| "eval_accuracy": 0.3721861759447227, |
| "eval_loss": 3.5502216815948486, |
| "eval_runtime": 80.7089, |
| "eval_samples_per_second": 206.309, |
| "eval_steps_per_second": 12.898, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.02154168607359, |
| "grad_norm": 0.3760972321033478, |
| "learning_rate": 0.00040798252766453114, |
| "loss": 3.1993, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.036096879366557, |
| "grad_norm": 0.37333351373672485, |
| "learning_rate": 0.00040780780430984273, |
| "loss": 3.2011, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.050652072659524, |
| "grad_norm": 0.38070735335350037, |
| "learning_rate": 0.0004076330809551543, |
| "loss": 3.2201, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.06520726595249, |
| "grad_norm": 0.4045502841472626, |
| "learning_rate": 0.00040745835760046587, |
| "loss": 3.2142, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.079762459245458, |
| "grad_norm": 0.3948613405227661, |
| "learning_rate": 0.00040728363424577746, |
| "loss": 3.2345, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.094317652538425, |
| "grad_norm": 0.3689858019351959, |
| "learning_rate": 0.0004071089108910891, |
| "loss": 3.2128, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.108872845831392, |
| "grad_norm": 0.3550013303756714, |
| "learning_rate": 0.0004069341875364007, |
| "loss": 3.2268, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.12342803912436, |
| "grad_norm": 0.36508285999298096, |
| "learning_rate": 0.00040675946418171225, |
| "loss": 3.2193, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.137983232417326, |
| "grad_norm": 0.3798517882823944, |
| "learning_rate": 0.00040658474082702384, |
| "loss": 3.2297, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.152538425710294, |
| "grad_norm": 0.3686642348766327, |
| "learning_rate": 0.00040641001747233543, |
| "loss": 3.2324, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.16709361900326, |
| "grad_norm": 0.3705148696899414, |
| "learning_rate": 0.000406235294117647, |
| "loss": 3.2456, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.181648812296228, |
| "grad_norm": 0.3792201578617096, |
| "learning_rate": 0.0004060605707629586, |
| "loss": 3.2464, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.196204005589195, |
| "grad_norm": 0.3715910315513611, |
| "learning_rate": 0.0004058858474082702, |
| "loss": 3.2432, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.21075919888216, |
| "grad_norm": 0.38456979393959045, |
| "learning_rate": 0.0004057111240535818, |
| "loss": 3.2347, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.22531439217513, |
| "grad_norm": 0.37881287932395935, |
| "learning_rate": 0.00040553640069889335, |
| "loss": 3.2608, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.239869585468096, |
| "grad_norm": 0.42177900671958923, |
| "learning_rate": 0.00040536167734420495, |
| "loss": 3.2476, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.254424778761063, |
| "grad_norm": 0.36965277791023254, |
| "learning_rate": 0.0004051869539895166, |
| "loss": 3.2482, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.26897997205403, |
| "grad_norm": 0.3698367178440094, |
| "learning_rate": 0.0004050122306348282, |
| "loss": 3.2679, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.283535165346997, |
| "grad_norm": 0.36005067825317383, |
| "learning_rate": 0.00040483750728013973, |
| "loss": 3.2495, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.298090358639964, |
| "grad_norm": 0.3601491451263428, |
| "learning_rate": 0.0004046627839254513, |
| "loss": 3.2503, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.298090358639964, |
| "eval_accuracy": 0.3718862462613095, |
| "eval_loss": 3.5522119998931885, |
| "eval_runtime": 80.8507, |
| "eval_samples_per_second": 205.947, |
| "eval_steps_per_second": 12.876, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.31264555193293, |
| "grad_norm": 0.3787119388580322, |
| "learning_rate": 0.0004044880605707629, |
| "loss": 3.259, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.3272007452259, |
| "grad_norm": 0.35941311717033386, |
| "learning_rate": 0.00040431333721607446, |
| "loss": 3.2605, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.341755938518865, |
| "grad_norm": 0.4253566265106201, |
| "learning_rate": 0.0004041386138613861, |
| "loss": 3.2414, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.35631113181183, |
| "grad_norm": 0.3907557427883148, |
| "learning_rate": 0.0004039638905066977, |
| "loss": 3.261, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.370866325104796, |
| "grad_norm": 0.38112950325012207, |
| "learning_rate": 0.0004037891671520093, |
| "loss": 3.2715, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.385421518397763, |
| "grad_norm": 0.37178608775138855, |
| "learning_rate": 0.0004036144437973209, |
| "loss": 3.2679, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.39997671169073, |
| "grad_norm": 0.41026216745376587, |
| "learning_rate": 0.00040343972044263243, |
| "loss": 3.2668, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.414531904983697, |
| "grad_norm": 0.39276382327079773, |
| "learning_rate": 0.0004032649970879441, |
| "loss": 3.2679, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.429087098276664, |
| "grad_norm": 0.4075048267841339, |
| "learning_rate": 0.00040309027373325567, |
| "loss": 3.2809, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.44364229156963, |
| "grad_norm": 0.3888840675354004, |
| "learning_rate": 0.00040291555037856727, |
| "loss": 3.2788, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.4581974848626, |
| "grad_norm": 0.37678566575050354, |
| "learning_rate": 0.0004027408270238788, |
| "loss": 3.2708, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.472752678155565, |
| "grad_norm": 0.3959866762161255, |
| "learning_rate": 0.0004025661036691904, |
| "loss": 3.2852, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.487307871448532, |
| "grad_norm": 0.38866668939590454, |
| "learning_rate": 0.000402391380314502, |
| "loss": 3.2776, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.5018630647415, |
| "grad_norm": 0.39661064743995667, |
| "learning_rate": 0.00040221665695981364, |
| "loss": 3.2789, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.516418258034467, |
| "grad_norm": 0.3465977907180786, |
| "learning_rate": 0.0004020419336051252, |
| "loss": 3.2688, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.530973451327434, |
| "grad_norm": 0.3973502218723297, |
| "learning_rate": 0.0004018672102504368, |
| "loss": 3.2687, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.5455286446204, |
| "grad_norm": 0.347652792930603, |
| "learning_rate": 0.00040169248689574837, |
| "loss": 3.2723, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.560083837913368, |
| "grad_norm": 0.36513060331344604, |
| "learning_rate": 0.0004015177635410599, |
| "loss": 3.2818, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.574639031206335, |
| "grad_norm": 0.3936389982700348, |
| "learning_rate": 0.0004013430401863715, |
| "loss": 3.2654, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.589194224499302, |
| "grad_norm": 0.36767658591270447, |
| "learning_rate": 0.00040116831683168315, |
| "loss": 3.2839, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.589194224499302, |
| "eval_accuracy": 0.37220545042281034, |
| "eval_loss": 3.5469257831573486, |
| "eval_runtime": 80.6706, |
| "eval_samples_per_second": 206.407, |
| "eval_steps_per_second": 12.904, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.60374941779227, |
| "grad_norm": 0.3595813810825348, |
| "learning_rate": 0.00040099359347699475, |
| "loss": 3.2867, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.618304611085236, |
| "grad_norm": 0.3898159861564636, |
| "learning_rate": 0.0004008188701223063, |
| "loss": 3.2728, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.632859804378203, |
| "grad_norm": 0.3986470699310303, |
| "learning_rate": 0.0004006441467676179, |
| "loss": 3.2874, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.64741499767117, |
| "grad_norm": 0.37127336859703064, |
| "learning_rate": 0.0004004694234129295, |
| "loss": 3.281, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.661970190964137, |
| "grad_norm": 0.4040543735027313, |
| "learning_rate": 0.0004002947000582411, |
| "loss": 3.2898, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.676525384257104, |
| "grad_norm": 0.3757478594779968, |
| "learning_rate": 0.00040011997670355267, |
| "loss": 3.2916, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.69108057755007, |
| "grad_norm": 0.3522929847240448, |
| "learning_rate": 0.00039994525334886426, |
| "loss": 3.2948, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.70563577084304, |
| "grad_norm": 0.3687567412853241, |
| "learning_rate": 0.00039977052999417585, |
| "loss": 3.2891, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.720190964136005, |
| "grad_norm": 0.38311392068862915, |
| "learning_rate": 0.00039959580663948745, |
| "loss": 3.2787, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.734746157428972, |
| "grad_norm": 0.34893175959587097, |
| "learning_rate": 0.000399421083284799, |
| "loss": 3.2905, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.749301350721936, |
| "grad_norm": 0.3817479610443115, |
| "learning_rate": 0.00039924635993011064, |
| "loss": 3.2952, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.763856544014903, |
| "grad_norm": 0.3581146001815796, |
| "learning_rate": 0.00039907163657542223, |
| "loss": 3.2921, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.77841173730787, |
| "grad_norm": 0.4336063265800476, |
| "learning_rate": 0.0003988969132207338, |
| "loss": 3.2913, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.792966930600837, |
| "grad_norm": 0.3745361566543579, |
| "learning_rate": 0.00039872218986604537, |
| "loss": 3.2978, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.807522123893804, |
| "grad_norm": 0.40274861454963684, |
| "learning_rate": 0.00039854746651135696, |
| "loss": 3.2781, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.82207731718677, |
| "grad_norm": 0.3722064197063446, |
| "learning_rate": 0.0003983727431566686, |
| "loss": 3.2926, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.83663251047974, |
| "grad_norm": 0.3850577473640442, |
| "learning_rate": 0.0003981980198019802, |
| "loss": 3.2955, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.851187703772705, |
| "grad_norm": 0.37323853373527527, |
| "learning_rate": 0.00039802329644729174, |
| "loss": 3.2902, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.865742897065672, |
| "grad_norm": 0.37659692764282227, |
| "learning_rate": 0.00039784857309260334, |
| "loss": 3.3011, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.88029809035864, |
| "grad_norm": 0.37374237179756165, |
| "learning_rate": 0.00039767384973791493, |
| "loss": 3.2964, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.88029809035864, |
| "eval_accuracy": 0.3728755911182735, |
| "eval_loss": 3.5368783473968506, |
| "eval_runtime": 80.6519, |
| "eval_samples_per_second": 206.455, |
| "eval_steps_per_second": 12.907, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.894853283651607, |
| "grad_norm": 0.3639906346797943, |
| "learning_rate": 0.00039749912638322647, |
| "loss": 3.299, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.909408476944574, |
| "grad_norm": 0.3662974536418915, |
| "learning_rate": 0.0003973244030285381, |
| "loss": 3.3013, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.92396367023754, |
| "grad_norm": 0.39329737424850464, |
| "learning_rate": 0.0003971496796738497, |
| "loss": 3.2998, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.938518863530508, |
| "grad_norm": 0.3817102611064911, |
| "learning_rate": 0.0003969749563191613, |
| "loss": 3.3008, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.953074056823475, |
| "grad_norm": 0.3732031583786011, |
| "learning_rate": 0.00039680023296447285, |
| "loss": 3.286, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.967629250116442, |
| "grad_norm": 0.40994203090667725, |
| "learning_rate": 0.00039662550960978444, |
| "loss": 3.3023, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.98218444340941, |
| "grad_norm": 0.3623109459877014, |
| "learning_rate": 0.00039645078625509604, |
| "loss": 3.3091, |
| "step": 58350 |
| }, |
| { |
| "epoch": 16.996739636702376, |
| "grad_norm": 0.3911479413509369, |
| "learning_rate": 0.0003962760629004077, |
| "loss": 3.2853, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.011061946902654, |
| "grad_norm": 0.41258203983306885, |
| "learning_rate": 0.0003961013395457193, |
| "loss": 3.2065, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.02561714019562, |
| "grad_norm": 0.38237568736076355, |
| "learning_rate": 0.0003959266161910308, |
| "loss": 3.1838, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.040172333488588, |
| "grad_norm": 0.35858142375946045, |
| "learning_rate": 0.0003957518928363424, |
| "loss": 3.1886, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.054727526781555, |
| "grad_norm": 0.3622501492500305, |
| "learning_rate": 0.000395577169481654, |
| "loss": 3.1968, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.069282720074522, |
| "grad_norm": 0.37220996618270874, |
| "learning_rate": 0.00039540244612696566, |
| "loss": 3.1963, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.08383791336749, |
| "grad_norm": 0.3873729705810547, |
| "learning_rate": 0.0003952277227722772, |
| "loss": 3.208, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.098393106660456, |
| "grad_norm": 0.37132883071899414, |
| "learning_rate": 0.0003950529994175888, |
| "loss": 3.2098, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.112948299953423, |
| "grad_norm": 0.38237062096595764, |
| "learning_rate": 0.0003948782760629004, |
| "loss": 3.2256, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.12750349324639, |
| "grad_norm": 0.3828263282775879, |
| "learning_rate": 0.0003947035527082119, |
| "loss": 3.2293, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.142058686539357, |
| "grad_norm": 0.4045010805130005, |
| "learning_rate": 0.0003945288293535235, |
| "loss": 3.2185, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.156613879832324, |
| "grad_norm": 0.36804038286209106, |
| "learning_rate": 0.00039435410599883517, |
| "loss": 3.2234, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.17116907312529, |
| "grad_norm": 0.37531086802482605, |
| "learning_rate": 0.00039417938264414676, |
| "loss": 3.2298, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.17116907312529, |
| "eval_accuracy": 0.3723900858196137, |
| "eval_loss": 3.552824020385742, |
| "eval_runtime": 80.7226, |
| "eval_samples_per_second": 206.274, |
| "eval_steps_per_second": 12.896, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.18572426641826, |
| "grad_norm": 0.36014947295188904, |
| "learning_rate": 0.0003940046592894583, |
| "loss": 3.2272, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.200279459711226, |
| "grad_norm": 0.4071590304374695, |
| "learning_rate": 0.0003938299359347699, |
| "loss": 3.234, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.214834653004193, |
| "grad_norm": 0.39101675152778625, |
| "learning_rate": 0.0003936552125800815, |
| "loss": 3.2266, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.22938984629716, |
| "grad_norm": 0.3784657418727875, |
| "learning_rate": 0.00039348048922539314, |
| "loss": 3.2352, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.243945039590127, |
| "grad_norm": 0.41391733288764954, |
| "learning_rate": 0.0003933057658707047, |
| "loss": 3.2429, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.258500232883094, |
| "grad_norm": 0.399127721786499, |
| "learning_rate": 0.0003931310425160163, |
| "loss": 3.251, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.27305542617606, |
| "grad_norm": 0.36477163434028625, |
| "learning_rate": 0.00039295631916132787, |
| "loss": 3.2448, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.287610619469028, |
| "grad_norm": 0.38781842589378357, |
| "learning_rate": 0.00039278159580663946, |
| "loss": 3.238, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.302165812761995, |
| "grad_norm": 0.4136674106121063, |
| "learning_rate": 0.000392606872451951, |
| "loss": 3.2426, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.316721006054962, |
| "grad_norm": 0.38425910472869873, |
| "learning_rate": 0.00039243214909726265, |
| "loss": 3.2426, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.331276199347926, |
| "grad_norm": 0.38384538888931274, |
| "learning_rate": 0.00039225742574257425, |
| "loss": 3.2398, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.345831392640893, |
| "grad_norm": 0.36757904291152954, |
| "learning_rate": 0.00039208270238788584, |
| "loss": 3.2425, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.36038658593386, |
| "grad_norm": 0.40921059250831604, |
| "learning_rate": 0.0003919079790331974, |
| "loss": 3.2477, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.374941779226827, |
| "grad_norm": 0.3589450716972351, |
| "learning_rate": 0.000391733255678509, |
| "loss": 3.2577, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.389496972519794, |
| "grad_norm": 0.3744639754295349, |
| "learning_rate": 0.00039155853232382057, |
| "loss": 3.263, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.40405216581276, |
| "grad_norm": 0.38087818026542664, |
| "learning_rate": 0.0003913838089691322, |
| "loss": 3.2542, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.418607359105728, |
| "grad_norm": 0.36054182052612305, |
| "learning_rate": 0.00039120908561444376, |
| "loss": 3.2602, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.433162552398695, |
| "grad_norm": 0.39330026507377625, |
| "learning_rate": 0.00039103436225975535, |
| "loss": 3.2441, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.447717745691662, |
| "grad_norm": 0.3876330554485321, |
| "learning_rate": 0.00039085963890506695, |
| "loss": 3.2618, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.46227293898463, |
| "grad_norm": 0.39032647013664246, |
| "learning_rate": 0.0003906849155503785, |
| "loss": 3.2623, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.46227293898463, |
| "eval_accuracy": 0.37241359128069623, |
| "eval_loss": 3.5480120182037354, |
| "eval_runtime": 80.675, |
| "eval_samples_per_second": 206.396, |
| "eval_steps_per_second": 12.904, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.476828132277596, |
| "grad_norm": 0.3893525004386902, |
| "learning_rate": 0.00039051019219569014, |
| "loss": 3.2667, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.491383325570563, |
| "grad_norm": 0.3922117352485657, |
| "learning_rate": 0.00039033546884100173, |
| "loss": 3.2554, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.50593851886353, |
| "grad_norm": 0.37369105219841003, |
| "learning_rate": 0.0003901607454863133, |
| "loss": 3.2612, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.520493712156497, |
| "grad_norm": 0.38372310996055603, |
| "learning_rate": 0.00038998602213162486, |
| "loss": 3.2645, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.535048905449464, |
| "grad_norm": 0.36489877104759216, |
| "learning_rate": 0.00038981129877693646, |
| "loss": 3.263, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.54960409874243, |
| "grad_norm": 0.4149305820465088, |
| "learning_rate": 0.00038963657542224805, |
| "loss": 3.2666, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.5641592920354, |
| "grad_norm": 0.3787636160850525, |
| "learning_rate": 0.0003894618520675597, |
| "loss": 3.2667, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.578714485328366, |
| "grad_norm": 0.366481214761734, |
| "learning_rate": 0.00038928712871287124, |
| "loss": 3.2693, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.593269678621333, |
| "grad_norm": 0.387067049741745, |
| "learning_rate": 0.00038911240535818284, |
| "loss": 3.2722, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.6078248719143, |
| "grad_norm": 0.36823612451553345, |
| "learning_rate": 0.00038893768200349443, |
| "loss": 3.2798, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.622380065207267, |
| "grad_norm": 0.4126061201095581, |
| "learning_rate": 0.000388762958648806, |
| "loss": 3.2802, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.636935258500234, |
| "grad_norm": 0.36839792132377625, |
| "learning_rate": 0.0003885882352941176, |
| "loss": 3.2636, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.6514904517932, |
| "grad_norm": 0.36374685168266296, |
| "learning_rate": 0.0003884135119394292, |
| "loss": 3.2791, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.666045645086168, |
| "grad_norm": 0.37513571977615356, |
| "learning_rate": 0.0003882387885847408, |
| "loss": 3.2667, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.680600838379135, |
| "grad_norm": 0.38282519578933716, |
| "learning_rate": 0.0003880640652300524, |
| "loss": 3.2719, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.695156031672102, |
| "grad_norm": 0.4076358377933502, |
| "learning_rate": 0.00038788934187536394, |
| "loss": 3.2739, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.70971122496507, |
| "grad_norm": 0.39459848403930664, |
| "learning_rate": 0.00038771461852067554, |
| "loss": 3.2879, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.724266418258033, |
| "grad_norm": 0.36787110567092896, |
| "learning_rate": 0.0003875398951659872, |
| "loss": 3.2712, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.738821611551, |
| "grad_norm": 0.3767246901988983, |
| "learning_rate": 0.0003873651718112988, |
| "loss": 3.2852, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.753376804843967, |
| "grad_norm": 0.40687525272369385, |
| "learning_rate": 0.0003871904484566103, |
| "loss": 3.2786, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.753376804843967, |
| "eval_accuracy": 0.37292648044151716, |
| "eval_loss": 3.5376880168914795, |
| "eval_runtime": 80.6783, |
| "eval_samples_per_second": 206.388, |
| "eval_steps_per_second": 12.903, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.767931998136934, |
| "grad_norm": 0.4051224887371063, |
| "learning_rate": 0.0003870157251019219, |
| "loss": 3.2765, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.7824871914299, |
| "grad_norm": 0.38209521770477295, |
| "learning_rate": 0.0003868410017472335, |
| "loss": 3.2639, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.797042384722868, |
| "grad_norm": 0.4038717746734619, |
| "learning_rate": 0.00038666627839254505, |
| "loss": 3.2816, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.811597578015835, |
| "grad_norm": 0.36336156725883484, |
| "learning_rate": 0.0003864915550378567, |
| "loss": 3.2792, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.826152771308802, |
| "grad_norm": 0.3712412416934967, |
| "learning_rate": 0.0003863168316831683, |
| "loss": 3.2868, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.84070796460177, |
| "grad_norm": 0.35908547043800354, |
| "learning_rate": 0.0003861421083284799, |
| "loss": 3.2818, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.855263157894736, |
| "grad_norm": 0.38663116097450256, |
| "learning_rate": 0.0003859673849737914, |
| "loss": 3.2766, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.869818351187703, |
| "grad_norm": 0.3836953341960907, |
| "learning_rate": 0.000385792661619103, |
| "loss": 3.2769, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.88437354448067, |
| "grad_norm": 0.40757477283477783, |
| "learning_rate": 0.00038561793826441467, |
| "loss": 3.279, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.898928737773637, |
| "grad_norm": 0.3693258762359619, |
| "learning_rate": 0.00038544321490972626, |
| "loss": 3.2979, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.913483931066605, |
| "grad_norm": 0.37590938806533813, |
| "learning_rate": 0.0003852684915550378, |
| "loss": 3.2901, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.92803912435957, |
| "grad_norm": 0.41181281208992004, |
| "learning_rate": 0.0003850937682003494, |
| "loss": 3.2731, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.94259431765254, |
| "grad_norm": 0.36585724353790283, |
| "learning_rate": 0.000384919044845661, |
| "loss": 3.2941, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.957149510945506, |
| "grad_norm": 0.3707817792892456, |
| "learning_rate": 0.0003847443214909726, |
| "loss": 3.2831, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.971704704238473, |
| "grad_norm": 0.35417839884757996, |
| "learning_rate": 0.00038456959813628423, |
| "loss": 3.2863, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.98625989753144, |
| "grad_norm": 0.36485251784324646, |
| "learning_rate": 0.0003843948747815958, |
| "loss": 3.2936, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.000582207731718, |
| "grad_norm": 0.3956283628940582, |
| "learning_rate": 0.00038422015142690737, |
| "loss": 3.2755, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.015137401024685, |
| "grad_norm": 0.37683695554733276, |
| "learning_rate": 0.00038404542807221896, |
| "loss": 3.173, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.029692594317652, |
| "grad_norm": 0.35242313146591187, |
| "learning_rate": 0.0003838707047175305, |
| "loss": 3.1823, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.04424778761062, |
| "grad_norm": 0.3618054986000061, |
| "learning_rate": 0.00038369598136284215, |
| "loss": 3.1831, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.04424778761062, |
| "eval_accuracy": 0.37251043378035625, |
| "eval_loss": 3.550104856491089, |
| "eval_runtime": 80.6535, |
| "eval_samples_per_second": 206.451, |
| "eval_steps_per_second": 12.907, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.058802980903586, |
| "grad_norm": 0.40269213914871216, |
| "learning_rate": 0.00038352125800815374, |
| "loss": 3.1951, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.073358174196553, |
| "grad_norm": 0.37385231256484985, |
| "learning_rate": 0.00038334653465346534, |
| "loss": 3.1966, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.08791336748952, |
| "grad_norm": 0.4258227050304413, |
| "learning_rate": 0.0003831718112987769, |
| "loss": 3.2037, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.102468560782487, |
| "grad_norm": 0.381752610206604, |
| "learning_rate": 0.0003829970879440885, |
| "loss": 3.2049, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.117023754075454, |
| "grad_norm": 0.379851758480072, |
| "learning_rate": 0.00038282236458940007, |
| "loss": 3.2094, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.13157894736842, |
| "grad_norm": 0.3839494287967682, |
| "learning_rate": 0.0003826476412347117, |
| "loss": 3.2004, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.14613414066139, |
| "grad_norm": 0.40136855840682983, |
| "learning_rate": 0.00038247291788002326, |
| "loss": 3.2027, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.160689333954355, |
| "grad_norm": 0.4199560284614563, |
| "learning_rate": 0.00038229819452533485, |
| "loss": 3.221, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.175244527247322, |
| "grad_norm": 0.39128363132476807, |
| "learning_rate": 0.00038212347117064644, |
| "loss": 3.2177, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.18979972054029, |
| "grad_norm": 0.4027943015098572, |
| "learning_rate": 0.000381948747815958, |
| "loss": 3.2189, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.204354913833257, |
| "grad_norm": 0.378438800573349, |
| "learning_rate": 0.0003817740244612696, |
| "loss": 3.2159, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.218910107126224, |
| "grad_norm": 0.3851422369480133, |
| "learning_rate": 0.00038159930110658123, |
| "loss": 3.2244, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.23346530041919, |
| "grad_norm": 0.41445624828338623, |
| "learning_rate": 0.0003814245777518928, |
| "loss": 3.2207, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.248020493712158, |
| "grad_norm": 0.36086156964302063, |
| "learning_rate": 0.0003812498543972044, |
| "loss": 3.2302, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.262575687005125, |
| "grad_norm": 0.4074300527572632, |
| "learning_rate": 0.00038107513104251596, |
| "loss": 3.2286, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.277130880298092, |
| "grad_norm": 0.4142097234725952, |
| "learning_rate": 0.00038090040768782755, |
| "loss": 3.2381, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.29168607359106, |
| "grad_norm": 0.41950494050979614, |
| "learning_rate": 0.0003807256843331392, |
| "loss": 3.2346, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.306241266884022, |
| "grad_norm": 0.4051865041255951, |
| "learning_rate": 0.0003805509609784508, |
| "loss": 3.2352, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.32079646017699, |
| "grad_norm": 0.4207635223865509, |
| "learning_rate": 0.00038037623762376233, |
| "loss": 3.2263, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.335351653469957, |
| "grad_norm": 0.405259370803833, |
| "learning_rate": 0.00038020151426907393, |
| "loss": 3.2437, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.335351653469957, |
| "eval_accuracy": 0.37275395035717135, |
| "eval_loss": 3.5502216815948486, |
| "eval_runtime": 80.7031, |
| "eval_samples_per_second": 206.324, |
| "eval_steps_per_second": 12.899, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.349906846762924, |
| "grad_norm": 0.37889447808265686, |
| "learning_rate": 0.0003800267909143855, |
| "loss": 3.238, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.36446204005589, |
| "grad_norm": 0.40762853622436523, |
| "learning_rate": 0.00037985206755969706, |
| "loss": 3.2307, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.379017233348858, |
| "grad_norm": 0.3893378674983978, |
| "learning_rate": 0.0003796773442050087, |
| "loss": 3.249, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.393572426641825, |
| "grad_norm": 0.4034741520881653, |
| "learning_rate": 0.0003795026208503203, |
| "loss": 3.2398, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.408127619934792, |
| "grad_norm": 0.411862850189209, |
| "learning_rate": 0.0003793278974956319, |
| "loss": 3.2359, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.42268281322776, |
| "grad_norm": 0.373924195766449, |
| "learning_rate": 0.00037915317414094344, |
| "loss": 3.245, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.437238006520726, |
| "grad_norm": 0.3809811472892761, |
| "learning_rate": 0.00037897845078625503, |
| "loss": 3.2491, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.451793199813693, |
| "grad_norm": 0.37844493985176086, |
| "learning_rate": 0.0003788037274315667, |
| "loss": 3.2418, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.46634839310666, |
| "grad_norm": 0.3905605971813202, |
| "learning_rate": 0.0003786290040768783, |
| "loss": 3.2487, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.480903586399627, |
| "grad_norm": 0.3903323709964752, |
| "learning_rate": 0.0003784542807221898, |
| "loss": 3.2498, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.495458779692594, |
| "grad_norm": 0.40007704496383667, |
| "learning_rate": 0.0003782795573675014, |
| "loss": 3.2331, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.51001397298556, |
| "grad_norm": 0.38210824131965637, |
| "learning_rate": 0.000378104834012813, |
| "loss": 3.2545, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.52456916627853, |
| "grad_norm": 0.3924643099308014, |
| "learning_rate": 0.0003779301106581246, |
| "loss": 3.2437, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.539124359571495, |
| "grad_norm": 0.38086479902267456, |
| "learning_rate": 0.0003777553873034362, |
| "loss": 3.2669, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.553679552864462, |
| "grad_norm": 0.42106711864471436, |
| "learning_rate": 0.0003775806639487478, |
| "loss": 3.2493, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.56823474615743, |
| "grad_norm": 0.38551023602485657, |
| "learning_rate": 0.0003774059405940594, |
| "loss": 3.2768, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.582789939450397, |
| "grad_norm": 0.4239996075630188, |
| "learning_rate": 0.000377231217239371, |
| "loss": 3.251, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.597345132743364, |
| "grad_norm": 0.4009993374347687, |
| "learning_rate": 0.0003770564938846825, |
| "loss": 3.2492, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.61190032603633, |
| "grad_norm": 0.39667460322380066, |
| "learning_rate": 0.00037688177052999416, |
| "loss": 3.2585, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.626455519329298, |
| "grad_norm": 0.3974549472332001, |
| "learning_rate": 0.00037670704717530576, |
| "loss": 3.267, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.626455519329298, |
| "eval_accuracy": 0.3732502681679291, |
| "eval_loss": 3.5376315116882324, |
| "eval_runtime": 80.9542, |
| "eval_samples_per_second": 205.684, |
| "eval_steps_per_second": 12.859, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.641010712622265, |
| "grad_norm": 0.37749361991882324, |
| "learning_rate": 0.00037653232382061735, |
| "loss": 3.2566, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.655565905915232, |
| "grad_norm": 0.4106753468513489, |
| "learning_rate": 0.0003763576004659289, |
| "loss": 3.2468, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.6701210992082, |
| "grad_norm": 0.3997870683670044, |
| "learning_rate": 0.0003761828771112405, |
| "loss": 3.2728, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.684676292501166, |
| "grad_norm": 0.3999560475349426, |
| "learning_rate": 0.0003760081537565521, |
| "loss": 3.2733, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.69923148579413, |
| "grad_norm": 0.4081985354423523, |
| "learning_rate": 0.00037583343040186373, |
| "loss": 3.2674, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.713786679087097, |
| "grad_norm": 0.3831663429737091, |
| "learning_rate": 0.00037565870704717527, |
| "loss": 3.2682, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.728341872380064, |
| "grad_norm": 0.389864981174469, |
| "learning_rate": 0.00037548398369248687, |
| "loss": 3.2591, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.74289706567303, |
| "grad_norm": 0.37658265233039856, |
| "learning_rate": 0.00037530926033779846, |
| "loss": 3.2511, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.757452258965998, |
| "grad_norm": 0.40715891122817993, |
| "learning_rate": 0.00037513453698311, |
| "loss": 3.2826, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.772007452258965, |
| "grad_norm": 0.36773937940597534, |
| "learning_rate": 0.0003749598136284216, |
| "loss": 3.2746, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.786562645551932, |
| "grad_norm": 0.40152689814567566, |
| "learning_rate": 0.00037478509027373324, |
| "loss": 3.2742, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.8011178388449, |
| "grad_norm": 0.40193498134613037, |
| "learning_rate": 0.00037461036691904484, |
| "loss": 3.2675, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.815673032137866, |
| "grad_norm": 0.4008418917655945, |
| "learning_rate": 0.0003744356435643564, |
| "loss": 3.2675, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.830228225430833, |
| "grad_norm": 0.41167858242988586, |
| "learning_rate": 0.00037426092020966797, |
| "loss": 3.267, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.8447834187238, |
| "grad_norm": 0.4141235053539276, |
| "learning_rate": 0.00037408619685497957, |
| "loss": 3.2637, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.859338612016767, |
| "grad_norm": 0.4026458263397217, |
| "learning_rate": 0.0003739114735002912, |
| "loss": 3.2873, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.873893805309734, |
| "grad_norm": 0.392117440700531, |
| "learning_rate": 0.0003737367501456028, |
| "loss": 3.2686, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.8884489986027, |
| "grad_norm": 0.43712759017944336, |
| "learning_rate": 0.00037356202679091435, |
| "loss": 3.2692, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.90300419189567, |
| "grad_norm": 0.3687138855457306, |
| "learning_rate": 0.00037338730343622594, |
| "loss": 3.2665, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.917559385188635, |
| "grad_norm": 0.41365763545036316, |
| "learning_rate": 0.00037321258008153754, |
| "loss": 3.2656, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.917559385188635, |
| "eval_accuracy": 0.37336556245453895, |
| "eval_loss": 3.5345816612243652, |
| "eval_runtime": 80.6783, |
| "eval_samples_per_second": 206.388, |
| "eval_steps_per_second": 12.903, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.932114578481603, |
| "grad_norm": 0.40160247683525085, |
| "learning_rate": 0.0003730378567268491, |
| "loss": 3.2802, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.94666977177457, |
| "grad_norm": 0.3640778362751007, |
| "learning_rate": 0.0003728631333721607, |
| "loss": 3.2721, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.961224965067537, |
| "grad_norm": 0.3737909495830536, |
| "learning_rate": 0.0003726884100174723, |
| "loss": 3.2737, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.975780158360504, |
| "grad_norm": 0.40158703923225403, |
| "learning_rate": 0.0003725136866627839, |
| "loss": 3.2748, |
| "step": 65200 |
| }, |
| { |
| "epoch": 18.99033535165347, |
| "grad_norm": 0.3827795386314392, |
| "learning_rate": 0.00037233896330809545, |
| "loss": 3.2692, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.00465766185375, |
| "grad_norm": 0.46527865529060364, |
| "learning_rate": 0.00037216423995340705, |
| "loss": 3.2471, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.019212855146716, |
| "grad_norm": 0.4251295328140259, |
| "learning_rate": 0.0003719895165987187, |
| "loss": 3.1694, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.033768048439683, |
| "grad_norm": 0.39228615164756775, |
| "learning_rate": 0.0003718147932440303, |
| "loss": 3.1746, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.04832324173265, |
| "grad_norm": 0.4059659242630005, |
| "learning_rate": 0.00037164006988934183, |
| "loss": 3.1858, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.062878435025617, |
| "grad_norm": 0.3963252604007721, |
| "learning_rate": 0.0003714653465346534, |
| "loss": 3.1904, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.077433628318584, |
| "grad_norm": 0.47725194692611694, |
| "learning_rate": 0.000371290623179965, |
| "loss": 3.1765, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.09198882161155, |
| "grad_norm": 0.435397744178772, |
| "learning_rate": 0.00037111589982527656, |
| "loss": 3.1922, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.106544014904518, |
| "grad_norm": 0.4138501286506653, |
| "learning_rate": 0.0003709411764705882, |
| "loss": 3.1968, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.121099208197485, |
| "grad_norm": 0.3707689046859741, |
| "learning_rate": 0.0003707664531158998, |
| "loss": 3.1958, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.135654401490452, |
| "grad_norm": 0.3851637840270996, |
| "learning_rate": 0.0003705917297612114, |
| "loss": 3.2047, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.15020959478342, |
| "grad_norm": 0.3792588412761688, |
| "learning_rate": 0.000370417006406523, |
| "loss": 3.1999, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.164764788076386, |
| "grad_norm": 0.4595699608325958, |
| "learning_rate": 0.00037024228305183453, |
| "loss": 3.1949, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.179319981369353, |
| "grad_norm": 0.4074689745903015, |
| "learning_rate": 0.0003700675596971461, |
| "loss": 3.2022, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.19387517466232, |
| "grad_norm": 0.4259937107563019, |
| "learning_rate": 0.0003698928363424578, |
| "loss": 3.1989, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.208430367955287, |
| "grad_norm": 0.3872113823890686, |
| "learning_rate": 0.00036971811298776937, |
| "loss": 3.2097, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.208430367955287, |
| "eval_accuracy": 0.3726486458915216, |
| "eval_loss": 3.5505411624908447, |
| "eval_runtime": 80.7381, |
| "eval_samples_per_second": 206.235, |
| "eval_steps_per_second": 12.894, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.222985561248255, |
| "grad_norm": 0.4027664363384247, |
| "learning_rate": 0.0003695433896330809, |
| "loss": 3.2034, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.23754075454122, |
| "grad_norm": 0.40276873111724854, |
| "learning_rate": 0.0003693686662783925, |
| "loss": 3.1997, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.25209594783419, |
| "grad_norm": 0.4078857898712158, |
| "learning_rate": 0.0003691939429237041, |
| "loss": 3.2198, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.266651141127156, |
| "grad_norm": 0.42560309171676636, |
| "learning_rate": 0.00036901921956901575, |
| "loss": 3.2188, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.281206334420123, |
| "grad_norm": 0.4098570644855499, |
| "learning_rate": 0.0003688444962143273, |
| "loss": 3.2196, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.29576152771309, |
| "grad_norm": 0.398127943277359, |
| "learning_rate": 0.0003686697728596389, |
| "loss": 3.218, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.310316721006053, |
| "grad_norm": 0.39331769943237305, |
| "learning_rate": 0.0003684950495049505, |
| "loss": 3.2198, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.32487191429902, |
| "grad_norm": 0.3819613754749298, |
| "learning_rate": 0.000368320326150262, |
| "loss": 3.228, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.339427107591987, |
| "grad_norm": 0.41020679473876953, |
| "learning_rate": 0.0003681456027955736, |
| "loss": 3.2152, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.353982300884955, |
| "grad_norm": 0.3911140561103821, |
| "learning_rate": 0.00036797087944088526, |
| "loss": 3.2225, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.36853749417792, |
| "grad_norm": 0.39398056268692017, |
| "learning_rate": 0.00036779615608619685, |
| "loss": 3.2295, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.38309268747089, |
| "grad_norm": 0.4079400897026062, |
| "learning_rate": 0.0003676214327315084, |
| "loss": 3.2395, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.397647880763856, |
| "grad_norm": 0.3981572091579437, |
| "learning_rate": 0.00036744670937682, |
| "loss": 3.2415, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.412203074056823, |
| "grad_norm": 0.37718790769577026, |
| "learning_rate": 0.0003672719860221316, |
| "loss": 3.2304, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.42675826734979, |
| "grad_norm": 0.39809155464172363, |
| "learning_rate": 0.00036709726266744323, |
| "loss": 3.2231, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.441313460642757, |
| "grad_norm": 0.38468989729881287, |
| "learning_rate": 0.00036692253931275477, |
| "loss": 3.2352, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.455868653935724, |
| "grad_norm": 0.3992660343647003, |
| "learning_rate": 0.00036674781595806636, |
| "loss": 3.2358, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.47042384722869, |
| "grad_norm": 0.399263471364975, |
| "learning_rate": 0.00036657309260337796, |
| "loss": 3.2441, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.484979040521658, |
| "grad_norm": 0.364732563495636, |
| "learning_rate": 0.00036639836924868955, |
| "loss": 3.2357, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.499534233814625, |
| "grad_norm": 0.39381417632102966, |
| "learning_rate": 0.0003662236458940011, |
| "loss": 3.2486, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.499534233814625, |
| "eval_accuracy": 0.37292542269576845, |
| "eval_loss": 3.54436993598938, |
| "eval_runtime": 80.7256, |
| "eval_samples_per_second": 206.267, |
| "eval_steps_per_second": 12.896, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.514089427107592, |
| "grad_norm": 0.3713568449020386, |
| "learning_rate": 0.00036604892253931274, |
| "loss": 3.2264, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.52864462040056, |
| "grad_norm": 0.4108469486236572, |
| "learning_rate": 0.00036587419918462433, |
| "loss": 3.246, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.543199813693526, |
| "grad_norm": 0.4463003873825073, |
| "learning_rate": 0.00036569947582993593, |
| "loss": 3.2509, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.557755006986493, |
| "grad_norm": 0.3756413161754608, |
| "learning_rate": 0.00036552475247524747, |
| "loss": 3.2384, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.57231020027946, |
| "grad_norm": 0.40886735916137695, |
| "learning_rate": 0.00036535002912055906, |
| "loss": 3.2482, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.586865393572428, |
| "grad_norm": 0.40938055515289307, |
| "learning_rate": 0.00036517530576587066, |
| "loss": 3.2437, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.601420586865395, |
| "grad_norm": 0.38628724217414856, |
| "learning_rate": 0.0003650005824111823, |
| "loss": 3.2554, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.61597578015836, |
| "grad_norm": 0.47480347752571106, |
| "learning_rate": 0.00036482585905649385, |
| "loss": 3.2536, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.63053097345133, |
| "grad_norm": 0.38943493366241455, |
| "learning_rate": 0.00036465113570180544, |
| "loss": 3.2585, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.645086166744296, |
| "grad_norm": 0.37005797028541565, |
| "learning_rate": 0.00036447641234711703, |
| "loss": 3.2348, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.659641360037263, |
| "grad_norm": 0.37122735381126404, |
| "learning_rate": 0.0003643016889924286, |
| "loss": 3.252, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.67419655333023, |
| "grad_norm": 0.39388713240623474, |
| "learning_rate": 0.0003641269656377402, |
| "loss": 3.256, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.688751746623197, |
| "grad_norm": 0.37956586480140686, |
| "learning_rate": 0.0003639522422830518, |
| "loss": 3.2492, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.70330693991616, |
| "grad_norm": 0.40325236320495605, |
| "learning_rate": 0.0003637775189283634, |
| "loss": 3.2547, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.717862133209128, |
| "grad_norm": 0.40572822093963623, |
| "learning_rate": 0.00036360279557367495, |
| "loss": 3.2541, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.732417326502095, |
| "grad_norm": 0.38434678316116333, |
| "learning_rate": 0.00036342807221898655, |
| "loss": 3.2534, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.74697251979506, |
| "grad_norm": 0.38787752389907837, |
| "learning_rate": 0.00036325334886429814, |
| "loss": 3.2513, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.76152771308803, |
| "grad_norm": 0.40014466643333435, |
| "learning_rate": 0.0003630786255096098, |
| "loss": 3.2484, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.776082906380996, |
| "grad_norm": 0.3978581428527832, |
| "learning_rate": 0.00036290390215492133, |
| "loss": 3.2673, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.790638099673963, |
| "grad_norm": 0.3934991657733917, |
| "learning_rate": 0.0003627291788002329, |
| "loss": 3.2669, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.790638099673963, |
| "eval_accuracy": 0.3736332896562691, |
| "eval_loss": 3.5357913970947266, |
| "eval_runtime": 80.8543, |
| "eval_samples_per_second": 205.938, |
| "eval_steps_per_second": 12.875, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.80519329296693, |
| "grad_norm": 0.3822427988052368, |
| "learning_rate": 0.0003625544554455445, |
| "loss": 3.2595, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.819748486259897, |
| "grad_norm": 0.3646906316280365, |
| "learning_rate": 0.0003623797320908561, |
| "loss": 3.2499, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.834303679552864, |
| "grad_norm": 0.3811250925064087, |
| "learning_rate": 0.00036220500873616776, |
| "loss": 3.2587, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.84885887284583, |
| "grad_norm": 0.4192066192626953, |
| "learning_rate": 0.0003620302853814793, |
| "loss": 3.2607, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.863414066138798, |
| "grad_norm": 0.39267846941947937, |
| "learning_rate": 0.0003618555620267909, |
| "loss": 3.2504, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.877969259431765, |
| "grad_norm": 0.4115963876247406, |
| "learning_rate": 0.0003616808386721025, |
| "loss": 3.2494, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.892524452724732, |
| "grad_norm": 0.38645699620246887, |
| "learning_rate": 0.00036150611531741403, |
| "loss": 3.2564, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.9070796460177, |
| "grad_norm": 0.38116270303726196, |
| "learning_rate": 0.0003613313919627256, |
| "loss": 3.2698, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.921634839310666, |
| "grad_norm": 0.39199966192245483, |
| "learning_rate": 0.00036115666860803727, |
| "loss": 3.2674, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.936190032603633, |
| "grad_norm": 0.39350375533103943, |
| "learning_rate": 0.00036098194525334887, |
| "loss": 3.2659, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.9507452258966, |
| "grad_norm": 0.39801281690597534, |
| "learning_rate": 0.0003608072218986604, |
| "loss": 3.2631, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.965300419189568, |
| "grad_norm": 0.37695860862731934, |
| "learning_rate": 0.000360632498543972, |
| "loss": 3.2645, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.979855612482535, |
| "grad_norm": 0.3968189060688019, |
| "learning_rate": 0.0003604577751892836, |
| "loss": 3.2745, |
| "step": 68650 |
| }, |
| { |
| "epoch": 19.9944108057755, |
| "grad_norm": 0.4010901153087616, |
| "learning_rate": 0.00036028305183459513, |
| "loss": 3.2698, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.00873311597578, |
| "grad_norm": 0.403112530708313, |
| "learning_rate": 0.0003601083284799068, |
| "loss": 3.2089, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.023288309268747, |
| "grad_norm": 0.4048195481300354, |
| "learning_rate": 0.0003599336051252184, |
| "loss": 3.1616, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.037843502561714, |
| "grad_norm": 0.4083423316478729, |
| "learning_rate": 0.00035975888177052997, |
| "loss": 3.1626, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.05239869585468, |
| "grad_norm": 0.4024417996406555, |
| "learning_rate": 0.0003595841584158415, |
| "loss": 3.1614, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.066953889147648, |
| "grad_norm": 0.4063968062400818, |
| "learning_rate": 0.0003594094350611531, |
| "loss": 3.1729, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.081509082440615, |
| "grad_norm": 0.4128325283527374, |
| "learning_rate": 0.00035923471170646475, |
| "loss": 3.1694, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.081509082440615, |
| "eval_accuracy": 0.3729174308390004, |
| "eval_loss": 3.5497817993164062, |
| "eval_runtime": 80.6758, |
| "eval_samples_per_second": 206.394, |
| "eval_steps_per_second": 12.903, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.096064275733582, |
| "grad_norm": 0.42597904801368713, |
| "learning_rate": 0.00035905998835177635, |
| "loss": 3.1715, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.11061946902655, |
| "grad_norm": 0.3879733383655548, |
| "learning_rate": 0.00035888526499708794, |
| "loss": 3.1818, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.125174662319516, |
| "grad_norm": 0.39771685004234314, |
| "learning_rate": 0.0003587105416423995, |
| "loss": 3.1917, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.139729855612483, |
| "grad_norm": 0.4266234040260315, |
| "learning_rate": 0.0003585358182877111, |
| "loss": 3.1715, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.15428504890545, |
| "grad_norm": 0.3949197232723236, |
| "learning_rate": 0.00035836109493302267, |
| "loss": 3.1837, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.168840242198417, |
| "grad_norm": 0.37246423959732056, |
| "learning_rate": 0.0003581863715783343, |
| "loss": 3.1903, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.183395435491384, |
| "grad_norm": 0.4120253622531891, |
| "learning_rate": 0.00035801164822364586, |
| "loss": 3.1986, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.19795062878435, |
| "grad_norm": 0.3683554530143738, |
| "learning_rate": 0.00035783692486895745, |
| "loss": 3.1952, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.21250582207732, |
| "grad_norm": 0.40573716163635254, |
| "learning_rate": 0.00035766220151426905, |
| "loss": 3.2114, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.227061015370285, |
| "grad_norm": 0.41922950744628906, |
| "learning_rate": 0.0003574874781595806, |
| "loss": 3.1968, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.241616208663253, |
| "grad_norm": 0.40111833810806274, |
| "learning_rate": 0.00035731275480489224, |
| "loss": 3.2154, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.25617140195622, |
| "grad_norm": 0.4058787226676941, |
| "learning_rate": 0.00035713803145020383, |
| "loss": 3.2045, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.270726595249187, |
| "grad_norm": 0.38896775245666504, |
| "learning_rate": 0.0003569633080955154, |
| "loss": 3.2156, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.28528178854215, |
| "grad_norm": 0.40761247277259827, |
| "learning_rate": 0.00035678858474082697, |
| "loss": 3.2195, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.299836981835117, |
| "grad_norm": 0.401114284992218, |
| "learning_rate": 0.00035661386138613856, |
| "loss": 3.2073, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.314392175128084, |
| "grad_norm": 0.4161500036716461, |
| "learning_rate": 0.00035643913803145015, |
| "loss": 3.2107, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.32894736842105, |
| "grad_norm": 0.4027375876903534, |
| "learning_rate": 0.0003562644146767618, |
| "loss": 3.2106, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.34350256171402, |
| "grad_norm": 0.4335081875324249, |
| "learning_rate": 0.00035608969132207334, |
| "loss": 3.21, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.358057755006985, |
| "grad_norm": 0.3922847509384155, |
| "learning_rate": 0.00035591496796738494, |
| "loss": 3.21, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.372612948299953, |
| "grad_norm": 0.4095343351364136, |
| "learning_rate": 0.00035574024461269653, |
| "loss": 3.2119, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.372612948299953, |
| "eval_accuracy": 0.37324756503990464, |
| "eval_loss": 3.549363136291504, |
| "eval_runtime": 81.2803, |
| "eval_samples_per_second": 204.859, |
| "eval_steps_per_second": 12.808, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.38716814159292, |
| "grad_norm": 0.39625418186187744, |
| "learning_rate": 0.0003555655212580081, |
| "loss": 3.231, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.401723334885887, |
| "grad_norm": 0.378966361284256, |
| "learning_rate": 0.00035539079790331967, |
| "loss": 3.2265, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.416278528178854, |
| "grad_norm": 0.3928792476654053, |
| "learning_rate": 0.0003552160745486313, |
| "loss": 3.2201, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.43083372147182, |
| "grad_norm": 0.39280930161476135, |
| "learning_rate": 0.0003550413511939429, |
| "loss": 3.2298, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.445388914764788, |
| "grad_norm": 0.4404183030128479, |
| "learning_rate": 0.0003548666278392545, |
| "loss": 3.2202, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.459944108057755, |
| "grad_norm": 0.4224923253059387, |
| "learning_rate": 0.00035469190448456604, |
| "loss": 3.2235, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.474499301350722, |
| "grad_norm": 0.3936331868171692, |
| "learning_rate": 0.00035451718112987764, |
| "loss": 3.2351, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.48905449464369, |
| "grad_norm": 0.38486790657043457, |
| "learning_rate": 0.0003543424577751893, |
| "loss": 3.2319, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.503609687936656, |
| "grad_norm": 0.3987591862678528, |
| "learning_rate": 0.0003541677344205009, |
| "loss": 3.2132, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.518164881229623, |
| "grad_norm": 0.42211002111434937, |
| "learning_rate": 0.0003539930110658124, |
| "loss": 3.2283, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.53272007452259, |
| "grad_norm": 0.4197330176830292, |
| "learning_rate": 0.000353818287711124, |
| "loss": 3.239, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.547275267815557, |
| "grad_norm": 0.3813665807247162, |
| "learning_rate": 0.0003536435643564356, |
| "loss": 3.2301, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.561830461108524, |
| "grad_norm": 0.40063029527664185, |
| "learning_rate": 0.00035346884100174715, |
| "loss": 3.229, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.57638565440149, |
| "grad_norm": 0.39176955819129944, |
| "learning_rate": 0.0003532941176470588, |
| "loss": 3.2284, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.59094084769446, |
| "grad_norm": 0.4090353846549988, |
| "learning_rate": 0.0003531193942923704, |
| "loss": 3.2307, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.605496040987425, |
| "grad_norm": 0.42943519353866577, |
| "learning_rate": 0.000352944670937682, |
| "loss": 3.2411, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.620051234280393, |
| "grad_norm": 0.38608500361442566, |
| "learning_rate": 0.0003527699475829935, |
| "loss": 3.2443, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.63460642757336, |
| "grad_norm": 0.38904619216918945, |
| "learning_rate": 0.0003525952242283051, |
| "loss": 3.2351, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.649161620866327, |
| "grad_norm": 0.4100426733493805, |
| "learning_rate": 0.00035242050087361677, |
| "loss": 3.2224, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.663716814159294, |
| "grad_norm": 0.3781289756298065, |
| "learning_rate": 0.00035224577751892836, |
| "loss": 3.2432, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.663716814159294, |
| "eval_accuracy": 0.373671721085139, |
| "eval_loss": 3.5387377738952637, |
| "eval_runtime": 80.7695, |
| "eval_samples_per_second": 206.154, |
| "eval_steps_per_second": 12.889, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.678272007452257, |
| "grad_norm": 0.4063239097595215, |
| "learning_rate": 0.0003520710541642399, |
| "loss": 3.2369, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.692827200745224, |
| "grad_norm": 0.39835068583488464, |
| "learning_rate": 0.0003518963308095515, |
| "loss": 3.2454, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.70738239403819, |
| "grad_norm": 0.4016174376010895, |
| "learning_rate": 0.0003517216074548631, |
| "loss": 3.2432, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.72193758733116, |
| "grad_norm": 0.39168989658355713, |
| "learning_rate": 0.0003515468841001747, |
| "loss": 3.2452, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.736492780624125, |
| "grad_norm": 0.3941398561000824, |
| "learning_rate": 0.00035137216074548634, |
| "loss": 3.2391, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.751047973917093, |
| "grad_norm": 0.40801143646240234, |
| "learning_rate": 0.0003511974373907979, |
| "loss": 3.2518, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.76560316721006, |
| "grad_norm": 0.3803810477256775, |
| "learning_rate": 0.00035102271403610947, |
| "loss": 3.2532, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.780158360503027, |
| "grad_norm": 0.4036867022514343, |
| "learning_rate": 0.00035084799068142106, |
| "loss": 3.2342, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.794713553795994, |
| "grad_norm": 0.41173118352890015, |
| "learning_rate": 0.0003506732673267326, |
| "loss": 3.2547, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.80926874708896, |
| "grad_norm": 0.40105438232421875, |
| "learning_rate": 0.00035049854397204425, |
| "loss": 3.2528, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.823823940381928, |
| "grad_norm": 0.40136584639549255, |
| "learning_rate": 0.00035032382061735585, |
| "loss": 3.2499, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.838379133674895, |
| "grad_norm": 0.3959749937057495, |
| "learning_rate": 0.00035014909726266744, |
| "loss": 3.2437, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.852934326967862, |
| "grad_norm": 0.41055890917778015, |
| "learning_rate": 0.000349974373907979, |
| "loss": 3.2504, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.86748952026083, |
| "grad_norm": 0.3951647877693176, |
| "learning_rate": 0.0003497996505532906, |
| "loss": 3.242, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.882044713553796, |
| "grad_norm": 0.3959544599056244, |
| "learning_rate": 0.00034962492719860217, |
| "loss": 3.2547, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.896599906846763, |
| "grad_norm": 0.3922771215438843, |
| "learning_rate": 0.0003494502038439138, |
| "loss": 3.2431, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.91115510013973, |
| "grad_norm": 0.45298540592193604, |
| "learning_rate": 0.00034927548048922536, |
| "loss": 3.2549, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.925710293432697, |
| "grad_norm": 0.3808850049972534, |
| "learning_rate": 0.00034910075713453695, |
| "loss": 3.2601, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.940265486725664, |
| "grad_norm": 0.40494149923324585, |
| "learning_rate": 0.00034892603377984855, |
| "loss": 3.2613, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.95482068001863, |
| "grad_norm": 0.3943309187889099, |
| "learning_rate": 0.0003487513104251601, |
| "loss": 3.2457, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.95482068001863, |
| "eval_accuracy": 0.3740248906379041, |
| "eval_loss": 3.5320045948028564, |
| "eval_runtime": 80.6452, |
| "eval_samples_per_second": 206.472, |
| "eval_steps_per_second": 12.908, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.9693758733116, |
| "grad_norm": 0.4346085488796234, |
| "learning_rate": 0.0003485765870704717, |
| "loss": 3.2557, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.983931066604566, |
| "grad_norm": 0.42012107372283936, |
| "learning_rate": 0.00034840186371578333, |
| "loss": 3.2649, |
| "step": 72100 |
| }, |
| { |
| "epoch": 20.998486259897533, |
| "grad_norm": 0.39540523290634155, |
| "learning_rate": 0.0003482271403610949, |
| "loss": 3.2595, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.01280857009781, |
| "grad_norm": 0.41544434428215027, |
| "learning_rate": 0.0003480524170064065, |
| "loss": 3.1656, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.027363763390778, |
| "grad_norm": 0.43322494626045227, |
| "learning_rate": 0.00034787769365171806, |
| "loss": 3.1503, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.041918956683745, |
| "grad_norm": 0.40528708696365356, |
| "learning_rate": 0.00034770297029702965, |
| "loss": 3.1498, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.05647414997671, |
| "grad_norm": 0.4161442518234253, |
| "learning_rate": 0.0003475282469423413, |
| "loss": 3.157, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.07102934326968, |
| "grad_norm": 0.4052940905094147, |
| "learning_rate": 0.0003473535235876529, |
| "loss": 3.1702, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.085584536562646, |
| "grad_norm": 0.39261749386787415, |
| "learning_rate": 0.00034717880023296444, |
| "loss": 3.1794, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.100139729855613, |
| "grad_norm": 0.42689916491508484, |
| "learning_rate": 0.00034700407687827603, |
| "loss": 3.1775, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.11469492314858, |
| "grad_norm": 0.4157993793487549, |
| "learning_rate": 0.0003468293535235876, |
| "loss": 3.1759, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.129250116441547, |
| "grad_norm": 0.4013780951499939, |
| "learning_rate": 0.00034665463016889916, |
| "loss": 3.1695, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.143805309734514, |
| "grad_norm": 0.41451194882392883, |
| "learning_rate": 0.0003464799068142108, |
| "loss": 3.1736, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.15836050302748, |
| "grad_norm": 0.4027930796146393, |
| "learning_rate": 0.0003463051834595224, |
| "loss": 3.1837, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.172915696320448, |
| "grad_norm": 0.4034494459629059, |
| "learning_rate": 0.000346130460104834, |
| "loss": 3.1869, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.187470889613415, |
| "grad_norm": 0.4335845708847046, |
| "learning_rate": 0.00034595573675014554, |
| "loss": 3.1889, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.202026082906382, |
| "grad_norm": 0.40240973234176636, |
| "learning_rate": 0.00034578101339545714, |
| "loss": 3.1841, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.21658127619935, |
| "grad_norm": 0.40595680475234985, |
| "learning_rate": 0.0003456062900407688, |
| "loss": 3.1843, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.231136469492316, |
| "grad_norm": 0.41040468215942383, |
| "learning_rate": 0.0003454315666860804, |
| "loss": 3.1922, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.245691662785283, |
| "grad_norm": 0.4233982563018799, |
| "learning_rate": 0.0003452568433313919, |
| "loss": 3.2013, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.245691662785283, |
| "eval_accuracy": 0.3730792659385537, |
| "eval_loss": 3.5517797470092773, |
| "eval_runtime": 80.7172, |
| "eval_samples_per_second": 206.288, |
| "eval_steps_per_second": 12.897, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.260246856078247, |
| "grad_norm": 0.4166684150695801, |
| "learning_rate": 0.0003450821199767035, |
| "loss": 3.2078, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.274802049371214, |
| "grad_norm": 0.45311158895492554, |
| "learning_rate": 0.0003449073966220151, |
| "loss": 3.1969, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.28935724266418, |
| "grad_norm": 0.3919454514980316, |
| "learning_rate": 0.0003447326732673267, |
| "loss": 3.193, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.303912435957148, |
| "grad_norm": 0.412781298160553, |
| "learning_rate": 0.0003445579499126383, |
| "loss": 3.1928, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.318467629250115, |
| "grad_norm": 0.43045079708099365, |
| "learning_rate": 0.0003443832265579499, |
| "loss": 3.2015, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.333022822543082, |
| "grad_norm": 0.3964080810546875, |
| "learning_rate": 0.0003442085032032615, |
| "loss": 3.2069, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.34757801583605, |
| "grad_norm": 0.4025682508945465, |
| "learning_rate": 0.0003440337798485731, |
| "loss": 3.1937, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.362133209129016, |
| "grad_norm": 0.4110333025455475, |
| "learning_rate": 0.0003438590564938846, |
| "loss": 3.2084, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.376688402421983, |
| "grad_norm": 0.40080559253692627, |
| "learning_rate": 0.0003436843331391962, |
| "loss": 3.201, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.39124359571495, |
| "grad_norm": 0.4053649306297302, |
| "learning_rate": 0.00034350960978450786, |
| "loss": 3.2129, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.405798789007918, |
| "grad_norm": 0.41887786984443665, |
| "learning_rate": 0.00034333488642981946, |
| "loss": 3.202, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.420353982300885, |
| "grad_norm": 0.45456013083457947, |
| "learning_rate": 0.000343160163075131, |
| "loss": 3.2063, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.43490917559385, |
| "grad_norm": 0.39207208156585693, |
| "learning_rate": 0.0003429854397204426, |
| "loss": 3.2209, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.44946436888682, |
| "grad_norm": 0.3982885777950287, |
| "learning_rate": 0.0003428107163657542, |
| "loss": 3.2061, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.464019562179786, |
| "grad_norm": 0.3940013647079468, |
| "learning_rate": 0.00034263599301106583, |
| "loss": 3.2137, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.478574755472753, |
| "grad_norm": 0.37735414505004883, |
| "learning_rate": 0.0003424612696563774, |
| "loss": 3.2163, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.49312994876572, |
| "grad_norm": 0.40237730741500854, |
| "learning_rate": 0.00034228654630168897, |
| "loss": 3.2157, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.507685142058687, |
| "grad_norm": 0.40874889492988586, |
| "learning_rate": 0.00034211182294700056, |
| "loss": 3.2159, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.522240335351654, |
| "grad_norm": 0.39473971724510193, |
| "learning_rate": 0.0003419370995923121, |
| "loss": 3.212, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.53679552864462, |
| "grad_norm": 0.4250940978527069, |
| "learning_rate": 0.0003417623762376237, |
| "loss": 3.2214, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.53679552864462, |
| "eval_accuracy": 0.37359485822739913, |
| "eval_loss": 3.5439980030059814, |
| "eval_runtime": 80.9944, |
| "eval_samples_per_second": 205.582, |
| "eval_steps_per_second": 12.853, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.551350721937588, |
| "grad_norm": 0.41113966703414917, |
| "learning_rate": 0.00034158765288293534, |
| "loss": 3.2343, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.565905915230555, |
| "grad_norm": 0.42064738273620605, |
| "learning_rate": 0.00034141292952824694, |
| "loss": 3.2064, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.580461108523522, |
| "grad_norm": 0.4072885513305664, |
| "learning_rate": 0.0003412382061735585, |
| "loss": 3.2228, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.59501630181649, |
| "grad_norm": 0.38789334893226624, |
| "learning_rate": 0.0003410634828188701, |
| "loss": 3.2275, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.609571495109456, |
| "grad_norm": 0.44646182656288147, |
| "learning_rate": 0.00034088875946418167, |
| "loss": 3.233, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.624126688402423, |
| "grad_norm": 0.4055272936820984, |
| "learning_rate": 0.0003407140361094933, |
| "loss": 3.2255, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.63868188169539, |
| "grad_norm": 0.4098580479621887, |
| "learning_rate": 0.00034053931275480486, |
| "loss": 3.2336, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.653237074988354, |
| "grad_norm": 0.41825008392333984, |
| "learning_rate": 0.00034036458940011645, |
| "loss": 3.2368, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.66779226828132, |
| "grad_norm": 0.41744616627693176, |
| "learning_rate": 0.00034018986604542804, |
| "loss": 3.2365, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.682347461574288, |
| "grad_norm": 0.3983094096183777, |
| "learning_rate": 0.00034001514269073964, |
| "loss": 3.2272, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.696902654867255, |
| "grad_norm": 0.4330258369445801, |
| "learning_rate": 0.0003398404193360512, |
| "loss": 3.2393, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.711457848160222, |
| "grad_norm": 0.4024401903152466, |
| "learning_rate": 0.00033966569598136283, |
| "loss": 3.2359, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.72601304145319, |
| "grad_norm": 0.4028503894805908, |
| "learning_rate": 0.0003394909726266744, |
| "loss": 3.2444, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.740568234746156, |
| "grad_norm": 0.411708801984787, |
| "learning_rate": 0.000339316249271986, |
| "loss": 3.2328, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.755123428039123, |
| "grad_norm": 0.40173205733299255, |
| "learning_rate": 0.00033914152591729756, |
| "loss": 3.2289, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.76967862133209, |
| "grad_norm": 0.4219015836715698, |
| "learning_rate": 0.00033896680256260915, |
| "loss": 3.2468, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.784233814625058, |
| "grad_norm": 0.38005444407463074, |
| "learning_rate": 0.00033879207920792074, |
| "loss": 3.2404, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.798789007918025, |
| "grad_norm": 0.3999958336353302, |
| "learning_rate": 0.0003386173558532324, |
| "loss": 3.2385, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.81334420121099, |
| "grad_norm": 0.41185006499290466, |
| "learning_rate": 0.00033844263249854393, |
| "loss": 3.2282, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.82789939450396, |
| "grad_norm": 0.419685423374176, |
| "learning_rate": 0.00033826790914385553, |
| "loss": 3.2267, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.82789939450396, |
| "eval_accuracy": 0.3742968488226291, |
| "eval_loss": 3.533151865005493, |
| "eval_runtime": 80.7673, |
| "eval_samples_per_second": 206.16, |
| "eval_steps_per_second": 12.889, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.842454587796926, |
| "grad_norm": 0.40660983324050903, |
| "learning_rate": 0.0003380931857891671, |
| "loss": 3.2389, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.857009781089893, |
| "grad_norm": 0.38929277658462524, |
| "learning_rate": 0.00033791846243447866, |
| "loss": 3.2242, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.87156497438286, |
| "grad_norm": 0.4097066819667816, |
| "learning_rate": 0.0003377437390797903, |
| "loss": 3.2337, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.886120167675827, |
| "grad_norm": 0.418094664812088, |
| "learning_rate": 0.0003375690157251019, |
| "loss": 3.2386, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.900675360968794, |
| "grad_norm": 0.40189552307128906, |
| "learning_rate": 0.0003373942923704135, |
| "loss": 3.2332, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.91523055426176, |
| "grad_norm": 0.44740450382232666, |
| "learning_rate": 0.00033721956901572504, |
| "loss": 3.2361, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.929785747554728, |
| "grad_norm": 0.39397916197776794, |
| "learning_rate": 0.00033704484566103663, |
| "loss": 3.2436, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.944340940847695, |
| "grad_norm": 0.4027535021305084, |
| "learning_rate": 0.00033687012230634823, |
| "loss": 3.2537, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.958896134140662, |
| "grad_norm": 0.3945886492729187, |
| "learning_rate": 0.0003366953989516599, |
| "loss": 3.2489, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.97345132743363, |
| "grad_norm": 0.4445384442806244, |
| "learning_rate": 0.00033652067559697147, |
| "loss": 3.2561, |
| "step": 75500 |
| }, |
| { |
| "epoch": 21.988006520726596, |
| "grad_norm": 0.4155162572860718, |
| "learning_rate": 0.000336345952242283, |
| "loss": 3.2445, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.002328830926874, |
| "grad_norm": 0.4118129014968872, |
| "learning_rate": 0.0003361712288875946, |
| "loss": 3.237, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.01688402421984, |
| "grad_norm": 0.3969469368457794, |
| "learning_rate": 0.0003359965055329062, |
| "loss": 3.1362, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.03143921751281, |
| "grad_norm": 0.41340717673301697, |
| "learning_rate": 0.00033582178217821785, |
| "loss": 3.1479, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.045994410805775, |
| "grad_norm": 0.39047229290008545, |
| "learning_rate": 0.0003356470588235294, |
| "loss": 3.1411, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.060549604098743, |
| "grad_norm": 0.4004015028476715, |
| "learning_rate": 0.000335472335468841, |
| "loss": 3.1407, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.07510479739171, |
| "grad_norm": 0.3989570736885071, |
| "learning_rate": 0.0003352976121141526, |
| "loss": 3.1702, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.089659990684677, |
| "grad_norm": 0.3973100781440735, |
| "learning_rate": 0.0003351228887594641, |
| "loss": 3.1504, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.104215183977644, |
| "grad_norm": 0.4538271427154541, |
| "learning_rate": 0.0003349481654047757, |
| "loss": 3.1522, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.11877037727061, |
| "grad_norm": 0.4443368911743164, |
| "learning_rate": 0.00033477344205008736, |
| "loss": 3.1637, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.11877037727061, |
| "eval_accuracy": 0.37344700887718996, |
| "eval_loss": 3.5503103733062744, |
| "eval_runtime": 80.9432, |
| "eval_samples_per_second": 205.712, |
| "eval_steps_per_second": 12.861, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.133325570563578, |
| "grad_norm": 0.42081600427627563, |
| "learning_rate": 0.00033459871869539895, |
| "loss": 3.1631, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.147880763856545, |
| "grad_norm": 0.4120270311832428, |
| "learning_rate": 0.0003344239953407105, |
| "loss": 3.1674, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.162435957149512, |
| "grad_norm": 0.4159543216228485, |
| "learning_rate": 0.0003342492719860221, |
| "loss": 3.1778, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.17699115044248, |
| "grad_norm": 0.4129338562488556, |
| "learning_rate": 0.0003340745486313337, |
| "loss": 3.1694, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.191546343735446, |
| "grad_norm": 0.41513359546661377, |
| "learning_rate": 0.0003338998252766453, |
| "loss": 3.1752, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.206101537028413, |
| "grad_norm": 0.4214684069156647, |
| "learning_rate": 0.00033372510192195687, |
| "loss": 3.1865, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.22065673032138, |
| "grad_norm": 0.4023610055446625, |
| "learning_rate": 0.00033355037856726847, |
| "loss": 3.1832, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.235211923614347, |
| "grad_norm": 0.44188499450683594, |
| "learning_rate": 0.00033337565521258006, |
| "loss": 3.1718, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.24976711690731, |
| "grad_norm": 0.40883350372314453, |
| "learning_rate": 0.00033320093185789165, |
| "loss": 3.1872, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.264322310200278, |
| "grad_norm": 0.4030850827693939, |
| "learning_rate": 0.0003330262085032032, |
| "loss": 3.1854, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.278877503493245, |
| "grad_norm": 0.42440441250801086, |
| "learning_rate": 0.00033285148514851484, |
| "loss": 3.1908, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.293432696786212, |
| "grad_norm": 0.4246731996536255, |
| "learning_rate": 0.00033267676179382644, |
| "loss": 3.1873, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.30798789007918, |
| "grad_norm": 0.41315269470214844, |
| "learning_rate": 0.00033250203843913803, |
| "loss": 3.1927, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.322543083372146, |
| "grad_norm": 0.46535906195640564, |
| "learning_rate": 0.00033232731508444957, |
| "loss": 3.1885, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.337098276665113, |
| "grad_norm": 0.4259510636329651, |
| "learning_rate": 0.00033215259172976117, |
| "loss": 3.2002, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.35165346995808, |
| "grad_norm": 0.39366328716278076, |
| "learning_rate": 0.00033197786837507276, |
| "loss": 3.2124, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.366208663251047, |
| "grad_norm": 0.4226546883583069, |
| "learning_rate": 0.0003318031450203844, |
| "loss": 3.1926, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.380763856544014, |
| "grad_norm": 0.42132630944252014, |
| "learning_rate": 0.00033162842166569595, |
| "loss": 3.1939, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.39531904983698, |
| "grad_norm": 0.44684508442878723, |
| "learning_rate": 0.00033145369831100754, |
| "loss": 3.2011, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.40987424312995, |
| "grad_norm": 0.4263452887535095, |
| "learning_rate": 0.00033127897495631914, |
| "loss": 3.2052, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.40987424312995, |
| "eval_accuracy": 0.37365256413435677, |
| "eval_loss": 3.5459089279174805, |
| "eval_runtime": 80.6619, |
| "eval_samples_per_second": 206.429, |
| "eval_steps_per_second": 12.906, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.424429436422916, |
| "grad_norm": 0.40241318941116333, |
| "learning_rate": 0.0003311042516016307, |
| "loss": 3.216, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.438984629715883, |
| "grad_norm": 0.4463818371295929, |
| "learning_rate": 0.0003309295282469423, |
| "loss": 3.206, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.45353982300885, |
| "grad_norm": 0.4178822338581085, |
| "learning_rate": 0.0003307548048922539, |
| "loss": 3.1992, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.468095016301817, |
| "grad_norm": 0.40396687388420105, |
| "learning_rate": 0.0003305800815375655, |
| "loss": 3.2046, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.482650209594784, |
| "grad_norm": 0.3951391577720642, |
| "learning_rate": 0.00033040535818287705, |
| "loss": 3.2082, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.49720540288775, |
| "grad_norm": 0.3887251913547516, |
| "learning_rate": 0.00033023063482818865, |
| "loss": 3.2187, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.511760596180718, |
| "grad_norm": 0.4324551224708557, |
| "learning_rate": 0.00033005591147350024, |
| "loss": 3.2267, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.526315789473685, |
| "grad_norm": 0.4100211262702942, |
| "learning_rate": 0.0003298811881188119, |
| "loss": 3.2095, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.540870982766652, |
| "grad_norm": 0.40569639205932617, |
| "learning_rate": 0.00032970646476412343, |
| "loss": 3.2245, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.55542617605962, |
| "grad_norm": 0.4006408154964447, |
| "learning_rate": 0.000329531741409435, |
| "loss": 3.2066, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.569981369352586, |
| "grad_norm": 0.4547775387763977, |
| "learning_rate": 0.0003293570180547466, |
| "loss": 3.2116, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.584536562645553, |
| "grad_norm": 0.4142012298107147, |
| "learning_rate": 0.0003291822947000582, |
| "loss": 3.2038, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.59909175593852, |
| "grad_norm": 0.4487040936946869, |
| "learning_rate": 0.00032900757134536975, |
| "loss": 3.2169, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.613646949231487, |
| "grad_norm": 0.4124731719493866, |
| "learning_rate": 0.0003288328479906814, |
| "loss": 3.2121, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.628202142524454, |
| "grad_norm": 0.41468000411987305, |
| "learning_rate": 0.000328658124635993, |
| "loss": 3.2247, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.64275733581742, |
| "grad_norm": 0.4135768413543701, |
| "learning_rate": 0.0003284834012813046, |
| "loss": 3.2156, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.657312529110385, |
| "grad_norm": 0.4599646031856537, |
| "learning_rate": 0.00032830867792661613, |
| "loss": 3.2172, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.671867722403352, |
| "grad_norm": 0.3863218426704407, |
| "learning_rate": 0.0003281339545719277, |
| "loss": 3.2299, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.68642291569632, |
| "grad_norm": 0.39928266406059265, |
| "learning_rate": 0.0003279592312172394, |
| "loss": 3.2216, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.700978108989286, |
| "grad_norm": 0.4237167537212372, |
| "learning_rate": 0.00032778450786255097, |
| "loss": 3.219, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.700978108989286, |
| "eval_accuracy": 0.3740679056316852, |
| "eval_loss": 3.536851406097412, |
| "eval_runtime": 80.9725, |
| "eval_samples_per_second": 205.638, |
| "eval_steps_per_second": 12.856, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.715533302282253, |
| "grad_norm": 0.47020766139030457, |
| "learning_rate": 0.0003276097845078625, |
| "loss": 3.2167, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.73008849557522, |
| "grad_norm": 0.3998420834541321, |
| "learning_rate": 0.0003274350611531741, |
| "loss": 3.2252, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.744643688868187, |
| "grad_norm": 0.4020930230617523, |
| "learning_rate": 0.0003272603377984857, |
| "loss": 3.2194, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.759198882161154, |
| "grad_norm": 0.4224435091018677, |
| "learning_rate": 0.00032708561444379724, |
| "loss": 3.2285, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.77375407545412, |
| "grad_norm": 0.4129151701927185, |
| "learning_rate": 0.0003269108910891089, |
| "loss": 3.2286, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.78830926874709, |
| "grad_norm": 0.48509684205055237, |
| "learning_rate": 0.0003267361677344205, |
| "loss": 3.232, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.802864462040056, |
| "grad_norm": 0.39431512355804443, |
| "learning_rate": 0.0003265614443797321, |
| "loss": 3.2227, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.817419655333023, |
| "grad_norm": 0.4320165514945984, |
| "learning_rate": 0.0003263867210250436, |
| "loss": 3.2245, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.83197484862599, |
| "grad_norm": 0.42519912123680115, |
| "learning_rate": 0.0003262119976703552, |
| "loss": 3.2311, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.846530041918957, |
| "grad_norm": 0.41585737466812134, |
| "learning_rate": 0.00032603727431566686, |
| "loss": 3.2226, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.861085235211924, |
| "grad_norm": 0.4260340631008148, |
| "learning_rate": 0.00032586255096097845, |
| "loss": 3.2291, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.87564042850489, |
| "grad_norm": 0.4221453070640564, |
| "learning_rate": 0.00032568782760629005, |
| "loss": 3.2287, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.890195621797858, |
| "grad_norm": 0.42393553256988525, |
| "learning_rate": 0.0003255131042516016, |
| "loss": 3.2342, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.904750815090825, |
| "grad_norm": 0.4393133223056793, |
| "learning_rate": 0.0003253383808969132, |
| "loss": 3.2276, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.919306008383792, |
| "grad_norm": 0.4100169837474823, |
| "learning_rate": 0.0003251636575422248, |
| "loss": 3.2379, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.93386120167676, |
| "grad_norm": 0.3902326226234436, |
| "learning_rate": 0.0003249889341875364, |
| "loss": 3.2289, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.948416394969726, |
| "grad_norm": 0.40721309185028076, |
| "learning_rate": 0.00032481421083284796, |
| "loss": 3.2481, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.962971588262693, |
| "grad_norm": 0.4227712154388428, |
| "learning_rate": 0.00032463948747815956, |
| "loss": 3.2341, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.97752678155566, |
| "grad_norm": 0.4186897873878479, |
| "learning_rate": 0.00032446476412347115, |
| "loss": 3.2335, |
| "step": 78950 |
| }, |
| { |
| "epoch": 22.992081974848627, |
| "grad_norm": 0.42580723762512207, |
| "learning_rate": 0.0003242900407687827, |
| "loss": 3.2287, |
| "step": 79000 |
| }, |
| { |
| "epoch": 22.992081974848627, |
| "eval_accuracy": 0.3746499008480888, |
| "eval_loss": 3.5309038162231445, |
| "eval_runtime": 80.8084, |
| "eval_samples_per_second": 206.055, |
| "eval_steps_per_second": 12.882, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.006404285048905, |
| "grad_norm": 0.4105288088321686, |
| "learning_rate": 0.00032411531741409434, |
| "loss": 3.1871, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.020959478341872, |
| "grad_norm": 0.4195365607738495, |
| "learning_rate": 0.00032394059405940593, |
| "loss": 3.1296, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.03551467163484, |
| "grad_norm": 0.4220030903816223, |
| "learning_rate": 0.00032376587070471753, |
| "loss": 3.1216, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.050069864927806, |
| "grad_norm": 0.39678916335105896, |
| "learning_rate": 0.00032359114735002907, |
| "loss": 3.1453, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.064625058220773, |
| "grad_norm": 0.4193277359008789, |
| "learning_rate": 0.00032341642399534066, |
| "loss": 3.152, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.07918025151374, |
| "grad_norm": 0.41241586208343506, |
| "learning_rate": 0.00032324170064065226, |
| "loss": 3.1419, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.093735444806708, |
| "grad_norm": 0.4127679169178009, |
| "learning_rate": 0.0003230669772859639, |
| "loss": 3.1593, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.108290638099675, |
| "grad_norm": 0.418415904045105, |
| "learning_rate": 0.00032289225393127545, |
| "loss": 3.1685, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.12284583139264, |
| "grad_norm": 0.4319279193878174, |
| "learning_rate": 0.00032271753057658704, |
| "loss": 3.1453, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.13740102468561, |
| "grad_norm": 0.4311137795448303, |
| "learning_rate": 0.00032254280722189863, |
| "loss": 3.1656, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.151956217978576, |
| "grad_norm": 0.4086917042732239, |
| "learning_rate": 0.00032236808386721023, |
| "loss": 3.17, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.166511411271543, |
| "grad_norm": 0.39349040389060974, |
| "learning_rate": 0.00032219336051252177, |
| "loss": 3.1663, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.18106660456451, |
| "grad_norm": 0.460382342338562, |
| "learning_rate": 0.0003220186371578334, |
| "loss": 3.1559, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.195621797857477, |
| "grad_norm": 0.41337937116622925, |
| "learning_rate": 0.000321843913803145, |
| "loss": 3.1628, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.210176991150444, |
| "grad_norm": 0.4093291759490967, |
| "learning_rate": 0.0003216691904484566, |
| "loss": 3.1612, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.22473218444341, |
| "grad_norm": 0.4589628577232361, |
| "learning_rate": 0.00032149446709376815, |
| "loss": 3.1779, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.239287377736375, |
| "grad_norm": 0.41005656123161316, |
| "learning_rate": 0.00032131974373907974, |
| "loss": 3.1664, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.25384257102934, |
| "grad_norm": 0.42591962218284607, |
| "learning_rate": 0.0003211450203843914, |
| "loss": 3.1826, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.26839776432231, |
| "grad_norm": 0.420188844203949, |
| "learning_rate": 0.000320970297029703, |
| "loss": 3.1726, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.282952957615276, |
| "grad_norm": 0.431878000497818, |
| "learning_rate": 0.0003207955736750145, |
| "loss": 3.1933, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.282952957615276, |
| "eval_accuracy": 0.3739259326467467, |
| "eval_loss": 3.5472726821899414, |
| "eval_runtime": 112.5166, |
| "eval_samples_per_second": 147.987, |
| "eval_steps_per_second": 9.252, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.297508150908243, |
| "grad_norm": 0.4298938810825348, |
| "learning_rate": 0.0003206208503203261, |
| "loss": 3.1757, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.31206334420121, |
| "grad_norm": 0.4381411373615265, |
| "learning_rate": 0.0003204461269656377, |
| "loss": 3.1799, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.326618537494177, |
| "grad_norm": 0.4479910135269165, |
| "learning_rate": 0.00032027140361094925, |
| "loss": 3.1872, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.341173730787144, |
| "grad_norm": 0.453216016292572, |
| "learning_rate": 0.0003200966802562609, |
| "loss": 3.1866, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.35572892408011, |
| "grad_norm": 0.39096808433532715, |
| "learning_rate": 0.0003199219569015725, |
| "loss": 3.1841, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.370284117373078, |
| "grad_norm": 0.4329829216003418, |
| "learning_rate": 0.0003197472335468841, |
| "loss": 3.182, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.384839310666045, |
| "grad_norm": 0.420579195022583, |
| "learning_rate": 0.00031957251019219563, |
| "loss": 3.1887, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.399394503959012, |
| "grad_norm": 0.43218526244163513, |
| "learning_rate": 0.0003193977868375072, |
| "loss": 3.1906, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.41394969725198, |
| "grad_norm": 0.43532559275627136, |
| "learning_rate": 0.00031922306348281887, |
| "loss": 3.1839, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.428504890544946, |
| "grad_norm": 0.4089663326740265, |
| "learning_rate": 0.00031904834012813047, |
| "loss": 3.1966, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.443060083837914, |
| "grad_norm": 0.42285382747650146, |
| "learning_rate": 0.000318873616773442, |
| "loss": 3.1952, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.45761527713088, |
| "grad_norm": 0.4344150424003601, |
| "learning_rate": 0.0003186988934187536, |
| "loss": 3.2047, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.472170470423848, |
| "grad_norm": 0.4165898263454437, |
| "learning_rate": 0.0003185241700640652, |
| "loss": 3.1949, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.486725663716815, |
| "grad_norm": 0.43037399649620056, |
| "learning_rate": 0.0003183494467093768, |
| "loss": 3.2025, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.50128085700978, |
| "grad_norm": 0.4395822286605835, |
| "learning_rate": 0.0003181747233546884, |
| "loss": 3.187, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.51583605030275, |
| "grad_norm": 0.4260178506374359, |
| "learning_rate": 0.000318, |
| "loss": 3.1952, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.530391243595716, |
| "grad_norm": 0.4306070804595947, |
| "learning_rate": 0.00031782527664531157, |
| "loss": 3.2104, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.544946436888683, |
| "grad_norm": 0.43296995759010315, |
| "learning_rate": 0.00031765055329062317, |
| "loss": 3.2045, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.55950163018165, |
| "grad_norm": 0.41853171586990356, |
| "learning_rate": 0.0003174758299359347, |
| "loss": 3.2064, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.574056823474617, |
| "grad_norm": 0.4107804596424103, |
| "learning_rate": 0.0003173011065812463, |
| "loss": 3.2034, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.574056823474617, |
| "eval_accuracy": 0.3741944825396146, |
| "eval_loss": 3.542091131210327, |
| "eval_runtime": 80.953, |
| "eval_samples_per_second": 205.687, |
| "eval_steps_per_second": 12.859, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.588612016767584, |
| "grad_norm": 0.43713459372520447, |
| "learning_rate": 0.00031712638322655795, |
| "loss": 3.2102, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.60316721006055, |
| "grad_norm": 0.4073716700077057, |
| "learning_rate": 0.00031695165987186954, |
| "loss": 3.2013, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.61772240335352, |
| "grad_norm": 0.4581024944782257, |
| "learning_rate": 0.0003167769365171811, |
| "loss": 3.2071, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.63227759664648, |
| "grad_norm": 0.39260411262512207, |
| "learning_rate": 0.0003166022131624927, |
| "loss": 3.216, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.64683278993945, |
| "grad_norm": 0.43183180689811707, |
| "learning_rate": 0.00031642748980780427, |
| "loss": 3.202, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.661387983232416, |
| "grad_norm": 0.39969465136528015, |
| "learning_rate": 0.0003162527664531159, |
| "loss": 3.2185, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.675943176525383, |
| "grad_norm": 0.4348007142543793, |
| "learning_rate": 0.00031607804309842746, |
| "loss": 3.2154, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.69049836981835, |
| "grad_norm": 0.4009442627429962, |
| "learning_rate": 0.00031590331974373905, |
| "loss": 3.2212, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.705053563111317, |
| "grad_norm": 0.40583693981170654, |
| "learning_rate": 0.00031572859638905065, |
| "loss": 3.2182, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.719608756404284, |
| "grad_norm": 0.417510449886322, |
| "learning_rate": 0.0003155538730343622, |
| "loss": 3.2228, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.73416394969725, |
| "grad_norm": 0.4350418448448181, |
| "learning_rate": 0.0003153791496796738, |
| "loss": 3.2255, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.74871914299022, |
| "grad_norm": 0.41379818320274353, |
| "learning_rate": 0.00031520442632498543, |
| "loss": 3.211, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.763274336283185, |
| "grad_norm": 0.41111648082733154, |
| "learning_rate": 0.000315029702970297, |
| "loss": 3.215, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.777829529576152, |
| "grad_norm": 0.41041308641433716, |
| "learning_rate": 0.0003148549796156086, |
| "loss": 3.2171, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.79238472286912, |
| "grad_norm": 0.4201701581478119, |
| "learning_rate": 0.00031468025626092016, |
| "loss": 3.2136, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.806939916162086, |
| "grad_norm": 0.40979939699172974, |
| "learning_rate": 0.00031450553290623175, |
| "loss": 3.2163, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.821495109455054, |
| "grad_norm": 0.46327435970306396, |
| "learning_rate": 0.0003143308095515434, |
| "loss": 3.2211, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.83605030274802, |
| "grad_norm": 0.4368557035923004, |
| "learning_rate": 0.000314156086196855, |
| "loss": 3.2125, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.850605496040988, |
| "grad_norm": 0.3848903775215149, |
| "learning_rate": 0.00031398136284216654, |
| "loss": 3.2265, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.865160689333955, |
| "grad_norm": 0.4104125499725342, |
| "learning_rate": 0.00031380663948747813, |
| "loss": 3.2039, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.865160689333955, |
| "eval_accuracy": 0.3744857152024273, |
| "eval_loss": 3.5353684425354004, |
| "eval_runtime": 80.7262, |
| "eval_samples_per_second": 206.265, |
| "eval_steps_per_second": 12.895, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.879715882626922, |
| "grad_norm": 0.42983537912368774, |
| "learning_rate": 0.0003136319161327897, |
| "loss": 3.2328, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.89427107591989, |
| "grad_norm": 0.4303259253501892, |
| "learning_rate": 0.00031345719277810127, |
| "loss": 3.2256, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.908826269212856, |
| "grad_norm": 0.396254301071167, |
| "learning_rate": 0.0003132824694234129, |
| "loss": 3.2234, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.923381462505823, |
| "grad_norm": 0.42929279804229736, |
| "learning_rate": 0.0003131077460687245, |
| "loss": 3.224, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.93793665579879, |
| "grad_norm": 0.41629648208618164, |
| "learning_rate": 0.0003129330227140361, |
| "loss": 3.2105, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.952491849091757, |
| "grad_norm": 0.43482571840286255, |
| "learning_rate": 0.00031275829935934764, |
| "loss": 3.2204, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.967047042384724, |
| "grad_norm": 0.4044232666492462, |
| "learning_rate": 0.00031258357600465924, |
| "loss": 3.2109, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.98160223567769, |
| "grad_norm": 0.4244318902492523, |
| "learning_rate": 0.00031240885264997083, |
| "loss": 3.2299, |
| "step": 82400 |
| }, |
| { |
| "epoch": 23.99615742897066, |
| "grad_norm": 0.4148140847682953, |
| "learning_rate": 0.0003122341292952825, |
| "loss": 3.223, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.010479739170936, |
| "grad_norm": 0.41952717304229736, |
| "learning_rate": 0.000312059405940594, |
| "loss": 3.1566, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.025034932463903, |
| "grad_norm": 0.42778280377388, |
| "learning_rate": 0.0003118846825859056, |
| "loss": 3.112, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.03959012575687, |
| "grad_norm": 0.4334464371204376, |
| "learning_rate": 0.0003117099592312172, |
| "loss": 3.1322, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.054145319049837, |
| "grad_norm": 0.4023035168647766, |
| "learning_rate": 0.0003115352358765288, |
| "loss": 3.1326, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.068700512342804, |
| "grad_norm": 0.4370701014995575, |
| "learning_rate": 0.0003113605125218404, |
| "loss": 3.1434, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.08325570563577, |
| "grad_norm": 0.4512668251991272, |
| "learning_rate": 0.000311185789167152, |
| "loss": 3.136, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.09781089892874, |
| "grad_norm": 0.4344898760318756, |
| "learning_rate": 0.0003110110658124636, |
| "loss": 3.1312, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.112366092221706, |
| "grad_norm": 0.43205738067626953, |
| "learning_rate": 0.0003108363424577752, |
| "loss": 3.1412, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.126921285514673, |
| "grad_norm": 0.4670618176460266, |
| "learning_rate": 0.0003106616191030867, |
| "loss": 3.1449, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.14147647880764, |
| "grad_norm": 0.4348713457584381, |
| "learning_rate": 0.0003104868957483983, |
| "loss": 3.1422, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.156031672100607, |
| "grad_norm": 0.4189150035381317, |
| "learning_rate": 0.00031031217239370996, |
| "loss": 3.1477, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.156031672100607, |
| "eval_accuracy": 0.373769973912464, |
| "eval_loss": 3.550147533416748, |
| "eval_runtime": 80.7406, |
| "eval_samples_per_second": 206.228, |
| "eval_steps_per_second": 12.893, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.170586865393574, |
| "grad_norm": 0.43405938148498535, |
| "learning_rate": 0.00031013744903902156, |
| "loss": 3.1574, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.18514205868654, |
| "grad_norm": 0.42298412322998047, |
| "learning_rate": 0.0003099627256843331, |
| "loss": 3.1557, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.199697251979508, |
| "grad_norm": 0.48131051659584045, |
| "learning_rate": 0.0003097880023296447, |
| "loss": 3.1597, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.21425244527247, |
| "grad_norm": 0.4325181543827057, |
| "learning_rate": 0.0003096132789749563, |
| "loss": 3.1609, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.22880763856544, |
| "grad_norm": 0.4598114490509033, |
| "learning_rate": 0.00030943855562026794, |
| "loss": 3.159, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.243362831858406, |
| "grad_norm": 0.42593055963516235, |
| "learning_rate": 0.0003092638322655795, |
| "loss": 3.1595, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.257918025151373, |
| "grad_norm": 0.44727107882499695, |
| "learning_rate": 0.00030908910891089107, |
| "loss": 3.1577, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.27247321844434, |
| "grad_norm": 0.4468149244785309, |
| "learning_rate": 0.00030891438555620266, |
| "loss": 3.1653, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.287028411737307, |
| "grad_norm": 0.44687238335609436, |
| "learning_rate": 0.0003087396622015142, |
| "loss": 3.1739, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.301583605030274, |
| "grad_norm": 0.4304378032684326, |
| "learning_rate": 0.0003085649388468258, |
| "loss": 3.183, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.31613879832324, |
| "grad_norm": 0.4155465364456177, |
| "learning_rate": 0.00030839021549213745, |
| "loss": 3.1836, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.330693991616208, |
| "grad_norm": 0.43251293897628784, |
| "learning_rate": 0.00030821549213744904, |
| "loss": 3.1771, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.345249184909175, |
| "grad_norm": 0.4176350235939026, |
| "learning_rate": 0.0003080407687827606, |
| "loss": 3.1774, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.359804378202142, |
| "grad_norm": 0.40085741877555847, |
| "learning_rate": 0.0003078660454280722, |
| "loss": 3.1868, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.37435957149511, |
| "grad_norm": 0.4163469076156616, |
| "learning_rate": 0.00030769132207338377, |
| "loss": 3.1761, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.388914764788076, |
| "grad_norm": 0.4343576431274414, |
| "learning_rate": 0.00030751659871869536, |
| "loss": 3.1805, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.403469958081043, |
| "grad_norm": 0.43104803562164307, |
| "learning_rate": 0.00030734187536400696, |
| "loss": 3.1765, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.41802515137401, |
| "grad_norm": 0.44864028692245483, |
| "learning_rate": 0.00030716715200931855, |
| "loss": 3.1841, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.432580344666977, |
| "grad_norm": 0.40288910269737244, |
| "learning_rate": 0.00030699242865463015, |
| "loss": 3.1825, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.447135537959944, |
| "grad_norm": 0.4211686849594116, |
| "learning_rate": 0.00030681770529994174, |
| "loss": 3.1925, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.447135537959944, |
| "eval_accuracy": 0.37384166556876575, |
| "eval_loss": 3.5458691120147705, |
| "eval_runtime": 81.0767, |
| "eval_samples_per_second": 205.373, |
| "eval_steps_per_second": 12.84, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.46169073125291, |
| "grad_norm": 0.4207664728164673, |
| "learning_rate": 0.0003066429819452533, |
| "loss": 3.1823, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.47624592454588, |
| "grad_norm": 0.4400728642940521, |
| "learning_rate": 0.00030646825859056493, |
| "loss": 3.1884, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.490801117838846, |
| "grad_norm": 0.42612317204475403, |
| "learning_rate": 0.0003062935352358765, |
| "loss": 3.1869, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.505356311131813, |
| "grad_norm": 0.4102015495300293, |
| "learning_rate": 0.0003061188118811881, |
| "loss": 3.1981, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.51991150442478, |
| "grad_norm": 0.41967734694480896, |
| "learning_rate": 0.00030594408852649966, |
| "loss": 3.1881, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.534466697717747, |
| "grad_norm": 0.4522620737552643, |
| "learning_rate": 0.00030576936517181125, |
| "loss": 3.2006, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.549021891010714, |
| "grad_norm": 0.43760499358177185, |
| "learning_rate": 0.00030559464181712285, |
| "loss": 3.1868, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.56357708430368, |
| "grad_norm": 0.43872225284576416, |
| "learning_rate": 0.0003054199184624345, |
| "loss": 3.2031, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.578132277596648, |
| "grad_norm": 0.4419388473033905, |
| "learning_rate": 0.00030524519510774604, |
| "loss": 3.1955, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.592687470889615, |
| "grad_norm": 0.4160863757133484, |
| "learning_rate": 0.00030507047175305763, |
| "loss": 3.19, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.60724266418258, |
| "grad_norm": 0.42310062050819397, |
| "learning_rate": 0.0003048957483983692, |
| "loss": 3.1962, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.621797857475546, |
| "grad_norm": 0.4006286561489105, |
| "learning_rate": 0.00030472102504368076, |
| "loss": 3.2033, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.636353050768513, |
| "grad_norm": 0.4215250015258789, |
| "learning_rate": 0.0003045463016889924, |
| "loss": 3.1891, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.65090824406148, |
| "grad_norm": 0.42358335852622986, |
| "learning_rate": 0.000304371578334304, |
| "loss": 3.1928, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.665463437354447, |
| "grad_norm": 0.46760815382003784, |
| "learning_rate": 0.0003041968549796156, |
| "loss": 3.2008, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.680018630647414, |
| "grad_norm": 0.40488535165786743, |
| "learning_rate": 0.00030402213162492714, |
| "loss": 3.1954, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.69457382394038, |
| "grad_norm": 0.4222564995288849, |
| "learning_rate": 0.00030384740827023874, |
| "loss": 3.2137, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.709129017233348, |
| "grad_norm": 0.38904741406440735, |
| "learning_rate": 0.00030367268491555033, |
| "loss": 3.2102, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.723684210526315, |
| "grad_norm": 0.413474977016449, |
| "learning_rate": 0.000303497961560862, |
| "loss": 3.197, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.738239403819282, |
| "grad_norm": 0.4101502001285553, |
| "learning_rate": 0.00030332323820617357, |
| "loss": 3.2035, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.738239403819282, |
| "eval_accuracy": 0.37460864876388894, |
| "eval_loss": 3.536444902420044, |
| "eval_runtime": 80.7649, |
| "eval_samples_per_second": 206.166, |
| "eval_steps_per_second": 12.889, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.75279459711225, |
| "grad_norm": 0.4247758388519287, |
| "learning_rate": 0.0003031485148514851, |
| "loss": 3.2134, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.767349790405216, |
| "grad_norm": 0.4600059688091278, |
| "learning_rate": 0.0003029737914967967, |
| "loss": 3.2086, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.781904983698183, |
| "grad_norm": 0.4369814693927765, |
| "learning_rate": 0.0003027990681421083, |
| "loss": 3.2068, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.79646017699115, |
| "grad_norm": 0.44090044498443604, |
| "learning_rate": 0.00030262434478741984, |
| "loss": 3.2051, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.811015370284117, |
| "grad_norm": 0.4195573031902313, |
| "learning_rate": 0.0003024496214327315, |
| "loss": 3.2186, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.825570563577084, |
| "grad_norm": 0.4263896942138672, |
| "learning_rate": 0.0003022748980780431, |
| "loss": 3.207, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.84012575687005, |
| "grad_norm": 0.4285949468612671, |
| "learning_rate": 0.0003021001747233547, |
| "loss": 3.2114, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.85468095016302, |
| "grad_norm": 0.40272489190101624, |
| "learning_rate": 0.0003019254513686662, |
| "loss": 3.2162, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.869236143455986, |
| "grad_norm": 0.4345651865005493, |
| "learning_rate": 0.0003017507280139778, |
| "loss": 3.2167, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.883791336748953, |
| "grad_norm": 0.4243450462818146, |
| "learning_rate": 0.00030157600465928946, |
| "loss": 3.2115, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.89834653004192, |
| "grad_norm": 0.48578545451164246, |
| "learning_rate": 0.00030140128130460106, |
| "loss": 3.2121, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.912901723334887, |
| "grad_norm": 0.4001424014568329, |
| "learning_rate": 0.0003012265579499126, |
| "loss": 3.22, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.927456916627854, |
| "grad_norm": 0.4222460389137268, |
| "learning_rate": 0.0003010518345952242, |
| "loss": 3.2037, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.94201210992082, |
| "grad_norm": 0.4225076735019684, |
| "learning_rate": 0.0003008771112405358, |
| "loss": 3.2165, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.956567303213788, |
| "grad_norm": 0.4223094880580902, |
| "learning_rate": 0.0003007023878858473, |
| "loss": 3.2222, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.971122496506755, |
| "grad_norm": 0.42355045676231384, |
| "learning_rate": 0.000300527664531159, |
| "loss": 3.2131, |
| "step": 85800 |
| }, |
| { |
| "epoch": 24.985677689799722, |
| "grad_norm": 0.4157885015010834, |
| "learning_rate": 0.00030035294117647057, |
| "loss": 3.2224, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.0, |
| "grad_norm": 1.2900067567825317, |
| "learning_rate": 0.00030017821782178216, |
| "loss": 3.2182, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.014555193292967, |
| "grad_norm": 0.4563400447368622, |
| "learning_rate": 0.00030000349446709376, |
| "loss": 3.1204, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.029110386585934, |
| "grad_norm": 0.4078611731529236, |
| "learning_rate": 0.00029982877111240535, |
| "loss": 3.1201, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.029110386585934, |
| "eval_accuracy": 0.373996096448078, |
| "eval_loss": 3.5485706329345703, |
| "eval_runtime": 81.302, |
| "eval_samples_per_second": 204.804, |
| "eval_steps_per_second": 12.804, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.0436655798789, |
| "grad_norm": 0.4429575204849243, |
| "learning_rate": 0.00029965404775771694, |
| "loss": 3.1123, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.05822077317187, |
| "grad_norm": 0.45872652530670166, |
| "learning_rate": 0.0002994793244030285, |
| "loss": 3.1305, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.072775966464835, |
| "grad_norm": 0.44822704792022705, |
| "learning_rate": 0.00029930460104834013, |
| "loss": 3.1239, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.087331159757802, |
| "grad_norm": 0.4247264564037323, |
| "learning_rate": 0.0002991298776936517, |
| "loss": 3.1238, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.10188635305077, |
| "grad_norm": 0.4382804036140442, |
| "learning_rate": 0.00029895515433896327, |
| "loss": 3.136, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.116441546343736, |
| "grad_norm": 0.43370020389556885, |
| "learning_rate": 0.00029878043098427486, |
| "loss": 3.1316, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.130996739636704, |
| "grad_norm": 0.4210694134235382, |
| "learning_rate": 0.00029860570762958646, |
| "loss": 3.1338, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.14555193292967, |
| "grad_norm": 0.4323810935020447, |
| "learning_rate": 0.00029843098427489805, |
| "loss": 3.1483, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.160107126222638, |
| "grad_norm": 0.42826271057128906, |
| "learning_rate": 0.00029825626092020964, |
| "loss": 3.1422, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.174662319515605, |
| "grad_norm": 0.41014766693115234, |
| "learning_rate": 0.00029808153756552124, |
| "loss": 3.137, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.189217512808572, |
| "grad_norm": 0.4239286184310913, |
| "learning_rate": 0.00029790681421083283, |
| "loss": 3.1598, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.203772706101535, |
| "grad_norm": 0.4386337101459503, |
| "learning_rate": 0.00029773209085614443, |
| "loss": 3.1498, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.218327899394502, |
| "grad_norm": 0.41120046377182007, |
| "learning_rate": 0.00029755736750145597, |
| "loss": 3.1447, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.23288309268747, |
| "grad_norm": 0.4621043801307678, |
| "learning_rate": 0.0002973826441467676, |
| "loss": 3.1409, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.247438285980436, |
| "grad_norm": 0.43041834235191345, |
| "learning_rate": 0.00029720792079207916, |
| "loss": 3.1516, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.261993479273404, |
| "grad_norm": 0.44179674983024597, |
| "learning_rate": 0.00029703319743739075, |
| "loss": 3.1575, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.27654867256637, |
| "grad_norm": 0.4127698838710785, |
| "learning_rate": 0.00029685847408270234, |
| "loss": 3.156, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.291103865859338, |
| "grad_norm": 0.43058666586875916, |
| "learning_rate": 0.00029668375072801394, |
| "loss": 3.1695, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.305659059152305, |
| "grad_norm": 0.4375165104866028, |
| "learning_rate": 0.00029650902737332553, |
| "loss": 3.1694, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.320214252445272, |
| "grad_norm": 0.41480395197868347, |
| "learning_rate": 0.00029633430401863713, |
| "loss": 3.1666, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.320214252445272, |
| "eval_accuracy": 0.3742105837804562, |
| "eval_loss": 3.551380157470703, |
| "eval_runtime": 81.1258, |
| "eval_samples_per_second": 205.249, |
| "eval_steps_per_second": 12.832, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.33476944573824, |
| "grad_norm": 0.411484032869339, |
| "learning_rate": 0.0002961595806639487, |
| "loss": 3.1611, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.349324639031206, |
| "grad_norm": 0.42418307065963745, |
| "learning_rate": 0.0002959848573092603, |
| "loss": 3.1661, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.363879832324173, |
| "grad_norm": 0.4323679506778717, |
| "learning_rate": 0.0002958101339545719, |
| "loss": 3.1743, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.37843502561714, |
| "grad_norm": 0.4265500009059906, |
| "learning_rate": 0.0002956354105998835, |
| "loss": 3.1673, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.392990218910107, |
| "grad_norm": 0.4311152994632721, |
| "learning_rate": 0.0002954606872451951, |
| "loss": 3.1649, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.407545412203074, |
| "grad_norm": 0.4068090617656708, |
| "learning_rate": 0.0002952859638905067, |
| "loss": 3.1745, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.42210060549604, |
| "grad_norm": 0.43490806221961975, |
| "learning_rate": 0.00029511124053581823, |
| "loss": 3.1863, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.43665579878901, |
| "grad_norm": 0.44787442684173584, |
| "learning_rate": 0.0002949365171811299, |
| "loss": 3.1737, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.451210992081975, |
| "grad_norm": 0.4076807498931885, |
| "learning_rate": 0.0002947617938264414, |
| "loss": 3.1773, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.465766185374942, |
| "grad_norm": 0.43871185183525085, |
| "learning_rate": 0.000294587070471753, |
| "loss": 3.1798, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.48032137866791, |
| "grad_norm": 0.43555015325546265, |
| "learning_rate": 0.0002944123471170646, |
| "loss": 3.1825, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.494876571960877, |
| "grad_norm": 0.45612266659736633, |
| "learning_rate": 0.0002942376237623762, |
| "loss": 3.1831, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.509431765253844, |
| "grad_norm": 0.4167602062225342, |
| "learning_rate": 0.0002940629004076878, |
| "loss": 3.1823, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.52398695854681, |
| "grad_norm": 0.4125515818595886, |
| "learning_rate": 0.0002938881770529994, |
| "loss": 3.1718, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.538542151839778, |
| "grad_norm": 0.404072642326355, |
| "learning_rate": 0.000293713453698311, |
| "loss": 3.1855, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.553097345132745, |
| "grad_norm": 0.44043952226638794, |
| "learning_rate": 0.0002935387303436226, |
| "loss": 3.1853, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.567652538425712, |
| "grad_norm": 0.4379127323627472, |
| "learning_rate": 0.0002933640069889342, |
| "loss": 3.1877, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.58220773171868, |
| "grad_norm": 0.42356306314468384, |
| "learning_rate": 0.0002931892836342457, |
| "loss": 3.1865, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.596762925011646, |
| "grad_norm": 0.4248952269554138, |
| "learning_rate": 0.00029301456027955736, |
| "loss": 3.1818, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.61131811830461, |
| "grad_norm": 0.4216628968715668, |
| "learning_rate": 0.0002928398369248689, |
| "loss": 3.1911, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.61131811830461, |
| "eval_accuracy": 0.3744086172900766, |
| "eval_loss": 3.5424342155456543, |
| "eval_runtime": 80.9438, |
| "eval_samples_per_second": 205.711, |
| "eval_steps_per_second": 12.861, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.625873311597577, |
| "grad_norm": 0.46165731549263, |
| "learning_rate": 0.0002926651135701805, |
| "loss": 3.1888, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.640428504890544, |
| "grad_norm": 0.4584922194480896, |
| "learning_rate": 0.00029249039021549215, |
| "loss": 3.1891, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.65498369818351, |
| "grad_norm": 0.42006292939186096, |
| "learning_rate": 0.0002923156668608037, |
| "loss": 3.1924, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.669538891476478, |
| "grad_norm": 0.42789939045906067, |
| "learning_rate": 0.0002921409435061153, |
| "loss": 3.1871, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.684094084769445, |
| "grad_norm": 0.431919664144516, |
| "learning_rate": 0.0002919662201514269, |
| "loss": 3.2093, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.698649278062412, |
| "grad_norm": 0.4444367587566376, |
| "learning_rate": 0.00029179149679673847, |
| "loss": 3.2044, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.71320447135538, |
| "grad_norm": 0.42872244119644165, |
| "learning_rate": 0.00029161677344205007, |
| "loss": 3.195, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.727759664648346, |
| "grad_norm": 0.4801745116710663, |
| "learning_rate": 0.00029144205008736166, |
| "loss": 3.199, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.742314857941313, |
| "grad_norm": 0.4228304624557495, |
| "learning_rate": 0.00029126732673267325, |
| "loss": 3.1929, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.75687005123428, |
| "grad_norm": 0.3924422264099121, |
| "learning_rate": 0.00029109260337798485, |
| "loss": 3.1874, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.771425244527247, |
| "grad_norm": 0.4239204525947571, |
| "learning_rate": 0.00029091788002329644, |
| "loss": 3.1935, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.785980437820214, |
| "grad_norm": 0.4381544589996338, |
| "learning_rate": 0.000290743156668608, |
| "loss": 3.1957, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.80053563111318, |
| "grad_norm": 0.4344896674156189, |
| "learning_rate": 0.00029056843331391963, |
| "loss": 3.2097, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.81509082440615, |
| "grad_norm": 0.46235960721969604, |
| "learning_rate": 0.00029039370995923117, |
| "loss": 3.2021, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.829646017699115, |
| "grad_norm": 0.42208707332611084, |
| "learning_rate": 0.00029021898660454277, |
| "loss": 3.1917, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.844201210992082, |
| "grad_norm": 0.4277696907520294, |
| "learning_rate": 0.00029004426324985436, |
| "loss": 3.2036, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.85875640428505, |
| "grad_norm": 0.48033884167671204, |
| "learning_rate": 0.00028986953989516595, |
| "loss": 3.1935, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.873311597578017, |
| "grad_norm": 0.4262528419494629, |
| "learning_rate": 0.00028969481654047755, |
| "loss": 3.2056, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.887866790870984, |
| "grad_norm": 0.4301201403141022, |
| "learning_rate": 0.00028952009318578914, |
| "loss": 3.205, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.90242198416395, |
| "grad_norm": 0.43471401929855347, |
| "learning_rate": 0.00028934536983110074, |
| "loss": 3.2058, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.90242198416395, |
| "eval_accuracy": 0.37485216534070404, |
| "eval_loss": 3.5316591262817383, |
| "eval_runtime": 81.2596, |
| "eval_samples_per_second": 204.911, |
| "eval_steps_per_second": 12.811, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.916977177456918, |
| "grad_norm": 0.40113165974617004, |
| "learning_rate": 0.00028917064647641233, |
| "loss": 3.2142, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.931532370749885, |
| "grad_norm": 0.43097642064094543, |
| "learning_rate": 0.0002889959231217239, |
| "loss": 3.2001, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.946087564042852, |
| "grad_norm": 0.41572996973991394, |
| "learning_rate": 0.0002888211997670355, |
| "loss": 3.2148, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.96064275733582, |
| "grad_norm": 0.4611375033855438, |
| "learning_rate": 0.0002886464764123471, |
| "loss": 3.2088, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.975197950628786, |
| "grad_norm": 0.42225733399391174, |
| "learning_rate": 0.0002884717530576587, |
| "loss": 3.2203, |
| "step": 89250 |
| }, |
| { |
| "epoch": 25.989753143921753, |
| "grad_norm": 0.42600035667419434, |
| "learning_rate": 0.00028829702970297025, |
| "loss": 3.2183, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.00407545412203, |
| "grad_norm": 0.43270421028137207, |
| "learning_rate": 0.0002881223063482819, |
| "loss": 3.1643, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.018630647414998, |
| "grad_norm": 0.4024989604949951, |
| "learning_rate": 0.00028794758299359344, |
| "loss": 3.1084, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.033185840707965, |
| "grad_norm": 0.42071107029914856, |
| "learning_rate": 0.00028777285963890503, |
| "loss": 3.1121, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.047741034000932, |
| "grad_norm": 0.48040148615837097, |
| "learning_rate": 0.0002875981362842166, |
| "loss": 3.1319, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.0622962272939, |
| "grad_norm": 0.44271162152290344, |
| "learning_rate": 0.0002874234129295282, |
| "loss": 3.1191, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.076851420586866, |
| "grad_norm": 0.43037092685699463, |
| "learning_rate": 0.0002872486895748398, |
| "loss": 3.1188, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.091406613879833, |
| "grad_norm": 0.4287130534648895, |
| "learning_rate": 0.0002870739662201514, |
| "loss": 3.1361, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.1059618071728, |
| "grad_norm": 0.44484972953796387, |
| "learning_rate": 0.000286899242865463, |
| "loss": 3.1284, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.120517000465767, |
| "grad_norm": 0.446280300617218, |
| "learning_rate": 0.0002867245195107746, |
| "loss": 3.1364, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.135072193758734, |
| "grad_norm": 0.4400237798690796, |
| "learning_rate": 0.0002865497961560862, |
| "loss": 3.1289, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.1496273870517, |
| "grad_norm": 0.5028014779090881, |
| "learning_rate": 0.00028637507280139773, |
| "loss": 3.1344, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.16418258034467, |
| "grad_norm": 0.43802520632743835, |
| "learning_rate": 0.0002862003494467094, |
| "loss": 3.1506, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.178737773637636, |
| "grad_norm": 0.4275193214416504, |
| "learning_rate": 0.0002860256260920209, |
| "loss": 3.1381, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.1932929669306, |
| "grad_norm": 0.40965625643730164, |
| "learning_rate": 0.0002858509027373325, |
| "loss": 3.1429, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.1932929669306, |
| "eval_accuracy": 0.37406802315899057, |
| "eval_loss": 3.5528557300567627, |
| "eval_runtime": 80.9649, |
| "eval_samples_per_second": 205.657, |
| "eval_steps_per_second": 12.857, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.207848160223566, |
| "grad_norm": 0.4387039840221405, |
| "learning_rate": 0.0002856761793826441, |
| "loss": 3.149, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.222403353516533, |
| "grad_norm": 0.4295991063117981, |
| "learning_rate": 0.0002855014560279557, |
| "loss": 3.1403, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.2369585468095, |
| "grad_norm": 0.4722476005554199, |
| "learning_rate": 0.0002853267326732673, |
| "loss": 3.1477, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.251513740102467, |
| "grad_norm": 0.4025357961654663, |
| "learning_rate": 0.0002851520093185789, |
| "loss": 3.1542, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.266068933395434, |
| "grad_norm": 0.4364268183708191, |
| "learning_rate": 0.0002849772859638905, |
| "loss": 3.1479, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.2806241266884, |
| "grad_norm": 0.4534520208835602, |
| "learning_rate": 0.0002848025626092021, |
| "loss": 3.1541, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.29517931998137, |
| "grad_norm": 0.4252232611179352, |
| "learning_rate": 0.0002846278392545137, |
| "loss": 3.1336, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.309734513274336, |
| "grad_norm": 0.4548446238040924, |
| "learning_rate": 0.00028445311589982527, |
| "loss": 3.1537, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.324289706567303, |
| "grad_norm": 0.4191991090774536, |
| "learning_rate": 0.00028427839254513686, |
| "loss": 3.1613, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.33884489986027, |
| "grad_norm": 0.44181784987449646, |
| "learning_rate": 0.00028410366919044846, |
| "loss": 3.1577, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.353400093153237, |
| "grad_norm": 0.43153467774391174, |
| "learning_rate": 0.00028392894583576, |
| "loss": 3.1516, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.367955286446204, |
| "grad_norm": 0.479218453168869, |
| "learning_rate": 0.00028375422248107165, |
| "loss": 3.1653, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.38251047973917, |
| "grad_norm": 0.454546183347702, |
| "learning_rate": 0.0002835794991263832, |
| "loss": 3.1591, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.397065673032138, |
| "grad_norm": 0.4449160397052765, |
| "learning_rate": 0.0002834047757716948, |
| "loss": 3.1657, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.411620866325105, |
| "grad_norm": 0.4321635365486145, |
| "learning_rate": 0.0002832300524170064, |
| "loss": 3.1687, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.426176059618072, |
| "grad_norm": 0.4208000898361206, |
| "learning_rate": 0.00028305532906231797, |
| "loss": 3.1763, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.44073125291104, |
| "grad_norm": 0.4508783221244812, |
| "learning_rate": 0.00028288060570762956, |
| "loss": 3.1613, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.455286446204006, |
| "grad_norm": 0.426058828830719, |
| "learning_rate": 0.00028270588235294116, |
| "loss": 3.1703, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.469841639496973, |
| "grad_norm": 0.4629392921924591, |
| "learning_rate": 0.00028253115899825275, |
| "loss": 3.1736, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.48439683278994, |
| "grad_norm": 0.4240282475948334, |
| "learning_rate": 0.0002823564356435643, |
| "loss": 3.174, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.48439683278994, |
| "eval_accuracy": 0.3747352256718184, |
| "eval_loss": 3.540186643600464, |
| "eval_runtime": 81.0892, |
| "eval_samples_per_second": 205.342, |
| "eval_steps_per_second": 12.838, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.498952026082907, |
| "grad_norm": 0.43761128187179565, |
| "learning_rate": 0.00028218171228887594, |
| "loss": 3.1817, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.513507219375875, |
| "grad_norm": 0.4533585011959076, |
| "learning_rate": 0.0002820069889341875, |
| "loss": 3.1722, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.52806241266884, |
| "grad_norm": 0.4645179510116577, |
| "learning_rate": 0.00028183226557949913, |
| "loss": 3.1618, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.54261760596181, |
| "grad_norm": 0.43274015188217163, |
| "learning_rate": 0.00028165754222481067, |
| "loss": 3.1659, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.557172799254776, |
| "grad_norm": 0.4452027380466461, |
| "learning_rate": 0.00028148281887012226, |
| "loss": 3.1752, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.571727992547743, |
| "grad_norm": 0.4114816188812256, |
| "learning_rate": 0.0002813080955154339, |
| "loss": 3.1745, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.586283185840706, |
| "grad_norm": 0.42357751727104187, |
| "learning_rate": 0.00028113337216074545, |
| "loss": 3.1679, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.600838379133673, |
| "grad_norm": 0.43339043855667114, |
| "learning_rate": 0.00028095864880605705, |
| "loss": 3.1821, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.61539357242664, |
| "grad_norm": 0.4122457504272461, |
| "learning_rate": 0.00028078392545136864, |
| "loss": 3.1751, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.629948765719607, |
| "grad_norm": 0.45421797037124634, |
| "learning_rate": 0.00028060920209668023, |
| "loss": 3.1877, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.644503959012575, |
| "grad_norm": 0.4309142231941223, |
| "learning_rate": 0.00028043447874199183, |
| "loss": 3.1805, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.65905915230554, |
| "grad_norm": 0.4100479185581207, |
| "learning_rate": 0.0002802597553873034, |
| "loss": 3.1762, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.67361434559851, |
| "grad_norm": 0.44715777039527893, |
| "learning_rate": 0.000280085032032615, |
| "loss": 3.1874, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.688169538891476, |
| "grad_norm": 0.43265244364738464, |
| "learning_rate": 0.00027991030867792656, |
| "loss": 3.18, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.702724732184443, |
| "grad_norm": 0.43956711888313293, |
| "learning_rate": 0.0002797355853232382, |
| "loss": 3.1809, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.71727992547741, |
| "grad_norm": 0.44515565037727356, |
| "learning_rate": 0.00027956086196854975, |
| "loss": 3.1833, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.731835118770377, |
| "grad_norm": 0.4233599305152893, |
| "learning_rate": 0.0002793861386138614, |
| "loss": 3.1806, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.746390312063344, |
| "grad_norm": 0.4397084712982178, |
| "learning_rate": 0.00027921141525917293, |
| "loss": 3.1943, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.76094550535631, |
| "grad_norm": 0.4042757749557495, |
| "learning_rate": 0.00027903669190448453, |
| "loss": 3.187, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.775500698649278, |
| "grad_norm": 0.4798316955566406, |
| "learning_rate": 0.0002788619685497961, |
| "loss": 3.1897, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.775500698649278, |
| "eval_accuracy": 0.3748420579924385, |
| "eval_loss": 3.5378575325012207, |
| "eval_runtime": 80.9637, |
| "eval_samples_per_second": 205.66, |
| "eval_steps_per_second": 12.858, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.790055891942245, |
| "grad_norm": 0.4386082887649536, |
| "learning_rate": 0.0002786872451951077, |
| "loss": 3.2016, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.804611085235212, |
| "grad_norm": 0.49084118008613586, |
| "learning_rate": 0.0002785125218404193, |
| "loss": 3.2039, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.81916627852818, |
| "grad_norm": 0.4145147204399109, |
| "learning_rate": 0.0002783377984857309, |
| "loss": 3.1862, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.833721471821146, |
| "grad_norm": 0.4321693480014801, |
| "learning_rate": 0.0002781630751310425, |
| "loss": 3.1935, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.848276665114113, |
| "grad_norm": 0.4226089417934418, |
| "learning_rate": 0.0002779883517763541, |
| "loss": 3.1908, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.86283185840708, |
| "grad_norm": 0.4028742015361786, |
| "learning_rate": 0.0002778136284216657, |
| "loss": 3.2032, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.877387051700047, |
| "grad_norm": 0.43310707807540894, |
| "learning_rate": 0.0002776389050669773, |
| "loss": 3.1871, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.891942244993015, |
| "grad_norm": 0.4450521171092987, |
| "learning_rate": 0.0002774641817122888, |
| "loss": 3.186, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.90649743828598, |
| "grad_norm": 0.42048415541648865, |
| "learning_rate": 0.00027728945835760047, |
| "loss": 3.1745, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.92105263157895, |
| "grad_norm": 0.4643785357475281, |
| "learning_rate": 0.000277114735002912, |
| "loss": 3.1977, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.935607824871916, |
| "grad_norm": 0.4301290512084961, |
| "learning_rate": 0.00027694001164822366, |
| "loss": 3.187, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.950163018164883, |
| "grad_norm": 0.44596704840660095, |
| "learning_rate": 0.0002767652882935352, |
| "loss": 3.193, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.96471821145785, |
| "grad_norm": 0.4527270793914795, |
| "learning_rate": 0.0002765905649388468, |
| "loss": 3.2039, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.979273404750813, |
| "grad_norm": 0.39088284969329834, |
| "learning_rate": 0.0002764158415841584, |
| "loss": 3.1918, |
| "step": 92700 |
| }, |
| { |
| "epoch": 26.99382859804378, |
| "grad_norm": 0.4439548850059509, |
| "learning_rate": 0.00027624111822947, |
| "loss": 3.2091, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.008150908244062, |
| "grad_norm": 0.4328957200050354, |
| "learning_rate": 0.0002760663948747816, |
| "loss": 3.1435, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.02270610153703, |
| "grad_norm": 0.47242867946624756, |
| "learning_rate": 0.00027589167152009317, |
| "loss": 3.1013, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.037261294829996, |
| "grad_norm": 0.4380267858505249, |
| "learning_rate": 0.00027571694816540477, |
| "loss": 3.1072, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.051816488122963, |
| "grad_norm": 0.4467245936393738, |
| "learning_rate": 0.0002755422248107163, |
| "loss": 3.0989, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.06637168141593, |
| "grad_norm": 0.4757293164730072, |
| "learning_rate": 0.00027536750145602795, |
| "loss": 3.1068, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.06637168141593, |
| "eval_accuracy": 0.37422022101950003, |
| "eval_loss": 3.5528392791748047, |
| "eval_runtime": 81.0939, |
| "eval_samples_per_second": 205.33, |
| "eval_steps_per_second": 12.837, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.080926874708897, |
| "grad_norm": 0.4595012664794922, |
| "learning_rate": 0.0002751927781013395, |
| "loss": 3.1148, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.095482068001864, |
| "grad_norm": 0.4173579216003418, |
| "learning_rate": 0.0002750180547466511, |
| "loss": 3.112, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.11003726129483, |
| "grad_norm": 0.4196181297302246, |
| "learning_rate": 0.0002748433313919627, |
| "loss": 3.1232, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.1245924545878, |
| "grad_norm": 0.46813368797302246, |
| "learning_rate": 0.0002746686080372743, |
| "loss": 3.1175, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.139147647880765, |
| "grad_norm": 0.4601026475429535, |
| "learning_rate": 0.00027449388468258587, |
| "loss": 3.1241, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.153702841173732, |
| "grad_norm": 0.453408807516098, |
| "learning_rate": 0.00027431916132789747, |
| "loss": 3.1106, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.168258034466696, |
| "grad_norm": 0.4332129955291748, |
| "learning_rate": 0.00027414443797320906, |
| "loss": 3.1427, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.182813227759663, |
| "grad_norm": 0.4542979896068573, |
| "learning_rate": 0.00027396971461852065, |
| "loss": 3.1294, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.19736842105263, |
| "grad_norm": 0.42211589217185974, |
| "learning_rate": 0.00027379499126383225, |
| "loss": 3.1395, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.211923614345597, |
| "grad_norm": 0.4546970725059509, |
| "learning_rate": 0.00027362026790914384, |
| "loss": 3.1356, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.226478807638564, |
| "grad_norm": 0.4617122709751129, |
| "learning_rate": 0.00027344554455445544, |
| "loss": 3.1507, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.24103400093153, |
| "grad_norm": 0.4626840651035309, |
| "learning_rate": 0.00027327082119976703, |
| "loss": 3.1357, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.2555891942245, |
| "grad_norm": 0.43889328837394714, |
| "learning_rate": 0.00027309609784507857, |
| "loss": 3.1404, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.270144387517465, |
| "grad_norm": 0.4172461926937103, |
| "learning_rate": 0.0002729213744903902, |
| "loss": 3.135, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.284699580810432, |
| "grad_norm": 0.44945260882377625, |
| "learning_rate": 0.00027274665113570176, |
| "loss": 3.141, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.2992547741034, |
| "grad_norm": 0.44916418194770813, |
| "learning_rate": 0.00027257192778101335, |
| "loss": 3.1339, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.313809967396367, |
| "grad_norm": 0.4344363212585449, |
| "learning_rate": 0.00027239720442632495, |
| "loss": 3.1617, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.328365160689334, |
| "grad_norm": 0.4550100266933441, |
| "learning_rate": 0.00027222248107163654, |
| "loss": 3.1437, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.3429203539823, |
| "grad_norm": 0.45779842138290405, |
| "learning_rate": 0.00027204775771694814, |
| "loss": 3.1409, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.357475547275268, |
| "grad_norm": 0.4399350881576538, |
| "learning_rate": 0.00027187303436225973, |
| "loss": 3.1598, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.357475547275268, |
| "eval_accuracy": 0.3745378973260305, |
| "eval_loss": 3.546377658843994, |
| "eval_runtime": 80.7427, |
| "eval_samples_per_second": 206.223, |
| "eval_steps_per_second": 12.893, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.372030740568235, |
| "grad_norm": 0.4308851957321167, |
| "learning_rate": 0.0002716983110075713, |
| "loss": 3.1584, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.386585933861202, |
| "grad_norm": 0.4552328586578369, |
| "learning_rate": 0.0002715235876528829, |
| "loss": 3.1597, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.40114112715417, |
| "grad_norm": 0.42795246839523315, |
| "learning_rate": 0.0002713488642981945, |
| "loss": 3.1604, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.415696320447136, |
| "grad_norm": 0.4725009799003601, |
| "learning_rate": 0.00027117414094350606, |
| "loss": 3.1571, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.430251513740103, |
| "grad_norm": 0.4435836374759674, |
| "learning_rate": 0.0002709994175888177, |
| "loss": 3.1623, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.44480670703307, |
| "grad_norm": 0.44227319955825806, |
| "learning_rate": 0.00027082469423412924, |
| "loss": 3.163, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.459361900326037, |
| "grad_norm": 0.446064829826355, |
| "learning_rate": 0.00027064997087944084, |
| "loss": 3.1469, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.473917093619004, |
| "grad_norm": 0.43536844849586487, |
| "learning_rate": 0.00027047524752475243, |
| "loss": 3.1612, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.48847228691197, |
| "grad_norm": 0.42750251293182373, |
| "learning_rate": 0.000270300524170064, |
| "loss": 3.155, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.50302748020494, |
| "grad_norm": 0.46864956617355347, |
| "learning_rate": 0.0002701258008153757, |
| "loss": 3.1709, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.517582673497905, |
| "grad_norm": 0.42518338561058044, |
| "learning_rate": 0.0002699510774606872, |
| "loss": 3.1721, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.532137866790872, |
| "grad_norm": 0.4215596914291382, |
| "learning_rate": 0.0002697763541059988, |
| "loss": 3.1638, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.54669306008384, |
| "grad_norm": 0.4434783160686493, |
| "learning_rate": 0.0002696016307513104, |
| "loss": 3.1563, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.561248253376803, |
| "grad_norm": 0.4654253125190735, |
| "learning_rate": 0.000269426907396622, |
| "loss": 3.1549, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.57580344666977, |
| "grad_norm": 0.44654175639152527, |
| "learning_rate": 0.0002692521840419336, |
| "loss": 3.1734, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.590358639962737, |
| "grad_norm": 0.44911614060401917, |
| "learning_rate": 0.0002690774606872452, |
| "loss": 3.1678, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.604913833255704, |
| "grad_norm": 0.4285270571708679, |
| "learning_rate": 0.0002689027373325568, |
| "loss": 3.1687, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.61946902654867, |
| "grad_norm": 0.4781116843223572, |
| "learning_rate": 0.0002687280139778683, |
| "loss": 3.1765, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.63402421984164, |
| "grad_norm": 0.4637291133403778, |
| "learning_rate": 0.00026855329062317997, |
| "loss": 3.1669, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.648579413134605, |
| "grad_norm": 0.4673132300376892, |
| "learning_rate": 0.0002683785672684915, |
| "loss": 3.171, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.648579413134605, |
| "eval_accuracy": 0.3748576891240584, |
| "eval_loss": 3.54510235786438, |
| "eval_runtime": 80.8115, |
| "eval_samples_per_second": 206.047, |
| "eval_steps_per_second": 12.882, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.663134606427572, |
| "grad_norm": 0.4657650291919708, |
| "learning_rate": 0.0002682038439138031, |
| "loss": 3.1831, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.67768979972054, |
| "grad_norm": 0.4400046467781067, |
| "learning_rate": 0.0002680291205591147, |
| "loss": 3.1822, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.692244993013507, |
| "grad_norm": 0.45291808247566223, |
| "learning_rate": 0.0002678543972044263, |
| "loss": 3.1665, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.706800186306474, |
| "grad_norm": 0.46261754631996155, |
| "learning_rate": 0.0002676796738497379, |
| "loss": 3.1794, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.72135537959944, |
| "grad_norm": 0.4905010759830475, |
| "learning_rate": 0.0002675049504950495, |
| "loss": 3.1672, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.735910572892408, |
| "grad_norm": 0.42721977829933167, |
| "learning_rate": 0.0002673302271403611, |
| "loss": 3.1886, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.750465766185375, |
| "grad_norm": 0.45352891087532043, |
| "learning_rate": 0.00026715550378567267, |
| "loss": 3.1802, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.765020959478342, |
| "grad_norm": 0.4502638280391693, |
| "learning_rate": 0.00026698078043098426, |
| "loss": 3.1766, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.77957615277131, |
| "grad_norm": 0.40890103578567505, |
| "learning_rate": 0.00026680605707629586, |
| "loss": 3.1786, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.794131346064276, |
| "grad_norm": 0.44698986411094666, |
| "learning_rate": 0.00026663133372160745, |
| "loss": 3.181, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.808686539357243, |
| "grad_norm": 0.4251572787761688, |
| "learning_rate": 0.00026645661036691905, |
| "loss": 3.1709, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.82324173265021, |
| "grad_norm": 0.4705631732940674, |
| "learning_rate": 0.0002662818870122306, |
| "loss": 3.1837, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.837796925943177, |
| "grad_norm": 0.45506125688552856, |
| "learning_rate": 0.00026610716365754224, |
| "loss": 3.1883, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.852352119236144, |
| "grad_norm": 0.4505208730697632, |
| "learning_rate": 0.0002659324403028538, |
| "loss": 3.1754, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.86690731252911, |
| "grad_norm": 0.4309210181236267, |
| "learning_rate": 0.00026575771694816537, |
| "loss": 3.1926, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.88146250582208, |
| "grad_norm": 0.4628695547580719, |
| "learning_rate": 0.00026558299359347696, |
| "loss": 3.1974, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.896017699115045, |
| "grad_norm": 0.4354335367679596, |
| "learning_rate": 0.00026540827023878856, |
| "loss": 3.1944, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.910572892408013, |
| "grad_norm": 0.416464239358902, |
| "learning_rate": 0.00026523354688410015, |
| "loss": 3.1836, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.92512808570098, |
| "grad_norm": 0.46018555760383606, |
| "learning_rate": 0.00026505882352941175, |
| "loss": 3.1771, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.939683278993947, |
| "grad_norm": 0.4998409152030945, |
| "learning_rate": 0.00026488410017472334, |
| "loss": 3.1805, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.939683278993947, |
| "eval_accuracy": 0.3751727798298698, |
| "eval_loss": 3.5356526374816895, |
| "eval_runtime": 80.8446, |
| "eval_samples_per_second": 205.963, |
| "eval_steps_per_second": 12.877, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.95423847228691, |
| "grad_norm": 0.4487084448337555, |
| "learning_rate": 0.00026470937682003494, |
| "loss": 3.1821, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.968793665579877, |
| "grad_norm": 0.4654010832309723, |
| "learning_rate": 0.00026453465346534653, |
| "loss": 3.1943, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.983348858872844, |
| "grad_norm": 0.45850756764411926, |
| "learning_rate": 0.00026435993011065807, |
| "loss": 3.1908, |
| "step": 96150 |
| }, |
| { |
| "epoch": 27.99790405216581, |
| "grad_norm": 0.4561852216720581, |
| "learning_rate": 0.0002641852067559697, |
| "loss": 3.2035, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.012226362366093, |
| "grad_norm": 0.43813368678092957, |
| "learning_rate": 0.00026401048340128126, |
| "loss": 3.1046, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.02678155565906, |
| "grad_norm": 0.46887195110321045, |
| "learning_rate": 0.00026383576004659285, |
| "loss": 3.0863, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.041336748952027, |
| "grad_norm": 0.4430742859840393, |
| "learning_rate": 0.00026366103669190445, |
| "loss": 3.0946, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.055891942244994, |
| "grad_norm": 0.4444419741630554, |
| "learning_rate": 0.00026348631333721604, |
| "loss": 3.1045, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.07044713553796, |
| "grad_norm": 0.4665871858596802, |
| "learning_rate": 0.00026331158998252764, |
| "loss": 3.1043, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.085002328830928, |
| "grad_norm": 0.45947691798210144, |
| "learning_rate": 0.00026313686662783923, |
| "loss": 3.1089, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.099557522123895, |
| "grad_norm": 0.42693883180618286, |
| "learning_rate": 0.0002629621432731508, |
| "loss": 3.1113, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.114112715416862, |
| "grad_norm": 0.464823454618454, |
| "learning_rate": 0.0002627874199184624, |
| "loss": 3.1097, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.12866790870983, |
| "grad_norm": 0.43090617656707764, |
| "learning_rate": 0.000262612696563774, |
| "loss": 3.1244, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.143223102002796, |
| "grad_norm": 0.4418310523033142, |
| "learning_rate": 0.0002624379732090856, |
| "loss": 3.1062, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.15777829529576, |
| "grad_norm": 0.4696221649646759, |
| "learning_rate": 0.0002622632498543972, |
| "loss": 3.1273, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.172333488588727, |
| "grad_norm": 0.4728420674800873, |
| "learning_rate": 0.0002620885264997088, |
| "loss": 3.1262, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.186888681881694, |
| "grad_norm": 0.44601503014564514, |
| "learning_rate": 0.00026191380314502034, |
| "loss": 3.1423, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.20144387517466, |
| "grad_norm": 0.4399915635585785, |
| "learning_rate": 0.000261739079790332, |
| "loss": 3.1198, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.215999068467628, |
| "grad_norm": 0.4591466188430786, |
| "learning_rate": 0.0002615643564356435, |
| "loss": 3.1237, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.230554261760595, |
| "grad_norm": 0.4566672742366791, |
| "learning_rate": 0.0002613896330809551, |
| "loss": 3.1208, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.230554261760595, |
| "eval_accuracy": 0.37435408462036507, |
| "eval_loss": 3.5521373748779297, |
| "eval_runtime": 80.8128, |
| "eval_samples_per_second": 206.044, |
| "eval_steps_per_second": 12.882, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.245109455053562, |
| "grad_norm": 0.4582904279232025, |
| "learning_rate": 0.0002612149097262667, |
| "loss": 3.1233, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.25966464834653, |
| "grad_norm": 0.47682955861091614, |
| "learning_rate": 0.0002610401863715783, |
| "loss": 3.1266, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.274219841639496, |
| "grad_norm": 0.4697798490524292, |
| "learning_rate": 0.0002608654630168899, |
| "loss": 3.1344, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.288775034932463, |
| "grad_norm": 0.443227618932724, |
| "learning_rate": 0.0002606907396622015, |
| "loss": 3.1291, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.30333022822543, |
| "grad_norm": 0.4299347698688507, |
| "learning_rate": 0.0002605160163075131, |
| "loss": 3.146, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.317885421518397, |
| "grad_norm": 0.4761723577976227, |
| "learning_rate": 0.0002603412929528247, |
| "loss": 3.1276, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.332440614811365, |
| "grad_norm": 0.45339974761009216, |
| "learning_rate": 0.0002601665695981363, |
| "loss": 3.1392, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.34699580810433, |
| "grad_norm": 0.44024261832237244, |
| "learning_rate": 0.0002599918462434478, |
| "loss": 3.1409, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.3615510013973, |
| "grad_norm": 0.43010157346725464, |
| "learning_rate": 0.00025981712288875947, |
| "loss": 3.1497, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.376106194690266, |
| "grad_norm": 0.4567290246486664, |
| "learning_rate": 0.000259642399534071, |
| "loss": 3.1489, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.390661387983233, |
| "grad_norm": 0.4604106545448303, |
| "learning_rate": 0.0002594676761793826, |
| "loss": 3.1456, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.4052165812762, |
| "grad_norm": 0.4699397683143616, |
| "learning_rate": 0.0002592929528246942, |
| "loss": 3.1565, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.419771774569167, |
| "grad_norm": 0.4464460015296936, |
| "learning_rate": 0.0002591182294700058, |
| "loss": 3.1549, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.434326967862134, |
| "grad_norm": 0.4644322991371155, |
| "learning_rate": 0.0002589435061153174, |
| "loss": 3.1521, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.4488821611551, |
| "grad_norm": 0.4581911563873291, |
| "learning_rate": 0.000258768782760629, |
| "loss": 3.1462, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.463437354448068, |
| "grad_norm": 0.4426102638244629, |
| "learning_rate": 0.0002585940594059406, |
| "loss": 3.1476, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.477992547741035, |
| "grad_norm": 0.45935672521591187, |
| "learning_rate": 0.00025841933605125217, |
| "loss": 3.1575, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.492547741034002, |
| "grad_norm": 0.4442085921764374, |
| "learning_rate": 0.00025824461269656376, |
| "loss": 3.159, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.50710293432697, |
| "grad_norm": 0.43588027358055115, |
| "learning_rate": 0.00025806988934187536, |
| "loss": 3.1435, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.521658127619936, |
| "grad_norm": 0.4424988925457001, |
| "learning_rate": 0.00025789516598718695, |
| "loss": 3.1498, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.521658127619936, |
| "eval_accuracy": 0.37475355993146275, |
| "eval_loss": 3.5443122386932373, |
| "eval_runtime": 80.8093, |
| "eval_samples_per_second": 206.053, |
| "eval_steps_per_second": 12.882, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.536213320912903, |
| "grad_norm": 0.45757943391799927, |
| "learning_rate": 0.00025772044263249854, |
| "loss": 3.1479, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.55076851420587, |
| "grad_norm": 0.46656733751296997, |
| "learning_rate": 0.0002575457192778101, |
| "loss": 3.1615, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.565323707498834, |
| "grad_norm": 0.4589703679084778, |
| "learning_rate": 0.00025737099592312173, |
| "loss": 3.1545, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.5798789007918, |
| "grad_norm": 0.4591953456401825, |
| "learning_rate": 0.0002571962725684333, |
| "loss": 3.1466, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.594434094084768, |
| "grad_norm": 0.4264989495277405, |
| "learning_rate": 0.00025702154921374487, |
| "loss": 3.1617, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.608989287377735, |
| "grad_norm": 0.4395191967487335, |
| "learning_rate": 0.00025684682585905646, |
| "loss": 3.154, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.623544480670702, |
| "grad_norm": 0.443545937538147, |
| "learning_rate": 0.00025667210250436806, |
| "loss": 3.1591, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.63809967396367, |
| "grad_norm": 0.433398962020874, |
| "learning_rate": 0.00025649737914967965, |
| "loss": 3.1693, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.652654867256636, |
| "grad_norm": 0.47089698910713196, |
| "learning_rate": 0.00025632265579499124, |
| "loss": 3.1623, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.667210060549603, |
| "grad_norm": 0.4389559030532837, |
| "learning_rate": 0.00025614793244030284, |
| "loss": 3.1727, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.68176525384257, |
| "grad_norm": 0.5161491632461548, |
| "learning_rate": 0.0002559732090856144, |
| "loss": 3.1572, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.696320447135538, |
| "grad_norm": 0.4519852101802826, |
| "learning_rate": 0.00025579848573092603, |
| "loss": 3.1735, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.710875640428505, |
| "grad_norm": 0.4415338933467865, |
| "learning_rate": 0.0002556237623762376, |
| "loss": 3.1619, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.72543083372147, |
| "grad_norm": 0.44179877638816833, |
| "learning_rate": 0.0002554490390215492, |
| "loss": 3.176, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.73998602701444, |
| "grad_norm": 0.4931659996509552, |
| "learning_rate": 0.0002552743156668608, |
| "loss": 3.1684, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.754541220307406, |
| "grad_norm": 0.4671033024787903, |
| "learning_rate": 0.00025509959231217235, |
| "loss": 3.1614, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.769096413600373, |
| "grad_norm": 0.48016655445098877, |
| "learning_rate": 0.000254924868957484, |
| "loss": 3.1748, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.78365160689334, |
| "grad_norm": 0.48242220282554626, |
| "learning_rate": 0.00025475014560279554, |
| "loss": 3.171, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.798206800186307, |
| "grad_norm": 0.479145348072052, |
| "learning_rate": 0.00025457542224810713, |
| "loss": 3.1889, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.812761993479274, |
| "grad_norm": 0.4503076672554016, |
| "learning_rate": 0.00025440069889341873, |
| "loss": 3.1754, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.812761993479274, |
| "eval_accuracy": 0.3753554172624811, |
| "eval_loss": 3.5349862575531006, |
| "eval_runtime": 80.7474, |
| "eval_samples_per_second": 206.211, |
| "eval_steps_per_second": 12.892, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.812761993479274, |
| "step": 99000, |
| "total_flos": 2.068898812526592e+18, |
| "train_loss": 3.3949128382519036, |
| "train_runtime": 72609.3608, |
| "train_samples_per_second": 189.237, |
| "train_steps_per_second": 2.366 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 11 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.068898812526592e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|