| { |
| "best_global_step": 79000, |
| "best_metric": 3.5285708904266357, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_carry_3591/checkpoint-40000", |
| "epoch": 29.103924080111785, |
| "eval_steps": 1000, |
| "global_step": 100000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014555193292966931, |
| "grad_norm": 0.7359105348587036, |
| "learning_rate": 0.000294, |
| "loss": 8.4385, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029110386585933862, |
| "grad_norm": 0.7309923768043518, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.728, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04366557987890079, |
| "grad_norm": 0.5441356897354126, |
| "learning_rate": 0.0005998287711124053, |
| "loss": 6.3624, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.058220773171867725, |
| "grad_norm": 0.5081177949905396, |
| "learning_rate": 0.000599654047757717, |
| "loss": 6.1462, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07277596646483465, |
| "grad_norm": 0.4880349338054657, |
| "learning_rate": 0.0005994793244030285, |
| "loss": 5.9999, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08733115975780158, |
| "grad_norm": 0.44926661252975464, |
| "learning_rate": 0.00059930460104834, |
| "loss": 5.8789, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10188635305076851, |
| "grad_norm": 0.42646145820617676, |
| "learning_rate": 0.0005991298776936517, |
| "loss": 5.7411, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11644154634373545, |
| "grad_norm": 0.5302311778068542, |
| "learning_rate": 0.0005989551543389632, |
| "loss": 5.6293, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1309967396367024, |
| "grad_norm": 0.4889351427555084, |
| "learning_rate": 0.0005987804309842748, |
| "loss": 5.515, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1455519329296693, |
| "grad_norm": 0.42970505356788635, |
| "learning_rate": 0.0005986057076295864, |
| "loss": 5.4076, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.16010712622263623, |
| "grad_norm": 0.4154544770717621, |
| "learning_rate": 0.0005984309842748981, |
| "loss": 5.3341, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17466231951560315, |
| "grad_norm": 0.4824340343475342, |
| "learning_rate": 0.0005982562609202096, |
| "loss": 5.2568, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1892175128085701, |
| "grad_norm": 0.42126211524009705, |
| "learning_rate": 0.0005980815375655212, |
| "loss": 5.1928, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20377270610153703, |
| "grad_norm": 0.4139862358570099, |
| "learning_rate": 0.0005979068142108328, |
| "loss": 5.1269, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.21832789939450395, |
| "grad_norm": 0.48572856187820435, |
| "learning_rate": 0.0005977320908561445, |
| "loss": 5.0723, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2328830926874709, |
| "grad_norm": 0.4250296950340271, |
| "learning_rate": 0.000597557367501456, |
| "loss": 5.0148, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24743828598043782, |
| "grad_norm": 0.3736257255077362, |
| "learning_rate": 0.0005973826441467675, |
| "loss": 4.9691, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2619934792734048, |
| "grad_norm": 0.3763846457004547, |
| "learning_rate": 0.0005972079207920792, |
| "loss": 4.9133, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.27654867256637167, |
| "grad_norm": 0.43584969639778137, |
| "learning_rate": 0.0005970331974373907, |
| "loss": 4.8652, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2911038658593386, |
| "grad_norm": 0.4402410686016083, |
| "learning_rate": 0.0005968584740827023, |
| "loss": 4.8353, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2911038658593386, |
| "eval_accuracy": 0.2531285474882593, |
| "eval_loss": 4.762944221496582, |
| "eval_runtime": 179.4777, |
| "eval_samples_per_second": 92.775, |
| "eval_steps_per_second": 5.8, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30565905915230557, |
| "grad_norm": 0.4653228223323822, |
| "learning_rate": 0.0005966837507280139, |
| "loss": 4.7825, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.32021425244527246, |
| "grad_norm": 0.44444817304611206, |
| "learning_rate": 0.0005965090273733256, |
| "loss": 4.7472, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3347694457382394, |
| "grad_norm": 0.4688259959220886, |
| "learning_rate": 0.0005963343040186371, |
| "loss": 4.7077, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3493246390312063, |
| "grad_norm": 0.4208512306213379, |
| "learning_rate": 0.0005961595806639486, |
| "loss": 4.6714, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36387983232417326, |
| "grad_norm": 0.4392108619213104, |
| "learning_rate": 0.0005959848573092603, |
| "loss": 4.6226, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3784350256171402, |
| "grad_norm": 0.5245473384857178, |
| "learning_rate": 0.0005958101339545718, |
| "loss": 4.6081, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3929902189101071, |
| "grad_norm": 0.42627111077308655, |
| "learning_rate": 0.0005956354105998835, |
| "loss": 4.5746, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.40754541220307405, |
| "grad_norm": 0.45299196243286133, |
| "learning_rate": 0.000595460687245195, |
| "loss": 4.5448, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.422100605496041, |
| "grad_norm": 0.40141919255256653, |
| "learning_rate": 0.0005952859638905067, |
| "loss": 4.5306, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4366557987890079, |
| "grad_norm": 0.4180101156234741, |
| "learning_rate": 0.0005951112405358182, |
| "loss": 4.5104, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45121099208197485, |
| "grad_norm": 0.4382331073284149, |
| "learning_rate": 0.0005949365171811299, |
| "loss": 4.4792, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4657661853749418, |
| "grad_norm": 0.45820483565330505, |
| "learning_rate": 0.0005947617938264414, |
| "loss": 4.4604, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4803213786679087, |
| "grad_norm": 0.41873323917388916, |
| "learning_rate": 0.000594587070471753, |
| "loss": 4.4525, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49487657196087564, |
| "grad_norm": 0.4274512231349945, |
| "learning_rate": 0.0005944123471170646, |
| "loss": 4.4232, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5094317652538426, |
| "grad_norm": 0.4086366891860962, |
| "learning_rate": 0.0005942376237623762, |
| "loss": 4.4096, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5239869585468095, |
| "grad_norm": 0.38669702410697937, |
| "learning_rate": 0.0005940629004076878, |
| "loss": 4.3981, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5385421518397764, |
| "grad_norm": 0.4255729913711548, |
| "learning_rate": 0.0005938881770529993, |
| "loss": 4.3785, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5530973451327433, |
| "grad_norm": 0.40904727578163147, |
| "learning_rate": 0.000593713453698311, |
| "loss": 4.3684, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5676525384257103, |
| "grad_norm": 0.384636789560318, |
| "learning_rate": 0.0005935387303436226, |
| "loss": 4.3485, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5822077317186772, |
| "grad_norm": 0.4042646884918213, |
| "learning_rate": 0.0005933640069889342, |
| "loss": 4.3374, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5822077317186772, |
| "eval_accuracy": 0.2996315166393396, |
| "eval_loss": 4.279468059539795, |
| "eval_runtime": 179.5389, |
| "eval_samples_per_second": 92.743, |
| "eval_steps_per_second": 5.798, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5967629250116442, |
| "grad_norm": 0.41661450266838074, |
| "learning_rate": 0.0005931892836342457, |
| "loss": 4.3235, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6113181183046111, |
| "grad_norm": 0.42830735445022583, |
| "learning_rate": 0.0005930145602795573, |
| "loss": 4.3108, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.625873311597578, |
| "grad_norm": 0.37084299325942993, |
| "learning_rate": 0.000592839836924869, |
| "loss": 4.3092, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6404285048905449, |
| "grad_norm": 0.4014033377170563, |
| "learning_rate": 0.0005926651135701805, |
| "loss": 4.3037, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6549836981835119, |
| "grad_norm": 0.3967902660369873, |
| "learning_rate": 0.0005924903902154921, |
| "loss": 4.2826, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6695388914764788, |
| "grad_norm": 0.4075942635536194, |
| "learning_rate": 0.0005923156668608037, |
| "loss": 4.2677, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6840940847694458, |
| "grad_norm": 0.4896312355995178, |
| "learning_rate": 0.0005921409435061153, |
| "loss": 4.2633, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6986492780624126, |
| "grad_norm": 0.4202996492385864, |
| "learning_rate": 0.0005919662201514268, |
| "loss": 4.2499, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.7132044713553796, |
| "grad_norm": 0.4072986841201782, |
| "learning_rate": 0.0005917914967967384, |
| "loss": 4.2487, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7277596646483465, |
| "grad_norm": 0.3957359790802002, |
| "learning_rate": 0.0005916167734420501, |
| "loss": 4.2499, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7423148579413135, |
| "grad_norm": 0.3910154700279236, |
| "learning_rate": 0.0005914420500873616, |
| "loss": 4.2258, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.7568700512342804, |
| "grad_norm": 0.3771495223045349, |
| "learning_rate": 0.0005912673267326732, |
| "loss": 4.2105, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7714252445272474, |
| "grad_norm": 0.37781083583831787, |
| "learning_rate": 0.0005910926033779848, |
| "loss": 4.2069, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7859804378202142, |
| "grad_norm": 0.3904173672199249, |
| "learning_rate": 0.0005909178800232964, |
| "loss": 4.1878, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8005356311131812, |
| "grad_norm": 0.3640347421169281, |
| "learning_rate": 0.000590743156668608, |
| "loss": 4.1915, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8150908244061481, |
| "grad_norm": 0.3591432273387909, |
| "learning_rate": 0.0005905684333139196, |
| "loss": 4.1809, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8296460176991151, |
| "grad_norm": 0.40700843930244446, |
| "learning_rate": 0.0005903937099592312, |
| "loss": 4.1689, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.844201210992082, |
| "grad_norm": 0.3366829454898834, |
| "learning_rate": 0.0005902189866045427, |
| "loss": 4.1601, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.858756404285049, |
| "grad_norm": 0.35641607642173767, |
| "learning_rate": 0.0005900442632498543, |
| "loss": 4.1486, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8733115975780158, |
| "grad_norm": 0.35798075795173645, |
| "learning_rate": 0.0005898695398951659, |
| "loss": 4.1489, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8733115975780158, |
| "eval_accuracy": 0.315511923674007, |
| "eval_loss": 4.095459938049316, |
| "eval_runtime": 179.6196, |
| "eval_samples_per_second": 92.701, |
| "eval_steps_per_second": 5.796, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8878667908709827, |
| "grad_norm": 0.3598261773586273, |
| "learning_rate": 0.0005896948165404776, |
| "loss": 4.1458, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9024219841639497, |
| "grad_norm": 0.3457593321800232, |
| "learning_rate": 0.0005895200931857891, |
| "loss": 4.1294, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9169771774569166, |
| "grad_norm": 0.3695979416370392, |
| "learning_rate": 0.0005893453698311007, |
| "loss": 4.1345, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9315323707498836, |
| "grad_norm": 0.3489490747451782, |
| "learning_rate": 0.0005891706464764123, |
| "loss": 4.1108, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9460875640428504, |
| "grad_norm": 0.34958118200302124, |
| "learning_rate": 0.0005889959231217238, |
| "loss": 4.1165, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9606427573358174, |
| "grad_norm": 0.3432953357696533, |
| "learning_rate": 0.0005888211997670355, |
| "loss": 4.1136, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9751979506287843, |
| "grad_norm": 0.38065826892852783, |
| "learning_rate": 0.000588646476412347, |
| "loss": 4.1089, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9897531439217513, |
| "grad_norm": 0.34400883316993713, |
| "learning_rate": 0.0005884717530576587, |
| "loss": 4.11, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0040754541220307, |
| "grad_norm": 0.33822906017303467, |
| "learning_rate": 0.0005882970297029702, |
| "loss": 4.0757, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0186306474149978, |
| "grad_norm": 0.35206469893455505, |
| "learning_rate": 0.0005881223063482818, |
| "loss": 4.0187, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.0331858407079646, |
| "grad_norm": 0.3783675730228424, |
| "learning_rate": 0.0005879475829935934, |
| "loss": 4.0276, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0477410340009314, |
| "grad_norm": 0.35457679629325867, |
| "learning_rate": 0.0005877728596389051, |
| "loss": 4.0168, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0622962272938985, |
| "grad_norm": 0.37590548396110535, |
| "learning_rate": 0.0005875981362842166, |
| "loss": 4.0154, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0768514205868653, |
| "grad_norm": 0.35166940093040466, |
| "learning_rate": 0.0005874234129295281, |
| "loss": 4.0346, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0914066138798324, |
| "grad_norm": 0.3491305708885193, |
| "learning_rate": 0.0005872486895748398, |
| "loss": 4.0116, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1059618071727992, |
| "grad_norm": 0.36400511860847473, |
| "learning_rate": 0.0005870739662201513, |
| "loss": 4.0084, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.120517000465766, |
| "grad_norm": 0.35419315099716187, |
| "learning_rate": 0.000586899242865463, |
| "loss": 4.0014, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1350721937587331, |
| "grad_norm": 0.3940429985523224, |
| "learning_rate": 0.0005867245195107746, |
| "loss": 4.006, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.1496273870517, |
| "grad_norm": 0.3528949022293091, |
| "learning_rate": 0.0005865497961560862, |
| "loss": 4.0051, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.164182580344667, |
| "grad_norm": 0.35174238681793213, |
| "learning_rate": 0.0005863750728013977, |
| "loss": 3.996, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.164182580344667, |
| "eval_accuracy": 0.32519629116731763, |
| "eval_loss": 3.99649977684021, |
| "eval_runtime": 179.7575, |
| "eval_samples_per_second": 92.63, |
| "eval_steps_per_second": 5.791, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1787377736376339, |
| "grad_norm": 0.3291102945804596, |
| "learning_rate": 0.0005862003494467094, |
| "loss": 3.9821, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.193292966930601, |
| "grad_norm": 0.35790738463401794, |
| "learning_rate": 0.0005860256260920209, |
| "loss": 4.0037, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2078481602235678, |
| "grad_norm": 0.37376832962036133, |
| "learning_rate": 0.0005858509027373325, |
| "loss": 3.9955, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2224033535165346, |
| "grad_norm": 0.34248632192611694, |
| "learning_rate": 0.0005856761793826441, |
| "loss": 3.9708, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.2369585468095017, |
| "grad_norm": 0.3399963676929474, |
| "learning_rate": 0.0005855014560279557, |
| "loss": 3.9767, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2515137401024685, |
| "grad_norm": 0.3411734104156494, |
| "learning_rate": 0.0005853267326732673, |
| "loss": 3.9673, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2660689333954354, |
| "grad_norm": 0.3493206202983856, |
| "learning_rate": 0.0005851520093185788, |
| "loss": 3.9599, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.2806241266884024, |
| "grad_norm": 0.34816136956214905, |
| "learning_rate": 0.0005849772859638905, |
| "loss": 3.9579, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2951793199813695, |
| "grad_norm": 0.3242569863796234, |
| "learning_rate": 0.0005848025626092021, |
| "loss": 3.9739, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3097345132743363, |
| "grad_norm": 0.34569236636161804, |
| "learning_rate": 0.0005846278392545136, |
| "loss": 3.9623, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.3242897065673032, |
| "grad_norm": 0.34033289551734924, |
| "learning_rate": 0.0005844531158998252, |
| "loss": 3.9518, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3388448998602702, |
| "grad_norm": 0.3365550637245178, |
| "learning_rate": 0.0005842783925451368, |
| "loss": 3.9437, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.353400093153237, |
| "grad_norm": 0.3506678342819214, |
| "learning_rate": 0.0005841036691904484, |
| "loss": 3.9446, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.367955286446204, |
| "grad_norm": 0.3359658718109131, |
| "learning_rate": 0.00058392894583576, |
| "loss": 3.9492, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.382510479739171, |
| "grad_norm": 0.3435382843017578, |
| "learning_rate": 0.0005837542224810716, |
| "loss": 3.9445, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3970656730321378, |
| "grad_norm": 0.35683852434158325, |
| "learning_rate": 0.0005835794991263832, |
| "loss": 3.9433, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4116208663251049, |
| "grad_norm": 0.3335430324077606, |
| "learning_rate": 0.0005834047757716948, |
| "loss": 3.936, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.4261760596180717, |
| "grad_norm": 0.3474925756454468, |
| "learning_rate": 0.0005832300524170063, |
| "loss": 3.9431, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4407312529110388, |
| "grad_norm": 0.35509905219078064, |
| "learning_rate": 0.0005830553290623179, |
| "loss": 3.9349, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.4552864462040056, |
| "grad_norm": 0.33583182096481323, |
| "learning_rate": 0.0005828806057076296, |
| "loss": 3.9347, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4552864462040056, |
| "eval_accuracy": 0.33179591947546155, |
| "eval_loss": 3.9166266918182373, |
| "eval_runtime": 179.7795, |
| "eval_samples_per_second": 92.619, |
| "eval_steps_per_second": 5.79, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4698416394969724, |
| "grad_norm": 0.3611011505126953, |
| "learning_rate": 0.0005827058823529411, |
| "loss": 3.9249, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.4843968327899395, |
| "grad_norm": 0.35978731513023376, |
| "learning_rate": 0.0005825311589982527, |
| "loss": 3.9307, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.4989520260829063, |
| "grad_norm": 0.3362000286579132, |
| "learning_rate": 0.0005823564356435643, |
| "loss": 3.9162, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5135072193758732, |
| "grad_norm": 0.3377557098865509, |
| "learning_rate": 0.0005821817122888759, |
| "loss": 3.9344, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5280624126688402, |
| "grad_norm": 0.33132773637771606, |
| "learning_rate": 0.0005820069889341875, |
| "loss": 3.93, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5426176059618073, |
| "grad_norm": 0.3384888470172882, |
| "learning_rate": 0.000581832265579499, |
| "loss": 3.9109, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5571727992547741, |
| "grad_norm": 0.3428712785243988, |
| "learning_rate": 0.0005816575422248107, |
| "loss": 3.9228, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.571727992547741, |
| "grad_norm": 0.3271981179714203, |
| "learning_rate": 0.0005814828188701222, |
| "loss": 3.9087, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.586283185840708, |
| "grad_norm": 0.3368676006793976, |
| "learning_rate": 0.0005813080955154338, |
| "loss": 3.9175, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6008383791336749, |
| "grad_norm": 0.32926130294799805, |
| "learning_rate": 0.0005811333721607454, |
| "loss": 3.8967, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.6153935724266417, |
| "grad_norm": 0.32293272018432617, |
| "learning_rate": 0.0005809586488060571, |
| "loss": 3.8934, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6299487657196088, |
| "grad_norm": 0.35573187470436096, |
| "learning_rate": 0.0005807839254513686, |
| "loss": 3.8959, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6445039590125758, |
| "grad_norm": 0.3371495306491852, |
| "learning_rate": 0.0005806092020966802, |
| "loss": 3.8953, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.6590591523055425, |
| "grad_norm": 0.3150193989276886, |
| "learning_rate": 0.0005804344787419918, |
| "loss": 3.8906, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6736143455985095, |
| "grad_norm": 0.34463661909103394, |
| "learning_rate": 0.0005802597553873033, |
| "loss": 3.8892, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6881695388914766, |
| "grad_norm": 0.3225216865539551, |
| "learning_rate": 0.000580085032032615, |
| "loss": 3.8853, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.7027247321844434, |
| "grad_norm": 0.33168166875839233, |
| "learning_rate": 0.0005799103086779265, |
| "loss": 3.8872, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.7172799254774103, |
| "grad_norm": 0.3117685914039612, |
| "learning_rate": 0.0005797355853232382, |
| "loss": 3.8712, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7318351187703773, |
| "grad_norm": 0.3171573579311371, |
| "learning_rate": 0.0005795608619685497, |
| "loss": 3.8869, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7463903120633442, |
| "grad_norm": 0.3368074893951416, |
| "learning_rate": 0.0005793861386138614, |
| "loss": 3.8665, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7463903120633442, |
| "eval_accuracy": 0.3372165138557054, |
| "eval_loss": 3.8597424030303955, |
| "eval_runtime": 179.7463, |
| "eval_samples_per_second": 92.636, |
| "eval_steps_per_second": 5.791, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.760945505356311, |
| "grad_norm": 0.33153268694877625, |
| "learning_rate": 0.0005792114152591729, |
| "loss": 3.8785, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.775500698649278, |
| "grad_norm": 0.3446453809738159, |
| "learning_rate": 0.0005790366919044846, |
| "loss": 3.8873, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.7900558919422451, |
| "grad_norm": 0.34094732999801636, |
| "learning_rate": 0.0005788619685497961, |
| "loss": 3.8715, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.804611085235212, |
| "grad_norm": 0.332516610622406, |
| "learning_rate": 0.0005786872451951077, |
| "loss": 3.8612, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.8191662785281788, |
| "grad_norm": 0.33042222261428833, |
| "learning_rate": 0.0005785125218404193, |
| "loss": 3.8693, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8337214718211459, |
| "grad_norm": 0.31963613629341125, |
| "learning_rate": 0.0005783377984857308, |
| "loss": 3.8765, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8482766651141127, |
| "grad_norm": 0.3356238305568695, |
| "learning_rate": 0.0005781630751310425, |
| "loss": 3.8714, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8628318584070795, |
| "grad_norm": 0.32635051012039185, |
| "learning_rate": 0.0005779883517763541, |
| "loss": 3.8626, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8773870517000466, |
| "grad_norm": 0.3183547556400299, |
| "learning_rate": 0.0005778136284216657, |
| "loss": 3.8541, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8919422449930137, |
| "grad_norm": 0.332824170589447, |
| "learning_rate": 0.0005776389050669772, |
| "loss": 3.8672, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.9064974382859803, |
| "grad_norm": 0.3161756694316864, |
| "learning_rate": 0.0005774641817122889, |
| "loss": 3.8409, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9210526315789473, |
| "grad_norm": 0.33893078565597534, |
| "learning_rate": 0.0005772894583576004, |
| "loss": 3.8608, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9356078248719144, |
| "grad_norm": 0.32037511467933655, |
| "learning_rate": 0.000577114735002912, |
| "loss": 3.8494, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.9501630181648812, |
| "grad_norm": 0.30871209502220154, |
| "learning_rate": 0.0005769400116482236, |
| "loss": 3.8421, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.964718211457848, |
| "grad_norm": 0.3202383518218994, |
| "learning_rate": 0.0005767652882935352, |
| "loss": 3.8567, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9792734047508151, |
| "grad_norm": 0.3361697494983673, |
| "learning_rate": 0.0005765905649388468, |
| "loss": 3.8522, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.993828598043782, |
| "grad_norm": 0.32010895013809204, |
| "learning_rate": 0.0005764158415841583, |
| "loss": 3.8407, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.0081509082440614, |
| "grad_norm": 0.33929452300071716, |
| "learning_rate": 0.00057624111822947, |
| "loss": 3.7821, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0227061015370285, |
| "grad_norm": 0.32835519313812256, |
| "learning_rate": 0.0005760663948747816, |
| "loss": 3.7356, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.0372612948299955, |
| "grad_norm": 0.31738170981407166, |
| "learning_rate": 0.0005758916715200931, |
| "loss": 3.7343, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0372612948299955, |
| "eval_accuracy": 0.3415684324478317, |
| "eval_loss": 3.8141865730285645, |
| "eval_runtime": 179.7242, |
| "eval_samples_per_second": 92.648, |
| "eval_steps_per_second": 5.792, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.051816488122962, |
| "grad_norm": 0.3155139088630676, |
| "learning_rate": 0.0005757169481654047, |
| "loss": 3.7447, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.066371681415929, |
| "grad_norm": 0.3158959448337555, |
| "learning_rate": 0.0005755422248107163, |
| "loss": 3.7481, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0809268747088963, |
| "grad_norm": 0.3319007158279419, |
| "learning_rate": 0.0005753675014560279, |
| "loss": 3.7539, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.095482068001863, |
| "grad_norm": 0.31912368535995483, |
| "learning_rate": 0.0005751927781013395, |
| "loss": 3.7488, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.11003726129483, |
| "grad_norm": 0.31239715218544006, |
| "learning_rate": 0.0005750180547466511, |
| "loss": 3.7673, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.124592454587797, |
| "grad_norm": 0.3297717273235321, |
| "learning_rate": 0.0005748433313919627, |
| "loss": 3.7545, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.139147647880764, |
| "grad_norm": 0.34258368611335754, |
| "learning_rate": 0.0005746686080372743, |
| "loss": 3.7494, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1537028411737307, |
| "grad_norm": 0.323652058839798, |
| "learning_rate": 0.0005744938846825858, |
| "loss": 3.7428, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.1682580344666977, |
| "grad_norm": 0.3314710557460785, |
| "learning_rate": 0.0005743191613278974, |
| "loss": 3.7627, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.182813227759665, |
| "grad_norm": 0.3517455458641052, |
| "learning_rate": 0.0005741444379732091, |
| "loss": 3.7598, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.1973684210526314, |
| "grad_norm": 0.34565964341163635, |
| "learning_rate": 0.0005739697146185206, |
| "loss": 3.7568, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.2119236143455985, |
| "grad_norm": 0.31987765431404114, |
| "learning_rate": 0.0005737949912638322, |
| "loss": 3.7692, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.2264788076385655, |
| "grad_norm": 0.3262273371219635, |
| "learning_rate": 0.0005736202679091438, |
| "loss": 3.7536, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.241034000931532, |
| "grad_norm": 0.32346490025520325, |
| "learning_rate": 0.0005734455445544554, |
| "loss": 3.7582, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.255589194224499, |
| "grad_norm": 0.3117275834083557, |
| "learning_rate": 0.000573270821199767, |
| "loss": 3.741, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.2701443875174663, |
| "grad_norm": 0.3295489251613617, |
| "learning_rate": 0.0005730960978450785, |
| "loss": 3.7585, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.2846995808104333, |
| "grad_norm": 0.30941474437713623, |
| "learning_rate": 0.0005729213744903902, |
| "loss": 3.7513, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.2992547741034, |
| "grad_norm": 0.3114382028579712, |
| "learning_rate": 0.0005727466511357017, |
| "loss": 3.7662, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.313809967396367, |
| "grad_norm": 0.3253454267978668, |
| "learning_rate": 0.0005725719277810134, |
| "loss": 3.757, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.328365160689334, |
| "grad_norm": 0.32134369015693665, |
| "learning_rate": 0.0005723972044263249, |
| "loss": 3.7434, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.328365160689334, |
| "eval_accuracy": 0.34472039725169445, |
| "eval_loss": 3.7803711891174316, |
| "eval_runtime": 179.7407, |
| "eval_samples_per_second": 92.639, |
| "eval_steps_per_second": 5.792, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3429203539823007, |
| "grad_norm": 0.31129831075668335, |
| "learning_rate": 0.0005722224810716366, |
| "loss": 3.7715, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.3574755472752678, |
| "grad_norm": 0.31825774908065796, |
| "learning_rate": 0.0005720477577169481, |
| "loss": 3.7519, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.372030740568235, |
| "grad_norm": 0.3173275291919708, |
| "learning_rate": 0.0005718730343622598, |
| "loss": 3.7538, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.386585933861202, |
| "grad_norm": 0.3088977038860321, |
| "learning_rate": 0.0005716983110075713, |
| "loss": 3.7434, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.4011411271541685, |
| "grad_norm": 0.31404852867126465, |
| "learning_rate": 0.0005715235876528828, |
| "loss": 3.7509, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4156963204471356, |
| "grad_norm": 0.34154555201530457, |
| "learning_rate": 0.0005713488642981945, |
| "loss": 3.7509, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.4302515137401026, |
| "grad_norm": 0.3258163332939148, |
| "learning_rate": 0.0005711741409435061, |
| "loss": 3.7483, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.4448067070330692, |
| "grad_norm": 0.30587008595466614, |
| "learning_rate": 0.0005709994175888177, |
| "loss": 3.7547, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.4593619003260363, |
| "grad_norm": 0.304930180311203, |
| "learning_rate": 0.0005708246942341292, |
| "loss": 3.7466, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.4739170936190034, |
| "grad_norm": 0.31150105595588684, |
| "learning_rate": 0.0005706499708794409, |
| "loss": 3.7452, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4884722869119704, |
| "grad_norm": 0.31906256079673767, |
| "learning_rate": 0.0005704752475247524, |
| "loss": 3.7465, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.503027480204937, |
| "grad_norm": 0.30793827772140503, |
| "learning_rate": 0.0005703005241700641, |
| "loss": 3.757, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.517582673497904, |
| "grad_norm": 0.312449187040329, |
| "learning_rate": 0.0005701258008153756, |
| "loss": 3.744, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.5321378667908707, |
| "grad_norm": 0.3282853364944458, |
| "learning_rate": 0.0005699510774606872, |
| "loss": 3.7454, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.546693060083838, |
| "grad_norm": 0.3083302080631256, |
| "learning_rate": 0.0005697763541059988, |
| "loss": 3.7455, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.561248253376805, |
| "grad_norm": 0.314094215631485, |
| "learning_rate": 0.0005696016307513103, |
| "loss": 3.7441, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.575803446669772, |
| "grad_norm": 0.3158092796802521, |
| "learning_rate": 0.000569426907396622, |
| "loss": 3.7521, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.590358639962739, |
| "grad_norm": 0.3241555094718933, |
| "learning_rate": 0.0005692521840419336, |
| "loss": 3.7385, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.6049138332557056, |
| "grad_norm": 0.34334230422973633, |
| "learning_rate": 0.0005690774606872452, |
| "loss": 3.7346, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6194690265486726, |
| "grad_norm": 0.3200402855873108, |
| "learning_rate": 0.0005689027373325567, |
| "loss": 3.7318, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6194690265486726, |
| "eval_accuracy": 0.34772286732307234, |
| "eval_loss": 3.7532577514648438, |
| "eval_runtime": 179.9031, |
| "eval_samples_per_second": 92.555, |
| "eval_steps_per_second": 5.786, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6340242198416393, |
| "grad_norm": 0.3395897150039673, |
| "learning_rate": 0.0005687280139778683, |
| "loss": 3.7331, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.6485794131346063, |
| "grad_norm": 0.3154236972332001, |
| "learning_rate": 0.0005685532906231799, |
| "loss": 3.7301, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6631346064275734, |
| "grad_norm": 0.31031179428100586, |
| "learning_rate": 0.0005683785672684915, |
| "loss": 3.7267, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.6776897997205404, |
| "grad_norm": 0.3203895688056946, |
| "learning_rate": 0.0005682038439138031, |
| "loss": 3.7326, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.692244993013507, |
| "grad_norm": 0.3058587312698364, |
| "learning_rate": 0.0005680291205591147, |
| "loss": 3.7222, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.706800186306474, |
| "grad_norm": 0.31644535064697266, |
| "learning_rate": 0.0005678543972044263, |
| "loss": 3.7268, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.721355379599441, |
| "grad_norm": 0.30357447266578674, |
| "learning_rate": 0.0005676796738497378, |
| "loss": 3.7311, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.735910572892408, |
| "grad_norm": 0.3125144839286804, |
| "learning_rate": 0.0005675049504950495, |
| "loss": 3.7375, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.750465766185375, |
| "grad_norm": 0.3147082030773163, |
| "learning_rate": 0.0005673302271403611, |
| "loss": 3.7399, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.765020959478342, |
| "grad_norm": 0.304993599653244, |
| "learning_rate": 0.0005671555037856726, |
| "loss": 3.7334, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.779576152771309, |
| "grad_norm": 0.32299643754959106, |
| "learning_rate": 0.0005669807804309842, |
| "loss": 3.7218, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7941313460642756, |
| "grad_norm": 0.32130691409111023, |
| "learning_rate": 0.0005668060570762958, |
| "loss": 3.7313, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.8086865393572427, |
| "grad_norm": 0.3162747621536255, |
| "learning_rate": 0.0005666313337216074, |
| "loss": 3.7299, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8232417326502097, |
| "grad_norm": 0.3274405002593994, |
| "learning_rate": 0.000566456610366919, |
| "loss": 3.7252, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8377969259431763, |
| "grad_norm": 0.31984663009643555, |
| "learning_rate": 0.0005662818870122306, |
| "loss": 3.735, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.8523521192361434, |
| "grad_norm": 0.2940428555011749, |
| "learning_rate": 0.0005661071636575422, |
| "loss": 3.7267, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.8669073125291105, |
| "grad_norm": 0.306113064289093, |
| "learning_rate": 0.0005659324403028537, |
| "loss": 3.7327, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.8814625058220775, |
| "grad_norm": 0.3145885467529297, |
| "learning_rate": 0.0005657577169481653, |
| "loss": 3.7142, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.896017699115044, |
| "grad_norm": 0.29591280221939087, |
| "learning_rate": 0.0005655829935934769, |
| "loss": 3.7039, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.910572892408011, |
| "grad_norm": 0.3004099726676941, |
| "learning_rate": 0.0005654082702387886, |
| "loss": 3.7166, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.910572892408011, |
| "eval_accuracy": 0.34956734085421903, |
| "eval_loss": 3.726909637451172, |
| "eval_runtime": 179.5861, |
| "eval_samples_per_second": 92.719, |
| "eval_steps_per_second": 5.797, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9251280857009783, |
| "grad_norm": 0.3109896183013916, |
| "learning_rate": 0.0005652335468841001, |
| "loss": 3.7304, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.939683278993945, |
| "grad_norm": 0.3140173554420471, |
| "learning_rate": 0.0005650588235294117, |
| "loss": 3.7058, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.954238472286912, |
| "grad_norm": 0.31375789642333984, |
| "learning_rate": 0.0005648841001747233, |
| "loss": 3.7248, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.968793665579879, |
| "grad_norm": 0.309282511472702, |
| "learning_rate": 0.0005647093768200349, |
| "loss": 3.7292, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.983348858872846, |
| "grad_norm": 0.3091147243976593, |
| "learning_rate": 0.0005645346534653465, |
| "loss": 3.7245, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.9979040521658127, |
| "grad_norm": 0.31467899680137634, |
| "learning_rate": 0.0005643599301106582, |
| "loss": 3.717, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0122263623660923, |
| "grad_norm": 0.31328141689300537, |
| "learning_rate": 0.0005641852067559697, |
| "loss": 3.6388, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.026781555659059, |
| "grad_norm": 0.30440211296081543, |
| "learning_rate": 0.0005640104834012812, |
| "loss": 3.6052, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.041336748952026, |
| "grad_norm": 0.31965646147727966, |
| "learning_rate": 0.0005638357600465929, |
| "loss": 3.6184, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.055891942244993, |
| "grad_norm": 0.3041605055332184, |
| "learning_rate": 0.0005636610366919044, |
| "loss": 3.6315, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.07044713553796, |
| "grad_norm": 0.3063664734363556, |
| "learning_rate": 0.0005634863133372161, |
| "loss": 3.6164, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.0850023288309267, |
| "grad_norm": 0.3086493909358978, |
| "learning_rate": 0.0005633115899825276, |
| "loss": 3.6269, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.099557522123894, |
| "grad_norm": 0.3314305245876312, |
| "learning_rate": 0.0005631368666278393, |
| "loss": 3.626, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.114112715416861, |
| "grad_norm": 0.3184899687767029, |
| "learning_rate": 0.0005629621432731508, |
| "loss": 3.6244, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.1286679087098275, |
| "grad_norm": 0.30493825674057007, |
| "learning_rate": 0.0005627874199184623, |
| "loss": 3.6374, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1432231020027945, |
| "grad_norm": 0.32075250148773193, |
| "learning_rate": 0.000562612696563774, |
| "loss": 3.6312, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1577782952957616, |
| "grad_norm": 0.31725215911865234, |
| "learning_rate": 0.0005624379732090856, |
| "loss": 3.6337, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.1723334885887287, |
| "grad_norm": 0.3105776607990265, |
| "learning_rate": 0.0005622632498543972, |
| "loss": 3.6398, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.1868886818816953, |
| "grad_norm": 0.31388428807258606, |
| "learning_rate": 0.0005620885264997087, |
| "loss": 3.6342, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2014438751746623, |
| "grad_norm": 0.338467538356781, |
| "learning_rate": 0.0005619138031450204, |
| "loss": 3.6328, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2014438751746623, |
| "eval_accuracy": 0.3517692149211257, |
| "eval_loss": 3.71359920501709, |
| "eval_runtime": 179.6628, |
| "eval_samples_per_second": 92.679, |
| "eval_steps_per_second": 5.794, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2159990684676294, |
| "grad_norm": 0.33954986929893494, |
| "learning_rate": 0.0005617390797903319, |
| "loss": 3.634, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.230554261760596, |
| "grad_norm": 0.3152860999107361, |
| "learning_rate": 0.0005615643564356436, |
| "loss": 3.6452, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.245109455053563, |
| "grad_norm": 0.31836217641830444, |
| "learning_rate": 0.0005613896330809551, |
| "loss": 3.6352, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.25966464834653, |
| "grad_norm": 0.3215279281139374, |
| "learning_rate": 0.0005612149097262667, |
| "loss": 3.6452, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.274219841639497, |
| "grad_norm": 0.31674569845199585, |
| "learning_rate": 0.0005610401863715783, |
| "loss": 3.6434, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.288775034932464, |
| "grad_norm": 0.2988327741622925, |
| "learning_rate": 0.0005608654630168898, |
| "loss": 3.6226, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.303330228225431, |
| "grad_norm": 0.322599858045578, |
| "learning_rate": 0.0005606907396622015, |
| "loss": 3.6409, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.317885421518398, |
| "grad_norm": 0.3030698001384735, |
| "learning_rate": 0.0005605160163075131, |
| "loss": 3.6371, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3324406148113646, |
| "grad_norm": 0.3100179433822632, |
| "learning_rate": 0.0005603412929528247, |
| "loss": 3.6327, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.3469958081043316, |
| "grad_norm": 0.30968624353408813, |
| "learning_rate": 0.0005601665695981362, |
| "loss": 3.6423, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3615510013972987, |
| "grad_norm": 0.3197573125362396, |
| "learning_rate": 0.0005599918462434478, |
| "loss": 3.6325, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.3761061946902653, |
| "grad_norm": 0.3116932809352875, |
| "learning_rate": 0.0005598171228887594, |
| "loss": 3.6313, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.3906613879832324, |
| "grad_norm": 0.31749212741851807, |
| "learning_rate": 0.0005596423995340709, |
| "loss": 3.6446, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4052165812761994, |
| "grad_norm": 0.31616830825805664, |
| "learning_rate": 0.0005594676761793826, |
| "loss": 3.6578, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.419771774569166, |
| "grad_norm": 0.30816033482551575, |
| "learning_rate": 0.0005592929528246942, |
| "loss": 3.6325, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.434326967862133, |
| "grad_norm": 0.3149195909500122, |
| "learning_rate": 0.0005591182294700058, |
| "loss": 3.6345, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4488821611551, |
| "grad_norm": 0.318624883890152, |
| "learning_rate": 0.0005589435061153173, |
| "loss": 3.6419, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.463437354448067, |
| "grad_norm": 0.3080650269985199, |
| "learning_rate": 0.000558768782760629, |
| "loss": 3.6258, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.477992547741034, |
| "grad_norm": 0.3114171326160431, |
| "learning_rate": 0.0005585940594059406, |
| "loss": 3.6486, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.492547741034001, |
| "grad_norm": 0.30843472480773926, |
| "learning_rate": 0.0005584193360512521, |
| "loss": 3.6293, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.492547741034001, |
| "eval_accuracy": 0.35355715781836883, |
| "eval_loss": 3.6956379413604736, |
| "eval_runtime": 179.6151, |
| "eval_samples_per_second": 92.704, |
| "eval_steps_per_second": 5.796, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.507102934326968, |
| "grad_norm": 0.3114839196205139, |
| "learning_rate": 0.0005582446126965637, |
| "loss": 3.6463, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.5216581276199346, |
| "grad_norm": 0.31944605708122253, |
| "learning_rate": 0.0005580698893418753, |
| "loss": 3.63, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.5362133209129016, |
| "grad_norm": 0.3223167955875397, |
| "learning_rate": 0.0005578951659871869, |
| "loss": 3.6368, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.5507685142058687, |
| "grad_norm": 0.31075170636177063, |
| "learning_rate": 0.0005577204426324985, |
| "loss": 3.6487, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5653237074988358, |
| "grad_norm": 0.30599287152290344, |
| "learning_rate": 0.0005575457192778101, |
| "loss": 3.6363, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5798789007918024, |
| "grad_norm": 0.3183053433895111, |
| "learning_rate": 0.0005573709959231217, |
| "loss": 3.6563, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.5944340940847694, |
| "grad_norm": 0.3029063642024994, |
| "learning_rate": 0.0005571962725684332, |
| "loss": 3.6467, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6089892873777365, |
| "grad_norm": 0.3174987733364105, |
| "learning_rate": 0.0005570215492137449, |
| "loss": 3.6467, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.623544480670703, |
| "grad_norm": 0.32621219754219055, |
| "learning_rate": 0.0005568468258590564, |
| "loss": 3.6284, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.63809967396367, |
| "grad_norm": 0.31085944175720215, |
| "learning_rate": 0.0005566721025043681, |
| "loss": 3.6346, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6526548672566372, |
| "grad_norm": 0.30973267555236816, |
| "learning_rate": 0.0005564973791496796, |
| "loss": 3.6303, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6672100605496043, |
| "grad_norm": 0.3051224946975708, |
| "learning_rate": 0.0005563226557949913, |
| "loss": 3.6498, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.681765253842571, |
| "grad_norm": 0.30966806411743164, |
| "learning_rate": 0.0005561479324403028, |
| "loss": 3.6451, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.696320447135538, |
| "grad_norm": 0.31790247559547424, |
| "learning_rate": 0.0005559732090856144, |
| "loss": 3.6464, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.710875640428505, |
| "grad_norm": 0.3122842311859131, |
| "learning_rate": 0.000555798485730926, |
| "loss": 3.6368, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.7254308337214717, |
| "grad_norm": 0.3073221445083618, |
| "learning_rate": 0.0005556237623762376, |
| "loss": 3.6385, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7399860270144387, |
| "grad_norm": 0.3260260224342346, |
| "learning_rate": 0.0005554490390215492, |
| "loss": 3.6424, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.754541220307406, |
| "grad_norm": 0.29862716794013977, |
| "learning_rate": 0.0005552743156668607, |
| "loss": 3.6521, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.769096413600373, |
| "grad_norm": 0.3145371675491333, |
| "learning_rate": 0.0005550995923121724, |
| "loss": 3.6345, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7836516068933395, |
| "grad_norm": 0.31774255633354187, |
| "learning_rate": 0.0005549248689574839, |
| "loss": 3.6383, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7836516068933395, |
| "eval_accuracy": 0.3552514314531981, |
| "eval_loss": 3.6770644187927246, |
| "eval_runtime": 179.7074, |
| "eval_samples_per_second": 92.656, |
| "eval_steps_per_second": 5.793, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7982068001863065, |
| "grad_norm": 0.31905579566955566, |
| "learning_rate": 0.0005547501456027955, |
| "loss": 3.6483, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.812761993479273, |
| "grad_norm": 0.29292047023773193, |
| "learning_rate": 0.0005545754222481071, |
| "loss": 3.6467, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.82731718677224, |
| "grad_norm": 0.3134244978427887, |
| "learning_rate": 0.0005544006988934188, |
| "loss": 3.6342, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8418723800652073, |
| "grad_norm": 0.3155539035797119, |
| "learning_rate": 0.0005542259755387303, |
| "loss": 3.6396, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.8564275733581743, |
| "grad_norm": 0.30693483352661133, |
| "learning_rate": 0.0005540512521840418, |
| "loss": 3.6287, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.8709827666511414, |
| "grad_norm": 0.3106670677661896, |
| "learning_rate": 0.0005538765288293535, |
| "loss": 3.6293, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.885537959944108, |
| "grad_norm": 0.3124571442604065, |
| "learning_rate": 0.0005537018054746651, |
| "loss": 3.6383, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.900093153237075, |
| "grad_norm": 0.2964450418949127, |
| "learning_rate": 0.0005535270821199767, |
| "loss": 3.6412, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.9146483465300417, |
| "grad_norm": 0.2931179702281952, |
| "learning_rate": 0.0005533523587652882, |
| "loss": 3.6425, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.9292035398230087, |
| "grad_norm": 0.30559033155441284, |
| "learning_rate": 0.0005531776354105999, |
| "loss": 3.626, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.943758733115976, |
| "grad_norm": 0.30951324105262756, |
| "learning_rate": 0.0005530029120559114, |
| "loss": 3.6453, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.958313926408943, |
| "grad_norm": 0.3029481768608093, |
| "learning_rate": 0.0005528281887012229, |
| "loss": 3.6301, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.9728691197019095, |
| "grad_norm": 0.2964765727519989, |
| "learning_rate": 0.0005526534653465346, |
| "loss": 3.643, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9874243129948765, |
| "grad_norm": 0.2986086905002594, |
| "learning_rate": 0.0005524787419918462, |
| "loss": 3.6376, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.001746623195156, |
| "grad_norm": 0.30963781476020813, |
| "learning_rate": 0.0005523040186371578, |
| "loss": 3.6197, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.016301816488123, |
| "grad_norm": 0.3227597177028656, |
| "learning_rate": 0.0005521292952824693, |
| "loss": 3.5223, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.03085700978109, |
| "grad_norm": 0.32853272557258606, |
| "learning_rate": 0.000551954571927781, |
| "loss": 3.5391, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.045412203074057, |
| "grad_norm": 0.32738175988197327, |
| "learning_rate": 0.0005517798485730926, |
| "loss": 3.5281, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.059967396367024, |
| "grad_norm": 0.31711599230766296, |
| "learning_rate": 0.0005516051252184042, |
| "loss": 3.5304, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.074522589659991, |
| "grad_norm": 0.2948492169380188, |
| "learning_rate": 0.0005514304018637157, |
| "loss": 3.5359, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.074522589659991, |
| "eval_accuracy": 0.356798090792429, |
| "eval_loss": 3.6693131923675537, |
| "eval_runtime": 179.7092, |
| "eval_samples_per_second": 92.655, |
| "eval_steps_per_second": 5.793, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.089077782952957, |
| "grad_norm": 0.32520484924316406, |
| "learning_rate": 0.0005512556785090273, |
| "loss": 3.5517, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.103632976245924, |
| "grad_norm": 0.3147394061088562, |
| "learning_rate": 0.0005510809551543389, |
| "loss": 3.5448, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.118188169538891, |
| "grad_norm": 0.303621768951416, |
| "learning_rate": 0.0005509062317996504, |
| "loss": 3.5423, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.132743362831858, |
| "grad_norm": 0.3130205571651459, |
| "learning_rate": 0.0005507315084449621, |
| "loss": 3.5559, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.1472985561248255, |
| "grad_norm": 0.342622846364975, |
| "learning_rate": 0.0005505567850902737, |
| "loss": 3.5519, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.1618537494177925, |
| "grad_norm": 0.3157576322555542, |
| "learning_rate": 0.0005503820617355853, |
| "loss": 3.5574, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.17640894271076, |
| "grad_norm": 0.3237878084182739, |
| "learning_rate": 0.0005502073383808969, |
| "loss": 3.5596, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.190964136003726, |
| "grad_norm": 0.3167899250984192, |
| "learning_rate": 0.0005500326150262085, |
| "loss": 3.5716, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.205519329296693, |
| "grad_norm": 0.3111872971057892, |
| "learning_rate": 0.00054985789167152, |
| "loss": 3.5554, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.22007452258966, |
| "grad_norm": 0.3192780315876007, |
| "learning_rate": 0.0005496831683168316, |
| "loss": 3.5633, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.234629715882627, |
| "grad_norm": 0.3170258104801178, |
| "learning_rate": 0.0005495084449621433, |
| "loss": 3.5539, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.249184909175594, |
| "grad_norm": 0.31680041551589966, |
| "learning_rate": 0.0005493337216074548, |
| "loss": 3.5602, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.263740102468561, |
| "grad_norm": 0.3066939413547516, |
| "learning_rate": 0.0005491589982527664, |
| "loss": 3.5584, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.278295295761528, |
| "grad_norm": 0.35077103972435, |
| "learning_rate": 0.000548984274898078, |
| "loss": 3.5494, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.292850489054494, |
| "grad_norm": 0.321871280670166, |
| "learning_rate": 0.0005488095515433897, |
| "loss": 3.5569, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.307405682347461, |
| "grad_norm": 0.34034264087677, |
| "learning_rate": 0.0005486348281887012, |
| "loss": 3.5536, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.321960875640428, |
| "grad_norm": 0.30980339646339417, |
| "learning_rate": 0.0005484601048340127, |
| "loss": 3.5674, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.3365160689333955, |
| "grad_norm": 0.3066848814487457, |
| "learning_rate": 0.0005482853814793244, |
| "loss": 3.5601, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.3510712622263625, |
| "grad_norm": 0.3109862804412842, |
| "learning_rate": 0.0005481106581246359, |
| "loss": 3.5693, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.36562645551933, |
| "grad_norm": 0.321115106344223, |
| "learning_rate": 0.0005479359347699475, |
| "loss": 3.5563, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.36562645551933, |
| "eval_accuracy": 0.35777850357418167, |
| "eval_loss": 3.657255172729492, |
| "eval_runtime": 179.8875, |
| "eval_samples_per_second": 92.563, |
| "eval_steps_per_second": 5.787, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.380181648812297, |
| "grad_norm": 0.3224615454673767, |
| "learning_rate": 0.0005477612114152591, |
| "loss": 3.5622, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.394736842105263, |
| "grad_norm": 0.3030092120170593, |
| "learning_rate": 0.0005475864880605708, |
| "loss": 3.5642, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.40929203539823, |
| "grad_norm": 0.33209550380706787, |
| "learning_rate": 0.0005474117647058823, |
| "loss": 3.5749, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.423847228691197, |
| "grad_norm": 0.3086824119091034, |
| "learning_rate": 0.0005472370413511939, |
| "loss": 3.5624, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.438402421984164, |
| "grad_norm": 0.31303492188453674, |
| "learning_rate": 0.0005470623179965055, |
| "loss": 3.5803, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.452957615277131, |
| "grad_norm": 0.30704519152641296, |
| "learning_rate": 0.0005468875946418171, |
| "loss": 3.5768, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.467512808570098, |
| "grad_norm": 0.31732097268104553, |
| "learning_rate": 0.0005467128712871287, |
| "loss": 3.5743, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.482068001863064, |
| "grad_norm": 0.32602792978286743, |
| "learning_rate": 0.0005465381479324402, |
| "loss": 3.5603, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.496623195156031, |
| "grad_norm": 0.3007522523403168, |
| "learning_rate": 0.0005463634245777519, |
| "loss": 3.5684, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.511178388448998, |
| "grad_norm": 0.3011171519756317, |
| "learning_rate": 0.0005461887012230634, |
| "loss": 3.5605, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.5257335817419655, |
| "grad_norm": 0.31969600915908813, |
| "learning_rate": 0.000546013977868375, |
| "loss": 3.5675, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.5402887750349326, |
| "grad_norm": 0.3020431399345398, |
| "learning_rate": 0.0005458392545136866, |
| "loss": 3.596, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.5548439683279, |
| "grad_norm": 0.3164878189563751, |
| "learning_rate": 0.0005456645311589983, |
| "loss": 3.5748, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.569399161620867, |
| "grad_norm": 0.3169105350971222, |
| "learning_rate": 0.0005454898078043098, |
| "loss": 3.5658, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.583954354913834, |
| "grad_norm": 0.3334912359714508, |
| "learning_rate": 0.0005453150844496213, |
| "loss": 3.5709, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.5985095482068, |
| "grad_norm": 0.3061586022377014, |
| "learning_rate": 0.000545140361094933, |
| "loss": 3.5786, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.613064741499767, |
| "grad_norm": 0.3311856687068939, |
| "learning_rate": 0.0005449656377402445, |
| "loss": 3.5711, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.627619934792734, |
| "grad_norm": 0.3058924674987793, |
| "learning_rate": 0.0005447909143855562, |
| "loss": 3.5846, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.642175128085701, |
| "grad_norm": 0.3133726119995117, |
| "learning_rate": 0.0005446161910308677, |
| "loss": 3.5762, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.656730321378668, |
| "grad_norm": 0.2965134084224701, |
| "learning_rate": 0.0005444414676761794, |
| "loss": 3.569, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.656730321378668, |
| "eval_accuracy": 0.3587089672511339, |
| "eval_loss": 3.643244981765747, |
| "eval_runtime": 179.7262, |
| "eval_samples_per_second": 92.646, |
| "eval_steps_per_second": 5.792, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.671285514671635, |
| "grad_norm": 0.30532756447792053, |
| "learning_rate": 0.0005442667443214909, |
| "loss": 3.5707, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.685840707964601, |
| "grad_norm": 0.327919065952301, |
| "learning_rate": 0.0005440920209668024, |
| "loss": 3.5688, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.7003959012575685, |
| "grad_norm": 0.30351272225379944, |
| "learning_rate": 0.0005439172976121141, |
| "loss": 3.5807, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.7149510945505355, |
| "grad_norm": 0.3110066056251526, |
| "learning_rate": 0.0005437425742574257, |
| "loss": 3.5803, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.729506287843503, |
| "grad_norm": 0.31730541586875916, |
| "learning_rate": 0.0005435678509027373, |
| "loss": 3.5656, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.74406148113647, |
| "grad_norm": 0.30847522616386414, |
| "learning_rate": 0.0005433931275480488, |
| "loss": 3.5674, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.758616674429437, |
| "grad_norm": 0.29101788997650146, |
| "learning_rate": 0.0005432184041933605, |
| "loss": 3.5671, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.773171867722404, |
| "grad_norm": 0.2971099615097046, |
| "learning_rate": 0.000543043680838672, |
| "loss": 3.5663, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.78772706101537, |
| "grad_norm": 0.292220801115036, |
| "learning_rate": 0.0005428689574839837, |
| "loss": 3.5817, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.802282254308337, |
| "grad_norm": 0.3278649151325226, |
| "learning_rate": 0.0005426942341292952, |
| "loss": 3.5734, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.816837447601304, |
| "grad_norm": 0.3087938725948334, |
| "learning_rate": 0.0005425195107746068, |
| "loss": 3.5699, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.831392640894271, |
| "grad_norm": 0.3141016662120819, |
| "learning_rate": 0.0005423447874199184, |
| "loss": 3.571, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.845947834187238, |
| "grad_norm": 0.32644930481910706, |
| "learning_rate": 0.00054217006406523, |
| "loss": 3.5744, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.860503027480205, |
| "grad_norm": 0.33965256810188293, |
| "learning_rate": 0.0005419953407105417, |
| "loss": 3.577, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.875058220773171, |
| "grad_norm": 0.2934809923171997, |
| "learning_rate": 0.0005418206173558532, |
| "loss": 3.5731, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.8896134140661385, |
| "grad_norm": 0.32220134139060974, |
| "learning_rate": 0.0005416458940011648, |
| "loss": 3.5702, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.9041686073591055, |
| "grad_norm": 0.294540137052536, |
| "learning_rate": 0.0005414711706464764, |
| "loss": 3.5755, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.918723800652073, |
| "grad_norm": 0.33389946818351746, |
| "learning_rate": 0.000541296447291788, |
| "loss": 3.5798, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.93327899394504, |
| "grad_norm": 0.3083215057849884, |
| "learning_rate": 0.0005411217239370995, |
| "loss": 3.5754, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.947834187238007, |
| "grad_norm": 0.3171711564064026, |
| "learning_rate": 0.0005409470005824111, |
| "loss": 3.5858, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.947834187238007, |
| "eval_accuracy": 0.36012622902710545, |
| "eval_loss": 3.632199287414551, |
| "eval_runtime": 179.7045, |
| "eval_samples_per_second": 92.658, |
| "eval_steps_per_second": 5.793, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.962389380530974, |
| "grad_norm": 0.29613155126571655, |
| "learning_rate": 0.0005407722772277228, |
| "loss": 3.5864, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.976944573823941, |
| "grad_norm": 0.305269330739975, |
| "learning_rate": 0.0005405975538730343, |
| "loss": 3.5816, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.991499767116907, |
| "grad_norm": 0.293756902217865, |
| "learning_rate": 0.0005404228305183459, |
| "loss": 3.5746, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.005822077317187, |
| "grad_norm": 0.33331188559532166, |
| "learning_rate": 0.0005402481071636575, |
| "loss": 3.5379, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.020377270610154, |
| "grad_norm": 0.30909281969070435, |
| "learning_rate": 0.0005400733838089692, |
| "loss": 3.4594, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.034932463903121, |
| "grad_norm": 0.3031615912914276, |
| "learning_rate": 0.0005398986604542807, |
| "loss": 3.4567, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.049487657196088, |
| "grad_norm": 0.3327254354953766, |
| "learning_rate": 0.0005397239370995922, |
| "loss": 3.4685, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.064042850489055, |
| "grad_norm": 0.32414567470550537, |
| "learning_rate": 0.0005395492137449039, |
| "loss": 3.483, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.078598043782021, |
| "grad_norm": 0.2953879237174988, |
| "learning_rate": 0.0005393744903902154, |
| "loss": 3.484, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.093153237074988, |
| "grad_norm": 0.32500144839286804, |
| "learning_rate": 0.000539199767035527, |
| "loss": 3.4798, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.107708430367955, |
| "grad_norm": 0.33330488204956055, |
| "learning_rate": 0.0005390250436808386, |
| "loss": 3.4824, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.122263623660922, |
| "grad_norm": 0.3294694423675537, |
| "learning_rate": 0.0005388503203261503, |
| "loss": 3.4841, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.136818816953889, |
| "grad_norm": 0.328597754240036, |
| "learning_rate": 0.0005386755969714618, |
| "loss": 3.4937, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.151374010246856, |
| "grad_norm": 0.34227892756462097, |
| "learning_rate": 0.0005385008736167733, |
| "loss": 3.4842, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.165929203539823, |
| "grad_norm": 0.30092424154281616, |
| "learning_rate": 0.000538326150262085, |
| "loss": 3.4961, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.18048439683279, |
| "grad_norm": 0.31993362307548523, |
| "learning_rate": 0.0005381514269073965, |
| "loss": 3.5018, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.195039590125757, |
| "grad_norm": 0.31290343403816223, |
| "learning_rate": 0.0005379767035527082, |
| "loss": 3.4916, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.209594783418724, |
| "grad_norm": 0.31315720081329346, |
| "learning_rate": 0.0005378019801980197, |
| "loss": 3.4899, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.224149976711691, |
| "grad_norm": 0.33438387513160706, |
| "learning_rate": 0.0005376272568433314, |
| "loss": 3.5003, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.238705170004658, |
| "grad_norm": 0.3011881709098816, |
| "learning_rate": 0.0005374525334886429, |
| "loss": 3.4991, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.238705170004658, |
| "eval_accuracy": 0.3604810439621463, |
| "eval_loss": 3.6344430446624756, |
| "eval_runtime": 179.6987, |
| "eval_samples_per_second": 92.661, |
| "eval_steps_per_second": 5.793, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.253260363297625, |
| "grad_norm": 0.30630162358283997, |
| "learning_rate": 0.0005372778101339545, |
| "loss": 3.501, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.267815556590591, |
| "grad_norm": 0.31994864344596863, |
| "learning_rate": 0.0005371030867792661, |
| "loss": 3.5159, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.282370749883558, |
| "grad_norm": 0.30697357654571533, |
| "learning_rate": 0.0005369283634245778, |
| "loss": 3.5036, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.296925943176525, |
| "grad_norm": 0.30527111887931824, |
| "learning_rate": 0.0005367536400698893, |
| "loss": 3.5074, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.311481136469492, |
| "grad_norm": 0.3237697184085846, |
| "learning_rate": 0.0005365789167152008, |
| "loss": 3.5075, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.326036329762459, |
| "grad_norm": 0.3105727434158325, |
| "learning_rate": 0.0005364041933605125, |
| "loss": 3.5095, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.340591523055426, |
| "grad_norm": 0.3035230040550232, |
| "learning_rate": 0.000536229470005824, |
| "loss": 3.5185, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.3551467163483935, |
| "grad_norm": 0.3358999788761139, |
| "learning_rate": 0.0005360547466511357, |
| "loss": 3.5145, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.36970190964136, |
| "grad_norm": 0.3387795388698578, |
| "learning_rate": 0.0005358800232964472, |
| "loss": 3.5195, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.384257102934327, |
| "grad_norm": 0.30367156863212585, |
| "learning_rate": 0.0005357052999417589, |
| "loss": 3.5161, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.398812296227294, |
| "grad_norm": 0.32554861903190613, |
| "learning_rate": 0.0005355305765870704, |
| "loss": 3.5156, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.413367489520261, |
| "grad_norm": 0.32227516174316406, |
| "learning_rate": 0.000535355853232382, |
| "loss": 3.521, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.427922682813228, |
| "grad_norm": 0.30939218401908875, |
| "learning_rate": 0.0005351811298776936, |
| "loss": 3.5177, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.442477876106195, |
| "grad_norm": 0.3210090696811676, |
| "learning_rate": 0.0005350064065230052, |
| "loss": 3.5075, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.457033069399162, |
| "grad_norm": 0.30049794912338257, |
| "learning_rate": 0.0005348316831683168, |
| "loss": 3.5269, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.471588262692128, |
| "grad_norm": 0.31089526414871216, |
| "learning_rate": 0.0005346569598136284, |
| "loss": 3.5171, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.486143455985095, |
| "grad_norm": 0.3039928674697876, |
| "learning_rate": 0.00053448223645894, |
| "loss": 3.5192, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.500698649278062, |
| "grad_norm": 0.3058149814605713, |
| "learning_rate": 0.0005343075131042515, |
| "loss": 3.514, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.515253842571029, |
| "grad_norm": 0.30949902534484863, |
| "learning_rate": 0.0005341327897495632, |
| "loss": 3.513, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.529809035863996, |
| "grad_norm": 0.2995462417602539, |
| "learning_rate": 0.0005339580663948748, |
| "loss": 3.5221, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.529809035863996, |
| "eval_accuracy": 0.3616167103143491, |
| "eval_loss": 3.6234872341156006, |
| "eval_runtime": 179.8697, |
| "eval_samples_per_second": 92.573, |
| "eval_steps_per_second": 5.788, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.5443642291569635, |
| "grad_norm": 0.3011551797389984, |
| "learning_rate": 0.0005337833430401863, |
| "loss": 3.5138, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.5589194224499305, |
| "grad_norm": 0.33986860513687134, |
| "learning_rate": 0.0005336086196854979, |
| "loss": 3.5246, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.573474615742897, |
| "grad_norm": 0.32027342915534973, |
| "learning_rate": 0.0005334338963308095, |
| "loss": 3.5271, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.588029809035864, |
| "grad_norm": 0.31938230991363525, |
| "learning_rate": 0.0005332591729761211, |
| "loss": 3.5304, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.602585002328831, |
| "grad_norm": 0.3016068935394287, |
| "learning_rate": 0.0005330844496214327, |
| "loss": 3.5239, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.617140195621798, |
| "grad_norm": 0.33804088830947876, |
| "learning_rate": 0.0005329097262667443, |
| "loss": 3.5175, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.631695388914765, |
| "grad_norm": 0.31818464398384094, |
| "learning_rate": 0.0005327350029120559, |
| "loss": 3.5263, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.646250582207732, |
| "grad_norm": 0.2964155673980713, |
| "learning_rate": 0.0005325602795573674, |
| "loss": 3.5271, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.660805775500698, |
| "grad_norm": 0.31857946515083313, |
| "learning_rate": 0.000532385556202679, |
| "loss": 3.5196, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.675360968793665, |
| "grad_norm": 0.3290652334690094, |
| "learning_rate": 0.0005322108328479906, |
| "loss": 3.5299, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.689916162086632, |
| "grad_norm": 0.337568461894989, |
| "learning_rate": 0.0005320361094933023, |
| "loss": 3.5341, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.704471355379599, |
| "grad_norm": 0.3184109032154083, |
| "learning_rate": 0.0005318613861386138, |
| "loss": 3.5251, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.719026548672566, |
| "grad_norm": 0.30031880736351013, |
| "learning_rate": 0.0005316866627839254, |
| "loss": 3.5197, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.7335817419655335, |
| "grad_norm": 0.3102143406867981, |
| "learning_rate": 0.000531511939429237, |
| "loss": 3.5203, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.748136935258501, |
| "grad_norm": 0.31440630555152893, |
| "learning_rate": 0.0005313372160745486, |
| "loss": 3.5217, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.762692128551468, |
| "grad_norm": 0.3147021532058716, |
| "learning_rate": 0.0005311624927198602, |
| "loss": 3.505, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.777247321844434, |
| "grad_norm": 0.312235027551651, |
| "learning_rate": 0.0005309877693651717, |
| "loss": 3.5302, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.791802515137401, |
| "grad_norm": 0.30304163694381714, |
| "learning_rate": 0.0005308130460104834, |
| "loss": 3.5261, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.806357708430368, |
| "grad_norm": 0.32415395975112915, |
| "learning_rate": 0.0005306383226557949, |
| "loss": 3.5251, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.820912901723335, |
| "grad_norm": 0.29884156584739685, |
| "learning_rate": 0.0005304635993011065, |
| "loss": 3.5329, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.820912901723335, |
| "eval_accuracy": 0.3623818130725857, |
| "eval_loss": 3.611879587173462, |
| "eval_runtime": 179.674, |
| "eval_samples_per_second": 92.673, |
| "eval_steps_per_second": 5.794, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.835468095016302, |
| "grad_norm": 0.30554407835006714, |
| "learning_rate": 0.0005302888759464181, |
| "loss": 3.5344, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.850023288309269, |
| "grad_norm": 0.3254547715187073, |
| "learning_rate": 0.0005301141525917298, |
| "loss": 3.5282, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.864578481602235, |
| "grad_norm": 0.3139106035232544, |
| "learning_rate": 0.0005299394292370413, |
| "loss": 3.5229, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.879133674895202, |
| "grad_norm": 0.32929766178131104, |
| "learning_rate": 0.0005297647058823528, |
| "loss": 3.5214, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.893688868188169, |
| "grad_norm": 0.31918251514434814, |
| "learning_rate": 0.0005295899825276645, |
| "loss": 3.5211, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.9082440614811365, |
| "grad_norm": 0.3251992166042328, |
| "learning_rate": 0.000529415259172976, |
| "loss": 3.5237, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.9227992547741035, |
| "grad_norm": 0.30213046073913574, |
| "learning_rate": 0.0005292405358182877, |
| "loss": 3.5333, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.937354448067071, |
| "grad_norm": 0.3040863275527954, |
| "learning_rate": 0.0005290658124635992, |
| "loss": 3.5296, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.951909641360038, |
| "grad_norm": 0.3090716600418091, |
| "learning_rate": 0.0005288910891089109, |
| "loss": 3.5339, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.966464834653004, |
| "grad_norm": 0.31416988372802734, |
| "learning_rate": 0.0005287163657542224, |
| "loss": 3.5275, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.981020027945971, |
| "grad_norm": 0.32365185022354126, |
| "learning_rate": 0.000528541642399534, |
| "loss": 3.5326, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.995575221238938, |
| "grad_norm": 0.31189417839050293, |
| "learning_rate": 0.0005283669190448456, |
| "loss": 3.5301, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.009897531439218, |
| "grad_norm": 0.30607596039772034, |
| "learning_rate": 0.0005281921956901572, |
| "loss": 3.4514, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.024452724732185, |
| "grad_norm": 0.2989142835140228, |
| "learning_rate": 0.0005280174723354688, |
| "loss": 3.4082, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.039007918025152, |
| "grad_norm": 0.30611732602119446, |
| "learning_rate": 0.0005278427489807804, |
| "loss": 3.4312, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.053563111318118, |
| "grad_norm": 0.33311450481414795, |
| "learning_rate": 0.000527668025626092, |
| "loss": 3.4145, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.068118304611085, |
| "grad_norm": 0.3017202913761139, |
| "learning_rate": 0.0005274933022714035, |
| "loss": 3.4378, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.082673497904052, |
| "grad_norm": 0.331897109746933, |
| "learning_rate": 0.0005273185789167152, |
| "loss": 3.4208, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.097228691197019, |
| "grad_norm": 0.32341882586479187, |
| "learning_rate": 0.0005271438555620268, |
| "loss": 3.4489, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.111783884489986, |
| "grad_norm": 0.3440253734588623, |
| "learning_rate": 0.0005269691322073384, |
| "loss": 3.4393, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.111783884489986, |
| "eval_accuracy": 0.3628803639021463, |
| "eval_loss": 3.613924503326416, |
| "eval_runtime": 179.8223, |
| "eval_samples_per_second": 92.597, |
| "eval_steps_per_second": 5.789, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.126339077782953, |
| "grad_norm": 0.3371540904045105, |
| "learning_rate": 0.0005267944088526499, |
| "loss": 3.446, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.14089427107592, |
| "grad_norm": 0.328134685754776, |
| "learning_rate": 0.0005266196854979615, |
| "loss": 3.4384, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.155449464368886, |
| "grad_norm": 0.33417844772338867, |
| "learning_rate": 0.0005264449621432731, |
| "loss": 3.4554, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.1700046576618535, |
| "grad_norm": 0.32764503359794617, |
| "learning_rate": 0.0005262702387885847, |
| "loss": 3.4483, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.1845598509548205, |
| "grad_norm": 0.3138699233531952, |
| "learning_rate": 0.0005260955154338963, |
| "loss": 3.4495, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.199115044247788, |
| "grad_norm": 0.33888301253318787, |
| "learning_rate": 0.0005259207920792079, |
| "loss": 3.4616, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.213670237540755, |
| "grad_norm": 0.3592548668384552, |
| "learning_rate": 0.0005257460687245195, |
| "loss": 3.4615, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.228225430833722, |
| "grad_norm": 0.30623868107795715, |
| "learning_rate": 0.000525571345369831, |
| "loss": 3.4654, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.242780624126689, |
| "grad_norm": 0.3069242835044861, |
| "learning_rate": 0.0005253966220151426, |
| "loss": 3.4524, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.257335817419655, |
| "grad_norm": 0.3082742989063263, |
| "learning_rate": 0.0005252218986604543, |
| "loss": 3.4639, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.271891010712622, |
| "grad_norm": 0.31266793608665466, |
| "learning_rate": 0.0005250471753057658, |
| "loss": 3.4535, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.286446204005589, |
| "grad_norm": 0.3180970847606659, |
| "learning_rate": 0.0005248724519510774, |
| "loss": 3.4587, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.301001397298556, |
| "grad_norm": 0.31825992465019226, |
| "learning_rate": 0.000524697728596389, |
| "loss": 3.4618, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.315556590591523, |
| "grad_norm": 0.31838515400886536, |
| "learning_rate": 0.0005245230052417006, |
| "loss": 3.4509, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.33011178388449, |
| "grad_norm": 0.3172382116317749, |
| "learning_rate": 0.0005243482818870122, |
| "loss": 3.4688, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.344666977177457, |
| "grad_norm": 0.34158939123153687, |
| "learning_rate": 0.0005241735585323238, |
| "loss": 3.4639, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.3592221704704235, |
| "grad_norm": 0.31548604369163513, |
| "learning_rate": 0.0005239988351776354, |
| "loss": 3.4599, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.3737773637633905, |
| "grad_norm": 0.3228817880153656, |
| "learning_rate": 0.0005238241118229469, |
| "loss": 3.4779, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.388332557056358, |
| "grad_norm": 0.32255199551582336, |
| "learning_rate": 0.0005236493884682585, |
| "loss": 3.4593, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.402887750349325, |
| "grad_norm": 0.3081405460834503, |
| "learning_rate": 0.0005234746651135701, |
| "loss": 3.4739, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.402887750349325, |
| "eval_accuracy": 0.3633811477505097, |
| "eval_loss": 3.608250856399536, |
| "eval_runtime": 179.8112, |
| "eval_samples_per_second": 92.603, |
| "eval_steps_per_second": 5.789, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.417442943642292, |
| "grad_norm": 0.3040979504585266, |
| "learning_rate": 0.0005232999417588818, |
| "loss": 3.4686, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.431998136935259, |
| "grad_norm": 0.32507723569869995, |
| "learning_rate": 0.0005231252184041933, |
| "loss": 3.4753, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.446553330228225, |
| "grad_norm": 0.35055863857269287, |
| "learning_rate": 0.0005229504950495049, |
| "loss": 3.459, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.461108523521192, |
| "grad_norm": 0.34218981862068176, |
| "learning_rate": 0.0005227757716948165, |
| "loss": 3.4631, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.475663716814159, |
| "grad_norm": 0.3296588063240051, |
| "learning_rate": 0.000522601048340128, |
| "loss": 3.4817, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.490218910107126, |
| "grad_norm": 0.33755651116371155, |
| "learning_rate": 0.0005224263249854397, |
| "loss": 3.4749, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.504774103400093, |
| "grad_norm": 0.36849409341812134, |
| "learning_rate": 0.0005222516016307512, |
| "loss": 3.4844, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.51932929669306, |
| "grad_norm": 0.3198098838329315, |
| "learning_rate": 0.0005220768782760629, |
| "loss": 3.4798, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.533884489986027, |
| "grad_norm": 0.36485329270362854, |
| "learning_rate": 0.0005219021549213744, |
| "loss": 3.4739, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.548439683278994, |
| "grad_norm": 0.36852753162384033, |
| "learning_rate": 0.000521727431566686, |
| "loss": 3.4807, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.562994876571961, |
| "grad_norm": 0.31377214193344116, |
| "learning_rate": 0.0005215527082119976, |
| "loss": 3.4769, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.577550069864928, |
| "grad_norm": 0.3116380274295807, |
| "learning_rate": 0.0005213779848573093, |
| "loss": 3.4823, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.592105263157895, |
| "grad_norm": 0.31712695956230164, |
| "learning_rate": 0.0005212032615026208, |
| "loss": 3.4833, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.606660456450862, |
| "grad_norm": 0.31625989079475403, |
| "learning_rate": 0.0005210285381479323, |
| "loss": 3.4778, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.621215649743829, |
| "grad_norm": 0.32391512393951416, |
| "learning_rate": 0.000520853814793244, |
| "loss": 3.4826, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.635770843036796, |
| "grad_norm": 0.3269652724266052, |
| "learning_rate": 0.0005206790914385555, |
| "loss": 3.4782, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.650326036329762, |
| "grad_norm": 0.32664456963539124, |
| "learning_rate": 0.0005205043680838672, |
| "loss": 3.4819, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.664881229622729, |
| "grad_norm": 0.32027164101600647, |
| "learning_rate": 0.0005203296447291787, |
| "loss": 3.4821, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.679436422915696, |
| "grad_norm": 0.3294582664966583, |
| "learning_rate": 0.0005201549213744904, |
| "loss": 3.4927, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.693991616208663, |
| "grad_norm": 0.32960760593414307, |
| "learning_rate": 0.0005199801980198019, |
| "loss": 3.4852, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.693991616208663, |
| "eval_accuracy": 0.3639370519051117, |
| "eval_loss": 3.6000916957855225, |
| "eval_runtime": 179.769, |
| "eval_samples_per_second": 92.624, |
| "eval_steps_per_second": 5.791, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.70854680950163, |
| "grad_norm": 0.31536969542503357, |
| "learning_rate": 0.0005198054746651136, |
| "loss": 3.4909, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.723102002794597, |
| "grad_norm": 0.3355516195297241, |
| "learning_rate": 0.0005196307513104251, |
| "loss": 3.482, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.737657196087564, |
| "grad_norm": 0.31154128909111023, |
| "learning_rate": 0.0005194560279557367, |
| "loss": 3.4761, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.752212389380531, |
| "grad_norm": 0.33221757411956787, |
| "learning_rate": 0.0005192813046010483, |
| "loss": 3.4905, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.766767582673498, |
| "grad_norm": 0.32357120513916016, |
| "learning_rate": 0.0005191065812463599, |
| "loss": 3.483, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.781322775966465, |
| "grad_norm": 0.30440056324005127, |
| "learning_rate": 0.0005189318578916715, |
| "loss": 3.4882, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.795877969259432, |
| "grad_norm": 0.33167630434036255, |
| "learning_rate": 0.000518757134536983, |
| "loss": 3.4971, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.810433162552399, |
| "grad_norm": 0.33919447660446167, |
| "learning_rate": 0.0005185824111822947, |
| "loss": 3.4903, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.824988355845366, |
| "grad_norm": 0.3298689126968384, |
| "learning_rate": 0.0005184076878276063, |
| "loss": 3.4918, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.839543549138332, |
| "grad_norm": 0.33077070116996765, |
| "learning_rate": 0.0005182329644729179, |
| "loss": 3.4843, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.854098742431299, |
| "grad_norm": 0.32967621088027954, |
| "learning_rate": 0.0005180582411182294, |
| "loss": 3.4893, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.868653935724266, |
| "grad_norm": 0.3005642294883728, |
| "learning_rate": 0.000517883517763541, |
| "loss": 3.496, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.883209129017233, |
| "grad_norm": 0.33780768513679504, |
| "learning_rate": 0.0005177087944088526, |
| "loss": 3.4948, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.8977643223102, |
| "grad_norm": 0.30605146288871765, |
| "learning_rate": 0.0005175340710541642, |
| "loss": 3.4951, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.912319515603167, |
| "grad_norm": 0.30990949273109436, |
| "learning_rate": 0.0005173593476994758, |
| "loss": 3.4877, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.926874708896134, |
| "grad_norm": 0.30463626980781555, |
| "learning_rate": 0.0005171846243447874, |
| "loss": 3.5054, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.9414299021891015, |
| "grad_norm": 0.3081115484237671, |
| "learning_rate": 0.000517009900990099, |
| "loss": 3.4877, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.955985095482068, |
| "grad_norm": 0.3156517446041107, |
| "learning_rate": 0.0005168351776354105, |
| "loss": 3.4985, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.970540288775035, |
| "grad_norm": 0.3113146722316742, |
| "learning_rate": 0.0005166604542807221, |
| "loss": 3.5097, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.985095482068002, |
| "grad_norm": 0.30410292744636536, |
| "learning_rate": 0.0005164857309260338, |
| "loss": 3.4901, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.985095482068002, |
| "eval_accuracy": 0.3646075451824911, |
| "eval_loss": 3.5917577743530273, |
| "eval_runtime": 179.7023, |
| "eval_samples_per_second": 92.659, |
| "eval_steps_per_second": 5.793, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.999650675360969, |
| "grad_norm": 0.3223191797733307, |
| "learning_rate": 0.0005163110075713453, |
| "loss": 3.4891, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.0139729855612485, |
| "grad_norm": 0.32697129249572754, |
| "learning_rate": 0.0005161362842166569, |
| "loss": 3.3921, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.0285281788542155, |
| "grad_norm": 0.31622612476348877, |
| "learning_rate": 0.0005159615608619685, |
| "loss": 3.3832, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.043083372147182, |
| "grad_norm": 0.31813427805900574, |
| "learning_rate": 0.0005157868375072801, |
| "loss": 3.382, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.057638565440149, |
| "grad_norm": 0.3346388638019562, |
| "learning_rate": 0.0005156121141525917, |
| "loss": 3.3943, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.072193758733116, |
| "grad_norm": 0.32561907172203064, |
| "learning_rate": 0.0005154373907979033, |
| "loss": 3.3955, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.086748952026083, |
| "grad_norm": 0.34515753388404846, |
| "learning_rate": 0.0005152626674432149, |
| "loss": 3.4014, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.10130414531905, |
| "grad_norm": 0.3135473430156708, |
| "learning_rate": 0.0005150879440885264, |
| "loss": 3.3965, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.115859338612017, |
| "grad_norm": 0.31428104639053345, |
| "learning_rate": 0.000514913220733838, |
| "loss": 3.3987, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.130414531904984, |
| "grad_norm": 0.3100883960723877, |
| "learning_rate": 0.0005147384973791496, |
| "loss": 3.4165, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.14496972519795, |
| "grad_norm": 0.32985416054725647, |
| "learning_rate": 0.0005145637740244613, |
| "loss": 3.4222, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.159524918490917, |
| "grad_norm": 0.3290567100048065, |
| "learning_rate": 0.0005143890506697728, |
| "loss": 3.4001, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.174080111783884, |
| "grad_norm": 0.33694422245025635, |
| "learning_rate": 0.0005142143273150844, |
| "loss": 3.4296, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.1886353050768514, |
| "grad_norm": 0.3293610215187073, |
| "learning_rate": 0.000514039603960396, |
| "loss": 3.405, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.2031904983698185, |
| "grad_norm": 0.3535860776901245, |
| "learning_rate": 0.0005138648806057075, |
| "loss": 3.4236, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.217745691662786, |
| "grad_norm": 0.329603374004364, |
| "learning_rate": 0.0005136901572510192, |
| "loss": 3.4145, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.232300884955753, |
| "grad_norm": 0.3592855632305145, |
| "learning_rate": 0.0005135154338963307, |
| "loss": 3.4108, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.246856078248719, |
| "grad_norm": 0.31975114345550537, |
| "learning_rate": 0.0005133407105416424, |
| "loss": 3.4276, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.261411271541686, |
| "grad_norm": 0.30115067958831787, |
| "learning_rate": 0.0005131659871869539, |
| "loss": 3.4279, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.275966464834653, |
| "grad_norm": 0.3238639831542969, |
| "learning_rate": 0.0005129912638322656, |
| "loss": 3.4147, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.275966464834653, |
| "eval_accuracy": 0.36494050003872525, |
| "eval_loss": 3.5967977046966553, |
| "eval_runtime": 179.8383, |
| "eval_samples_per_second": 92.589, |
| "eval_steps_per_second": 5.789, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.29052165812762, |
| "grad_norm": 0.3399578332901001, |
| "learning_rate": 0.0005128165404775771, |
| "loss": 3.4347, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.305076851420587, |
| "grad_norm": 0.3150589168071747, |
| "learning_rate": 0.0005126418171228888, |
| "loss": 3.4344, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.319632044713554, |
| "grad_norm": 0.3032225966453552, |
| "learning_rate": 0.0005124670937682003, |
| "loss": 3.4145, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.334187238006521, |
| "grad_norm": 0.3271799683570862, |
| "learning_rate": 0.000512292370413512, |
| "loss": 3.4279, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.348742431299487, |
| "grad_norm": 0.339102566242218, |
| "learning_rate": 0.0005121176470588235, |
| "loss": 3.4321, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.363297624592454, |
| "grad_norm": 0.3249066174030304, |
| "learning_rate": 0.000511942923704135, |
| "loss": 3.4173, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.3778528178854215, |
| "grad_norm": 0.3434494137763977, |
| "learning_rate": 0.0005117682003494467, |
| "loss": 3.44, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.3924080111783885, |
| "grad_norm": 0.3373579680919647, |
| "learning_rate": 0.0005115934769947583, |
| "loss": 3.445, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.406963204471356, |
| "grad_norm": 0.3416643440723419, |
| "learning_rate": 0.0005114187536400699, |
| "loss": 3.4308, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.421518397764323, |
| "grad_norm": 0.3268771171569824, |
| "learning_rate": 0.0005112440302853814, |
| "loss": 3.4366, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.436073591057289, |
| "grad_norm": 0.3285571038722992, |
| "learning_rate": 0.0005110693069306931, |
| "loss": 3.4416, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.450628784350256, |
| "grad_norm": 0.33393388986587524, |
| "learning_rate": 0.0005108945835760046, |
| "loss": 3.4464, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.465183977643223, |
| "grad_norm": 0.33005261421203613, |
| "learning_rate": 0.0005107198602213162, |
| "loss": 3.4413, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.47973917093619, |
| "grad_norm": 0.31219834089279175, |
| "learning_rate": 0.0005105451368666278, |
| "loss": 3.4401, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.494294364229157, |
| "grad_norm": 0.315737247467041, |
| "learning_rate": 0.0005103704135119394, |
| "loss": 3.4436, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.508849557522124, |
| "grad_norm": 0.349681556224823, |
| "learning_rate": 0.000510195690157251, |
| "loss": 3.4555, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.523404750815091, |
| "grad_norm": 0.31493327021598816, |
| "learning_rate": 0.0005100209668025625, |
| "loss": 3.4331, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.537959944108058, |
| "grad_norm": 0.3189566433429718, |
| "learning_rate": 0.0005098462434478742, |
| "loss": 3.4371, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.552515137401024, |
| "grad_norm": 0.3341345191001892, |
| "learning_rate": 0.0005096715200931858, |
| "loss": 3.456, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.5670703306939915, |
| "grad_norm": 0.30145561695098877, |
| "learning_rate": 0.0005094967967384974, |
| "loss": 3.4439, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.5670703306939915, |
| "eval_accuracy": 0.36580832166189253, |
| "eval_loss": 3.5877833366394043, |
| "eval_runtime": 179.756, |
| "eval_samples_per_second": 92.631, |
| "eval_steps_per_second": 5.791, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.5816255239869585, |
| "grad_norm": 0.33717626333236694, |
| "learning_rate": 0.0005093220733838089, |
| "loss": 3.4481, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.596180717279926, |
| "grad_norm": 0.3323234021663666, |
| "learning_rate": 0.0005091473500291205, |
| "loss": 3.4396, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.610735910572893, |
| "grad_norm": 0.33281031250953674, |
| "learning_rate": 0.0005089726266744321, |
| "loss": 3.4597, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.625291103865859, |
| "grad_norm": 0.30492302775382996, |
| "learning_rate": 0.0005087979033197437, |
| "loss": 3.4504, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.639846297158826, |
| "grad_norm": 0.3305889666080475, |
| "learning_rate": 0.0005086231799650553, |
| "loss": 3.4563, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.654401490451793, |
| "grad_norm": 0.3255027234554291, |
| "learning_rate": 0.0005084484566103669, |
| "loss": 3.4421, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.66895668374476, |
| "grad_norm": 0.3166560232639313, |
| "learning_rate": 0.0005082737332556785, |
| "loss": 3.4486, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.683511877037727, |
| "grad_norm": 0.3044646382331848, |
| "learning_rate": 0.00050809900990099, |
| "loss": 3.4568, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.698067070330694, |
| "grad_norm": 0.33230146765708923, |
| "learning_rate": 0.0005079242865463016, |
| "loss": 3.4496, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.712622263623661, |
| "grad_norm": 0.3186247646808624, |
| "learning_rate": 0.0005077495631916133, |
| "loss": 3.4491, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.727177456916628, |
| "grad_norm": 0.31578993797302246, |
| "learning_rate": 0.0005075748398369248, |
| "loss": 3.4557, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.7417326502095944, |
| "grad_norm": 0.32597461342811584, |
| "learning_rate": 0.0005074001164822364, |
| "loss": 3.462, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.7562878435025615, |
| "grad_norm": 0.31171032786369324, |
| "learning_rate": 0.000507225393127548, |
| "loss": 3.4615, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.770843036795529, |
| "grad_norm": 0.3156268894672394, |
| "learning_rate": 0.0005070506697728596, |
| "loss": 3.4549, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.785398230088496, |
| "grad_norm": 0.31520920991897583, |
| "learning_rate": 0.0005068759464181711, |
| "loss": 3.4521, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.799953423381463, |
| "grad_norm": 0.3372666835784912, |
| "learning_rate": 0.0005067012230634828, |
| "loss": 3.4647, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.81450861667443, |
| "grad_norm": 0.30733582377433777, |
| "learning_rate": 0.0005065264997087944, |
| "loss": 3.4506, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.829063809967396, |
| "grad_norm": 0.35157862305641174, |
| "learning_rate": 0.0005063517763541059, |
| "loss": 3.4553, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.843619003260363, |
| "grad_norm": 0.3399302661418915, |
| "learning_rate": 0.0005061770529994175, |
| "loss": 3.4566, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.85817419655333, |
| "grad_norm": 0.33593276143074036, |
| "learning_rate": 0.0005060023296447291, |
| "loss": 3.4694, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.85817419655333, |
| "eval_accuracy": 0.3663316707528952, |
| "eval_loss": 3.5779364109039307, |
| "eval_runtime": 179.7006, |
| "eval_samples_per_second": 92.66, |
| "eval_steps_per_second": 5.793, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.872729389846297, |
| "grad_norm": 0.32834339141845703, |
| "learning_rate": 0.0005058276062900408, |
| "loss": 3.4545, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.887284583139264, |
| "grad_norm": 0.3266600966453552, |
| "learning_rate": 0.0005056528829353523, |
| "loss": 3.4561, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.901839776432231, |
| "grad_norm": 0.34114813804626465, |
| "learning_rate": 0.000505478159580664, |
| "loss": 3.4538, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.916394969725198, |
| "grad_norm": 0.31827306747436523, |
| "learning_rate": 0.0005053034362259755, |
| "loss": 3.4641, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.930950163018165, |
| "grad_norm": 0.36237582564353943, |
| "learning_rate": 0.000505128712871287, |
| "loss": 3.453, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.9455053563111315, |
| "grad_norm": 0.3262782096862793, |
| "learning_rate": 0.0005049539895165987, |
| "loss": 3.4534, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.960060549604099, |
| "grad_norm": 0.31556040048599243, |
| "learning_rate": 0.0005047792661619103, |
| "loss": 3.4556, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.974615742897066, |
| "grad_norm": 0.3208247423171997, |
| "learning_rate": 0.0005046045428072219, |
| "loss": 3.4579, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.989170936190033, |
| "grad_norm": 0.3132985234260559, |
| "learning_rate": 0.0005044298194525334, |
| "loss": 3.4575, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.003493246390311, |
| "grad_norm": 0.39999905228614807, |
| "learning_rate": 0.0005042550960978451, |
| "loss": 3.4277, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.018048439683279, |
| "grad_norm": 0.3299867808818817, |
| "learning_rate": 0.0005040803727431566, |
| "loss": 3.3549, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.032603632976246, |
| "grad_norm": 0.33842816948890686, |
| "learning_rate": 0.0005039056493884683, |
| "loss": 3.3549, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.047158826269213, |
| "grad_norm": 0.33577972650527954, |
| "learning_rate": 0.0005037309260337798, |
| "loss": 3.3558, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.06171401956218, |
| "grad_norm": 0.329557329416275, |
| "learning_rate": 0.0005035562026790914, |
| "loss": 3.3599, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.076269212855147, |
| "grad_norm": 0.33456891775131226, |
| "learning_rate": 0.000503381479324403, |
| "loss": 3.3565, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.090824406148114, |
| "grad_norm": 0.34147799015045166, |
| "learning_rate": 0.0005032067559697145, |
| "loss": 3.3836, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.10537959944108, |
| "grad_norm": 0.3332184851169586, |
| "learning_rate": 0.0005030320326150262, |
| "loss": 3.3786, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.119934792734048, |
| "grad_norm": 0.3092395067214966, |
| "learning_rate": 0.0005028573092603378, |
| "loss": 3.3869, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.134489986027015, |
| "grad_norm": 0.3140532374382019, |
| "learning_rate": 0.0005026825859056494, |
| "loss": 3.3799, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.149045179319982, |
| "grad_norm": 0.3436073064804077, |
| "learning_rate": 0.0005025078625509609, |
| "loss": 3.3723, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.149045179319982, |
| "eval_accuracy": 0.36610496058075415, |
| "eval_loss": 3.591127395629883, |
| "eval_runtime": 179.6817, |
| "eval_samples_per_second": 92.669, |
| "eval_steps_per_second": 5.794, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.16360037261295, |
| "grad_norm": 0.3282712697982788, |
| "learning_rate": 0.0005023331391962726, |
| "loss": 3.382, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.178155565905914, |
| "grad_norm": 0.3222522735595703, |
| "learning_rate": 0.0005021584158415841, |
| "loss": 3.3743, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.192710759198881, |
| "grad_norm": 0.3165663182735443, |
| "learning_rate": 0.0005019836924868956, |
| "loss": 3.3873, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.207265952491849, |
| "grad_norm": 0.3453287184238434, |
| "learning_rate": 0.0005018089691322073, |
| "loss": 3.3833, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.221821145784816, |
| "grad_norm": 0.34769949316978455, |
| "learning_rate": 0.0005016342457775189, |
| "loss": 3.3829, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.236376339077783, |
| "grad_norm": 0.33653950691223145, |
| "learning_rate": 0.0005014595224228305, |
| "loss": 3.381, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.25093153237075, |
| "grad_norm": 0.3488893210887909, |
| "learning_rate": 0.000501284799068142, |
| "loss": 3.397, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.265486725663717, |
| "grad_norm": 0.32981276512145996, |
| "learning_rate": 0.0005011100757134537, |
| "loss": 3.393, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.280041918956684, |
| "grad_norm": 0.3737078607082367, |
| "learning_rate": 0.0005009353523587653, |
| "loss": 3.401, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.294597112249651, |
| "grad_norm": 0.36423400044441223, |
| "learning_rate": 0.0005007606290040768, |
| "loss": 3.401, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.309152305542618, |
| "grad_norm": 0.31796398758888245, |
| "learning_rate": 0.0005005859056493884, |
| "loss": 3.4029, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.323707498835585, |
| "grad_norm": 0.31824707984924316, |
| "learning_rate": 0.0005004111822947, |
| "loss": 3.3987, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.338262692128552, |
| "grad_norm": 0.3500267565250397, |
| "learning_rate": 0.0005002364589400116, |
| "loss": 3.4124, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.35281788542152, |
| "grad_norm": 0.3271293342113495, |
| "learning_rate": 0.0005000617355853231, |
| "loss": 3.4052, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.367373078714486, |
| "grad_norm": 0.346746027469635, |
| "learning_rate": 0.0004998870122306348, |
| "loss": 3.4036, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.381928272007451, |
| "grad_norm": 0.32007408142089844, |
| "learning_rate": 0.0004997122888759464, |
| "loss": 3.3994, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.396483465300419, |
| "grad_norm": 0.3441210687160492, |
| "learning_rate": 0.000499537565521258, |
| "loss": 3.4007, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.411038658593386, |
| "grad_norm": 0.33618438243865967, |
| "learning_rate": 0.0004993628421665695, |
| "loss": 3.4075, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.425593851886353, |
| "grad_norm": 0.31930792331695557, |
| "learning_rate": 0.0004991881188118811, |
| "loss": 3.4101, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.44014904517932, |
| "grad_norm": 0.32625195384025574, |
| "learning_rate": 0.0004990133954571928, |
| "loss": 3.405, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.44014904517932, |
| "eval_accuracy": 0.36678896949825596, |
| "eval_loss": 3.5803894996643066, |
| "eval_runtime": 179.7312, |
| "eval_samples_per_second": 92.644, |
| "eval_steps_per_second": 5.792, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.454704238472287, |
| "grad_norm": 0.3442314863204956, |
| "learning_rate": 0.0004988386721025043, |
| "loss": 3.4119, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.469259431765254, |
| "grad_norm": 0.3489319682121277, |
| "learning_rate": 0.0004986639487478159, |
| "loss": 3.4213, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.483814625058221, |
| "grad_norm": 0.32982802391052246, |
| "learning_rate": 0.0004984892253931275, |
| "loss": 3.418, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.498369818351188, |
| "grad_norm": 0.32021379470825195, |
| "learning_rate": 0.0004983145020384391, |
| "loss": 3.4164, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.512925011644155, |
| "grad_norm": 0.3246486783027649, |
| "learning_rate": 0.0004981397786837507, |
| "loss": 3.3948, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.527480204937122, |
| "grad_norm": 0.33790332078933716, |
| "learning_rate": 0.0004979650553290622, |
| "loss": 3.4134, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.54203539823009, |
| "grad_norm": 0.3419322073459625, |
| "learning_rate": 0.0004977903319743739, |
| "loss": 3.414, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.556590591523056, |
| "grad_norm": 0.333080530166626, |
| "learning_rate": 0.0004976156086196854, |
| "loss": 3.4146, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.571145784816022, |
| "grad_norm": 0.34474682807922363, |
| "learning_rate": 0.0004974408852649971, |
| "loss": 3.4239, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.585700978108989, |
| "grad_norm": 0.33647671341896057, |
| "learning_rate": 0.0004972661619103086, |
| "loss": 3.4339, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.600256171401956, |
| "grad_norm": 0.3294098973274231, |
| "learning_rate": 0.0004970914385556202, |
| "loss": 3.4124, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.614811364694923, |
| "grad_norm": 0.3614839017391205, |
| "learning_rate": 0.0004969167152009318, |
| "loss": 3.4181, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.62936655798789, |
| "grad_norm": 0.33004382252693176, |
| "learning_rate": 0.0004967419918462435, |
| "loss": 3.4296, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.643921751280857, |
| "grad_norm": 0.32359495759010315, |
| "learning_rate": 0.000496567268491555, |
| "loss": 3.4182, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.658476944573824, |
| "grad_norm": 0.3327469825744629, |
| "learning_rate": 0.0004963925451368665, |
| "loss": 3.4334, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.673032137866791, |
| "grad_norm": 0.3434790074825287, |
| "learning_rate": 0.0004962178217821782, |
| "loss": 3.4206, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.687587331159758, |
| "grad_norm": 0.32667237520217896, |
| "learning_rate": 0.0004960430984274898, |
| "loss": 3.4193, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.702142524452725, |
| "grad_norm": 0.36787742376327515, |
| "learning_rate": 0.0004958683750728014, |
| "loss": 3.4198, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.716697717745692, |
| "grad_norm": 0.30659085512161255, |
| "learning_rate": 0.0004956936517181129, |
| "loss": 3.4216, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.73125291103866, |
| "grad_norm": 0.322503924369812, |
| "learning_rate": 0.0004955189283634246, |
| "loss": 3.4347, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.73125291103866, |
| "eval_accuracy": 0.3674749763799498, |
| "eval_loss": 3.5715341567993164, |
| "eval_runtime": 179.5919, |
| "eval_samples_per_second": 92.716, |
| "eval_steps_per_second": 5.796, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.745808104331626, |
| "grad_norm": 0.33062100410461426, |
| "learning_rate": 0.0004953442050087361, |
| "loss": 3.4324, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.760363297624593, |
| "grad_norm": 0.32108139991760254, |
| "learning_rate": 0.0004951694816540476, |
| "loss": 3.4105, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.774918490917559, |
| "grad_norm": 0.3835145831108093, |
| "learning_rate": 0.0004949947582993593, |
| "loss": 3.424, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.789473684210526, |
| "grad_norm": 0.3131749629974365, |
| "learning_rate": 0.0004948200349446709, |
| "loss": 3.4238, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.804028877503493, |
| "grad_norm": 0.3232799470424652, |
| "learning_rate": 0.0004946453115899825, |
| "loss": 3.4309, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.81858407079646, |
| "grad_norm": 0.31317904591560364, |
| "learning_rate": 0.000494470588235294, |
| "loss": 3.4317, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.833139264089427, |
| "grad_norm": 0.35502809286117554, |
| "learning_rate": 0.0004942958648806057, |
| "loss": 3.4321, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.847694457382394, |
| "grad_norm": 0.3296915292739868, |
| "learning_rate": 0.0004941211415259173, |
| "loss": 3.4246, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.862249650675361, |
| "grad_norm": 0.332200288772583, |
| "learning_rate": 0.0004939464181712289, |
| "loss": 3.4364, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.876804843968328, |
| "grad_norm": 0.31839796900749207, |
| "learning_rate": 0.0004937716948165404, |
| "loss": 3.4393, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.891360037261295, |
| "grad_norm": 0.3107447028160095, |
| "learning_rate": 0.000493596971461852, |
| "loss": 3.4403, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.905915230554262, |
| "grad_norm": 0.32366111874580383, |
| "learning_rate": 0.0004934222481071636, |
| "loss": 3.4284, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.92047042384723, |
| "grad_norm": 0.3176148235797882, |
| "learning_rate": 0.0004932475247524751, |
| "loss": 3.4152, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.935025617140196, |
| "grad_norm": 0.30004048347473145, |
| "learning_rate": 0.0004930728013977868, |
| "loss": 3.4174, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.949580810433163, |
| "grad_norm": 0.3007476031780243, |
| "learning_rate": 0.0004928980780430984, |
| "loss": 3.4378, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.964136003726129, |
| "grad_norm": 0.32137224078178406, |
| "learning_rate": 0.00049272335468841, |
| "loss": 3.4267, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.978691197019096, |
| "grad_norm": 0.31897372007369995, |
| "learning_rate": 0.0004925486313337215, |
| "loss": 3.4309, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.993246390312063, |
| "grad_norm": 0.3516756296157837, |
| "learning_rate": 0.0004923739079790332, |
| "loss": 3.426, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.007568700512342, |
| "grad_norm": 0.341561883687973, |
| "learning_rate": 0.0004921991846243447, |
| "loss": 3.3706, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.02212389380531, |
| "grad_norm": 0.3258677124977112, |
| "learning_rate": 0.0004920244612696563, |
| "loss": 3.3197, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.02212389380531, |
| "eval_accuracy": 0.3673204279733321, |
| "eval_loss": 3.5789639949798584, |
| "eval_runtime": 179.612, |
| "eval_samples_per_second": 92.705, |
| "eval_steps_per_second": 5.796, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.036679087098276, |
| "grad_norm": 0.35316187143325806, |
| "learning_rate": 0.0004918497379149679, |
| "loss": 3.3323, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.051234280391244, |
| "grad_norm": 0.35009464621543884, |
| "learning_rate": 0.0004916750145602795, |
| "loss": 3.3375, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.06578947368421, |
| "grad_norm": 0.3244270086288452, |
| "learning_rate": 0.0004915002912055911, |
| "loss": 3.3319, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.080344666977178, |
| "grad_norm": 0.33380645513534546, |
| "learning_rate": 0.0004913255678509026, |
| "loss": 3.3444, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.094899860270145, |
| "grad_norm": 0.3167516887187958, |
| "learning_rate": 0.0004911508444962143, |
| "loss": 3.3275, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.109455053563112, |
| "grad_norm": 0.3265177309513092, |
| "learning_rate": 0.0004909761211415259, |
| "loss": 3.3431, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.124010246856079, |
| "grad_norm": 0.350564181804657, |
| "learning_rate": 0.0004908013977868375, |
| "loss": 3.3494, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.138565440149046, |
| "grad_norm": 0.3303062915802002, |
| "learning_rate": 0.0004906266744321491, |
| "loss": 3.3463, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.153120633442011, |
| "grad_norm": 0.3373015820980072, |
| "learning_rate": 0.0004904519510774606, |
| "loss": 3.3478, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.167675826734978, |
| "grad_norm": 0.33251577615737915, |
| "learning_rate": 0.0004902772277227722, |
| "loss": 3.3607, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.182231020027945, |
| "grad_norm": 0.3346186578273773, |
| "learning_rate": 0.0004901025043680838, |
| "loss": 3.3683, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.196786213320912, |
| "grad_norm": 0.32972466945648193, |
| "learning_rate": 0.0004899277810133955, |
| "loss": 3.3544, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.21134140661388, |
| "grad_norm": 0.3555352985858917, |
| "learning_rate": 0.000489753057658707, |
| "loss": 3.3542, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.225896599906847, |
| "grad_norm": 0.34347039461135864, |
| "learning_rate": 0.0004895783343040186, |
| "loss": 3.3548, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.240451793199814, |
| "grad_norm": 0.3755747377872467, |
| "learning_rate": 0.0004894036109493302, |
| "loss": 3.3721, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.25500698649278, |
| "grad_norm": 0.34164270758628845, |
| "learning_rate": 0.0004892288875946419, |
| "loss": 3.3667, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.269562179785748, |
| "grad_norm": 0.33080416917800903, |
| "learning_rate": 0.0004890541642399534, |
| "loss": 3.3655, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.284117373078715, |
| "grad_norm": 0.32492369413375854, |
| "learning_rate": 0.0004888794408852649, |
| "loss": 3.3689, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.298672566371682, |
| "grad_norm": 0.3133104145526886, |
| "learning_rate": 0.0004887047175305766, |
| "loss": 3.37, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.313227759664649, |
| "grad_norm": 0.3155703842639923, |
| "learning_rate": 0.0004885299941758881, |
| "loss": 3.3719, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.313227759664649, |
| "eval_accuracy": 0.3677861886846826, |
| "eval_loss": 3.5757789611816406, |
| "eval_runtime": 179.647, |
| "eval_samples_per_second": 92.687, |
| "eval_steps_per_second": 5.795, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.327782952957616, |
| "grad_norm": 0.32102909684181213, |
| "learning_rate": 0.0004883552708211997, |
| "loss": 3.364, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.342338146250583, |
| "grad_norm": 0.31682634353637695, |
| "learning_rate": 0.00048818054746651137, |
| "loss": 3.3759, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.356893339543548, |
| "grad_norm": 0.33725234866142273, |
| "learning_rate": 0.0004880058241118229, |
| "loss": 3.376, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.371448532836515, |
| "grad_norm": 0.33588212728500366, |
| "learning_rate": 0.0004878311007571345, |
| "loss": 3.3753, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.386003726129482, |
| "grad_norm": 0.341235488653183, |
| "learning_rate": 0.0004876563774024461, |
| "loss": 3.3824, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.40055891942245, |
| "grad_norm": 0.39756667613983154, |
| "learning_rate": 0.00048748165404775763, |
| "loss": 3.3874, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.415114112715417, |
| "grad_norm": 0.3262716233730316, |
| "learning_rate": 0.0004873069306930693, |
| "loss": 3.3839, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.429669306008384, |
| "grad_norm": 0.3105382025241852, |
| "learning_rate": 0.0004871322073383809, |
| "loss": 3.3724, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.44422449930135, |
| "grad_norm": 0.3270449936389923, |
| "learning_rate": 0.00048695748398369247, |
| "loss": 3.379, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.458779692594318, |
| "grad_norm": 0.34322798252105713, |
| "learning_rate": 0.000486782760629004, |
| "loss": 3.3976, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.473334885887285, |
| "grad_norm": 0.32397979497909546, |
| "learning_rate": 0.0004866080372743156, |
| "loss": 3.3871, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.487890079180252, |
| "grad_norm": 0.33422529697418213, |
| "learning_rate": 0.0004864333139196272, |
| "loss": 3.3824, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.502445272473219, |
| "grad_norm": 0.3257046341896057, |
| "learning_rate": 0.00048625859056493885, |
| "loss": 3.3782, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.517000465766186, |
| "grad_norm": 0.337839275598526, |
| "learning_rate": 0.0004860838672102504, |
| "loss": 3.3972, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.531555659059153, |
| "grad_norm": 0.3106997013092041, |
| "learning_rate": 0.000485909143855562, |
| "loss": 3.4099, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.546110852352118, |
| "grad_norm": 0.3634362518787384, |
| "learning_rate": 0.0004857344205008736, |
| "loss": 3.3984, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.560666045645085, |
| "grad_norm": 0.36889997124671936, |
| "learning_rate": 0.00048555969714618517, |
| "loss": 3.3959, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.575221238938052, |
| "grad_norm": 0.3163613975048065, |
| "learning_rate": 0.0004853849737914967, |
| "loss": 3.3775, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.58977643223102, |
| "grad_norm": 0.33509278297424316, |
| "learning_rate": 0.00048521025043680836, |
| "loss": 3.3918, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.604331625523987, |
| "grad_norm": 0.3135107755661011, |
| "learning_rate": 0.00048503552708211995, |
| "loss": 3.394, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.604331625523987, |
| "eval_accuracy": 0.3684542138886483, |
| "eval_loss": 3.568233013153076, |
| "eval_runtime": 179.6101, |
| "eval_samples_per_second": 92.706, |
| "eval_steps_per_second": 5.796, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.618886818816954, |
| "grad_norm": 0.3235209584236145, |
| "learning_rate": 0.00048486080372743155, |
| "loss": 3.3925, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.63344201210992, |
| "grad_norm": 0.34784653782844543, |
| "learning_rate": 0.0004846860803727431, |
| "loss": 3.4051, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.647997205402888, |
| "grad_norm": 0.3406641483306885, |
| "learning_rate": 0.0004845113570180547, |
| "loss": 3.3897, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.662552398695855, |
| "grad_norm": 0.3359287977218628, |
| "learning_rate": 0.00048433663366336633, |
| "loss": 3.394, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.677107591988822, |
| "grad_norm": 0.3356068432331085, |
| "learning_rate": 0.0004841619103086779, |
| "loss": 3.408, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.691662785281789, |
| "grad_norm": 0.36484211683273315, |
| "learning_rate": 0.00048398718695398947, |
| "loss": 3.4018, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.706217978574756, |
| "grad_norm": 0.3308541774749756, |
| "learning_rate": 0.00048381246359930106, |
| "loss": 3.4026, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.720773171867723, |
| "grad_norm": 0.33220258355140686, |
| "learning_rate": 0.00048363774024461265, |
| "loss": 3.3921, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.73532836516069, |
| "grad_norm": 0.34359830617904663, |
| "learning_rate": 0.0004834630168899242, |
| "loss": 3.401, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.749883558453657, |
| "grad_norm": 0.3357202708721161, |
| "learning_rate": 0.00048328829353523584, |
| "loss": 3.3974, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.764438751746622, |
| "grad_norm": 0.3662799894809723, |
| "learning_rate": 0.00048311357018054744, |
| "loss": 3.4082, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.77899394503959, |
| "grad_norm": 0.33433783054351807, |
| "learning_rate": 0.00048293884682585903, |
| "loss": 3.3953, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.793549138332557, |
| "grad_norm": 0.3398445248603821, |
| "learning_rate": 0.00048276412347117057, |
| "loss": 3.4051, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.808104331625524, |
| "grad_norm": 0.32314440608024597, |
| "learning_rate": 0.00048258940011648217, |
| "loss": 3.4146, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.82265952491849, |
| "grad_norm": 0.31336554884910583, |
| "learning_rate": 0.0004824146767617938, |
| "loss": 3.4036, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.837214718211458, |
| "grad_norm": 0.366742342710495, |
| "learning_rate": 0.0004822399534071054, |
| "loss": 3.399, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.851769911504425, |
| "grad_norm": 0.33634525537490845, |
| "learning_rate": 0.00048206523005241695, |
| "loss": 3.4209, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.866325104797392, |
| "grad_norm": 0.3509387671947479, |
| "learning_rate": 0.00048189050669772854, |
| "loss": 3.4114, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.880880298090359, |
| "grad_norm": 0.33496803045272827, |
| "learning_rate": 0.00048171578334304014, |
| "loss": 3.4134, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.895435491383326, |
| "grad_norm": 0.3415274918079376, |
| "learning_rate": 0.00048154105998835173, |
| "loss": 3.4089, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.895435491383326, |
| "eval_accuracy": 0.36880409267686187, |
| "eval_loss": 3.559906482696533, |
| "eval_runtime": 179.6841, |
| "eval_samples_per_second": 92.668, |
| "eval_steps_per_second": 5.794, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.909990684676293, |
| "grad_norm": 0.3290272653102875, |
| "learning_rate": 0.0004813663366336633, |
| "loss": 3.4046, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.92454587796926, |
| "grad_norm": 0.3289201259613037, |
| "learning_rate": 0.0004811916132789749, |
| "loss": 3.4071, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.939101071262227, |
| "grad_norm": 0.35677555203437805, |
| "learning_rate": 0.0004810168899242865, |
| "loss": 3.3999, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.953656264555192, |
| "grad_norm": 0.3309026062488556, |
| "learning_rate": 0.0004808421665695981, |
| "loss": 3.4169, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.96821145784816, |
| "grad_norm": 0.32284677028656006, |
| "learning_rate": 0.00048066744321490965, |
| "loss": 3.4157, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.982766651141127, |
| "grad_norm": 0.32936713099479675, |
| "learning_rate": 0.00048049271986022124, |
| "loss": 3.395, |
| "step": 34300 |
| }, |
| { |
| "epoch": 9.997321844434094, |
| "grad_norm": 0.33557114005088806, |
| "learning_rate": 0.0004803179965055329, |
| "loss": 3.4141, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.011644154634373, |
| "grad_norm": 0.32910844683647156, |
| "learning_rate": 0.0004801432731508445, |
| "loss": 3.3246, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.02619934792734, |
| "grad_norm": 0.3272988498210907, |
| "learning_rate": 0.000479968549796156, |
| "loss": 3.3019, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.040754541220307, |
| "grad_norm": 0.34475770592689514, |
| "learning_rate": 0.0004797938264414676, |
| "loss": 3.3026, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.055309734513274, |
| "grad_norm": 0.3495998680591583, |
| "learning_rate": 0.0004796191030867792, |
| "loss": 3.3068, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.069864927806242, |
| "grad_norm": 0.3315109312534332, |
| "learning_rate": 0.00047944437973209086, |
| "loss": 3.3101, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.084420121099209, |
| "grad_norm": 0.3565497100353241, |
| "learning_rate": 0.0004792696563774024, |
| "loss": 3.3121, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.098975314392176, |
| "grad_norm": 0.33841779828071594, |
| "learning_rate": 0.000479094933022714, |
| "loss": 3.3233, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.113530507685143, |
| "grad_norm": 0.31984496116638184, |
| "learning_rate": 0.0004789202096680256, |
| "loss": 3.3232, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.12808570097811, |
| "grad_norm": 0.33919304609298706, |
| "learning_rate": 0.00047874548631333713, |
| "loss": 3.3161, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.142640894271075, |
| "grad_norm": 0.3212825357913971, |
| "learning_rate": 0.0004785707629586487, |
| "loss": 3.3306, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.157196087564042, |
| "grad_norm": 0.34842801094055176, |
| "learning_rate": 0.0004783960396039604, |
| "loss": 3.3233, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.17175128085701, |
| "grad_norm": 0.33531925082206726, |
| "learning_rate": 0.00047822131624927197, |
| "loss": 3.3363, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.186306474149976, |
| "grad_norm": 0.3403702974319458, |
| "learning_rate": 0.0004780465928945835, |
| "loss": 3.3351, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.186306474149976, |
| "eval_accuracy": 0.3685656297741795, |
| "eval_loss": 3.571756362915039, |
| "eval_runtime": 179.8081, |
| "eval_samples_per_second": 92.604, |
| "eval_steps_per_second": 5.79, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.200861667442943, |
| "grad_norm": 0.34296759963035583, |
| "learning_rate": 0.0004778718695398951, |
| "loss": 3.34, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.21541686073591, |
| "grad_norm": 0.3556338846683502, |
| "learning_rate": 0.0004776971461852067, |
| "loss": 3.3419, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.229972054028877, |
| "grad_norm": 0.3632368743419647, |
| "learning_rate": 0.00047752242283051835, |
| "loss": 3.3468, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.244527247321844, |
| "grad_norm": 0.370301216840744, |
| "learning_rate": 0.00047734769947582994, |
| "loss": 3.3431, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.259082440614812, |
| "grad_norm": 0.3300345242023468, |
| "learning_rate": 0.0004771729761211415, |
| "loss": 3.3484, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.273637633907779, |
| "grad_norm": 0.35405808687210083, |
| "learning_rate": 0.0004769982527664531, |
| "loss": 3.3431, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.288192827200746, |
| "grad_norm": 0.33882415294647217, |
| "learning_rate": 0.00047682352941176467, |
| "loss": 3.358, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.302748020493713, |
| "grad_norm": 0.32428914308547974, |
| "learning_rate": 0.0004766488060570762, |
| "loss": 3.3416, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.31730321378668, |
| "grad_norm": 0.3725070655345917, |
| "learning_rate": 0.00047647408270238786, |
| "loss": 3.3579, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.331858407079647, |
| "grad_norm": 0.32831016182899475, |
| "learning_rate": 0.00047629935934769945, |
| "loss": 3.3651, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.346413600372612, |
| "grad_norm": 0.32686808705329895, |
| "learning_rate": 0.00047612463599301105, |
| "loss": 3.3474, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.36096879366558, |
| "grad_norm": 0.3326481580734253, |
| "learning_rate": 0.0004759499126383226, |
| "loss": 3.343, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.375523986958546, |
| "grad_norm": 0.34560704231262207, |
| "learning_rate": 0.0004757751892836342, |
| "loss": 3.3622, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.390079180251513, |
| "grad_norm": 0.35292690992355347, |
| "learning_rate": 0.0004756004659289458, |
| "loss": 3.3523, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.40463437354448, |
| "grad_norm": 0.33068376779556274, |
| "learning_rate": 0.0004754257425742574, |
| "loss": 3.3536, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.419189566837447, |
| "grad_norm": 0.3444773852825165, |
| "learning_rate": 0.00047525101921956896, |
| "loss": 3.3609, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.433744760130415, |
| "grad_norm": 0.35411354899406433, |
| "learning_rate": 0.00047507629586488056, |
| "loss": 3.3738, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.448299953423382, |
| "grad_norm": 0.3531155586242676, |
| "learning_rate": 0.00047490157251019215, |
| "loss": 3.3521, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.462855146716349, |
| "grad_norm": 0.34749075770378113, |
| "learning_rate": 0.0004747268491555037, |
| "loss": 3.3709, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.477410340009316, |
| "grad_norm": 0.3302483558654785, |
| "learning_rate": 0.00047455212580081534, |
| "loss": 3.3608, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.477410340009316, |
| "eval_accuracy": 0.3686402596131166, |
| "eval_loss": 3.5644166469573975, |
| "eval_runtime": 179.674, |
| "eval_samples_per_second": 92.673, |
| "eval_steps_per_second": 5.794, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.491965533302283, |
| "grad_norm": 0.3223860263824463, |
| "learning_rate": 0.00047437740244612694, |
| "loss": 3.3601, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.50652072659525, |
| "grad_norm": 0.3294314742088318, |
| "learning_rate": 0.00047420267909143853, |
| "loss": 3.3779, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.521075919888217, |
| "grad_norm": 0.3386666178703308, |
| "learning_rate": 0.0004740279557367501, |
| "loss": 3.3699, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.535631113181182, |
| "grad_norm": 0.3316313624382019, |
| "learning_rate": 0.00047385323238206166, |
| "loss": 3.3679, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.55018630647415, |
| "grad_norm": 0.3450184464454651, |
| "learning_rate": 0.00047367850902737326, |
| "loss": 3.3768, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.564741499767116, |
| "grad_norm": 0.3605281114578247, |
| "learning_rate": 0.0004735037856726849, |
| "loss": 3.362, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.579296693060083, |
| "grad_norm": 0.3162323534488678, |
| "learning_rate": 0.0004733290623179965, |
| "loss": 3.3737, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.59385188635305, |
| "grad_norm": 0.34284910559654236, |
| "learning_rate": 0.00047315433896330804, |
| "loss": 3.3803, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.608407079646017, |
| "grad_norm": 0.3211812376976013, |
| "learning_rate": 0.00047297961560861964, |
| "loss": 3.3822, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.622962272938985, |
| "grad_norm": 0.3258812725543976, |
| "learning_rate": 0.00047280489225393123, |
| "loss": 3.3763, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.637517466231952, |
| "grad_norm": 0.3411952257156372, |
| "learning_rate": 0.0004726301688992429, |
| "loss": 3.379, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.652072659524919, |
| "grad_norm": 0.3464188873767853, |
| "learning_rate": 0.0004724554455445544, |
| "loss": 3.3756, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.666627852817886, |
| "grad_norm": 0.3489798307418823, |
| "learning_rate": 0.000472280722189866, |
| "loss": 3.3666, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.681183046110853, |
| "grad_norm": 0.3330119550228119, |
| "learning_rate": 0.0004721059988351776, |
| "loss": 3.3725, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.69573823940382, |
| "grad_norm": 0.37871816754341125, |
| "learning_rate": 0.00047193127548048915, |
| "loss": 3.3732, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.710293432696787, |
| "grad_norm": 0.30662286281585693, |
| "learning_rate": 0.00047175655212580074, |
| "loss": 3.3792, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.724848625989754, |
| "grad_norm": 0.34182503819465637, |
| "learning_rate": 0.0004715818287711124, |
| "loss": 3.3996, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.73940381928272, |
| "grad_norm": 0.3572937250137329, |
| "learning_rate": 0.000471407105416424, |
| "loss": 3.3756, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.753959012575686, |
| "grad_norm": 0.33789294958114624, |
| "learning_rate": 0.0004712323820617355, |
| "loss": 3.3804, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.768514205868653, |
| "grad_norm": 0.34808850288391113, |
| "learning_rate": 0.0004710576587070471, |
| "loss": 3.3839, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.768514205868653, |
| "eval_accuracy": 0.36958905754971316, |
| "eval_loss": 3.5576395988464355, |
| "eval_runtime": 179.691, |
| "eval_samples_per_second": 92.665, |
| "eval_steps_per_second": 5.793, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.78306939916162, |
| "grad_norm": 0.3317504823207855, |
| "learning_rate": 0.0004708829353523587, |
| "loss": 3.3828, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.797624592454587, |
| "grad_norm": 0.3486345112323761, |
| "learning_rate": 0.0004707082119976703, |
| "loss": 3.378, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.812179785747555, |
| "grad_norm": 0.3379111886024475, |
| "learning_rate": 0.0004705334886429819, |
| "loss": 3.3915, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.826734979040522, |
| "grad_norm": 0.3323996365070343, |
| "learning_rate": 0.0004703587652882935, |
| "loss": 3.3739, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.841290172333489, |
| "grad_norm": 0.341294527053833, |
| "learning_rate": 0.0004701840419336051, |
| "loss": 3.3687, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.855845365626456, |
| "grad_norm": 0.3258977234363556, |
| "learning_rate": 0.0004700093185789167, |
| "loss": 3.3836, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.870400558919423, |
| "grad_norm": 0.327525794506073, |
| "learning_rate": 0.0004698345952242282, |
| "loss": 3.3697, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.88495575221239, |
| "grad_norm": 0.3258748948574066, |
| "learning_rate": 0.00046965987186953987, |
| "loss": 3.3864, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.899510945505357, |
| "grad_norm": 0.3566642701625824, |
| "learning_rate": 0.00046948514851485147, |
| "loss": 3.3892, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.914066138798324, |
| "grad_norm": 0.32489481568336487, |
| "learning_rate": 0.00046931042516016306, |
| "loss": 3.3924, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.92862133209129, |
| "grad_norm": 0.3091423809528351, |
| "learning_rate": 0.0004691357018054746, |
| "loss": 3.3915, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.943176525384256, |
| "grad_norm": 0.3518022894859314, |
| "learning_rate": 0.0004689609784507862, |
| "loss": 3.3855, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.957731718677223, |
| "grad_norm": 0.3540588915348053, |
| "learning_rate": 0.0004687862550960978, |
| "loss": 3.3879, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.97228691197019, |
| "grad_norm": 0.339358389377594, |
| "learning_rate": 0.00046861153174140944, |
| "loss": 3.3936, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.986842105263158, |
| "grad_norm": 0.3372827172279358, |
| "learning_rate": 0.000468436808386721, |
| "loss": 3.3831, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.001164415463437, |
| "grad_norm": 0.37063729763031006, |
| "learning_rate": 0.0004682620850320326, |
| "loss": 3.3715, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.015719608756404, |
| "grad_norm": 0.3537955582141876, |
| "learning_rate": 0.00046808736167734417, |
| "loss": 3.266, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.030274802049371, |
| "grad_norm": 0.34943118691444397, |
| "learning_rate": 0.0004679126383226557, |
| "loss": 3.2636, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.044829995342338, |
| "grad_norm": 0.323003351688385, |
| "learning_rate": 0.00046773791496796736, |
| "loss": 3.292, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.059385188635305, |
| "grad_norm": 0.351985901594162, |
| "learning_rate": 0.00046756319161327895, |
| "loss": 3.2872, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.059385188635305, |
| "eval_accuracy": 0.3697098756196774, |
| "eval_loss": 3.5635266304016113, |
| "eval_runtime": 179.5649, |
| "eval_samples_per_second": 92.73, |
| "eval_steps_per_second": 5.797, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.073940381928272, |
| "grad_norm": 0.3405875265598297, |
| "learning_rate": 0.00046738846825859054, |
| "loss": 3.2988, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.08849557522124, |
| "grad_norm": 0.3895106613636017, |
| "learning_rate": 0.0004672137449039021, |
| "loss": 3.2898, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.103050768514207, |
| "grad_norm": 0.34889012575149536, |
| "learning_rate": 0.0004670390215492137, |
| "loss": 3.2914, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.117605961807174, |
| "grad_norm": 0.3436652719974518, |
| "learning_rate": 0.0004668642981945253, |
| "loss": 3.2922, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.132161155100139, |
| "grad_norm": 0.3298552334308624, |
| "learning_rate": 0.0004666895748398369, |
| "loss": 3.3036, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.146716348393106, |
| "grad_norm": 0.374026358127594, |
| "learning_rate": 0.00046651485148514846, |
| "loss": 3.3078, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.161271541686073, |
| "grad_norm": 0.3402121067047119, |
| "learning_rate": 0.00046634012813046006, |
| "loss": 3.3049, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.17582673497904, |
| "grad_norm": 0.347309947013855, |
| "learning_rate": 0.00046616540477577165, |
| "loss": 3.3056, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.190381928272007, |
| "grad_norm": 0.38802456855773926, |
| "learning_rate": 0.00046599068142108324, |
| "loss": 3.3165, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.204937121564974, |
| "grad_norm": 0.3473553955554962, |
| "learning_rate": 0.0004658159580663948, |
| "loss": 3.314, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.219492314857941, |
| "grad_norm": 0.35995396971702576, |
| "learning_rate": 0.00046564123471170643, |
| "loss": 3.3263, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.234047508150908, |
| "grad_norm": 0.34611260890960693, |
| "learning_rate": 0.00046546651135701803, |
| "loss": 3.3238, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.248602701443875, |
| "grad_norm": 0.34514832496643066, |
| "learning_rate": 0.0004652917880023296, |
| "loss": 3.3183, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.263157894736842, |
| "grad_norm": 0.3476332724094391, |
| "learning_rate": 0.00046511706464764116, |
| "loss": 3.3175, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.27771308802981, |
| "grad_norm": 0.3759239912033081, |
| "learning_rate": 0.00046494234129295276, |
| "loss": 3.3355, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.292268281322777, |
| "grad_norm": 0.3533199727535248, |
| "learning_rate": 0.0004647676179382644, |
| "loss": 3.3161, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.306823474615744, |
| "grad_norm": 0.37267446517944336, |
| "learning_rate": 0.000464592894583576, |
| "loss": 3.331, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.32137866790871, |
| "grad_norm": 0.34054192900657654, |
| "learning_rate": 0.00046441817122888754, |
| "loss": 3.3426, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.335933861201676, |
| "grad_norm": 0.34166160225868225, |
| "learning_rate": 0.00046424344787419913, |
| "loss": 3.3492, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.350489054494643, |
| "grad_norm": 0.34584760665893555, |
| "learning_rate": 0.00046406872451951073, |
| "loss": 3.3239, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.350489054494643, |
| "eval_accuracy": 0.36916642935944916, |
| "eval_loss": 3.564857244491577, |
| "eval_runtime": 179.5454, |
| "eval_samples_per_second": 92.74, |
| "eval_steps_per_second": 5.798, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.36504424778761, |
| "grad_norm": 0.34652185440063477, |
| "learning_rate": 0.00046389400116482227, |
| "loss": 3.3311, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.379599441080577, |
| "grad_norm": 0.35235944390296936, |
| "learning_rate": 0.0004637192778101339, |
| "loss": 3.3451, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.394154634373544, |
| "grad_norm": 0.3453175723552704, |
| "learning_rate": 0.0004635445544554455, |
| "loss": 3.3386, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.408709827666511, |
| "grad_norm": 0.3537910282611847, |
| "learning_rate": 0.0004633698311007571, |
| "loss": 3.3399, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.423265020959478, |
| "grad_norm": 0.354648232460022, |
| "learning_rate": 0.0004631951077460687, |
| "loss": 3.3416, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.437820214252445, |
| "grad_norm": 0.3352389931678772, |
| "learning_rate": 0.00046302038439138024, |
| "loss": 3.3472, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.452375407545413, |
| "grad_norm": 0.33255258202552795, |
| "learning_rate": 0.0004628456610366919, |
| "loss": 3.3551, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.46693060083838, |
| "grad_norm": 0.36602121591567993, |
| "learning_rate": 0.0004626709376820035, |
| "loss": 3.3424, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.481485794131347, |
| "grad_norm": 0.3826133608818054, |
| "learning_rate": 0.0004624962143273151, |
| "loss": 3.3527, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.496040987424314, |
| "grad_norm": 0.3627864122390747, |
| "learning_rate": 0.0004623214909726266, |
| "loss": 3.344, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.51059618071728, |
| "grad_norm": 0.33537665009498596, |
| "learning_rate": 0.0004621467676179382, |
| "loss": 3.3587, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.525151374010246, |
| "grad_norm": 0.33809006214141846, |
| "learning_rate": 0.0004619720442632498, |
| "loss": 3.3333, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.539706567303213, |
| "grad_norm": 0.35963302850723267, |
| "learning_rate": 0.00046179732090856145, |
| "loss": 3.3527, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.55426176059618, |
| "grad_norm": 0.3328627347946167, |
| "learning_rate": 0.000461622597553873, |
| "loss": 3.3601, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.568816953889147, |
| "grad_norm": 0.34587082266807556, |
| "learning_rate": 0.0004614478741991846, |
| "loss": 3.3477, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.583372147182114, |
| "grad_norm": 0.347168505191803, |
| "learning_rate": 0.0004612731508444962, |
| "loss": 3.35, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.597927340475081, |
| "grad_norm": 0.3339124917984009, |
| "learning_rate": 0.0004610984274898077, |
| "loss": 3.3537, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.612482533768048, |
| "grad_norm": 0.3587067127227783, |
| "learning_rate": 0.00046092370413511937, |
| "loss": 3.3494, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.627037727061015, |
| "grad_norm": 0.34786462783813477, |
| "learning_rate": 0.00046074898078043096, |
| "loss": 3.3506, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.641592920353983, |
| "grad_norm": 0.3410889804363251, |
| "learning_rate": 0.00046057425742574256, |
| "loss": 3.3604, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.641592920353983, |
| "eval_accuracy": 0.3702274658727149, |
| "eval_loss": 3.556337594985962, |
| "eval_runtime": 179.6688, |
| "eval_samples_per_second": 92.676, |
| "eval_steps_per_second": 5.794, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.65614811364695, |
| "grad_norm": 0.32578304409980774, |
| "learning_rate": 0.0004603995340710541, |
| "loss": 3.3519, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.670703306939917, |
| "grad_norm": 0.33926668763160706, |
| "learning_rate": 0.0004602248107163657, |
| "loss": 3.3591, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.685258500232884, |
| "grad_norm": 0.3640845715999603, |
| "learning_rate": 0.0004600500873616773, |
| "loss": 3.3618, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.69981369352585, |
| "grad_norm": 0.34970220923423767, |
| "learning_rate": 0.00045987536400698894, |
| "loss": 3.3563, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.714368886818818, |
| "grad_norm": 0.3340449631214142, |
| "learning_rate": 0.0004597006406523005, |
| "loss": 3.3529, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.728924080111783, |
| "grad_norm": 0.3313457667827606, |
| "learning_rate": 0.00045952591729761207, |
| "loss": 3.3554, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.74347927340475, |
| "grad_norm": 0.3290119171142578, |
| "learning_rate": 0.00045935119394292367, |
| "loss": 3.3634, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.758034466697717, |
| "grad_norm": 0.34706369042396545, |
| "learning_rate": 0.00045917647058823526, |
| "loss": 3.3626, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.772589659990684, |
| "grad_norm": 0.3630402982234955, |
| "learning_rate": 0.0004590017472335468, |
| "loss": 3.3507, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.787144853283651, |
| "grad_norm": 0.3313291668891907, |
| "learning_rate": 0.00045882702387885845, |
| "loss": 3.3646, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.801700046576618, |
| "grad_norm": 0.3491012454032898, |
| "learning_rate": 0.00045865230052417004, |
| "loss": 3.3703, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.816255239869585, |
| "grad_norm": 0.3649109899997711, |
| "learning_rate": 0.00045847757716948164, |
| "loss": 3.3689, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.830810433162553, |
| "grad_norm": 0.3404376804828644, |
| "learning_rate": 0.0004583028538147932, |
| "loss": 3.3697, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.84536562645552, |
| "grad_norm": 0.35776814818382263, |
| "learning_rate": 0.00045812813046010477, |
| "loss": 3.3625, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.859920819748487, |
| "grad_norm": 0.3335564434528351, |
| "learning_rate": 0.0004579534071054164, |
| "loss": 3.3575, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.874476013041454, |
| "grad_norm": 0.3584432899951935, |
| "learning_rate": 0.000457778683750728, |
| "loss": 3.3683, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.88903120633442, |
| "grad_norm": 0.372200071811676, |
| "learning_rate": 0.00045760396039603955, |
| "loss": 3.3574, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.903586399627388, |
| "grad_norm": 0.3619687855243683, |
| "learning_rate": 0.00045742923704135115, |
| "loss": 3.3737, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.918141592920353, |
| "grad_norm": 0.36458566784858704, |
| "learning_rate": 0.00045725451368666274, |
| "loss": 3.3734, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.93269678621332, |
| "grad_norm": 0.36355504393577576, |
| "learning_rate": 0.0004570797903319743, |
| "loss": 3.3633, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.93269678621332, |
| "eval_accuracy": 0.37053221417564997, |
| "eval_loss": 3.548603057861328, |
| "eval_runtime": 179.6555, |
| "eval_samples_per_second": 92.683, |
| "eval_steps_per_second": 5.794, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.947251979506287, |
| "grad_norm": 0.3612082302570343, |
| "learning_rate": 0.00045690506697728593, |
| "loss": 3.3748, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.961807172799254, |
| "grad_norm": 0.31688255071640015, |
| "learning_rate": 0.0004567303436225975, |
| "loss": 3.3693, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.976362366092221, |
| "grad_norm": 0.33474868535995483, |
| "learning_rate": 0.0004565556202679091, |
| "loss": 3.3685, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.990917559385188, |
| "grad_norm": 0.33027711510658264, |
| "learning_rate": 0.00045638089691322066, |
| "loss": 3.381, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.005239869585468, |
| "grad_norm": 0.3528062105178833, |
| "learning_rate": 0.00045620617355853225, |
| "loss": 3.3416, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.019795062878435, |
| "grad_norm": 0.32908952236175537, |
| "learning_rate": 0.0004560314502038439, |
| "loss": 3.2629, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.034350256171402, |
| "grad_norm": 0.37203720211982727, |
| "learning_rate": 0.0004558567268491555, |
| "loss": 3.2617, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.04890544946437, |
| "grad_norm": 0.34509584307670593, |
| "learning_rate": 0.00045568200349446704, |
| "loss": 3.2684, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.063460642757336, |
| "grad_norm": 0.3308143615722656, |
| "learning_rate": 0.00045550728013977863, |
| "loss": 3.2681, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.078015836050303, |
| "grad_norm": 0.3766533136367798, |
| "learning_rate": 0.0004553325567850902, |
| "loss": 3.2629, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.09257102934327, |
| "grad_norm": 0.3604935109615326, |
| "learning_rate": 0.0004551578334304018, |
| "loss": 3.2869, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.107126222636236, |
| "grad_norm": 0.34294411540031433, |
| "learning_rate": 0.00045498311007571347, |
| "loss": 3.2633, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.121681415929203, |
| "grad_norm": 0.34739822149276733, |
| "learning_rate": 0.000454808386721025, |
| "loss": 3.2834, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.13623660922217, |
| "grad_norm": 0.3293208181858063, |
| "learning_rate": 0.0004546336633663366, |
| "loss": 3.286, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.150791802515137, |
| "grad_norm": 0.346708208322525, |
| "learning_rate": 0.0004544589400116482, |
| "loss": 3.287, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.165346995808104, |
| "grad_norm": 0.3570074439048767, |
| "learning_rate": 0.00045428421665695974, |
| "loss": 3.2899, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.179902189101071, |
| "grad_norm": 0.34867045283317566, |
| "learning_rate": 0.00045410949330227133, |
| "loss": 3.2963, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.194457382394038, |
| "grad_norm": 0.3396155834197998, |
| "learning_rate": 0.000453934769947583, |
| "loss": 3.2904, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.209012575687005, |
| "grad_norm": 0.3608848452568054, |
| "learning_rate": 0.0004537600465928946, |
| "loss": 3.3067, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.223567768979972, |
| "grad_norm": 0.3715206980705261, |
| "learning_rate": 0.0004535853232382061, |
| "loss": 3.3018, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.223567768979972, |
| "eval_accuracy": 0.369946693140084, |
| "eval_loss": 3.5617122650146484, |
| "eval_runtime": 179.8742, |
| "eval_samples_per_second": 92.57, |
| "eval_steps_per_second": 5.787, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.23812296227294, |
| "grad_norm": 0.3436983823776245, |
| "learning_rate": 0.0004534105998835177, |
| "loss": 3.3002, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.252678155565906, |
| "grad_norm": 0.3525453805923462, |
| "learning_rate": 0.0004532358765288293, |
| "loss": 3.3219, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.267233348858873, |
| "grad_norm": 0.3803021013736725, |
| "learning_rate": 0.00045306115317414095, |
| "loss": 3.2979, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.28178854215184, |
| "grad_norm": 0.36688610911369324, |
| "learning_rate": 0.0004528864298194525, |
| "loss": 3.3118, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.296343735444808, |
| "grad_norm": 0.3614189326763153, |
| "learning_rate": 0.0004527117064647641, |
| "loss": 3.3043, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.310898928737773, |
| "grad_norm": 0.41557541489601135, |
| "learning_rate": 0.0004525369831100757, |
| "loss": 3.3259, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.32545412203074, |
| "grad_norm": 0.34607186913490295, |
| "learning_rate": 0.0004523622597553872, |
| "loss": 3.3075, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.340009315323707, |
| "grad_norm": 0.36357954144477844, |
| "learning_rate": 0.0004521875364006988, |
| "loss": 3.3127, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.354564508616674, |
| "grad_norm": 0.35901084542274475, |
| "learning_rate": 0.00045201281304601046, |
| "loss": 3.3222, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.369119701909641, |
| "grad_norm": 0.34429091215133667, |
| "learning_rate": 0.00045183808969132206, |
| "loss": 3.3268, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.383674895202608, |
| "grad_norm": 0.3401288688182831, |
| "learning_rate": 0.00045166336633663365, |
| "loss": 3.3147, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.398230088495575, |
| "grad_norm": 0.3882862329483032, |
| "learning_rate": 0.0004514886429819452, |
| "loss": 3.3241, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.412785281788542, |
| "grad_norm": 0.33733412623405457, |
| "learning_rate": 0.0004513139196272568, |
| "loss": 3.3212, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.42734047508151, |
| "grad_norm": 0.35020992159843445, |
| "learning_rate": 0.00045113919627256843, |
| "loss": 3.3179, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.441895668374476, |
| "grad_norm": 0.36065271496772766, |
| "learning_rate": 0.00045096447291788003, |
| "loss": 3.3266, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.456450861667443, |
| "grad_norm": 0.34550905227661133, |
| "learning_rate": 0.00045078974956319157, |
| "loss": 3.3288, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.47100605496041, |
| "grad_norm": 0.3330172896385193, |
| "learning_rate": 0.00045061502620850316, |
| "loss": 3.328, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.485561248253378, |
| "grad_norm": 0.3609847128391266, |
| "learning_rate": 0.00045044030285381476, |
| "loss": 3.3256, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.500116441546343, |
| "grad_norm": 0.35363641381263733, |
| "learning_rate": 0.0004502655794991263, |
| "loss": 3.3383, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.51467163483931, |
| "grad_norm": 0.3697940707206726, |
| "learning_rate": 0.00045009085614443795, |
| "loss": 3.3237, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.51467163483931, |
| "eval_accuracy": 0.3703875380626869, |
| "eval_loss": 3.5561211109161377, |
| "eval_runtime": 179.7227, |
| "eval_samples_per_second": 92.648, |
| "eval_steps_per_second": 5.792, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.529226828132277, |
| "grad_norm": 0.37186741828918457, |
| "learning_rate": 0.00044991613278974954, |
| "loss": 3.3344, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.543782021425244, |
| "grad_norm": 0.3866070508956909, |
| "learning_rate": 0.00044974140943506113, |
| "loss": 3.3314, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.558337214718211, |
| "grad_norm": 0.363854318857193, |
| "learning_rate": 0.0004495666860803727, |
| "loss": 3.3361, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.572892408011178, |
| "grad_norm": 0.3626774251461029, |
| "learning_rate": 0.00044939196272568427, |
| "loss": 3.3318, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.587447601304145, |
| "grad_norm": 0.38811835646629333, |
| "learning_rate": 0.00044921723937099586, |
| "loss": 3.3316, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.602002794597112, |
| "grad_norm": 0.35809630155563354, |
| "learning_rate": 0.0004490425160163075, |
| "loss": 3.3345, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.61655798789008, |
| "grad_norm": 0.3418888747692108, |
| "learning_rate": 0.00044886779266161905, |
| "loss": 3.3445, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.631113181183046, |
| "grad_norm": 0.3369224965572357, |
| "learning_rate": 0.00044869306930693065, |
| "loss": 3.3289, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.645668374476013, |
| "grad_norm": 0.3346802592277527, |
| "learning_rate": 0.00044851834595224224, |
| "loss": 3.3417, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.66022356776898, |
| "grad_norm": 0.3884475827217102, |
| "learning_rate": 0.00044834362259755383, |
| "loss": 3.3461, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.674778761061948, |
| "grad_norm": 0.35574793815612793, |
| "learning_rate": 0.00044816889924286543, |
| "loss": 3.3445, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.689333954354915, |
| "grad_norm": 0.3697713315486908, |
| "learning_rate": 0.000447994175888177, |
| "loss": 3.3392, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.703889147647882, |
| "grad_norm": 0.35330986976623535, |
| "learning_rate": 0.0004478194525334886, |
| "loss": 3.3326, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.718444340940847, |
| "grad_norm": 0.3685454726219177, |
| "learning_rate": 0.0004476447291788002, |
| "loss": 3.3491, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.732999534233814, |
| "grad_norm": 0.3632424473762512, |
| "learning_rate": 0.00044747000582411175, |
| "loss": 3.3416, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.747554727526781, |
| "grad_norm": 0.35729971528053284, |
| "learning_rate": 0.00044729528246942335, |
| "loss": 3.3286, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.762109920819748, |
| "grad_norm": 0.36447879672050476, |
| "learning_rate": 0.000447120559114735, |
| "loss": 3.331, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.776665114112715, |
| "grad_norm": 0.34526267647743225, |
| "learning_rate": 0.0004469458357600466, |
| "loss": 3.3558, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.791220307405682, |
| "grad_norm": 0.36058372259140015, |
| "learning_rate": 0.00044677111240535813, |
| "loss": 3.3435, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.80577550069865, |
| "grad_norm": 0.37143009901046753, |
| "learning_rate": 0.0004465963890506697, |
| "loss": 3.3553, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.80577550069865, |
| "eval_accuracy": 0.37099597692280845, |
| "eval_loss": 3.545367479324341, |
| "eval_runtime": 179.758, |
| "eval_samples_per_second": 92.63, |
| "eval_steps_per_second": 5.791, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.820330693991616, |
| "grad_norm": 0.35331812500953674, |
| "learning_rate": 0.0004464216656959813, |
| "loss": 3.3553, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.834885887284583, |
| "grad_norm": 0.3394143283367157, |
| "learning_rate": 0.00044624694234129297, |
| "loss": 3.3457, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.84944108057755, |
| "grad_norm": 0.36839503049850464, |
| "learning_rate": 0.0004460722189866045, |
| "loss": 3.3476, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.863996273870518, |
| "grad_norm": 0.3979646563529968, |
| "learning_rate": 0.0004458974956319161, |
| "loss": 3.3549, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.878551467163485, |
| "grad_norm": 0.3441554605960846, |
| "learning_rate": 0.0004457227722772277, |
| "loss": 3.3421, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.89310666045645, |
| "grad_norm": 0.3389278054237366, |
| "learning_rate": 0.00044554804892253923, |
| "loss": 3.3441, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.907661853749417, |
| "grad_norm": 0.3513207733631134, |
| "learning_rate": 0.00044537332556785083, |
| "loss": 3.3579, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.922217047042384, |
| "grad_norm": 0.3634328544139862, |
| "learning_rate": 0.0004451986022131625, |
| "loss": 3.3447, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.936772240335351, |
| "grad_norm": 0.34999966621398926, |
| "learning_rate": 0.00044502387885847407, |
| "loss": 3.3522, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.951327433628318, |
| "grad_norm": 0.34144464135169983, |
| "learning_rate": 0.0004448491555037856, |
| "loss": 3.3623, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.965882626921285, |
| "grad_norm": 0.35570549964904785, |
| "learning_rate": 0.0004446744321490972, |
| "loss": 3.3436, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.980437820214252, |
| "grad_norm": 0.351752907037735, |
| "learning_rate": 0.0004444997087944088, |
| "loss": 3.3526, |
| "step": 44600 |
| }, |
| { |
| "epoch": 12.99499301350722, |
| "grad_norm": 0.3449532389640808, |
| "learning_rate": 0.0004443249854397204, |
| "loss": 3.3437, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.009315323707499, |
| "grad_norm": 0.33416709303855896, |
| "learning_rate": 0.000444150262085032, |
| "loss": 3.2859, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.023870517000466, |
| "grad_norm": 0.34715038537979126, |
| "learning_rate": 0.0004439755387303436, |
| "loss": 3.2449, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.038425710293433, |
| "grad_norm": 0.3305808901786804, |
| "learning_rate": 0.0004438008153756552, |
| "loss": 3.2539, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.0529809035864, |
| "grad_norm": 0.38518592715263367, |
| "learning_rate": 0.00044362609202096677, |
| "loss": 3.2619, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.067536096879367, |
| "grad_norm": 0.35659927129745483, |
| "learning_rate": 0.0004434513686662783, |
| "loss": 3.2589, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.082091290172334, |
| "grad_norm": 0.3617592453956604, |
| "learning_rate": 0.00044327664531158996, |
| "loss": 3.2477, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.0966464834653, |
| "grad_norm": 0.36155426502227783, |
| "learning_rate": 0.00044310192195690155, |
| "loss": 3.266, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.0966464834653, |
| "eval_accuracy": 0.370269658175358, |
| "eval_loss": 3.562215566635132, |
| "eval_runtime": 179.6666, |
| "eval_samples_per_second": 92.677, |
| "eval_steps_per_second": 5.794, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.111201676758267, |
| "grad_norm": 0.35711902379989624, |
| "learning_rate": 0.00044292719860221315, |
| "loss": 3.2665, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.125756870051234, |
| "grad_norm": 0.3325355648994446, |
| "learning_rate": 0.0004427524752475247, |
| "loss": 3.2799, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.1403120633442, |
| "grad_norm": 0.3590795695781708, |
| "learning_rate": 0.0004425777518928363, |
| "loss": 3.2741, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.154867256637168, |
| "grad_norm": 0.36923661828041077, |
| "learning_rate": 0.0004424030285381479, |
| "loss": 3.2727, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.169422449930135, |
| "grad_norm": 0.3838081359863281, |
| "learning_rate": 0.0004422283051834595, |
| "loss": 3.2678, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.183977643223102, |
| "grad_norm": 0.36855098605155945, |
| "learning_rate": 0.00044205358182877107, |
| "loss": 3.2836, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.198532836516069, |
| "grad_norm": 0.3482199013233185, |
| "learning_rate": 0.00044187885847408266, |
| "loss": 3.2763, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.213088029809036, |
| "grad_norm": 0.3591024875640869, |
| "learning_rate": 0.00044170413511939425, |
| "loss": 3.2913, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.227643223102003, |
| "grad_norm": 0.3705367147922516, |
| "learning_rate": 0.0004415294117647058, |
| "loss": 3.2844, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.24219841639497, |
| "grad_norm": 0.35116633772850037, |
| "learning_rate": 0.00044135468841001744, |
| "loss": 3.2834, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.256753609687937, |
| "grad_norm": 0.3566817343235016, |
| "learning_rate": 0.00044117996505532904, |
| "loss": 3.2826, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.271308802980904, |
| "grad_norm": 0.35147467255592346, |
| "learning_rate": 0.00044100524170064063, |
| "loss": 3.2919, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.285863996273871, |
| "grad_norm": 0.3430477976799011, |
| "learning_rate": 0.0004408305183459522, |
| "loss": 3.2787, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.300419189566837, |
| "grad_norm": 0.37397006154060364, |
| "learning_rate": 0.00044065579499126377, |
| "loss": 3.2915, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.314974382859804, |
| "grad_norm": 0.38754358887672424, |
| "learning_rate": 0.00044048107163657536, |
| "loss": 3.3088, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.32952957615277, |
| "grad_norm": 0.3674768805503845, |
| "learning_rate": 0.000440306348281887, |
| "loss": 3.2898, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.344084769445738, |
| "grad_norm": 0.3887562155723572, |
| "learning_rate": 0.0004401316249271986, |
| "loss": 3.2986, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.358639962738705, |
| "grad_norm": 0.36924323439598083, |
| "learning_rate": 0.00043995690157251014, |
| "loss": 3.2946, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.373195156031672, |
| "grad_norm": 0.3964468538761139, |
| "learning_rate": 0.00043978217821782174, |
| "loss": 3.2932, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.387750349324639, |
| "grad_norm": 0.3599477708339691, |
| "learning_rate": 0.00043960745486313333, |
| "loss": 3.298, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.387750349324639, |
| "eval_accuracy": 0.37062588343806385, |
| "eval_loss": 3.556173086166382, |
| "eval_runtime": 179.6561, |
| "eval_samples_per_second": 92.683, |
| "eval_steps_per_second": 5.794, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.402305542617606, |
| "grad_norm": 0.3589121103286743, |
| "learning_rate": 0.00043943273150844487, |
| "loss": 3.3072, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.416860735910573, |
| "grad_norm": 0.3620617687702179, |
| "learning_rate": 0.0004392580081537565, |
| "loss": 3.3076, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.43141592920354, |
| "grad_norm": 0.3338106870651245, |
| "learning_rate": 0.0004390832847990681, |
| "loss": 3.306, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.445971122496507, |
| "grad_norm": 0.34184029698371887, |
| "learning_rate": 0.0004389085614443797, |
| "loss": 3.3005, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.460526315789474, |
| "grad_norm": 0.3470214009284973, |
| "learning_rate": 0.00043873383808969125, |
| "loss": 3.3065, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.475081509082441, |
| "grad_norm": 0.39131850004196167, |
| "learning_rate": 0.00043855911473500284, |
| "loss": 3.3043, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.489636702375407, |
| "grad_norm": 0.37218213081359863, |
| "learning_rate": 0.0004383843913803145, |
| "loss": 3.329, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.504191895668374, |
| "grad_norm": 0.3680576980113983, |
| "learning_rate": 0.0004382096680256261, |
| "loss": 3.3217, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.51874708896134, |
| "grad_norm": 0.4002123177051544, |
| "learning_rate": 0.0004380349446709376, |
| "loss": 3.3121, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.533302282254308, |
| "grad_norm": 0.36680230498313904, |
| "learning_rate": 0.0004378602213162492, |
| "loss": 3.3098, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.547857475547275, |
| "grad_norm": 0.32874155044555664, |
| "learning_rate": 0.0004376854979615608, |
| "loss": 3.304, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.562412668840242, |
| "grad_norm": 0.3655678629875183, |
| "learning_rate": 0.0004375107746068724, |
| "loss": 3.3152, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.576967862133209, |
| "grad_norm": 0.3674863874912262, |
| "learning_rate": 0.000437336051252184, |
| "loss": 3.3161, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.591523055426176, |
| "grad_norm": 0.34729015827178955, |
| "learning_rate": 0.0004371613278974956, |
| "loss": 3.3201, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.606078248719143, |
| "grad_norm": 0.3326950967311859, |
| "learning_rate": 0.0004369866045428072, |
| "loss": 3.305, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.62063344201211, |
| "grad_norm": 0.38385745882987976, |
| "learning_rate": 0.0004368118811881188, |
| "loss": 3.3202, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.635188635305077, |
| "grad_norm": 0.35021325945854187, |
| "learning_rate": 0.0004366371578334303, |
| "loss": 3.3206, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.649743828598044, |
| "grad_norm": 0.36092448234558105, |
| "learning_rate": 0.000436462434478742, |
| "loss": 3.3223, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.664299021891011, |
| "grad_norm": 0.35010001063346863, |
| "learning_rate": 0.00043628771112405357, |
| "loss": 3.3283, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.678854215183978, |
| "grad_norm": 0.33859771490097046, |
| "learning_rate": 0.00043611298776936516, |
| "loss": 3.3193, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.678854215183978, |
| "eval_accuracy": 0.37116862453445965, |
| "eval_loss": 3.5511391162872314, |
| "eval_runtime": 179.5909, |
| "eval_samples_per_second": 92.716, |
| "eval_steps_per_second": 5.797, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.693409408476944, |
| "grad_norm": 0.35317522287368774, |
| "learning_rate": 0.0004359382644146767, |
| "loss": 3.3187, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.70796460176991, |
| "grad_norm": 0.4268363118171692, |
| "learning_rate": 0.0004357635410599883, |
| "loss": 3.3372, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.722519795062878, |
| "grad_norm": 0.3827083706855774, |
| "learning_rate": 0.0004355888177052999, |
| "loss": 3.3369, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.737074988355845, |
| "grad_norm": 0.3427564203739166, |
| "learning_rate": 0.00043541409435061154, |
| "loss": 3.3282, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.751630181648812, |
| "grad_norm": 0.36137208342552185, |
| "learning_rate": 0.0004352393709959231, |
| "loss": 3.3189, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.766185374941779, |
| "grad_norm": 0.33945193886756897, |
| "learning_rate": 0.0004350646476412347, |
| "loss": 3.3331, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.780740568234746, |
| "grad_norm": 0.40009352564811707, |
| "learning_rate": 0.00043488992428654627, |
| "loss": 3.3407, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.795295761527713, |
| "grad_norm": 0.3917628228664398, |
| "learning_rate": 0.0004347152009318578, |
| "loss": 3.33, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.80985095482068, |
| "grad_norm": 0.3363507091999054, |
| "learning_rate": 0.00043454047757716946, |
| "loss": 3.3353, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.824406148113647, |
| "grad_norm": 0.33481940627098083, |
| "learning_rate": 0.00043436575422248105, |
| "loss": 3.3309, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.838961341406614, |
| "grad_norm": 0.3893332779407501, |
| "learning_rate": 0.00043419103086779265, |
| "loss": 3.3376, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.853516534699581, |
| "grad_norm": 0.3612382411956787, |
| "learning_rate": 0.0004340163075131042, |
| "loss": 3.3238, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.868071727992549, |
| "grad_norm": 0.3898193836212158, |
| "learning_rate": 0.0004338415841584158, |
| "loss": 3.3458, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.882626921285514, |
| "grad_norm": 0.38638564944267273, |
| "learning_rate": 0.0004336668608037274, |
| "loss": 3.3269, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.89718211457848, |
| "grad_norm": 0.3925230801105499, |
| "learning_rate": 0.000433492137449039, |
| "loss": 3.3353, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.911737307871448, |
| "grad_norm": 0.3488530218601227, |
| "learning_rate": 0.00043331741409435056, |
| "loss": 3.3336, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.926292501164415, |
| "grad_norm": 0.35001108050346375, |
| "learning_rate": 0.00043314269073966216, |
| "loss": 3.3381, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.940847694457382, |
| "grad_norm": 0.3923191428184509, |
| "learning_rate": 0.00043296796738497375, |
| "loss": 3.3239, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.955402887750349, |
| "grad_norm": 0.37925252318382263, |
| "learning_rate": 0.00043279324403028535, |
| "loss": 3.352, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.969958081043316, |
| "grad_norm": 0.3492947220802307, |
| "learning_rate": 0.0004326185206755969, |
| "loss": 3.3377, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.969958081043316, |
| "eval_accuracy": 0.3718900071350827, |
| "eval_loss": 3.5379514694213867, |
| "eval_runtime": 179.6614, |
| "eval_samples_per_second": 92.68, |
| "eval_steps_per_second": 5.794, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.984513274336283, |
| "grad_norm": 0.3619728982448578, |
| "learning_rate": 0.00043244379732090854, |
| "loss": 3.3463, |
| "step": 48050 |
| }, |
| { |
| "epoch": 13.99906846762925, |
| "grad_norm": 0.34894421696662903, |
| "learning_rate": 0.00043226907396622013, |
| "loss": 3.3319, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.01339077782953, |
| "grad_norm": 0.3701537549495697, |
| "learning_rate": 0.0004320943506115317, |
| "loss": 3.2336, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.027945971122497, |
| "grad_norm": 0.33407357335090637, |
| "learning_rate": 0.00043191962725684326, |
| "loss": 3.2156, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.042501164415464, |
| "grad_norm": 0.3887421190738678, |
| "learning_rate": 0.00043174490390215486, |
| "loss": 3.2451, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.057056357708431, |
| "grad_norm": 0.34901162981987, |
| "learning_rate": 0.0004315701805474665, |
| "loss": 3.2342, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.071611551001398, |
| "grad_norm": 0.36356404423713684, |
| "learning_rate": 0.0004313954571927781, |
| "loss": 3.2384, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.086166744294363, |
| "grad_norm": 0.3423268496990204, |
| "learning_rate": 0.00043122073383808964, |
| "loss": 3.2502, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.10072193758733, |
| "grad_norm": 0.3743983507156372, |
| "learning_rate": 0.00043104601048340124, |
| "loss": 3.2497, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.115277130880298, |
| "grad_norm": 0.40020278096199036, |
| "learning_rate": 0.00043087128712871283, |
| "loss": 3.2558, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.129832324173265, |
| "grad_norm": 0.33450552821159363, |
| "learning_rate": 0.00043069656377402437, |
| "loss": 3.2594, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.144387517466232, |
| "grad_norm": 0.3431876599788666, |
| "learning_rate": 0.000430521840419336, |
| "loss": 3.2497, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.158942710759199, |
| "grad_norm": 0.357308954000473, |
| "learning_rate": 0.0004303471170646476, |
| "loss": 3.2554, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.173497904052166, |
| "grad_norm": 0.355058491230011, |
| "learning_rate": 0.0004301723937099592, |
| "loss": 3.2586, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.188053097345133, |
| "grad_norm": 0.39194077253341675, |
| "learning_rate": 0.00042999767035527075, |
| "loss": 3.2703, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.2026082906381, |
| "grad_norm": 0.3674411177635193, |
| "learning_rate": 0.00042982294700058234, |
| "loss": 3.2657, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.217163483931067, |
| "grad_norm": 0.3595729470252991, |
| "learning_rate": 0.000429648223645894, |
| "loss": 3.271, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.231718677224034, |
| "grad_norm": 0.36931276321411133, |
| "learning_rate": 0.0004294735002912056, |
| "loss": 3.2785, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.246273870517001, |
| "grad_norm": 0.3702554702758789, |
| "learning_rate": 0.0004292987769365172, |
| "loss": 3.2676, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.260829063809968, |
| "grad_norm": 0.3658660352230072, |
| "learning_rate": 0.0004291240535818287, |
| "loss": 3.2707, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.260829063809968, |
| "eval_accuracy": 0.3710951699685767, |
| "eval_loss": 3.5558922290802, |
| "eval_runtime": 179.6472, |
| "eval_samples_per_second": 92.687, |
| "eval_steps_per_second": 5.795, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.275384257102935, |
| "grad_norm": 0.35185012221336365, |
| "learning_rate": 0.0004289493302271403, |
| "loss": 3.2838, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.2899394503959, |
| "grad_norm": 0.3410630524158478, |
| "learning_rate": 0.0004287746068724519, |
| "loss": 3.288, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.304494643688868, |
| "grad_norm": 0.3545389771461487, |
| "learning_rate": 0.00042859988351776356, |
| "loss": 3.2729, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.319049836981835, |
| "grad_norm": 0.3831106424331665, |
| "learning_rate": 0.0004284251601630751, |
| "loss": 3.2881, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.333605030274802, |
| "grad_norm": 0.36799466609954834, |
| "learning_rate": 0.0004282504368083867, |
| "loss": 3.2728, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.348160223567769, |
| "grad_norm": 0.35214364528656006, |
| "learning_rate": 0.0004280757134536983, |
| "loss": 3.2897, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.362715416860736, |
| "grad_norm": 0.366825133562088, |
| "learning_rate": 0.0004279009900990098, |
| "loss": 3.2857, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.377270610153703, |
| "grad_norm": 0.3538905382156372, |
| "learning_rate": 0.0004277262667443214, |
| "loss": 3.2864, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.39182580344667, |
| "grad_norm": 0.3406408727169037, |
| "learning_rate": 0.00042755154338963307, |
| "loss": 3.2917, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.406380996739637, |
| "grad_norm": 0.3695243299007416, |
| "learning_rate": 0.00042737682003494466, |
| "loss": 3.2993, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.420936190032604, |
| "grad_norm": 0.38720056414604187, |
| "learning_rate": 0.0004272020966802562, |
| "loss": 3.3039, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.435491383325571, |
| "grad_norm": 0.35300391912460327, |
| "learning_rate": 0.0004270273733255678, |
| "loss": 3.2896, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.450046576618538, |
| "grad_norm": 0.37260785698890686, |
| "learning_rate": 0.0004268526499708794, |
| "loss": 3.2924, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.464601769911505, |
| "grad_norm": 0.3798984885215759, |
| "learning_rate": 0.00042667792661619104, |
| "loss": 3.2954, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.47915696320447, |
| "grad_norm": 0.37590479850769043, |
| "learning_rate": 0.0004265032032615026, |
| "loss": 3.2964, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.493712156497438, |
| "grad_norm": 0.3550393879413605, |
| "learning_rate": 0.0004263284799068142, |
| "loss": 3.2941, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.508267349790405, |
| "grad_norm": 0.3456578552722931, |
| "learning_rate": 0.00042615375655212577, |
| "loss": 3.3006, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.522822543083372, |
| "grad_norm": 0.3781253397464752, |
| "learning_rate": 0.00042597903319743736, |
| "loss": 3.287, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.537377736376339, |
| "grad_norm": 0.361044704914093, |
| "learning_rate": 0.0004258043098427489, |
| "loss": 3.293, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.551932929669306, |
| "grad_norm": 0.37364527583122253, |
| "learning_rate": 0.00042562958648806055, |
| "loss": 3.2986, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.551932929669306, |
| "eval_accuracy": 0.3713522021855143, |
| "eval_loss": 3.547536849975586, |
| "eval_runtime": 179.586, |
| "eval_samples_per_second": 92.719, |
| "eval_steps_per_second": 5.797, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.566488122962273, |
| "grad_norm": 0.38099926710128784, |
| "learning_rate": 0.00042545486313337214, |
| "loss": 3.2904, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.58104331625524, |
| "grad_norm": 0.37132102251052856, |
| "learning_rate": 0.00042528013977868374, |
| "loss": 3.2997, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.595598509548207, |
| "grad_norm": 0.35718464851379395, |
| "learning_rate": 0.0004251054164239953, |
| "loss": 3.2906, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.610153702841174, |
| "grad_norm": 0.4218481481075287, |
| "learning_rate": 0.0004249306930693069, |
| "loss": 3.3184, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.624708896134141, |
| "grad_norm": 0.37840861082077026, |
| "learning_rate": 0.0004247559697146185, |
| "loss": 3.3032, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.639264089427108, |
| "grad_norm": 0.35636797547340393, |
| "learning_rate": 0.0004245812463599301, |
| "loss": 3.3033, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.653819282720075, |
| "grad_norm": 0.35813337564468384, |
| "learning_rate": 0.00042440652300524166, |
| "loss": 3.2904, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.668374476013042, |
| "grad_norm": 0.36577391624450684, |
| "learning_rate": 0.00042423179965055325, |
| "loss": 3.305, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.682929669306008, |
| "grad_norm": 0.37621748447418213, |
| "learning_rate": 0.00042405707629586484, |
| "loss": 3.3105, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.697484862598975, |
| "grad_norm": 0.3534005284309387, |
| "learning_rate": 0.0004238823529411764, |
| "loss": 3.3143, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.712040055891942, |
| "grad_norm": 0.33455362915992737, |
| "learning_rate": 0.00042370762958648803, |
| "loss": 3.3092, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.726595249184909, |
| "grad_norm": 0.36176812648773193, |
| "learning_rate": 0.00042353290623179963, |
| "loss": 3.3064, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.741150442477876, |
| "grad_norm": 0.3369339108467102, |
| "learning_rate": 0.0004233581828771112, |
| "loss": 3.3078, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.755705635770843, |
| "grad_norm": 0.37076041102409363, |
| "learning_rate": 0.00042318345952242276, |
| "loss": 3.3203, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.77026082906381, |
| "grad_norm": 0.3623945415019989, |
| "learning_rate": 0.00042300873616773436, |
| "loss": 3.3136, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.784816022356777, |
| "grad_norm": 0.3696160316467285, |
| "learning_rate": 0.00042283401281304595, |
| "loss": 3.3155, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.799371215649744, |
| "grad_norm": 0.3646029829978943, |
| "learning_rate": 0.0004226592894583576, |
| "loss": 3.3136, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.813926408942711, |
| "grad_norm": 0.3717617094516754, |
| "learning_rate": 0.00042248456610366914, |
| "loss": 3.3075, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.828481602235678, |
| "grad_norm": 0.3565121591091156, |
| "learning_rate": 0.00042230984274898073, |
| "loss": 3.3208, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.843036795528645, |
| "grad_norm": 0.36482352018356323, |
| "learning_rate": 0.00042213511939429233, |
| "loss": 3.3352, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.843036795528645, |
| "eval_accuracy": 0.3720694713304479, |
| "eval_loss": 3.5395095348358154, |
| "eval_runtime": 179.5403, |
| "eval_samples_per_second": 92.742, |
| "eval_steps_per_second": 5.798, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.857591988821612, |
| "grad_norm": 0.34361740946769714, |
| "learning_rate": 0.0004219603960396039, |
| "loss": 3.3154, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.872147182114578, |
| "grad_norm": 0.3463952839374542, |
| "learning_rate": 0.0004217856726849155, |
| "loss": 3.3222, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.886702375407545, |
| "grad_norm": 0.35919591784477234, |
| "learning_rate": 0.0004216109493302271, |
| "loss": 3.3276, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.901257568700512, |
| "grad_norm": 0.3705897331237793, |
| "learning_rate": 0.0004214362259755387, |
| "loss": 3.317, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.915812761993479, |
| "grad_norm": 0.3571067750453949, |
| "learning_rate": 0.0004212615026208503, |
| "loss": 3.324, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.930367955286446, |
| "grad_norm": 0.34250903129577637, |
| "learning_rate": 0.00042108677926616184, |
| "loss": 3.3237, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.944923148579413, |
| "grad_norm": 0.37873947620391846, |
| "learning_rate": 0.00042091205591147343, |
| "loss": 3.3237, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.95947834187238, |
| "grad_norm": 0.409123957157135, |
| "learning_rate": 0.0004207373325567851, |
| "loss": 3.3299, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.974033535165347, |
| "grad_norm": 0.3576182425022125, |
| "learning_rate": 0.0004205626092020967, |
| "loss": 3.3202, |
| "step": 51450 |
| }, |
| { |
| "epoch": 14.988588728458314, |
| "grad_norm": 0.36448413133621216, |
| "learning_rate": 0.0004203878858474082, |
| "loss": 3.3059, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.002911038658594, |
| "grad_norm": 0.3610822856426239, |
| "learning_rate": 0.0004202131624927198, |
| "loss": 3.3108, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.01746623195156, |
| "grad_norm": 0.38162854313850403, |
| "learning_rate": 0.0004200384391380314, |
| "loss": 3.2257, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.032021425244528, |
| "grad_norm": 0.40614140033721924, |
| "learning_rate": 0.00041986371578334305, |
| "loss": 3.2096, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.046576618537495, |
| "grad_norm": 0.3821380138397217, |
| "learning_rate": 0.0004196889924286546, |
| "loss": 3.2207, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.06113181183046, |
| "grad_norm": 0.3656452000141144, |
| "learning_rate": 0.0004195142690739662, |
| "loss": 3.2344, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.075687005123427, |
| "grad_norm": 0.37217843532562256, |
| "learning_rate": 0.0004193395457192778, |
| "loss": 3.2278, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.090242198416394, |
| "grad_norm": 0.3583957254886627, |
| "learning_rate": 0.0004191648223645893, |
| "loss": 3.2347, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.104797391709361, |
| "grad_norm": 0.3432309329509735, |
| "learning_rate": 0.0004189900990099009, |
| "loss": 3.236, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.119352585002328, |
| "grad_norm": 0.409397691488266, |
| "learning_rate": 0.00041881537565521256, |
| "loss": 3.2372, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.133907778295296, |
| "grad_norm": 0.3543730676174164, |
| "learning_rate": 0.00041864065230052416, |
| "loss": 3.2328, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.133907778295296, |
| "eval_accuracy": 0.3717191224330127, |
| "eval_loss": 3.5530853271484375, |
| "eval_runtime": 179.6972, |
| "eval_samples_per_second": 92.661, |
| "eval_steps_per_second": 5.793, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.148462971588263, |
| "grad_norm": 0.3441791832447052, |
| "learning_rate": 0.00041846592894583575, |
| "loss": 3.2443, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.16301816488123, |
| "grad_norm": 0.3853805661201477, |
| "learning_rate": 0.0004182912055911473, |
| "loss": 3.244, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.177573358174197, |
| "grad_norm": 0.3840022683143616, |
| "learning_rate": 0.0004181164822364589, |
| "loss": 3.2354, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.192128551467164, |
| "grad_norm": 0.3698272705078125, |
| "learning_rate": 0.0004179417588817705, |
| "loss": 3.2504, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.20668374476013, |
| "grad_norm": 0.36755281686782837, |
| "learning_rate": 0.00041776703552708213, |
| "loss": 3.2553, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.221238938053098, |
| "grad_norm": 0.3694024085998535, |
| "learning_rate": 0.00041759231217239367, |
| "loss": 3.2514, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.235794131346065, |
| "grad_norm": 0.3874948024749756, |
| "learning_rate": 0.00041741758881770527, |
| "loss": 3.2526, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.250349324639032, |
| "grad_norm": 0.36800023913383484, |
| "learning_rate": 0.00041724286546301686, |
| "loss": 3.2682, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.264904517931997, |
| "grad_norm": 0.3868958652019501, |
| "learning_rate": 0.0004170681421083284, |
| "loss": 3.2656, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.279459711224964, |
| "grad_norm": 0.3841627836227417, |
| "learning_rate": 0.00041689341875364005, |
| "loss": 3.2586, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.294014904517931, |
| "grad_norm": 0.36817166209220886, |
| "learning_rate": 0.00041671869539895164, |
| "loss": 3.2491, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.308570097810899, |
| "grad_norm": 0.3713952898979187, |
| "learning_rate": 0.00041654397204426324, |
| "loss": 3.2714, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.323125291103866, |
| "grad_norm": 0.36914798617362976, |
| "learning_rate": 0.0004163692486895748, |
| "loss": 3.2747, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.337680484396833, |
| "grad_norm": 0.39278385043144226, |
| "learning_rate": 0.00041619452533488637, |
| "loss": 3.2763, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.3522356776898, |
| "grad_norm": 0.3813706636428833, |
| "learning_rate": 0.00041601980198019797, |
| "loss": 3.2517, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.366790870982767, |
| "grad_norm": 0.3740118145942688, |
| "learning_rate": 0.0004158450786255096, |
| "loss": 3.2634, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.381346064275734, |
| "grad_norm": 0.35879942774772644, |
| "learning_rate": 0.00041567035527082115, |
| "loss": 3.2687, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.3959012575687, |
| "grad_norm": 0.3741402328014374, |
| "learning_rate": 0.00041549563191613275, |
| "loss": 3.2697, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.410456450861668, |
| "grad_norm": 0.3618059456348419, |
| "learning_rate": 0.00041532090856144434, |
| "loss": 3.2675, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.425011644154635, |
| "grad_norm": 0.3848046660423279, |
| "learning_rate": 0.00041514618520675594, |
| "loss": 3.278, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.425011644154635, |
| "eval_accuracy": 0.37196287406443856, |
| "eval_loss": 3.5493414402008057, |
| "eval_runtime": 179.6539, |
| "eval_samples_per_second": 92.684, |
| "eval_steps_per_second": 5.794, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.439566837447602, |
| "grad_norm": 0.37163642048835754, |
| "learning_rate": 0.00041497146185206753, |
| "loss": 3.2886, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.454122030740567, |
| "grad_norm": 0.3669773042201996, |
| "learning_rate": 0.0004147967384973791, |
| "loss": 3.2606, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.468677224033534, |
| "grad_norm": 0.3907168209552765, |
| "learning_rate": 0.0004146220151426907, |
| "loss": 3.2783, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.483232417326501, |
| "grad_norm": 0.3907336890697479, |
| "learning_rate": 0.0004144472917880023, |
| "loss": 3.2893, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.497787610619469, |
| "grad_norm": 0.3639543652534485, |
| "learning_rate": 0.00041427256843331385, |
| "loss": 3.272, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.512342803912436, |
| "grad_norm": 0.41702601313591003, |
| "learning_rate": 0.00041409784507862545, |
| "loss": 3.2886, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.526897997205403, |
| "grad_norm": 0.4025951027870178, |
| "learning_rate": 0.0004139231217239371, |
| "loss": 3.2761, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.54145319049837, |
| "grad_norm": 0.3820459842681885, |
| "learning_rate": 0.0004137483983692487, |
| "loss": 3.2812, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.556008383791337, |
| "grad_norm": 0.36580777168273926, |
| "learning_rate": 0.00041357367501456023, |
| "loss": 3.2967, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.570563577084304, |
| "grad_norm": 0.36303627490997314, |
| "learning_rate": 0.0004133989516598718, |
| "loss": 3.2896, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.585118770377271, |
| "grad_norm": 0.35470396280288696, |
| "learning_rate": 0.0004132242283051834, |
| "loss": 3.2799, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.599673963670238, |
| "grad_norm": 0.3685864508152008, |
| "learning_rate": 0.00041304950495049496, |
| "loss": 3.29, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.614229156963205, |
| "grad_norm": 0.3831583857536316, |
| "learning_rate": 0.0004128747815958066, |
| "loss": 3.2894, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.628784350256172, |
| "grad_norm": 0.3941470682621002, |
| "learning_rate": 0.0004127000582411182, |
| "loss": 3.2939, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.64333954354914, |
| "grad_norm": 0.3580646514892578, |
| "learning_rate": 0.0004125253348864298, |
| "loss": 3.2979, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.657894736842106, |
| "grad_norm": 0.3721872866153717, |
| "learning_rate": 0.00041235061153174134, |
| "loss": 3.2968, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.672449930135071, |
| "grad_norm": 0.3598230481147766, |
| "learning_rate": 0.00041217588817705293, |
| "loss": 3.2864, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.687005123428039, |
| "grad_norm": 0.3656480312347412, |
| "learning_rate": 0.0004120011648223646, |
| "loss": 3.2915, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.701560316721006, |
| "grad_norm": 0.35835638642311096, |
| "learning_rate": 0.0004118264414676762, |
| "loss": 3.3011, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.716115510013973, |
| "grad_norm": 0.38179880380630493, |
| "learning_rate": 0.0004116517181129877, |
| "loss": 3.2919, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.716115510013973, |
| "eval_accuracy": 0.3724329832860893, |
| "eval_loss": 3.5413262844085693, |
| "eval_runtime": 179.6208, |
| "eval_samples_per_second": 92.701, |
| "eval_steps_per_second": 5.796, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.73067070330694, |
| "grad_norm": 0.3575129210948944, |
| "learning_rate": 0.0004114769947582993, |
| "loss": 3.3032, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.745225896599907, |
| "grad_norm": 0.4014807343482971, |
| "learning_rate": 0.0004113022714036109, |
| "loss": 3.3143, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.759781089892874, |
| "grad_norm": 0.39006999135017395, |
| "learning_rate": 0.0004111275480489225, |
| "loss": 3.2954, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.774336283185841, |
| "grad_norm": 0.35772159695625305, |
| "learning_rate": 0.0004109528246942341, |
| "loss": 3.3076, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.788891476478808, |
| "grad_norm": 0.39875340461730957, |
| "learning_rate": 0.0004107781013395457, |
| "loss": 3.2932, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.803446669771775, |
| "grad_norm": 0.3441701829433441, |
| "learning_rate": 0.0004106033779848573, |
| "loss": 3.3028, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.818001863064742, |
| "grad_norm": 0.35036149621009827, |
| "learning_rate": 0.0004104286546301689, |
| "loss": 3.3049, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.83255705635771, |
| "grad_norm": 0.3675341010093689, |
| "learning_rate": 0.0004102539312754804, |
| "loss": 3.3186, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.847112249650674, |
| "grad_norm": 0.36607494950294495, |
| "learning_rate": 0.00041007920792079206, |
| "loss": 3.3086, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.861667442943642, |
| "grad_norm": 0.36645710468292236, |
| "learning_rate": 0.00040990448456610366, |
| "loss": 3.3141, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.876222636236609, |
| "grad_norm": 0.36055368185043335, |
| "learning_rate": 0.00040972976121141525, |
| "loss": 3.3134, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.890777829529576, |
| "grad_norm": 0.36668238043785095, |
| "learning_rate": 0.0004095550378567268, |
| "loss": 3.3035, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.905333022822543, |
| "grad_norm": 0.3563118278980255, |
| "learning_rate": 0.0004093803145020384, |
| "loss": 3.3093, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.91988821611551, |
| "grad_norm": 0.3816479742527008, |
| "learning_rate": 0.00040920559114735, |
| "loss": 3.3128, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.934443409408477, |
| "grad_norm": 0.3647186756134033, |
| "learning_rate": 0.00040903086779266163, |
| "loss": 3.3119, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.948998602701444, |
| "grad_norm": 0.35991621017456055, |
| "learning_rate": 0.00040885614443797317, |
| "loss": 3.3154, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.963553795994411, |
| "grad_norm": 0.3633652329444885, |
| "learning_rate": 0.00040868142108328476, |
| "loss": 3.3102, |
| "step": 54850 |
| }, |
| { |
| "epoch": 15.978108989287378, |
| "grad_norm": 0.38223573565483093, |
| "learning_rate": 0.00040850669772859636, |
| "loss": 3.3094, |
| "step": 54900 |
| }, |
| { |
| "epoch": 15.992664182580345, |
| "grad_norm": 0.3739972710609436, |
| "learning_rate": 0.0004083319743739079, |
| "loss": 3.3059, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.006986492780623, |
| "grad_norm": 0.37632840871810913, |
| "learning_rate": 0.0004081572510192195, |
| "loss": 3.2682, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.006986492780623, |
| "eval_accuracy": 0.3719969569830083, |
| "eval_loss": 3.5475385189056396, |
| "eval_runtime": 179.7295, |
| "eval_samples_per_second": 92.645, |
| "eval_steps_per_second": 5.792, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.02154168607359, |
| "grad_norm": 0.3803386688232422, |
| "learning_rate": 0.00040798252766453114, |
| "loss": 3.2038, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.036096879366557, |
| "grad_norm": 0.35486406087875366, |
| "learning_rate": 0.00040780780430984273, |
| "loss": 3.207, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.050652072659524, |
| "grad_norm": 0.36758115887641907, |
| "learning_rate": 0.0004076330809551543, |
| "loss": 3.2083, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.06520726595249, |
| "grad_norm": 0.3653200566768646, |
| "learning_rate": 0.00040745835760046587, |
| "loss": 3.2015, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.079762459245458, |
| "grad_norm": 0.4169802963733673, |
| "learning_rate": 0.00040728363424577746, |
| "loss": 3.2054, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.094317652538425, |
| "grad_norm": 0.3609280586242676, |
| "learning_rate": 0.0004071089108910891, |
| "loss": 3.2186, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.108872845831392, |
| "grad_norm": 0.38168197870254517, |
| "learning_rate": 0.0004069341875364007, |
| "loss": 3.2337, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.12342803912436, |
| "grad_norm": 0.4037674367427826, |
| "learning_rate": 0.00040675946418171225, |
| "loss": 3.225, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.137983232417326, |
| "grad_norm": 0.3925740718841553, |
| "learning_rate": 0.00040658474082702384, |
| "loss": 3.2271, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.152538425710294, |
| "grad_norm": 0.37324100732803345, |
| "learning_rate": 0.00040641001747233543, |
| "loss": 3.2314, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.16709361900326, |
| "grad_norm": 0.38064682483673096, |
| "learning_rate": 0.000406235294117647, |
| "loss": 3.2391, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.181648812296228, |
| "grad_norm": 0.3883419334888458, |
| "learning_rate": 0.0004060605707629586, |
| "loss": 3.2359, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.196204005589195, |
| "grad_norm": 0.3888667821884155, |
| "learning_rate": 0.0004058858474082702, |
| "loss": 3.2457, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.21075919888216, |
| "grad_norm": 0.3523971736431122, |
| "learning_rate": 0.0004057111240535818, |
| "loss": 3.2474, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.22531439217513, |
| "grad_norm": 0.3806838393211365, |
| "learning_rate": 0.00040553640069889335, |
| "loss": 3.241, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.239869585468096, |
| "grad_norm": 0.37035635113716125, |
| "learning_rate": 0.00040536167734420495, |
| "loss": 3.2556, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.254424778761063, |
| "grad_norm": 0.36657461524009705, |
| "learning_rate": 0.0004051869539895166, |
| "loss": 3.2501, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.26897997205403, |
| "grad_norm": 0.40011465549468994, |
| "learning_rate": 0.0004050122306348282, |
| "loss": 3.2552, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.283535165346997, |
| "grad_norm": 0.3992847800254822, |
| "learning_rate": 0.00040483750728013973, |
| "loss": 3.2518, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.298090358639964, |
| "grad_norm": 0.3844752609729767, |
| "learning_rate": 0.0004046627839254513, |
| "loss": 3.2366, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.298090358639964, |
| "eval_accuracy": 0.3725179555279027, |
| "eval_loss": 3.549312114715576, |
| "eval_runtime": 179.6265, |
| "eval_samples_per_second": 92.698, |
| "eval_steps_per_second": 5.795, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.31264555193293, |
| "grad_norm": 0.39455828070640564, |
| "learning_rate": 0.0004044880605707629, |
| "loss": 3.256, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.3272007452259, |
| "grad_norm": 0.3753792941570282, |
| "learning_rate": 0.00040431333721607446, |
| "loss": 3.2485, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.341755938518865, |
| "grad_norm": 0.3579533100128174, |
| "learning_rate": 0.0004041386138613861, |
| "loss": 3.2474, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.35631113181183, |
| "grad_norm": 0.3628307580947876, |
| "learning_rate": 0.0004039638905066977, |
| "loss": 3.2552, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.370866325104796, |
| "grad_norm": 0.3599070608615875, |
| "learning_rate": 0.0004037891671520093, |
| "loss": 3.2571, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.385421518397763, |
| "grad_norm": 0.37618762254714966, |
| "learning_rate": 0.0004036144437973209, |
| "loss": 3.2502, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.39997671169073, |
| "grad_norm": 0.37751829624176025, |
| "learning_rate": 0.00040343972044263243, |
| "loss": 3.265, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.414531904983697, |
| "grad_norm": 0.37696489691734314, |
| "learning_rate": 0.0004032649970879441, |
| "loss": 3.2667, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.429087098276664, |
| "grad_norm": 0.36506879329681396, |
| "learning_rate": 0.00040309027373325567, |
| "loss": 3.2734, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.44364229156963, |
| "grad_norm": 0.37977832555770874, |
| "learning_rate": 0.00040291555037856727, |
| "loss": 3.2638, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.4581974848626, |
| "grad_norm": 0.38764917850494385, |
| "learning_rate": 0.0004027408270238788, |
| "loss": 3.2674, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.472752678155565, |
| "grad_norm": 0.3823941648006439, |
| "learning_rate": 0.0004025661036691904, |
| "loss": 3.2692, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.487307871448532, |
| "grad_norm": 0.3607601225376129, |
| "learning_rate": 0.000402391380314502, |
| "loss": 3.2756, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.5018630647415, |
| "grad_norm": 0.40973618626594543, |
| "learning_rate": 0.00040221665695981364, |
| "loss": 3.2684, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.516418258034467, |
| "grad_norm": 0.3708013594150543, |
| "learning_rate": 0.0004020419336051252, |
| "loss": 3.2718, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.530973451327434, |
| "grad_norm": 0.371579110622406, |
| "learning_rate": 0.0004018672102504368, |
| "loss": 3.2746, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.5455286446204, |
| "grad_norm": 0.373159259557724, |
| "learning_rate": 0.00040169248689574837, |
| "loss": 3.2793, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.560083837913368, |
| "grad_norm": 0.39505279064178467, |
| "learning_rate": 0.0004015177635410599, |
| "loss": 3.2851, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.574639031206335, |
| "grad_norm": 0.4192643165588379, |
| "learning_rate": 0.0004013430401863715, |
| "loss": 3.2629, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.589194224499302, |
| "grad_norm": 0.3601832687854767, |
| "learning_rate": 0.00040116831683168315, |
| "loss": 3.2881, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.589194224499302, |
| "eval_accuracy": 0.37245072990920663, |
| "eval_loss": 3.5421199798583984, |
| "eval_runtime": 179.6507, |
| "eval_samples_per_second": 92.685, |
| "eval_steps_per_second": 5.795, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.60374941779227, |
| "grad_norm": 0.3881416320800781, |
| "learning_rate": 0.00040099359347699475, |
| "loss": 3.2822, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.618304611085236, |
| "grad_norm": 0.36859115958213806, |
| "learning_rate": 0.0004008188701223063, |
| "loss": 3.2766, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.632859804378203, |
| "grad_norm": 0.37605735659599304, |
| "learning_rate": 0.0004006441467676179, |
| "loss": 3.2849, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.64741499767117, |
| "grad_norm": 0.38148075342178345, |
| "learning_rate": 0.0004004694234129295, |
| "loss": 3.2891, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.661970190964137, |
| "grad_norm": 0.4013352692127228, |
| "learning_rate": 0.0004002947000582411, |
| "loss": 3.2706, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.676525384257104, |
| "grad_norm": 0.33865901827812195, |
| "learning_rate": 0.00040011997670355267, |
| "loss": 3.2819, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.69108057755007, |
| "grad_norm": 0.361101895570755, |
| "learning_rate": 0.00039994525334886426, |
| "loss": 3.2909, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.70563577084304, |
| "grad_norm": 0.3946160674095154, |
| "learning_rate": 0.00039977052999417585, |
| "loss": 3.2848, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.720190964136005, |
| "grad_norm": 0.3926425278186798, |
| "learning_rate": 0.00039959580663948745, |
| "loss": 3.2931, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.734746157428972, |
| "grad_norm": 0.35905447602272034, |
| "learning_rate": 0.000399421083284799, |
| "loss": 3.2666, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.749301350721936, |
| "grad_norm": 0.3647949993610382, |
| "learning_rate": 0.00039924635993011064, |
| "loss": 3.2931, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.763856544014903, |
| "grad_norm": 0.4049411416053772, |
| "learning_rate": 0.00039907163657542223, |
| "loss": 3.3049, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.77841173730787, |
| "grad_norm": 0.38806965947151184, |
| "learning_rate": 0.0003988969132207338, |
| "loss": 3.2901, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.792966930600837, |
| "grad_norm": 0.39097949862480164, |
| "learning_rate": 0.00039872218986604537, |
| "loss": 3.2737, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.807522123893804, |
| "grad_norm": 0.36895087361335754, |
| "learning_rate": 0.00039854746651135696, |
| "loss": 3.2756, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.82207731718677, |
| "grad_norm": 0.35981959104537964, |
| "learning_rate": 0.0003983727431566686, |
| "loss": 3.2876, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.83663251047974, |
| "grad_norm": 0.36862266063690186, |
| "learning_rate": 0.0003981980198019802, |
| "loss": 3.2796, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.851187703772705, |
| "grad_norm": 0.3888986110687256, |
| "learning_rate": 0.00039802329644729174, |
| "loss": 3.2916, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.865742897065672, |
| "grad_norm": 0.3622966706752777, |
| "learning_rate": 0.00039784857309260334, |
| "loss": 3.2801, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.88029809035864, |
| "grad_norm": 0.3616342544555664, |
| "learning_rate": 0.00039767384973791493, |
| "loss": 3.2976, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.88029809035864, |
| "eval_accuracy": 0.3728305781603004, |
| "eval_loss": 3.5370852947235107, |
| "eval_runtime": 179.6611, |
| "eval_samples_per_second": 92.68, |
| "eval_steps_per_second": 5.794, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.894853283651607, |
| "grad_norm": 0.37034207582473755, |
| "learning_rate": 0.00039749912638322647, |
| "loss": 3.2898, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.909408476944574, |
| "grad_norm": 0.36878299713134766, |
| "learning_rate": 0.0003973244030285381, |
| "loss": 3.2922, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.92396367023754, |
| "grad_norm": 0.4003487527370453, |
| "learning_rate": 0.0003971496796738497, |
| "loss": 3.2961, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.938518863530508, |
| "grad_norm": 0.37082305550575256, |
| "learning_rate": 0.0003969749563191613, |
| "loss": 3.2972, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.953074056823475, |
| "grad_norm": 0.3500148355960846, |
| "learning_rate": 0.00039680023296447285, |
| "loss": 3.2897, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.967629250116442, |
| "grad_norm": 0.3859942853450775, |
| "learning_rate": 0.00039662550960978444, |
| "loss": 3.2784, |
| "step": 58300 |
| }, |
| { |
| "epoch": 16.98218444340941, |
| "grad_norm": 0.3568435609340668, |
| "learning_rate": 0.00039645078625509604, |
| "loss": 3.2951, |
| "step": 58350 |
| }, |
| { |
| "epoch": 16.996739636702376, |
| "grad_norm": 0.3656318485736847, |
| "learning_rate": 0.0003962760629004077, |
| "loss": 3.297, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.011061946902654, |
| "grad_norm": 0.3973788917064667, |
| "learning_rate": 0.0003961013395457193, |
| "loss": 3.1995, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.02561714019562, |
| "grad_norm": 0.3912462592124939, |
| "learning_rate": 0.0003959266161910308, |
| "loss": 3.1886, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.040172333488588, |
| "grad_norm": 0.41491520404815674, |
| "learning_rate": 0.0003957518928363424, |
| "loss": 3.1952, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.054727526781555, |
| "grad_norm": 0.3734089136123657, |
| "learning_rate": 0.000395577169481654, |
| "loss": 3.2013, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.069282720074522, |
| "grad_norm": 0.3561474084854126, |
| "learning_rate": 0.00039540244612696566, |
| "loss": 3.2074, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.08383791336749, |
| "grad_norm": 0.35811877250671387, |
| "learning_rate": 0.0003952277227722772, |
| "loss": 3.2098, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.098393106660456, |
| "grad_norm": 0.4044589698314667, |
| "learning_rate": 0.0003950529994175888, |
| "loss": 3.1951, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.112948299953423, |
| "grad_norm": 0.40245088934898376, |
| "learning_rate": 0.0003948782760629004, |
| "loss": 3.2112, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.12750349324639, |
| "grad_norm": 0.40546953678131104, |
| "learning_rate": 0.0003947035527082119, |
| "loss": 3.2241, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.142058686539357, |
| "grad_norm": 0.3817267417907715, |
| "learning_rate": 0.0003945288293535235, |
| "loss": 3.2153, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.156613879832324, |
| "grad_norm": 0.3912336230278015, |
| "learning_rate": 0.00039435410599883517, |
| "loss": 3.2246, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.17116907312529, |
| "grad_norm": 0.36735278367996216, |
| "learning_rate": 0.00039417938264414676, |
| "loss": 3.2211, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.17116907312529, |
| "eval_accuracy": 0.3723176889994795, |
| "eval_loss": 3.5510292053222656, |
| "eval_runtime": 179.6357, |
| "eval_samples_per_second": 92.693, |
| "eval_steps_per_second": 5.795, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.18572426641826, |
| "grad_norm": 0.37078139185905457, |
| "learning_rate": 0.0003940046592894583, |
| "loss": 3.2257, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.200279459711226, |
| "grad_norm": 0.3937833607196808, |
| "learning_rate": 0.0003938299359347699, |
| "loss": 3.2258, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.214834653004193, |
| "grad_norm": 0.3787386417388916, |
| "learning_rate": 0.0003936552125800815, |
| "loss": 3.2316, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.22938984629716, |
| "grad_norm": 0.4125954806804657, |
| "learning_rate": 0.00039348048922539314, |
| "loss": 3.2169, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.243945039590127, |
| "grad_norm": 0.3592980206012726, |
| "learning_rate": 0.0003933057658707047, |
| "loss": 3.2361, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.258500232883094, |
| "grad_norm": 0.36557480692863464, |
| "learning_rate": 0.0003931310425160163, |
| "loss": 3.2276, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.27305542617606, |
| "grad_norm": 0.3812400698661804, |
| "learning_rate": 0.00039295631916132787, |
| "loss": 3.2349, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.287610619469028, |
| "grad_norm": 0.37743037939071655, |
| "learning_rate": 0.00039278159580663946, |
| "loss": 3.2271, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.302165812761995, |
| "grad_norm": 0.3797909915447235, |
| "learning_rate": 0.000392606872451951, |
| "loss": 3.2436, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.316721006054962, |
| "grad_norm": 0.3824479281902313, |
| "learning_rate": 0.00039243214909726265, |
| "loss": 3.2359, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.331276199347926, |
| "grad_norm": 0.36629578471183777, |
| "learning_rate": 0.00039225742574257425, |
| "loss": 3.2425, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.345831392640893, |
| "grad_norm": 0.3833477199077606, |
| "learning_rate": 0.00039208270238788584, |
| "loss": 3.246, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.36038658593386, |
| "grad_norm": 0.393760621547699, |
| "learning_rate": 0.0003919079790331974, |
| "loss": 3.2417, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.374941779226827, |
| "grad_norm": 0.40368106961250305, |
| "learning_rate": 0.000391733255678509, |
| "loss": 3.2376, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.389496972519794, |
| "grad_norm": 0.3892704248428345, |
| "learning_rate": 0.00039155853232382057, |
| "loss": 3.2509, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.40405216581276, |
| "grad_norm": 0.3821728229522705, |
| "learning_rate": 0.0003913838089691322, |
| "loss": 3.2461, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.418607359105728, |
| "grad_norm": 0.3901355564594269, |
| "learning_rate": 0.00039120908561444376, |
| "loss": 3.2564, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.433162552398695, |
| "grad_norm": 0.3782642185688019, |
| "learning_rate": 0.00039103436225975535, |
| "loss": 3.2574, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.447717745691662, |
| "grad_norm": 0.37162768840789795, |
| "learning_rate": 0.00039085963890506695, |
| "loss": 3.2565, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.46227293898463, |
| "grad_norm": 0.38288211822509766, |
| "learning_rate": 0.0003906849155503785, |
| "loss": 3.265, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.46227293898463, |
| "eval_accuracy": 0.37270200328817893, |
| "eval_loss": 3.546358823776245, |
| "eval_runtime": 179.5231, |
| "eval_samples_per_second": 92.751, |
| "eval_steps_per_second": 5.799, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.476828132277596, |
| "grad_norm": 0.38734108209609985, |
| "learning_rate": 0.00039051019219569014, |
| "loss": 3.2502, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.491383325570563, |
| "grad_norm": 0.38259828090667725, |
| "learning_rate": 0.00039033546884100173, |
| "loss": 3.2445, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.50593851886353, |
| "grad_norm": 0.38961061835289, |
| "learning_rate": 0.0003901607454863133, |
| "loss": 3.2673, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.520493712156497, |
| "grad_norm": 0.34726908802986145, |
| "learning_rate": 0.00038998602213162486, |
| "loss": 3.2568, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.535048905449464, |
| "grad_norm": 0.42472442984580994, |
| "learning_rate": 0.00038981129877693646, |
| "loss": 3.2546, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.54960409874243, |
| "grad_norm": 0.38472869992256165, |
| "learning_rate": 0.00038963657542224805, |
| "loss": 3.2622, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.5641592920354, |
| "grad_norm": 0.4024789333343506, |
| "learning_rate": 0.0003894618520675597, |
| "loss": 3.2635, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.578714485328366, |
| "grad_norm": 0.3788979649543762, |
| "learning_rate": 0.00038928712871287124, |
| "loss": 3.252, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.593269678621333, |
| "grad_norm": 0.39442792534828186, |
| "learning_rate": 0.00038911240535818284, |
| "loss": 3.2777, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.6078248719143, |
| "grad_norm": 0.39352068305015564, |
| "learning_rate": 0.00038893768200349443, |
| "loss": 3.2663, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.622380065207267, |
| "grad_norm": 0.3962717354297638, |
| "learning_rate": 0.000388762958648806, |
| "loss": 3.2626, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.636935258500234, |
| "grad_norm": 0.38364487886428833, |
| "learning_rate": 0.0003885882352941176, |
| "loss": 3.2788, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.6514904517932, |
| "grad_norm": 0.3541100025177002, |
| "learning_rate": 0.0003884135119394292, |
| "loss": 3.2587, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.666045645086168, |
| "grad_norm": 0.3765043318271637, |
| "learning_rate": 0.0003882387885847408, |
| "loss": 3.2667, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.680600838379135, |
| "grad_norm": 0.37509429454803467, |
| "learning_rate": 0.0003880640652300524, |
| "loss": 3.2663, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.695156031672102, |
| "grad_norm": 0.35692986845970154, |
| "learning_rate": 0.00038788934187536394, |
| "loss": 3.2728, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.70971122496507, |
| "grad_norm": 0.34927189350128174, |
| "learning_rate": 0.00038771461852067554, |
| "loss": 3.2791, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.724266418258033, |
| "grad_norm": 0.359375923871994, |
| "learning_rate": 0.0003875398951659872, |
| "loss": 3.2762, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.738821611551, |
| "grad_norm": 0.3698371648788452, |
| "learning_rate": 0.0003873651718112988, |
| "loss": 3.2751, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.753376804843967, |
| "grad_norm": 0.39195096492767334, |
| "learning_rate": 0.0003871904484566103, |
| "loss": 3.2771, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.753376804843967, |
| "eval_accuracy": 0.37315612879629356, |
| "eval_loss": 3.535740375518799, |
| "eval_runtime": 179.6914, |
| "eval_samples_per_second": 92.664, |
| "eval_steps_per_second": 5.793, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.767931998136934, |
| "grad_norm": 0.34754666686058044, |
| "learning_rate": 0.0003870157251019219, |
| "loss": 3.2756, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.7824871914299, |
| "grad_norm": 0.40687552094459534, |
| "learning_rate": 0.0003868410017472335, |
| "loss": 3.2797, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.797042384722868, |
| "grad_norm": 0.3617575764656067, |
| "learning_rate": 0.00038666627839254505, |
| "loss": 3.2713, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.811597578015835, |
| "grad_norm": 0.36929965019226074, |
| "learning_rate": 0.0003864915550378567, |
| "loss": 3.2807, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.826152771308802, |
| "grad_norm": 0.36952269077301025, |
| "learning_rate": 0.0003863168316831683, |
| "loss": 3.285, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.84070796460177, |
| "grad_norm": 0.35455524921417236, |
| "learning_rate": 0.0003861421083284799, |
| "loss": 3.2878, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.855263157894736, |
| "grad_norm": 0.4048665463924408, |
| "learning_rate": 0.0003859673849737914, |
| "loss": 3.2765, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.869818351187703, |
| "grad_norm": 0.3863668441772461, |
| "learning_rate": 0.000385792661619103, |
| "loss": 3.2885, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.88437354448067, |
| "grad_norm": 0.38117608428001404, |
| "learning_rate": 0.00038561793826441467, |
| "loss": 3.2785, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.898928737773637, |
| "grad_norm": 0.3930187225341797, |
| "learning_rate": 0.00038544321490972626, |
| "loss": 3.291, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.913483931066605, |
| "grad_norm": 0.361481636762619, |
| "learning_rate": 0.0003852684915550378, |
| "loss": 3.2738, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.92803912435957, |
| "grad_norm": 0.36943328380584717, |
| "learning_rate": 0.0003850937682003494, |
| "loss": 3.2778, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.94259431765254, |
| "grad_norm": 0.3720901608467102, |
| "learning_rate": 0.000384919044845661, |
| "loss": 3.2895, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.957149510945506, |
| "grad_norm": 0.37087762355804443, |
| "learning_rate": 0.0003847443214909726, |
| "loss": 3.2686, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.971704704238473, |
| "grad_norm": 0.4020977020263672, |
| "learning_rate": 0.00038456959813628423, |
| "loss": 3.2813, |
| "step": 61750 |
| }, |
| { |
| "epoch": 17.98625989753144, |
| "grad_norm": 0.38199225068092346, |
| "learning_rate": 0.0003843948747815958, |
| "loss": 3.2921, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.000582207731718, |
| "grad_norm": 0.4147983491420746, |
| "learning_rate": 0.00038422015142690737, |
| "loss": 3.287, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.015137401024685, |
| "grad_norm": 0.35553550720214844, |
| "learning_rate": 0.00038404542807221896, |
| "loss": 3.1681, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.029692594317652, |
| "grad_norm": 0.38814523816108704, |
| "learning_rate": 0.0003838707047175305, |
| "loss": 3.1671, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.04424778761062, |
| "grad_norm": 0.36407405138015747, |
| "learning_rate": 0.00038369598136284215, |
| "loss": 3.1751, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.04424778761062, |
| "eval_accuracy": 0.37241570677219366, |
| "eval_loss": 3.549715518951416, |
| "eval_runtime": 179.8427, |
| "eval_samples_per_second": 92.586, |
| "eval_steps_per_second": 5.788, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.058802980903586, |
| "grad_norm": 0.35381069779396057, |
| "learning_rate": 0.00038352125800815374, |
| "loss": 3.1806, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.073358174196553, |
| "grad_norm": 0.38974687457084656, |
| "learning_rate": 0.00038334653465346534, |
| "loss": 3.188, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.08791336748952, |
| "grad_norm": 0.39390093088150024, |
| "learning_rate": 0.0003831718112987769, |
| "loss": 3.1949, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.102468560782487, |
| "grad_norm": 0.38337382674217224, |
| "learning_rate": 0.0003829970879440885, |
| "loss": 3.1987, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.117023754075454, |
| "grad_norm": 0.39741766452789307, |
| "learning_rate": 0.00038282236458940007, |
| "loss": 3.214, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.13157894736842, |
| "grad_norm": 0.36907315254211426, |
| "learning_rate": 0.0003826476412347117, |
| "loss": 3.1865, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.14613414066139, |
| "grad_norm": 0.41438016295433044, |
| "learning_rate": 0.00038247291788002326, |
| "loss": 3.2109, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.160689333954355, |
| "grad_norm": 0.3838878273963928, |
| "learning_rate": 0.00038229819452533485, |
| "loss": 3.2182, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.175244527247322, |
| "grad_norm": 0.3881027400493622, |
| "learning_rate": 0.00038212347117064644, |
| "loss": 3.2179, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.18979972054029, |
| "grad_norm": 0.3932623267173767, |
| "learning_rate": 0.000381948747815958, |
| "loss": 3.2089, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.204354913833257, |
| "grad_norm": 0.3784829378128052, |
| "learning_rate": 0.0003817740244612696, |
| "loss": 3.2142, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.218910107126224, |
| "grad_norm": 0.3763687014579773, |
| "learning_rate": 0.00038159930110658123, |
| "loss": 3.2179, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.23346530041919, |
| "grad_norm": 0.39583879709243774, |
| "learning_rate": 0.0003814245777518928, |
| "loss": 3.2143, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.248020493712158, |
| "grad_norm": 0.38479939103126526, |
| "learning_rate": 0.0003812498543972044, |
| "loss": 3.2218, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.262575687005125, |
| "grad_norm": 0.363050639629364, |
| "learning_rate": 0.00038107513104251596, |
| "loss": 3.2286, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.277130880298092, |
| "grad_norm": 0.3779996931552887, |
| "learning_rate": 0.00038090040768782755, |
| "loss": 3.2342, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.29168607359106, |
| "grad_norm": 0.39983800053596497, |
| "learning_rate": 0.0003807256843331392, |
| "loss": 3.2353, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.306241266884022, |
| "grad_norm": 0.37936192750930786, |
| "learning_rate": 0.0003805509609784508, |
| "loss": 3.2282, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.32079646017699, |
| "grad_norm": 0.38810208439826965, |
| "learning_rate": 0.00038037623762376233, |
| "loss": 3.2334, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.335351653469957, |
| "grad_norm": 0.40988579392433167, |
| "learning_rate": 0.00038020151426907393, |
| "loss": 3.2348, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.335351653469957, |
| "eval_accuracy": 0.3728442113277283, |
| "eval_loss": 3.550168752670288, |
| "eval_runtime": 179.6745, |
| "eval_samples_per_second": 92.673, |
| "eval_steps_per_second": 5.794, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.349906846762924, |
| "grad_norm": 0.3667028844356537, |
| "learning_rate": 0.0003800267909143855, |
| "loss": 3.23, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.36446204005589, |
| "grad_norm": 0.4316708743572235, |
| "learning_rate": 0.00037985206755969706, |
| "loss": 3.2315, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.379017233348858, |
| "grad_norm": 0.3851536214351654, |
| "learning_rate": 0.0003796773442050087, |
| "loss": 3.2306, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.393572426641825, |
| "grad_norm": 0.3705301284790039, |
| "learning_rate": 0.0003795026208503203, |
| "loss": 3.2534, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.408127619934792, |
| "grad_norm": 0.4015274941921234, |
| "learning_rate": 0.0003793278974956319, |
| "loss": 3.2605, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.42268281322776, |
| "grad_norm": 0.40232664346694946, |
| "learning_rate": 0.00037915317414094344, |
| "loss": 3.2504, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.437238006520726, |
| "grad_norm": 0.4108823835849762, |
| "learning_rate": 0.00037897845078625503, |
| "loss": 3.2411, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.451793199813693, |
| "grad_norm": 0.3592471778392792, |
| "learning_rate": 0.0003788037274315667, |
| "loss": 3.2475, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.46634839310666, |
| "grad_norm": 0.385998010635376, |
| "learning_rate": 0.0003786290040768783, |
| "loss": 3.2484, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.480903586399627, |
| "grad_norm": 0.39755505323410034, |
| "learning_rate": 0.0003784542807221898, |
| "loss": 3.2334, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.495458779692594, |
| "grad_norm": 0.38933873176574707, |
| "learning_rate": 0.0003782795573675014, |
| "loss": 3.2383, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.51001397298556, |
| "grad_norm": 0.36488500237464905, |
| "learning_rate": 0.000378104834012813, |
| "loss": 3.2655, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.52456916627853, |
| "grad_norm": 0.3819720447063446, |
| "learning_rate": 0.0003779301106581246, |
| "loss": 3.2466, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.539124359571495, |
| "grad_norm": 0.38391003012657166, |
| "learning_rate": 0.0003777553873034362, |
| "loss": 3.2557, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.553679552864462, |
| "grad_norm": 0.3939170837402344, |
| "learning_rate": 0.0003775806639487478, |
| "loss": 3.2513, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.56823474615743, |
| "grad_norm": 0.39664459228515625, |
| "learning_rate": 0.0003774059405940594, |
| "loss": 3.2397, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.582789939450397, |
| "grad_norm": 0.3669049143791199, |
| "learning_rate": 0.000377231217239371, |
| "loss": 3.243, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.597345132743364, |
| "grad_norm": 0.4121682047843933, |
| "learning_rate": 0.0003770564938846825, |
| "loss": 3.2451, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.61190032603633, |
| "grad_norm": 0.40818139910697937, |
| "learning_rate": 0.00037688177052999416, |
| "loss": 3.2538, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.626455519329298, |
| "grad_norm": 0.40750160813331604, |
| "learning_rate": 0.00037670704717530576, |
| "loss": 3.2587, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.626455519329298, |
| "eval_accuracy": 0.3730311972706399, |
| "eval_loss": 3.5414130687713623, |
| "eval_runtime": 179.6839, |
| "eval_samples_per_second": 92.668, |
| "eval_steps_per_second": 5.794, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.641010712622265, |
| "grad_norm": 0.39473462104797363, |
| "learning_rate": 0.00037653232382061735, |
| "loss": 3.2561, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.655565905915232, |
| "grad_norm": 0.36568084359169006, |
| "learning_rate": 0.0003763576004659289, |
| "loss": 3.2541, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.6701210992082, |
| "grad_norm": 0.39155519008636475, |
| "learning_rate": 0.0003761828771112405, |
| "loss": 3.2566, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.684676292501166, |
| "grad_norm": 0.39551350474357605, |
| "learning_rate": 0.0003760081537565521, |
| "loss": 3.2546, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.69923148579413, |
| "grad_norm": 0.4288475811481476, |
| "learning_rate": 0.00037583343040186373, |
| "loss": 3.249, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.713786679087097, |
| "grad_norm": 0.386665403842926, |
| "learning_rate": 0.00037565870704717527, |
| "loss": 3.2676, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.728341872380064, |
| "grad_norm": 0.37789034843444824, |
| "learning_rate": 0.00037548398369248687, |
| "loss": 3.2622, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.74289706567303, |
| "grad_norm": 0.3938154876232147, |
| "learning_rate": 0.00037530926033779846, |
| "loss": 3.2729, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.757452258965998, |
| "grad_norm": 0.36355528235435486, |
| "learning_rate": 0.00037513453698311, |
| "loss": 3.268, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.772007452258965, |
| "grad_norm": 0.416415810585022, |
| "learning_rate": 0.0003749598136284216, |
| "loss": 3.2616, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.786562645551932, |
| "grad_norm": 0.4128302335739136, |
| "learning_rate": 0.00037478509027373324, |
| "loss": 3.2657, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.8011178388449, |
| "grad_norm": 0.37560078501701355, |
| "learning_rate": 0.00037461036691904484, |
| "loss": 3.2485, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.815673032137866, |
| "grad_norm": 0.40694931149482727, |
| "learning_rate": 0.0003744356435643564, |
| "loss": 3.2506, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.830228225430833, |
| "grad_norm": 0.3949992060661316, |
| "learning_rate": 0.00037426092020966797, |
| "loss": 3.2647, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.8447834187238, |
| "grad_norm": 0.38129428029060364, |
| "learning_rate": 0.00037408619685497957, |
| "loss": 3.2756, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.859338612016767, |
| "grad_norm": 0.3728366494178772, |
| "learning_rate": 0.0003739114735002912, |
| "loss": 3.2647, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.873893805309734, |
| "grad_norm": 0.40122315287590027, |
| "learning_rate": 0.0003737367501456028, |
| "loss": 3.2654, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.8884489986027, |
| "grad_norm": 0.37444445490837097, |
| "learning_rate": 0.00037356202679091435, |
| "loss": 3.277, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.90300419189567, |
| "grad_norm": 0.3745873272418976, |
| "learning_rate": 0.00037338730343622594, |
| "loss": 3.2704, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.917559385188635, |
| "grad_norm": 0.38452088832855225, |
| "learning_rate": 0.00037321258008153754, |
| "loss": 3.2776, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.917559385188635, |
| "eval_accuracy": 0.37377173682204523, |
| "eval_loss": 3.5317463874816895, |
| "eval_runtime": 179.6892, |
| "eval_samples_per_second": 92.666, |
| "eval_steps_per_second": 5.793, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.932114578481603, |
| "grad_norm": 0.40762460231781006, |
| "learning_rate": 0.0003730378567268491, |
| "loss": 3.2639, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.94666977177457, |
| "grad_norm": 0.42345935106277466, |
| "learning_rate": 0.0003728631333721607, |
| "loss": 3.2639, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.961224965067537, |
| "grad_norm": 0.3937210738658905, |
| "learning_rate": 0.0003726884100174723, |
| "loss": 3.2795, |
| "step": 65150 |
| }, |
| { |
| "epoch": 18.975780158360504, |
| "grad_norm": 0.38212597370147705, |
| "learning_rate": 0.0003725136866627839, |
| "loss": 3.277, |
| "step": 65200 |
| }, |
| { |
| "epoch": 18.99033535165347, |
| "grad_norm": 0.4089849889278412, |
| "learning_rate": 0.00037233896330809545, |
| "loss": 3.2783, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.00465766185375, |
| "grad_norm": 0.3767828643321991, |
| "learning_rate": 0.00037216423995340705, |
| "loss": 3.2387, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.019212855146716, |
| "grad_norm": 0.39858773350715637, |
| "learning_rate": 0.0003719895165987187, |
| "loss": 3.1656, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.033768048439683, |
| "grad_norm": 0.40623700618743896, |
| "learning_rate": 0.0003718147932440303, |
| "loss": 3.1762, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.04832324173265, |
| "grad_norm": 0.40791580080986023, |
| "learning_rate": 0.00037164006988934183, |
| "loss": 3.1803, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.062878435025617, |
| "grad_norm": 0.35862547159194946, |
| "learning_rate": 0.0003714653465346534, |
| "loss": 3.174, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.077433628318584, |
| "grad_norm": 0.38739070296287537, |
| "learning_rate": 0.000371290623179965, |
| "loss": 3.1929, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.09198882161155, |
| "grad_norm": 0.4318332076072693, |
| "learning_rate": 0.00037111589982527656, |
| "loss": 3.183, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.106544014904518, |
| "grad_norm": 0.4079286754131317, |
| "learning_rate": 0.0003709411764705882, |
| "loss": 3.1921, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.121099208197485, |
| "grad_norm": 0.3845118582248688, |
| "learning_rate": 0.0003707664531158998, |
| "loss": 3.191, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.135654401490452, |
| "grad_norm": 0.38291704654693604, |
| "learning_rate": 0.0003705917297612114, |
| "loss": 3.1975, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.15020959478342, |
| "grad_norm": 0.3601689338684082, |
| "learning_rate": 0.000370417006406523, |
| "loss": 3.201, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.164764788076386, |
| "grad_norm": 0.3908986747264862, |
| "learning_rate": 0.00037024228305183453, |
| "loss": 3.1925, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.179319981369353, |
| "grad_norm": 0.36919277906417847, |
| "learning_rate": 0.0003700675596971461, |
| "loss": 3.1987, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.19387517466232, |
| "grad_norm": 0.4059826135635376, |
| "learning_rate": 0.0003698928363424578, |
| "loss": 3.1942, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.208430367955287, |
| "grad_norm": 0.43888577818870544, |
| "learning_rate": 0.00036971811298776937, |
| "loss": 3.1939, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.208430367955287, |
| "eval_accuracy": 0.3727541854117822, |
| "eval_loss": 3.551252841949463, |
| "eval_runtime": 179.6276, |
| "eval_samples_per_second": 92.697, |
| "eval_steps_per_second": 5.795, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.222985561248255, |
| "grad_norm": 0.4140142500400543, |
| "learning_rate": 0.0003695433896330809, |
| "loss": 3.2038, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.23754075454122, |
| "grad_norm": 0.40382882952690125, |
| "learning_rate": 0.0003693686662783925, |
| "loss": 3.2215, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.25209594783419, |
| "grad_norm": 0.39210304617881775, |
| "learning_rate": 0.0003691939429237041, |
| "loss": 3.2032, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.266651141127156, |
| "grad_norm": 0.4092456102371216, |
| "learning_rate": 0.00036901921956901575, |
| "loss": 3.2133, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.281206334420123, |
| "grad_norm": 0.38151684403419495, |
| "learning_rate": 0.0003688444962143273, |
| "loss": 3.2171, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.29576152771309, |
| "grad_norm": 0.36478516459465027, |
| "learning_rate": 0.0003686697728596389, |
| "loss": 3.2124, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.310316721006053, |
| "grad_norm": 0.41321274638175964, |
| "learning_rate": 0.0003684950495049505, |
| "loss": 3.2126, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.32487191429902, |
| "grad_norm": 0.384111225605011, |
| "learning_rate": 0.000368320326150262, |
| "loss": 3.2266, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.339427107591987, |
| "grad_norm": 0.37913626432418823, |
| "learning_rate": 0.0003681456027955736, |
| "loss": 3.2197, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.353982300884955, |
| "grad_norm": 0.4188009798526764, |
| "learning_rate": 0.00036797087944088526, |
| "loss": 3.2158, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.36853749417792, |
| "grad_norm": 0.39769816398620605, |
| "learning_rate": 0.00036779615608619685, |
| "loss": 3.2273, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.38309268747089, |
| "grad_norm": 0.3958772122859955, |
| "learning_rate": 0.0003676214327315084, |
| "loss": 3.2329, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.397647880763856, |
| "grad_norm": 0.3825916051864624, |
| "learning_rate": 0.00036744670937682, |
| "loss": 3.2261, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.412203074056823, |
| "grad_norm": 0.3826241195201874, |
| "learning_rate": 0.0003672719860221316, |
| "loss": 3.2166, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.42675826734979, |
| "grad_norm": 0.38634222745895386, |
| "learning_rate": 0.00036709726266744323, |
| "loss": 3.2484, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.441313460642757, |
| "grad_norm": 0.3904113471508026, |
| "learning_rate": 0.00036692253931275477, |
| "loss": 3.2139, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.455868653935724, |
| "grad_norm": 0.3914354741573334, |
| "learning_rate": 0.00036674781595806636, |
| "loss": 3.2258, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.47042384722869, |
| "grad_norm": 0.38262632489204407, |
| "learning_rate": 0.00036657309260337796, |
| "loss": 3.2338, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.484979040521658, |
| "grad_norm": 0.40577489137649536, |
| "learning_rate": 0.00036639836924868955, |
| "loss": 3.2281, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.499534233814625, |
| "grad_norm": 0.39221397042274475, |
| "learning_rate": 0.0003662236458940011, |
| "loss": 3.2337, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.499534233814625, |
| "eval_accuracy": 0.37309760019819804, |
| "eval_loss": 3.542379140853882, |
| "eval_runtime": 179.58, |
| "eval_samples_per_second": 92.722, |
| "eval_steps_per_second": 5.797, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.514089427107592, |
| "grad_norm": 0.3849264681339264, |
| "learning_rate": 0.00036604892253931274, |
| "loss": 3.2324, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.52864462040056, |
| "grad_norm": 0.41369107365608215, |
| "learning_rate": 0.00036587419918462433, |
| "loss": 3.2369, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.543199813693526, |
| "grad_norm": 0.4056357443332672, |
| "learning_rate": 0.00036569947582993593, |
| "loss": 3.2423, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.557755006986493, |
| "grad_norm": 0.3950081467628479, |
| "learning_rate": 0.00036552475247524747, |
| "loss": 3.2423, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.57231020027946, |
| "grad_norm": 0.3796003460884094, |
| "learning_rate": 0.00036535002912055906, |
| "loss": 3.2345, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.586865393572428, |
| "grad_norm": 0.37646934390068054, |
| "learning_rate": 0.00036517530576587066, |
| "loss": 3.2447, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.601420586865395, |
| "grad_norm": 0.4380941689014435, |
| "learning_rate": 0.0003650005824111823, |
| "loss": 3.2474, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.61597578015836, |
| "grad_norm": 0.37116000056266785, |
| "learning_rate": 0.00036482585905649385, |
| "loss": 3.2474, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.63053097345133, |
| "grad_norm": 0.37322819232940674, |
| "learning_rate": 0.00036465113570180544, |
| "loss": 3.2529, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.645086166744296, |
| "grad_norm": 0.3834007680416107, |
| "learning_rate": 0.00036447641234711703, |
| "loss": 3.2464, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.659641360037263, |
| "grad_norm": 0.3989749550819397, |
| "learning_rate": 0.0003643016889924286, |
| "loss": 3.2585, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.67419655333023, |
| "grad_norm": 0.3850140869617462, |
| "learning_rate": 0.0003641269656377402, |
| "loss": 3.2513, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.688751746623197, |
| "grad_norm": 0.41245153546333313, |
| "learning_rate": 0.0003639522422830518, |
| "loss": 3.2449, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.70330693991616, |
| "grad_norm": 0.41811299324035645, |
| "learning_rate": 0.0003637775189283634, |
| "loss": 3.2421, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.717862133209128, |
| "grad_norm": 0.40594467520713806, |
| "learning_rate": 0.00036360279557367495, |
| "loss": 3.2448, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.732417326502095, |
| "grad_norm": 0.38768884539604187, |
| "learning_rate": 0.00036342807221898655, |
| "loss": 3.2512, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.74697251979506, |
| "grad_norm": 0.40855497121810913, |
| "learning_rate": 0.00036325334886429814, |
| "loss": 3.2382, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.76152771308803, |
| "grad_norm": 0.39450258016586304, |
| "learning_rate": 0.0003630786255096098, |
| "loss": 3.2425, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.776082906380996, |
| "grad_norm": 0.39796581864356995, |
| "learning_rate": 0.00036290390215492133, |
| "loss": 3.2672, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.790638099673963, |
| "grad_norm": 0.43509823083877563, |
| "learning_rate": 0.0003627291788002329, |
| "loss": 3.2667, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.790638099673963, |
| "eval_accuracy": 0.37376562540216374, |
| "eval_loss": 3.536363363265991, |
| "eval_runtime": 179.6113, |
| "eval_samples_per_second": 92.706, |
| "eval_steps_per_second": 5.796, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.80519329296693, |
| "grad_norm": 0.4296112060546875, |
| "learning_rate": 0.0003625544554455445, |
| "loss": 3.2532, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.819748486259897, |
| "grad_norm": 0.39242368936538696, |
| "learning_rate": 0.0003623797320908561, |
| "loss": 3.2546, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.834303679552864, |
| "grad_norm": 0.37693583965301514, |
| "learning_rate": 0.00036220500873616776, |
| "loss": 3.2589, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.84885887284583, |
| "grad_norm": 0.36907529830932617, |
| "learning_rate": 0.0003620302853814793, |
| "loss": 3.2491, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.863414066138798, |
| "grad_norm": 0.394565224647522, |
| "learning_rate": 0.0003618555620267909, |
| "loss": 3.2555, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.877969259431765, |
| "grad_norm": 0.3643400967121124, |
| "learning_rate": 0.0003616808386721025, |
| "loss": 3.2494, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.892524452724732, |
| "grad_norm": 0.3661266267299652, |
| "learning_rate": 0.00036150611531741403, |
| "loss": 3.2576, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.9070796460177, |
| "grad_norm": 0.4081973433494568, |
| "learning_rate": 0.0003613313919627256, |
| "loss": 3.2615, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.921634839310666, |
| "grad_norm": 0.363898366689682, |
| "learning_rate": 0.00036115666860803727, |
| "loss": 3.258, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.936190032603633, |
| "grad_norm": 0.3852180540561676, |
| "learning_rate": 0.00036098194525334887, |
| "loss": 3.2626, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.9507452258966, |
| "grad_norm": 0.383724182844162, |
| "learning_rate": 0.0003608072218986604, |
| "loss": 3.2609, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.965300419189568, |
| "grad_norm": 0.41124436259269714, |
| "learning_rate": 0.000360632498543972, |
| "loss": 3.2613, |
| "step": 68600 |
| }, |
| { |
| "epoch": 19.979855612482535, |
| "grad_norm": 0.3976960778236389, |
| "learning_rate": 0.0003604577751892836, |
| "loss": 3.2653, |
| "step": 68650 |
| }, |
| { |
| "epoch": 19.9944108057755, |
| "grad_norm": 0.38881340622901917, |
| "learning_rate": 0.00036028305183459513, |
| "loss": 3.2707, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.00873311597578, |
| "grad_norm": 0.40865153074264526, |
| "learning_rate": 0.0003601083284799068, |
| "loss": 3.201, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.023288309268747, |
| "grad_norm": 0.40403062105178833, |
| "learning_rate": 0.0003599336051252184, |
| "loss": 3.1603, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.037843502561714, |
| "grad_norm": 0.4033607840538025, |
| "learning_rate": 0.00035975888177052997, |
| "loss": 3.1707, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.05239869585468, |
| "grad_norm": 0.3669825792312622, |
| "learning_rate": 0.0003595841584158415, |
| "loss": 3.1716, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.066953889147648, |
| "grad_norm": 0.39947494864463806, |
| "learning_rate": 0.0003594094350611531, |
| "loss": 3.1686, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.081509082440615, |
| "grad_norm": 0.3739069998264313, |
| "learning_rate": 0.00035923471170646475, |
| "loss": 3.1624, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.081509082440615, |
| "eval_accuracy": 0.37318891891450373, |
| "eval_loss": 3.548868179321289, |
| "eval_runtime": 179.9292, |
| "eval_samples_per_second": 92.542, |
| "eval_steps_per_second": 5.786, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.096064275733582, |
| "grad_norm": 0.4172305762767792, |
| "learning_rate": 0.00035905998835177635, |
| "loss": 3.1698, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.11061946902655, |
| "grad_norm": 0.42960259318351746, |
| "learning_rate": 0.00035888526499708794, |
| "loss": 3.1756, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.125174662319516, |
| "grad_norm": 0.3955884575843811, |
| "learning_rate": 0.0003587105416423995, |
| "loss": 3.1886, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.139729855612483, |
| "grad_norm": 0.40036845207214355, |
| "learning_rate": 0.0003585358182877111, |
| "loss": 3.1768, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.15428504890545, |
| "grad_norm": 0.41901588439941406, |
| "learning_rate": 0.00035836109493302267, |
| "loss": 3.1899, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.168840242198417, |
| "grad_norm": 0.3858005106449127, |
| "learning_rate": 0.0003581863715783343, |
| "loss": 3.1951, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.183395435491384, |
| "grad_norm": 0.37416332960128784, |
| "learning_rate": 0.00035801164822364586, |
| "loss": 3.1942, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.19795062878435, |
| "grad_norm": 0.40615788102149963, |
| "learning_rate": 0.00035783692486895745, |
| "loss": 3.185, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.21250582207732, |
| "grad_norm": 0.3919961154460907, |
| "learning_rate": 0.00035766220151426905, |
| "loss": 3.1983, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.227061015370285, |
| "grad_norm": 0.42501819133758545, |
| "learning_rate": 0.0003574874781595806, |
| "loss": 3.1967, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.241616208663253, |
| "grad_norm": 0.3887561857700348, |
| "learning_rate": 0.00035731275480489224, |
| "loss": 3.198, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.25617140195622, |
| "grad_norm": 0.4249391257762909, |
| "learning_rate": 0.00035713803145020383, |
| "loss": 3.1956, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.270726595249187, |
| "grad_norm": 0.4223974049091339, |
| "learning_rate": 0.0003569633080955154, |
| "loss": 3.1909, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.28528178854215, |
| "grad_norm": 0.4471769630908966, |
| "learning_rate": 0.00035678858474082697, |
| "loss": 3.2013, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.299836981835117, |
| "grad_norm": 0.3672872483730316, |
| "learning_rate": 0.00035661386138613856, |
| "loss": 3.1967, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.314392175128084, |
| "grad_norm": 0.40826860070228577, |
| "learning_rate": 0.00035643913803145015, |
| "loss": 3.1972, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.32894736842105, |
| "grad_norm": 0.39870554208755493, |
| "learning_rate": 0.0003562644146767618, |
| "loss": 3.2159, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.34350256171402, |
| "grad_norm": 0.37478652596473694, |
| "learning_rate": 0.00035608969132207334, |
| "loss": 3.2043, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.358057755006985, |
| "grad_norm": 0.39846566319465637, |
| "learning_rate": 0.00035591496796738494, |
| "loss": 3.2099, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.372612948299953, |
| "grad_norm": 0.39627769589424133, |
| "learning_rate": 0.00035574024461269653, |
| "loss": 3.2091, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.372612948299953, |
| "eval_accuracy": 0.3736645519195088, |
| "eval_loss": 3.5438168048858643, |
| "eval_runtime": 179.754, |
| "eval_samples_per_second": 92.632, |
| "eval_steps_per_second": 5.791, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.38716814159292, |
| "grad_norm": 0.4286830723285675, |
| "learning_rate": 0.0003555655212580081, |
| "loss": 3.2232, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.401723334885887, |
| "grad_norm": 0.3937196731567383, |
| "learning_rate": 0.00035539079790331967, |
| "loss": 3.2197, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.416278528178854, |
| "grad_norm": 0.39053255319595337, |
| "learning_rate": 0.0003552160745486313, |
| "loss": 3.2137, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.43083372147182, |
| "grad_norm": 0.419683039188385, |
| "learning_rate": 0.0003550413511939429, |
| "loss": 3.2258, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.445388914764788, |
| "grad_norm": 0.37828585505485535, |
| "learning_rate": 0.0003548666278392545, |
| "loss": 3.2279, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.459944108057755, |
| "grad_norm": 0.4096558094024658, |
| "learning_rate": 0.00035469190448456604, |
| "loss": 3.2081, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.474499301350722, |
| "grad_norm": 0.41085392236709595, |
| "learning_rate": 0.00035451718112987764, |
| "loss": 3.2231, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.48905449464369, |
| "grad_norm": 0.39229652285575867, |
| "learning_rate": 0.0003543424577751893, |
| "loss": 3.2201, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.503609687936656, |
| "grad_norm": 0.40823259949684143, |
| "learning_rate": 0.0003541677344205009, |
| "loss": 3.2291, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.518164881229623, |
| "grad_norm": 0.3808740973472595, |
| "learning_rate": 0.0003539930110658124, |
| "loss": 3.2391, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.53272007452259, |
| "grad_norm": 0.4064353108406067, |
| "learning_rate": 0.000353818287711124, |
| "loss": 3.2238, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.547275267815557, |
| "grad_norm": 0.40563496947288513, |
| "learning_rate": 0.0003536435643564356, |
| "loss": 3.2274, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.561830461108524, |
| "grad_norm": 0.4227381646633148, |
| "learning_rate": 0.00035346884100174715, |
| "loss": 3.2376, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.57638565440149, |
| "grad_norm": 0.39980873465538025, |
| "learning_rate": 0.0003532941176470588, |
| "loss": 3.2329, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.59094084769446, |
| "grad_norm": 0.36712634563446045, |
| "learning_rate": 0.0003531193942923704, |
| "loss": 3.2385, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.605496040987425, |
| "grad_norm": 0.4282870590686798, |
| "learning_rate": 0.000352944670937682, |
| "loss": 3.2392, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.620051234280393, |
| "grad_norm": 0.4079741835594177, |
| "learning_rate": 0.0003527699475829935, |
| "loss": 3.2345, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.63460642757336, |
| "grad_norm": 0.40561193227767944, |
| "learning_rate": 0.0003525952242283051, |
| "loss": 3.2285, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.649161620866327, |
| "grad_norm": 0.39606034755706787, |
| "learning_rate": 0.00035242050087361677, |
| "loss": 3.2346, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.663716814159294, |
| "grad_norm": 0.4000726640224457, |
| "learning_rate": 0.00035224577751892836, |
| "loss": 3.245, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.663716814159294, |
| "eval_accuracy": 0.37348931870713853, |
| "eval_loss": 3.5370469093322754, |
| "eval_runtime": 179.6602, |
| "eval_samples_per_second": 92.681, |
| "eval_steps_per_second": 5.794, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.678272007452257, |
| "grad_norm": 0.4043654203414917, |
| "learning_rate": 0.0003520710541642399, |
| "loss": 3.2364, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.692827200745224, |
| "grad_norm": 0.40891769528388977, |
| "learning_rate": 0.0003518963308095515, |
| "loss": 3.2401, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.70738239403819, |
| "grad_norm": 0.43435272574424744, |
| "learning_rate": 0.0003517216074548631, |
| "loss": 3.2437, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.72193758733116, |
| "grad_norm": 0.37514767050743103, |
| "learning_rate": 0.0003515468841001747, |
| "loss": 3.2427, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.736492780624125, |
| "grad_norm": 0.40798866748809814, |
| "learning_rate": 0.00035137216074548634, |
| "loss": 3.2485, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.751047973917093, |
| "grad_norm": 0.4111190140247345, |
| "learning_rate": 0.0003511974373907979, |
| "loss": 3.234, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.76560316721006, |
| "grad_norm": 0.4060057997703552, |
| "learning_rate": 0.00035102271403610947, |
| "loss": 3.242, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.780158360503027, |
| "grad_norm": 0.4118170142173767, |
| "learning_rate": 0.00035084799068142106, |
| "loss": 3.2461, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.794713553795994, |
| "grad_norm": 0.40792983770370483, |
| "learning_rate": 0.0003506732673267326, |
| "loss": 3.2456, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.80926874708896, |
| "grad_norm": 0.3925841748714447, |
| "learning_rate": 0.00035049854397204425, |
| "loss": 3.2465, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.823823940381928, |
| "grad_norm": 0.3952105939388275, |
| "learning_rate": 0.00035032382061735585, |
| "loss": 3.2509, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.838379133674895, |
| "grad_norm": 0.4176660180091858, |
| "learning_rate": 0.00035014909726266744, |
| "loss": 3.2386, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.852934326967862, |
| "grad_norm": 0.3933994174003601, |
| "learning_rate": 0.000349974373907979, |
| "loss": 3.2424, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.86748952026083, |
| "grad_norm": 0.38902366161346436, |
| "learning_rate": 0.0003497996505532906, |
| "loss": 3.2306, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.882044713553796, |
| "grad_norm": 0.38957300782203674, |
| "learning_rate": 0.00034962492719860217, |
| "loss": 3.2595, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.896599906846763, |
| "grad_norm": 0.4192463159561157, |
| "learning_rate": 0.0003494502038439138, |
| "loss": 3.2512, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.91115510013973, |
| "grad_norm": 0.39436599612236023, |
| "learning_rate": 0.00034927548048922536, |
| "loss": 3.2299, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.925710293432697, |
| "grad_norm": 0.4020264148712158, |
| "learning_rate": 0.00034910075713453695, |
| "loss": 3.2395, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.940265486725664, |
| "grad_norm": 0.4359363317489624, |
| "learning_rate": 0.00034892603377984855, |
| "loss": 3.2572, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.95482068001863, |
| "grad_norm": 0.4143703579902649, |
| "learning_rate": 0.0003487513104251601, |
| "loss": 3.2399, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.95482068001863, |
| "eval_accuracy": 0.37419577533997417, |
| "eval_loss": 3.530885934829712, |
| "eval_runtime": 180.8418, |
| "eval_samples_per_second": 92.075, |
| "eval_steps_per_second": 5.756, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.9693758733116, |
| "grad_norm": 0.40823879837989807, |
| "learning_rate": 0.0003485765870704717, |
| "loss": 3.2552, |
| "step": 72050 |
| }, |
| { |
| "epoch": 20.983931066604566, |
| "grad_norm": 0.3722114861011505, |
| "learning_rate": 0.00034840186371578333, |
| "loss": 3.246, |
| "step": 72100 |
| }, |
| { |
| "epoch": 20.998486259897533, |
| "grad_norm": 0.38992324471473694, |
| "learning_rate": 0.0003482271403610949, |
| "loss": 3.261, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.01280857009781, |
| "grad_norm": 0.3851622939109802, |
| "learning_rate": 0.0003480524170064065, |
| "loss": 3.1588, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.027363763390778, |
| "grad_norm": 0.41066426038742065, |
| "learning_rate": 0.00034787769365171806, |
| "loss": 3.1497, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.041918956683745, |
| "grad_norm": 0.3948858082294464, |
| "learning_rate": 0.00034770297029702965, |
| "loss": 3.1566, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.05647414997671, |
| "grad_norm": 0.4023319482803345, |
| "learning_rate": 0.0003475282469423413, |
| "loss": 3.1449, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.07102934326968, |
| "grad_norm": 0.4502675235271454, |
| "learning_rate": 0.0003473535235876529, |
| "loss": 3.1525, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.085584536562646, |
| "grad_norm": 0.38525882363319397, |
| "learning_rate": 0.00034717880023296444, |
| "loss": 3.164, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.100139729855613, |
| "grad_norm": 0.4162638187408447, |
| "learning_rate": 0.00034700407687827603, |
| "loss": 3.1658, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.11469492314858, |
| "grad_norm": 0.39325228333473206, |
| "learning_rate": 0.0003468293535235876, |
| "loss": 3.1698, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.129250116441547, |
| "grad_norm": 0.40508460998535156, |
| "learning_rate": 0.00034665463016889916, |
| "loss": 3.1801, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.143805309734514, |
| "grad_norm": 0.4105875492095947, |
| "learning_rate": 0.0003464799068142108, |
| "loss": 3.1711, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.15836050302748, |
| "grad_norm": 0.4069672226905823, |
| "learning_rate": 0.0003463051834595224, |
| "loss": 3.1771, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.172915696320448, |
| "grad_norm": 0.38891884684562683, |
| "learning_rate": 0.000346130460104834, |
| "loss": 3.18, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.187470889613415, |
| "grad_norm": 0.3792036175727844, |
| "learning_rate": 0.00034595573675014554, |
| "loss": 3.1873, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.202026082906382, |
| "grad_norm": 0.41310176253318787, |
| "learning_rate": 0.00034578101339545714, |
| "loss": 3.1875, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.21658127619935, |
| "grad_norm": 0.37777265906333923, |
| "learning_rate": 0.0003456062900407688, |
| "loss": 3.1911, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.231136469492316, |
| "grad_norm": 0.4072975814342499, |
| "learning_rate": 0.0003454315666860804, |
| "loss": 3.1877, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.245691662785283, |
| "grad_norm": 0.40285155177116394, |
| "learning_rate": 0.0003452568433313919, |
| "loss": 3.1951, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.245691662785283, |
| "eval_accuracy": 0.3731553061051557, |
| "eval_loss": 3.54914927482605, |
| "eval_runtime": 181.1286, |
| "eval_samples_per_second": 91.929, |
| "eval_steps_per_second": 5.747, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.260246856078247, |
| "grad_norm": 0.43861880898475647, |
| "learning_rate": 0.0003450821199767035, |
| "loss": 3.1788, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.274802049371214, |
| "grad_norm": 0.388239324092865, |
| "learning_rate": 0.0003449073966220151, |
| "loss": 3.1891, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.28935724266418, |
| "grad_norm": 0.39079049229621887, |
| "learning_rate": 0.0003447326732673267, |
| "loss": 3.2009, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.303912435957148, |
| "grad_norm": 0.41338858008384705, |
| "learning_rate": 0.0003445579499126383, |
| "loss": 3.196, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.318467629250115, |
| "grad_norm": 0.3875797986984253, |
| "learning_rate": 0.0003443832265579499, |
| "loss": 3.1977, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.333022822543082, |
| "grad_norm": 0.41410455107688904, |
| "learning_rate": 0.0003442085032032615, |
| "loss": 3.2012, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.34757801583605, |
| "grad_norm": 0.4161223769187927, |
| "learning_rate": 0.0003440337798485731, |
| "loss": 3.2108, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.362133209129016, |
| "grad_norm": 0.4099974036216736, |
| "learning_rate": 0.0003438590564938846, |
| "loss": 3.1971, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.376688402421983, |
| "grad_norm": 0.40403392910957336, |
| "learning_rate": 0.0003436843331391962, |
| "loss": 3.2197, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.39124359571495, |
| "grad_norm": 0.38444963097572327, |
| "learning_rate": 0.00034350960978450786, |
| "loss": 3.2057, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.405798789007918, |
| "grad_norm": 0.3847692012786865, |
| "learning_rate": 0.00034333488642981946, |
| "loss": 3.2085, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.420353982300885, |
| "grad_norm": 0.4129888713359833, |
| "learning_rate": 0.000343160163075131, |
| "loss": 3.195, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.43490917559385, |
| "grad_norm": 0.40269991755485535, |
| "learning_rate": 0.0003429854397204426, |
| "loss": 3.2062, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.44946436888682, |
| "grad_norm": 0.42505478858947754, |
| "learning_rate": 0.0003428107163657542, |
| "loss": 3.2191, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.464019562179786, |
| "grad_norm": 0.3840712904930115, |
| "learning_rate": 0.00034263599301106583, |
| "loss": 3.2103, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.478574755472753, |
| "grad_norm": 0.393205463886261, |
| "learning_rate": 0.0003424612696563774, |
| "loss": 3.2007, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.49312994876572, |
| "grad_norm": 0.4051612317562103, |
| "learning_rate": 0.00034228654630168897, |
| "loss": 3.2217, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.507685142058687, |
| "grad_norm": 0.39363154768943787, |
| "learning_rate": 0.00034211182294700056, |
| "loss": 3.2072, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.522240335351654, |
| "grad_norm": 0.41303566098213196, |
| "learning_rate": 0.0003419370995923121, |
| "loss": 3.2194, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.53679552864462, |
| "grad_norm": 0.40681394934654236, |
| "learning_rate": 0.0003417623762376237, |
| "loss": 3.2234, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.53679552864462, |
| "eval_accuracy": 0.3737651552929421, |
| "eval_loss": 3.5365705490112305, |
| "eval_runtime": 179.8936, |
| "eval_samples_per_second": 92.56, |
| "eval_steps_per_second": 5.787, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.551350721937588, |
| "grad_norm": 0.4122057557106018, |
| "learning_rate": 0.00034158765288293534, |
| "loss": 3.2145, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.565905915230555, |
| "grad_norm": 0.3874419331550598, |
| "learning_rate": 0.00034141292952824694, |
| "loss": 3.2164, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.580461108523522, |
| "grad_norm": 0.4723573625087738, |
| "learning_rate": 0.0003412382061735585, |
| "loss": 3.2241, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.59501630181649, |
| "grad_norm": 0.39037251472473145, |
| "learning_rate": 0.0003410634828188701, |
| "loss": 3.2313, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.609571495109456, |
| "grad_norm": 0.4305918216705322, |
| "learning_rate": 0.00034088875946418167, |
| "loss": 3.2167, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.624126688402423, |
| "grad_norm": 0.4206292927265167, |
| "learning_rate": 0.0003407140361094933, |
| "loss": 3.2183, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.63868188169539, |
| "grad_norm": 0.3807690739631653, |
| "learning_rate": 0.00034053931275480486, |
| "loss": 3.217, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.653237074988354, |
| "grad_norm": 0.45432552695274353, |
| "learning_rate": 0.00034036458940011645, |
| "loss": 3.2195, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.66779226828132, |
| "grad_norm": 0.4133349359035492, |
| "learning_rate": 0.00034018986604542804, |
| "loss": 3.2259, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.682347461574288, |
| "grad_norm": 0.38990363478660583, |
| "learning_rate": 0.00034001514269073964, |
| "loss": 3.2348, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.696902654867255, |
| "grad_norm": 0.43620920181274414, |
| "learning_rate": 0.0003398404193360512, |
| "loss": 3.2293, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.711457848160222, |
| "grad_norm": 0.4420872628688812, |
| "learning_rate": 0.00033966569598136283, |
| "loss": 3.2225, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.72601304145319, |
| "grad_norm": 0.3839336335659027, |
| "learning_rate": 0.0003394909726266744, |
| "loss": 3.2255, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.740568234746156, |
| "grad_norm": 0.42757683992385864, |
| "learning_rate": 0.000339316249271986, |
| "loss": 3.225, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.755123428039123, |
| "grad_norm": 0.39489442110061646, |
| "learning_rate": 0.00033914152591729756, |
| "loss": 3.2369, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.76967862133209, |
| "grad_norm": 0.4031055271625519, |
| "learning_rate": 0.00033896680256260915, |
| "loss": 3.2364, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.784233814625058, |
| "grad_norm": 0.4320981502532959, |
| "learning_rate": 0.00033879207920792074, |
| "loss": 3.2302, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.798789007918025, |
| "grad_norm": 0.40397652983665466, |
| "learning_rate": 0.0003386173558532324, |
| "loss": 3.2205, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.81334420121099, |
| "grad_norm": 0.39160752296447754, |
| "learning_rate": 0.00033844263249854393, |
| "loss": 3.2293, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.82789939450396, |
| "grad_norm": 0.39503806829452515, |
| "learning_rate": 0.00033826790914385553, |
| "loss": 3.2386, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.82789939450396, |
| "eval_accuracy": 0.3742340892415387, |
| "eval_loss": 3.5338709354400635, |
| "eval_runtime": 179.9943, |
| "eval_samples_per_second": 92.508, |
| "eval_steps_per_second": 5.784, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.842454587796926, |
| "grad_norm": 0.4009569585323334, |
| "learning_rate": 0.0003380931857891671, |
| "loss": 3.2343, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.857009781089893, |
| "grad_norm": 0.40219414234161377, |
| "learning_rate": 0.00033791846243447866, |
| "loss": 3.2429, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.87156497438286, |
| "grad_norm": 0.4035627841949463, |
| "learning_rate": 0.0003377437390797903, |
| "loss": 3.2337, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.886120167675827, |
| "grad_norm": 0.4356299936771393, |
| "learning_rate": 0.0003375690157251019, |
| "loss": 3.2375, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.900675360968794, |
| "grad_norm": 0.3984715938568115, |
| "learning_rate": 0.0003373942923704135, |
| "loss": 3.2308, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.91523055426176, |
| "grad_norm": 0.3872779309749603, |
| "learning_rate": 0.00033721956901572504, |
| "loss": 3.2432, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.929785747554728, |
| "grad_norm": 0.39195629954338074, |
| "learning_rate": 0.00033704484566103663, |
| "loss": 3.2495, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.944340940847695, |
| "grad_norm": 0.41744935512542725, |
| "learning_rate": 0.00033687012230634823, |
| "loss": 3.2373, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.958896134140662, |
| "grad_norm": 0.4085555374622345, |
| "learning_rate": 0.0003366953989516599, |
| "loss": 3.2461, |
| "step": 75450 |
| }, |
| { |
| "epoch": 21.97345132743363, |
| "grad_norm": 0.3933025598526001, |
| "learning_rate": 0.00033652067559697147, |
| "loss": 3.237, |
| "step": 75500 |
| }, |
| { |
| "epoch": 21.988006520726596, |
| "grad_norm": 0.3915697932243347, |
| "learning_rate": 0.000336345952242283, |
| "loss": 3.2568, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.002328830926874, |
| "grad_norm": 0.3964177668094635, |
| "learning_rate": 0.0003361712288875946, |
| "loss": 3.2169, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.01688402421984, |
| "grad_norm": 0.40610381960868835, |
| "learning_rate": 0.0003359965055329062, |
| "loss": 3.1147, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.03143921751281, |
| "grad_norm": 0.40210819244384766, |
| "learning_rate": 0.00033582178217821785, |
| "loss": 3.1422, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.045994410805775, |
| "grad_norm": 0.419974148273468, |
| "learning_rate": 0.0003356470588235294, |
| "loss": 3.1387, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.060549604098743, |
| "grad_norm": 0.4376601278781891, |
| "learning_rate": 0.000335472335468841, |
| "loss": 3.151, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.07510479739171, |
| "grad_norm": 0.42732951045036316, |
| "learning_rate": 0.0003352976121141526, |
| "loss": 3.1584, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.089659990684677, |
| "grad_norm": 0.45031747221946716, |
| "learning_rate": 0.0003351228887594641, |
| "loss": 3.169, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.104215183977644, |
| "grad_norm": 0.4007145166397095, |
| "learning_rate": 0.0003349481654047757, |
| "loss": 3.1676, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.11877037727061, |
| "grad_norm": 0.39019888639450073, |
| "learning_rate": 0.00033477344205008736, |
| "loss": 3.1712, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.11877037727061, |
| "eval_accuracy": 0.3735336265012791, |
| "eval_loss": 3.547504425048828, |
| "eval_runtime": 179.7692, |
| "eval_samples_per_second": 92.624, |
| "eval_steps_per_second": 5.791, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.133325570563578, |
| "grad_norm": 0.40922585129737854, |
| "learning_rate": 0.00033459871869539895, |
| "loss": 3.1625, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.147880763856545, |
| "grad_norm": 0.39725741744041443, |
| "learning_rate": 0.0003344239953407105, |
| "loss": 3.1695, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.162435957149512, |
| "grad_norm": 0.442618191242218, |
| "learning_rate": 0.0003342492719860221, |
| "loss": 3.1558, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.17699115044248, |
| "grad_norm": 0.3919195234775543, |
| "learning_rate": 0.0003340745486313337, |
| "loss": 3.1578, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.191546343735446, |
| "grad_norm": 0.4100443720817566, |
| "learning_rate": 0.0003338998252766453, |
| "loss": 3.168, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.206101537028413, |
| "grad_norm": 0.4021892547607422, |
| "learning_rate": 0.00033372510192195687, |
| "loss": 3.1668, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.22065673032138, |
| "grad_norm": 0.41602620482444763, |
| "learning_rate": 0.00033355037856726847, |
| "loss": 3.1793, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.235211923614347, |
| "grad_norm": 0.40192118287086487, |
| "learning_rate": 0.00033337565521258006, |
| "loss": 3.1831, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.24976711690731, |
| "grad_norm": 0.4379929304122925, |
| "learning_rate": 0.00033320093185789165, |
| "loss": 3.1783, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.264322310200278, |
| "grad_norm": 0.42289242148399353, |
| "learning_rate": 0.0003330262085032032, |
| "loss": 3.1886, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.278877503493245, |
| "grad_norm": 0.4312947392463684, |
| "learning_rate": 0.00033285148514851484, |
| "loss": 3.1825, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.293432696786212, |
| "grad_norm": 0.41763949394226074, |
| "learning_rate": 0.00033267676179382644, |
| "loss": 3.1947, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.30798789007918, |
| "grad_norm": 0.4036341905593872, |
| "learning_rate": 0.00033250203843913803, |
| "loss": 3.1833, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.322543083372146, |
| "grad_norm": 0.4084683954715729, |
| "learning_rate": 0.00033232731508444957, |
| "loss": 3.1957, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.337098276665113, |
| "grad_norm": 0.3992261290550232, |
| "learning_rate": 0.00033215259172976117, |
| "loss": 3.1843, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.35165346995808, |
| "grad_norm": 0.4478423595428467, |
| "learning_rate": 0.00033197786837507276, |
| "loss": 3.1849, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.366208663251047, |
| "grad_norm": 0.420403391122818, |
| "learning_rate": 0.0003318031450203844, |
| "loss": 3.2038, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.380763856544014, |
| "grad_norm": 0.4012890160083771, |
| "learning_rate": 0.00033162842166569595, |
| "loss": 3.189, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.39531904983698, |
| "grad_norm": 0.3808094263076782, |
| "learning_rate": 0.00033145369831100754, |
| "loss": 3.2134, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.40987424312995, |
| "grad_norm": 0.4185815155506134, |
| "learning_rate": 0.00033127897495631914, |
| "loss": 3.1945, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.40987424312995, |
| "eval_accuracy": 0.37388820638170916, |
| "eval_loss": 3.5480434894561768, |
| "eval_runtime": 179.8842, |
| "eval_samples_per_second": 92.565, |
| "eval_steps_per_second": 5.787, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.424429436422916, |
| "grad_norm": 0.40921658277511597, |
| "learning_rate": 0.0003311042516016307, |
| "loss": 3.1995, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.438984629715883, |
| "grad_norm": 0.40508735179901123, |
| "learning_rate": 0.0003309295282469423, |
| "loss": 3.2075, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.45353982300885, |
| "grad_norm": 0.41545870900154114, |
| "learning_rate": 0.0003307548048922539, |
| "loss": 3.1968, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.468095016301817, |
| "grad_norm": 0.41426533460617065, |
| "learning_rate": 0.0003305800815375655, |
| "loss": 3.1945, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.482650209594784, |
| "grad_norm": 0.4070603549480438, |
| "learning_rate": 0.00033040535818287705, |
| "loss": 3.203, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.49720540288775, |
| "grad_norm": 0.41937437653541565, |
| "learning_rate": 0.00033023063482818865, |
| "loss": 3.2106, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.511760596180718, |
| "grad_norm": 0.41913917660713196, |
| "learning_rate": 0.00033005591147350024, |
| "loss": 3.1997, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.526315789473685, |
| "grad_norm": 0.42557600140571594, |
| "learning_rate": 0.0003298811881188119, |
| "loss": 3.2032, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.540870982766652, |
| "grad_norm": 0.4355701506137848, |
| "learning_rate": 0.00032970646476412343, |
| "loss": 3.2055, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.55542617605962, |
| "grad_norm": 0.3803400695323944, |
| "learning_rate": 0.000329531741409435, |
| "loss": 3.2191, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.569981369352586, |
| "grad_norm": 0.39761683344841003, |
| "learning_rate": 0.0003293570180547466, |
| "loss": 3.2094, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.584536562645553, |
| "grad_norm": 0.41126570105552673, |
| "learning_rate": 0.0003291822947000582, |
| "loss": 3.2084, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.59909175593852, |
| "grad_norm": 0.4171556830406189, |
| "learning_rate": 0.00032900757134536975, |
| "loss": 3.208, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.613646949231487, |
| "grad_norm": 0.39230960607528687, |
| "learning_rate": 0.0003288328479906814, |
| "loss": 3.2135, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.628202142524454, |
| "grad_norm": 0.4159952700138092, |
| "learning_rate": 0.000328658124635993, |
| "loss": 3.2063, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.64275733581742, |
| "grad_norm": 0.4337736666202545, |
| "learning_rate": 0.0003284834012813046, |
| "loss": 3.2097, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.657312529110385, |
| "grad_norm": 0.4640325605869293, |
| "learning_rate": 0.00032830867792661613, |
| "loss": 3.2151, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.671867722403352, |
| "grad_norm": 0.4161394238471985, |
| "learning_rate": 0.0003281339545719277, |
| "loss": 3.2275, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.68642291569632, |
| "grad_norm": 0.41482967138290405, |
| "learning_rate": 0.0003279592312172394, |
| "loss": 3.2264, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.700978108989286, |
| "grad_norm": 0.3908606469631195, |
| "learning_rate": 0.00032778450786255097, |
| "loss": 3.2309, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.700978108989286, |
| "eval_accuracy": 0.37420458988788013, |
| "eval_loss": 3.5342824459075928, |
| "eval_runtime": 179.7395, |
| "eval_samples_per_second": 92.64, |
| "eval_steps_per_second": 5.792, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.715533302282253, |
| "grad_norm": 0.44009092450141907, |
| "learning_rate": 0.0003276097845078625, |
| "loss": 3.2251, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.73008849557522, |
| "grad_norm": 0.4078558087348938, |
| "learning_rate": 0.0003274350611531741, |
| "loss": 3.2101, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.744643688868187, |
| "grad_norm": 0.4620960056781769, |
| "learning_rate": 0.0003272603377984857, |
| "loss": 3.2185, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.759198882161154, |
| "grad_norm": 0.3999122083187103, |
| "learning_rate": 0.00032708561444379724, |
| "loss": 3.2213, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.77375407545412, |
| "grad_norm": 0.387114942073822, |
| "learning_rate": 0.0003269108910891089, |
| "loss": 3.2123, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.78830926874709, |
| "grad_norm": 0.4111201763153076, |
| "learning_rate": 0.0003267361677344205, |
| "loss": 3.2208, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.802864462040056, |
| "grad_norm": 0.4094546437263489, |
| "learning_rate": 0.0003265614443797321, |
| "loss": 3.2251, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.817419655333023, |
| "grad_norm": 0.404011070728302, |
| "learning_rate": 0.0003263867210250436, |
| "loss": 3.2301, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.83197484862599, |
| "grad_norm": 0.410820335149765, |
| "learning_rate": 0.0003262119976703552, |
| "loss": 3.2216, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.846530041918957, |
| "grad_norm": 0.3758966624736786, |
| "learning_rate": 0.00032603727431566686, |
| "loss": 3.2169, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.861085235211924, |
| "grad_norm": 0.3818444311618805, |
| "learning_rate": 0.00032586255096097845, |
| "loss": 3.2228, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.87564042850489, |
| "grad_norm": 0.41991475224494934, |
| "learning_rate": 0.00032568782760629005, |
| "loss": 3.2161, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.890195621797858, |
| "grad_norm": 0.42864012718200684, |
| "learning_rate": 0.0003255131042516016, |
| "loss": 3.2297, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.904750815090825, |
| "grad_norm": 0.40493571758270264, |
| "learning_rate": 0.0003253383808969132, |
| "loss": 3.2292, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.919306008383792, |
| "grad_norm": 0.4492877721786499, |
| "learning_rate": 0.0003251636575422248, |
| "loss": 3.2415, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.93386120167676, |
| "grad_norm": 0.4454686939716339, |
| "learning_rate": 0.0003249889341875364, |
| "loss": 3.2403, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.948416394969726, |
| "grad_norm": 0.4318472146987915, |
| "learning_rate": 0.00032481421083284796, |
| "loss": 3.2177, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.962971588262693, |
| "grad_norm": 0.40572306513786316, |
| "learning_rate": 0.00032463948747815956, |
| "loss": 3.2211, |
| "step": 78900 |
| }, |
| { |
| "epoch": 22.97752678155566, |
| "grad_norm": 0.40303680300712585, |
| "learning_rate": 0.00032446476412347115, |
| "loss": 3.2347, |
| "step": 78950 |
| }, |
| { |
| "epoch": 22.992081974848627, |
| "grad_norm": 0.3872184455394745, |
| "learning_rate": 0.0003242900407687827, |
| "loss": 3.2372, |
| "step": 79000 |
| }, |
| { |
| "epoch": 22.992081974848627, |
| "eval_accuracy": 0.3749660492996489, |
| "eval_loss": 3.5285708904266357, |
| "eval_runtime": 179.7425, |
| "eval_samples_per_second": 92.638, |
| "eval_steps_per_second": 5.792, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.006404285048905, |
| "grad_norm": 0.42881467938423157, |
| "learning_rate": 0.00032411531741409434, |
| "loss": 3.1789, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.020959478341872, |
| "grad_norm": 0.4351500868797302, |
| "learning_rate": 0.00032394059405940593, |
| "loss": 3.1258, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.03551467163484, |
| "grad_norm": 0.3753584623336792, |
| "learning_rate": 0.00032376587070471753, |
| "loss": 3.1297, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.050069864927806, |
| "grad_norm": 0.41720595955848694, |
| "learning_rate": 0.00032359114735002907, |
| "loss": 3.1342, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.064625058220773, |
| "grad_norm": 0.4204729497432709, |
| "learning_rate": 0.00032341642399534066, |
| "loss": 3.1364, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.07918025151374, |
| "grad_norm": 0.4257517158985138, |
| "learning_rate": 0.00032324170064065226, |
| "loss": 3.1494, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.093735444806708, |
| "grad_norm": 0.4221104681491852, |
| "learning_rate": 0.0003230669772859639, |
| "loss": 3.1491, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.108290638099675, |
| "grad_norm": 0.40155747532844543, |
| "learning_rate": 0.00032289225393127545, |
| "loss": 3.1436, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.12284583139264, |
| "grad_norm": 0.4350748062133789, |
| "learning_rate": 0.00032271753057658704, |
| "loss": 3.1517, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.13740102468561, |
| "grad_norm": 0.3839072585105896, |
| "learning_rate": 0.00032254280722189863, |
| "loss": 3.157, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.151956217978576, |
| "grad_norm": 0.4622083306312561, |
| "learning_rate": 0.00032236808386721023, |
| "loss": 3.1603, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.166511411271543, |
| "grad_norm": 0.40241503715515137, |
| "learning_rate": 0.00032219336051252177, |
| "loss": 3.1538, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.18106660456451, |
| "grad_norm": 0.4790761172771454, |
| "learning_rate": 0.0003220186371578334, |
| "loss": 3.1553, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.195621797857477, |
| "grad_norm": 0.402031272649765, |
| "learning_rate": 0.000321843913803145, |
| "loss": 3.1628, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.210176991150444, |
| "grad_norm": 0.4585934579372406, |
| "learning_rate": 0.0003216691904484566, |
| "loss": 3.1701, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.22473218444341, |
| "grad_norm": 0.3923730254173279, |
| "learning_rate": 0.00032149446709376815, |
| "loss": 3.1829, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.239287377736375, |
| "grad_norm": 0.4399240016937256, |
| "learning_rate": 0.00032131974373907974, |
| "loss": 3.167, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.25384257102934, |
| "grad_norm": 0.43063780665397644, |
| "learning_rate": 0.0003211450203843914, |
| "loss": 3.1584, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.26839776432231, |
| "grad_norm": 0.431403785943985, |
| "learning_rate": 0.000320970297029703, |
| "loss": 3.1723, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.282952957615276, |
| "grad_norm": 0.4274817705154419, |
| "learning_rate": 0.0003207955736750145, |
| "loss": 3.1752, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.282952957615276, |
| "eval_accuracy": 0.37391077162434844, |
| "eval_loss": 3.5465760231018066, |
| "eval_runtime": 179.7261, |
| "eval_samples_per_second": 92.647, |
| "eval_steps_per_second": 5.792, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.297508150908243, |
| "grad_norm": 0.42905303835868835, |
| "learning_rate": 0.0003206208503203261, |
| "loss": 3.1283, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.31206334420121, |
| "grad_norm": 0.41787195205688477, |
| "learning_rate": 0.0003204461269656377, |
| "loss": 3.1484, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.326618537494177, |
| "grad_norm": 0.4279998242855072, |
| "learning_rate": 0.00032027140361094925, |
| "loss": 3.1479, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.341173730787144, |
| "grad_norm": 0.45045384764671326, |
| "learning_rate": 0.0003200966802562609, |
| "loss": 3.1503, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.35572892408011, |
| "grad_norm": 0.43174025416374207, |
| "learning_rate": 0.0003199219569015725, |
| "loss": 3.1515, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.370284117373078, |
| "grad_norm": 0.4571380913257599, |
| "learning_rate": 0.0003197472335468841, |
| "loss": 3.145, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.384839310666045, |
| "grad_norm": 0.4241339862346649, |
| "learning_rate": 0.00031957251019219563, |
| "loss": 3.1444, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.399394503959012, |
| "grad_norm": 0.4476516842842102, |
| "learning_rate": 0.0003193977868375072, |
| "loss": 3.1501, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.41394969725198, |
| "grad_norm": 0.40319111943244934, |
| "learning_rate": 0.00031922306348281887, |
| "loss": 3.1557, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.428504890544946, |
| "grad_norm": 0.41085711121559143, |
| "learning_rate": 0.00031904834012813047, |
| "loss": 3.1622, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.443060083837914, |
| "grad_norm": 0.44014430046081543, |
| "learning_rate": 0.000318873616773442, |
| "loss": 3.1623, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.45761527713088, |
| "grad_norm": 0.4018969237804413, |
| "learning_rate": 0.0003186988934187536, |
| "loss": 3.1623, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.472170470423848, |
| "grad_norm": 0.4306873679161072, |
| "learning_rate": 0.0003185241700640652, |
| "loss": 3.1734, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.486725663716815, |
| "grad_norm": 0.41860219836235046, |
| "learning_rate": 0.0003183494467093768, |
| "loss": 3.1688, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.50128085700978, |
| "grad_norm": 0.41949665546417236, |
| "learning_rate": 0.0003181747233546884, |
| "loss": 3.1631, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.51583605030275, |
| "grad_norm": 0.4215621054172516, |
| "learning_rate": 0.000318, |
| "loss": 3.1708, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.530391243595716, |
| "grad_norm": 0.456132709980011, |
| "learning_rate": 0.00031782527664531157, |
| "loss": 3.1717, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.544946436888683, |
| "grad_norm": 0.47575968503952026, |
| "learning_rate": 0.00031765055329062317, |
| "loss": 3.1729, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.55950163018165, |
| "grad_norm": 0.4039624035358429, |
| "learning_rate": 0.0003174758299359347, |
| "loss": 3.178, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.574056823474617, |
| "grad_norm": 0.42520585656166077, |
| "learning_rate": 0.0003173011065812463, |
| "loss": 3.1701, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.574056823474617, |
| "eval_accuracy": 0.3734778010312081, |
| "eval_loss": 3.5532686710357666, |
| "eval_runtime": 180.8181, |
| "eval_samples_per_second": 92.087, |
| "eval_steps_per_second": 5.757, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.588612016767584, |
| "grad_norm": 0.43075016140937805, |
| "learning_rate": 0.00031712638322655795, |
| "loss": 3.1776, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.60316721006055, |
| "grad_norm": 0.39215949177742004, |
| "learning_rate": 0.00031695165987186954, |
| "loss": 3.176, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.61772240335352, |
| "grad_norm": 0.3989499807357788, |
| "learning_rate": 0.0003167769365171811, |
| "loss": 3.175, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.63227759664648, |
| "grad_norm": 0.43521571159362793, |
| "learning_rate": 0.0003166022131624927, |
| "loss": 3.1837, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.64683278993945, |
| "grad_norm": 0.4280226528644562, |
| "learning_rate": 0.00031642748980780427, |
| "loss": 3.1874, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.661387983232416, |
| "grad_norm": 0.40798208117485046, |
| "learning_rate": 0.0003162527664531159, |
| "loss": 3.1861, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.675943176525383, |
| "grad_norm": 0.4074450433254242, |
| "learning_rate": 0.00031607804309842746, |
| "loss": 3.1853, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.69049836981835, |
| "grad_norm": 0.43709275126457214, |
| "learning_rate": 0.00031590331974373905, |
| "loss": 3.1914, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.705053563111317, |
| "grad_norm": 0.4177038371562958, |
| "learning_rate": 0.00031572859638905065, |
| "loss": 3.1864, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.719608756404284, |
| "grad_norm": 0.43477728962898254, |
| "learning_rate": 0.0003155538730343622, |
| "loss": 3.1916, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.73416394969725, |
| "grad_norm": 0.4035871624946594, |
| "learning_rate": 0.0003153791496796738, |
| "loss": 3.1793, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.74871914299022, |
| "grad_norm": 0.4537198543548584, |
| "learning_rate": 0.00031520442632498543, |
| "loss": 3.2004, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.763274336283185, |
| "grad_norm": 0.45444414019584656, |
| "learning_rate": 0.000315029702970297, |
| "loss": 3.1969, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.777829529576152, |
| "grad_norm": 0.4409884214401245, |
| "learning_rate": 0.0003148549796156086, |
| "loss": 3.1884, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.79238472286912, |
| "grad_norm": 0.44495829939842224, |
| "learning_rate": 0.00031468025626092016, |
| "loss": 3.1847, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.806939916162086, |
| "grad_norm": 0.391963928937912, |
| "learning_rate": 0.00031450553290623175, |
| "loss": 3.1951, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.821495109455054, |
| "grad_norm": 0.4508795738220215, |
| "learning_rate": 0.0003143308095515434, |
| "loss": 3.2053, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.83605030274802, |
| "grad_norm": 0.45079120993614197, |
| "learning_rate": 0.000314156086196855, |
| "loss": 3.194, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.850605496040988, |
| "grad_norm": 0.4412509500980377, |
| "learning_rate": 0.00031398136284216654, |
| "loss": 3.1898, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.865160689333955, |
| "grad_norm": 0.4032447040081024, |
| "learning_rate": 0.00031380663948747813, |
| "loss": 3.1977, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.865160689333955, |
| "eval_accuracy": 0.37412596412055904, |
| "eval_loss": 3.540754795074463, |
| "eval_runtime": 178.8063, |
| "eval_samples_per_second": 93.123, |
| "eval_steps_per_second": 5.822, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.879715882626922, |
| "grad_norm": 0.41658344864845276, |
| "learning_rate": 0.0003136319161327897, |
| "loss": 3.1999, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.89427107591989, |
| "grad_norm": 0.42867329716682434, |
| "learning_rate": 0.00031345719277810127, |
| "loss": 3.1996, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.908826269212856, |
| "grad_norm": 0.4257943034172058, |
| "learning_rate": 0.0003132824694234129, |
| "loss": 3.1991, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.923381462505823, |
| "grad_norm": 0.4132334887981415, |
| "learning_rate": 0.0003131077460687245, |
| "loss": 3.203, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.93793665579879, |
| "grad_norm": 0.4158965051174164, |
| "learning_rate": 0.0003129330227140361, |
| "loss": 3.1978, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.952491849091757, |
| "grad_norm": 0.44557440280914307, |
| "learning_rate": 0.00031275829935934764, |
| "loss": 3.197, |
| "step": 82300 |
| }, |
| { |
| "epoch": 23.967047042384724, |
| "grad_norm": 0.4208097457885742, |
| "learning_rate": 0.00031258357600465924, |
| "loss": 3.2107, |
| "step": 82350 |
| }, |
| { |
| "epoch": 23.98160223567769, |
| "grad_norm": 0.41734635829925537, |
| "learning_rate": 0.00031240885264997083, |
| "loss": 3.2195, |
| "step": 82400 |
| }, |
| { |
| "epoch": 23.99615742897066, |
| "grad_norm": 0.4121745228767395, |
| "learning_rate": 0.0003122341292952825, |
| "loss": 3.2018, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.010770843036795, |
| "grad_norm": 0.43925735354423523, |
| "learning_rate": 0.000312059405940594, |
| "loss": 3.2022, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.025326036329762, |
| "grad_norm": 0.43141815066337585, |
| "learning_rate": 0.0003118846825859056, |
| "loss": 3.1254, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.03988122962273, |
| "grad_norm": 0.44609951972961426, |
| "learning_rate": 0.0003117099592312172, |
| "loss": 3.1231, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.054436422915696, |
| "grad_norm": 0.4301334619522095, |
| "learning_rate": 0.0003115352358765288, |
| "loss": 3.1492, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.068991616208663, |
| "grad_norm": 0.45953306555747986, |
| "learning_rate": 0.0003113605125218404, |
| "loss": 3.1395, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.08354680950163, |
| "grad_norm": 0.4030922055244446, |
| "learning_rate": 0.000311185789167152, |
| "loss": 3.1388, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.098102002794597, |
| "grad_norm": 0.42163464426994324, |
| "learning_rate": 0.0003110110658124636, |
| "loss": 3.1503, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.112657196087564, |
| "grad_norm": 0.42299437522888184, |
| "learning_rate": 0.0003108363424577752, |
| "loss": 3.1515, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.12721238938053, |
| "grad_norm": 0.4112618863582611, |
| "learning_rate": 0.0003106616191030867, |
| "loss": 3.1419, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.1417675826735, |
| "grad_norm": 0.4395235776901245, |
| "learning_rate": 0.0003104868957483983, |
| "loss": 3.1511, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.156322775966466, |
| "grad_norm": 0.4338419735431671, |
| "learning_rate": 0.00031031217239370996, |
| "loss": 3.154, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.156322775966466, |
| "eval_accuracy": 0.3735538411978101, |
| "eval_loss": 3.55212664604187, |
| "eval_runtime": 178.7314, |
| "eval_samples_per_second": 93.162, |
| "eval_steps_per_second": 5.824, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.170877969259433, |
| "grad_norm": 0.410587340593338, |
| "learning_rate": 0.00031013744903902156, |
| "loss": 3.1564, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.1854331625524, |
| "grad_norm": 0.4255504012107849, |
| "learning_rate": 0.0003099627256843331, |
| "loss": 3.16, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.199988355845367, |
| "grad_norm": 0.44280168414115906, |
| "learning_rate": 0.0003097880023296447, |
| "loss": 3.1635, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.214543549138334, |
| "grad_norm": 0.4021797180175781, |
| "learning_rate": 0.0003096132789749563, |
| "loss": 3.1546, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.2290987424313, |
| "grad_norm": 0.41863852739334106, |
| "learning_rate": 0.00030943855562026794, |
| "loss": 3.1633, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.243653935724268, |
| "grad_norm": 0.42614641785621643, |
| "learning_rate": 0.0003092638322655795, |
| "loss": 3.16, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.258209129017235, |
| "grad_norm": 0.42498284578323364, |
| "learning_rate": 0.00030908910891089107, |
| "loss": 3.1721, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.2727643223102, |
| "grad_norm": 0.44240236282348633, |
| "learning_rate": 0.00030891438555620266, |
| "loss": 3.1735, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.287319515603166, |
| "grad_norm": 0.4178884029388428, |
| "learning_rate": 0.0003087396622015142, |
| "loss": 3.1825, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.301874708896133, |
| "grad_norm": 0.4449654519557953, |
| "learning_rate": 0.0003085649388468258, |
| "loss": 3.1667, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.3164299021891, |
| "grad_norm": 0.43948787450790405, |
| "learning_rate": 0.00030839021549213745, |
| "loss": 3.17, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.330985095482067, |
| "grad_norm": 0.3911823630332947, |
| "learning_rate": 0.00030821549213744904, |
| "loss": 3.1732, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.345540288775034, |
| "grad_norm": 0.427975058555603, |
| "learning_rate": 0.0003080407687827606, |
| "loss": 3.1786, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.360095482068, |
| "grad_norm": 0.45086926221847534, |
| "learning_rate": 0.0003078660454280722, |
| "loss": 3.1817, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.374650675360968, |
| "grad_norm": 0.4079881012439728, |
| "learning_rate": 0.00030769132207338377, |
| "loss": 3.1748, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.389205868653935, |
| "grad_norm": 0.43696466088294983, |
| "learning_rate": 0.00030751659871869536, |
| "loss": 3.1725, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.403761061946902, |
| "grad_norm": 0.404164582490921, |
| "learning_rate": 0.00030734187536400696, |
| "loss": 3.1947, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.41831625523987, |
| "grad_norm": 0.4178932309150696, |
| "learning_rate": 0.00030716715200931855, |
| "loss": 3.1885, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.432871448532836, |
| "grad_norm": 0.4175661504268646, |
| "learning_rate": 0.00030699242865463015, |
| "loss": 3.178, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.447426641825803, |
| "grad_norm": 0.4395267367362976, |
| "learning_rate": 0.00030681770529994174, |
| "loss": 3.1835, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.447426641825803, |
| "eval_accuracy": 0.3743471505093457, |
| "eval_loss": 3.5441832542419434, |
| "eval_runtime": 178.6435, |
| "eval_samples_per_second": 93.208, |
| "eval_steps_per_second": 5.827, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.46198183511877, |
| "grad_norm": 0.4422329068183899, |
| "learning_rate": 0.0003066429819452533, |
| "loss": 3.1938, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.476537028411737, |
| "grad_norm": 0.40939897298812866, |
| "learning_rate": 0.00030646825859056493, |
| "loss": 3.1864, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.491092221704704, |
| "grad_norm": 0.4184223413467407, |
| "learning_rate": 0.0003062935352358765, |
| "loss": 3.1853, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.50564741499767, |
| "grad_norm": 0.4515139162540436, |
| "learning_rate": 0.0003061188118811881, |
| "loss": 3.1952, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.52020260829064, |
| "grad_norm": 0.4533407390117645, |
| "learning_rate": 0.00030594408852649966, |
| "loss": 3.1857, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.534757801583606, |
| "grad_norm": 0.40750402212142944, |
| "learning_rate": 0.00030576936517181125, |
| "loss": 3.1891, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.549312994876573, |
| "grad_norm": 0.42680811882019043, |
| "learning_rate": 0.00030559464181712285, |
| "loss": 3.1951, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.56386818816954, |
| "grad_norm": 0.41157272458076477, |
| "learning_rate": 0.0003054199184624345, |
| "loss": 3.1828, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.578423381462507, |
| "grad_norm": 0.45721128582954407, |
| "learning_rate": 0.00030524519510774604, |
| "loss": 3.1925, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.592978574755474, |
| "grad_norm": 0.43095633387565613, |
| "learning_rate": 0.00030507047175305763, |
| "loss": 3.2138, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.60753376804844, |
| "grad_norm": 0.4171367585659027, |
| "learning_rate": 0.0003048957483983692, |
| "loss": 3.1886, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.622088961341408, |
| "grad_norm": 0.4192904531955719, |
| "learning_rate": 0.00030472102504368076, |
| "loss": 3.1885, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.636644154634375, |
| "grad_norm": 0.42844662070274353, |
| "learning_rate": 0.0003045463016889924, |
| "loss": 3.1945, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.651199347927342, |
| "grad_norm": 0.40473800897598267, |
| "learning_rate": 0.000304371578334304, |
| "loss": 3.1873, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.66575454122031, |
| "grad_norm": 0.4121386408805847, |
| "learning_rate": 0.0003041968549796156, |
| "loss": 3.2009, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.680309734513273, |
| "grad_norm": 0.4244219958782196, |
| "learning_rate": 0.00030402213162492714, |
| "loss": 3.2004, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.69486492780624, |
| "grad_norm": 0.42446020245552063, |
| "learning_rate": 0.00030384740827023874, |
| "loss": 3.1985, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.709420121099207, |
| "grad_norm": 0.40833067893981934, |
| "learning_rate": 0.00030367268491555033, |
| "loss": 3.1967, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.723975314392174, |
| "grad_norm": 0.41140016913414, |
| "learning_rate": 0.000303497961560862, |
| "loss": 3.1969, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.73853050768514, |
| "grad_norm": 0.4671842157840729, |
| "learning_rate": 0.00030332323820617357, |
| "loss": 3.205, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.73853050768514, |
| "eval_accuracy": 0.37468727453121004, |
| "eval_loss": 3.537564754486084, |
| "eval_runtime": 178.3167, |
| "eval_samples_per_second": 93.379, |
| "eval_steps_per_second": 5.838, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.753085700978108, |
| "grad_norm": 0.4072178304195404, |
| "learning_rate": 0.0003031485148514851, |
| "loss": 3.1955, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.767640894271075, |
| "grad_norm": 0.4268096387386322, |
| "learning_rate": 0.0003029737914967967, |
| "loss": 3.2029, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.782196087564042, |
| "grad_norm": 0.445999413728714, |
| "learning_rate": 0.0003027990681421083, |
| "loss": 3.2048, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.79675128085701, |
| "grad_norm": 0.45961517095565796, |
| "learning_rate": 0.00030262434478741984, |
| "loss": 3.2095, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.811306474149976, |
| "grad_norm": 0.4011656641960144, |
| "learning_rate": 0.0003024496214327315, |
| "loss": 3.2021, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.825861667442943, |
| "grad_norm": 0.4256271719932556, |
| "learning_rate": 0.0003022748980780431, |
| "loss": 3.2095, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.84041686073591, |
| "grad_norm": 0.4334579408168793, |
| "learning_rate": 0.0003021001747233547, |
| "loss": 3.2105, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.854972054028877, |
| "grad_norm": 0.4193101227283478, |
| "learning_rate": 0.0003019254513686662, |
| "loss": 3.2101, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.869527247321844, |
| "grad_norm": 0.4495272934436798, |
| "learning_rate": 0.0003017507280139778, |
| "loss": 3.2186, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.88408244061481, |
| "grad_norm": 0.4173179566860199, |
| "learning_rate": 0.00030157600465928946, |
| "loss": 3.2105, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.89863763390778, |
| "grad_norm": 0.43967458605766296, |
| "learning_rate": 0.00030140128130460106, |
| "loss": 3.1966, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.913192827200746, |
| "grad_norm": 0.3922642767429352, |
| "learning_rate": 0.0003012265579499126, |
| "loss": 3.2005, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.927748020493713, |
| "grad_norm": 0.40184563398361206, |
| "learning_rate": 0.0003010518345952242, |
| "loss": 3.2247, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.94230321378668, |
| "grad_norm": 0.44088226556777954, |
| "learning_rate": 0.0003008771112405358, |
| "loss": 3.2217, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.956858407079647, |
| "grad_norm": 0.4002504348754883, |
| "learning_rate": 0.0003007023878858473, |
| "loss": 3.2126, |
| "step": 85750 |
| }, |
| { |
| "epoch": 24.971413600372614, |
| "grad_norm": 0.45589736104011536, |
| "learning_rate": 0.000300527664531159, |
| "loss": 3.2148, |
| "step": 85800 |
| }, |
| { |
| "epoch": 24.98596879366558, |
| "grad_norm": 0.41425585746765137, |
| "learning_rate": 0.00030035294117647057, |
| "loss": 3.2123, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.00029110386586, |
| "grad_norm": 0.460923433303833, |
| "learning_rate": 0.00030017821782178216, |
| "loss": 3.2125, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.014846297158826, |
| "grad_norm": 0.4303196370601654, |
| "learning_rate": 0.00030000349446709376, |
| "loss": 3.1173, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.029401490451793, |
| "grad_norm": 0.42220667004585266, |
| "learning_rate": 0.00029982877111240535, |
| "loss": 3.1135, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.029401490451793, |
| "eval_accuracy": 0.3740235978375446, |
| "eval_loss": 3.548854112625122, |
| "eval_runtime": 178.4682, |
| "eval_samples_per_second": 93.3, |
| "eval_steps_per_second": 5.833, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.04395668374476, |
| "grad_norm": 0.43721678853034973, |
| "learning_rate": 0.00029965404775771694, |
| "loss": 3.1255, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.058511877037727, |
| "grad_norm": 0.4440969228744507, |
| "learning_rate": 0.0002994793244030285, |
| "loss": 3.1214, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.073067070330694, |
| "grad_norm": 0.4456615746021271, |
| "learning_rate": 0.00029930460104834013, |
| "loss": 3.1181, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.08762226362366, |
| "grad_norm": 0.4187942147254944, |
| "learning_rate": 0.0002991298776936517, |
| "loss": 3.1234, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.10217745691663, |
| "grad_norm": 0.4579191207885742, |
| "learning_rate": 0.00029895515433896327, |
| "loss": 3.1356, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.116732650209595, |
| "grad_norm": 0.4180799722671509, |
| "learning_rate": 0.00029878043098427486, |
| "loss": 3.1369, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.131287843502562, |
| "grad_norm": 0.43055835366249084, |
| "learning_rate": 0.00029860570762958646, |
| "loss": 3.1436, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.14584303679553, |
| "grad_norm": 0.44100478291511536, |
| "learning_rate": 0.00029843098427489805, |
| "loss": 3.1481, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.160398230088497, |
| "grad_norm": 0.44051438570022583, |
| "learning_rate": 0.00029825626092020964, |
| "loss": 3.1397, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.174953423381464, |
| "grad_norm": 0.4257868528366089, |
| "learning_rate": 0.00029808153756552124, |
| "loss": 3.139, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.18950861667443, |
| "grad_norm": 0.4832608103752136, |
| "learning_rate": 0.00029790681421083283, |
| "loss": 3.1449, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.204063809967398, |
| "grad_norm": 0.44304877519607544, |
| "learning_rate": 0.00029773209085614443, |
| "loss": 3.1539, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.218619003260365, |
| "grad_norm": 0.4223651587963104, |
| "learning_rate": 0.00029755736750145597, |
| "loss": 3.1409, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.233174196553332, |
| "grad_norm": 0.42087194323539734, |
| "learning_rate": 0.0002973826441467676, |
| "loss": 3.1611, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.2477293898463, |
| "grad_norm": 0.4258721172809601, |
| "learning_rate": 0.00029720792079207916, |
| "loss": 3.1438, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.262284583139262, |
| "grad_norm": 0.43097466230392456, |
| "learning_rate": 0.00029703319743739075, |
| "loss": 3.1442, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.27683977643223, |
| "grad_norm": 0.44095996022224426, |
| "learning_rate": 0.00029685847408270234, |
| "loss": 3.1555, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.291394969725197, |
| "grad_norm": 0.4092617332935333, |
| "learning_rate": 0.00029668375072801394, |
| "loss": 3.16, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.305950163018164, |
| "grad_norm": 0.42463231086730957, |
| "learning_rate": 0.00029650902737332553, |
| "loss": 3.1714, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.32050535631113, |
| "grad_norm": 0.4192950427532196, |
| "learning_rate": 0.00029633430401863713, |
| "loss": 3.1552, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.32050535631113, |
| "eval_accuracy": 0.3736667849383117, |
| "eval_loss": 3.5517146587371826, |
| "eval_runtime": 178.1632, |
| "eval_samples_per_second": 93.459, |
| "eval_steps_per_second": 5.843, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.335060549604098, |
| "grad_norm": 0.4604697823524475, |
| "learning_rate": 0.0002961595806639487, |
| "loss": 3.1667, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.349615742897065, |
| "grad_norm": 0.4282155930995941, |
| "learning_rate": 0.0002959848573092603, |
| "loss": 3.1786, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.364170936190032, |
| "grad_norm": 0.43425947427749634, |
| "learning_rate": 0.0002958101339545719, |
| "loss": 3.1807, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.378726129483, |
| "grad_norm": 0.4362315833568573, |
| "learning_rate": 0.0002956354105998835, |
| "loss": 3.1767, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.393281322775966, |
| "grad_norm": 0.44421157240867615, |
| "learning_rate": 0.0002954606872451951, |
| "loss": 3.1611, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.407836516068933, |
| "grad_norm": 0.4191991984844208, |
| "learning_rate": 0.0002952859638905067, |
| "loss": 3.1799, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.4223917093619, |
| "grad_norm": 0.428579717874527, |
| "learning_rate": 0.00029511124053581823, |
| "loss": 3.1655, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.436946902654867, |
| "grad_norm": 0.40432706475257874, |
| "learning_rate": 0.0002949365171811299, |
| "loss": 3.1687, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.451502095947834, |
| "grad_norm": 0.4361858069896698, |
| "learning_rate": 0.0002947617938264414, |
| "loss": 3.1667, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.4660572892408, |
| "grad_norm": 0.44031381607055664, |
| "learning_rate": 0.000294587070471753, |
| "loss": 3.1805, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.48061248253377, |
| "grad_norm": 0.4382694661617279, |
| "learning_rate": 0.0002944123471170646, |
| "loss": 3.1793, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.495167675826735, |
| "grad_norm": 0.43290427327156067, |
| "learning_rate": 0.0002942376237623762, |
| "loss": 3.1917, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.509722869119702, |
| "grad_norm": 0.4313948452472687, |
| "learning_rate": 0.0002940629004076878, |
| "loss": 3.1664, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.52427806241267, |
| "grad_norm": 0.40133073925971985, |
| "learning_rate": 0.0002938881770529994, |
| "loss": 3.1776, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.538833255705637, |
| "grad_norm": 0.468847393989563, |
| "learning_rate": 0.000293713453698311, |
| "loss": 3.189, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.553388448998604, |
| "grad_norm": 0.4637649655342102, |
| "learning_rate": 0.0002935387303436226, |
| "loss": 3.1765, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.56794364229157, |
| "grad_norm": 0.43740376830101013, |
| "learning_rate": 0.0002933640069889342, |
| "loss": 3.1829, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.582498835584538, |
| "grad_norm": 0.44679924845695496, |
| "learning_rate": 0.0002931892836342457, |
| "loss": 3.1873, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.597054028877505, |
| "grad_norm": 0.42557457089424133, |
| "learning_rate": 0.00029301456027955736, |
| "loss": 3.176, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.611609222170472, |
| "grad_norm": 0.4091741144657135, |
| "learning_rate": 0.0002928398369248689, |
| "loss": 3.1777, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.611609222170472, |
| "eval_accuracy": 0.3743444473813212, |
| "eval_loss": 3.5408666133880615, |
| "eval_runtime": 178.3812, |
| "eval_samples_per_second": 93.345, |
| "eval_steps_per_second": 5.836, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.62616441546344, |
| "grad_norm": 0.4278210997581482, |
| "learning_rate": 0.0002926651135701805, |
| "loss": 3.1852, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.640719608756406, |
| "grad_norm": 0.41783830523490906, |
| "learning_rate": 0.00029249039021549215, |
| "loss": 3.19, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.65527480204937, |
| "grad_norm": 0.4398443102836609, |
| "learning_rate": 0.0002923156668608037, |
| "loss": 3.1858, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.669829995342337, |
| "grad_norm": 0.4523797035217285, |
| "learning_rate": 0.0002921409435061153, |
| "loss": 3.1921, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.684385188635304, |
| "grad_norm": 0.44268232583999634, |
| "learning_rate": 0.0002919662201514269, |
| "loss": 3.1922, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.69894038192827, |
| "grad_norm": 0.41883960366249084, |
| "learning_rate": 0.00029179149679673847, |
| "loss": 3.1956, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.713495575221238, |
| "grad_norm": 0.43059247732162476, |
| "learning_rate": 0.00029161677344205007, |
| "loss": 3.1949, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.728050768514205, |
| "grad_norm": 0.43987998366355896, |
| "learning_rate": 0.00029144205008736166, |
| "loss": 3.1903, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.742605961807172, |
| "grad_norm": 0.4239983558654785, |
| "learning_rate": 0.00029126732673267325, |
| "loss": 3.1882, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.75716115510014, |
| "grad_norm": 0.4204862713813782, |
| "learning_rate": 0.00029109260337798485, |
| "loss": 3.1831, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.771716348393106, |
| "grad_norm": 0.4111863672733307, |
| "learning_rate": 0.00029091788002329644, |
| "loss": 3.1927, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.786271541686073, |
| "grad_norm": 0.42216384410858154, |
| "learning_rate": 0.000290743156668608, |
| "loss": 3.1872, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.80082673497904, |
| "grad_norm": 0.4510778486728668, |
| "learning_rate": 0.00029056843331391963, |
| "loss": 3.2054, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.815381928272007, |
| "grad_norm": 0.4231010675430298, |
| "learning_rate": 0.00029039370995923117, |
| "loss": 3.1947, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.829937121564974, |
| "grad_norm": 0.4821763336658478, |
| "learning_rate": 0.00029021898660454277, |
| "loss": 3.1927, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.84449231485794, |
| "grad_norm": 0.4131713807582855, |
| "learning_rate": 0.00029004426324985436, |
| "loss": 3.2026, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.85904750815091, |
| "grad_norm": 0.42869827151298523, |
| "learning_rate": 0.00028986953989516595, |
| "loss": 3.1973, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.873602701443875, |
| "grad_norm": 0.41724643111228943, |
| "learning_rate": 0.00028969481654047755, |
| "loss": 3.2063, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.888157894736842, |
| "grad_norm": 0.43880873918533325, |
| "learning_rate": 0.00028952009318578914, |
| "loss": 3.1844, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.90271308802981, |
| "grad_norm": 0.437773734331131, |
| "learning_rate": 0.00028934536983110074, |
| "loss": 3.2144, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.90271308802981, |
| "eval_accuracy": 0.3750591309255358, |
| "eval_loss": 3.5333504676818848, |
| "eval_runtime": 178.1036, |
| "eval_samples_per_second": 93.491, |
| "eval_steps_per_second": 5.845, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.917268281322777, |
| "grad_norm": 0.44758129119873047, |
| "learning_rate": 0.00028917064647641233, |
| "loss": 3.2018, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.931823474615744, |
| "grad_norm": 0.43168768286705017, |
| "learning_rate": 0.0002889959231217239, |
| "loss": 3.1961, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.94637866790871, |
| "grad_norm": 0.43598511815071106, |
| "learning_rate": 0.0002888211997670355, |
| "loss": 3.2, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.960933861201678, |
| "grad_norm": 0.4103822708129883, |
| "learning_rate": 0.0002886464764123471, |
| "loss": 3.2047, |
| "step": 89200 |
| }, |
| { |
| "epoch": 25.975489054494645, |
| "grad_norm": 0.4364749789237976, |
| "learning_rate": 0.0002884717530576587, |
| "loss": 3.2125, |
| "step": 89250 |
| }, |
| { |
| "epoch": 25.990044247787612, |
| "grad_norm": 0.42849549651145935, |
| "learning_rate": 0.00028829702970297025, |
| "loss": 3.2003, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.00436655798789, |
| "grad_norm": 0.4636796712875366, |
| "learning_rate": 0.0002881223063482819, |
| "loss": 3.1681, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.018921751280857, |
| "grad_norm": 0.46553125977516174, |
| "learning_rate": 0.00028794758299359344, |
| "loss": 3.108, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.033476944573824, |
| "grad_norm": 0.4672216475009918, |
| "learning_rate": 0.00028777285963890503, |
| "loss": 3.1076, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.04803213786679, |
| "grad_norm": 0.42928624153137207, |
| "learning_rate": 0.0002875981362842166, |
| "loss": 3.1082, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.062587331159758, |
| "grad_norm": 0.43934693932533264, |
| "learning_rate": 0.0002874234129295282, |
| "loss": 3.1121, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.077142524452725, |
| "grad_norm": 0.4280041456222534, |
| "learning_rate": 0.0002872486895748398, |
| "loss": 3.1098, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.091697717745692, |
| "grad_norm": 0.43326443433761597, |
| "learning_rate": 0.0002870739662201514, |
| "loss": 3.1155, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.10625291103866, |
| "grad_norm": 0.44299453496932983, |
| "learning_rate": 0.000286899242865463, |
| "loss": 3.1239, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.120808104331626, |
| "grad_norm": 0.43149131536483765, |
| "learning_rate": 0.0002867245195107746, |
| "loss": 3.1264, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.135363297624593, |
| "grad_norm": 0.4228934645652771, |
| "learning_rate": 0.0002865497961560862, |
| "loss": 3.1284, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.14991849091756, |
| "grad_norm": 0.4010949730873108, |
| "learning_rate": 0.00028637507280139773, |
| "loss": 3.1208, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.164473684210527, |
| "grad_norm": 0.4526248872280121, |
| "learning_rate": 0.0002862003494467094, |
| "loss": 3.1424, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.179028877503494, |
| "grad_norm": 0.441843718290329, |
| "learning_rate": 0.0002860256260920209, |
| "loss": 3.1476, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.19358407079646, |
| "grad_norm": 0.4254477322101593, |
| "learning_rate": 0.0002858509027373325, |
| "loss": 3.1364, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.19358407079646, |
| "eval_accuracy": 0.3741916618842847, |
| "eval_loss": 3.5503010749816895, |
| "eval_runtime": 181.3304, |
| "eval_samples_per_second": 91.827, |
| "eval_steps_per_second": 5.741, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.20813926408943, |
| "grad_norm": 0.44659337401390076, |
| "learning_rate": 0.0002856761793826441, |
| "loss": 3.1355, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.222694457382396, |
| "grad_norm": 0.42724499106407166, |
| "learning_rate": 0.0002855014560279557, |
| "loss": 3.1442, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.23724965067536, |
| "grad_norm": 0.42329496145248413, |
| "learning_rate": 0.0002853267326732673, |
| "loss": 3.1397, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.251804843968326, |
| "grad_norm": 0.48526531457901, |
| "learning_rate": 0.0002851520093185789, |
| "loss": 3.1581, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.266360037261293, |
| "grad_norm": 0.45286616683006287, |
| "learning_rate": 0.0002849772859638905, |
| "loss": 3.1552, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.28091523055426, |
| "grad_norm": 0.4201975464820862, |
| "learning_rate": 0.0002848025626092021, |
| "loss": 3.1467, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.295470423847227, |
| "grad_norm": 0.45757290720939636, |
| "learning_rate": 0.0002846278392545137, |
| "loss": 3.1504, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.310025617140194, |
| "grad_norm": 0.46606433391571045, |
| "learning_rate": 0.00028445311589982527, |
| "loss": 3.1574, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.32458081043316, |
| "grad_norm": 0.42274847626686096, |
| "learning_rate": 0.00028427839254513686, |
| "loss": 3.1672, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.33913600372613, |
| "grad_norm": 0.4176158905029297, |
| "learning_rate": 0.00028410366919044846, |
| "loss": 3.1589, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.353691197019096, |
| "grad_norm": 0.40520918369293213, |
| "learning_rate": 0.00028392894583576, |
| "loss": 3.1548, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.368246390312063, |
| "grad_norm": 0.43299931287765503, |
| "learning_rate": 0.00028375422248107165, |
| "loss": 3.159, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.38280158360503, |
| "grad_norm": 0.42173489928245544, |
| "learning_rate": 0.0002835794991263832, |
| "loss": 3.1499, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.397356776897997, |
| "grad_norm": 0.41218259930610657, |
| "learning_rate": 0.0002834047757716948, |
| "loss": 3.15, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.411911970190964, |
| "grad_norm": 0.4249900281429291, |
| "learning_rate": 0.0002832300524170064, |
| "loss": 3.1647, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.42646716348393, |
| "grad_norm": 0.4059372842311859, |
| "learning_rate": 0.00028305532906231797, |
| "loss": 3.1671, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.441022356776898, |
| "grad_norm": 0.4203883111476898, |
| "learning_rate": 0.00028288060570762956, |
| "loss": 3.1634, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.455577550069865, |
| "grad_norm": 0.4686765968799591, |
| "learning_rate": 0.00028270588235294116, |
| "loss": 3.1646, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.470132743362832, |
| "grad_norm": 0.47062376141548157, |
| "learning_rate": 0.00028253115899825275, |
| "loss": 3.1541, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.4846879366558, |
| "grad_norm": 0.4375110864639282, |
| "learning_rate": 0.0002823564356435643, |
| "loss": 3.1675, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.4846879366558, |
| "eval_accuracy": 0.3745528232938179, |
| "eval_loss": 3.5466043949127197, |
| "eval_runtime": 179.6775, |
| "eval_samples_per_second": 92.672, |
| "eval_steps_per_second": 5.794, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.499243129948766, |
| "grad_norm": 0.42747896909713745, |
| "learning_rate": 0.00028218171228887594, |
| "loss": 3.1751, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.513798323241733, |
| "grad_norm": 0.44301602244377136, |
| "learning_rate": 0.0002820069889341875, |
| "loss": 3.1676, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.5283535165347, |
| "grad_norm": 0.40379756689071655, |
| "learning_rate": 0.00028183226557949913, |
| "loss": 3.1587, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.542908709827667, |
| "grad_norm": 0.41646426916122437, |
| "learning_rate": 0.00028165754222481067, |
| "loss": 3.1727, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.557463903120635, |
| "grad_norm": 0.4359433948993683, |
| "learning_rate": 0.00028148281887012226, |
| "loss": 3.1746, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.5720190964136, |
| "grad_norm": 0.43240684270858765, |
| "learning_rate": 0.0002813080955154339, |
| "loss": 3.166, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.58657428970657, |
| "grad_norm": 0.4111410975456238, |
| "learning_rate": 0.00028113337216074545, |
| "loss": 3.1663, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.601129482999536, |
| "grad_norm": 0.4109819233417511, |
| "learning_rate": 0.00028095864880605705, |
| "loss": 3.1846, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.615684676292503, |
| "grad_norm": 0.44175857305526733, |
| "learning_rate": 0.00028078392545136864, |
| "loss": 3.1713, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.630239869585466, |
| "grad_norm": 0.43947285413742065, |
| "learning_rate": 0.00028060920209668023, |
| "loss": 3.169, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.644795062878433, |
| "grad_norm": 0.45988473296165466, |
| "learning_rate": 0.00028043447874199183, |
| "loss": 3.1822, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.6593502561714, |
| "grad_norm": 0.4398362934589386, |
| "learning_rate": 0.0002802597553873034, |
| "loss": 3.1828, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.673905449464367, |
| "grad_norm": 0.3981860876083374, |
| "learning_rate": 0.000280085032032615, |
| "loss": 3.1872, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.688460642757335, |
| "grad_norm": 0.46240130066871643, |
| "learning_rate": 0.00027991030867792656, |
| "loss": 3.1883, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.7030158360503, |
| "grad_norm": 0.46508920192718506, |
| "learning_rate": 0.0002797355853232382, |
| "loss": 3.1839, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.71757102934327, |
| "grad_norm": 0.4539368748664856, |
| "learning_rate": 0.00027956086196854975, |
| "loss": 3.1831, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.732126222636236, |
| "grad_norm": 0.4178009331226349, |
| "learning_rate": 0.0002793861386138614, |
| "loss": 3.1841, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.746681415929203, |
| "grad_norm": 0.4389614760875702, |
| "learning_rate": 0.00027921141525917293, |
| "loss": 3.1728, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.76123660922217, |
| "grad_norm": 0.444723904132843, |
| "learning_rate": 0.00027903669190448453, |
| "loss": 3.1765, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.775791802515137, |
| "grad_norm": 0.4468613266944885, |
| "learning_rate": 0.0002788619685497961, |
| "loss": 3.201, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.775791802515137, |
| "eval_accuracy": 0.37513763916555143, |
| "eval_loss": 3.536406993865967, |
| "eval_runtime": 178.3326, |
| "eval_samples_per_second": 93.371, |
| "eval_steps_per_second": 5.837, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.790346995808104, |
| "grad_norm": 0.4519892930984497, |
| "learning_rate": 0.0002786872451951077, |
| "loss": 3.1741, |
| "step": 92050 |
| }, |
| { |
| "epoch": 26.80490218910107, |
| "grad_norm": 0.45573484897613525, |
| "learning_rate": 0.0002785125218404193, |
| "loss": 3.1827, |
| "step": 92100 |
| }, |
| { |
| "epoch": 26.819457382394038, |
| "grad_norm": 0.43908625841140747, |
| "learning_rate": 0.0002783377984857309, |
| "loss": 3.1877, |
| "step": 92150 |
| }, |
| { |
| "epoch": 26.834012575687005, |
| "grad_norm": 0.4402633011341095, |
| "learning_rate": 0.0002781630751310425, |
| "loss": 3.1838, |
| "step": 92200 |
| }, |
| { |
| "epoch": 26.848567768979972, |
| "grad_norm": 0.4282047748565674, |
| "learning_rate": 0.0002779883517763541, |
| "loss": 3.193, |
| "step": 92250 |
| }, |
| { |
| "epoch": 26.86312296227294, |
| "grad_norm": 0.42211318016052246, |
| "learning_rate": 0.0002778136284216657, |
| "loss": 3.1904, |
| "step": 92300 |
| }, |
| { |
| "epoch": 26.877678155565906, |
| "grad_norm": 0.48066452145576477, |
| "learning_rate": 0.0002776389050669773, |
| "loss": 3.1884, |
| "step": 92350 |
| }, |
| { |
| "epoch": 26.892233348858873, |
| "grad_norm": 0.474904328584671, |
| "learning_rate": 0.0002774641817122888, |
| "loss": 3.1759, |
| "step": 92400 |
| }, |
| { |
| "epoch": 26.90678854215184, |
| "grad_norm": 0.45568016171455383, |
| "learning_rate": 0.00027728945835760047, |
| "loss": 3.1967, |
| "step": 92450 |
| }, |
| { |
| "epoch": 26.921343735444808, |
| "grad_norm": 0.45983555912971497, |
| "learning_rate": 0.000277114735002912, |
| "loss": 3.1979, |
| "step": 92500 |
| }, |
| { |
| "epoch": 26.935898928737775, |
| "grad_norm": 0.4195818603038788, |
| "learning_rate": 0.00027694001164822366, |
| "loss": 3.1885, |
| "step": 92550 |
| }, |
| { |
| "epoch": 26.95045412203074, |
| "grad_norm": 0.43301427364349365, |
| "learning_rate": 0.0002767652882935352, |
| "loss": 3.1931, |
| "step": 92600 |
| }, |
| { |
| "epoch": 26.96500931532371, |
| "grad_norm": 0.4479294419288635, |
| "learning_rate": 0.0002765905649388468, |
| "loss": 3.1973, |
| "step": 92650 |
| }, |
| { |
| "epoch": 26.979564508616676, |
| "grad_norm": 0.43495991826057434, |
| "learning_rate": 0.0002764158415841584, |
| "loss": 3.1899, |
| "step": 92700 |
| }, |
| { |
| "epoch": 26.994119701909643, |
| "grad_norm": 0.431049108505249, |
| "learning_rate": 0.00027624111822947, |
| "loss": 3.2076, |
| "step": 92750 |
| }, |
| { |
| "epoch": 27.00844201210992, |
| "grad_norm": 0.4351713955402374, |
| "learning_rate": 0.0002760663948747816, |
| "loss": 3.1334, |
| "step": 92800 |
| }, |
| { |
| "epoch": 27.022997205402888, |
| "grad_norm": 0.4406755268573761, |
| "learning_rate": 0.00027589167152009317, |
| "loss": 3.109, |
| "step": 92850 |
| }, |
| { |
| "epoch": 27.037552398695855, |
| "grad_norm": 0.46634653210639954, |
| "learning_rate": 0.00027571694816540477, |
| "loss": 3.1128, |
| "step": 92900 |
| }, |
| { |
| "epoch": 27.052107591988822, |
| "grad_norm": 0.4551210105419159, |
| "learning_rate": 0.0002755422248107163, |
| "loss": 3.0966, |
| "step": 92950 |
| }, |
| { |
| "epoch": 27.06666278528179, |
| "grad_norm": 0.45429322123527527, |
| "learning_rate": 0.00027536750145602795, |
| "loss": 3.1089, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.06666278528179, |
| "eval_accuracy": 0.374071548978153, |
| "eval_loss": 3.554753065109253, |
| "eval_runtime": 178.296, |
| "eval_samples_per_second": 93.39, |
| "eval_steps_per_second": 5.839, |
| "step": 93000 |
| }, |
| { |
| "epoch": 27.081217978574756, |
| "grad_norm": 0.44996699690818787, |
| "learning_rate": 0.0002751927781013395, |
| "loss": 3.1094, |
| "step": 93050 |
| }, |
| { |
| "epoch": 27.095773171867723, |
| "grad_norm": 0.45104679465293884, |
| "learning_rate": 0.0002750180547466511, |
| "loss": 3.11, |
| "step": 93100 |
| }, |
| { |
| "epoch": 27.11032836516069, |
| "grad_norm": 0.4221501052379608, |
| "learning_rate": 0.0002748433313919627, |
| "loss": 3.1181, |
| "step": 93150 |
| }, |
| { |
| "epoch": 27.124883558453657, |
| "grad_norm": 0.46069955825805664, |
| "learning_rate": 0.0002746686080372743, |
| "loss": 3.1173, |
| "step": 93200 |
| }, |
| { |
| "epoch": 27.139438751746624, |
| "grad_norm": 0.5119186043739319, |
| "learning_rate": 0.00027449388468258587, |
| "loss": 3.122, |
| "step": 93250 |
| }, |
| { |
| "epoch": 27.15399394503959, |
| "grad_norm": 0.4407660961151123, |
| "learning_rate": 0.00027431916132789747, |
| "loss": 3.1249, |
| "step": 93300 |
| }, |
| { |
| "epoch": 27.16854913833256, |
| "grad_norm": 0.426318496465683, |
| "learning_rate": 0.00027414443797320906, |
| "loss": 3.1265, |
| "step": 93350 |
| }, |
| { |
| "epoch": 27.183104331625525, |
| "grad_norm": 0.4476647973060608, |
| "learning_rate": 0.00027396971461852065, |
| "loss": 3.1378, |
| "step": 93400 |
| }, |
| { |
| "epoch": 27.197659524918492, |
| "grad_norm": 0.45376473665237427, |
| "learning_rate": 0.00027379499126383225, |
| "loss": 3.1141, |
| "step": 93450 |
| }, |
| { |
| "epoch": 27.21221471821146, |
| "grad_norm": 0.46606171131134033, |
| "learning_rate": 0.00027362026790914384, |
| "loss": 3.1289, |
| "step": 93500 |
| }, |
| { |
| "epoch": 27.226769911504423, |
| "grad_norm": 0.4278535842895508, |
| "learning_rate": 0.00027344554455445544, |
| "loss": 3.1362, |
| "step": 93550 |
| }, |
| { |
| "epoch": 27.24132510479739, |
| "grad_norm": 0.4427182972431183, |
| "learning_rate": 0.00027327082119976703, |
| "loss": 3.1378, |
| "step": 93600 |
| }, |
| { |
| "epoch": 27.255880298090357, |
| "grad_norm": 0.4301832616329193, |
| "learning_rate": 0.00027309609784507857, |
| "loss": 3.1287, |
| "step": 93650 |
| }, |
| { |
| "epoch": 27.270435491383324, |
| "grad_norm": 0.4756440222263336, |
| "learning_rate": 0.0002729213744903902, |
| "loss": 3.1426, |
| "step": 93700 |
| }, |
| { |
| "epoch": 27.28499068467629, |
| "grad_norm": 0.4330383837223053, |
| "learning_rate": 0.00027274665113570176, |
| "loss": 3.139, |
| "step": 93750 |
| }, |
| { |
| "epoch": 27.29954587796926, |
| "grad_norm": 0.44224536418914795, |
| "learning_rate": 0.00027257192778101335, |
| "loss": 3.14, |
| "step": 93800 |
| }, |
| { |
| "epoch": 27.314101071262225, |
| "grad_norm": 0.453418105840683, |
| "learning_rate": 0.00027239720442632495, |
| "loss": 3.1404, |
| "step": 93850 |
| }, |
| { |
| "epoch": 27.328656264555192, |
| "grad_norm": 0.4232199490070343, |
| "learning_rate": 0.00027222248107163654, |
| "loss": 3.1517, |
| "step": 93900 |
| }, |
| { |
| "epoch": 27.34321145784816, |
| "grad_norm": 0.4673709273338318, |
| "learning_rate": 0.00027204775771694814, |
| "loss": 3.1443, |
| "step": 93950 |
| }, |
| { |
| "epoch": 27.357766651141127, |
| "grad_norm": 0.46243950724601746, |
| "learning_rate": 0.00027187303436225973, |
| "loss": 3.151, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.357766651141127, |
| "eval_accuracy": 0.3744229556213369, |
| "eval_loss": 3.544090747833252, |
| "eval_runtime": 178.3649, |
| "eval_samples_per_second": 93.354, |
| "eval_steps_per_second": 5.836, |
| "step": 94000 |
| }, |
| { |
| "epoch": 27.372321844434094, |
| "grad_norm": 0.418197363615036, |
| "learning_rate": 0.0002716983110075713, |
| "loss": 3.1402, |
| "step": 94050 |
| }, |
| { |
| "epoch": 27.38687703772706, |
| "grad_norm": 0.48369506001472473, |
| "learning_rate": 0.0002715235876528829, |
| "loss": 3.137, |
| "step": 94100 |
| }, |
| { |
| "epoch": 27.401432231020028, |
| "grad_norm": 0.42239800095558167, |
| "learning_rate": 0.0002713488642981945, |
| "loss": 3.1435, |
| "step": 94150 |
| }, |
| { |
| "epoch": 27.415987424312995, |
| "grad_norm": 0.4270704984664917, |
| "learning_rate": 0.00027117414094350606, |
| "loss": 3.1562, |
| "step": 94200 |
| }, |
| { |
| "epoch": 27.430542617605962, |
| "grad_norm": 0.4096634089946747, |
| "learning_rate": 0.0002709994175888177, |
| "loss": 3.1447, |
| "step": 94250 |
| }, |
| { |
| "epoch": 27.44509781089893, |
| "grad_norm": 0.43484556674957275, |
| "learning_rate": 0.00027082469423412924, |
| "loss": 3.1578, |
| "step": 94300 |
| }, |
| { |
| "epoch": 27.459653004191896, |
| "grad_norm": 0.44652125239372253, |
| "learning_rate": 0.00027064997087944084, |
| "loss": 3.1537, |
| "step": 94350 |
| }, |
| { |
| "epoch": 27.474208197484863, |
| "grad_norm": 0.4425007104873657, |
| "learning_rate": 0.00027047524752475243, |
| "loss": 3.1643, |
| "step": 94400 |
| }, |
| { |
| "epoch": 27.48876339077783, |
| "grad_norm": 0.4466058909893036, |
| "learning_rate": 0.000270300524170064, |
| "loss": 3.1598, |
| "step": 94450 |
| }, |
| { |
| "epoch": 27.503318584070797, |
| "grad_norm": 0.4788243770599365, |
| "learning_rate": 0.0002701258008153757, |
| "loss": 3.1604, |
| "step": 94500 |
| }, |
| { |
| "epoch": 27.517873777363764, |
| "grad_norm": 0.4294137954711914, |
| "learning_rate": 0.0002699510774606872, |
| "loss": 3.16, |
| "step": 94550 |
| }, |
| { |
| "epoch": 27.53242897065673, |
| "grad_norm": 0.46710747480392456, |
| "learning_rate": 0.0002697763541059988, |
| "loss": 3.1648, |
| "step": 94600 |
| }, |
| { |
| "epoch": 27.5469841639497, |
| "grad_norm": 0.43919190764427185, |
| "learning_rate": 0.0002696016307513104, |
| "loss": 3.1596, |
| "step": 94650 |
| }, |
| { |
| "epoch": 27.561539357242665, |
| "grad_norm": 0.42766788601875305, |
| "learning_rate": 0.000269426907396622, |
| "loss": 3.1594, |
| "step": 94700 |
| }, |
| { |
| "epoch": 27.576094550535633, |
| "grad_norm": 0.4629672169685364, |
| "learning_rate": 0.0002692521840419336, |
| "loss": 3.168, |
| "step": 94750 |
| }, |
| { |
| "epoch": 27.5906497438286, |
| "grad_norm": 0.4353528916835785, |
| "learning_rate": 0.0002690774606872452, |
| "loss": 3.1524, |
| "step": 94800 |
| }, |
| { |
| "epoch": 27.605204937121567, |
| "grad_norm": 0.4430389404296875, |
| "learning_rate": 0.0002689027373325568, |
| "loss": 3.1609, |
| "step": 94850 |
| }, |
| { |
| "epoch": 27.619760130414534, |
| "grad_norm": 0.42469891905784607, |
| "learning_rate": 0.0002687280139778683, |
| "loss": 3.1755, |
| "step": 94900 |
| }, |
| { |
| "epoch": 27.634315323707497, |
| "grad_norm": 0.4509817957878113, |
| "learning_rate": 0.00026855329062317997, |
| "loss": 3.18, |
| "step": 94950 |
| }, |
| { |
| "epoch": 27.648870517000464, |
| "grad_norm": 0.4238099455833435, |
| "learning_rate": 0.0002683785672684915, |
| "loss": 3.1649, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.648870517000464, |
| "eval_accuracy": 0.37499155272492346, |
| "eval_loss": 3.537271022796631, |
| "eval_runtime": 178.3239, |
| "eval_samples_per_second": 93.375, |
| "eval_steps_per_second": 5.838, |
| "step": 95000 |
| }, |
| { |
| "epoch": 27.66342571029343, |
| "grad_norm": 0.4029273986816406, |
| "learning_rate": 0.0002682038439138031, |
| "loss": 3.1908, |
| "step": 95050 |
| }, |
| { |
| "epoch": 27.6779809035864, |
| "grad_norm": 0.4985395669937134, |
| "learning_rate": 0.0002680291205591147, |
| "loss": 3.1838, |
| "step": 95100 |
| }, |
| { |
| "epoch": 27.692536096879365, |
| "grad_norm": 0.43642306327819824, |
| "learning_rate": 0.0002678543972044263, |
| "loss": 3.174, |
| "step": 95150 |
| }, |
| { |
| "epoch": 27.707091290172333, |
| "grad_norm": 0.42214831709861755, |
| "learning_rate": 0.0002676796738497379, |
| "loss": 3.1736, |
| "step": 95200 |
| }, |
| { |
| "epoch": 27.7216464834653, |
| "grad_norm": 0.4335307478904724, |
| "learning_rate": 0.0002675049504950495, |
| "loss": 3.1655, |
| "step": 95250 |
| }, |
| { |
| "epoch": 27.736201676758267, |
| "grad_norm": 0.45082777738571167, |
| "learning_rate": 0.0002673302271403611, |
| "loss": 3.1698, |
| "step": 95300 |
| }, |
| { |
| "epoch": 27.750756870051234, |
| "grad_norm": 0.44629359245300293, |
| "learning_rate": 0.00026715550378567267, |
| "loss": 3.1739, |
| "step": 95350 |
| }, |
| { |
| "epoch": 27.7653120633442, |
| "grad_norm": 0.44939613342285156, |
| "learning_rate": 0.00026698078043098426, |
| "loss": 3.1689, |
| "step": 95400 |
| }, |
| { |
| "epoch": 27.779867256637168, |
| "grad_norm": 0.4434186816215515, |
| "learning_rate": 0.00026680605707629586, |
| "loss": 3.1773, |
| "step": 95450 |
| }, |
| { |
| "epoch": 27.794422449930135, |
| "grad_norm": 0.42752760648727417, |
| "learning_rate": 0.00026663133372160745, |
| "loss": 3.1754, |
| "step": 95500 |
| }, |
| { |
| "epoch": 27.808977643223102, |
| "grad_norm": 0.41842010617256165, |
| "learning_rate": 0.00026645661036691905, |
| "loss": 3.1773, |
| "step": 95550 |
| }, |
| { |
| "epoch": 27.82353283651607, |
| "grad_norm": 0.42154571413993835, |
| "learning_rate": 0.0002662818870122306, |
| "loss": 3.1826, |
| "step": 95600 |
| }, |
| { |
| "epoch": 27.838088029809036, |
| "grad_norm": 0.4588300585746765, |
| "learning_rate": 0.00026610716365754224, |
| "loss": 3.1839, |
| "step": 95650 |
| }, |
| { |
| "epoch": 27.852643223102003, |
| "grad_norm": 0.46981218457221985, |
| "learning_rate": 0.0002659324403028538, |
| "loss": 3.1816, |
| "step": 95700 |
| }, |
| { |
| "epoch": 27.86719841639497, |
| "grad_norm": 0.4390254020690918, |
| "learning_rate": 0.00026575771694816537, |
| "loss": 3.1891, |
| "step": 95750 |
| }, |
| { |
| "epoch": 27.881753609687937, |
| "grad_norm": 0.44484564661979675, |
| "learning_rate": 0.00026558299359347696, |
| "loss": 3.1896, |
| "step": 95800 |
| }, |
| { |
| "epoch": 27.896308802980904, |
| "grad_norm": 0.44375529885292053, |
| "learning_rate": 0.00026540827023878856, |
| "loss": 3.1737, |
| "step": 95850 |
| }, |
| { |
| "epoch": 27.91086399627387, |
| "grad_norm": 0.41514551639556885, |
| "learning_rate": 0.00026523354688410015, |
| "loss": 3.1779, |
| "step": 95900 |
| }, |
| { |
| "epoch": 27.92541918956684, |
| "grad_norm": 0.4646185636520386, |
| "learning_rate": 0.00026505882352941175, |
| "loss": 3.1753, |
| "step": 95950 |
| }, |
| { |
| "epoch": 27.939974382859806, |
| "grad_norm": 0.42588916420936584, |
| "learning_rate": 0.00026488410017472334, |
| "loss": 3.1918, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.939974382859806, |
| "eval_accuracy": 0.37517289735717524, |
| "eval_loss": 3.534158945083618, |
| "eval_runtime": 178.3519, |
| "eval_samples_per_second": 93.36, |
| "eval_steps_per_second": 5.837, |
| "step": 96000 |
| }, |
| { |
| "epoch": 27.954529576152773, |
| "grad_norm": 0.42573437094688416, |
| "learning_rate": 0.00026470937682003494, |
| "loss": 3.1895, |
| "step": 96050 |
| }, |
| { |
| "epoch": 27.96908476944574, |
| "grad_norm": 0.46956467628479004, |
| "learning_rate": 0.00026453465346534653, |
| "loss": 3.1848, |
| "step": 96100 |
| }, |
| { |
| "epoch": 27.983639962738707, |
| "grad_norm": 0.41468024253845215, |
| "learning_rate": 0.00026435993011065807, |
| "loss": 3.1891, |
| "step": 96150 |
| }, |
| { |
| "epoch": 27.998195156031674, |
| "grad_norm": 0.4592526853084564, |
| "learning_rate": 0.0002641852067559697, |
| "loss": 3.1807, |
| "step": 96200 |
| }, |
| { |
| "epoch": 28.01251746623195, |
| "grad_norm": 0.42601925134658813, |
| "learning_rate": 0.00026401048340128126, |
| "loss": 3.1132, |
| "step": 96250 |
| }, |
| { |
| "epoch": 28.02707265952492, |
| "grad_norm": 0.45896121859550476, |
| "learning_rate": 0.00026383576004659285, |
| "loss": 3.0964, |
| "step": 96300 |
| }, |
| { |
| "epoch": 28.041627852817886, |
| "grad_norm": 0.4385358691215515, |
| "learning_rate": 0.00026366103669190445, |
| "loss": 3.0943, |
| "step": 96350 |
| }, |
| { |
| "epoch": 28.056183046110853, |
| "grad_norm": 0.4664997160434723, |
| "learning_rate": 0.00026348631333721604, |
| "loss": 3.0952, |
| "step": 96400 |
| }, |
| { |
| "epoch": 28.07073823940382, |
| "grad_norm": 0.4414977729320526, |
| "learning_rate": 0.00026331158998252764, |
| "loss": 3.1015, |
| "step": 96450 |
| }, |
| { |
| "epoch": 28.085293432696787, |
| "grad_norm": 0.47265881299972534, |
| "learning_rate": 0.00026313686662783923, |
| "loss": 3.1015, |
| "step": 96500 |
| }, |
| { |
| "epoch": 28.099848625989754, |
| "grad_norm": 0.45517316460609436, |
| "learning_rate": 0.0002629621432731508, |
| "loss": 3.1052, |
| "step": 96550 |
| }, |
| { |
| "epoch": 28.11440381928272, |
| "grad_norm": 0.43620362877845764, |
| "learning_rate": 0.0002627874199184624, |
| "loss": 3.1148, |
| "step": 96600 |
| }, |
| { |
| "epoch": 28.128959012575688, |
| "grad_norm": 0.49415022134780884, |
| "learning_rate": 0.000262612696563774, |
| "loss": 3.101, |
| "step": 96650 |
| }, |
| { |
| "epoch": 28.143514205868655, |
| "grad_norm": 0.43687793612480164, |
| "learning_rate": 0.0002624379732090856, |
| "loss": 3.1072, |
| "step": 96700 |
| }, |
| { |
| "epoch": 28.158069399161622, |
| "grad_norm": 0.46462565660476685, |
| "learning_rate": 0.0002622632498543972, |
| "loss": 3.1101, |
| "step": 96750 |
| }, |
| { |
| "epoch": 28.17262459245459, |
| "grad_norm": 0.4543834328651428, |
| "learning_rate": 0.0002620885264997088, |
| "loss": 3.1162, |
| "step": 96800 |
| }, |
| { |
| "epoch": 28.187179785747556, |
| "grad_norm": 0.4572249948978424, |
| "learning_rate": 0.00026191380314502034, |
| "loss": 3.1219, |
| "step": 96850 |
| }, |
| { |
| "epoch": 28.201734979040523, |
| "grad_norm": 0.44651177525520325, |
| "learning_rate": 0.000261739079790332, |
| "loss": 3.1183, |
| "step": 96900 |
| }, |
| { |
| "epoch": 28.216290172333487, |
| "grad_norm": 0.43274229764938354, |
| "learning_rate": 0.0002615643564356435, |
| "loss": 3.1271, |
| "step": 96950 |
| }, |
| { |
| "epoch": 28.230845365626454, |
| "grad_norm": 0.4469480514526367, |
| "learning_rate": 0.0002613896330809551, |
| "loss": 3.1161, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.230845365626454, |
| "eval_accuracy": 0.37445022195619265, |
| "eval_loss": 3.5514791011810303, |
| "eval_runtime": 180.2815, |
| "eval_samples_per_second": 92.361, |
| "eval_steps_per_second": 5.774, |
| "step": 97000 |
| }, |
| { |
| "epoch": 28.24540055891942, |
| "grad_norm": 0.4637867212295532, |
| "learning_rate": 0.0002612149097262667, |
| "loss": 3.1273, |
| "step": 97050 |
| }, |
| { |
| "epoch": 28.259955752212388, |
| "grad_norm": 0.46112245321273804, |
| "learning_rate": 0.0002610401863715783, |
| "loss": 3.1342, |
| "step": 97100 |
| }, |
| { |
| "epoch": 28.274510945505355, |
| "grad_norm": 0.4241412580013275, |
| "learning_rate": 0.0002608654630168899, |
| "loss": 3.1403, |
| "step": 97150 |
| }, |
| { |
| "epoch": 28.289066138798322, |
| "grad_norm": 0.5504874587059021, |
| "learning_rate": 0.0002606907396622015, |
| "loss": 3.1314, |
| "step": 97200 |
| }, |
| { |
| "epoch": 28.30362133209129, |
| "grad_norm": 0.4429568946361542, |
| "learning_rate": 0.0002605160163075131, |
| "loss": 3.1404, |
| "step": 97250 |
| }, |
| { |
| "epoch": 28.318176525384256, |
| "grad_norm": 0.4465397000312805, |
| "learning_rate": 0.0002603412929528247, |
| "loss": 3.1339, |
| "step": 97300 |
| }, |
| { |
| "epoch": 28.332731718677223, |
| "grad_norm": 0.4697418510913849, |
| "learning_rate": 0.0002601665695981363, |
| "loss": 3.1313, |
| "step": 97350 |
| }, |
| { |
| "epoch": 28.34728691197019, |
| "grad_norm": 0.4520648121833801, |
| "learning_rate": 0.0002599918462434478, |
| "loss": 3.151, |
| "step": 97400 |
| }, |
| { |
| "epoch": 28.361842105263158, |
| "grad_norm": 0.41424447298049927, |
| "learning_rate": 0.00025981712288875947, |
| "loss": 3.1435, |
| "step": 97450 |
| }, |
| { |
| "epoch": 28.376397298556125, |
| "grad_norm": 0.4646261930465698, |
| "learning_rate": 0.000259642399534071, |
| "loss": 3.1326, |
| "step": 97500 |
| }, |
| { |
| "epoch": 28.39095249184909, |
| "grad_norm": 0.4540160298347473, |
| "learning_rate": 0.0002594676761793826, |
| "loss": 3.14, |
| "step": 97550 |
| }, |
| { |
| "epoch": 28.40550768514206, |
| "grad_norm": 0.475751668214798, |
| "learning_rate": 0.0002592929528246942, |
| "loss": 3.1422, |
| "step": 97600 |
| }, |
| { |
| "epoch": 28.420062878435026, |
| "grad_norm": 0.4772433936595917, |
| "learning_rate": 0.0002591182294700058, |
| "loss": 3.1381, |
| "step": 97650 |
| }, |
| { |
| "epoch": 28.434618071727993, |
| "grad_norm": 0.4552571773529053, |
| "learning_rate": 0.0002589435061153174, |
| "loss": 3.1463, |
| "step": 97700 |
| }, |
| { |
| "epoch": 28.44917326502096, |
| "grad_norm": 0.4398738145828247, |
| "learning_rate": 0.000258768782760629, |
| "loss": 3.1451, |
| "step": 97750 |
| }, |
| { |
| "epoch": 28.463728458313927, |
| "grad_norm": 0.4349164068698883, |
| "learning_rate": 0.0002585940594059406, |
| "loss": 3.1429, |
| "step": 97800 |
| }, |
| { |
| "epoch": 28.478283651606894, |
| "grad_norm": 0.45709657669067383, |
| "learning_rate": 0.00025841933605125217, |
| "loss": 3.1559, |
| "step": 97850 |
| }, |
| { |
| "epoch": 28.49283884489986, |
| "grad_norm": 0.4473492205142975, |
| "learning_rate": 0.00025824461269656376, |
| "loss": 3.1534, |
| "step": 97900 |
| }, |
| { |
| "epoch": 28.507394038192828, |
| "grad_norm": 0.4502778649330139, |
| "learning_rate": 0.00025806988934187536, |
| "loss": 3.1626, |
| "step": 97950 |
| }, |
| { |
| "epoch": 28.521949231485795, |
| "grad_norm": 0.4576888680458069, |
| "learning_rate": 0.00025789516598718695, |
| "loss": 3.1603, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.521949231485795, |
| "eval_accuracy": 0.3747342854533751, |
| "eval_loss": 3.542839527130127, |
| "eval_runtime": 180.6667, |
| "eval_samples_per_second": 92.164, |
| "eval_steps_per_second": 5.762, |
| "step": 98000 |
| }, |
| { |
| "epoch": 28.536504424778762, |
| "grad_norm": 0.45554453134536743, |
| "learning_rate": 0.00025772044263249854, |
| "loss": 3.1546, |
| "step": 98050 |
| }, |
| { |
| "epoch": 28.55105961807173, |
| "grad_norm": 0.44775858521461487, |
| "learning_rate": 0.0002575457192778101, |
| "loss": 3.1434, |
| "step": 98100 |
| }, |
| { |
| "epoch": 28.565614811364696, |
| "grad_norm": 0.46019670367240906, |
| "learning_rate": 0.00025737099592312173, |
| "loss": 3.158, |
| "step": 98150 |
| }, |
| { |
| "epoch": 28.580170004657663, |
| "grad_norm": 0.45176613330841064, |
| "learning_rate": 0.0002571962725684333, |
| "loss": 3.1486, |
| "step": 98200 |
| }, |
| { |
| "epoch": 28.59472519795063, |
| "grad_norm": 0.4416624903678894, |
| "learning_rate": 0.00025702154921374487, |
| "loss": 3.1602, |
| "step": 98250 |
| }, |
| { |
| "epoch": 28.609280391243594, |
| "grad_norm": 0.42979925870895386, |
| "learning_rate": 0.00025684682585905646, |
| "loss": 3.1566, |
| "step": 98300 |
| }, |
| { |
| "epoch": 28.62383558453656, |
| "grad_norm": 0.4476456344127655, |
| "learning_rate": 0.00025667210250436806, |
| "loss": 3.1632, |
| "step": 98350 |
| }, |
| { |
| "epoch": 28.638390777829528, |
| "grad_norm": 0.47356441617012024, |
| "learning_rate": 0.00025649737914967965, |
| "loss": 3.1572, |
| "step": 98400 |
| }, |
| { |
| "epoch": 28.652945971122495, |
| "grad_norm": 0.44925108551979065, |
| "learning_rate": 0.00025632265579499124, |
| "loss": 3.1615, |
| "step": 98450 |
| }, |
| { |
| "epoch": 28.667501164415462, |
| "grad_norm": 0.4434398114681244, |
| "learning_rate": 0.00025614793244030284, |
| "loss": 3.162, |
| "step": 98500 |
| }, |
| { |
| "epoch": 28.68205635770843, |
| "grad_norm": 0.4198032021522522, |
| "learning_rate": 0.0002559732090856144, |
| "loss": 3.1689, |
| "step": 98550 |
| }, |
| { |
| "epoch": 28.696611551001396, |
| "grad_norm": 0.4542519450187683, |
| "learning_rate": 0.00025579848573092603, |
| "loss": 3.1684, |
| "step": 98600 |
| }, |
| { |
| "epoch": 28.711166744294363, |
| "grad_norm": 0.4310658872127533, |
| "learning_rate": 0.0002556237623762376, |
| "loss": 3.1717, |
| "step": 98650 |
| }, |
| { |
| "epoch": 28.72572193758733, |
| "grad_norm": 0.45117637515068054, |
| "learning_rate": 0.0002554490390215492, |
| "loss": 3.1588, |
| "step": 98700 |
| }, |
| { |
| "epoch": 28.740277130880298, |
| "grad_norm": 0.4620664417743683, |
| "learning_rate": 0.0002552743156668608, |
| "loss": 3.1613, |
| "step": 98750 |
| }, |
| { |
| "epoch": 28.754832324173265, |
| "grad_norm": 0.44088515639305115, |
| "learning_rate": 0.00025509959231217235, |
| "loss": 3.1553, |
| "step": 98800 |
| }, |
| { |
| "epoch": 28.76938751746623, |
| "grad_norm": 0.4798021912574768, |
| "learning_rate": 0.000254924868957484, |
| "loss": 3.1638, |
| "step": 98850 |
| }, |
| { |
| "epoch": 28.7839427107592, |
| "grad_norm": 0.4578956365585327, |
| "learning_rate": 0.00025475014560279554, |
| "loss": 3.1662, |
| "step": 98900 |
| }, |
| { |
| "epoch": 28.798497904052166, |
| "grad_norm": 0.4738767147064209, |
| "learning_rate": 0.00025457542224810713, |
| "loss": 3.1644, |
| "step": 98950 |
| }, |
| { |
| "epoch": 28.813053097345133, |
| "grad_norm": 0.4682692587375641, |
| "learning_rate": 0.00025440069889341873, |
| "loss": 3.1628, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.813053097345133, |
| "eval_accuracy": 0.3754198222258473, |
| "eval_loss": 3.5394463539123535, |
| "eval_runtime": 180.8378, |
| "eval_samples_per_second": 92.077, |
| "eval_steps_per_second": 5.757, |
| "step": 99000 |
| }, |
| { |
| "epoch": 28.8276082906381, |
| "grad_norm": 0.46600857377052307, |
| "learning_rate": 0.0002542259755387303, |
| "loss": 3.1667, |
| "step": 99050 |
| }, |
| { |
| "epoch": 28.842163483931067, |
| "grad_norm": 0.47476571798324585, |
| "learning_rate": 0.0002540512521840419, |
| "loss": 3.1602, |
| "step": 99100 |
| }, |
| { |
| "epoch": 28.856718677224034, |
| "grad_norm": 0.4336269199848175, |
| "learning_rate": 0.0002538765288293535, |
| "loss": 3.1728, |
| "step": 99150 |
| }, |
| { |
| "epoch": 28.871273870517, |
| "grad_norm": 0.4457774758338928, |
| "learning_rate": 0.0002537018054746651, |
| "loss": 3.1742, |
| "step": 99200 |
| }, |
| { |
| "epoch": 28.885829063809968, |
| "grad_norm": 0.45064619183540344, |
| "learning_rate": 0.00025352708211997664, |
| "loss": 3.176, |
| "step": 99250 |
| }, |
| { |
| "epoch": 28.900384257102935, |
| "grad_norm": 0.45002007484436035, |
| "learning_rate": 0.0002533523587652883, |
| "loss": 3.1727, |
| "step": 99300 |
| }, |
| { |
| "epoch": 28.914939450395902, |
| "grad_norm": 0.4713496267795563, |
| "learning_rate": 0.00025317763541059983, |
| "loss": 3.1729, |
| "step": 99350 |
| }, |
| { |
| "epoch": 28.92949464368887, |
| "grad_norm": 0.43779316544532776, |
| "learning_rate": 0.0002530029120559115, |
| "loss": 3.1756, |
| "step": 99400 |
| }, |
| { |
| "epoch": 28.944049836981836, |
| "grad_norm": 0.5112631320953369, |
| "learning_rate": 0.000252828188701223, |
| "loss": 3.1696, |
| "step": 99450 |
| }, |
| { |
| "epoch": 28.958605030274803, |
| "grad_norm": 0.4868534803390503, |
| "learning_rate": 0.0002526534653465346, |
| "loss": 3.1802, |
| "step": 99500 |
| }, |
| { |
| "epoch": 28.97316022356777, |
| "grad_norm": 0.4310745596885681, |
| "learning_rate": 0.0002524787419918462, |
| "loss": 3.1854, |
| "step": 99550 |
| }, |
| { |
| "epoch": 28.987715416860738, |
| "grad_norm": 0.4424709975719452, |
| "learning_rate": 0.0002523040186371578, |
| "loss": 3.1797, |
| "step": 99600 |
| }, |
| { |
| "epoch": 29.002037727061015, |
| "grad_norm": 0.4721603989601135, |
| "learning_rate": 0.0002521292952824694, |
| "loss": 3.1789, |
| "step": 99650 |
| }, |
| { |
| "epoch": 29.016592920353983, |
| "grad_norm": 0.480665922164917, |
| "learning_rate": 0.000251954571927781, |
| "loss": 3.0871, |
| "step": 99700 |
| }, |
| { |
| "epoch": 29.03114811364695, |
| "grad_norm": 0.4653773009777069, |
| "learning_rate": 0.0002517798485730926, |
| "loss": 3.0885, |
| "step": 99750 |
| }, |
| { |
| "epoch": 29.045703306939917, |
| "grad_norm": 0.49100857973098755, |
| "learning_rate": 0.0002516051252184042, |
| "loss": 3.0812, |
| "step": 99800 |
| }, |
| { |
| "epoch": 29.060258500232884, |
| "grad_norm": 0.45538330078125, |
| "learning_rate": 0.0002514304018637158, |
| "loss": 3.0897, |
| "step": 99850 |
| }, |
| { |
| "epoch": 29.07481369352585, |
| "grad_norm": 0.43507033586502075, |
| "learning_rate": 0.00025125567850902737, |
| "loss": 3.091, |
| "step": 99900 |
| }, |
| { |
| "epoch": 29.089368886818818, |
| "grad_norm": 0.4787675142288208, |
| "learning_rate": 0.0002510809551543389, |
| "loss": 3.1075, |
| "step": 99950 |
| }, |
| { |
| "epoch": 29.103924080111785, |
| "grad_norm": 0.447716623544693, |
| "learning_rate": 0.00025090623179965056, |
| "loss": 3.087, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.103924080111785, |
| "eval_accuracy": 0.3742630009586702, |
| "eval_loss": 3.554548978805542, |
| "eval_runtime": 179.2564, |
| "eval_samples_per_second": 92.889, |
| "eval_steps_per_second": 5.807, |
| "step": 100000 |
| }, |
| { |
| "epoch": 29.103924080111785, |
| "step": 100000, |
| "total_flos": 2.089804004130816e+18, |
| "train_loss": 0.6329161505126953, |
| "train_runtime": 39779.2124, |
| "train_samples_per_second": 345.415, |
| "train_steps_per_second": 4.319 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 20 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.089804004130816e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|