| { |
| "best_metric": 3.4000582695007324, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__8397/checkpoint-50000", |
| "epoch": 5.390835579514825, |
| "eval_steps": 1000, |
| "global_step": 50000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005390835579514825, |
| "grad_norm": 1.9401150941848755, |
| "learning_rate": 0.0003, |
| "loss": 8.6351, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01078167115902965, |
| "grad_norm": 1.2623108625411987, |
| "learning_rate": 0.0006, |
| "loss": 6.8864, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.016172506738544475, |
| "grad_norm": 3.276719808578491, |
| "learning_rate": 0.0005996762007555315, |
| "loss": 6.4636, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.0215633423180593, |
| "grad_norm": 2.706618309020996, |
| "learning_rate": 0.000599352401511063, |
| "loss": 6.2398, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.026954177897574125, |
| "grad_norm": 1.0366517305374146, |
| "learning_rate": 0.0005990286022665946, |
| "loss": 6.0879, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03234501347708895, |
| "grad_norm": 2.069812297821045, |
| "learning_rate": 0.0005987048030221263, |
| "loss": 5.9796, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03773584905660377, |
| "grad_norm": 1.1950629949569702, |
| "learning_rate": 0.0005983810037776578, |
| "loss": 5.8688, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.0431266846361186, |
| "grad_norm": 1.0007357597351074, |
| "learning_rate": 0.0005980572045331894, |
| "loss": 5.8233, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04851752021563342, |
| "grad_norm": 2.0554916858673096, |
| "learning_rate": 0.0005977334052887209, |
| "loss": 5.7452, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05390835579514825, |
| "grad_norm": 1.996433973312378, |
| "learning_rate": 0.0005974096060442526, |
| "loss": 5.6484, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05929919137466307, |
| "grad_norm": 1.439592719078064, |
| "learning_rate": 0.0005970858067997841, |
| "loss": 5.5734, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.0646900269541779, |
| "grad_norm": 1.0077263116836548, |
| "learning_rate": 0.0005967620075553157, |
| "loss": 5.4903, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.07008086253369272, |
| "grad_norm": 1.460601568222046, |
| "learning_rate": 0.0005964382083108472, |
| "loss": 5.4273, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07547169811320754, |
| "grad_norm": 1.5479509830474854, |
| "learning_rate": 0.0005961144090663788, |
| "loss": 5.3803, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08086253369272237, |
| "grad_norm": 1.2044662237167358, |
| "learning_rate": 0.0005957906098219104, |
| "loss": 5.305, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.0862533692722372, |
| "grad_norm": 1.4456267356872559, |
| "learning_rate": 0.0005954668105774419, |
| "loss": 5.2577, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09164420485175202, |
| "grad_norm": 1.4306107759475708, |
| "learning_rate": 0.0005951430113329735, |
| "loss": 5.2168, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.09703504043126684, |
| "grad_norm": 1.5723680257797241, |
| "learning_rate": 0.0005948192120885051, |
| "loss": 5.1595, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10242587601078167, |
| "grad_norm": 1.4030689001083374, |
| "learning_rate": 0.0005944954128440366, |
| "loss": 5.1322, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.1078167115902965, |
| "grad_norm": 0.9765119552612305, |
| "learning_rate": 0.0005941716135995682, |
| "loss": 5.0934, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1078167115902965, |
| "eval_accuracy": 0.22754229014862737, |
| "eval_loss": 5.022522926330566, |
| "eval_runtime": 185.9242, |
| "eval_samples_per_second": 96.873, |
| "eval_steps_per_second": 6.056, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11320754716981132, |
| "grad_norm": 0.8278682827949524, |
| "learning_rate": 0.0005938478143550997, |
| "loss": 5.0442, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.11859838274932614, |
| "grad_norm": 1.0204647779464722, |
| "learning_rate": 0.0005935240151106314, |
| "loss": 5.0209, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12398921832884097, |
| "grad_norm": 1.004250407218933, |
| "learning_rate": 0.0005932002158661629, |
| "loss": 4.9915, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.1293800539083558, |
| "grad_norm": 1.2949087619781494, |
| "learning_rate": 0.0005928764166216945, |
| "loss": 4.9493, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1347708894878706, |
| "grad_norm": 0.8809250593185425, |
| "learning_rate": 0.000592552617377226, |
| "loss": 4.9039, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.14016172506738545, |
| "grad_norm": 1.0270975828170776, |
| "learning_rate": 0.0005922288181327577, |
| "loss": 4.8792, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14555256064690028, |
| "grad_norm": 0.9649767279624939, |
| "learning_rate": 0.0005919050188882893, |
| "loss": 4.8622, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.1509433962264151, |
| "grad_norm": 0.9902446866035461, |
| "learning_rate": 0.0005915812196438207, |
| "loss": 4.8481, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15633423180592992, |
| "grad_norm": 0.7882956862449646, |
| "learning_rate": 0.0005912574203993524, |
| "loss": 4.7968, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.16172506738544473, |
| "grad_norm": 0.8941258788108826, |
| "learning_rate": 0.0005909336211548839, |
| "loss": 4.8149, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.16711590296495957, |
| "grad_norm": 1.3297462463378906, |
| "learning_rate": 0.0005906098219104155, |
| "loss": 4.7656, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.1725067385444744, |
| "grad_norm": 1.497783899307251, |
| "learning_rate": 0.000590286022665947, |
| "loss": 4.7642, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1778975741239892, |
| "grad_norm": 1.0149500370025635, |
| "learning_rate": 0.0005899622234214787, |
| "loss": 4.7241, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18328840970350405, |
| "grad_norm": 1.067275047302246, |
| "learning_rate": 0.0005896384241770102, |
| "loss": 4.7115, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.18867924528301888, |
| "grad_norm": 0.7510073781013489, |
| "learning_rate": 0.0005893146249325418, |
| "loss": 4.6734, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.1940700808625337, |
| "grad_norm": 1.0857677459716797, |
| "learning_rate": 0.0005889908256880733, |
| "loss": 4.6802, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.19946091644204852, |
| "grad_norm": 0.9117745161056519, |
| "learning_rate": 0.0005886670264436049, |
| "loss": 4.6481, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20485175202156333, |
| "grad_norm": 0.9455132484436035, |
| "learning_rate": 0.0005883432271991365, |
| "loss": 4.6385, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.21024258760107817, |
| "grad_norm": 0.8344852924346924, |
| "learning_rate": 0.0005880194279546681, |
| "loss": 4.6113, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.215633423180593, |
| "grad_norm": 1.0357882976531982, |
| "learning_rate": 0.0005876956287101996, |
| "loss": 4.5937, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.215633423180593, |
| "eval_accuracy": 0.27015801715433524, |
| "eval_loss": 4.516484260559082, |
| "eval_runtime": 185.6539, |
| "eval_samples_per_second": 97.014, |
| "eval_steps_per_second": 6.065, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2210242587601078, |
| "grad_norm": 0.6943921446800232, |
| "learning_rate": 0.0005873718294657312, |
| "loss": 4.5747, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22641509433962265, |
| "grad_norm": 0.8243718147277832, |
| "learning_rate": 0.0005870480302212628, |
| "loss": 4.5582, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.23180592991913745, |
| "grad_norm": 1.0496394634246826, |
| "learning_rate": 0.0005867242309767943, |
| "loss": 4.5315, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2371967654986523, |
| "grad_norm": 0.8335784077644348, |
| "learning_rate": 0.0005864004317323259, |
| "loss": 4.5086, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24258760107816713, |
| "grad_norm": 0.8698639273643494, |
| "learning_rate": 0.0005860766324878575, |
| "loss": 4.4958, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.24797843665768193, |
| "grad_norm": 0.7829259634017944, |
| "learning_rate": 0.000585752833243389, |
| "loss": 4.5067, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.25336927223719674, |
| "grad_norm": 0.8259062767028809, |
| "learning_rate": 0.0005854290339989206, |
| "loss": 4.4569, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2587601078167116, |
| "grad_norm": 0.8445196151733398, |
| "learning_rate": 0.0005851052347544521, |
| "loss": 4.4611, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2641509433962264, |
| "grad_norm": 0.8523595929145813, |
| "learning_rate": 0.0005847814355099838, |
| "loss": 4.4402, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2695417789757412, |
| "grad_norm": 0.8830829858779907, |
| "learning_rate": 0.0005844576362655154, |
| "loss": 4.4235, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2749326145552561, |
| "grad_norm": 0.8365247845649719, |
| "learning_rate": 0.0005841338370210469, |
| "loss": 4.4101, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2803234501347709, |
| "grad_norm": 0.853497326374054, |
| "learning_rate": 0.0005838100377765785, |
| "loss": 4.3826, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.9241225719451904, |
| "learning_rate": 0.0005834862385321101, |
| "loss": 4.3931, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29110512129380056, |
| "grad_norm": 0.8265724778175354, |
| "learning_rate": 0.0005831624392876417, |
| "loss": 4.3887, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.29649595687331537, |
| "grad_norm": 0.8722224831581116, |
| "learning_rate": 0.0005828386400431731, |
| "loss": 4.3563, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3018867924528302, |
| "grad_norm": 0.7775854468345642, |
| "learning_rate": 0.0005825148407987048, |
| "loss": 4.3271, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.30727762803234504, |
| "grad_norm": 0.7281326055526733, |
| "learning_rate": 0.0005821910415542363, |
| "loss": 4.3363, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31266846361185985, |
| "grad_norm": 0.8523993492126465, |
| "learning_rate": 0.0005818672423097679, |
| "loss": 4.3284, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.31805929919137466, |
| "grad_norm": 0.8152425289154053, |
| "learning_rate": 0.0005815434430652994, |
| "loss": 4.313, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.32345013477088946, |
| "grad_norm": 0.9258395433425903, |
| "learning_rate": 0.0005812196438208311, |
| "loss": 4.3092, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.32345013477088946, |
| "eval_accuracy": 0.29783223363934697, |
| "eval_loss": 4.246051788330078, |
| "eval_runtime": 185.609, |
| "eval_samples_per_second": 97.037, |
| "eval_steps_per_second": 6.067, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3288409703504043, |
| "grad_norm": 0.78269362449646, |
| "learning_rate": 0.0005808958445763626, |
| "loss": 4.304, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.33423180592991913, |
| "grad_norm": 0.7207179069519043, |
| "learning_rate": 0.0005805720453318942, |
| "loss": 4.3041, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.33962264150943394, |
| "grad_norm": 0.791796088218689, |
| "learning_rate": 0.0005802482460874257, |
| "loss": 4.2633, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3450134770889488, |
| "grad_norm": 0.6603590250015259, |
| "learning_rate": 0.0005799244468429573, |
| "loss": 4.2613, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3504043126684636, |
| "grad_norm": 0.922822892665863, |
| "learning_rate": 0.0005796006475984889, |
| "loss": 4.2617, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.3557951482479784, |
| "grad_norm": 0.7605053782463074, |
| "learning_rate": 0.0005792768483540205, |
| "loss": 4.2446, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3611859838274933, |
| "grad_norm": 0.7679175734519958, |
| "learning_rate": 0.000578953049109552, |
| "loss": 4.26, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.3665768194070081, |
| "grad_norm": 0.7325921654701233, |
| "learning_rate": 0.0005786292498650836, |
| "loss": 4.2337, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3719676549865229, |
| "grad_norm": 0.6152936816215515, |
| "learning_rate": 0.0005783054506206152, |
| "loss": 4.21, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.37735849056603776, |
| "grad_norm": 0.7148654460906982, |
| "learning_rate": 0.0005779816513761467, |
| "loss": 4.2203, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.38274932614555257, |
| "grad_norm": 0.8136675953865051, |
| "learning_rate": 0.0005776578521316782, |
| "loss": 4.2143, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.3881401617250674, |
| "grad_norm": 0.6423671245574951, |
| "learning_rate": 0.0005773340528872099, |
| "loss": 4.2164, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3935309973045822, |
| "grad_norm": 0.8761276602745056, |
| "learning_rate": 0.0005770102536427414, |
| "loss": 4.1887, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.39892183288409705, |
| "grad_norm": 0.7816224694252014, |
| "learning_rate": 0.000576686454398273, |
| "loss": 4.1937, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.40431266846361186, |
| "grad_norm": 0.7834933400154114, |
| "learning_rate": 0.0005763626551538045, |
| "loss": 4.1965, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.40970350404312667, |
| "grad_norm": 0.6208221912384033, |
| "learning_rate": 0.0005760388559093362, |
| "loss": 4.1864, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.41509433962264153, |
| "grad_norm": 0.6872389316558838, |
| "learning_rate": 0.0005757150566648678, |
| "loss": 4.1732, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.42048517520215634, |
| "grad_norm": 0.7426034808158875, |
| "learning_rate": 0.0005753912574203993, |
| "loss": 4.1682, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.42587601078167114, |
| "grad_norm": 0.7201637625694275, |
| "learning_rate": 0.0005750674581759309, |
| "loss": 4.157, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.431266846361186, |
| "grad_norm": 0.7274760007858276, |
| "learning_rate": 0.0005747436589314624, |
| "loss": 4.1586, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.431266846361186, |
| "eval_accuracy": 0.3127459290207626, |
| "eval_loss": 4.088518142700195, |
| "eval_runtime": 185.6568, |
| "eval_samples_per_second": 97.012, |
| "eval_steps_per_second": 6.065, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4366576819407008, |
| "grad_norm": 0.5464039444923401, |
| "learning_rate": 0.0005744198596869941, |
| "loss": 4.149, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.4420485175202156, |
| "grad_norm": 0.7344082593917847, |
| "learning_rate": 0.0005740960604425255, |
| "loss": 4.1532, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.4474393530997305, |
| "grad_norm": 0.6581411957740784, |
| "learning_rate": 0.0005737722611980572, |
| "loss": 4.1547, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.4528301886792453, |
| "grad_norm": 0.6639128923416138, |
| "learning_rate": 0.0005734484619535887, |
| "loss": 4.1367, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.4582210242587601, |
| "grad_norm": 0.734761655330658, |
| "learning_rate": 0.0005731246627091203, |
| "loss": 4.1221, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4636118598382749, |
| "grad_norm": 0.8860589265823364, |
| "learning_rate": 0.0005728008634646518, |
| "loss": 4.1218, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.46900269541778977, |
| "grad_norm": 0.5995060205459595, |
| "learning_rate": 0.0005724770642201835, |
| "loss": 4.1167, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.4743935309973046, |
| "grad_norm": 0.7229450345039368, |
| "learning_rate": 0.000572153264975715, |
| "loss": 4.1148, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4797843665768194, |
| "grad_norm": 0.731573224067688, |
| "learning_rate": 0.0005718294657312466, |
| "loss": 4.1018, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.48517520215633425, |
| "grad_norm": 0.7716799974441528, |
| "learning_rate": 0.0005715056664867781, |
| "loss": 4.109, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.49056603773584906, |
| "grad_norm": 0.542386531829834, |
| "learning_rate": 0.0005711818672423097, |
| "loss": 4.1017, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.49595687331536387, |
| "grad_norm": 0.794573187828064, |
| "learning_rate": 0.0005708580679978413, |
| "loss": 4.086, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5013477088948787, |
| "grad_norm": 0.6180745959281921, |
| "learning_rate": 0.0005705342687533729, |
| "loss": 4.0787, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.5067385444743935, |
| "grad_norm": 0.6277428269386292, |
| "learning_rate": 0.0005702104695089044, |
| "loss": 4.0932, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5121293800539084, |
| "grad_norm": 0.6428430676460266, |
| "learning_rate": 0.000569886670264436, |
| "loss": 4.0812, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5175202156334232, |
| "grad_norm": 0.6423416137695312, |
| "learning_rate": 0.0005695628710199675, |
| "loss": 4.0791, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.522911051212938, |
| "grad_norm": 0.6095171570777893, |
| "learning_rate": 0.0005692390717754991, |
| "loss": 4.0655, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5283018867924528, |
| "grad_norm": 0.6891219615936279, |
| "learning_rate": 0.0005689152725310306, |
| "loss": 4.0527, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5336927223719676, |
| "grad_norm": 0.6701120734214783, |
| "learning_rate": 0.0005685914732865623, |
| "loss": 4.0714, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5390835579514824, |
| "grad_norm": 0.5252517461776733, |
| "learning_rate": 0.0005682676740420939, |
| "loss": 4.0492, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5390835579514824, |
| "eval_accuracy": 0.32119879773406573, |
| "eval_loss": 3.9914467334747314, |
| "eval_runtime": 185.5695, |
| "eval_samples_per_second": 97.058, |
| "eval_steps_per_second": 6.068, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5444743935309974, |
| "grad_norm": 0.6187731623649597, |
| "learning_rate": 0.0005679438747976254, |
| "loss": 4.0591, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.5498652291105122, |
| "grad_norm": 0.6102878451347351, |
| "learning_rate": 0.000567620075553157, |
| "loss": 4.0513, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.555256064690027, |
| "grad_norm": 0.708706259727478, |
| "learning_rate": 0.0005672962763086886, |
| "loss": 4.0489, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.5606469002695418, |
| "grad_norm": 0.6461382508277893, |
| "learning_rate": 0.0005669724770642202, |
| "loss": 4.0661, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5660377358490566, |
| "grad_norm": 0.7211845517158508, |
| "learning_rate": 0.0005666486778197517, |
| "loss": 4.0418, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.7692673802375793, |
| "learning_rate": 0.0005663248785752833, |
| "loss": 4.0325, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5768194070080862, |
| "grad_norm": 0.5925557613372803, |
| "learning_rate": 0.0005660010793308148, |
| "loss": 4.0266, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.5822102425876011, |
| "grad_norm": 0.5735204815864563, |
| "learning_rate": 0.0005656772800863465, |
| "loss": 4.0372, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5876010781671159, |
| "grad_norm": 0.5482949018478394, |
| "learning_rate": 0.0005653534808418779, |
| "loss": 4.036, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.5929919137466307, |
| "grad_norm": 0.7778071165084839, |
| "learning_rate": 0.0005650296815974096, |
| "loss": 4.0203, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5983827493261455, |
| "grad_norm": 0.603145956993103, |
| "learning_rate": 0.0005647058823529411, |
| "loss": 4.009, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.6037735849056604, |
| "grad_norm": 0.5814369916915894, |
| "learning_rate": 0.0005643820831084727, |
| "loss": 4.0256, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6091644204851752, |
| "grad_norm": 0.6685661673545837, |
| "learning_rate": 0.0005640582838640042, |
| "loss": 4.0055, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6145552560646901, |
| "grad_norm": 0.5345515608787537, |
| "learning_rate": 0.0005637344846195358, |
| "loss": 4.0021, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6199460916442049, |
| "grad_norm": 0.6599013209342957, |
| "learning_rate": 0.0005634106853750674, |
| "loss": 3.9773, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6253369272237197, |
| "grad_norm": 0.6814241409301758, |
| "learning_rate": 0.000563086886130599, |
| "loss": 3.985, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6307277628032345, |
| "grad_norm": 0.6166688203811646, |
| "learning_rate": 0.0005627630868861305, |
| "loss": 3.9948, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.6361185983827493, |
| "grad_norm": 0.6375031471252441, |
| "learning_rate": 0.0005624392876416621, |
| "loss": 3.995, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6415094339622641, |
| "grad_norm": 0.6021779179573059, |
| "learning_rate": 0.0005621154883971937, |
| "loss": 3.9838, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.6469002695417789, |
| "grad_norm": 0.6350939273834229, |
| "learning_rate": 0.0005617916891527253, |
| "loss": 3.9962, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6469002695417789, |
| "eval_accuracy": 0.3278974655736041, |
| "eval_loss": 3.919321060180664, |
| "eval_runtime": 185.6272, |
| "eval_samples_per_second": 97.028, |
| "eval_steps_per_second": 6.066, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6522911051212938, |
| "grad_norm": 0.6063998937606812, |
| "learning_rate": 0.0005614678899082568, |
| "loss": 3.9623, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.6576819407008087, |
| "grad_norm": 0.6236124634742737, |
| "learning_rate": 0.0005611440906637884, |
| "loss": 3.9998, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.6630727762803235, |
| "grad_norm": 0.7479600310325623, |
| "learning_rate": 0.0005608267674042094, |
| "loss": 3.9792, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.6684636118598383, |
| "grad_norm": 0.5813738703727722, |
| "learning_rate": 0.0005605029681597409, |
| "loss": 3.953, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6738544474393531, |
| "grad_norm": 0.5313798785209656, |
| "learning_rate": 0.0005601791689152725, |
| "loss": 3.972, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.6792452830188679, |
| "grad_norm": 0.6238154768943787, |
| "learning_rate": 0.000559855369670804, |
| "loss": 3.9769, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6846361185983828, |
| "grad_norm": 0.5220109224319458, |
| "learning_rate": 0.0005595315704263357, |
| "loss": 3.9613, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.6900269541778976, |
| "grad_norm": 0.673324704170227, |
| "learning_rate": 0.0005592077711818672, |
| "loss": 3.9706, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6954177897574124, |
| "grad_norm": 0.6563860177993774, |
| "learning_rate": 0.0005588839719373988, |
| "loss": 3.969, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.7008086253369272, |
| "grad_norm": 0.6665467023849487, |
| "learning_rate": 0.0005585601726929303, |
| "loss": 3.9606, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.706199460916442, |
| "grad_norm": 0.5975165367126465, |
| "learning_rate": 0.0005582363734484619, |
| "loss": 3.959, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7115902964959568, |
| "grad_norm": 0.6370920538902283, |
| "learning_rate": 0.0005579125742039935, |
| "loss": 3.9606, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7169811320754716, |
| "grad_norm": 0.6234825849533081, |
| "learning_rate": 0.0005575887749595251, |
| "loss": 3.97, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7223719676549866, |
| "grad_norm": 0.6267138123512268, |
| "learning_rate": 0.0005572649757150566, |
| "loss": 3.9344, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.7277628032345014, |
| "grad_norm": 0.5611273050308228, |
| "learning_rate": 0.0005569411764705882, |
| "loss": 3.9631, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.7331536388140162, |
| "grad_norm": 0.6250568628311157, |
| "learning_rate": 0.0005566173772261198, |
| "loss": 3.9344, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.738544474393531, |
| "grad_norm": 0.5902653932571411, |
| "learning_rate": 0.0005562935779816513, |
| "loss": 3.9345, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.7439353099730458, |
| "grad_norm": 0.5684419274330139, |
| "learning_rate": 0.0005559697787371828, |
| "loss": 3.9367, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.7493261455525606, |
| "grad_norm": 0.5950194001197815, |
| "learning_rate": 0.0005556459794927145, |
| "loss": 3.9418, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.6283013820648193, |
| "learning_rate": 0.000555322180248246, |
| "loss": 3.9219, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "eval_accuracy": 0.33346299244612526, |
| "eval_loss": 3.8631539344787598, |
| "eval_runtime": 185.3774, |
| "eval_samples_per_second": 97.159, |
| "eval_steps_per_second": 6.074, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7601078167115903, |
| "grad_norm": 0.6419661641120911, |
| "learning_rate": 0.0005549983810037776, |
| "loss": 3.9341, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.7654986522911051, |
| "grad_norm": 0.8758557438850403, |
| "learning_rate": 0.0005546745817593091, |
| "loss": 3.9093, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.77088948787062, |
| "grad_norm": 0.6024784445762634, |
| "learning_rate": 0.0005543507825148408, |
| "loss": 3.9347, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.7762803234501348, |
| "grad_norm": 0.5663654208183289, |
| "learning_rate": 0.0005540269832703723, |
| "loss": 3.9072, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7816711590296496, |
| "grad_norm": 0.6945359110832214, |
| "learning_rate": 0.0005537031840259039, |
| "loss": 3.899, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.7870619946091644, |
| "grad_norm": 0.5187634229660034, |
| "learning_rate": 0.0005533793847814354, |
| "loss": 3.9124, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7924528301886793, |
| "grad_norm": 0.6671504378318787, |
| "learning_rate": 0.000553055585536967, |
| "loss": 3.9133, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.7978436657681941, |
| "grad_norm": 0.5317527055740356, |
| "learning_rate": 0.0005527317862924987, |
| "loss": 3.9103, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8032345013477089, |
| "grad_norm": 0.567512571811676, |
| "learning_rate": 0.0005524079870480301, |
| "loss": 3.8879, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.8086253369272237, |
| "grad_norm": 0.5598046183586121, |
| "learning_rate": 0.0005520841878035618, |
| "loss": 3.9029, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8140161725067385, |
| "grad_norm": 0.6032840013504028, |
| "learning_rate": 0.0005517603885590933, |
| "loss": 3.8964, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.8194070080862533, |
| "grad_norm": 0.5689701437950134, |
| "learning_rate": 0.0005514365893146249, |
| "loss": 3.8911, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.8247978436657682, |
| "grad_norm": 0.5858896374702454, |
| "learning_rate": 0.0005511127900701564, |
| "loss": 3.8974, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.8301886792452831, |
| "grad_norm": 0.6379143595695496, |
| "learning_rate": 0.000550788990825688, |
| "loss": 3.8913, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.8355795148247979, |
| "grad_norm": 0.6346825361251831, |
| "learning_rate": 0.0005504651915812196, |
| "loss": 3.8938, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.8409703504043127, |
| "grad_norm": 0.651748776435852, |
| "learning_rate": 0.0005501413923367512, |
| "loss": 3.8768, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.8463611859838275, |
| "grad_norm": 0.5529057383537292, |
| "learning_rate": 0.0005498175930922827, |
| "loss": 3.8814, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.8517520215633423, |
| "grad_norm": 0.5781732797622681, |
| "learning_rate": 0.0005494937938478143, |
| "loss": 3.8778, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.8571428571428571, |
| "grad_norm": 0.599770188331604, |
| "learning_rate": 0.0005491699946033459, |
| "loss": 3.9077, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.862533692722372, |
| "grad_norm": 0.6126531958580017, |
| "learning_rate": 0.0005488461953588775, |
| "loss": 3.8566, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.862533692722372, |
| "eval_accuracy": 0.3376529737589151, |
| "eval_loss": 3.815525531768799, |
| "eval_runtime": 186.958, |
| "eval_samples_per_second": 96.337, |
| "eval_steps_per_second": 6.023, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8679245283018868, |
| "grad_norm": 0.6914803981781006, |
| "learning_rate": 0.000548522396114409, |
| "loss": 3.8954, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.8733153638814016, |
| "grad_norm": 0.6908184289932251, |
| "learning_rate": 0.0005481985968699406, |
| "loss": 3.8747, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.8787061994609164, |
| "grad_norm": 0.5247132778167725, |
| "learning_rate": 0.0005478747976254721, |
| "loss": 3.8785, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.8840970350404312, |
| "grad_norm": 0.585969090461731, |
| "learning_rate": 0.0005475509983810037, |
| "loss": 3.8734, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.889487870619946, |
| "grad_norm": 0.5993847250938416, |
| "learning_rate": 0.0005472271991365352, |
| "loss": 3.8685, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.894878706199461, |
| "grad_norm": 0.6115968227386475, |
| "learning_rate": 0.0005469033998920669, |
| "loss": 3.8867, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.9002695417789758, |
| "grad_norm": 0.5844521522521973, |
| "learning_rate": 0.0005465860766324878, |
| "loss": 3.8566, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.9056603773584906, |
| "grad_norm": 0.5750303864479065, |
| "learning_rate": 0.0005462622773880194, |
| "loss": 3.8639, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.9110512129380054, |
| "grad_norm": 0.5799904465675354, |
| "learning_rate": 0.000545938478143551, |
| "loss": 3.859, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.9164420485175202, |
| "grad_norm": 0.6240545511245728, |
| "learning_rate": 0.0005456146788990825, |
| "loss": 3.8648, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.921832884097035, |
| "grad_norm": 0.5578630566596985, |
| "learning_rate": 0.000545290879654614, |
| "loss": 3.8597, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.9272237196765498, |
| "grad_norm": 0.5732514262199402, |
| "learning_rate": 0.0005449670804101457, |
| "loss": 3.8652, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.9326145552560647, |
| "grad_norm": 0.614829957485199, |
| "learning_rate": 0.0005446432811656773, |
| "loss": 3.845, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.9380053908355795, |
| "grad_norm": 0.6199792623519897, |
| "learning_rate": 0.0005443194819212088, |
| "loss": 3.8484, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.9433962264150944, |
| "grad_norm": 0.5770998597145081, |
| "learning_rate": 0.0005439956826767404, |
| "loss": 3.8456, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.9487870619946092, |
| "grad_norm": 0.5756552815437317, |
| "learning_rate": 0.000543671883432272, |
| "loss": 3.8492, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.954177897574124, |
| "grad_norm": 0.5448219180107117, |
| "learning_rate": 0.0005433480841878035, |
| "loss": 3.8518, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.9595687331536388, |
| "grad_norm": 0.5587062239646912, |
| "learning_rate": 0.000543024284943335, |
| "loss": 3.8389, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.9649595687331537, |
| "grad_norm": 0.5609190464019775, |
| "learning_rate": 0.0005427004856988667, |
| "loss": 3.8303, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.9703504043126685, |
| "grad_norm": 0.6230307817459106, |
| "learning_rate": 0.0005423766864543982, |
| "loss": 3.856, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9703504043126685, |
| "eval_accuracy": 0.34079347682830485, |
| "eval_loss": 3.7813315391540527, |
| "eval_runtime": 186.6989, |
| "eval_samples_per_second": 96.471, |
| "eval_steps_per_second": 6.031, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9757412398921833, |
| "grad_norm": 0.6406134963035583, |
| "learning_rate": 0.0005420528872099298, |
| "loss": 3.871, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.9811320754716981, |
| "grad_norm": 0.49706608057022095, |
| "learning_rate": 0.0005417290879654613, |
| "loss": 3.8273, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.9865229110512129, |
| "grad_norm": 0.5765336751937866, |
| "learning_rate": 0.000541405288720993, |
| "loss": 3.8418, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.9919137466307277, |
| "grad_norm": 0.5545505881309509, |
| "learning_rate": 0.0005410814894765245, |
| "loss": 3.8285, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.9973045822102425, |
| "grad_norm": 0.5445045232772827, |
| "learning_rate": 0.0005407576902320561, |
| "loss": 3.8485, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.0026954177897573, |
| "grad_norm": 0.5650607347488403, |
| "learning_rate": 0.0005404338909875876, |
| "loss": 3.7963, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.0080862533692723, |
| "grad_norm": 0.5752312541007996, |
| "learning_rate": 0.0005401100917431192, |
| "loss": 3.7605, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.013477088948787, |
| "grad_norm": 0.5649228096008301, |
| "learning_rate": 0.0005397862924986508, |
| "loss": 3.7564, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.0188679245283019, |
| "grad_norm": 0.5715786218643188, |
| "learning_rate": 0.0005394624932541824, |
| "loss": 3.7554, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.0242587601078168, |
| "grad_norm": 0.5207133889198303, |
| "learning_rate": 0.0005391386940097139, |
| "loss": 3.7592, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0296495956873315, |
| "grad_norm": 0.5603737235069275, |
| "learning_rate": 0.0005388148947652455, |
| "loss": 3.7704, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.0350404312668464, |
| "grad_norm": 0.6097581386566162, |
| "learning_rate": 0.000538491095520777, |
| "loss": 3.7686, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.0404312668463611, |
| "grad_norm": 0.6030071377754211, |
| "learning_rate": 0.0005381672962763086, |
| "loss": 3.7726, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.045822102425876, |
| "grad_norm": 0.546688437461853, |
| "learning_rate": 0.0005378434970318403, |
| "loss": 3.7567, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.0512129380053907, |
| "grad_norm": 0.5595793724060059, |
| "learning_rate": 0.0005375196977873718, |
| "loss": 3.755, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.0566037735849056, |
| "grad_norm": 0.5992977619171143, |
| "learning_rate": 0.0005371958985429034, |
| "loss": 3.7821, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.0619946091644206, |
| "grad_norm": 0.5221476554870605, |
| "learning_rate": 0.0005368720992984349, |
| "loss": 3.7845, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.0673854447439353, |
| "grad_norm": 0.5831781029701233, |
| "learning_rate": 0.0005365483000539665, |
| "loss": 3.772, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.0727762803234502, |
| "grad_norm": 0.5980172753334045, |
| "learning_rate": 0.0005362245008094981, |
| "loss": 3.7854, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.0781671159029649, |
| "grad_norm": 0.5590375661849976, |
| "learning_rate": 0.0005359007015650297, |
| "loss": 3.7649, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0781671159029649, |
| "eval_accuracy": 0.34435424926776104, |
| "eval_loss": 3.7518138885498047, |
| "eval_runtime": 186.0381, |
| "eval_samples_per_second": 96.814, |
| "eval_steps_per_second": 6.053, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0835579514824798, |
| "grad_norm": 0.5709946751594543, |
| "learning_rate": 0.0005355769023205612, |
| "loss": 3.7581, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.0889487870619945, |
| "grad_norm": 0.618277370929718, |
| "learning_rate": 0.0005352531030760928, |
| "loss": 3.7592, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.0943396226415094, |
| "grad_norm": 0.6374980211257935, |
| "learning_rate": 0.0005349293038316244, |
| "loss": 3.7555, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.0997304582210243, |
| "grad_norm": 0.5958013534545898, |
| "learning_rate": 0.0005346055045871559, |
| "loss": 3.7666, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.105121293800539, |
| "grad_norm": 0.6115665435791016, |
| "learning_rate": 0.0005342817053426874, |
| "loss": 3.7524, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.110512129380054, |
| "grad_norm": 0.585763692855835, |
| "learning_rate": 0.0005339579060982191, |
| "loss": 3.764, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.1159029649595686, |
| "grad_norm": 0.653211772441864, |
| "learning_rate": 0.0005336341068537506, |
| "loss": 3.7568, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.1212938005390836, |
| "grad_norm": 0.5737949013710022, |
| "learning_rate": 0.0005333103076092822, |
| "loss": 3.7638, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.1266846361185983, |
| "grad_norm": 0.5587084293365479, |
| "learning_rate": 0.0005329865083648137, |
| "loss": 3.7609, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.1320754716981132, |
| "grad_norm": 0.6286759972572327, |
| "learning_rate": 0.0005326627091203454, |
| "loss": 3.7635, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.137466307277628, |
| "grad_norm": 0.5914649367332458, |
| "learning_rate": 0.0005323453858607662, |
| "loss": 3.7659, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.1428571428571428, |
| "grad_norm": 0.574865460395813, |
| "learning_rate": 0.0005320215866162979, |
| "loss": 3.7552, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.1482479784366577, |
| "grad_norm": 0.570719838142395, |
| "learning_rate": 0.0005316977873718294, |
| "loss": 3.7441, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.1536388140161726, |
| "grad_norm": 0.5437813997268677, |
| "learning_rate": 0.000531373988127361, |
| "loss": 3.7548, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.1590296495956873, |
| "grad_norm": 0.575222909450531, |
| "learning_rate": 0.0005310501888828925, |
| "loss": 3.7359, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.1644204851752022, |
| "grad_norm": 0.5911815762519836, |
| "learning_rate": 0.0005307263896384242, |
| "loss": 3.7398, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.169811320754717, |
| "grad_norm": 0.5724892020225525, |
| "learning_rate": 0.0005304025903939556, |
| "loss": 3.7319, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.1752021563342319, |
| "grad_norm": 0.6011896729469299, |
| "learning_rate": 0.0005300787911494873, |
| "loss": 3.7465, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.1805929919137466, |
| "grad_norm": 0.5386224985122681, |
| "learning_rate": 0.0005297549919050189, |
| "loss": 3.7593, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.1859838274932615, |
| "grad_norm": 0.5424002408981323, |
| "learning_rate": 0.0005294311926605504, |
| "loss": 3.769, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1859838274932615, |
| "eval_accuracy": 0.3473514391781235, |
| "eval_loss": 3.723193883895874, |
| "eval_runtime": 186.4484, |
| "eval_samples_per_second": 96.6, |
| "eval_steps_per_second": 6.039, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1913746630727764, |
| "grad_norm": 0.5702701210975647, |
| "learning_rate": 0.000529107393416082, |
| "loss": 3.7567, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.196765498652291, |
| "grad_norm": 0.5534523129463196, |
| "learning_rate": 0.0005287835941716135, |
| "loss": 3.7412, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.202156334231806, |
| "grad_norm": 0.5500940084457397, |
| "learning_rate": 0.0005284597949271452, |
| "loss": 3.7407, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.2075471698113207, |
| "grad_norm": 0.5379878878593445, |
| "learning_rate": 0.0005281359956826767, |
| "loss": 3.7631, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.2129380053908356, |
| "grad_norm": 0.5732437372207642, |
| "learning_rate": 0.0005278121964382083, |
| "loss": 3.7498, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.2183288409703503, |
| "grad_norm": 0.5423194766044617, |
| "learning_rate": 0.0005274883971937398, |
| "loss": 3.7371, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.2237196765498652, |
| "grad_norm": 0.542545735836029, |
| "learning_rate": 0.0005271645979492714, |
| "loss": 3.7301, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.2291105121293802, |
| "grad_norm": 0.8030261397361755, |
| "learning_rate": 0.0005268472746896923, |
| "loss": 3.7452, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.2345013477088949, |
| "grad_norm": 0.5232232809066772, |
| "learning_rate": 0.000526523475445224, |
| "loss": 3.755, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.2398921832884098, |
| "grad_norm": 0.5808193683624268, |
| "learning_rate": 0.0005261996762007554, |
| "loss": 3.7452, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.2452830188679245, |
| "grad_norm": 0.5781920552253723, |
| "learning_rate": 0.0005258758769562871, |
| "loss": 3.7392, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.2506738544474394, |
| "grad_norm": 0.5637895464897156, |
| "learning_rate": 0.0005255520777118186, |
| "loss": 3.7512, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.256064690026954, |
| "grad_norm": 0.5855022668838501, |
| "learning_rate": 0.0005252282784673502, |
| "loss": 3.7318, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.261455525606469, |
| "grad_norm": 0.5060122013092041, |
| "learning_rate": 0.0005249044792228817, |
| "loss": 3.7424, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.266846361185984, |
| "grad_norm": 0.5578184723854065, |
| "learning_rate": 0.0005245806799784133, |
| "loss": 3.7421, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.2722371967654986, |
| "grad_norm": 0.5692754983901978, |
| "learning_rate": 0.0005242568807339449, |
| "loss": 3.7541, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.2776280323450135, |
| "grad_norm": 0.5293059349060059, |
| "learning_rate": 0.0005239330814894765, |
| "loss": 3.7389, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.2830188679245282, |
| "grad_norm": 0.5823394060134888, |
| "learning_rate": 0.000523609282245008, |
| "loss": 3.7336, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.2884097035040432, |
| "grad_norm": 0.5567950010299683, |
| "learning_rate": 0.0005232854830005396, |
| "loss": 3.7176, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.2938005390835579, |
| "grad_norm": 0.5795886516571045, |
| "learning_rate": 0.0005229616837560712, |
| "loss": 3.7288, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2938005390835579, |
| "eval_accuracy": 0.34931164592718456, |
| "eval_loss": 3.697058916091919, |
| "eval_runtime": 186.8168, |
| "eval_samples_per_second": 96.41, |
| "eval_steps_per_second": 6.027, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2991913746630728, |
| "grad_norm": 0.6153177618980408, |
| "learning_rate": 0.0005226378845116028, |
| "loss": 3.7453, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.3045822102425877, |
| "grad_norm": 0.6005005836486816, |
| "learning_rate": 0.0005223140852671344, |
| "loss": 3.7398, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.3099730458221024, |
| "grad_norm": 0.5839811563491821, |
| "learning_rate": 0.0005219902860226659, |
| "loss": 3.7295, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.3153638814016173, |
| "grad_norm": 0.54124516248703, |
| "learning_rate": 0.0005216664867781975, |
| "loss": 3.7451, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.320754716981132, |
| "grad_norm": 0.5872597098350525, |
| "learning_rate": 0.000521342687533729, |
| "loss": 3.7195, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.326145552560647, |
| "grad_norm": 0.6764320731163025, |
| "learning_rate": 0.0005210188882892606, |
| "loss": 3.7273, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.3315363881401616, |
| "grad_norm": 0.5127867460250854, |
| "learning_rate": 0.0005206950890447922, |
| "loss": 3.7354, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.3369272237196765, |
| "grad_norm": 0.5766441822052002, |
| "learning_rate": 0.0005203712898003238, |
| "loss": 3.7329, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.3423180592991915, |
| "grad_norm": 0.5768564343452454, |
| "learning_rate": 0.0005200474905558553, |
| "loss": 3.7196, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.3477088948787062, |
| "grad_norm": 0.5573644638061523, |
| "learning_rate": 0.0005197236913113869, |
| "loss": 3.7398, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.353099730458221, |
| "grad_norm": 0.519527018070221, |
| "learning_rate": 0.0005193998920669184, |
| "loss": 3.7141, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.3584905660377358, |
| "grad_norm": 0.6151216626167297, |
| "learning_rate": 0.0005190760928224501, |
| "loss": 3.7121, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.3638814016172507, |
| "grad_norm": 0.6031010746955872, |
| "learning_rate": 0.0005187522935779816, |
| "loss": 3.7035, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.3692722371967654, |
| "grad_norm": 0.534813642501831, |
| "learning_rate": 0.0005184284943335132, |
| "loss": 3.7193, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.3746630727762803, |
| "grad_norm": 0.5405371785163879, |
| "learning_rate": 0.0005181046950890447, |
| "loss": 3.7074, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.3800539083557952, |
| "grad_norm": 0.537695050239563, |
| "learning_rate": 0.0005177808958445764, |
| "loss": 3.7067, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.38544474393531, |
| "grad_norm": 0.6358603239059448, |
| "learning_rate": 0.0005174570966001078, |
| "loss": 3.7174, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.3908355795148248, |
| "grad_norm": 0.6034652590751648, |
| "learning_rate": 0.0005171332973556395, |
| "loss": 3.7168, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.3962264150943398, |
| "grad_norm": 0.553503692150116, |
| "learning_rate": 0.000516809498111171, |
| "loss": 3.7269, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.4016172506738545, |
| "grad_norm": 0.5765125155448914, |
| "learning_rate": 0.0005164856988667026, |
| "loss": 3.7086, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4016172506738545, |
| "eval_accuracy": 0.35153598784652257, |
| "eval_loss": 3.674765110015869, |
| "eval_runtime": 186.3343, |
| "eval_samples_per_second": 96.66, |
| "eval_steps_per_second": 6.043, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4070080862533692, |
| "grad_norm": 0.6072783470153809, |
| "learning_rate": 0.0005161618996222341, |
| "loss": 3.7063, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.412398921832884, |
| "grad_norm": 0.5731549859046936, |
| "learning_rate": 0.0005158381003777657, |
| "loss": 3.6997, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.417789757412399, |
| "grad_norm": 0.5856156945228577, |
| "learning_rate": 0.0005155143011332973, |
| "loss": 3.7057, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.4231805929919137, |
| "grad_norm": 0.566673755645752, |
| "learning_rate": 0.0005151905018888289, |
| "loss": 3.7061, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.527152419090271, |
| "learning_rate": 0.0005148667026443604, |
| "loss": 3.7152, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.4339622641509435, |
| "grad_norm": 0.5248644948005676, |
| "learning_rate": 0.000514542903399892, |
| "loss": 3.6882, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.4393530997304582, |
| "grad_norm": 0.5552476048469543, |
| "learning_rate": 0.0005142191041554237, |
| "loss": 3.708, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.444743935309973, |
| "grad_norm": 0.5420221090316772, |
| "learning_rate": 0.0005138953049109552, |
| "loss": 3.723, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.4501347708894878, |
| "grad_norm": 0.546251654624939, |
| "learning_rate": 0.0005135715056664868, |
| "loss": 3.7274, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.4555256064690028, |
| "grad_norm": 0.656278669834137, |
| "learning_rate": 0.0005132477064220183, |
| "loss": 3.719, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.4609164420485174, |
| "grad_norm": 0.5882102847099304, |
| "learning_rate": 0.0005129239071775499, |
| "loss": 3.7077, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.4663072776280324, |
| "grad_norm": 0.5904348492622375, |
| "learning_rate": 0.0005126001079330814, |
| "loss": 3.7233, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.4716981132075473, |
| "grad_norm": 0.5616297721862793, |
| "learning_rate": 0.000512276308688613, |
| "loss": 3.7126, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.477088948787062, |
| "grad_norm": 0.588398277759552, |
| "learning_rate": 0.0005119525094441446, |
| "loss": 3.6976, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.482479784366577, |
| "grad_norm": 0.6392883062362671, |
| "learning_rate": 0.0005116287101996762, |
| "loss": 3.701, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.4878706199460916, |
| "grad_norm": 0.5546556711196899, |
| "learning_rate": 0.0005113049109552077, |
| "loss": 3.7046, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.4932614555256065, |
| "grad_norm": 0.5846448540687561, |
| "learning_rate": 0.0005109811117107393, |
| "loss": 3.7087, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.4986522911051212, |
| "grad_norm": 0.6359493732452393, |
| "learning_rate": 0.0005106573124662708, |
| "loss": 3.6907, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.5040431266846361, |
| "grad_norm": 0.5580483078956604, |
| "learning_rate": 0.0005103335132218025, |
| "loss": 3.688, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "grad_norm": 0.5794188976287842, |
| "learning_rate": 0.000510009713977334, |
| "loss": 3.703, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "eval_accuracy": 0.35349782438890087, |
| "eval_loss": 3.6583781242370605, |
| "eval_runtime": 185.6312, |
| "eval_samples_per_second": 97.026, |
| "eval_steps_per_second": 6.066, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5148247978436657, |
| "grad_norm": 0.5856167078018188, |
| "learning_rate": 0.0005096859147328656, |
| "loss": 3.6999, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.5202156334231804, |
| "grad_norm": 0.6054800152778625, |
| "learning_rate": 0.0005093621154883971, |
| "loss": 3.6984, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.5256064690026954, |
| "grad_norm": 0.5391426086425781, |
| "learning_rate": 0.0005090383162439288, |
| "loss": 3.695, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.5309973045822103, |
| "grad_norm": 0.5351890921592712, |
| "learning_rate": 0.0005087145169994602, |
| "loss": 3.701, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.536388140161725, |
| "grad_norm": 0.5853469371795654, |
| "learning_rate": 0.0005083907177549918, |
| "loss": 3.7001, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.54177897574124, |
| "grad_norm": 0.5472508668899536, |
| "learning_rate": 0.0005080669185105234, |
| "loss": 3.697, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.5471698113207548, |
| "grad_norm": 0.5463733673095703, |
| "learning_rate": 0.000507743119266055, |
| "loss": 3.6871, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.5525606469002695, |
| "grad_norm": 0.549136757850647, |
| "learning_rate": 0.0005074193200215865, |
| "loss": 3.6895, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.5579514824797842, |
| "grad_norm": 0.5161502957344055, |
| "learning_rate": 0.0005070955207771181, |
| "loss": 3.693, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.5633423180592994, |
| "grad_norm": 0.5291047096252441, |
| "learning_rate": 0.0005067717215326498, |
| "loss": 3.6909, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.568733153638814, |
| "grad_norm": 0.5388016700744629, |
| "learning_rate": 0.0005064479222881813, |
| "loss": 3.6906, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.5741239892183287, |
| "grad_norm": 0.5703091621398926, |
| "learning_rate": 0.0005061241230437129, |
| "loss": 3.6751, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.5795148247978437, |
| "grad_norm": 0.5753449201583862, |
| "learning_rate": 0.0005058003237992444, |
| "loss": 3.6803, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.5849056603773586, |
| "grad_norm": 0.5543888807296753, |
| "learning_rate": 0.000505476524554776, |
| "loss": 3.6864, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.5902964959568733, |
| "grad_norm": 0.5601723790168762, |
| "learning_rate": 0.0005051527253103076, |
| "loss": 3.6654, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.595687331536388, |
| "grad_norm": 0.5522553324699402, |
| "learning_rate": 0.0005048289260658392, |
| "loss": 3.6576, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.6010781671159031, |
| "grad_norm": 0.5866943001747131, |
| "learning_rate": 0.0005045051268213707, |
| "loss": 3.6834, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.6064690026954178, |
| "grad_norm": 0.5801655650138855, |
| "learning_rate": 0.0005041813275769023, |
| "loss": 3.6865, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.6118598382749325, |
| "grad_norm": 0.5756445527076721, |
| "learning_rate": 0.0005038575283324338, |
| "loss": 3.6724, |
| "step": 14950 |
| }, |
| { |
| "epoch": 1.6172506738544474, |
| "grad_norm": 0.4960193932056427, |
| "learning_rate": 0.0005035337290879654, |
| "loss": 3.664, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.6172506738544474, |
| "eval_accuracy": 0.35552974204391946, |
| "eval_loss": 3.637425184249878, |
| "eval_runtime": 185.9925, |
| "eval_samples_per_second": 96.837, |
| "eval_steps_per_second": 6.054, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.6226415094339623, |
| "grad_norm": 0.5649005770683289, |
| "learning_rate": 0.000503209929843497, |
| "loss": 3.6632, |
| "step": 15050 |
| }, |
| { |
| "epoch": 1.628032345013477, |
| "grad_norm": 0.5567517876625061, |
| "learning_rate": 0.0005028861305990286, |
| "loss": 3.6753, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.633423180592992, |
| "grad_norm": 0.5907676219940186, |
| "learning_rate": 0.0005025623313545601, |
| "loss": 3.6967, |
| "step": 15150 |
| }, |
| { |
| "epoch": 1.6388140161725069, |
| "grad_norm": 0.5697629451751709, |
| "learning_rate": 0.0005022385321100917, |
| "loss": 3.6818, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.6442048517520216, |
| "grad_norm": 0.5234145522117615, |
| "learning_rate": 0.0005019147328656232, |
| "loss": 3.6737, |
| "step": 15250 |
| }, |
| { |
| "epoch": 1.6495956873315363, |
| "grad_norm": 0.5655122995376587, |
| "learning_rate": 0.0005015909336211549, |
| "loss": 3.6731, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.6549865229110512, |
| "grad_norm": 0.5556879639625549, |
| "learning_rate": 0.0005012671343766864, |
| "loss": 3.6583, |
| "step": 15350 |
| }, |
| { |
| "epoch": 1.6603773584905661, |
| "grad_norm": 0.5384864211082458, |
| "learning_rate": 0.000500943335132218, |
| "loss": 3.6766, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.6657681940700808, |
| "grad_norm": 0.6065835952758789, |
| "learning_rate": 0.0005006195358877495, |
| "loss": 3.6646, |
| "step": 15450 |
| }, |
| { |
| "epoch": 1.6711590296495957, |
| "grad_norm": 0.5641660690307617, |
| "learning_rate": 0.0005002957366432812, |
| "loss": 3.666, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.6765498652291106, |
| "grad_norm": 0.5256580710411072, |
| "learning_rate": 0.0004999719373988127, |
| "loss": 3.6515, |
| "step": 15550 |
| }, |
| { |
| "epoch": 1.6819407008086253, |
| "grad_norm": 0.5791158676147461, |
| "learning_rate": 0.0004996481381543442, |
| "loss": 3.6708, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.68733153638814, |
| "grad_norm": 0.5627703666687012, |
| "learning_rate": 0.0004993308148947651, |
| "loss": 3.6779, |
| "step": 15650 |
| }, |
| { |
| "epoch": 1.692722371967655, |
| "grad_norm": 0.6009765863418579, |
| "learning_rate": 0.0004990070156502968, |
| "loss": 3.6717, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.6981132075471699, |
| "grad_norm": 0.6165124177932739, |
| "learning_rate": 0.0004986832164058284, |
| "loss": 3.6558, |
| "step": 15750 |
| }, |
| { |
| "epoch": 1.7035040431266846, |
| "grad_norm": 0.573556661605835, |
| "learning_rate": 0.0004983594171613599, |
| "loss": 3.6659, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.7088948787061995, |
| "grad_norm": 0.5521446466445923, |
| "learning_rate": 0.0004980356179168915, |
| "loss": 3.6825, |
| "step": 15850 |
| }, |
| { |
| "epoch": 1.7142857142857144, |
| "grad_norm": 0.5886697769165039, |
| "learning_rate": 0.000497711818672423, |
| "loss": 3.6819, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.719676549865229, |
| "grad_norm": 0.6262629628181458, |
| "learning_rate": 0.0004973880194279547, |
| "loss": 3.6689, |
| "step": 15950 |
| }, |
| { |
| "epoch": 1.7250673854447438, |
| "grad_norm": 0.6005555391311646, |
| "learning_rate": 0.0004970642201834862, |
| "loss": 3.6639, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7250673854447438, |
| "eval_accuracy": 0.3572108195241851, |
| "eval_loss": 3.6206066608428955, |
| "eval_runtime": 185.4999, |
| "eval_samples_per_second": 97.094, |
| "eval_steps_per_second": 6.07, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7304582210242587, |
| "grad_norm": 0.5764271020889282, |
| "learning_rate": 0.0004967404209390178, |
| "loss": 3.6523, |
| "step": 16050 |
| }, |
| { |
| "epoch": 1.7358490566037736, |
| "grad_norm": 0.5741286277770996, |
| "learning_rate": 0.0004964166216945493, |
| "loss": 3.653, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.7412398921832883, |
| "grad_norm": 0.6296107172966003, |
| "learning_rate": 0.000496092822450081, |
| "loss": 3.6606, |
| "step": 16150 |
| }, |
| { |
| "epoch": 1.7466307277628033, |
| "grad_norm": 0.6621016263961792, |
| "learning_rate": 0.0004957690232056125, |
| "loss": 3.667, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.7520215633423182, |
| "grad_norm": 0.6163436770439148, |
| "learning_rate": 0.0004954452239611441, |
| "loss": 3.6514, |
| "step": 16250 |
| }, |
| { |
| "epoch": 1.7574123989218329, |
| "grad_norm": 0.5336316823959351, |
| "learning_rate": 0.0004951214247166756, |
| "loss": 3.645, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.7628032345013476, |
| "grad_norm": 0.5538309812545776, |
| "learning_rate": 0.0004947976254722072, |
| "loss": 3.6681, |
| "step": 16350 |
| }, |
| { |
| "epoch": 1.7681940700808625, |
| "grad_norm": 0.5325199365615845, |
| "learning_rate": 0.0004944738262277387, |
| "loss": 3.6466, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.7735849056603774, |
| "grad_norm": 0.576173722743988, |
| "learning_rate": 0.0004941500269832703, |
| "loss": 3.6569, |
| "step": 16450 |
| }, |
| { |
| "epoch": 1.778975741239892, |
| "grad_norm": 0.5102316737174988, |
| "learning_rate": 0.0004938262277388019, |
| "loss": 3.6425, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.784366576819407, |
| "grad_norm": 0.5195789933204651, |
| "learning_rate": 0.0004935024284943335, |
| "loss": 3.6499, |
| "step": 16550 |
| }, |
| { |
| "epoch": 1.789757412398922, |
| "grad_norm": 0.5175392031669617, |
| "learning_rate": 0.000493178629249865, |
| "loss": 3.6622, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.7951482479784366, |
| "grad_norm": 0.5237419605255127, |
| "learning_rate": 0.0004928548300053966, |
| "loss": 3.6531, |
| "step": 16650 |
| }, |
| { |
| "epoch": 1.8005390835579513, |
| "grad_norm": 0.6199319958686829, |
| "learning_rate": 0.0004925310307609282, |
| "loss": 3.66, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.8059299191374663, |
| "grad_norm": 0.5926080346107483, |
| "learning_rate": 0.0004922072315164598, |
| "loss": 3.659, |
| "step": 16750 |
| }, |
| { |
| "epoch": 1.8113207547169812, |
| "grad_norm": 0.5379118919372559, |
| "learning_rate": 0.0004918834322719913, |
| "loss": 3.6662, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.8167115902964959, |
| "grad_norm": 0.6137577295303345, |
| "learning_rate": 0.0004915596330275229, |
| "loss": 3.6746, |
| "step": 16850 |
| }, |
| { |
| "epoch": 1.8221024258760108, |
| "grad_norm": 0.5653969645500183, |
| "learning_rate": 0.0004912358337830544, |
| "loss": 3.6474, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.8274932614555257, |
| "grad_norm": 0.56174635887146, |
| "learning_rate": 0.000490912034538586, |
| "loss": 3.6592, |
| "step": 16950 |
| }, |
| { |
| "epoch": 1.8328840970350404, |
| "grad_norm": 0.5398900508880615, |
| "learning_rate": 0.0004905882352941175, |
| "loss": 3.6383, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8328840970350404, |
| "eval_accuracy": 0.3587294609371681, |
| "eval_loss": 3.6051440238952637, |
| "eval_runtime": 185.6742, |
| "eval_samples_per_second": 97.003, |
| "eval_steps_per_second": 6.064, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.838274932614555, |
| "grad_norm": 0.5739971399307251, |
| "learning_rate": 0.0004902644360496492, |
| "loss": 3.6431, |
| "step": 17050 |
| }, |
| { |
| "epoch": 1.8436657681940702, |
| "grad_norm": 0.5605867505073547, |
| "learning_rate": 0.0004899406368051808, |
| "loss": 3.6362, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.849056603773585, |
| "grad_norm": 0.5735852718353271, |
| "learning_rate": 0.0004896168375607123, |
| "loss": 3.6416, |
| "step": 17150 |
| }, |
| { |
| "epoch": 1.8544474393530996, |
| "grad_norm": 0.5647099614143372, |
| "learning_rate": 0.0004892930383162439, |
| "loss": 3.6467, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.8598382749326146, |
| "grad_norm": 0.5226011276245117, |
| "learning_rate": 0.0004889692390717754, |
| "loss": 3.6534, |
| "step": 17250 |
| }, |
| { |
| "epoch": 1.8652291105121295, |
| "grad_norm": 0.5790987610816956, |
| "learning_rate": 0.0004886454398273071, |
| "loss": 3.6564, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.8706199460916442, |
| "grad_norm": 0.5586245059967041, |
| "learning_rate": 0.0004883216405828386, |
| "loss": 3.6568, |
| "step": 17350 |
| }, |
| { |
| "epoch": 1.8760107816711589, |
| "grad_norm": 0.5110759139060974, |
| "learning_rate": 0.00048799784133837017, |
| "loss": 3.6508, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.881401617250674, |
| "grad_norm": 0.5973682999610901, |
| "learning_rate": 0.0004876740420939017, |
| "loss": 3.6405, |
| "step": 17450 |
| }, |
| { |
| "epoch": 1.8867924528301887, |
| "grad_norm": 0.5328883528709412, |
| "learning_rate": 0.0004873502428494333, |
| "loss": 3.6423, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.8921832884097034, |
| "grad_norm": 0.6016663312911987, |
| "learning_rate": 0.0004870264436049649, |
| "loss": 3.6395, |
| "step": 17550 |
| }, |
| { |
| "epoch": 1.8975741239892183, |
| "grad_norm": 0.5924084186553955, |
| "learning_rate": 0.00048670264436049643, |
| "loss": 3.6529, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.9029649595687332, |
| "grad_norm": 0.5941994190216064, |
| "learning_rate": 0.00048637884511602803, |
| "loss": 3.6441, |
| "step": 17650 |
| }, |
| { |
| "epoch": 1.908355795148248, |
| "grad_norm": 0.5356640219688416, |
| "learning_rate": 0.0004860550458715596, |
| "loss": 3.638, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.9137466307277629, |
| "grad_norm": 0.6006718277931213, |
| "learning_rate": 0.0004857312466270912, |
| "loss": 3.6598, |
| "step": 17750 |
| }, |
| { |
| "epoch": 1.9191374663072778, |
| "grad_norm": 0.5845404863357544, |
| "learning_rate": 0.00048540744738262274, |
| "loss": 3.6449, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.9245283018867925, |
| "grad_norm": 0.5526080131530762, |
| "learning_rate": 0.00048508364813815434, |
| "loss": 3.6354, |
| "step": 17850 |
| }, |
| { |
| "epoch": 1.9299191374663072, |
| "grad_norm": 0.5421946048736572, |
| "learning_rate": 0.0004847663248785753, |
| "loss": 3.6465, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.935309973045822, |
| "grad_norm": 0.5944849848747253, |
| "learning_rate": 0.0004844425256341068, |
| "loss": 3.6269, |
| "step": 17950 |
| }, |
| { |
| "epoch": 1.940700808625337, |
| "grad_norm": 0.5479409694671631, |
| "learning_rate": 0.00048411872638963834, |
| "loss": 3.6321, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.940700808625337, |
| "eval_accuracy": 0.35996886442846787, |
| "eval_loss": 3.5911636352539062, |
| "eval_runtime": 185.6549, |
| "eval_samples_per_second": 97.013, |
| "eval_steps_per_second": 6.065, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9460916442048517, |
| "grad_norm": 0.5909312963485718, |
| "learning_rate": 0.00048379492714516995, |
| "loss": 3.6343, |
| "step": 18050 |
| }, |
| { |
| "epoch": 1.9514824797843666, |
| "grad_norm": 0.5868159532546997, |
| "learning_rate": 0.0004834711279007015, |
| "loss": 3.6358, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.9568733153638815, |
| "grad_norm": 0.5208358764648438, |
| "learning_rate": 0.0004831473286562331, |
| "loss": 3.6319, |
| "step": 18150 |
| }, |
| { |
| "epoch": 1.9622641509433962, |
| "grad_norm": 0.5705773234367371, |
| "learning_rate": 0.00048282352941176465, |
| "loss": 3.6214, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.967654986522911, |
| "grad_norm": 0.5833699703216553, |
| "learning_rate": 0.00048249973016729626, |
| "loss": 3.6428, |
| "step": 18250 |
| }, |
| { |
| "epoch": 1.9730458221024259, |
| "grad_norm": 0.594480574131012, |
| "learning_rate": 0.0004821759309228278, |
| "loss": 3.6519, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.9784366576819408, |
| "grad_norm": 0.5162302851676941, |
| "learning_rate": 0.00048185213167835936, |
| "loss": 3.6368, |
| "step": 18350 |
| }, |
| { |
| "epoch": 1.9838274932614555, |
| "grad_norm": 0.6090728640556335, |
| "learning_rate": 0.00048152833243389096, |
| "loss": 3.6373, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.9892183288409704, |
| "grad_norm": 0.5224543809890747, |
| "learning_rate": 0.0004812045331894225, |
| "loss": 3.6452, |
| "step": 18450 |
| }, |
| { |
| "epoch": 1.9946091644204853, |
| "grad_norm": 0.6286558508872986, |
| "learning_rate": 0.0004808807339449541, |
| "loss": 3.6383, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.1893088817596436, |
| "learning_rate": 0.0004805569347004856, |
| "loss": 3.6417, |
| "step": 18550 |
| }, |
| { |
| "epoch": 2.0053908355795147, |
| "grad_norm": 0.5484069585800171, |
| "learning_rate": 0.0004802331354560173, |
| "loss": 3.5707, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.01078167115903, |
| "grad_norm": 0.5843707323074341, |
| "learning_rate": 0.00047990933621154877, |
| "loss": 3.5399, |
| "step": 18650 |
| }, |
| { |
| "epoch": 2.0161725067385445, |
| "grad_norm": 0.5230047106742859, |
| "learning_rate": 0.0004795855369670804, |
| "loss": 3.5512, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.0215633423180592, |
| "grad_norm": 0.5233069658279419, |
| "learning_rate": 0.0004792617377226119, |
| "loss": 3.5319, |
| "step": 18750 |
| }, |
| { |
| "epoch": 2.026954177897574, |
| "grad_norm": 0.5597031712532043, |
| "learning_rate": 0.0004789379384781435, |
| "loss": 3.5518, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.032345013477089, |
| "grad_norm": 0.5443568825721741, |
| "learning_rate": 0.0004786141392336751, |
| "loss": 3.5586, |
| "step": 18850 |
| }, |
| { |
| "epoch": 2.0377358490566038, |
| "grad_norm": 0.5472878217697144, |
| "learning_rate": 0.00047829033998920663, |
| "loss": 3.5469, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.0431266846361185, |
| "grad_norm": 0.5626387000083923, |
| "learning_rate": 0.00047796654074473824, |
| "loss": 3.5408, |
| "step": 18950 |
| }, |
| { |
| "epoch": 2.0485175202156336, |
| "grad_norm": 0.5888155698776245, |
| "learning_rate": 0.0004776427415002698, |
| "loss": 3.5672, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.0485175202156336, |
| "eval_accuracy": 0.361499131700447, |
| "eval_loss": 3.5821456909179688, |
| "eval_runtime": 185.3913, |
| "eval_samples_per_second": 97.151, |
| "eval_steps_per_second": 6.074, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.0539083557951483, |
| "grad_norm": 0.5674854516983032, |
| "learning_rate": 0.0004773189422558014, |
| "loss": 3.5436, |
| "step": 19050 |
| }, |
| { |
| "epoch": 2.059299191374663, |
| "grad_norm": 0.5584875345230103, |
| "learning_rate": 0.00047699514301133294, |
| "loss": 3.567, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.0646900269541777, |
| "grad_norm": 0.5911595821380615, |
| "learning_rate": 0.00047667134376686455, |
| "loss": 3.5623, |
| "step": 19150 |
| }, |
| { |
| "epoch": 2.070080862533693, |
| "grad_norm": 0.5857378840446472, |
| "learning_rate": 0.0004763475445223961, |
| "loss": 3.5702, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.0754716981132075, |
| "grad_norm": 0.5929583311080933, |
| "learning_rate": 0.0004760237452779276, |
| "loss": 3.5495, |
| "step": 19250 |
| }, |
| { |
| "epoch": 2.0808625336927222, |
| "grad_norm": 0.5647702813148499, |
| "learning_rate": 0.0004756999460334592, |
| "loss": 3.5433, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.0862533692722374, |
| "grad_norm": 0.5857425332069397, |
| "learning_rate": 0.00047537614678899075, |
| "loss": 3.547, |
| "step": 19350 |
| }, |
| { |
| "epoch": 2.091644204851752, |
| "grad_norm": 0.5703374743461609, |
| "learning_rate": 0.00047505234754452235, |
| "loss": 3.5461, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.0970350404312668, |
| "grad_norm": 0.8500975966453552, |
| "learning_rate": 0.0004747285483000539, |
| "loss": 3.5538, |
| "step": 19450 |
| }, |
| { |
| "epoch": 2.1024258760107815, |
| "grad_norm": 0.6051983833312988, |
| "learning_rate": 0.0004744047490555855, |
| "loss": 3.5454, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.1078167115902966, |
| "grad_norm": 0.7593019008636475, |
| "learning_rate": 0.00047408094981111706, |
| "loss": 3.5555, |
| "step": 19550 |
| }, |
| { |
| "epoch": 2.1132075471698113, |
| "grad_norm": 0.5925678014755249, |
| "learning_rate": 0.000473763626551538, |
| "loss": 3.5514, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.118598382749326, |
| "grad_norm": 0.5761629343032837, |
| "learning_rate": 0.00047343982730706956, |
| "loss": 3.5705, |
| "step": 19650 |
| }, |
| { |
| "epoch": 2.123989218328841, |
| "grad_norm": 0.5896238088607788, |
| "learning_rate": 0.00047311602806260117, |
| "loss": 3.5423, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.129380053908356, |
| "grad_norm": 0.5506173968315125, |
| "learning_rate": 0.0004727922288181327, |
| "loss": 3.5619, |
| "step": 19750 |
| }, |
| { |
| "epoch": 2.1347708894878705, |
| "grad_norm": 0.5747953653335571, |
| "learning_rate": 0.0004724684295736643, |
| "loss": 3.5549, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.1401617250673857, |
| "grad_norm": 0.6055583953857422, |
| "learning_rate": 0.0004721446303291959, |
| "loss": 3.5466, |
| "step": 19850 |
| }, |
| { |
| "epoch": 2.1455525606469004, |
| "grad_norm": 0.5854771733283997, |
| "learning_rate": 0.0004718208310847275, |
| "loss": 3.5586, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.150943396226415, |
| "grad_norm": 0.5524225234985352, |
| "learning_rate": 0.000471497031840259, |
| "loss": 3.5506, |
| "step": 19950 |
| }, |
| { |
| "epoch": 2.1563342318059298, |
| "grad_norm": 0.6449847221374512, |
| "learning_rate": 0.0004711732325957905, |
| "loss": 3.5754, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.1563342318059298, |
| "eval_accuracy": 0.3623671596211969, |
| "eval_loss": 3.573341131210327, |
| "eval_runtime": 185.7691, |
| "eval_samples_per_second": 96.954, |
| "eval_steps_per_second": 6.061, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.161725067385445, |
| "grad_norm": 0.5967618823051453, |
| "learning_rate": 0.00047084943335132213, |
| "loss": 3.5752, |
| "step": 20050 |
| }, |
| { |
| "epoch": 2.1671159029649596, |
| "grad_norm": 0.5871074795722961, |
| "learning_rate": 0.0004705256341068537, |
| "loss": 3.5762, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.1725067385444743, |
| "grad_norm": 0.5428000688552856, |
| "learning_rate": 0.0004702018348623853, |
| "loss": 3.5692, |
| "step": 20150 |
| }, |
| { |
| "epoch": 2.177897574123989, |
| "grad_norm": 0.606965959072113, |
| "learning_rate": 0.00046987803561791684, |
| "loss": 3.5672, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.183288409703504, |
| "grad_norm": 0.5462236404418945, |
| "learning_rate": 0.00046955423637344844, |
| "loss": 3.5669, |
| "step": 20250 |
| }, |
| { |
| "epoch": 2.188679245283019, |
| "grad_norm": 0.5761914253234863, |
| "learning_rate": 0.00046923043712898, |
| "loss": 3.5573, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.1940700808625335, |
| "grad_norm": 0.5329501628875732, |
| "learning_rate": 0.0004689066378845116, |
| "loss": 3.5705, |
| "step": 20350 |
| }, |
| { |
| "epoch": 2.1994609164420487, |
| "grad_norm": 0.5812281966209412, |
| "learning_rate": 0.00046858283864004315, |
| "loss": 3.5734, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.2048517520215634, |
| "grad_norm": 0.5665324926376343, |
| "learning_rate": 0.0004682590393955747, |
| "loss": 3.5702, |
| "step": 20450 |
| }, |
| { |
| "epoch": 2.210242587601078, |
| "grad_norm": 0.5955948233604431, |
| "learning_rate": 0.0004679352401511063, |
| "loss": 3.5426, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.215633423180593, |
| "grad_norm": 0.5881284475326538, |
| "learning_rate": 0.0004676114409066378, |
| "loss": 3.5686, |
| "step": 20550 |
| }, |
| { |
| "epoch": 2.221024258760108, |
| "grad_norm": 0.6288923025131226, |
| "learning_rate": 0.00046728764166216946, |
| "loss": 3.5468, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.2264150943396226, |
| "grad_norm": 0.5576397776603699, |
| "learning_rate": 0.00046696384241770095, |
| "loss": 3.5532, |
| "step": 20650 |
| }, |
| { |
| "epoch": 2.2318059299191373, |
| "grad_norm": 0.5199394226074219, |
| "learning_rate": 0.00046664004317323256, |
| "loss": 3.551, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.2371967654986524, |
| "grad_norm": 0.5937060713768005, |
| "learning_rate": 0.0004663162439287641, |
| "loss": 3.5577, |
| "step": 20750 |
| }, |
| { |
| "epoch": 2.242587601078167, |
| "grad_norm": 0.6562036871910095, |
| "learning_rate": 0.00046599244468429566, |
| "loss": 3.5528, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.247978436657682, |
| "grad_norm": 0.565091609954834, |
| "learning_rate": 0.00046566864543982726, |
| "loss": 3.5683, |
| "step": 20850 |
| }, |
| { |
| "epoch": 2.2533692722371965, |
| "grad_norm": 0.5420801043510437, |
| "learning_rate": 0.0004653448461953588, |
| "loss": 3.5628, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.2587601078167117, |
| "grad_norm": 0.5261815786361694, |
| "learning_rate": 0.0004650210469508904, |
| "loss": 3.5508, |
| "step": 20950 |
| }, |
| { |
| "epoch": 2.2641509433962264, |
| "grad_norm": 0.5820636749267578, |
| "learning_rate": 0.00046469724770642197, |
| "loss": 3.5637, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.2641509433962264, |
| "eval_accuracy": 0.36372227843801913, |
| "eval_loss": 3.5613205432891846, |
| "eval_runtime": 185.2619, |
| "eval_samples_per_second": 97.219, |
| "eval_steps_per_second": 6.078, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.269541778975741, |
| "grad_norm": 0.5688766837120056, |
| "learning_rate": 0.0004643734484619536, |
| "loss": 3.5568, |
| "step": 21050 |
| }, |
| { |
| "epoch": 2.274932614555256, |
| "grad_norm": 0.5932535529136658, |
| "learning_rate": 0.0004640496492174851, |
| "loss": 3.5535, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.280323450134771, |
| "grad_norm": 0.5635634064674377, |
| "learning_rate": 0.00046372584997301673, |
| "loss": 3.5756, |
| "step": 21150 |
| }, |
| { |
| "epoch": 2.2857142857142856, |
| "grad_norm": 0.6333921551704407, |
| "learning_rate": 0.0004634020507285483, |
| "loss": 3.5627, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.2911051212938007, |
| "grad_norm": 0.5962139964103699, |
| "learning_rate": 0.0004630782514840798, |
| "loss": 3.571, |
| "step": 21250 |
| }, |
| { |
| "epoch": 2.2964959568733154, |
| "grad_norm": 0.601997435092926, |
| "learning_rate": 0.0004627544522396114, |
| "loss": 3.5379, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.30188679245283, |
| "grad_norm": 0.83173668384552, |
| "learning_rate": 0.00046243065299514293, |
| "loss": 3.559, |
| "step": 21350 |
| }, |
| { |
| "epoch": 2.3072776280323453, |
| "grad_norm": 0.550325334072113, |
| "learning_rate": 0.00046210685375067454, |
| "loss": 3.5504, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.31266846361186, |
| "grad_norm": 0.5884050130844116, |
| "learning_rate": 0.0004617830545062061, |
| "loss": 3.5759, |
| "step": 21450 |
| }, |
| { |
| "epoch": 2.3180592991913747, |
| "grad_norm": 0.5583473443984985, |
| "learning_rate": 0.0004614592552617377, |
| "loss": 3.5461, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.3234501347708894, |
| "grad_norm": 0.5997599363327026, |
| "learning_rate": 0.00046113545601726924, |
| "loss": 3.5626, |
| "step": 21550 |
| }, |
| { |
| "epoch": 2.3288409703504045, |
| "grad_norm": 0.5541661977767944, |
| "learning_rate": 0.00046081165677280085, |
| "loss": 3.5549, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.334231805929919, |
| "grad_norm": 0.6619417071342468, |
| "learning_rate": 0.0004604878575283324, |
| "loss": 3.5543, |
| "step": 21650 |
| }, |
| { |
| "epoch": 2.339622641509434, |
| "grad_norm": 0.5892545580863953, |
| "learning_rate": 0.00046016405828386395, |
| "loss": 3.5725, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.3450134770889486, |
| "grad_norm": 0.5789877772331238, |
| "learning_rate": 0.00045984025903939555, |
| "loss": 3.5618, |
| "step": 21750 |
| }, |
| { |
| "epoch": 2.3504043126684637, |
| "grad_norm": 0.5967502593994141, |
| "learning_rate": 0.0004595164597949271, |
| "loss": 3.5435, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.3557951482479784, |
| "grad_norm": 0.5737834572792053, |
| "learning_rate": 0.0004591926605504587, |
| "loss": 3.5549, |
| "step": 21850 |
| }, |
| { |
| "epoch": 2.361185983827493, |
| "grad_norm": 0.5817111134529114, |
| "learning_rate": 0.0004588688613059902, |
| "loss": 3.5489, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.3665768194070083, |
| "grad_norm": 0.5602347254753113, |
| "learning_rate": 0.00045854506206152186, |
| "loss": 3.5488, |
| "step": 21950 |
| }, |
| { |
| "epoch": 2.371967654986523, |
| "grad_norm": 0.5591195821762085, |
| "learning_rate": 0.00045822126281705336, |
| "loss": 3.5706, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.371967654986523, |
| "eval_accuracy": 0.3645871554250224, |
| "eval_loss": 3.5503523349761963, |
| "eval_runtime": 185.7321, |
| "eval_samples_per_second": 96.973, |
| "eval_steps_per_second": 6.062, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3773584905660377, |
| "grad_norm": 0.5747423768043518, |
| "learning_rate": 0.00045789746357258497, |
| "loss": 3.5462, |
| "step": 22050 |
| }, |
| { |
| "epoch": 2.382749326145553, |
| "grad_norm": 0.6654125452041626, |
| "learning_rate": 0.0004575736643281165, |
| "loss": 3.5505, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.3881401617250675, |
| "grad_norm": 0.593645453453064, |
| "learning_rate": 0.00045724986508364807, |
| "loss": 3.5528, |
| "step": 22150 |
| }, |
| { |
| "epoch": 2.393530997304582, |
| "grad_norm": 0.5858222842216492, |
| "learning_rate": 0.00045692606583917967, |
| "loss": 3.5645, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.398921832884097, |
| "grad_norm": 0.6029301285743713, |
| "learning_rate": 0.0004566022665947112, |
| "loss": 3.5695, |
| "step": 22250 |
| }, |
| { |
| "epoch": 2.404312668463612, |
| "grad_norm": 0.584065854549408, |
| "learning_rate": 0.0004562784673502428, |
| "loss": 3.5708, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.4097035040431267, |
| "grad_norm": 0.5794286727905273, |
| "learning_rate": 0.0004559546681057744, |
| "loss": 3.5516, |
| "step": 22350 |
| }, |
| { |
| "epoch": 2.4150943396226414, |
| "grad_norm": 0.6180821657180786, |
| "learning_rate": 0.000455630868861306, |
| "loss": 3.5726, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.420485175202156, |
| "grad_norm": 0.5881420373916626, |
| "learning_rate": 0.00045530706961683753, |
| "loss": 3.5624, |
| "step": 22450 |
| }, |
| { |
| "epoch": 2.4258760107816713, |
| "grad_norm": 0.5598190426826477, |
| "learning_rate": 0.00045498327037236914, |
| "loss": 3.556, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.431266846361186, |
| "grad_norm": 0.5753728151321411, |
| "learning_rate": 0.0004546594711279007, |
| "loss": 3.5505, |
| "step": 22550 |
| }, |
| { |
| "epoch": 2.4366576819407006, |
| "grad_norm": 0.5574933886528015, |
| "learning_rate": 0.0004543356718834322, |
| "loss": 3.551, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.442048517520216, |
| "grad_norm": 0.5815372467041016, |
| "learning_rate": 0.0004540118726389638, |
| "loss": 3.5393, |
| "step": 22650 |
| }, |
| { |
| "epoch": 2.4474393530997305, |
| "grad_norm": 0.5866022706031799, |
| "learning_rate": 0.00045368807339449534, |
| "loss": 3.5423, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.452830188679245, |
| "grad_norm": 0.5673452615737915, |
| "learning_rate": 0.00045336427415002694, |
| "loss": 3.547, |
| "step": 22750 |
| }, |
| { |
| "epoch": 2.4582210242587603, |
| "grad_norm": 0.587451159954071, |
| "learning_rate": 0.0004530404749055585, |
| "loss": 3.5455, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.463611859838275, |
| "grad_norm": 0.6395877599716187, |
| "learning_rate": 0.0004527166756610901, |
| "loss": 3.5518, |
| "step": 22850 |
| }, |
| { |
| "epoch": 2.4690026954177897, |
| "grad_norm": 0.6238613724708557, |
| "learning_rate": 0.00045239287641662165, |
| "loss": 3.5595, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.4743935309973044, |
| "grad_norm": 0.6073823571205139, |
| "learning_rate": 0.0004520690771721532, |
| "loss": 3.5612, |
| "step": 22950 |
| }, |
| { |
| "epoch": 2.4797843665768196, |
| "grad_norm": 0.582822322845459, |
| "learning_rate": 0.0004517452779276848, |
| "loss": 3.5408, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4797843665768196, |
| "eval_accuracy": 0.3659634615549684, |
| "eval_loss": 3.5402395725250244, |
| "eval_runtime": 185.4548, |
| "eval_samples_per_second": 97.118, |
| "eval_steps_per_second": 6.072, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4851752021563343, |
| "grad_norm": 0.5883828401565552, |
| "learning_rate": 0.00045142147868321636, |
| "loss": 3.543, |
| "step": 23050 |
| }, |
| { |
| "epoch": 2.490566037735849, |
| "grad_norm": 0.5678288340568542, |
| "learning_rate": 0.00045109767943874796, |
| "loss": 3.5585, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.4959568733153636, |
| "grad_norm": 0.57308030128479, |
| "learning_rate": 0.0004507738801942795, |
| "loss": 3.5585, |
| "step": 23150 |
| }, |
| { |
| "epoch": 2.501347708894879, |
| "grad_norm": 0.6100690364837646, |
| "learning_rate": 0.0004504500809498111, |
| "loss": 3.5512, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.5067385444743935, |
| "grad_norm": 0.5841269493103027, |
| "learning_rate": 0.0004501262817053426, |
| "loss": 3.5446, |
| "step": 23250 |
| }, |
| { |
| "epoch": 2.512129380053908, |
| "grad_norm": 0.6322735548019409, |
| "learning_rate": 0.00044980248246087427, |
| "loss": 3.5466, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.5175202156334233, |
| "grad_norm": 0.6177630424499512, |
| "learning_rate": 0.00044947868321640577, |
| "loss": 3.5425, |
| "step": 23350 |
| }, |
| { |
| "epoch": 2.522911051212938, |
| "grad_norm": 0.6240397691726685, |
| "learning_rate": 0.0004491548839719373, |
| "loss": 3.5322, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.5283018867924527, |
| "grad_norm": 0.587017834186554, |
| "learning_rate": 0.0004488310847274689, |
| "loss": 3.5453, |
| "step": 23450 |
| }, |
| { |
| "epoch": 2.533692722371968, |
| "grad_norm": 0.5576516389846802, |
| "learning_rate": 0.0004485072854830005, |
| "loss": 3.5428, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.5390835579514826, |
| "grad_norm": 0.5701577663421631, |
| "learning_rate": 0.0004481834862385321, |
| "loss": 3.5514, |
| "step": 23550 |
| }, |
| { |
| "epoch": 2.5444743935309972, |
| "grad_norm": 0.5828137397766113, |
| "learning_rate": 0.00044785968699406363, |
| "loss": 3.5517, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.5498652291105124, |
| "grad_norm": 0.6262269020080566, |
| "learning_rate": 0.0004475423637344846, |
| "loss": 3.556, |
| "step": 23650 |
| }, |
| { |
| "epoch": 2.555256064690027, |
| "grad_norm": 0.5848754644393921, |
| "learning_rate": 0.00044721856449001613, |
| "loss": 3.5631, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.560646900269542, |
| "grad_norm": 0.5634021162986755, |
| "learning_rate": 0.00044689476524554774, |
| "loss": 3.5699, |
| "step": 23750 |
| }, |
| { |
| "epoch": 2.5660377358490565, |
| "grad_norm": 0.6959907412528992, |
| "learning_rate": 0.0004465709660010793, |
| "loss": 3.5458, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.571428571428571, |
| "grad_norm": 0.5664647817611694, |
| "learning_rate": 0.0004462471667566109, |
| "loss": 3.5597, |
| "step": 23850 |
| }, |
| { |
| "epoch": 2.5768194070080863, |
| "grad_norm": 0.5963898301124573, |
| "learning_rate": 0.00044592336751214244, |
| "loss": 3.5427, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.582210242587601, |
| "grad_norm": 0.5585032105445862, |
| "learning_rate": 0.00044559956826767405, |
| "loss": 3.5489, |
| "step": 23950 |
| }, |
| { |
| "epoch": 2.5876010781671157, |
| "grad_norm": 0.6078963279724121, |
| "learning_rate": 0.00044527576902320554, |
| "loss": 3.532, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.5876010781671157, |
| "eval_accuracy": 0.36688788032449404, |
| "eval_loss": 3.531247854232788, |
| "eval_runtime": 185.6875, |
| "eval_samples_per_second": 96.996, |
| "eval_steps_per_second": 6.064, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.592991913746631, |
| "grad_norm": 0.5676623582839966, |
| "learning_rate": 0.00044495196977873715, |
| "loss": 3.5458, |
| "step": 24050 |
| }, |
| { |
| "epoch": 2.5983827493261455, |
| "grad_norm": 0.6343262195587158, |
| "learning_rate": 0.0004446281705342687, |
| "loss": 3.5379, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.6037735849056602, |
| "grad_norm": 0.5848167538642883, |
| "learning_rate": 0.00044430437128980025, |
| "loss": 3.5677, |
| "step": 24150 |
| }, |
| { |
| "epoch": 2.6091644204851754, |
| "grad_norm": 0.6168308854103088, |
| "learning_rate": 0.00044398057204533185, |
| "loss": 3.5298, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.61455525606469, |
| "grad_norm": 0.5785287022590637, |
| "learning_rate": 0.0004436567728008634, |
| "loss": 3.5411, |
| "step": 24250 |
| }, |
| { |
| "epoch": 2.6199460916442048, |
| "grad_norm": 0.5798066258430481, |
| "learning_rate": 0.000443332973556395, |
| "loss": 3.5491, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.62533692722372, |
| "grad_norm": 0.5612218379974365, |
| "learning_rate": 0.00044300917431192656, |
| "loss": 3.5649, |
| "step": 24350 |
| }, |
| { |
| "epoch": 2.6307277628032346, |
| "grad_norm": 0.5887892842292786, |
| "learning_rate": 0.00044268537506745816, |
| "loss": 3.5273, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.6361185983827493, |
| "grad_norm": 0.6022453308105469, |
| "learning_rate": 0.0004423615758229897, |
| "loss": 3.5447, |
| "step": 24450 |
| }, |
| { |
| "epoch": 2.641509433962264, |
| "grad_norm": 0.5853219032287598, |
| "learning_rate": 0.0004420377765785213, |
| "loss": 3.5591, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.6469002695417787, |
| "grad_norm": 0.5891228318214417, |
| "learning_rate": 0.00044171397733405287, |
| "loss": 3.5263, |
| "step": 24550 |
| }, |
| { |
| "epoch": 2.652291105121294, |
| "grad_norm": 0.6075050830841064, |
| "learning_rate": 0.00044139017808958437, |
| "loss": 3.5344, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.6576819407008085, |
| "grad_norm": 0.5453810095787048, |
| "learning_rate": 0.00044106637884511597, |
| "loss": 3.5361, |
| "step": 24650 |
| }, |
| { |
| "epoch": 2.6630727762803232, |
| "grad_norm": 0.6543773412704468, |
| "learning_rate": 0.0004407425796006475, |
| "loss": 3.5407, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.6684636118598384, |
| "grad_norm": 0.5871316194534302, |
| "learning_rate": 0.00044041878035617913, |
| "loss": 3.5404, |
| "step": 24750 |
| }, |
| { |
| "epoch": 2.673854447439353, |
| "grad_norm": 0.5868498086929321, |
| "learning_rate": 0.0004400949811117107, |
| "loss": 3.5162, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.6792452830188678, |
| "grad_norm": 0.5242236852645874, |
| "learning_rate": 0.0004397711818672423, |
| "loss": 3.5146, |
| "step": 24850 |
| }, |
| { |
| "epoch": 2.684636118598383, |
| "grad_norm": 0.5566399097442627, |
| "learning_rate": 0.00043944738262277383, |
| "loss": 3.5539, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.6900269541778976, |
| "grad_norm": 0.5947811603546143, |
| "learning_rate": 0.00043912358337830544, |
| "loss": 3.5379, |
| "step": 24950 |
| }, |
| { |
| "epoch": 2.6954177897574123, |
| "grad_norm": 0.5996220111846924, |
| "learning_rate": 0.000438799784133837, |
| "loss": 3.527, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.6954177897574123, |
| "eval_accuracy": 0.3672525194159994, |
| "eval_loss": 3.5217697620391846, |
| "eval_runtime": 185.7585, |
| "eval_samples_per_second": 96.959, |
| "eval_steps_per_second": 6.062, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.7008086253369274, |
| "grad_norm": 0.5733533501625061, |
| "learning_rate": 0.00043847598488936854, |
| "loss": 3.5383, |
| "step": 25050 |
| }, |
| { |
| "epoch": 2.706199460916442, |
| "grad_norm": 0.5578649044036865, |
| "learning_rate": 0.00043815218564490014, |
| "loss": 3.5233, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.711590296495957, |
| "grad_norm": 0.5936254858970642, |
| "learning_rate": 0.0004378283864004317, |
| "loss": 3.5137, |
| "step": 25150 |
| }, |
| { |
| "epoch": 2.7169811320754715, |
| "grad_norm": 0.591632068157196, |
| "learning_rate": 0.0004375045871559633, |
| "loss": 3.5193, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.7223719676549867, |
| "grad_norm": 0.6370254158973694, |
| "learning_rate": 0.00043718078791149485, |
| "loss": 3.5535, |
| "step": 25250 |
| }, |
| { |
| "epoch": 2.7277628032345014, |
| "grad_norm": 0.5756546258926392, |
| "learning_rate": 0.00043685698866702645, |
| "loss": 3.5101, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.733153638814016, |
| "grad_norm": 0.595561146736145, |
| "learning_rate": 0.00043653318942255795, |
| "loss": 3.5358, |
| "step": 25350 |
| }, |
| { |
| "epoch": 2.7385444743935308, |
| "grad_norm": 0.6051717400550842, |
| "learning_rate": 0.00043620939017808956, |
| "loss": 3.5376, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.743935309973046, |
| "grad_norm": 0.5473976731300354, |
| "learning_rate": 0.0004358855909336211, |
| "loss": 3.5087, |
| "step": 25450 |
| }, |
| { |
| "epoch": 2.7493261455525606, |
| "grad_norm": 0.6153759360313416, |
| "learning_rate": 0.00043556179168915266, |
| "loss": 3.5703, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.7547169811320753, |
| "grad_norm": 0.5839317440986633, |
| "learning_rate": 0.00043523799244468426, |
| "loss": 3.5587, |
| "step": 25550 |
| }, |
| { |
| "epoch": 2.7601078167115904, |
| "grad_norm": 0.6133059859275818, |
| "learning_rate": 0.0004349141932002158, |
| "loss": 3.5361, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.765498652291105, |
| "grad_norm": 0.6802798509597778, |
| "learning_rate": 0.0004345903939557474, |
| "loss": 3.5245, |
| "step": 25650 |
| }, |
| { |
| "epoch": 2.77088948787062, |
| "grad_norm": 0.5973533987998962, |
| "learning_rate": 0.00043426659471127897, |
| "loss": 3.5436, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.776280323450135, |
| "grad_norm": 0.6152960658073425, |
| "learning_rate": 0.0004339492714516999, |
| "loss": 3.5327, |
| "step": 25750 |
| }, |
| { |
| "epoch": 2.7816711590296497, |
| "grad_norm": 0.5712870359420776, |
| "learning_rate": 0.00043362547220723147, |
| "loss": 3.5337, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.7870619946091644, |
| "grad_norm": 0.5867605209350586, |
| "learning_rate": 0.0004333016729627631, |
| "loss": 3.5213, |
| "step": 25850 |
| }, |
| { |
| "epoch": 2.7924528301886795, |
| "grad_norm": 0.6075189113616943, |
| "learning_rate": 0.0004329778737182946, |
| "loss": 3.5308, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.797843665768194, |
| "grad_norm": 0.5805151462554932, |
| "learning_rate": 0.00043265407447382623, |
| "loss": 3.5207, |
| "step": 25950 |
| }, |
| { |
| "epoch": 2.803234501347709, |
| "grad_norm": 0.5881994962692261, |
| "learning_rate": 0.0004323302752293577, |
| "loss": 3.5333, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.803234501347709, |
| "eval_accuracy": 0.368511371774218, |
| "eval_loss": 3.51200795173645, |
| "eval_runtime": 185.141, |
| "eval_samples_per_second": 97.283, |
| "eval_steps_per_second": 6.082, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.8086253369272236, |
| "grad_norm": 0.5690694451332092, |
| "learning_rate": 0.00043200647598488933, |
| "loss": 3.5296, |
| "step": 26050 |
| }, |
| { |
| "epoch": 2.8140161725067383, |
| "grad_norm": 0.5980532169342041, |
| "learning_rate": 0.0004316826767404209, |
| "loss": 3.5044, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.8194070080862534, |
| "grad_norm": 0.5740624070167542, |
| "learning_rate": 0.0004313588774959525, |
| "loss": 3.5082, |
| "step": 26150 |
| }, |
| { |
| "epoch": 2.824797843665768, |
| "grad_norm": 0.6146470904350281, |
| "learning_rate": 0.00043103507825148404, |
| "loss": 3.532, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.830188679245283, |
| "grad_norm": 0.5921385288238525, |
| "learning_rate": 0.0004307112790070156, |
| "loss": 3.5445, |
| "step": 26250 |
| }, |
| { |
| "epoch": 2.835579514824798, |
| "grad_norm": 0.5627673268318176, |
| "learning_rate": 0.0004303874797625472, |
| "loss": 3.5604, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.8409703504043127, |
| "grad_norm": 0.6661468744277954, |
| "learning_rate": 0.00043006368051807874, |
| "loss": 3.5291, |
| "step": 26350 |
| }, |
| { |
| "epoch": 2.8463611859838274, |
| "grad_norm": 0.6103497743606567, |
| "learning_rate": 0.00042973988127361035, |
| "loss": 3.5391, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.8517520215633425, |
| "grad_norm": 0.6032962799072266, |
| "learning_rate": 0.0004294160820291419, |
| "loss": 3.545, |
| "step": 26450 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.5415673851966858, |
| "learning_rate": 0.0004290922827846735, |
| "loss": 3.5161, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.862533692722372, |
| "grad_norm": 0.5705322027206421, |
| "learning_rate": 0.00042876848354020505, |
| "loss": 3.5323, |
| "step": 26550 |
| }, |
| { |
| "epoch": 2.867924528301887, |
| "grad_norm": 0.5669793486595154, |
| "learning_rate": 0.00042844468429573655, |
| "loss": 3.5407, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.8733153638814017, |
| "grad_norm": 0.605086624622345, |
| "learning_rate": 0.00042812088505126815, |
| "loss": 3.5466, |
| "step": 26650 |
| }, |
| { |
| "epoch": 2.8787061994609164, |
| "grad_norm": 0.5618116855621338, |
| "learning_rate": 0.0004277970858067997, |
| "loss": 3.5318, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.884097035040431, |
| "grad_norm": 0.6186133027076721, |
| "learning_rate": 0.0004274732865623313, |
| "loss": 3.5432, |
| "step": 26750 |
| }, |
| { |
| "epoch": 2.889487870619946, |
| "grad_norm": 0.5849158763885498, |
| "learning_rate": 0.00042714948731786286, |
| "loss": 3.5203, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.894878706199461, |
| "grad_norm": 0.6805973052978516, |
| "learning_rate": 0.00042682568807339447, |
| "loss": 3.5437, |
| "step": 26850 |
| }, |
| { |
| "epoch": 2.9002695417789757, |
| "grad_norm": 0.6036573648452759, |
| "learning_rate": 0.000426501888828926, |
| "loss": 3.5082, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.9056603773584904, |
| "grad_norm": 0.6047782301902771, |
| "learning_rate": 0.0004261780895844576, |
| "loss": 3.5299, |
| "step": 26950 |
| }, |
| { |
| "epoch": 2.9110512129380055, |
| "grad_norm": 0.5614576935768127, |
| "learning_rate": 0.00042585429033998917, |
| "loss": 3.5442, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9110512129380055, |
| "eval_accuracy": 0.3696261504031946, |
| "eval_loss": 3.5053839683532715, |
| "eval_runtime": 185.4428, |
| "eval_samples_per_second": 97.124, |
| "eval_steps_per_second": 6.072, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.91644204851752, |
| "grad_norm": 0.6214551329612732, |
| "learning_rate": 0.0004255304910955207, |
| "loss": 3.5288, |
| "step": 27050 |
| }, |
| { |
| "epoch": 2.921832884097035, |
| "grad_norm": 0.5811451077461243, |
| "learning_rate": 0.0004252066918510523, |
| "loss": 3.5338, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.92722371967655, |
| "grad_norm": 0.6257526874542236, |
| "learning_rate": 0.0004248828926065839, |
| "loss": 3.5296, |
| "step": 27150 |
| }, |
| { |
| "epoch": 2.9326145552560647, |
| "grad_norm": 0.6285461187362671, |
| "learning_rate": 0.0004245590933621155, |
| "loss": 3.5139, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.9380053908355794, |
| "grad_norm": 0.549082338809967, |
| "learning_rate": 0.00042423529411764703, |
| "loss": 3.5302, |
| "step": 27250 |
| }, |
| { |
| "epoch": 2.9433962264150946, |
| "grad_norm": 0.5633655786514282, |
| "learning_rate": 0.00042391149487317864, |
| "loss": 3.5334, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.9487870619946093, |
| "grad_norm": 0.5984169244766235, |
| "learning_rate": 0.00042358769562871013, |
| "loss": 3.5178, |
| "step": 27350 |
| }, |
| { |
| "epoch": 2.954177897574124, |
| "grad_norm": 0.6494591236114502, |
| "learning_rate": 0.00042326389638424174, |
| "loss": 3.5112, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.9595687331536387, |
| "grad_norm": 0.5794746279716492, |
| "learning_rate": 0.0004229400971397733, |
| "loss": 3.5172, |
| "step": 27450 |
| }, |
| { |
| "epoch": 2.964959568733154, |
| "grad_norm": 0.5832740068435669, |
| "learning_rate": 0.00042261629789530484, |
| "loss": 3.5293, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.9703504043126685, |
| "grad_norm": 0.5642590522766113, |
| "learning_rate": 0.00042229249865083644, |
| "loss": 3.5181, |
| "step": 27550 |
| }, |
| { |
| "epoch": 2.975741239892183, |
| "grad_norm": 0.5790451765060425, |
| "learning_rate": 0.000421968699406368, |
| "loss": 3.5252, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.981132075471698, |
| "grad_norm": 0.6082571744918823, |
| "learning_rate": 0.0004216449001618996, |
| "loss": 3.5057, |
| "step": 27650 |
| }, |
| { |
| "epoch": 2.986522911051213, |
| "grad_norm": 0.5873731374740601, |
| "learning_rate": 0.00042132110091743115, |
| "loss": 3.5168, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.9919137466307277, |
| "grad_norm": 0.6429144144058228, |
| "learning_rate": 0.00042099730167296275, |
| "loss": 3.5223, |
| "step": 27750 |
| }, |
| { |
| "epoch": 2.9973045822102424, |
| "grad_norm": 0.642909586429596, |
| "learning_rate": 0.00042067997841338365, |
| "loss": 3.5247, |
| "step": 27800 |
| }, |
| { |
| "epoch": 3.0026954177897576, |
| "grad_norm": 0.6168606281280518, |
| "learning_rate": 0.00042035617916891526, |
| "loss": 3.4874, |
| "step": 27850 |
| }, |
| { |
| "epoch": 3.0080862533692723, |
| "grad_norm": 0.6800719499588013, |
| "learning_rate": 0.0004200323799244468, |
| "loss": 3.4281, |
| "step": 27900 |
| }, |
| { |
| "epoch": 3.013477088948787, |
| "grad_norm": 0.57563316822052, |
| "learning_rate": 0.0004197085806799784, |
| "loss": 3.4331, |
| "step": 27950 |
| }, |
| { |
| "epoch": 3.018867924528302, |
| "grad_norm": 0.607589602470398, |
| "learning_rate": 0.0004193847814355099, |
| "loss": 3.4451, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.018867924528302, |
| "eval_accuracy": 0.3699707973633421, |
| "eval_loss": 3.5011649131774902, |
| "eval_runtime": 185.7113, |
| "eval_samples_per_second": 96.984, |
| "eval_steps_per_second": 6.063, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.024258760107817, |
| "grad_norm": 0.6603455543518066, |
| "learning_rate": 0.0004190609821910415, |
| "loss": 3.4317, |
| "step": 28050 |
| }, |
| { |
| "epoch": 3.0296495956873315, |
| "grad_norm": 0.63117516040802, |
| "learning_rate": 0.00041873718294657306, |
| "loss": 3.4264, |
| "step": 28100 |
| }, |
| { |
| "epoch": 3.035040431266846, |
| "grad_norm": 0.6193031072616577, |
| "learning_rate": 0.00041841338370210467, |
| "loss": 3.466, |
| "step": 28150 |
| }, |
| { |
| "epoch": 3.0404312668463613, |
| "grad_norm": 0.6265783905982971, |
| "learning_rate": 0.0004180895844576362, |
| "loss": 3.4467, |
| "step": 28200 |
| }, |
| { |
| "epoch": 3.045822102425876, |
| "grad_norm": 0.5731920003890991, |
| "learning_rate": 0.00041776578521316777, |
| "loss": 3.4543, |
| "step": 28250 |
| }, |
| { |
| "epoch": 3.0512129380053907, |
| "grad_norm": 0.6172985434532166, |
| "learning_rate": 0.0004174419859686994, |
| "loss": 3.4541, |
| "step": 28300 |
| }, |
| { |
| "epoch": 3.056603773584906, |
| "grad_norm": 0.5971702337265015, |
| "learning_rate": 0.0004171181867242309, |
| "loss": 3.4452, |
| "step": 28350 |
| }, |
| { |
| "epoch": 3.0619946091644206, |
| "grad_norm": 0.6083709001541138, |
| "learning_rate": 0.00041679438747976253, |
| "loss": 3.4237, |
| "step": 28400 |
| }, |
| { |
| "epoch": 3.0673854447439353, |
| "grad_norm": 0.595926821231842, |
| "learning_rate": 0.0004164705882352941, |
| "loss": 3.4655, |
| "step": 28450 |
| }, |
| { |
| "epoch": 3.07277628032345, |
| "grad_norm": 0.6328946352005005, |
| "learning_rate": 0.0004161467889908257, |
| "loss": 3.4581, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.078167115902965, |
| "grad_norm": 0.62444669008255, |
| "learning_rate": 0.00041582298974635724, |
| "loss": 3.4453, |
| "step": 28550 |
| }, |
| { |
| "epoch": 3.08355795148248, |
| "grad_norm": 0.5992231965065002, |
| "learning_rate": 0.00041549919050188884, |
| "loss": 3.442, |
| "step": 28600 |
| }, |
| { |
| "epoch": 3.0889487870619945, |
| "grad_norm": 0.6712383031845093, |
| "learning_rate": 0.00041517539125742034, |
| "loss": 3.4401, |
| "step": 28650 |
| }, |
| { |
| "epoch": 3.0943396226415096, |
| "grad_norm": 0.58417809009552, |
| "learning_rate": 0.0004148515920129519, |
| "loss": 3.4495, |
| "step": 28700 |
| }, |
| { |
| "epoch": 3.0997304582210243, |
| "grad_norm": 0.6253139972686768, |
| "learning_rate": 0.0004145277927684835, |
| "loss": 3.4385, |
| "step": 28750 |
| }, |
| { |
| "epoch": 3.105121293800539, |
| "grad_norm": 0.5860809683799744, |
| "learning_rate": 0.00041420399352401504, |
| "loss": 3.4427, |
| "step": 28800 |
| }, |
| { |
| "epoch": 3.1105121293800537, |
| "grad_norm": 0.6193458437919617, |
| "learning_rate": 0.00041388019427954665, |
| "loss": 3.4264, |
| "step": 28850 |
| }, |
| { |
| "epoch": 3.115902964959569, |
| "grad_norm": 0.5895789861679077, |
| "learning_rate": 0.0004135563950350782, |
| "loss": 3.4446, |
| "step": 28900 |
| }, |
| { |
| "epoch": 3.1212938005390836, |
| "grad_norm": 0.6106541156768799, |
| "learning_rate": 0.0004132325957906098, |
| "loss": 3.4618, |
| "step": 28950 |
| }, |
| { |
| "epoch": 3.1266846361185983, |
| "grad_norm": 0.6361096501350403, |
| "learning_rate": 0.00041290879654614135, |
| "loss": 3.4295, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.1266846361185983, |
| "eval_accuracy": 0.3709342225195931, |
| "eval_loss": 3.496896266937256, |
| "eval_runtime": 184.8429, |
| "eval_samples_per_second": 97.439, |
| "eval_steps_per_second": 6.092, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.1320754716981134, |
| "grad_norm": 0.665164589881897, |
| "learning_rate": 0.00041258499730167296, |
| "loss": 3.4525, |
| "step": 29050 |
| }, |
| { |
| "epoch": 3.137466307277628, |
| "grad_norm": 0.6416609883308411, |
| "learning_rate": 0.0004122611980572045, |
| "loss": 3.4413, |
| "step": 29100 |
| }, |
| { |
| "epoch": 3.142857142857143, |
| "grad_norm": 0.6141297817230225, |
| "learning_rate": 0.00041193739881273606, |
| "loss": 3.4591, |
| "step": 29150 |
| }, |
| { |
| "epoch": 3.1482479784366575, |
| "grad_norm": 0.5525941848754883, |
| "learning_rate": 0.00041161359956826766, |
| "loss": 3.4582, |
| "step": 29200 |
| }, |
| { |
| "epoch": 3.1536388140161726, |
| "grad_norm": 0.5938381552696228, |
| "learning_rate": 0.0004112898003237992, |
| "loss": 3.4456, |
| "step": 29250 |
| }, |
| { |
| "epoch": 3.1590296495956873, |
| "grad_norm": 0.6277801990509033, |
| "learning_rate": 0.0004109660010793308, |
| "loss": 3.4546, |
| "step": 29300 |
| }, |
| { |
| "epoch": 3.164420485175202, |
| "grad_norm": 0.6602087616920471, |
| "learning_rate": 0.0004106422018348623, |
| "loss": 3.4666, |
| "step": 29350 |
| }, |
| { |
| "epoch": 3.169811320754717, |
| "grad_norm": 0.5656922459602356, |
| "learning_rate": 0.0004103184025903939, |
| "loss": 3.458, |
| "step": 29400 |
| }, |
| { |
| "epoch": 3.175202156334232, |
| "grad_norm": 0.605190634727478, |
| "learning_rate": 0.00040999460334592547, |
| "loss": 3.4687, |
| "step": 29450 |
| }, |
| { |
| "epoch": 3.1805929919137466, |
| "grad_norm": 0.6249133944511414, |
| "learning_rate": 0.000409670804101457, |
| "loss": 3.465, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.1859838274932613, |
| "grad_norm": 0.5883227586746216, |
| "learning_rate": 0.00040934700485698863, |
| "loss": 3.4413, |
| "step": 29550 |
| }, |
| { |
| "epoch": 3.1913746630727764, |
| "grad_norm": 0.5973005294799805, |
| "learning_rate": 0.0004090232056125202, |
| "loss": 3.4628, |
| "step": 29600 |
| }, |
| { |
| "epoch": 3.196765498652291, |
| "grad_norm": 0.5894708633422852, |
| "learning_rate": 0.0004086994063680518, |
| "loss": 3.4505, |
| "step": 29650 |
| }, |
| { |
| "epoch": 3.202156334231806, |
| "grad_norm": 0.5822822451591492, |
| "learning_rate": 0.00040837560712358333, |
| "loss": 3.4559, |
| "step": 29700 |
| }, |
| { |
| "epoch": 3.207547169811321, |
| "grad_norm": 0.6240995526313782, |
| "learning_rate": 0.00040805180787911494, |
| "loss": 3.4638, |
| "step": 29750 |
| }, |
| { |
| "epoch": 3.2129380053908356, |
| "grad_norm": 0.5901066064834595, |
| "learning_rate": 0.0004077344846195359, |
| "loss": 3.4456, |
| "step": 29800 |
| }, |
| { |
| "epoch": 3.2183288409703503, |
| "grad_norm": 0.6369420289993286, |
| "learning_rate": 0.00040741068537506744, |
| "loss": 3.467, |
| "step": 29850 |
| }, |
| { |
| "epoch": 3.223719676549865, |
| "grad_norm": 0.6294394135475159, |
| "learning_rate": 0.000407086886130599, |
| "loss": 3.4735, |
| "step": 29900 |
| }, |
| { |
| "epoch": 3.22911051212938, |
| "grad_norm": 0.6184892654418945, |
| "learning_rate": 0.0004067630868861306, |
| "loss": 3.4679, |
| "step": 29950 |
| }, |
| { |
| "epoch": 3.234501347708895, |
| "grad_norm": 0.6455713510513306, |
| "learning_rate": 0.0004064392876416621, |
| "loss": 3.4506, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.234501347708895, |
| "eval_accuracy": 0.3713301536427891, |
| "eval_loss": 3.490234613418579, |
| "eval_runtime": 185.1295, |
| "eval_samples_per_second": 97.289, |
| "eval_steps_per_second": 6.082, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.2398921832884096, |
| "grad_norm": 0.642624020576477, |
| "learning_rate": 0.0004061154883971937, |
| "loss": 3.4476, |
| "step": 30050 |
| }, |
| { |
| "epoch": 3.2452830188679247, |
| "grad_norm": 0.626826822757721, |
| "learning_rate": 0.00040579168915272525, |
| "loss": 3.4482, |
| "step": 30100 |
| }, |
| { |
| "epoch": 3.2506738544474394, |
| "grad_norm": 0.5784972310066223, |
| "learning_rate": 0.00040546788990825685, |
| "loss": 3.4517, |
| "step": 30150 |
| }, |
| { |
| "epoch": 3.256064690026954, |
| "grad_norm": 0.64665287733078, |
| "learning_rate": 0.0004051440906637884, |
| "loss": 3.4719, |
| "step": 30200 |
| }, |
| { |
| "epoch": 3.2614555256064692, |
| "grad_norm": 0.6225249171257019, |
| "learning_rate": 0.00040482029141931995, |
| "loss": 3.464, |
| "step": 30250 |
| }, |
| { |
| "epoch": 3.266846361185984, |
| "grad_norm": 0.7104143500328064, |
| "learning_rate": 0.00040449649217485156, |
| "loss": 3.465, |
| "step": 30300 |
| }, |
| { |
| "epoch": 3.2722371967654986, |
| "grad_norm": 0.6647838950157166, |
| "learning_rate": 0.0004041726929303831, |
| "loss": 3.4433, |
| "step": 30350 |
| }, |
| { |
| "epoch": 3.2776280323450133, |
| "grad_norm": 0.678033173084259, |
| "learning_rate": 0.0004038488936859147, |
| "loss": 3.4726, |
| "step": 30400 |
| }, |
| { |
| "epoch": 3.2830188679245285, |
| "grad_norm": 0.6125953197479248, |
| "learning_rate": 0.00040352509444144626, |
| "loss": 3.4584, |
| "step": 30450 |
| }, |
| { |
| "epoch": 3.288409703504043, |
| "grad_norm": 0.5959815979003906, |
| "learning_rate": 0.00040320129519697787, |
| "loss": 3.4427, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.293800539083558, |
| "grad_norm": 0.6709897518157959, |
| "learning_rate": 0.0004028774959525094, |
| "loss": 3.4719, |
| "step": 30550 |
| }, |
| { |
| "epoch": 3.2991913746630726, |
| "grad_norm": 0.7089282870292664, |
| "learning_rate": 0.000402553696708041, |
| "loss": 3.4638, |
| "step": 30600 |
| }, |
| { |
| "epoch": 3.3045822102425877, |
| "grad_norm": 0.6118188500404358, |
| "learning_rate": 0.0004022298974635726, |
| "loss": 3.4443, |
| "step": 30650 |
| }, |
| { |
| "epoch": 3.3099730458221024, |
| "grad_norm": 0.6362579464912415, |
| "learning_rate": 0.00040190609821910407, |
| "loss": 3.4604, |
| "step": 30700 |
| }, |
| { |
| "epoch": 3.315363881401617, |
| "grad_norm": 0.6267250776290894, |
| "learning_rate": 0.0004015822989746357, |
| "loss": 3.4298, |
| "step": 30750 |
| }, |
| { |
| "epoch": 3.3207547169811322, |
| "grad_norm": 0.5901178121566772, |
| "learning_rate": 0.0004012584997301672, |
| "loss": 3.4663, |
| "step": 30800 |
| }, |
| { |
| "epoch": 3.326145552560647, |
| "grad_norm": 0.6500908732414246, |
| "learning_rate": 0.00040093470048569883, |
| "loss": 3.446, |
| "step": 30850 |
| }, |
| { |
| "epoch": 3.3315363881401616, |
| "grad_norm": 0.6370844841003418, |
| "learning_rate": 0.0004006109012412304, |
| "loss": 3.4689, |
| "step": 30900 |
| }, |
| { |
| "epoch": 3.3369272237196768, |
| "grad_norm": 0.6281841993331909, |
| "learning_rate": 0.000400287101996762, |
| "loss": 3.4456, |
| "step": 30950 |
| }, |
| { |
| "epoch": 3.3423180592991915, |
| "grad_norm": 0.6539614200592041, |
| "learning_rate": 0.00039996330275229354, |
| "loss": 3.4582, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3423180592991915, |
| "eval_accuracy": 0.37187863342047656, |
| "eval_loss": 3.48494291305542, |
| "eval_runtime": 184.9741, |
| "eval_samples_per_second": 97.37, |
| "eval_steps_per_second": 6.087, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.347708894878706, |
| "grad_norm": 0.8110284209251404, |
| "learning_rate": 0.00039963950350782514, |
| "loss": 3.4593, |
| "step": 31050 |
| }, |
| { |
| "epoch": 3.353099730458221, |
| "grad_norm": 0.6377867460250854, |
| "learning_rate": 0.0003993157042633567, |
| "loss": 3.4541, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.358490566037736, |
| "grad_norm": 0.6073828935623169, |
| "learning_rate": 0.00039899190501888824, |
| "loss": 3.4652, |
| "step": 31150 |
| }, |
| { |
| "epoch": 3.3638814016172507, |
| "grad_norm": 0.6192326545715332, |
| "learning_rate": 0.00039866810577441985, |
| "loss": 3.4487, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.3692722371967654, |
| "grad_norm": 0.6012443900108337, |
| "learning_rate": 0.0003983443065299514, |
| "loss": 3.4543, |
| "step": 31250 |
| }, |
| { |
| "epoch": 3.37466307277628, |
| "grad_norm": 0.5912206172943115, |
| "learning_rate": 0.000398020507285483, |
| "loss": 3.4795, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.3800539083557952, |
| "grad_norm": 0.615149974822998, |
| "learning_rate": 0.0003976967080410145, |
| "loss": 3.4505, |
| "step": 31350 |
| }, |
| { |
| "epoch": 3.38544474393531, |
| "grad_norm": 0.6301441192626953, |
| "learning_rate": 0.0003973729087965461, |
| "loss": 3.4511, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.3908355795148246, |
| "grad_norm": 0.6270235180854797, |
| "learning_rate": 0.00039704910955207765, |
| "loss": 3.4568, |
| "step": 31450 |
| }, |
| { |
| "epoch": 3.3962264150943398, |
| "grad_norm": 0.5964970588684082, |
| "learning_rate": 0.00039672531030760926, |
| "loss": 3.449, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.4016172506738545, |
| "grad_norm": 0.6153132319450378, |
| "learning_rate": 0.0003964015110631408, |
| "loss": 3.4681, |
| "step": 31550 |
| }, |
| { |
| "epoch": 3.407008086253369, |
| "grad_norm": 0.6561875343322754, |
| "learning_rate": 0.00039607771181867236, |
| "loss": 3.4452, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.4123989218328843, |
| "grad_norm": 0.6051456332206726, |
| "learning_rate": 0.00039575391257420397, |
| "loss": 3.4482, |
| "step": 31650 |
| }, |
| { |
| "epoch": 3.417789757412399, |
| "grad_norm": 0.6496569514274597, |
| "learning_rate": 0.0003954301133297355, |
| "loss": 3.4746, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.4231805929919137, |
| "grad_norm": 0.6450279355049133, |
| "learning_rate": 0.0003951063140852671, |
| "loss": 3.4468, |
| "step": 31750 |
| }, |
| { |
| "epoch": 3.4285714285714284, |
| "grad_norm": 0.6020802855491638, |
| "learning_rate": 0.00039478251484079867, |
| "loss": 3.4509, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.4339622641509435, |
| "grad_norm": 0.6902250647544861, |
| "learning_rate": 0.0003944587155963303, |
| "loss": 3.4817, |
| "step": 31850 |
| }, |
| { |
| "epoch": 3.439353099730458, |
| "grad_norm": 0.6622137427330017, |
| "learning_rate": 0.0003941349163518618, |
| "loss": 3.4645, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.444743935309973, |
| "grad_norm": 0.6333610415458679, |
| "learning_rate": 0.0003938175930922828, |
| "loss": 3.4427, |
| "step": 31950 |
| }, |
| { |
| "epoch": 3.450134770889488, |
| "grad_norm": 0.6233834624290466, |
| "learning_rate": 0.0003934937938478143, |
| "loss": 3.4597, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.450134770889488, |
| "eval_accuracy": 0.3733303446545658, |
| "eval_loss": 3.4757399559020996, |
| "eval_runtime": 185.0469, |
| "eval_samples_per_second": 97.332, |
| "eval_steps_per_second": 6.085, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.4555256064690028, |
| "grad_norm": 0.6415245532989502, |
| "learning_rate": 0.0003931699946033459, |
| "loss": 3.4613, |
| "step": 32050 |
| }, |
| { |
| "epoch": 3.4609164420485174, |
| "grad_norm": 0.6769406199455261, |
| "learning_rate": 0.00039284619535887743, |
| "loss": 3.4545, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.466307277628032, |
| "grad_norm": 0.6122919321060181, |
| "learning_rate": 0.00039252239611440904, |
| "loss": 3.4669, |
| "step": 32150 |
| }, |
| { |
| "epoch": 3.4716981132075473, |
| "grad_norm": 0.6024535894393921, |
| "learning_rate": 0.0003921985968699406, |
| "loss": 3.4484, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.477088948787062, |
| "grad_norm": 0.6409429311752319, |
| "learning_rate": 0.0003918747976254722, |
| "loss": 3.4661, |
| "step": 32250 |
| }, |
| { |
| "epoch": 3.4824797843665767, |
| "grad_norm": 0.6143214106559753, |
| "learning_rate": 0.00039155099838100374, |
| "loss": 3.4796, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.487870619946092, |
| "grad_norm": 0.5818310976028442, |
| "learning_rate": 0.0003912271991365353, |
| "loss": 3.4407, |
| "step": 32350 |
| }, |
| { |
| "epoch": 3.4932614555256065, |
| "grad_norm": 0.6170235276222229, |
| "learning_rate": 0.0003909033998920669, |
| "loss": 3.4722, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.498652291105121, |
| "grad_norm": 0.6212165951728821, |
| "learning_rate": 0.00039057960064759845, |
| "loss": 3.446, |
| "step": 32450 |
| }, |
| { |
| "epoch": 3.5040431266846364, |
| "grad_norm": 0.6115076541900635, |
| "learning_rate": 0.00039025580140313005, |
| "loss": 3.4707, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.509433962264151, |
| "grad_norm": 0.5680184960365295, |
| "learning_rate": 0.0003899320021586616, |
| "loss": 3.4612, |
| "step": 32550 |
| }, |
| { |
| "epoch": 3.5148247978436657, |
| "grad_norm": 0.6204341650009155, |
| "learning_rate": 0.0003896082029141932, |
| "loss": 3.4577, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.5202156334231804, |
| "grad_norm": 0.6625291705131531, |
| "learning_rate": 0.00038928440366972476, |
| "loss": 3.4669, |
| "step": 32650 |
| }, |
| { |
| "epoch": 3.525606469002695, |
| "grad_norm": 0.6184507608413696, |
| "learning_rate": 0.00038896060442525636, |
| "loss": 3.4411, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.5309973045822103, |
| "grad_norm": 0.6391364932060242, |
| "learning_rate": 0.00038863680518078786, |
| "loss": 3.4536, |
| "step": 32750 |
| }, |
| { |
| "epoch": 3.536388140161725, |
| "grad_norm": 0.6026434898376465, |
| "learning_rate": 0.0003883130059363194, |
| "loss": 3.4641, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.5417789757412397, |
| "grad_norm": 0.642951250076294, |
| "learning_rate": 0.000387989206691851, |
| "loss": 3.4257, |
| "step": 32850 |
| }, |
| { |
| "epoch": 3.547169811320755, |
| "grad_norm": 0.6193267107009888, |
| "learning_rate": 0.00038766540744738256, |
| "loss": 3.4491, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.5525606469002695, |
| "grad_norm": 0.6317867636680603, |
| "learning_rate": 0.00038734160820291417, |
| "loss": 3.4632, |
| "step": 32950 |
| }, |
| { |
| "epoch": 3.557951482479784, |
| "grad_norm": 0.654092013835907, |
| "learning_rate": 0.0003870178089584457, |
| "loss": 3.4639, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.557951482479784, |
| "eval_accuracy": 0.37359350194885255, |
| "eval_loss": 3.4721765518188477, |
| "eval_runtime": 185.0176, |
| "eval_samples_per_second": 97.347, |
| "eval_steps_per_second": 6.086, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5633423180592994, |
| "grad_norm": 0.5980980396270752, |
| "learning_rate": 0.0003866940097139773, |
| "loss": 3.4638, |
| "step": 33050 |
| }, |
| { |
| "epoch": 3.568733153638814, |
| "grad_norm": 0.671788215637207, |
| "learning_rate": 0.0003863702104695089, |
| "loss": 3.4525, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.5741239892183287, |
| "grad_norm": 0.6702181100845337, |
| "learning_rate": 0.0003860464112250404, |
| "loss": 3.4601, |
| "step": 33150 |
| }, |
| { |
| "epoch": 3.579514824797844, |
| "grad_norm": 0.6230493187904358, |
| "learning_rate": 0.00038572261198057203, |
| "loss": 3.4616, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.5849056603773586, |
| "grad_norm": 0.6334137916564941, |
| "learning_rate": 0.0003853988127361036, |
| "loss": 3.4536, |
| "step": 33250 |
| }, |
| { |
| "epoch": 3.5902964959568733, |
| "grad_norm": 0.6252257227897644, |
| "learning_rate": 0.0003850750134916352, |
| "loss": 3.4431, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.595687331536388, |
| "grad_norm": 0.5797781348228455, |
| "learning_rate": 0.0003847512142471667, |
| "loss": 3.454, |
| "step": 33350 |
| }, |
| { |
| "epoch": 3.601078167115903, |
| "grad_norm": 0.6160080432891846, |
| "learning_rate": 0.0003844274150026983, |
| "loss": 3.4549, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.606469002695418, |
| "grad_norm": 0.6341015696525574, |
| "learning_rate": 0.00038410361575822984, |
| "loss": 3.4639, |
| "step": 33450 |
| }, |
| { |
| "epoch": 3.6118598382749325, |
| "grad_norm": 0.6332781910896301, |
| "learning_rate": 0.00038377981651376144, |
| "loss": 3.4719, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.617250673854447, |
| "grad_norm": 0.6350259184837341, |
| "learning_rate": 0.000383456017269293, |
| "loss": 3.4501, |
| "step": 33550 |
| }, |
| { |
| "epoch": 3.6226415094339623, |
| "grad_norm": 0.6050942540168762, |
| "learning_rate": 0.00038313221802482454, |
| "loss": 3.4529, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.628032345013477, |
| "grad_norm": 0.679898202419281, |
| "learning_rate": 0.00038280841878035615, |
| "loss": 3.4496, |
| "step": 33650 |
| }, |
| { |
| "epoch": 3.6334231805929917, |
| "grad_norm": 0.6030776500701904, |
| "learning_rate": 0.0003824846195358877, |
| "loss": 3.438, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.638814016172507, |
| "grad_norm": 0.6465497016906738, |
| "learning_rate": 0.0003821608202914193, |
| "loss": 3.46, |
| "step": 33750 |
| }, |
| { |
| "epoch": 3.6442048517520216, |
| "grad_norm": 0.6347126960754395, |
| "learning_rate": 0.00038183702104695085, |
| "loss": 3.4418, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.6495956873315363, |
| "grad_norm": 0.6211069822311401, |
| "learning_rate": 0.00038151322180248246, |
| "loss": 3.4722, |
| "step": 33850 |
| }, |
| { |
| "epoch": 3.6549865229110514, |
| "grad_norm": 0.6023736000061035, |
| "learning_rate": 0.000381189422558014, |
| "loss": 3.4463, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.660377358490566, |
| "grad_norm": 0.6519888043403625, |
| "learning_rate": 0.00038087209929843496, |
| "loss": 3.4477, |
| "step": 33950 |
| }, |
| { |
| "epoch": 3.665768194070081, |
| "grad_norm": 0.6298272013664246, |
| "learning_rate": 0.00038054830005396646, |
| "loss": 3.4602, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.665768194070081, |
| "eval_accuracy": 0.3735198352909143, |
| "eval_loss": 3.4653146266937256, |
| "eval_runtime": 185.1109, |
| "eval_samples_per_second": 97.298, |
| "eval_steps_per_second": 6.083, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.671159029649596, |
| "grad_norm": 0.6116703152656555, |
| "learning_rate": 0.00038022450080949806, |
| "loss": 3.4527, |
| "step": 34050 |
| }, |
| { |
| "epoch": 3.6765498652291106, |
| "grad_norm": 0.6061373949050903, |
| "learning_rate": 0.0003799007015650296, |
| "loss": 3.4695, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.6819407008086253, |
| "grad_norm": 0.6899125576019287, |
| "learning_rate": 0.0003795769023205612, |
| "loss": 3.4656, |
| "step": 34150 |
| }, |
| { |
| "epoch": 3.68733153638814, |
| "grad_norm": 0.6152036190032959, |
| "learning_rate": 0.00037925310307609277, |
| "loss": 3.4485, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.6927223719676547, |
| "grad_norm": 0.6439741253852844, |
| "learning_rate": 0.0003789293038316244, |
| "loss": 3.4411, |
| "step": 34250 |
| }, |
| { |
| "epoch": 3.69811320754717, |
| "grad_norm": 0.5817996263504028, |
| "learning_rate": 0.0003786055045871559, |
| "loss": 3.4418, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.7035040431266846, |
| "grad_norm": 0.633815348148346, |
| "learning_rate": 0.0003782817053426875, |
| "loss": 3.446, |
| "step": 34350 |
| }, |
| { |
| "epoch": 3.7088948787061993, |
| "grad_norm": 0.593890905380249, |
| "learning_rate": 0.0003779579060982191, |
| "loss": 3.4419, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.7142857142857144, |
| "grad_norm": 0.6376861929893494, |
| "learning_rate": 0.00037763410685375063, |
| "loss": 3.4648, |
| "step": 34450 |
| }, |
| { |
| "epoch": 3.719676549865229, |
| "grad_norm": 0.616452157497406, |
| "learning_rate": 0.00037731030760928223, |
| "loss": 3.4457, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.725067385444744, |
| "grad_norm": 0.6425932049751282, |
| "learning_rate": 0.0003769865083648138, |
| "loss": 3.4385, |
| "step": 34550 |
| }, |
| { |
| "epoch": 3.730458221024259, |
| "grad_norm": 0.6993752717971802, |
| "learning_rate": 0.0003766627091203454, |
| "loss": 3.4588, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.7358490566037736, |
| "grad_norm": 0.6306095123291016, |
| "learning_rate": 0.00037633890987587694, |
| "loss": 3.4574, |
| "step": 34650 |
| }, |
| { |
| "epoch": 3.7412398921832883, |
| "grad_norm": 0.6183214783668518, |
| "learning_rate": 0.00037602158661629784, |
| "loss": 3.4664, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.7466307277628035, |
| "grad_norm": 0.616973340511322, |
| "learning_rate": 0.0003756977873718294, |
| "loss": 3.4632, |
| "step": 34750 |
| }, |
| { |
| "epoch": 3.752021563342318, |
| "grad_norm": 0.64264976978302, |
| "learning_rate": 0.000375373988127361, |
| "loss": 3.4428, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.757412398921833, |
| "grad_norm": 0.6926430463790894, |
| "learning_rate": 0.00037505018888289254, |
| "loss": 3.4469, |
| "step": 34850 |
| }, |
| { |
| "epoch": 3.7628032345013476, |
| "grad_norm": 0.6002678871154785, |
| "learning_rate": 0.00037472638963842415, |
| "loss": 3.4612, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.7681940700808623, |
| "grad_norm": 0.7651305794715881, |
| "learning_rate": 0.0003744025903939557, |
| "loss": 3.454, |
| "step": 34950 |
| }, |
| { |
| "epoch": 3.7735849056603774, |
| "grad_norm": 0.6672982573509216, |
| "learning_rate": 0.0003740787911494873, |
| "loss": 3.4613, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.7735849056603774, |
| "eval_accuracy": 0.37492178350238453, |
| "eval_loss": 3.4591267108917236, |
| "eval_runtime": 185.1296, |
| "eval_samples_per_second": 97.289, |
| "eval_steps_per_second": 6.082, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.778975741239892, |
| "grad_norm": 0.6062513589859009, |
| "learning_rate": 0.00037375499190501885, |
| "loss": 3.47, |
| "step": 35050 |
| }, |
| { |
| "epoch": 3.784366576819407, |
| "grad_norm": 0.6883726716041565, |
| "learning_rate": 0.0003734311926605504, |
| "loss": 3.4365, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.789757412398922, |
| "grad_norm": 0.6614878177642822, |
| "learning_rate": 0.000373107393416082, |
| "loss": 3.4592, |
| "step": 35150 |
| }, |
| { |
| "epoch": 3.7951482479784366, |
| "grad_norm": 0.6300371289253235, |
| "learning_rate": 0.00037278359417161356, |
| "loss": 3.4572, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.8005390835579513, |
| "grad_norm": 0.6380262970924377, |
| "learning_rate": 0.00037245979492714517, |
| "loss": 3.4392, |
| "step": 35250 |
| }, |
| { |
| "epoch": 3.8059299191374665, |
| "grad_norm": 0.6063967943191528, |
| "learning_rate": 0.0003721359956826767, |
| "loss": 3.4528, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.811320754716981, |
| "grad_norm": 0.6829068064689636, |
| "learning_rate": 0.0003718121964382083, |
| "loss": 3.4549, |
| "step": 35350 |
| }, |
| { |
| "epoch": 3.816711590296496, |
| "grad_norm": 0.6208524703979492, |
| "learning_rate": 0.0003714883971937398, |
| "loss": 3.4547, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.822102425876011, |
| "grad_norm": 0.6724206209182739, |
| "learning_rate": 0.0003711645979492714, |
| "loss": 3.4494, |
| "step": 35450 |
| }, |
| { |
| "epoch": 3.8274932614555257, |
| "grad_norm": 0.586846113204956, |
| "learning_rate": 0.00037084079870480297, |
| "loss": 3.4517, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.8328840970350404, |
| "grad_norm": 0.7098872065544128, |
| "learning_rate": 0.0003705169994603345, |
| "loss": 3.4652, |
| "step": 35550 |
| }, |
| { |
| "epoch": 3.838274932614555, |
| "grad_norm": 0.6315362453460693, |
| "learning_rate": 0.00037019320021586613, |
| "loss": 3.4693, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.8436657681940702, |
| "grad_norm": 0.6404989957809448, |
| "learning_rate": 0.0003698694009713977, |
| "loss": 3.4676, |
| "step": 35650 |
| }, |
| { |
| "epoch": 3.849056603773585, |
| "grad_norm": 0.5798198580741882, |
| "learning_rate": 0.0003695456017269293, |
| "loss": 3.469, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.8544474393530996, |
| "grad_norm": 0.6681696772575378, |
| "learning_rate": 0.00036922180248246083, |
| "loss": 3.4485, |
| "step": 35750 |
| }, |
| { |
| "epoch": 3.8598382749326143, |
| "grad_norm": 0.8003295660018921, |
| "learning_rate": 0.00036889800323799244, |
| "loss": 3.4547, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.8652291105121295, |
| "grad_norm": 0.6079601645469666, |
| "learning_rate": 0.000368574203993524, |
| "loss": 3.4312, |
| "step": 35850 |
| }, |
| { |
| "epoch": 3.870619946091644, |
| "grad_norm": 0.6316880583763123, |
| "learning_rate": 0.0003682504047490556, |
| "loss": 3.4479, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.876010781671159, |
| "grad_norm": 0.6452414393424988, |
| "learning_rate": 0.00036792660550458714, |
| "loss": 3.4706, |
| "step": 35950 |
| }, |
| { |
| "epoch": 3.881401617250674, |
| "grad_norm": 0.6125510931015015, |
| "learning_rate": 0.00036760280626011864, |
| "loss": 3.4517, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.881401617250674, |
| "eval_accuracy": 0.3754607018259444, |
| "eval_loss": 3.4510653018951416, |
| "eval_runtime": 185.2329, |
| "eval_samples_per_second": 97.234, |
| "eval_steps_per_second": 6.079, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8867924528301887, |
| "grad_norm": 0.7082051038742065, |
| "learning_rate": 0.0003672790070156503, |
| "loss": 3.4576, |
| "step": 36050 |
| }, |
| { |
| "epoch": 3.8921832884097034, |
| "grad_norm": 0.6857859492301941, |
| "learning_rate": 0.0003669552077711818, |
| "loss": 3.4543, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.8975741239892185, |
| "grad_norm": 0.5955175161361694, |
| "learning_rate": 0.0003666314085267134, |
| "loss": 3.4599, |
| "step": 36150 |
| }, |
| { |
| "epoch": 3.9029649595687332, |
| "grad_norm": 0.6065194010734558, |
| "learning_rate": 0.00036630760928224495, |
| "loss": 3.4475, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.908355795148248, |
| "grad_norm": 0.6187525391578674, |
| "learning_rate": 0.00036598381003777656, |
| "loss": 3.4643, |
| "step": 36250 |
| }, |
| { |
| "epoch": 3.913746630727763, |
| "grad_norm": 0.637075662612915, |
| "learning_rate": 0.0003656600107933081, |
| "loss": 3.4443, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.9191374663072778, |
| "grad_norm": 0.613892138004303, |
| "learning_rate": 0.0003653362115488397, |
| "loss": 3.4656, |
| "step": 36350 |
| }, |
| { |
| "epoch": 3.9245283018867925, |
| "grad_norm": 0.598813533782959, |
| "learning_rate": 0.00036501241230437126, |
| "loss": 3.4665, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.929919137466307, |
| "grad_norm": 0.6649259924888611, |
| "learning_rate": 0.0003646886130599028, |
| "loss": 3.4416, |
| "step": 36450 |
| }, |
| { |
| "epoch": 3.935309973045822, |
| "grad_norm": 0.6432799696922302, |
| "learning_rate": 0.0003643648138154344, |
| "loss": 3.4519, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.940700808625337, |
| "grad_norm": 0.6609790921211243, |
| "learning_rate": 0.00036404101457096597, |
| "loss": 3.4532, |
| "step": 36550 |
| }, |
| { |
| "epoch": 3.9460916442048517, |
| "grad_norm": 0.5976753234863281, |
| "learning_rate": 0.00036371721532649757, |
| "loss": 3.4501, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.9514824797843664, |
| "grad_norm": 0.7079935669898987, |
| "learning_rate": 0.0003633934160820291, |
| "loss": 3.4404, |
| "step": 36650 |
| }, |
| { |
| "epoch": 3.9568733153638815, |
| "grad_norm": 0.6590920686721802, |
| "learning_rate": 0.00036306961683756073, |
| "loss": 3.452, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.9622641509433962, |
| "grad_norm": 0.6470325589179993, |
| "learning_rate": 0.0003627458175930922, |
| "loss": 3.4424, |
| "step": 36750 |
| }, |
| { |
| "epoch": 3.967654986522911, |
| "grad_norm": 0.625947117805481, |
| "learning_rate": 0.0003624220183486238, |
| "loss": 3.4346, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.973045822102426, |
| "grad_norm": 0.6342935562133789, |
| "learning_rate": 0.0003620982191041554, |
| "loss": 3.4505, |
| "step": 36850 |
| }, |
| { |
| "epoch": 3.9784366576819408, |
| "grad_norm": 0.6525506973266602, |
| "learning_rate": 0.00036177441985968693, |
| "loss": 3.4421, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.9838274932614555, |
| "grad_norm": 0.6463566422462463, |
| "learning_rate": 0.00036145062061521854, |
| "loss": 3.4535, |
| "step": 36950 |
| }, |
| { |
| "epoch": 3.9892183288409706, |
| "grad_norm": 0.6641883254051208, |
| "learning_rate": 0.0003611268213707501, |
| "loss": 3.4488, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9892183288409706, |
| "eval_accuracy": 0.3760902366579415, |
| "eval_loss": 3.445734977722168, |
| "eval_runtime": 185.0242, |
| "eval_samples_per_second": 97.344, |
| "eval_steps_per_second": 6.086, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9946091644204853, |
| "grad_norm": 0.6596526503562927, |
| "learning_rate": 0.0003608030221262817, |
| "loss": 3.4613, |
| "step": 37050 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 1.2152858972549438, |
| "learning_rate": 0.00036047922288181324, |
| "loss": 3.4334, |
| "step": 37100 |
| }, |
| { |
| "epoch": 4.005390835579515, |
| "grad_norm": 0.6752334833145142, |
| "learning_rate": 0.00036015542363734485, |
| "loss": 3.3286, |
| "step": 37150 |
| }, |
| { |
| "epoch": 4.010781671159029, |
| "grad_norm": 0.6589771509170532, |
| "learning_rate": 0.0003598316243928764, |
| "loss": 3.3567, |
| "step": 37200 |
| }, |
| { |
| "epoch": 4.0161725067385445, |
| "grad_norm": 0.6467882990837097, |
| "learning_rate": 0.00035950782514840795, |
| "loss": 3.3498, |
| "step": 37250 |
| }, |
| { |
| "epoch": 4.02156334231806, |
| "grad_norm": 0.6317258477210999, |
| "learning_rate": 0.00035918402590393955, |
| "loss": 3.3546, |
| "step": 37300 |
| }, |
| { |
| "epoch": 4.026954177897574, |
| "grad_norm": 0.6882931590080261, |
| "learning_rate": 0.00035886022665947105, |
| "loss": 3.3413, |
| "step": 37350 |
| }, |
| { |
| "epoch": 4.032345013477089, |
| "grad_norm": 0.6179378628730774, |
| "learning_rate": 0.0003585364274150027, |
| "loss": 3.361, |
| "step": 37400 |
| }, |
| { |
| "epoch": 4.037735849056604, |
| "grad_norm": 0.7111193537712097, |
| "learning_rate": 0.0003582126281705342, |
| "loss": 3.3608, |
| "step": 37450 |
| }, |
| { |
| "epoch": 4.0431266846361185, |
| "grad_norm": 0.6103177666664124, |
| "learning_rate": 0.0003578888289260658, |
| "loss": 3.3477, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.048517520215634, |
| "grad_norm": 0.67775958776474, |
| "learning_rate": 0.00035756502968159736, |
| "loss": 3.3644, |
| "step": 37550 |
| }, |
| { |
| "epoch": 4.053908355795148, |
| "grad_norm": 0.663544774055481, |
| "learning_rate": 0.00035724123043712896, |
| "loss": 3.3578, |
| "step": 37600 |
| }, |
| { |
| "epoch": 4.059299191374663, |
| "grad_norm": 0.6494781374931335, |
| "learning_rate": 0.0003569174311926605, |
| "loss": 3.3734, |
| "step": 37650 |
| }, |
| { |
| "epoch": 4.064690026954178, |
| "grad_norm": 0.6415791511535645, |
| "learning_rate": 0.00035659363194819206, |
| "loss": 3.3584, |
| "step": 37700 |
| }, |
| { |
| "epoch": 4.070080862533692, |
| "grad_norm": 0.6472448110580444, |
| "learning_rate": 0.00035626983270372367, |
| "loss": 3.3797, |
| "step": 37750 |
| }, |
| { |
| "epoch": 4.0754716981132075, |
| "grad_norm": 0.6979560256004333, |
| "learning_rate": 0.0003559460334592552, |
| "loss": 3.3631, |
| "step": 37800 |
| }, |
| { |
| "epoch": 4.080862533692723, |
| "grad_norm": 0.6640351414680481, |
| "learning_rate": 0.0003556222342147868, |
| "loss": 3.359, |
| "step": 37850 |
| }, |
| { |
| "epoch": 4.086253369272237, |
| "grad_norm": 0.6525617837905884, |
| "learning_rate": 0.0003552984349703184, |
| "loss": 3.359, |
| "step": 37900 |
| }, |
| { |
| "epoch": 4.091644204851752, |
| "grad_norm": 0.6497607231140137, |
| "learning_rate": 0.00035497463572585, |
| "loss": 3.3734, |
| "step": 37950 |
| }, |
| { |
| "epoch": 4.097035040431267, |
| "grad_norm": 0.6940391063690186, |
| "learning_rate": 0.00035465083648138153, |
| "loss": 3.3644, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.097035040431267, |
| "eval_accuracy": 0.3764235837177563, |
| "eval_loss": 3.4489781856536865, |
| "eval_runtime": 185.0419, |
| "eval_samples_per_second": 97.335, |
| "eval_steps_per_second": 6.085, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.1024258760107815, |
| "grad_norm": 0.7142944931983948, |
| "learning_rate": 0.00035432703723691314, |
| "loss": 3.3681, |
| "step": 38050 |
| }, |
| { |
| "epoch": 4.107816711590297, |
| "grad_norm": 0.6247251629829407, |
| "learning_rate": 0.00035400323799244463, |
| "loss": 3.3706, |
| "step": 38100 |
| }, |
| { |
| "epoch": 4.113207547169812, |
| "grad_norm": 0.6462594866752625, |
| "learning_rate": 0.0003536794387479762, |
| "loss": 3.3803, |
| "step": 38150 |
| }, |
| { |
| "epoch": 4.118598382749326, |
| "grad_norm": 0.659847617149353, |
| "learning_rate": 0.0003533556395035078, |
| "loss": 3.3622, |
| "step": 38200 |
| }, |
| { |
| "epoch": 4.123989218328841, |
| "grad_norm": 0.6913278102874756, |
| "learning_rate": 0.00035303184025903934, |
| "loss": 3.3643, |
| "step": 38250 |
| }, |
| { |
| "epoch": 4.129380053908355, |
| "grad_norm": 0.6582707166671753, |
| "learning_rate": 0.00035270804101457094, |
| "loss": 3.3768, |
| "step": 38300 |
| }, |
| { |
| "epoch": 4.1347708894878705, |
| "grad_norm": 0.6678318977355957, |
| "learning_rate": 0.0003523842417701025, |
| "loss": 3.3553, |
| "step": 38350 |
| }, |
| { |
| "epoch": 4.140161725067386, |
| "grad_norm": 0.6429058909416199, |
| "learning_rate": 0.0003520604425256341, |
| "loss": 3.379, |
| "step": 38400 |
| }, |
| { |
| "epoch": 4.1455525606469, |
| "grad_norm": 0.6227869391441345, |
| "learning_rate": 0.00035173664328116565, |
| "loss": 3.3626, |
| "step": 38450 |
| }, |
| { |
| "epoch": 4.150943396226415, |
| "grad_norm": 0.6584265828132629, |
| "learning_rate": 0.00035141284403669725, |
| "loss": 3.3735, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.15633423180593, |
| "grad_norm": 0.6800707578659058, |
| "learning_rate": 0.0003510890447922288, |
| "loss": 3.398, |
| "step": 38550 |
| }, |
| { |
| "epoch": 4.1617250673854445, |
| "grad_norm": 0.6075555086135864, |
| "learning_rate": 0.00035077172153264976, |
| "loss": 3.3731, |
| "step": 38600 |
| }, |
| { |
| "epoch": 4.16711590296496, |
| "grad_norm": 0.6489547491073608, |
| "learning_rate": 0.0003504479222881813, |
| "loss": 3.3858, |
| "step": 38650 |
| }, |
| { |
| "epoch": 4.172506738544475, |
| "grad_norm": 0.7513884902000427, |
| "learning_rate": 0.0003501241230437129, |
| "loss": 3.3517, |
| "step": 38700 |
| }, |
| { |
| "epoch": 4.177897574123989, |
| "grad_norm": 0.6607750058174133, |
| "learning_rate": 0.0003498003237992444, |
| "loss": 3.3763, |
| "step": 38750 |
| }, |
| { |
| "epoch": 4.183288409703504, |
| "grad_norm": 0.6463075280189514, |
| "learning_rate": 0.000349476524554776, |
| "loss": 3.3807, |
| "step": 38800 |
| }, |
| { |
| "epoch": 4.188679245283019, |
| "grad_norm": 0.7830615639686584, |
| "learning_rate": 0.00034915272531030756, |
| "loss": 3.3839, |
| "step": 38850 |
| }, |
| { |
| "epoch": 4.1940700808625335, |
| "grad_norm": 0.6614366769790649, |
| "learning_rate": 0.0003488289260658391, |
| "loss": 3.3912, |
| "step": 38900 |
| }, |
| { |
| "epoch": 4.199460916442049, |
| "grad_norm": 0.7006166577339172, |
| "learning_rate": 0.0003485051268213707, |
| "loss": 3.3844, |
| "step": 38950 |
| }, |
| { |
| "epoch": 4.204851752021563, |
| "grad_norm": 0.6897702813148499, |
| "learning_rate": 0.00034818132757690227, |
| "loss": 3.3636, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.204851752021563, |
| "eval_accuracy": 0.37732518538084087, |
| "eval_loss": 3.445204734802246, |
| "eval_runtime": 184.9796, |
| "eval_samples_per_second": 97.367, |
| "eval_steps_per_second": 6.087, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.210242587601078, |
| "grad_norm": 0.6401898264884949, |
| "learning_rate": 0.0003478575283324339, |
| "loss": 3.3922, |
| "step": 39050 |
| }, |
| { |
| "epoch": 4.215633423180593, |
| "grad_norm": 0.6454992294311523, |
| "learning_rate": 0.0003475337290879654, |
| "loss": 3.3863, |
| "step": 39100 |
| }, |
| { |
| "epoch": 4.2210242587601075, |
| "grad_norm": 0.6486014723777771, |
| "learning_rate": 0.00034720992984349703, |
| "loss": 3.395, |
| "step": 39150 |
| }, |
| { |
| "epoch": 4.226415094339623, |
| "grad_norm": 0.6316015720367432, |
| "learning_rate": 0.0003468861305990286, |
| "loss": 3.3762, |
| "step": 39200 |
| }, |
| { |
| "epoch": 4.231805929919138, |
| "grad_norm": 0.6304581761360168, |
| "learning_rate": 0.0003465623313545602, |
| "loss": 3.3712, |
| "step": 39250 |
| }, |
| { |
| "epoch": 4.237196765498652, |
| "grad_norm": 0.6496357917785645, |
| "learning_rate": 0.00034623853211009173, |
| "loss": 3.3843, |
| "step": 39300 |
| }, |
| { |
| "epoch": 4.242587601078167, |
| "grad_norm": 0.6726136207580566, |
| "learning_rate": 0.00034591473286562323, |
| "loss": 3.3732, |
| "step": 39350 |
| }, |
| { |
| "epoch": 4.247978436657682, |
| "grad_norm": 0.6343677043914795, |
| "learning_rate": 0.0003455909336211549, |
| "loss": 3.3922, |
| "step": 39400 |
| }, |
| { |
| "epoch": 4.2533692722371965, |
| "grad_norm": 0.6977464556694031, |
| "learning_rate": 0.0003452671343766864, |
| "loss": 3.3806, |
| "step": 39450 |
| }, |
| { |
| "epoch": 4.258760107816712, |
| "grad_norm": 0.6857298016548157, |
| "learning_rate": 0.000344943335132218, |
| "loss": 3.3852, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.264150943396227, |
| "grad_norm": 0.6643324494361877, |
| "learning_rate": 0.00034461953588774954, |
| "loss": 3.3899, |
| "step": 39550 |
| }, |
| { |
| "epoch": 4.269541778975741, |
| "grad_norm": 0.6354033946990967, |
| "learning_rate": 0.00034429573664328115, |
| "loss": 3.3981, |
| "step": 39600 |
| }, |
| { |
| "epoch": 4.274932614555256, |
| "grad_norm": 0.6460668444633484, |
| "learning_rate": 0.0003439719373988127, |
| "loss": 3.3938, |
| "step": 39650 |
| }, |
| { |
| "epoch": 4.280323450134771, |
| "grad_norm": 0.7137424945831299, |
| "learning_rate": 0.00034364813815434425, |
| "loss": 3.3745, |
| "step": 39700 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.7198200821876526, |
| "learning_rate": 0.00034332433890987585, |
| "loss": 3.3825, |
| "step": 39750 |
| }, |
| { |
| "epoch": 4.291105121293801, |
| "grad_norm": 0.6877724528312683, |
| "learning_rate": 0.0003430005396654074, |
| "loss": 3.383, |
| "step": 39800 |
| }, |
| { |
| "epoch": 4.296495956873315, |
| "grad_norm": 0.6581820249557495, |
| "learning_rate": 0.000342676740420939, |
| "loss": 3.3829, |
| "step": 39850 |
| }, |
| { |
| "epoch": 4.30188679245283, |
| "grad_norm": 0.7357942461967468, |
| "learning_rate": 0.00034235294117647056, |
| "loss": 3.3896, |
| "step": 39900 |
| }, |
| { |
| "epoch": 4.307277628032345, |
| "grad_norm": 0.6755037903785706, |
| "learning_rate": 0.00034202914193200216, |
| "loss": 3.3776, |
| "step": 39950 |
| }, |
| { |
| "epoch": 4.3126684636118595, |
| "grad_norm": 0.604346752166748, |
| "learning_rate": 0.0003417053426875337, |
| "loss": 3.3899, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.3126684636118595, |
| "eval_accuracy": 0.37746034957328206, |
| "eval_loss": 3.44191312789917, |
| "eval_runtime": 185.1596, |
| "eval_samples_per_second": 97.273, |
| "eval_steps_per_second": 6.081, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.318059299191375, |
| "grad_norm": 0.7762167453765869, |
| "learning_rate": 0.0003413815434430653, |
| "loss": 3.3884, |
| "step": 40050 |
| }, |
| { |
| "epoch": 4.32345013477089, |
| "grad_norm": 0.7179322242736816, |
| "learning_rate": 0.0003410577441985968, |
| "loss": 3.3853, |
| "step": 40100 |
| }, |
| { |
| "epoch": 4.328840970350404, |
| "grad_norm": 0.6719879508018494, |
| "learning_rate": 0.00034073394495412837, |
| "loss": 3.3954, |
| "step": 40150 |
| }, |
| { |
| "epoch": 4.334231805929919, |
| "grad_norm": 0.705895185470581, |
| "learning_rate": 0.00034041014570965997, |
| "loss": 3.3915, |
| "step": 40200 |
| }, |
| { |
| "epoch": 4.339622641509434, |
| "grad_norm": 0.6576878428459167, |
| "learning_rate": 0.0003400863464651915, |
| "loss": 3.378, |
| "step": 40250 |
| }, |
| { |
| "epoch": 4.345013477088949, |
| "grad_norm": 0.6394710540771484, |
| "learning_rate": 0.0003397625472207231, |
| "loss": 3.3928, |
| "step": 40300 |
| }, |
| { |
| "epoch": 4.350404312668464, |
| "grad_norm": 0.6182326078414917, |
| "learning_rate": 0.0003394387479762547, |
| "loss": 3.3743, |
| "step": 40350 |
| }, |
| { |
| "epoch": 4.355795148247978, |
| "grad_norm": 0.6548103094100952, |
| "learning_rate": 0.0003391149487317863, |
| "loss": 3.3867, |
| "step": 40400 |
| }, |
| { |
| "epoch": 4.361185983827493, |
| "grad_norm": 0.6793950796127319, |
| "learning_rate": 0.00033879114948731783, |
| "loss": 3.3844, |
| "step": 40450 |
| }, |
| { |
| "epoch": 4.366576819407008, |
| "grad_norm": 0.6479836702346802, |
| "learning_rate": 0.00033846735024284944, |
| "loss": 3.3977, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.3719676549865225, |
| "grad_norm": 0.6476645469665527, |
| "learning_rate": 0.000338143550998381, |
| "loss": 3.3889, |
| "step": 40550 |
| }, |
| { |
| "epoch": 4.377358490566038, |
| "grad_norm": 0.6995546221733093, |
| "learning_rate": 0.00033781975175391254, |
| "loss": 3.3794, |
| "step": 40600 |
| }, |
| { |
| "epoch": 4.382749326145553, |
| "grad_norm": 0.7507542371749878, |
| "learning_rate": 0.00033749595250944414, |
| "loss": 3.3889, |
| "step": 40650 |
| }, |
| { |
| "epoch": 4.388140161725067, |
| "grad_norm": 0.7036803364753723, |
| "learning_rate": 0.00033717215326497564, |
| "loss": 3.3894, |
| "step": 40700 |
| }, |
| { |
| "epoch": 4.393530997304582, |
| "grad_norm": 0.6556727886199951, |
| "learning_rate": 0.0003368483540205073, |
| "loss": 3.3865, |
| "step": 40750 |
| }, |
| { |
| "epoch": 4.398921832884097, |
| "grad_norm": 0.6941169500350952, |
| "learning_rate": 0.0003365245547760388, |
| "loss": 3.3765, |
| "step": 40800 |
| }, |
| { |
| "epoch": 4.404312668463612, |
| "grad_norm": 0.6608136892318726, |
| "learning_rate": 0.0003362007555315704, |
| "loss": 3.3915, |
| "step": 40850 |
| }, |
| { |
| "epoch": 4.409703504043127, |
| "grad_norm": 0.8930730223655701, |
| "learning_rate": 0.00033587695628710195, |
| "loss": 3.3998, |
| "step": 40900 |
| }, |
| { |
| "epoch": 4.415094339622642, |
| "grad_norm": 0.6588183045387268, |
| "learning_rate": 0.00033555315704263355, |
| "loss": 3.3953, |
| "step": 40950 |
| }, |
| { |
| "epoch": 4.420485175202156, |
| "grad_norm": 0.6629270315170288, |
| "learning_rate": 0.0003352293577981651, |
| "loss": 3.3969, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.420485175202156, |
| "eval_accuracy": 0.3781523598157725, |
| "eval_loss": 3.4341278076171875, |
| "eval_runtime": 185.2042, |
| "eval_samples_per_second": 97.249, |
| "eval_steps_per_second": 6.08, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.425876010781671, |
| "grad_norm": 0.6934705972671509, |
| "learning_rate": 0.00033490555855369665, |
| "loss": 3.402, |
| "step": 41050 |
| }, |
| { |
| "epoch": 4.431266846361186, |
| "grad_norm": 0.608988881111145, |
| "learning_rate": 0.00033458175930922826, |
| "loss": 3.3712, |
| "step": 41100 |
| }, |
| { |
| "epoch": 4.436657681940701, |
| "grad_norm": 0.6706856489181519, |
| "learning_rate": 0.0003342579600647598, |
| "loss": 3.3936, |
| "step": 41150 |
| }, |
| { |
| "epoch": 4.442048517520216, |
| "grad_norm": 0.6479840874671936, |
| "learning_rate": 0.0003339341608202914, |
| "loss": 3.4055, |
| "step": 41200 |
| }, |
| { |
| "epoch": 4.44743935309973, |
| "grad_norm": 0.680960476398468, |
| "learning_rate": 0.00033361036157582297, |
| "loss": 3.4043, |
| "step": 41250 |
| }, |
| { |
| "epoch": 4.452830188679245, |
| "grad_norm": 0.6902644634246826, |
| "learning_rate": 0.00033328656233135457, |
| "loss": 3.3727, |
| "step": 41300 |
| }, |
| { |
| "epoch": 4.45822102425876, |
| "grad_norm": 0.6928199529647827, |
| "learning_rate": 0.0003329627630868861, |
| "loss": 3.379, |
| "step": 41350 |
| }, |
| { |
| "epoch": 4.463611859838275, |
| "grad_norm": 0.7097663879394531, |
| "learning_rate": 0.0003326389638424177, |
| "loss": 3.3974, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.46900269541779, |
| "grad_norm": 0.6607456803321838, |
| "learning_rate": 0.0003323151645979492, |
| "loss": 3.4084, |
| "step": 41450 |
| }, |
| { |
| "epoch": 4.474393530997305, |
| "grad_norm": 0.7242568731307983, |
| "learning_rate": 0.00033199136535348077, |
| "loss": 3.4067, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.479784366576819, |
| "grad_norm": 0.6340367197990417, |
| "learning_rate": 0.0003316675661090124, |
| "loss": 3.3938, |
| "step": 41550 |
| }, |
| { |
| "epoch": 4.485175202156334, |
| "grad_norm": 0.6760000586509705, |
| "learning_rate": 0.00033134376686454393, |
| "loss": 3.3863, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.490566037735849, |
| "grad_norm": 0.6576944589614868, |
| "learning_rate": 0.00033101996762007553, |
| "loss": 3.3976, |
| "step": 41650 |
| }, |
| { |
| "epoch": 4.495956873315364, |
| "grad_norm": 0.6730291247367859, |
| "learning_rate": 0.0003306961683756071, |
| "loss": 3.3886, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.501347708894879, |
| "grad_norm": 0.685353696346283, |
| "learning_rate": 0.0003303723691311387, |
| "loss": 3.387, |
| "step": 41750 |
| }, |
| { |
| "epoch": 4.506738544474393, |
| "grad_norm": 0.9739151000976562, |
| "learning_rate": 0.00033004856988667024, |
| "loss": 3.4041, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.512129380053908, |
| "grad_norm": 0.6757647395133972, |
| "learning_rate": 0.0003297247706422018, |
| "loss": 3.3808, |
| "step": 41850 |
| }, |
| { |
| "epoch": 4.517520215633423, |
| "grad_norm": 0.6179267168045044, |
| "learning_rate": 0.0003294009713977334, |
| "loss": 3.3965, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.5229110512129385, |
| "grad_norm": 0.677100658416748, |
| "learning_rate": 0.00032907717215326494, |
| "loss": 3.3982, |
| "step": 41950 |
| }, |
| { |
| "epoch": 4.528301886792453, |
| "grad_norm": 0.6717151999473572, |
| "learning_rate": 0.00032875337290879655, |
| "loss": 3.3848, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.528301886792453, |
| "eval_accuracy": 0.37859935779624126, |
| "eval_loss": 3.430816888809204, |
| "eval_runtime": 185.1614, |
| "eval_samples_per_second": 97.272, |
| "eval_steps_per_second": 6.081, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.533692722371968, |
| "grad_norm": 0.647710919380188, |
| "learning_rate": 0.00032842957366432805, |
| "loss": 3.3855, |
| "step": 42050 |
| }, |
| { |
| "epoch": 4.539083557951482, |
| "grad_norm": 0.7008846998214722, |
| "learning_rate": 0.000328112250404749, |
| "loss": 3.3938, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.544474393530997, |
| "grad_norm": 0.6753333806991577, |
| "learning_rate": 0.00032778845116028066, |
| "loss": 3.3952, |
| "step": 42150 |
| }, |
| { |
| "epoch": 4.549865229110512, |
| "grad_norm": 0.6874195337295532, |
| "learning_rate": 0.00032746465191581215, |
| "loss": 3.3911, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.555256064690027, |
| "grad_norm": 0.6420570611953735, |
| "learning_rate": 0.0003271408526713437, |
| "loss": 3.4027, |
| "step": 42250 |
| }, |
| { |
| "epoch": 4.560646900269542, |
| "grad_norm": 0.7065884470939636, |
| "learning_rate": 0.0003268170534268753, |
| "loss": 3.3818, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.566037735849057, |
| "grad_norm": 0.6392266750335693, |
| "learning_rate": 0.00032649325418240686, |
| "loss": 3.4136, |
| "step": 42350 |
| }, |
| { |
| "epoch": 4.571428571428571, |
| "grad_norm": 0.662189781665802, |
| "learning_rate": 0.00032616945493793846, |
| "loss": 3.374, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.576819407008086, |
| "grad_norm": 0.6861265301704407, |
| "learning_rate": 0.00032584565569347, |
| "loss": 3.3951, |
| "step": 42450 |
| }, |
| { |
| "epoch": 4.5822102425876015, |
| "grad_norm": 0.6285902857780457, |
| "learning_rate": 0.0003255218564490016, |
| "loss": 3.4016, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.587601078167116, |
| "grad_norm": 0.6859892010688782, |
| "learning_rate": 0.00032519805720453317, |
| "loss": 3.3806, |
| "step": 42550 |
| }, |
| { |
| "epoch": 4.592991913746631, |
| "grad_norm": 0.6380650401115417, |
| "learning_rate": 0.0003248742579600647, |
| "loss": 3.4063, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.598382749326145, |
| "grad_norm": 0.6524938344955444, |
| "learning_rate": 0.0003245504587155963, |
| "loss": 3.3911, |
| "step": 42650 |
| }, |
| { |
| "epoch": 4.60377358490566, |
| "grad_norm": 0.6308937072753906, |
| "learning_rate": 0.0003242266594711278, |
| "loss": 3.3892, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.609164420485175, |
| "grad_norm": 0.648926854133606, |
| "learning_rate": 0.0003239028602266595, |
| "loss": 3.3987, |
| "step": 42750 |
| }, |
| { |
| "epoch": 4.6145552560646905, |
| "grad_norm": 0.6324000954627991, |
| "learning_rate": 0.000323579060982191, |
| "loss": 3.3729, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.619946091644205, |
| "grad_norm": 0.6731909513473511, |
| "learning_rate": 0.0003232552617377226, |
| "loss": 3.4003, |
| "step": 42850 |
| }, |
| { |
| "epoch": 4.62533692722372, |
| "grad_norm": 0.6357996463775635, |
| "learning_rate": 0.00032293146249325413, |
| "loss": 3.3966, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.630727762803234, |
| "grad_norm": 0.7101162075996399, |
| "learning_rate": 0.00032260766324878574, |
| "loss": 3.3918, |
| "step": 42950 |
| }, |
| { |
| "epoch": 4.636118598382749, |
| "grad_norm": 0.7121390104293823, |
| "learning_rate": 0.0003222838640043173, |
| "loss": 3.3952, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.636118598382749, |
| "eval_accuracy": 0.3787855888459553, |
| "eval_loss": 3.424402952194214, |
| "eval_runtime": 185.282, |
| "eval_samples_per_second": 97.209, |
| "eval_steps_per_second": 6.077, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.6415094339622645, |
| "grad_norm": 0.6777333617210388, |
| "learning_rate": 0.00032196006475984884, |
| "loss": 3.3872, |
| "step": 43050 |
| }, |
| { |
| "epoch": 4.646900269541779, |
| "grad_norm": 0.6901153922080994, |
| "learning_rate": 0.00032163626551538044, |
| "loss": 3.3889, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.652291105121294, |
| "grad_norm": 0.6530250310897827, |
| "learning_rate": 0.000321312466270912, |
| "loss": 3.4072, |
| "step": 43150 |
| }, |
| { |
| "epoch": 4.657681940700809, |
| "grad_norm": 0.7110807299613953, |
| "learning_rate": 0.0003209886670264436, |
| "loss": 3.3743, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.663072776280323, |
| "grad_norm": 0.648343563079834, |
| "learning_rate": 0.00032066486778197515, |
| "loss": 3.396, |
| "step": 43250 |
| }, |
| { |
| "epoch": 4.668463611859838, |
| "grad_norm": 0.6666106581687927, |
| "learning_rate": 0.00032034106853750675, |
| "loss": 3.3839, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.6738544474393535, |
| "grad_norm": 0.7402759194374084, |
| "learning_rate": 0.0003200172692930383, |
| "loss": 3.3912, |
| "step": 43350 |
| }, |
| { |
| "epoch": 4.679245283018868, |
| "grad_norm": 0.6613388061523438, |
| "learning_rate": 0.0003196934700485699, |
| "loss": 3.3894, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.684636118598383, |
| "grad_norm": 0.6671817898750305, |
| "learning_rate": 0.0003193696708041014, |
| "loss": 3.3974, |
| "step": 43450 |
| }, |
| { |
| "epoch": 4.690026954177897, |
| "grad_norm": 0.6724131107330322, |
| "learning_rate": 0.00031904587155963296, |
| "loss": 3.3766, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.695417789757412, |
| "grad_norm": 0.6639029383659363, |
| "learning_rate": 0.00031872207231516456, |
| "loss": 3.3921, |
| "step": 43550 |
| }, |
| { |
| "epoch": 4.7008086253369274, |
| "grad_norm": 0.6928759217262268, |
| "learning_rate": 0.0003183982730706961, |
| "loss": 3.3984, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.706199460916442, |
| "grad_norm": 0.6988749504089355, |
| "learning_rate": 0.0003180744738262277, |
| "loss": 3.3954, |
| "step": 43650 |
| }, |
| { |
| "epoch": 4.711590296495957, |
| "grad_norm": 0.675839900970459, |
| "learning_rate": 0.00031775067458175927, |
| "loss": 3.3914, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.716981132075472, |
| "grad_norm": 0.6638389229774475, |
| "learning_rate": 0.00031742687533729087, |
| "loss": 3.4078, |
| "step": 43750 |
| }, |
| { |
| "epoch": 4.722371967654986, |
| "grad_norm": 0.6970890164375305, |
| "learning_rate": 0.0003171030760928224, |
| "loss": 3.3917, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.727762803234501, |
| "grad_norm": 0.6743645071983337, |
| "learning_rate": 0.000316779276848354, |
| "loss": 3.399, |
| "step": 43850 |
| }, |
| { |
| "epoch": 4.7331536388140165, |
| "grad_norm": 0.6509005427360535, |
| "learning_rate": 0.0003164554776038856, |
| "loss": 3.3884, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.738544474393531, |
| "grad_norm": 0.6444371938705444, |
| "learning_rate": 0.00031613167835941713, |
| "loss": 3.3879, |
| "step": 43950 |
| }, |
| { |
| "epoch": 4.743935309973046, |
| "grad_norm": 0.6728962063789368, |
| "learning_rate": 0.00031580787911494873, |
| "loss": 3.3875, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.743935309973046, |
| "eval_accuracy": 0.37932092162421727, |
| "eval_loss": 3.421056032180786, |
| "eval_runtime": 185.3808, |
| "eval_samples_per_second": 97.157, |
| "eval_steps_per_second": 6.074, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.74932614555256, |
| "grad_norm": 0.6364440321922302, |
| "learning_rate": 0.00031548407987048023, |
| "loss": 3.3854, |
| "step": 44050 |
| }, |
| { |
| "epoch": 4.754716981132075, |
| "grad_norm": 0.7157145738601685, |
| "learning_rate": 0.0003151602806260119, |
| "loss": 3.3942, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.7601078167115904, |
| "grad_norm": 0.7098649144172668, |
| "learning_rate": 0.0003148364813815434, |
| "loss": 3.3875, |
| "step": 44150 |
| }, |
| { |
| "epoch": 4.765498652291106, |
| "grad_norm": 0.691125214099884, |
| "learning_rate": 0.000314512682137075, |
| "loss": 3.3911, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.77088948787062, |
| "grad_norm": 0.6948944330215454, |
| "learning_rate": 0.00031418888289260654, |
| "loss": 3.3756, |
| "step": 44250 |
| }, |
| { |
| "epoch": 4.776280323450135, |
| "grad_norm": 0.6892076134681702, |
| "learning_rate": 0.0003138650836481381, |
| "loss": 3.3961, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.781671159029649, |
| "grad_norm": 0.7187123894691467, |
| "learning_rate": 0.0003135412844036697, |
| "loss": 3.3782, |
| "step": 44350 |
| }, |
| { |
| "epoch": 4.787061994609164, |
| "grad_norm": 0.6381692886352539, |
| "learning_rate": 0.00031321748515920124, |
| "loss": 3.4169, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.7924528301886795, |
| "grad_norm": 0.6556568145751953, |
| "learning_rate": 0.00031289368591473285, |
| "loss": 3.3752, |
| "step": 44450 |
| }, |
| { |
| "epoch": 4.797843665768194, |
| "grad_norm": 0.6677922010421753, |
| "learning_rate": 0.0003125698866702644, |
| "loss": 3.3898, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.803234501347709, |
| "grad_norm": 0.6631757616996765, |
| "learning_rate": 0.000312246087425796, |
| "loss": 3.3838, |
| "step": 44550 |
| }, |
| { |
| "epoch": 4.808625336927224, |
| "grad_norm": 0.6678897738456726, |
| "learning_rate": 0.00031192228818132756, |
| "loss": 3.3972, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.814016172506738, |
| "grad_norm": 0.7124047875404358, |
| "learning_rate": 0.00031159848893685916, |
| "loss": 3.3808, |
| "step": 44650 |
| }, |
| { |
| "epoch": 4.819407008086253, |
| "grad_norm": 0.6782823801040649, |
| "learning_rate": 0.0003112746896923907, |
| "loss": 3.3945, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.824797843665769, |
| "grad_norm": 0.687823474407196, |
| "learning_rate": 0.0003109508904479222, |
| "loss": 3.3879, |
| "step": 44750 |
| }, |
| { |
| "epoch": 4.830188679245283, |
| "grad_norm": 0.7118351459503174, |
| "learning_rate": 0.0003106270912034538, |
| "loss": 3.3857, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.835579514824798, |
| "grad_norm": 0.6740395426750183, |
| "learning_rate": 0.00031030329195898536, |
| "loss": 3.3894, |
| "step": 44850 |
| }, |
| { |
| "epoch": 4.840970350404312, |
| "grad_norm": 0.6710334420204163, |
| "learning_rate": 0.00030997949271451697, |
| "loss": 3.392, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.846361185983827, |
| "grad_norm": 0.6429509520530701, |
| "learning_rate": 0.0003096556934700485, |
| "loss": 3.3808, |
| "step": 44950 |
| }, |
| { |
| "epoch": 4.8517520215633425, |
| "grad_norm": 0.6663042306900024, |
| "learning_rate": 0.00030933837021046947, |
| "loss": 3.4071, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.8517520215633425, |
| "eval_accuracy": 0.3799037357144541, |
| "eval_loss": 3.415198802947998, |
| "eval_runtime": 185.3679, |
| "eval_samples_per_second": 97.164, |
| "eval_steps_per_second": 6.074, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.857142857142857, |
| "grad_norm": 0.7604882717132568, |
| "learning_rate": 0.0003090145709660011, |
| "loss": 3.377, |
| "step": 45050 |
| }, |
| { |
| "epoch": 4.862533692722372, |
| "grad_norm": 0.694174587726593, |
| "learning_rate": 0.0003086907717215326, |
| "loss": 3.3797, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.867924528301887, |
| "grad_norm": 0.690619945526123, |
| "learning_rate": 0.0003083669724770642, |
| "loss": 3.391, |
| "step": 45150 |
| }, |
| { |
| "epoch": 4.873315363881401, |
| "grad_norm": 0.7311848402023315, |
| "learning_rate": 0.0003080431732325958, |
| "loss": 3.3779, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.878706199460916, |
| "grad_norm": 0.7099789381027222, |
| "learning_rate": 0.00030771937398812733, |
| "loss": 3.3939, |
| "step": 45250 |
| }, |
| { |
| "epoch": 4.884097035040432, |
| "grad_norm": 0.7288787961006165, |
| "learning_rate": 0.00030739557474365894, |
| "loss": 3.3783, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.889487870619946, |
| "grad_norm": 0.7027914524078369, |
| "learning_rate": 0.0003070717754991905, |
| "loss": 3.3861, |
| "step": 45350 |
| }, |
| { |
| "epoch": 4.894878706199461, |
| "grad_norm": 0.6941841244697571, |
| "learning_rate": 0.0003067479762547221, |
| "loss": 3.3837, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.900269541778976, |
| "grad_norm": 0.756320059299469, |
| "learning_rate": 0.0003064241770102536, |
| "loss": 3.3929, |
| "step": 45450 |
| }, |
| { |
| "epoch": 4.90566037735849, |
| "grad_norm": 0.6624318361282349, |
| "learning_rate": 0.00030610037776578514, |
| "loss": 3.3948, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.9110512129380055, |
| "grad_norm": 0.6968894600868225, |
| "learning_rate": 0.00030577657852131674, |
| "loss": 3.3916, |
| "step": 45550 |
| }, |
| { |
| "epoch": 4.916442048517521, |
| "grad_norm": 0.6658734083175659, |
| "learning_rate": 0.0003054527792768483, |
| "loss": 3.3918, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.921832884097035, |
| "grad_norm": 0.6801126003265381, |
| "learning_rate": 0.0003051289800323799, |
| "loss": 3.4074, |
| "step": 45650 |
| }, |
| { |
| "epoch": 4.92722371967655, |
| "grad_norm": 0.679326057434082, |
| "learning_rate": 0.00030480518078791145, |
| "loss": 3.401, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.932614555256064, |
| "grad_norm": 0.6885223984718323, |
| "learning_rate": 0.00030448138154344305, |
| "loss": 3.3811, |
| "step": 45750 |
| }, |
| { |
| "epoch": 4.938005390835579, |
| "grad_norm": 0.7149697542190552, |
| "learning_rate": 0.0003041575822989746, |
| "loss": 3.4065, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.943396226415095, |
| "grad_norm": 0.6741778254508972, |
| "learning_rate": 0.0003038337830545062, |
| "loss": 3.3787, |
| "step": 45850 |
| }, |
| { |
| "epoch": 4.948787061994609, |
| "grad_norm": 0.6654149889945984, |
| "learning_rate": 0.00030350998381003776, |
| "loss": 3.3748, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.954177897574124, |
| "grad_norm": 0.682928740978241, |
| "learning_rate": 0.0003031861845655693, |
| "loss": 3.3726, |
| "step": 45950 |
| }, |
| { |
| "epoch": 4.959568733153639, |
| "grad_norm": 0.7227015495300293, |
| "learning_rate": 0.0003028623853211009, |
| "loss": 3.3742, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.959568733153639, |
| "eval_accuracy": 0.3803156388121588, |
| "eval_loss": 3.4095218181610107, |
| "eval_runtime": 185.3393, |
| "eval_samples_per_second": 97.179, |
| "eval_steps_per_second": 6.075, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.964959568733153, |
| "grad_norm": 0.6969720721244812, |
| "learning_rate": 0.00030253858607663247, |
| "loss": 3.3797, |
| "step": 46050 |
| }, |
| { |
| "epoch": 4.9703504043126685, |
| "grad_norm": 0.7063660621643066, |
| "learning_rate": 0.00030221478683216407, |
| "loss": 3.3897, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.975741239892184, |
| "grad_norm": 0.6650516986846924, |
| "learning_rate": 0.00030189098758769557, |
| "loss": 3.3888, |
| "step": 46150 |
| }, |
| { |
| "epoch": 4.981132075471698, |
| "grad_norm": 0.6772712469100952, |
| "learning_rate": 0.00030156718834322717, |
| "loss": 3.3944, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.986522911051213, |
| "grad_norm": 0.7059131860733032, |
| "learning_rate": 0.0003012433890987587, |
| "loss": 3.392, |
| "step": 46250 |
| }, |
| { |
| "epoch": 4.991913746630727, |
| "grad_norm": 0.7540570497512817, |
| "learning_rate": 0.0003009195898542903, |
| "loss": 3.3841, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.997304582210242, |
| "grad_norm": 0.7052427530288696, |
| "learning_rate": 0.0003005957906098219, |
| "loss": 3.3774, |
| "step": 46350 |
| }, |
| { |
| "epoch": 5.002695417789758, |
| "grad_norm": 0.7645816206932068, |
| "learning_rate": 0.00030027199136535343, |
| "loss": 3.3627, |
| "step": 46400 |
| }, |
| { |
| "epoch": 5.008086253369272, |
| "grad_norm": 0.6885704398155212, |
| "learning_rate": 0.00029994819212088503, |
| "loss": 3.2878, |
| "step": 46450 |
| }, |
| { |
| "epoch": 5.013477088948787, |
| "grad_norm": 1.085796594619751, |
| "learning_rate": 0.0002996243928764166, |
| "loss": 3.2952, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.018867924528302, |
| "grad_norm": 0.6787880659103394, |
| "learning_rate": 0.0002993005936319482, |
| "loss": 3.3063, |
| "step": 46550 |
| }, |
| { |
| "epoch": 5.024258760107816, |
| "grad_norm": 0.7390757203102112, |
| "learning_rate": 0.00029897679438747974, |
| "loss": 3.2968, |
| "step": 46600 |
| }, |
| { |
| "epoch": 5.0296495956873315, |
| "grad_norm": 0.7662341594696045, |
| "learning_rate": 0.0002986529951430113, |
| "loss": 3.2848, |
| "step": 46650 |
| }, |
| { |
| "epoch": 5.035040431266847, |
| "grad_norm": 0.6801024675369263, |
| "learning_rate": 0.0002983291958985429, |
| "loss": 3.2963, |
| "step": 46700 |
| }, |
| { |
| "epoch": 5.040431266846361, |
| "grad_norm": 0.6480717658996582, |
| "learning_rate": 0.00029800539665407444, |
| "loss": 3.2874, |
| "step": 46750 |
| }, |
| { |
| "epoch": 5.045822102425876, |
| "grad_norm": 0.7626944780349731, |
| "learning_rate": 0.000297681597409606, |
| "loss": 3.2959, |
| "step": 46800 |
| }, |
| { |
| "epoch": 5.051212938005391, |
| "grad_norm": 0.6716665625572205, |
| "learning_rate": 0.0002973577981651376, |
| "loss": 3.3083, |
| "step": 46850 |
| }, |
| { |
| "epoch": 5.056603773584905, |
| "grad_norm": 0.6842003464698792, |
| "learning_rate": 0.00029703399892066915, |
| "loss": 3.316, |
| "step": 46900 |
| }, |
| { |
| "epoch": 5.061994609164421, |
| "grad_norm": 0.6711278557777405, |
| "learning_rate": 0.00029671019967620076, |
| "loss": 3.3085, |
| "step": 46950 |
| }, |
| { |
| "epoch": 5.067385444743936, |
| "grad_norm": 0.6872760653495789, |
| "learning_rate": 0.0002963864004317323, |
| "loss": 3.3158, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.067385444743936, |
| "eval_accuracy": 0.380729606314732, |
| "eval_loss": 3.410980463027954, |
| "eval_runtime": 185.6161, |
| "eval_samples_per_second": 97.034, |
| "eval_steps_per_second": 6.066, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.07277628032345, |
| "grad_norm": 0.7116502523422241, |
| "learning_rate": 0.00029606260118726386, |
| "loss": 3.2866, |
| "step": 47050 |
| }, |
| { |
| "epoch": 5.078167115902965, |
| "grad_norm": 0.6567325592041016, |
| "learning_rate": 0.0002957388019427954, |
| "loss": 3.3009, |
| "step": 47100 |
| }, |
| { |
| "epoch": 5.083557951482479, |
| "grad_norm": 0.7238079309463501, |
| "learning_rate": 0.000295415002698327, |
| "loss": 3.3246, |
| "step": 47150 |
| }, |
| { |
| "epoch": 5.0889487870619945, |
| "grad_norm": 0.7564477324485779, |
| "learning_rate": 0.00029509120345385856, |
| "loss": 3.3181, |
| "step": 47200 |
| }, |
| { |
| "epoch": 5.09433962264151, |
| "grad_norm": 0.7536827921867371, |
| "learning_rate": 0.00029476740420939017, |
| "loss": 3.2986, |
| "step": 47250 |
| }, |
| { |
| "epoch": 5.099730458221024, |
| "grad_norm": 0.7181034684181213, |
| "learning_rate": 0.0002944436049649217, |
| "loss": 3.3121, |
| "step": 47300 |
| }, |
| { |
| "epoch": 5.105121293800539, |
| "grad_norm": 0.7820267081260681, |
| "learning_rate": 0.00029412628170534267, |
| "loss": 3.3151, |
| "step": 47350 |
| }, |
| { |
| "epoch": 5.110512129380054, |
| "grad_norm": 0.676846444606781, |
| "learning_rate": 0.0002938024824608742, |
| "loss": 3.3098, |
| "step": 47400 |
| }, |
| { |
| "epoch": 5.115902964959568, |
| "grad_norm": 0.7352178692817688, |
| "learning_rate": 0.00029347868321640577, |
| "loss": 3.3171, |
| "step": 47450 |
| }, |
| { |
| "epoch": 5.121293800539084, |
| "grad_norm": 0.7145463824272156, |
| "learning_rate": 0.0002931548839719374, |
| "loss": 3.3112, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.126684636118599, |
| "grad_norm": 0.6998820900917053, |
| "learning_rate": 0.0002928310847274689, |
| "loss": 3.3068, |
| "step": 47550 |
| }, |
| { |
| "epoch": 5.132075471698113, |
| "grad_norm": 0.6971082091331482, |
| "learning_rate": 0.00029250728548300053, |
| "loss": 3.3033, |
| "step": 47600 |
| }, |
| { |
| "epoch": 5.137466307277628, |
| "grad_norm": 0.6818746328353882, |
| "learning_rate": 0.0002921834862385321, |
| "loss": 3.3174, |
| "step": 47650 |
| }, |
| { |
| "epoch": 5.142857142857143, |
| "grad_norm": 0.6603133678436279, |
| "learning_rate": 0.0002918596869940637, |
| "loss": 3.32, |
| "step": 47700 |
| }, |
| { |
| "epoch": 5.1482479784366575, |
| "grad_norm": 0.720582902431488, |
| "learning_rate": 0.0002915358877495952, |
| "loss": 3.3214, |
| "step": 47750 |
| }, |
| { |
| "epoch": 5.153638814016173, |
| "grad_norm": 0.7110748291015625, |
| "learning_rate": 0.0002912120885051268, |
| "loss": 3.3299, |
| "step": 47800 |
| }, |
| { |
| "epoch": 5.159029649595688, |
| "grad_norm": 0.6903175115585327, |
| "learning_rate": 0.00029088828926065834, |
| "loss": 3.3209, |
| "step": 47850 |
| }, |
| { |
| "epoch": 5.164420485175202, |
| "grad_norm": 0.7338415384292603, |
| "learning_rate": 0.00029056449001618994, |
| "loss": 3.3033, |
| "step": 47900 |
| }, |
| { |
| "epoch": 5.169811320754717, |
| "grad_norm": 0.723596453666687, |
| "learning_rate": 0.0002902406907717215, |
| "loss": 3.3219, |
| "step": 47950 |
| }, |
| { |
| "epoch": 5.175202156334231, |
| "grad_norm": 0.7516893148422241, |
| "learning_rate": 0.0002899168915272531, |
| "loss": 3.3119, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.175202156334231, |
| "eval_accuracy": 0.3811521573954425, |
| "eval_loss": 3.411118268966675, |
| "eval_runtime": 185.4994, |
| "eval_samples_per_second": 97.095, |
| "eval_steps_per_second": 6.07, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.180592991913747, |
| "grad_norm": 0.7215257883071899, |
| "learning_rate": 0.00028959309228278465, |
| "loss": 3.3424, |
| "step": 48050 |
| }, |
| { |
| "epoch": 5.185983827493262, |
| "grad_norm": 0.7489564418792725, |
| "learning_rate": 0.00028926929303831625, |
| "loss": 3.3215, |
| "step": 48100 |
| }, |
| { |
| "epoch": 5.191374663072776, |
| "grad_norm": 0.6841164231300354, |
| "learning_rate": 0.0002889454937938478, |
| "loss": 3.3318, |
| "step": 48150 |
| }, |
| { |
| "epoch": 5.196765498652291, |
| "grad_norm": 0.6923016309738159, |
| "learning_rate": 0.00028862169454937935, |
| "loss": 3.3199, |
| "step": 48200 |
| }, |
| { |
| "epoch": 5.202156334231806, |
| "grad_norm": 0.7165120840072632, |
| "learning_rate": 0.0002882978953049109, |
| "loss": 3.3028, |
| "step": 48250 |
| }, |
| { |
| "epoch": 5.2075471698113205, |
| "grad_norm": 0.6918339729309082, |
| "learning_rate": 0.0002879740960604425, |
| "loss": 3.3329, |
| "step": 48300 |
| }, |
| { |
| "epoch": 5.212938005390836, |
| "grad_norm": 0.6800699830055237, |
| "learning_rate": 0.00028765029681597406, |
| "loss": 3.3145, |
| "step": 48350 |
| }, |
| { |
| "epoch": 5.218328840970351, |
| "grad_norm": 0.7082724571228027, |
| "learning_rate": 0.00028732649757150566, |
| "loss": 3.3271, |
| "step": 48400 |
| }, |
| { |
| "epoch": 5.223719676549865, |
| "grad_norm": 0.719948947429657, |
| "learning_rate": 0.0002870026983270372, |
| "loss": 3.3173, |
| "step": 48450 |
| }, |
| { |
| "epoch": 5.22911051212938, |
| "grad_norm": 0.6641173362731934, |
| "learning_rate": 0.00028667889908256877, |
| "loss": 3.317, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.234501347708895, |
| "grad_norm": 0.6939883828163147, |
| "learning_rate": 0.00028635509983810037, |
| "loss": 3.3035, |
| "step": 48550 |
| }, |
| { |
| "epoch": 5.2398921832884096, |
| "grad_norm": 0.695296049118042, |
| "learning_rate": 0.0002860313005936319, |
| "loss": 3.3286, |
| "step": 48600 |
| }, |
| { |
| "epoch": 5.245283018867925, |
| "grad_norm": 0.7294591069221497, |
| "learning_rate": 0.00028570750134916347, |
| "loss": 3.3226, |
| "step": 48650 |
| }, |
| { |
| "epoch": 5.250673854447439, |
| "grad_norm": 0.710701048374176, |
| "learning_rate": 0.0002853837021046951, |
| "loss": 3.3203, |
| "step": 48700 |
| }, |
| { |
| "epoch": 5.256064690026954, |
| "grad_norm": 0.7199147343635559, |
| "learning_rate": 0.00028505990286022663, |
| "loss": 3.311, |
| "step": 48750 |
| }, |
| { |
| "epoch": 5.261455525606469, |
| "grad_norm": 0.6639696359634399, |
| "learning_rate": 0.0002847361036157582, |
| "loss": 3.3255, |
| "step": 48800 |
| }, |
| { |
| "epoch": 5.2668463611859835, |
| "grad_norm": 0.6906806826591492, |
| "learning_rate": 0.0002844123043712898, |
| "loss": 3.3368, |
| "step": 48850 |
| }, |
| { |
| "epoch": 5.272237196765499, |
| "grad_norm": 0.6999244689941406, |
| "learning_rate": 0.00028408850512682133, |
| "loss": 3.3156, |
| "step": 48900 |
| }, |
| { |
| "epoch": 5.277628032345014, |
| "grad_norm": 0.7082576751708984, |
| "learning_rate": 0.00028376470588235294, |
| "loss": 3.3318, |
| "step": 48950 |
| }, |
| { |
| "epoch": 5.283018867924528, |
| "grad_norm": 0.7592863440513611, |
| "learning_rate": 0.0002834409066378845, |
| "loss": 3.3434, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.283018867924528, |
| "eval_accuracy": 0.38130264164506555, |
| "eval_loss": 3.4073922634124756, |
| "eval_runtime": 185.3808, |
| "eval_samples_per_second": 97.157, |
| "eval_steps_per_second": 6.074, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.288409703504043, |
| "grad_norm": 0.7438406944274902, |
| "learning_rate": 0.00028311710739341604, |
| "loss": 3.3356, |
| "step": 49050 |
| }, |
| { |
| "epoch": 5.293800539083558, |
| "grad_norm": 0.7723414301872253, |
| "learning_rate": 0.00028279330814894764, |
| "loss": 3.3378, |
| "step": 49100 |
| }, |
| { |
| "epoch": 5.2991913746630726, |
| "grad_norm": 0.7213961482048035, |
| "learning_rate": 0.0002824695089044792, |
| "loss": 3.3135, |
| "step": 49150 |
| }, |
| { |
| "epoch": 5.304582210242588, |
| "grad_norm": 0.6973984241485596, |
| "learning_rate": 0.00028214570966001075, |
| "loss": 3.3393, |
| "step": 49200 |
| }, |
| { |
| "epoch": 5.309973045822103, |
| "grad_norm": 0.6829140186309814, |
| "learning_rate": 0.00028182191041554235, |
| "loss": 3.327, |
| "step": 49250 |
| }, |
| { |
| "epoch": 5.315363881401617, |
| "grad_norm": 0.7045508027076721, |
| "learning_rate": 0.0002814981111710739, |
| "loss": 3.324, |
| "step": 49300 |
| }, |
| { |
| "epoch": 5.320754716981132, |
| "grad_norm": 0.6908060908317566, |
| "learning_rate": 0.0002811743119266055, |
| "loss": 3.3408, |
| "step": 49350 |
| }, |
| { |
| "epoch": 5.3261455525606465, |
| "grad_norm": 0.6872555613517761, |
| "learning_rate": 0.00028085051268213706, |
| "loss": 3.3154, |
| "step": 49400 |
| }, |
| { |
| "epoch": 5.331536388140162, |
| "grad_norm": 0.7134802341461182, |
| "learning_rate": 0.0002805267134376686, |
| "loss": 3.3139, |
| "step": 49450 |
| }, |
| { |
| "epoch": 5.336927223719677, |
| "grad_norm": 0.6903613805770874, |
| "learning_rate": 0.00028020291419320016, |
| "loss": 3.3408, |
| "step": 49500 |
| }, |
| { |
| "epoch": 5.342318059299191, |
| "grad_norm": 0.7270494103431702, |
| "learning_rate": 0.00027987911494873176, |
| "loss": 3.3396, |
| "step": 49550 |
| }, |
| { |
| "epoch": 5.347708894878706, |
| "grad_norm": 0.7505331635475159, |
| "learning_rate": 0.0002795553157042633, |
| "loss": 3.3326, |
| "step": 49600 |
| }, |
| { |
| "epoch": 5.353099730458221, |
| "grad_norm": 0.71560138463974, |
| "learning_rate": 0.0002792315164597949, |
| "loss": 3.3208, |
| "step": 49650 |
| }, |
| { |
| "epoch": 5.3584905660377355, |
| "grad_norm": 0.7327362895011902, |
| "learning_rate": 0.00027890771721532647, |
| "loss": 3.3467, |
| "step": 49700 |
| }, |
| { |
| "epoch": 5.363881401617251, |
| "grad_norm": 0.707087516784668, |
| "learning_rate": 0.00027858391797085807, |
| "loss": 3.3443, |
| "step": 49750 |
| }, |
| { |
| "epoch": 5.369272237196766, |
| "grad_norm": 0.6669909358024597, |
| "learning_rate": 0.0002782601187263896, |
| "loss": 3.3169, |
| "step": 49800 |
| }, |
| { |
| "epoch": 5.37466307277628, |
| "grad_norm": 0.6940752863883972, |
| "learning_rate": 0.0002779363194819212, |
| "loss": 3.3494, |
| "step": 49850 |
| }, |
| { |
| "epoch": 5.380053908355795, |
| "grad_norm": 0.7098177671432495, |
| "learning_rate": 0.0002776125202374527, |
| "loss": 3.3276, |
| "step": 49900 |
| }, |
| { |
| "epoch": 5.38544474393531, |
| "grad_norm": 0.7212302088737488, |
| "learning_rate": 0.00027728872099298433, |
| "loss": 3.3396, |
| "step": 49950 |
| }, |
| { |
| "epoch": 5.390835579514825, |
| "grad_norm": 0.7480193972587585, |
| "learning_rate": 0.0002769713977334053, |
| "loss": 3.3422, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.390835579514825, |
| "eval_accuracy": 0.3820361572907011, |
| "eval_loss": 3.4000582695007324, |
| "eval_runtime": 185.5499, |
| "eval_samples_per_second": 97.068, |
| "eval_steps_per_second": 6.068, |
| "step": 50000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 92750, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.18034589696e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|