diff --git "a/checkpoint-80000/trainer_state.json" "b/checkpoint-80000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-80000/trainer_state.json" @@ -0,0 +1,11953 @@ +{ + "best_metric": 3.3219223022460938, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__8397/checkpoint-80000", + "epoch": 8.625336927223719, + "eval_steps": 1000, + "global_step": 80000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005390835579514825, + "grad_norm": 1.9401150941848755, + "learning_rate": 0.0003, + "loss": 8.6351, + "step": 50 + }, + { + "epoch": 0.01078167115902965, + "grad_norm": 1.2623108625411987, + "learning_rate": 0.0006, + "loss": 6.8864, + "step": 100 + }, + { + "epoch": 0.016172506738544475, + "grad_norm": 3.276719808578491, + "learning_rate": 0.0005996762007555315, + "loss": 6.4636, + "step": 150 + }, + { + "epoch": 0.0215633423180593, + "grad_norm": 2.706618309020996, + "learning_rate": 0.000599352401511063, + "loss": 6.2398, + "step": 200 + }, + { + "epoch": 0.026954177897574125, + "grad_norm": 1.0366517305374146, + "learning_rate": 0.0005990286022665946, + "loss": 6.0879, + "step": 250 + }, + { + "epoch": 0.03234501347708895, + "grad_norm": 2.069812297821045, + "learning_rate": 0.0005987048030221263, + "loss": 5.9796, + "step": 300 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 1.1950629949569702, + "learning_rate": 0.0005983810037776578, + "loss": 5.8688, + "step": 350 + }, + { + "epoch": 0.0431266846361186, + "grad_norm": 1.0007357597351074, + "learning_rate": 0.0005980572045331894, + "loss": 5.8233, + "step": 400 + }, + { + "epoch": 0.04851752021563342, + "grad_norm": 2.0554916858673096, + "learning_rate": 0.0005977334052887209, + "loss": 5.7452, + "step": 450 + }, + { + "epoch": 0.05390835579514825, + "grad_norm": 1.996433973312378, + "learning_rate": 0.0005974096060442526, + "loss": 5.6484, + "step": 500 + }, + { + "epoch": 0.05929919137466307, + "grad_norm": 1.439592719078064, + "learning_rate": 0.0005970858067997841, + "loss": 5.5734, + "step": 550 + }, + { + "epoch": 0.0646900269541779, + "grad_norm": 1.0077263116836548, + "learning_rate": 0.0005967620075553157, + "loss": 5.4903, + "step": 600 + }, + { + "epoch": 0.07008086253369272, + "grad_norm": 1.460601568222046, + "learning_rate": 0.0005964382083108472, + "loss": 5.4273, + "step": 650 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 1.5479509830474854, + "learning_rate": 0.0005961144090663788, + "loss": 5.3803, + "step": 700 + }, + { + "epoch": 0.08086253369272237, + "grad_norm": 1.2044662237167358, + "learning_rate": 0.0005957906098219104, + "loss": 5.305, + "step": 750 + }, + { + "epoch": 0.0862533692722372, + "grad_norm": 1.4456267356872559, + "learning_rate": 0.0005954668105774419, + "loss": 5.2577, + "step": 800 + }, + { + "epoch": 0.09164420485175202, + "grad_norm": 1.4306107759475708, + "learning_rate": 0.0005951430113329735, + "loss": 5.2168, + "step": 850 + }, + { + "epoch": 0.09703504043126684, + "grad_norm": 1.5723680257797241, + "learning_rate": 0.0005948192120885051, + "loss": 5.1595, + "step": 900 + }, + { + "epoch": 0.10242587601078167, + "grad_norm": 1.4030689001083374, + "learning_rate": 0.0005944954128440366, + "loss": 5.1322, + "step": 950 + }, + { + "epoch": 0.1078167115902965, + "grad_norm": 0.9765119552612305, + "learning_rate": 0.0005941716135995682, + "loss": 5.0934, + "step": 1000 + }, + { + "epoch": 0.1078167115902965, + "eval_accuracy": 0.22754229014862737, + "eval_loss": 5.022522926330566, + "eval_runtime": 185.9242, + "eval_samples_per_second": 96.873, + "eval_steps_per_second": 6.056, + "step": 1000 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.8278682827949524, + "learning_rate": 0.0005938478143550997, + "loss": 5.0442, + "step": 1050 + }, + { + "epoch": 0.11859838274932614, + "grad_norm": 1.0204647779464722, + "learning_rate": 0.0005935240151106314, + "loss": 5.0209, + "step": 1100 + }, + { + "epoch": 0.12398921832884097, + "grad_norm": 1.004250407218933, + "learning_rate": 0.0005932002158661629, + "loss": 4.9915, + "step": 1150 + }, + { + "epoch": 0.1293800539083558, + "grad_norm": 1.2949087619781494, + "learning_rate": 0.0005928764166216945, + "loss": 4.9493, + "step": 1200 + }, + { + "epoch": 0.1347708894878706, + "grad_norm": 0.8809250593185425, + "learning_rate": 0.000592552617377226, + "loss": 4.9039, + "step": 1250 + }, + { + "epoch": 0.14016172506738545, + "grad_norm": 1.0270975828170776, + "learning_rate": 0.0005922288181327577, + "loss": 4.8792, + "step": 1300 + }, + { + "epoch": 0.14555256064690028, + "grad_norm": 0.9649767279624939, + "learning_rate": 0.0005919050188882893, + "loss": 4.8622, + "step": 1350 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.9902446866035461, + "learning_rate": 0.0005915812196438207, + "loss": 4.8481, + "step": 1400 + }, + { + "epoch": 0.15633423180592992, + "grad_norm": 0.7882956862449646, + "learning_rate": 0.0005912574203993524, + "loss": 4.7968, + "step": 1450 + }, + { + "epoch": 0.16172506738544473, + "grad_norm": 0.8941258788108826, + "learning_rate": 0.0005909336211548839, + "loss": 4.8149, + "step": 1500 + }, + { + "epoch": 0.16711590296495957, + "grad_norm": 1.3297462463378906, + "learning_rate": 0.0005906098219104155, + "loss": 4.7656, + "step": 1550 + }, + { + "epoch": 0.1725067385444744, + "grad_norm": 1.497783899307251, + "learning_rate": 0.000590286022665947, + "loss": 4.7642, + "step": 1600 + }, + { + "epoch": 0.1778975741239892, + "grad_norm": 1.0149500370025635, + "learning_rate": 0.0005899622234214787, + "loss": 4.7241, + "step": 1650 + }, + { + "epoch": 0.18328840970350405, + "grad_norm": 1.067275047302246, + "learning_rate": 0.0005896384241770102, + "loss": 4.7115, + "step": 1700 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.7510073781013489, + "learning_rate": 0.0005893146249325418, + "loss": 4.6734, + "step": 1750 + }, + { + "epoch": 0.1940700808625337, + "grad_norm": 1.0857677459716797, + "learning_rate": 0.0005889908256880733, + "loss": 4.6802, + "step": 1800 + }, + { + "epoch": 0.19946091644204852, + "grad_norm": 0.9117745161056519, + "learning_rate": 0.0005886670264436049, + "loss": 4.6481, + "step": 1850 + }, + { + "epoch": 0.20485175202156333, + "grad_norm": 0.9455132484436035, + "learning_rate": 0.0005883432271991365, + "loss": 4.6385, + "step": 1900 + }, + { + "epoch": 0.21024258760107817, + "grad_norm": 0.8344852924346924, + "learning_rate": 0.0005880194279546681, + "loss": 4.6113, + "step": 1950 + }, + { + "epoch": 0.215633423180593, + "grad_norm": 1.0357882976531982, + "learning_rate": 0.0005876956287101996, + "loss": 4.5937, + "step": 2000 + }, + { + "epoch": 0.215633423180593, + "eval_accuracy": 0.27015801715433524, + "eval_loss": 4.516484260559082, + "eval_runtime": 185.6539, + "eval_samples_per_second": 97.014, + "eval_steps_per_second": 6.065, + "step": 2000 + }, + { + "epoch": 0.2210242587601078, + "grad_norm": 0.6943921446800232, + "learning_rate": 0.0005873718294657312, + "loss": 4.5747, + "step": 2050 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.8243718147277832, + "learning_rate": 0.0005870480302212628, + "loss": 4.5582, + "step": 2100 + }, + { + "epoch": 0.23180592991913745, + "grad_norm": 1.0496394634246826, + "learning_rate": 0.0005867242309767943, + "loss": 4.5315, + "step": 2150 + }, + { + "epoch": 0.2371967654986523, + "grad_norm": 0.8335784077644348, + "learning_rate": 0.0005864004317323259, + "loss": 4.5086, + "step": 2200 + }, + { + "epoch": 0.24258760107816713, + "grad_norm": 0.8698639273643494, + "learning_rate": 0.0005860766324878575, + "loss": 4.4958, + "step": 2250 + }, + { + "epoch": 0.24797843665768193, + "grad_norm": 0.7829259634017944, + "learning_rate": 0.000585752833243389, + "loss": 4.5067, + "step": 2300 + }, + { + "epoch": 0.25336927223719674, + "grad_norm": 0.8259062767028809, + "learning_rate": 0.0005854290339989206, + "loss": 4.4569, + "step": 2350 + }, + { + "epoch": 0.2587601078167116, + "grad_norm": 0.8445196151733398, + "learning_rate": 0.0005851052347544521, + "loss": 4.4611, + "step": 2400 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.8523595929145813, + "learning_rate": 0.0005847814355099838, + "loss": 4.4402, + "step": 2450 + }, + { + "epoch": 0.2695417789757412, + "grad_norm": 0.8830829858779907, + "learning_rate": 0.0005844576362655154, + "loss": 4.4235, + "step": 2500 + }, + { + "epoch": 0.2749326145552561, + "grad_norm": 0.8365247845649719, + "learning_rate": 0.0005841338370210469, + "loss": 4.4101, + "step": 2550 + }, + { + "epoch": 0.2803234501347709, + "grad_norm": 0.853497326374054, + "learning_rate": 0.0005838100377765785, + "loss": 4.3826, + "step": 2600 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.9241225719451904, + "learning_rate": 0.0005834862385321101, + "loss": 4.3931, + "step": 2650 + }, + { + "epoch": 0.29110512129380056, + "grad_norm": 0.8265724778175354, + "learning_rate": 0.0005831624392876417, + "loss": 4.3887, + "step": 2700 + }, + { + "epoch": 0.29649595687331537, + "grad_norm": 0.8722224831581116, + "learning_rate": 0.0005828386400431731, + "loss": 4.3563, + "step": 2750 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.7775854468345642, + "learning_rate": 0.0005825148407987048, + "loss": 4.3271, + "step": 2800 + }, + { + "epoch": 0.30727762803234504, + "grad_norm": 0.7281326055526733, + "learning_rate": 0.0005821910415542363, + "loss": 4.3363, + "step": 2850 + }, + { + "epoch": 0.31266846361185985, + "grad_norm": 0.8523993492126465, + "learning_rate": 0.0005818672423097679, + "loss": 4.3284, + "step": 2900 + }, + { + "epoch": 0.31805929919137466, + "grad_norm": 0.8152425289154053, + "learning_rate": 0.0005815434430652994, + "loss": 4.313, + "step": 2950 + }, + { + "epoch": 0.32345013477088946, + "grad_norm": 0.9258395433425903, + "learning_rate": 0.0005812196438208311, + "loss": 4.3092, + "step": 3000 + }, + { + "epoch": 0.32345013477088946, + "eval_accuracy": 0.29783223363934697, + "eval_loss": 4.246051788330078, + "eval_runtime": 185.609, + "eval_samples_per_second": 97.037, + "eval_steps_per_second": 6.067, + "step": 3000 + }, + { + "epoch": 0.3288409703504043, + "grad_norm": 0.78269362449646, + "learning_rate": 0.0005808958445763626, + "loss": 4.304, + "step": 3050 + }, + { + "epoch": 0.33423180592991913, + "grad_norm": 0.7207179069519043, + "learning_rate": 0.0005805720453318942, + "loss": 4.3041, + "step": 3100 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.791796088218689, + "learning_rate": 0.0005802482460874257, + "loss": 4.2633, + "step": 3150 + }, + { + "epoch": 0.3450134770889488, + "grad_norm": 0.6603590250015259, + "learning_rate": 0.0005799244468429573, + "loss": 4.2613, + "step": 3200 + }, + { + "epoch": 0.3504043126684636, + "grad_norm": 0.922822892665863, + "learning_rate": 0.0005796006475984889, + "loss": 4.2617, + "step": 3250 + }, + { + "epoch": 0.3557951482479784, + "grad_norm": 0.7605053782463074, + "learning_rate": 0.0005792768483540205, + "loss": 4.2446, + "step": 3300 + }, + { + "epoch": 0.3611859838274933, + "grad_norm": 0.7679175734519958, + "learning_rate": 0.000578953049109552, + "loss": 4.26, + "step": 3350 + }, + { + "epoch": 0.3665768194070081, + "grad_norm": 0.7325921654701233, + "learning_rate": 0.0005786292498650836, + "loss": 4.2337, + "step": 3400 + }, + { + "epoch": 0.3719676549865229, + "grad_norm": 0.6152936816215515, + "learning_rate": 0.0005783054506206152, + "loss": 4.21, + "step": 3450 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.7148654460906982, + "learning_rate": 0.0005779816513761467, + "loss": 4.2203, + "step": 3500 + }, + { + "epoch": 0.38274932614555257, + "grad_norm": 0.8136675953865051, + "learning_rate": 0.0005776578521316782, + "loss": 4.2143, + "step": 3550 + }, + { + "epoch": 0.3881401617250674, + "grad_norm": 0.6423671245574951, + "learning_rate": 0.0005773340528872099, + "loss": 4.2164, + "step": 3600 + }, + { + "epoch": 0.3935309973045822, + "grad_norm": 0.8761276602745056, + "learning_rate": 0.0005770102536427414, + "loss": 4.1887, + "step": 3650 + }, + { + "epoch": 0.39892183288409705, + "grad_norm": 0.7816224694252014, + "learning_rate": 0.000576686454398273, + "loss": 4.1937, + "step": 3700 + }, + { + "epoch": 0.40431266846361186, + "grad_norm": 0.7834933400154114, + "learning_rate": 0.0005763626551538045, + "loss": 4.1965, + "step": 3750 + }, + { + "epoch": 0.40970350404312667, + "grad_norm": 0.6208221912384033, + "learning_rate": 0.0005760388559093362, + "loss": 4.1864, + "step": 3800 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.6872389316558838, + "learning_rate": 0.0005757150566648678, + "loss": 4.1732, + "step": 3850 + }, + { + "epoch": 0.42048517520215634, + "grad_norm": 0.7426034808158875, + "learning_rate": 0.0005753912574203993, + "loss": 4.1682, + "step": 3900 + }, + { + "epoch": 0.42587601078167114, + "grad_norm": 0.7201637625694275, + "learning_rate": 0.0005750674581759309, + "loss": 4.157, + "step": 3950 + }, + { + "epoch": 0.431266846361186, + "grad_norm": 0.7274760007858276, + "learning_rate": 0.0005747436589314624, + "loss": 4.1586, + "step": 4000 + }, + { + "epoch": 0.431266846361186, + "eval_accuracy": 0.3127459290207626, + "eval_loss": 4.088518142700195, + "eval_runtime": 185.6568, + "eval_samples_per_second": 97.012, + "eval_steps_per_second": 6.065, + "step": 4000 + }, + { + "epoch": 0.4366576819407008, + "grad_norm": 0.5464039444923401, + "learning_rate": 0.0005744198596869941, + "loss": 4.149, + "step": 4050 + }, + { + "epoch": 0.4420485175202156, + "grad_norm": 0.7344082593917847, + "learning_rate": 0.0005740960604425255, + "loss": 4.1532, + "step": 4100 + }, + { + "epoch": 0.4474393530997305, + "grad_norm": 0.6581411957740784, + "learning_rate": 0.0005737722611980572, + "loss": 4.1547, + "step": 4150 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.6639128923416138, + "learning_rate": 0.0005734484619535887, + "loss": 4.1367, + "step": 4200 + }, + { + "epoch": 0.4582210242587601, + "grad_norm": 0.734761655330658, + "learning_rate": 0.0005731246627091203, + "loss": 4.1221, + "step": 4250 + }, + { + "epoch": 0.4636118598382749, + "grad_norm": 0.8860589265823364, + "learning_rate": 0.0005728008634646518, + "loss": 4.1218, + "step": 4300 + }, + { + "epoch": 0.46900269541778977, + "grad_norm": 0.5995060205459595, + "learning_rate": 0.0005724770642201835, + "loss": 4.1167, + "step": 4350 + }, + { + "epoch": 0.4743935309973046, + "grad_norm": 0.7229450345039368, + "learning_rate": 0.000572153264975715, + "loss": 4.1148, + "step": 4400 + }, + { + "epoch": 0.4797843665768194, + "grad_norm": 0.731573224067688, + "learning_rate": 0.0005718294657312466, + "loss": 4.1018, + "step": 4450 + }, + { + "epoch": 0.48517520215633425, + "grad_norm": 0.7716799974441528, + "learning_rate": 0.0005715056664867781, + "loss": 4.109, + "step": 4500 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.542386531829834, + "learning_rate": 0.0005711818672423097, + "loss": 4.1017, + "step": 4550 + }, + { + "epoch": 0.49595687331536387, + "grad_norm": 0.794573187828064, + "learning_rate": 0.0005708580679978413, + "loss": 4.086, + "step": 4600 + }, + { + "epoch": 0.5013477088948787, + "grad_norm": 0.6180745959281921, + "learning_rate": 0.0005705342687533729, + "loss": 4.0787, + "step": 4650 + }, + { + "epoch": 0.5067385444743935, + "grad_norm": 0.6277428269386292, + "learning_rate": 0.0005702104695089044, + "loss": 4.0932, + "step": 4700 + }, + { + "epoch": 0.5121293800539084, + "grad_norm": 0.6428430676460266, + "learning_rate": 0.000569886670264436, + "loss": 4.0812, + "step": 4750 + }, + { + "epoch": 0.5175202156334232, + "grad_norm": 0.6423416137695312, + "learning_rate": 0.0005695628710199675, + "loss": 4.0791, + "step": 4800 + }, + { + "epoch": 0.522911051212938, + "grad_norm": 0.6095171570777893, + "learning_rate": 0.0005692390717754991, + "loss": 4.0655, + "step": 4850 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.6891219615936279, + "learning_rate": 0.0005689152725310306, + "loss": 4.0527, + "step": 4900 + }, + { + "epoch": 0.5336927223719676, + "grad_norm": 0.6701120734214783, + "learning_rate": 0.0005685914732865623, + "loss": 4.0714, + "step": 4950 + }, + { + "epoch": 0.5390835579514824, + "grad_norm": 0.5252517461776733, + "learning_rate": 0.0005682676740420939, + "loss": 4.0492, + "step": 5000 + }, + { + "epoch": 0.5390835579514824, + "eval_accuracy": 0.32119879773406573, + "eval_loss": 3.9914467334747314, + "eval_runtime": 185.5695, + "eval_samples_per_second": 97.058, + "eval_steps_per_second": 6.068, + "step": 5000 + }, + { + "epoch": 0.5444743935309974, + "grad_norm": 0.6187731623649597, + "learning_rate": 0.0005679438747976254, + "loss": 4.0591, + "step": 5050 + }, + { + "epoch": 0.5498652291105122, + "grad_norm": 0.6102878451347351, + "learning_rate": 0.000567620075553157, + "loss": 4.0513, + "step": 5100 + }, + { + "epoch": 0.555256064690027, + "grad_norm": 0.708706259727478, + "learning_rate": 0.0005672962763086886, + "loss": 4.0489, + "step": 5150 + }, + { + "epoch": 0.5606469002695418, + "grad_norm": 0.6461382508277893, + "learning_rate": 0.0005669724770642202, + "loss": 4.0661, + "step": 5200 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.7211845517158508, + "learning_rate": 0.0005666486778197517, + "loss": 4.0418, + "step": 5250 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7692673802375793, + "learning_rate": 0.0005663248785752833, + "loss": 4.0325, + "step": 5300 + }, + { + "epoch": 0.5768194070080862, + "grad_norm": 0.5925557613372803, + "learning_rate": 0.0005660010793308148, + "loss": 4.0266, + "step": 5350 + }, + { + "epoch": 0.5822102425876011, + "grad_norm": 0.5735204815864563, + "learning_rate": 0.0005656772800863465, + "loss": 4.0372, + "step": 5400 + }, + { + "epoch": 0.5876010781671159, + "grad_norm": 0.5482949018478394, + "learning_rate": 0.0005653534808418779, + "loss": 4.036, + "step": 5450 + }, + { + "epoch": 0.5929919137466307, + "grad_norm": 0.7778071165084839, + "learning_rate": 0.0005650296815974096, + "loss": 4.0203, + "step": 5500 + }, + { + "epoch": 0.5983827493261455, + "grad_norm": 0.603145956993103, + "learning_rate": 0.0005647058823529411, + "loss": 4.009, + "step": 5550 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.5814369916915894, + "learning_rate": 0.0005643820831084727, + "loss": 4.0256, + "step": 5600 + }, + { + "epoch": 0.6091644204851752, + "grad_norm": 0.6685661673545837, + "learning_rate": 0.0005640582838640042, + "loss": 4.0055, + "step": 5650 + }, + { + "epoch": 0.6145552560646901, + "grad_norm": 0.5345515608787537, + "learning_rate": 0.0005637344846195358, + "loss": 4.0021, + "step": 5700 + }, + { + "epoch": 0.6199460916442049, + "grad_norm": 0.6599013209342957, + "learning_rate": 0.0005634106853750674, + "loss": 3.9773, + "step": 5750 + }, + { + "epoch": 0.6253369272237197, + "grad_norm": 0.6814241409301758, + "learning_rate": 0.000563086886130599, + "loss": 3.985, + "step": 5800 + }, + { + "epoch": 0.6307277628032345, + "grad_norm": 0.6166688203811646, + "learning_rate": 0.0005627630868861305, + "loss": 3.9948, + "step": 5850 + }, + { + "epoch": 0.6361185983827493, + "grad_norm": 0.6375031471252441, + "learning_rate": 0.0005624392876416621, + "loss": 3.995, + "step": 5900 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.6021779179573059, + "learning_rate": 0.0005621154883971937, + "loss": 3.9838, + "step": 5950 + }, + { + "epoch": 0.6469002695417789, + "grad_norm": 0.6350939273834229, + "learning_rate": 0.0005617916891527253, + "loss": 3.9962, + "step": 6000 + }, + { + "epoch": 0.6469002695417789, + "eval_accuracy": 0.3278974655736041, + "eval_loss": 3.919321060180664, + "eval_runtime": 185.6272, + "eval_samples_per_second": 97.028, + "eval_steps_per_second": 6.066, + "step": 6000 + }, + { + "epoch": 0.6522911051212938, + "grad_norm": 0.6063998937606812, + "learning_rate": 0.0005614678899082568, + "loss": 3.9623, + "step": 6050 + }, + { + "epoch": 0.6576819407008087, + "grad_norm": 0.6236124634742737, + "learning_rate": 0.0005611440906637884, + "loss": 3.9998, + "step": 6100 + }, + { + "epoch": 0.6630727762803235, + "grad_norm": 0.7479600310325623, + "learning_rate": 0.0005608267674042094, + "loss": 3.9792, + "step": 6150 + }, + { + "epoch": 0.6684636118598383, + "grad_norm": 0.5813738703727722, + "learning_rate": 0.0005605029681597409, + "loss": 3.953, + "step": 6200 + }, + { + "epoch": 0.6738544474393531, + "grad_norm": 0.5313798785209656, + "learning_rate": 0.0005601791689152725, + "loss": 3.972, + "step": 6250 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.6238154768943787, + "learning_rate": 0.000559855369670804, + "loss": 3.9769, + "step": 6300 + }, + { + "epoch": 0.6846361185983828, + "grad_norm": 0.5220109224319458, + "learning_rate": 0.0005595315704263357, + "loss": 3.9613, + "step": 6350 + }, + { + "epoch": 0.6900269541778976, + "grad_norm": 0.673324704170227, + "learning_rate": 0.0005592077711818672, + "loss": 3.9706, + "step": 6400 + }, + { + "epoch": 0.6954177897574124, + "grad_norm": 0.6563860177993774, + "learning_rate": 0.0005588839719373988, + "loss": 3.969, + "step": 6450 + }, + { + "epoch": 0.7008086253369272, + "grad_norm": 0.6665467023849487, + "learning_rate": 0.0005585601726929303, + "loss": 3.9606, + "step": 6500 + }, + { + "epoch": 0.706199460916442, + "grad_norm": 0.5975165367126465, + "learning_rate": 0.0005582363734484619, + "loss": 3.959, + "step": 6550 + }, + { + "epoch": 0.7115902964959568, + "grad_norm": 0.6370920538902283, + "learning_rate": 0.0005579125742039935, + "loss": 3.9606, + "step": 6600 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.6234825849533081, + "learning_rate": 0.0005575887749595251, + "loss": 3.97, + "step": 6650 + }, + { + "epoch": 0.7223719676549866, + "grad_norm": 0.6267138123512268, + "learning_rate": 0.0005572649757150566, + "loss": 3.9344, + "step": 6700 + }, + { + "epoch": 0.7277628032345014, + "grad_norm": 0.5611273050308228, + "learning_rate": 0.0005569411764705882, + "loss": 3.9631, + "step": 6750 + }, + { + "epoch": 0.7331536388140162, + "grad_norm": 0.6250568628311157, + "learning_rate": 0.0005566173772261198, + "loss": 3.9344, + "step": 6800 + }, + { + "epoch": 0.738544474393531, + "grad_norm": 0.5902653932571411, + "learning_rate": 0.0005562935779816513, + "loss": 3.9345, + "step": 6850 + }, + { + "epoch": 0.7439353099730458, + "grad_norm": 0.5684419274330139, + "learning_rate": 0.0005559697787371828, + "loss": 3.9367, + "step": 6900 + }, + { + "epoch": 0.7493261455525606, + "grad_norm": 0.5950194001197815, + "learning_rate": 0.0005556459794927145, + "loss": 3.9418, + "step": 6950 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.6283013820648193, + "learning_rate": 0.000555322180248246, + "loss": 3.9219, + "step": 7000 + }, + { + "epoch": 0.7547169811320755, + "eval_accuracy": 0.33346299244612526, + "eval_loss": 3.8631539344787598, + "eval_runtime": 185.3774, + "eval_samples_per_second": 97.159, + "eval_steps_per_second": 6.074, + "step": 7000 + }, + { + "epoch": 0.7601078167115903, + "grad_norm": 0.6419661641120911, + "learning_rate": 0.0005549983810037776, + "loss": 3.9341, + "step": 7050 + }, + { + "epoch": 0.7654986522911051, + "grad_norm": 0.8758557438850403, + "learning_rate": 0.0005546745817593091, + "loss": 3.9093, + "step": 7100 + }, + { + "epoch": 0.77088948787062, + "grad_norm": 0.6024784445762634, + "learning_rate": 0.0005543507825148408, + "loss": 3.9347, + "step": 7150 + }, + { + "epoch": 0.7762803234501348, + "grad_norm": 0.5663654208183289, + "learning_rate": 0.0005540269832703723, + "loss": 3.9072, + "step": 7200 + }, + { + "epoch": 0.7816711590296496, + "grad_norm": 0.6945359110832214, + "learning_rate": 0.0005537031840259039, + "loss": 3.899, + "step": 7250 + }, + { + "epoch": 0.7870619946091644, + "grad_norm": 0.5187634229660034, + "learning_rate": 0.0005533793847814354, + "loss": 3.9124, + "step": 7300 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.6671504378318787, + "learning_rate": 0.000553055585536967, + "loss": 3.9133, + "step": 7350 + }, + { + "epoch": 0.7978436657681941, + "grad_norm": 0.5317527055740356, + "learning_rate": 0.0005527317862924987, + "loss": 3.9103, + "step": 7400 + }, + { + "epoch": 0.8032345013477089, + "grad_norm": 0.567512571811676, + "learning_rate": 0.0005524079870480301, + "loss": 3.8879, + "step": 7450 + }, + { + "epoch": 0.8086253369272237, + "grad_norm": 0.5598046183586121, + "learning_rate": 0.0005520841878035618, + "loss": 3.9029, + "step": 7500 + }, + { + "epoch": 0.8140161725067385, + "grad_norm": 0.6032840013504028, + "learning_rate": 0.0005517603885590933, + "loss": 3.8964, + "step": 7550 + }, + { + "epoch": 0.8194070080862533, + "grad_norm": 0.5689701437950134, + "learning_rate": 0.0005514365893146249, + "loss": 3.8911, + "step": 7600 + }, + { + "epoch": 0.8247978436657682, + "grad_norm": 0.5858896374702454, + "learning_rate": 0.0005511127900701564, + "loss": 3.8974, + "step": 7650 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.6379143595695496, + "learning_rate": 0.000550788990825688, + "loss": 3.8913, + "step": 7700 + }, + { + "epoch": 0.8355795148247979, + "grad_norm": 0.6346825361251831, + "learning_rate": 0.0005504651915812196, + "loss": 3.8938, + "step": 7750 + }, + { + "epoch": 0.8409703504043127, + "grad_norm": 0.651748776435852, + "learning_rate": 0.0005501413923367512, + "loss": 3.8768, + "step": 7800 + }, + { + "epoch": 0.8463611859838275, + "grad_norm": 0.5529057383537292, + "learning_rate": 0.0005498175930922827, + "loss": 3.8814, + "step": 7850 + }, + { + "epoch": 0.8517520215633423, + "grad_norm": 0.5781732797622681, + "learning_rate": 0.0005494937938478143, + "loss": 3.8778, + "step": 7900 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.599770188331604, + "learning_rate": 0.0005491699946033459, + "loss": 3.9077, + "step": 7950 + }, + { + "epoch": 0.862533692722372, + "grad_norm": 0.6126531958580017, + "learning_rate": 0.0005488461953588775, + "loss": 3.8566, + "step": 8000 + }, + { + "epoch": 0.862533692722372, + "eval_accuracy": 0.3376529737589151, + "eval_loss": 3.815525531768799, + "eval_runtime": 186.958, + "eval_samples_per_second": 96.337, + "eval_steps_per_second": 6.023, + "step": 8000 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.6914803981781006, + "learning_rate": 0.000548522396114409, + "loss": 3.8954, + "step": 8050 + }, + { + "epoch": 0.8733153638814016, + "grad_norm": 0.6908184289932251, + "learning_rate": 0.0005481985968699406, + "loss": 3.8747, + "step": 8100 + }, + { + "epoch": 0.8787061994609164, + "grad_norm": 0.5247132778167725, + "learning_rate": 0.0005478747976254721, + "loss": 3.8785, + "step": 8150 + }, + { + "epoch": 0.8840970350404312, + "grad_norm": 0.585969090461731, + "learning_rate": 0.0005475509983810037, + "loss": 3.8734, + "step": 8200 + }, + { + "epoch": 0.889487870619946, + "grad_norm": 0.5993847250938416, + "learning_rate": 0.0005472271991365352, + "loss": 3.8685, + "step": 8250 + }, + { + "epoch": 0.894878706199461, + "grad_norm": 0.6115968227386475, + "learning_rate": 0.0005469033998920669, + "loss": 3.8867, + "step": 8300 + }, + { + "epoch": 0.9002695417789758, + "grad_norm": 0.5844521522521973, + "learning_rate": 0.0005465860766324878, + "loss": 3.8566, + "step": 8350 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.5750303864479065, + "learning_rate": 0.0005462622773880194, + "loss": 3.8639, + "step": 8400 + }, + { + "epoch": 0.9110512129380054, + "grad_norm": 0.5799904465675354, + "learning_rate": 0.000545938478143551, + "loss": 3.859, + "step": 8450 + }, + { + "epoch": 0.9164420485175202, + "grad_norm": 0.6240545511245728, + "learning_rate": 0.0005456146788990825, + "loss": 3.8648, + "step": 8500 + }, + { + "epoch": 0.921832884097035, + "grad_norm": 0.5578630566596985, + "learning_rate": 0.000545290879654614, + "loss": 3.8597, + "step": 8550 + }, + { + "epoch": 0.9272237196765498, + "grad_norm": 0.5732514262199402, + "learning_rate": 0.0005449670804101457, + "loss": 3.8652, + "step": 8600 + }, + { + "epoch": 0.9326145552560647, + "grad_norm": 0.614829957485199, + "learning_rate": 0.0005446432811656773, + "loss": 3.845, + "step": 8650 + }, + { + "epoch": 0.9380053908355795, + "grad_norm": 0.6199792623519897, + "learning_rate": 0.0005443194819212088, + "loss": 3.8484, + "step": 8700 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.5770998597145081, + "learning_rate": 0.0005439956826767404, + "loss": 3.8456, + "step": 8750 + }, + { + "epoch": 0.9487870619946092, + "grad_norm": 0.5756552815437317, + "learning_rate": 0.000543671883432272, + "loss": 3.8492, + "step": 8800 + }, + { + "epoch": 0.954177897574124, + "grad_norm": 0.5448219180107117, + "learning_rate": 0.0005433480841878035, + "loss": 3.8518, + "step": 8850 + }, + { + "epoch": 0.9595687331536388, + "grad_norm": 0.5587062239646912, + "learning_rate": 0.000543024284943335, + "loss": 3.8389, + "step": 8900 + }, + { + "epoch": 0.9649595687331537, + "grad_norm": 0.5609190464019775, + "learning_rate": 0.0005427004856988667, + "loss": 3.8303, + "step": 8950 + }, + { + "epoch": 0.9703504043126685, + "grad_norm": 0.6230307817459106, + "learning_rate": 0.0005423766864543982, + "loss": 3.856, + "step": 9000 + }, + { + "epoch": 0.9703504043126685, + "eval_accuracy": 0.34079347682830485, + "eval_loss": 3.7813315391540527, + "eval_runtime": 186.6989, + "eval_samples_per_second": 96.471, + "eval_steps_per_second": 6.031, + "step": 9000 + }, + { + "epoch": 0.9757412398921833, + "grad_norm": 0.6406134963035583, + "learning_rate": 0.0005420528872099298, + "loss": 3.871, + "step": 9050 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.49706608057022095, + "learning_rate": 0.0005417290879654613, + "loss": 3.8273, + "step": 9100 + }, + { + "epoch": 0.9865229110512129, + "grad_norm": 0.5765336751937866, + "learning_rate": 0.000541405288720993, + "loss": 3.8418, + "step": 9150 + }, + { + "epoch": 0.9919137466307277, + "grad_norm": 0.5545505881309509, + "learning_rate": 0.0005410814894765245, + "loss": 3.8285, + "step": 9200 + }, + { + "epoch": 0.9973045822102425, + "grad_norm": 0.5445045232772827, + "learning_rate": 0.0005407576902320561, + "loss": 3.8485, + "step": 9250 + }, + { + "epoch": 1.0026954177897573, + "grad_norm": 0.5650607347488403, + "learning_rate": 0.0005404338909875876, + "loss": 3.7963, + "step": 9300 + }, + { + "epoch": 1.0080862533692723, + "grad_norm": 0.5752312541007996, + "learning_rate": 0.0005401100917431192, + "loss": 3.7605, + "step": 9350 + }, + { + "epoch": 1.013477088948787, + "grad_norm": 0.5649228096008301, + "learning_rate": 0.0005397862924986508, + "loss": 3.7564, + "step": 9400 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.5715786218643188, + "learning_rate": 0.0005394624932541824, + "loss": 3.7554, + "step": 9450 + }, + { + "epoch": 1.0242587601078168, + "grad_norm": 0.5207133889198303, + "learning_rate": 0.0005391386940097139, + "loss": 3.7592, + "step": 9500 + }, + { + "epoch": 1.0296495956873315, + "grad_norm": 0.5603737235069275, + "learning_rate": 0.0005388148947652455, + "loss": 3.7704, + "step": 9550 + }, + { + "epoch": 1.0350404312668464, + "grad_norm": 0.6097581386566162, + "learning_rate": 0.000538491095520777, + "loss": 3.7686, + "step": 9600 + }, + { + "epoch": 1.0404312668463611, + "grad_norm": 0.6030071377754211, + "learning_rate": 0.0005381672962763086, + "loss": 3.7726, + "step": 9650 + }, + { + "epoch": 1.045822102425876, + "grad_norm": 0.546688437461853, + "learning_rate": 0.0005378434970318403, + "loss": 3.7567, + "step": 9700 + }, + { + "epoch": 1.0512129380053907, + "grad_norm": 0.5595793724060059, + "learning_rate": 0.0005375196977873718, + "loss": 3.755, + "step": 9750 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 0.5992977619171143, + "learning_rate": 0.0005371958985429034, + "loss": 3.7821, + "step": 9800 + }, + { + "epoch": 1.0619946091644206, + "grad_norm": 0.5221476554870605, + "learning_rate": 0.0005368720992984349, + "loss": 3.7845, + "step": 9850 + }, + { + "epoch": 1.0673854447439353, + "grad_norm": 0.5831781029701233, + "learning_rate": 0.0005365483000539665, + "loss": 3.772, + "step": 9900 + }, + { + "epoch": 1.0727762803234502, + "grad_norm": 0.5980172753334045, + "learning_rate": 0.0005362245008094981, + "loss": 3.7854, + "step": 9950 + }, + { + "epoch": 1.0781671159029649, + "grad_norm": 0.5590375661849976, + "learning_rate": 0.0005359007015650297, + "loss": 3.7649, + "step": 10000 + }, + { + "epoch": 1.0781671159029649, + "eval_accuracy": 0.34435424926776104, + "eval_loss": 3.7518138885498047, + "eval_runtime": 186.0381, + "eval_samples_per_second": 96.814, + "eval_steps_per_second": 6.053, + "step": 10000 + }, + { + "epoch": 1.0835579514824798, + "grad_norm": 0.5709946751594543, + "learning_rate": 0.0005355769023205612, + "loss": 3.7581, + "step": 10050 + }, + { + "epoch": 1.0889487870619945, + "grad_norm": 0.618277370929718, + "learning_rate": 0.0005352531030760928, + "loss": 3.7592, + "step": 10100 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.6374980211257935, + "learning_rate": 0.0005349293038316244, + "loss": 3.7555, + "step": 10150 + }, + { + "epoch": 1.0997304582210243, + "grad_norm": 0.5958013534545898, + "learning_rate": 0.0005346055045871559, + "loss": 3.7666, + "step": 10200 + }, + { + "epoch": 1.105121293800539, + "grad_norm": 0.6115665435791016, + "learning_rate": 0.0005342817053426874, + "loss": 3.7524, + "step": 10250 + }, + { + "epoch": 1.110512129380054, + "grad_norm": 0.585763692855835, + "learning_rate": 0.0005339579060982191, + "loss": 3.764, + "step": 10300 + }, + { + "epoch": 1.1159029649595686, + "grad_norm": 0.653211772441864, + "learning_rate": 0.0005336341068537506, + "loss": 3.7568, + "step": 10350 + }, + { + "epoch": 1.1212938005390836, + "grad_norm": 0.5737949013710022, + "learning_rate": 0.0005333103076092822, + "loss": 3.7638, + "step": 10400 + }, + { + "epoch": 1.1266846361185983, + "grad_norm": 0.5587084293365479, + "learning_rate": 0.0005329865083648137, + "loss": 3.7609, + "step": 10450 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.6286759972572327, + "learning_rate": 0.0005326627091203454, + "loss": 3.7635, + "step": 10500 + }, + { + "epoch": 1.137466307277628, + "grad_norm": 0.5914649367332458, + "learning_rate": 0.0005323453858607662, + "loss": 3.7659, + "step": 10550 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.574865460395813, + "learning_rate": 0.0005320215866162979, + "loss": 3.7552, + "step": 10600 + }, + { + "epoch": 1.1482479784366577, + "grad_norm": 0.570719838142395, + "learning_rate": 0.0005316977873718294, + "loss": 3.7441, + "step": 10650 + }, + { + "epoch": 1.1536388140161726, + "grad_norm": 0.5437813997268677, + "learning_rate": 0.000531373988127361, + "loss": 3.7548, + "step": 10700 + }, + { + "epoch": 1.1590296495956873, + "grad_norm": 0.575222909450531, + "learning_rate": 0.0005310501888828925, + "loss": 3.7359, + "step": 10750 + }, + { + "epoch": 1.1644204851752022, + "grad_norm": 0.5911815762519836, + "learning_rate": 0.0005307263896384242, + "loss": 3.7398, + "step": 10800 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 0.5724892020225525, + "learning_rate": 0.0005304025903939556, + "loss": 3.7319, + "step": 10850 + }, + { + "epoch": 1.1752021563342319, + "grad_norm": 0.6011896729469299, + "learning_rate": 0.0005300787911494873, + "loss": 3.7465, + "step": 10900 + }, + { + "epoch": 1.1805929919137466, + "grad_norm": 0.5386224985122681, + "learning_rate": 0.0005297549919050189, + "loss": 3.7593, + "step": 10950 + }, + { + "epoch": 1.1859838274932615, + "grad_norm": 0.5424002408981323, + "learning_rate": 0.0005294311926605504, + "loss": 3.769, + "step": 11000 + }, + { + "epoch": 1.1859838274932615, + "eval_accuracy": 0.3473514391781235, + "eval_loss": 3.723193883895874, + "eval_runtime": 186.4484, + "eval_samples_per_second": 96.6, + "eval_steps_per_second": 6.039, + "step": 11000 + }, + { + "epoch": 1.1913746630727764, + "grad_norm": 0.5702701210975647, + "learning_rate": 0.000529107393416082, + "loss": 3.7567, + "step": 11050 + }, + { + "epoch": 1.196765498652291, + "grad_norm": 0.5534523129463196, + "learning_rate": 0.0005287835941716135, + "loss": 3.7412, + "step": 11100 + }, + { + "epoch": 1.202156334231806, + "grad_norm": 0.5500940084457397, + "learning_rate": 0.0005284597949271452, + "loss": 3.7407, + "step": 11150 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 0.5379878878593445, + "learning_rate": 0.0005281359956826767, + "loss": 3.7631, + "step": 11200 + }, + { + "epoch": 1.2129380053908356, + "grad_norm": 0.5732437372207642, + "learning_rate": 0.0005278121964382083, + "loss": 3.7498, + "step": 11250 + }, + { + "epoch": 1.2183288409703503, + "grad_norm": 0.5423194766044617, + "learning_rate": 0.0005274883971937398, + "loss": 3.7371, + "step": 11300 + }, + { + "epoch": 1.2237196765498652, + "grad_norm": 0.542545735836029, + "learning_rate": 0.0005271645979492714, + "loss": 3.7301, + "step": 11350 + }, + { + "epoch": 1.2291105121293802, + "grad_norm": 0.8030261397361755, + "learning_rate": 0.0005268472746896923, + "loss": 3.7452, + "step": 11400 + }, + { + "epoch": 1.2345013477088949, + "grad_norm": 0.5232232809066772, + "learning_rate": 0.000526523475445224, + "loss": 3.755, + "step": 11450 + }, + { + "epoch": 1.2398921832884098, + "grad_norm": 0.5808193683624268, + "learning_rate": 0.0005261996762007554, + "loss": 3.7452, + "step": 11500 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.5781920552253723, + "learning_rate": 0.0005258758769562871, + "loss": 3.7392, + "step": 11550 + }, + { + "epoch": 1.2506738544474394, + "grad_norm": 0.5637895464897156, + "learning_rate": 0.0005255520777118186, + "loss": 3.7512, + "step": 11600 + }, + { + "epoch": 1.256064690026954, + "grad_norm": 0.5855022668838501, + "learning_rate": 0.0005252282784673502, + "loss": 3.7318, + "step": 11650 + }, + { + "epoch": 1.261455525606469, + "grad_norm": 0.5060122013092041, + "learning_rate": 0.0005249044792228817, + "loss": 3.7424, + "step": 11700 + }, + { + "epoch": 1.266846361185984, + "grad_norm": 0.5578184723854065, + "learning_rate": 0.0005245806799784133, + "loss": 3.7421, + "step": 11750 + }, + { + "epoch": 1.2722371967654986, + "grad_norm": 0.5692754983901978, + "learning_rate": 0.0005242568807339449, + "loss": 3.7541, + "step": 11800 + }, + { + "epoch": 1.2776280323450135, + "grad_norm": 0.5293059349060059, + "learning_rate": 0.0005239330814894765, + "loss": 3.7389, + "step": 11850 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 0.5823394060134888, + "learning_rate": 0.000523609282245008, + "loss": 3.7336, + "step": 11900 + }, + { + "epoch": 1.2884097035040432, + "grad_norm": 0.5567950010299683, + "learning_rate": 0.0005232854830005396, + "loss": 3.7176, + "step": 11950 + }, + { + "epoch": 1.2938005390835579, + "grad_norm": 0.5795886516571045, + "learning_rate": 0.0005229616837560712, + "loss": 3.7288, + "step": 12000 + }, + { + "epoch": 1.2938005390835579, + "eval_accuracy": 0.34931164592718456, + "eval_loss": 3.697058916091919, + "eval_runtime": 186.8168, + "eval_samples_per_second": 96.41, + "eval_steps_per_second": 6.027, + "step": 12000 + }, + { + "epoch": 1.2991913746630728, + "grad_norm": 0.6153177618980408, + "learning_rate": 0.0005226378845116028, + "loss": 3.7453, + "step": 12050 + }, + { + "epoch": 1.3045822102425877, + "grad_norm": 0.6005005836486816, + "learning_rate": 0.0005223140852671344, + "loss": 3.7398, + "step": 12100 + }, + { + "epoch": 1.3099730458221024, + "grad_norm": 0.5839811563491821, + "learning_rate": 0.0005219902860226659, + "loss": 3.7295, + "step": 12150 + }, + { + "epoch": 1.3153638814016173, + "grad_norm": 0.54124516248703, + "learning_rate": 0.0005216664867781975, + "loss": 3.7451, + "step": 12200 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.5872597098350525, + "learning_rate": 0.000521342687533729, + "loss": 3.7195, + "step": 12250 + }, + { + "epoch": 1.326145552560647, + "grad_norm": 0.6764320731163025, + "learning_rate": 0.0005210188882892606, + "loss": 3.7273, + "step": 12300 + }, + { + "epoch": 1.3315363881401616, + "grad_norm": 0.5127867460250854, + "learning_rate": 0.0005206950890447922, + "loss": 3.7354, + "step": 12350 + }, + { + "epoch": 1.3369272237196765, + "grad_norm": 0.5766441822052002, + "learning_rate": 0.0005203712898003238, + "loss": 3.7329, + "step": 12400 + }, + { + "epoch": 1.3423180592991915, + "grad_norm": 0.5768564343452454, + "learning_rate": 0.0005200474905558553, + "loss": 3.7196, + "step": 12450 + }, + { + "epoch": 1.3477088948787062, + "grad_norm": 0.5573644638061523, + "learning_rate": 0.0005197236913113869, + "loss": 3.7398, + "step": 12500 + }, + { + "epoch": 1.353099730458221, + "grad_norm": 0.519527018070221, + "learning_rate": 0.0005193998920669184, + "loss": 3.7141, + "step": 12550 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.6151216626167297, + "learning_rate": 0.0005190760928224501, + "loss": 3.7121, + "step": 12600 + }, + { + "epoch": 1.3638814016172507, + "grad_norm": 0.6031010746955872, + "learning_rate": 0.0005187522935779816, + "loss": 3.7035, + "step": 12650 + }, + { + "epoch": 1.3692722371967654, + "grad_norm": 0.534813642501831, + "learning_rate": 0.0005184284943335132, + "loss": 3.7193, + "step": 12700 + }, + { + "epoch": 1.3746630727762803, + "grad_norm": 0.5405371785163879, + "learning_rate": 0.0005181046950890447, + "loss": 3.7074, + "step": 12750 + }, + { + "epoch": 1.3800539083557952, + "grad_norm": 0.537695050239563, + "learning_rate": 0.0005177808958445764, + "loss": 3.7067, + "step": 12800 + }, + { + "epoch": 1.38544474393531, + "grad_norm": 0.6358603239059448, + "learning_rate": 0.0005174570966001078, + "loss": 3.7174, + "step": 12850 + }, + { + "epoch": 1.3908355795148248, + "grad_norm": 0.6034652590751648, + "learning_rate": 0.0005171332973556395, + "loss": 3.7168, + "step": 12900 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 0.553503692150116, + "learning_rate": 0.000516809498111171, + "loss": 3.7269, + "step": 12950 + }, + { + "epoch": 1.4016172506738545, + "grad_norm": 0.5765125155448914, + "learning_rate": 0.0005164856988667026, + "loss": 3.7086, + "step": 13000 + }, + { + "epoch": 1.4016172506738545, + "eval_accuracy": 0.35153598784652257, + "eval_loss": 3.674765110015869, + "eval_runtime": 186.3343, + "eval_samples_per_second": 96.66, + "eval_steps_per_second": 6.043, + "step": 13000 + }, + { + "epoch": 1.4070080862533692, + "grad_norm": 0.6072783470153809, + "learning_rate": 0.0005161618996222341, + "loss": 3.7063, + "step": 13050 + }, + { + "epoch": 1.412398921832884, + "grad_norm": 0.5731549859046936, + "learning_rate": 0.0005158381003777657, + "loss": 3.6997, + "step": 13100 + }, + { + "epoch": 1.417789757412399, + "grad_norm": 0.5856156945228577, + "learning_rate": 0.0005155143011332973, + "loss": 3.7057, + "step": 13150 + }, + { + "epoch": 1.4231805929919137, + "grad_norm": 0.566673755645752, + "learning_rate": 0.0005151905018888289, + "loss": 3.7061, + "step": 13200 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.527152419090271, + "learning_rate": 0.0005148667026443604, + "loss": 3.7152, + "step": 13250 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 0.5248644948005676, + "learning_rate": 0.000514542903399892, + "loss": 3.6882, + "step": 13300 + }, + { + "epoch": 1.4393530997304582, + "grad_norm": 0.5552476048469543, + "learning_rate": 0.0005142191041554237, + "loss": 3.708, + "step": 13350 + }, + { + "epoch": 1.444743935309973, + "grad_norm": 0.5420221090316772, + "learning_rate": 0.0005138953049109552, + "loss": 3.723, + "step": 13400 + }, + { + "epoch": 1.4501347708894878, + "grad_norm": 0.546251654624939, + "learning_rate": 0.0005135715056664868, + "loss": 3.7274, + "step": 13450 + }, + { + "epoch": 1.4555256064690028, + "grad_norm": 0.656278669834137, + "learning_rate": 0.0005132477064220183, + "loss": 3.719, + "step": 13500 + }, + { + "epoch": 1.4609164420485174, + "grad_norm": 0.5882102847099304, + "learning_rate": 0.0005129239071775499, + "loss": 3.7077, + "step": 13550 + }, + { + "epoch": 1.4663072776280324, + "grad_norm": 0.5904348492622375, + "learning_rate": 0.0005126001079330814, + "loss": 3.7233, + "step": 13600 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.5616297721862793, + "learning_rate": 0.000512276308688613, + "loss": 3.7126, + "step": 13650 + }, + { + "epoch": 1.477088948787062, + "grad_norm": 0.588398277759552, + "learning_rate": 0.0005119525094441446, + "loss": 3.6976, + "step": 13700 + }, + { + "epoch": 1.482479784366577, + "grad_norm": 0.6392883062362671, + "learning_rate": 0.0005116287101996762, + "loss": 3.701, + "step": 13750 + }, + { + "epoch": 1.4878706199460916, + "grad_norm": 0.5546556711196899, + "learning_rate": 0.0005113049109552077, + "loss": 3.7046, + "step": 13800 + }, + { + "epoch": 1.4932614555256065, + "grad_norm": 0.5846448540687561, + "learning_rate": 0.0005109811117107393, + "loss": 3.7087, + "step": 13850 + }, + { + "epoch": 1.4986522911051212, + "grad_norm": 0.6359493732452393, + "learning_rate": 0.0005106573124662708, + "loss": 3.6907, + "step": 13900 + }, + { + "epoch": 1.5040431266846361, + "grad_norm": 0.5580483078956604, + "learning_rate": 0.0005103335132218025, + "loss": 3.688, + "step": 13950 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.5794188976287842, + "learning_rate": 0.000510009713977334, + "loss": 3.703, + "step": 14000 + }, + { + "epoch": 1.509433962264151, + "eval_accuracy": 0.35349782438890087, + "eval_loss": 3.6583781242370605, + "eval_runtime": 185.6312, + "eval_samples_per_second": 97.026, + "eval_steps_per_second": 6.066, + "step": 14000 + }, + { + "epoch": 1.5148247978436657, + "grad_norm": 0.5856167078018188, + "learning_rate": 0.0005096859147328656, + "loss": 3.6999, + "step": 14050 + }, + { + "epoch": 1.5202156334231804, + "grad_norm": 0.6054800152778625, + "learning_rate": 0.0005093621154883971, + "loss": 3.6984, + "step": 14100 + }, + { + "epoch": 1.5256064690026954, + "grad_norm": 0.5391426086425781, + "learning_rate": 0.0005090383162439288, + "loss": 3.695, + "step": 14150 + }, + { + "epoch": 1.5309973045822103, + "grad_norm": 0.5351890921592712, + "learning_rate": 0.0005087145169994602, + "loss": 3.701, + "step": 14200 + }, + { + "epoch": 1.536388140161725, + "grad_norm": 0.5853469371795654, + "learning_rate": 0.0005083907177549918, + "loss": 3.7001, + "step": 14250 + }, + { + "epoch": 1.54177897574124, + "grad_norm": 0.5472508668899536, + "learning_rate": 0.0005080669185105234, + "loss": 3.697, + "step": 14300 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 0.5463733673095703, + "learning_rate": 0.000507743119266055, + "loss": 3.6871, + "step": 14350 + }, + { + "epoch": 1.5525606469002695, + "grad_norm": 0.549136757850647, + "learning_rate": 0.0005074193200215865, + "loss": 3.6895, + "step": 14400 + }, + { + "epoch": 1.5579514824797842, + "grad_norm": 0.5161502957344055, + "learning_rate": 0.0005070955207771181, + "loss": 3.693, + "step": 14450 + }, + { + "epoch": 1.5633423180592994, + "grad_norm": 0.5291047096252441, + "learning_rate": 0.0005067717215326498, + "loss": 3.6909, + "step": 14500 + }, + { + "epoch": 1.568733153638814, + "grad_norm": 0.5388016700744629, + "learning_rate": 0.0005064479222881813, + "loss": 3.6906, + "step": 14550 + }, + { + "epoch": 1.5741239892183287, + "grad_norm": 0.5703091621398926, + "learning_rate": 0.0005061241230437129, + "loss": 3.6751, + "step": 14600 + }, + { + "epoch": 1.5795148247978437, + "grad_norm": 0.5753449201583862, + "learning_rate": 0.0005058003237992444, + "loss": 3.6803, + "step": 14650 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.5543888807296753, + "learning_rate": 0.000505476524554776, + "loss": 3.6864, + "step": 14700 + }, + { + "epoch": 1.5902964959568733, + "grad_norm": 0.5601723790168762, + "learning_rate": 0.0005051527253103076, + "loss": 3.6654, + "step": 14750 + }, + { + "epoch": 1.595687331536388, + "grad_norm": 0.5522553324699402, + "learning_rate": 0.0005048289260658392, + "loss": 3.6576, + "step": 14800 + }, + { + "epoch": 1.6010781671159031, + "grad_norm": 0.5866943001747131, + "learning_rate": 0.0005045051268213707, + "loss": 3.6834, + "step": 14850 + }, + { + "epoch": 1.6064690026954178, + "grad_norm": 0.5801655650138855, + "learning_rate": 0.0005041813275769023, + "loss": 3.6865, + "step": 14900 + }, + { + "epoch": 1.6118598382749325, + "grad_norm": 0.5756445527076721, + "learning_rate": 0.0005038575283324338, + "loss": 3.6724, + "step": 14950 + }, + { + "epoch": 1.6172506738544474, + "grad_norm": 0.4960193932056427, + "learning_rate": 0.0005035337290879654, + "loss": 3.664, + "step": 15000 + }, + { + "epoch": 1.6172506738544474, + "eval_accuracy": 0.35552974204391946, + "eval_loss": 3.637425184249878, + "eval_runtime": 185.9925, + "eval_samples_per_second": 96.837, + "eval_steps_per_second": 6.054, + "step": 15000 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 0.5649005770683289, + "learning_rate": 0.000503209929843497, + "loss": 3.6632, + "step": 15050 + }, + { + "epoch": 1.628032345013477, + "grad_norm": 0.5567517876625061, + "learning_rate": 0.0005028861305990286, + "loss": 3.6753, + "step": 15100 + }, + { + "epoch": 1.633423180592992, + "grad_norm": 0.5907676219940186, + "learning_rate": 0.0005025623313545601, + "loss": 3.6967, + "step": 15150 + }, + { + "epoch": 1.6388140161725069, + "grad_norm": 0.5697629451751709, + "learning_rate": 0.0005022385321100917, + "loss": 3.6818, + "step": 15200 + }, + { + "epoch": 1.6442048517520216, + "grad_norm": 0.5234145522117615, + "learning_rate": 0.0005019147328656232, + "loss": 3.6737, + "step": 15250 + }, + { + "epoch": 1.6495956873315363, + "grad_norm": 0.5655122995376587, + "learning_rate": 0.0005015909336211549, + "loss": 3.6731, + "step": 15300 + }, + { + "epoch": 1.6549865229110512, + "grad_norm": 0.5556879639625549, + "learning_rate": 0.0005012671343766864, + "loss": 3.6583, + "step": 15350 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 0.5384864211082458, + "learning_rate": 0.000500943335132218, + "loss": 3.6766, + "step": 15400 + }, + { + "epoch": 1.6657681940700808, + "grad_norm": 0.6065835952758789, + "learning_rate": 0.0005006195358877495, + "loss": 3.6646, + "step": 15450 + }, + { + "epoch": 1.6711590296495957, + "grad_norm": 0.5641660690307617, + "learning_rate": 0.0005002957366432812, + "loss": 3.666, + "step": 15500 + }, + { + "epoch": 1.6765498652291106, + "grad_norm": 0.5256580710411072, + "learning_rate": 0.0004999719373988127, + "loss": 3.6515, + "step": 15550 + }, + { + "epoch": 1.6819407008086253, + "grad_norm": 0.5791158676147461, + "learning_rate": 0.0004996481381543442, + "loss": 3.6708, + "step": 15600 + }, + { + "epoch": 1.68733153638814, + "grad_norm": 0.5627703666687012, + "learning_rate": 0.0004993308148947651, + "loss": 3.6779, + "step": 15650 + }, + { + "epoch": 1.692722371967655, + "grad_norm": 0.6009765863418579, + "learning_rate": 0.0004990070156502968, + "loss": 3.6717, + "step": 15700 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.6165124177932739, + "learning_rate": 0.0004986832164058284, + "loss": 3.6558, + "step": 15750 + }, + { + "epoch": 1.7035040431266846, + "grad_norm": 0.573556661605835, + "learning_rate": 0.0004983594171613599, + "loss": 3.6659, + "step": 15800 + }, + { + "epoch": 1.7088948787061995, + "grad_norm": 0.5521446466445923, + "learning_rate": 0.0004980356179168915, + "loss": 3.6825, + "step": 15850 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.5886697769165039, + "learning_rate": 0.000497711818672423, + "loss": 3.6819, + "step": 15900 + }, + { + "epoch": 1.719676549865229, + "grad_norm": 0.6262629628181458, + "learning_rate": 0.0004973880194279547, + "loss": 3.6689, + "step": 15950 + }, + { + "epoch": 1.7250673854447438, + "grad_norm": 0.6005555391311646, + "learning_rate": 0.0004970642201834862, + "loss": 3.6639, + "step": 16000 + }, + { + "epoch": 1.7250673854447438, + "eval_accuracy": 0.3572108195241851, + "eval_loss": 3.6206066608428955, + "eval_runtime": 185.4999, + "eval_samples_per_second": 97.094, + "eval_steps_per_second": 6.07, + "step": 16000 + }, + { + "epoch": 1.7304582210242587, + "grad_norm": 0.5764271020889282, + "learning_rate": 0.0004967404209390178, + "loss": 3.6523, + "step": 16050 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 0.5741286277770996, + "learning_rate": 0.0004964166216945493, + "loss": 3.653, + "step": 16100 + }, + { + "epoch": 1.7412398921832883, + "grad_norm": 0.6296107172966003, + "learning_rate": 0.000496092822450081, + "loss": 3.6606, + "step": 16150 + }, + { + "epoch": 1.7466307277628033, + "grad_norm": 0.6621016263961792, + "learning_rate": 0.0004957690232056125, + "loss": 3.667, + "step": 16200 + }, + { + "epoch": 1.7520215633423182, + "grad_norm": 0.6163436770439148, + "learning_rate": 0.0004954452239611441, + "loss": 3.6514, + "step": 16250 + }, + { + "epoch": 1.7574123989218329, + "grad_norm": 0.5336316823959351, + "learning_rate": 0.0004951214247166756, + "loss": 3.645, + "step": 16300 + }, + { + "epoch": 1.7628032345013476, + "grad_norm": 0.5538309812545776, + "learning_rate": 0.0004947976254722072, + "loss": 3.6681, + "step": 16350 + }, + { + "epoch": 1.7681940700808625, + "grad_norm": 0.5325199365615845, + "learning_rate": 0.0004944738262277387, + "loss": 3.6466, + "step": 16400 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 0.576173722743988, + "learning_rate": 0.0004941500269832703, + "loss": 3.6569, + "step": 16450 + }, + { + "epoch": 1.778975741239892, + "grad_norm": 0.5102316737174988, + "learning_rate": 0.0004938262277388019, + "loss": 3.6425, + "step": 16500 + }, + { + "epoch": 1.784366576819407, + "grad_norm": 0.5195789933204651, + "learning_rate": 0.0004935024284943335, + "loss": 3.6499, + "step": 16550 + }, + { + "epoch": 1.789757412398922, + "grad_norm": 0.5175392031669617, + "learning_rate": 0.000493178629249865, + "loss": 3.6622, + "step": 16600 + }, + { + "epoch": 1.7951482479784366, + "grad_norm": 0.5237419605255127, + "learning_rate": 0.0004928548300053966, + "loss": 3.6531, + "step": 16650 + }, + { + "epoch": 1.8005390835579513, + "grad_norm": 0.6199319958686829, + "learning_rate": 0.0004925310307609282, + "loss": 3.66, + "step": 16700 + }, + { + "epoch": 1.8059299191374663, + "grad_norm": 0.5926080346107483, + "learning_rate": 0.0004922072315164598, + "loss": 3.659, + "step": 16750 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.5379118919372559, + "learning_rate": 0.0004918834322719913, + "loss": 3.6662, + "step": 16800 + }, + { + "epoch": 1.8167115902964959, + "grad_norm": 0.6137577295303345, + "learning_rate": 0.0004915596330275229, + "loss": 3.6746, + "step": 16850 + }, + { + "epoch": 1.8221024258760108, + "grad_norm": 0.5653969645500183, + "learning_rate": 0.0004912358337830544, + "loss": 3.6474, + "step": 16900 + }, + { + "epoch": 1.8274932614555257, + "grad_norm": 0.56174635887146, + "learning_rate": 0.000490912034538586, + "loss": 3.6592, + "step": 16950 + }, + { + "epoch": 1.8328840970350404, + "grad_norm": 0.5398900508880615, + "learning_rate": 0.0004905882352941175, + "loss": 3.6383, + "step": 17000 + }, + { + "epoch": 1.8328840970350404, + "eval_accuracy": 0.3587294609371681, + "eval_loss": 3.6051440238952637, + "eval_runtime": 185.6742, + "eval_samples_per_second": 97.003, + "eval_steps_per_second": 6.064, + "step": 17000 + }, + { + "epoch": 1.838274932614555, + "grad_norm": 0.5739971399307251, + "learning_rate": 0.0004902644360496492, + "loss": 3.6431, + "step": 17050 + }, + { + "epoch": 1.8436657681940702, + "grad_norm": 0.5605867505073547, + "learning_rate": 0.0004899406368051808, + "loss": 3.6362, + "step": 17100 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 0.5735852718353271, + "learning_rate": 0.0004896168375607123, + "loss": 3.6416, + "step": 17150 + }, + { + "epoch": 1.8544474393530996, + "grad_norm": 0.5647099614143372, + "learning_rate": 0.0004892930383162439, + "loss": 3.6467, + "step": 17200 + }, + { + "epoch": 1.8598382749326146, + "grad_norm": 0.5226011276245117, + "learning_rate": 0.0004889692390717754, + "loss": 3.6534, + "step": 17250 + }, + { + "epoch": 1.8652291105121295, + "grad_norm": 0.5790987610816956, + "learning_rate": 0.0004886454398273071, + "loss": 3.6564, + "step": 17300 + }, + { + "epoch": 1.8706199460916442, + "grad_norm": 0.5586245059967041, + "learning_rate": 0.0004883216405828386, + "loss": 3.6568, + "step": 17350 + }, + { + "epoch": 1.8760107816711589, + "grad_norm": 0.5110759139060974, + "learning_rate": 0.00048799784133837017, + "loss": 3.6508, + "step": 17400 + }, + { + "epoch": 1.881401617250674, + "grad_norm": 0.5973682999610901, + "learning_rate": 0.0004876740420939017, + "loss": 3.6405, + "step": 17450 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.5328883528709412, + "learning_rate": 0.0004873502428494333, + "loss": 3.6423, + "step": 17500 + }, + { + "epoch": 1.8921832884097034, + "grad_norm": 0.6016663312911987, + "learning_rate": 0.0004870264436049649, + "loss": 3.6395, + "step": 17550 + }, + { + "epoch": 1.8975741239892183, + "grad_norm": 0.5924084186553955, + "learning_rate": 0.00048670264436049643, + "loss": 3.6529, + "step": 17600 + }, + { + "epoch": 1.9029649595687332, + "grad_norm": 0.5941994190216064, + "learning_rate": 0.00048637884511602803, + "loss": 3.6441, + "step": 17650 + }, + { + "epoch": 1.908355795148248, + "grad_norm": 0.5356640219688416, + "learning_rate": 0.0004860550458715596, + "loss": 3.638, + "step": 17700 + }, + { + "epoch": 1.9137466307277629, + "grad_norm": 0.6006718277931213, + "learning_rate": 0.0004857312466270912, + "loss": 3.6598, + "step": 17750 + }, + { + "epoch": 1.9191374663072778, + "grad_norm": 0.5845404863357544, + "learning_rate": 0.00048540744738262274, + "loss": 3.6449, + "step": 17800 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.5526080131530762, + "learning_rate": 0.00048508364813815434, + "loss": 3.6354, + "step": 17850 + }, + { + "epoch": 1.9299191374663072, + "grad_norm": 0.5421946048736572, + "learning_rate": 0.0004847663248785753, + "loss": 3.6465, + "step": 17900 + }, + { + "epoch": 1.935309973045822, + "grad_norm": 0.5944849848747253, + "learning_rate": 0.0004844425256341068, + "loss": 3.6269, + "step": 17950 + }, + { + "epoch": 1.940700808625337, + "grad_norm": 0.5479409694671631, + "learning_rate": 0.00048411872638963834, + "loss": 3.6321, + "step": 18000 + }, + { + "epoch": 1.940700808625337, + "eval_accuracy": 0.35996886442846787, + "eval_loss": 3.5911636352539062, + "eval_runtime": 185.6549, + "eval_samples_per_second": 97.013, + "eval_steps_per_second": 6.065, + "step": 18000 + }, + { + "epoch": 1.9460916442048517, + "grad_norm": 0.5909312963485718, + "learning_rate": 0.00048379492714516995, + "loss": 3.6343, + "step": 18050 + }, + { + "epoch": 1.9514824797843666, + "grad_norm": 0.5868159532546997, + "learning_rate": 0.0004834711279007015, + "loss": 3.6358, + "step": 18100 + }, + { + "epoch": 1.9568733153638815, + "grad_norm": 0.5208358764648438, + "learning_rate": 0.0004831473286562331, + "loss": 3.6319, + "step": 18150 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 0.5705773234367371, + "learning_rate": 0.00048282352941176465, + "loss": 3.6214, + "step": 18200 + }, + { + "epoch": 1.967654986522911, + "grad_norm": 0.5833699703216553, + "learning_rate": 0.00048249973016729626, + "loss": 3.6428, + "step": 18250 + }, + { + "epoch": 1.9730458221024259, + "grad_norm": 0.594480574131012, + "learning_rate": 0.0004821759309228278, + "loss": 3.6519, + "step": 18300 + }, + { + "epoch": 1.9784366576819408, + "grad_norm": 0.5162302851676941, + "learning_rate": 0.00048185213167835936, + "loss": 3.6368, + "step": 18350 + }, + { + "epoch": 1.9838274932614555, + "grad_norm": 0.6090728640556335, + "learning_rate": 0.00048152833243389096, + "loss": 3.6373, + "step": 18400 + }, + { + "epoch": 1.9892183288409704, + "grad_norm": 0.5224543809890747, + "learning_rate": 0.0004812045331894225, + "loss": 3.6452, + "step": 18450 + }, + { + "epoch": 1.9946091644204853, + "grad_norm": 0.6286558508872986, + "learning_rate": 0.0004808807339449541, + "loss": 3.6383, + "step": 18500 + }, + { + "epoch": 2.0, + "grad_norm": 1.1893088817596436, + "learning_rate": 0.0004805569347004856, + "loss": 3.6417, + "step": 18550 + }, + { + "epoch": 2.0053908355795147, + "grad_norm": 0.5484069585800171, + "learning_rate": 0.0004802331354560173, + "loss": 3.5707, + "step": 18600 + }, + { + "epoch": 2.01078167115903, + "grad_norm": 0.5843707323074341, + "learning_rate": 0.00047990933621154877, + "loss": 3.5399, + "step": 18650 + }, + { + "epoch": 2.0161725067385445, + "grad_norm": 0.5230047106742859, + "learning_rate": 0.0004795855369670804, + "loss": 3.5512, + "step": 18700 + }, + { + "epoch": 2.0215633423180592, + "grad_norm": 0.5233069658279419, + "learning_rate": 0.0004792617377226119, + "loss": 3.5319, + "step": 18750 + }, + { + "epoch": 2.026954177897574, + "grad_norm": 0.5597031712532043, + "learning_rate": 0.0004789379384781435, + "loss": 3.5518, + "step": 18800 + }, + { + "epoch": 2.032345013477089, + "grad_norm": 0.5443568825721741, + "learning_rate": 0.0004786141392336751, + "loss": 3.5586, + "step": 18850 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.5472878217697144, + "learning_rate": 0.00047829033998920663, + "loss": 3.5469, + "step": 18900 + }, + { + "epoch": 2.0431266846361185, + "grad_norm": 0.5626387000083923, + "learning_rate": 0.00047796654074473824, + "loss": 3.5408, + "step": 18950 + }, + { + "epoch": 2.0485175202156336, + "grad_norm": 0.5888155698776245, + "learning_rate": 0.0004776427415002698, + "loss": 3.5672, + "step": 19000 + }, + { + "epoch": 2.0485175202156336, + "eval_accuracy": 0.361499131700447, + "eval_loss": 3.5821456909179688, + "eval_runtime": 185.3913, + "eval_samples_per_second": 97.151, + "eval_steps_per_second": 6.074, + "step": 19000 + }, + { + "epoch": 2.0539083557951483, + "grad_norm": 0.5674854516983032, + "learning_rate": 0.0004773189422558014, + "loss": 3.5436, + "step": 19050 + }, + { + "epoch": 2.059299191374663, + "grad_norm": 0.5584875345230103, + "learning_rate": 0.00047699514301133294, + "loss": 3.567, + "step": 19100 + }, + { + "epoch": 2.0646900269541777, + "grad_norm": 0.5911595821380615, + "learning_rate": 0.00047667134376686455, + "loss": 3.5623, + "step": 19150 + }, + { + "epoch": 2.070080862533693, + "grad_norm": 0.5857378840446472, + "learning_rate": 0.0004763475445223961, + "loss": 3.5702, + "step": 19200 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.5929583311080933, + "learning_rate": 0.0004760237452779276, + "loss": 3.5495, + "step": 19250 + }, + { + "epoch": 2.0808625336927222, + "grad_norm": 0.5647702813148499, + "learning_rate": 0.0004756999460334592, + "loss": 3.5433, + "step": 19300 + }, + { + "epoch": 2.0862533692722374, + "grad_norm": 0.5857425332069397, + "learning_rate": 0.00047537614678899075, + "loss": 3.547, + "step": 19350 + }, + { + "epoch": 2.091644204851752, + "grad_norm": 0.5703374743461609, + "learning_rate": 0.00047505234754452235, + "loss": 3.5461, + "step": 19400 + }, + { + "epoch": 2.0970350404312668, + "grad_norm": 0.8500975966453552, + "learning_rate": 0.0004747285483000539, + "loss": 3.5538, + "step": 19450 + }, + { + "epoch": 2.1024258760107815, + "grad_norm": 0.6051983833312988, + "learning_rate": 0.0004744047490555855, + "loss": 3.5454, + "step": 19500 + }, + { + "epoch": 2.1078167115902966, + "grad_norm": 0.7593019008636475, + "learning_rate": 0.00047408094981111706, + "loss": 3.5555, + "step": 19550 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 0.5925678014755249, + "learning_rate": 0.000473763626551538, + "loss": 3.5514, + "step": 19600 + }, + { + "epoch": 2.118598382749326, + "grad_norm": 0.5761629343032837, + "learning_rate": 0.00047343982730706956, + "loss": 3.5705, + "step": 19650 + }, + { + "epoch": 2.123989218328841, + "grad_norm": 0.5896238088607788, + "learning_rate": 0.00047311602806260117, + "loss": 3.5423, + "step": 19700 + }, + { + "epoch": 2.129380053908356, + "grad_norm": 0.5506173968315125, + "learning_rate": 0.0004727922288181327, + "loss": 3.5619, + "step": 19750 + }, + { + "epoch": 2.1347708894878705, + "grad_norm": 0.5747953653335571, + "learning_rate": 0.0004724684295736643, + "loss": 3.5549, + "step": 19800 + }, + { + "epoch": 2.1401617250673857, + "grad_norm": 0.6055583953857422, + "learning_rate": 0.0004721446303291959, + "loss": 3.5466, + "step": 19850 + }, + { + "epoch": 2.1455525606469004, + "grad_norm": 0.5854771733283997, + "learning_rate": 0.0004718208310847275, + "loss": 3.5586, + "step": 19900 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.5524225234985352, + "learning_rate": 0.000471497031840259, + "loss": 3.5506, + "step": 19950 + }, + { + "epoch": 2.1563342318059298, + "grad_norm": 0.6449847221374512, + "learning_rate": 0.0004711732325957905, + "loss": 3.5754, + "step": 20000 + }, + { + "epoch": 2.1563342318059298, + "eval_accuracy": 0.3623671596211969, + "eval_loss": 3.573341131210327, + "eval_runtime": 185.7691, + "eval_samples_per_second": 96.954, + "eval_steps_per_second": 6.061, + "step": 20000 + }, + { + "epoch": 2.161725067385445, + "grad_norm": 0.5967618823051453, + "learning_rate": 0.00047084943335132213, + "loss": 3.5752, + "step": 20050 + }, + { + "epoch": 2.1671159029649596, + "grad_norm": 0.5871074795722961, + "learning_rate": 0.0004705256341068537, + "loss": 3.5762, + "step": 20100 + }, + { + "epoch": 2.1725067385444743, + "grad_norm": 0.5428000688552856, + "learning_rate": 0.0004702018348623853, + "loss": 3.5692, + "step": 20150 + }, + { + "epoch": 2.177897574123989, + "grad_norm": 0.606965959072113, + "learning_rate": 0.00046987803561791684, + "loss": 3.5672, + "step": 20200 + }, + { + "epoch": 2.183288409703504, + "grad_norm": 0.5462236404418945, + "learning_rate": 0.00046955423637344844, + "loss": 3.5669, + "step": 20250 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 0.5761914253234863, + "learning_rate": 0.00046923043712898, + "loss": 3.5573, + "step": 20300 + }, + { + "epoch": 2.1940700808625335, + "grad_norm": 0.5329501628875732, + "learning_rate": 0.0004689066378845116, + "loss": 3.5705, + "step": 20350 + }, + { + "epoch": 2.1994609164420487, + "grad_norm": 0.5812281966209412, + "learning_rate": 0.00046858283864004315, + "loss": 3.5734, + "step": 20400 + }, + { + "epoch": 2.2048517520215634, + "grad_norm": 0.5665324926376343, + "learning_rate": 0.0004682590393955747, + "loss": 3.5702, + "step": 20450 + }, + { + "epoch": 2.210242587601078, + "grad_norm": 0.5955948233604431, + "learning_rate": 0.0004679352401511063, + "loss": 3.5426, + "step": 20500 + }, + { + "epoch": 2.215633423180593, + "grad_norm": 0.5881284475326538, + "learning_rate": 0.0004676114409066378, + "loss": 3.5686, + "step": 20550 + }, + { + "epoch": 2.221024258760108, + "grad_norm": 0.6288923025131226, + "learning_rate": 0.00046728764166216946, + "loss": 3.5468, + "step": 20600 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 0.5576397776603699, + "learning_rate": 0.00046696384241770095, + "loss": 3.5532, + "step": 20650 + }, + { + "epoch": 2.2318059299191373, + "grad_norm": 0.5199394226074219, + "learning_rate": 0.00046664004317323256, + "loss": 3.551, + "step": 20700 + }, + { + "epoch": 2.2371967654986524, + "grad_norm": 0.5937060713768005, + "learning_rate": 0.0004663162439287641, + "loss": 3.5577, + "step": 20750 + }, + { + "epoch": 2.242587601078167, + "grad_norm": 0.6562036871910095, + "learning_rate": 0.00046599244468429566, + "loss": 3.5528, + "step": 20800 + }, + { + "epoch": 2.247978436657682, + "grad_norm": 0.565091609954834, + "learning_rate": 0.00046566864543982726, + "loss": 3.5683, + "step": 20850 + }, + { + "epoch": 2.2533692722371965, + "grad_norm": 0.5420801043510437, + "learning_rate": 0.0004653448461953588, + "loss": 3.5628, + "step": 20900 + }, + { + "epoch": 2.2587601078167117, + "grad_norm": 0.5261815786361694, + "learning_rate": 0.0004650210469508904, + "loss": 3.5508, + "step": 20950 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.5820636749267578, + "learning_rate": 0.00046469724770642197, + "loss": 3.5637, + "step": 21000 + }, + { + "epoch": 2.2641509433962264, + "eval_accuracy": 0.36372227843801913, + "eval_loss": 3.5613205432891846, + "eval_runtime": 185.2619, + "eval_samples_per_second": 97.219, + "eval_steps_per_second": 6.078, + "step": 21000 + }, + { + "epoch": 2.269541778975741, + "grad_norm": 0.5688766837120056, + "learning_rate": 0.0004643734484619536, + "loss": 3.5568, + "step": 21050 + }, + { + "epoch": 2.274932614555256, + "grad_norm": 0.5932535529136658, + "learning_rate": 0.0004640496492174851, + "loss": 3.5535, + "step": 21100 + }, + { + "epoch": 2.280323450134771, + "grad_norm": 0.5635634064674377, + "learning_rate": 0.00046372584997301673, + "loss": 3.5756, + "step": 21150 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.6333921551704407, + "learning_rate": 0.0004634020507285483, + "loss": 3.5627, + "step": 21200 + }, + { + "epoch": 2.2911051212938007, + "grad_norm": 0.5962139964103699, + "learning_rate": 0.0004630782514840798, + "loss": 3.571, + "step": 21250 + }, + { + "epoch": 2.2964959568733154, + "grad_norm": 0.601997435092926, + "learning_rate": 0.0004627544522396114, + "loss": 3.5379, + "step": 21300 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 0.83173668384552, + "learning_rate": 0.00046243065299514293, + "loss": 3.559, + "step": 21350 + }, + { + "epoch": 2.3072776280323453, + "grad_norm": 0.550325334072113, + "learning_rate": 0.00046210685375067454, + "loss": 3.5504, + "step": 21400 + }, + { + "epoch": 2.31266846361186, + "grad_norm": 0.5884050130844116, + "learning_rate": 0.0004617830545062061, + "loss": 3.5759, + "step": 21450 + }, + { + "epoch": 2.3180592991913747, + "grad_norm": 0.5583473443984985, + "learning_rate": 0.0004614592552617377, + "loss": 3.5461, + "step": 21500 + }, + { + "epoch": 2.3234501347708894, + "grad_norm": 0.5997599363327026, + "learning_rate": 0.00046113545601726924, + "loss": 3.5626, + "step": 21550 + }, + { + "epoch": 2.3288409703504045, + "grad_norm": 0.5541661977767944, + "learning_rate": 0.00046081165677280085, + "loss": 3.5549, + "step": 21600 + }, + { + "epoch": 2.334231805929919, + "grad_norm": 0.6619417071342468, + "learning_rate": 0.0004604878575283324, + "loss": 3.5543, + "step": 21650 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 0.5892545580863953, + "learning_rate": 0.00046016405828386395, + "loss": 3.5725, + "step": 21700 + }, + { + "epoch": 2.3450134770889486, + "grad_norm": 0.5789877772331238, + "learning_rate": 0.00045984025903939555, + "loss": 3.5618, + "step": 21750 + }, + { + "epoch": 2.3504043126684637, + "grad_norm": 0.5967502593994141, + "learning_rate": 0.0004595164597949271, + "loss": 3.5435, + "step": 21800 + }, + { + "epoch": 2.3557951482479784, + "grad_norm": 0.5737834572792053, + "learning_rate": 0.0004591926605504587, + "loss": 3.5549, + "step": 21850 + }, + { + "epoch": 2.361185983827493, + "grad_norm": 0.5817111134529114, + "learning_rate": 0.0004588688613059902, + "loss": 3.5489, + "step": 21900 + }, + { + "epoch": 2.3665768194070083, + "grad_norm": 0.5602347254753113, + "learning_rate": 0.00045854506206152186, + "loss": 3.5488, + "step": 21950 + }, + { + "epoch": 2.371967654986523, + "grad_norm": 0.5591195821762085, + "learning_rate": 0.00045822126281705336, + "loss": 3.5706, + "step": 22000 + }, + { + "epoch": 2.371967654986523, + "eval_accuracy": 0.3645871554250224, + "eval_loss": 3.5503523349761963, + "eval_runtime": 185.7321, + "eval_samples_per_second": 96.973, + "eval_steps_per_second": 6.062, + "step": 22000 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.5747423768043518, + "learning_rate": 0.00045789746357258497, + "loss": 3.5462, + "step": 22050 + }, + { + "epoch": 2.382749326145553, + "grad_norm": 0.6654125452041626, + "learning_rate": 0.0004575736643281165, + "loss": 3.5505, + "step": 22100 + }, + { + "epoch": 2.3881401617250675, + "grad_norm": 0.593645453453064, + "learning_rate": 0.00045724986508364807, + "loss": 3.5528, + "step": 22150 + }, + { + "epoch": 2.393530997304582, + "grad_norm": 0.5858222842216492, + "learning_rate": 0.00045692606583917967, + "loss": 3.5645, + "step": 22200 + }, + { + "epoch": 2.398921832884097, + "grad_norm": 0.6029301285743713, + "learning_rate": 0.0004566022665947112, + "loss": 3.5695, + "step": 22250 + }, + { + "epoch": 2.404312668463612, + "grad_norm": 0.584065854549408, + "learning_rate": 0.0004562784673502428, + "loss": 3.5708, + "step": 22300 + }, + { + "epoch": 2.4097035040431267, + "grad_norm": 0.5794286727905273, + "learning_rate": 0.0004559546681057744, + "loss": 3.5516, + "step": 22350 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 0.6180821657180786, + "learning_rate": 0.000455630868861306, + "loss": 3.5726, + "step": 22400 + }, + { + "epoch": 2.420485175202156, + "grad_norm": 0.5881420373916626, + "learning_rate": 0.00045530706961683753, + "loss": 3.5624, + "step": 22450 + }, + { + "epoch": 2.4258760107816713, + "grad_norm": 0.5598190426826477, + "learning_rate": 0.00045498327037236914, + "loss": 3.556, + "step": 22500 + }, + { + "epoch": 2.431266846361186, + "grad_norm": 0.5753728151321411, + "learning_rate": 0.0004546594711279007, + "loss": 3.5505, + "step": 22550 + }, + { + "epoch": 2.4366576819407006, + "grad_norm": 0.5574933886528015, + "learning_rate": 0.0004543356718834322, + "loss": 3.551, + "step": 22600 + }, + { + "epoch": 2.442048517520216, + "grad_norm": 0.5815372467041016, + "learning_rate": 0.0004540118726389638, + "loss": 3.5393, + "step": 22650 + }, + { + "epoch": 2.4474393530997305, + "grad_norm": 0.5866022706031799, + "learning_rate": 0.00045368807339449534, + "loss": 3.5423, + "step": 22700 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.5673452615737915, + "learning_rate": 0.00045336427415002694, + "loss": 3.547, + "step": 22750 + }, + { + "epoch": 2.4582210242587603, + "grad_norm": 0.587451159954071, + "learning_rate": 0.0004530404749055585, + "loss": 3.5455, + "step": 22800 + }, + { + "epoch": 2.463611859838275, + "grad_norm": 0.6395877599716187, + "learning_rate": 0.0004527166756610901, + "loss": 3.5518, + "step": 22850 + }, + { + "epoch": 2.4690026954177897, + "grad_norm": 0.6238613724708557, + "learning_rate": 0.00045239287641662165, + "loss": 3.5595, + "step": 22900 + }, + { + "epoch": 2.4743935309973044, + "grad_norm": 0.6073823571205139, + "learning_rate": 0.0004520690771721532, + "loss": 3.5612, + "step": 22950 + }, + { + "epoch": 2.4797843665768196, + "grad_norm": 0.582822322845459, + "learning_rate": 0.0004517452779276848, + "loss": 3.5408, + "step": 23000 + }, + { + "epoch": 2.4797843665768196, + "eval_accuracy": 0.3659634615549684, + "eval_loss": 3.5402395725250244, + "eval_runtime": 185.4548, + "eval_samples_per_second": 97.118, + "eval_steps_per_second": 6.072, + "step": 23000 + }, + { + "epoch": 2.4851752021563343, + "grad_norm": 0.5883828401565552, + "learning_rate": 0.00045142147868321636, + "loss": 3.543, + "step": 23050 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.5678288340568542, + "learning_rate": 0.00045109767943874796, + "loss": 3.5585, + "step": 23100 + }, + { + "epoch": 2.4959568733153636, + "grad_norm": 0.57308030128479, + "learning_rate": 0.0004507738801942795, + "loss": 3.5585, + "step": 23150 + }, + { + "epoch": 2.501347708894879, + "grad_norm": 0.6100690364837646, + "learning_rate": 0.0004504500809498111, + "loss": 3.5512, + "step": 23200 + }, + { + "epoch": 2.5067385444743935, + "grad_norm": 0.5841269493103027, + "learning_rate": 0.0004501262817053426, + "loss": 3.5446, + "step": 23250 + }, + { + "epoch": 2.512129380053908, + "grad_norm": 0.6322735548019409, + "learning_rate": 0.00044980248246087427, + "loss": 3.5466, + "step": 23300 + }, + { + "epoch": 2.5175202156334233, + "grad_norm": 0.6177630424499512, + "learning_rate": 0.00044947868321640577, + "loss": 3.5425, + "step": 23350 + }, + { + "epoch": 2.522911051212938, + "grad_norm": 0.6240397691726685, + "learning_rate": 0.0004491548839719373, + "loss": 3.5322, + "step": 23400 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.587017834186554, + "learning_rate": 0.0004488310847274689, + "loss": 3.5453, + "step": 23450 + }, + { + "epoch": 2.533692722371968, + "grad_norm": 0.5576516389846802, + "learning_rate": 0.0004485072854830005, + "loss": 3.5428, + "step": 23500 + }, + { + "epoch": 2.5390835579514826, + "grad_norm": 0.5701577663421631, + "learning_rate": 0.0004481834862385321, + "loss": 3.5514, + "step": 23550 + }, + { + "epoch": 2.5444743935309972, + "grad_norm": 0.5828137397766113, + "learning_rate": 0.00044785968699406363, + "loss": 3.5517, + "step": 23600 + }, + { + "epoch": 2.5498652291105124, + "grad_norm": 0.6262269020080566, + "learning_rate": 0.0004475423637344846, + "loss": 3.556, + "step": 23650 + }, + { + "epoch": 2.555256064690027, + "grad_norm": 0.5848754644393921, + "learning_rate": 0.00044721856449001613, + "loss": 3.5631, + "step": 23700 + }, + { + "epoch": 2.560646900269542, + "grad_norm": 0.5634021162986755, + "learning_rate": 0.00044689476524554774, + "loss": 3.5699, + "step": 23750 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 0.6959907412528992, + "learning_rate": 0.0004465709660010793, + "loss": 3.5458, + "step": 23800 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.5664647817611694, + "learning_rate": 0.0004462471667566109, + "loss": 3.5597, + "step": 23850 + }, + { + "epoch": 2.5768194070080863, + "grad_norm": 0.5963898301124573, + "learning_rate": 0.00044592336751214244, + "loss": 3.5427, + "step": 23900 + }, + { + "epoch": 2.582210242587601, + "grad_norm": 0.5585032105445862, + "learning_rate": 0.00044559956826767405, + "loss": 3.5489, + "step": 23950 + }, + { + "epoch": 2.5876010781671157, + "grad_norm": 0.6078963279724121, + "learning_rate": 0.00044527576902320554, + "loss": 3.532, + "step": 24000 + }, + { + "epoch": 2.5876010781671157, + "eval_accuracy": 0.36688788032449404, + "eval_loss": 3.531247854232788, + "eval_runtime": 185.6875, + "eval_samples_per_second": 96.996, + "eval_steps_per_second": 6.064, + "step": 24000 + }, + { + "epoch": 2.592991913746631, + "grad_norm": 0.5676623582839966, + "learning_rate": 0.00044495196977873715, + "loss": 3.5458, + "step": 24050 + }, + { + "epoch": 2.5983827493261455, + "grad_norm": 0.6343262195587158, + "learning_rate": 0.0004446281705342687, + "loss": 3.5379, + "step": 24100 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.5848167538642883, + "learning_rate": 0.00044430437128980025, + "loss": 3.5677, + "step": 24150 + }, + { + "epoch": 2.6091644204851754, + "grad_norm": 0.6168308854103088, + "learning_rate": 0.00044398057204533185, + "loss": 3.5298, + "step": 24200 + }, + { + "epoch": 2.61455525606469, + "grad_norm": 0.5785287022590637, + "learning_rate": 0.0004436567728008634, + "loss": 3.5411, + "step": 24250 + }, + { + "epoch": 2.6199460916442048, + "grad_norm": 0.5798066258430481, + "learning_rate": 0.000443332973556395, + "loss": 3.5491, + "step": 24300 + }, + { + "epoch": 2.62533692722372, + "grad_norm": 0.5612218379974365, + "learning_rate": 0.00044300917431192656, + "loss": 3.5649, + "step": 24350 + }, + { + "epoch": 2.6307277628032346, + "grad_norm": 0.5887892842292786, + "learning_rate": 0.00044268537506745816, + "loss": 3.5273, + "step": 24400 + }, + { + "epoch": 2.6361185983827493, + "grad_norm": 0.6022453308105469, + "learning_rate": 0.0004423615758229897, + "loss": 3.5447, + "step": 24450 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.5853219032287598, + "learning_rate": 0.0004420377765785213, + "loss": 3.5591, + "step": 24500 + }, + { + "epoch": 2.6469002695417787, + "grad_norm": 0.5891228318214417, + "learning_rate": 0.00044171397733405287, + "loss": 3.5263, + "step": 24550 + }, + { + "epoch": 2.652291105121294, + "grad_norm": 0.6075050830841064, + "learning_rate": 0.00044139017808958437, + "loss": 3.5344, + "step": 24600 + }, + { + "epoch": 2.6576819407008085, + "grad_norm": 0.5453810095787048, + "learning_rate": 0.00044106637884511597, + "loss": 3.5361, + "step": 24650 + }, + { + "epoch": 2.6630727762803232, + "grad_norm": 0.6543773412704468, + "learning_rate": 0.0004407425796006475, + "loss": 3.5407, + "step": 24700 + }, + { + "epoch": 2.6684636118598384, + "grad_norm": 0.5871316194534302, + "learning_rate": 0.00044041878035617913, + "loss": 3.5404, + "step": 24750 + }, + { + "epoch": 2.673854447439353, + "grad_norm": 0.5868498086929321, + "learning_rate": 0.0004400949811117107, + "loss": 3.5162, + "step": 24800 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 0.5242236852645874, + "learning_rate": 0.0004397711818672423, + "loss": 3.5146, + "step": 24850 + }, + { + "epoch": 2.684636118598383, + "grad_norm": 0.5566399097442627, + "learning_rate": 0.00043944738262277383, + "loss": 3.5539, + "step": 24900 + }, + { + "epoch": 2.6900269541778976, + "grad_norm": 0.5947811603546143, + "learning_rate": 0.00043912358337830544, + "loss": 3.5379, + "step": 24950 + }, + { + "epoch": 2.6954177897574123, + "grad_norm": 0.5996220111846924, + "learning_rate": 0.000438799784133837, + "loss": 3.527, + "step": 25000 + }, + { + "epoch": 2.6954177897574123, + "eval_accuracy": 0.3672525194159994, + "eval_loss": 3.5217697620391846, + "eval_runtime": 185.7585, + "eval_samples_per_second": 96.959, + "eval_steps_per_second": 6.062, + "step": 25000 + }, + { + "epoch": 2.7008086253369274, + "grad_norm": 0.5733533501625061, + "learning_rate": 0.00043847598488936854, + "loss": 3.5383, + "step": 25050 + }, + { + "epoch": 2.706199460916442, + "grad_norm": 0.5578649044036865, + "learning_rate": 0.00043815218564490014, + "loss": 3.5233, + "step": 25100 + }, + { + "epoch": 2.711590296495957, + "grad_norm": 0.5936254858970642, + "learning_rate": 0.0004378283864004317, + "loss": 3.5137, + "step": 25150 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.591632068157196, + "learning_rate": 0.0004375045871559633, + "loss": 3.5193, + "step": 25200 + }, + { + "epoch": 2.7223719676549867, + "grad_norm": 0.6370254158973694, + "learning_rate": 0.00043718078791149485, + "loss": 3.5535, + "step": 25250 + }, + { + "epoch": 2.7277628032345014, + "grad_norm": 0.5756546258926392, + "learning_rate": 0.00043685698866702645, + "loss": 3.5101, + "step": 25300 + }, + { + "epoch": 2.733153638814016, + "grad_norm": 0.595561146736145, + "learning_rate": 0.00043653318942255795, + "loss": 3.5358, + "step": 25350 + }, + { + "epoch": 2.7385444743935308, + "grad_norm": 0.6051717400550842, + "learning_rate": 0.00043620939017808956, + "loss": 3.5376, + "step": 25400 + }, + { + "epoch": 2.743935309973046, + "grad_norm": 0.5473976731300354, + "learning_rate": 0.0004358855909336211, + "loss": 3.5087, + "step": 25450 + }, + { + "epoch": 2.7493261455525606, + "grad_norm": 0.6153759360313416, + "learning_rate": 0.00043556179168915266, + "loss": 3.5703, + "step": 25500 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 0.5839317440986633, + "learning_rate": 0.00043523799244468426, + "loss": 3.5587, + "step": 25550 + }, + { + "epoch": 2.7601078167115904, + "grad_norm": 0.6133059859275818, + "learning_rate": 0.0004349141932002158, + "loss": 3.5361, + "step": 25600 + }, + { + "epoch": 2.765498652291105, + "grad_norm": 0.6802798509597778, + "learning_rate": 0.0004345903939557474, + "loss": 3.5245, + "step": 25650 + }, + { + "epoch": 2.77088948787062, + "grad_norm": 0.5973533987998962, + "learning_rate": 0.00043426659471127897, + "loss": 3.5436, + "step": 25700 + }, + { + "epoch": 2.776280323450135, + "grad_norm": 0.6152960658073425, + "learning_rate": 0.0004339492714516999, + "loss": 3.5327, + "step": 25750 + }, + { + "epoch": 2.7816711590296497, + "grad_norm": 0.5712870359420776, + "learning_rate": 0.00043362547220723147, + "loss": 3.5337, + "step": 25800 + }, + { + "epoch": 2.7870619946091644, + "grad_norm": 0.5867605209350586, + "learning_rate": 0.0004333016729627631, + "loss": 3.5213, + "step": 25850 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 0.6075189113616943, + "learning_rate": 0.0004329778737182946, + "loss": 3.5308, + "step": 25900 + }, + { + "epoch": 2.797843665768194, + "grad_norm": 0.5805151462554932, + "learning_rate": 0.00043265407447382623, + "loss": 3.5207, + "step": 25950 + }, + { + "epoch": 2.803234501347709, + "grad_norm": 0.5881994962692261, + "learning_rate": 0.0004323302752293577, + "loss": 3.5333, + "step": 26000 + }, + { + "epoch": 2.803234501347709, + "eval_accuracy": 0.368511371774218, + "eval_loss": 3.51200795173645, + "eval_runtime": 185.141, + "eval_samples_per_second": 97.283, + "eval_steps_per_second": 6.082, + "step": 26000 + }, + { + "epoch": 2.8086253369272236, + "grad_norm": 0.5690694451332092, + "learning_rate": 0.00043200647598488933, + "loss": 3.5296, + "step": 26050 + }, + { + "epoch": 2.8140161725067383, + "grad_norm": 0.5980532169342041, + "learning_rate": 0.0004316826767404209, + "loss": 3.5044, + "step": 26100 + }, + { + "epoch": 2.8194070080862534, + "grad_norm": 0.5740624070167542, + "learning_rate": 0.0004313588774959525, + "loss": 3.5082, + "step": 26150 + }, + { + "epoch": 2.824797843665768, + "grad_norm": 0.6146470904350281, + "learning_rate": 0.00043103507825148404, + "loss": 3.532, + "step": 26200 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.5921385288238525, + "learning_rate": 0.0004307112790070156, + "loss": 3.5445, + "step": 26250 + }, + { + "epoch": 2.835579514824798, + "grad_norm": 0.5627673268318176, + "learning_rate": 0.0004303874797625472, + "loss": 3.5604, + "step": 26300 + }, + { + "epoch": 2.8409703504043127, + "grad_norm": 0.6661468744277954, + "learning_rate": 0.00043006368051807874, + "loss": 3.5291, + "step": 26350 + }, + { + "epoch": 2.8463611859838274, + "grad_norm": 0.6103497743606567, + "learning_rate": 0.00042973988127361035, + "loss": 3.5391, + "step": 26400 + }, + { + "epoch": 2.8517520215633425, + "grad_norm": 0.6032962799072266, + "learning_rate": 0.0004294160820291419, + "loss": 3.545, + "step": 26450 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5415673851966858, + "learning_rate": 0.0004290922827846735, + "loss": 3.5161, + "step": 26500 + }, + { + "epoch": 2.862533692722372, + "grad_norm": 0.5705322027206421, + "learning_rate": 0.00042876848354020505, + "loss": 3.5323, + "step": 26550 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 0.5669793486595154, + "learning_rate": 0.00042844468429573655, + "loss": 3.5407, + "step": 26600 + }, + { + "epoch": 2.8733153638814017, + "grad_norm": 0.605086624622345, + "learning_rate": 0.00042812088505126815, + "loss": 3.5466, + "step": 26650 + }, + { + "epoch": 2.8787061994609164, + "grad_norm": 0.5618116855621338, + "learning_rate": 0.0004277970858067997, + "loss": 3.5318, + "step": 26700 + }, + { + "epoch": 2.884097035040431, + "grad_norm": 0.6186133027076721, + "learning_rate": 0.0004274732865623313, + "loss": 3.5432, + "step": 26750 + }, + { + "epoch": 2.889487870619946, + "grad_norm": 0.5849158763885498, + "learning_rate": 0.00042714948731786286, + "loss": 3.5203, + "step": 26800 + }, + { + "epoch": 2.894878706199461, + "grad_norm": 0.6805973052978516, + "learning_rate": 0.00042682568807339447, + "loss": 3.5437, + "step": 26850 + }, + { + "epoch": 2.9002695417789757, + "grad_norm": 0.6036573648452759, + "learning_rate": 0.000426501888828926, + "loss": 3.5082, + "step": 26900 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.6047782301902771, + "learning_rate": 0.0004261780895844576, + "loss": 3.5299, + "step": 26950 + }, + { + "epoch": 2.9110512129380055, + "grad_norm": 0.5614576935768127, + "learning_rate": 0.00042585429033998917, + "loss": 3.5442, + "step": 27000 + }, + { + "epoch": 2.9110512129380055, + "eval_accuracy": 0.3696261504031946, + "eval_loss": 3.5053839683532715, + "eval_runtime": 185.4428, + "eval_samples_per_second": 97.124, + "eval_steps_per_second": 6.072, + "step": 27000 + }, + { + "epoch": 2.91644204851752, + "grad_norm": 0.6214551329612732, + "learning_rate": 0.0004255304910955207, + "loss": 3.5288, + "step": 27050 + }, + { + "epoch": 2.921832884097035, + "grad_norm": 0.5811451077461243, + "learning_rate": 0.0004252066918510523, + "loss": 3.5338, + "step": 27100 + }, + { + "epoch": 2.92722371967655, + "grad_norm": 0.6257526874542236, + "learning_rate": 0.0004248828926065839, + "loss": 3.5296, + "step": 27150 + }, + { + "epoch": 2.9326145552560647, + "grad_norm": 0.6285461187362671, + "learning_rate": 0.0004245590933621155, + "loss": 3.5139, + "step": 27200 + }, + { + "epoch": 2.9380053908355794, + "grad_norm": 0.549082338809967, + "learning_rate": 0.00042423529411764703, + "loss": 3.5302, + "step": 27250 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.5633655786514282, + "learning_rate": 0.00042391149487317864, + "loss": 3.5334, + "step": 27300 + }, + { + "epoch": 2.9487870619946093, + "grad_norm": 0.5984169244766235, + "learning_rate": 0.00042358769562871013, + "loss": 3.5178, + "step": 27350 + }, + { + "epoch": 2.954177897574124, + "grad_norm": 0.6494591236114502, + "learning_rate": 0.00042326389638424174, + "loss": 3.5112, + "step": 27400 + }, + { + "epoch": 2.9595687331536387, + "grad_norm": 0.5794746279716492, + "learning_rate": 0.0004229400971397733, + "loss": 3.5172, + "step": 27450 + }, + { + "epoch": 2.964959568733154, + "grad_norm": 0.5832740068435669, + "learning_rate": 0.00042261629789530484, + "loss": 3.5293, + "step": 27500 + }, + { + "epoch": 2.9703504043126685, + "grad_norm": 0.5642590522766113, + "learning_rate": 0.00042229249865083644, + "loss": 3.5181, + "step": 27550 + }, + { + "epoch": 2.975741239892183, + "grad_norm": 0.5790451765060425, + "learning_rate": 0.000421968699406368, + "loss": 3.5252, + "step": 27600 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 0.6082571744918823, + "learning_rate": 0.0004216449001618996, + "loss": 3.5057, + "step": 27650 + }, + { + "epoch": 2.986522911051213, + "grad_norm": 0.5873731374740601, + "learning_rate": 0.00042132110091743115, + "loss": 3.5168, + "step": 27700 + }, + { + "epoch": 2.9919137466307277, + "grad_norm": 0.6429144144058228, + "learning_rate": 0.00042099730167296275, + "loss": 3.5223, + "step": 27750 + }, + { + "epoch": 2.9973045822102424, + "grad_norm": 0.642909586429596, + "learning_rate": 0.00042067997841338365, + "loss": 3.5247, + "step": 27800 + }, + { + "epoch": 3.0026954177897576, + "grad_norm": 0.6168606281280518, + "learning_rate": 0.00042035617916891526, + "loss": 3.4874, + "step": 27850 + }, + { + "epoch": 3.0080862533692723, + "grad_norm": 0.6800719499588013, + "learning_rate": 0.0004200323799244468, + "loss": 3.4281, + "step": 27900 + }, + { + "epoch": 3.013477088948787, + "grad_norm": 0.57563316822052, + "learning_rate": 0.0004197085806799784, + "loss": 3.4331, + "step": 27950 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 0.607589602470398, + "learning_rate": 0.0004193847814355099, + "loss": 3.4451, + "step": 28000 + }, + { + "epoch": 3.018867924528302, + "eval_accuracy": 0.3699707973633421, + "eval_loss": 3.5011649131774902, + "eval_runtime": 185.7113, + "eval_samples_per_second": 96.984, + "eval_steps_per_second": 6.063, + "step": 28000 + }, + { + "epoch": 3.024258760107817, + "grad_norm": 0.6603455543518066, + "learning_rate": 0.0004190609821910415, + "loss": 3.4317, + "step": 28050 + }, + { + "epoch": 3.0296495956873315, + "grad_norm": 0.63117516040802, + "learning_rate": 0.00041873718294657306, + "loss": 3.4264, + "step": 28100 + }, + { + "epoch": 3.035040431266846, + "grad_norm": 0.6193031072616577, + "learning_rate": 0.00041841338370210467, + "loss": 3.466, + "step": 28150 + }, + { + "epoch": 3.0404312668463613, + "grad_norm": 0.6265783905982971, + "learning_rate": 0.0004180895844576362, + "loss": 3.4467, + "step": 28200 + }, + { + "epoch": 3.045822102425876, + "grad_norm": 0.5731920003890991, + "learning_rate": 0.00041776578521316777, + "loss": 3.4543, + "step": 28250 + }, + { + "epoch": 3.0512129380053907, + "grad_norm": 0.6172985434532166, + "learning_rate": 0.0004174419859686994, + "loss": 3.4541, + "step": 28300 + }, + { + "epoch": 3.056603773584906, + "grad_norm": 0.5971702337265015, + "learning_rate": 0.0004171181867242309, + "loss": 3.4452, + "step": 28350 + }, + { + "epoch": 3.0619946091644206, + "grad_norm": 0.6083709001541138, + "learning_rate": 0.00041679438747976253, + "loss": 3.4237, + "step": 28400 + }, + { + "epoch": 3.0673854447439353, + "grad_norm": 0.595926821231842, + "learning_rate": 0.0004164705882352941, + "loss": 3.4655, + "step": 28450 + }, + { + "epoch": 3.07277628032345, + "grad_norm": 0.6328946352005005, + "learning_rate": 0.0004161467889908257, + "loss": 3.4581, + "step": 28500 + }, + { + "epoch": 3.078167115902965, + "grad_norm": 0.62444669008255, + "learning_rate": 0.00041582298974635724, + "loss": 3.4453, + "step": 28550 + }, + { + "epoch": 3.08355795148248, + "grad_norm": 0.5992231965065002, + "learning_rate": 0.00041549919050188884, + "loss": 3.442, + "step": 28600 + }, + { + "epoch": 3.0889487870619945, + "grad_norm": 0.6712383031845093, + "learning_rate": 0.00041517539125742034, + "loss": 3.4401, + "step": 28650 + }, + { + "epoch": 3.0943396226415096, + "grad_norm": 0.58417809009552, + "learning_rate": 0.0004148515920129519, + "loss": 3.4495, + "step": 28700 + }, + { + "epoch": 3.0997304582210243, + "grad_norm": 0.6253139972686768, + "learning_rate": 0.0004145277927684835, + "loss": 3.4385, + "step": 28750 + }, + { + "epoch": 3.105121293800539, + "grad_norm": 0.5860809683799744, + "learning_rate": 0.00041420399352401504, + "loss": 3.4427, + "step": 28800 + }, + { + "epoch": 3.1105121293800537, + "grad_norm": 0.6193458437919617, + "learning_rate": 0.00041388019427954665, + "loss": 3.4264, + "step": 28850 + }, + { + "epoch": 3.115902964959569, + "grad_norm": 0.5895789861679077, + "learning_rate": 0.0004135563950350782, + "loss": 3.4446, + "step": 28900 + }, + { + "epoch": 3.1212938005390836, + "grad_norm": 0.6106541156768799, + "learning_rate": 0.0004132325957906098, + "loss": 3.4618, + "step": 28950 + }, + { + "epoch": 3.1266846361185983, + "grad_norm": 0.6361096501350403, + "learning_rate": 0.00041290879654614135, + "loss": 3.4295, + "step": 29000 + }, + { + "epoch": 3.1266846361185983, + "eval_accuracy": 0.3709342225195931, + "eval_loss": 3.496896266937256, + "eval_runtime": 184.8429, + "eval_samples_per_second": 97.439, + "eval_steps_per_second": 6.092, + "step": 29000 + }, + { + "epoch": 3.1320754716981134, + "grad_norm": 0.665164589881897, + "learning_rate": 0.00041258499730167296, + "loss": 3.4525, + "step": 29050 + }, + { + "epoch": 3.137466307277628, + "grad_norm": 0.6416609883308411, + "learning_rate": 0.0004122611980572045, + "loss": 3.4413, + "step": 29100 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.6141297817230225, + "learning_rate": 0.00041193739881273606, + "loss": 3.4591, + "step": 29150 + }, + { + "epoch": 3.1482479784366575, + "grad_norm": 0.5525941848754883, + "learning_rate": 0.00041161359956826766, + "loss": 3.4582, + "step": 29200 + }, + { + "epoch": 3.1536388140161726, + "grad_norm": 0.5938381552696228, + "learning_rate": 0.0004112898003237992, + "loss": 3.4456, + "step": 29250 + }, + { + "epoch": 3.1590296495956873, + "grad_norm": 0.6277801990509033, + "learning_rate": 0.0004109660010793308, + "loss": 3.4546, + "step": 29300 + }, + { + "epoch": 3.164420485175202, + "grad_norm": 0.6602087616920471, + "learning_rate": 0.0004106422018348623, + "loss": 3.4666, + "step": 29350 + }, + { + "epoch": 3.169811320754717, + "grad_norm": 0.5656922459602356, + "learning_rate": 0.0004103184025903939, + "loss": 3.458, + "step": 29400 + }, + { + "epoch": 3.175202156334232, + "grad_norm": 0.605190634727478, + "learning_rate": 0.00040999460334592547, + "loss": 3.4687, + "step": 29450 + }, + { + "epoch": 3.1805929919137466, + "grad_norm": 0.6249133944511414, + "learning_rate": 0.000409670804101457, + "loss": 3.465, + "step": 29500 + }, + { + "epoch": 3.1859838274932613, + "grad_norm": 0.5883227586746216, + "learning_rate": 0.00040934700485698863, + "loss": 3.4413, + "step": 29550 + }, + { + "epoch": 3.1913746630727764, + "grad_norm": 0.5973005294799805, + "learning_rate": 0.0004090232056125202, + "loss": 3.4628, + "step": 29600 + }, + { + "epoch": 3.196765498652291, + "grad_norm": 0.5894708633422852, + "learning_rate": 0.0004086994063680518, + "loss": 3.4505, + "step": 29650 + }, + { + "epoch": 3.202156334231806, + "grad_norm": 0.5822822451591492, + "learning_rate": 0.00040837560712358333, + "loss": 3.4559, + "step": 29700 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 0.6240995526313782, + "learning_rate": 0.00040805180787911494, + "loss": 3.4638, + "step": 29750 + }, + { + "epoch": 3.2129380053908356, + "grad_norm": 0.5901066064834595, + "learning_rate": 0.0004077344846195359, + "loss": 3.4456, + "step": 29800 + }, + { + "epoch": 3.2183288409703503, + "grad_norm": 0.6369420289993286, + "learning_rate": 0.00040741068537506744, + "loss": 3.467, + "step": 29850 + }, + { + "epoch": 3.223719676549865, + "grad_norm": 0.6294394135475159, + "learning_rate": 0.000407086886130599, + "loss": 3.4735, + "step": 29900 + }, + { + "epoch": 3.22911051212938, + "grad_norm": 0.6184892654418945, + "learning_rate": 0.0004067630868861306, + "loss": 3.4679, + "step": 29950 + }, + { + "epoch": 3.234501347708895, + "grad_norm": 0.6455713510513306, + "learning_rate": 0.0004064392876416621, + "loss": 3.4506, + "step": 30000 + }, + { + "epoch": 3.234501347708895, + "eval_accuracy": 0.3713301536427891, + "eval_loss": 3.490234613418579, + "eval_runtime": 185.1295, + "eval_samples_per_second": 97.289, + "eval_steps_per_second": 6.082, + "step": 30000 + }, + { + "epoch": 3.2398921832884096, + "grad_norm": 0.642624020576477, + "learning_rate": 0.0004061154883971937, + "loss": 3.4476, + "step": 30050 + }, + { + "epoch": 3.2452830188679247, + "grad_norm": 0.626826822757721, + "learning_rate": 0.00040579168915272525, + "loss": 3.4482, + "step": 30100 + }, + { + "epoch": 3.2506738544474394, + "grad_norm": 0.5784972310066223, + "learning_rate": 0.00040546788990825685, + "loss": 3.4517, + "step": 30150 + }, + { + "epoch": 3.256064690026954, + "grad_norm": 0.64665287733078, + "learning_rate": 0.0004051440906637884, + "loss": 3.4719, + "step": 30200 + }, + { + "epoch": 3.2614555256064692, + "grad_norm": 0.6225249171257019, + "learning_rate": 0.00040482029141931995, + "loss": 3.464, + "step": 30250 + }, + { + "epoch": 3.266846361185984, + "grad_norm": 0.7104143500328064, + "learning_rate": 0.00040449649217485156, + "loss": 3.465, + "step": 30300 + }, + { + "epoch": 3.2722371967654986, + "grad_norm": 0.6647838950157166, + "learning_rate": 0.0004041726929303831, + "loss": 3.4433, + "step": 30350 + }, + { + "epoch": 3.2776280323450133, + "grad_norm": 0.678033173084259, + "learning_rate": 0.0004038488936859147, + "loss": 3.4726, + "step": 30400 + }, + { + "epoch": 3.2830188679245285, + "grad_norm": 0.6125953197479248, + "learning_rate": 0.00040352509444144626, + "loss": 3.4584, + "step": 30450 + }, + { + "epoch": 3.288409703504043, + "grad_norm": 0.5959815979003906, + "learning_rate": 0.00040320129519697787, + "loss": 3.4427, + "step": 30500 + }, + { + "epoch": 3.293800539083558, + "grad_norm": 0.6709897518157959, + "learning_rate": 0.0004028774959525094, + "loss": 3.4719, + "step": 30550 + }, + { + "epoch": 3.2991913746630726, + "grad_norm": 0.7089282870292664, + "learning_rate": 0.000402553696708041, + "loss": 3.4638, + "step": 30600 + }, + { + "epoch": 3.3045822102425877, + "grad_norm": 0.6118188500404358, + "learning_rate": 0.0004022298974635726, + "loss": 3.4443, + "step": 30650 + }, + { + "epoch": 3.3099730458221024, + "grad_norm": 0.6362579464912415, + "learning_rate": 0.00040190609821910407, + "loss": 3.4604, + "step": 30700 + }, + { + "epoch": 3.315363881401617, + "grad_norm": 0.6267250776290894, + "learning_rate": 0.0004015822989746357, + "loss": 3.4298, + "step": 30750 + }, + { + "epoch": 3.3207547169811322, + "grad_norm": 0.5901178121566772, + "learning_rate": 0.0004012584997301672, + "loss": 3.4663, + "step": 30800 + }, + { + "epoch": 3.326145552560647, + "grad_norm": 0.6500908732414246, + "learning_rate": 0.00040093470048569883, + "loss": 3.446, + "step": 30850 + }, + { + "epoch": 3.3315363881401616, + "grad_norm": 0.6370844841003418, + "learning_rate": 0.0004006109012412304, + "loss": 3.4689, + "step": 30900 + }, + { + "epoch": 3.3369272237196768, + "grad_norm": 0.6281841993331909, + "learning_rate": 0.000400287101996762, + "loss": 3.4456, + "step": 30950 + }, + { + "epoch": 3.3423180592991915, + "grad_norm": 0.6539614200592041, + "learning_rate": 0.00039996330275229354, + "loss": 3.4582, + "step": 31000 + }, + { + "epoch": 3.3423180592991915, + "eval_accuracy": 0.37187863342047656, + "eval_loss": 3.48494291305542, + "eval_runtime": 184.9741, + "eval_samples_per_second": 97.37, + "eval_steps_per_second": 6.087, + "step": 31000 + }, + { + "epoch": 3.347708894878706, + "grad_norm": 0.8110284209251404, + "learning_rate": 0.00039963950350782514, + "loss": 3.4593, + "step": 31050 + }, + { + "epoch": 3.353099730458221, + "grad_norm": 0.6377867460250854, + "learning_rate": 0.0003993157042633567, + "loss": 3.4541, + "step": 31100 + }, + { + "epoch": 3.358490566037736, + "grad_norm": 0.6073828935623169, + "learning_rate": 0.00039899190501888824, + "loss": 3.4652, + "step": 31150 + }, + { + "epoch": 3.3638814016172507, + "grad_norm": 0.6192326545715332, + "learning_rate": 0.00039866810577441985, + "loss": 3.4487, + "step": 31200 + }, + { + "epoch": 3.3692722371967654, + "grad_norm": 0.6012443900108337, + "learning_rate": 0.0003983443065299514, + "loss": 3.4543, + "step": 31250 + }, + { + "epoch": 3.37466307277628, + "grad_norm": 0.5912206172943115, + "learning_rate": 0.000398020507285483, + "loss": 3.4795, + "step": 31300 + }, + { + "epoch": 3.3800539083557952, + "grad_norm": 0.615149974822998, + "learning_rate": 0.0003976967080410145, + "loss": 3.4505, + "step": 31350 + }, + { + "epoch": 3.38544474393531, + "grad_norm": 0.6301441192626953, + "learning_rate": 0.0003973729087965461, + "loss": 3.4511, + "step": 31400 + }, + { + "epoch": 3.3908355795148246, + "grad_norm": 0.6270235180854797, + "learning_rate": 0.00039704910955207765, + "loss": 3.4568, + "step": 31450 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 0.5964970588684082, + "learning_rate": 0.00039672531030760926, + "loss": 3.449, + "step": 31500 + }, + { + "epoch": 3.4016172506738545, + "grad_norm": 0.6153132319450378, + "learning_rate": 0.0003964015110631408, + "loss": 3.4681, + "step": 31550 + }, + { + "epoch": 3.407008086253369, + "grad_norm": 0.6561875343322754, + "learning_rate": 0.00039607771181867236, + "loss": 3.4452, + "step": 31600 + }, + { + "epoch": 3.4123989218328843, + "grad_norm": 0.6051456332206726, + "learning_rate": 0.00039575391257420397, + "loss": 3.4482, + "step": 31650 + }, + { + "epoch": 3.417789757412399, + "grad_norm": 0.6496569514274597, + "learning_rate": 0.0003954301133297355, + "loss": 3.4746, + "step": 31700 + }, + { + "epoch": 3.4231805929919137, + "grad_norm": 0.6450279355049133, + "learning_rate": 0.0003951063140852671, + "loss": 3.4468, + "step": 31750 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.6020802855491638, + "learning_rate": 0.00039478251484079867, + "loss": 3.4509, + "step": 31800 + }, + { + "epoch": 3.4339622641509435, + "grad_norm": 0.6902250647544861, + "learning_rate": 0.0003944587155963303, + "loss": 3.4817, + "step": 31850 + }, + { + "epoch": 3.439353099730458, + "grad_norm": 0.6622137427330017, + "learning_rate": 0.0003941349163518618, + "loss": 3.4645, + "step": 31900 + }, + { + "epoch": 3.444743935309973, + "grad_norm": 0.6333610415458679, + "learning_rate": 0.0003938175930922828, + "loss": 3.4427, + "step": 31950 + }, + { + "epoch": 3.450134770889488, + "grad_norm": 0.6233834624290466, + "learning_rate": 0.0003934937938478143, + "loss": 3.4597, + "step": 32000 + }, + { + "epoch": 3.450134770889488, + "eval_accuracy": 0.3733303446545658, + "eval_loss": 3.4757399559020996, + "eval_runtime": 185.0469, + "eval_samples_per_second": 97.332, + "eval_steps_per_second": 6.085, + "step": 32000 + }, + { + "epoch": 3.4555256064690028, + "grad_norm": 0.6415245532989502, + "learning_rate": 0.0003931699946033459, + "loss": 3.4613, + "step": 32050 + }, + { + "epoch": 3.4609164420485174, + "grad_norm": 0.6769406199455261, + "learning_rate": 0.00039284619535887743, + "loss": 3.4545, + "step": 32100 + }, + { + "epoch": 3.466307277628032, + "grad_norm": 0.6122919321060181, + "learning_rate": 0.00039252239611440904, + "loss": 3.4669, + "step": 32150 + }, + { + "epoch": 3.4716981132075473, + "grad_norm": 0.6024535894393921, + "learning_rate": 0.0003921985968699406, + "loss": 3.4484, + "step": 32200 + }, + { + "epoch": 3.477088948787062, + "grad_norm": 0.6409429311752319, + "learning_rate": 0.0003918747976254722, + "loss": 3.4661, + "step": 32250 + }, + { + "epoch": 3.4824797843665767, + "grad_norm": 0.6143214106559753, + "learning_rate": 0.00039155099838100374, + "loss": 3.4796, + "step": 32300 + }, + { + "epoch": 3.487870619946092, + "grad_norm": 0.5818310976028442, + "learning_rate": 0.0003912271991365353, + "loss": 3.4407, + "step": 32350 + }, + { + "epoch": 3.4932614555256065, + "grad_norm": 0.6170235276222229, + "learning_rate": 0.0003909033998920669, + "loss": 3.4722, + "step": 32400 + }, + { + "epoch": 3.498652291105121, + "grad_norm": 0.6212165951728821, + "learning_rate": 0.00039057960064759845, + "loss": 3.446, + "step": 32450 + }, + { + "epoch": 3.5040431266846364, + "grad_norm": 0.6115076541900635, + "learning_rate": 0.00039025580140313005, + "loss": 3.4707, + "step": 32500 + }, + { + "epoch": 3.509433962264151, + "grad_norm": 0.5680184960365295, + "learning_rate": 0.0003899320021586616, + "loss": 3.4612, + "step": 32550 + }, + { + "epoch": 3.5148247978436657, + "grad_norm": 0.6204341650009155, + "learning_rate": 0.0003896082029141932, + "loss": 3.4577, + "step": 32600 + }, + { + "epoch": 3.5202156334231804, + "grad_norm": 0.6625291705131531, + "learning_rate": 0.00038928440366972476, + "loss": 3.4669, + "step": 32650 + }, + { + "epoch": 3.525606469002695, + "grad_norm": 0.6184507608413696, + "learning_rate": 0.00038896060442525636, + "loss": 3.4411, + "step": 32700 + }, + { + "epoch": 3.5309973045822103, + "grad_norm": 0.6391364932060242, + "learning_rate": 0.00038863680518078786, + "loss": 3.4536, + "step": 32750 + }, + { + "epoch": 3.536388140161725, + "grad_norm": 0.6026434898376465, + "learning_rate": 0.0003883130059363194, + "loss": 3.4641, + "step": 32800 + }, + { + "epoch": 3.5417789757412397, + "grad_norm": 0.642951250076294, + "learning_rate": 0.000387989206691851, + "loss": 3.4257, + "step": 32850 + }, + { + "epoch": 3.547169811320755, + "grad_norm": 0.6193267107009888, + "learning_rate": 0.00038766540744738256, + "loss": 3.4491, + "step": 32900 + }, + { + "epoch": 3.5525606469002695, + "grad_norm": 0.6317867636680603, + "learning_rate": 0.00038734160820291417, + "loss": 3.4632, + "step": 32950 + }, + { + "epoch": 3.557951482479784, + "grad_norm": 0.654092013835907, + "learning_rate": 0.0003870178089584457, + "loss": 3.4639, + "step": 33000 + }, + { + "epoch": 3.557951482479784, + "eval_accuracy": 0.37359350194885255, + "eval_loss": 3.4721765518188477, + "eval_runtime": 185.0176, + "eval_samples_per_second": 97.347, + "eval_steps_per_second": 6.086, + "step": 33000 + }, + { + "epoch": 3.5633423180592994, + "grad_norm": 0.5980980396270752, + "learning_rate": 0.0003866940097139773, + "loss": 3.4638, + "step": 33050 + }, + { + "epoch": 3.568733153638814, + "grad_norm": 0.671788215637207, + "learning_rate": 0.0003863702104695089, + "loss": 3.4525, + "step": 33100 + }, + { + "epoch": 3.5741239892183287, + "grad_norm": 0.6702181100845337, + "learning_rate": 0.0003860464112250404, + "loss": 3.4601, + "step": 33150 + }, + { + "epoch": 3.579514824797844, + "grad_norm": 0.6230493187904358, + "learning_rate": 0.00038572261198057203, + "loss": 3.4616, + "step": 33200 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 0.6334137916564941, + "learning_rate": 0.0003853988127361036, + "loss": 3.4536, + "step": 33250 + }, + { + "epoch": 3.5902964959568733, + "grad_norm": 0.6252257227897644, + "learning_rate": 0.0003850750134916352, + "loss": 3.4431, + "step": 33300 + }, + { + "epoch": 3.595687331536388, + "grad_norm": 0.5797781348228455, + "learning_rate": 0.0003847512142471667, + "loss": 3.454, + "step": 33350 + }, + { + "epoch": 3.601078167115903, + "grad_norm": 0.6160080432891846, + "learning_rate": 0.0003844274150026983, + "loss": 3.4549, + "step": 33400 + }, + { + "epoch": 3.606469002695418, + "grad_norm": 0.6341015696525574, + "learning_rate": 0.00038410361575822984, + "loss": 3.4639, + "step": 33450 + }, + { + "epoch": 3.6118598382749325, + "grad_norm": 0.6332781910896301, + "learning_rate": 0.00038377981651376144, + "loss": 3.4719, + "step": 33500 + }, + { + "epoch": 3.617250673854447, + "grad_norm": 0.6350259184837341, + "learning_rate": 0.000383456017269293, + "loss": 3.4501, + "step": 33550 + }, + { + "epoch": 3.6226415094339623, + "grad_norm": 0.6050942540168762, + "learning_rate": 0.00038313221802482454, + "loss": 3.4529, + "step": 33600 + }, + { + "epoch": 3.628032345013477, + "grad_norm": 0.679898202419281, + "learning_rate": 0.00038280841878035615, + "loss": 3.4496, + "step": 33650 + }, + { + "epoch": 3.6334231805929917, + "grad_norm": 0.6030776500701904, + "learning_rate": 0.0003824846195358877, + "loss": 3.438, + "step": 33700 + }, + { + "epoch": 3.638814016172507, + "grad_norm": 0.6465497016906738, + "learning_rate": 0.0003821608202914193, + "loss": 3.46, + "step": 33750 + }, + { + "epoch": 3.6442048517520216, + "grad_norm": 0.6347126960754395, + "learning_rate": 0.00038183702104695085, + "loss": 3.4418, + "step": 33800 + }, + { + "epoch": 3.6495956873315363, + "grad_norm": 0.6211069822311401, + "learning_rate": 0.00038151322180248246, + "loss": 3.4722, + "step": 33850 + }, + { + "epoch": 3.6549865229110514, + "grad_norm": 0.6023736000061035, + "learning_rate": 0.000381189422558014, + "loss": 3.4463, + "step": 33900 + }, + { + "epoch": 3.660377358490566, + "grad_norm": 0.6519888043403625, + "learning_rate": 0.00038087209929843496, + "loss": 3.4477, + "step": 33950 + }, + { + "epoch": 3.665768194070081, + "grad_norm": 0.6298272013664246, + "learning_rate": 0.00038054830005396646, + "loss": 3.4602, + "step": 34000 + }, + { + "epoch": 3.665768194070081, + "eval_accuracy": 0.3735198352909143, + "eval_loss": 3.4653146266937256, + "eval_runtime": 185.1109, + "eval_samples_per_second": 97.298, + "eval_steps_per_second": 6.083, + "step": 34000 + }, + { + "epoch": 3.671159029649596, + "grad_norm": 0.6116703152656555, + "learning_rate": 0.00038022450080949806, + "loss": 3.4527, + "step": 34050 + }, + { + "epoch": 3.6765498652291106, + "grad_norm": 0.6061373949050903, + "learning_rate": 0.0003799007015650296, + "loss": 3.4695, + "step": 34100 + }, + { + "epoch": 3.6819407008086253, + "grad_norm": 0.6899125576019287, + "learning_rate": 0.0003795769023205612, + "loss": 3.4656, + "step": 34150 + }, + { + "epoch": 3.68733153638814, + "grad_norm": 0.6152036190032959, + "learning_rate": 0.00037925310307609277, + "loss": 3.4485, + "step": 34200 + }, + { + "epoch": 3.6927223719676547, + "grad_norm": 0.6439741253852844, + "learning_rate": 0.0003789293038316244, + "loss": 3.4411, + "step": 34250 + }, + { + "epoch": 3.69811320754717, + "grad_norm": 0.5817996263504028, + "learning_rate": 0.0003786055045871559, + "loss": 3.4418, + "step": 34300 + }, + { + "epoch": 3.7035040431266846, + "grad_norm": 0.633815348148346, + "learning_rate": 0.0003782817053426875, + "loss": 3.446, + "step": 34350 + }, + { + "epoch": 3.7088948787061993, + "grad_norm": 0.593890905380249, + "learning_rate": 0.0003779579060982191, + "loss": 3.4419, + "step": 34400 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.6376861929893494, + "learning_rate": 0.00037763410685375063, + "loss": 3.4648, + "step": 34450 + }, + { + "epoch": 3.719676549865229, + "grad_norm": 0.616452157497406, + "learning_rate": 0.00037731030760928223, + "loss": 3.4457, + "step": 34500 + }, + { + "epoch": 3.725067385444744, + "grad_norm": 0.6425932049751282, + "learning_rate": 0.0003769865083648138, + "loss": 3.4385, + "step": 34550 + }, + { + "epoch": 3.730458221024259, + "grad_norm": 0.6993752717971802, + "learning_rate": 0.0003766627091203454, + "loss": 3.4588, + "step": 34600 + }, + { + "epoch": 3.7358490566037736, + "grad_norm": 0.6306095123291016, + "learning_rate": 0.00037633890987587694, + "loss": 3.4574, + "step": 34650 + }, + { + "epoch": 3.7412398921832883, + "grad_norm": 0.6183214783668518, + "learning_rate": 0.00037602158661629784, + "loss": 3.4664, + "step": 34700 + }, + { + "epoch": 3.7466307277628035, + "grad_norm": 0.616973340511322, + "learning_rate": 0.0003756977873718294, + "loss": 3.4632, + "step": 34750 + }, + { + "epoch": 3.752021563342318, + "grad_norm": 0.64264976978302, + "learning_rate": 0.000375373988127361, + "loss": 3.4428, + "step": 34800 + }, + { + "epoch": 3.757412398921833, + "grad_norm": 0.6926430463790894, + "learning_rate": 0.00037505018888289254, + "loss": 3.4469, + "step": 34850 + }, + { + "epoch": 3.7628032345013476, + "grad_norm": 0.6002678871154785, + "learning_rate": 0.00037472638963842415, + "loss": 3.4612, + "step": 34900 + }, + { + "epoch": 3.7681940700808623, + "grad_norm": 0.7651305794715881, + "learning_rate": 0.0003744025903939557, + "loss": 3.454, + "step": 34950 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 0.6672982573509216, + "learning_rate": 0.0003740787911494873, + "loss": 3.4613, + "step": 35000 + }, + { + "epoch": 3.7735849056603774, + "eval_accuracy": 0.37492178350238453, + "eval_loss": 3.4591267108917236, + "eval_runtime": 185.1296, + "eval_samples_per_second": 97.289, + "eval_steps_per_second": 6.082, + "step": 35000 + }, + { + "epoch": 3.778975741239892, + "grad_norm": 0.6062513589859009, + "learning_rate": 0.00037375499190501885, + "loss": 3.47, + "step": 35050 + }, + { + "epoch": 3.784366576819407, + "grad_norm": 0.6883726716041565, + "learning_rate": 0.0003734311926605504, + "loss": 3.4365, + "step": 35100 + }, + { + "epoch": 3.789757412398922, + "grad_norm": 0.6614878177642822, + "learning_rate": 0.000373107393416082, + "loss": 3.4592, + "step": 35150 + }, + { + "epoch": 3.7951482479784366, + "grad_norm": 0.6300371289253235, + "learning_rate": 0.00037278359417161356, + "loss": 3.4572, + "step": 35200 + }, + { + "epoch": 3.8005390835579513, + "grad_norm": 0.6380262970924377, + "learning_rate": 0.00037245979492714517, + "loss": 3.4392, + "step": 35250 + }, + { + "epoch": 3.8059299191374665, + "grad_norm": 0.6063967943191528, + "learning_rate": 0.0003721359956826767, + "loss": 3.4528, + "step": 35300 + }, + { + "epoch": 3.811320754716981, + "grad_norm": 0.6829068064689636, + "learning_rate": 0.0003718121964382083, + "loss": 3.4549, + "step": 35350 + }, + { + "epoch": 3.816711590296496, + "grad_norm": 0.6208524703979492, + "learning_rate": 0.0003714883971937398, + "loss": 3.4547, + "step": 35400 + }, + { + "epoch": 3.822102425876011, + "grad_norm": 0.6724206209182739, + "learning_rate": 0.0003711645979492714, + "loss": 3.4494, + "step": 35450 + }, + { + "epoch": 3.8274932614555257, + "grad_norm": 0.586846113204956, + "learning_rate": 0.00037084079870480297, + "loss": 3.4517, + "step": 35500 + }, + { + "epoch": 3.8328840970350404, + "grad_norm": 0.7098872065544128, + "learning_rate": 0.0003705169994603345, + "loss": 3.4652, + "step": 35550 + }, + { + "epoch": 3.838274932614555, + "grad_norm": 0.6315362453460693, + "learning_rate": 0.00037019320021586613, + "loss": 3.4693, + "step": 35600 + }, + { + "epoch": 3.8436657681940702, + "grad_norm": 0.6404989957809448, + "learning_rate": 0.0003698694009713977, + "loss": 3.4676, + "step": 35650 + }, + { + "epoch": 3.849056603773585, + "grad_norm": 0.5798198580741882, + "learning_rate": 0.0003695456017269293, + "loss": 3.469, + "step": 35700 + }, + { + "epoch": 3.8544474393530996, + "grad_norm": 0.6681696772575378, + "learning_rate": 0.00036922180248246083, + "loss": 3.4485, + "step": 35750 + }, + { + "epoch": 3.8598382749326143, + "grad_norm": 0.8003295660018921, + "learning_rate": 0.00036889800323799244, + "loss": 3.4547, + "step": 35800 + }, + { + "epoch": 3.8652291105121295, + "grad_norm": 0.6079601645469666, + "learning_rate": 0.000368574203993524, + "loss": 3.4312, + "step": 35850 + }, + { + "epoch": 3.870619946091644, + "grad_norm": 0.6316880583763123, + "learning_rate": 0.0003682504047490556, + "loss": 3.4479, + "step": 35900 + }, + { + "epoch": 3.876010781671159, + "grad_norm": 0.6452414393424988, + "learning_rate": 0.00036792660550458714, + "loss": 3.4706, + "step": 35950 + }, + { + "epoch": 3.881401617250674, + "grad_norm": 0.6125510931015015, + "learning_rate": 0.00036760280626011864, + "loss": 3.4517, + "step": 36000 + }, + { + "epoch": 3.881401617250674, + "eval_accuracy": 0.3754607018259444, + "eval_loss": 3.4510653018951416, + "eval_runtime": 185.2329, + "eval_samples_per_second": 97.234, + "eval_steps_per_second": 6.079, + "step": 36000 + }, + { + "epoch": 3.8867924528301887, + "grad_norm": 0.7082051038742065, + "learning_rate": 0.0003672790070156503, + "loss": 3.4576, + "step": 36050 + }, + { + "epoch": 3.8921832884097034, + "grad_norm": 0.6857859492301941, + "learning_rate": 0.0003669552077711818, + "loss": 3.4543, + "step": 36100 + }, + { + "epoch": 3.8975741239892185, + "grad_norm": 0.5955175161361694, + "learning_rate": 0.0003666314085267134, + "loss": 3.4599, + "step": 36150 + }, + { + "epoch": 3.9029649595687332, + "grad_norm": 0.6065194010734558, + "learning_rate": 0.00036630760928224495, + "loss": 3.4475, + "step": 36200 + }, + { + "epoch": 3.908355795148248, + "grad_norm": 0.6187525391578674, + "learning_rate": 0.00036598381003777656, + "loss": 3.4643, + "step": 36250 + }, + { + "epoch": 3.913746630727763, + "grad_norm": 0.637075662612915, + "learning_rate": 0.0003656600107933081, + "loss": 3.4443, + "step": 36300 + }, + { + "epoch": 3.9191374663072778, + "grad_norm": 0.613892138004303, + "learning_rate": 0.0003653362115488397, + "loss": 3.4656, + "step": 36350 + }, + { + "epoch": 3.9245283018867925, + "grad_norm": 0.598813533782959, + "learning_rate": 0.00036501241230437126, + "loss": 3.4665, + "step": 36400 + }, + { + "epoch": 3.929919137466307, + "grad_norm": 0.6649259924888611, + "learning_rate": 0.0003646886130599028, + "loss": 3.4416, + "step": 36450 + }, + { + "epoch": 3.935309973045822, + "grad_norm": 0.6432799696922302, + "learning_rate": 0.0003643648138154344, + "loss": 3.4519, + "step": 36500 + }, + { + "epoch": 3.940700808625337, + "grad_norm": 0.6609790921211243, + "learning_rate": 0.00036404101457096597, + "loss": 3.4532, + "step": 36550 + }, + { + "epoch": 3.9460916442048517, + "grad_norm": 0.5976753234863281, + "learning_rate": 0.00036371721532649757, + "loss": 3.4501, + "step": 36600 + }, + { + "epoch": 3.9514824797843664, + "grad_norm": 0.7079935669898987, + "learning_rate": 0.0003633934160820291, + "loss": 3.4404, + "step": 36650 + }, + { + "epoch": 3.9568733153638815, + "grad_norm": 0.6590920686721802, + "learning_rate": 0.00036306961683756073, + "loss": 3.452, + "step": 36700 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 0.6470325589179993, + "learning_rate": 0.0003627458175930922, + "loss": 3.4424, + "step": 36750 + }, + { + "epoch": 3.967654986522911, + "grad_norm": 0.625947117805481, + "learning_rate": 0.0003624220183486238, + "loss": 3.4346, + "step": 36800 + }, + { + "epoch": 3.973045822102426, + "grad_norm": 0.6342935562133789, + "learning_rate": 0.0003620982191041554, + "loss": 3.4505, + "step": 36850 + }, + { + "epoch": 3.9784366576819408, + "grad_norm": 0.6525506973266602, + "learning_rate": 0.00036177441985968693, + "loss": 3.4421, + "step": 36900 + }, + { + "epoch": 3.9838274932614555, + "grad_norm": 0.6463566422462463, + "learning_rate": 0.00036145062061521854, + "loss": 3.4535, + "step": 36950 + }, + { + "epoch": 3.9892183288409706, + "grad_norm": 0.6641883254051208, + "learning_rate": 0.0003611268213707501, + "loss": 3.4488, + "step": 37000 + }, + { + "epoch": 3.9892183288409706, + "eval_accuracy": 0.3760902366579415, + "eval_loss": 3.445734977722168, + "eval_runtime": 185.0242, + "eval_samples_per_second": 97.344, + "eval_steps_per_second": 6.086, + "step": 37000 + }, + { + "epoch": 3.9946091644204853, + "grad_norm": 0.6596526503562927, + "learning_rate": 0.0003608030221262817, + "loss": 3.4613, + "step": 37050 + }, + { + "epoch": 4.0, + "grad_norm": 1.2152858972549438, + "learning_rate": 0.00036047922288181324, + "loss": 3.4334, + "step": 37100 + }, + { + "epoch": 4.005390835579515, + "grad_norm": 0.6752334833145142, + "learning_rate": 0.00036015542363734485, + "loss": 3.3286, + "step": 37150 + }, + { + "epoch": 4.010781671159029, + "grad_norm": 0.6589771509170532, + "learning_rate": 0.0003598316243928764, + "loss": 3.3567, + "step": 37200 + }, + { + "epoch": 4.0161725067385445, + "grad_norm": 0.6467882990837097, + "learning_rate": 0.00035950782514840795, + "loss": 3.3498, + "step": 37250 + }, + { + "epoch": 4.02156334231806, + "grad_norm": 0.6317258477210999, + "learning_rate": 0.00035918402590393955, + "loss": 3.3546, + "step": 37300 + }, + { + "epoch": 4.026954177897574, + "grad_norm": 0.6882931590080261, + "learning_rate": 0.00035886022665947105, + "loss": 3.3413, + "step": 37350 + }, + { + "epoch": 4.032345013477089, + "grad_norm": 0.6179378628730774, + "learning_rate": 0.0003585364274150027, + "loss": 3.361, + "step": 37400 + }, + { + "epoch": 4.037735849056604, + "grad_norm": 0.7111193537712097, + "learning_rate": 0.0003582126281705342, + "loss": 3.3608, + "step": 37450 + }, + { + "epoch": 4.0431266846361185, + "grad_norm": 0.6103177666664124, + "learning_rate": 0.0003578888289260658, + "loss": 3.3477, + "step": 37500 + }, + { + "epoch": 4.048517520215634, + "grad_norm": 0.67775958776474, + "learning_rate": 0.00035756502968159736, + "loss": 3.3644, + "step": 37550 + }, + { + "epoch": 4.053908355795148, + "grad_norm": 0.663544774055481, + "learning_rate": 0.00035724123043712896, + "loss": 3.3578, + "step": 37600 + }, + { + "epoch": 4.059299191374663, + "grad_norm": 0.6494781374931335, + "learning_rate": 0.0003569174311926605, + "loss": 3.3734, + "step": 37650 + }, + { + "epoch": 4.064690026954178, + "grad_norm": 0.6415791511535645, + "learning_rate": 0.00035659363194819206, + "loss": 3.3584, + "step": 37700 + }, + { + "epoch": 4.070080862533692, + "grad_norm": 0.6472448110580444, + "learning_rate": 0.00035626983270372367, + "loss": 3.3797, + "step": 37750 + }, + { + "epoch": 4.0754716981132075, + "grad_norm": 0.6979560256004333, + "learning_rate": 0.0003559460334592552, + "loss": 3.3631, + "step": 37800 + }, + { + "epoch": 4.080862533692723, + "grad_norm": 0.6640351414680481, + "learning_rate": 0.0003556222342147868, + "loss": 3.359, + "step": 37850 + }, + { + "epoch": 4.086253369272237, + "grad_norm": 0.6525617837905884, + "learning_rate": 0.0003552984349703184, + "loss": 3.359, + "step": 37900 + }, + { + "epoch": 4.091644204851752, + "grad_norm": 0.6497607231140137, + "learning_rate": 0.00035497463572585, + "loss": 3.3734, + "step": 37950 + }, + { + "epoch": 4.097035040431267, + "grad_norm": 0.6940391063690186, + "learning_rate": 0.00035465083648138153, + "loss": 3.3644, + "step": 38000 + }, + { + "epoch": 4.097035040431267, + "eval_accuracy": 0.3764235837177563, + "eval_loss": 3.4489781856536865, + "eval_runtime": 185.0419, + "eval_samples_per_second": 97.335, + "eval_steps_per_second": 6.085, + "step": 38000 + }, + { + "epoch": 4.1024258760107815, + "grad_norm": 0.7142944931983948, + "learning_rate": 0.00035432703723691314, + "loss": 3.3681, + "step": 38050 + }, + { + "epoch": 4.107816711590297, + "grad_norm": 0.6247251629829407, + "learning_rate": 0.00035400323799244463, + "loss": 3.3706, + "step": 38100 + }, + { + "epoch": 4.113207547169812, + "grad_norm": 0.6462594866752625, + "learning_rate": 0.0003536794387479762, + "loss": 3.3803, + "step": 38150 + }, + { + "epoch": 4.118598382749326, + "grad_norm": 0.659847617149353, + "learning_rate": 0.0003533556395035078, + "loss": 3.3622, + "step": 38200 + }, + { + "epoch": 4.123989218328841, + "grad_norm": 0.6913278102874756, + "learning_rate": 0.00035303184025903934, + "loss": 3.3643, + "step": 38250 + }, + { + "epoch": 4.129380053908355, + "grad_norm": 0.6582707166671753, + "learning_rate": 0.00035270804101457094, + "loss": 3.3768, + "step": 38300 + }, + { + "epoch": 4.1347708894878705, + "grad_norm": 0.6678318977355957, + "learning_rate": 0.0003523842417701025, + "loss": 3.3553, + "step": 38350 + }, + { + "epoch": 4.140161725067386, + "grad_norm": 0.6429058909416199, + "learning_rate": 0.0003520604425256341, + "loss": 3.379, + "step": 38400 + }, + { + "epoch": 4.1455525606469, + "grad_norm": 0.6227869391441345, + "learning_rate": 0.00035173664328116565, + "loss": 3.3626, + "step": 38450 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 0.6584265828132629, + "learning_rate": 0.00035141284403669725, + "loss": 3.3735, + "step": 38500 + }, + { + "epoch": 4.15633423180593, + "grad_norm": 0.6800707578659058, + "learning_rate": 0.0003510890447922288, + "loss": 3.398, + "step": 38550 + }, + { + "epoch": 4.1617250673854445, + "grad_norm": 0.6075555086135864, + "learning_rate": 0.00035077172153264976, + "loss": 3.3731, + "step": 38600 + }, + { + "epoch": 4.16711590296496, + "grad_norm": 0.6489547491073608, + "learning_rate": 0.0003504479222881813, + "loss": 3.3858, + "step": 38650 + }, + { + "epoch": 4.172506738544475, + "grad_norm": 0.7513884902000427, + "learning_rate": 0.0003501241230437129, + "loss": 3.3517, + "step": 38700 + }, + { + "epoch": 4.177897574123989, + "grad_norm": 0.6607750058174133, + "learning_rate": 0.0003498003237992444, + "loss": 3.3763, + "step": 38750 + }, + { + "epoch": 4.183288409703504, + "grad_norm": 0.6463075280189514, + "learning_rate": 0.000349476524554776, + "loss": 3.3807, + "step": 38800 + }, + { + "epoch": 4.188679245283019, + "grad_norm": 0.7830615639686584, + "learning_rate": 0.00034915272531030756, + "loss": 3.3839, + "step": 38850 + }, + { + "epoch": 4.1940700808625335, + "grad_norm": 0.6614366769790649, + "learning_rate": 0.0003488289260658391, + "loss": 3.3912, + "step": 38900 + }, + { + "epoch": 4.199460916442049, + "grad_norm": 0.7006166577339172, + "learning_rate": 0.0003485051268213707, + "loss": 3.3844, + "step": 38950 + }, + { + "epoch": 4.204851752021563, + "grad_norm": 0.6897702813148499, + "learning_rate": 0.00034818132757690227, + "loss": 3.3636, + "step": 39000 + }, + { + "epoch": 4.204851752021563, + "eval_accuracy": 0.37732518538084087, + "eval_loss": 3.445204734802246, + "eval_runtime": 184.9796, + "eval_samples_per_second": 97.367, + "eval_steps_per_second": 6.087, + "step": 39000 + }, + { + "epoch": 4.210242587601078, + "grad_norm": 0.6401898264884949, + "learning_rate": 0.0003478575283324339, + "loss": 3.3922, + "step": 39050 + }, + { + "epoch": 4.215633423180593, + "grad_norm": 0.6454992294311523, + "learning_rate": 0.0003475337290879654, + "loss": 3.3863, + "step": 39100 + }, + { + "epoch": 4.2210242587601075, + "grad_norm": 0.6486014723777771, + "learning_rate": 0.00034720992984349703, + "loss": 3.395, + "step": 39150 + }, + { + "epoch": 4.226415094339623, + "grad_norm": 0.6316015720367432, + "learning_rate": 0.0003468861305990286, + "loss": 3.3762, + "step": 39200 + }, + { + "epoch": 4.231805929919138, + "grad_norm": 0.6304581761360168, + "learning_rate": 0.0003465623313545602, + "loss": 3.3712, + "step": 39250 + }, + { + "epoch": 4.237196765498652, + "grad_norm": 0.6496357917785645, + "learning_rate": 0.00034623853211009173, + "loss": 3.3843, + "step": 39300 + }, + { + "epoch": 4.242587601078167, + "grad_norm": 0.6726136207580566, + "learning_rate": 0.00034591473286562323, + "loss": 3.3732, + "step": 39350 + }, + { + "epoch": 4.247978436657682, + "grad_norm": 0.6343677043914795, + "learning_rate": 0.0003455909336211549, + "loss": 3.3922, + "step": 39400 + }, + { + "epoch": 4.2533692722371965, + "grad_norm": 0.6977464556694031, + "learning_rate": 0.0003452671343766864, + "loss": 3.3806, + "step": 39450 + }, + { + "epoch": 4.258760107816712, + "grad_norm": 0.6857298016548157, + "learning_rate": 0.000344943335132218, + "loss": 3.3852, + "step": 39500 + }, + { + "epoch": 4.264150943396227, + "grad_norm": 0.6643324494361877, + "learning_rate": 0.00034461953588774954, + "loss": 3.3899, + "step": 39550 + }, + { + "epoch": 4.269541778975741, + "grad_norm": 0.6354033946990967, + "learning_rate": 0.00034429573664328115, + "loss": 3.3981, + "step": 39600 + }, + { + "epoch": 4.274932614555256, + "grad_norm": 0.6460668444633484, + "learning_rate": 0.0003439719373988127, + "loss": 3.3938, + "step": 39650 + }, + { + "epoch": 4.280323450134771, + "grad_norm": 0.7137424945831299, + "learning_rate": 0.00034364813815434425, + "loss": 3.3745, + "step": 39700 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.7198200821876526, + "learning_rate": 0.00034332433890987585, + "loss": 3.3825, + "step": 39750 + }, + { + "epoch": 4.291105121293801, + "grad_norm": 0.6877724528312683, + "learning_rate": 0.0003430005396654074, + "loss": 3.383, + "step": 39800 + }, + { + "epoch": 4.296495956873315, + "grad_norm": 0.6581820249557495, + "learning_rate": 0.000342676740420939, + "loss": 3.3829, + "step": 39850 + }, + { + "epoch": 4.30188679245283, + "grad_norm": 0.7357942461967468, + "learning_rate": 0.00034235294117647056, + "loss": 3.3896, + "step": 39900 + }, + { + "epoch": 4.307277628032345, + "grad_norm": 0.6755037903785706, + "learning_rate": 0.00034202914193200216, + "loss": 3.3776, + "step": 39950 + }, + { + "epoch": 4.3126684636118595, + "grad_norm": 0.604346752166748, + "learning_rate": 0.0003417053426875337, + "loss": 3.3899, + "step": 40000 + }, + { + "epoch": 4.3126684636118595, + "eval_accuracy": 0.37746034957328206, + "eval_loss": 3.44191312789917, + "eval_runtime": 185.1596, + "eval_samples_per_second": 97.273, + "eval_steps_per_second": 6.081, + "step": 40000 + }, + { + "epoch": 4.318059299191375, + "grad_norm": 0.7762167453765869, + "learning_rate": 0.0003413815434430653, + "loss": 3.3884, + "step": 40050 + }, + { + "epoch": 4.32345013477089, + "grad_norm": 0.7179322242736816, + "learning_rate": 0.0003410577441985968, + "loss": 3.3853, + "step": 40100 + }, + { + "epoch": 4.328840970350404, + "grad_norm": 0.6719879508018494, + "learning_rate": 0.00034073394495412837, + "loss": 3.3954, + "step": 40150 + }, + { + "epoch": 4.334231805929919, + "grad_norm": 0.705895185470581, + "learning_rate": 0.00034041014570965997, + "loss": 3.3915, + "step": 40200 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 0.6576878428459167, + "learning_rate": 0.0003400863464651915, + "loss": 3.378, + "step": 40250 + }, + { + "epoch": 4.345013477088949, + "grad_norm": 0.6394710540771484, + "learning_rate": 0.0003397625472207231, + "loss": 3.3928, + "step": 40300 + }, + { + "epoch": 4.350404312668464, + "grad_norm": 0.6182326078414917, + "learning_rate": 0.0003394387479762547, + "loss": 3.3743, + "step": 40350 + }, + { + "epoch": 4.355795148247978, + "grad_norm": 0.6548103094100952, + "learning_rate": 0.0003391149487317863, + "loss": 3.3867, + "step": 40400 + }, + { + "epoch": 4.361185983827493, + "grad_norm": 0.6793950796127319, + "learning_rate": 0.00033879114948731783, + "loss": 3.3844, + "step": 40450 + }, + { + "epoch": 4.366576819407008, + "grad_norm": 0.6479836702346802, + "learning_rate": 0.00033846735024284944, + "loss": 3.3977, + "step": 40500 + }, + { + "epoch": 4.3719676549865225, + "grad_norm": 0.6476645469665527, + "learning_rate": 0.000338143550998381, + "loss": 3.3889, + "step": 40550 + }, + { + "epoch": 4.377358490566038, + "grad_norm": 0.6995546221733093, + "learning_rate": 0.00033781975175391254, + "loss": 3.3794, + "step": 40600 + }, + { + "epoch": 4.382749326145553, + "grad_norm": 0.7507542371749878, + "learning_rate": 0.00033749595250944414, + "loss": 3.3889, + "step": 40650 + }, + { + "epoch": 4.388140161725067, + "grad_norm": 0.7036803364753723, + "learning_rate": 0.00033717215326497564, + "loss": 3.3894, + "step": 40700 + }, + { + "epoch": 4.393530997304582, + "grad_norm": 0.6556727886199951, + "learning_rate": 0.0003368483540205073, + "loss": 3.3865, + "step": 40750 + }, + { + "epoch": 4.398921832884097, + "grad_norm": 0.6941169500350952, + "learning_rate": 0.0003365245547760388, + "loss": 3.3765, + "step": 40800 + }, + { + "epoch": 4.404312668463612, + "grad_norm": 0.6608136892318726, + "learning_rate": 0.0003362007555315704, + "loss": 3.3915, + "step": 40850 + }, + { + "epoch": 4.409703504043127, + "grad_norm": 0.8930730223655701, + "learning_rate": 0.00033587695628710195, + "loss": 3.3998, + "step": 40900 + }, + { + "epoch": 4.415094339622642, + "grad_norm": 0.6588183045387268, + "learning_rate": 0.00033555315704263355, + "loss": 3.3953, + "step": 40950 + }, + { + "epoch": 4.420485175202156, + "grad_norm": 0.6629270315170288, + "learning_rate": 0.0003352293577981651, + "loss": 3.3969, + "step": 41000 + }, + { + "epoch": 4.420485175202156, + "eval_accuracy": 0.3781523598157725, + "eval_loss": 3.4341278076171875, + "eval_runtime": 185.2042, + "eval_samples_per_second": 97.249, + "eval_steps_per_second": 6.08, + "step": 41000 + }, + { + "epoch": 4.425876010781671, + "grad_norm": 0.6934705972671509, + "learning_rate": 0.00033490555855369665, + "loss": 3.402, + "step": 41050 + }, + { + "epoch": 4.431266846361186, + "grad_norm": 0.608988881111145, + "learning_rate": 0.00033458175930922826, + "loss": 3.3712, + "step": 41100 + }, + { + "epoch": 4.436657681940701, + "grad_norm": 0.6706856489181519, + "learning_rate": 0.0003342579600647598, + "loss": 3.3936, + "step": 41150 + }, + { + "epoch": 4.442048517520216, + "grad_norm": 0.6479840874671936, + "learning_rate": 0.0003339341608202914, + "loss": 3.4055, + "step": 41200 + }, + { + "epoch": 4.44743935309973, + "grad_norm": 0.680960476398468, + "learning_rate": 0.00033361036157582297, + "loss": 3.4043, + "step": 41250 + }, + { + "epoch": 4.452830188679245, + "grad_norm": 0.6902644634246826, + "learning_rate": 0.00033328656233135457, + "loss": 3.3727, + "step": 41300 + }, + { + "epoch": 4.45822102425876, + "grad_norm": 0.6928199529647827, + "learning_rate": 0.0003329627630868861, + "loss": 3.379, + "step": 41350 + }, + { + "epoch": 4.463611859838275, + "grad_norm": 0.7097663879394531, + "learning_rate": 0.0003326389638424177, + "loss": 3.3974, + "step": 41400 + }, + { + "epoch": 4.46900269541779, + "grad_norm": 0.6607456803321838, + "learning_rate": 0.0003323151645979492, + "loss": 3.4084, + "step": 41450 + }, + { + "epoch": 4.474393530997305, + "grad_norm": 0.7242568731307983, + "learning_rate": 0.00033199136535348077, + "loss": 3.4067, + "step": 41500 + }, + { + "epoch": 4.479784366576819, + "grad_norm": 0.6340367197990417, + "learning_rate": 0.0003316675661090124, + "loss": 3.3938, + "step": 41550 + }, + { + "epoch": 4.485175202156334, + "grad_norm": 0.6760000586509705, + "learning_rate": 0.00033134376686454393, + "loss": 3.3863, + "step": 41600 + }, + { + "epoch": 4.490566037735849, + "grad_norm": 0.6576944589614868, + "learning_rate": 0.00033101996762007553, + "loss": 3.3976, + "step": 41650 + }, + { + "epoch": 4.495956873315364, + "grad_norm": 0.6730291247367859, + "learning_rate": 0.0003306961683756071, + "loss": 3.3886, + "step": 41700 + }, + { + "epoch": 4.501347708894879, + "grad_norm": 0.685353696346283, + "learning_rate": 0.0003303723691311387, + "loss": 3.387, + "step": 41750 + }, + { + "epoch": 4.506738544474393, + "grad_norm": 0.9739151000976562, + "learning_rate": 0.00033004856988667024, + "loss": 3.4041, + "step": 41800 + }, + { + "epoch": 4.512129380053908, + "grad_norm": 0.6757647395133972, + "learning_rate": 0.0003297247706422018, + "loss": 3.3808, + "step": 41850 + }, + { + "epoch": 4.517520215633423, + "grad_norm": 0.6179267168045044, + "learning_rate": 0.0003294009713977334, + "loss": 3.3965, + "step": 41900 + }, + { + "epoch": 4.5229110512129385, + "grad_norm": 0.677100658416748, + "learning_rate": 0.00032907717215326494, + "loss": 3.3982, + "step": 41950 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 0.6717151999473572, + "learning_rate": 0.00032875337290879655, + "loss": 3.3848, + "step": 42000 + }, + { + "epoch": 4.528301886792453, + "eval_accuracy": 0.37859935779624126, + "eval_loss": 3.430816888809204, + "eval_runtime": 185.1614, + "eval_samples_per_second": 97.272, + "eval_steps_per_second": 6.081, + "step": 42000 + }, + { + "epoch": 4.533692722371968, + "grad_norm": 0.647710919380188, + "learning_rate": 0.00032842957366432805, + "loss": 3.3855, + "step": 42050 + }, + { + "epoch": 4.539083557951482, + "grad_norm": 0.7008846998214722, + "learning_rate": 0.000328112250404749, + "loss": 3.3938, + "step": 42100 + }, + { + "epoch": 4.544474393530997, + "grad_norm": 0.6753333806991577, + "learning_rate": 0.00032778845116028066, + "loss": 3.3952, + "step": 42150 + }, + { + "epoch": 4.549865229110512, + "grad_norm": 0.6874195337295532, + "learning_rate": 0.00032746465191581215, + "loss": 3.3911, + "step": 42200 + }, + { + "epoch": 4.555256064690027, + "grad_norm": 0.6420570611953735, + "learning_rate": 0.0003271408526713437, + "loss": 3.4027, + "step": 42250 + }, + { + "epoch": 4.560646900269542, + "grad_norm": 0.7065884470939636, + "learning_rate": 0.0003268170534268753, + "loss": 3.3818, + "step": 42300 + }, + { + "epoch": 4.566037735849057, + "grad_norm": 0.6392266750335693, + "learning_rate": 0.00032649325418240686, + "loss": 3.4136, + "step": 42350 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.662189781665802, + "learning_rate": 0.00032616945493793846, + "loss": 3.374, + "step": 42400 + }, + { + "epoch": 4.576819407008086, + "grad_norm": 0.6861265301704407, + "learning_rate": 0.00032584565569347, + "loss": 3.3951, + "step": 42450 + }, + { + "epoch": 4.5822102425876015, + "grad_norm": 0.6285902857780457, + "learning_rate": 0.0003255218564490016, + "loss": 3.4016, + "step": 42500 + }, + { + "epoch": 4.587601078167116, + "grad_norm": 0.6859892010688782, + "learning_rate": 0.00032519805720453317, + "loss": 3.3806, + "step": 42550 + }, + { + "epoch": 4.592991913746631, + "grad_norm": 0.6380650401115417, + "learning_rate": 0.0003248742579600647, + "loss": 3.4063, + "step": 42600 + }, + { + "epoch": 4.598382749326145, + "grad_norm": 0.6524938344955444, + "learning_rate": 0.0003245504587155963, + "loss": 3.3911, + "step": 42650 + }, + { + "epoch": 4.60377358490566, + "grad_norm": 0.6308937072753906, + "learning_rate": 0.0003242266594711278, + "loss": 3.3892, + "step": 42700 + }, + { + "epoch": 4.609164420485175, + "grad_norm": 0.648926854133606, + "learning_rate": 0.0003239028602266595, + "loss": 3.3987, + "step": 42750 + }, + { + "epoch": 4.6145552560646905, + "grad_norm": 0.6324000954627991, + "learning_rate": 0.000323579060982191, + "loss": 3.3729, + "step": 42800 + }, + { + "epoch": 4.619946091644205, + "grad_norm": 0.6731909513473511, + "learning_rate": 0.0003232552617377226, + "loss": 3.4003, + "step": 42850 + }, + { + "epoch": 4.62533692722372, + "grad_norm": 0.6357996463775635, + "learning_rate": 0.00032293146249325413, + "loss": 3.3966, + "step": 42900 + }, + { + "epoch": 4.630727762803234, + "grad_norm": 0.7101162075996399, + "learning_rate": 0.00032260766324878574, + "loss": 3.3918, + "step": 42950 + }, + { + "epoch": 4.636118598382749, + "grad_norm": 0.7121390104293823, + "learning_rate": 0.0003222838640043173, + "loss": 3.3952, + "step": 43000 + }, + { + "epoch": 4.636118598382749, + "eval_accuracy": 0.3787855888459553, + "eval_loss": 3.424402952194214, + "eval_runtime": 185.282, + "eval_samples_per_second": 97.209, + "eval_steps_per_second": 6.077, + "step": 43000 + }, + { + "epoch": 4.6415094339622645, + "grad_norm": 0.6777333617210388, + "learning_rate": 0.00032196006475984884, + "loss": 3.3872, + "step": 43050 + }, + { + "epoch": 4.646900269541779, + "grad_norm": 0.6901153922080994, + "learning_rate": 0.00032163626551538044, + "loss": 3.3889, + "step": 43100 + }, + { + "epoch": 4.652291105121294, + "grad_norm": 0.6530250310897827, + "learning_rate": 0.000321312466270912, + "loss": 3.4072, + "step": 43150 + }, + { + "epoch": 4.657681940700809, + "grad_norm": 0.7110807299613953, + "learning_rate": 0.0003209886670264436, + "loss": 3.3743, + "step": 43200 + }, + { + "epoch": 4.663072776280323, + "grad_norm": 0.648343563079834, + "learning_rate": 0.00032066486778197515, + "loss": 3.396, + "step": 43250 + }, + { + "epoch": 4.668463611859838, + "grad_norm": 0.6666106581687927, + "learning_rate": 0.00032034106853750675, + "loss": 3.3839, + "step": 43300 + }, + { + "epoch": 4.6738544474393535, + "grad_norm": 0.7402759194374084, + "learning_rate": 0.0003200172692930383, + "loss": 3.3912, + "step": 43350 + }, + { + "epoch": 4.679245283018868, + "grad_norm": 0.6613388061523438, + "learning_rate": 0.0003196934700485699, + "loss": 3.3894, + "step": 43400 + }, + { + "epoch": 4.684636118598383, + "grad_norm": 0.6671817898750305, + "learning_rate": 0.0003193696708041014, + "loss": 3.3974, + "step": 43450 + }, + { + "epoch": 4.690026954177897, + "grad_norm": 0.6724131107330322, + "learning_rate": 0.00031904587155963296, + "loss": 3.3766, + "step": 43500 + }, + { + "epoch": 4.695417789757412, + "grad_norm": 0.6639029383659363, + "learning_rate": 0.00031872207231516456, + "loss": 3.3921, + "step": 43550 + }, + { + "epoch": 4.7008086253369274, + "grad_norm": 0.6928759217262268, + "learning_rate": 0.0003183982730706961, + "loss": 3.3984, + "step": 43600 + }, + { + "epoch": 4.706199460916442, + "grad_norm": 0.6988749504089355, + "learning_rate": 0.0003180744738262277, + "loss": 3.3954, + "step": 43650 + }, + { + "epoch": 4.711590296495957, + "grad_norm": 0.675839900970459, + "learning_rate": 0.00031775067458175927, + "loss": 3.3914, + "step": 43700 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 0.6638389229774475, + "learning_rate": 0.00031742687533729087, + "loss": 3.4078, + "step": 43750 + }, + { + "epoch": 4.722371967654986, + "grad_norm": 0.6970890164375305, + "learning_rate": 0.0003171030760928224, + "loss": 3.3917, + "step": 43800 + }, + { + "epoch": 4.727762803234501, + "grad_norm": 0.6743645071983337, + "learning_rate": 0.000316779276848354, + "loss": 3.399, + "step": 43850 + }, + { + "epoch": 4.7331536388140165, + "grad_norm": 0.6509005427360535, + "learning_rate": 0.0003164554776038856, + "loss": 3.3884, + "step": 43900 + }, + { + "epoch": 4.738544474393531, + "grad_norm": 0.6444371938705444, + "learning_rate": 0.00031613167835941713, + "loss": 3.3879, + "step": 43950 + }, + { + "epoch": 4.743935309973046, + "grad_norm": 0.6728962063789368, + "learning_rate": 0.00031580787911494873, + "loss": 3.3875, + "step": 44000 + }, + { + "epoch": 4.743935309973046, + "eval_accuracy": 0.37932092162421727, + "eval_loss": 3.421056032180786, + "eval_runtime": 185.3808, + "eval_samples_per_second": 97.157, + "eval_steps_per_second": 6.074, + "step": 44000 + }, + { + "epoch": 4.74932614555256, + "grad_norm": 0.6364440321922302, + "learning_rate": 0.00031548407987048023, + "loss": 3.3854, + "step": 44050 + }, + { + "epoch": 4.754716981132075, + "grad_norm": 0.7157145738601685, + "learning_rate": 0.0003151602806260119, + "loss": 3.3942, + "step": 44100 + }, + { + "epoch": 4.7601078167115904, + "grad_norm": 0.7098649144172668, + "learning_rate": 0.0003148364813815434, + "loss": 3.3875, + "step": 44150 + }, + { + "epoch": 4.765498652291106, + "grad_norm": 0.691125214099884, + "learning_rate": 0.000314512682137075, + "loss": 3.3911, + "step": 44200 + }, + { + "epoch": 4.77088948787062, + "grad_norm": 0.6948944330215454, + "learning_rate": 0.00031418888289260654, + "loss": 3.3756, + "step": 44250 + }, + { + "epoch": 4.776280323450135, + "grad_norm": 0.6892076134681702, + "learning_rate": 0.0003138650836481381, + "loss": 3.3961, + "step": 44300 + }, + { + "epoch": 4.781671159029649, + "grad_norm": 0.7187123894691467, + "learning_rate": 0.0003135412844036697, + "loss": 3.3782, + "step": 44350 + }, + { + "epoch": 4.787061994609164, + "grad_norm": 0.6381692886352539, + "learning_rate": 0.00031321748515920124, + "loss": 3.4169, + "step": 44400 + }, + { + "epoch": 4.7924528301886795, + "grad_norm": 0.6556568145751953, + "learning_rate": 0.00031289368591473285, + "loss": 3.3752, + "step": 44450 + }, + { + "epoch": 4.797843665768194, + "grad_norm": 0.6677922010421753, + "learning_rate": 0.0003125698866702644, + "loss": 3.3898, + "step": 44500 + }, + { + "epoch": 4.803234501347709, + "grad_norm": 0.6631757616996765, + "learning_rate": 0.000312246087425796, + "loss": 3.3838, + "step": 44550 + }, + { + "epoch": 4.808625336927224, + "grad_norm": 0.6678897738456726, + "learning_rate": 0.00031192228818132756, + "loss": 3.3972, + "step": 44600 + }, + { + "epoch": 4.814016172506738, + "grad_norm": 0.7124047875404358, + "learning_rate": 0.00031159848893685916, + "loss": 3.3808, + "step": 44650 + }, + { + "epoch": 4.819407008086253, + "grad_norm": 0.6782823801040649, + "learning_rate": 0.0003112746896923907, + "loss": 3.3945, + "step": 44700 + }, + { + "epoch": 4.824797843665769, + "grad_norm": 0.687823474407196, + "learning_rate": 0.0003109508904479222, + "loss": 3.3879, + "step": 44750 + }, + { + "epoch": 4.830188679245283, + "grad_norm": 0.7118351459503174, + "learning_rate": 0.0003106270912034538, + "loss": 3.3857, + "step": 44800 + }, + { + "epoch": 4.835579514824798, + "grad_norm": 0.6740395426750183, + "learning_rate": 0.00031030329195898536, + "loss": 3.3894, + "step": 44850 + }, + { + "epoch": 4.840970350404312, + "grad_norm": 0.6710334420204163, + "learning_rate": 0.00030997949271451697, + "loss": 3.392, + "step": 44900 + }, + { + "epoch": 4.846361185983827, + "grad_norm": 0.6429509520530701, + "learning_rate": 0.0003096556934700485, + "loss": 3.3808, + "step": 44950 + }, + { + "epoch": 4.8517520215633425, + "grad_norm": 0.6663042306900024, + "learning_rate": 0.00030933837021046947, + "loss": 3.4071, + "step": 45000 + }, + { + "epoch": 4.8517520215633425, + "eval_accuracy": 0.3799037357144541, + "eval_loss": 3.415198802947998, + "eval_runtime": 185.3679, + "eval_samples_per_second": 97.164, + "eval_steps_per_second": 6.074, + "step": 45000 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 0.7604882717132568, + "learning_rate": 0.0003090145709660011, + "loss": 3.377, + "step": 45050 + }, + { + "epoch": 4.862533692722372, + "grad_norm": 0.694174587726593, + "learning_rate": 0.0003086907717215326, + "loss": 3.3797, + "step": 45100 + }, + { + "epoch": 4.867924528301887, + "grad_norm": 0.690619945526123, + "learning_rate": 0.0003083669724770642, + "loss": 3.391, + "step": 45150 + }, + { + "epoch": 4.873315363881401, + "grad_norm": 0.7311848402023315, + "learning_rate": 0.0003080431732325958, + "loss": 3.3779, + "step": 45200 + }, + { + "epoch": 4.878706199460916, + "grad_norm": 0.7099789381027222, + "learning_rate": 0.00030771937398812733, + "loss": 3.3939, + "step": 45250 + }, + { + "epoch": 4.884097035040432, + "grad_norm": 0.7288787961006165, + "learning_rate": 0.00030739557474365894, + "loss": 3.3783, + "step": 45300 + }, + { + "epoch": 4.889487870619946, + "grad_norm": 0.7027914524078369, + "learning_rate": 0.0003070717754991905, + "loss": 3.3861, + "step": 45350 + }, + { + "epoch": 4.894878706199461, + "grad_norm": 0.6941841244697571, + "learning_rate": 0.0003067479762547221, + "loss": 3.3837, + "step": 45400 + }, + { + "epoch": 4.900269541778976, + "grad_norm": 0.756320059299469, + "learning_rate": 0.0003064241770102536, + "loss": 3.3929, + "step": 45450 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 0.6624318361282349, + "learning_rate": 0.00030610037776578514, + "loss": 3.3948, + "step": 45500 + }, + { + "epoch": 4.9110512129380055, + "grad_norm": 0.6968894600868225, + "learning_rate": 0.00030577657852131674, + "loss": 3.3916, + "step": 45550 + }, + { + "epoch": 4.916442048517521, + "grad_norm": 0.6658734083175659, + "learning_rate": 0.0003054527792768483, + "loss": 3.3918, + "step": 45600 + }, + { + "epoch": 4.921832884097035, + "grad_norm": 0.6801126003265381, + "learning_rate": 0.0003051289800323799, + "loss": 3.4074, + "step": 45650 + }, + { + "epoch": 4.92722371967655, + "grad_norm": 0.679326057434082, + "learning_rate": 0.00030480518078791145, + "loss": 3.401, + "step": 45700 + }, + { + "epoch": 4.932614555256064, + "grad_norm": 0.6885223984718323, + "learning_rate": 0.00030448138154344305, + "loss": 3.3811, + "step": 45750 + }, + { + "epoch": 4.938005390835579, + "grad_norm": 0.7149697542190552, + "learning_rate": 0.0003041575822989746, + "loss": 3.4065, + "step": 45800 + }, + { + "epoch": 4.943396226415095, + "grad_norm": 0.6741778254508972, + "learning_rate": 0.0003038337830545062, + "loss": 3.3787, + "step": 45850 + }, + { + "epoch": 4.948787061994609, + "grad_norm": 0.6654149889945984, + "learning_rate": 0.00030350998381003776, + "loss": 3.3748, + "step": 45900 + }, + { + "epoch": 4.954177897574124, + "grad_norm": 0.682928740978241, + "learning_rate": 0.0003031861845655693, + "loss": 3.3726, + "step": 45950 + }, + { + "epoch": 4.959568733153639, + "grad_norm": 0.7227015495300293, + "learning_rate": 0.0003028623853211009, + "loss": 3.3742, + "step": 46000 + }, + { + "epoch": 4.959568733153639, + "eval_accuracy": 0.3803156388121588, + "eval_loss": 3.4095218181610107, + "eval_runtime": 185.3393, + "eval_samples_per_second": 97.179, + "eval_steps_per_second": 6.075, + "step": 46000 + }, + { + "epoch": 4.964959568733153, + "grad_norm": 0.6969720721244812, + "learning_rate": 0.00030253858607663247, + "loss": 3.3797, + "step": 46050 + }, + { + "epoch": 4.9703504043126685, + "grad_norm": 0.7063660621643066, + "learning_rate": 0.00030221478683216407, + "loss": 3.3897, + "step": 46100 + }, + { + "epoch": 4.975741239892184, + "grad_norm": 0.6650516986846924, + "learning_rate": 0.00030189098758769557, + "loss": 3.3888, + "step": 46150 + }, + { + "epoch": 4.981132075471698, + "grad_norm": 0.6772712469100952, + "learning_rate": 0.00030156718834322717, + "loss": 3.3944, + "step": 46200 + }, + { + "epoch": 4.986522911051213, + "grad_norm": 0.7059131860733032, + "learning_rate": 0.0003012433890987587, + "loss": 3.392, + "step": 46250 + }, + { + "epoch": 4.991913746630727, + "grad_norm": 0.7540570497512817, + "learning_rate": 0.0003009195898542903, + "loss": 3.3841, + "step": 46300 + }, + { + "epoch": 4.997304582210242, + "grad_norm": 0.7052427530288696, + "learning_rate": 0.0003005957906098219, + "loss": 3.3774, + "step": 46350 + }, + { + "epoch": 5.002695417789758, + "grad_norm": 0.7645816206932068, + "learning_rate": 0.00030027199136535343, + "loss": 3.3627, + "step": 46400 + }, + { + "epoch": 5.008086253369272, + "grad_norm": 0.6885704398155212, + "learning_rate": 0.00029994819212088503, + "loss": 3.2878, + "step": 46450 + }, + { + "epoch": 5.013477088948787, + "grad_norm": 1.085796594619751, + "learning_rate": 0.0002996243928764166, + "loss": 3.2952, + "step": 46500 + }, + { + "epoch": 5.018867924528302, + "grad_norm": 0.6787880659103394, + "learning_rate": 0.0002993005936319482, + "loss": 3.3063, + "step": 46550 + }, + { + "epoch": 5.024258760107816, + "grad_norm": 0.7390757203102112, + "learning_rate": 0.00029897679438747974, + "loss": 3.2968, + "step": 46600 + }, + { + "epoch": 5.0296495956873315, + "grad_norm": 0.7662341594696045, + "learning_rate": 0.0002986529951430113, + "loss": 3.2848, + "step": 46650 + }, + { + "epoch": 5.035040431266847, + "grad_norm": 0.6801024675369263, + "learning_rate": 0.0002983291958985429, + "loss": 3.2963, + "step": 46700 + }, + { + "epoch": 5.040431266846361, + "grad_norm": 0.6480717658996582, + "learning_rate": 0.00029800539665407444, + "loss": 3.2874, + "step": 46750 + }, + { + "epoch": 5.045822102425876, + "grad_norm": 0.7626944780349731, + "learning_rate": 0.000297681597409606, + "loss": 3.2959, + "step": 46800 + }, + { + "epoch": 5.051212938005391, + "grad_norm": 0.6716665625572205, + "learning_rate": 0.0002973577981651376, + "loss": 3.3083, + "step": 46850 + }, + { + "epoch": 5.056603773584905, + "grad_norm": 0.6842003464698792, + "learning_rate": 0.00029703399892066915, + "loss": 3.316, + "step": 46900 + }, + { + "epoch": 5.061994609164421, + "grad_norm": 0.6711278557777405, + "learning_rate": 0.00029671019967620076, + "loss": 3.3085, + "step": 46950 + }, + { + "epoch": 5.067385444743936, + "grad_norm": 0.6872760653495789, + "learning_rate": 0.0002963864004317323, + "loss": 3.3158, + "step": 47000 + }, + { + "epoch": 5.067385444743936, + "eval_accuracy": 0.380729606314732, + "eval_loss": 3.410980463027954, + "eval_runtime": 185.6161, + "eval_samples_per_second": 97.034, + "eval_steps_per_second": 6.066, + "step": 47000 + }, + { + "epoch": 5.07277628032345, + "grad_norm": 0.7116502523422241, + "learning_rate": 0.00029606260118726386, + "loss": 3.2866, + "step": 47050 + }, + { + "epoch": 5.078167115902965, + "grad_norm": 0.6567325592041016, + "learning_rate": 0.0002957388019427954, + "loss": 3.3009, + "step": 47100 + }, + { + "epoch": 5.083557951482479, + "grad_norm": 0.7238079309463501, + "learning_rate": 0.000295415002698327, + "loss": 3.3246, + "step": 47150 + }, + { + "epoch": 5.0889487870619945, + "grad_norm": 0.7564477324485779, + "learning_rate": 0.00029509120345385856, + "loss": 3.3181, + "step": 47200 + }, + { + "epoch": 5.09433962264151, + "grad_norm": 0.7536827921867371, + "learning_rate": 0.00029476740420939017, + "loss": 3.2986, + "step": 47250 + }, + { + "epoch": 5.099730458221024, + "grad_norm": 0.7181034684181213, + "learning_rate": 0.0002944436049649217, + "loss": 3.3121, + "step": 47300 + }, + { + "epoch": 5.105121293800539, + "grad_norm": 0.7820267081260681, + "learning_rate": 0.00029412628170534267, + "loss": 3.3151, + "step": 47350 + }, + { + "epoch": 5.110512129380054, + "grad_norm": 0.676846444606781, + "learning_rate": 0.0002938024824608742, + "loss": 3.3098, + "step": 47400 + }, + { + "epoch": 5.115902964959568, + "grad_norm": 0.7352178692817688, + "learning_rate": 0.00029347868321640577, + "loss": 3.3171, + "step": 47450 + }, + { + "epoch": 5.121293800539084, + "grad_norm": 0.7145463824272156, + "learning_rate": 0.0002931548839719374, + "loss": 3.3112, + "step": 47500 + }, + { + "epoch": 5.126684636118599, + "grad_norm": 0.6998820900917053, + "learning_rate": 0.0002928310847274689, + "loss": 3.3068, + "step": 47550 + }, + { + "epoch": 5.132075471698113, + "grad_norm": 0.6971082091331482, + "learning_rate": 0.00029250728548300053, + "loss": 3.3033, + "step": 47600 + }, + { + "epoch": 5.137466307277628, + "grad_norm": 0.6818746328353882, + "learning_rate": 0.0002921834862385321, + "loss": 3.3174, + "step": 47650 + }, + { + "epoch": 5.142857142857143, + "grad_norm": 0.6603133678436279, + "learning_rate": 0.0002918596869940637, + "loss": 3.32, + "step": 47700 + }, + { + "epoch": 5.1482479784366575, + "grad_norm": 0.720582902431488, + "learning_rate": 0.0002915358877495952, + "loss": 3.3214, + "step": 47750 + }, + { + "epoch": 5.153638814016173, + "grad_norm": 0.7110748291015625, + "learning_rate": 0.0002912120885051268, + "loss": 3.3299, + "step": 47800 + }, + { + "epoch": 5.159029649595688, + "grad_norm": 0.6903175115585327, + "learning_rate": 0.00029088828926065834, + "loss": 3.3209, + "step": 47850 + }, + { + "epoch": 5.164420485175202, + "grad_norm": 0.7338415384292603, + "learning_rate": 0.00029056449001618994, + "loss": 3.3033, + "step": 47900 + }, + { + "epoch": 5.169811320754717, + "grad_norm": 0.723596453666687, + "learning_rate": 0.0002902406907717215, + "loss": 3.3219, + "step": 47950 + }, + { + "epoch": 5.175202156334231, + "grad_norm": 0.7516893148422241, + "learning_rate": 0.0002899168915272531, + "loss": 3.3119, + "step": 48000 + }, + { + "epoch": 5.175202156334231, + "eval_accuracy": 0.3811521573954425, + "eval_loss": 3.411118268966675, + "eval_runtime": 185.4994, + "eval_samples_per_second": 97.095, + "eval_steps_per_second": 6.07, + "step": 48000 + }, + { + "epoch": 5.180592991913747, + "grad_norm": 0.7215257883071899, + "learning_rate": 0.00028959309228278465, + "loss": 3.3424, + "step": 48050 + }, + { + "epoch": 5.185983827493262, + "grad_norm": 0.7489564418792725, + "learning_rate": 0.00028926929303831625, + "loss": 3.3215, + "step": 48100 + }, + { + "epoch": 5.191374663072776, + "grad_norm": 0.6841164231300354, + "learning_rate": 0.0002889454937938478, + "loss": 3.3318, + "step": 48150 + }, + { + "epoch": 5.196765498652291, + "grad_norm": 0.6923016309738159, + "learning_rate": 0.00028862169454937935, + "loss": 3.3199, + "step": 48200 + }, + { + "epoch": 5.202156334231806, + "grad_norm": 0.7165120840072632, + "learning_rate": 0.0002882978953049109, + "loss": 3.3028, + "step": 48250 + }, + { + "epoch": 5.2075471698113205, + "grad_norm": 0.6918339729309082, + "learning_rate": 0.0002879740960604425, + "loss": 3.3329, + "step": 48300 + }, + { + "epoch": 5.212938005390836, + "grad_norm": 0.6800699830055237, + "learning_rate": 0.00028765029681597406, + "loss": 3.3145, + "step": 48350 + }, + { + "epoch": 5.218328840970351, + "grad_norm": 0.7082724571228027, + "learning_rate": 0.00028732649757150566, + "loss": 3.3271, + "step": 48400 + }, + { + "epoch": 5.223719676549865, + "grad_norm": 0.719948947429657, + "learning_rate": 0.0002870026983270372, + "loss": 3.3173, + "step": 48450 + }, + { + "epoch": 5.22911051212938, + "grad_norm": 0.6641173362731934, + "learning_rate": 0.00028667889908256877, + "loss": 3.317, + "step": 48500 + }, + { + "epoch": 5.234501347708895, + "grad_norm": 0.6939883828163147, + "learning_rate": 0.00028635509983810037, + "loss": 3.3035, + "step": 48550 + }, + { + "epoch": 5.2398921832884096, + "grad_norm": 0.695296049118042, + "learning_rate": 0.0002860313005936319, + "loss": 3.3286, + "step": 48600 + }, + { + "epoch": 5.245283018867925, + "grad_norm": 0.7294591069221497, + "learning_rate": 0.00028570750134916347, + "loss": 3.3226, + "step": 48650 + }, + { + "epoch": 5.250673854447439, + "grad_norm": 0.710701048374176, + "learning_rate": 0.0002853837021046951, + "loss": 3.3203, + "step": 48700 + }, + { + "epoch": 5.256064690026954, + "grad_norm": 0.7199147343635559, + "learning_rate": 0.00028505990286022663, + "loss": 3.311, + "step": 48750 + }, + { + "epoch": 5.261455525606469, + "grad_norm": 0.6639696359634399, + "learning_rate": 0.0002847361036157582, + "loss": 3.3255, + "step": 48800 + }, + { + "epoch": 5.2668463611859835, + "grad_norm": 0.6906806826591492, + "learning_rate": 0.0002844123043712898, + "loss": 3.3368, + "step": 48850 + }, + { + "epoch": 5.272237196765499, + "grad_norm": 0.6999244689941406, + "learning_rate": 0.00028408850512682133, + "loss": 3.3156, + "step": 48900 + }, + { + "epoch": 5.277628032345014, + "grad_norm": 0.7082576751708984, + "learning_rate": 0.00028376470588235294, + "loss": 3.3318, + "step": 48950 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 0.7592863440513611, + "learning_rate": 0.0002834409066378845, + "loss": 3.3434, + "step": 49000 + }, + { + "epoch": 5.283018867924528, + "eval_accuracy": 0.38130264164506555, + "eval_loss": 3.4073922634124756, + "eval_runtime": 185.3808, + "eval_samples_per_second": 97.157, + "eval_steps_per_second": 6.074, + "step": 49000 + }, + { + "epoch": 5.288409703504043, + "grad_norm": 0.7438406944274902, + "learning_rate": 0.00028311710739341604, + "loss": 3.3356, + "step": 49050 + }, + { + "epoch": 5.293800539083558, + "grad_norm": 0.7723414301872253, + "learning_rate": 0.00028279330814894764, + "loss": 3.3378, + "step": 49100 + }, + { + "epoch": 5.2991913746630726, + "grad_norm": 0.7213961482048035, + "learning_rate": 0.0002824695089044792, + "loss": 3.3135, + "step": 49150 + }, + { + "epoch": 5.304582210242588, + "grad_norm": 0.6973984241485596, + "learning_rate": 0.00028214570966001075, + "loss": 3.3393, + "step": 49200 + }, + { + "epoch": 5.309973045822103, + "grad_norm": 0.6829140186309814, + "learning_rate": 0.00028182191041554235, + "loss": 3.327, + "step": 49250 + }, + { + "epoch": 5.315363881401617, + "grad_norm": 0.7045508027076721, + "learning_rate": 0.0002814981111710739, + "loss": 3.324, + "step": 49300 + }, + { + "epoch": 5.320754716981132, + "grad_norm": 0.6908060908317566, + "learning_rate": 0.0002811743119266055, + "loss": 3.3408, + "step": 49350 + }, + { + "epoch": 5.3261455525606465, + "grad_norm": 0.6872555613517761, + "learning_rate": 0.00028085051268213706, + "loss": 3.3154, + "step": 49400 + }, + { + "epoch": 5.331536388140162, + "grad_norm": 0.7134802341461182, + "learning_rate": 0.0002805267134376686, + "loss": 3.3139, + "step": 49450 + }, + { + "epoch": 5.336927223719677, + "grad_norm": 0.6903613805770874, + "learning_rate": 0.00028020291419320016, + "loss": 3.3408, + "step": 49500 + }, + { + "epoch": 5.342318059299191, + "grad_norm": 0.7270494103431702, + "learning_rate": 0.00027987911494873176, + "loss": 3.3396, + "step": 49550 + }, + { + "epoch": 5.347708894878706, + "grad_norm": 0.7505331635475159, + "learning_rate": 0.0002795553157042633, + "loss": 3.3326, + "step": 49600 + }, + { + "epoch": 5.353099730458221, + "grad_norm": 0.71560138463974, + "learning_rate": 0.0002792315164597949, + "loss": 3.3208, + "step": 49650 + }, + { + "epoch": 5.3584905660377355, + "grad_norm": 0.7327362895011902, + "learning_rate": 0.00027890771721532647, + "loss": 3.3467, + "step": 49700 + }, + { + "epoch": 5.363881401617251, + "grad_norm": 0.707087516784668, + "learning_rate": 0.00027858391797085807, + "loss": 3.3443, + "step": 49750 + }, + { + "epoch": 5.369272237196766, + "grad_norm": 0.6669909358024597, + "learning_rate": 0.0002782601187263896, + "loss": 3.3169, + "step": 49800 + }, + { + "epoch": 5.37466307277628, + "grad_norm": 0.6940752863883972, + "learning_rate": 0.0002779363194819212, + "loss": 3.3494, + "step": 49850 + }, + { + "epoch": 5.380053908355795, + "grad_norm": 0.7098177671432495, + "learning_rate": 0.0002776125202374527, + "loss": 3.3276, + "step": 49900 + }, + { + "epoch": 5.38544474393531, + "grad_norm": 0.7212302088737488, + "learning_rate": 0.00027728872099298433, + "loss": 3.3396, + "step": 49950 + }, + { + "epoch": 5.390835579514825, + "grad_norm": 0.7480193972587585, + "learning_rate": 0.0002769713977334053, + "loss": 3.3422, + "step": 50000 + }, + { + "epoch": 5.390835579514825, + "eval_accuracy": 0.3820361572907011, + "eval_loss": 3.4000582695007324, + "eval_runtime": 185.5499, + "eval_samples_per_second": 97.068, + "eval_steps_per_second": 6.068, + "step": 50000 + }, + { + "epoch": 5.39622641509434, + "grad_norm": 0.6794703602790833, + "learning_rate": 0.00027664759848893683, + "loss": 3.3215, + "step": 50050 + }, + { + "epoch": 5.401617250673855, + "grad_norm": 0.7536985874176025, + "learning_rate": 0.00027632379924446844, + "loss": 3.3188, + "step": 50100 + }, + { + "epoch": 5.407008086253369, + "grad_norm": 0.693673312664032, + "learning_rate": 0.000276, + "loss": 3.3313, + "step": 50150 + }, + { + "epoch": 5.412398921832884, + "grad_norm": 0.750146746635437, + "learning_rate": 0.00027567620075553154, + "loss": 3.3299, + "step": 50200 + }, + { + "epoch": 5.4177897574123985, + "grad_norm": 0.6878756880760193, + "learning_rate": 0.0002753524015110631, + "loss": 3.3373, + "step": 50250 + }, + { + "epoch": 5.423180592991914, + "grad_norm": 0.6811180710792542, + "learning_rate": 0.0002750286022665947, + "loss": 3.3215, + "step": 50300 + }, + { + "epoch": 5.428571428571429, + "grad_norm": 0.722919762134552, + "learning_rate": 0.00027470480302212624, + "loss": 3.3506, + "step": 50350 + }, + { + "epoch": 5.433962264150943, + "grad_norm": 0.7224785089492798, + "learning_rate": 0.0002743874797625472, + "loss": 3.3321, + "step": 50400 + }, + { + "epoch": 5.439353099730458, + "grad_norm": 0.7404599785804749, + "learning_rate": 0.0002740636805180788, + "loss": 3.3371, + "step": 50450 + }, + { + "epoch": 5.444743935309973, + "grad_norm": 0.7081565260887146, + "learning_rate": 0.00027373988127361035, + "loss": 3.325, + "step": 50500 + }, + { + "epoch": 5.450134770889488, + "grad_norm": 0.7478460669517517, + "learning_rate": 0.0002734160820291419, + "loss": 3.3324, + "step": 50550 + }, + { + "epoch": 5.455525606469003, + "grad_norm": 0.715099036693573, + "learning_rate": 0.00027309228278467345, + "loss": 3.3411, + "step": 50600 + }, + { + "epoch": 5.460916442048518, + "grad_norm": 0.6973506808280945, + "learning_rate": 0.00027276848354020506, + "loss": 3.3541, + "step": 50650 + }, + { + "epoch": 5.466307277628032, + "grad_norm": 0.800144612789154, + "learning_rate": 0.0002724446842957366, + "loss": 3.3159, + "step": 50700 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 0.721748948097229, + "learning_rate": 0.0002721208850512682, + "loss": 3.3341, + "step": 50750 + }, + { + "epoch": 5.4770889487870615, + "grad_norm": 0.7428343296051025, + "learning_rate": 0.00027179708580679976, + "loss": 3.3349, + "step": 50800 + }, + { + "epoch": 5.482479784366577, + "grad_norm": 0.6495864391326904, + "learning_rate": 0.0002714732865623313, + "loss": 3.3282, + "step": 50850 + }, + { + "epoch": 5.487870619946092, + "grad_norm": 0.7069242000579834, + "learning_rate": 0.0002711494873178629, + "loss": 3.325, + "step": 50900 + }, + { + "epoch": 5.493261455525606, + "grad_norm": 0.6868419647216797, + "learning_rate": 0.00027082568807339447, + "loss": 3.3423, + "step": 50950 + }, + { + "epoch": 5.498652291105121, + "grad_norm": 0.7288803458213806, + "learning_rate": 0.000270501888828926, + "loss": 3.3445, + "step": 51000 + }, + { + "epoch": 5.498652291105121, + "eval_accuracy": 0.3819953038048829, + "eval_loss": 3.3993842601776123, + "eval_runtime": 185.485, + "eval_samples_per_second": 97.102, + "eval_steps_per_second": 6.071, + "step": 51000 + }, + { + "epoch": 5.504043126684636, + "grad_norm": 0.7231615781784058, + "learning_rate": 0.0002701780895844576, + "loss": 3.3506, + "step": 51050 + }, + { + "epoch": 5.509433962264151, + "grad_norm": 0.7433888912200928, + "learning_rate": 0.0002698542903399892, + "loss": 3.3371, + "step": 51100 + }, + { + "epoch": 5.514824797843666, + "grad_norm": 0.6915830969810486, + "learning_rate": 0.0002695304910955207, + "loss": 3.3342, + "step": 51150 + }, + { + "epoch": 5.520215633423181, + "grad_norm": 0.6907735466957092, + "learning_rate": 0.00026920669185105233, + "loss": 3.3171, + "step": 51200 + }, + { + "epoch": 5.525606469002695, + "grad_norm": 0.7327287197113037, + "learning_rate": 0.0002688828926065839, + "loss": 3.3385, + "step": 51250 + }, + { + "epoch": 5.53099730458221, + "grad_norm": 0.6716380715370178, + "learning_rate": 0.0002685590933621155, + "loss": 3.3296, + "step": 51300 + }, + { + "epoch": 5.536388140161725, + "grad_norm": 0.7345424890518188, + "learning_rate": 0.00026823529411764704, + "loss": 3.3215, + "step": 51350 + }, + { + "epoch": 5.54177897574124, + "grad_norm": 0.7785588502883911, + "learning_rate": 0.0002679114948731786, + "loss": 3.3123, + "step": 51400 + }, + { + "epoch": 5.547169811320755, + "grad_norm": 0.7506522536277771, + "learning_rate": 0.0002675876956287102, + "loss": 3.3115, + "step": 51450 + }, + { + "epoch": 5.55256064690027, + "grad_norm": 0.7358080744743347, + "learning_rate": 0.00026726389638424174, + "loss": 3.342, + "step": 51500 + }, + { + "epoch": 5.557951482479784, + "grad_norm": 0.7009307146072388, + "learning_rate": 0.0002669400971397733, + "loss": 3.3456, + "step": 51550 + }, + { + "epoch": 5.563342318059299, + "grad_norm": 0.6749496459960938, + "learning_rate": 0.0002666162978953049, + "loss": 3.3201, + "step": 51600 + }, + { + "epoch": 5.568733153638814, + "grad_norm": 0.7447863221168518, + "learning_rate": 0.00026629249865083645, + "loss": 3.336, + "step": 51650 + }, + { + "epoch": 5.574123989218329, + "grad_norm": 0.7776146531105042, + "learning_rate": 0.00026596869940636805, + "loss": 3.3398, + "step": 51700 + }, + { + "epoch": 5.579514824797844, + "grad_norm": 0.6787426471710205, + "learning_rate": 0.0002656449001618996, + "loss": 3.3314, + "step": 51750 + }, + { + "epoch": 5.584905660377358, + "grad_norm": 0.6856850385665894, + "learning_rate": 0.0002653211009174312, + "loss": 3.3232, + "step": 51800 + }, + { + "epoch": 5.590296495956873, + "grad_norm": 0.7217938899993896, + "learning_rate": 0.0002649973016729627, + "loss": 3.332, + "step": 51850 + }, + { + "epoch": 5.595687331536388, + "grad_norm": 0.7498999238014221, + "learning_rate": 0.0002646735024284943, + "loss": 3.3218, + "step": 51900 + }, + { + "epoch": 5.601078167115903, + "grad_norm": 0.7334463596343994, + "learning_rate": 0.00026434970318402586, + "loss": 3.339, + "step": 51950 + }, + { + "epoch": 5.606469002695418, + "grad_norm": 0.6884651780128479, + "learning_rate": 0.00026402590393955746, + "loss": 3.3169, + "step": 52000 + }, + { + "epoch": 5.606469002695418, + "eval_accuracy": 0.3826099531912494, + "eval_loss": 3.392883777618408, + "eval_runtime": 185.4171, + "eval_samples_per_second": 97.138, + "eval_steps_per_second": 6.073, + "step": 52000 + }, + { + "epoch": 5.611859838274933, + "grad_norm": 0.782909095287323, + "learning_rate": 0.000263702104695089, + "loss": 3.3343, + "step": 52050 + }, + { + "epoch": 5.617250673854447, + "grad_norm": 0.7251365184783936, + "learning_rate": 0.0002633783054506206, + "loss": 3.3271, + "step": 52100 + }, + { + "epoch": 5.622641509433962, + "grad_norm": 0.72756427526474, + "learning_rate": 0.00026305450620615217, + "loss": 3.3473, + "step": 52150 + }, + { + "epoch": 5.628032345013477, + "grad_norm": 0.7529804706573486, + "learning_rate": 0.0002627307069616837, + "loss": 3.3497, + "step": 52200 + }, + { + "epoch": 5.633423180592992, + "grad_norm": 0.6761258244514465, + "learning_rate": 0.0002624069077172153, + "loss": 3.3294, + "step": 52250 + }, + { + "epoch": 5.638814016172507, + "grad_norm": 0.7436754107475281, + "learning_rate": 0.0002620831084727469, + "loss": 3.3182, + "step": 52300 + }, + { + "epoch": 5.644204851752022, + "grad_norm": 0.7272395491600037, + "learning_rate": 0.0002617593092282784, + "loss": 3.3417, + "step": 52350 + }, + { + "epoch": 5.649595687331536, + "grad_norm": 0.6948999762535095, + "learning_rate": 0.00026143550998381003, + "loss": 3.3186, + "step": 52400 + }, + { + "epoch": 5.654986522911051, + "grad_norm": 0.7938417792320251, + "learning_rate": 0.0002611117107393416, + "loss": 3.3372, + "step": 52450 + }, + { + "epoch": 5.660377358490566, + "grad_norm": 0.7938036918640137, + "learning_rate": 0.00026078791149487313, + "loss": 3.3445, + "step": 52500 + }, + { + "epoch": 5.665768194070081, + "grad_norm": 0.669804036617279, + "learning_rate": 0.00026046411225040474, + "loss": 3.3305, + "step": 52550 + }, + { + "epoch": 5.671159029649596, + "grad_norm": 0.7671605348587036, + "learning_rate": 0.0002601403130059363, + "loss": 3.3441, + "step": 52600 + }, + { + "epoch": 5.67654986522911, + "grad_norm": 0.7261316180229187, + "learning_rate": 0.0002598165137614679, + "loss": 3.3439, + "step": 52650 + }, + { + "epoch": 5.681940700808625, + "grad_norm": 0.6810511946678162, + "learning_rate": 0.00025949271451699944, + "loss": 3.3326, + "step": 52700 + }, + { + "epoch": 5.6873315363881405, + "grad_norm": 0.7681779265403748, + "learning_rate": 0.000259168915272531, + "loss": 3.3468, + "step": 52750 + }, + { + "epoch": 5.692722371967655, + "grad_norm": 0.7268847227096558, + "learning_rate": 0.0002588451160280626, + "loss": 3.3262, + "step": 52800 + }, + { + "epoch": 5.69811320754717, + "grad_norm": 0.7206116914749146, + "learning_rate": 0.00025852131678359415, + "loss": 3.3247, + "step": 52850 + }, + { + "epoch": 5.703504043126685, + "grad_norm": 0.7164592146873474, + "learning_rate": 0.0002581975175391257, + "loss": 3.3254, + "step": 52900 + }, + { + "epoch": 5.708894878706199, + "grad_norm": 0.7586601972579956, + "learning_rate": 0.0002578737182946573, + "loss": 3.3434, + "step": 52950 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.786045491695404, + "learning_rate": 0.00025754991905018885, + "loss": 3.333, + "step": 53000 + }, + { + "epoch": 5.714285714285714, + "eval_accuracy": 0.38318516157933924, + "eval_loss": 3.3877861499786377, + "eval_runtime": 185.5745, + "eval_samples_per_second": 97.055, + "eval_steps_per_second": 6.068, + "step": 53000 + }, + { + "epoch": 5.719676549865229, + "grad_norm": 0.6914837956428528, + "learning_rate": 0.00025722611980572046, + "loss": 3.311, + "step": 53050 + }, + { + "epoch": 5.725067385444744, + "grad_norm": 0.767452597618103, + "learning_rate": 0.000256902320561252, + "loss": 3.3264, + "step": 53100 + }, + { + "epoch": 5.730458221024259, + "grad_norm": 0.7509741187095642, + "learning_rate": 0.00025657852131678356, + "loss": 3.333, + "step": 53150 + }, + { + "epoch": 5.735849056603773, + "grad_norm": 0.735542356967926, + "learning_rate": 0.0002562547220723151, + "loss": 3.3335, + "step": 53200 + }, + { + "epoch": 5.741239892183288, + "grad_norm": 0.709091067314148, + "learning_rate": 0.0002559309228278467, + "loss": 3.3245, + "step": 53250 + }, + { + "epoch": 5.7466307277628035, + "grad_norm": 0.741787850856781, + "learning_rate": 0.00025560712358337827, + "loss": 3.3504, + "step": 53300 + }, + { + "epoch": 5.752021563342318, + "grad_norm": 0.7668866515159607, + "learning_rate": 0.00025528332433890987, + "loss": 3.3277, + "step": 53350 + }, + { + "epoch": 5.757412398921833, + "grad_norm": 0.7090975046157837, + "learning_rate": 0.0002549595250944414, + "loss": 3.3134, + "step": 53400 + }, + { + "epoch": 5.762803234501348, + "grad_norm": 0.7142593860626221, + "learning_rate": 0.000254635725849973, + "loss": 3.3448, + "step": 53450 + }, + { + "epoch": 5.768194070080862, + "grad_norm": 0.7309287786483765, + "learning_rate": 0.0002543119266055046, + "loss": 3.3142, + "step": 53500 + }, + { + "epoch": 5.773584905660377, + "grad_norm": 0.7920538783073425, + "learning_rate": 0.00025398812736103613, + "loss": 3.3432, + "step": 53550 + }, + { + "epoch": 5.7789757412398925, + "grad_norm": 0.7171128392219543, + "learning_rate": 0.0002536643281165677, + "loss": 3.3163, + "step": 53600 + }, + { + "epoch": 5.784366576819407, + "grad_norm": 0.6814385056495667, + "learning_rate": 0.0002533405288720993, + "loss": 3.3494, + "step": 53650 + }, + { + "epoch": 5.789757412398922, + "grad_norm": 0.7790846228599548, + "learning_rate": 0.00025301672962763083, + "loss": 3.3204, + "step": 53700 + }, + { + "epoch": 5.795148247978437, + "grad_norm": 0.7661272287368774, + "learning_rate": 0.00025269293038316244, + "loss": 3.3356, + "step": 53750 + }, + { + "epoch": 5.800539083557951, + "grad_norm": 0.7544566988945007, + "learning_rate": 0.000252369131138694, + "loss": 3.3269, + "step": 53800 + }, + { + "epoch": 5.8059299191374665, + "grad_norm": 0.7239667177200317, + "learning_rate": 0.00025204533189422554, + "loss": 3.3316, + "step": 53850 + }, + { + "epoch": 5.811320754716981, + "grad_norm": 0.7095721364021301, + "learning_rate": 0.00025172153264975714, + "loss": 3.3212, + "step": 53900 + }, + { + "epoch": 5.816711590296496, + "grad_norm": 0.7116101384162903, + "learning_rate": 0.0002513977334052887, + "loss": 3.3163, + "step": 53950 + }, + { + "epoch": 5.822102425876011, + "grad_norm": 0.7057174444198608, + "learning_rate": 0.00025107393416082025, + "loss": 3.331, + "step": 54000 + }, + { + "epoch": 5.822102425876011, + "eval_accuracy": 0.3834898242767711, + "eval_loss": 3.384427070617676, + "eval_runtime": 185.8424, + "eval_samples_per_second": 96.915, + "eval_steps_per_second": 6.059, + "step": 54000 + }, + { + "epoch": 5.827493261455525, + "grad_norm": 0.7220545411109924, + "learning_rate": 0.00025075013491635185, + "loss": 3.3434, + "step": 54050 + }, + { + "epoch": 5.83288409703504, + "grad_norm": 0.6828851103782654, + "learning_rate": 0.0002504263356718834, + "loss": 3.3495, + "step": 54100 + }, + { + "epoch": 5.8382749326145555, + "grad_norm": 0.7387149333953857, + "learning_rate": 0.000250102536427415, + "loss": 3.323, + "step": 54150 + }, + { + "epoch": 5.84366576819407, + "grad_norm": 0.7202645540237427, + "learning_rate": 0.00024977873718294656, + "loss": 3.3336, + "step": 54200 + }, + { + "epoch": 5.849056603773585, + "grad_norm": 0.7070654630661011, + "learning_rate": 0.0002494549379384781, + "loss": 3.3618, + "step": 54250 + }, + { + "epoch": 5.8544474393531, + "grad_norm": 0.71637362241745, + "learning_rate": 0.0002491311386940097, + "loss": 3.3402, + "step": 54300 + }, + { + "epoch": 5.859838274932614, + "grad_norm": 0.6806088089942932, + "learning_rate": 0.00024880733944954126, + "loss": 3.3416, + "step": 54350 + }, + { + "epoch": 5.8652291105121295, + "grad_norm": 0.73921799659729, + "learning_rate": 0.0002484835402050728, + "loss": 3.3452, + "step": 54400 + }, + { + "epoch": 5.870619946091644, + "grad_norm": 0.7796598076820374, + "learning_rate": 0.0002481597409606044, + "loss": 3.3176, + "step": 54450 + }, + { + "epoch": 5.876010781671159, + "grad_norm": 0.7413299679756165, + "learning_rate": 0.00024783594171613597, + "loss": 3.3442, + "step": 54500 + }, + { + "epoch": 5.881401617250674, + "grad_norm": 0.722176194190979, + "learning_rate": 0.0002475121424716675, + "loss": 3.332, + "step": 54550 + }, + { + "epoch": 5.886792452830189, + "grad_norm": 0.6924837231636047, + "learning_rate": 0.0002471883432271991, + "loss": 3.3228, + "step": 54600 + }, + { + "epoch": 5.892183288409703, + "grad_norm": 0.7435639500617981, + "learning_rate": 0.0002468645439827307, + "loss": 3.3468, + "step": 54650 + }, + { + "epoch": 5.8975741239892185, + "grad_norm": 0.7494300007820129, + "learning_rate": 0.0002465407447382623, + "loss": 3.3424, + "step": 54700 + }, + { + "epoch": 5.902964959568733, + "grad_norm": 0.7710332274436951, + "learning_rate": 0.00024621694549379383, + "loss": 3.3272, + "step": 54750 + }, + { + "epoch": 5.908355795148248, + "grad_norm": 0.7281633615493774, + "learning_rate": 0.00024589314624932543, + "loss": 3.3583, + "step": 54800 + }, + { + "epoch": 5.913746630727763, + "grad_norm": 0.7332462072372437, + "learning_rate": 0.00024556934700485693, + "loss": 3.3256, + "step": 54850 + }, + { + "epoch": 5.919137466307277, + "grad_norm": 0.7332950234413147, + "learning_rate": 0.00024524554776038853, + "loss": 3.3547, + "step": 54900 + }, + { + "epoch": 5.9245283018867925, + "grad_norm": 0.7006910443305969, + "learning_rate": 0.0002449217485159201, + "loss": 3.3328, + "step": 54950 + }, + { + "epoch": 5.929919137466308, + "grad_norm": 0.8082262277603149, + "learning_rate": 0.0002445979492714517, + "loss": 3.3342, + "step": 55000 + }, + { + "epoch": 5.929919137466308, + "eval_accuracy": 0.38402233207994985, + "eval_loss": 3.380251884460449, + "eval_runtime": 185.4976, + "eval_samples_per_second": 97.096, + "eval_steps_per_second": 6.07, + "step": 55000 + }, + { + "epoch": 5.935309973045822, + "grad_norm": 0.7710868716239929, + "learning_rate": 0.00024427415002698324, + "loss": 3.3547, + "step": 55050 + }, + { + "epoch": 5.940700808625337, + "grad_norm": 0.7845757007598877, + "learning_rate": 0.00024395035078251482, + "loss": 3.3294, + "step": 55100 + }, + { + "epoch": 5.946091644204852, + "grad_norm": 0.7587760090827942, + "learning_rate": 0.0002436265515380464, + "loss": 3.3422, + "step": 55150 + }, + { + "epoch": 5.951482479784366, + "grad_norm": 0.7302002310752869, + "learning_rate": 0.00024330275229357797, + "loss": 3.3277, + "step": 55200 + }, + { + "epoch": 5.9568733153638815, + "grad_norm": 0.7525575757026672, + "learning_rate": 0.00024297895304910952, + "loss": 3.3126, + "step": 55250 + }, + { + "epoch": 5.962264150943396, + "grad_norm": 0.719638466835022, + "learning_rate": 0.00024266162978953048, + "loss": 3.3299, + "step": 55300 + }, + { + "epoch": 5.967654986522911, + "grad_norm": 0.7314834594726562, + "learning_rate": 0.00024233783054506203, + "loss": 3.3279, + "step": 55350 + }, + { + "epoch": 5.973045822102426, + "grad_norm": 0.6763395071029663, + "learning_rate": 0.0002420140313005936, + "loss": 3.3395, + "step": 55400 + }, + { + "epoch": 5.97843665768194, + "grad_norm": 0.7260298132896423, + "learning_rate": 0.00024169023205612518, + "loss": 3.3475, + "step": 55450 + }, + { + "epoch": 5.9838274932614555, + "grad_norm": 0.7525773644447327, + "learning_rate": 0.00024136643281165676, + "loss": 3.3146, + "step": 55500 + }, + { + "epoch": 5.989218328840971, + "grad_norm": 0.7207843065261841, + "learning_rate": 0.00024104263356718834, + "loss": 3.3359, + "step": 55550 + }, + { + "epoch": 5.994609164420485, + "grad_norm": 0.698942244052887, + "learning_rate": 0.0002407188343227199, + "loss": 3.349, + "step": 55600 + }, + { + "epoch": 6.0, + "grad_norm": 1.4941537380218506, + "learning_rate": 0.00024039503507825147, + "loss": 3.3174, + "step": 55650 + }, + { + "epoch": 6.005390835579515, + "grad_norm": 0.7332544326782227, + "learning_rate": 0.00024007123583378302, + "loss": 3.2308, + "step": 55700 + }, + { + "epoch": 6.010781671159029, + "grad_norm": 0.7470399737358093, + "learning_rate": 0.0002397474365893146, + "loss": 3.2343, + "step": 55750 + }, + { + "epoch": 6.0161725067385445, + "grad_norm": 0.7069420218467712, + "learning_rate": 0.00023942363734484617, + "loss": 3.2661, + "step": 55800 + }, + { + "epoch": 6.02156334231806, + "grad_norm": 0.7124414443969727, + "learning_rate": 0.00023909983810037775, + "loss": 3.2322, + "step": 55850 + }, + { + "epoch": 6.026954177897574, + "grad_norm": 0.7617751955986023, + "learning_rate": 0.00023877603885590933, + "loss": 3.2335, + "step": 55900 + }, + { + "epoch": 6.032345013477089, + "grad_norm": 0.7374067902565002, + "learning_rate": 0.0002384522396114409, + "loss": 3.2608, + "step": 55950 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 0.7492246031761169, + "learning_rate": 0.00023812844036697248, + "loss": 3.2353, + "step": 56000 + }, + { + "epoch": 6.037735849056604, + "eval_accuracy": 0.384033957938946, + "eval_loss": 3.384352684020996, + "eval_runtime": 185.3605, + "eval_samples_per_second": 97.167, + "eval_steps_per_second": 6.075, + "step": 56000 + }, + { + "epoch": 6.0431266846361185, + "grad_norm": 0.7165188193321228, + "learning_rate": 0.000237804641122504, + "loss": 3.2517, + "step": 56050 + }, + { + "epoch": 6.048517520215634, + "grad_norm": 0.7443341612815857, + "learning_rate": 0.00023748084187803558, + "loss": 3.2432, + "step": 56100 + }, + { + "epoch": 6.053908355795148, + "grad_norm": 0.731981098651886, + "learning_rate": 0.00023715704263356716, + "loss": 3.2602, + "step": 56150 + }, + { + "epoch": 6.059299191374663, + "grad_norm": 0.7504057288169861, + "learning_rate": 0.00023683324338909874, + "loss": 3.2583, + "step": 56200 + }, + { + "epoch": 6.064690026954178, + "grad_norm": 0.7301583290100098, + "learning_rate": 0.00023650944414463032, + "loss": 3.2514, + "step": 56250 + }, + { + "epoch": 6.070080862533692, + "grad_norm": 0.7722079753875732, + "learning_rate": 0.0002361856449001619, + "loss": 3.2588, + "step": 56300 + }, + { + "epoch": 6.0754716981132075, + "grad_norm": 0.7549823522567749, + "learning_rate": 0.00023586184565569347, + "loss": 3.2598, + "step": 56350 + }, + { + "epoch": 6.080862533692723, + "grad_norm": 0.7237383723258972, + "learning_rate": 0.00023553804641122502, + "loss": 3.2725, + "step": 56400 + }, + { + "epoch": 6.086253369272237, + "grad_norm": 0.7340690493583679, + "learning_rate": 0.00023521424716675657, + "loss": 3.2511, + "step": 56450 + }, + { + "epoch": 6.091644204851752, + "grad_norm": 0.7521067261695862, + "learning_rate": 0.00023489044792228815, + "loss": 3.2467, + "step": 56500 + }, + { + "epoch": 6.097035040431267, + "grad_norm": 0.7757521271705627, + "learning_rate": 0.00023456664867781973, + "loss": 3.2494, + "step": 56550 + }, + { + "epoch": 6.1024258760107815, + "grad_norm": 0.7578301429748535, + "learning_rate": 0.0002342428494333513, + "loss": 3.2627, + "step": 56600 + }, + { + "epoch": 6.107816711590297, + "grad_norm": 0.7615374326705933, + "learning_rate": 0.00023391905018888288, + "loss": 3.2605, + "step": 56650 + }, + { + "epoch": 6.113207547169812, + "grad_norm": 0.7228667736053467, + "learning_rate": 0.00023359525094441443, + "loss": 3.2608, + "step": 56700 + }, + { + "epoch": 6.118598382749326, + "grad_norm": 0.7355426549911499, + "learning_rate": 0.000233271451699946, + "loss": 3.2673, + "step": 56750 + }, + { + "epoch": 6.123989218328841, + "grad_norm": 0.7165455222129822, + "learning_rate": 0.0002329476524554776, + "loss": 3.2662, + "step": 56800 + }, + { + "epoch": 6.129380053908355, + "grad_norm": 0.7375234961509705, + "learning_rate": 0.00023262385321100917, + "loss": 3.2589, + "step": 56850 + }, + { + "epoch": 6.1347708894878705, + "grad_norm": 0.74793541431427, + "learning_rate": 0.00023230005396654072, + "loss": 3.2684, + "step": 56900 + }, + { + "epoch": 6.140161725067386, + "grad_norm": 0.7644237279891968, + "learning_rate": 0.0002319762547220723, + "loss": 3.2657, + "step": 56950 + }, + { + "epoch": 6.1455525606469, + "grad_norm": 0.7800267934799194, + "learning_rate": 0.00023165245547760387, + "loss": 3.2567, + "step": 57000 + }, + { + "epoch": 6.1455525606469, + "eval_accuracy": 0.38427158180459625, + "eval_loss": 3.3837149143218994, + "eval_runtime": 185.459, + "eval_samples_per_second": 97.116, + "eval_steps_per_second": 6.071, + "step": 57000 + }, + { + "epoch": 6.150943396226415, + "grad_norm": 0.7396030426025391, + "learning_rate": 0.00023132865623313542, + "loss": 3.2724, + "step": 57050 + }, + { + "epoch": 6.15633423180593, + "grad_norm": 0.7594435811042786, + "learning_rate": 0.000231004856988667, + "loss": 3.2756, + "step": 57100 + }, + { + "epoch": 6.1617250673854445, + "grad_norm": 0.7593262195587158, + "learning_rate": 0.00023068105774419858, + "loss": 3.2731, + "step": 57150 + }, + { + "epoch": 6.16711590296496, + "grad_norm": 0.7153212428092957, + "learning_rate": 0.00023035725849973016, + "loss": 3.2591, + "step": 57200 + }, + { + "epoch": 6.172506738544475, + "grad_norm": 0.7407529354095459, + "learning_rate": 0.00023003345925526173, + "loss": 3.2671, + "step": 57250 + }, + { + "epoch": 6.177897574123989, + "grad_norm": 0.7409720420837402, + "learning_rate": 0.00022970966001079328, + "loss": 3.2746, + "step": 57300 + }, + { + "epoch": 6.183288409703504, + "grad_norm": 0.7745881676673889, + "learning_rate": 0.00022938586076632484, + "loss": 3.2796, + "step": 57350 + }, + { + "epoch": 6.188679245283019, + "grad_norm": 0.8050203919410706, + "learning_rate": 0.0002290620615218564, + "loss": 3.2638, + "step": 57400 + }, + { + "epoch": 6.1940700808625335, + "grad_norm": 0.7250150442123413, + "learning_rate": 0.000228738262277388, + "loss": 3.2607, + "step": 57450 + }, + { + "epoch": 6.199460916442049, + "grad_norm": 0.8042988777160645, + "learning_rate": 0.00022841446303291957, + "loss": 3.2757, + "step": 57500 + }, + { + "epoch": 6.204851752021563, + "grad_norm": 0.7667548060417175, + "learning_rate": 0.00022809066378845115, + "loss": 3.2992, + "step": 57550 + }, + { + "epoch": 6.210242587601078, + "grad_norm": 0.7277979850769043, + "learning_rate": 0.00022776686454398272, + "loss": 3.2697, + "step": 57600 + }, + { + "epoch": 6.215633423180593, + "grad_norm": 0.7964656949043274, + "learning_rate": 0.0002274430652995143, + "loss": 3.2737, + "step": 57650 + }, + { + "epoch": 6.2210242587601075, + "grad_norm": 0.7558398842811584, + "learning_rate": 0.00022711926605504588, + "loss": 3.2685, + "step": 57700 + }, + { + "epoch": 6.226415094339623, + "grad_norm": 0.7219595313072205, + "learning_rate": 0.0002267954668105774, + "loss": 3.2591, + "step": 57750 + }, + { + "epoch": 6.231805929919138, + "grad_norm": 0.7618371248245239, + "learning_rate": 0.00022647166756610898, + "loss": 3.2666, + "step": 57800 + }, + { + "epoch": 6.237196765498652, + "grad_norm": 0.7510839700698853, + "learning_rate": 0.00022614786832164056, + "loss": 3.2738, + "step": 57850 + }, + { + "epoch": 6.242587601078167, + "grad_norm": 0.736240565776825, + "learning_rate": 0.00022582406907717214, + "loss": 3.2727, + "step": 57900 + }, + { + "epoch": 6.247978436657682, + "grad_norm": 0.7617259621620178, + "learning_rate": 0.0002255002698327037, + "loss": 3.2813, + "step": 57950 + }, + { + "epoch": 6.2533692722371965, + "grad_norm": 0.7635646462440491, + "learning_rate": 0.0002251764705882353, + "loss": 3.2751, + "step": 58000 + }, + { + "epoch": 6.2533692722371965, + "eval_accuracy": 0.3847077144962836, + "eval_loss": 3.3810763359069824, + "eval_runtime": 185.3536, + "eval_samples_per_second": 97.171, + "eval_steps_per_second": 6.075, + "step": 58000 + }, + { + "epoch": 6.258760107816712, + "grad_norm": 0.7703520059585571, + "learning_rate": 0.00022485267134376687, + "loss": 3.2669, + "step": 58050 + }, + { + "epoch": 6.264150943396227, + "grad_norm": 0.7938615679740906, + "learning_rate": 0.00022452887209929842, + "loss": 3.2608, + "step": 58100 + }, + { + "epoch": 6.269541778975741, + "grad_norm": 0.7358196973800659, + "learning_rate": 0.00022420507285482997, + "loss": 3.2608, + "step": 58150 + }, + { + "epoch": 6.274932614555256, + "grad_norm": 0.7554171085357666, + "learning_rate": 0.00022388127361036155, + "loss": 3.2987, + "step": 58200 + }, + { + "epoch": 6.280323450134771, + "grad_norm": 0.7923507690429688, + "learning_rate": 0.00022355747436589312, + "loss": 3.2738, + "step": 58250 + }, + { + "epoch": 6.285714285714286, + "grad_norm": 0.7709658145904541, + "learning_rate": 0.0002232336751214247, + "loss": 3.2603, + "step": 58300 + }, + { + "epoch": 6.291105121293801, + "grad_norm": 0.7847428917884827, + "learning_rate": 0.00022290987587695628, + "loss": 3.2872, + "step": 58350 + }, + { + "epoch": 6.296495956873315, + "grad_norm": 0.7624788284301758, + "learning_rate": 0.00022258607663248783, + "loss": 3.2701, + "step": 58400 + }, + { + "epoch": 6.30188679245283, + "grad_norm": 0.7347487211227417, + "learning_rate": 0.0002222622773880194, + "loss": 3.2768, + "step": 58450 + }, + { + "epoch": 6.307277628032345, + "grad_norm": 0.8133580088615417, + "learning_rate": 0.00022193847814355099, + "loss": 3.2917, + "step": 58500 + }, + { + "epoch": 6.3126684636118595, + "grad_norm": 0.7164966464042664, + "learning_rate": 0.00022161467889908256, + "loss": 3.253, + "step": 58550 + }, + { + "epoch": 6.318059299191375, + "grad_norm": 0.7584646940231323, + "learning_rate": 0.00022129087965461411, + "loss": 3.2864, + "step": 58600 + }, + { + "epoch": 6.32345013477089, + "grad_norm": 0.7676268815994263, + "learning_rate": 0.0002209670804101457, + "loss": 3.2635, + "step": 58650 + }, + { + "epoch": 6.328840970350404, + "grad_norm": 0.7437969446182251, + "learning_rate": 0.00022064328116567724, + "loss": 3.2665, + "step": 58700 + }, + { + "epoch": 6.334231805929919, + "grad_norm": 0.7605310082435608, + "learning_rate": 0.00022031948192120882, + "loss": 3.2876, + "step": 58750 + }, + { + "epoch": 6.339622641509434, + "grad_norm": 0.7673757076263428, + "learning_rate": 0.0002199956826767404, + "loss": 3.2797, + "step": 58800 + }, + { + "epoch": 6.345013477088949, + "grad_norm": 0.7611026167869568, + "learning_rate": 0.00021967188343227198, + "loss": 3.2833, + "step": 58850 + }, + { + "epoch": 6.350404312668464, + "grad_norm": 0.7772629857063293, + "learning_rate": 0.00021934808418780355, + "loss": 3.2885, + "step": 58900 + }, + { + "epoch": 6.355795148247978, + "grad_norm": 0.7738526463508606, + "learning_rate": 0.00021902428494333513, + "loss": 3.2982, + "step": 58950 + }, + { + "epoch": 6.361185983827493, + "grad_norm": 0.766217052936554, + "learning_rate": 0.00021870048569886665, + "loss": 3.2877, + "step": 59000 + }, + { + "epoch": 6.361185983827493, + "eval_accuracy": 0.38495500846894937, + "eval_loss": 3.376774787902832, + "eval_runtime": 185.626, + "eval_samples_per_second": 97.028, + "eval_steps_per_second": 6.066, + "step": 59000 + }, + { + "epoch": 6.366576819407008, + "grad_norm": 0.7514162659645081, + "learning_rate": 0.00021837668645439823, + "loss": 3.285, + "step": 59050 + }, + { + "epoch": 6.3719676549865225, + "grad_norm": 0.7757066488265991, + "learning_rate": 0.0002180528872099298, + "loss": 3.2815, + "step": 59100 + }, + { + "epoch": 6.377358490566038, + "grad_norm": 0.7878845930099487, + "learning_rate": 0.0002177290879654614, + "loss": 3.2825, + "step": 59150 + }, + { + "epoch": 6.382749326145553, + "grad_norm": 0.7204424142837524, + "learning_rate": 0.00021740528872099296, + "loss": 3.2754, + "step": 59200 + }, + { + "epoch": 6.388140161725067, + "grad_norm": 0.776679515838623, + "learning_rate": 0.00021708148947652454, + "loss": 3.2768, + "step": 59250 + }, + { + "epoch": 6.393530997304582, + "grad_norm": 0.7729427218437195, + "learning_rate": 0.0002167641662169455, + "loss": 3.2716, + "step": 59300 + }, + { + "epoch": 6.398921832884097, + "grad_norm": 0.7658233642578125, + "learning_rate": 0.00021644036697247702, + "loss": 3.2639, + "step": 59350 + }, + { + "epoch": 6.404312668463612, + "grad_norm": 0.7380334138870239, + "learning_rate": 0.0002161165677280086, + "loss": 3.2524, + "step": 59400 + }, + { + "epoch": 6.409703504043127, + "grad_norm": 0.7924899458885193, + "learning_rate": 0.00021579276848354017, + "loss": 3.2818, + "step": 59450 + }, + { + "epoch": 6.415094339622642, + "grad_norm": 0.7978893518447876, + "learning_rate": 0.00021546896923907175, + "loss": 3.2748, + "step": 59500 + }, + { + "epoch": 6.420485175202156, + "grad_norm": 0.7716631889343262, + "learning_rate": 0.00021514516999460333, + "loss": 3.2888, + "step": 59550 + }, + { + "epoch": 6.425876010781671, + "grad_norm": 0.7399250864982605, + "learning_rate": 0.0002148213707501349, + "loss": 3.2837, + "step": 59600 + }, + { + "epoch": 6.431266846361186, + "grad_norm": 0.7455103397369385, + "learning_rate": 0.00021449757150566648, + "loss": 3.264, + "step": 59650 + }, + { + "epoch": 6.436657681940701, + "grad_norm": 0.7875425815582275, + "learning_rate": 0.00021417377226119806, + "loss": 3.2622, + "step": 59700 + }, + { + "epoch": 6.442048517520216, + "grad_norm": 0.7843931913375854, + "learning_rate": 0.0002138499730167296, + "loss": 3.2782, + "step": 59750 + }, + { + "epoch": 6.44743935309973, + "grad_norm": 0.7751355171203613, + "learning_rate": 0.00021352617377226116, + "loss": 3.2843, + "step": 59800 + }, + { + "epoch": 6.452830188679245, + "grad_norm": 0.7746478319168091, + "learning_rate": 0.00021320237452779274, + "loss": 3.2896, + "step": 59850 + }, + { + "epoch": 6.45822102425876, + "grad_norm": 0.8506944179534912, + "learning_rate": 0.00021287857528332432, + "loss": 3.2784, + "step": 59900 + }, + { + "epoch": 6.463611859838275, + "grad_norm": 0.7042997479438782, + "learning_rate": 0.0002125547760388559, + "loss": 3.2759, + "step": 59950 + }, + { + "epoch": 6.46900269541779, + "grad_norm": 0.8357415199279785, + "learning_rate": 0.00021223097679438747, + "loss": 3.2882, + "step": 60000 + }, + { + "epoch": 6.46900269541779, + "eval_accuracy": 0.38543384174554773, + "eval_loss": 3.373171806335449, + "eval_runtime": 185.3306, + "eval_samples_per_second": 97.183, + "eval_steps_per_second": 6.076, + "step": 60000 + }, + { + "epoch": 6.474393530997305, + "grad_norm": 0.7789021134376526, + "learning_rate": 0.00021190717754991905, + "loss": 3.2919, + "step": 60050 + }, + { + "epoch": 6.479784366576819, + "grad_norm": 0.7376289963722229, + "learning_rate": 0.0002115833783054506, + "loss": 3.2784, + "step": 60100 + }, + { + "epoch": 6.485175202156334, + "grad_norm": 0.7288296222686768, + "learning_rate": 0.00021125957906098218, + "loss": 3.2763, + "step": 60150 + }, + { + "epoch": 6.490566037735849, + "grad_norm": 0.7110522389411926, + "learning_rate": 0.00021093577981651373, + "loss": 3.2611, + "step": 60200 + }, + { + "epoch": 6.495956873315364, + "grad_norm": 0.7700715661048889, + "learning_rate": 0.0002106119805720453, + "loss": 3.2893, + "step": 60250 + }, + { + "epoch": 6.501347708894879, + "grad_norm": 0.7219817638397217, + "learning_rate": 0.00021028818132757689, + "loss": 3.2778, + "step": 60300 + }, + { + "epoch": 6.506738544474393, + "grad_norm": 0.7898648977279663, + "learning_rate": 0.00020996438208310846, + "loss": 3.2772, + "step": 60350 + }, + { + "epoch": 6.512129380053908, + "grad_norm": 0.7658487558364868, + "learning_rate": 0.00020964058283864001, + "loss": 3.2698, + "step": 60400 + }, + { + "epoch": 6.517520215633423, + "grad_norm": 0.8089845180511475, + "learning_rate": 0.0002093167835941716, + "loss": 3.2709, + "step": 60450 + }, + { + "epoch": 6.5229110512129385, + "grad_norm": 0.8010355830192566, + "learning_rate": 0.00020899298434970317, + "loss": 3.2784, + "step": 60500 + }, + { + "epoch": 6.528301886792453, + "grad_norm": 0.7700260877609253, + "learning_rate": 0.00020866918510523475, + "loss": 3.2692, + "step": 60550 + }, + { + "epoch": 6.533692722371968, + "grad_norm": 0.7774828672409058, + "learning_rate": 0.00020834538586076632, + "loss": 3.279, + "step": 60600 + }, + { + "epoch": 6.539083557951482, + "grad_norm": 0.7341523170471191, + "learning_rate": 0.00020802158661629787, + "loss": 3.2821, + "step": 60650 + }, + { + "epoch": 6.544474393530997, + "grad_norm": 0.7971969246864319, + "learning_rate": 0.00020769778737182943, + "loss": 3.2581, + "step": 60700 + }, + { + "epoch": 6.549865229110512, + "grad_norm": 0.8375870585441589, + "learning_rate": 0.000207373988127361, + "loss": 3.2855, + "step": 60750 + }, + { + "epoch": 6.555256064690027, + "grad_norm": 0.7822794318199158, + "learning_rate": 0.00020705018888289258, + "loss": 3.3007, + "step": 60800 + }, + { + "epoch": 6.560646900269542, + "grad_norm": 0.7843235731124878, + "learning_rate": 0.00020672638963842416, + "loss": 3.2775, + "step": 60850 + }, + { + "epoch": 6.566037735849057, + "grad_norm": 0.7439322471618652, + "learning_rate": 0.00020640259039395574, + "loss": 3.2741, + "step": 60900 + }, + { + "epoch": 6.571428571428571, + "grad_norm": 0.7677661180496216, + "learning_rate": 0.00020607879114948731, + "loss": 3.2948, + "step": 60950 + }, + { + "epoch": 6.576819407008086, + "grad_norm": 0.7387002110481262, + "learning_rate": 0.0002057549919050189, + "loss": 3.2801, + "step": 61000 + }, + { + "epoch": 6.576819407008086, + "eval_accuracy": 0.3860143741251405, + "eval_loss": 3.3682520389556885, + "eval_runtime": 185.5315, + "eval_samples_per_second": 97.078, + "eval_steps_per_second": 6.069, + "step": 61000 + }, + { + "epoch": 6.5822102425876015, + "grad_norm": 0.7914436459541321, + "learning_rate": 0.00020543119266055041, + "loss": 3.294, + "step": 61050 + }, + { + "epoch": 6.587601078167116, + "grad_norm": 0.7589644193649292, + "learning_rate": 0.000205107393416082, + "loss": 3.2632, + "step": 61100 + }, + { + "epoch": 6.592991913746631, + "grad_norm": 0.7871928811073303, + "learning_rate": 0.00020479007015650294, + "loss": 3.3004, + "step": 61150 + }, + { + "epoch": 6.598382749326145, + "grad_norm": 0.7957470417022705, + "learning_rate": 0.00020446627091203452, + "loss": 3.2818, + "step": 61200 + }, + { + "epoch": 6.60377358490566, + "grad_norm": 0.7232425212860107, + "learning_rate": 0.0002041424716675661, + "loss": 3.2687, + "step": 61250 + }, + { + "epoch": 6.609164420485175, + "grad_norm": 0.7499574422836304, + "learning_rate": 0.00020381867242309768, + "loss": 3.2972, + "step": 61300 + }, + { + "epoch": 6.6145552560646905, + "grad_norm": 0.7162529826164246, + "learning_rate": 0.00020349487317862926, + "loss": 3.2937, + "step": 61350 + }, + { + "epoch": 6.619946091644205, + "grad_norm": 0.8264064788818359, + "learning_rate": 0.00020317107393416078, + "loss": 3.2797, + "step": 61400 + }, + { + "epoch": 6.62533692722372, + "grad_norm": 0.7366076111793518, + "learning_rate": 0.00020284727468969236, + "loss": 3.2673, + "step": 61450 + }, + { + "epoch": 6.630727762803234, + "grad_norm": 0.8459667563438416, + "learning_rate": 0.00020252347544522393, + "loss": 3.2902, + "step": 61500 + }, + { + "epoch": 6.636118598382749, + "grad_norm": 0.7387397885322571, + "learning_rate": 0.0002021996762007555, + "loss": 3.2755, + "step": 61550 + }, + { + "epoch": 6.6415094339622645, + "grad_norm": 0.7981934547424316, + "learning_rate": 0.0002018758769562871, + "loss": 3.2769, + "step": 61600 + }, + { + "epoch": 6.646900269541779, + "grad_norm": 0.7583616971969604, + "learning_rate": 0.00020155207771181867, + "loss": 3.2867, + "step": 61650 + }, + { + "epoch": 6.652291105121294, + "grad_norm": 0.8009341359138489, + "learning_rate": 0.00020122827846735024, + "loss": 3.2969, + "step": 61700 + }, + { + "epoch": 6.657681940700809, + "grad_norm": 0.7538726329803467, + "learning_rate": 0.00020090447922288182, + "loss": 3.2853, + "step": 61750 + }, + { + "epoch": 6.663072776280323, + "grad_norm": 0.7500268816947937, + "learning_rate": 0.00020058067997841335, + "loss": 3.2796, + "step": 61800 + }, + { + "epoch": 6.668463611859838, + "grad_norm": 0.8226208686828613, + "learning_rate": 0.00020025688073394492, + "loss": 3.2772, + "step": 61850 + }, + { + "epoch": 6.6738544474393535, + "grad_norm": 0.7617080211639404, + "learning_rate": 0.0001999330814894765, + "loss": 3.272, + "step": 61900 + }, + { + "epoch": 6.679245283018868, + "grad_norm": 0.8388587236404419, + "learning_rate": 0.00019960928224500808, + "loss": 3.2727, + "step": 61950 + }, + { + "epoch": 6.684636118598383, + "grad_norm": 0.7575362920761108, + "learning_rate": 0.00019928548300053966, + "loss": 3.2991, + "step": 62000 + }, + { + "epoch": 6.684636118598383, + "eval_accuracy": 0.3863006744845317, + "eval_loss": 3.362423896789551, + "eval_runtime": 185.5003, + "eval_samples_per_second": 97.094, + "eval_steps_per_second": 6.07, + "step": 62000 + }, + { + "epoch": 6.690026954177897, + "grad_norm": 0.7498572468757629, + "learning_rate": 0.00019896168375607123, + "loss": 3.2829, + "step": 62050 + }, + { + "epoch": 6.695417789757412, + "grad_norm": 0.7894664406776428, + "learning_rate": 0.00019863788451160278, + "loss": 3.261, + "step": 62100 + }, + { + "epoch": 6.7008086253369274, + "grad_norm": 0.7989409565925598, + "learning_rate": 0.00019831408526713436, + "loss": 3.3027, + "step": 62150 + }, + { + "epoch": 6.706199460916442, + "grad_norm": 0.7582972645759583, + "learning_rate": 0.00019799028602266594, + "loss": 3.2954, + "step": 62200 + }, + { + "epoch": 6.711590296495957, + "grad_norm": 0.7609683871269226, + "learning_rate": 0.0001976664867781975, + "loss": 3.2792, + "step": 62250 + }, + { + "epoch": 6.716981132075472, + "grad_norm": 0.8025651574134827, + "learning_rate": 0.00019734268753372907, + "loss": 3.2795, + "step": 62300 + }, + { + "epoch": 6.722371967654986, + "grad_norm": 0.8627368807792664, + "learning_rate": 0.00019701888828926065, + "loss": 3.3019, + "step": 62350 + }, + { + "epoch": 6.727762803234501, + "grad_norm": 0.7837508916854858, + "learning_rate": 0.0001966950890447922, + "loss": 3.2843, + "step": 62400 + }, + { + "epoch": 6.7331536388140165, + "grad_norm": 0.7840617299079895, + "learning_rate": 0.00019637128980032377, + "loss": 3.284, + "step": 62450 + }, + { + "epoch": 6.738544474393531, + "grad_norm": 0.7620540857315063, + "learning_rate": 0.00019604749055585535, + "loss": 3.2904, + "step": 62500 + }, + { + "epoch": 6.743935309973046, + "grad_norm": 0.7956631183624268, + "learning_rate": 0.00019572369131138693, + "loss": 3.2867, + "step": 62550 + }, + { + "epoch": 6.74932614555256, + "grad_norm": 0.7693567276000977, + "learning_rate": 0.0001953998920669185, + "loss": 3.278, + "step": 62600 + }, + { + "epoch": 6.754716981132075, + "grad_norm": 0.7932519316673279, + "learning_rate": 0.00019507609282245006, + "loss": 3.2808, + "step": 62650 + }, + { + "epoch": 6.7601078167115904, + "grad_norm": 0.8055360913276672, + "learning_rate": 0.00019475229357798164, + "loss": 3.2907, + "step": 62700 + }, + { + "epoch": 6.765498652291106, + "grad_norm": 0.8191863298416138, + "learning_rate": 0.00019442849433351319, + "loss": 3.269, + "step": 62750 + }, + { + "epoch": 6.77088948787062, + "grad_norm": 0.812945544719696, + "learning_rate": 0.00019410469508904476, + "loss": 3.2704, + "step": 62800 + }, + { + "epoch": 6.776280323450135, + "grad_norm": 0.7781937718391418, + "learning_rate": 0.00019378089584457634, + "loss": 3.2891, + "step": 62850 + }, + { + "epoch": 6.781671159029649, + "grad_norm": 0.8601032495498657, + "learning_rate": 0.00019345709660010792, + "loss": 3.2863, + "step": 62900 + }, + { + "epoch": 6.787061994609164, + "grad_norm": 0.7767324447631836, + "learning_rate": 0.0001931332973556395, + "loss": 3.3008, + "step": 62950 + }, + { + "epoch": 6.7924528301886795, + "grad_norm": 0.8062757849693298, + "learning_rate": 0.00019280949811117107, + "loss": 3.2791, + "step": 63000 + }, + { + "epoch": 6.7924528301886795, + "eval_accuracy": 0.3865814335466443, + "eval_loss": 3.3601419925689697, + "eval_runtime": 185.3224, + "eval_samples_per_second": 97.187, + "eval_steps_per_second": 6.076, + "step": 63000 + }, + { + "epoch": 6.797843665768194, + "grad_norm": 0.7886756658554077, + "learning_rate": 0.00019248569886670265, + "loss": 3.2896, + "step": 63050 + }, + { + "epoch": 6.803234501347709, + "grad_norm": 0.7697591185569763, + "learning_rate": 0.00019216189962223418, + "loss": 3.2975, + "step": 63100 + }, + { + "epoch": 6.808625336927224, + "grad_norm": 0.7850983738899231, + "learning_rate": 0.00019183810037776575, + "loss": 3.2726, + "step": 63150 + }, + { + "epoch": 6.814016172506738, + "grad_norm": 0.7991440892219543, + "learning_rate": 0.00019151430113329733, + "loss": 3.2879, + "step": 63200 + }, + { + "epoch": 6.819407008086253, + "grad_norm": 0.7532088160514832, + "learning_rate": 0.0001911905018888289, + "loss": 3.2893, + "step": 63250 + }, + { + "epoch": 6.824797843665769, + "grad_norm": 0.8358055949211121, + "learning_rate": 0.00019086670264436049, + "loss": 3.2792, + "step": 63300 + }, + { + "epoch": 6.830188679245283, + "grad_norm": 0.7301905155181885, + "learning_rate": 0.00019054290339989206, + "loss": 3.2635, + "step": 63350 + }, + { + "epoch": 6.835579514824798, + "grad_norm": 0.7704817652702332, + "learning_rate": 0.00019021910415542364, + "loss": 3.2945, + "step": 63400 + }, + { + "epoch": 6.840970350404312, + "grad_norm": 0.8155561089515686, + "learning_rate": 0.0001898953049109552, + "loss": 3.2971, + "step": 63450 + }, + { + "epoch": 6.846361185983827, + "grad_norm": 0.7301574349403381, + "learning_rate": 0.00018957150566648677, + "loss": 3.2914, + "step": 63500 + }, + { + "epoch": 6.8517520215633425, + "grad_norm": 0.8053485155105591, + "learning_rate": 0.00018924770642201832, + "loss": 3.286, + "step": 63550 + }, + { + "epoch": 6.857142857142857, + "grad_norm": 0.7629830241203308, + "learning_rate": 0.0001889239071775499, + "loss": 3.2661, + "step": 63600 + }, + { + "epoch": 6.862533692722372, + "grad_norm": 0.7930283546447754, + "learning_rate": 0.00018860010793308148, + "loss": 3.268, + "step": 63650 + }, + { + "epoch": 6.867924528301887, + "grad_norm": 0.8023165464401245, + "learning_rate": 0.00018827630868861305, + "loss": 3.277, + "step": 63700 + }, + { + "epoch": 6.873315363881401, + "grad_norm": 0.7296940684318542, + "learning_rate": 0.0001879525094441446, + "loss": 3.2752, + "step": 63750 + }, + { + "epoch": 6.878706199460916, + "grad_norm": 0.7824319005012512, + "learning_rate": 0.00018762871019967618, + "loss": 3.267, + "step": 63800 + }, + { + "epoch": 6.884097035040432, + "grad_norm": 0.7985427379608154, + "learning_rate": 0.00018730491095520776, + "loss": 3.2845, + "step": 63850 + }, + { + "epoch": 6.889487870619946, + "grad_norm": 0.7951188087463379, + "learning_rate": 0.00018698111171073934, + "loss": 3.2824, + "step": 63900 + }, + { + "epoch": 6.894878706199461, + "grad_norm": 0.8034949898719788, + "learning_rate": 0.0001866573124662709, + "loss": 3.3008, + "step": 63950 + }, + { + "epoch": 6.900269541778976, + "grad_norm": 0.7574693560600281, + "learning_rate": 0.00018633351322180246, + "loss": 3.288, + "step": 64000 + }, + { + "epoch": 6.900269541778976, + "eval_accuracy": 0.38711002984586174, + "eval_loss": 3.3546271324157715, + "eval_runtime": 185.647, + "eval_samples_per_second": 97.017, + "eval_steps_per_second": 6.065, + "step": 64000 + }, + { + "epoch": 6.90566037735849, + "grad_norm": 0.7891746759414673, + "learning_rate": 0.00018600971397733404, + "loss": 3.2859, + "step": 64050 + }, + { + "epoch": 6.9110512129380055, + "grad_norm": 0.8314854502677917, + "learning_rate": 0.0001856859147328656, + "loss": 3.2927, + "step": 64100 + }, + { + "epoch": 6.916442048517521, + "grad_norm": 0.8023466467857361, + "learning_rate": 0.00018536211548839717, + "loss": 3.2871, + "step": 64150 + }, + { + "epoch": 6.921832884097035, + "grad_norm": 0.8005847334861755, + "learning_rate": 0.00018503831624392875, + "loss": 3.2775, + "step": 64200 + }, + { + "epoch": 6.92722371967655, + "grad_norm": 0.7765096426010132, + "learning_rate": 0.00018471451699946033, + "loss": 3.2639, + "step": 64250 + }, + { + "epoch": 6.932614555256064, + "grad_norm": 0.7770959138870239, + "learning_rate": 0.0001843907177549919, + "loss": 3.3046, + "step": 64300 + }, + { + "epoch": 6.938005390835579, + "grad_norm": 0.7714718580245972, + "learning_rate": 0.00018406691851052348, + "loss": 3.2832, + "step": 64350 + }, + { + "epoch": 6.943396226415095, + "grad_norm": 0.7529462575912476, + "learning_rate": 0.000183743119266055, + "loss": 3.2839, + "step": 64400 + }, + { + "epoch": 6.948787061994609, + "grad_norm": 0.7635588049888611, + "learning_rate": 0.00018342579600647596, + "loss": 3.2811, + "step": 64450 + }, + { + "epoch": 6.954177897574124, + "grad_norm": 0.7985997200012207, + "learning_rate": 0.00018310199676200753, + "loss": 3.2737, + "step": 64500 + }, + { + "epoch": 6.959568733153639, + "grad_norm": 0.7837223410606384, + "learning_rate": 0.0001827781975175391, + "loss": 3.2801, + "step": 64550 + }, + { + "epoch": 6.964959568733153, + "grad_norm": 0.8346897959709167, + "learning_rate": 0.0001824543982730707, + "loss": 3.273, + "step": 64600 + }, + { + "epoch": 6.9703504043126685, + "grad_norm": 0.788210391998291, + "learning_rate": 0.00018213059902860227, + "loss": 3.2838, + "step": 64650 + }, + { + "epoch": 6.975741239892184, + "grad_norm": 0.7630437612533569, + "learning_rate": 0.00018180679978413382, + "loss": 3.2784, + "step": 64700 + }, + { + "epoch": 6.981132075471698, + "grad_norm": 0.8454469442367554, + "learning_rate": 0.00018148300053966537, + "loss": 3.2904, + "step": 64750 + }, + { + "epoch": 6.986522911051213, + "grad_norm": 0.7735024094581604, + "learning_rate": 0.00018115920129519695, + "loss": 3.2774, + "step": 64800 + }, + { + "epoch": 6.991913746630727, + "grad_norm": 0.7661256790161133, + "learning_rate": 0.00018083540205072852, + "loss": 3.2721, + "step": 64850 + }, + { + "epoch": 6.997304582210242, + "grad_norm": 0.8172721862792969, + "learning_rate": 0.0001805116028062601, + "loss": 3.2823, + "step": 64900 + }, + { + "epoch": 7.002695417789758, + "grad_norm": 0.836434543132782, + "learning_rate": 0.00018018780356179168, + "loss": 3.2301, + "step": 64950 + }, + { + "epoch": 7.008086253369272, + "grad_norm": 0.7450481653213501, + "learning_rate": 0.00017986400431732326, + "loss": 3.1829, + "step": 65000 + }, + { + "epoch": 7.008086253369272, + "eval_accuracy": 0.3870916675078211, + "eval_loss": 3.3573193550109863, + "eval_runtime": 185.3036, + "eval_samples_per_second": 97.197, + "eval_steps_per_second": 6.077, + "step": 65000 + }, + { + "epoch": 7.013477088948787, + "grad_norm": 0.8692488074302673, + "learning_rate": 0.00017954020507285483, + "loss": 3.2036, + "step": 65050 + }, + { + "epoch": 7.018867924528302, + "grad_norm": 0.7994346022605896, + "learning_rate": 0.0001792164058283864, + "loss": 3.201, + "step": 65100 + }, + { + "epoch": 7.024258760107816, + "grad_norm": 0.8160102963447571, + "learning_rate": 0.00017889260658391794, + "loss": 3.2111, + "step": 65150 + }, + { + "epoch": 7.0296495956873315, + "grad_norm": 0.7847365736961365, + "learning_rate": 0.00017856880733944951, + "loss": 3.2034, + "step": 65200 + }, + { + "epoch": 7.035040431266847, + "grad_norm": 0.8384340405464172, + "learning_rate": 0.0001782450080949811, + "loss": 3.2032, + "step": 65250 + }, + { + "epoch": 7.040431266846361, + "grad_norm": 0.8204463124275208, + "learning_rate": 0.00017792120885051267, + "loss": 3.1974, + "step": 65300 + }, + { + "epoch": 7.045822102425876, + "grad_norm": 0.7891581654548645, + "learning_rate": 0.00017759740960604425, + "loss": 3.214, + "step": 65350 + }, + { + "epoch": 7.051212938005391, + "grad_norm": 0.7553804516792297, + "learning_rate": 0.00017727361036157582, + "loss": 3.1921, + "step": 65400 + }, + { + "epoch": 7.056603773584905, + "grad_norm": 0.7528859972953796, + "learning_rate": 0.00017694981111710737, + "loss": 3.1798, + "step": 65450 + }, + { + "epoch": 7.061994609164421, + "grad_norm": 0.8152026534080505, + "learning_rate": 0.00017662601187263895, + "loss": 3.1979, + "step": 65500 + }, + { + "epoch": 7.067385444743936, + "grad_norm": 0.8077825903892517, + "learning_rate": 0.0001763022126281705, + "loss": 3.1878, + "step": 65550 + }, + { + "epoch": 7.07277628032345, + "grad_norm": 0.8011118769645691, + "learning_rate": 0.00017597841338370208, + "loss": 3.2192, + "step": 65600 + }, + { + "epoch": 7.078167115902965, + "grad_norm": 0.7679276466369629, + "learning_rate": 0.00017565461413923366, + "loss": 3.213, + "step": 65650 + }, + { + "epoch": 7.083557951482479, + "grad_norm": 0.8511323928833008, + "learning_rate": 0.00017533081489476524, + "loss": 3.2056, + "step": 65700 + }, + { + "epoch": 7.0889487870619945, + "grad_norm": 0.8467673659324646, + "learning_rate": 0.0001750070156502968, + "loss": 3.1899, + "step": 65750 + }, + { + "epoch": 7.09433962264151, + "grad_norm": 0.7860401272773743, + "learning_rate": 0.00017468321640582836, + "loss": 3.2195, + "step": 65800 + }, + { + "epoch": 7.099730458221024, + "grad_norm": 0.8050251007080078, + "learning_rate": 0.00017435941716135994, + "loss": 3.2154, + "step": 65850 + }, + { + "epoch": 7.105121293800539, + "grad_norm": 0.835083544254303, + "learning_rate": 0.00017403561791689152, + "loss": 3.2303, + "step": 65900 + }, + { + "epoch": 7.110512129380054, + "grad_norm": 0.7602958083152771, + "learning_rate": 0.0001737118186724231, + "loss": 3.2124, + "step": 65950 + }, + { + "epoch": 7.115902964959568, + "grad_norm": 0.851472795009613, + "learning_rate": 0.00017338801942795465, + "loss": 3.2234, + "step": 66000 + }, + { + "epoch": 7.115902964959568, + "eval_accuracy": 0.387299629135098, + "eval_loss": 3.360718250274658, + "eval_runtime": 185.5923, + "eval_samples_per_second": 97.046, + "eval_steps_per_second": 6.067, + "step": 66000 + }, + { + "epoch": 7.121293800539084, + "grad_norm": 0.8110368251800537, + "learning_rate": 0.00017306422018348623, + "loss": 3.2182, + "step": 66050 + }, + { + "epoch": 7.126684636118599, + "grad_norm": 0.8128678798675537, + "learning_rate": 0.00017274042093901778, + "loss": 3.2171, + "step": 66100 + }, + { + "epoch": 7.132075471698113, + "grad_norm": 0.8244554400444031, + "learning_rate": 0.00017241662169454935, + "loss": 3.2187, + "step": 66150 + }, + { + "epoch": 7.137466307277628, + "grad_norm": 0.7652295827865601, + "learning_rate": 0.00017209282245008093, + "loss": 3.2137, + "step": 66200 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.840542733669281, + "learning_rate": 0.0001717690232056125, + "loss": 3.2205, + "step": 66250 + }, + { + "epoch": 7.1482479784366575, + "grad_norm": 0.795521080493927, + "learning_rate": 0.0001714452239611441, + "loss": 3.2147, + "step": 66300 + }, + { + "epoch": 7.153638814016173, + "grad_norm": 0.8073201775550842, + "learning_rate": 0.00017112142471667566, + "loss": 3.2236, + "step": 66350 + }, + { + "epoch": 7.159029649595688, + "grad_norm": 0.7793418765068054, + "learning_rate": 0.0001707976254722072, + "loss": 3.2051, + "step": 66400 + }, + { + "epoch": 7.164420485175202, + "grad_norm": 0.8074833154678345, + "learning_rate": 0.00017047382622773877, + "loss": 3.217, + "step": 66450 + }, + { + "epoch": 7.169811320754717, + "grad_norm": 0.7875019907951355, + "learning_rate": 0.00017015002698327034, + "loss": 3.2174, + "step": 66500 + }, + { + "epoch": 7.175202156334231, + "grad_norm": 0.7965167760848999, + "learning_rate": 0.00016982622773880192, + "loss": 3.22, + "step": 66550 + }, + { + "epoch": 7.180592991913747, + "grad_norm": 0.7714762687683105, + "learning_rate": 0.0001695024284943335, + "loss": 3.2062, + "step": 66600 + }, + { + "epoch": 7.185983827493262, + "grad_norm": 0.8042778372764587, + "learning_rate": 0.00016917862924986508, + "loss": 3.2157, + "step": 66650 + }, + { + "epoch": 7.191374663072776, + "grad_norm": 0.7633846402168274, + "learning_rate": 0.00016885483000539665, + "loss": 3.2174, + "step": 66700 + }, + { + "epoch": 7.196765498652291, + "grad_norm": 0.818200409412384, + "learning_rate": 0.00016853103076092823, + "loss": 3.2211, + "step": 66750 + }, + { + "epoch": 7.202156334231806, + "grad_norm": 0.8160050511360168, + "learning_rate": 0.00016820723151645978, + "loss": 3.2099, + "step": 66800 + }, + { + "epoch": 7.2075471698113205, + "grad_norm": 0.779509425163269, + "learning_rate": 0.0001678899082568807, + "loss": 3.2305, + "step": 66850 + }, + { + "epoch": 7.212938005390836, + "grad_norm": 0.7970914244651794, + "learning_rate": 0.00016756610901241228, + "loss": 3.2384, + "step": 66900 + }, + { + "epoch": 7.218328840970351, + "grad_norm": 0.8047239780426025, + "learning_rate": 0.00016724230976794386, + "loss": 3.2257, + "step": 66950 + }, + { + "epoch": 7.223719676549865, + "grad_norm": 0.7717013359069824, + "learning_rate": 0.00016691851052347544, + "loss": 3.2282, + "step": 67000 + }, + { + "epoch": 7.223719676549865, + "eval_accuracy": 0.38757376037105395, + "eval_loss": 3.3571853637695312, + "eval_runtime": 185.3414, + "eval_samples_per_second": 97.177, + "eval_steps_per_second": 6.075, + "step": 67000 + }, + { + "epoch": 7.22911051212938, + "grad_norm": 0.7892364263534546, + "learning_rate": 0.00016659471127900702, + "loss": 3.1987, + "step": 67050 + }, + { + "epoch": 7.234501347708895, + "grad_norm": 0.7793697118759155, + "learning_rate": 0.0001662709120345386, + "loss": 3.2414, + "step": 67100 + }, + { + "epoch": 7.2398921832884096, + "grad_norm": 0.8037189841270447, + "learning_rate": 0.00016594711279007015, + "loss": 3.2046, + "step": 67150 + }, + { + "epoch": 7.245283018867925, + "grad_norm": 0.8540467619895935, + "learning_rate": 0.0001656233135456017, + "loss": 3.224, + "step": 67200 + }, + { + "epoch": 7.250673854447439, + "grad_norm": 0.8086419701576233, + "learning_rate": 0.00016529951430113327, + "loss": 3.234, + "step": 67250 + }, + { + "epoch": 7.256064690026954, + "grad_norm": 0.7857149243354797, + "learning_rate": 0.00016497571505666485, + "loss": 3.237, + "step": 67300 + }, + { + "epoch": 7.261455525606469, + "grad_norm": 0.7541642785072327, + "learning_rate": 0.00016465191581219643, + "loss": 3.2163, + "step": 67350 + }, + { + "epoch": 7.2668463611859835, + "grad_norm": 0.7896484136581421, + "learning_rate": 0.000164328116567728, + "loss": 3.2406, + "step": 67400 + }, + { + "epoch": 7.272237196765499, + "grad_norm": 0.8298619389533997, + "learning_rate": 0.00016400431732325956, + "loss": 3.2366, + "step": 67450 + }, + { + "epoch": 7.277628032345014, + "grad_norm": 0.779678463935852, + "learning_rate": 0.00016368051807879114, + "loss": 3.2287, + "step": 67500 + }, + { + "epoch": 7.283018867924528, + "grad_norm": 0.8095333576202393, + "learning_rate": 0.0001633567188343227, + "loss": 3.2385, + "step": 67550 + }, + { + "epoch": 7.288409703504043, + "grad_norm": 0.837469756603241, + "learning_rate": 0.00016303291958985426, + "loss": 3.2236, + "step": 67600 + }, + { + "epoch": 7.293800539083558, + "grad_norm": 0.7788031697273254, + "learning_rate": 0.00016270912034538584, + "loss": 3.228, + "step": 67650 + }, + { + "epoch": 7.2991913746630726, + "grad_norm": 0.8133876919746399, + "learning_rate": 0.00016238532110091742, + "loss": 3.2086, + "step": 67700 + }, + { + "epoch": 7.304582210242588, + "grad_norm": 0.8191686272621155, + "learning_rate": 0.000162061521856449, + "loss": 3.2226, + "step": 67750 + }, + { + "epoch": 7.309973045822103, + "grad_norm": 0.814452588558197, + "learning_rate": 0.00016173772261198055, + "loss": 3.2222, + "step": 67800 + }, + { + "epoch": 7.315363881401617, + "grad_norm": 0.8741965889930725, + "learning_rate": 0.00016141392336751212, + "loss": 3.2219, + "step": 67850 + }, + { + "epoch": 7.320754716981132, + "grad_norm": 0.7747724056243896, + "learning_rate": 0.0001610901241230437, + "loss": 3.2371, + "step": 67900 + }, + { + "epoch": 7.3261455525606465, + "grad_norm": 0.7778286337852478, + "learning_rate": 0.00016076632487857528, + "loss": 3.213, + "step": 67950 + }, + { + "epoch": 7.331536388140162, + "grad_norm": 0.8529244065284729, + "learning_rate": 0.00016044252563410686, + "loss": 3.2267, + "step": 68000 + }, + { + "epoch": 7.331536388140162, + "eval_accuracy": 0.3877894363533657, + "eval_loss": 3.3537304401397705, + "eval_runtime": 185.5322, + "eval_samples_per_second": 97.077, + "eval_steps_per_second": 6.069, + "step": 68000 + }, + { + "epoch": 7.336927223719677, + "grad_norm": 0.7788181304931641, + "learning_rate": 0.0001601187263896384, + "loss": 3.2322, + "step": 68050 + }, + { + "epoch": 7.342318059299191, + "grad_norm": 0.826663076877594, + "learning_rate": 0.00015979492714516996, + "loss": 3.2372, + "step": 68100 + }, + { + "epoch": 7.347708894878706, + "grad_norm": 0.7754775881767273, + "learning_rate": 0.00015947112790070154, + "loss": 3.226, + "step": 68150 + }, + { + "epoch": 7.353099730458221, + "grad_norm": 0.7751636505126953, + "learning_rate": 0.00015914732865623311, + "loss": 3.2435, + "step": 68200 + }, + { + "epoch": 7.3584905660377355, + "grad_norm": 0.8280046582221985, + "learning_rate": 0.0001588235294117647, + "loss": 3.2315, + "step": 68250 + }, + { + "epoch": 7.363881401617251, + "grad_norm": 0.7916621565818787, + "learning_rate": 0.00015849973016729627, + "loss": 3.2189, + "step": 68300 + }, + { + "epoch": 7.369272237196766, + "grad_norm": 0.7694364190101624, + "learning_rate": 0.00015817593092282785, + "loss": 3.2306, + "step": 68350 + }, + { + "epoch": 7.37466307277628, + "grad_norm": 0.8083913326263428, + "learning_rate": 0.00015785213167835942, + "loss": 3.2246, + "step": 68400 + }, + { + "epoch": 7.380053908355795, + "grad_norm": 0.8321467041969299, + "learning_rate": 0.00015752833243389095, + "loss": 3.2072, + "step": 68450 + }, + { + "epoch": 7.38544474393531, + "grad_norm": 0.8279005885124207, + "learning_rate": 0.00015720453318942253, + "loss": 3.2285, + "step": 68500 + }, + { + "epoch": 7.390835579514825, + "grad_norm": 0.7874852418899536, + "learning_rate": 0.0001568807339449541, + "loss": 3.2297, + "step": 68550 + }, + { + "epoch": 7.39622641509434, + "grad_norm": 0.7745757699012756, + "learning_rate": 0.00015655693470048568, + "loss": 3.2251, + "step": 68600 + }, + { + "epoch": 7.401617250673855, + "grad_norm": 0.7697532176971436, + "learning_rate": 0.00015623313545601726, + "loss": 3.2258, + "step": 68650 + }, + { + "epoch": 7.407008086253369, + "grad_norm": 0.7562422752380371, + "learning_rate": 0.00015590933621154884, + "loss": 3.2357, + "step": 68700 + }, + { + "epoch": 7.412398921832884, + "grad_norm": 0.862626314163208, + "learning_rate": 0.00015558553696708041, + "loss": 3.2234, + "step": 68750 + }, + { + "epoch": 7.4177897574123985, + "grad_norm": 0.8310208916664124, + "learning_rate": 0.00015526173772261196, + "loss": 3.2323, + "step": 68800 + }, + { + "epoch": 7.423180592991914, + "grad_norm": 0.8276763558387756, + "learning_rate": 0.00015493793847814354, + "loss": 3.2361, + "step": 68850 + }, + { + "epoch": 7.428571428571429, + "grad_norm": 0.7850115299224854, + "learning_rate": 0.0001546141392336751, + "loss": 3.2284, + "step": 68900 + }, + { + "epoch": 7.433962264150943, + "grad_norm": 0.8095736503601074, + "learning_rate": 0.00015429033998920667, + "loss": 3.2325, + "step": 68950 + }, + { + "epoch": 7.439353099730458, + "grad_norm": 0.8068041205406189, + "learning_rate": 0.00015396654074473825, + "loss": 3.218, + "step": 69000 + }, + { + "epoch": 7.439353099730458, + "eval_accuracy": 0.3883189018756857, + "eval_loss": 3.3485703468322754, + "eval_runtime": 185.8026, + "eval_samples_per_second": 96.936, + "eval_steps_per_second": 6.06, + "step": 69000 + }, + { + "epoch": 7.444743935309973, + "grad_norm": 0.7911592125892639, + "learning_rate": 0.00015364274150026983, + "loss": 3.2181, + "step": 69050 + }, + { + "epoch": 7.450134770889488, + "grad_norm": 0.8405386805534363, + "learning_rate": 0.0001533189422558014, + "loss": 3.2278, + "step": 69100 + }, + { + "epoch": 7.455525606469003, + "grad_norm": 0.8172047138214111, + "learning_rate": 0.00015300161899622233, + "loss": 3.2291, + "step": 69150 + }, + { + "epoch": 7.460916442048518, + "grad_norm": 0.8096583485603333, + "learning_rate": 0.00015267781975175388, + "loss": 3.2192, + "step": 69200 + }, + { + "epoch": 7.466307277628032, + "grad_norm": 0.8990162014961243, + "learning_rate": 0.00015235402050728546, + "loss": 3.2372, + "step": 69250 + }, + { + "epoch": 7.471698113207547, + "grad_norm": 0.8064659237861633, + "learning_rate": 0.00015203022126281703, + "loss": 3.2242, + "step": 69300 + }, + { + "epoch": 7.4770889487870615, + "grad_norm": 0.7970738410949707, + "learning_rate": 0.0001517064220183486, + "loss": 3.2357, + "step": 69350 + }, + { + "epoch": 7.482479784366577, + "grad_norm": 0.78382807970047, + "learning_rate": 0.0001513826227738802, + "loss": 3.2216, + "step": 69400 + }, + { + "epoch": 7.487870619946092, + "grad_norm": 0.8397746086120605, + "learning_rate": 0.00015105882352941177, + "loss": 3.222, + "step": 69450 + }, + { + "epoch": 7.493261455525606, + "grad_norm": 0.8016370534896851, + "learning_rate": 0.00015073502428494332, + "loss": 3.2229, + "step": 69500 + }, + { + "epoch": 7.498652291105121, + "grad_norm": 0.8667572736740112, + "learning_rate": 0.0001504112250404749, + "loss": 3.2365, + "step": 69550 + }, + { + "epoch": 7.504043126684636, + "grad_norm": 0.7989738583564758, + "learning_rate": 0.00015008742579600647, + "loss": 3.2103, + "step": 69600 + }, + { + "epoch": 7.509433962264151, + "grad_norm": 0.8215728998184204, + "learning_rate": 0.00014976362655153802, + "loss": 3.2272, + "step": 69650 + }, + { + "epoch": 7.514824797843666, + "grad_norm": 0.7850006222724915, + "learning_rate": 0.0001494398273070696, + "loss": 3.2367, + "step": 69700 + }, + { + "epoch": 7.520215633423181, + "grad_norm": 0.8493239879608154, + "learning_rate": 0.00014911602806260118, + "loss": 3.2425, + "step": 69750 + }, + { + "epoch": 7.525606469002695, + "grad_norm": 0.8434281349182129, + "learning_rate": 0.00014879222881813273, + "loss": 3.2238, + "step": 69800 + }, + { + "epoch": 7.53099730458221, + "grad_norm": 0.8048841953277588, + "learning_rate": 0.0001484684295736643, + "loss": 3.2276, + "step": 69850 + }, + { + "epoch": 7.536388140161725, + "grad_norm": 0.8230905532836914, + "learning_rate": 0.00014814463032919589, + "loss": 3.2295, + "step": 69900 + }, + { + "epoch": 7.54177897574124, + "grad_norm": 0.855772078037262, + "learning_rate": 0.00014782083108472744, + "loss": 3.226, + "step": 69950 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 0.8324517011642456, + "learning_rate": 0.00014749703184025901, + "loss": 3.2235, + "step": 70000 + }, + { + "epoch": 7.547169811320755, + "eval_accuracy": 0.3884776437447826, + "eval_loss": 3.3457727432250977, + "eval_runtime": 185.4004, + "eval_samples_per_second": 97.146, + "eval_steps_per_second": 6.073, + "step": 70000 + }, + { + "epoch": 7.55256064690027, + "grad_norm": 0.8509570956230164, + "learning_rate": 0.0001471732325957906, + "loss": 3.2324, + "step": 70050 + }, + { + "epoch": 7.557951482479784, + "grad_norm": 0.8302329182624817, + "learning_rate": 0.00014684943335132217, + "loss": 3.2349, + "step": 70100 + }, + { + "epoch": 7.563342318059299, + "grad_norm": 0.8065623641014099, + "learning_rate": 0.00014652563410685375, + "loss": 3.2328, + "step": 70150 + }, + { + "epoch": 7.568733153638814, + "grad_norm": 0.8313807249069214, + "learning_rate": 0.0001462018348623853, + "loss": 3.217, + "step": 70200 + }, + { + "epoch": 7.574123989218329, + "grad_norm": 0.8627298474311829, + "learning_rate": 0.00014587803561791687, + "loss": 3.2306, + "step": 70250 + }, + { + "epoch": 7.579514824797844, + "grad_norm": 0.7965494394302368, + "learning_rate": 0.00014555423637344845, + "loss": 3.2271, + "step": 70300 + }, + { + "epoch": 7.584905660377358, + "grad_norm": 0.8298637866973877, + "learning_rate": 0.00014523043712898003, + "loss": 3.2427, + "step": 70350 + }, + { + "epoch": 7.590296495956873, + "grad_norm": 0.8248470425605774, + "learning_rate": 0.00014490663788451158, + "loss": 3.2247, + "step": 70400 + }, + { + "epoch": 7.595687331536388, + "grad_norm": 0.787762463092804, + "learning_rate": 0.00014458283864004316, + "loss": 3.2358, + "step": 70450 + }, + { + "epoch": 7.601078167115903, + "grad_norm": 0.8552139401435852, + "learning_rate": 0.00014425903939557474, + "loss": 3.2397, + "step": 70500 + }, + { + "epoch": 7.606469002695418, + "grad_norm": 0.828631579875946, + "learning_rate": 0.00014393524015110631, + "loss": 3.2287, + "step": 70550 + }, + { + "epoch": 7.611859838274933, + "grad_norm": 0.8409473299980164, + "learning_rate": 0.00014361144090663786, + "loss": 3.2292, + "step": 70600 + }, + { + "epoch": 7.617250673854447, + "grad_norm": 0.7935662865638733, + "learning_rate": 0.00014328764166216944, + "loss": 3.2284, + "step": 70650 + }, + { + "epoch": 7.622641509433962, + "grad_norm": 0.7940767407417297, + "learning_rate": 0.00014296384241770102, + "loss": 3.2311, + "step": 70700 + }, + { + "epoch": 7.628032345013477, + "grad_norm": 0.8206398487091064, + "learning_rate": 0.0001426400431732326, + "loss": 3.2316, + "step": 70750 + }, + { + "epoch": 7.633423180592992, + "grad_norm": 0.857508659362793, + "learning_rate": 0.00014231624392876417, + "loss": 3.2439, + "step": 70800 + }, + { + "epoch": 7.638814016172507, + "grad_norm": 0.795734167098999, + "learning_rate": 0.00014199244468429573, + "loss": 3.2205, + "step": 70850 + }, + { + "epoch": 7.644204851752022, + "grad_norm": 0.8424890637397766, + "learning_rate": 0.0001416686454398273, + "loss": 3.2385, + "step": 70900 + }, + { + "epoch": 7.649595687331536, + "grad_norm": 0.824721097946167, + "learning_rate": 0.00014134484619535888, + "loss": 3.2192, + "step": 70950 + }, + { + "epoch": 7.654986522911051, + "grad_norm": 0.849852442741394, + "learning_rate": 0.00014102104695089043, + "loss": 3.2512, + "step": 71000 + }, + { + "epoch": 7.654986522911051, + "eval_accuracy": 0.3889413742699748, + "eval_loss": 3.342104911804199, + "eval_runtime": 185.5438, + "eval_samples_per_second": 97.071, + "eval_steps_per_second": 6.069, + "step": 71000 + }, + { + "epoch": 7.660377358490566, + "grad_norm": 0.7847785353660583, + "learning_rate": 0.000140697247706422, + "loss": 3.242, + "step": 71050 + }, + { + "epoch": 7.665768194070081, + "grad_norm": 0.8247793912887573, + "learning_rate": 0.0001403734484619536, + "loss": 3.2495, + "step": 71100 + }, + { + "epoch": 7.671159029649596, + "grad_norm": 0.8466809988021851, + "learning_rate": 0.00014004964921748514, + "loss": 3.2309, + "step": 71150 + }, + { + "epoch": 7.67654986522911, + "grad_norm": 0.8371172547340393, + "learning_rate": 0.00013972584997301671, + "loss": 3.2462, + "step": 71200 + }, + { + "epoch": 7.681940700808625, + "grad_norm": 0.7972482442855835, + "learning_rate": 0.0001394020507285483, + "loss": 3.2354, + "step": 71250 + }, + { + "epoch": 7.6873315363881405, + "grad_norm": 0.8721069097518921, + "learning_rate": 0.00013907825148407984, + "loss": 3.2327, + "step": 71300 + }, + { + "epoch": 7.692722371967655, + "grad_norm": 0.8658064603805542, + "learning_rate": 0.00013875445223961142, + "loss": 3.246, + "step": 71350 + }, + { + "epoch": 7.69811320754717, + "grad_norm": 0.8553919196128845, + "learning_rate": 0.000138430652995143, + "loss": 3.2504, + "step": 71400 + }, + { + "epoch": 7.703504043126685, + "grad_norm": 0.8206518888473511, + "learning_rate": 0.00013810685375067455, + "loss": 3.2292, + "step": 71450 + }, + { + "epoch": 7.708894878706199, + "grad_norm": 0.8005398511886597, + "learning_rate": 0.00013778305450620613, + "loss": 3.2377, + "step": 71500 + }, + { + "epoch": 7.714285714285714, + "grad_norm": 0.8112930059432983, + "learning_rate": 0.0001374592552617377, + "loss": 3.226, + "step": 71550 + }, + { + "epoch": 7.719676549865229, + "grad_norm": 0.8869747519493103, + "learning_rate": 0.00013713545601726928, + "loss": 3.22, + "step": 71600 + }, + { + "epoch": 7.725067385444744, + "grad_norm": 0.8616730570793152, + "learning_rate": 0.00013681165677280086, + "loss": 3.243, + "step": 71650 + }, + { + "epoch": 7.730458221024259, + "grad_norm": 0.8031706213951111, + "learning_rate": 0.0001364878575283324, + "loss": 3.2445, + "step": 71700 + }, + { + "epoch": 7.735849056603773, + "grad_norm": 0.8454236388206482, + "learning_rate": 0.000136164058283864, + "loss": 3.2418, + "step": 71750 + }, + { + "epoch": 7.741239892183288, + "grad_norm": 0.812794029712677, + "learning_rate": 0.00013584025903939557, + "loss": 3.2266, + "step": 71800 + }, + { + "epoch": 7.7466307277628035, + "grad_norm": 0.8233222365379333, + "learning_rate": 0.00013551645979492714, + "loss": 3.245, + "step": 71850 + }, + { + "epoch": 7.752021563342318, + "grad_norm": 0.7795098423957825, + "learning_rate": 0.0001351926605504587, + "loss": 3.244, + "step": 71900 + }, + { + "epoch": 7.757412398921833, + "grad_norm": 0.8167760968208313, + "learning_rate": 0.00013486886130599027, + "loss": 3.2451, + "step": 71950 + }, + { + "epoch": 7.762803234501348, + "grad_norm": 0.8600962162017822, + "learning_rate": 0.00013454506206152185, + "loss": 3.2503, + "step": 72000 + }, + { + "epoch": 7.762803234501348, + "eval_accuracy": 0.3895109327078983, + "eval_loss": 3.338362693786621, + "eval_runtime": 185.6422, + "eval_samples_per_second": 97.02, + "eval_steps_per_second": 6.065, + "step": 72000 + }, + { + "epoch": 7.768194070080862, + "grad_norm": 0.8626272082328796, + "learning_rate": 0.00013422126281705343, + "loss": 3.237, + "step": 72050 + }, + { + "epoch": 7.773584905660377, + "grad_norm": 0.8045924305915833, + "learning_rate": 0.00013389746357258498, + "loss": 3.2439, + "step": 72100 + }, + { + "epoch": 7.7789757412398925, + "grad_norm": 0.8124786019325256, + "learning_rate": 0.00013357366432811656, + "loss": 3.2279, + "step": 72150 + }, + { + "epoch": 7.784366576819407, + "grad_norm": 0.821742832660675, + "learning_rate": 0.00013324986508364813, + "loss": 3.2195, + "step": 72200 + }, + { + "epoch": 7.789757412398922, + "grad_norm": 0.8228744268417358, + "learning_rate": 0.0001329260658391797, + "loss": 3.2336, + "step": 72250 + }, + { + "epoch": 7.795148247978437, + "grad_norm": 0.813159704208374, + "learning_rate": 0.00013260226659471126, + "loss": 3.2222, + "step": 72300 + }, + { + "epoch": 7.800539083557951, + "grad_norm": 0.8435391187667847, + "learning_rate": 0.00013227846735024284, + "loss": 3.2063, + "step": 72350 + }, + { + "epoch": 7.8059299191374665, + "grad_norm": 0.8387914299964905, + "learning_rate": 0.00013195466810577442, + "loss": 3.2326, + "step": 72400 + }, + { + "epoch": 7.811320754716981, + "grad_norm": 0.8151735067367554, + "learning_rate": 0.000131630868861306, + "loss": 3.2368, + "step": 72450 + }, + { + "epoch": 7.816711590296496, + "grad_norm": 0.8179688453674316, + "learning_rate": 0.00013131354560172692, + "loss": 3.2313, + "step": 72500 + }, + { + "epoch": 7.822102425876011, + "grad_norm": 0.8553862571716309, + "learning_rate": 0.0001309897463572585, + "loss": 3.2186, + "step": 72550 + }, + { + "epoch": 7.827493261455525, + "grad_norm": 0.818985104560852, + "learning_rate": 0.00013066594711279007, + "loss": 3.2491, + "step": 72600 + }, + { + "epoch": 7.83288409703504, + "grad_norm": 0.8475792407989502, + "learning_rate": 0.00013034214786832162, + "loss": 3.2344, + "step": 72650 + }, + { + "epoch": 7.8382749326145555, + "grad_norm": 0.8516533970832825, + "learning_rate": 0.0001300183486238532, + "loss": 3.2391, + "step": 72700 + }, + { + "epoch": 7.84366576819407, + "grad_norm": 0.8353322148323059, + "learning_rate": 0.00012969454937938478, + "loss": 3.2081, + "step": 72750 + }, + { + "epoch": 7.849056603773585, + "grad_norm": 0.8211535811424255, + "learning_rate": 0.00012937075013491636, + "loss": 3.2394, + "step": 72800 + }, + { + "epoch": 7.8544474393531, + "grad_norm": 0.8574275970458984, + "learning_rate": 0.0001290469508904479, + "loss": 3.2372, + "step": 72850 + }, + { + "epoch": 7.859838274932614, + "grad_norm": 0.8538966178894043, + "learning_rate": 0.00012872315164597949, + "loss": 3.237, + "step": 72900 + }, + { + "epoch": 7.8652291105121295, + "grad_norm": 0.8543484210968018, + "learning_rate": 0.00012839935240151106, + "loss": 3.2408, + "step": 72950 + }, + { + "epoch": 7.870619946091644, + "grad_norm": 0.8760818243026733, + "learning_rate": 0.00012807555315704261, + "loss": 3.2303, + "step": 73000 + }, + { + "epoch": 7.870619946091644, + "eval_accuracy": 0.39012243116051826, + "eval_loss": 3.334336042404175, + "eval_runtime": 185.7846, + "eval_samples_per_second": 96.946, + "eval_steps_per_second": 6.061, + "step": 73000 + }, + { + "epoch": 7.876010781671159, + "grad_norm": 0.9315475821495056, + "learning_rate": 0.0001277517539125742, + "loss": 3.2402, + "step": 73050 + }, + { + "epoch": 7.881401617250674, + "grad_norm": 0.8197574615478516, + "learning_rate": 0.00012742795466810577, + "loss": 3.2364, + "step": 73100 + }, + { + "epoch": 7.886792452830189, + "grad_norm": 0.8208304643630981, + "learning_rate": 0.00012710415542363732, + "loss": 3.2348, + "step": 73150 + }, + { + "epoch": 7.892183288409703, + "grad_norm": 0.8868663907051086, + "learning_rate": 0.0001267803561791689, + "loss": 3.228, + "step": 73200 + }, + { + "epoch": 7.8975741239892185, + "grad_norm": 0.8241314888000488, + "learning_rate": 0.00012645655693470048, + "loss": 3.2348, + "step": 73250 + }, + { + "epoch": 7.902964959568733, + "grad_norm": 0.8251668810844421, + "learning_rate": 0.00012613275769023203, + "loss": 3.2396, + "step": 73300 + }, + { + "epoch": 7.908355795148248, + "grad_norm": 0.870736300945282, + "learning_rate": 0.0001258089584457636, + "loss": 3.2153, + "step": 73350 + }, + { + "epoch": 7.913746630727763, + "grad_norm": 0.8884828090667725, + "learning_rate": 0.00012548515920129518, + "loss": 3.2293, + "step": 73400 + }, + { + "epoch": 7.919137466307277, + "grad_norm": 0.8159066438674927, + "learning_rate": 0.00012516135995682676, + "loss": 3.2142, + "step": 73450 + }, + { + "epoch": 7.9245283018867925, + "grad_norm": 0.8352020978927612, + "learning_rate": 0.0001248375607123583, + "loss": 3.2314, + "step": 73500 + }, + { + "epoch": 7.929919137466308, + "grad_norm": 0.7894663214683533, + "learning_rate": 0.0001245137614678899, + "loss": 3.2215, + "step": 73550 + }, + { + "epoch": 7.935309973045822, + "grad_norm": 0.8059535026550293, + "learning_rate": 0.00012418996222342147, + "loss": 3.221, + "step": 73600 + }, + { + "epoch": 7.940700808625337, + "grad_norm": 0.8399400115013123, + "learning_rate": 0.00012386616297895304, + "loss": 3.2375, + "step": 73650 + }, + { + "epoch": 7.946091644204852, + "grad_norm": 0.8263852596282959, + "learning_rate": 0.0001235423637344846, + "loss": 3.2149, + "step": 73700 + }, + { + "epoch": 7.951482479784366, + "grad_norm": 0.8288939595222473, + "learning_rate": 0.00012321856449001617, + "loss": 3.2328, + "step": 73750 + }, + { + "epoch": 7.9568733153638815, + "grad_norm": 0.8177635073661804, + "learning_rate": 0.00012289476524554775, + "loss": 3.2332, + "step": 73800 + }, + { + "epoch": 7.962264150943396, + "grad_norm": 0.8735475540161133, + "learning_rate": 0.00012257096600107933, + "loss": 3.2179, + "step": 73850 + }, + { + "epoch": 7.967654986522911, + "grad_norm": 0.8652746677398682, + "learning_rate": 0.0001222471667566109, + "loss": 3.2224, + "step": 73900 + }, + { + "epoch": 7.973045822102426, + "grad_norm": 0.8226109743118286, + "learning_rate": 0.00012192336751214245, + "loss": 3.2273, + "step": 73950 + }, + { + "epoch": 7.97843665768194, + "grad_norm": 0.8543111681938171, + "learning_rate": 0.00012159956826767403, + "loss": 3.2506, + "step": 74000 + }, + { + "epoch": 7.97843665768194, + "eval_accuracy": 0.39014720401893993, + "eval_loss": 3.330275535583496, + "eval_runtime": 185.4384, + "eval_samples_per_second": 97.127, + "eval_steps_per_second": 6.072, + "step": 74000 + }, + { + "epoch": 7.9838274932614555, + "grad_norm": 0.9201006293296814, + "learning_rate": 0.00012127576902320561, + "loss": 3.2366, + "step": 74050 + }, + { + "epoch": 7.989218328840971, + "grad_norm": 0.8811962604522705, + "learning_rate": 0.00012095196977873717, + "loss": 3.2393, + "step": 74100 + }, + { + "epoch": 7.994609164420485, + "grad_norm": 0.8862497806549072, + "learning_rate": 0.00012062817053426874, + "loss": 3.2423, + "step": 74150 + }, + { + "epoch": 8.0, + "grad_norm": 1.6737746000289917, + "learning_rate": 0.00012030437128980032, + "loss": 3.2376, + "step": 74200 + }, + { + "epoch": 8.005390835579515, + "grad_norm": 0.7967222929000854, + "learning_rate": 0.00011998057204533188, + "loss": 3.1517, + "step": 74250 + }, + { + "epoch": 8.01078167115903, + "grad_norm": 0.8131229877471924, + "learning_rate": 0.00011965677280086346, + "loss": 3.162, + "step": 74300 + }, + { + "epoch": 8.016172506738544, + "grad_norm": 0.8384157419204712, + "learning_rate": 0.00011933297355639502, + "loss": 3.1598, + "step": 74350 + }, + { + "epoch": 8.021563342318059, + "grad_norm": 0.9127787947654724, + "learning_rate": 0.00011900917431192659, + "loss": 3.1583, + "step": 74400 + }, + { + "epoch": 8.026954177897574, + "grad_norm": 0.864522397518158, + "learning_rate": 0.00011868537506745816, + "loss": 3.1603, + "step": 74450 + }, + { + "epoch": 8.032345013477089, + "grad_norm": 0.8500173091888428, + "learning_rate": 0.00011836157582298974, + "loss": 3.1775, + "step": 74500 + }, + { + "epoch": 8.037735849056604, + "grad_norm": 0.8668968081474304, + "learning_rate": 0.00011803777657852132, + "loss": 3.1564, + "step": 74550 + }, + { + "epoch": 8.04312668463612, + "grad_norm": 0.8468078374862671, + "learning_rate": 0.00011771397733405287, + "loss": 3.1771, + "step": 74600 + }, + { + "epoch": 8.048517520215633, + "grad_norm": 0.8505526185035706, + "learning_rate": 0.00011739017808958445, + "loss": 3.1743, + "step": 74650 + }, + { + "epoch": 8.053908355795148, + "grad_norm": 0.8244675993919373, + "learning_rate": 0.00011706637884511602, + "loss": 3.1622, + "step": 74700 + }, + { + "epoch": 8.059299191374663, + "grad_norm": 0.8527902364730835, + "learning_rate": 0.0001167425796006476, + "loss": 3.1502, + "step": 74750 + }, + { + "epoch": 8.064690026954178, + "grad_norm": 0.8415079116821289, + "learning_rate": 0.00011641878035617915, + "loss": 3.1693, + "step": 74800 + }, + { + "epoch": 8.070080862533693, + "grad_norm": 0.8204153180122375, + "learning_rate": 0.00011609498111171073, + "loss": 3.15, + "step": 74850 + }, + { + "epoch": 8.075471698113208, + "grad_norm": 0.7880483269691467, + "learning_rate": 0.00011577118186724231, + "loss": 3.1494, + "step": 74900 + }, + { + "epoch": 8.080862533692722, + "grad_norm": 0.8183393478393555, + "learning_rate": 0.00011544738262277387, + "loss": 3.1641, + "step": 74950 + }, + { + "epoch": 8.086253369272237, + "grad_norm": 0.8524279594421387, + "learning_rate": 0.00011512358337830544, + "loss": 3.155, + "step": 75000 + }, + { + "epoch": 8.086253369272237, + "eval_accuracy": 0.39016502309254153, + "eval_loss": 3.3376200199127197, + "eval_runtime": 185.5538, + "eval_samples_per_second": 97.066, + "eval_steps_per_second": 6.068, + "step": 75000 + }, + { + "epoch": 8.091644204851752, + "grad_norm": 0.8844255805015564, + "learning_rate": 0.00011479978413383701, + "loss": 3.1615, + "step": 75050 + }, + { + "epoch": 8.097035040431267, + "grad_norm": 0.8732317090034485, + "learning_rate": 0.00011447598488936858, + "loss": 3.1484, + "step": 75100 + }, + { + "epoch": 8.102425876010782, + "grad_norm": 0.9299524426460266, + "learning_rate": 0.00011415218564490016, + "loss": 3.1726, + "step": 75150 + }, + { + "epoch": 8.107816711590296, + "grad_norm": 0.8411099910736084, + "learning_rate": 0.00011382838640043172, + "loss": 3.1661, + "step": 75200 + }, + { + "epoch": 8.11320754716981, + "grad_norm": 0.8532853126525879, + "learning_rate": 0.00011350458715596328, + "loss": 3.1714, + "step": 75250 + }, + { + "epoch": 8.118598382749326, + "grad_norm": 0.7946939468383789, + "learning_rate": 0.00011318078791149486, + "loss": 3.1793, + "step": 75300 + }, + { + "epoch": 8.123989218328841, + "grad_norm": 0.8610842823982239, + "learning_rate": 0.00011285698866702644, + "loss": 3.1826, + "step": 75350 + }, + { + "epoch": 8.129380053908356, + "grad_norm": 0.9045069217681885, + "learning_rate": 0.00011253318942255802, + "loss": 3.185, + "step": 75400 + }, + { + "epoch": 8.134770889487871, + "grad_norm": 0.8936116099357605, + "learning_rate": 0.00011220939017808957, + "loss": 3.1709, + "step": 75450 + }, + { + "epoch": 8.140161725067385, + "grad_norm": 0.8157985806465149, + "learning_rate": 0.00011188559093362115, + "loss": 3.1754, + "step": 75500 + }, + { + "epoch": 8.1455525606469, + "grad_norm": 0.8028213381767273, + "learning_rate": 0.00011156179168915272, + "loss": 3.1595, + "step": 75550 + }, + { + "epoch": 8.150943396226415, + "grad_norm": 0.8442425727844238, + "learning_rate": 0.00011123799244468429, + "loss": 3.158, + "step": 75600 + }, + { + "epoch": 8.15633423180593, + "grad_norm": 0.8477099537849426, + "learning_rate": 0.00011091419320021585, + "loss": 3.1646, + "step": 75650 + }, + { + "epoch": 8.161725067385445, + "grad_norm": 0.871978759765625, + "learning_rate": 0.00011059039395574743, + "loss": 3.1836, + "step": 75700 + }, + { + "epoch": 8.167115902964959, + "grad_norm": 0.8002774119377136, + "learning_rate": 0.00011026659471127899, + "loss": 3.1646, + "step": 75750 + }, + { + "epoch": 8.172506738544474, + "grad_norm": 0.8479560017585754, + "learning_rate": 0.00010994279546681057, + "loss": 3.1601, + "step": 75800 + }, + { + "epoch": 8.177897574123989, + "grad_norm": 0.8153694868087769, + "learning_rate": 0.00010961899622234213, + "loss": 3.1643, + "step": 75850 + }, + { + "epoch": 8.183288409703504, + "grad_norm": 0.844157874584198, + "learning_rate": 0.00010929519697787371, + "loss": 3.1668, + "step": 75900 + }, + { + "epoch": 8.18867924528302, + "grad_norm": 0.8492550253868103, + "learning_rate": 0.00010897139773340528, + "loss": 3.1761, + "step": 75950 + }, + { + "epoch": 8.194070080862534, + "grad_norm": 0.8232309222221375, + "learning_rate": 0.00010864759848893685, + "loss": 3.1552, + "step": 76000 + }, + { + "epoch": 8.194070080862534, + "eval_accuracy": 0.39022684658570794, + "eval_loss": 3.3350107669830322, + "eval_runtime": 185.73, + "eval_samples_per_second": 96.974, + "eval_steps_per_second": 6.063, + "step": 76000 + }, + { + "epoch": 8.199460916442048, + "grad_norm": 0.9264254570007324, + "learning_rate": 0.00010832379924446842, + "loss": 3.1785, + "step": 76050 + }, + { + "epoch": 8.204851752021563, + "grad_norm": 0.8779247403144836, + "learning_rate": 0.00010800647598488936, + "loss": 3.1992, + "step": 76100 + }, + { + "epoch": 8.210242587601078, + "grad_norm": 0.8053860068321228, + "learning_rate": 0.00010768267674042093, + "loss": 3.1669, + "step": 76150 + }, + { + "epoch": 8.215633423180593, + "grad_norm": 0.87197345495224, + "learning_rate": 0.0001073588774959525, + "loss": 3.1809, + "step": 76200 + }, + { + "epoch": 8.221024258760108, + "grad_norm": 0.8206743001937866, + "learning_rate": 0.00010703507825148406, + "loss": 3.1762, + "step": 76250 + }, + { + "epoch": 8.226415094339623, + "grad_norm": 0.8729113340377808, + "learning_rate": 0.00010671127900701564, + "loss": 3.159, + "step": 76300 + }, + { + "epoch": 8.231805929919137, + "grad_norm": 0.8449000716209412, + "learning_rate": 0.00010638747976254722, + "loss": 3.1766, + "step": 76350 + }, + { + "epoch": 8.237196765498652, + "grad_norm": 0.8317974209785461, + "learning_rate": 0.00010606368051807877, + "loss": 3.1728, + "step": 76400 + }, + { + "epoch": 8.242587601078167, + "grad_norm": 0.8439301252365112, + "learning_rate": 0.00010573988127361035, + "loss": 3.1729, + "step": 76450 + }, + { + "epoch": 8.247978436657682, + "grad_norm": 0.8755947351455688, + "learning_rate": 0.00010541608202914192, + "loss": 3.1605, + "step": 76500 + }, + { + "epoch": 8.253369272237197, + "grad_norm": 0.8506140112876892, + "learning_rate": 0.0001050922827846735, + "loss": 3.1887, + "step": 76550 + }, + { + "epoch": 8.25876010781671, + "grad_norm": 0.8570579290390015, + "learning_rate": 0.00010476848354020505, + "loss": 3.1638, + "step": 76600 + }, + { + "epoch": 8.264150943396226, + "grad_norm": 0.7894355654716492, + "learning_rate": 0.00010444468429573663, + "loss": 3.161, + "step": 76650 + }, + { + "epoch": 8.269541778975741, + "grad_norm": 0.8321079015731812, + "learning_rate": 0.00010412088505126821, + "loss": 3.1819, + "step": 76700 + }, + { + "epoch": 8.274932614555256, + "grad_norm": 0.8256025910377502, + "learning_rate": 0.00010379708580679979, + "loss": 3.173, + "step": 76750 + }, + { + "epoch": 8.280323450134771, + "grad_norm": 0.8556420207023621, + "learning_rate": 0.00010347328656233135, + "loss": 3.1784, + "step": 76800 + }, + { + "epoch": 8.285714285714286, + "grad_norm": 0.8610917925834656, + "learning_rate": 0.00010314948731786291, + "loss": 3.1877, + "step": 76850 + }, + { + "epoch": 8.2911051212938, + "grad_norm": 0.8405426740646362, + "learning_rate": 0.00010282568807339449, + "loss": 3.204, + "step": 76900 + }, + { + "epoch": 8.296495956873315, + "grad_norm": 0.9084919691085815, + "learning_rate": 0.00010250188882892606, + "loss": 3.1854, + "step": 76950 + }, + { + "epoch": 8.30188679245283, + "grad_norm": 0.8620514273643494, + "learning_rate": 0.00010217808958445763, + "loss": 3.1737, + "step": 77000 + }, + { + "epoch": 8.30188679245283, + "eval_accuracy": 0.3907469679596759, + "eval_loss": 3.330815076828003, + "eval_runtime": 185.4048, + "eval_samples_per_second": 97.144, + "eval_steps_per_second": 6.073, + "step": 77000 + }, + { + "epoch": 8.307277628032345, + "grad_norm": 0.8218554258346558, + "learning_rate": 0.0001018542903399892, + "loss": 3.1878, + "step": 77050 + }, + { + "epoch": 8.31266846361186, + "grad_norm": 0.8750684857368469, + "learning_rate": 0.00010153049109552076, + "loss": 3.1723, + "step": 77100 + }, + { + "epoch": 8.318059299191376, + "grad_norm": 0.8547766208648682, + "learning_rate": 0.00010120669185105234, + "loss": 3.1601, + "step": 77150 + }, + { + "epoch": 8.323450134770889, + "grad_norm": 0.9241974949836731, + "learning_rate": 0.00010088289260658392, + "loss": 3.1607, + "step": 77200 + }, + { + "epoch": 8.328840970350404, + "grad_norm": 0.8532211780548096, + "learning_rate": 0.00010055909336211547, + "loss": 3.1603, + "step": 77250 + }, + { + "epoch": 8.33423180592992, + "grad_norm": 0.8546267151832581, + "learning_rate": 0.00010023529411764704, + "loss": 3.1686, + "step": 77300 + }, + { + "epoch": 8.339622641509434, + "grad_norm": 0.8596141934394836, + "learning_rate": 9.991149487317862e-05, + "loss": 3.1831, + "step": 77350 + }, + { + "epoch": 8.34501347708895, + "grad_norm": 0.8430399298667908, + "learning_rate": 9.95876956287102e-05, + "loss": 3.1368, + "step": 77400 + }, + { + "epoch": 8.350404312668463, + "grad_norm": 0.8879104256629944, + "learning_rate": 9.926389638424175e-05, + "loss": 3.1817, + "step": 77450 + }, + { + "epoch": 8.355795148247978, + "grad_norm": 0.8396323919296265, + "learning_rate": 9.894009713977333e-05, + "loss": 3.1877, + "step": 77500 + }, + { + "epoch": 8.361185983827493, + "grad_norm": 0.9354804754257202, + "learning_rate": 9.86162978953049e-05, + "loss": 3.1739, + "step": 77550 + }, + { + "epoch": 8.366576819407008, + "grad_norm": 0.921036422252655, + "learning_rate": 9.829249865083647e-05, + "loss": 3.1888, + "step": 77600 + }, + { + "epoch": 8.371967654986523, + "grad_norm": 0.8595395088195801, + "learning_rate": 9.796869940636805e-05, + "loss": 3.1577, + "step": 77650 + }, + { + "epoch": 8.377358490566039, + "grad_norm": 0.8779202699661255, + "learning_rate": 9.764490016189961e-05, + "loss": 3.1973, + "step": 77700 + }, + { + "epoch": 8.382749326145552, + "grad_norm": 0.8880534768104553, + "learning_rate": 9.732110091743119e-05, + "loss": 3.1751, + "step": 77750 + }, + { + "epoch": 8.388140161725067, + "grad_norm": 0.9138519763946533, + "learning_rate": 9.699730167296275e-05, + "loss": 3.1703, + "step": 77800 + }, + { + "epoch": 8.393530997304582, + "grad_norm": 0.8722093105316162, + "learning_rate": 9.667350242849433e-05, + "loss": 3.1775, + "step": 77850 + }, + { + "epoch": 8.398921832884097, + "grad_norm": 0.8912443518638611, + "learning_rate": 9.63497031840259e-05, + "loss": 3.1734, + "step": 77900 + }, + { + "epoch": 8.404312668463612, + "grad_norm": 0.9036107063293457, + "learning_rate": 9.602590393955746e-05, + "loss": 3.1824, + "step": 77950 + }, + { + "epoch": 8.409703504043126, + "grad_norm": 0.8936976790428162, + "learning_rate": 9.570210469508904e-05, + "loss": 3.167, + "step": 78000 + }, + { + "epoch": 8.409703504043126, + "eval_accuracy": 0.390828892237088, + "eval_loss": 3.3293895721435547, + "eval_runtime": 185.6367, + "eval_samples_per_second": 97.023, + "eval_steps_per_second": 6.066, + "step": 78000 + }, + { + "epoch": 8.415094339622641, + "grad_norm": 0.8334117531776428, + "learning_rate": 9.537830545062061e-05, + "loss": 3.1807, + "step": 78050 + }, + { + "epoch": 8.420485175202156, + "grad_norm": 0.8583893179893494, + "learning_rate": 9.505450620615217e-05, + "loss": 3.1683, + "step": 78100 + }, + { + "epoch": 8.425876010781671, + "grad_norm": 0.8793879151344299, + "learning_rate": 9.473070696168374e-05, + "loss": 3.1779, + "step": 78150 + }, + { + "epoch": 8.431266846361186, + "grad_norm": 0.8737629055976868, + "learning_rate": 9.440690771721532e-05, + "loss": 3.1893, + "step": 78200 + }, + { + "epoch": 8.436657681940702, + "grad_norm": 0.8955116868019104, + "learning_rate": 9.40831084727469e-05, + "loss": 3.2123, + "step": 78250 + }, + { + "epoch": 8.442048517520215, + "grad_norm": 0.8622949123382568, + "learning_rate": 9.375930922827845e-05, + "loss": 3.1808, + "step": 78300 + }, + { + "epoch": 8.44743935309973, + "grad_norm": 0.8935747742652893, + "learning_rate": 9.343550998381003e-05, + "loss": 3.1764, + "step": 78350 + }, + { + "epoch": 8.452830188679245, + "grad_norm": 0.874409019947052, + "learning_rate": 9.31117107393416e-05, + "loss": 3.1797, + "step": 78400 + }, + { + "epoch": 8.45822102425876, + "grad_norm": 0.915778636932373, + "learning_rate": 9.278791149487317e-05, + "loss": 3.1915, + "step": 78450 + }, + { + "epoch": 8.463611859838275, + "grad_norm": 0.8673213124275208, + "learning_rate": 9.246411225040475e-05, + "loss": 3.1852, + "step": 78500 + }, + { + "epoch": 8.46900269541779, + "grad_norm": 0.8483467102050781, + "learning_rate": 9.214031300593631e-05, + "loss": 3.1768, + "step": 78550 + }, + { + "epoch": 8.474393530997304, + "grad_norm": 0.8365896344184875, + "learning_rate": 9.181651376146787e-05, + "loss": 3.1854, + "step": 78600 + }, + { + "epoch": 8.479784366576819, + "grad_norm": 0.8353489637374878, + "learning_rate": 9.149271451699945e-05, + "loss": 3.175, + "step": 78650 + }, + { + "epoch": 8.485175202156334, + "grad_norm": 0.8398521542549133, + "learning_rate": 9.116891527253103e-05, + "loss": 3.1768, + "step": 78700 + }, + { + "epoch": 8.49056603773585, + "grad_norm": 0.9163660407066345, + "learning_rate": 9.084511602806258e-05, + "loss": 3.187, + "step": 78750 + }, + { + "epoch": 8.495956873315365, + "grad_norm": 0.8956058621406555, + "learning_rate": 9.052131678359416e-05, + "loss": 3.1911, + "step": 78800 + }, + { + "epoch": 8.501347708894878, + "grad_norm": 0.8277040719985962, + "learning_rate": 9.019751753912574e-05, + "loss": 3.1885, + "step": 78850 + }, + { + "epoch": 8.506738544474393, + "grad_norm": 0.8544255495071411, + "learning_rate": 8.987371829465731e-05, + "loss": 3.1792, + "step": 78900 + }, + { + "epoch": 8.512129380053908, + "grad_norm": 0.8472200036048889, + "learning_rate": 8.954991905018886e-05, + "loss": 3.1771, + "step": 78950 + }, + { + "epoch": 8.517520215633423, + "grad_norm": 0.8413991928100586, + "learning_rate": 8.922611980572044e-05, + "loss": 3.1798, + "step": 79000 + }, + { + "epoch": 8.517520215633423, + "eval_accuracy": 0.39121558786481975, + "eval_loss": 3.324694871902466, + "eval_runtime": 185.6121, + "eval_samples_per_second": 97.036, + "eval_steps_per_second": 6.066, + "step": 79000 + }, + { + "epoch": 8.522911051212938, + "grad_norm": 0.855174720287323, + "learning_rate": 8.890232056125202e-05, + "loss": 3.1962, + "step": 79050 + }, + { + "epoch": 8.528301886792454, + "grad_norm": 0.9736406803131104, + "learning_rate": 8.85785213167836e-05, + "loss": 3.1796, + "step": 79100 + }, + { + "epoch": 8.533692722371967, + "grad_norm": 0.8685256242752075, + "learning_rate": 8.825472207231516e-05, + "loss": 3.1868, + "step": 79150 + }, + { + "epoch": 8.539083557951482, + "grad_norm": 0.9153226613998413, + "learning_rate": 8.793092282784672e-05, + "loss": 3.2056, + "step": 79200 + }, + { + "epoch": 8.544474393530997, + "grad_norm": 0.8617915511131287, + "learning_rate": 8.76071235833783e-05, + "loss": 3.1855, + "step": 79250 + }, + { + "epoch": 8.549865229110512, + "grad_norm": 0.8766077756881714, + "learning_rate": 8.728332433890987e-05, + "loss": 3.163, + "step": 79300 + }, + { + "epoch": 8.555256064690028, + "grad_norm": 0.8733969330787659, + "learning_rate": 8.695952509444144e-05, + "loss": 3.183, + "step": 79350 + }, + { + "epoch": 8.560646900269543, + "grad_norm": 0.9204221963882446, + "learning_rate": 8.663572584997301e-05, + "loss": 3.1797, + "step": 79400 + }, + { + "epoch": 8.566037735849056, + "grad_norm": 0.859405517578125, + "learning_rate": 8.631192660550457e-05, + "loss": 3.1643, + "step": 79450 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.8825210928916931, + "learning_rate": 8.598812736103615e-05, + "loss": 3.174, + "step": 79500 + }, + { + "epoch": 8.576819407008086, + "grad_norm": 0.9521044492721558, + "learning_rate": 8.566432811656773e-05, + "loss": 3.1751, + "step": 79550 + }, + { + "epoch": 8.582210242587601, + "grad_norm": 0.8510432243347168, + "learning_rate": 8.534052887209928e-05, + "loss": 3.1599, + "step": 79600 + }, + { + "epoch": 8.587601078167117, + "grad_norm": 0.8742794394493103, + "learning_rate": 8.501672962763086e-05, + "loss": 3.1736, + "step": 79650 + }, + { + "epoch": 8.59299191374663, + "grad_norm": 0.8450675010681152, + "learning_rate": 8.469293038316243e-05, + "loss": 3.1715, + "step": 79700 + }, + { + "epoch": 8.598382749326145, + "grad_norm": 0.8777676224708557, + "learning_rate": 8.436913113869401e-05, + "loss": 3.1811, + "step": 79750 + }, + { + "epoch": 8.60377358490566, + "grad_norm": 0.8752297759056091, + "learning_rate": 8.404533189422556e-05, + "loss": 3.1833, + "step": 79800 + }, + { + "epoch": 8.609164420485175, + "grad_norm": 0.8393634557723999, + "learning_rate": 8.372153264975714e-05, + "loss": 3.1636, + "step": 79850 + }, + { + "epoch": 8.61455525606469, + "grad_norm": 0.8718554377555847, + "learning_rate": 8.339773340528872e-05, + "loss": 3.2053, + "step": 79900 + }, + { + "epoch": 8.619946091644206, + "grad_norm": 0.8281534910202026, + "learning_rate": 8.307393416082028e-05, + "loss": 3.1834, + "step": 79950 + }, + { + "epoch": 8.625336927223719, + "grad_norm": 0.8530157804489136, + "learning_rate": 8.275013491635186e-05, + "loss": 3.1957, + "step": 80000 + }, + { + "epoch": 8.625336927223719, + "eval_accuracy": 0.3918415371515189, + "eval_loss": 3.3219223022460938, + "eval_runtime": 185.7657, + "eval_samples_per_second": 96.955, + "eval_steps_per_second": 6.061, + "step": 80000 + } + ], + "logging_steps": 50, + "max_steps": 92750, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.688553435136e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}