diff --git "a/checkpoint-30000/trainer_state.json" "b/checkpoint-30000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-30000/trainer_state.json" @@ -0,0 +1,4503 @@ +{ + "best_metric": 3.490234613418579, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M__8397/checkpoint-30000", + "epoch": 3.234501347708895, + "eval_steps": 1000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005390835579514825, + "grad_norm": 1.9401150941848755, + "learning_rate": 0.0003, + "loss": 8.6351, + "step": 50 + }, + { + "epoch": 0.01078167115902965, + "grad_norm": 1.2623108625411987, + "learning_rate": 0.0006, + "loss": 6.8864, + "step": 100 + }, + { + "epoch": 0.016172506738544475, + "grad_norm": 3.276719808578491, + "learning_rate": 0.0005996762007555315, + "loss": 6.4636, + "step": 150 + }, + { + "epoch": 0.0215633423180593, + "grad_norm": 2.706618309020996, + "learning_rate": 0.000599352401511063, + "loss": 6.2398, + "step": 200 + }, + { + "epoch": 0.026954177897574125, + "grad_norm": 1.0366517305374146, + "learning_rate": 0.0005990286022665946, + "loss": 6.0879, + "step": 250 + }, + { + "epoch": 0.03234501347708895, + "grad_norm": 2.069812297821045, + "learning_rate": 0.0005987048030221263, + "loss": 5.9796, + "step": 300 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 1.1950629949569702, + "learning_rate": 0.0005983810037776578, + "loss": 5.8688, + "step": 350 + }, + { + "epoch": 0.0431266846361186, + "grad_norm": 1.0007357597351074, + "learning_rate": 0.0005980572045331894, + "loss": 5.8233, + "step": 400 + }, + { + "epoch": 0.04851752021563342, + "grad_norm": 2.0554916858673096, + "learning_rate": 0.0005977334052887209, + "loss": 5.7452, + "step": 450 + }, + { + "epoch": 0.05390835579514825, + "grad_norm": 1.996433973312378, + "learning_rate": 0.0005974096060442526, + "loss": 5.6484, + "step": 500 + }, + { + "epoch": 0.05929919137466307, + "grad_norm": 1.439592719078064, + "learning_rate": 0.0005970858067997841, + "loss": 5.5734, + "step": 550 + }, + { + "epoch": 0.0646900269541779, + "grad_norm": 1.0077263116836548, + "learning_rate": 0.0005967620075553157, + "loss": 5.4903, + "step": 600 + }, + { + "epoch": 0.07008086253369272, + "grad_norm": 1.460601568222046, + "learning_rate": 0.0005964382083108472, + "loss": 5.4273, + "step": 650 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 1.5479509830474854, + "learning_rate": 0.0005961144090663788, + "loss": 5.3803, + "step": 700 + }, + { + "epoch": 0.08086253369272237, + "grad_norm": 1.2044662237167358, + "learning_rate": 0.0005957906098219104, + "loss": 5.305, + "step": 750 + }, + { + "epoch": 0.0862533692722372, + "grad_norm": 1.4456267356872559, + "learning_rate": 0.0005954668105774419, + "loss": 5.2577, + "step": 800 + }, + { + "epoch": 0.09164420485175202, + "grad_norm": 1.4306107759475708, + "learning_rate": 0.0005951430113329735, + "loss": 5.2168, + "step": 850 + }, + { + "epoch": 0.09703504043126684, + "grad_norm": 1.5723680257797241, + "learning_rate": 0.0005948192120885051, + "loss": 5.1595, + "step": 900 + }, + { + "epoch": 0.10242587601078167, + "grad_norm": 1.4030689001083374, + "learning_rate": 0.0005944954128440366, + "loss": 5.1322, + "step": 950 + }, + { + "epoch": 0.1078167115902965, + "grad_norm": 0.9765119552612305, + "learning_rate": 0.0005941716135995682, + "loss": 5.0934, + "step": 1000 + }, + { + "epoch": 0.1078167115902965, + "eval_accuracy": 0.22754229014862737, + "eval_loss": 5.022522926330566, + "eval_runtime": 185.9242, + "eval_samples_per_second": 96.873, + "eval_steps_per_second": 6.056, + "step": 1000 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.8278682827949524, + "learning_rate": 0.0005938478143550997, + "loss": 5.0442, + "step": 1050 + }, + { + "epoch": 0.11859838274932614, + "grad_norm": 1.0204647779464722, + "learning_rate": 0.0005935240151106314, + "loss": 5.0209, + "step": 1100 + }, + { + "epoch": 0.12398921832884097, + "grad_norm": 1.004250407218933, + "learning_rate": 0.0005932002158661629, + "loss": 4.9915, + "step": 1150 + }, + { + "epoch": 0.1293800539083558, + "grad_norm": 1.2949087619781494, + "learning_rate": 0.0005928764166216945, + "loss": 4.9493, + "step": 1200 + }, + { + "epoch": 0.1347708894878706, + "grad_norm": 0.8809250593185425, + "learning_rate": 0.000592552617377226, + "loss": 4.9039, + "step": 1250 + }, + { + "epoch": 0.14016172506738545, + "grad_norm": 1.0270975828170776, + "learning_rate": 0.0005922288181327577, + "loss": 4.8792, + "step": 1300 + }, + { + "epoch": 0.14555256064690028, + "grad_norm": 0.9649767279624939, + "learning_rate": 0.0005919050188882893, + "loss": 4.8622, + "step": 1350 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.9902446866035461, + "learning_rate": 0.0005915812196438207, + "loss": 4.8481, + "step": 1400 + }, + { + "epoch": 0.15633423180592992, + "grad_norm": 0.7882956862449646, + "learning_rate": 0.0005912574203993524, + "loss": 4.7968, + "step": 1450 + }, + { + "epoch": 0.16172506738544473, + "grad_norm": 0.8941258788108826, + "learning_rate": 0.0005909336211548839, + "loss": 4.8149, + "step": 1500 + }, + { + "epoch": 0.16711590296495957, + "grad_norm": 1.3297462463378906, + "learning_rate": 0.0005906098219104155, + "loss": 4.7656, + "step": 1550 + }, + { + "epoch": 0.1725067385444744, + "grad_norm": 1.497783899307251, + "learning_rate": 0.000590286022665947, + "loss": 4.7642, + "step": 1600 + }, + { + "epoch": 0.1778975741239892, + "grad_norm": 1.0149500370025635, + "learning_rate": 0.0005899622234214787, + "loss": 4.7241, + "step": 1650 + }, + { + "epoch": 0.18328840970350405, + "grad_norm": 1.067275047302246, + "learning_rate": 0.0005896384241770102, + "loss": 4.7115, + "step": 1700 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.7510073781013489, + "learning_rate": 0.0005893146249325418, + "loss": 4.6734, + "step": 1750 + }, + { + "epoch": 0.1940700808625337, + "grad_norm": 1.0857677459716797, + "learning_rate": 0.0005889908256880733, + "loss": 4.6802, + "step": 1800 + }, + { + "epoch": 0.19946091644204852, + "grad_norm": 0.9117745161056519, + "learning_rate": 0.0005886670264436049, + "loss": 4.6481, + "step": 1850 + }, + { + "epoch": 0.20485175202156333, + "grad_norm": 0.9455132484436035, + "learning_rate": 0.0005883432271991365, + "loss": 4.6385, + "step": 1900 + }, + { + "epoch": 0.21024258760107817, + "grad_norm": 0.8344852924346924, + "learning_rate": 0.0005880194279546681, + "loss": 4.6113, + "step": 1950 + }, + { + "epoch": 0.215633423180593, + "grad_norm": 1.0357882976531982, + "learning_rate": 0.0005876956287101996, + "loss": 4.5937, + "step": 2000 + }, + { + "epoch": 0.215633423180593, + "eval_accuracy": 0.27015801715433524, + "eval_loss": 4.516484260559082, + "eval_runtime": 185.6539, + "eval_samples_per_second": 97.014, + "eval_steps_per_second": 6.065, + "step": 2000 + }, + { + "epoch": 0.2210242587601078, + "grad_norm": 0.6943921446800232, + "learning_rate": 0.0005873718294657312, + "loss": 4.5747, + "step": 2050 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.8243718147277832, + "learning_rate": 0.0005870480302212628, + "loss": 4.5582, + "step": 2100 + }, + { + "epoch": 0.23180592991913745, + "grad_norm": 1.0496394634246826, + "learning_rate": 0.0005867242309767943, + "loss": 4.5315, + "step": 2150 + }, + { + "epoch": 0.2371967654986523, + "grad_norm": 0.8335784077644348, + "learning_rate": 0.0005864004317323259, + "loss": 4.5086, + "step": 2200 + }, + { + "epoch": 0.24258760107816713, + "grad_norm": 0.8698639273643494, + "learning_rate": 0.0005860766324878575, + "loss": 4.4958, + "step": 2250 + }, + { + "epoch": 0.24797843665768193, + "grad_norm": 0.7829259634017944, + "learning_rate": 0.000585752833243389, + "loss": 4.5067, + "step": 2300 + }, + { + "epoch": 0.25336927223719674, + "grad_norm": 0.8259062767028809, + "learning_rate": 0.0005854290339989206, + "loss": 4.4569, + "step": 2350 + }, + { + "epoch": 0.2587601078167116, + "grad_norm": 0.8445196151733398, + "learning_rate": 0.0005851052347544521, + "loss": 4.4611, + "step": 2400 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.8523595929145813, + "learning_rate": 0.0005847814355099838, + "loss": 4.4402, + "step": 2450 + }, + { + "epoch": 0.2695417789757412, + "grad_norm": 0.8830829858779907, + "learning_rate": 0.0005844576362655154, + "loss": 4.4235, + "step": 2500 + }, + { + "epoch": 0.2749326145552561, + "grad_norm": 0.8365247845649719, + "learning_rate": 0.0005841338370210469, + "loss": 4.4101, + "step": 2550 + }, + { + "epoch": 0.2803234501347709, + "grad_norm": 0.853497326374054, + "learning_rate": 0.0005838100377765785, + "loss": 4.3826, + "step": 2600 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.9241225719451904, + "learning_rate": 0.0005834862385321101, + "loss": 4.3931, + "step": 2650 + }, + { + "epoch": 0.29110512129380056, + "grad_norm": 0.8265724778175354, + "learning_rate": 0.0005831624392876417, + "loss": 4.3887, + "step": 2700 + }, + { + "epoch": 0.29649595687331537, + "grad_norm": 0.8722224831581116, + "learning_rate": 0.0005828386400431731, + "loss": 4.3563, + "step": 2750 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.7775854468345642, + "learning_rate": 0.0005825148407987048, + "loss": 4.3271, + "step": 2800 + }, + { + "epoch": 0.30727762803234504, + "grad_norm": 0.7281326055526733, + "learning_rate": 0.0005821910415542363, + "loss": 4.3363, + "step": 2850 + }, + { + "epoch": 0.31266846361185985, + "grad_norm": 0.8523993492126465, + "learning_rate": 0.0005818672423097679, + "loss": 4.3284, + "step": 2900 + }, + { + "epoch": 0.31805929919137466, + "grad_norm": 0.8152425289154053, + "learning_rate": 0.0005815434430652994, + "loss": 4.313, + "step": 2950 + }, + { + "epoch": 0.32345013477088946, + "grad_norm": 0.9258395433425903, + "learning_rate": 0.0005812196438208311, + "loss": 4.3092, + "step": 3000 + }, + { + "epoch": 0.32345013477088946, + "eval_accuracy": 0.29783223363934697, + "eval_loss": 4.246051788330078, + "eval_runtime": 185.609, + "eval_samples_per_second": 97.037, + "eval_steps_per_second": 6.067, + "step": 3000 + }, + { + "epoch": 0.3288409703504043, + "grad_norm": 0.78269362449646, + "learning_rate": 0.0005808958445763626, + "loss": 4.304, + "step": 3050 + }, + { + "epoch": 0.33423180592991913, + "grad_norm": 0.7207179069519043, + "learning_rate": 0.0005805720453318942, + "loss": 4.3041, + "step": 3100 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.791796088218689, + "learning_rate": 0.0005802482460874257, + "loss": 4.2633, + "step": 3150 + }, + { + "epoch": 0.3450134770889488, + "grad_norm": 0.6603590250015259, + "learning_rate": 0.0005799244468429573, + "loss": 4.2613, + "step": 3200 + }, + { + "epoch": 0.3504043126684636, + "grad_norm": 0.922822892665863, + "learning_rate": 0.0005796006475984889, + "loss": 4.2617, + "step": 3250 + }, + { + "epoch": 0.3557951482479784, + "grad_norm": 0.7605053782463074, + "learning_rate": 0.0005792768483540205, + "loss": 4.2446, + "step": 3300 + }, + { + "epoch": 0.3611859838274933, + "grad_norm": 0.7679175734519958, + "learning_rate": 0.000578953049109552, + "loss": 4.26, + "step": 3350 + }, + { + "epoch": 0.3665768194070081, + "grad_norm": 0.7325921654701233, + "learning_rate": 0.0005786292498650836, + "loss": 4.2337, + "step": 3400 + }, + { + "epoch": 0.3719676549865229, + "grad_norm": 0.6152936816215515, + "learning_rate": 0.0005783054506206152, + "loss": 4.21, + "step": 3450 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.7148654460906982, + "learning_rate": 0.0005779816513761467, + "loss": 4.2203, + "step": 3500 + }, + { + "epoch": 0.38274932614555257, + "grad_norm": 0.8136675953865051, + "learning_rate": 0.0005776578521316782, + "loss": 4.2143, + "step": 3550 + }, + { + "epoch": 0.3881401617250674, + "grad_norm": 0.6423671245574951, + "learning_rate": 0.0005773340528872099, + "loss": 4.2164, + "step": 3600 + }, + { + "epoch": 0.3935309973045822, + "grad_norm": 0.8761276602745056, + "learning_rate": 0.0005770102536427414, + "loss": 4.1887, + "step": 3650 + }, + { + "epoch": 0.39892183288409705, + "grad_norm": 0.7816224694252014, + "learning_rate": 0.000576686454398273, + "loss": 4.1937, + "step": 3700 + }, + { + "epoch": 0.40431266846361186, + "grad_norm": 0.7834933400154114, + "learning_rate": 0.0005763626551538045, + "loss": 4.1965, + "step": 3750 + }, + { + "epoch": 0.40970350404312667, + "grad_norm": 0.6208221912384033, + "learning_rate": 0.0005760388559093362, + "loss": 4.1864, + "step": 3800 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.6872389316558838, + "learning_rate": 0.0005757150566648678, + "loss": 4.1732, + "step": 3850 + }, + { + "epoch": 0.42048517520215634, + "grad_norm": 0.7426034808158875, + "learning_rate": 0.0005753912574203993, + "loss": 4.1682, + "step": 3900 + }, + { + "epoch": 0.42587601078167114, + "grad_norm": 0.7201637625694275, + "learning_rate": 0.0005750674581759309, + "loss": 4.157, + "step": 3950 + }, + { + "epoch": 0.431266846361186, + "grad_norm": 0.7274760007858276, + "learning_rate": 0.0005747436589314624, + "loss": 4.1586, + "step": 4000 + }, + { + "epoch": 0.431266846361186, + "eval_accuracy": 0.3127459290207626, + "eval_loss": 4.088518142700195, + "eval_runtime": 185.6568, + "eval_samples_per_second": 97.012, + "eval_steps_per_second": 6.065, + "step": 4000 + }, + { + "epoch": 0.4366576819407008, + "grad_norm": 0.5464039444923401, + "learning_rate": 0.0005744198596869941, + "loss": 4.149, + "step": 4050 + }, + { + "epoch": 0.4420485175202156, + "grad_norm": 0.7344082593917847, + "learning_rate": 0.0005740960604425255, + "loss": 4.1532, + "step": 4100 + }, + { + "epoch": 0.4474393530997305, + "grad_norm": 0.6581411957740784, + "learning_rate": 0.0005737722611980572, + "loss": 4.1547, + "step": 4150 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.6639128923416138, + "learning_rate": 0.0005734484619535887, + "loss": 4.1367, + "step": 4200 + }, + { + "epoch": 0.4582210242587601, + "grad_norm": 0.734761655330658, + "learning_rate": 0.0005731246627091203, + "loss": 4.1221, + "step": 4250 + }, + { + "epoch": 0.4636118598382749, + "grad_norm": 0.8860589265823364, + "learning_rate": 0.0005728008634646518, + "loss": 4.1218, + "step": 4300 + }, + { + "epoch": 0.46900269541778977, + "grad_norm": 0.5995060205459595, + "learning_rate": 0.0005724770642201835, + "loss": 4.1167, + "step": 4350 + }, + { + "epoch": 0.4743935309973046, + "grad_norm": 0.7229450345039368, + "learning_rate": 0.000572153264975715, + "loss": 4.1148, + "step": 4400 + }, + { + "epoch": 0.4797843665768194, + "grad_norm": 0.731573224067688, + "learning_rate": 0.0005718294657312466, + "loss": 4.1018, + "step": 4450 + }, + { + "epoch": 0.48517520215633425, + "grad_norm": 0.7716799974441528, + "learning_rate": 0.0005715056664867781, + "loss": 4.109, + "step": 4500 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.542386531829834, + "learning_rate": 0.0005711818672423097, + "loss": 4.1017, + "step": 4550 + }, + { + "epoch": 0.49595687331536387, + "grad_norm": 0.794573187828064, + "learning_rate": 0.0005708580679978413, + "loss": 4.086, + "step": 4600 + }, + { + "epoch": 0.5013477088948787, + "grad_norm": 0.6180745959281921, + "learning_rate": 0.0005705342687533729, + "loss": 4.0787, + "step": 4650 + }, + { + "epoch": 0.5067385444743935, + "grad_norm": 0.6277428269386292, + "learning_rate": 0.0005702104695089044, + "loss": 4.0932, + "step": 4700 + }, + { + "epoch": 0.5121293800539084, + "grad_norm": 0.6428430676460266, + "learning_rate": 0.000569886670264436, + "loss": 4.0812, + "step": 4750 + }, + { + "epoch": 0.5175202156334232, + "grad_norm": 0.6423416137695312, + "learning_rate": 0.0005695628710199675, + "loss": 4.0791, + "step": 4800 + }, + { + "epoch": 0.522911051212938, + "grad_norm": 0.6095171570777893, + "learning_rate": 0.0005692390717754991, + "loss": 4.0655, + "step": 4850 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.6891219615936279, + "learning_rate": 0.0005689152725310306, + "loss": 4.0527, + "step": 4900 + }, + { + "epoch": 0.5336927223719676, + "grad_norm": 0.6701120734214783, + "learning_rate": 0.0005685914732865623, + "loss": 4.0714, + "step": 4950 + }, + { + "epoch": 0.5390835579514824, + "grad_norm": 0.5252517461776733, + "learning_rate": 0.0005682676740420939, + "loss": 4.0492, + "step": 5000 + }, + { + "epoch": 0.5390835579514824, + "eval_accuracy": 0.32119879773406573, + "eval_loss": 3.9914467334747314, + "eval_runtime": 185.5695, + "eval_samples_per_second": 97.058, + "eval_steps_per_second": 6.068, + "step": 5000 + }, + { + "epoch": 0.5444743935309974, + "grad_norm": 0.6187731623649597, + "learning_rate": 0.0005679438747976254, + "loss": 4.0591, + "step": 5050 + }, + { + "epoch": 0.5498652291105122, + "grad_norm": 0.6102878451347351, + "learning_rate": 0.000567620075553157, + "loss": 4.0513, + "step": 5100 + }, + { + "epoch": 0.555256064690027, + "grad_norm": 0.708706259727478, + "learning_rate": 0.0005672962763086886, + "loss": 4.0489, + "step": 5150 + }, + { + "epoch": 0.5606469002695418, + "grad_norm": 0.6461382508277893, + "learning_rate": 0.0005669724770642202, + "loss": 4.0661, + "step": 5200 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.7211845517158508, + "learning_rate": 0.0005666486778197517, + "loss": 4.0418, + "step": 5250 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.7692673802375793, + "learning_rate": 0.0005663248785752833, + "loss": 4.0325, + "step": 5300 + }, + { + "epoch": 0.5768194070080862, + "grad_norm": 0.5925557613372803, + "learning_rate": 0.0005660010793308148, + "loss": 4.0266, + "step": 5350 + }, + { + "epoch": 0.5822102425876011, + "grad_norm": 0.5735204815864563, + "learning_rate": 0.0005656772800863465, + "loss": 4.0372, + "step": 5400 + }, + { + "epoch": 0.5876010781671159, + "grad_norm": 0.5482949018478394, + "learning_rate": 0.0005653534808418779, + "loss": 4.036, + "step": 5450 + }, + { + "epoch": 0.5929919137466307, + "grad_norm": 0.7778071165084839, + "learning_rate": 0.0005650296815974096, + "loss": 4.0203, + "step": 5500 + }, + { + "epoch": 0.5983827493261455, + "grad_norm": 0.603145956993103, + "learning_rate": 0.0005647058823529411, + "loss": 4.009, + "step": 5550 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.5814369916915894, + "learning_rate": 0.0005643820831084727, + "loss": 4.0256, + "step": 5600 + }, + { + "epoch": 0.6091644204851752, + "grad_norm": 0.6685661673545837, + "learning_rate": 0.0005640582838640042, + "loss": 4.0055, + "step": 5650 + }, + { + "epoch": 0.6145552560646901, + "grad_norm": 0.5345515608787537, + "learning_rate": 0.0005637344846195358, + "loss": 4.0021, + "step": 5700 + }, + { + "epoch": 0.6199460916442049, + "grad_norm": 0.6599013209342957, + "learning_rate": 0.0005634106853750674, + "loss": 3.9773, + "step": 5750 + }, + { + "epoch": 0.6253369272237197, + "grad_norm": 0.6814241409301758, + "learning_rate": 0.000563086886130599, + "loss": 3.985, + "step": 5800 + }, + { + "epoch": 0.6307277628032345, + "grad_norm": 0.6166688203811646, + "learning_rate": 0.0005627630868861305, + "loss": 3.9948, + "step": 5850 + }, + { + "epoch": 0.6361185983827493, + "grad_norm": 0.6375031471252441, + "learning_rate": 0.0005624392876416621, + "loss": 3.995, + "step": 5900 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.6021779179573059, + "learning_rate": 0.0005621154883971937, + "loss": 3.9838, + "step": 5950 + }, + { + "epoch": 0.6469002695417789, + "grad_norm": 0.6350939273834229, + "learning_rate": 0.0005617916891527253, + "loss": 3.9962, + "step": 6000 + }, + { + "epoch": 0.6469002695417789, + "eval_accuracy": 0.3278974655736041, + "eval_loss": 3.919321060180664, + "eval_runtime": 185.6272, + "eval_samples_per_second": 97.028, + "eval_steps_per_second": 6.066, + "step": 6000 + }, + { + "epoch": 0.6522911051212938, + "grad_norm": 0.6063998937606812, + "learning_rate": 0.0005614678899082568, + "loss": 3.9623, + "step": 6050 + }, + { + "epoch": 0.6576819407008087, + "grad_norm": 0.6236124634742737, + "learning_rate": 0.0005611440906637884, + "loss": 3.9998, + "step": 6100 + }, + { + "epoch": 0.6630727762803235, + "grad_norm": 0.7479600310325623, + "learning_rate": 0.0005608267674042094, + "loss": 3.9792, + "step": 6150 + }, + { + "epoch": 0.6684636118598383, + "grad_norm": 0.5813738703727722, + "learning_rate": 0.0005605029681597409, + "loss": 3.953, + "step": 6200 + }, + { + "epoch": 0.6738544474393531, + "grad_norm": 0.5313798785209656, + "learning_rate": 0.0005601791689152725, + "loss": 3.972, + "step": 6250 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.6238154768943787, + "learning_rate": 0.000559855369670804, + "loss": 3.9769, + "step": 6300 + }, + { + "epoch": 0.6846361185983828, + "grad_norm": 0.5220109224319458, + "learning_rate": 0.0005595315704263357, + "loss": 3.9613, + "step": 6350 + }, + { + "epoch": 0.6900269541778976, + "grad_norm": 0.673324704170227, + "learning_rate": 0.0005592077711818672, + "loss": 3.9706, + "step": 6400 + }, + { + "epoch": 0.6954177897574124, + "grad_norm": 0.6563860177993774, + "learning_rate": 0.0005588839719373988, + "loss": 3.969, + "step": 6450 + }, + { + "epoch": 0.7008086253369272, + "grad_norm": 0.6665467023849487, + "learning_rate": 0.0005585601726929303, + "loss": 3.9606, + "step": 6500 + }, + { + "epoch": 0.706199460916442, + "grad_norm": 0.5975165367126465, + "learning_rate": 0.0005582363734484619, + "loss": 3.959, + "step": 6550 + }, + { + "epoch": 0.7115902964959568, + "grad_norm": 0.6370920538902283, + "learning_rate": 0.0005579125742039935, + "loss": 3.9606, + "step": 6600 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.6234825849533081, + "learning_rate": 0.0005575887749595251, + "loss": 3.97, + "step": 6650 + }, + { + "epoch": 0.7223719676549866, + "grad_norm": 0.6267138123512268, + "learning_rate": 0.0005572649757150566, + "loss": 3.9344, + "step": 6700 + }, + { + "epoch": 0.7277628032345014, + "grad_norm": 0.5611273050308228, + "learning_rate": 0.0005569411764705882, + "loss": 3.9631, + "step": 6750 + }, + { + "epoch": 0.7331536388140162, + "grad_norm": 0.6250568628311157, + "learning_rate": 0.0005566173772261198, + "loss": 3.9344, + "step": 6800 + }, + { + "epoch": 0.738544474393531, + "grad_norm": 0.5902653932571411, + "learning_rate": 0.0005562935779816513, + "loss": 3.9345, + "step": 6850 + }, + { + "epoch": 0.7439353099730458, + "grad_norm": 0.5684419274330139, + "learning_rate": 0.0005559697787371828, + "loss": 3.9367, + "step": 6900 + }, + { + "epoch": 0.7493261455525606, + "grad_norm": 0.5950194001197815, + "learning_rate": 0.0005556459794927145, + "loss": 3.9418, + "step": 6950 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.6283013820648193, + "learning_rate": 0.000555322180248246, + "loss": 3.9219, + "step": 7000 + }, + { + "epoch": 0.7547169811320755, + "eval_accuracy": 0.33346299244612526, + "eval_loss": 3.8631539344787598, + "eval_runtime": 185.3774, + "eval_samples_per_second": 97.159, + "eval_steps_per_second": 6.074, + "step": 7000 + }, + { + "epoch": 0.7601078167115903, + "grad_norm": 0.6419661641120911, + "learning_rate": 0.0005549983810037776, + "loss": 3.9341, + "step": 7050 + }, + { + "epoch": 0.7654986522911051, + "grad_norm": 0.8758557438850403, + "learning_rate": 0.0005546745817593091, + "loss": 3.9093, + "step": 7100 + }, + { + "epoch": 0.77088948787062, + "grad_norm": 0.6024784445762634, + "learning_rate": 0.0005543507825148408, + "loss": 3.9347, + "step": 7150 + }, + { + "epoch": 0.7762803234501348, + "grad_norm": 0.5663654208183289, + "learning_rate": 0.0005540269832703723, + "loss": 3.9072, + "step": 7200 + }, + { + "epoch": 0.7816711590296496, + "grad_norm": 0.6945359110832214, + "learning_rate": 0.0005537031840259039, + "loss": 3.899, + "step": 7250 + }, + { + "epoch": 0.7870619946091644, + "grad_norm": 0.5187634229660034, + "learning_rate": 0.0005533793847814354, + "loss": 3.9124, + "step": 7300 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.6671504378318787, + "learning_rate": 0.000553055585536967, + "loss": 3.9133, + "step": 7350 + }, + { + "epoch": 0.7978436657681941, + "grad_norm": 0.5317527055740356, + "learning_rate": 0.0005527317862924987, + "loss": 3.9103, + "step": 7400 + }, + { + "epoch": 0.8032345013477089, + "grad_norm": 0.567512571811676, + "learning_rate": 0.0005524079870480301, + "loss": 3.8879, + "step": 7450 + }, + { + "epoch": 0.8086253369272237, + "grad_norm": 0.5598046183586121, + "learning_rate": 0.0005520841878035618, + "loss": 3.9029, + "step": 7500 + }, + { + "epoch": 0.8140161725067385, + "grad_norm": 0.6032840013504028, + "learning_rate": 0.0005517603885590933, + "loss": 3.8964, + "step": 7550 + }, + { + "epoch": 0.8194070080862533, + "grad_norm": 0.5689701437950134, + "learning_rate": 0.0005514365893146249, + "loss": 3.8911, + "step": 7600 + }, + { + "epoch": 0.8247978436657682, + "grad_norm": 0.5858896374702454, + "learning_rate": 0.0005511127900701564, + "loss": 3.8974, + "step": 7650 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.6379143595695496, + "learning_rate": 0.000550788990825688, + "loss": 3.8913, + "step": 7700 + }, + { + "epoch": 0.8355795148247979, + "grad_norm": 0.6346825361251831, + "learning_rate": 0.0005504651915812196, + "loss": 3.8938, + "step": 7750 + }, + { + "epoch": 0.8409703504043127, + "grad_norm": 0.651748776435852, + "learning_rate": 0.0005501413923367512, + "loss": 3.8768, + "step": 7800 + }, + { + "epoch": 0.8463611859838275, + "grad_norm": 0.5529057383537292, + "learning_rate": 0.0005498175930922827, + "loss": 3.8814, + "step": 7850 + }, + { + "epoch": 0.8517520215633423, + "grad_norm": 0.5781732797622681, + "learning_rate": 0.0005494937938478143, + "loss": 3.8778, + "step": 7900 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.599770188331604, + "learning_rate": 0.0005491699946033459, + "loss": 3.9077, + "step": 7950 + }, + { + "epoch": 0.862533692722372, + "grad_norm": 0.6126531958580017, + "learning_rate": 0.0005488461953588775, + "loss": 3.8566, + "step": 8000 + }, + { + "epoch": 0.862533692722372, + "eval_accuracy": 0.3376529737589151, + "eval_loss": 3.815525531768799, + "eval_runtime": 186.958, + "eval_samples_per_second": 96.337, + "eval_steps_per_second": 6.023, + "step": 8000 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.6914803981781006, + "learning_rate": 0.000548522396114409, + "loss": 3.8954, + "step": 8050 + }, + { + "epoch": 0.8733153638814016, + "grad_norm": 0.6908184289932251, + "learning_rate": 0.0005481985968699406, + "loss": 3.8747, + "step": 8100 + }, + { + "epoch": 0.8787061994609164, + "grad_norm": 0.5247132778167725, + "learning_rate": 0.0005478747976254721, + "loss": 3.8785, + "step": 8150 + }, + { + "epoch": 0.8840970350404312, + "grad_norm": 0.585969090461731, + "learning_rate": 0.0005475509983810037, + "loss": 3.8734, + "step": 8200 + }, + { + "epoch": 0.889487870619946, + "grad_norm": 0.5993847250938416, + "learning_rate": 0.0005472271991365352, + "loss": 3.8685, + "step": 8250 + }, + { + "epoch": 0.894878706199461, + "grad_norm": 0.6115968227386475, + "learning_rate": 0.0005469033998920669, + "loss": 3.8867, + "step": 8300 + }, + { + "epoch": 0.9002695417789758, + "grad_norm": 0.5844521522521973, + "learning_rate": 0.0005465860766324878, + "loss": 3.8566, + "step": 8350 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.5750303864479065, + "learning_rate": 0.0005462622773880194, + "loss": 3.8639, + "step": 8400 + }, + { + "epoch": 0.9110512129380054, + "grad_norm": 0.5799904465675354, + "learning_rate": 0.000545938478143551, + "loss": 3.859, + "step": 8450 + }, + { + "epoch": 0.9164420485175202, + "grad_norm": 0.6240545511245728, + "learning_rate": 0.0005456146788990825, + "loss": 3.8648, + "step": 8500 + }, + { + "epoch": 0.921832884097035, + "grad_norm": 0.5578630566596985, + "learning_rate": 0.000545290879654614, + "loss": 3.8597, + "step": 8550 + }, + { + "epoch": 0.9272237196765498, + "grad_norm": 0.5732514262199402, + "learning_rate": 0.0005449670804101457, + "loss": 3.8652, + "step": 8600 + }, + { + "epoch": 0.9326145552560647, + "grad_norm": 0.614829957485199, + "learning_rate": 0.0005446432811656773, + "loss": 3.845, + "step": 8650 + }, + { + "epoch": 0.9380053908355795, + "grad_norm": 0.6199792623519897, + "learning_rate": 0.0005443194819212088, + "loss": 3.8484, + "step": 8700 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.5770998597145081, + "learning_rate": 0.0005439956826767404, + "loss": 3.8456, + "step": 8750 + }, + { + "epoch": 0.9487870619946092, + "grad_norm": 0.5756552815437317, + "learning_rate": 0.000543671883432272, + "loss": 3.8492, + "step": 8800 + }, + { + "epoch": 0.954177897574124, + "grad_norm": 0.5448219180107117, + "learning_rate": 0.0005433480841878035, + "loss": 3.8518, + "step": 8850 + }, + { + "epoch": 0.9595687331536388, + "grad_norm": 0.5587062239646912, + "learning_rate": 0.000543024284943335, + "loss": 3.8389, + "step": 8900 + }, + { + "epoch": 0.9649595687331537, + "grad_norm": 0.5609190464019775, + "learning_rate": 0.0005427004856988667, + "loss": 3.8303, + "step": 8950 + }, + { + "epoch": 0.9703504043126685, + "grad_norm": 0.6230307817459106, + "learning_rate": 0.0005423766864543982, + "loss": 3.856, + "step": 9000 + }, + { + "epoch": 0.9703504043126685, + "eval_accuracy": 0.34079347682830485, + "eval_loss": 3.7813315391540527, + "eval_runtime": 186.6989, + "eval_samples_per_second": 96.471, + "eval_steps_per_second": 6.031, + "step": 9000 + }, + { + "epoch": 0.9757412398921833, + "grad_norm": 0.6406134963035583, + "learning_rate": 0.0005420528872099298, + "loss": 3.871, + "step": 9050 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.49706608057022095, + "learning_rate": 0.0005417290879654613, + "loss": 3.8273, + "step": 9100 + }, + { + "epoch": 0.9865229110512129, + "grad_norm": 0.5765336751937866, + "learning_rate": 0.000541405288720993, + "loss": 3.8418, + "step": 9150 + }, + { + "epoch": 0.9919137466307277, + "grad_norm": 0.5545505881309509, + "learning_rate": 0.0005410814894765245, + "loss": 3.8285, + "step": 9200 + }, + { + "epoch": 0.9973045822102425, + "grad_norm": 0.5445045232772827, + "learning_rate": 0.0005407576902320561, + "loss": 3.8485, + "step": 9250 + }, + { + "epoch": 1.0026954177897573, + "grad_norm": 0.5650607347488403, + "learning_rate": 0.0005404338909875876, + "loss": 3.7963, + "step": 9300 + }, + { + "epoch": 1.0080862533692723, + "grad_norm": 0.5752312541007996, + "learning_rate": 0.0005401100917431192, + "loss": 3.7605, + "step": 9350 + }, + { + "epoch": 1.013477088948787, + "grad_norm": 0.5649228096008301, + "learning_rate": 0.0005397862924986508, + "loss": 3.7564, + "step": 9400 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.5715786218643188, + "learning_rate": 0.0005394624932541824, + "loss": 3.7554, + "step": 9450 + }, + { + "epoch": 1.0242587601078168, + "grad_norm": 0.5207133889198303, + "learning_rate": 0.0005391386940097139, + "loss": 3.7592, + "step": 9500 + }, + { + "epoch": 1.0296495956873315, + "grad_norm": 0.5603737235069275, + "learning_rate": 0.0005388148947652455, + "loss": 3.7704, + "step": 9550 + }, + { + "epoch": 1.0350404312668464, + "grad_norm": 0.6097581386566162, + "learning_rate": 0.000538491095520777, + "loss": 3.7686, + "step": 9600 + }, + { + "epoch": 1.0404312668463611, + "grad_norm": 0.6030071377754211, + "learning_rate": 0.0005381672962763086, + "loss": 3.7726, + "step": 9650 + }, + { + "epoch": 1.045822102425876, + "grad_norm": 0.546688437461853, + "learning_rate": 0.0005378434970318403, + "loss": 3.7567, + "step": 9700 + }, + { + "epoch": 1.0512129380053907, + "grad_norm": 0.5595793724060059, + "learning_rate": 0.0005375196977873718, + "loss": 3.755, + "step": 9750 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 0.5992977619171143, + "learning_rate": 0.0005371958985429034, + "loss": 3.7821, + "step": 9800 + }, + { + "epoch": 1.0619946091644206, + "grad_norm": 0.5221476554870605, + "learning_rate": 0.0005368720992984349, + "loss": 3.7845, + "step": 9850 + }, + { + "epoch": 1.0673854447439353, + "grad_norm": 0.5831781029701233, + "learning_rate": 0.0005365483000539665, + "loss": 3.772, + "step": 9900 + }, + { + "epoch": 1.0727762803234502, + "grad_norm": 0.5980172753334045, + "learning_rate": 0.0005362245008094981, + "loss": 3.7854, + "step": 9950 + }, + { + "epoch": 1.0781671159029649, + "grad_norm": 0.5590375661849976, + "learning_rate": 0.0005359007015650297, + "loss": 3.7649, + "step": 10000 + }, + { + "epoch": 1.0781671159029649, + "eval_accuracy": 0.34435424926776104, + "eval_loss": 3.7518138885498047, + "eval_runtime": 186.0381, + "eval_samples_per_second": 96.814, + "eval_steps_per_second": 6.053, + "step": 10000 + }, + { + "epoch": 1.0835579514824798, + "grad_norm": 0.5709946751594543, + "learning_rate": 0.0005355769023205612, + "loss": 3.7581, + "step": 10050 + }, + { + "epoch": 1.0889487870619945, + "grad_norm": 0.618277370929718, + "learning_rate": 0.0005352531030760928, + "loss": 3.7592, + "step": 10100 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.6374980211257935, + "learning_rate": 0.0005349293038316244, + "loss": 3.7555, + "step": 10150 + }, + { + "epoch": 1.0997304582210243, + "grad_norm": 0.5958013534545898, + "learning_rate": 0.0005346055045871559, + "loss": 3.7666, + "step": 10200 + }, + { + "epoch": 1.105121293800539, + "grad_norm": 0.6115665435791016, + "learning_rate": 0.0005342817053426874, + "loss": 3.7524, + "step": 10250 + }, + { + "epoch": 1.110512129380054, + "grad_norm": 0.585763692855835, + "learning_rate": 0.0005339579060982191, + "loss": 3.764, + "step": 10300 + }, + { + "epoch": 1.1159029649595686, + "grad_norm": 0.653211772441864, + "learning_rate": 0.0005336341068537506, + "loss": 3.7568, + "step": 10350 + }, + { + "epoch": 1.1212938005390836, + "grad_norm": 0.5737949013710022, + "learning_rate": 0.0005333103076092822, + "loss": 3.7638, + "step": 10400 + }, + { + "epoch": 1.1266846361185983, + "grad_norm": 0.5587084293365479, + "learning_rate": 0.0005329865083648137, + "loss": 3.7609, + "step": 10450 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.6286759972572327, + "learning_rate": 0.0005326627091203454, + "loss": 3.7635, + "step": 10500 + }, + { + "epoch": 1.137466307277628, + "grad_norm": 0.5914649367332458, + "learning_rate": 0.0005323453858607662, + "loss": 3.7659, + "step": 10550 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.574865460395813, + "learning_rate": 0.0005320215866162979, + "loss": 3.7552, + "step": 10600 + }, + { + "epoch": 1.1482479784366577, + "grad_norm": 0.570719838142395, + "learning_rate": 0.0005316977873718294, + "loss": 3.7441, + "step": 10650 + }, + { + "epoch": 1.1536388140161726, + "grad_norm": 0.5437813997268677, + "learning_rate": 0.000531373988127361, + "loss": 3.7548, + "step": 10700 + }, + { + "epoch": 1.1590296495956873, + "grad_norm": 0.575222909450531, + "learning_rate": 0.0005310501888828925, + "loss": 3.7359, + "step": 10750 + }, + { + "epoch": 1.1644204851752022, + "grad_norm": 0.5911815762519836, + "learning_rate": 0.0005307263896384242, + "loss": 3.7398, + "step": 10800 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 0.5724892020225525, + "learning_rate": 0.0005304025903939556, + "loss": 3.7319, + "step": 10850 + }, + { + "epoch": 1.1752021563342319, + "grad_norm": 0.6011896729469299, + "learning_rate": 0.0005300787911494873, + "loss": 3.7465, + "step": 10900 + }, + { + "epoch": 1.1805929919137466, + "grad_norm": 0.5386224985122681, + "learning_rate": 0.0005297549919050189, + "loss": 3.7593, + "step": 10950 + }, + { + "epoch": 1.1859838274932615, + "grad_norm": 0.5424002408981323, + "learning_rate": 0.0005294311926605504, + "loss": 3.769, + "step": 11000 + }, + { + "epoch": 1.1859838274932615, + "eval_accuracy": 0.3473514391781235, + "eval_loss": 3.723193883895874, + "eval_runtime": 186.4484, + "eval_samples_per_second": 96.6, + "eval_steps_per_second": 6.039, + "step": 11000 + }, + { + "epoch": 1.1913746630727764, + "grad_norm": 0.5702701210975647, + "learning_rate": 0.000529107393416082, + "loss": 3.7567, + "step": 11050 + }, + { + "epoch": 1.196765498652291, + "grad_norm": 0.5534523129463196, + "learning_rate": 0.0005287835941716135, + "loss": 3.7412, + "step": 11100 + }, + { + "epoch": 1.202156334231806, + "grad_norm": 0.5500940084457397, + "learning_rate": 0.0005284597949271452, + "loss": 3.7407, + "step": 11150 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 0.5379878878593445, + "learning_rate": 0.0005281359956826767, + "loss": 3.7631, + "step": 11200 + }, + { + "epoch": 1.2129380053908356, + "grad_norm": 0.5732437372207642, + "learning_rate": 0.0005278121964382083, + "loss": 3.7498, + "step": 11250 + }, + { + "epoch": 1.2183288409703503, + "grad_norm": 0.5423194766044617, + "learning_rate": 0.0005274883971937398, + "loss": 3.7371, + "step": 11300 + }, + { + "epoch": 1.2237196765498652, + "grad_norm": 0.542545735836029, + "learning_rate": 0.0005271645979492714, + "loss": 3.7301, + "step": 11350 + }, + { + "epoch": 1.2291105121293802, + "grad_norm": 0.8030261397361755, + "learning_rate": 0.0005268472746896923, + "loss": 3.7452, + "step": 11400 + }, + { + "epoch": 1.2345013477088949, + "grad_norm": 0.5232232809066772, + "learning_rate": 0.000526523475445224, + "loss": 3.755, + "step": 11450 + }, + { + "epoch": 1.2398921832884098, + "grad_norm": 0.5808193683624268, + "learning_rate": 0.0005261996762007554, + "loss": 3.7452, + "step": 11500 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.5781920552253723, + "learning_rate": 0.0005258758769562871, + "loss": 3.7392, + "step": 11550 + }, + { + "epoch": 1.2506738544474394, + "grad_norm": 0.5637895464897156, + "learning_rate": 0.0005255520777118186, + "loss": 3.7512, + "step": 11600 + }, + { + "epoch": 1.256064690026954, + "grad_norm": 0.5855022668838501, + "learning_rate": 0.0005252282784673502, + "loss": 3.7318, + "step": 11650 + }, + { + "epoch": 1.261455525606469, + "grad_norm": 0.5060122013092041, + "learning_rate": 0.0005249044792228817, + "loss": 3.7424, + "step": 11700 + }, + { + "epoch": 1.266846361185984, + "grad_norm": 0.5578184723854065, + "learning_rate": 0.0005245806799784133, + "loss": 3.7421, + "step": 11750 + }, + { + "epoch": 1.2722371967654986, + "grad_norm": 0.5692754983901978, + "learning_rate": 0.0005242568807339449, + "loss": 3.7541, + "step": 11800 + }, + { + "epoch": 1.2776280323450135, + "grad_norm": 0.5293059349060059, + "learning_rate": 0.0005239330814894765, + "loss": 3.7389, + "step": 11850 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 0.5823394060134888, + "learning_rate": 0.000523609282245008, + "loss": 3.7336, + "step": 11900 + }, + { + "epoch": 1.2884097035040432, + "grad_norm": 0.5567950010299683, + "learning_rate": 0.0005232854830005396, + "loss": 3.7176, + "step": 11950 + }, + { + "epoch": 1.2938005390835579, + "grad_norm": 0.5795886516571045, + "learning_rate": 0.0005229616837560712, + "loss": 3.7288, + "step": 12000 + }, + { + "epoch": 1.2938005390835579, + "eval_accuracy": 0.34931164592718456, + "eval_loss": 3.697058916091919, + "eval_runtime": 186.8168, + "eval_samples_per_second": 96.41, + "eval_steps_per_second": 6.027, + "step": 12000 + }, + { + "epoch": 1.2991913746630728, + "grad_norm": 0.6153177618980408, + "learning_rate": 0.0005226378845116028, + "loss": 3.7453, + "step": 12050 + }, + { + "epoch": 1.3045822102425877, + "grad_norm": 0.6005005836486816, + "learning_rate": 0.0005223140852671344, + "loss": 3.7398, + "step": 12100 + }, + { + "epoch": 1.3099730458221024, + "grad_norm": 0.5839811563491821, + "learning_rate": 0.0005219902860226659, + "loss": 3.7295, + "step": 12150 + }, + { + "epoch": 1.3153638814016173, + "grad_norm": 0.54124516248703, + "learning_rate": 0.0005216664867781975, + "loss": 3.7451, + "step": 12200 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.5872597098350525, + "learning_rate": 0.000521342687533729, + "loss": 3.7195, + "step": 12250 + }, + { + "epoch": 1.326145552560647, + "grad_norm": 0.6764320731163025, + "learning_rate": 0.0005210188882892606, + "loss": 3.7273, + "step": 12300 + }, + { + "epoch": 1.3315363881401616, + "grad_norm": 0.5127867460250854, + "learning_rate": 0.0005206950890447922, + "loss": 3.7354, + "step": 12350 + }, + { + "epoch": 1.3369272237196765, + "grad_norm": 0.5766441822052002, + "learning_rate": 0.0005203712898003238, + "loss": 3.7329, + "step": 12400 + }, + { + "epoch": 1.3423180592991915, + "grad_norm": 0.5768564343452454, + "learning_rate": 0.0005200474905558553, + "loss": 3.7196, + "step": 12450 + }, + { + "epoch": 1.3477088948787062, + "grad_norm": 0.5573644638061523, + "learning_rate": 0.0005197236913113869, + "loss": 3.7398, + "step": 12500 + }, + { + "epoch": 1.353099730458221, + "grad_norm": 0.519527018070221, + "learning_rate": 0.0005193998920669184, + "loss": 3.7141, + "step": 12550 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.6151216626167297, + "learning_rate": 0.0005190760928224501, + "loss": 3.7121, + "step": 12600 + }, + { + "epoch": 1.3638814016172507, + "grad_norm": 0.6031010746955872, + "learning_rate": 0.0005187522935779816, + "loss": 3.7035, + "step": 12650 + }, + { + "epoch": 1.3692722371967654, + "grad_norm": 0.534813642501831, + "learning_rate": 0.0005184284943335132, + "loss": 3.7193, + "step": 12700 + }, + { + "epoch": 1.3746630727762803, + "grad_norm": 0.5405371785163879, + "learning_rate": 0.0005181046950890447, + "loss": 3.7074, + "step": 12750 + }, + { + "epoch": 1.3800539083557952, + "grad_norm": 0.537695050239563, + "learning_rate": 0.0005177808958445764, + "loss": 3.7067, + "step": 12800 + }, + { + "epoch": 1.38544474393531, + "grad_norm": 0.6358603239059448, + "learning_rate": 0.0005174570966001078, + "loss": 3.7174, + "step": 12850 + }, + { + "epoch": 1.3908355795148248, + "grad_norm": 0.6034652590751648, + "learning_rate": 0.0005171332973556395, + "loss": 3.7168, + "step": 12900 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 0.553503692150116, + "learning_rate": 0.000516809498111171, + "loss": 3.7269, + "step": 12950 + }, + { + "epoch": 1.4016172506738545, + "grad_norm": 0.5765125155448914, + "learning_rate": 0.0005164856988667026, + "loss": 3.7086, + "step": 13000 + }, + { + "epoch": 1.4016172506738545, + "eval_accuracy": 0.35153598784652257, + "eval_loss": 3.674765110015869, + "eval_runtime": 186.3343, + "eval_samples_per_second": 96.66, + "eval_steps_per_second": 6.043, + "step": 13000 + }, + { + "epoch": 1.4070080862533692, + "grad_norm": 0.6072783470153809, + "learning_rate": 0.0005161618996222341, + "loss": 3.7063, + "step": 13050 + }, + { + "epoch": 1.412398921832884, + "grad_norm": 0.5731549859046936, + "learning_rate": 0.0005158381003777657, + "loss": 3.6997, + "step": 13100 + }, + { + "epoch": 1.417789757412399, + "grad_norm": 0.5856156945228577, + "learning_rate": 0.0005155143011332973, + "loss": 3.7057, + "step": 13150 + }, + { + "epoch": 1.4231805929919137, + "grad_norm": 0.566673755645752, + "learning_rate": 0.0005151905018888289, + "loss": 3.7061, + "step": 13200 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.527152419090271, + "learning_rate": 0.0005148667026443604, + "loss": 3.7152, + "step": 13250 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 0.5248644948005676, + "learning_rate": 0.000514542903399892, + "loss": 3.6882, + "step": 13300 + }, + { + "epoch": 1.4393530997304582, + "grad_norm": 0.5552476048469543, + "learning_rate": 0.0005142191041554237, + "loss": 3.708, + "step": 13350 + }, + { + "epoch": 1.444743935309973, + "grad_norm": 0.5420221090316772, + "learning_rate": 0.0005138953049109552, + "loss": 3.723, + "step": 13400 + }, + { + "epoch": 1.4501347708894878, + "grad_norm": 0.546251654624939, + "learning_rate": 0.0005135715056664868, + "loss": 3.7274, + "step": 13450 + }, + { + "epoch": 1.4555256064690028, + "grad_norm": 0.656278669834137, + "learning_rate": 0.0005132477064220183, + "loss": 3.719, + "step": 13500 + }, + { + "epoch": 1.4609164420485174, + "grad_norm": 0.5882102847099304, + "learning_rate": 0.0005129239071775499, + "loss": 3.7077, + "step": 13550 + }, + { + "epoch": 1.4663072776280324, + "grad_norm": 0.5904348492622375, + "learning_rate": 0.0005126001079330814, + "loss": 3.7233, + "step": 13600 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.5616297721862793, + "learning_rate": 0.000512276308688613, + "loss": 3.7126, + "step": 13650 + }, + { + "epoch": 1.477088948787062, + "grad_norm": 0.588398277759552, + "learning_rate": 0.0005119525094441446, + "loss": 3.6976, + "step": 13700 + }, + { + "epoch": 1.482479784366577, + "grad_norm": 0.6392883062362671, + "learning_rate": 0.0005116287101996762, + "loss": 3.701, + "step": 13750 + }, + { + "epoch": 1.4878706199460916, + "grad_norm": 0.5546556711196899, + "learning_rate": 0.0005113049109552077, + "loss": 3.7046, + "step": 13800 + }, + { + "epoch": 1.4932614555256065, + "grad_norm": 0.5846448540687561, + "learning_rate": 0.0005109811117107393, + "loss": 3.7087, + "step": 13850 + }, + { + "epoch": 1.4986522911051212, + "grad_norm": 0.6359493732452393, + "learning_rate": 0.0005106573124662708, + "loss": 3.6907, + "step": 13900 + }, + { + "epoch": 1.5040431266846361, + "grad_norm": 0.5580483078956604, + "learning_rate": 0.0005103335132218025, + "loss": 3.688, + "step": 13950 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.5794188976287842, + "learning_rate": 0.000510009713977334, + "loss": 3.703, + "step": 14000 + }, + { + "epoch": 1.509433962264151, + "eval_accuracy": 0.35349782438890087, + "eval_loss": 3.6583781242370605, + "eval_runtime": 185.6312, + "eval_samples_per_second": 97.026, + "eval_steps_per_second": 6.066, + "step": 14000 + }, + { + "epoch": 1.5148247978436657, + "grad_norm": 0.5856167078018188, + "learning_rate": 0.0005096859147328656, + "loss": 3.6999, + "step": 14050 + }, + { + "epoch": 1.5202156334231804, + "grad_norm": 0.6054800152778625, + "learning_rate": 0.0005093621154883971, + "loss": 3.6984, + "step": 14100 + }, + { + "epoch": 1.5256064690026954, + "grad_norm": 0.5391426086425781, + "learning_rate": 0.0005090383162439288, + "loss": 3.695, + "step": 14150 + }, + { + "epoch": 1.5309973045822103, + "grad_norm": 0.5351890921592712, + "learning_rate": 0.0005087145169994602, + "loss": 3.701, + "step": 14200 + }, + { + "epoch": 1.536388140161725, + "grad_norm": 0.5853469371795654, + "learning_rate": 0.0005083907177549918, + "loss": 3.7001, + "step": 14250 + }, + { + "epoch": 1.54177897574124, + "grad_norm": 0.5472508668899536, + "learning_rate": 0.0005080669185105234, + "loss": 3.697, + "step": 14300 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 0.5463733673095703, + "learning_rate": 0.000507743119266055, + "loss": 3.6871, + "step": 14350 + }, + { + "epoch": 1.5525606469002695, + "grad_norm": 0.549136757850647, + "learning_rate": 0.0005074193200215865, + "loss": 3.6895, + "step": 14400 + }, + { + "epoch": 1.5579514824797842, + "grad_norm": 0.5161502957344055, + "learning_rate": 0.0005070955207771181, + "loss": 3.693, + "step": 14450 + }, + { + "epoch": 1.5633423180592994, + "grad_norm": 0.5291047096252441, + "learning_rate": 0.0005067717215326498, + "loss": 3.6909, + "step": 14500 + }, + { + "epoch": 1.568733153638814, + "grad_norm": 0.5388016700744629, + "learning_rate": 0.0005064479222881813, + "loss": 3.6906, + "step": 14550 + }, + { + "epoch": 1.5741239892183287, + "grad_norm": 0.5703091621398926, + "learning_rate": 0.0005061241230437129, + "loss": 3.6751, + "step": 14600 + }, + { + "epoch": 1.5795148247978437, + "grad_norm": 0.5753449201583862, + "learning_rate": 0.0005058003237992444, + "loss": 3.6803, + "step": 14650 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.5543888807296753, + "learning_rate": 0.000505476524554776, + "loss": 3.6864, + "step": 14700 + }, + { + "epoch": 1.5902964959568733, + "grad_norm": 0.5601723790168762, + "learning_rate": 0.0005051527253103076, + "loss": 3.6654, + "step": 14750 + }, + { + "epoch": 1.595687331536388, + "grad_norm": 0.5522553324699402, + "learning_rate": 0.0005048289260658392, + "loss": 3.6576, + "step": 14800 + }, + { + "epoch": 1.6010781671159031, + "grad_norm": 0.5866943001747131, + "learning_rate": 0.0005045051268213707, + "loss": 3.6834, + "step": 14850 + }, + { + "epoch": 1.6064690026954178, + "grad_norm": 0.5801655650138855, + "learning_rate": 0.0005041813275769023, + "loss": 3.6865, + "step": 14900 + }, + { + "epoch": 1.6118598382749325, + "grad_norm": 0.5756445527076721, + "learning_rate": 0.0005038575283324338, + "loss": 3.6724, + "step": 14950 + }, + { + "epoch": 1.6172506738544474, + "grad_norm": 0.4960193932056427, + "learning_rate": 0.0005035337290879654, + "loss": 3.664, + "step": 15000 + }, + { + "epoch": 1.6172506738544474, + "eval_accuracy": 0.35552974204391946, + "eval_loss": 3.637425184249878, + "eval_runtime": 185.9925, + "eval_samples_per_second": 96.837, + "eval_steps_per_second": 6.054, + "step": 15000 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 0.5649005770683289, + "learning_rate": 0.000503209929843497, + "loss": 3.6632, + "step": 15050 + }, + { + "epoch": 1.628032345013477, + "grad_norm": 0.5567517876625061, + "learning_rate": 0.0005028861305990286, + "loss": 3.6753, + "step": 15100 + }, + { + "epoch": 1.633423180592992, + "grad_norm": 0.5907676219940186, + "learning_rate": 0.0005025623313545601, + "loss": 3.6967, + "step": 15150 + }, + { + "epoch": 1.6388140161725069, + "grad_norm": 0.5697629451751709, + "learning_rate": 0.0005022385321100917, + "loss": 3.6818, + "step": 15200 + }, + { + "epoch": 1.6442048517520216, + "grad_norm": 0.5234145522117615, + "learning_rate": 0.0005019147328656232, + "loss": 3.6737, + "step": 15250 + }, + { + "epoch": 1.6495956873315363, + "grad_norm": 0.5655122995376587, + "learning_rate": 0.0005015909336211549, + "loss": 3.6731, + "step": 15300 + }, + { + "epoch": 1.6549865229110512, + "grad_norm": 0.5556879639625549, + "learning_rate": 0.0005012671343766864, + "loss": 3.6583, + "step": 15350 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 0.5384864211082458, + "learning_rate": 0.000500943335132218, + "loss": 3.6766, + "step": 15400 + }, + { + "epoch": 1.6657681940700808, + "grad_norm": 0.6065835952758789, + "learning_rate": 0.0005006195358877495, + "loss": 3.6646, + "step": 15450 + }, + { + "epoch": 1.6711590296495957, + "grad_norm": 0.5641660690307617, + "learning_rate": 0.0005002957366432812, + "loss": 3.666, + "step": 15500 + }, + { + "epoch": 1.6765498652291106, + "grad_norm": 0.5256580710411072, + "learning_rate": 0.0004999719373988127, + "loss": 3.6515, + "step": 15550 + }, + { + "epoch": 1.6819407008086253, + "grad_norm": 0.5791158676147461, + "learning_rate": 0.0004996481381543442, + "loss": 3.6708, + "step": 15600 + }, + { + "epoch": 1.68733153638814, + "grad_norm": 0.5627703666687012, + "learning_rate": 0.0004993308148947651, + "loss": 3.6779, + "step": 15650 + }, + { + "epoch": 1.692722371967655, + "grad_norm": 0.6009765863418579, + "learning_rate": 0.0004990070156502968, + "loss": 3.6717, + "step": 15700 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.6165124177932739, + "learning_rate": 0.0004986832164058284, + "loss": 3.6558, + "step": 15750 + }, + { + "epoch": 1.7035040431266846, + "grad_norm": 0.573556661605835, + "learning_rate": 0.0004983594171613599, + "loss": 3.6659, + "step": 15800 + }, + { + "epoch": 1.7088948787061995, + "grad_norm": 0.5521446466445923, + "learning_rate": 0.0004980356179168915, + "loss": 3.6825, + "step": 15850 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.5886697769165039, + "learning_rate": 0.000497711818672423, + "loss": 3.6819, + "step": 15900 + }, + { + "epoch": 1.719676549865229, + "grad_norm": 0.6262629628181458, + "learning_rate": 0.0004973880194279547, + "loss": 3.6689, + "step": 15950 + }, + { + "epoch": 1.7250673854447438, + "grad_norm": 0.6005555391311646, + "learning_rate": 0.0004970642201834862, + "loss": 3.6639, + "step": 16000 + }, + { + "epoch": 1.7250673854447438, + "eval_accuracy": 0.3572108195241851, + "eval_loss": 3.6206066608428955, + "eval_runtime": 185.4999, + "eval_samples_per_second": 97.094, + "eval_steps_per_second": 6.07, + "step": 16000 + }, + { + "epoch": 1.7304582210242587, + "grad_norm": 0.5764271020889282, + "learning_rate": 0.0004967404209390178, + "loss": 3.6523, + "step": 16050 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 0.5741286277770996, + "learning_rate": 0.0004964166216945493, + "loss": 3.653, + "step": 16100 + }, + { + "epoch": 1.7412398921832883, + "grad_norm": 0.6296107172966003, + "learning_rate": 0.000496092822450081, + "loss": 3.6606, + "step": 16150 + }, + { + "epoch": 1.7466307277628033, + "grad_norm": 0.6621016263961792, + "learning_rate": 0.0004957690232056125, + "loss": 3.667, + "step": 16200 + }, + { + "epoch": 1.7520215633423182, + "grad_norm": 0.6163436770439148, + "learning_rate": 0.0004954452239611441, + "loss": 3.6514, + "step": 16250 + }, + { + "epoch": 1.7574123989218329, + "grad_norm": 0.5336316823959351, + "learning_rate": 0.0004951214247166756, + "loss": 3.645, + "step": 16300 + }, + { + "epoch": 1.7628032345013476, + "grad_norm": 0.5538309812545776, + "learning_rate": 0.0004947976254722072, + "loss": 3.6681, + "step": 16350 + }, + { + "epoch": 1.7681940700808625, + "grad_norm": 0.5325199365615845, + "learning_rate": 0.0004944738262277387, + "loss": 3.6466, + "step": 16400 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 0.576173722743988, + "learning_rate": 0.0004941500269832703, + "loss": 3.6569, + "step": 16450 + }, + { + "epoch": 1.778975741239892, + "grad_norm": 0.5102316737174988, + "learning_rate": 0.0004938262277388019, + "loss": 3.6425, + "step": 16500 + }, + { + "epoch": 1.784366576819407, + "grad_norm": 0.5195789933204651, + "learning_rate": 0.0004935024284943335, + "loss": 3.6499, + "step": 16550 + }, + { + "epoch": 1.789757412398922, + "grad_norm": 0.5175392031669617, + "learning_rate": 0.000493178629249865, + "loss": 3.6622, + "step": 16600 + }, + { + "epoch": 1.7951482479784366, + "grad_norm": 0.5237419605255127, + "learning_rate": 0.0004928548300053966, + "loss": 3.6531, + "step": 16650 + }, + { + "epoch": 1.8005390835579513, + "grad_norm": 0.6199319958686829, + "learning_rate": 0.0004925310307609282, + "loss": 3.66, + "step": 16700 + }, + { + "epoch": 1.8059299191374663, + "grad_norm": 0.5926080346107483, + "learning_rate": 0.0004922072315164598, + "loss": 3.659, + "step": 16750 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.5379118919372559, + "learning_rate": 0.0004918834322719913, + "loss": 3.6662, + "step": 16800 + }, + { + "epoch": 1.8167115902964959, + "grad_norm": 0.6137577295303345, + "learning_rate": 0.0004915596330275229, + "loss": 3.6746, + "step": 16850 + }, + { + "epoch": 1.8221024258760108, + "grad_norm": 0.5653969645500183, + "learning_rate": 0.0004912358337830544, + "loss": 3.6474, + "step": 16900 + }, + { + "epoch": 1.8274932614555257, + "grad_norm": 0.56174635887146, + "learning_rate": 0.000490912034538586, + "loss": 3.6592, + "step": 16950 + }, + { + "epoch": 1.8328840970350404, + "grad_norm": 0.5398900508880615, + "learning_rate": 0.0004905882352941175, + "loss": 3.6383, + "step": 17000 + }, + { + "epoch": 1.8328840970350404, + "eval_accuracy": 0.3587294609371681, + "eval_loss": 3.6051440238952637, + "eval_runtime": 185.6742, + "eval_samples_per_second": 97.003, + "eval_steps_per_second": 6.064, + "step": 17000 + }, + { + "epoch": 1.838274932614555, + "grad_norm": 0.5739971399307251, + "learning_rate": 0.0004902644360496492, + "loss": 3.6431, + "step": 17050 + }, + { + "epoch": 1.8436657681940702, + "grad_norm": 0.5605867505073547, + "learning_rate": 0.0004899406368051808, + "loss": 3.6362, + "step": 17100 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 0.5735852718353271, + "learning_rate": 0.0004896168375607123, + "loss": 3.6416, + "step": 17150 + }, + { + "epoch": 1.8544474393530996, + "grad_norm": 0.5647099614143372, + "learning_rate": 0.0004892930383162439, + "loss": 3.6467, + "step": 17200 + }, + { + "epoch": 1.8598382749326146, + "grad_norm": 0.5226011276245117, + "learning_rate": 0.0004889692390717754, + "loss": 3.6534, + "step": 17250 + }, + { + "epoch": 1.8652291105121295, + "grad_norm": 0.5790987610816956, + "learning_rate": 0.0004886454398273071, + "loss": 3.6564, + "step": 17300 + }, + { + "epoch": 1.8706199460916442, + "grad_norm": 0.5586245059967041, + "learning_rate": 0.0004883216405828386, + "loss": 3.6568, + "step": 17350 + }, + { + "epoch": 1.8760107816711589, + "grad_norm": 0.5110759139060974, + "learning_rate": 0.00048799784133837017, + "loss": 3.6508, + "step": 17400 + }, + { + "epoch": 1.881401617250674, + "grad_norm": 0.5973682999610901, + "learning_rate": 0.0004876740420939017, + "loss": 3.6405, + "step": 17450 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.5328883528709412, + "learning_rate": 0.0004873502428494333, + "loss": 3.6423, + "step": 17500 + }, + { + "epoch": 1.8921832884097034, + "grad_norm": 0.6016663312911987, + "learning_rate": 0.0004870264436049649, + "loss": 3.6395, + "step": 17550 + }, + { + "epoch": 1.8975741239892183, + "grad_norm": 0.5924084186553955, + "learning_rate": 0.00048670264436049643, + "loss": 3.6529, + "step": 17600 + }, + { + "epoch": 1.9029649595687332, + "grad_norm": 0.5941994190216064, + "learning_rate": 0.00048637884511602803, + "loss": 3.6441, + "step": 17650 + }, + { + "epoch": 1.908355795148248, + "grad_norm": 0.5356640219688416, + "learning_rate": 0.0004860550458715596, + "loss": 3.638, + "step": 17700 + }, + { + "epoch": 1.9137466307277629, + "grad_norm": 0.6006718277931213, + "learning_rate": 0.0004857312466270912, + "loss": 3.6598, + "step": 17750 + }, + { + "epoch": 1.9191374663072778, + "grad_norm": 0.5845404863357544, + "learning_rate": 0.00048540744738262274, + "loss": 3.6449, + "step": 17800 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.5526080131530762, + "learning_rate": 0.00048508364813815434, + "loss": 3.6354, + "step": 17850 + }, + { + "epoch": 1.9299191374663072, + "grad_norm": 0.5421946048736572, + "learning_rate": 0.0004847663248785753, + "loss": 3.6465, + "step": 17900 + }, + { + "epoch": 1.935309973045822, + "grad_norm": 0.5944849848747253, + "learning_rate": 0.0004844425256341068, + "loss": 3.6269, + "step": 17950 + }, + { + "epoch": 1.940700808625337, + "grad_norm": 0.5479409694671631, + "learning_rate": 0.00048411872638963834, + "loss": 3.6321, + "step": 18000 + }, + { + "epoch": 1.940700808625337, + "eval_accuracy": 0.35996886442846787, + "eval_loss": 3.5911636352539062, + "eval_runtime": 185.6549, + "eval_samples_per_second": 97.013, + "eval_steps_per_second": 6.065, + "step": 18000 + }, + { + "epoch": 1.9460916442048517, + "grad_norm": 0.5909312963485718, + "learning_rate": 0.00048379492714516995, + "loss": 3.6343, + "step": 18050 + }, + { + "epoch": 1.9514824797843666, + "grad_norm": 0.5868159532546997, + "learning_rate": 0.0004834711279007015, + "loss": 3.6358, + "step": 18100 + }, + { + "epoch": 1.9568733153638815, + "grad_norm": 0.5208358764648438, + "learning_rate": 0.0004831473286562331, + "loss": 3.6319, + "step": 18150 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 0.5705773234367371, + "learning_rate": 0.00048282352941176465, + "loss": 3.6214, + "step": 18200 + }, + { + "epoch": 1.967654986522911, + "grad_norm": 0.5833699703216553, + "learning_rate": 0.00048249973016729626, + "loss": 3.6428, + "step": 18250 + }, + { + "epoch": 1.9730458221024259, + "grad_norm": 0.594480574131012, + "learning_rate": 0.0004821759309228278, + "loss": 3.6519, + "step": 18300 + }, + { + "epoch": 1.9784366576819408, + "grad_norm": 0.5162302851676941, + "learning_rate": 0.00048185213167835936, + "loss": 3.6368, + "step": 18350 + }, + { + "epoch": 1.9838274932614555, + "grad_norm": 0.6090728640556335, + "learning_rate": 0.00048152833243389096, + "loss": 3.6373, + "step": 18400 + }, + { + "epoch": 1.9892183288409704, + "grad_norm": 0.5224543809890747, + "learning_rate": 0.0004812045331894225, + "loss": 3.6452, + "step": 18450 + }, + { + "epoch": 1.9946091644204853, + "grad_norm": 0.6286558508872986, + "learning_rate": 0.0004808807339449541, + "loss": 3.6383, + "step": 18500 + }, + { + "epoch": 2.0, + "grad_norm": 1.1893088817596436, + "learning_rate": 0.0004805569347004856, + "loss": 3.6417, + "step": 18550 + }, + { + "epoch": 2.0053908355795147, + "grad_norm": 0.5484069585800171, + "learning_rate": 0.0004802331354560173, + "loss": 3.5707, + "step": 18600 + }, + { + "epoch": 2.01078167115903, + "grad_norm": 0.5843707323074341, + "learning_rate": 0.00047990933621154877, + "loss": 3.5399, + "step": 18650 + }, + { + "epoch": 2.0161725067385445, + "grad_norm": 0.5230047106742859, + "learning_rate": 0.0004795855369670804, + "loss": 3.5512, + "step": 18700 + }, + { + "epoch": 2.0215633423180592, + "grad_norm": 0.5233069658279419, + "learning_rate": 0.0004792617377226119, + "loss": 3.5319, + "step": 18750 + }, + { + "epoch": 2.026954177897574, + "grad_norm": 0.5597031712532043, + "learning_rate": 0.0004789379384781435, + "loss": 3.5518, + "step": 18800 + }, + { + "epoch": 2.032345013477089, + "grad_norm": 0.5443568825721741, + "learning_rate": 0.0004786141392336751, + "loss": 3.5586, + "step": 18850 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.5472878217697144, + "learning_rate": 0.00047829033998920663, + "loss": 3.5469, + "step": 18900 + }, + { + "epoch": 2.0431266846361185, + "grad_norm": 0.5626387000083923, + "learning_rate": 0.00047796654074473824, + "loss": 3.5408, + "step": 18950 + }, + { + "epoch": 2.0485175202156336, + "grad_norm": 0.5888155698776245, + "learning_rate": 0.0004776427415002698, + "loss": 3.5672, + "step": 19000 + }, + { + "epoch": 2.0485175202156336, + "eval_accuracy": 0.361499131700447, + "eval_loss": 3.5821456909179688, + "eval_runtime": 185.3913, + "eval_samples_per_second": 97.151, + "eval_steps_per_second": 6.074, + "step": 19000 + }, + { + "epoch": 2.0539083557951483, + "grad_norm": 0.5674854516983032, + "learning_rate": 0.0004773189422558014, + "loss": 3.5436, + "step": 19050 + }, + { + "epoch": 2.059299191374663, + "grad_norm": 0.5584875345230103, + "learning_rate": 0.00047699514301133294, + "loss": 3.567, + "step": 19100 + }, + { + "epoch": 2.0646900269541777, + "grad_norm": 0.5911595821380615, + "learning_rate": 0.00047667134376686455, + "loss": 3.5623, + "step": 19150 + }, + { + "epoch": 2.070080862533693, + "grad_norm": 0.5857378840446472, + "learning_rate": 0.0004763475445223961, + "loss": 3.5702, + "step": 19200 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.5929583311080933, + "learning_rate": 0.0004760237452779276, + "loss": 3.5495, + "step": 19250 + }, + { + "epoch": 2.0808625336927222, + "grad_norm": 0.5647702813148499, + "learning_rate": 0.0004756999460334592, + "loss": 3.5433, + "step": 19300 + }, + { + "epoch": 2.0862533692722374, + "grad_norm": 0.5857425332069397, + "learning_rate": 0.00047537614678899075, + "loss": 3.547, + "step": 19350 + }, + { + "epoch": 2.091644204851752, + "grad_norm": 0.5703374743461609, + "learning_rate": 0.00047505234754452235, + "loss": 3.5461, + "step": 19400 + }, + { + "epoch": 2.0970350404312668, + "grad_norm": 0.8500975966453552, + "learning_rate": 0.0004747285483000539, + "loss": 3.5538, + "step": 19450 + }, + { + "epoch": 2.1024258760107815, + "grad_norm": 0.6051983833312988, + "learning_rate": 0.0004744047490555855, + "loss": 3.5454, + "step": 19500 + }, + { + "epoch": 2.1078167115902966, + "grad_norm": 0.7593019008636475, + "learning_rate": 0.00047408094981111706, + "loss": 3.5555, + "step": 19550 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 0.5925678014755249, + "learning_rate": 0.000473763626551538, + "loss": 3.5514, + "step": 19600 + }, + { + "epoch": 2.118598382749326, + "grad_norm": 0.5761629343032837, + "learning_rate": 0.00047343982730706956, + "loss": 3.5705, + "step": 19650 + }, + { + "epoch": 2.123989218328841, + "grad_norm": 0.5896238088607788, + "learning_rate": 0.00047311602806260117, + "loss": 3.5423, + "step": 19700 + }, + { + "epoch": 2.129380053908356, + "grad_norm": 0.5506173968315125, + "learning_rate": 0.0004727922288181327, + "loss": 3.5619, + "step": 19750 + }, + { + "epoch": 2.1347708894878705, + "grad_norm": 0.5747953653335571, + "learning_rate": 0.0004724684295736643, + "loss": 3.5549, + "step": 19800 + }, + { + "epoch": 2.1401617250673857, + "grad_norm": 0.6055583953857422, + "learning_rate": 0.0004721446303291959, + "loss": 3.5466, + "step": 19850 + }, + { + "epoch": 2.1455525606469004, + "grad_norm": 0.5854771733283997, + "learning_rate": 0.0004718208310847275, + "loss": 3.5586, + "step": 19900 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.5524225234985352, + "learning_rate": 0.000471497031840259, + "loss": 3.5506, + "step": 19950 + }, + { + "epoch": 2.1563342318059298, + "grad_norm": 0.6449847221374512, + "learning_rate": 0.0004711732325957905, + "loss": 3.5754, + "step": 20000 + }, + { + "epoch": 2.1563342318059298, + "eval_accuracy": 0.3623671596211969, + "eval_loss": 3.573341131210327, + "eval_runtime": 185.7691, + "eval_samples_per_second": 96.954, + "eval_steps_per_second": 6.061, + "step": 20000 + }, + { + "epoch": 2.161725067385445, + "grad_norm": 0.5967618823051453, + "learning_rate": 0.00047084943335132213, + "loss": 3.5752, + "step": 20050 + }, + { + "epoch": 2.1671159029649596, + "grad_norm": 0.5871074795722961, + "learning_rate": 0.0004705256341068537, + "loss": 3.5762, + "step": 20100 + }, + { + "epoch": 2.1725067385444743, + "grad_norm": 0.5428000688552856, + "learning_rate": 0.0004702018348623853, + "loss": 3.5692, + "step": 20150 + }, + { + "epoch": 2.177897574123989, + "grad_norm": 0.606965959072113, + "learning_rate": 0.00046987803561791684, + "loss": 3.5672, + "step": 20200 + }, + { + "epoch": 2.183288409703504, + "grad_norm": 0.5462236404418945, + "learning_rate": 0.00046955423637344844, + "loss": 3.5669, + "step": 20250 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 0.5761914253234863, + "learning_rate": 0.00046923043712898, + "loss": 3.5573, + "step": 20300 + }, + { + "epoch": 2.1940700808625335, + "grad_norm": 0.5329501628875732, + "learning_rate": 0.0004689066378845116, + "loss": 3.5705, + "step": 20350 + }, + { + "epoch": 2.1994609164420487, + "grad_norm": 0.5812281966209412, + "learning_rate": 0.00046858283864004315, + "loss": 3.5734, + "step": 20400 + }, + { + "epoch": 2.2048517520215634, + "grad_norm": 0.5665324926376343, + "learning_rate": 0.0004682590393955747, + "loss": 3.5702, + "step": 20450 + }, + { + "epoch": 2.210242587601078, + "grad_norm": 0.5955948233604431, + "learning_rate": 0.0004679352401511063, + "loss": 3.5426, + "step": 20500 + }, + { + "epoch": 2.215633423180593, + "grad_norm": 0.5881284475326538, + "learning_rate": 0.0004676114409066378, + "loss": 3.5686, + "step": 20550 + }, + { + "epoch": 2.221024258760108, + "grad_norm": 0.6288923025131226, + "learning_rate": 0.00046728764166216946, + "loss": 3.5468, + "step": 20600 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 0.5576397776603699, + "learning_rate": 0.00046696384241770095, + "loss": 3.5532, + "step": 20650 + }, + { + "epoch": 2.2318059299191373, + "grad_norm": 0.5199394226074219, + "learning_rate": 0.00046664004317323256, + "loss": 3.551, + "step": 20700 + }, + { + "epoch": 2.2371967654986524, + "grad_norm": 0.5937060713768005, + "learning_rate": 0.0004663162439287641, + "loss": 3.5577, + "step": 20750 + }, + { + "epoch": 2.242587601078167, + "grad_norm": 0.6562036871910095, + "learning_rate": 0.00046599244468429566, + "loss": 3.5528, + "step": 20800 + }, + { + "epoch": 2.247978436657682, + "grad_norm": 0.565091609954834, + "learning_rate": 0.00046566864543982726, + "loss": 3.5683, + "step": 20850 + }, + { + "epoch": 2.2533692722371965, + "grad_norm": 0.5420801043510437, + "learning_rate": 0.0004653448461953588, + "loss": 3.5628, + "step": 20900 + }, + { + "epoch": 2.2587601078167117, + "grad_norm": 0.5261815786361694, + "learning_rate": 0.0004650210469508904, + "loss": 3.5508, + "step": 20950 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.5820636749267578, + "learning_rate": 0.00046469724770642197, + "loss": 3.5637, + "step": 21000 + }, + { + "epoch": 2.2641509433962264, + "eval_accuracy": 0.36372227843801913, + "eval_loss": 3.5613205432891846, + "eval_runtime": 185.2619, + "eval_samples_per_second": 97.219, + "eval_steps_per_second": 6.078, + "step": 21000 + }, + { + "epoch": 2.269541778975741, + "grad_norm": 0.5688766837120056, + "learning_rate": 0.0004643734484619536, + "loss": 3.5568, + "step": 21050 + }, + { + "epoch": 2.274932614555256, + "grad_norm": 0.5932535529136658, + "learning_rate": 0.0004640496492174851, + "loss": 3.5535, + "step": 21100 + }, + { + "epoch": 2.280323450134771, + "grad_norm": 0.5635634064674377, + "learning_rate": 0.00046372584997301673, + "loss": 3.5756, + "step": 21150 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.6333921551704407, + "learning_rate": 0.0004634020507285483, + "loss": 3.5627, + "step": 21200 + }, + { + "epoch": 2.2911051212938007, + "grad_norm": 0.5962139964103699, + "learning_rate": 0.0004630782514840798, + "loss": 3.571, + "step": 21250 + }, + { + "epoch": 2.2964959568733154, + "grad_norm": 0.601997435092926, + "learning_rate": 0.0004627544522396114, + "loss": 3.5379, + "step": 21300 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 0.83173668384552, + "learning_rate": 0.00046243065299514293, + "loss": 3.559, + "step": 21350 + }, + { + "epoch": 2.3072776280323453, + "grad_norm": 0.550325334072113, + "learning_rate": 0.00046210685375067454, + "loss": 3.5504, + "step": 21400 + }, + { + "epoch": 2.31266846361186, + "grad_norm": 0.5884050130844116, + "learning_rate": 0.0004617830545062061, + "loss": 3.5759, + "step": 21450 + }, + { + "epoch": 2.3180592991913747, + "grad_norm": 0.5583473443984985, + "learning_rate": 0.0004614592552617377, + "loss": 3.5461, + "step": 21500 + }, + { + "epoch": 2.3234501347708894, + "grad_norm": 0.5997599363327026, + "learning_rate": 0.00046113545601726924, + "loss": 3.5626, + "step": 21550 + }, + { + "epoch": 2.3288409703504045, + "grad_norm": 0.5541661977767944, + "learning_rate": 0.00046081165677280085, + "loss": 3.5549, + "step": 21600 + }, + { + "epoch": 2.334231805929919, + "grad_norm": 0.6619417071342468, + "learning_rate": 0.0004604878575283324, + "loss": 3.5543, + "step": 21650 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 0.5892545580863953, + "learning_rate": 0.00046016405828386395, + "loss": 3.5725, + "step": 21700 + }, + { + "epoch": 2.3450134770889486, + "grad_norm": 0.5789877772331238, + "learning_rate": 0.00045984025903939555, + "loss": 3.5618, + "step": 21750 + }, + { + "epoch": 2.3504043126684637, + "grad_norm": 0.5967502593994141, + "learning_rate": 0.0004595164597949271, + "loss": 3.5435, + "step": 21800 + }, + { + "epoch": 2.3557951482479784, + "grad_norm": 0.5737834572792053, + "learning_rate": 0.0004591926605504587, + "loss": 3.5549, + "step": 21850 + }, + { + "epoch": 2.361185983827493, + "grad_norm": 0.5817111134529114, + "learning_rate": 0.0004588688613059902, + "loss": 3.5489, + "step": 21900 + }, + { + "epoch": 2.3665768194070083, + "grad_norm": 0.5602347254753113, + "learning_rate": 0.00045854506206152186, + "loss": 3.5488, + "step": 21950 + }, + { + "epoch": 2.371967654986523, + "grad_norm": 0.5591195821762085, + "learning_rate": 0.00045822126281705336, + "loss": 3.5706, + "step": 22000 + }, + { + "epoch": 2.371967654986523, + "eval_accuracy": 0.3645871554250224, + "eval_loss": 3.5503523349761963, + "eval_runtime": 185.7321, + "eval_samples_per_second": 96.973, + "eval_steps_per_second": 6.062, + "step": 22000 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.5747423768043518, + "learning_rate": 0.00045789746357258497, + "loss": 3.5462, + "step": 22050 + }, + { + "epoch": 2.382749326145553, + "grad_norm": 0.6654125452041626, + "learning_rate": 0.0004575736643281165, + "loss": 3.5505, + "step": 22100 + }, + { + "epoch": 2.3881401617250675, + "grad_norm": 0.593645453453064, + "learning_rate": 0.00045724986508364807, + "loss": 3.5528, + "step": 22150 + }, + { + "epoch": 2.393530997304582, + "grad_norm": 0.5858222842216492, + "learning_rate": 0.00045692606583917967, + "loss": 3.5645, + "step": 22200 + }, + { + "epoch": 2.398921832884097, + "grad_norm": 0.6029301285743713, + "learning_rate": 0.0004566022665947112, + "loss": 3.5695, + "step": 22250 + }, + { + "epoch": 2.404312668463612, + "grad_norm": 0.584065854549408, + "learning_rate": 0.0004562784673502428, + "loss": 3.5708, + "step": 22300 + }, + { + "epoch": 2.4097035040431267, + "grad_norm": 0.5794286727905273, + "learning_rate": 0.0004559546681057744, + "loss": 3.5516, + "step": 22350 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 0.6180821657180786, + "learning_rate": 0.000455630868861306, + "loss": 3.5726, + "step": 22400 + }, + { + "epoch": 2.420485175202156, + "grad_norm": 0.5881420373916626, + "learning_rate": 0.00045530706961683753, + "loss": 3.5624, + "step": 22450 + }, + { + "epoch": 2.4258760107816713, + "grad_norm": 0.5598190426826477, + "learning_rate": 0.00045498327037236914, + "loss": 3.556, + "step": 22500 + }, + { + "epoch": 2.431266846361186, + "grad_norm": 0.5753728151321411, + "learning_rate": 0.0004546594711279007, + "loss": 3.5505, + "step": 22550 + }, + { + "epoch": 2.4366576819407006, + "grad_norm": 0.5574933886528015, + "learning_rate": 0.0004543356718834322, + "loss": 3.551, + "step": 22600 + }, + { + "epoch": 2.442048517520216, + "grad_norm": 0.5815372467041016, + "learning_rate": 0.0004540118726389638, + "loss": 3.5393, + "step": 22650 + }, + { + "epoch": 2.4474393530997305, + "grad_norm": 0.5866022706031799, + "learning_rate": 0.00045368807339449534, + "loss": 3.5423, + "step": 22700 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.5673452615737915, + "learning_rate": 0.00045336427415002694, + "loss": 3.547, + "step": 22750 + }, + { + "epoch": 2.4582210242587603, + "grad_norm": 0.587451159954071, + "learning_rate": 0.0004530404749055585, + "loss": 3.5455, + "step": 22800 + }, + { + "epoch": 2.463611859838275, + "grad_norm": 0.6395877599716187, + "learning_rate": 0.0004527166756610901, + "loss": 3.5518, + "step": 22850 + }, + { + "epoch": 2.4690026954177897, + "grad_norm": 0.6238613724708557, + "learning_rate": 0.00045239287641662165, + "loss": 3.5595, + "step": 22900 + }, + { + "epoch": 2.4743935309973044, + "grad_norm": 0.6073823571205139, + "learning_rate": 0.0004520690771721532, + "loss": 3.5612, + "step": 22950 + }, + { + "epoch": 2.4797843665768196, + "grad_norm": 0.582822322845459, + "learning_rate": 0.0004517452779276848, + "loss": 3.5408, + "step": 23000 + }, + { + "epoch": 2.4797843665768196, + "eval_accuracy": 0.3659634615549684, + "eval_loss": 3.5402395725250244, + "eval_runtime": 185.4548, + "eval_samples_per_second": 97.118, + "eval_steps_per_second": 6.072, + "step": 23000 + }, + { + "epoch": 2.4851752021563343, + "grad_norm": 0.5883828401565552, + "learning_rate": 0.00045142147868321636, + "loss": 3.543, + "step": 23050 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.5678288340568542, + "learning_rate": 0.00045109767943874796, + "loss": 3.5585, + "step": 23100 + }, + { + "epoch": 2.4959568733153636, + "grad_norm": 0.57308030128479, + "learning_rate": 0.0004507738801942795, + "loss": 3.5585, + "step": 23150 + }, + { + "epoch": 2.501347708894879, + "grad_norm": 0.6100690364837646, + "learning_rate": 0.0004504500809498111, + "loss": 3.5512, + "step": 23200 + }, + { + "epoch": 2.5067385444743935, + "grad_norm": 0.5841269493103027, + "learning_rate": 0.0004501262817053426, + "loss": 3.5446, + "step": 23250 + }, + { + "epoch": 2.512129380053908, + "grad_norm": 0.6322735548019409, + "learning_rate": 0.00044980248246087427, + "loss": 3.5466, + "step": 23300 + }, + { + "epoch": 2.5175202156334233, + "grad_norm": 0.6177630424499512, + "learning_rate": 0.00044947868321640577, + "loss": 3.5425, + "step": 23350 + }, + { + "epoch": 2.522911051212938, + "grad_norm": 0.6240397691726685, + "learning_rate": 0.0004491548839719373, + "loss": 3.5322, + "step": 23400 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.587017834186554, + "learning_rate": 0.0004488310847274689, + "loss": 3.5453, + "step": 23450 + }, + { + "epoch": 2.533692722371968, + "grad_norm": 0.5576516389846802, + "learning_rate": 0.0004485072854830005, + "loss": 3.5428, + "step": 23500 + }, + { + "epoch": 2.5390835579514826, + "grad_norm": 0.5701577663421631, + "learning_rate": 0.0004481834862385321, + "loss": 3.5514, + "step": 23550 + }, + { + "epoch": 2.5444743935309972, + "grad_norm": 0.5828137397766113, + "learning_rate": 0.00044785968699406363, + "loss": 3.5517, + "step": 23600 + }, + { + "epoch": 2.5498652291105124, + "grad_norm": 0.6262269020080566, + "learning_rate": 0.0004475423637344846, + "loss": 3.556, + "step": 23650 + }, + { + "epoch": 2.555256064690027, + "grad_norm": 0.5848754644393921, + "learning_rate": 0.00044721856449001613, + "loss": 3.5631, + "step": 23700 + }, + { + "epoch": 2.560646900269542, + "grad_norm": 0.5634021162986755, + "learning_rate": 0.00044689476524554774, + "loss": 3.5699, + "step": 23750 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 0.6959907412528992, + "learning_rate": 0.0004465709660010793, + "loss": 3.5458, + "step": 23800 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.5664647817611694, + "learning_rate": 0.0004462471667566109, + "loss": 3.5597, + "step": 23850 + }, + { + "epoch": 2.5768194070080863, + "grad_norm": 0.5963898301124573, + "learning_rate": 0.00044592336751214244, + "loss": 3.5427, + "step": 23900 + }, + { + "epoch": 2.582210242587601, + "grad_norm": 0.5585032105445862, + "learning_rate": 0.00044559956826767405, + "loss": 3.5489, + "step": 23950 + }, + { + "epoch": 2.5876010781671157, + "grad_norm": 0.6078963279724121, + "learning_rate": 0.00044527576902320554, + "loss": 3.532, + "step": 24000 + }, + { + "epoch": 2.5876010781671157, + "eval_accuracy": 0.36688788032449404, + "eval_loss": 3.531247854232788, + "eval_runtime": 185.6875, + "eval_samples_per_second": 96.996, + "eval_steps_per_second": 6.064, + "step": 24000 + }, + { + "epoch": 2.592991913746631, + "grad_norm": 0.5676623582839966, + "learning_rate": 0.00044495196977873715, + "loss": 3.5458, + "step": 24050 + }, + { + "epoch": 2.5983827493261455, + "grad_norm": 0.6343262195587158, + "learning_rate": 0.0004446281705342687, + "loss": 3.5379, + "step": 24100 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.5848167538642883, + "learning_rate": 0.00044430437128980025, + "loss": 3.5677, + "step": 24150 + }, + { + "epoch": 2.6091644204851754, + "grad_norm": 0.6168308854103088, + "learning_rate": 0.00044398057204533185, + "loss": 3.5298, + "step": 24200 + }, + { + "epoch": 2.61455525606469, + "grad_norm": 0.5785287022590637, + "learning_rate": 0.0004436567728008634, + "loss": 3.5411, + "step": 24250 + }, + { + "epoch": 2.6199460916442048, + "grad_norm": 0.5798066258430481, + "learning_rate": 0.000443332973556395, + "loss": 3.5491, + "step": 24300 + }, + { + "epoch": 2.62533692722372, + "grad_norm": 0.5612218379974365, + "learning_rate": 0.00044300917431192656, + "loss": 3.5649, + "step": 24350 + }, + { + "epoch": 2.6307277628032346, + "grad_norm": 0.5887892842292786, + "learning_rate": 0.00044268537506745816, + "loss": 3.5273, + "step": 24400 + }, + { + "epoch": 2.6361185983827493, + "grad_norm": 0.6022453308105469, + "learning_rate": 0.0004423615758229897, + "loss": 3.5447, + "step": 24450 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.5853219032287598, + "learning_rate": 0.0004420377765785213, + "loss": 3.5591, + "step": 24500 + }, + { + "epoch": 2.6469002695417787, + "grad_norm": 0.5891228318214417, + "learning_rate": 0.00044171397733405287, + "loss": 3.5263, + "step": 24550 + }, + { + "epoch": 2.652291105121294, + "grad_norm": 0.6075050830841064, + "learning_rate": 0.00044139017808958437, + "loss": 3.5344, + "step": 24600 + }, + { + "epoch": 2.6576819407008085, + "grad_norm": 0.5453810095787048, + "learning_rate": 0.00044106637884511597, + "loss": 3.5361, + "step": 24650 + }, + { + "epoch": 2.6630727762803232, + "grad_norm": 0.6543773412704468, + "learning_rate": 0.0004407425796006475, + "loss": 3.5407, + "step": 24700 + }, + { + "epoch": 2.6684636118598384, + "grad_norm": 0.5871316194534302, + "learning_rate": 0.00044041878035617913, + "loss": 3.5404, + "step": 24750 + }, + { + "epoch": 2.673854447439353, + "grad_norm": 0.5868498086929321, + "learning_rate": 0.0004400949811117107, + "loss": 3.5162, + "step": 24800 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 0.5242236852645874, + "learning_rate": 0.0004397711818672423, + "loss": 3.5146, + "step": 24850 + }, + { + "epoch": 2.684636118598383, + "grad_norm": 0.5566399097442627, + "learning_rate": 0.00043944738262277383, + "loss": 3.5539, + "step": 24900 + }, + { + "epoch": 2.6900269541778976, + "grad_norm": 0.5947811603546143, + "learning_rate": 0.00043912358337830544, + "loss": 3.5379, + "step": 24950 + }, + { + "epoch": 2.6954177897574123, + "grad_norm": 0.5996220111846924, + "learning_rate": 0.000438799784133837, + "loss": 3.527, + "step": 25000 + }, + { + "epoch": 2.6954177897574123, + "eval_accuracy": 0.3672525194159994, + "eval_loss": 3.5217697620391846, + "eval_runtime": 185.7585, + "eval_samples_per_second": 96.959, + "eval_steps_per_second": 6.062, + "step": 25000 + }, + { + "epoch": 2.7008086253369274, + "grad_norm": 0.5733533501625061, + "learning_rate": 0.00043847598488936854, + "loss": 3.5383, + "step": 25050 + }, + { + "epoch": 2.706199460916442, + "grad_norm": 0.5578649044036865, + "learning_rate": 0.00043815218564490014, + "loss": 3.5233, + "step": 25100 + }, + { + "epoch": 2.711590296495957, + "grad_norm": 0.5936254858970642, + "learning_rate": 0.0004378283864004317, + "loss": 3.5137, + "step": 25150 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.591632068157196, + "learning_rate": 0.0004375045871559633, + "loss": 3.5193, + "step": 25200 + }, + { + "epoch": 2.7223719676549867, + "grad_norm": 0.6370254158973694, + "learning_rate": 0.00043718078791149485, + "loss": 3.5535, + "step": 25250 + }, + { + "epoch": 2.7277628032345014, + "grad_norm": 0.5756546258926392, + "learning_rate": 0.00043685698866702645, + "loss": 3.5101, + "step": 25300 + }, + { + "epoch": 2.733153638814016, + "grad_norm": 0.595561146736145, + "learning_rate": 0.00043653318942255795, + "loss": 3.5358, + "step": 25350 + }, + { + "epoch": 2.7385444743935308, + "grad_norm": 0.6051717400550842, + "learning_rate": 0.00043620939017808956, + "loss": 3.5376, + "step": 25400 + }, + { + "epoch": 2.743935309973046, + "grad_norm": 0.5473976731300354, + "learning_rate": 0.0004358855909336211, + "loss": 3.5087, + "step": 25450 + }, + { + "epoch": 2.7493261455525606, + "grad_norm": 0.6153759360313416, + "learning_rate": 0.00043556179168915266, + "loss": 3.5703, + "step": 25500 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 0.5839317440986633, + "learning_rate": 0.00043523799244468426, + "loss": 3.5587, + "step": 25550 + }, + { + "epoch": 2.7601078167115904, + "grad_norm": 0.6133059859275818, + "learning_rate": 0.0004349141932002158, + "loss": 3.5361, + "step": 25600 + }, + { + "epoch": 2.765498652291105, + "grad_norm": 0.6802798509597778, + "learning_rate": 0.0004345903939557474, + "loss": 3.5245, + "step": 25650 + }, + { + "epoch": 2.77088948787062, + "grad_norm": 0.5973533987998962, + "learning_rate": 0.00043426659471127897, + "loss": 3.5436, + "step": 25700 + }, + { + "epoch": 2.776280323450135, + "grad_norm": 0.6152960658073425, + "learning_rate": 0.0004339492714516999, + "loss": 3.5327, + "step": 25750 + }, + { + "epoch": 2.7816711590296497, + "grad_norm": 0.5712870359420776, + "learning_rate": 0.00043362547220723147, + "loss": 3.5337, + "step": 25800 + }, + { + "epoch": 2.7870619946091644, + "grad_norm": 0.5867605209350586, + "learning_rate": 0.0004333016729627631, + "loss": 3.5213, + "step": 25850 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 0.6075189113616943, + "learning_rate": 0.0004329778737182946, + "loss": 3.5308, + "step": 25900 + }, + { + "epoch": 2.797843665768194, + "grad_norm": 0.5805151462554932, + "learning_rate": 0.00043265407447382623, + "loss": 3.5207, + "step": 25950 + }, + { + "epoch": 2.803234501347709, + "grad_norm": 0.5881994962692261, + "learning_rate": 0.0004323302752293577, + "loss": 3.5333, + "step": 26000 + }, + { + "epoch": 2.803234501347709, + "eval_accuracy": 0.368511371774218, + "eval_loss": 3.51200795173645, + "eval_runtime": 185.141, + "eval_samples_per_second": 97.283, + "eval_steps_per_second": 6.082, + "step": 26000 + }, + { + "epoch": 2.8086253369272236, + "grad_norm": 0.5690694451332092, + "learning_rate": 0.00043200647598488933, + "loss": 3.5296, + "step": 26050 + }, + { + "epoch": 2.8140161725067383, + "grad_norm": 0.5980532169342041, + "learning_rate": 0.0004316826767404209, + "loss": 3.5044, + "step": 26100 + }, + { + "epoch": 2.8194070080862534, + "grad_norm": 0.5740624070167542, + "learning_rate": 0.0004313588774959525, + "loss": 3.5082, + "step": 26150 + }, + { + "epoch": 2.824797843665768, + "grad_norm": 0.6146470904350281, + "learning_rate": 0.00043103507825148404, + "loss": 3.532, + "step": 26200 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.5921385288238525, + "learning_rate": 0.0004307112790070156, + "loss": 3.5445, + "step": 26250 + }, + { + "epoch": 2.835579514824798, + "grad_norm": 0.5627673268318176, + "learning_rate": 0.0004303874797625472, + "loss": 3.5604, + "step": 26300 + }, + { + "epoch": 2.8409703504043127, + "grad_norm": 0.6661468744277954, + "learning_rate": 0.00043006368051807874, + "loss": 3.5291, + "step": 26350 + }, + { + "epoch": 2.8463611859838274, + "grad_norm": 0.6103497743606567, + "learning_rate": 0.00042973988127361035, + "loss": 3.5391, + "step": 26400 + }, + { + "epoch": 2.8517520215633425, + "grad_norm": 0.6032962799072266, + "learning_rate": 0.0004294160820291419, + "loss": 3.545, + "step": 26450 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.5415673851966858, + "learning_rate": 0.0004290922827846735, + "loss": 3.5161, + "step": 26500 + }, + { + "epoch": 2.862533692722372, + "grad_norm": 0.5705322027206421, + "learning_rate": 0.00042876848354020505, + "loss": 3.5323, + "step": 26550 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 0.5669793486595154, + "learning_rate": 0.00042844468429573655, + "loss": 3.5407, + "step": 26600 + }, + { + "epoch": 2.8733153638814017, + "grad_norm": 0.605086624622345, + "learning_rate": 0.00042812088505126815, + "loss": 3.5466, + "step": 26650 + }, + { + "epoch": 2.8787061994609164, + "grad_norm": 0.5618116855621338, + "learning_rate": 0.0004277970858067997, + "loss": 3.5318, + "step": 26700 + }, + { + "epoch": 2.884097035040431, + "grad_norm": 0.6186133027076721, + "learning_rate": 0.0004274732865623313, + "loss": 3.5432, + "step": 26750 + }, + { + "epoch": 2.889487870619946, + "grad_norm": 0.5849158763885498, + "learning_rate": 0.00042714948731786286, + "loss": 3.5203, + "step": 26800 + }, + { + "epoch": 2.894878706199461, + "grad_norm": 0.6805973052978516, + "learning_rate": 0.00042682568807339447, + "loss": 3.5437, + "step": 26850 + }, + { + "epoch": 2.9002695417789757, + "grad_norm": 0.6036573648452759, + "learning_rate": 0.000426501888828926, + "loss": 3.5082, + "step": 26900 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.6047782301902771, + "learning_rate": 0.0004261780895844576, + "loss": 3.5299, + "step": 26950 + }, + { + "epoch": 2.9110512129380055, + "grad_norm": 0.5614576935768127, + "learning_rate": 0.00042585429033998917, + "loss": 3.5442, + "step": 27000 + }, + { + "epoch": 2.9110512129380055, + "eval_accuracy": 0.3696261504031946, + "eval_loss": 3.5053839683532715, + "eval_runtime": 185.4428, + "eval_samples_per_second": 97.124, + "eval_steps_per_second": 6.072, + "step": 27000 + }, + { + "epoch": 2.91644204851752, + "grad_norm": 0.6214551329612732, + "learning_rate": 0.0004255304910955207, + "loss": 3.5288, + "step": 27050 + }, + { + "epoch": 2.921832884097035, + "grad_norm": 0.5811451077461243, + "learning_rate": 0.0004252066918510523, + "loss": 3.5338, + "step": 27100 + }, + { + "epoch": 2.92722371967655, + "grad_norm": 0.6257526874542236, + "learning_rate": 0.0004248828926065839, + "loss": 3.5296, + "step": 27150 + }, + { + "epoch": 2.9326145552560647, + "grad_norm": 0.6285461187362671, + "learning_rate": 0.0004245590933621155, + "loss": 3.5139, + "step": 27200 + }, + { + "epoch": 2.9380053908355794, + "grad_norm": 0.549082338809967, + "learning_rate": 0.00042423529411764703, + "loss": 3.5302, + "step": 27250 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.5633655786514282, + "learning_rate": 0.00042391149487317864, + "loss": 3.5334, + "step": 27300 + }, + { + "epoch": 2.9487870619946093, + "grad_norm": 0.5984169244766235, + "learning_rate": 0.00042358769562871013, + "loss": 3.5178, + "step": 27350 + }, + { + "epoch": 2.954177897574124, + "grad_norm": 0.6494591236114502, + "learning_rate": 0.00042326389638424174, + "loss": 3.5112, + "step": 27400 + }, + { + "epoch": 2.9595687331536387, + "grad_norm": 0.5794746279716492, + "learning_rate": 0.0004229400971397733, + "loss": 3.5172, + "step": 27450 + }, + { + "epoch": 2.964959568733154, + "grad_norm": 0.5832740068435669, + "learning_rate": 0.00042261629789530484, + "loss": 3.5293, + "step": 27500 + }, + { + "epoch": 2.9703504043126685, + "grad_norm": 0.5642590522766113, + "learning_rate": 0.00042229249865083644, + "loss": 3.5181, + "step": 27550 + }, + { + "epoch": 2.975741239892183, + "grad_norm": 0.5790451765060425, + "learning_rate": 0.000421968699406368, + "loss": 3.5252, + "step": 27600 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 0.6082571744918823, + "learning_rate": 0.0004216449001618996, + "loss": 3.5057, + "step": 27650 + }, + { + "epoch": 2.986522911051213, + "grad_norm": 0.5873731374740601, + "learning_rate": 0.00042132110091743115, + "loss": 3.5168, + "step": 27700 + }, + { + "epoch": 2.9919137466307277, + "grad_norm": 0.6429144144058228, + "learning_rate": 0.00042099730167296275, + "loss": 3.5223, + "step": 27750 + }, + { + "epoch": 2.9973045822102424, + "grad_norm": 0.642909586429596, + "learning_rate": 0.00042067997841338365, + "loss": 3.5247, + "step": 27800 + }, + { + "epoch": 3.0026954177897576, + "grad_norm": 0.6168606281280518, + "learning_rate": 0.00042035617916891526, + "loss": 3.4874, + "step": 27850 + }, + { + "epoch": 3.0080862533692723, + "grad_norm": 0.6800719499588013, + "learning_rate": 0.0004200323799244468, + "loss": 3.4281, + "step": 27900 + }, + { + "epoch": 3.013477088948787, + "grad_norm": 0.57563316822052, + "learning_rate": 0.0004197085806799784, + "loss": 3.4331, + "step": 27950 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 0.607589602470398, + "learning_rate": 0.0004193847814355099, + "loss": 3.4451, + "step": 28000 + }, + { + "epoch": 3.018867924528302, + "eval_accuracy": 0.3699707973633421, + "eval_loss": 3.5011649131774902, + "eval_runtime": 185.7113, + "eval_samples_per_second": 96.984, + "eval_steps_per_second": 6.063, + "step": 28000 + }, + { + "epoch": 3.024258760107817, + "grad_norm": 0.6603455543518066, + "learning_rate": 0.0004190609821910415, + "loss": 3.4317, + "step": 28050 + }, + { + "epoch": 3.0296495956873315, + "grad_norm": 0.63117516040802, + "learning_rate": 0.00041873718294657306, + "loss": 3.4264, + "step": 28100 + }, + { + "epoch": 3.035040431266846, + "grad_norm": 0.6193031072616577, + "learning_rate": 0.00041841338370210467, + "loss": 3.466, + "step": 28150 + }, + { + "epoch": 3.0404312668463613, + "grad_norm": 0.6265783905982971, + "learning_rate": 0.0004180895844576362, + "loss": 3.4467, + "step": 28200 + }, + { + "epoch": 3.045822102425876, + "grad_norm": 0.5731920003890991, + "learning_rate": 0.00041776578521316777, + "loss": 3.4543, + "step": 28250 + }, + { + "epoch": 3.0512129380053907, + "grad_norm": 0.6172985434532166, + "learning_rate": 0.0004174419859686994, + "loss": 3.4541, + "step": 28300 + }, + { + "epoch": 3.056603773584906, + "grad_norm": 0.5971702337265015, + "learning_rate": 0.0004171181867242309, + "loss": 3.4452, + "step": 28350 + }, + { + "epoch": 3.0619946091644206, + "grad_norm": 0.6083709001541138, + "learning_rate": 0.00041679438747976253, + "loss": 3.4237, + "step": 28400 + }, + { + "epoch": 3.0673854447439353, + "grad_norm": 0.595926821231842, + "learning_rate": 0.0004164705882352941, + "loss": 3.4655, + "step": 28450 + }, + { + "epoch": 3.07277628032345, + "grad_norm": 0.6328946352005005, + "learning_rate": 0.0004161467889908257, + "loss": 3.4581, + "step": 28500 + }, + { + "epoch": 3.078167115902965, + "grad_norm": 0.62444669008255, + "learning_rate": 0.00041582298974635724, + "loss": 3.4453, + "step": 28550 + }, + { + "epoch": 3.08355795148248, + "grad_norm": 0.5992231965065002, + "learning_rate": 0.00041549919050188884, + "loss": 3.442, + "step": 28600 + }, + { + "epoch": 3.0889487870619945, + "grad_norm": 0.6712383031845093, + "learning_rate": 0.00041517539125742034, + "loss": 3.4401, + "step": 28650 + }, + { + "epoch": 3.0943396226415096, + "grad_norm": 0.58417809009552, + "learning_rate": 0.0004148515920129519, + "loss": 3.4495, + "step": 28700 + }, + { + "epoch": 3.0997304582210243, + "grad_norm": 0.6253139972686768, + "learning_rate": 0.0004145277927684835, + "loss": 3.4385, + "step": 28750 + }, + { + "epoch": 3.105121293800539, + "grad_norm": 0.5860809683799744, + "learning_rate": 0.00041420399352401504, + "loss": 3.4427, + "step": 28800 + }, + { + "epoch": 3.1105121293800537, + "grad_norm": 0.6193458437919617, + "learning_rate": 0.00041388019427954665, + "loss": 3.4264, + "step": 28850 + }, + { + "epoch": 3.115902964959569, + "grad_norm": 0.5895789861679077, + "learning_rate": 0.0004135563950350782, + "loss": 3.4446, + "step": 28900 + }, + { + "epoch": 3.1212938005390836, + "grad_norm": 0.6106541156768799, + "learning_rate": 0.0004132325957906098, + "loss": 3.4618, + "step": 28950 + }, + { + "epoch": 3.1266846361185983, + "grad_norm": 0.6361096501350403, + "learning_rate": 0.00041290879654614135, + "loss": 3.4295, + "step": 29000 + }, + { + "epoch": 3.1266846361185983, + "eval_accuracy": 0.3709342225195931, + "eval_loss": 3.496896266937256, + "eval_runtime": 184.8429, + "eval_samples_per_second": 97.439, + "eval_steps_per_second": 6.092, + "step": 29000 + }, + { + "epoch": 3.1320754716981134, + "grad_norm": 0.665164589881897, + "learning_rate": 0.00041258499730167296, + "loss": 3.4525, + "step": 29050 + }, + { + "epoch": 3.137466307277628, + "grad_norm": 0.6416609883308411, + "learning_rate": 0.0004122611980572045, + "loss": 3.4413, + "step": 29100 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.6141297817230225, + "learning_rate": 0.00041193739881273606, + "loss": 3.4591, + "step": 29150 + }, + { + "epoch": 3.1482479784366575, + "grad_norm": 0.5525941848754883, + "learning_rate": 0.00041161359956826766, + "loss": 3.4582, + "step": 29200 + }, + { + "epoch": 3.1536388140161726, + "grad_norm": 0.5938381552696228, + "learning_rate": 0.0004112898003237992, + "loss": 3.4456, + "step": 29250 + }, + { + "epoch": 3.1590296495956873, + "grad_norm": 0.6277801990509033, + "learning_rate": 0.0004109660010793308, + "loss": 3.4546, + "step": 29300 + }, + { + "epoch": 3.164420485175202, + "grad_norm": 0.6602087616920471, + "learning_rate": 0.0004106422018348623, + "loss": 3.4666, + "step": 29350 + }, + { + "epoch": 3.169811320754717, + "grad_norm": 0.5656922459602356, + "learning_rate": 0.0004103184025903939, + "loss": 3.458, + "step": 29400 + }, + { + "epoch": 3.175202156334232, + "grad_norm": 0.605190634727478, + "learning_rate": 0.00040999460334592547, + "loss": 3.4687, + "step": 29450 + }, + { + "epoch": 3.1805929919137466, + "grad_norm": 0.6249133944511414, + "learning_rate": 0.000409670804101457, + "loss": 3.465, + "step": 29500 + }, + { + "epoch": 3.1859838274932613, + "grad_norm": 0.5883227586746216, + "learning_rate": 0.00040934700485698863, + "loss": 3.4413, + "step": 29550 + }, + { + "epoch": 3.1913746630727764, + "grad_norm": 0.5973005294799805, + "learning_rate": 0.0004090232056125202, + "loss": 3.4628, + "step": 29600 + }, + { + "epoch": 3.196765498652291, + "grad_norm": 0.5894708633422852, + "learning_rate": 0.0004086994063680518, + "loss": 3.4505, + "step": 29650 + }, + { + "epoch": 3.202156334231806, + "grad_norm": 0.5822822451591492, + "learning_rate": 0.00040837560712358333, + "loss": 3.4559, + "step": 29700 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 0.6240995526313782, + "learning_rate": 0.00040805180787911494, + "loss": 3.4638, + "step": 29750 + }, + { + "epoch": 3.2129380053908356, + "grad_norm": 0.5901066064834595, + "learning_rate": 0.0004077344846195359, + "loss": 3.4456, + "step": 29800 + }, + { + "epoch": 3.2183288409703503, + "grad_norm": 0.6369420289993286, + "learning_rate": 0.00040741068537506744, + "loss": 3.467, + "step": 29850 + }, + { + "epoch": 3.223719676549865, + "grad_norm": 0.6294394135475159, + "learning_rate": 0.000407086886130599, + "loss": 3.4735, + "step": 29900 + }, + { + "epoch": 3.22911051212938, + "grad_norm": 0.6184892654418945, + "learning_rate": 0.0004067630868861306, + "loss": 3.4679, + "step": 29950 + }, + { + "epoch": 3.234501347708895, + "grad_norm": 0.6455713510513306, + "learning_rate": 0.0004064392876416621, + "loss": 3.4506, + "step": 30000 + }, + { + "epoch": 3.234501347708895, + "eval_accuracy": 0.3713301536427891, + "eval_loss": 3.490234613418579, + "eval_runtime": 185.1295, + "eval_samples_per_second": 97.289, + "eval_steps_per_second": 6.082, + "step": 30000 + } + ], + "logging_steps": 50, + "max_steps": 92750, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.508207538176e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}