{ "best_metric": 3.3063955307006836, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_500_495/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005390835579514825, "grad_norm": 1.9115898609161377, "learning_rate": 0.0003, "loss": 8.518, "step": 50 }, { "epoch": 0.01078167115902965, "grad_norm": 5.626335144042969, "learning_rate": 0.0006, "loss": 6.8964, "step": 100 }, { "epoch": 0.016172506738544475, "grad_norm": 1.5297762155532837, "learning_rate": 0.0005996762007555315, "loss": 6.4903, "step": 150 }, { "epoch": 0.0215633423180593, "grad_norm": 1.232740044593811, "learning_rate": 0.000599352401511063, "loss": 6.2567, "step": 200 }, { "epoch": 0.026954177897574125, "grad_norm": 0.7754726409912109, "learning_rate": 0.0005990286022665946, "loss": 6.0728, "step": 250 }, { "epoch": 0.03234501347708895, "grad_norm": 2.1544833183288574, "learning_rate": 0.0005987048030221263, "loss": 5.9666, "step": 300 }, { "epoch": 0.03773584905660377, "grad_norm": 0.9812532663345337, "learning_rate": 0.0005983810037776578, "loss": 5.8668, "step": 350 }, { "epoch": 0.0431266846361186, "grad_norm": 1.3231236934661865, "learning_rate": 0.0005980572045331894, "loss": 5.809, "step": 400 }, { "epoch": 0.04851752021563342, "grad_norm": 1.8271440267562866, "learning_rate": 0.0005977334052887209, "loss": 5.7373, "step": 450 }, { "epoch": 0.05390835579514825, "grad_norm": 1.6826151609420776, "learning_rate": 0.0005974096060442526, "loss": 5.6784, "step": 500 }, { "epoch": 0.05929919137466307, "grad_norm": 2.017390012741089, "learning_rate": 0.0005970858067997841, "loss": 5.566, "step": 550 }, { "epoch": 0.0646900269541779, "grad_norm": 2.0407137870788574, "learning_rate": 0.0005967620075553157, "loss": 5.5348, "step": 600 }, { "epoch": 0.07008086253369272, "grad_norm": 0.9982050061225891, "learning_rate": 0.0005964382083108472, "loss": 5.4331, "step": 650 }, { "epoch": 0.07547169811320754, "grad_norm": 1.1897433996200562, "learning_rate": 0.0005961144090663788, "loss": 5.4115, "step": 700 }, { "epoch": 0.08086253369272237, "grad_norm": 1.7157807350158691, "learning_rate": 0.0005957906098219104, "loss": 5.3354, "step": 750 }, { "epoch": 0.0862533692722372, "grad_norm": 1.4302008152008057, "learning_rate": 0.0005954668105774419, "loss": 5.2837, "step": 800 }, { "epoch": 0.09164420485175202, "grad_norm": 1.2498868703842163, "learning_rate": 0.0005951430113329735, "loss": 5.2505, "step": 850 }, { "epoch": 0.09703504043126684, "grad_norm": 0.8495532274246216, "learning_rate": 0.0005948192120885051, "loss": 5.1733, "step": 900 }, { "epoch": 0.10242587601078167, "grad_norm": 0.9718948602676392, "learning_rate": 0.0005944954128440366, "loss": 5.161, "step": 950 }, { "epoch": 0.1078167115902965, "grad_norm": 1.1483657360076904, "learning_rate": 0.0005941716135995682, "loss": 5.1041, "step": 1000 }, { "epoch": 0.1078167115902965, "eval_accuracy": 0.22633993729207233, "eval_loss": 5.030312538146973, "eval_runtime": 188.3882, "eval_samples_per_second": 95.606, "eval_steps_per_second": 5.977, "step": 1000 }, { "epoch": 0.11320754716981132, "grad_norm": 1.1805005073547363, "learning_rate": 0.0005938478143550997, "loss": 5.0661, "step": 1050 }, { "epoch": 0.11859838274932614, "grad_norm": 1.1717469692230225, "learning_rate": 0.0005935240151106314, "loss": 5.0246, "step": 1100 }, { "epoch": 0.12398921832884097, "grad_norm": 0.9578151702880859, "learning_rate": 0.0005932002158661629, "loss": 5.0052, "step": 1150 }, { "epoch": 0.1293800539083558, "grad_norm": 1.1234848499298096, "learning_rate": 0.0005928764166216945, "loss": 4.9837, "step": 1200 }, { "epoch": 0.1347708894878706, "grad_norm": 1.1127243041992188, "learning_rate": 0.000592552617377226, "loss": 4.9561, "step": 1250 }, { "epoch": 0.14016172506738545, "grad_norm": 1.002482295036316, "learning_rate": 0.0005922288181327577, "loss": 4.902, "step": 1300 }, { "epoch": 0.14555256064690028, "grad_norm": 1.1132245063781738, "learning_rate": 0.0005919050188882893, "loss": 4.8863, "step": 1350 }, { "epoch": 0.1509433962264151, "grad_norm": 0.8769996762275696, "learning_rate": 0.0005915812196438207, "loss": 4.8603, "step": 1400 }, { "epoch": 0.15633423180592992, "grad_norm": 1.0863553285598755, "learning_rate": 0.0005912574203993524, "loss": 4.838, "step": 1450 }, { "epoch": 0.16172506738544473, "grad_norm": 0.9288261532783508, "learning_rate": 0.0005909336211548839, "loss": 4.8231, "step": 1500 }, { "epoch": 0.16711590296495957, "grad_norm": 1.0346804857254028, "learning_rate": 0.0005906098219104155, "loss": 4.7805, "step": 1550 }, { "epoch": 0.1725067385444744, "grad_norm": 0.972179651260376, "learning_rate": 0.000590286022665947, "loss": 4.7959, "step": 1600 }, { "epoch": 0.1778975741239892, "grad_norm": 1.0101567506790161, "learning_rate": 0.0005899622234214787, "loss": 4.7731, "step": 1650 }, { "epoch": 0.18328840970350405, "grad_norm": 0.979967474937439, "learning_rate": 0.0005896384241770102, "loss": 4.6941, "step": 1700 }, { "epoch": 0.18867924528301888, "grad_norm": 1.1450660228729248, "learning_rate": 0.0005893146249325418, "loss": 4.7073, "step": 1750 }, { "epoch": 0.1940700808625337, "grad_norm": 0.8868092894554138, "learning_rate": 0.0005889908256880733, "loss": 4.6979, "step": 1800 }, { "epoch": 0.19946091644204852, "grad_norm": 0.8870577216148376, "learning_rate": 0.0005886670264436049, "loss": 4.6546, "step": 1850 }, { "epoch": 0.20485175202156333, "grad_norm": 0.9839901924133301, "learning_rate": 0.0005883432271991365, "loss": 4.6524, "step": 1900 }, { "epoch": 0.21024258760107817, "grad_norm": 1.3535802364349365, "learning_rate": 0.0005880194279546681, "loss": 4.6183, "step": 1950 }, { "epoch": 0.215633423180593, "grad_norm": 1.1455411911010742, "learning_rate": 0.0005876956287101996, "loss": 4.5964, "step": 2000 }, { "epoch": 0.215633423180593, "eval_accuracy": 0.2674478881735786, "eval_loss": 4.539289474487305, "eval_runtime": 187.7214, "eval_samples_per_second": 95.945, "eval_steps_per_second": 5.998, "step": 2000 }, { "epoch": 0.2210242587601078, "grad_norm": 0.9703343510627747, "learning_rate": 0.0005873718294657312, "loss": 4.6083, "step": 2050 }, { "epoch": 0.22641509433962265, "grad_norm": 0.934937059879303, "learning_rate": 0.0005870480302212628, "loss": 4.575, "step": 2100 }, { "epoch": 0.23180592991913745, "grad_norm": 1.039948582649231, "learning_rate": 0.0005867242309767943, "loss": 4.5189, "step": 2150 }, { "epoch": 0.2371967654986523, "grad_norm": 0.926210343837738, "learning_rate": 0.0005864004317323259, "loss": 4.5622, "step": 2200 }, { "epoch": 0.24258760107816713, "grad_norm": 0.7271756529808044, "learning_rate": 0.0005860766324878575, "loss": 4.4915, "step": 2250 }, { "epoch": 0.24797843665768193, "grad_norm": 1.1152825355529785, "learning_rate": 0.000585752833243389, "loss": 4.4756, "step": 2300 }, { "epoch": 0.25336927223719674, "grad_norm": 0.8933312296867371, "learning_rate": 0.0005854290339989206, "loss": 4.4727, "step": 2350 }, { "epoch": 0.2587601078167116, "grad_norm": 1.0869262218475342, "learning_rate": 0.0005851052347544521, "loss": 4.4607, "step": 2400 }, { "epoch": 0.2641509433962264, "grad_norm": 0.9118139147758484, "learning_rate": 0.0005847814355099838, "loss": 4.4666, "step": 2450 }, { "epoch": 0.2695417789757412, "grad_norm": 0.7588400840759277, "learning_rate": 0.0005844576362655154, "loss": 4.4622, "step": 2500 }, { "epoch": 0.2749326145552561, "grad_norm": 0.8877196311950684, "learning_rate": 0.0005841338370210469, "loss": 4.4204, "step": 2550 }, { "epoch": 0.2803234501347709, "grad_norm": 0.8958740234375, "learning_rate": 0.0005838100377765785, "loss": 4.4112, "step": 2600 }, { "epoch": 0.2857142857142857, "grad_norm": 1.173532247543335, "learning_rate": 0.0005834862385321101, "loss": 4.4104, "step": 2650 }, { "epoch": 0.29110512129380056, "grad_norm": 0.862187385559082, "learning_rate": 0.0005831624392876417, "loss": 4.4036, "step": 2700 }, { "epoch": 0.29649595687331537, "grad_norm": 0.8075447082519531, "learning_rate": 0.0005828386400431731, "loss": 4.3633, "step": 2750 }, { "epoch": 0.3018867924528302, "grad_norm": 0.8274509906768799, "learning_rate": 0.0005825148407987048, "loss": 4.4045, "step": 2800 }, { "epoch": 0.30727762803234504, "grad_norm": 0.7691844701766968, "learning_rate": 0.0005821910415542363, "loss": 4.3696, "step": 2850 }, { "epoch": 0.31266846361185985, "grad_norm": 0.7903671860694885, "learning_rate": 0.0005818672423097679, "loss": 4.3474, "step": 2900 }, { "epoch": 0.31805929919137466, "grad_norm": 0.871162474155426, "learning_rate": 0.0005815434430652994, "loss": 4.3491, "step": 2950 }, { "epoch": 0.32345013477088946, "grad_norm": 0.6972927451133728, "learning_rate": 0.0005812196438208311, "loss": 4.3455, "step": 3000 }, { "epoch": 0.32345013477088946, "eval_accuracy": 0.29693041467048675, "eval_loss": 4.249656677246094, "eval_runtime": 187.5787, "eval_samples_per_second": 96.018, "eval_steps_per_second": 6.003, "step": 3000 }, { "epoch": 0.3288409703504043, "grad_norm": 0.8769593834877014, "learning_rate": 0.0005808958445763626, "loss": 4.325, "step": 3050 }, { "epoch": 0.33423180592991913, "grad_norm": 0.7991501092910767, "learning_rate": 0.0005805720453318942, "loss": 4.3205, "step": 3100 }, { "epoch": 0.33962264150943394, "grad_norm": 0.7523728609085083, "learning_rate": 0.0005802482460874257, "loss": 4.2834, "step": 3150 }, { "epoch": 0.3450134770889488, "grad_norm": 0.854878842830658, "learning_rate": 0.0005799244468429573, "loss": 4.2919, "step": 3200 }, { "epoch": 0.3504043126684636, "grad_norm": 0.7155695557594299, "learning_rate": 0.0005796006475984889, "loss": 4.2661, "step": 3250 }, { "epoch": 0.3557951482479784, "grad_norm": 0.7300028204917908, "learning_rate": 0.0005792768483540205, "loss": 4.2884, "step": 3300 }, { "epoch": 0.3611859838274933, "grad_norm": 0.6207231879234314, "learning_rate": 0.000578953049109552, "loss": 4.2695, "step": 3350 }, { "epoch": 0.3665768194070081, "grad_norm": 0.8405777215957642, "learning_rate": 0.0005786292498650836, "loss": 4.2635, "step": 3400 }, { "epoch": 0.3719676549865229, "grad_norm": 0.6905649304389954, "learning_rate": 0.0005783054506206152, "loss": 4.2467, "step": 3450 }, { "epoch": 0.37735849056603776, "grad_norm": 0.7021420001983643, "learning_rate": 0.0005779816513761467, "loss": 4.226, "step": 3500 }, { "epoch": 0.38274932614555257, "grad_norm": 0.7447681427001953, "learning_rate": 0.0005776578521316782, "loss": 4.2315, "step": 3550 }, { "epoch": 0.3881401617250674, "grad_norm": 0.8126360177993774, "learning_rate": 0.0005773340528872099, "loss": 4.2338, "step": 3600 }, { "epoch": 0.3935309973045822, "grad_norm": 0.821212649345398, "learning_rate": 0.0005770102536427414, "loss": 4.2432, "step": 3650 }, { "epoch": 0.39892183288409705, "grad_norm": 0.7117862105369568, "learning_rate": 0.000576686454398273, "loss": 4.2093, "step": 3700 }, { "epoch": 0.40431266846361186, "grad_norm": 0.7709187269210815, "learning_rate": 0.0005763626551538045, "loss": 4.2057, "step": 3750 }, { "epoch": 0.40970350404312667, "grad_norm": 0.648059606552124, "learning_rate": 0.0005760388559093362, "loss": 4.2246, "step": 3800 }, { "epoch": 0.41509433962264153, "grad_norm": 0.7581032514572144, "learning_rate": 0.0005757150566648678, "loss": 4.2061, "step": 3850 }, { "epoch": 0.42048517520215634, "grad_norm": 1.0342339277267456, "learning_rate": 0.0005753912574203993, "loss": 4.1752, "step": 3900 }, { "epoch": 0.42587601078167114, "grad_norm": 0.6775522828102112, "learning_rate": 0.0005750674581759309, "loss": 4.1718, "step": 3950 }, { "epoch": 0.431266846361186, "grad_norm": 0.7176401615142822, "learning_rate": 0.0005747436589314624, "loss": 4.1807, "step": 4000 }, { "epoch": 0.431266846361186, "eval_accuracy": 0.31078854724678473, "eval_loss": 4.110689163208008, "eval_runtime": 187.6754, "eval_samples_per_second": 95.969, "eval_steps_per_second": 6.0, "step": 4000 }, { "epoch": 0.4366576819407008, "grad_norm": 0.6973474621772766, "learning_rate": 0.0005744198596869941, "loss": 4.175, "step": 4050 }, { "epoch": 0.4420485175202156, "grad_norm": 0.6163977384567261, "learning_rate": 0.0005740960604425255, "loss": 4.1802, "step": 4100 }, { "epoch": 0.4474393530997305, "grad_norm": 0.701683759689331, "learning_rate": 0.0005737722611980572, "loss": 4.1702, "step": 4150 }, { "epoch": 0.4528301886792453, "grad_norm": 0.648784339427948, "learning_rate": 0.0005734484619535887, "loss": 4.1583, "step": 4200 }, { "epoch": 0.4582210242587601, "grad_norm": 0.6678009629249573, "learning_rate": 0.0005731246627091203, "loss": 4.1472, "step": 4250 }, { "epoch": 0.4636118598382749, "grad_norm": 0.7954819202423096, "learning_rate": 0.0005728008634646518, "loss": 4.1426, "step": 4300 }, { "epoch": 0.46900269541778977, "grad_norm": 0.6935424208641052, "learning_rate": 0.0005724770642201835, "loss": 4.1495, "step": 4350 }, { "epoch": 0.4743935309973046, "grad_norm": 0.7789283394813538, "learning_rate": 0.000572153264975715, "loss": 4.1333, "step": 4400 }, { "epoch": 0.4797843665768194, "grad_norm": 0.7601833939552307, "learning_rate": 0.0005718294657312466, "loss": 4.1398, "step": 4450 }, { "epoch": 0.48517520215633425, "grad_norm": 0.68370121717453, "learning_rate": 0.0005715056664867781, "loss": 4.1212, "step": 4500 }, { "epoch": 0.49056603773584906, "grad_norm": 0.6254522204399109, "learning_rate": 0.0005711818672423097, "loss": 4.1102, "step": 4550 }, { "epoch": 0.49595687331536387, "grad_norm": 0.6852529644966125, "learning_rate": 0.0005708580679978413, "loss": 4.1179, "step": 4600 }, { "epoch": 0.5013477088948787, "grad_norm": 0.7181583046913147, "learning_rate": 0.0005705342687533729, "loss": 4.0984, "step": 4650 }, { "epoch": 0.5067385444743935, "grad_norm": 0.7251303791999817, "learning_rate": 0.0005702104695089044, "loss": 4.1042, "step": 4700 }, { "epoch": 0.5121293800539084, "grad_norm": 0.6099450588226318, "learning_rate": 0.000569886670264436, "loss": 4.0865, "step": 4750 }, { "epoch": 0.5175202156334232, "grad_norm": 0.5896744132041931, "learning_rate": 0.0005695628710199675, "loss": 4.1034, "step": 4800 }, { "epoch": 0.522911051212938, "grad_norm": 0.6033713817596436, "learning_rate": 0.0005692390717754991, "loss": 4.095, "step": 4850 }, { "epoch": 0.5283018867924528, "grad_norm": 0.5704399347305298, "learning_rate": 0.0005689152725310306, "loss": 4.0999, "step": 4900 }, { "epoch": 0.5336927223719676, "grad_norm": 0.671288013458252, "learning_rate": 0.0005685914732865623, "loss": 4.0748, "step": 4950 }, { "epoch": 0.5390835579514824, "grad_norm": 0.6648839712142944, "learning_rate": 0.0005682676740420939, "loss": 4.0654, "step": 5000 }, { "epoch": 0.5390835579514824, "eval_accuracy": 0.3200598981639944, "eval_loss": 4.007607460021973, "eval_runtime": 187.3054, "eval_samples_per_second": 96.158, "eval_steps_per_second": 6.012, "step": 5000 }, { "epoch": 0.5444743935309974, "grad_norm": 0.5957515239715576, "learning_rate": 0.0005679438747976254, "loss": 4.0606, "step": 5050 }, { "epoch": 0.5498652291105122, "grad_norm": 0.7439665794372559, "learning_rate": 0.000567620075553157, "loss": 4.0673, "step": 5100 }, { "epoch": 0.555256064690027, "grad_norm": 0.7030623555183411, "learning_rate": 0.0005672962763086886, "loss": 4.0702, "step": 5150 }, { "epoch": 0.5606469002695418, "grad_norm": 0.6474356651306152, "learning_rate": 0.0005669724770642202, "loss": 4.0454, "step": 5200 }, { "epoch": 0.5660377358490566, "grad_norm": 0.6153950095176697, "learning_rate": 0.0005666486778197517, "loss": 4.0458, "step": 5250 }, { "epoch": 0.5714285714285714, "grad_norm": 0.6132007837295532, "learning_rate": 0.0005663248785752833, "loss": 4.0315, "step": 5300 }, { "epoch": 0.5768194070080862, "grad_norm": 0.6719494462013245, "learning_rate": 0.0005660010793308148, "loss": 4.0466, "step": 5350 }, { "epoch": 0.5822102425876011, "grad_norm": 0.6127015352249146, "learning_rate": 0.0005656772800863465, "loss": 4.0351, "step": 5400 }, { "epoch": 0.5876010781671159, "grad_norm": 0.6278625130653381, "learning_rate": 0.0005653534808418779, "loss": 4.0478, "step": 5450 }, { "epoch": 0.5929919137466307, "grad_norm": 0.6481402516365051, "learning_rate": 0.0005650296815974096, "loss": 4.0487, "step": 5500 }, { "epoch": 0.5983827493261455, "grad_norm": 0.6136019825935364, "learning_rate": 0.0005647058823529411, "loss": 4.0517, "step": 5550 }, { "epoch": 0.6037735849056604, "grad_norm": 0.6584049463272095, "learning_rate": 0.0005643820831084727, "loss": 4.0275, "step": 5600 }, { "epoch": 0.6091644204851752, "grad_norm": 0.6200391054153442, "learning_rate": 0.0005640582838640042, "loss": 4.0176, "step": 5650 }, { "epoch": 0.6145552560646901, "grad_norm": 0.6417002081871033, "learning_rate": 0.0005637344846195358, "loss": 4.0226, "step": 5700 }, { "epoch": 0.6199460916442049, "grad_norm": 0.6000820994377136, "learning_rate": 0.0005634106853750674, "loss": 4.0338, "step": 5750 }, { "epoch": 0.6253369272237197, "grad_norm": 0.6211546659469604, "learning_rate": 0.000563086886130599, "loss": 4.0222, "step": 5800 }, { "epoch": 0.6307277628032345, "grad_norm": 0.6056830286979675, "learning_rate": 0.0005627630868861305, "loss": 4.0358, "step": 5850 }, { "epoch": 0.6361185983827493, "grad_norm": 0.5934694409370422, "learning_rate": 0.0005624392876416621, "loss": 4.0042, "step": 5900 }, { "epoch": 0.6415094339622641, "grad_norm": 0.6150168180465698, "learning_rate": 0.0005621154883971937, "loss": 3.9998, "step": 5950 }, { "epoch": 0.6469002695417789, "grad_norm": 0.5978230237960815, "learning_rate": 0.0005617916891527253, "loss": 4.0111, "step": 6000 }, { "epoch": 0.6469002695417789, "eval_accuracy": 0.32617988072303283, "eval_loss": 3.932387351989746, "eval_runtime": 187.2479, "eval_samples_per_second": 96.188, "eval_steps_per_second": 6.013, "step": 6000 }, { "epoch": 0.6522911051212938, "grad_norm": 0.6879520416259766, "learning_rate": 0.0005614678899082568, "loss": 3.9816, "step": 6050 }, { "epoch": 0.6576819407008087, "grad_norm": 0.7578017711639404, "learning_rate": 0.0005611440906637884, "loss": 4.0126, "step": 6100 }, { "epoch": 0.6630727762803235, "grad_norm": 0.544524073600769, "learning_rate": 0.00056082029141932, "loss": 3.9986, "step": 6150 }, { "epoch": 0.6684636118598383, "grad_norm": 0.6755046248435974, "learning_rate": 0.0005604964921748515, "loss": 3.9759, "step": 6200 }, { "epoch": 0.6738544474393531, "grad_norm": 0.5876584053039551, "learning_rate": 0.000560172692930383, "loss": 3.991, "step": 6250 }, { "epoch": 0.6792452830188679, "grad_norm": 0.6401435732841492, "learning_rate": 0.0005598488936859147, "loss": 4.0086, "step": 6300 }, { "epoch": 0.6846361185983828, "grad_norm": 0.6608089208602905, "learning_rate": 0.0005595250944414463, "loss": 3.9701, "step": 6350 }, { "epoch": 0.6900269541778976, "grad_norm": 0.5429288148880005, "learning_rate": 0.0005592012951969778, "loss": 3.9757, "step": 6400 }, { "epoch": 0.6954177897574124, "grad_norm": 0.6896840929985046, "learning_rate": 0.0005588839719373988, "loss": 3.9599, "step": 6450 }, { "epoch": 0.7008086253369272, "grad_norm": 0.6430425643920898, "learning_rate": 0.0005585601726929303, "loss": 3.9727, "step": 6500 }, { "epoch": 0.706199460916442, "grad_norm": 0.6070827841758728, "learning_rate": 0.0005582363734484619, "loss": 3.9942, "step": 6550 }, { "epoch": 0.7115902964959568, "grad_norm": 0.6078307032585144, "learning_rate": 0.0005579125742039935, "loss": 3.9502, "step": 6600 }, { "epoch": 0.7169811320754716, "grad_norm": 0.7253237962722778, "learning_rate": 0.0005575887749595251, "loss": 3.9852, "step": 6650 }, { "epoch": 0.7223719676549866, "grad_norm": 0.6382951736450195, "learning_rate": 0.0005572649757150566, "loss": 3.9578, "step": 6700 }, { "epoch": 0.7277628032345014, "grad_norm": 0.5518078804016113, "learning_rate": 0.0005569411764705882, "loss": 3.9632, "step": 6750 }, { "epoch": 0.7331536388140162, "grad_norm": 0.6217591166496277, "learning_rate": 0.0005566173772261198, "loss": 3.9522, "step": 6800 }, { "epoch": 0.738544474393531, "grad_norm": 0.761849582195282, "learning_rate": 0.0005562935779816513, "loss": 3.9322, "step": 6850 }, { "epoch": 0.7439353099730458, "grad_norm": 0.7228758335113525, "learning_rate": 0.0005559697787371828, "loss": 3.9589, "step": 6900 }, { "epoch": 0.7493261455525606, "grad_norm": 0.6004955768585205, "learning_rate": 0.0005556459794927145, "loss": 3.9557, "step": 6950 }, { "epoch": 0.7547169811320755, "grad_norm": 0.6903455257415771, "learning_rate": 0.000555322180248246, "loss": 3.9427, "step": 7000 }, { "epoch": 0.7547169811320755, "eval_accuracy": 0.33205235200362987, "eval_loss": 3.8775031566619873, "eval_runtime": 187.876, "eval_samples_per_second": 95.866, "eval_steps_per_second": 5.993, "step": 7000 }, { "epoch": 0.7601078167115903, "grad_norm": 0.6758421063423157, "learning_rate": 0.0005549983810037776, "loss": 3.9325, "step": 7050 }, { "epoch": 0.7654986522911051, "grad_norm": 0.5868051648139954, "learning_rate": 0.0005546745817593091, "loss": 3.9466, "step": 7100 }, { "epoch": 0.77088948787062, "grad_norm": 0.5360812544822693, "learning_rate": 0.0005543507825148408, "loss": 3.9455, "step": 7150 }, { "epoch": 0.7762803234501348, "grad_norm": 0.6635931134223938, "learning_rate": 0.0005540269832703723, "loss": 3.9448, "step": 7200 }, { "epoch": 0.7816711590296496, "grad_norm": 0.5423426628112793, "learning_rate": 0.0005537031840259039, "loss": 3.9006, "step": 7250 }, { "epoch": 0.7870619946091644, "grad_norm": 0.6123334169387817, "learning_rate": 0.0005533793847814354, "loss": 3.9403, "step": 7300 }, { "epoch": 0.7924528301886793, "grad_norm": 0.540355920791626, "learning_rate": 0.000553055585536967, "loss": 3.9251, "step": 7350 }, { "epoch": 0.7978436657681941, "grad_norm": 0.711990475654602, "learning_rate": 0.0005527317862924987, "loss": 3.9266, "step": 7400 }, { "epoch": 0.8032345013477089, "grad_norm": 0.5533379912376404, "learning_rate": 0.0005524079870480301, "loss": 3.9171, "step": 7450 }, { "epoch": 0.8086253369272237, "grad_norm": 0.5620821118354797, "learning_rate": 0.0005520841878035618, "loss": 3.9021, "step": 7500 }, { "epoch": 0.8140161725067385, "grad_norm": 0.6481254696846008, "learning_rate": 0.0005517603885590933, "loss": 3.9298, "step": 7550 }, { "epoch": 0.8194070080862533, "grad_norm": 0.5652502775192261, "learning_rate": 0.0005514365893146249, "loss": 3.9313, "step": 7600 }, { "epoch": 0.8247978436657682, "grad_norm": 0.5965263843536377, "learning_rate": 0.0005511127900701564, "loss": 3.9063, "step": 7650 }, { "epoch": 0.8301886792452831, "grad_norm": 0.5765863656997681, "learning_rate": 0.000550788990825688, "loss": 3.9115, "step": 7700 }, { "epoch": 0.8355795148247979, "grad_norm": 0.5533828139305115, "learning_rate": 0.0005504651915812196, "loss": 3.8928, "step": 7750 }, { "epoch": 0.8409703504043127, "grad_norm": 0.6085563898086548, "learning_rate": 0.0005501413923367512, "loss": 3.9219, "step": 7800 }, { "epoch": 0.8463611859838275, "grad_norm": 0.6730923652648926, "learning_rate": 0.0005498175930922827, "loss": 3.8986, "step": 7850 }, { "epoch": 0.8517520215633423, "grad_norm": 0.5520169138908386, "learning_rate": 0.0005494937938478143, "loss": 3.9013, "step": 7900 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5863463878631592, "learning_rate": 0.0005491699946033459, "loss": 3.8983, "step": 7950 }, { "epoch": 0.862533692722372, "grad_norm": 0.6581799387931824, "learning_rate": 0.0005488461953588775, "loss": 3.8863, "step": 8000 }, { "epoch": 0.862533692722372, "eval_accuracy": 0.3363742379222265, "eval_loss": 3.829810619354248, "eval_runtime": 187.6184, "eval_samples_per_second": 95.998, "eval_steps_per_second": 6.002, "step": 8000 }, { "epoch": 0.8679245283018868, "grad_norm": 0.5409788489341736, "learning_rate": 0.000548522396114409, "loss": 3.8947, "step": 8050 }, { "epoch": 0.8733153638814016, "grad_norm": 0.6086958050727844, "learning_rate": 0.0005481985968699406, "loss": 3.8799, "step": 8100 }, { "epoch": 0.8787061994609164, "grad_norm": 0.5412994623184204, "learning_rate": 0.0005478747976254721, "loss": 3.8916, "step": 8150 }, { "epoch": 0.8840970350404312, "grad_norm": 0.5382531881332397, "learning_rate": 0.0005475509983810037, "loss": 3.8804, "step": 8200 }, { "epoch": 0.889487870619946, "grad_norm": 0.6551288962364197, "learning_rate": 0.0005472271991365352, "loss": 3.8687, "step": 8250 }, { "epoch": 0.894878706199461, "grad_norm": 0.6076605319976807, "learning_rate": 0.0005469033998920669, "loss": 3.891, "step": 8300 }, { "epoch": 0.9002695417789758, "grad_norm": 0.6250050663948059, "learning_rate": 0.0005465796006475984, "loss": 3.8736, "step": 8350 }, { "epoch": 0.9056603773584906, "grad_norm": 0.6367249488830566, "learning_rate": 0.00054625580140313, "loss": 3.8892, "step": 8400 }, { "epoch": 0.9110512129380054, "grad_norm": 0.625838041305542, "learning_rate": 0.0005459320021586615, "loss": 3.8984, "step": 8450 }, { "epoch": 0.9164420485175202, "grad_norm": 0.6105564832687378, "learning_rate": 0.0005456082029141932, "loss": 3.8909, "step": 8500 }, { "epoch": 0.921832884097035, "grad_norm": 0.5936092138290405, "learning_rate": 0.0005452844036697248, "loss": 3.8709, "step": 8550 }, { "epoch": 0.9272237196765498, "grad_norm": 0.5503196716308594, "learning_rate": 0.0005449606044252563, "loss": 3.8656, "step": 8600 }, { "epoch": 0.9326145552560647, "grad_norm": 0.5261377096176147, "learning_rate": 0.0005446368051807879, "loss": 3.8574, "step": 8650 }, { "epoch": 0.9380053908355795, "grad_norm": 0.541580319404602, "learning_rate": 0.0005443130059363194, "loss": 3.8643, "step": 8700 }, { "epoch": 0.9433962264150944, "grad_norm": 0.559482216835022, "learning_rate": 0.0005439892066918511, "loss": 3.8616, "step": 8750 }, { "epoch": 0.9487870619946092, "grad_norm": 0.5329921245574951, "learning_rate": 0.0005436654074473825, "loss": 3.854, "step": 8800 }, { "epoch": 0.954177897574124, "grad_norm": 0.5369682312011719, "learning_rate": 0.0005433480841878035, "loss": 3.8458, "step": 8850 }, { "epoch": 0.9595687331536388, "grad_norm": 0.5372455716133118, "learning_rate": 0.000543024284943335, "loss": 3.8639, "step": 8900 }, { "epoch": 0.9649595687331537, "grad_norm": 0.6307722926139832, "learning_rate": 0.0005427004856988667, "loss": 3.8671, "step": 8950 }, { "epoch": 0.9703504043126685, "grad_norm": 0.5844284296035767, "learning_rate": 0.0005423766864543982, "loss": 3.8554, "step": 9000 }, { "epoch": 0.9703504043126685, "eval_accuracy": 0.34037483725155565, "eval_loss": 3.7922003269195557, "eval_runtime": 187.0501, "eval_samples_per_second": 96.29, "eval_steps_per_second": 6.02, "step": 9000 }, { "epoch": 0.9757412398921833, "grad_norm": 0.5957661271095276, "learning_rate": 0.0005420528872099298, "loss": 3.8489, "step": 9050 }, { "epoch": 0.9811320754716981, "grad_norm": 0.5926182270050049, "learning_rate": 0.0005417290879654613, "loss": 3.8633, "step": 9100 }, { "epoch": 0.9865229110512129, "grad_norm": 0.548740565776825, "learning_rate": 0.000541405288720993, "loss": 3.8363, "step": 9150 }, { "epoch": 0.9919137466307277, "grad_norm": 0.5834896564483643, "learning_rate": 0.0005410814894765245, "loss": 3.8322, "step": 9200 }, { "epoch": 0.9973045822102425, "grad_norm": 0.5723360180854797, "learning_rate": 0.0005407576902320561, "loss": 3.8482, "step": 9250 }, { "epoch": 1.0026954177897573, "grad_norm": 0.5614904761314392, "learning_rate": 0.0005404338909875876, "loss": 3.8201, "step": 9300 }, { "epoch": 1.0080862533692723, "grad_norm": 0.6050702333450317, "learning_rate": 0.0005401100917431192, "loss": 3.7859, "step": 9350 }, { "epoch": 1.013477088948787, "grad_norm": 0.5851100087165833, "learning_rate": 0.0005397862924986508, "loss": 3.7818, "step": 9400 }, { "epoch": 1.0188679245283019, "grad_norm": 0.5718414783477783, "learning_rate": 0.0005394624932541824, "loss": 3.7795, "step": 9450 }, { "epoch": 1.0242587601078168, "grad_norm": 0.6160867810249329, "learning_rate": 0.0005391386940097139, "loss": 3.778, "step": 9500 }, { "epoch": 1.0296495956873315, "grad_norm": 0.5421329140663147, "learning_rate": 0.0005388148947652455, "loss": 3.7779, "step": 9550 }, { "epoch": 1.0350404312668464, "grad_norm": 0.5929830074310303, "learning_rate": 0.000538491095520777, "loss": 3.7659, "step": 9600 }, { "epoch": 1.0404312668463611, "grad_norm": 0.5713817477226257, "learning_rate": 0.0005381672962763086, "loss": 3.8062, "step": 9650 }, { "epoch": 1.045822102425876, "grad_norm": 0.6254969239234924, "learning_rate": 0.0005378434970318403, "loss": 3.7924, "step": 9700 }, { "epoch": 1.0512129380053907, "grad_norm": 0.5617794394493103, "learning_rate": 0.0005375196977873718, "loss": 3.7907, "step": 9750 }, { "epoch": 1.0566037735849056, "grad_norm": 0.5884292721748352, "learning_rate": 0.0005371958985429034, "loss": 3.7984, "step": 9800 }, { "epoch": 1.0619946091644206, "grad_norm": 0.5575320720672607, "learning_rate": 0.0005368720992984349, "loss": 3.7939, "step": 9850 }, { "epoch": 1.0673854447439353, "grad_norm": 0.5543467402458191, "learning_rate": 0.0005365483000539665, "loss": 3.785, "step": 9900 }, { "epoch": 1.0727762803234502, "grad_norm": 0.607307493686676, "learning_rate": 0.0005362245008094981, "loss": 3.7797, "step": 9950 }, { "epoch": 1.0781671159029649, "grad_norm": 0.6108971834182739, "learning_rate": 0.0005359007015650297, "loss": 3.7746, "step": 10000 }, { "epoch": 1.0781671159029649, "eval_accuracy": 0.3434026672762818, "eval_loss": 3.7587594985961914, "eval_runtime": 187.3528, "eval_samples_per_second": 96.134, "eval_steps_per_second": 6.01, "step": 10000 }, { "epoch": 1.0835579514824798, "grad_norm": 0.5519567728042603, "learning_rate": 0.0005355769023205612, "loss": 3.7767, "step": 10050 }, { "epoch": 1.0889487870619945, "grad_norm": 0.5885294079780579, "learning_rate": 0.0005352531030760928, "loss": 3.7858, "step": 10100 }, { "epoch": 1.0943396226415094, "grad_norm": 0.6101541519165039, "learning_rate": 0.0005349293038316244, "loss": 3.7783, "step": 10150 }, { "epoch": 1.0997304582210243, "grad_norm": 0.5413455367088318, "learning_rate": 0.0005346055045871559, "loss": 3.777, "step": 10200 }, { "epoch": 1.105121293800539, "grad_norm": 0.5316413640975952, "learning_rate": 0.0005342817053426874, "loss": 3.7892, "step": 10250 }, { "epoch": 1.110512129380054, "grad_norm": 0.6017285585403442, "learning_rate": 0.0005339579060982191, "loss": 3.7736, "step": 10300 }, { "epoch": 1.1159029649595686, "grad_norm": 0.6177402138710022, "learning_rate": 0.0005336341068537506, "loss": 3.7604, "step": 10350 }, { "epoch": 1.1212938005390836, "grad_norm": 0.5573323965072632, "learning_rate": 0.0005333103076092822, "loss": 3.7756, "step": 10400 }, { "epoch": 1.1266846361185983, "grad_norm": 0.5655317306518555, "learning_rate": 0.0005329865083648137, "loss": 3.7654, "step": 10450 }, { "epoch": 1.1320754716981132, "grad_norm": 0.5832797884941101, "learning_rate": 0.0005326627091203454, "loss": 3.7555, "step": 10500 }, { "epoch": 1.137466307277628, "grad_norm": 0.5413033962249756, "learning_rate": 0.0005323389098758769, "loss": 3.7752, "step": 10550 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6015472412109375, "learning_rate": 0.0005320151106314085, "loss": 3.7858, "step": 10600 }, { "epoch": 1.1482479784366577, "grad_norm": 0.6413048505783081, "learning_rate": 0.00053169131138694, "loss": 3.7643, "step": 10650 }, { "epoch": 1.1536388140161726, "grad_norm": 0.5632215738296509, "learning_rate": 0.0005313675121424716, "loss": 3.7595, "step": 10700 }, { "epoch": 1.1590296495956873, "grad_norm": 0.7078406810760498, "learning_rate": 0.0005310437128980032, "loss": 3.7473, "step": 10750 }, { "epoch": 1.1644204851752022, "grad_norm": 0.580906093120575, "learning_rate": 0.0005307199136535348, "loss": 3.7379, "step": 10800 }, { "epoch": 1.169811320754717, "grad_norm": 0.5415087938308716, "learning_rate": 0.0005303961144090663, "loss": 3.7567, "step": 10850 }, { "epoch": 1.1752021563342319, "grad_norm": 0.5719914436340332, "learning_rate": 0.0005300723151645979, "loss": 3.7588, "step": 10900 }, { "epoch": 1.1805929919137466, "grad_norm": 0.6258513331413269, "learning_rate": 0.0005297549919050189, "loss": 3.7707, "step": 10950 }, { "epoch": 1.1859838274932615, "grad_norm": 0.6074648499488831, "learning_rate": 0.0005294311926605504, "loss": 3.7845, "step": 11000 }, { "epoch": 1.1859838274932615, "eval_accuracy": 0.3456640598303646, "eval_loss": 3.7324743270874023, "eval_runtime": 187.399, "eval_samples_per_second": 96.11, "eval_steps_per_second": 6.009, "step": 11000 }, { "epoch": 1.1913746630727764, "grad_norm": 0.532958447933197, "learning_rate": 0.000529107393416082, "loss": 3.7742, "step": 11050 }, { "epoch": 1.196765498652291, "grad_norm": 0.5758987069129944, "learning_rate": 0.0005287835941716135, "loss": 3.7677, "step": 11100 }, { "epoch": 1.202156334231806, "grad_norm": 0.5204159021377563, "learning_rate": 0.0005284597949271452, "loss": 3.7653, "step": 11150 }, { "epoch": 1.2075471698113207, "grad_norm": 0.5608921051025391, "learning_rate": 0.0005281359956826767, "loss": 3.7539, "step": 11200 }, { "epoch": 1.2129380053908356, "grad_norm": 0.537016749382019, "learning_rate": 0.0005278121964382083, "loss": 3.7451, "step": 11250 }, { "epoch": 1.2183288409703503, "grad_norm": 0.5913570523262024, "learning_rate": 0.0005274883971937398, "loss": 3.7612, "step": 11300 }, { "epoch": 1.2237196765498652, "grad_norm": 0.6099703907966614, "learning_rate": 0.0005271645979492714, "loss": 3.7478, "step": 11350 }, { "epoch": 1.2291105121293802, "grad_norm": 0.5816707015037537, "learning_rate": 0.000526840798704803, "loss": 3.7651, "step": 11400 }, { "epoch": 1.2345013477088949, "grad_norm": 0.5798138380050659, "learning_rate": 0.0005265169994603346, "loss": 3.7535, "step": 11450 }, { "epoch": 1.2398921832884098, "grad_norm": 0.6095981001853943, "learning_rate": 0.0005261932002158661, "loss": 3.7464, "step": 11500 }, { "epoch": 1.2452830188679245, "grad_norm": 0.48052334785461426, "learning_rate": 0.0005258694009713977, "loss": 3.757, "step": 11550 }, { "epoch": 1.2506738544474394, "grad_norm": 0.5602318048477173, "learning_rate": 0.0005255456017269292, "loss": 3.7321, "step": 11600 }, { "epoch": 1.256064690026954, "grad_norm": 0.6862710118293762, "learning_rate": 0.0005252218024824608, "loss": 3.7624, "step": 11650 }, { "epoch": 1.261455525606469, "grad_norm": 0.680061399936676, "learning_rate": 0.0005248980032379924, "loss": 3.7615, "step": 11700 }, { "epoch": 1.266846361185984, "grad_norm": 0.5473019480705261, "learning_rate": 0.000524574203993524, "loss": 3.7522, "step": 11750 }, { "epoch": 1.2722371967654986, "grad_norm": 0.5845617055892944, "learning_rate": 0.0005242504047490555, "loss": 3.7444, "step": 11800 }, { "epoch": 1.2776280323450135, "grad_norm": 0.5994465351104736, "learning_rate": 0.0005239266055045871, "loss": 3.7533, "step": 11850 }, { "epoch": 1.2830188679245282, "grad_norm": 0.5354965329170227, "learning_rate": 0.0005236028062601186, "loss": 3.7394, "step": 11900 }, { "epoch": 1.2884097035040432, "grad_norm": 0.5937642455101013, "learning_rate": 0.0005232790070156503, "loss": 3.7542, "step": 11950 }, { "epoch": 1.2938005390835579, "grad_norm": 0.6336929202079773, "learning_rate": 0.0005229552077711818, "loss": 3.7478, "step": 12000 }, { "epoch": 1.2938005390835579, "eval_accuracy": 0.34786960480011075, "eval_loss": 3.708402633666992, "eval_runtime": 188.2113, "eval_samples_per_second": 95.696, "eval_steps_per_second": 5.983, "step": 12000 }, { "epoch": 1.2991913746630728, "grad_norm": 0.580945611000061, "learning_rate": 0.0005226314085267134, "loss": 3.7444, "step": 12050 }, { "epoch": 1.3045822102425877, "grad_norm": 0.5905938744544983, "learning_rate": 0.000522307609282245, "loss": 3.7524, "step": 12100 }, { "epoch": 1.3099730458221024, "grad_norm": 0.550325334072113, "learning_rate": 0.0005219838100377766, "loss": 3.7695, "step": 12150 }, { "epoch": 1.3153638814016173, "grad_norm": 0.5600820779800415, "learning_rate": 0.000521660010793308, "loss": 3.7399, "step": 12200 }, { "epoch": 1.320754716981132, "grad_norm": 0.5471258759498596, "learning_rate": 0.0005213362115488396, "loss": 3.7442, "step": 12250 }, { "epoch": 1.326145552560647, "grad_norm": 0.5628847479820251, "learning_rate": 0.0005210124123043713, "loss": 3.7555, "step": 12300 }, { "epoch": 1.3315363881401616, "grad_norm": 0.592341959476471, "learning_rate": 0.0005206886130599028, "loss": 3.7435, "step": 12350 }, { "epoch": 1.3369272237196765, "grad_norm": 0.5691344738006592, "learning_rate": 0.0005203648138154344, "loss": 3.7308, "step": 12400 }, { "epoch": 1.3423180592991915, "grad_norm": 0.5798255801200867, "learning_rate": 0.0005200410145709659, "loss": 3.7462, "step": 12450 }, { "epoch": 1.3477088948787062, "grad_norm": 0.7156546711921692, "learning_rate": 0.0005197172153264976, "loss": 3.7264, "step": 12500 }, { "epoch": 1.353099730458221, "grad_norm": 0.5289100408554077, "learning_rate": 0.0005193934160820291, "loss": 3.7396, "step": 12550 }, { "epoch": 1.3584905660377358, "grad_norm": 0.5872498154640198, "learning_rate": 0.0005190696168375607, "loss": 3.7339, "step": 12600 }, { "epoch": 1.3638814016172507, "grad_norm": 0.6003397703170776, "learning_rate": 0.0005187458175930922, "loss": 3.7224, "step": 12650 }, { "epoch": 1.3692722371967654, "grad_norm": 0.5635182857513428, "learning_rate": 0.0005184220183486238, "loss": 3.7189, "step": 12700 }, { "epoch": 1.3746630727762803, "grad_norm": 0.5980514287948608, "learning_rate": 0.0005180982191041554, "loss": 3.7381, "step": 12750 }, { "epoch": 1.3800539083557952, "grad_norm": 0.5108292102813721, "learning_rate": 0.000517774419859687, "loss": 3.7194, "step": 12800 }, { "epoch": 1.38544474393531, "grad_norm": 0.6213290691375732, "learning_rate": 0.0005174506206152185, "loss": 3.7305, "step": 12850 }, { "epoch": 1.3908355795148248, "grad_norm": 0.512791633605957, "learning_rate": 0.0005171268213707501, "loss": 3.7317, "step": 12900 }, { "epoch": 1.3962264150943398, "grad_norm": 0.6446360349655151, "learning_rate": 0.0005168030221262816, "loss": 3.7234, "step": 12950 }, { "epoch": 1.4016172506738545, "grad_norm": 0.5144412517547607, "learning_rate": 0.0005164792228818132, "loss": 3.72, "step": 13000 }, { "epoch": 1.4016172506738545, "eval_accuracy": 0.3508476718022178, "eval_loss": 3.6830170154571533, "eval_runtime": 187.4709, "eval_samples_per_second": 96.074, "eval_steps_per_second": 6.006, "step": 13000 }, { "epoch": 1.4070080862533692, "grad_norm": 0.5337992310523987, "learning_rate": 0.0005161554236373448, "loss": 3.6965, "step": 13050 }, { "epoch": 1.412398921832884, "grad_norm": 0.5049533843994141, "learning_rate": 0.0005158316243928764, "loss": 3.719, "step": 13100 }, { "epoch": 1.417789757412399, "grad_norm": 0.563833475112915, "learning_rate": 0.0005155078251484079, "loss": 3.7227, "step": 13150 }, { "epoch": 1.4231805929919137, "grad_norm": 0.5263909101486206, "learning_rate": 0.0005151840259039395, "loss": 3.7043, "step": 13200 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5385528206825256, "learning_rate": 0.000514860226659471, "loss": 3.7236, "step": 13250 }, { "epoch": 1.4339622641509435, "grad_norm": 0.5642369985580444, "learning_rate": 0.0005145364274150027, "loss": 3.718, "step": 13300 }, { "epoch": 1.4393530997304582, "grad_norm": 0.5917314887046814, "learning_rate": 0.0005142126281705343, "loss": 3.7219, "step": 13350 }, { "epoch": 1.444743935309973, "grad_norm": 0.5475001931190491, "learning_rate": 0.0005138888289260658, "loss": 3.712, "step": 13400 }, { "epoch": 1.4501347708894878, "grad_norm": 0.5014727711677551, "learning_rate": 0.0005135650296815974, "loss": 3.729, "step": 13450 }, { "epoch": 1.4555256064690028, "grad_norm": 0.6016579270362854, "learning_rate": 0.0005132412304371289, "loss": 3.7224, "step": 13500 }, { "epoch": 1.4609164420485174, "grad_norm": 0.5744902491569519, "learning_rate": 0.0005129174311926605, "loss": 3.6989, "step": 13550 }, { "epoch": 1.4663072776280324, "grad_norm": 0.5305960178375244, "learning_rate": 0.000512593631948192, "loss": 3.7266, "step": 13600 }, { "epoch": 1.4716981132075473, "grad_norm": 0.5261050462722778, "learning_rate": 0.0005122698327037237, "loss": 3.7084, "step": 13650 }, { "epoch": 1.477088948787062, "grad_norm": 0.5880641341209412, "learning_rate": 0.0005119460334592552, "loss": 3.7263, "step": 13700 }, { "epoch": 1.482479784366577, "grad_norm": 0.5516208410263062, "learning_rate": 0.0005116222342147868, "loss": 3.6993, "step": 13750 }, { "epoch": 1.4878706199460916, "grad_norm": 0.6383761167526245, "learning_rate": 0.0005112984349703183, "loss": 3.7225, "step": 13800 }, { "epoch": 1.4932614555256065, "grad_norm": 0.5817267894744873, "learning_rate": 0.00051097463572585, "loss": 3.7083, "step": 13850 }, { "epoch": 1.4986522911051212, "grad_norm": 0.5393953323364258, "learning_rate": 0.0005106573124662708, "loss": 3.7188, "step": 13900 }, { "epoch": 1.5040431266846361, "grad_norm": 0.5401408076286316, "learning_rate": 0.0005103335132218025, "loss": 3.7074, "step": 13950 }, { "epoch": 1.509433962264151, "grad_norm": 0.5108155012130737, "learning_rate": 0.000510009713977334, "loss": 3.7062, "step": 14000 }, { "epoch": 1.509433962264151, "eval_accuracy": 0.3525950275440503, "eval_loss": 3.6650450229644775, "eval_runtime": 187.3211, "eval_samples_per_second": 96.15, "eval_steps_per_second": 6.011, "step": 14000 }, { "epoch": 1.5148247978436657, "grad_norm": 0.593525230884552, "learning_rate": 0.0005096859147328656, "loss": 3.7055, "step": 14050 }, { "epoch": 1.5202156334231804, "grad_norm": 0.5967386960983276, "learning_rate": 0.0005093621154883971, "loss": 3.7037, "step": 14100 }, { "epoch": 1.5256064690026954, "grad_norm": 0.5114768147468567, "learning_rate": 0.0005090383162439288, "loss": 3.6958, "step": 14150 }, { "epoch": 1.5309973045822103, "grad_norm": 0.6086651086807251, "learning_rate": 0.0005087145169994602, "loss": 3.6959, "step": 14200 }, { "epoch": 1.536388140161725, "grad_norm": 0.5664010643959045, "learning_rate": 0.0005083907177549918, "loss": 3.7123, "step": 14250 }, { "epoch": 1.54177897574124, "grad_norm": 0.5594226121902466, "learning_rate": 0.0005080669185105234, "loss": 3.7111, "step": 14300 }, { "epoch": 1.5471698113207548, "grad_norm": 0.5915853381156921, "learning_rate": 0.000507743119266055, "loss": 3.7037, "step": 14350 }, { "epoch": 1.5525606469002695, "grad_norm": 0.5531720519065857, "learning_rate": 0.0005074193200215865, "loss": 3.7185, "step": 14400 }, { "epoch": 1.5579514824797842, "grad_norm": 0.5341070890426636, "learning_rate": 0.0005070955207771181, "loss": 3.7107, "step": 14450 }, { "epoch": 1.5633423180592994, "grad_norm": 0.6099830269813538, "learning_rate": 0.0005067717215326498, "loss": 3.6937, "step": 14500 }, { "epoch": 1.568733153638814, "grad_norm": 0.5606914162635803, "learning_rate": 0.0005064479222881813, "loss": 3.7077, "step": 14550 }, { "epoch": 1.5741239892183287, "grad_norm": 0.5850932598114014, "learning_rate": 0.0005061241230437129, "loss": 3.6935, "step": 14600 }, { "epoch": 1.5795148247978437, "grad_norm": 0.570080041885376, "learning_rate": 0.0005058003237992444, "loss": 3.6894, "step": 14650 }, { "epoch": 1.5849056603773586, "grad_norm": 0.5481992363929749, "learning_rate": 0.000505476524554776, "loss": 3.7065, "step": 14700 }, { "epoch": 1.5902964959568733, "grad_norm": 0.5915714502334595, "learning_rate": 0.0005051527253103076, "loss": 3.708, "step": 14750 }, { "epoch": 1.595687331536388, "grad_norm": 0.5506225228309631, "learning_rate": 0.0005048289260658392, "loss": 3.6768, "step": 14800 }, { "epoch": 1.6010781671159031, "grad_norm": 0.5602514147758484, "learning_rate": 0.0005045051268213707, "loss": 3.6965, "step": 14850 }, { "epoch": 1.6064690026954178, "grad_norm": 0.5985303521156311, "learning_rate": 0.0005041813275769023, "loss": 3.7044, "step": 14900 }, { "epoch": 1.6118598382749325, "grad_norm": 0.6018468141555786, "learning_rate": 0.0005038575283324338, "loss": 3.6936, "step": 14950 }, { "epoch": 1.6172506738544474, "grad_norm": 0.6045594811439514, "learning_rate": 0.0005035337290879654, "loss": 3.6945, "step": 15000 }, { "epoch": 1.6172506738544474, "eval_accuracy": 0.3545141635015175, "eval_loss": 3.6467270851135254, "eval_runtime": 188.3924, "eval_samples_per_second": 95.604, "eval_steps_per_second": 5.977, "step": 15000 }, { "epoch": 1.6226415094339623, "grad_norm": 0.5498721599578857, "learning_rate": 0.000503209929843497, "loss": 3.6842, "step": 15050 }, { "epoch": 1.628032345013477, "grad_norm": 0.5301955342292786, "learning_rate": 0.0005028861305990286, "loss": 3.6793, "step": 15100 }, { "epoch": 1.633423180592992, "grad_norm": 0.5473493933677673, "learning_rate": 0.0005025623313545601, "loss": 3.6787, "step": 15150 }, { "epoch": 1.6388140161725069, "grad_norm": 0.5606446862220764, "learning_rate": 0.0005022385321100917, "loss": 3.6822, "step": 15200 }, { "epoch": 1.6442048517520216, "grad_norm": 0.5899948477745056, "learning_rate": 0.0005019147328656232, "loss": 3.6766, "step": 15250 }, { "epoch": 1.6495956873315363, "grad_norm": 0.5508028864860535, "learning_rate": 0.0005015909336211549, "loss": 3.6821, "step": 15300 }, { "epoch": 1.6549865229110512, "grad_norm": 0.540977954864502, "learning_rate": 0.0005012671343766864, "loss": 3.7041, "step": 15350 }, { "epoch": 1.6603773584905661, "grad_norm": 0.5751597881317139, "learning_rate": 0.000500943335132218, "loss": 3.695, "step": 15400 }, { "epoch": 1.6657681940700808, "grad_norm": 0.5815579891204834, "learning_rate": 0.0005006195358877495, "loss": 3.6627, "step": 15450 }, { "epoch": 1.6711590296495957, "grad_norm": 0.6284985542297363, "learning_rate": 0.0005002957366432812, "loss": 3.6786, "step": 15500 }, { "epoch": 1.6765498652291106, "grad_norm": 0.5556270480155945, "learning_rate": 0.0004999719373988127, "loss": 3.6775, "step": 15550 }, { "epoch": 1.6819407008086253, "grad_norm": 0.509715735912323, "learning_rate": 0.0004996481381543442, "loss": 3.6642, "step": 15600 }, { "epoch": 1.68733153638814, "grad_norm": 0.5490276217460632, "learning_rate": 0.0004993243389098758, "loss": 3.6823, "step": 15650 }, { "epoch": 1.692722371967655, "grad_norm": 0.5493997931480408, "learning_rate": 0.0004990005396654074, "loss": 3.6706, "step": 15700 }, { "epoch": 1.6981132075471699, "grad_norm": 0.5856523513793945, "learning_rate": 0.000498676740420939, "loss": 3.6974, "step": 15750 }, { "epoch": 1.7035040431266846, "grad_norm": 0.5182335376739502, "learning_rate": 0.0004983529411764705, "loss": 3.6908, "step": 15800 }, { "epoch": 1.7088948787061995, "grad_norm": 0.5529457330703735, "learning_rate": 0.0004980291419320022, "loss": 3.6858, "step": 15850 }, { "epoch": 1.7142857142857144, "grad_norm": 0.5879248976707458, "learning_rate": 0.0004977053426875337, "loss": 3.6865, "step": 15900 }, { "epoch": 1.719676549865229, "grad_norm": 0.5359522700309753, "learning_rate": 0.0004973815434430653, "loss": 3.6724, "step": 15950 }, { "epoch": 1.7250673854447438, "grad_norm": 0.6166804432868958, "learning_rate": 0.0004970642201834862, "loss": 3.6696, "step": 16000 }, { "epoch": 1.7250673854447438, "eval_accuracy": 0.3563720192302573, "eval_loss": 3.6291186809539795, "eval_runtime": 187.9412, "eval_samples_per_second": 95.833, "eval_steps_per_second": 5.991, "step": 16000 }, { "epoch": 1.7304582210242587, "grad_norm": 0.5808438062667847, "learning_rate": 0.0004967404209390178, "loss": 3.6702, "step": 16050 }, { "epoch": 1.7358490566037736, "grad_norm": 0.5703431963920593, "learning_rate": 0.0004964166216945493, "loss": 3.658, "step": 16100 }, { "epoch": 1.7412398921832883, "grad_norm": 0.5545331239700317, "learning_rate": 0.000496092822450081, "loss": 3.68, "step": 16150 }, { "epoch": 1.7466307277628033, "grad_norm": 0.5666328072547913, "learning_rate": 0.0004957690232056125, "loss": 3.6912, "step": 16200 }, { "epoch": 1.7520215633423182, "grad_norm": 0.5478608012199402, "learning_rate": 0.0004954452239611441, "loss": 3.6673, "step": 16250 }, { "epoch": 1.7574123989218329, "grad_norm": 0.5847337245941162, "learning_rate": 0.0004951214247166756, "loss": 3.6887, "step": 16300 }, { "epoch": 1.7628032345013476, "grad_norm": 0.6067107915878296, "learning_rate": 0.0004947976254722072, "loss": 3.6728, "step": 16350 }, { "epoch": 1.7681940700808625, "grad_norm": 0.5245669484138489, "learning_rate": 0.0004944738262277387, "loss": 3.6636, "step": 16400 }, { "epoch": 1.7735849056603774, "grad_norm": 0.5606017708778381, "learning_rate": 0.0004941500269832703, "loss": 3.6698, "step": 16450 }, { "epoch": 1.778975741239892, "grad_norm": 0.5453688502311707, "learning_rate": 0.0004938262277388019, "loss": 3.6553, "step": 16500 }, { "epoch": 1.784366576819407, "grad_norm": 0.6251377463340759, "learning_rate": 0.0004935024284943335, "loss": 3.684, "step": 16550 }, { "epoch": 1.789757412398922, "grad_norm": 0.6245383620262146, "learning_rate": 0.000493178629249865, "loss": 3.6669, "step": 16600 }, { "epoch": 1.7951482479784366, "grad_norm": 0.5404795408248901, "learning_rate": 0.0004928548300053966, "loss": 3.6722, "step": 16650 }, { "epoch": 1.8005390835579513, "grad_norm": 0.5152893662452698, "learning_rate": 0.0004925310307609282, "loss": 3.6736, "step": 16700 }, { "epoch": 1.8059299191374663, "grad_norm": 0.5853434801101685, "learning_rate": 0.0004922072315164598, "loss": 3.6629, "step": 16750 }, { "epoch": 1.8113207547169812, "grad_norm": 0.53773033618927, "learning_rate": 0.0004918834322719913, "loss": 3.6618, "step": 16800 }, { "epoch": 1.8167115902964959, "grad_norm": 0.6675792336463928, "learning_rate": 0.0004915596330275229, "loss": 3.6682, "step": 16850 }, { "epoch": 1.8221024258760108, "grad_norm": 0.5869772434234619, "learning_rate": 0.0004912423097679439, "loss": 3.6543, "step": 16900 }, { "epoch": 1.8274932614555257, "grad_norm": 0.5314184427261353, "learning_rate": 0.0004909185105234754, "loss": 3.6832, "step": 16950 }, { "epoch": 1.8328840970350404, "grad_norm": 0.5484595894813538, "learning_rate": 0.000490594711279007, "loss": 3.6703, "step": 17000 }, { "epoch": 1.8328840970350404, "eval_accuracy": 0.35783242269537174, "eval_loss": 3.615173816680908, "eval_runtime": 187.71, "eval_samples_per_second": 95.951, "eval_steps_per_second": 5.999, "step": 17000 }, { "epoch": 1.838274932614555, "grad_norm": 0.5312723517417908, "learning_rate": 0.0004902709120345385, "loss": 3.6526, "step": 17050 }, { "epoch": 1.8436657681940702, "grad_norm": 0.5201370716094971, "learning_rate": 0.00048994711279007, "loss": 3.6655, "step": 17100 }, { "epoch": 1.849056603773585, "grad_norm": 0.541175127029419, "learning_rate": 0.0004896233135456017, "loss": 3.6618, "step": 17150 }, { "epoch": 1.8544474393530996, "grad_norm": 0.5954682230949402, "learning_rate": 0.0004892995143011333, "loss": 3.6487, "step": 17200 }, { "epoch": 1.8598382749326146, "grad_norm": 0.6293191909790039, "learning_rate": 0.0004889757150566648, "loss": 3.6673, "step": 17250 }, { "epoch": 1.8652291105121295, "grad_norm": 0.593071460723877, "learning_rate": 0.0004886519158121964, "loss": 3.6598, "step": 17300 }, { "epoch": 1.8706199460916442, "grad_norm": 0.6929914951324463, "learning_rate": 0.000488328116567728, "loss": 3.6619, "step": 17350 }, { "epoch": 1.8760107816711589, "grad_norm": 0.5749919414520264, "learning_rate": 0.0004880043173232595, "loss": 3.6562, "step": 17400 }, { "epoch": 1.881401617250674, "grad_norm": 0.5454095602035522, "learning_rate": 0.0004876805180787911, "loss": 3.6735, "step": 17450 }, { "epoch": 1.8867924528301887, "grad_norm": 0.5522634387016296, "learning_rate": 0.0004873567188343227, "loss": 3.6662, "step": 17500 }, { "epoch": 1.8921832884097034, "grad_norm": 0.5520308017730713, "learning_rate": 0.0004870329195898542, "loss": 3.6464, "step": 17550 }, { "epoch": 1.8975741239892183, "grad_norm": 0.596127986907959, "learning_rate": 0.00048670912034538583, "loss": 3.6552, "step": 17600 }, { "epoch": 1.9029649595687332, "grad_norm": 0.5734618306159973, "learning_rate": 0.0004863853211009174, "loss": 3.6327, "step": 17650 }, { "epoch": 1.908355795148248, "grad_norm": 0.5969244837760925, "learning_rate": 0.000486061521856449, "loss": 3.6419, "step": 17700 }, { "epoch": 1.9137466307277629, "grad_norm": 0.5883053541183472, "learning_rate": 0.00048573772261198054, "loss": 3.6462, "step": 17750 }, { "epoch": 1.9191374663072778, "grad_norm": 0.544793426990509, "learning_rate": 0.00048541392336751214, "loss": 3.664, "step": 17800 }, { "epoch": 1.9245283018867925, "grad_norm": 0.5943015813827515, "learning_rate": 0.0004850901241230437, "loss": 3.665, "step": 17850 }, { "epoch": 1.9299191374663072, "grad_norm": 0.5140420198440552, "learning_rate": 0.0004847663248785753, "loss": 3.6708, "step": 17900 }, { "epoch": 1.935309973045822, "grad_norm": 0.5902069807052612, "learning_rate": 0.0004844425256341068, "loss": 3.6597, "step": 17950 }, { "epoch": 1.940700808625337, "grad_norm": 0.6027345657348633, "learning_rate": 0.00048411872638963834, "loss": 3.6467, "step": 18000 }, { "epoch": 1.940700808625337, "eval_accuracy": 0.3591877588179696, "eval_loss": 3.598599910736084, "eval_runtime": 187.872, "eval_samples_per_second": 95.868, "eval_steps_per_second": 5.993, "step": 18000 }, { "epoch": 1.9460916442048517, "grad_norm": 0.5818517208099365, "learning_rate": 0.00048379492714516995, "loss": 3.6332, "step": 18050 }, { "epoch": 1.9514824797843666, "grad_norm": 0.6805805563926697, "learning_rate": 0.0004834711279007015, "loss": 3.6435, "step": 18100 }, { "epoch": 1.9568733153638815, "grad_norm": 0.5603196620941162, "learning_rate": 0.0004831473286562331, "loss": 3.6487, "step": 18150 }, { "epoch": 1.9622641509433962, "grad_norm": 0.5356538891792297, "learning_rate": 0.00048282352941176465, "loss": 3.6499, "step": 18200 }, { "epoch": 1.967654986522911, "grad_norm": 0.5439290404319763, "learning_rate": 0.00048249973016729626, "loss": 3.6414, "step": 18250 }, { "epoch": 1.9730458221024259, "grad_norm": 0.5327361226081848, "learning_rate": 0.0004821759309228278, "loss": 3.6649, "step": 18300 }, { "epoch": 1.9784366576819408, "grad_norm": 0.5530686378479004, "learning_rate": 0.00048185213167835936, "loss": 3.6452, "step": 18350 }, { "epoch": 1.9838274932614555, "grad_norm": 0.6126965284347534, "learning_rate": 0.00048152833243389096, "loss": 3.6502, "step": 18400 }, { "epoch": 1.9892183288409704, "grad_norm": 0.5579380989074707, "learning_rate": 0.0004812045331894225, "loss": 3.6557, "step": 18450 }, { "epoch": 1.9946091644204853, "grad_norm": 0.5861890912055969, "learning_rate": 0.0004808807339449541, "loss": 3.6357, "step": 18500 }, { "epoch": 2.0, "grad_norm": 1.1135197877883911, "learning_rate": 0.0004805569347004856, "loss": 3.6403, "step": 18550 }, { "epoch": 2.0053908355795147, "grad_norm": 0.5583805441856384, "learning_rate": 0.0004802331354560173, "loss": 3.536, "step": 18600 }, { "epoch": 2.01078167115903, "grad_norm": 0.5247920155525208, "learning_rate": 0.00047990933621154877, "loss": 3.5479, "step": 18650 }, { "epoch": 2.0161725067385445, "grad_norm": 0.5629153847694397, "learning_rate": 0.0004795855369670804, "loss": 3.5451, "step": 18700 }, { "epoch": 2.0215633423180592, "grad_norm": 0.6046398282051086, "learning_rate": 0.0004792617377226119, "loss": 3.5519, "step": 18750 }, { "epoch": 2.026954177897574, "grad_norm": 0.5622299313545227, "learning_rate": 0.0004789379384781435, "loss": 3.574, "step": 18800 }, { "epoch": 2.032345013477089, "grad_norm": 0.5278229117393494, "learning_rate": 0.0004786141392336751, "loss": 3.5721, "step": 18850 }, { "epoch": 2.0377358490566038, "grad_norm": 0.5192546248435974, "learning_rate": 0.00047829033998920663, "loss": 3.5541, "step": 18900 }, { "epoch": 2.0431266846361185, "grad_norm": 0.5911169052124023, "learning_rate": 0.00047796654074473824, "loss": 3.5595, "step": 18950 }, { "epoch": 2.0485175202156336, "grad_norm": 0.5658133625984192, "learning_rate": 0.0004776427415002698, "loss": 3.5905, "step": 19000 }, { "epoch": 2.0485175202156336, "eval_accuracy": 0.36072813080851546, "eval_loss": 3.588514566421509, "eval_runtime": 187.5876, "eval_samples_per_second": 96.014, "eval_steps_per_second": 6.003, "step": 19000 }, { "epoch": 2.0539083557951483, "grad_norm": 0.5861156582832336, "learning_rate": 0.0004773189422558014, "loss": 3.5654, "step": 19050 }, { "epoch": 2.059299191374663, "grad_norm": 0.5876423716545105, "learning_rate": 0.00047699514301133294, "loss": 3.559, "step": 19100 }, { "epoch": 2.0646900269541777, "grad_norm": 0.6080085635185242, "learning_rate": 0.00047667134376686455, "loss": 3.573, "step": 19150 }, { "epoch": 2.070080862533693, "grad_norm": 0.6167579293251038, "learning_rate": 0.0004763475445223961, "loss": 3.585, "step": 19200 }, { "epoch": 2.0754716981132075, "grad_norm": 0.5479595065116882, "learning_rate": 0.0004760237452779276, "loss": 3.568, "step": 19250 }, { "epoch": 2.0808625336927222, "grad_norm": 0.6446500420570374, "learning_rate": 0.0004756999460334592, "loss": 3.5579, "step": 19300 }, { "epoch": 2.0862533692722374, "grad_norm": 0.6173583269119263, "learning_rate": 0.00047537614678899075, "loss": 3.5745, "step": 19350 }, { "epoch": 2.091644204851752, "grad_norm": 0.5377396941184998, "learning_rate": 0.00047505234754452235, "loss": 3.5535, "step": 19400 }, { "epoch": 2.0970350404312668, "grad_norm": 0.5425545573234558, "learning_rate": 0.0004747285483000539, "loss": 3.5692, "step": 19450 }, { "epoch": 2.1024258760107815, "grad_norm": 0.6151023507118225, "learning_rate": 0.0004744047490555855, "loss": 3.5695, "step": 19500 }, { "epoch": 2.1078167115902966, "grad_norm": 0.6019276976585388, "learning_rate": 0.00047408094981111706, "loss": 3.5584, "step": 19550 }, { "epoch": 2.1132075471698113, "grad_norm": 0.5593970417976379, "learning_rate": 0.00047375715056664866, "loss": 3.5596, "step": 19600 }, { "epoch": 2.118598382749326, "grad_norm": 0.6009968519210815, "learning_rate": 0.0004734333513221802, "loss": 3.567, "step": 19650 }, { "epoch": 2.123989218328841, "grad_norm": 0.5649537444114685, "learning_rate": 0.00047310955207771177, "loss": 3.5717, "step": 19700 }, { "epoch": 2.129380053908356, "grad_norm": 0.5351967215538025, "learning_rate": 0.00047278575283324337, "loss": 3.5785, "step": 19750 }, { "epoch": 2.1347708894878705, "grad_norm": 0.5678059458732605, "learning_rate": 0.0004724619535887749, "loss": 3.5599, "step": 19800 }, { "epoch": 2.1401617250673857, "grad_norm": 0.5407218933105469, "learning_rate": 0.0004721381543443065, "loss": 3.5872, "step": 19850 }, { "epoch": 2.1455525606469004, "grad_norm": 0.5732417106628418, "learning_rate": 0.000471814355099838, "loss": 3.5882, "step": 19900 }, { "epoch": 2.150943396226415, "grad_norm": 0.6302648186683655, "learning_rate": 0.0004714905558553697, "loss": 3.5841, "step": 19950 }, { "epoch": 2.1563342318059298, "grad_norm": 0.5343143939971924, "learning_rate": 0.0004711667566109012, "loss": 3.5686, "step": 20000 }, { "epoch": 2.1563342318059298, "eval_accuracy": 0.36169535881584, "eval_loss": 3.582411050796509, "eval_runtime": 187.7427, "eval_samples_per_second": 95.934, "eval_steps_per_second": 5.998, "step": 20000 }, { "epoch": 2.161725067385445, "grad_norm": 0.5989320278167725, "learning_rate": 0.00047084295736643273, "loss": 3.5688, "step": 20050 }, { "epoch": 2.1671159029649596, "grad_norm": 0.5695555806159973, "learning_rate": 0.00047051915812196433, "loss": 3.5748, "step": 20100 }, { "epoch": 2.1725067385444743, "grad_norm": 0.5576919317245483, "learning_rate": 0.0004701953588774959, "loss": 3.5674, "step": 20150 }, { "epoch": 2.177897574123989, "grad_norm": 0.5610904097557068, "learning_rate": 0.0004698715596330275, "loss": 3.5857, "step": 20200 }, { "epoch": 2.183288409703504, "grad_norm": 0.5354993343353271, "learning_rate": 0.00046954776038855904, "loss": 3.5707, "step": 20250 }, { "epoch": 2.188679245283019, "grad_norm": 0.5753160715103149, "learning_rate": 0.00046922396114409064, "loss": 3.5748, "step": 20300 }, { "epoch": 2.1940700808625335, "grad_norm": 0.6492990851402283, "learning_rate": 0.0004689001618996222, "loss": 3.5586, "step": 20350 }, { "epoch": 2.1994609164420487, "grad_norm": 0.5679466128349304, "learning_rate": 0.0004685763626551538, "loss": 3.5652, "step": 20400 }, { "epoch": 2.2048517520215634, "grad_norm": 0.5694677233695984, "learning_rate": 0.00046825256341068535, "loss": 3.5668, "step": 20450 }, { "epoch": 2.210242587601078, "grad_norm": 0.5879806280136108, "learning_rate": 0.0004679287641662169, "loss": 3.5777, "step": 20500 }, { "epoch": 2.215633423180593, "grad_norm": 0.5413506627082825, "learning_rate": 0.0004676049649217485, "loss": 3.5756, "step": 20550 }, { "epoch": 2.221024258760108, "grad_norm": 0.5593597888946533, "learning_rate": 0.00046728116567728, "loss": 3.5686, "step": 20600 }, { "epoch": 2.2264150943396226, "grad_norm": 0.5434522032737732, "learning_rate": 0.0004669573664328116, "loss": 3.5702, "step": 20650 }, { "epoch": 2.2318059299191373, "grad_norm": 0.6712062954902649, "learning_rate": 0.00046663356718834316, "loss": 3.5654, "step": 20700 }, { "epoch": 2.2371967654986524, "grad_norm": 0.5769302248954773, "learning_rate": 0.00046630976794387476, "loss": 3.5653, "step": 20750 }, { "epoch": 2.242587601078167, "grad_norm": 0.599884569644928, "learning_rate": 0.0004659859686994063, "loss": 3.5716, "step": 20800 }, { "epoch": 2.247978436657682, "grad_norm": 0.6358910202980042, "learning_rate": 0.0004656621694549379, "loss": 3.567, "step": 20850 }, { "epoch": 2.2533692722371965, "grad_norm": 0.5754926800727844, "learning_rate": 0.0004653448461953588, "loss": 3.5622, "step": 20900 }, { "epoch": 2.2587601078167117, "grad_norm": 0.5771598219871521, "learning_rate": 0.0004650210469508904, "loss": 3.572, "step": 20950 }, { "epoch": 2.2641509433962264, "grad_norm": 0.6123298406600952, "learning_rate": 0.00046469724770642197, "loss": 3.5546, "step": 21000 }, { "epoch": 2.2641509433962264, "eval_accuracy": 0.3627602657693097, "eval_loss": 3.571512460708618, "eval_runtime": 187.3542, "eval_samples_per_second": 96.133, "eval_steps_per_second": 6.01, "step": 21000 }, { "epoch": 2.269541778975741, "grad_norm": 0.5739647746086121, "learning_rate": 0.0004643734484619536, "loss": 3.5696, "step": 21050 }, { "epoch": 2.274932614555256, "grad_norm": 0.5520042777061462, "learning_rate": 0.0004640496492174851, "loss": 3.5743, "step": 21100 }, { "epoch": 2.280323450134771, "grad_norm": 0.5906243324279785, "learning_rate": 0.00046372584997301673, "loss": 3.5845, "step": 21150 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5810971856117249, "learning_rate": 0.0004634020507285483, "loss": 3.5379, "step": 21200 }, { "epoch": 2.2911051212938007, "grad_norm": 0.565323531627655, "learning_rate": 0.0004630782514840798, "loss": 3.565, "step": 21250 }, { "epoch": 2.2964959568733154, "grad_norm": 0.5445725321769714, "learning_rate": 0.0004627544522396114, "loss": 3.5676, "step": 21300 }, { "epoch": 2.30188679245283, "grad_norm": 0.5527558326721191, "learning_rate": 0.00046243065299514293, "loss": 3.5603, "step": 21350 }, { "epoch": 2.3072776280323453, "grad_norm": 0.560587465763092, "learning_rate": 0.00046210685375067454, "loss": 3.5752, "step": 21400 }, { "epoch": 2.31266846361186, "grad_norm": 0.5325582027435303, "learning_rate": 0.0004617830545062061, "loss": 3.5719, "step": 21450 }, { "epoch": 2.3180592991913747, "grad_norm": 0.5584622621536255, "learning_rate": 0.0004614592552617377, "loss": 3.5749, "step": 21500 }, { "epoch": 2.3234501347708894, "grad_norm": 0.5946611166000366, "learning_rate": 0.00046113545601726924, "loss": 3.5561, "step": 21550 }, { "epoch": 2.3288409703504045, "grad_norm": 0.5582345128059387, "learning_rate": 0.00046081165677280085, "loss": 3.5775, "step": 21600 }, { "epoch": 2.334231805929919, "grad_norm": 0.5604034066200256, "learning_rate": 0.0004604878575283324, "loss": 3.573, "step": 21650 }, { "epoch": 2.339622641509434, "grad_norm": 0.5571855902671814, "learning_rate": 0.00046016405828386395, "loss": 3.5513, "step": 21700 }, { "epoch": 2.3450134770889486, "grad_norm": 0.6498404145240784, "learning_rate": 0.00045984025903939555, "loss": 3.5687, "step": 21750 }, { "epoch": 2.3504043126684637, "grad_norm": 0.5699803829193115, "learning_rate": 0.0004595164597949271, "loss": 3.5887, "step": 21800 }, { "epoch": 2.3557951482479784, "grad_norm": 0.6102402806282043, "learning_rate": 0.0004591926605504587, "loss": 3.5741, "step": 21850 }, { "epoch": 2.361185983827493, "grad_norm": 0.6587836146354675, "learning_rate": 0.0004588688613059902, "loss": 3.5529, "step": 21900 }, { "epoch": 2.3665768194070083, "grad_norm": 0.6018949151039124, "learning_rate": 0.00045854506206152186, "loss": 3.5676, "step": 21950 }, { "epoch": 2.371967654986523, "grad_norm": 0.5756721496582031, "learning_rate": 0.00045822126281705336, "loss": 3.5576, "step": 22000 }, { "epoch": 2.371967654986523, "eval_accuracy": 0.363854726308265, "eval_loss": 3.5607616901397705, "eval_runtime": 187.5094, "eval_samples_per_second": 96.054, "eval_steps_per_second": 6.005, "step": 22000 }, { "epoch": 2.3773584905660377, "grad_norm": 0.5994178056716919, "learning_rate": 0.00045789746357258497, "loss": 3.5606, "step": 22050 }, { "epoch": 2.382749326145553, "grad_norm": 0.5385029911994934, "learning_rate": 0.0004575736643281165, "loss": 3.573, "step": 22100 }, { "epoch": 2.3881401617250675, "grad_norm": 0.5386703610420227, "learning_rate": 0.00045724986508364807, "loss": 3.576, "step": 22150 }, { "epoch": 2.393530997304582, "grad_norm": 0.5755857825279236, "learning_rate": 0.00045692606583917967, "loss": 3.5603, "step": 22200 }, { "epoch": 2.398921832884097, "grad_norm": 0.554608166217804, "learning_rate": 0.0004566022665947112, "loss": 3.5675, "step": 22250 }, { "epoch": 2.404312668463612, "grad_norm": 0.6008310317993164, "learning_rate": 0.0004562784673502428, "loss": 3.5556, "step": 22300 }, { "epoch": 2.4097035040431267, "grad_norm": 0.5571367144584656, "learning_rate": 0.0004559546681057744, "loss": 3.5784, "step": 22350 }, { "epoch": 2.4150943396226414, "grad_norm": 0.5885669589042664, "learning_rate": 0.000455630868861306, "loss": 3.5604, "step": 22400 }, { "epoch": 2.420485175202156, "grad_norm": 0.574574887752533, "learning_rate": 0.00045530706961683753, "loss": 3.563, "step": 22450 }, { "epoch": 2.4258760107816713, "grad_norm": 0.5961259603500366, "learning_rate": 0.00045498327037236914, "loss": 3.5716, "step": 22500 }, { "epoch": 2.431266846361186, "grad_norm": 0.5879604816436768, "learning_rate": 0.0004546594711279007, "loss": 3.5629, "step": 22550 }, { "epoch": 2.4366576819407006, "grad_norm": 0.6066700220108032, "learning_rate": 0.0004543356718834322, "loss": 3.5943, "step": 22600 }, { "epoch": 2.442048517520216, "grad_norm": 0.5698487758636475, "learning_rate": 0.0004540118726389638, "loss": 3.5546, "step": 22650 }, { "epoch": 2.4474393530997305, "grad_norm": 0.5691965818405151, "learning_rate": 0.00045368807339449534, "loss": 3.5457, "step": 22700 }, { "epoch": 2.452830188679245, "grad_norm": 0.5681477785110474, "learning_rate": 0.00045336427415002694, "loss": 3.5509, "step": 22750 }, { "epoch": 2.4582210242587603, "grad_norm": 0.5589949488639832, "learning_rate": 0.0004530404749055585, "loss": 3.5605, "step": 22800 }, { "epoch": 2.463611859838275, "grad_norm": 0.5612940192222595, "learning_rate": 0.0004527166756610901, "loss": 3.5464, "step": 22850 }, { "epoch": 2.4690026954177897, "grad_norm": 0.5747497081756592, "learning_rate": 0.00045239287641662165, "loss": 3.5627, "step": 22900 }, { "epoch": 2.4743935309973044, "grad_norm": 0.5754920840263367, "learning_rate": 0.0004520690771721532, "loss": 3.5591, "step": 22950 }, { "epoch": 2.4797843665768196, "grad_norm": 0.5711870193481445, "learning_rate": 0.0004517452779276848, "loss": 3.5526, "step": 23000 }, { "epoch": 2.4797843665768196, "eval_accuracy": 0.3650906529071547, "eval_loss": 3.546846389770508, "eval_runtime": 188.1187, "eval_samples_per_second": 95.743, "eval_steps_per_second": 5.986, "step": 23000 }, { "epoch": 2.4851752021563343, "grad_norm": 0.5261489152908325, "learning_rate": 0.00045142147868321636, "loss": 3.5626, "step": 23050 }, { "epoch": 2.490566037735849, "grad_norm": 0.6213911771774292, "learning_rate": 0.0004511041554236373, "loss": 3.566, "step": 23100 }, { "epoch": 2.4959568733153636, "grad_norm": 0.5985024571418762, "learning_rate": 0.0004507803561791689, "loss": 3.5628, "step": 23150 }, { "epoch": 2.501347708894879, "grad_norm": 0.576748251914978, "learning_rate": 0.00045045655693470046, "loss": 3.5601, "step": 23200 }, { "epoch": 2.5067385444743935, "grad_norm": 0.6227774024009705, "learning_rate": 0.00045013275769023207, "loss": 3.5594, "step": 23250 }, { "epoch": 2.512129380053908, "grad_norm": 0.5366291403770447, "learning_rate": 0.00044980895844576356, "loss": 3.5443, "step": 23300 }, { "epoch": 2.5175202156334233, "grad_norm": 0.6307509541511536, "learning_rate": 0.0004494851592012951, "loss": 3.5499, "step": 23350 }, { "epoch": 2.522911051212938, "grad_norm": 0.5795412063598633, "learning_rate": 0.0004491613599568267, "loss": 3.5467, "step": 23400 }, { "epoch": 2.5283018867924527, "grad_norm": 0.5431268215179443, "learning_rate": 0.00044883756071235827, "loss": 3.5585, "step": 23450 }, { "epoch": 2.533692722371968, "grad_norm": 0.5200484395027161, "learning_rate": 0.0004485137614678899, "loss": 3.5626, "step": 23500 }, { "epoch": 2.5390835579514826, "grad_norm": 0.5646335482597351, "learning_rate": 0.00044819643820831083, "loss": 3.5698, "step": 23550 }, { "epoch": 2.5444743935309972, "grad_norm": 0.5931826829910278, "learning_rate": 0.0004478726389638424, "loss": 3.5731, "step": 23600 }, { "epoch": 2.5498652291105124, "grad_norm": 0.5869390964508057, "learning_rate": 0.00044754883971937393, "loss": 3.5585, "step": 23650 }, { "epoch": 2.555256064690027, "grad_norm": 0.5838721394538879, "learning_rate": 0.00044722504047490553, "loss": 3.5635, "step": 23700 }, { "epoch": 2.560646900269542, "grad_norm": 0.5717689394950867, "learning_rate": 0.0004469012412304371, "loss": 3.5562, "step": 23750 }, { "epoch": 2.5660377358490565, "grad_norm": 0.588871419429779, "learning_rate": 0.0004465774419859687, "loss": 3.5684, "step": 23800 }, { "epoch": 2.571428571428571, "grad_norm": 0.6461533904075623, "learning_rate": 0.00044625364274150024, "loss": 3.5753, "step": 23850 }, { "epoch": 2.5768194070080863, "grad_norm": 0.5436891913414001, "learning_rate": 0.00044592984349703184, "loss": 3.5734, "step": 23900 }, { "epoch": 2.582210242587601, "grad_norm": 0.625299334526062, "learning_rate": 0.00044560604425256334, "loss": 3.5485, "step": 23950 }, { "epoch": 2.5876010781671157, "grad_norm": 0.5579541325569153, "learning_rate": 0.000445282245008095, "loss": 3.5451, "step": 24000 }, { "epoch": 2.5876010781671157, "eval_accuracy": 0.36589175064901086, "eval_loss": 3.5402700901031494, "eval_runtime": 187.5335, "eval_samples_per_second": 96.042, "eval_steps_per_second": 6.004, "step": 24000 }, { "epoch": 2.592991913746631, "grad_norm": 0.6185333132743835, "learning_rate": 0.0004449584457636265, "loss": 3.5467, "step": 24050 }, { "epoch": 2.5983827493261455, "grad_norm": 0.5775592923164368, "learning_rate": 0.00044463464651915805, "loss": 3.5458, "step": 24100 }, { "epoch": 2.6037735849056602, "grad_norm": 0.5688097476959229, "learning_rate": 0.00044431084727468965, "loss": 3.55, "step": 24150 }, { "epoch": 2.6091644204851754, "grad_norm": 0.6222105026245117, "learning_rate": 0.0004439870480302212, "loss": 3.565, "step": 24200 }, { "epoch": 2.61455525606469, "grad_norm": 0.5622762441635132, "learning_rate": 0.0004436632487857528, "loss": 3.5693, "step": 24250 }, { "epoch": 2.6199460916442048, "grad_norm": 0.6001819968223572, "learning_rate": 0.00044333944954128436, "loss": 3.5543, "step": 24300 }, { "epoch": 2.62533692722372, "grad_norm": 0.6119269728660583, "learning_rate": 0.00044301565029681596, "loss": 3.5469, "step": 24350 }, { "epoch": 2.6307277628032346, "grad_norm": 0.5447987914085388, "learning_rate": 0.0004426918510523475, "loss": 3.5599, "step": 24400 }, { "epoch": 2.6361185983827493, "grad_norm": 0.5981472134590149, "learning_rate": 0.0004423680518078791, "loss": 3.5324, "step": 24450 }, { "epoch": 2.641509433962264, "grad_norm": 0.6249266266822815, "learning_rate": 0.00044204425256341067, "loss": 3.5395, "step": 24500 }, { "epoch": 2.6469002695417787, "grad_norm": 0.6205341815948486, "learning_rate": 0.0004417204533189422, "loss": 3.5569, "step": 24550 }, { "epoch": 2.652291105121294, "grad_norm": 0.609792172908783, "learning_rate": 0.0004413966540744738, "loss": 3.5346, "step": 24600 }, { "epoch": 2.6576819407008085, "grad_norm": 0.5270373821258545, "learning_rate": 0.0004410728548300053, "loss": 3.5518, "step": 24650 }, { "epoch": 2.6630727762803232, "grad_norm": 0.5638793706893921, "learning_rate": 0.0004407490555855369, "loss": 3.5521, "step": 24700 }, { "epoch": 2.6684636118598384, "grad_norm": 0.6222891211509705, "learning_rate": 0.0004404252563410685, "loss": 3.5552, "step": 24750 }, { "epoch": 2.673854447439353, "grad_norm": 0.5870048999786377, "learning_rate": 0.0004401014570966001, "loss": 3.5535, "step": 24800 }, { "epoch": 2.6792452830188678, "grad_norm": 0.5811405181884766, "learning_rate": 0.00043977765785213163, "loss": 3.5552, "step": 24850 }, { "epoch": 2.684636118598383, "grad_norm": 0.6036756038665771, "learning_rate": 0.0004394538586076632, "loss": 3.5722, "step": 24900 }, { "epoch": 2.6900269541778976, "grad_norm": 0.6059266328811646, "learning_rate": 0.0004391300593631948, "loss": 3.561, "step": 24950 }, { "epoch": 2.6954177897574123, "grad_norm": 0.559899091720581, "learning_rate": 0.00043880626011872634, "loss": 3.555, "step": 25000 }, { "epoch": 2.6954177897574123, "eval_accuracy": 0.36689939753060236, "eval_loss": 3.527742385864258, "eval_runtime": 187.8058, "eval_samples_per_second": 95.902, "eval_steps_per_second": 5.996, "step": 25000 }, { "epoch": 2.7008086253369274, "grad_norm": 0.5817413330078125, "learning_rate": 0.00043848246087425794, "loss": 3.5455, "step": 25050 }, { "epoch": 2.706199460916442, "grad_norm": 0.5311758518218994, "learning_rate": 0.0004381586616297895, "loss": 3.5554, "step": 25100 }, { "epoch": 2.711590296495957, "grad_norm": 0.6347346305847168, "learning_rate": 0.0004378348623853211, "loss": 3.5307, "step": 25150 }, { "epoch": 2.7169811320754715, "grad_norm": 0.6582555770874023, "learning_rate": 0.00043751106314085265, "loss": 3.552, "step": 25200 }, { "epoch": 2.7223719676549867, "grad_norm": 0.6206474304199219, "learning_rate": 0.00043718726389638425, "loss": 3.5568, "step": 25250 }, { "epoch": 2.7277628032345014, "grad_norm": 0.5440530776977539, "learning_rate": 0.00043686346465191575, "loss": 3.5593, "step": 25300 }, { "epoch": 2.733153638814016, "grad_norm": 0.5995965600013733, "learning_rate": 0.0004365396654074473, "loss": 3.5496, "step": 25350 }, { "epoch": 2.7385444743935308, "grad_norm": 0.7029544115066528, "learning_rate": 0.0004362158661629789, "loss": 3.5409, "step": 25400 }, { "epoch": 2.743935309973046, "grad_norm": 0.5936011075973511, "learning_rate": 0.00043589206691851045, "loss": 3.5428, "step": 25450 }, { "epoch": 2.7493261455525606, "grad_norm": 0.5993081331253052, "learning_rate": 0.00043556826767404206, "loss": 3.5572, "step": 25500 }, { "epoch": 2.7547169811320753, "grad_norm": 0.5754674077033997, "learning_rate": 0.0004352444684295736, "loss": 3.5503, "step": 25550 }, { "epoch": 2.7601078167115904, "grad_norm": 0.5702727437019348, "learning_rate": 0.0004349206691851052, "loss": 3.5517, "step": 25600 }, { "epoch": 2.765498652291105, "grad_norm": 0.597646951675415, "learning_rate": 0.00043459686994063676, "loss": 3.544, "step": 25650 }, { "epoch": 2.77088948787062, "grad_norm": 0.6668535470962524, "learning_rate": 0.00043427307069616837, "loss": 3.5493, "step": 25700 }, { "epoch": 2.776280323450135, "grad_norm": 0.564940333366394, "learning_rate": 0.0004339492714516999, "loss": 3.5378, "step": 25750 }, { "epoch": 2.7816711590296497, "grad_norm": 0.5661826729774475, "learning_rate": 0.00043362547220723147, "loss": 3.5621, "step": 25800 }, { "epoch": 2.7870619946091644, "grad_norm": 0.5790637135505676, "learning_rate": 0.0004333016729627631, "loss": 3.5346, "step": 25850 }, { "epoch": 2.7924528301886795, "grad_norm": 0.5834683775901794, "learning_rate": 0.0004329778737182946, "loss": 3.5441, "step": 25900 }, { "epoch": 2.797843665768194, "grad_norm": 0.6243922114372253, "learning_rate": 0.00043265407447382623, "loss": 3.5435, "step": 25950 }, { "epoch": 2.803234501347709, "grad_norm": 0.5399969816207886, "learning_rate": 0.0004323302752293577, "loss": 3.5566, "step": 26000 }, { "epoch": 2.803234501347709, "eval_accuracy": 0.3679905984829232, "eval_loss": 3.520369529724121, "eval_runtime": 187.7249, "eval_samples_per_second": 95.944, "eval_steps_per_second": 5.998, "step": 26000 }, { "epoch": 2.8086253369272236, "grad_norm": 0.6081315279006958, "learning_rate": 0.00043200647598488933, "loss": 3.5645, "step": 26050 }, { "epoch": 2.8140161725067383, "grad_norm": 0.6176601648330688, "learning_rate": 0.0004316826767404209, "loss": 3.5486, "step": 26100 }, { "epoch": 2.8194070080862534, "grad_norm": 0.6038433313369751, "learning_rate": 0.0004313588774959525, "loss": 3.542, "step": 26150 }, { "epoch": 2.824797843665768, "grad_norm": 0.5484240055084229, "learning_rate": 0.00043103507825148404, "loss": 3.5549, "step": 26200 }, { "epoch": 2.830188679245283, "grad_norm": 0.5879635214805603, "learning_rate": 0.0004307112790070156, "loss": 3.5369, "step": 26250 }, { "epoch": 2.835579514824798, "grad_norm": 0.6630586981773376, "learning_rate": 0.0004303874797625472, "loss": 3.5354, "step": 26300 }, { "epoch": 2.8409703504043127, "grad_norm": 0.6347656846046448, "learning_rate": 0.00043006368051807874, "loss": 3.5574, "step": 26350 }, { "epoch": 2.8463611859838274, "grad_norm": 0.5696792006492615, "learning_rate": 0.00042973988127361035, "loss": 3.531, "step": 26400 }, { "epoch": 2.8517520215633425, "grad_norm": 0.5842821598052979, "learning_rate": 0.0004294160820291419, "loss": 3.5532, "step": 26450 }, { "epoch": 2.857142857142857, "grad_norm": 0.5837120413780212, "learning_rate": 0.0004290922827846735, "loss": 3.5602, "step": 26500 }, { "epoch": 2.862533692722372, "grad_norm": 0.652936577796936, "learning_rate": 0.00042876848354020505, "loss": 3.5401, "step": 26550 }, { "epoch": 2.867924528301887, "grad_norm": 0.5488174557685852, "learning_rate": 0.00042844468429573655, "loss": 3.5421, "step": 26600 }, { "epoch": 2.8733153638814017, "grad_norm": 0.5535524487495422, "learning_rate": 0.00042812088505126815, "loss": 3.5271, "step": 26650 }, { "epoch": 2.8787061994609164, "grad_norm": 0.6259840130805969, "learning_rate": 0.0004277970858067997, "loss": 3.5667, "step": 26700 }, { "epoch": 2.884097035040431, "grad_norm": 0.626735270023346, "learning_rate": 0.0004274732865623313, "loss": 3.5335, "step": 26750 }, { "epoch": 2.889487870619946, "grad_norm": 0.565473198890686, "learning_rate": 0.00042714948731786286, "loss": 3.5181, "step": 26800 }, { "epoch": 2.894878706199461, "grad_norm": 0.6019836664199829, "learning_rate": 0.00042682568807339447, "loss": 3.5218, "step": 26850 }, { "epoch": 2.9002695417789757, "grad_norm": 0.5418981909751892, "learning_rate": 0.000426501888828926, "loss": 3.5398, "step": 26900 }, { "epoch": 2.9056603773584904, "grad_norm": 0.6377326250076294, "learning_rate": 0.0004261780895844576, "loss": 3.5597, "step": 26950 }, { "epoch": 2.9110512129380055, "grad_norm": 0.5947117805480957, "learning_rate": 0.00042585429033998917, "loss": 3.5438, "step": 27000 }, { "epoch": 2.9110512129380055, "eval_accuracy": 0.3689144739880097, "eval_loss": 3.5099987983703613, "eval_runtime": 187.8098, "eval_samples_per_second": 95.9, "eval_steps_per_second": 5.995, "step": 27000 }, { "epoch": 2.91644204851752, "grad_norm": 0.6234594583511353, "learning_rate": 0.0004255304910955207, "loss": 3.5303, "step": 27050 }, { "epoch": 2.921832884097035, "grad_norm": 0.5786582827568054, "learning_rate": 0.0004252066918510523, "loss": 3.5387, "step": 27100 }, { "epoch": 2.92722371967655, "grad_norm": 0.6039722561836243, "learning_rate": 0.0004248828926065839, "loss": 3.5334, "step": 27150 }, { "epoch": 2.9326145552560647, "grad_norm": 0.608992338180542, "learning_rate": 0.0004245590933621155, "loss": 3.544, "step": 27200 }, { "epoch": 2.9380053908355794, "grad_norm": 0.5596150755882263, "learning_rate": 0.00042423529411764703, "loss": 3.5481, "step": 27250 }, { "epoch": 2.9433962264150946, "grad_norm": 0.5880177617073059, "learning_rate": 0.00042391149487317864, "loss": 3.5136, "step": 27300 }, { "epoch": 2.9487870619946093, "grad_norm": 0.5926826596260071, "learning_rate": 0.00042358769562871013, "loss": 3.5327, "step": 27350 }, { "epoch": 2.954177897574124, "grad_norm": 0.5690999031066895, "learning_rate": 0.00042326389638424174, "loss": 3.5271, "step": 27400 }, { "epoch": 2.9595687331536387, "grad_norm": 0.5594161748886108, "learning_rate": 0.0004229400971397733, "loss": 3.5225, "step": 27450 }, { "epoch": 2.964959568733154, "grad_norm": 0.5708371996879578, "learning_rate": 0.00042261629789530484, "loss": 3.5475, "step": 27500 }, { "epoch": 2.9703504043126685, "grad_norm": 0.561190664768219, "learning_rate": 0.00042229249865083644, "loss": 3.5487, "step": 27550 }, { "epoch": 2.975741239892183, "grad_norm": 0.6065704226493835, "learning_rate": 0.0004219751753912574, "loss": 3.5301, "step": 27600 }, { "epoch": 2.981132075471698, "grad_norm": 0.6116462349891663, "learning_rate": 0.00042165137614678895, "loss": 3.5354, "step": 27650 }, { "epoch": 2.986522911051213, "grad_norm": 0.5894355177879333, "learning_rate": 0.00042132757690232055, "loss": 3.5255, "step": 27700 }, { "epoch": 2.9919137466307277, "grad_norm": 0.6621140837669373, "learning_rate": 0.0004210037776578521, "loss": 3.5259, "step": 27750 }, { "epoch": 2.9973045822102424, "grad_norm": 0.5993552207946777, "learning_rate": 0.00042067997841338365, "loss": 3.533, "step": 27800 }, { "epoch": 3.0026954177897576, "grad_norm": 0.5765702724456787, "learning_rate": 0.00042035617916891526, "loss": 3.4951, "step": 27850 }, { "epoch": 3.0080862533692723, "grad_norm": 0.6206012964248657, "learning_rate": 0.0004200323799244468, "loss": 3.4419, "step": 27900 }, { "epoch": 3.013477088948787, "grad_norm": 0.5946252942085266, "learning_rate": 0.0004197085806799784, "loss": 3.4422, "step": 27950 }, { "epoch": 3.018867924528302, "grad_norm": 0.5602282881736755, "learning_rate": 0.0004193847814355099, "loss": 3.4421, "step": 28000 }, { "epoch": 3.018867924528302, "eval_accuracy": 0.36948424973170885, "eval_loss": 3.5056018829345703, "eval_runtime": 187.3106, "eval_samples_per_second": 96.156, "eval_steps_per_second": 6.011, "step": 28000 }, { "epoch": 3.024258760107817, "grad_norm": 0.565204381942749, "learning_rate": 0.0004190609821910415, "loss": 3.4405, "step": 28050 }, { "epoch": 3.0296495956873315, "grad_norm": 0.5876172184944153, "learning_rate": 0.00041873718294657306, "loss": 3.4473, "step": 28100 }, { "epoch": 3.035040431266846, "grad_norm": 0.6169458031654358, "learning_rate": 0.00041841338370210467, "loss": 3.4312, "step": 28150 }, { "epoch": 3.0404312668463613, "grad_norm": 0.5794179439544678, "learning_rate": 0.0004180895844576362, "loss": 3.4471, "step": 28200 }, { "epoch": 3.045822102425876, "grad_norm": 0.5454350709915161, "learning_rate": 0.00041776578521316777, "loss": 3.4392, "step": 28250 }, { "epoch": 3.0512129380053907, "grad_norm": 0.5963560938835144, "learning_rate": 0.0004174419859686994, "loss": 3.4641, "step": 28300 }, { "epoch": 3.056603773584906, "grad_norm": 0.6112507581710815, "learning_rate": 0.0004171181867242309, "loss": 3.4487, "step": 28350 }, { "epoch": 3.0619946091644206, "grad_norm": 0.5972319841384888, "learning_rate": 0.00041679438747976253, "loss": 3.4692, "step": 28400 }, { "epoch": 3.0673854447439353, "grad_norm": 0.5812078714370728, "learning_rate": 0.0004164705882352941, "loss": 3.4462, "step": 28450 }, { "epoch": 3.07277628032345, "grad_norm": 0.6155223250389099, "learning_rate": 0.0004161467889908257, "loss": 3.4396, "step": 28500 }, { "epoch": 3.078167115902965, "grad_norm": 0.6197593212127686, "learning_rate": 0.00041582298974635724, "loss": 3.4435, "step": 28550 }, { "epoch": 3.08355795148248, "grad_norm": 0.6040982604026794, "learning_rate": 0.00041549919050188884, "loss": 3.4621, "step": 28600 }, { "epoch": 3.0889487870619945, "grad_norm": 0.7128732204437256, "learning_rate": 0.00041517539125742034, "loss": 3.4482, "step": 28650 }, { "epoch": 3.0943396226415096, "grad_norm": 0.5997539162635803, "learning_rate": 0.0004148515920129519, "loss": 3.4704, "step": 28700 }, { "epoch": 3.0997304582210243, "grad_norm": 0.6413683891296387, "learning_rate": 0.0004145277927684835, "loss": 3.4344, "step": 28750 }, { "epoch": 3.105121293800539, "grad_norm": 0.5954497456550598, "learning_rate": 0.00041420399352401504, "loss": 3.465, "step": 28800 }, { "epoch": 3.1105121293800537, "grad_norm": 0.6719018220901489, "learning_rate": 0.00041388019427954665, "loss": 3.4504, "step": 28850 }, { "epoch": 3.115902964959569, "grad_norm": 0.5712775588035583, "learning_rate": 0.0004135563950350782, "loss": 3.4676, "step": 28900 }, { "epoch": 3.1212938005390836, "grad_norm": 0.5765361785888672, "learning_rate": 0.0004132325957906098, "loss": 3.4634, "step": 28950 }, { "epoch": 3.1266846361185983, "grad_norm": 0.6526620388031006, "learning_rate": 0.00041290879654614135, "loss": 3.4609, "step": 29000 }, { "epoch": 3.1266846361185983, "eval_accuracy": 0.37030783862134264, "eval_loss": 3.5025389194488525, "eval_runtime": 187.8717, "eval_samples_per_second": 95.869, "eval_steps_per_second": 5.993, "step": 29000 }, { "epoch": 3.1320754716981134, "grad_norm": 0.6089078187942505, "learning_rate": 0.00041258499730167296, "loss": 3.4653, "step": 29050 }, { "epoch": 3.137466307277628, "grad_norm": 0.5929785370826721, "learning_rate": 0.0004122611980572045, "loss": 3.4622, "step": 29100 }, { "epoch": 3.142857142857143, "grad_norm": 0.6185963749885559, "learning_rate": 0.00041193739881273606, "loss": 3.4663, "step": 29150 }, { "epoch": 3.1482479784366575, "grad_norm": 0.5839598774909973, "learning_rate": 0.000411620075553157, "loss": 3.4628, "step": 29200 }, { "epoch": 3.1536388140161726, "grad_norm": 0.6440391540527344, "learning_rate": 0.0004112962763086886, "loss": 3.4815, "step": 29250 }, { "epoch": 3.1590296495956873, "grad_norm": 0.655301570892334, "learning_rate": 0.0004109724770642201, "loss": 3.4671, "step": 29300 }, { "epoch": 3.164420485175202, "grad_norm": 0.5958247780799866, "learning_rate": 0.00041064867781975177, "loss": 3.4627, "step": 29350 }, { "epoch": 3.169811320754717, "grad_norm": 0.581174910068512, "learning_rate": 0.00041032487857528327, "loss": 3.4552, "step": 29400 }, { "epoch": 3.175202156334232, "grad_norm": 0.6023386120796204, "learning_rate": 0.0004100010793308148, "loss": 3.4664, "step": 29450 }, { "epoch": 3.1805929919137466, "grad_norm": 0.6177906394004822, "learning_rate": 0.0004096772800863464, "loss": 3.4619, "step": 29500 }, { "epoch": 3.1859838274932613, "grad_norm": 0.6565192937850952, "learning_rate": 0.000409353480841878, "loss": 3.4737, "step": 29550 }, { "epoch": 3.1913746630727764, "grad_norm": 0.5951600670814514, "learning_rate": 0.0004090296815974096, "loss": 3.4712, "step": 29600 }, { "epoch": 3.196765498652291, "grad_norm": 0.636186957359314, "learning_rate": 0.00040870588235294113, "loss": 3.4576, "step": 29650 }, { "epoch": 3.202156334231806, "grad_norm": 0.6101711988449097, "learning_rate": 0.00040838208310847273, "loss": 3.4893, "step": 29700 }, { "epoch": 3.207547169811321, "grad_norm": 0.634506344795227, "learning_rate": 0.0004080582838640043, "loss": 3.459, "step": 29750 }, { "epoch": 3.2129380053908356, "grad_norm": 0.6397202610969543, "learning_rate": 0.0004077344846195359, "loss": 3.4702, "step": 29800 }, { "epoch": 3.2183288409703503, "grad_norm": 0.6291985511779785, "learning_rate": 0.00040741068537506744, "loss": 3.4467, "step": 29850 }, { "epoch": 3.223719676549865, "grad_norm": 0.5848795771598816, "learning_rate": 0.000407086886130599, "loss": 3.4579, "step": 29900 }, { "epoch": 3.22911051212938, "grad_norm": 0.5925343036651611, "learning_rate": 0.0004067630868861306, "loss": 3.4721, "step": 29950 }, { "epoch": 3.234501347708895, "grad_norm": 0.5578032732009888, "learning_rate": 0.0004064392876416621, "loss": 3.4737, "step": 30000 }, { "epoch": 3.234501347708895, "eval_accuracy": 0.37095921268379045, "eval_loss": 3.4953773021698, "eval_runtime": 187.3567, "eval_samples_per_second": 96.132, "eval_steps_per_second": 6.01, "step": 30000 }, { "epoch": 3.2398921832884096, "grad_norm": 0.6164473295211792, "learning_rate": 0.0004061154883971937, "loss": 3.4777, "step": 30050 }, { "epoch": 3.2452830188679247, "grad_norm": 0.5678409337997437, "learning_rate": 0.00040579168915272525, "loss": 3.4606, "step": 30100 }, { "epoch": 3.2506738544474394, "grad_norm": 0.6322740912437439, "learning_rate": 0.00040546788990825685, "loss": 3.4631, "step": 30150 }, { "epoch": 3.256064690026954, "grad_norm": 0.5954024791717529, "learning_rate": 0.0004051440906637884, "loss": 3.4663, "step": 30200 }, { "epoch": 3.2614555256064692, "grad_norm": 0.5793306231498718, "learning_rate": 0.00040482029141931995, "loss": 3.4458, "step": 30250 }, { "epoch": 3.266846361185984, "grad_norm": 0.5884698629379272, "learning_rate": 0.00040449649217485156, "loss": 3.479, "step": 30300 }, { "epoch": 3.2722371967654986, "grad_norm": 0.5904477834701538, "learning_rate": 0.0004041726929303831, "loss": 3.4627, "step": 30350 }, { "epoch": 3.2776280323450133, "grad_norm": 0.5925611853599548, "learning_rate": 0.0004038488936859147, "loss": 3.465, "step": 30400 }, { "epoch": 3.2830188679245285, "grad_norm": 0.6218023300170898, "learning_rate": 0.00040352509444144626, "loss": 3.4659, "step": 30450 }, { "epoch": 3.288409703504043, "grad_norm": 0.5947563648223877, "learning_rate": 0.00040320129519697787, "loss": 3.4958, "step": 30500 }, { "epoch": 3.293800539083558, "grad_norm": 0.6579247713088989, "learning_rate": 0.0004028774959525094, "loss": 3.4644, "step": 30550 }, { "epoch": 3.2991913746630726, "grad_norm": 0.5691560506820679, "learning_rate": 0.000402553696708041, "loss": 3.4724, "step": 30600 }, { "epoch": 3.3045822102425877, "grad_norm": 0.5954251885414124, "learning_rate": 0.0004022298974635726, "loss": 3.4599, "step": 30650 }, { "epoch": 3.3099730458221024, "grad_norm": 0.6152036190032959, "learning_rate": 0.00040190609821910407, "loss": 3.4639, "step": 30700 }, { "epoch": 3.315363881401617, "grad_norm": 0.5858497023582458, "learning_rate": 0.0004015822989746357, "loss": 3.454, "step": 30750 }, { "epoch": 3.3207547169811322, "grad_norm": 0.5813567042350769, "learning_rate": 0.0004012584997301672, "loss": 3.4685, "step": 30800 }, { "epoch": 3.326145552560647, "grad_norm": 0.6456325054168701, "learning_rate": 0.00040093470048569883, "loss": 3.4794, "step": 30850 }, { "epoch": 3.3315363881401616, "grad_norm": 0.6200717687606812, "learning_rate": 0.0004006109012412304, "loss": 3.4679, "step": 30900 }, { "epoch": 3.3369272237196768, "grad_norm": 0.6692876815795898, "learning_rate": 0.000400287101996762, "loss": 3.4739, "step": 30950 }, { "epoch": 3.3423180592991915, "grad_norm": 0.5814414024353027, "learning_rate": 0.00039996330275229354, "loss": 3.4623, "step": 31000 }, { "epoch": 3.3423180592991915, "eval_accuracy": 0.3720433511984033, "eval_loss": 3.4902048110961914, "eval_runtime": 187.701, "eval_samples_per_second": 95.956, "eval_steps_per_second": 5.999, "step": 31000 }, { "epoch": 3.347708894878706, "grad_norm": 0.6048328876495361, "learning_rate": 0.00039963950350782514, "loss": 3.473, "step": 31050 }, { "epoch": 3.353099730458221, "grad_norm": 0.6048550009727478, "learning_rate": 0.0003993157042633567, "loss": 3.4611, "step": 31100 }, { "epoch": 3.358490566037736, "grad_norm": 0.6270948648452759, "learning_rate": 0.00039899190501888824, "loss": 3.4633, "step": 31150 }, { "epoch": 3.3638814016172507, "grad_norm": 0.6166711449623108, "learning_rate": 0.00039866810577441985, "loss": 3.473, "step": 31200 }, { "epoch": 3.3692722371967654, "grad_norm": 0.6020999550819397, "learning_rate": 0.0003983443065299514, "loss": 3.4617, "step": 31250 }, { "epoch": 3.37466307277628, "grad_norm": 0.6348170042037964, "learning_rate": 0.00039802698327037235, "loss": 3.4698, "step": 31300 }, { "epoch": 3.3800539083557952, "grad_norm": 0.6470772624015808, "learning_rate": 0.00039770318402590396, "loss": 3.4731, "step": 31350 }, { "epoch": 3.38544474393531, "grad_norm": 0.6075937747955322, "learning_rate": 0.00039737938478143545, "loss": 3.4858, "step": 31400 }, { "epoch": 3.3908355795148246, "grad_norm": 0.6555103063583374, "learning_rate": 0.000397055585536967, "loss": 3.4644, "step": 31450 }, { "epoch": 3.3962264150943398, "grad_norm": 0.5855467915534973, "learning_rate": 0.0003967317862924986, "loss": 3.4624, "step": 31500 }, { "epoch": 3.4016172506738545, "grad_norm": 0.6008960008621216, "learning_rate": 0.00039640798704803016, "loss": 3.449, "step": 31550 }, { "epoch": 3.407008086253369, "grad_norm": 0.6120670437812805, "learning_rate": 0.00039608418780356176, "loss": 3.4585, "step": 31600 }, { "epoch": 3.4123989218328843, "grad_norm": 0.6167988777160645, "learning_rate": 0.0003957603885590933, "loss": 3.4748, "step": 31650 }, { "epoch": 3.417789757412399, "grad_norm": 0.6113810539245605, "learning_rate": 0.0003954365893146249, "loss": 3.4616, "step": 31700 }, { "epoch": 3.4231805929919137, "grad_norm": 0.6468194723129272, "learning_rate": 0.00039511279007015647, "loss": 3.4704, "step": 31750 }, { "epoch": 3.4285714285714284, "grad_norm": 0.6060384511947632, "learning_rate": 0.00039478899082568807, "loss": 3.4692, "step": 31800 }, { "epoch": 3.4339622641509435, "grad_norm": 0.5738762021064758, "learning_rate": 0.0003944651915812196, "loss": 3.4586, "step": 31850 }, { "epoch": 3.439353099730458, "grad_norm": 0.596107542514801, "learning_rate": 0.0003941413923367512, "loss": 3.459, "step": 31900 }, { "epoch": 3.444743935309973, "grad_norm": 0.5924981832504272, "learning_rate": 0.0003938175930922828, "loss": 3.4679, "step": 31950 }, { "epoch": 3.450134770889488, "grad_norm": 0.6087164282798767, "learning_rate": 0.0003934937938478143, "loss": 3.4713, "step": 32000 }, { "epoch": 3.450134770889488, "eval_accuracy": 0.3723299775164579, "eval_loss": 3.4831252098083496, "eval_runtime": 187.6862, "eval_samples_per_second": 95.963, "eval_steps_per_second": 5.999, "step": 32000 }, { "epoch": 3.4555256064690028, "grad_norm": 0.6510775089263916, "learning_rate": 0.0003931699946033459, "loss": 3.4671, "step": 32050 }, { "epoch": 3.4609164420485174, "grad_norm": 0.5941176414489746, "learning_rate": 0.00039284619535887743, "loss": 3.4686, "step": 32100 }, { "epoch": 3.466307277628032, "grad_norm": 0.5818312764167786, "learning_rate": 0.00039252239611440904, "loss": 3.4657, "step": 32150 }, { "epoch": 3.4716981132075473, "grad_norm": 0.6189693808555603, "learning_rate": 0.0003921985968699406, "loss": 3.4633, "step": 32200 }, { "epoch": 3.477088948787062, "grad_norm": 0.6840120553970337, "learning_rate": 0.0003918747976254722, "loss": 3.4742, "step": 32250 }, { "epoch": 3.4824797843665767, "grad_norm": 0.6052665710449219, "learning_rate": 0.00039155099838100374, "loss": 3.4845, "step": 32300 }, { "epoch": 3.487870619946092, "grad_norm": 0.6465663909912109, "learning_rate": 0.0003912271991365353, "loss": 3.4689, "step": 32350 }, { "epoch": 3.4932614555256065, "grad_norm": 0.5961518883705139, "learning_rate": 0.0003909033998920669, "loss": 3.4694, "step": 32400 }, { "epoch": 3.498652291105121, "grad_norm": 0.6497889161109924, "learning_rate": 0.00039057960064759845, "loss": 3.4529, "step": 32450 }, { "epoch": 3.5040431266846364, "grad_norm": 0.5722944736480713, "learning_rate": 0.00039025580140313005, "loss": 3.4766, "step": 32500 }, { "epoch": 3.509433962264151, "grad_norm": 0.6770778894424438, "learning_rate": 0.0003899320021586616, "loss": 3.4527, "step": 32550 }, { "epoch": 3.5148247978436657, "grad_norm": 0.6125558018684387, "learning_rate": 0.0003896082029141932, "loss": 3.4736, "step": 32600 }, { "epoch": 3.5202156334231804, "grad_norm": 0.6196644306182861, "learning_rate": 0.00038928440366972476, "loss": 3.4602, "step": 32650 }, { "epoch": 3.525606469002695, "grad_norm": 0.6249521374702454, "learning_rate": 0.00038896060442525636, "loss": 3.4555, "step": 32700 }, { "epoch": 3.5309973045822103, "grad_norm": 0.6278482675552368, "learning_rate": 0.00038863680518078786, "loss": 3.4667, "step": 32750 }, { "epoch": 3.536388140161725, "grad_norm": 0.6120947003364563, "learning_rate": 0.0003883130059363194, "loss": 3.4728, "step": 32800 }, { "epoch": 3.5417789757412397, "grad_norm": 0.5920261740684509, "learning_rate": 0.000387989206691851, "loss": 3.458, "step": 32850 }, { "epoch": 3.547169811320755, "grad_norm": 0.576420247554779, "learning_rate": 0.00038766540744738256, "loss": 3.4604, "step": 32900 }, { "epoch": 3.5525606469002695, "grad_norm": 0.6335041522979736, "learning_rate": 0.00038734160820291417, "loss": 3.4826, "step": 32950 }, { "epoch": 3.557951482479784, "grad_norm": 0.6456249356269836, "learning_rate": 0.0003870178089584457, "loss": 3.4599, "step": 33000 }, { "epoch": 3.557951482479784, "eval_accuracy": 0.3727679573072381, "eval_loss": 3.4776175022125244, "eval_runtime": 187.4986, "eval_samples_per_second": 96.059, "eval_steps_per_second": 6.005, "step": 33000 }, { "epoch": 3.5633423180592994, "grad_norm": 0.5974683165550232, "learning_rate": 0.0003866940097139773, "loss": 3.4603, "step": 33050 }, { "epoch": 3.568733153638814, "grad_norm": 0.6262832283973694, "learning_rate": 0.0003863702104695089, "loss": 3.4828, "step": 33100 }, { "epoch": 3.5741239892183287, "grad_norm": 0.5947827100753784, "learning_rate": 0.0003860464112250404, "loss": 3.458, "step": 33150 }, { "epoch": 3.579514824797844, "grad_norm": 0.5985409617424011, "learning_rate": 0.00038572261198057203, "loss": 3.4769, "step": 33200 }, { "epoch": 3.5849056603773586, "grad_norm": 0.5783484578132629, "learning_rate": 0.0003853988127361036, "loss": 3.4725, "step": 33250 }, { "epoch": 3.5902964959568733, "grad_norm": 0.6530685424804688, "learning_rate": 0.0003850750134916352, "loss": 3.4968, "step": 33300 }, { "epoch": 3.595687331536388, "grad_norm": 0.6270087957382202, "learning_rate": 0.0003847512142471667, "loss": 3.4611, "step": 33350 }, { "epoch": 3.601078167115903, "grad_norm": 0.6156905889511108, "learning_rate": 0.0003844274150026983, "loss": 3.4747, "step": 33400 }, { "epoch": 3.606469002695418, "grad_norm": 0.5912847518920898, "learning_rate": 0.00038410361575822984, "loss": 3.4828, "step": 33450 }, { "epoch": 3.6118598382749325, "grad_norm": 0.5977141857147217, "learning_rate": 0.00038377981651376144, "loss": 3.4765, "step": 33500 }, { "epoch": 3.617250673854447, "grad_norm": 0.6528952121734619, "learning_rate": 0.000383456017269293, "loss": 3.4641, "step": 33550 }, { "epoch": 3.6226415094339623, "grad_norm": 0.6034547090530396, "learning_rate": 0.00038313221802482454, "loss": 3.4617, "step": 33600 }, { "epoch": 3.628032345013477, "grad_norm": 0.6089150309562683, "learning_rate": 0.00038280841878035615, "loss": 3.4533, "step": 33650 }, { "epoch": 3.6334231805929917, "grad_norm": 0.5918501615524292, "learning_rate": 0.0003824846195358877, "loss": 3.4497, "step": 33700 }, { "epoch": 3.638814016172507, "grad_norm": 0.594428539276123, "learning_rate": 0.0003821608202914193, "loss": 3.4731, "step": 33750 }, { "epoch": 3.6442048517520216, "grad_norm": 0.5788894891738892, "learning_rate": 0.00038183702104695085, "loss": 3.4525, "step": 33800 }, { "epoch": 3.6495956873315363, "grad_norm": 0.6797850728034973, "learning_rate": 0.00038151322180248246, "loss": 3.4659, "step": 33850 }, { "epoch": 3.6549865229110514, "grad_norm": 0.6085013747215271, "learning_rate": 0.000381189422558014, "loss": 3.4651, "step": 33900 }, { "epoch": 3.660377358490566, "grad_norm": 0.6374878287315369, "learning_rate": 0.0003808656233135456, "loss": 3.4603, "step": 33950 }, { "epoch": 3.665768194070081, "grad_norm": 0.6357040405273438, "learning_rate": 0.00038054182406907716, "loss": 3.4813, "step": 34000 }, { "epoch": 3.665768194070081, "eval_accuracy": 0.3736759694907037, "eval_loss": 3.4704673290252686, "eval_runtime": 187.7895, "eval_samples_per_second": 95.911, "eval_steps_per_second": 5.996, "step": 34000 }, { "epoch": 3.671159029649596, "grad_norm": 0.5881034731864929, "learning_rate": 0.00038021802482460866, "loss": 3.4617, "step": 34050 }, { "epoch": 3.6765498652291106, "grad_norm": 0.5796710848808289, "learning_rate": 0.00037989422558014027, "loss": 3.4691, "step": 34100 }, { "epoch": 3.6819407008086253, "grad_norm": 0.642318606376648, "learning_rate": 0.0003795704263356718, "loss": 3.4757, "step": 34150 }, { "epoch": 3.68733153638814, "grad_norm": 0.6038513779640198, "learning_rate": 0.0003792466270912034, "loss": 3.4749, "step": 34200 }, { "epoch": 3.6927223719676547, "grad_norm": 0.5752274990081787, "learning_rate": 0.00037892282784673497, "loss": 3.4663, "step": 34250 }, { "epoch": 3.69811320754717, "grad_norm": 0.6258960962295532, "learning_rate": 0.0003785990286022666, "loss": 3.4774, "step": 34300 }, { "epoch": 3.7035040431266846, "grad_norm": 0.6572497487068176, "learning_rate": 0.00037827522935779813, "loss": 3.4861, "step": 34350 }, { "epoch": 3.7088948787061993, "grad_norm": 0.6318001747131348, "learning_rate": 0.00037795143011332973, "loss": 3.4592, "step": 34400 }, { "epoch": 3.7142857142857144, "grad_norm": 0.6450494527816772, "learning_rate": 0.0003776276308688613, "loss": 3.4438, "step": 34450 }, { "epoch": 3.719676549865229, "grad_norm": 0.6240411400794983, "learning_rate": 0.00037730383162439283, "loss": 3.4636, "step": 34500 }, { "epoch": 3.725067385444744, "grad_norm": 0.6308651566505432, "learning_rate": 0.00037698003237992444, "loss": 3.4799, "step": 34550 }, { "epoch": 3.730458221024259, "grad_norm": 0.6322066187858582, "learning_rate": 0.000376656233135456, "loss": 3.4733, "step": 34600 }, { "epoch": 3.7358490566037736, "grad_norm": 0.6787533164024353, "learning_rate": 0.0003763324338909876, "loss": 3.4466, "step": 34650 }, { "epoch": 3.7412398921832883, "grad_norm": 0.6830219030380249, "learning_rate": 0.0003760086346465191, "loss": 3.4894, "step": 34700 }, { "epoch": 3.7466307277628035, "grad_norm": 0.6646257638931274, "learning_rate": 0.0003756848354020507, "loss": 3.4516, "step": 34750 }, { "epoch": 3.752021563342318, "grad_norm": 0.6488728523254395, "learning_rate": 0.00037536103615758224, "loss": 3.4828, "step": 34800 }, { "epoch": 3.757412398921833, "grad_norm": 0.6173174381256104, "learning_rate": 0.0003750372369131138, "loss": 3.4949, "step": 34850 }, { "epoch": 3.7628032345013476, "grad_norm": 0.6098762154579163, "learning_rate": 0.0003747134376686454, "loss": 3.4806, "step": 34900 }, { "epoch": 3.7681940700808623, "grad_norm": 0.5716989040374756, "learning_rate": 0.00037438963842417695, "loss": 3.4584, "step": 34950 }, { "epoch": 3.7735849056603774, "grad_norm": 0.6457527875900269, "learning_rate": 0.00037406583917970856, "loss": 3.4595, "step": 35000 }, { "epoch": 3.7735849056603774, "eval_accuracy": 0.37413861348701777, "eval_loss": 3.4643301963806152, "eval_runtime": 187.4007, "eval_samples_per_second": 96.11, "eval_steps_per_second": 6.009, "step": 35000 }, { "epoch": 3.778975741239892, "grad_norm": 0.6491796970367432, "learning_rate": 0.0003737420399352401, "loss": 3.4837, "step": 35050 }, { "epoch": 3.784366576819407, "grad_norm": 0.689035177230835, "learning_rate": 0.0003734182406907717, "loss": 3.4489, "step": 35100 }, { "epoch": 3.789757412398922, "grad_norm": 0.6304520964622498, "learning_rate": 0.00037309444144630326, "loss": 3.4715, "step": 35150 }, { "epoch": 3.7951482479784366, "grad_norm": 0.6018555760383606, "learning_rate": 0.00037277064220183487, "loss": 3.4543, "step": 35200 }, { "epoch": 3.8005390835579513, "grad_norm": 0.582009494304657, "learning_rate": 0.0003724468429573664, "loss": 3.4806, "step": 35250 }, { "epoch": 3.8059299191374665, "grad_norm": 0.6774134039878845, "learning_rate": 0.0003721230437128979, "loss": 3.469, "step": 35300 }, { "epoch": 3.811320754716981, "grad_norm": 0.5913166999816895, "learning_rate": 0.00037179924446842957, "loss": 3.47, "step": 35350 }, { "epoch": 3.816711590296496, "grad_norm": 0.6116369962692261, "learning_rate": 0.00037147544522396107, "loss": 3.4878, "step": 35400 }, { "epoch": 3.822102425876011, "grad_norm": 0.5959073305130005, "learning_rate": 0.000371158121964382, "loss": 3.4448, "step": 35450 }, { "epoch": 3.8274932614555257, "grad_norm": 0.611961841583252, "learning_rate": 0.0003708343227199136, "loss": 3.4809, "step": 35500 }, { "epoch": 3.8328840970350404, "grad_norm": 0.6396837830543518, "learning_rate": 0.0003705105234754452, "loss": 3.4641, "step": 35550 }, { "epoch": 3.838274932614555, "grad_norm": 0.5907486081123352, "learning_rate": 0.0003701867242309768, "loss": 3.4513, "step": 35600 }, { "epoch": 3.8436657681940702, "grad_norm": 0.6286391019821167, "learning_rate": 0.00036986292498650833, "loss": 3.4649, "step": 35650 }, { "epoch": 3.849056603773585, "grad_norm": 0.629386842250824, "learning_rate": 0.0003695391257420399, "loss": 3.4621, "step": 35700 }, { "epoch": 3.8544474393530996, "grad_norm": 0.6501761674880981, "learning_rate": 0.0003692153264975715, "loss": 3.4423, "step": 35750 }, { "epoch": 3.8598382749326143, "grad_norm": 0.6401487588882446, "learning_rate": 0.00036889152725310304, "loss": 3.4644, "step": 35800 }, { "epoch": 3.8652291105121295, "grad_norm": 0.6554639935493469, "learning_rate": 0.00036856772800863464, "loss": 3.4676, "step": 35850 }, { "epoch": 3.870619946091644, "grad_norm": 0.59914231300354, "learning_rate": 0.0003682439287641662, "loss": 3.4596, "step": 35900 }, { "epoch": 3.876010781671159, "grad_norm": 0.5814793705940247, "learning_rate": 0.0003679201295196978, "loss": 3.4636, "step": 35950 }, { "epoch": 3.881401617250674, "grad_norm": 0.6307700872421265, "learning_rate": 0.00036759633027522935, "loss": 3.4446, "step": 36000 }, { "epoch": 3.881401617250674, "eval_accuracy": 0.3749434054270596, "eval_loss": 3.458733320236206, "eval_runtime": 187.388, "eval_samples_per_second": 96.116, "eval_steps_per_second": 6.009, "step": 36000 }, { "epoch": 3.8867924528301887, "grad_norm": 0.6277196407318115, "learning_rate": 0.00036727253103076084, "loss": 3.4476, "step": 36050 }, { "epoch": 3.8921832884097034, "grad_norm": 0.6114755272865295, "learning_rate": 0.00036694873178629245, "loss": 3.4725, "step": 36100 }, { "epoch": 3.8975741239892185, "grad_norm": 0.6390984654426575, "learning_rate": 0.000366624932541824, "loss": 3.4624, "step": 36150 }, { "epoch": 3.9029649595687332, "grad_norm": 0.6099748611450195, "learning_rate": 0.0003663011332973556, "loss": 3.4424, "step": 36200 }, { "epoch": 3.908355795148248, "grad_norm": 0.6574003100395203, "learning_rate": 0.00036597733405288715, "loss": 3.4488, "step": 36250 }, { "epoch": 3.913746630727763, "grad_norm": 0.6434478163719177, "learning_rate": 0.00036565353480841876, "loss": 3.461, "step": 36300 }, { "epoch": 3.9191374663072778, "grad_norm": 0.5764980912208557, "learning_rate": 0.0003653297355639503, "loss": 3.4538, "step": 36350 }, { "epoch": 3.9245283018867925, "grad_norm": 0.6057455539703369, "learning_rate": 0.0003650059363194819, "loss": 3.4548, "step": 36400 }, { "epoch": 3.929919137466307, "grad_norm": 0.6364848613739014, "learning_rate": 0.00036468213707501347, "loss": 3.462, "step": 36450 }, { "epoch": 3.935309973045822, "grad_norm": 0.5842701196670532, "learning_rate": 0.000364358337830545, "loss": 3.4663, "step": 36500 }, { "epoch": 3.940700808625337, "grad_norm": 0.5927199125289917, "learning_rate": 0.0003640345385860766, "loss": 3.4543, "step": 36550 }, { "epoch": 3.9460916442048517, "grad_norm": 0.6089248657226562, "learning_rate": 0.00036371073934160817, "loss": 3.466, "step": 36600 }, { "epoch": 3.9514824797843664, "grad_norm": 0.6188242435455322, "learning_rate": 0.0003633869400971398, "loss": 3.4595, "step": 36650 }, { "epoch": 3.9568733153638815, "grad_norm": 0.6130086183547974, "learning_rate": 0.00036306314085267127, "loss": 3.4574, "step": 36700 }, { "epoch": 3.9622641509433962, "grad_norm": 0.6104381084442139, "learning_rate": 0.00036273934160820293, "loss": 3.4509, "step": 36750 }, { "epoch": 3.967654986522911, "grad_norm": 0.7232850790023804, "learning_rate": 0.00036241554236373443, "loss": 3.4471, "step": 36800 }, { "epoch": 3.973045822102426, "grad_norm": 0.6882140636444092, "learning_rate": 0.00036209174311926603, "loss": 3.4459, "step": 36850 }, { "epoch": 3.9784366576819408, "grad_norm": 0.6311055421829224, "learning_rate": 0.0003617679438747976, "loss": 3.4611, "step": 36900 }, { "epoch": 3.9838274932614555, "grad_norm": 0.6199382543563843, "learning_rate": 0.00036144414463032913, "loss": 3.4723, "step": 36950 }, { "epoch": 3.9892183288409706, "grad_norm": 0.6460267901420593, "learning_rate": 0.00036112034538586074, "loss": 3.467, "step": 37000 }, { "epoch": 3.9892183288409706, "eval_accuracy": 0.3754612450903834, "eval_loss": 3.453521251678467, "eval_runtime": 187.8709, "eval_samples_per_second": 95.869, "eval_steps_per_second": 5.993, "step": 37000 }, { "epoch": 3.9946091644204853, "grad_norm": 0.6030475497245789, "learning_rate": 0.0003607965461413923, "loss": 3.4608, "step": 37050 }, { "epoch": 4.0, "grad_norm": 1.1987624168395996, "learning_rate": 0.0003604727468969239, "loss": 3.4534, "step": 37100 }, { "epoch": 4.005390835579515, "grad_norm": 0.6259908676147461, "learning_rate": 0.00036014894765245544, "loss": 3.3673, "step": 37150 }, { "epoch": 4.010781671159029, "grad_norm": 0.604569673538208, "learning_rate": 0.00035982514840798705, "loss": 3.3673, "step": 37200 }, { "epoch": 4.0161725067385445, "grad_norm": 0.6428099274635315, "learning_rate": 0.0003595013491635186, "loss": 3.3624, "step": 37250 }, { "epoch": 4.02156334231806, "grad_norm": 0.6633650064468384, "learning_rate": 0.0003591775499190502, "loss": 3.3465, "step": 37300 }, { "epoch": 4.026954177897574, "grad_norm": 0.6475902795791626, "learning_rate": 0.00035885375067458176, "loss": 3.3721, "step": 37350 }, { "epoch": 4.032345013477089, "grad_norm": 0.6463242173194885, "learning_rate": 0.00035852995143011325, "loss": 3.3786, "step": 37400 }, { "epoch": 4.037735849056604, "grad_norm": 0.6179232597351074, "learning_rate": 0.0003582126281705342, "loss": 3.3648, "step": 37450 }, { "epoch": 4.0431266846361185, "grad_norm": 0.6580100059509277, "learning_rate": 0.0003578888289260658, "loss": 3.3807, "step": 37500 }, { "epoch": 4.048517520215634, "grad_norm": 0.6432563662528992, "learning_rate": 0.00035756502968159736, "loss": 3.3707, "step": 37550 }, { "epoch": 4.053908355795148, "grad_norm": 0.6162436604499817, "learning_rate": 0.00035724123043712896, "loss": 3.366, "step": 37600 }, { "epoch": 4.059299191374663, "grad_norm": 0.6134903430938721, "learning_rate": 0.0003569174311926605, "loss": 3.3818, "step": 37650 }, { "epoch": 4.064690026954178, "grad_norm": 0.6512436270713806, "learning_rate": 0.00035659363194819206, "loss": 3.3842, "step": 37700 }, { "epoch": 4.070080862533692, "grad_norm": 0.6385947465896606, "learning_rate": 0.00035626983270372367, "loss": 3.3619, "step": 37750 }, { "epoch": 4.0754716981132075, "grad_norm": 0.6357753276824951, "learning_rate": 0.0003559460334592552, "loss": 3.3616, "step": 37800 }, { "epoch": 4.080862533692723, "grad_norm": 0.6429207921028137, "learning_rate": 0.0003556222342147868, "loss": 3.375, "step": 37850 }, { "epoch": 4.086253369272237, "grad_norm": 0.635795533657074, "learning_rate": 0.0003552984349703184, "loss": 3.3842, "step": 37900 }, { "epoch": 4.091644204851752, "grad_norm": 0.6620546579360962, "learning_rate": 0.00035497463572585, "loss": 3.3943, "step": 37950 }, { "epoch": 4.097035040431267, "grad_norm": 0.6397520899772644, "learning_rate": 0.00035465083648138153, "loss": 3.3801, "step": 38000 }, { "epoch": 4.097035040431267, "eval_accuracy": 0.3764028310161837, "eval_loss": 3.453479766845703, "eval_runtime": 187.2183, "eval_samples_per_second": 96.203, "eval_steps_per_second": 6.014, "step": 38000 }, { "epoch": 4.1024258760107815, "grad_norm": 0.6575223207473755, "learning_rate": 0.00035432703723691314, "loss": 3.3747, "step": 38050 }, { "epoch": 4.107816711590297, "grad_norm": 0.64016193151474, "learning_rate": 0.00035400323799244463, "loss": 3.3831, "step": 38100 }, { "epoch": 4.113207547169812, "grad_norm": 0.6374653577804565, "learning_rate": 0.0003536794387479762, "loss": 3.3751, "step": 38150 }, { "epoch": 4.118598382749326, "grad_norm": 0.6173773407936096, "learning_rate": 0.0003533556395035078, "loss": 3.3993, "step": 38200 }, { "epoch": 4.123989218328841, "grad_norm": 0.6391666531562805, "learning_rate": 0.00035303184025903934, "loss": 3.3912, "step": 38250 }, { "epoch": 4.129380053908355, "grad_norm": 0.6431846022605896, "learning_rate": 0.00035270804101457094, "loss": 3.3915, "step": 38300 }, { "epoch": 4.1347708894878705, "grad_norm": 0.5880658626556396, "learning_rate": 0.0003523842417701025, "loss": 3.382, "step": 38350 }, { "epoch": 4.140161725067386, "grad_norm": 0.6336498856544495, "learning_rate": 0.0003520604425256341, "loss": 3.3767, "step": 38400 }, { "epoch": 4.1455525606469, "grad_norm": 0.6082850098609924, "learning_rate": 0.00035173664328116565, "loss": 3.3863, "step": 38450 }, { "epoch": 4.150943396226415, "grad_norm": 0.6710603833198547, "learning_rate": 0.00035141284403669725, "loss": 3.3936, "step": 38500 }, { "epoch": 4.15633423180593, "grad_norm": 0.6536480188369751, "learning_rate": 0.0003510890447922288, "loss": 3.3814, "step": 38550 }, { "epoch": 4.1617250673854445, "grad_norm": 0.6300691366195679, "learning_rate": 0.00035076524554776035, "loss": 3.3792, "step": 38600 }, { "epoch": 4.16711590296496, "grad_norm": 0.6134623289108276, "learning_rate": 0.00035044144630329196, "loss": 3.4067, "step": 38650 }, { "epoch": 4.172506738544475, "grad_norm": 0.6804368495941162, "learning_rate": 0.00035011764705882346, "loss": 3.384, "step": 38700 }, { "epoch": 4.177897574123989, "grad_norm": 0.6068801879882812, "learning_rate": 0.0003497938478143551, "loss": 3.3772, "step": 38750 }, { "epoch": 4.183288409703504, "grad_norm": 0.7131064534187317, "learning_rate": 0.0003494700485698866, "loss": 3.3989, "step": 38800 }, { "epoch": 4.188679245283019, "grad_norm": 0.6442277431488037, "learning_rate": 0.0003491462493254182, "loss": 3.4037, "step": 38850 }, { "epoch": 4.1940700808625335, "grad_norm": 0.6614530682563782, "learning_rate": 0.00034882245008094977, "loss": 3.4017, "step": 38900 }, { "epoch": 4.199460916442049, "grad_norm": 0.6263593435287476, "learning_rate": 0.0003484986508364813, "loss": 3.3943, "step": 38950 }, { "epoch": 4.204851752021563, "grad_norm": 0.6600920557975769, "learning_rate": 0.0003481748515920129, "loss": 3.3901, "step": 39000 }, { "epoch": 4.204851752021563, "eval_accuracy": 0.37629906750832093, "eval_loss": 3.448453664779663, "eval_runtime": 187.5563, "eval_samples_per_second": 96.03, "eval_steps_per_second": 6.004, "step": 39000 }, { "epoch": 4.210242587601078, "grad_norm": 0.627390444278717, "learning_rate": 0.00034785105234754447, "loss": 3.3867, "step": 39050 }, { "epoch": 4.215633423180593, "grad_norm": 0.6027140021324158, "learning_rate": 0.0003475272531030761, "loss": 3.3901, "step": 39100 }, { "epoch": 4.2210242587601075, "grad_norm": 0.6500639319419861, "learning_rate": 0.00034720345385860763, "loss": 3.3859, "step": 39150 }, { "epoch": 4.226415094339623, "grad_norm": 0.6324947476387024, "learning_rate": 0.00034687965461413923, "loss": 3.383, "step": 39200 }, { "epoch": 4.231805929919138, "grad_norm": 0.7030044794082642, "learning_rate": 0.0003465558553696708, "loss": 3.3804, "step": 39250 }, { "epoch": 4.237196765498652, "grad_norm": 0.656147301197052, "learning_rate": 0.0003462320561252024, "loss": 3.3941, "step": 39300 }, { "epoch": 4.242587601078167, "grad_norm": 0.6377809047698975, "learning_rate": 0.00034590825688073394, "loss": 3.3942, "step": 39350 }, { "epoch": 4.247978436657682, "grad_norm": 0.6354027986526489, "learning_rate": 0.00034558445763626543, "loss": 3.3987, "step": 39400 }, { "epoch": 4.2533692722371965, "grad_norm": 0.6445314884185791, "learning_rate": 0.00034526065839179704, "loss": 3.3887, "step": 39450 }, { "epoch": 4.258760107816712, "grad_norm": 0.6906786561012268, "learning_rate": 0.0003449368591473286, "loss": 3.3807, "step": 39500 }, { "epoch": 4.264150943396227, "grad_norm": 0.6557918787002563, "learning_rate": 0.00034461953588774954, "loss": 3.3852, "step": 39550 }, { "epoch": 4.269541778975741, "grad_norm": 0.6619495749473572, "learning_rate": 0.0003443022126281705, "loss": 3.3829, "step": 39600 }, { "epoch": 4.274932614555256, "grad_norm": 0.700074315071106, "learning_rate": 0.00034397841338370204, "loss": 3.4102, "step": 39650 }, { "epoch": 4.280323450134771, "grad_norm": 0.6916738748550415, "learning_rate": 0.00034365461413923365, "loss": 3.3953, "step": 39700 }, { "epoch": 4.285714285714286, "grad_norm": 0.6402020454406738, "learning_rate": 0.0003433308148947652, "loss": 3.3878, "step": 39750 }, { "epoch": 4.291105121293801, "grad_norm": 0.5897455811500549, "learning_rate": 0.0003430070156502968, "loss": 3.4044, "step": 39800 }, { "epoch": 4.296495956873315, "grad_norm": 0.6505072712898254, "learning_rate": 0.00034268321640582835, "loss": 3.4052, "step": 39850 }, { "epoch": 4.30188679245283, "grad_norm": 0.6143913269042969, "learning_rate": 0.00034235941716135996, "loss": 3.3976, "step": 39900 }, { "epoch": 4.307277628032345, "grad_norm": 0.6959649920463562, "learning_rate": 0.0003420356179168915, "loss": 3.4007, "step": 39950 }, { "epoch": 4.3126684636118595, "grad_norm": 0.6691548228263855, "learning_rate": 0.0003417118186724231, "loss": 3.3924, "step": 40000 }, { "epoch": 4.3126684636118595, "eval_accuracy": 0.3769364253482407, "eval_loss": 3.444800853729248, "eval_runtime": 187.924, "eval_samples_per_second": 95.842, "eval_steps_per_second": 5.992, "step": 40000 }, { "epoch": 4.318059299191375, "grad_norm": 0.6325005888938904, "learning_rate": 0.00034138801942795467, "loss": 3.3868, "step": 40050 }, { "epoch": 4.32345013477089, "grad_norm": 0.6606913208961487, "learning_rate": 0.00034106422018348616, "loss": 3.4124, "step": 40100 }, { "epoch": 4.328840970350404, "grad_norm": 0.6430675983428955, "learning_rate": 0.00034074042093901777, "loss": 3.4218, "step": 40150 }, { "epoch": 4.334231805929919, "grad_norm": 0.653377115726471, "learning_rate": 0.0003404166216945493, "loss": 3.3903, "step": 40200 }, { "epoch": 4.339622641509434, "grad_norm": 0.6559277176856995, "learning_rate": 0.0003400928224500809, "loss": 3.3963, "step": 40250 }, { "epoch": 4.345013477088949, "grad_norm": 0.6817251443862915, "learning_rate": 0.00033976902320561247, "loss": 3.4016, "step": 40300 }, { "epoch": 4.350404312668464, "grad_norm": 0.664612889289856, "learning_rate": 0.0003394452239611441, "loss": 3.3943, "step": 40350 }, { "epoch": 4.355795148247978, "grad_norm": 0.6640458703041077, "learning_rate": 0.00033912142471667563, "loss": 3.4109, "step": 40400 }, { "epoch": 4.361185983827493, "grad_norm": 0.6599859595298767, "learning_rate": 0.00033879762547220723, "loss": 3.38, "step": 40450 }, { "epoch": 4.366576819407008, "grad_norm": 0.6857253909111023, "learning_rate": 0.0003384738262277388, "loss": 3.3843, "step": 40500 }, { "epoch": 4.3719676549865225, "grad_norm": 0.6355632543563843, "learning_rate": 0.00033815002698327033, "loss": 3.407, "step": 40550 }, { "epoch": 4.377358490566038, "grad_norm": 0.6515060663223267, "learning_rate": 0.00033782622773880194, "loss": 3.3921, "step": 40600 }, { "epoch": 4.382749326145553, "grad_norm": 0.651151716709137, "learning_rate": 0.0003375024284943335, "loss": 3.3834, "step": 40650 }, { "epoch": 4.388140161725067, "grad_norm": 0.6231512427330017, "learning_rate": 0.0003371786292498651, "loss": 3.4116, "step": 40700 }, { "epoch": 4.393530997304582, "grad_norm": 0.7003295421600342, "learning_rate": 0.0003368548300053966, "loss": 3.4094, "step": 40750 }, { "epoch": 4.398921832884097, "grad_norm": 0.6507589817047119, "learning_rate": 0.0003365310307609282, "loss": 3.3824, "step": 40800 }, { "epoch": 4.404312668463612, "grad_norm": 0.6417136788368225, "learning_rate": 0.00033620723151645975, "loss": 3.4028, "step": 40850 }, { "epoch": 4.409703504043127, "grad_norm": 0.6464463472366333, "learning_rate": 0.0003358834322719913, "loss": 3.3901, "step": 40900 }, { "epoch": 4.415094339622642, "grad_norm": 0.672114372253418, "learning_rate": 0.0003355596330275229, "loss": 3.3962, "step": 40950 }, { "epoch": 4.420485175202156, "grad_norm": 0.6453450918197632, "learning_rate": 0.00033523583378305445, "loss": 3.4093, "step": 41000 }, { "epoch": 4.420485175202156, "eval_accuracy": 0.37747805999399586, "eval_loss": 3.4390106201171875, "eval_runtime": 187.6862, "eval_samples_per_second": 95.963, "eval_steps_per_second": 5.999, "step": 41000 }, { "epoch": 4.425876010781671, "grad_norm": 0.6747152209281921, "learning_rate": 0.00033491203453858606, "loss": 3.4099, "step": 41050 }, { "epoch": 4.431266846361186, "grad_norm": 0.6520073413848877, "learning_rate": 0.0003345882352941176, "loss": 3.3923, "step": 41100 }, { "epoch": 4.436657681940701, "grad_norm": 0.6549708843231201, "learning_rate": 0.0003342644360496492, "loss": 3.3711, "step": 41150 }, { "epoch": 4.442048517520216, "grad_norm": 0.6869304776191711, "learning_rate": 0.00033394063680518076, "loss": 3.4071, "step": 41200 }, { "epoch": 4.44743935309973, "grad_norm": 0.6499251127243042, "learning_rate": 0.00033361683756071237, "loss": 3.3953, "step": 41250 }, { "epoch": 4.452830188679245, "grad_norm": 0.6444708108901978, "learning_rate": 0.0003332930383162439, "loss": 3.3924, "step": 41300 }, { "epoch": 4.45822102425876, "grad_norm": 0.6329517364501953, "learning_rate": 0.0003329692390717754, "loss": 3.3836, "step": 41350 }, { "epoch": 4.463611859838275, "grad_norm": 0.6541862487792969, "learning_rate": 0.0003326454398273071, "loss": 3.395, "step": 41400 }, { "epoch": 4.46900269541779, "grad_norm": 0.6906949281692505, "learning_rate": 0.00033232164058283857, "loss": 3.3827, "step": 41450 }, { "epoch": 4.474393530997305, "grad_norm": 0.6574482321739197, "learning_rate": 0.0003319978413383702, "loss": 3.3845, "step": 41500 }, { "epoch": 4.479784366576819, "grad_norm": 0.6679044961929321, "learning_rate": 0.0003316740420939017, "loss": 3.4135, "step": 41550 }, { "epoch": 4.485175202156334, "grad_norm": 0.6667996644973755, "learning_rate": 0.00033135024284943333, "loss": 3.4046, "step": 41600 }, { "epoch": 4.490566037735849, "grad_norm": 0.6718025207519531, "learning_rate": 0.0003310264436049649, "loss": 3.41, "step": 41650 }, { "epoch": 4.495956873315364, "grad_norm": 0.6717631220817566, "learning_rate": 0.0003307026443604965, "loss": 3.4114, "step": 41700 }, { "epoch": 4.501347708894879, "grad_norm": 0.6933983564376831, "learning_rate": 0.00033037884511602804, "loss": 3.3954, "step": 41750 }, { "epoch": 4.506738544474393, "grad_norm": 0.6075118780136108, "learning_rate": 0.0003300550458715596, "loss": 3.4072, "step": 41800 }, { "epoch": 4.512129380053908, "grad_norm": 0.68716961145401, "learning_rate": 0.0003297312466270912, "loss": 3.3843, "step": 41850 }, { "epoch": 4.517520215633423, "grad_norm": 0.6730256676673889, "learning_rate": 0.00032940744738262274, "loss": 3.4068, "step": 41900 }, { "epoch": 4.5229110512129385, "grad_norm": 0.7278045415878296, "learning_rate": 0.00032908364813815435, "loss": 3.4097, "step": 41950 }, { "epoch": 4.528301886792453, "grad_norm": 0.6512318849563599, "learning_rate": 0.0003287598488936859, "loss": 3.4104, "step": 42000 }, { "epoch": 4.528301886792453, "eval_accuracy": 0.378027300341898, "eval_loss": 3.4342195987701416, "eval_runtime": 187.6655, "eval_samples_per_second": 95.974, "eval_steps_per_second": 6.0, "step": 42000 }, { "epoch": 4.533692722371968, "grad_norm": 0.6605228185653687, "learning_rate": 0.0003284360496492175, "loss": 3.3834, "step": 42050 }, { "epoch": 4.539083557951482, "grad_norm": 0.7046045064926147, "learning_rate": 0.000328112250404749, "loss": 3.394, "step": 42100 }, { "epoch": 4.544474393530997, "grad_norm": 0.6316480040550232, "learning_rate": 0.00032778845116028066, "loss": 3.3978, "step": 42150 }, { "epoch": 4.549865229110512, "grad_norm": 0.6550145149230957, "learning_rate": 0.00032746465191581215, "loss": 3.4099, "step": 42200 }, { "epoch": 4.555256064690027, "grad_norm": 0.6940279006958008, "learning_rate": 0.0003271408526713437, "loss": 3.4054, "step": 42250 }, { "epoch": 4.560646900269542, "grad_norm": 0.6692448854446411, "learning_rate": 0.0003268170534268753, "loss": 3.3962, "step": 42300 }, { "epoch": 4.566037735849057, "grad_norm": 0.6523816585540771, "learning_rate": 0.00032649325418240686, "loss": 3.4179, "step": 42350 }, { "epoch": 4.571428571428571, "grad_norm": 0.6672971844673157, "learning_rate": 0.00032616945493793846, "loss": 3.396, "step": 42400 }, { "epoch": 4.576819407008086, "grad_norm": 0.6267552971839905, "learning_rate": 0.00032584565569347, "loss": 3.4071, "step": 42450 }, { "epoch": 4.5822102425876015, "grad_norm": 0.6981790661811829, "learning_rate": 0.0003255218564490016, "loss": 3.3912, "step": 42500 }, { "epoch": 4.587601078167116, "grad_norm": 0.646481454372406, "learning_rate": 0.00032519805720453317, "loss": 3.4016, "step": 42550 }, { "epoch": 4.592991913746631, "grad_norm": 0.658094048500061, "learning_rate": 0.0003248742579600647, "loss": 3.3834, "step": 42600 }, { "epoch": 4.598382749326145, "grad_norm": 0.701236367225647, "learning_rate": 0.0003245504587155963, "loss": 3.4091, "step": 42650 }, { "epoch": 4.60377358490566, "grad_norm": 0.6524823904037476, "learning_rate": 0.0003242266594711278, "loss": 3.4031, "step": 42700 }, { "epoch": 4.609164420485175, "grad_norm": 0.6430251002311707, "learning_rate": 0.0003239028602266595, "loss": 3.3937, "step": 42750 }, { "epoch": 4.6145552560646905, "grad_norm": 0.6301037073135376, "learning_rate": 0.000323579060982191, "loss": 3.395, "step": 42800 }, { "epoch": 4.619946091644205, "grad_norm": 0.6488710641860962, "learning_rate": 0.0003232552617377226, "loss": 3.4024, "step": 42850 }, { "epoch": 4.62533692722372, "grad_norm": 0.664051353931427, "learning_rate": 0.00032293146249325413, "loss": 3.4154, "step": 42900 }, { "epoch": 4.630727762803234, "grad_norm": 0.6523007750511169, "learning_rate": 0.00032260766324878574, "loss": 3.424, "step": 42950 }, { "epoch": 4.636118598382749, "grad_norm": 0.6787187457084656, "learning_rate": 0.0003222838640043173, "loss": 3.3984, "step": 43000 }, { "epoch": 4.636118598382749, "eval_accuracy": 0.37819299599581513, "eval_loss": 3.4289870262145996, "eval_runtime": 187.3727, "eval_samples_per_second": 96.124, "eval_steps_per_second": 6.009, "step": 43000 }, { "epoch": 4.6415094339622645, "grad_norm": 0.6364569067955017, "learning_rate": 0.00032196006475984884, "loss": 3.4042, "step": 43050 }, { "epoch": 4.646900269541779, "grad_norm": 0.6747838854789734, "learning_rate": 0.00032163626551538044, "loss": 3.397, "step": 43100 }, { "epoch": 4.652291105121294, "grad_norm": 0.6599672436714172, "learning_rate": 0.000321312466270912, "loss": 3.4032, "step": 43150 }, { "epoch": 4.657681940700809, "grad_norm": 0.6930491328239441, "learning_rate": 0.0003209886670264436, "loss": 3.4026, "step": 43200 }, { "epoch": 4.663072776280323, "grad_norm": 0.6434908509254456, "learning_rate": 0.00032066486778197515, "loss": 3.3898, "step": 43250 }, { "epoch": 4.668463611859838, "grad_norm": 0.6763002276420593, "learning_rate": 0.00032034106853750675, "loss": 3.3911, "step": 43300 }, { "epoch": 4.6738544474393535, "grad_norm": 0.6328909993171692, "learning_rate": 0.0003200172692930383, "loss": 3.3884, "step": 43350 }, { "epoch": 4.679245283018868, "grad_norm": 0.6524956822395325, "learning_rate": 0.0003196934700485699, "loss": 3.4142, "step": 43400 }, { "epoch": 4.684636118598383, "grad_norm": 0.6790773272514343, "learning_rate": 0.0003193696708041014, "loss": 3.4084, "step": 43450 }, { "epoch": 4.690026954177897, "grad_norm": 0.6767542958259583, "learning_rate": 0.00031904587155963296, "loss": 3.4182, "step": 43500 }, { "epoch": 4.695417789757412, "grad_norm": 0.6552374362945557, "learning_rate": 0.00031872207231516456, "loss": 3.383, "step": 43550 }, { "epoch": 4.7008086253369274, "grad_norm": 0.6805891394615173, "learning_rate": 0.0003184047490555855, "loss": 3.3967, "step": 43600 }, { "epoch": 4.706199460916442, "grad_norm": 0.6635441780090332, "learning_rate": 0.00031808094981111706, "loss": 3.4145, "step": 43650 }, { "epoch": 4.711590296495957, "grad_norm": 0.6911014914512634, "learning_rate": 0.00031775715056664867, "loss": 3.4043, "step": 43700 }, { "epoch": 4.716981132075472, "grad_norm": 0.6306861639022827, "learning_rate": 0.0003174333513221802, "loss": 3.3861, "step": 43750 }, { "epoch": 4.722371967654986, "grad_norm": 0.6720971465110779, "learning_rate": 0.00031710955207771177, "loss": 3.3955, "step": 43800 }, { "epoch": 4.727762803234501, "grad_norm": 0.681067168712616, "learning_rate": 0.0003167857528332434, "loss": 3.4104, "step": 43850 }, { "epoch": 4.7331536388140165, "grad_norm": 0.6770942807197571, "learning_rate": 0.0003164619535887749, "loss": 3.3891, "step": 43900 }, { "epoch": 4.738544474393531, "grad_norm": 0.6591625809669495, "learning_rate": 0.00031613815434430653, "loss": 3.4002, "step": 43950 }, { "epoch": 4.743935309973046, "grad_norm": 0.6355513334274292, "learning_rate": 0.0003158143550998381, "loss": 3.403, "step": 44000 }, { "epoch": 4.743935309973046, "eval_accuracy": 0.37904559520649533, "eval_loss": 3.4251604080200195, "eval_runtime": 187.4268, "eval_samples_per_second": 96.096, "eval_steps_per_second": 6.008, "step": 44000 }, { "epoch": 4.74932614555256, "grad_norm": 0.6216215491294861, "learning_rate": 0.0003154905558553697, "loss": 3.4181, "step": 44050 }, { "epoch": 4.754716981132075, "grad_norm": 0.7213981747627258, "learning_rate": 0.0003151667566109012, "loss": 3.3885, "step": 44100 }, { "epoch": 4.7601078167115904, "grad_norm": 0.6748440265655518, "learning_rate": 0.00031484295736643284, "loss": 3.3973, "step": 44150 }, { "epoch": 4.765498652291106, "grad_norm": 0.6230536103248596, "learning_rate": 0.00031451915812196434, "loss": 3.3889, "step": 44200 }, { "epoch": 4.77088948787062, "grad_norm": 0.6862159371376038, "learning_rate": 0.0003141953588774959, "loss": 3.4072, "step": 44250 }, { "epoch": 4.776280323450135, "grad_norm": 0.6609549522399902, "learning_rate": 0.0003138715596330275, "loss": 3.3927, "step": 44300 }, { "epoch": 4.781671159029649, "grad_norm": 0.7261171340942383, "learning_rate": 0.00031354776038855904, "loss": 3.3898, "step": 44350 }, { "epoch": 4.787061994609164, "grad_norm": 0.683654248714447, "learning_rate": 0.00031322396114409065, "loss": 3.3972, "step": 44400 }, { "epoch": 4.7924528301886795, "grad_norm": 0.6835164427757263, "learning_rate": 0.0003129001618996222, "loss": 3.4126, "step": 44450 }, { "epoch": 4.797843665768194, "grad_norm": 0.6999883055686951, "learning_rate": 0.0003125763626551538, "loss": 3.4072, "step": 44500 }, { "epoch": 4.803234501347709, "grad_norm": 0.660194456577301, "learning_rate": 0.00031225256341068535, "loss": 3.4001, "step": 44550 }, { "epoch": 4.808625336927224, "grad_norm": 0.662414014339447, "learning_rate": 0.00031192876416621696, "loss": 3.4085, "step": 44600 }, { "epoch": 4.814016172506738, "grad_norm": 0.6617624759674072, "learning_rate": 0.0003116049649217485, "loss": 3.4029, "step": 44650 }, { "epoch": 4.819407008086253, "grad_norm": 0.6734949350357056, "learning_rate": 0.00031128116567728, "loss": 3.3927, "step": 44700 }, { "epoch": 4.824797843665769, "grad_norm": 0.6460137367248535, "learning_rate": 0.00031095736643281166, "loss": 3.41, "step": 44750 }, { "epoch": 4.830188679245283, "grad_norm": 0.6399081945419312, "learning_rate": 0.00031063356718834316, "loss": 3.4198, "step": 44800 }, { "epoch": 4.835579514824798, "grad_norm": 0.671139121055603, "learning_rate": 0.00031030976794387476, "loss": 3.4087, "step": 44850 }, { "epoch": 4.840970350404312, "grad_norm": 0.6836479902267456, "learning_rate": 0.0003099859686994063, "loss": 3.4142, "step": 44900 }, { "epoch": 4.846361185983827, "grad_norm": 0.6696195006370544, "learning_rate": 0.0003096621694549379, "loss": 3.4148, "step": 44950 }, { "epoch": 4.8517520215633425, "grad_norm": 0.6819624304771423, "learning_rate": 0.00030933837021046947, "loss": 3.4082, "step": 45000 }, { "epoch": 4.8517520215633425, "eval_accuracy": 0.37946542996501054, "eval_loss": 3.4183225631713867, "eval_runtime": 187.3214, "eval_samples_per_second": 96.15, "eval_steps_per_second": 6.011, "step": 45000 }, { "epoch": 4.857142857142857, "grad_norm": 0.7019557356834412, "learning_rate": 0.0003090145709660011, "loss": 3.3969, "step": 45050 }, { "epoch": 4.862533692722372, "grad_norm": 0.6515421271324158, "learning_rate": 0.0003086907717215326, "loss": 3.4006, "step": 45100 }, { "epoch": 4.867924528301887, "grad_norm": 0.6652536392211914, "learning_rate": 0.0003083669724770642, "loss": 3.407, "step": 45150 }, { "epoch": 4.873315363881401, "grad_norm": 0.712742269039154, "learning_rate": 0.0003080431732325958, "loss": 3.3875, "step": 45200 }, { "epoch": 4.878706199460916, "grad_norm": 0.6716951727867126, "learning_rate": 0.00030771937398812733, "loss": 3.3971, "step": 45250 }, { "epoch": 4.884097035040432, "grad_norm": 0.657441258430481, "learning_rate": 0.00030739557474365894, "loss": 3.3839, "step": 45300 }, { "epoch": 4.889487870619946, "grad_norm": 0.6646153926849365, "learning_rate": 0.0003070717754991905, "loss": 3.3969, "step": 45350 }, { "epoch": 4.894878706199461, "grad_norm": 0.7000280022621155, "learning_rate": 0.0003067479762547221, "loss": 3.4016, "step": 45400 }, { "epoch": 4.900269541778976, "grad_norm": 0.6623254418373108, "learning_rate": 0.0003064241770102536, "loss": 3.4024, "step": 45450 }, { "epoch": 4.90566037735849, "grad_norm": 0.7038528919219971, "learning_rate": 0.00030610037776578514, "loss": 3.3923, "step": 45500 }, { "epoch": 4.9110512129380055, "grad_norm": 0.6824002265930176, "learning_rate": 0.00030577657852131674, "loss": 3.3806, "step": 45550 }, { "epoch": 4.916442048517521, "grad_norm": 0.6446487307548523, "learning_rate": 0.0003054592552617377, "loss": 3.3889, "step": 45600 }, { "epoch": 4.921832884097035, "grad_norm": 0.7261145710945129, "learning_rate": 0.00030513545601726925, "loss": 3.3935, "step": 45650 }, { "epoch": 4.92722371967655, "grad_norm": 0.651084303855896, "learning_rate": 0.00030481165677280085, "loss": 3.4054, "step": 45700 }, { "epoch": 4.932614555256064, "grad_norm": 0.6604491472244263, "learning_rate": 0.0003044878575283324, "loss": 3.4026, "step": 45750 }, { "epoch": 4.938005390835579, "grad_norm": 0.7495591044425964, "learning_rate": 0.000304164058283864, "loss": 3.3887, "step": 45800 }, { "epoch": 4.943396226415095, "grad_norm": 0.6877288818359375, "learning_rate": 0.00030384025903939556, "loss": 3.3952, "step": 45850 }, { "epoch": 4.948787061994609, "grad_norm": 0.6563786864280701, "learning_rate": 0.0003035164597949271, "loss": 3.3979, "step": 45900 }, { "epoch": 4.954177897574124, "grad_norm": 0.68060302734375, "learning_rate": 0.0003031926605504587, "loss": 3.3982, "step": 45950 }, { "epoch": 4.959568733153639, "grad_norm": 0.6740385293960571, "learning_rate": 0.00030286886130599026, "loss": 3.3902, "step": 46000 }, { "epoch": 4.959568733153639, "eval_accuracy": 0.3798636427988506, "eval_loss": 3.415046215057373, "eval_runtime": 188.2864, "eval_samples_per_second": 95.657, "eval_steps_per_second": 5.98, "step": 46000 }, { "epoch": 4.964959568733153, "grad_norm": 0.6443136930465698, "learning_rate": 0.00030254506206152187, "loss": 3.4066, "step": 46050 }, { "epoch": 4.9703504043126685, "grad_norm": 0.676317036151886, "learning_rate": 0.00030222126281705336, "loss": 3.3889, "step": 46100 }, { "epoch": 4.975741239892184, "grad_norm": 0.6700965762138367, "learning_rate": 0.000301897463572585, "loss": 3.4058, "step": 46150 }, { "epoch": 4.981132075471698, "grad_norm": 0.7073339223861694, "learning_rate": 0.0003015736643281165, "loss": 3.3953, "step": 46200 }, { "epoch": 4.986522911051213, "grad_norm": 0.6577731966972351, "learning_rate": 0.00030124986508364807, "loss": 3.4054, "step": 46250 }, { "epoch": 4.991913746630727, "grad_norm": 0.6806015968322754, "learning_rate": 0.0003009260658391797, "loss": 3.4067, "step": 46300 }, { "epoch": 4.997304582210242, "grad_norm": 0.6785401105880737, "learning_rate": 0.0003006022665947112, "loss": 3.3915, "step": 46350 }, { "epoch": 5.002695417789758, "grad_norm": 0.6428429484367371, "learning_rate": 0.00030027846735024283, "loss": 3.3601, "step": 46400 }, { "epoch": 5.008086253369272, "grad_norm": 0.6469174027442932, "learning_rate": 0.00029995466810577443, "loss": 3.2972, "step": 46450 }, { "epoch": 5.013477088948787, "grad_norm": 0.6720959544181824, "learning_rate": 0.00029963086886130593, "loss": 3.2985, "step": 46500 }, { "epoch": 5.018867924528302, "grad_norm": 0.714445173740387, "learning_rate": 0.00029930706961683754, "loss": 3.297, "step": 46550 }, { "epoch": 5.024258760107816, "grad_norm": 0.7507009506225586, "learning_rate": 0.0002989832703723691, "loss": 3.3071, "step": 46600 }, { "epoch": 5.0296495956873315, "grad_norm": 0.7144675254821777, "learning_rate": 0.0002986594711279007, "loss": 3.3031, "step": 46650 }, { "epoch": 5.035040431266847, "grad_norm": 0.6817595362663269, "learning_rate": 0.00029833567188343224, "loss": 3.3211, "step": 46700 }, { "epoch": 5.040431266846361, "grad_norm": 0.7118050456047058, "learning_rate": 0.00029801187263896385, "loss": 3.3085, "step": 46750 }, { "epoch": 5.045822102425876, "grad_norm": 0.6556680202484131, "learning_rate": 0.0002976880733944954, "loss": 3.3258, "step": 46800 }, { "epoch": 5.051212938005391, "grad_norm": 0.7107027769088745, "learning_rate": 0.00029736427415002695, "loss": 3.3078, "step": 46850 }, { "epoch": 5.056603773584905, "grad_norm": 0.6754008531570435, "learning_rate": 0.0002970404749055585, "loss": 3.3179, "step": 46900 }, { "epoch": 5.061994609164421, "grad_norm": 0.6981600522994995, "learning_rate": 0.0002967166756610901, "loss": 3.3276, "step": 46950 }, { "epoch": 5.067385444743936, "grad_norm": 0.7493944764137268, "learning_rate": 0.00029639287641662165, "loss": 3.315, "step": 47000 }, { "epoch": 5.067385444743936, "eval_accuracy": 0.38014038170411407, "eval_loss": 3.420238494873047, "eval_runtime": 187.6122, "eval_samples_per_second": 96.001, "eval_steps_per_second": 6.002, "step": 47000 }, { "epoch": 5.07277628032345, "grad_norm": 0.6746612787246704, "learning_rate": 0.00029606907717215326, "loss": 3.32, "step": 47050 }, { "epoch": 5.078167115902965, "grad_norm": 0.736753523349762, "learning_rate": 0.0002957452779276848, "loss": 3.3211, "step": 47100 }, { "epoch": 5.083557951482479, "grad_norm": 0.7640523314476013, "learning_rate": 0.00029542147868321636, "loss": 3.3097, "step": 47150 }, { "epoch": 5.0889487870619945, "grad_norm": 0.6825003623962402, "learning_rate": 0.00029509767943874796, "loss": 3.3206, "step": 47200 }, { "epoch": 5.09433962264151, "grad_norm": 0.6508895754814148, "learning_rate": 0.0002947738801942795, "loss": 3.3091, "step": 47250 }, { "epoch": 5.099730458221024, "grad_norm": 0.6962198615074158, "learning_rate": 0.00029445655693470047, "loss": 3.3217, "step": 47300 }, { "epoch": 5.105121293800539, "grad_norm": 0.7021613121032715, "learning_rate": 0.000294132757690232, "loss": 3.3286, "step": 47350 }, { "epoch": 5.110512129380054, "grad_norm": 0.6529552936553955, "learning_rate": 0.0002938089584457636, "loss": 3.3382, "step": 47400 }, { "epoch": 5.115902964959568, "grad_norm": 0.7062910795211792, "learning_rate": 0.00029348515920129517, "loss": 3.3252, "step": 47450 }, { "epoch": 5.121293800539084, "grad_norm": 0.6736778616905212, "learning_rate": 0.0002931613599568267, "loss": 3.3304, "step": 47500 }, { "epoch": 5.126684636118599, "grad_norm": 0.6905795931816101, "learning_rate": 0.00029283756071235833, "loss": 3.3278, "step": 47550 }, { "epoch": 5.132075471698113, "grad_norm": 0.7116023898124695, "learning_rate": 0.0002925137614678899, "loss": 3.3291, "step": 47600 }, { "epoch": 5.137466307277628, "grad_norm": 0.7711155414581299, "learning_rate": 0.00029218996222342143, "loss": 3.3198, "step": 47650 }, { "epoch": 5.142857142857143, "grad_norm": 0.7277513742446899, "learning_rate": 0.00029186616297895303, "loss": 3.3177, "step": 47700 }, { "epoch": 5.1482479784366575, "grad_norm": 0.6594894528388977, "learning_rate": 0.000291548839719374, "loss": 3.3157, "step": 47750 }, { "epoch": 5.153638814016173, "grad_norm": 0.7083530426025391, "learning_rate": 0.00029122504047490554, "loss": 3.3343, "step": 47800 }, { "epoch": 5.159029649595688, "grad_norm": 0.7079737782478333, "learning_rate": 0.0002909012412304371, "loss": 3.3067, "step": 47850 }, { "epoch": 5.164420485175202, "grad_norm": 0.743047833442688, "learning_rate": 0.0002905774419859687, "loss": 3.3431, "step": 47900 }, { "epoch": 5.169811320754717, "grad_norm": 0.6552191972732544, "learning_rate": 0.00029025364274150024, "loss": 3.3267, "step": 47950 }, { "epoch": 5.175202156334231, "grad_norm": 0.6602107286453247, "learning_rate": 0.0002899298434970318, "loss": 3.3194, "step": 48000 }, { "epoch": 5.175202156334231, "eval_accuracy": 0.38059954880801805, "eval_loss": 3.4138906002044678, "eval_runtime": 187.9002, "eval_samples_per_second": 95.854, "eval_steps_per_second": 5.993, "step": 48000 }, { "epoch": 5.180592991913747, "grad_norm": 0.6958052515983582, "learning_rate": 0.0002896060442525634, "loss": 3.3332, "step": 48050 }, { "epoch": 5.185983827493262, "grad_norm": 0.6811221241950989, "learning_rate": 0.00028928224500809495, "loss": 3.3202, "step": 48100 }, { "epoch": 5.191374663072776, "grad_norm": 0.6962025165557861, "learning_rate": 0.0002889584457636265, "loss": 3.3349, "step": 48150 }, { "epoch": 5.196765498652291, "grad_norm": 0.6717342138290405, "learning_rate": 0.0002886346465191581, "loss": 3.3358, "step": 48200 }, { "epoch": 5.202156334231806, "grad_norm": 0.6936809420585632, "learning_rate": 0.00028831084727468965, "loss": 3.3275, "step": 48250 }, { "epoch": 5.2075471698113205, "grad_norm": 0.7409997582435608, "learning_rate": 0.00028798704803022126, "loss": 3.3286, "step": 48300 }, { "epoch": 5.212938005390836, "grad_norm": 0.7106682658195496, "learning_rate": 0.0002876632487857528, "loss": 3.332, "step": 48350 }, { "epoch": 5.218328840970351, "grad_norm": 0.65801602602005, "learning_rate": 0.0002873394495412844, "loss": 3.3419, "step": 48400 }, { "epoch": 5.223719676549865, "grad_norm": 0.7524127960205078, "learning_rate": 0.0002870156502968159, "loss": 3.3331, "step": 48450 }, { "epoch": 5.22911051212938, "grad_norm": 0.7451915144920349, "learning_rate": 0.0002866918510523475, "loss": 3.3196, "step": 48500 }, { "epoch": 5.234501347708895, "grad_norm": 0.7274192571640015, "learning_rate": 0.00028636805180787907, "loss": 3.3301, "step": 48550 }, { "epoch": 5.2398921832884096, "grad_norm": 0.6847207546234131, "learning_rate": 0.00028604425256341067, "loss": 3.3537, "step": 48600 }, { "epoch": 5.245283018867925, "grad_norm": 0.6851732730865479, "learning_rate": 0.0002857204533189422, "loss": 3.3343, "step": 48650 }, { "epoch": 5.250673854447439, "grad_norm": 0.7094375491142273, "learning_rate": 0.0002853966540744738, "loss": 3.3359, "step": 48700 }, { "epoch": 5.256064690026954, "grad_norm": 0.7639798521995544, "learning_rate": 0.0002850728548300054, "loss": 3.3449, "step": 48750 }, { "epoch": 5.261455525606469, "grad_norm": 0.6960095167160034, "learning_rate": 0.000284749055585537, "loss": 3.3278, "step": 48800 }, { "epoch": 5.2668463611859835, "grad_norm": 0.6844643950462341, "learning_rate": 0.0002844252563410685, "loss": 3.3229, "step": 48850 }, { "epoch": 5.272237196765499, "grad_norm": 0.7330331802368164, "learning_rate": 0.0002841014570966001, "loss": 3.338, "step": 48900 }, { "epoch": 5.277628032345014, "grad_norm": 0.7339238524436951, "learning_rate": 0.00028377765785213163, "loss": 3.3371, "step": 48950 }, { "epoch": 5.283018867924528, "grad_norm": 0.7294909358024597, "learning_rate": 0.00028345385860766324, "loss": 3.3434, "step": 49000 }, { "epoch": 5.283018867924528, "eval_accuracy": 0.38087085506889085, "eval_loss": 3.410123586654663, "eval_runtime": 187.9534, "eval_samples_per_second": 95.827, "eval_steps_per_second": 5.991, "step": 49000 }, { "epoch": 5.288409703504043, "grad_norm": 0.7706004977226257, "learning_rate": 0.0002831300593631948, "loss": 3.3418, "step": 49050 }, { "epoch": 5.293800539083558, "grad_norm": 0.7382911443710327, "learning_rate": 0.0002828062601187264, "loss": 3.354, "step": 49100 }, { "epoch": 5.2991913746630726, "grad_norm": 0.7077482342720032, "learning_rate": 0.00028248246087425794, "loss": 3.3486, "step": 49150 }, { "epoch": 5.304582210242588, "grad_norm": 0.7349934577941895, "learning_rate": 0.0002821586616297895, "loss": 3.3568, "step": 49200 }, { "epoch": 5.309973045822103, "grad_norm": 0.6874622702598572, "learning_rate": 0.0002818348623853211, "loss": 3.3349, "step": 49250 }, { "epoch": 5.315363881401617, "grad_norm": 0.7308844327926636, "learning_rate": 0.00028151106314085265, "loss": 3.3386, "step": 49300 }, { "epoch": 5.320754716981132, "grad_norm": 0.7208496928215027, "learning_rate": 0.0002811872638963842, "loss": 3.3406, "step": 49350 }, { "epoch": 5.3261455525606465, "grad_norm": 0.6990295052528381, "learning_rate": 0.0002808634646519158, "loss": 3.3326, "step": 49400 }, { "epoch": 5.331536388140162, "grad_norm": 0.6737046241760254, "learning_rate": 0.00028053966540744736, "loss": 3.3159, "step": 49450 }, { "epoch": 5.336927223719677, "grad_norm": 0.725805938243866, "learning_rate": 0.0002802158661629789, "loss": 3.3448, "step": 49500 }, { "epoch": 5.342318059299191, "grad_norm": 0.6815341711044312, "learning_rate": 0.0002798920669185105, "loss": 3.3416, "step": 49550 }, { "epoch": 5.347708894878706, "grad_norm": 0.7110295295715332, "learning_rate": 0.00027956826767404206, "loss": 3.3343, "step": 49600 }, { "epoch": 5.353099730458221, "grad_norm": 0.6776632070541382, "learning_rate": 0.00027924446842957367, "loss": 3.3388, "step": 49650 }, { "epoch": 5.3584905660377355, "grad_norm": 0.7187811136245728, "learning_rate": 0.0002789206691851052, "loss": 3.3225, "step": 49700 }, { "epoch": 5.363881401617251, "grad_norm": 0.7040181756019592, "learning_rate": 0.00027859686994063677, "loss": 3.3609, "step": 49750 }, { "epoch": 5.369272237196766, "grad_norm": 0.7089135646820068, "learning_rate": 0.0002782730706961683, "loss": 3.3371, "step": 49800 }, { "epoch": 5.37466307277628, "grad_norm": 0.7521438002586365, "learning_rate": 0.0002779492714516999, "loss": 3.3469, "step": 49850 }, { "epoch": 5.380053908355795, "grad_norm": 0.7226114869117737, "learning_rate": 0.00027762547220723147, "loss": 3.3395, "step": 49900 }, { "epoch": 5.38544474393531, "grad_norm": 0.6697371602058411, "learning_rate": 0.0002773016729627631, "loss": 3.3519, "step": 49950 }, { "epoch": 5.390835579514825, "grad_norm": 0.7129895091056824, "learning_rate": 0.00027697787371829463, "loss": 3.3462, "step": 50000 }, { "epoch": 5.390835579514825, "eval_accuracy": 0.38111282505005367, "eval_loss": 3.410655975341797, "eval_runtime": 187.6619, "eval_samples_per_second": 95.976, "eval_steps_per_second": 6.0, "step": 50000 }, { "epoch": 5.39622641509434, "grad_norm": 0.7076137661933899, "learning_rate": 0.00027665407447382623, "loss": 3.3291, "step": 50050 }, { "epoch": 5.401617250673855, "grad_norm": 0.7562270760536194, "learning_rate": 0.0002763302752293578, "loss": 3.354, "step": 50100 }, { "epoch": 5.407008086253369, "grad_norm": 0.6961677074432373, "learning_rate": 0.00027600647598488933, "loss": 3.3319, "step": 50150 }, { "epoch": 5.412398921832884, "grad_norm": 0.677556574344635, "learning_rate": 0.0002756826767404209, "loss": 3.3385, "step": 50200 }, { "epoch": 5.4177897574123985, "grad_norm": 0.7727808356285095, "learning_rate": 0.0002753588774959525, "loss": 3.3574, "step": 50250 }, { "epoch": 5.423180592991914, "grad_norm": 0.7122824788093567, "learning_rate": 0.00027503507825148404, "loss": 3.322, "step": 50300 }, { "epoch": 5.428571428571429, "grad_norm": 0.7861427664756775, "learning_rate": 0.00027471127900701564, "loss": 3.3397, "step": 50350 }, { "epoch": 5.433962264150943, "grad_norm": 0.6940593123435974, "learning_rate": 0.0002743874797625472, "loss": 3.3523, "step": 50400 }, { "epoch": 5.439353099730458, "grad_norm": 0.7210487723350525, "learning_rate": 0.0002740636805180788, "loss": 3.3354, "step": 50450 }, { "epoch": 5.444743935309973, "grad_norm": 0.7028635740280151, "learning_rate": 0.00027373988127361035, "loss": 3.3486, "step": 50500 }, { "epoch": 5.450134770889488, "grad_norm": 0.7137354016304016, "learning_rate": 0.0002734160820291419, "loss": 3.346, "step": 50550 }, { "epoch": 5.455525606469003, "grad_norm": 0.6994820237159729, "learning_rate": 0.00027309228278467345, "loss": 3.3351, "step": 50600 }, { "epoch": 5.460916442048518, "grad_norm": 0.6950395703315735, "learning_rate": 0.00027276848354020506, "loss": 3.3469, "step": 50650 }, { "epoch": 5.466307277628032, "grad_norm": 0.7250814437866211, "learning_rate": 0.0002724446842957366, "loss": 3.3623, "step": 50700 }, { "epoch": 5.471698113207547, "grad_norm": 0.7469569444656372, "learning_rate": 0.0002721208850512682, "loss": 3.3645, "step": 50750 }, { "epoch": 5.4770889487870615, "grad_norm": 0.7123376131057739, "learning_rate": 0.00027179708580679976, "loss": 3.3449, "step": 50800 }, { "epoch": 5.482479784366577, "grad_norm": 0.6993563771247864, "learning_rate": 0.0002714732865623313, "loss": 3.3459, "step": 50850 }, { "epoch": 5.487870619946092, "grad_norm": 0.6928504109382629, "learning_rate": 0.0002711494873178629, "loss": 3.3604, "step": 50900 }, { "epoch": 5.493261455525606, "grad_norm": 0.708638608455658, "learning_rate": 0.00027082568807339447, "loss": 3.3473, "step": 50950 }, { "epoch": 5.498652291105121, "grad_norm": 0.6820019483566284, "learning_rate": 0.000270501888828926, "loss": 3.3481, "step": 51000 }, { "epoch": 5.498652291105121, "eval_accuracy": 0.38180364011077816, "eval_loss": 3.4029934406280518, "eval_runtime": 187.9527, "eval_samples_per_second": 95.827, "eval_steps_per_second": 5.991, "step": 51000 }, { "epoch": 5.504043126684636, "grad_norm": 0.7395426630973816, "learning_rate": 0.0002701780895844576, "loss": 3.3385, "step": 51050 }, { "epoch": 5.509433962264151, "grad_norm": 0.7084470391273499, "learning_rate": 0.0002698542903399892, "loss": 3.3299, "step": 51100 }, { "epoch": 5.514824797843666, "grad_norm": 0.7226378917694092, "learning_rate": 0.0002695304910955207, "loss": 3.3308, "step": 51150 }, { "epoch": 5.520215633423181, "grad_norm": 0.7400267124176025, "learning_rate": 0.00026920669185105233, "loss": 3.3319, "step": 51200 }, { "epoch": 5.525606469002695, "grad_norm": 0.6931690573692322, "learning_rate": 0.0002688828926065839, "loss": 3.3606, "step": 51250 }, { "epoch": 5.53099730458221, "grad_norm": 0.7145960927009583, "learning_rate": 0.0002685590933621155, "loss": 3.3511, "step": 51300 }, { "epoch": 5.536388140161725, "grad_norm": 0.7469211220741272, "learning_rate": 0.00026823529411764704, "loss": 3.352, "step": 51350 }, { "epoch": 5.54177897574124, "grad_norm": 0.7150426506996155, "learning_rate": 0.0002679114948731786, "loss": 3.3304, "step": 51400 }, { "epoch": 5.547169811320755, "grad_norm": 0.6985902190208435, "learning_rate": 0.0002675876956287102, "loss": 3.3355, "step": 51450 }, { "epoch": 5.55256064690027, "grad_norm": 0.7522373795509338, "learning_rate": 0.00026726389638424174, "loss": 3.3749, "step": 51500 }, { "epoch": 5.557951482479784, "grad_norm": 0.7140241861343384, "learning_rate": 0.0002669400971397733, "loss": 3.3353, "step": 51550 }, { "epoch": 5.563342318059299, "grad_norm": 0.7518497705459595, "learning_rate": 0.0002666162978953049, "loss": 3.35, "step": 51600 }, { "epoch": 5.568733153638814, "grad_norm": 0.6793280839920044, "learning_rate": 0.00026629249865083645, "loss": 3.336, "step": 51650 }, { "epoch": 5.574123989218329, "grad_norm": 0.7399218678474426, "learning_rate": 0.00026596869940636805, "loss": 3.3401, "step": 51700 }, { "epoch": 5.579514824797844, "grad_norm": 0.6799671649932861, "learning_rate": 0.0002656449001618996, "loss": 3.3521, "step": 51750 }, { "epoch": 5.584905660377358, "grad_norm": 0.7114550471305847, "learning_rate": 0.0002653211009174312, "loss": 3.3258, "step": 51800 }, { "epoch": 5.590296495956873, "grad_norm": 0.6648984551429749, "learning_rate": 0.0002649973016729627, "loss": 3.3234, "step": 51850 }, { "epoch": 5.595687331536388, "grad_norm": 0.6796683073043823, "learning_rate": 0.0002646735024284943, "loss": 3.3763, "step": 51900 }, { "epoch": 5.601078167115903, "grad_norm": 0.7021110653877258, "learning_rate": 0.00026434970318402586, "loss": 3.3425, "step": 51950 }, { "epoch": 5.606469002695418, "grad_norm": 0.6809260249137878, "learning_rate": 0.00026402590393955746, "loss": 3.3563, "step": 52000 }, { "epoch": 5.606469002695418, "eval_accuracy": 0.382167409979181, "eval_loss": 3.3986570835113525, "eval_runtime": 187.5942, "eval_samples_per_second": 96.01, "eval_steps_per_second": 6.002, "step": 52000 }, { "epoch": 5.611859838274933, "grad_norm": 0.7304422855377197, "learning_rate": 0.000263702104695089, "loss": 3.3322, "step": 52050 }, { "epoch": 5.617250673854447, "grad_norm": 0.7089975476264954, "learning_rate": 0.0002633783054506206, "loss": 3.3526, "step": 52100 }, { "epoch": 5.622641509433962, "grad_norm": 0.7025209665298462, "learning_rate": 0.00026305450620615217, "loss": 3.326, "step": 52150 }, { "epoch": 5.628032345013477, "grad_norm": 0.7471228837966919, "learning_rate": 0.0002627307069616837, "loss": 3.3258, "step": 52200 }, { "epoch": 5.633423180592992, "grad_norm": 0.7315310835838318, "learning_rate": 0.0002624069077172153, "loss": 3.3439, "step": 52250 }, { "epoch": 5.638814016172507, "grad_norm": 0.7284122705459595, "learning_rate": 0.0002620831084727469, "loss": 3.3585, "step": 52300 }, { "epoch": 5.644204851752022, "grad_norm": 0.7069854736328125, "learning_rate": 0.0002617593092282784, "loss": 3.3298, "step": 52350 }, { "epoch": 5.649595687331536, "grad_norm": 0.7223303318023682, "learning_rate": 0.00026143550998381003, "loss": 3.3243, "step": 52400 }, { "epoch": 5.654986522911051, "grad_norm": 0.7994519472122192, "learning_rate": 0.0002611117107393416, "loss": 3.3368, "step": 52450 }, { "epoch": 5.660377358490566, "grad_norm": 0.7556917667388916, "learning_rate": 0.00026078791149487313, "loss": 3.3491, "step": 52500 }, { "epoch": 5.665768194070081, "grad_norm": 0.7107183933258057, "learning_rate": 0.00026046411225040474, "loss": 3.3432, "step": 52550 }, { "epoch": 5.671159029649596, "grad_norm": 0.8043556809425354, "learning_rate": 0.0002601403130059363, "loss": 3.3448, "step": 52600 }, { "epoch": 5.67654986522911, "grad_norm": 0.7135907411575317, "learning_rate": 0.0002598165137614679, "loss": 3.3574, "step": 52650 }, { "epoch": 5.681940700808625, "grad_norm": 0.738898515701294, "learning_rate": 0.00025949271451699944, "loss": 3.3589, "step": 52700 }, { "epoch": 5.6873315363881405, "grad_norm": 0.7450215220451355, "learning_rate": 0.000259168915272531, "loss": 3.3521, "step": 52750 }, { "epoch": 5.692722371967655, "grad_norm": 0.7098891735076904, "learning_rate": 0.0002588451160280626, "loss": 3.332, "step": 52800 }, { "epoch": 5.69811320754717, "grad_norm": 0.7791381478309631, "learning_rate": 0.00025852131678359415, "loss": 3.3358, "step": 52850 }, { "epoch": 5.703504043126685, "grad_norm": 0.7195214629173279, "learning_rate": 0.0002581975175391257, "loss": 3.3575, "step": 52900 }, { "epoch": 5.708894878706199, "grad_norm": 0.7167583703994751, "learning_rate": 0.0002578737182946573, "loss": 3.3528, "step": 52950 }, { "epoch": 5.714285714285714, "grad_norm": 0.6789204478263855, "learning_rate": 0.00025754991905018885, "loss": 3.3483, "step": 53000 }, { "epoch": 5.714285714285714, "eval_accuracy": 0.38240546845638257, "eval_loss": 3.3940911293029785, "eval_runtime": 188.0896, "eval_samples_per_second": 95.758, "eval_steps_per_second": 5.987, "step": 53000 }, { "epoch": 5.719676549865229, "grad_norm": 0.7385661602020264, "learning_rate": 0.00025722611980572046, "loss": 3.3309, "step": 53050 }, { "epoch": 5.725067385444744, "grad_norm": 0.7592161297798157, "learning_rate": 0.000256902320561252, "loss": 3.3538, "step": 53100 }, { "epoch": 5.730458221024259, "grad_norm": 0.7699127197265625, "learning_rate": 0.00025657852131678356, "loss": 3.3546, "step": 53150 }, { "epoch": 5.735849056603773, "grad_norm": 0.7535634636878967, "learning_rate": 0.0002562547220723151, "loss": 3.3488, "step": 53200 }, { "epoch": 5.741239892183288, "grad_norm": 0.7500416040420532, "learning_rate": 0.0002559309228278467, "loss": 3.3408, "step": 53250 }, { "epoch": 5.7466307277628035, "grad_norm": 0.70904141664505, "learning_rate": 0.00025560712358337827, "loss": 3.3645, "step": 53300 }, { "epoch": 5.752021563342318, "grad_norm": 0.739777147769928, "learning_rate": 0.00025528332433890987, "loss": 3.3288, "step": 53350 }, { "epoch": 5.757412398921833, "grad_norm": 0.7218491435050964, "learning_rate": 0.0002549595250944414, "loss": 3.3466, "step": 53400 }, { "epoch": 5.762803234501348, "grad_norm": 0.7896709442138672, "learning_rate": 0.000254635725849973, "loss": 3.3365, "step": 53450 }, { "epoch": 5.768194070080862, "grad_norm": 0.7408962845802307, "learning_rate": 0.0002543119266055046, "loss": 3.349, "step": 53500 }, { "epoch": 5.773584905660377, "grad_norm": 0.7032009959220886, "learning_rate": 0.00025398812736103613, "loss": 3.3334, "step": 53550 }, { "epoch": 5.7789757412398925, "grad_norm": 0.807627260684967, "learning_rate": 0.0002536643281165677, "loss": 3.3306, "step": 53600 }, { "epoch": 5.784366576819407, "grad_norm": 0.6804621815681458, "learning_rate": 0.0002533405288720993, "loss": 3.3277, "step": 53650 }, { "epoch": 5.789757412398922, "grad_norm": 0.7353481650352478, "learning_rate": 0.00025301672962763083, "loss": 3.338, "step": 53700 }, { "epoch": 5.795148247978437, "grad_norm": 0.7213873267173767, "learning_rate": 0.0002526994063680518, "loss": 3.3453, "step": 53750 }, { "epoch": 5.800539083557951, "grad_norm": 0.751306414604187, "learning_rate": 0.0002523756071235834, "loss": 3.3533, "step": 53800 }, { "epoch": 5.8059299191374665, "grad_norm": 0.6616135239601135, "learning_rate": 0.00025205180787911494, "loss": 3.3369, "step": 53850 }, { "epoch": 5.811320754716981, "grad_norm": 0.7167936563491821, "learning_rate": 0.0002517280086346465, "loss": 3.3414, "step": 53900 }, { "epoch": 5.816711590296496, "grad_norm": 0.7028331160545349, "learning_rate": 0.00025140420939017804, "loss": 3.3411, "step": 53950 }, { "epoch": 5.822102425876011, "grad_norm": 0.6635711193084717, "learning_rate": 0.00025108041014570965, "loss": 3.3368, "step": 54000 }, { "epoch": 5.822102425876011, "eval_accuracy": 0.38277771325003496, "eval_loss": 3.389596700668335, "eval_runtime": 187.815, "eval_samples_per_second": 95.898, "eval_steps_per_second": 5.995, "step": 54000 }, { "epoch": 5.827493261455525, "grad_norm": 0.7287641167640686, "learning_rate": 0.0002507566109012412, "loss": 3.3552, "step": 54050 }, { "epoch": 5.83288409703504, "grad_norm": 0.7763833999633789, "learning_rate": 0.0002504328116567728, "loss": 3.3402, "step": 54100 }, { "epoch": 5.8382749326145555, "grad_norm": 0.7085495591163635, "learning_rate": 0.00025010901241230435, "loss": 3.3601, "step": 54150 }, { "epoch": 5.84366576819407, "grad_norm": 0.724402904510498, "learning_rate": 0.0002497852131678359, "loss": 3.3646, "step": 54200 }, { "epoch": 5.849056603773585, "grad_norm": 0.7037736773490906, "learning_rate": 0.0002494614139233675, "loss": 3.3403, "step": 54250 }, { "epoch": 5.8544474393531, "grad_norm": 0.751995325088501, "learning_rate": 0.00024913761467889906, "loss": 3.3494, "step": 54300 }, { "epoch": 5.859838274932614, "grad_norm": 0.749474287033081, "learning_rate": 0.0002488138154344306, "loss": 3.352, "step": 54350 }, { "epoch": 5.8652291105121295, "grad_norm": 0.7056989073753357, "learning_rate": 0.0002484900161899622, "loss": 3.3472, "step": 54400 }, { "epoch": 5.870619946091644, "grad_norm": 0.7020218372344971, "learning_rate": 0.00024816621694549376, "loss": 3.3353, "step": 54450 }, { "epoch": 5.876010781671159, "grad_norm": 0.7736152410507202, "learning_rate": 0.00024784241770102537, "loss": 3.3473, "step": 54500 }, { "epoch": 5.881401617250674, "grad_norm": 0.7925509214401245, "learning_rate": 0.0002475186184565569, "loss": 3.3463, "step": 54550 }, { "epoch": 5.886792452830189, "grad_norm": 0.7352725267410278, "learning_rate": 0.00024719481921208847, "loss": 3.346, "step": 54600 }, { "epoch": 5.892183288409703, "grad_norm": 0.7285502552986145, "learning_rate": 0.0002468710199676201, "loss": 3.3522, "step": 54650 }, { "epoch": 5.8975741239892185, "grad_norm": 0.7065455913543701, "learning_rate": 0.0002465472207231516, "loss": 3.3408, "step": 54700 }, { "epoch": 5.902964959568733, "grad_norm": 0.8250967264175415, "learning_rate": 0.0002462234214786832, "loss": 3.3413, "step": 54750 }, { "epoch": 5.908355795148248, "grad_norm": 0.7509498596191406, "learning_rate": 0.0002458996222342148, "loss": 3.339, "step": 54800 }, { "epoch": 5.913746630727763, "grad_norm": 0.6860405802726746, "learning_rate": 0.00024557582298974633, "loss": 3.3402, "step": 54850 }, { "epoch": 5.919137466307277, "grad_norm": 0.746910035610199, "learning_rate": 0.0002452520237452779, "loss": 3.3429, "step": 54900 }, { "epoch": 5.9245283018867925, "grad_norm": 0.8592144250869751, "learning_rate": 0.0002449282245008095, "loss": 3.3498, "step": 54950 }, { "epoch": 5.929919137466308, "grad_norm": 0.7327863574028015, "learning_rate": 0.00024460442525634104, "loss": 3.3578, "step": 55000 }, { "epoch": 5.929919137466308, "eval_accuracy": 0.38393095500129787, "eval_loss": 3.3858537673950195, "eval_runtime": 188.1736, "eval_samples_per_second": 95.715, "eval_steps_per_second": 5.984, "step": 55000 }, { "epoch": 5.935309973045822, "grad_norm": 0.7130724787712097, "learning_rate": 0.00024428062601187264, "loss": 3.3619, "step": 55050 }, { "epoch": 5.940700808625337, "grad_norm": 0.7520925998687744, "learning_rate": 0.0002439568267674042, "loss": 3.3506, "step": 55100 }, { "epoch": 5.946091644204852, "grad_norm": 0.7618809938430786, "learning_rate": 0.00024363302752293574, "loss": 3.3459, "step": 55150 }, { "epoch": 5.951482479784366, "grad_norm": 0.757305383682251, "learning_rate": 0.00024330922827846732, "loss": 3.3539, "step": 55200 }, { "epoch": 5.9568733153638815, "grad_norm": 0.7665929198265076, "learning_rate": 0.0002429854290339989, "loss": 3.3594, "step": 55250 }, { "epoch": 5.962264150943396, "grad_norm": 0.7202193737030029, "learning_rate": 0.00024266162978953048, "loss": 3.3256, "step": 55300 }, { "epoch": 5.967654986522911, "grad_norm": 0.7906439304351807, "learning_rate": 0.00024233783054506203, "loss": 3.3373, "step": 55350 }, { "epoch": 5.973045822102426, "grad_norm": 0.6932725310325623, "learning_rate": 0.0002420140313005936, "loss": 3.3375, "step": 55400 }, { "epoch": 5.97843665768194, "grad_norm": 0.7889389991760254, "learning_rate": 0.00024169023205612518, "loss": 3.3301, "step": 55450 }, { "epoch": 5.9838274932614555, "grad_norm": 0.7442350387573242, "learning_rate": 0.00024136643281165676, "loss": 3.351, "step": 55500 }, { "epoch": 5.989218328840971, "grad_norm": 0.7058565616607666, "learning_rate": 0.00024104263356718834, "loss": 3.3421, "step": 55550 }, { "epoch": 5.994609164420485, "grad_norm": 0.7452341914176941, "learning_rate": 0.0002407188343227199, "loss": 3.3257, "step": 55600 }, { "epoch": 6.0, "grad_norm": 1.5780168771743774, "learning_rate": 0.00024039503507825147, "loss": 3.341, "step": 55650 }, { "epoch": 6.005390835579515, "grad_norm": 0.7813258767127991, "learning_rate": 0.00024007123583378302, "loss": 3.2441, "step": 55700 }, { "epoch": 6.010781671159029, "grad_norm": 0.7337445020675659, "learning_rate": 0.0002397474365893146, "loss": 3.2507, "step": 55750 }, { "epoch": 6.0161725067385445, "grad_norm": 0.7185764312744141, "learning_rate": 0.00023943011332973555, "loss": 3.2637, "step": 55800 }, { "epoch": 6.02156334231806, "grad_norm": 0.7511305212974548, "learning_rate": 0.00023910631408526712, "loss": 3.2531, "step": 55850 }, { "epoch": 6.026954177897574, "grad_norm": 0.7478423714637756, "learning_rate": 0.0002387825148407987, "loss": 3.2628, "step": 55900 }, { "epoch": 6.032345013477089, "grad_norm": 0.7264851927757263, "learning_rate": 0.00023845871559633025, "loss": 3.2552, "step": 55950 }, { "epoch": 6.037735849056604, "grad_norm": 0.7317351698875427, "learning_rate": 0.0002381349163518618, "loss": 3.2562, "step": 56000 }, { "epoch": 6.037735849056604, "eval_accuracy": 0.3837084338870538, "eval_loss": 3.387885570526123, "eval_runtime": 187.6852, "eval_samples_per_second": 95.964, "eval_steps_per_second": 5.999, "step": 56000 }, { "epoch": 6.0431266846361185, "grad_norm": 0.8035656213760376, "learning_rate": 0.00023781111710739338, "loss": 3.2558, "step": 56050 }, { "epoch": 6.048517520215634, "grad_norm": 0.7192986011505127, "learning_rate": 0.00023748731786292496, "loss": 3.2742, "step": 56100 }, { "epoch": 6.053908355795148, "grad_norm": 0.6871985197067261, "learning_rate": 0.00023716351861845654, "loss": 3.2518, "step": 56150 }, { "epoch": 6.059299191374663, "grad_norm": 0.6761716604232788, "learning_rate": 0.0002368397193739881, "loss": 3.2607, "step": 56200 }, { "epoch": 6.064690026954178, "grad_norm": 0.7683160305023193, "learning_rate": 0.0002365159201295197, "loss": 3.2775, "step": 56250 }, { "epoch": 6.070080862533692, "grad_norm": 0.7152441143989563, "learning_rate": 0.00023619212088505127, "loss": 3.2585, "step": 56300 }, { "epoch": 6.0754716981132075, "grad_norm": 0.7848925590515137, "learning_rate": 0.0002358683216405828, "loss": 3.2735, "step": 56350 }, { "epoch": 6.080862533692723, "grad_norm": 0.7362651228904724, "learning_rate": 0.00023554452239611437, "loss": 3.273, "step": 56400 }, { "epoch": 6.086253369272237, "grad_norm": 0.7312922477722168, "learning_rate": 0.00023522072315164595, "loss": 3.2636, "step": 56450 }, { "epoch": 6.091644204851752, "grad_norm": 0.7509818077087402, "learning_rate": 0.0002349033998920669, "loss": 3.2747, "step": 56500 }, { "epoch": 6.097035040431267, "grad_norm": 0.7297114729881287, "learning_rate": 0.00023457960064759848, "loss": 3.2905, "step": 56550 }, { "epoch": 6.1024258760107815, "grad_norm": 0.7270781993865967, "learning_rate": 0.00023425580140313005, "loss": 3.2893, "step": 56600 }, { "epoch": 6.107816711590297, "grad_norm": 0.7570849657058716, "learning_rate": 0.00023393200215866163, "loss": 3.2655, "step": 56650 }, { "epoch": 6.113207547169812, "grad_norm": 0.7342410087585449, "learning_rate": 0.00023360820291419316, "loss": 3.2849, "step": 56700 }, { "epoch": 6.118598382749326, "grad_norm": 0.7107300758361816, "learning_rate": 0.00023328440366972473, "loss": 3.2713, "step": 56750 }, { "epoch": 6.123989218328841, "grad_norm": 0.73293536901474, "learning_rate": 0.0002329606044252563, "loss": 3.2649, "step": 56800 }, { "epoch": 6.129380053908355, "grad_norm": 0.7688845992088318, "learning_rate": 0.0002326368051807879, "loss": 3.2661, "step": 56850 }, { "epoch": 6.1347708894878705, "grad_norm": 0.7595694065093994, "learning_rate": 0.00023231300593631947, "loss": 3.2667, "step": 56900 }, { "epoch": 6.140161725067386, "grad_norm": 0.7559347152709961, "learning_rate": 0.00023198920669185104, "loss": 3.2798, "step": 56950 }, { "epoch": 6.1455525606469, "grad_norm": 0.7553902268409729, "learning_rate": 0.00023166540744738262, "loss": 3.2748, "step": 57000 }, { "epoch": 6.1455525606469, "eval_accuracy": 0.3838181733037464, "eval_loss": 3.3863282203674316, "eval_runtime": 187.9204, "eval_samples_per_second": 95.844, "eval_steps_per_second": 5.992, "step": 57000 }, { "epoch": 6.150943396226415, "grad_norm": 0.7465497851371765, "learning_rate": 0.0002313416082029142, "loss": 3.2845, "step": 57050 }, { "epoch": 6.15633423180593, "grad_norm": 0.7718679308891296, "learning_rate": 0.00023101780895844572, "loss": 3.2673, "step": 57100 }, { "epoch": 6.1617250673854445, "grad_norm": 0.8121668696403503, "learning_rate": 0.0002306940097139773, "loss": 3.276, "step": 57150 }, { "epoch": 6.16711590296496, "grad_norm": 0.7367902994155884, "learning_rate": 0.00023037021046950888, "loss": 3.2705, "step": 57200 }, { "epoch": 6.172506738544475, "grad_norm": 0.7376331686973572, "learning_rate": 0.00023004641122504046, "loss": 3.2723, "step": 57250 }, { "epoch": 6.177897574123989, "grad_norm": 0.8236753940582275, "learning_rate": 0.00022972261198057203, "loss": 3.2745, "step": 57300 }, { "epoch": 6.183288409703504, "grad_norm": 0.8473188877105713, "learning_rate": 0.0002293988127361036, "loss": 3.2814, "step": 57350 }, { "epoch": 6.188679245283019, "grad_norm": 0.7933148741722107, "learning_rate": 0.00022907501349163516, "loss": 3.2874, "step": 57400 }, { "epoch": 6.1940700808625335, "grad_norm": 0.7893574833869934, "learning_rate": 0.00022875121424716674, "loss": 3.3015, "step": 57450 }, { "epoch": 6.199460916442049, "grad_norm": 0.7557196021080017, "learning_rate": 0.00022842741500269832, "loss": 3.2545, "step": 57500 }, { "epoch": 6.204851752021563, "grad_norm": 0.7800120115280151, "learning_rate": 0.00022810361575822987, "loss": 3.3058, "step": 57550 }, { "epoch": 6.210242587601078, "grad_norm": 0.7379505038261414, "learning_rate": 0.00022777981651376145, "loss": 3.2753, "step": 57600 }, { "epoch": 6.215633423180593, "grad_norm": 0.7617130279541016, "learning_rate": 0.00022745601726929302, "loss": 3.2784, "step": 57650 }, { "epoch": 6.2210242587601075, "grad_norm": 0.7396572828292847, "learning_rate": 0.00022713221802482457, "loss": 3.2981, "step": 57700 }, { "epoch": 6.226415094339623, "grad_norm": 0.7271841764450073, "learning_rate": 0.00022680841878035615, "loss": 3.2814, "step": 57750 }, { "epoch": 6.231805929919138, "grad_norm": 0.7972626686096191, "learning_rate": 0.00022648461953588773, "loss": 3.2878, "step": 57800 }, { "epoch": 6.237196765498652, "grad_norm": 0.7392578125, "learning_rate": 0.0002261608202914193, "loss": 3.2872, "step": 57850 }, { "epoch": 6.242587601078167, "grad_norm": 0.7566655874252319, "learning_rate": 0.00022583702104695088, "loss": 3.287, "step": 57900 }, { "epoch": 6.247978436657682, "grad_norm": 0.7734372615814209, "learning_rate": 0.00022551322180248246, "loss": 3.2726, "step": 57950 }, { "epoch": 6.2533692722371965, "grad_norm": 0.7498614192008972, "learning_rate": 0.00022518942255801399, "loss": 3.292, "step": 58000 }, { "epoch": 6.2533692722371965, "eval_accuracy": 0.3842432234008767, "eval_loss": 3.3843281269073486, "eval_runtime": 187.9288, "eval_samples_per_second": 95.84, "eval_steps_per_second": 5.992, "step": 58000 }, { "epoch": 6.258760107816712, "grad_norm": 0.7295949459075928, "learning_rate": 0.00022486562331354556, "loss": 3.282, "step": 58050 }, { "epoch": 6.264150943396227, "grad_norm": 0.7549505829811096, "learning_rate": 0.00022454182406907714, "loss": 3.2869, "step": 58100 }, { "epoch": 6.269541778975741, "grad_norm": 0.7437057495117188, "learning_rate": 0.00022421802482460872, "loss": 3.2805, "step": 58150 }, { "epoch": 6.274932614555256, "grad_norm": 0.7324735522270203, "learning_rate": 0.0002238942255801403, "loss": 3.279, "step": 58200 }, { "epoch": 6.280323450134771, "grad_norm": 0.8076735734939575, "learning_rate": 0.00022357042633567187, "loss": 3.2862, "step": 58250 }, { "epoch": 6.285714285714286, "grad_norm": 0.747568666934967, "learning_rate": 0.00022324662709120345, "loss": 3.2721, "step": 58300 }, { "epoch": 6.291105121293801, "grad_norm": 0.7982004284858704, "learning_rate": 0.00022292282784673503, "loss": 3.3009, "step": 58350 }, { "epoch": 6.296495956873315, "grad_norm": 0.8082500696182251, "learning_rate": 0.00022259902860226655, "loss": 3.2826, "step": 58400 }, { "epoch": 6.30188679245283, "grad_norm": 0.8358715176582336, "learning_rate": 0.00022227522935779813, "loss": 3.2843, "step": 58450 }, { "epoch": 6.307277628032345, "grad_norm": 0.8117343187332153, "learning_rate": 0.0002219514301133297, "loss": 3.275, "step": 58500 }, { "epoch": 6.3126684636118595, "grad_norm": 0.7375354170799255, "learning_rate": 0.00022162763086886129, "loss": 3.285, "step": 58550 }, { "epoch": 6.318059299191375, "grad_norm": 0.7584719061851501, "learning_rate": 0.00022130383162439286, "loss": 3.3029, "step": 58600 }, { "epoch": 6.32345013477089, "grad_norm": 0.7963485717773438, "learning_rate": 0.00022098003237992444, "loss": 3.279, "step": 58650 }, { "epoch": 6.328840970350404, "grad_norm": 0.7515376806259155, "learning_rate": 0.00022065623313545602, "loss": 3.292, "step": 58700 }, { "epoch": 6.334231805929919, "grad_norm": 0.7710037231445312, "learning_rate": 0.00022033243389098757, "loss": 3.2945, "step": 58750 }, { "epoch": 6.339622641509434, "grad_norm": 0.7963765859603882, "learning_rate": 0.00022000863464651915, "loss": 3.296, "step": 58800 }, { "epoch": 6.345013477088949, "grad_norm": 0.7372424006462097, "learning_rate": 0.0002196848354020507, "loss": 3.28, "step": 58850 }, { "epoch": 6.350404312668464, "grad_norm": 0.7944062352180481, "learning_rate": 0.00021936103615758227, "loss": 3.2815, "step": 58900 }, { "epoch": 6.355795148247978, "grad_norm": 0.7355282306671143, "learning_rate": 0.00021903723691311385, "loss": 3.2831, "step": 58950 }, { "epoch": 6.361185983827493, "grad_norm": 0.7471590042114258, "learning_rate": 0.00021871343766864543, "loss": 3.2756, "step": 59000 }, { "epoch": 6.361185983827493, "eval_accuracy": 0.3848307095652896, "eval_loss": 3.3796348571777344, "eval_runtime": 187.8514, "eval_samples_per_second": 95.879, "eval_steps_per_second": 5.994, "step": 59000 }, { "epoch": 6.366576819407008, "grad_norm": 0.7613081932067871, "learning_rate": 0.00021838963842417698, "loss": 3.2832, "step": 59050 }, { "epoch": 6.3719676549865225, "grad_norm": 0.7622696757316589, "learning_rate": 0.00021806583917970856, "loss": 3.2789, "step": 59100 }, { "epoch": 6.377358490566038, "grad_norm": 0.72235506772995, "learning_rate": 0.00021774203993524014, "loss": 3.2904, "step": 59150 }, { "epoch": 6.382749326145553, "grad_norm": 0.8166873455047607, "learning_rate": 0.00021741824069077171, "loss": 3.2955, "step": 59200 }, { "epoch": 6.388140161725067, "grad_norm": 0.7647126913070679, "learning_rate": 0.00021709444144630326, "loss": 3.2857, "step": 59250 }, { "epoch": 6.393530997304582, "grad_norm": 0.7704625129699707, "learning_rate": 0.00021677064220183484, "loss": 3.3068, "step": 59300 }, { "epoch": 6.398921832884097, "grad_norm": 0.7548606395721436, "learning_rate": 0.00021644684295736642, "loss": 3.2837, "step": 59350 }, { "epoch": 6.404312668463612, "grad_norm": 0.695271909236908, "learning_rate": 0.00021612304371289797, "loss": 3.2807, "step": 59400 }, { "epoch": 6.409703504043127, "grad_norm": 0.7588513493537903, "learning_rate": 0.00021579924446842955, "loss": 3.2976, "step": 59450 }, { "epoch": 6.415094339622642, "grad_norm": 0.8067427277565002, "learning_rate": 0.00021547544522396113, "loss": 3.2828, "step": 59500 }, { "epoch": 6.420485175202156, "grad_norm": 0.777260959148407, "learning_rate": 0.0002151516459794927, "loss": 3.2736, "step": 59550 }, { "epoch": 6.425876010781671, "grad_norm": 0.7520415186882019, "learning_rate": 0.00021482784673502428, "loss": 3.2852, "step": 59600 }, { "epoch": 6.431266846361186, "grad_norm": 0.7825960516929626, "learning_rate": 0.00021450404749055586, "loss": 3.2913, "step": 59650 }, { "epoch": 6.436657681940701, "grad_norm": 0.7923210859298706, "learning_rate": 0.00021418024824608738, "loss": 3.2861, "step": 59700 }, { "epoch": 6.442048517520216, "grad_norm": 0.7896773815155029, "learning_rate": 0.00021385644900161896, "loss": 3.2976, "step": 59750 }, { "epoch": 6.44743935309973, "grad_norm": 0.7635443210601807, "learning_rate": 0.00021353264975715054, "loss": 3.2877, "step": 59800 }, { "epoch": 6.452830188679245, "grad_norm": 0.7932662963867188, "learning_rate": 0.00021320885051268211, "loss": 3.2946, "step": 59850 }, { "epoch": 6.45822102425876, "grad_norm": 0.7560425996780396, "learning_rate": 0.0002128850512682137, "loss": 3.2832, "step": 59900 }, { "epoch": 6.463611859838275, "grad_norm": 0.7501300573348999, "learning_rate": 0.00021256125202374527, "loss": 3.2952, "step": 59950 }, { "epoch": 6.46900269541779, "grad_norm": 0.7904677987098694, "learning_rate": 0.00021223745277927685, "loss": 3.2928, "step": 60000 }, { "epoch": 6.46900269541779, "eval_accuracy": 0.384727380668978, "eval_loss": 3.3773369789123535, "eval_runtime": 188.1838, "eval_samples_per_second": 95.71, "eval_steps_per_second": 5.984, "step": 60000 }, { "epoch": 6.474393530997305, "grad_norm": 0.7343600392341614, "learning_rate": 0.00021191365353480843, "loss": 3.3014, "step": 60050 }, { "epoch": 6.479784366576819, "grad_norm": 0.777730405330658, "learning_rate": 0.00021158985429033995, "loss": 3.2978, "step": 60100 }, { "epoch": 6.485175202156334, "grad_norm": 0.7713602185249329, "learning_rate": 0.00021126605504587153, "loss": 3.2898, "step": 60150 }, { "epoch": 6.490566037735849, "grad_norm": 0.8683859705924988, "learning_rate": 0.0002109422558014031, "loss": 3.3135, "step": 60200 }, { "epoch": 6.495956873315364, "grad_norm": 0.7600846290588379, "learning_rate": 0.00021061845655693468, "loss": 3.2873, "step": 60250 }, { "epoch": 6.501347708894879, "grad_norm": 0.7918633818626404, "learning_rate": 0.00021029465731246626, "loss": 3.2922, "step": 60300 }, { "epoch": 6.506738544474393, "grad_norm": 0.7769505381584167, "learning_rate": 0.0002099773340528872, "loss": 3.2941, "step": 60350 }, { "epoch": 6.512129380053908, "grad_norm": 0.8223928809165955, "learning_rate": 0.0002096535348084188, "loss": 3.2921, "step": 60400 }, { "epoch": 6.517520215633423, "grad_norm": 0.7309505939483643, "learning_rate": 0.0002093297355639503, "loss": 3.3065, "step": 60450 }, { "epoch": 6.5229110512129385, "grad_norm": 0.7776194214820862, "learning_rate": 0.0002090059363194819, "loss": 3.2934, "step": 60500 }, { "epoch": 6.528301886792453, "grad_norm": 0.7898682951927185, "learning_rate": 0.00020868213707501347, "loss": 3.3116, "step": 60550 }, { "epoch": 6.533692722371968, "grad_norm": 0.8079066872596741, "learning_rate": 0.00020835833783054505, "loss": 3.3073, "step": 60600 }, { "epoch": 6.539083557951482, "grad_norm": 0.8752650022506714, "learning_rate": 0.00020803453858607662, "loss": 3.2962, "step": 60650 }, { "epoch": 6.544474393530997, "grad_norm": 0.8422349691390991, "learning_rate": 0.0002077107393416082, "loss": 3.3121, "step": 60700 }, { "epoch": 6.549865229110512, "grad_norm": 0.7801697850227356, "learning_rate": 0.00020738694009713975, "loss": 3.2964, "step": 60750 }, { "epoch": 6.555256064690027, "grad_norm": 0.732362687587738, "learning_rate": 0.00020706314085267133, "loss": 3.2881, "step": 60800 }, { "epoch": 6.560646900269542, "grad_norm": 0.8021662831306458, "learning_rate": 0.00020673934160820288, "loss": 3.3067, "step": 60850 }, { "epoch": 6.566037735849057, "grad_norm": 0.8009557127952576, "learning_rate": 0.00020641554236373446, "loss": 3.2876, "step": 60900 }, { "epoch": 6.571428571428571, "grad_norm": 0.7953367233276367, "learning_rate": 0.00020609174311926604, "loss": 3.2895, "step": 60950 }, { "epoch": 6.576819407008086, "grad_norm": 0.7843896150588989, "learning_rate": 0.0002057679438747976, "loss": 3.2836, "step": 61000 }, { "epoch": 6.576819407008086, "eval_accuracy": 0.3854018977965303, "eval_loss": 3.373260498046875, "eval_runtime": 187.8909, "eval_samples_per_second": 95.859, "eval_steps_per_second": 5.993, "step": 61000 }, { "epoch": 6.5822102425876015, "grad_norm": 0.7954122424125671, "learning_rate": 0.00020544414463032916, "loss": 3.2847, "step": 61050 }, { "epoch": 6.587601078167116, "grad_norm": 0.7902364730834961, "learning_rate": 0.00020512034538586074, "loss": 3.2728, "step": 61100 }, { "epoch": 6.592991913746631, "grad_norm": 0.7611752152442932, "learning_rate": 0.00020479654614139232, "loss": 3.2865, "step": 61150 }, { "epoch": 6.598382749326145, "grad_norm": 0.8043197989463806, "learning_rate": 0.0002044727468969239, "loss": 3.2878, "step": 61200 }, { "epoch": 6.60377358490566, "grad_norm": 0.7516340613365173, "learning_rate": 0.00020414894765245547, "loss": 3.2894, "step": 61250 }, { "epoch": 6.609164420485175, "grad_norm": 0.7979243397712708, "learning_rate": 0.00020382514840798702, "loss": 3.2967, "step": 61300 }, { "epoch": 6.6145552560646905, "grad_norm": 0.7496867775917053, "learning_rate": 0.0002035013491635186, "loss": 3.2942, "step": 61350 }, { "epoch": 6.619946091644205, "grad_norm": 0.8551615476608276, "learning_rate": 0.00020317754991905015, "loss": 3.2784, "step": 61400 }, { "epoch": 6.62533692722372, "grad_norm": 0.7860953211784363, "learning_rate": 0.00020285375067458173, "loss": 3.3065, "step": 61450 }, { "epoch": 6.630727762803234, "grad_norm": 0.8140352964401245, "learning_rate": 0.0002025299514301133, "loss": 3.2856, "step": 61500 }, { "epoch": 6.636118598382749, "grad_norm": 0.7997155785560608, "learning_rate": 0.00020220615218564489, "loss": 3.3001, "step": 61550 }, { "epoch": 6.6415094339622645, "grad_norm": 0.7644395232200623, "learning_rate": 0.00020188235294117646, "loss": 3.2859, "step": 61600 }, { "epoch": 6.646900269541779, "grad_norm": 0.7517040371894836, "learning_rate": 0.00020155855369670804, "loss": 3.2825, "step": 61650 }, { "epoch": 6.652291105121294, "grad_norm": 0.8023728132247925, "learning_rate": 0.00020123475445223956, "loss": 3.3109, "step": 61700 }, { "epoch": 6.657681940700809, "grad_norm": 0.7633260488510132, "learning_rate": 0.00020091095520777114, "loss": 3.2998, "step": 61750 }, { "epoch": 6.663072776280323, "grad_norm": 0.8267096281051636, "learning_rate": 0.00020058715596330272, "loss": 3.2899, "step": 61800 }, { "epoch": 6.668463611859838, "grad_norm": 0.8023743629455566, "learning_rate": 0.0002002633567188343, "loss": 3.2966, "step": 61850 }, { "epoch": 6.6738544474393535, "grad_norm": 0.8002311587333679, "learning_rate": 0.00019993955747436588, "loss": 3.2937, "step": 61900 }, { "epoch": 6.679245283018868, "grad_norm": 0.8148394823074341, "learning_rate": 0.00019961575822989745, "loss": 3.2936, "step": 61950 }, { "epoch": 6.684636118598383, "grad_norm": 0.80229651927948, "learning_rate": 0.00019929195898542903, "loss": 3.307, "step": 62000 }, { "epoch": 6.684636118598383, "eval_accuracy": 0.3859653716727362, "eval_loss": 3.3676891326904297, "eval_runtime": 188.1942, "eval_samples_per_second": 95.704, "eval_steps_per_second": 5.983, "step": 62000 }, { "epoch": 6.690026954177897, "grad_norm": 0.7992780208587646, "learning_rate": 0.0001989681597409606, "loss": 3.2901, "step": 62050 }, { "epoch": 6.695417789757412, "grad_norm": 0.7275341749191284, "learning_rate": 0.00019864436049649216, "loss": 3.2889, "step": 62100 }, { "epoch": 6.7008086253369274, "grad_norm": 0.8156207203865051, "learning_rate": 0.0001983205612520237, "loss": 3.2942, "step": 62150 }, { "epoch": 6.706199460916442, "grad_norm": 0.8169457912445068, "learning_rate": 0.0001979967620075553, "loss": 3.299, "step": 62200 }, { "epoch": 6.711590296495957, "grad_norm": 0.7171157002449036, "learning_rate": 0.00019767296276308686, "loss": 3.2987, "step": 62250 }, { "epoch": 6.716981132075472, "grad_norm": 0.7815842628479004, "learning_rate": 0.00019734916351861844, "loss": 3.2905, "step": 62300 }, { "epoch": 6.722371967654986, "grad_norm": 0.8042939305305481, "learning_rate": 0.00019702536427415002, "loss": 3.2956, "step": 62350 }, { "epoch": 6.727762803234501, "grad_norm": 0.8041614294052124, "learning_rate": 0.0001967015650296816, "loss": 3.2962, "step": 62400 }, { "epoch": 6.7331536388140165, "grad_norm": 0.7848486304283142, "learning_rate": 0.00019637776578521315, "loss": 3.2944, "step": 62450 }, { "epoch": 6.738544474393531, "grad_norm": 0.8133097290992737, "learning_rate": 0.00019605396654074473, "loss": 3.2887, "step": 62500 }, { "epoch": 6.743935309973046, "grad_norm": 0.7522289752960205, "learning_rate": 0.0001957301672962763, "loss": 3.3056, "step": 62550 }, { "epoch": 6.74932614555256, "grad_norm": 0.7786284685134888, "learning_rate": 0.00019540636805180785, "loss": 3.2838, "step": 62600 }, { "epoch": 6.754716981132075, "grad_norm": 0.7982814311981201, "learning_rate": 0.00019508256880733943, "loss": 3.2938, "step": 62650 }, { "epoch": 6.7601078167115904, "grad_norm": 0.8202456831932068, "learning_rate": 0.000194758769562871, "loss": 3.2877, "step": 62700 }, { "epoch": 6.765498652291106, "grad_norm": 0.7827909588813782, "learning_rate": 0.00019443497031840256, "loss": 3.2874, "step": 62750 }, { "epoch": 6.77088948787062, "grad_norm": 0.7439596056938171, "learning_rate": 0.00019411117107393414, "loss": 3.2878, "step": 62800 }, { "epoch": 6.776280323450135, "grad_norm": 0.8092724084854126, "learning_rate": 0.00019378737182946572, "loss": 3.2849, "step": 62850 }, { "epoch": 6.781671159029649, "grad_norm": 0.7997114658355713, "learning_rate": 0.0001934635725849973, "loss": 3.3103, "step": 62900 }, { "epoch": 6.787061994609164, "grad_norm": Infinity, "learning_rate": 0.00019314624932541822, "loss": 3.2921, "step": 62950 }, { "epoch": 6.7924528301886795, "grad_norm": 0.7939057946205139, "learning_rate": 0.0001928224500809498, "loss": 3.2797, "step": 63000 }, { "epoch": 6.7924528301886795, "eval_accuracy": 0.3862258126448275, "eval_loss": 3.364175796508789, "eval_runtime": 187.876, "eval_samples_per_second": 95.866, "eval_steps_per_second": 5.993, "step": 63000 }, { "epoch": 6.797843665768194, "grad_norm": 0.7783874869346619, "learning_rate": 0.00019249865083648137, "loss": 3.3023, "step": 63050 }, { "epoch": 6.803234501347709, "grad_norm": 0.8127438426017761, "learning_rate": 0.00019217485159201292, "loss": 3.2982, "step": 63100 }, { "epoch": 6.808625336927224, "grad_norm": 0.8036924600601196, "learning_rate": 0.0001918510523475445, "loss": 3.2927, "step": 63150 }, { "epoch": 6.814016172506738, "grad_norm": 0.7932625412940979, "learning_rate": 0.00019152725310307608, "loss": 3.3087, "step": 63200 }, { "epoch": 6.819407008086253, "grad_norm": 0.9488426446914673, "learning_rate": 0.00019120345385860766, "loss": 3.2917, "step": 63250 }, { "epoch": 6.824797843665769, "grad_norm": 0.8007382154464722, "learning_rate": 0.00019087965461413923, "loss": 3.2951, "step": 63300 }, { "epoch": 6.830188679245283, "grad_norm": 0.7953688502311707, "learning_rate": 0.00019055585536967079, "loss": 3.2765, "step": 63350 }, { "epoch": 6.835579514824798, "grad_norm": 0.7954296469688416, "learning_rate": 0.00019023205612520234, "loss": 3.2922, "step": 63400 }, { "epoch": 6.840970350404312, "grad_norm": 0.7817032337188721, "learning_rate": 0.00018990825688073391, "loss": 3.2839, "step": 63450 }, { "epoch": 6.846361185983827, "grad_norm": 0.7928920984268188, "learning_rate": 0.0001895844576362655, "loss": 3.3096, "step": 63500 }, { "epoch": 6.8517520215633425, "grad_norm": 0.824849009513855, "learning_rate": 0.00018926065839179707, "loss": 3.2859, "step": 63550 }, { "epoch": 6.857142857142857, "grad_norm": 0.8251289129257202, "learning_rate": 0.00018893685914732865, "loss": 3.3139, "step": 63600 }, { "epoch": 6.862533692722372, "grad_norm": 0.7916178107261658, "learning_rate": 0.00018861305990286022, "loss": 3.2982, "step": 63650 }, { "epoch": 6.867924528301887, "grad_norm": 0.8265122771263123, "learning_rate": 0.0001882892606583918, "loss": 3.298, "step": 63700 }, { "epoch": 6.873315363881401, "grad_norm": 0.7769911289215088, "learning_rate": 0.00018796546141392333, "loss": 3.3156, "step": 63750 }, { "epoch": 6.878706199460916, "grad_norm": 0.8272818326950073, "learning_rate": 0.0001876416621694549, "loss": 3.3094, "step": 63800 }, { "epoch": 6.884097035040432, "grad_norm": 0.8231576085090637, "learning_rate": 0.00018731786292498648, "loss": 3.2944, "step": 63850 }, { "epoch": 6.889487870619946, "grad_norm": 0.8012822866439819, "learning_rate": 0.00018699406368051806, "loss": 3.2897, "step": 63900 }, { "epoch": 6.894878706199461, "grad_norm": 0.8373528122901917, "learning_rate": 0.00018667026443604964, "loss": 3.2976, "step": 63950 }, { "epoch": 6.900269541778976, "grad_norm": 0.8032182455062866, "learning_rate": 0.00018634646519158121, "loss": 3.2845, "step": 64000 }, { "epoch": 6.900269541778976, "eval_accuracy": 0.386607401586832, "eval_loss": 3.3606295585632324, "eval_runtime": 187.9629, "eval_samples_per_second": 95.822, "eval_steps_per_second": 5.991, "step": 64000 }, { "epoch": 6.90566037735849, "grad_norm": 0.7420918345451355, "learning_rate": 0.0001860226659471128, "loss": 3.2917, "step": 64050 }, { "epoch": 6.9110512129380055, "grad_norm": 0.833095908164978, "learning_rate": 0.00018569886670264434, "loss": 3.294, "step": 64100 }, { "epoch": 6.916442048517521, "grad_norm": 0.8235822319984436, "learning_rate": 0.00018537506745817592, "loss": 3.3031, "step": 64150 }, { "epoch": 6.921832884097035, "grad_norm": 0.8346628546714783, "learning_rate": 0.00018505126821370747, "loss": 3.2886, "step": 64200 }, { "epoch": 6.92722371967655, "grad_norm": 0.7743715643882751, "learning_rate": 0.00018472746896923905, "loss": 3.3067, "step": 64250 }, { "epoch": 6.932614555256064, "grad_norm": 0.8426298499107361, "learning_rate": 0.00018440366972477063, "loss": 3.2944, "step": 64300 }, { "epoch": 6.938005390835579, "grad_norm": 0.772252082824707, "learning_rate": 0.0001840798704803022, "loss": 3.2894, "step": 64350 }, { "epoch": 6.943396226415095, "grad_norm": 0.8040913939476013, "learning_rate": 0.00018375607123583378, "loss": 3.3017, "step": 64400 }, { "epoch": 6.948787061994609, "grad_norm": 0.7839882373809814, "learning_rate": 0.00018343227199136533, "loss": 3.2766, "step": 64450 }, { "epoch": 6.954177897574124, "grad_norm": 0.7890709042549133, "learning_rate": 0.0001831084727468969, "loss": 3.3068, "step": 64500 }, { "epoch": 6.959568733153639, "grad_norm": 0.7926402688026428, "learning_rate": 0.0001827846735024285, "loss": 3.2977, "step": 64550 }, { "epoch": 6.964959568733153, "grad_norm": 0.7957824468612671, "learning_rate": 0.00018246087425796004, "loss": 3.2825, "step": 64600 }, { "epoch": 6.9703504043126685, "grad_norm": 0.8079566359519958, "learning_rate": 0.00018213707501349162, "loss": 3.2973, "step": 64650 }, { "epoch": 6.975741239892184, "grad_norm": 0.7776445746421814, "learning_rate": 0.0001818132757690232, "loss": 3.3011, "step": 64700 }, { "epoch": 6.981132075471698, "grad_norm": 0.80714350938797, "learning_rate": 0.00018148947652455474, "loss": 3.2876, "step": 64750 }, { "epoch": 6.986522911051213, "grad_norm": 0.8431934118270874, "learning_rate": 0.00018116567728008632, "loss": 3.2964, "step": 64800 }, { "epoch": 6.991913746630727, "grad_norm": 0.8190756440162659, "learning_rate": 0.0001808418780356179, "loss": 3.2698, "step": 64850 }, { "epoch": 6.997304582210242, "grad_norm": 0.7817225456237793, "learning_rate": 0.00018051807879114948, "loss": 3.3162, "step": 64900 }, { "epoch": 7.002695417789758, "grad_norm": 0.7827392220497131, "learning_rate": 0.00018019427954668105, "loss": 3.2638, "step": 64950 }, { "epoch": 7.008086253369272, "grad_norm": 0.8535230159759521, "learning_rate": 0.00017987048030221263, "loss": 3.2128, "step": 65000 }, { "epoch": 7.008086253369272, "eval_accuracy": 0.3870300613204303, "eval_loss": 3.3620991706848145, "eval_runtime": 188.0973, "eval_samples_per_second": 95.754, "eval_steps_per_second": 5.986, "step": 65000 }, { "epoch": 7.013477088948787, "grad_norm": 0.8091498613357544, "learning_rate": 0.00017954668105774416, "loss": 3.2167, "step": 65050 }, { "epoch": 7.018867924528302, "grad_norm": 0.7643722891807556, "learning_rate": 0.00017922288181327573, "loss": 3.2169, "step": 65100 }, { "epoch": 7.024258760107816, "grad_norm": 0.8369283080101013, "learning_rate": 0.0001788990825688073, "loss": 3.2205, "step": 65150 }, { "epoch": 7.0296495956873315, "grad_norm": 0.7840744256973267, "learning_rate": 0.0001785752833243389, "loss": 3.2284, "step": 65200 }, { "epoch": 7.035040431266847, "grad_norm": 0.7983671426773071, "learning_rate": 0.00017825148407987047, "loss": 3.2039, "step": 65250 }, { "epoch": 7.040431266846361, "grad_norm": 0.8109228014945984, "learning_rate": 0.00017793416082029142, "loss": 3.2186, "step": 65300 }, { "epoch": 7.045822102425876, "grad_norm": 0.7853065133094788, "learning_rate": 0.000177610361575823, "loss": 3.2151, "step": 65350 }, { "epoch": 7.051212938005391, "grad_norm": 0.7788589596748352, "learning_rate": 0.00017728656233135452, "loss": 3.2145, "step": 65400 }, { "epoch": 7.056603773584905, "grad_norm": 0.7826195955276489, "learning_rate": 0.0001769627630868861, "loss": 3.2041, "step": 65450 }, { "epoch": 7.061994609164421, "grad_norm": 0.7999874949455261, "learning_rate": 0.00017663896384241767, "loss": 3.2177, "step": 65500 }, { "epoch": 7.067385444743936, "grad_norm": 0.834130048751831, "learning_rate": 0.00017631516459794925, "loss": 3.203, "step": 65550 }, { "epoch": 7.07277628032345, "grad_norm": 0.8355494737625122, "learning_rate": 0.00017599136535348083, "loss": 3.196, "step": 65600 }, { "epoch": 7.078167115902965, "grad_norm": 0.7943632006645203, "learning_rate": 0.0001756675661090124, "loss": 3.2143, "step": 65650 }, { "epoch": 7.083557951482479, "grad_norm": 0.7961003184318542, "learning_rate": 0.00017534376686454398, "loss": 3.2406, "step": 65700 }, { "epoch": 7.0889487870619945, "grad_norm": 0.8276134133338928, "learning_rate": 0.00017501996762007556, "loss": 3.2242, "step": 65750 }, { "epoch": 7.09433962264151, "grad_norm": 0.8373634815216064, "learning_rate": 0.00017469616837560709, "loss": 3.2149, "step": 65800 }, { "epoch": 7.099730458221024, "grad_norm": 0.7836037874221802, "learning_rate": 0.00017437236913113866, "loss": 3.2282, "step": 65850 }, { "epoch": 7.105121293800539, "grad_norm": 0.8280946016311646, "learning_rate": 0.00017404856988667024, "loss": 3.2394, "step": 65900 }, { "epoch": 7.110512129380054, "grad_norm": 0.8327875137329102, "learning_rate": 0.00017372477064220182, "loss": 3.2372, "step": 65950 }, { "epoch": 7.115902964959568, "grad_norm": 0.84897780418396, "learning_rate": 0.0001734009713977334, "loss": 3.2066, "step": 66000 }, { "epoch": 7.115902964959568, "eval_accuracy": 0.38692760164722123, "eval_loss": 3.3632514476776123, "eval_runtime": 188.3038, "eval_samples_per_second": 95.649, "eval_steps_per_second": 5.98, "step": 66000 }, { "epoch": 7.121293800539084, "grad_norm": 0.8668311238288879, "learning_rate": 0.00017307717215326497, "loss": 3.2588, "step": 66050 }, { "epoch": 7.126684636118599, "grad_norm": 0.8419384360313416, "learning_rate": 0.00017275337290879655, "loss": 3.2144, "step": 66100 }, { "epoch": 7.132075471698113, "grad_norm": 0.846095621585846, "learning_rate": 0.0001724295736643281, "loss": 3.2171, "step": 66150 }, { "epoch": 7.137466307277628, "grad_norm": 0.8568212389945984, "learning_rate": 0.00017210577441985968, "loss": 3.2174, "step": 66200 }, { "epoch": 7.142857142857143, "grad_norm": 0.8674870133399963, "learning_rate": 0.00017178197517539123, "loss": 3.2317, "step": 66250 }, { "epoch": 7.1482479784366575, "grad_norm": 0.7653921246528625, "learning_rate": 0.0001714581759309228, "loss": 3.2348, "step": 66300 }, { "epoch": 7.153638814016173, "grad_norm": 0.8856856822967529, "learning_rate": 0.00017113437668645439, "loss": 3.2316, "step": 66350 }, { "epoch": 7.159029649595688, "grad_norm": 0.7898850440979004, "learning_rate": 0.00017081057744198596, "loss": 3.24, "step": 66400 }, { "epoch": 7.164420485175202, "grad_norm": 0.7871878147125244, "learning_rate": 0.00017048677819751751, "loss": 3.2378, "step": 66450 }, { "epoch": 7.169811320754717, "grad_norm": 0.8268200159072876, "learning_rate": 0.0001701629789530491, "loss": 3.2243, "step": 66500 }, { "epoch": 7.175202156334231, "grad_norm": 0.829271674156189, "learning_rate": 0.00016983917970858067, "loss": 3.2261, "step": 66550 }, { "epoch": 7.180592991913747, "grad_norm": 0.8218183517456055, "learning_rate": 0.00016951538046411225, "loss": 3.2282, "step": 66600 }, { "epoch": 7.185983827493262, "grad_norm": 0.8347243666648865, "learning_rate": 0.0001691915812196438, "loss": 3.2301, "step": 66650 }, { "epoch": 7.191374663072776, "grad_norm": 0.8447914123535156, "learning_rate": 0.00016886778197517538, "loss": 3.2272, "step": 66700 }, { "epoch": 7.196765498652291, "grad_norm": 0.854339599609375, "learning_rate": 0.00016854398273070693, "loss": 3.2207, "step": 66750 }, { "epoch": 7.202156334231806, "grad_norm": 0.8469423651695251, "learning_rate": 0.0001682201834862385, "loss": 3.2379, "step": 66800 }, { "epoch": 7.2075471698113205, "grad_norm": 0.8873926401138306, "learning_rate": 0.00016789638424177008, "loss": 3.2445, "step": 66850 }, { "epoch": 7.212938005390836, "grad_norm": 0.8708266615867615, "learning_rate": 0.00016757258499730166, "loss": 3.2234, "step": 66900 }, { "epoch": 7.218328840970351, "grad_norm": 0.8092944622039795, "learning_rate": 0.00016724878575283324, "loss": 3.244, "step": 66950 }, { "epoch": 7.223719676549865, "grad_norm": 0.7969817519187927, "learning_rate": 0.00016692498650836481, "loss": 3.2329, "step": 67000 }, { "epoch": 7.223719676549865, "eval_accuracy": 0.38752866942261094, "eval_loss": 3.360783100128174, "eval_runtime": 187.837, "eval_samples_per_second": 95.886, "eval_steps_per_second": 5.995, "step": 67000 }, { "epoch": 7.22911051212938, "grad_norm": 0.872371256351471, "learning_rate": 0.0001666011872638964, "loss": 3.2582, "step": 67050 }, { "epoch": 7.234501347708895, "grad_norm": 0.7985860109329224, "learning_rate": 0.00016627738801942792, "loss": 3.2338, "step": 67100 }, { "epoch": 7.2398921832884096, "grad_norm": 0.8462668657302856, "learning_rate": 0.0001659535887749595, "loss": 3.2476, "step": 67150 }, { "epoch": 7.245283018867925, "grad_norm": 0.8263183832168579, "learning_rate": 0.00016562978953049107, "loss": 3.2315, "step": 67200 }, { "epoch": 7.250673854447439, "grad_norm": 0.7963122725486755, "learning_rate": 0.00016530599028602265, "loss": 3.2401, "step": 67250 }, { "epoch": 7.256064690026954, "grad_norm": 0.8212612867355347, "learning_rate": 0.00016498219104155423, "loss": 3.2396, "step": 67300 }, { "epoch": 7.261455525606469, "grad_norm": 0.8274388313293457, "learning_rate": 0.0001646583917970858, "loss": 3.2335, "step": 67350 }, { "epoch": 7.2668463611859835, "grad_norm": 0.8153033256530762, "learning_rate": 0.00016433459255261738, "loss": 3.2378, "step": 67400 }, { "epoch": 7.272237196765499, "grad_norm": 0.8221491575241089, "learning_rate": 0.00016401079330814896, "loss": 3.2338, "step": 67450 }, { "epoch": 7.277628032345014, "grad_norm": 0.8421692252159119, "learning_rate": 0.00016368699406368048, "loss": 3.2443, "step": 67500 }, { "epoch": 7.283018867924528, "grad_norm": 0.8362700343132019, "learning_rate": 0.00016336319481921206, "loss": 3.2484, "step": 67550 }, { "epoch": 7.288409703504043, "grad_norm": 0.7969926595687866, "learning_rate": 0.00016303939557474364, "loss": 3.2195, "step": 67600 }, { "epoch": 7.293800539083558, "grad_norm": 0.7820971012115479, "learning_rate": 0.00016271559633027522, "loss": 3.2425, "step": 67650 }, { "epoch": 7.2991913746630726, "grad_norm": 0.8669634461402893, "learning_rate": 0.0001623917970858068, "loss": 3.2312, "step": 67700 }, { "epoch": 7.304582210242588, "grad_norm": 0.8178690075874329, "learning_rate": 0.00016206799784133837, "loss": 3.2396, "step": 67750 }, { "epoch": 7.309973045822103, "grad_norm": 0.8719444870948792, "learning_rate": 0.00016174419859686992, "loss": 3.2307, "step": 67800 }, { "epoch": 7.315363881401617, "grad_norm": 0.7770711779594421, "learning_rate": 0.0001614203993524015, "loss": 3.24, "step": 67850 }, { "epoch": 7.320754716981132, "grad_norm": 0.8245651721954346, "learning_rate": 0.00016109660010793308, "loss": 3.2262, "step": 67900 }, { "epoch": 7.3261455525606465, "grad_norm": 0.8216030597686768, "learning_rate": 0.00016077280086346463, "loss": 3.2282, "step": 67950 }, { "epoch": 7.331536388140162, "grad_norm": 0.838242769241333, "learning_rate": 0.0001604490016189962, "loss": 3.2334, "step": 68000 }, { "epoch": 7.331536388140162, "eval_accuracy": 0.3875798449327716, "eval_loss": 3.357121706008911, "eval_runtime": 188.4467, "eval_samples_per_second": 95.576, "eval_steps_per_second": 5.975, "step": 68000 }, { "epoch": 7.336927223719677, "grad_norm": 0.8874527215957642, "learning_rate": 0.00016012520237452778, "loss": 3.2394, "step": 68050 }, { "epoch": 7.342318059299191, "grad_norm": 0.8234398365020752, "learning_rate": 0.00015980140313005933, "loss": 3.2681, "step": 68100 }, { "epoch": 7.347708894878706, "grad_norm": 0.9195815324783325, "learning_rate": 0.0001594776038855909, "loss": 3.2316, "step": 68150 }, { "epoch": 7.353099730458221, "grad_norm": 0.8223610520362854, "learning_rate": 0.0001591538046411225, "loss": 3.2292, "step": 68200 }, { "epoch": 7.3584905660377355, "grad_norm": 0.8694055676460266, "learning_rate": 0.00015883000539665407, "loss": 3.2589, "step": 68250 }, { "epoch": 7.363881401617251, "grad_norm": 0.8673613667488098, "learning_rate": 0.00015850620615218564, "loss": 3.2354, "step": 68300 }, { "epoch": 7.369272237196766, "grad_norm": 0.8228337168693542, "learning_rate": 0.0001581824069077172, "loss": 3.2454, "step": 68350 }, { "epoch": 7.37466307277628, "grad_norm": 0.8655876517295837, "learning_rate": 0.00015785860766324877, "loss": 3.2482, "step": 68400 }, { "epoch": 7.380053908355795, "grad_norm": 0.8178161978721619, "learning_rate": 0.00015753480841878032, "loss": 3.2314, "step": 68450 }, { "epoch": 7.38544474393531, "grad_norm": 0.8654986023902893, "learning_rate": 0.0001572110091743119, "loss": 3.2173, "step": 68500 }, { "epoch": 7.390835579514825, "grad_norm": 0.866359293460846, "learning_rate": 0.00015688720992984348, "loss": 3.2428, "step": 68550 }, { "epoch": 7.39622641509434, "grad_norm": 0.8526694178581238, "learning_rate": 0.00015656341068537506, "loss": 3.2247, "step": 68600 }, { "epoch": 7.401617250673855, "grad_norm": 0.8770955801010132, "learning_rate": 0.00015623961144090663, "loss": 3.2474, "step": 68650 }, { "epoch": 7.407008086253369, "grad_norm": 0.8251200318336487, "learning_rate": 0.0001559158121964382, "loss": 3.2239, "step": 68700 }, { "epoch": 7.412398921832884, "grad_norm": 0.886862576007843, "learning_rate": 0.0001555920129519698, "loss": 3.2334, "step": 68750 }, { "epoch": 7.4177897574123985, "grad_norm": 0.8815854787826538, "learning_rate": 0.0001552682137075013, "loss": 3.2502, "step": 68800 }, { "epoch": 7.423180592991914, "grad_norm": 0.8504962921142578, "learning_rate": 0.0001549444144630329, "loss": 3.2438, "step": 68850 }, { "epoch": 7.428571428571429, "grad_norm": 0.8565604090690613, "learning_rate": 0.00015462061521856447, "loss": 3.2513, "step": 68900 }, { "epoch": 7.433962264150943, "grad_norm": 0.8494173884391785, "learning_rate": 0.00015429681597409605, "loss": 3.2412, "step": 68950 }, { "epoch": 7.439353099730458, "grad_norm": 0.8011876344680786, "learning_rate": 0.000153979492714517, "loss": 3.2301, "step": 69000 }, { "epoch": 7.439353099730458, "eval_accuracy": 0.3875929919321971, "eval_loss": 3.354153871536255, "eval_runtime": 188.1144, "eval_samples_per_second": 95.745, "eval_steps_per_second": 5.986, "step": 69000 }, { "epoch": 7.444743935309973, "grad_norm": 0.8193632364273071, "learning_rate": 0.00015365569347004858, "loss": 3.2271, "step": 69050 }, { "epoch": 7.450134770889488, "grad_norm": 0.8151710629463196, "learning_rate": 0.00015333189422558015, "loss": 3.2332, "step": 69100 }, { "epoch": 7.455525606469003, "grad_norm": 0.8996269702911377, "learning_rate": 0.00015300809498111168, "loss": 3.2432, "step": 69150 }, { "epoch": 7.460916442048518, "grad_norm": 0.9261552691459656, "learning_rate": 0.00015268429573664325, "loss": 3.2588, "step": 69200 }, { "epoch": 7.466307277628032, "grad_norm": 0.8579587936401367, "learning_rate": 0.00015236049649217483, "loss": 3.2349, "step": 69250 }, { "epoch": 7.471698113207547, "grad_norm": 0.8887305855751038, "learning_rate": 0.0001520366972477064, "loss": 3.2629, "step": 69300 }, { "epoch": 7.4770889487870615, "grad_norm": 0.9070491194725037, "learning_rate": 0.000151712898003238, "loss": 3.2475, "step": 69350 }, { "epoch": 7.482479784366577, "grad_norm": 0.8597719073295593, "learning_rate": 0.00015138909875876956, "loss": 3.2416, "step": 69400 }, { "epoch": 7.487870619946092, "grad_norm": 0.8557640314102173, "learning_rate": 0.00015106529951430114, "loss": 3.2421, "step": 69450 }, { "epoch": 7.493261455525606, "grad_norm": 0.8627281188964844, "learning_rate": 0.0001507415002698327, "loss": 3.255, "step": 69500 }, { "epoch": 7.498652291105121, "grad_norm": 0.8331064581871033, "learning_rate": 0.00015041770102536424, "loss": 3.2324, "step": 69550 }, { "epoch": 7.504043126684636, "grad_norm": 0.9102504253387451, "learning_rate": 0.00015009390178089582, "loss": 3.2418, "step": 69600 }, { "epoch": 7.509433962264151, "grad_norm": 0.8683810234069824, "learning_rate": 0.0001497701025364274, "loss": 3.2566, "step": 69650 }, { "epoch": 7.514824797843666, "grad_norm": 0.8473634123802185, "learning_rate": 0.00014944630329195898, "loss": 3.2508, "step": 69700 }, { "epoch": 7.520215633423181, "grad_norm": 0.8372735381126404, "learning_rate": 0.00014912250404749055, "loss": 3.2712, "step": 69750 }, { "epoch": 7.525606469002695, "grad_norm": 0.8659421801567078, "learning_rate": 0.0001487987048030221, "loss": 3.2557, "step": 69800 }, { "epoch": 7.53099730458221, "grad_norm": 0.8479022979736328, "learning_rate": 0.00014847490555855368, "loss": 3.2343, "step": 69850 }, { "epoch": 7.536388140161725, "grad_norm": 0.8474248051643372, "learning_rate": 0.00014815110631408526, "loss": 3.2239, "step": 69900 }, { "epoch": 7.54177897574124, "grad_norm": 0.9101057648658752, "learning_rate": 0.0001478273070696168, "loss": 3.2595, "step": 69950 }, { "epoch": 7.547169811320755, "grad_norm": 0.8913754224777222, "learning_rate": 0.0001475035078251484, "loss": 3.2397, "step": 70000 }, { "epoch": 7.547169811320755, "eval_accuracy": 0.3883309623462331, "eval_loss": 3.3493452072143555, "eval_runtime": 188.5729, "eval_samples_per_second": 95.512, "eval_steps_per_second": 5.971, "step": 70000 }, { "epoch": 7.55256064690027, "grad_norm": 0.8284294009208679, "learning_rate": 0.00014717970858067997, "loss": 3.2311, "step": 70050 }, { "epoch": 7.557951482479784, "grad_norm": 0.8397430777549744, "learning_rate": 0.00014685590933621154, "loss": 3.2495, "step": 70100 }, { "epoch": 7.563342318059299, "grad_norm": 0.8663442730903625, "learning_rate": 0.0001465321100917431, "loss": 3.2463, "step": 70150 }, { "epoch": 7.568733153638814, "grad_norm": 0.8732253313064575, "learning_rate": 0.00014620831084727467, "loss": 3.2396, "step": 70200 }, { "epoch": 7.574123989218329, "grad_norm": 0.8437363505363464, "learning_rate": 0.00014588451160280625, "loss": 3.2518, "step": 70250 }, { "epoch": 7.579514824797844, "grad_norm": 0.8276694416999817, "learning_rate": 0.0001455607123583378, "loss": 3.2439, "step": 70300 }, { "epoch": 7.584905660377358, "grad_norm": 0.8312061429023743, "learning_rate": 0.00014523691311386938, "loss": 3.2447, "step": 70350 }, { "epoch": 7.590296495956873, "grad_norm": 0.8033319711685181, "learning_rate": 0.00014491311386940096, "loss": 3.2543, "step": 70400 }, { "epoch": 7.595687331536388, "grad_norm": 0.8969256281852722, "learning_rate": 0.00014458931462493253, "loss": 3.2566, "step": 70450 }, { "epoch": 7.601078167115903, "grad_norm": 0.8511197566986084, "learning_rate": 0.00014426551538046408, "loss": 3.2485, "step": 70500 }, { "epoch": 7.606469002695418, "grad_norm": 0.8295609354972839, "learning_rate": 0.00014394171613599566, "loss": 3.2324, "step": 70550 }, { "epoch": 7.611859838274933, "grad_norm": 0.8153634071350098, "learning_rate": 0.00014361791689152724, "loss": 3.2491, "step": 70600 }, { "epoch": 7.617250673854447, "grad_norm": 0.8315316438674927, "learning_rate": 0.00014329411764705882, "loss": 3.229, "step": 70650 }, { "epoch": 7.622641509433962, "grad_norm": 0.857572615146637, "learning_rate": 0.0001429703184025904, "loss": 3.2302, "step": 70700 }, { "epoch": 7.628032345013477, "grad_norm": 0.8279692530632019, "learning_rate": 0.00014264651915812194, "loss": 3.2476, "step": 70750 }, { "epoch": 7.633423180592992, "grad_norm": 0.8981974124908447, "learning_rate": 0.00014232271991365352, "loss": 3.2547, "step": 70800 }, { "epoch": 7.638814016172507, "grad_norm": 0.8265762329101562, "learning_rate": 0.0001419989206691851, "loss": 3.2457, "step": 70850 }, { "epoch": 7.644204851752022, "grad_norm": 0.871878445148468, "learning_rate": 0.00014167512142471668, "loss": 3.2525, "step": 70900 }, { "epoch": 7.649595687331536, "grad_norm": 0.9873601198196411, "learning_rate": 0.00014135132218024823, "loss": 3.2456, "step": 70950 }, { "epoch": 7.654986522911051, "grad_norm": 0.8204500675201416, "learning_rate": 0.0001410275229357798, "loss": 3.2356, "step": 71000 }, { "epoch": 7.654986522911051, "eval_accuracy": 0.3886576815798912, "eval_loss": 3.3460803031921387, "eval_runtime": 187.9483, "eval_samples_per_second": 95.83, "eval_steps_per_second": 5.991, "step": 71000 }, { "epoch": 7.660377358490566, "grad_norm": 0.8447879552841187, "learning_rate": 0.00014070372369131138, "loss": 3.2571, "step": 71050 }, { "epoch": 7.665768194070081, "grad_norm": 0.8823720812797546, "learning_rate": 0.00014037992444684296, "loss": 3.251, "step": 71100 }, { "epoch": 7.671159029649596, "grad_norm": 0.8513026833534241, "learning_rate": 0.0001400561252023745, "loss": 3.2542, "step": 71150 }, { "epoch": 7.67654986522911, "grad_norm": 0.8223856687545776, "learning_rate": 0.0001397323259579061, "loss": 3.2377, "step": 71200 }, { "epoch": 7.681940700808625, "grad_norm": 0.8093695044517517, "learning_rate": 0.00013940852671343767, "loss": 3.2353, "step": 71250 }, { "epoch": 7.6873315363881405, "grad_norm": 0.855767011642456, "learning_rate": 0.00013908472746896924, "loss": 3.2416, "step": 71300 }, { "epoch": 7.692722371967655, "grad_norm": 0.861933708190918, "learning_rate": 0.0001387609282245008, "loss": 3.2258, "step": 71350 }, { "epoch": 7.69811320754717, "grad_norm": 0.8508812785148621, "learning_rate": 0.00013843712898003237, "loss": 3.2479, "step": 71400 }, { "epoch": 7.703504043126685, "grad_norm": 0.9000187516212463, "learning_rate": 0.00013811332973556395, "loss": 3.2494, "step": 71450 }, { "epoch": 7.708894878706199, "grad_norm": 0.8213217258453369, "learning_rate": 0.0001377895304910955, "loss": 3.234, "step": 71500 }, { "epoch": 7.714285714285714, "grad_norm": 0.8820887804031372, "learning_rate": 0.00013746573124662708, "loss": 3.263, "step": 71550 }, { "epoch": 7.719676549865229, "grad_norm": 0.8610178828239441, "learning_rate": 0.00013714193200215866, "loss": 3.2413, "step": 71600 }, { "epoch": 7.725067385444744, "grad_norm": 0.8472573161125183, "learning_rate": 0.0001368181327576902, "loss": 3.2256, "step": 71650 }, { "epoch": 7.730458221024259, "grad_norm": 0.9511776566505432, "learning_rate": 0.00013649433351322178, "loss": 3.2522, "step": 71700 }, { "epoch": 7.735849056603773, "grad_norm": 0.8717077374458313, "learning_rate": 0.00013617701025364274, "loss": 3.2526, "step": 71750 }, { "epoch": 7.741239892183288, "grad_norm": 0.8659278750419617, "learning_rate": 0.0001358532110091743, "loss": 3.2549, "step": 71800 }, { "epoch": 7.7466307277628035, "grad_norm": 0.8259217739105225, "learning_rate": 0.00013552941176470587, "loss": 3.2458, "step": 71850 }, { "epoch": 7.752021563342318, "grad_norm": 0.906169593334198, "learning_rate": 0.00013520561252023744, "loss": 3.2155, "step": 71900 }, { "epoch": 7.757412398921833, "grad_norm": 0.8461496829986572, "learning_rate": 0.00013488181327576902, "loss": 3.2572, "step": 71950 }, { "epoch": 7.762803234501348, "grad_norm": 0.9183528423309326, "learning_rate": 0.00013455801403130057, "loss": 3.239, "step": 72000 }, { "epoch": 7.762803234501348, "eval_accuracy": 0.38920985555576443, "eval_loss": 3.3422865867614746, "eval_runtime": 188.2266, "eval_samples_per_second": 95.688, "eval_steps_per_second": 5.982, "step": 72000 }, { "epoch": 7.768194070080862, "grad_norm": 0.8566349148750305, "learning_rate": 0.00013423421478683215, "loss": 3.2618, "step": 72050 }, { "epoch": 7.773584905660377, "grad_norm": 0.8473686575889587, "learning_rate": 0.00013391041554236373, "loss": 3.2447, "step": 72100 }, { "epoch": 7.7789757412398925, "grad_norm": 0.8755962252616882, "learning_rate": 0.00013358661629789528, "loss": 3.2386, "step": 72150 }, { "epoch": 7.784366576819407, "grad_norm": 0.832993745803833, "learning_rate": 0.00013326281705342685, "loss": 3.2337, "step": 72200 }, { "epoch": 7.789757412398922, "grad_norm": 0.8647584915161133, "learning_rate": 0.00013293901780895843, "loss": 3.2494, "step": 72250 }, { "epoch": 7.795148247978437, "grad_norm": 0.877619206905365, "learning_rate": 0.00013261521856449, "loss": 3.245, "step": 72300 }, { "epoch": 7.800539083557951, "grad_norm": 0.920470118522644, "learning_rate": 0.00013229141932002156, "loss": 3.233, "step": 72350 }, { "epoch": 7.8059299191374665, "grad_norm": 0.9275642037391663, "learning_rate": 0.00013196762007555314, "loss": 3.2381, "step": 72400 }, { "epoch": 7.811320754716981, "grad_norm": 0.8567812442779541, "learning_rate": 0.00013164382083108472, "loss": 3.2538, "step": 72450 }, { "epoch": 7.816711590296496, "grad_norm": 0.8508996367454529, "learning_rate": 0.0001313200215866163, "loss": 3.2415, "step": 72500 }, { "epoch": 7.822102425876011, "grad_norm": 0.8709658980369568, "learning_rate": 0.00013099622234214784, "loss": 3.2361, "step": 72550 }, { "epoch": 7.827493261455525, "grad_norm": 0.8481534123420715, "learning_rate": 0.00013067242309767942, "loss": 3.2271, "step": 72600 }, { "epoch": 7.83288409703504, "grad_norm": 0.8258869647979736, "learning_rate": 0.000130348623853211, "loss": 3.2287, "step": 72650 }, { "epoch": 7.8382749326145555, "grad_norm": 0.8969262838363647, "learning_rate": 0.00013002482460874258, "loss": 3.2518, "step": 72700 }, { "epoch": 7.84366576819407, "grad_norm": 0.8625231981277466, "learning_rate": 0.00012970102536427413, "loss": 3.2441, "step": 72750 }, { "epoch": 7.849056603773585, "grad_norm": 0.8450524210929871, "learning_rate": 0.0001293772261198057, "loss": 3.2201, "step": 72800 }, { "epoch": 7.8544474393531, "grad_norm": 0.8817840814590454, "learning_rate": 0.00012905342687533728, "loss": 3.2418, "step": 72850 }, { "epoch": 7.859838274932614, "grad_norm": 0.8885464668273926, "learning_rate": 0.00012872962763086886, "loss": 3.241, "step": 72900 }, { "epoch": 7.8652291105121295, "grad_norm": 0.8640483021736145, "learning_rate": 0.00012840582838640044, "loss": 3.2513, "step": 72950 }, { "epoch": 7.870619946091644, "grad_norm": 0.8919004797935486, "learning_rate": 0.000128082029141932, "loss": 3.248, "step": 73000 }, { "epoch": 7.870619946091644, "eval_accuracy": 0.38931839979069105, "eval_loss": 3.3391408920288086, "eval_runtime": 187.7588, "eval_samples_per_second": 95.926, "eval_steps_per_second": 5.997, "step": 73000 }, { "epoch": 7.876010781671159, "grad_norm": 0.9472377300262451, "learning_rate": 0.00012775822989746357, "loss": 3.2351, "step": 73050 }, { "epoch": 7.881401617250674, "grad_norm": 0.8974602222442627, "learning_rate": 0.00012743443065299514, "loss": 3.2414, "step": 73100 }, { "epoch": 7.886792452830189, "grad_norm": 0.9496487379074097, "learning_rate": 0.00012711063140852672, "loss": 3.254, "step": 73150 }, { "epoch": 7.892183288409703, "grad_norm": 0.8661764860153198, "learning_rate": 0.00012678683216405827, "loss": 3.2198, "step": 73200 }, { "epoch": 7.8975741239892185, "grad_norm": 0.8774340748786926, "learning_rate": 0.00012646303291958985, "loss": 3.2429, "step": 73250 }, { "epoch": 7.902964959568733, "grad_norm": 0.8722697496414185, "learning_rate": 0.00012613923367512143, "loss": 3.2337, "step": 73300 }, { "epoch": 7.908355795148248, "grad_norm": 0.8887828588485718, "learning_rate": 0.00012581543443065298, "loss": 3.2708, "step": 73350 }, { "epoch": 7.913746630727763, "grad_norm": 0.8771435618400574, "learning_rate": 0.00012549163518618456, "loss": 3.2568, "step": 73400 }, { "epoch": 7.919137466307277, "grad_norm": 0.8421612977981567, "learning_rate": 0.00012516783594171613, "loss": 3.2556, "step": 73450 }, { "epoch": 7.9245283018867925, "grad_norm": 0.9187849760055542, "learning_rate": 0.00012484403669724768, "loss": 3.2609, "step": 73500 }, { "epoch": 7.929919137466308, "grad_norm": 0.8759550452232361, "learning_rate": 0.00012452023745277926, "loss": 3.2343, "step": 73550 }, { "epoch": 7.935309973045822, "grad_norm": 0.8831597566604614, "learning_rate": 0.00012419643820831084, "loss": 3.256, "step": 73600 }, { "epoch": 7.940700808625337, "grad_norm": 0.8703159093856812, "learning_rate": 0.0001238726389638424, "loss": 3.2427, "step": 73650 }, { "epoch": 7.946091644204852, "grad_norm": 0.8806517720222473, "learning_rate": 0.00012354883971937397, "loss": 3.2337, "step": 73700 }, { "epoch": 7.951482479784366, "grad_norm": 0.8828615546226501, "learning_rate": 0.00012322504047490555, "loss": 3.2424, "step": 73750 }, { "epoch": 7.9568733153638815, "grad_norm": 0.8587687611579895, "learning_rate": 0.0001229077172153265, "loss": 3.2401, "step": 73800 }, { "epoch": 7.962264150943396, "grad_norm": 0.8520889282226562, "learning_rate": 0.00012258391797085805, "loss": 3.2413, "step": 73850 }, { "epoch": 7.967654986522911, "grad_norm": 0.8795452117919922, "learning_rate": 0.00012226011872638963, "loss": 3.237, "step": 73900 }, { "epoch": 7.973045822102426, "grad_norm": 0.8479523658752441, "learning_rate": 0.00012193631948192119, "loss": 3.2574, "step": 73950 }, { "epoch": 7.97843665768194, "grad_norm": 0.8569572567939758, "learning_rate": 0.00012161252023745277, "loss": 3.2583, "step": 74000 }, { "epoch": 7.97843665768194, "eval_accuracy": 0.3900485471968044, "eval_loss": 3.3358373641967773, "eval_runtime": 188.1945, "eval_samples_per_second": 95.704, "eval_steps_per_second": 5.983, "step": 74000 }, { "epoch": 7.9838274932614555, "grad_norm": 0.8261906504631042, "learning_rate": 0.00012128872099298435, "loss": 3.2584, "step": 74050 }, { "epoch": 7.989218328840971, "grad_norm": 0.9081093072891235, "learning_rate": 0.00012096492174851591, "loss": 3.2528, "step": 74100 }, { "epoch": 7.994609164420485, "grad_norm": 0.8433132767677307, "learning_rate": 0.00012064112250404749, "loss": 3.2545, "step": 74150 }, { "epoch": 8.0, "grad_norm": 1.7616924047470093, "learning_rate": 0.00012031732325957905, "loss": 3.2255, "step": 74200 }, { "epoch": 8.005390835579515, "grad_norm": 0.8508578538894653, "learning_rate": 0.00011999352401511062, "loss": 3.1774, "step": 74250 }, { "epoch": 8.01078167115903, "grad_norm": 0.87180495262146, "learning_rate": 0.00011966972477064219, "loss": 3.1611, "step": 74300 }, { "epoch": 8.016172506738544, "grad_norm": 0.8701367974281311, "learning_rate": 0.00011934592552617377, "loss": 3.1618, "step": 74350 }, { "epoch": 8.021563342318059, "grad_norm": 0.9527740478515625, "learning_rate": 0.00011902212628170532, "loss": 3.1657, "step": 74400 }, { "epoch": 8.026954177897574, "grad_norm": 0.8816781640052795, "learning_rate": 0.0001186983270372369, "loss": 3.165, "step": 74450 }, { "epoch": 8.032345013477089, "grad_norm": 0.8772432804107666, "learning_rate": 0.00011837452779276848, "loss": 3.169, "step": 74500 }, { "epoch": 8.037735849056604, "grad_norm": 0.9046964049339294, "learning_rate": 0.00011805072854830005, "loss": 3.1862, "step": 74550 }, { "epoch": 8.04312668463612, "grad_norm": 0.850584089756012, "learning_rate": 0.0001177269293038316, "loss": 3.1822, "step": 74600 }, { "epoch": 8.048517520215633, "grad_norm": 0.8665777444839478, "learning_rate": 0.00011740313005936318, "loss": 3.1756, "step": 74650 }, { "epoch": 8.053908355795148, "grad_norm": 0.8615607619285583, "learning_rate": 0.00011707933081489476, "loss": 3.1659, "step": 74700 }, { "epoch": 8.059299191374663, "grad_norm": 0.8437297940254211, "learning_rate": 0.00011675553157042632, "loss": 3.1962, "step": 74750 }, { "epoch": 8.064690026954178, "grad_norm": 0.876714825630188, "learning_rate": 0.00011643173232595789, "loss": 3.1722, "step": 74800 }, { "epoch": 8.070080862533693, "grad_norm": 0.8553579449653625, "learning_rate": 0.00011610793308148947, "loss": 3.1779, "step": 74850 }, { "epoch": 8.075471698113208, "grad_norm": 0.8584737777709961, "learning_rate": 0.00011578413383702104, "loss": 3.1723, "step": 74900 }, { "epoch": 8.080862533692722, "grad_norm": 0.9048159718513489, "learning_rate": 0.00011546033459255261, "loss": 3.1733, "step": 74950 }, { "epoch": 8.086253369272237, "grad_norm": 0.8661808371543884, "learning_rate": 0.00011513653534808419, "loss": 3.1717, "step": 75000 }, { "epoch": 8.086253369272237, "eval_accuracy": 0.3897854985554055, "eval_loss": 3.341247320175171, "eval_runtime": 187.5859, "eval_samples_per_second": 96.015, "eval_steps_per_second": 6.003, "step": 75000 }, { "epoch": 8.091644204851752, "grad_norm": 0.8615157008171082, "learning_rate": 0.00011481273610361575, "loss": 3.1795, "step": 75050 }, { "epoch": 8.097035040431267, "grad_norm": 0.8480276465415955, "learning_rate": 0.00011448893685914731, "loss": 3.1897, "step": 75100 }, { "epoch": 8.102425876010782, "grad_norm": 0.8777347207069397, "learning_rate": 0.00011416513761467889, "loss": 3.1793, "step": 75150 }, { "epoch": 8.107816711590296, "grad_norm": 0.8581756353378296, "learning_rate": 0.00011384133837021047, "loss": 3.1799, "step": 75200 }, { "epoch": 8.11320754716981, "grad_norm": 0.897616446018219, "learning_rate": 0.00011351753912574202, "loss": 3.1801, "step": 75250 }, { "epoch": 8.118598382749326, "grad_norm": 0.8755094408988953, "learning_rate": 0.0001131937398812736, "loss": 3.1826, "step": 75300 }, { "epoch": 8.123989218328841, "grad_norm": 0.8945389986038208, "learning_rate": 0.00011286994063680517, "loss": 3.1673, "step": 75350 }, { "epoch": 8.129380053908356, "grad_norm": 0.9047781825065613, "learning_rate": 0.00011254614139233675, "loss": 3.1784, "step": 75400 }, { "epoch": 8.134770889487871, "grad_norm": 0.8923010230064392, "learning_rate": 0.0001122223421478683, "loss": 3.1798, "step": 75450 }, { "epoch": 8.140161725067385, "grad_norm": 0.9317930936813354, "learning_rate": 0.00011189854290339988, "loss": 3.181, "step": 75500 }, { "epoch": 8.1455525606469, "grad_norm": 0.8782373666763306, "learning_rate": 0.00011157474365893146, "loss": 3.1832, "step": 75550 }, { "epoch": 8.150943396226415, "grad_norm": 0.9229250550270081, "learning_rate": 0.00011125094441446302, "loss": 3.1852, "step": 75600 }, { "epoch": 8.15633423180593, "grad_norm": 0.9482505917549133, "learning_rate": 0.00011092714516999459, "loss": 3.1697, "step": 75650 }, { "epoch": 8.161725067385445, "grad_norm": 0.8892439007759094, "learning_rate": 0.00011060334592552616, "loss": 3.1789, "step": 75700 }, { "epoch": 8.167115902964959, "grad_norm": 0.8948121666908264, "learning_rate": 0.00011027954668105773, "loss": 3.1997, "step": 75750 }, { "epoch": 8.172506738544474, "grad_norm": 0.9063841104507446, "learning_rate": 0.0001099557474365893, "loss": 3.2041, "step": 75800 }, { "epoch": 8.177897574123989, "grad_norm": 0.8902472853660583, "learning_rate": 0.00010963194819212088, "loss": 3.1912, "step": 75850 }, { "epoch": 8.183288409703504, "grad_norm": 0.9260648488998413, "learning_rate": 0.00010930814894765243, "loss": 3.1748, "step": 75900 }, { "epoch": 8.18867924528302, "grad_norm": 0.9551469683647156, "learning_rate": 0.00010898434970318401, "loss": 3.1831, "step": 75950 }, { "epoch": 8.194070080862534, "grad_norm": 0.9048367738723755, "learning_rate": 0.00010866055045871559, "loss": 3.194, "step": 76000 }, { "epoch": 8.194070080862534, "eval_accuracy": 0.3899166425909976, "eval_loss": 3.3387255668640137, "eval_runtime": 187.7379, "eval_samples_per_second": 95.937, "eval_steps_per_second": 5.998, "step": 76000 }, { "epoch": 8.199460916442048, "grad_norm": 0.8931266665458679, "learning_rate": 0.00010833675121424717, "loss": 3.184, "step": 76050 }, { "epoch": 8.204851752021563, "grad_norm": 0.8630874752998352, "learning_rate": 0.00010801295196977872, "loss": 3.1901, "step": 76100 }, { "epoch": 8.210242587601078, "grad_norm": 0.8952473998069763, "learning_rate": 0.0001076891527253103, "loss": 3.2001, "step": 76150 }, { "epoch": 8.215633423180593, "grad_norm": 0.8969290256500244, "learning_rate": 0.00010737182946573123, "loss": 3.192, "step": 76200 }, { "epoch": 8.221024258760108, "grad_norm": 0.8658696413040161, "learning_rate": 0.0001070480302212628, "loss": 3.1861, "step": 76250 }, { "epoch": 8.226415094339623, "grad_norm": 0.8696344494819641, "learning_rate": 0.00010672423097679438, "loss": 3.1845, "step": 76300 }, { "epoch": 8.231805929919137, "grad_norm": 0.9221002459526062, "learning_rate": 0.00010640043173232595, "loss": 3.1888, "step": 76350 }, { "epoch": 8.237196765498652, "grad_norm": 0.9323232769966125, "learning_rate": 0.00010607663248785753, "loss": 3.1993, "step": 76400 }, { "epoch": 8.242587601078167, "grad_norm": 0.9110686182975769, "learning_rate": 0.00010575283324338908, "loss": 3.2035, "step": 76450 }, { "epoch": 8.247978436657682, "grad_norm": 0.9169686436653137, "learning_rate": 0.00010542903399892066, "loss": 3.1789, "step": 76500 }, { "epoch": 8.253369272237197, "grad_norm": Infinity, "learning_rate": 0.0001051117107393416, "loss": 3.1813, "step": 76550 }, { "epoch": 8.25876010781671, "grad_norm": 0.8762148022651672, "learning_rate": 0.00010478791149487316, "loss": 3.1851, "step": 76600 }, { "epoch": 8.264150943396226, "grad_norm": 0.8535869121551514, "learning_rate": 0.00010446411225040474, "loss": 3.1942, "step": 76650 }, { "epoch": 8.269541778975741, "grad_norm": 0.9197494983673096, "learning_rate": 0.00010414031300593632, "loss": 3.1874, "step": 76700 }, { "epoch": 8.274932614555256, "grad_norm": 0.8315194249153137, "learning_rate": 0.00010381651376146787, "loss": 3.1813, "step": 76750 }, { "epoch": 8.280323450134771, "grad_norm": 0.9296077489852905, "learning_rate": 0.00010349271451699945, "loss": 3.1992, "step": 76800 }, { "epoch": 8.285714285714286, "grad_norm": 0.9207568168640137, "learning_rate": 0.00010316891527253102, "loss": 3.1844, "step": 76850 }, { "epoch": 8.2911051212938, "grad_norm": 0.8942183256149292, "learning_rate": 0.0001028451160280626, "loss": 3.1912, "step": 76900 }, { "epoch": 8.296495956873315, "grad_norm": 0.8278528451919556, "learning_rate": 0.00010252131678359416, "loss": 3.1987, "step": 76950 }, { "epoch": 8.30188679245283, "grad_norm": 0.8854562044143677, "learning_rate": 0.00010219751753912573, "loss": 3.1902, "step": 77000 }, { "epoch": 8.30188679245283, "eval_accuracy": 0.39015491837397476, "eval_loss": 3.336681365966797, "eval_runtime": 187.7838, "eval_samples_per_second": 95.914, "eval_steps_per_second": 5.996, "step": 77000 }, { "epoch": 8.307277628032345, "grad_norm": 0.8753238320350647, "learning_rate": 0.0001018737182946573, "loss": 3.2079, "step": 77050 }, { "epoch": 8.31266846361186, "grad_norm": 0.867482602596283, "learning_rate": 0.00010154991905018887, "loss": 3.1959, "step": 77100 }, { "epoch": 8.318059299191376, "grad_norm": 0.8500292301177979, "learning_rate": 0.00010122611980572045, "loss": 3.1798, "step": 77150 }, { "epoch": 8.323450134770889, "grad_norm": 0.8923583030700684, "learning_rate": 0.00010090232056125201, "loss": 3.1884, "step": 77200 }, { "epoch": 8.328840970350404, "grad_norm": 0.9104676246643066, "learning_rate": 0.00010057852131678359, "loss": 3.1968, "step": 77250 }, { "epoch": 8.33423180592992, "grad_norm": 0.9469248056411743, "learning_rate": 0.00010025472207231515, "loss": 3.1925, "step": 77300 }, { "epoch": 8.339622641509434, "grad_norm": 0.9284824728965759, "learning_rate": 9.993092282784673e-05, "loss": 3.1953, "step": 77350 }, { "epoch": 8.34501347708895, "grad_norm": 0.9235027432441711, "learning_rate": 9.96071235833783e-05, "loss": 3.1919, "step": 77400 }, { "epoch": 8.350404312668463, "grad_norm": 0.8894093036651611, "learning_rate": 9.928332433890986e-05, "loss": 3.1901, "step": 77450 }, { "epoch": 8.355795148247978, "grad_norm": 0.9287856817245483, "learning_rate": 9.895952509444144e-05, "loss": 3.208, "step": 77500 }, { "epoch": 8.361185983827493, "grad_norm": 0.8841333985328674, "learning_rate": 9.863572584997302e-05, "loss": 3.1928, "step": 77550 }, { "epoch": 8.366576819407008, "grad_norm": 0.8921537399291992, "learning_rate": 9.831192660550457e-05, "loss": 3.203, "step": 77600 }, { "epoch": 8.371967654986523, "grad_norm": 0.8815743327140808, "learning_rate": 9.798812736103614e-05, "loss": 3.1849, "step": 77650 }, { "epoch": 8.377358490566039, "grad_norm": 0.8852102756500244, "learning_rate": 9.766432811656772e-05, "loss": 3.1901, "step": 77700 }, { "epoch": 8.382749326145552, "grad_norm": 0.8962900042533875, "learning_rate": 9.73405288720993e-05, "loss": 3.1786, "step": 77750 }, { "epoch": 8.388140161725067, "grad_norm": 0.9113929271697998, "learning_rate": 9.701672962763086e-05, "loss": 3.1965, "step": 77800 }, { "epoch": 8.393530997304582, "grad_norm": 0.8877599239349365, "learning_rate": 9.669293038316243e-05, "loss": 3.1914, "step": 77850 }, { "epoch": 8.398921832884097, "grad_norm": 1.0134741067886353, "learning_rate": 9.6369131138694e-05, "loss": 3.1801, "step": 77900 }, { "epoch": 8.404312668463612, "grad_norm": 0.8980883955955505, "learning_rate": 9.604533189422557e-05, "loss": 3.1971, "step": 77950 }, { "epoch": 8.409703504043126, "grad_norm": 0.9101238250732422, "learning_rate": 9.572153264975715e-05, "loss": 3.1837, "step": 78000 }, { "epoch": 8.409703504043126, "eval_accuracy": 0.3904747924757006, "eval_loss": 3.333543539047241, "eval_runtime": 188.0712, "eval_samples_per_second": 95.767, "eval_steps_per_second": 5.987, "step": 78000 }, { "epoch": 8.415094339622641, "grad_norm": 0.8676778674125671, "learning_rate": 9.539773340528871e-05, "loss": 3.2043, "step": 78050 }, { "epoch": 8.420485175202156, "grad_norm": 0.9355192184448242, "learning_rate": 9.507393416082027e-05, "loss": 3.1883, "step": 78100 }, { "epoch": 8.425876010781671, "grad_norm": 0.9380497336387634, "learning_rate": 9.475013491635185e-05, "loss": 3.1893, "step": 78150 }, { "epoch": 8.431266846361186, "grad_norm": 0.8921291828155518, "learning_rate": 9.442633567188343e-05, "loss": 3.1987, "step": 78200 }, { "epoch": 8.436657681940702, "grad_norm": 0.952408492565155, "learning_rate": 9.410253642741498e-05, "loss": 3.2036, "step": 78250 }, { "epoch": 8.442048517520215, "grad_norm": 0.9249347448348999, "learning_rate": 9.377873718294656e-05, "loss": 3.1883, "step": 78300 }, { "epoch": 8.44743935309973, "grad_norm": 0.9118902683258057, "learning_rate": 9.345493793847814e-05, "loss": 3.2047, "step": 78350 }, { "epoch": 8.452830188679245, "grad_norm": 0.929966390132904, "learning_rate": 9.313113869400971e-05, "loss": 3.1827, "step": 78400 }, { "epoch": 8.45822102425876, "grad_norm": 0.9814649224281311, "learning_rate": 9.280733944954126e-05, "loss": 3.1744, "step": 78450 }, { "epoch": 8.463611859838275, "grad_norm": 0.9400845766067505, "learning_rate": 9.248354020507284e-05, "loss": 3.2012, "step": 78500 }, { "epoch": 8.46900269541779, "grad_norm": 0.9196180701255798, "learning_rate": 9.215974096060442e-05, "loss": 3.1851, "step": 78550 }, { "epoch": 8.474393530997304, "grad_norm": 0.8978266716003418, "learning_rate": 9.1835941716136e-05, "loss": 3.1822, "step": 78600 }, { "epoch": 8.479784366576819, "grad_norm": 0.8619605302810669, "learning_rate": 9.151214247166756e-05, "loss": 3.1947, "step": 78650 }, { "epoch": 8.485175202156334, "grad_norm": 0.9299235939979553, "learning_rate": 9.118834322719913e-05, "loss": 3.2065, "step": 78700 }, { "epoch": 8.49056603773585, "grad_norm": 0.8792457580566406, "learning_rate": 9.08645439827307e-05, "loss": 3.2227, "step": 78750 }, { "epoch": 8.495956873315365, "grad_norm": 0.9111963510513306, "learning_rate": 9.054074473826227e-05, "loss": 3.1876, "step": 78800 }, { "epoch": 8.501347708894878, "grad_norm": 0.912173330783844, "learning_rate": 9.021694549379385e-05, "loss": 3.1958, "step": 78850 }, { "epoch": 8.506738544474393, "grad_norm": 0.8932207822799683, "learning_rate": 8.989314624932541e-05, "loss": 3.2019, "step": 78900 }, { "epoch": 8.512129380053908, "grad_norm": 0.8717414736747742, "learning_rate": 8.956934700485697e-05, "loss": 3.1856, "step": 78950 }, { "epoch": 8.517520215633423, "grad_norm": 0.9076499342918396, "learning_rate": 8.924554776038855e-05, "loss": 3.1819, "step": 79000 }, { "epoch": 8.517520215633423, "eval_accuracy": 0.3909775293876182, "eval_loss": 3.330061435699463, "eval_runtime": 187.9001, "eval_samples_per_second": 95.854, "eval_steps_per_second": 5.993, "step": 79000 }, { "epoch": 8.522911051212938, "grad_norm": 0.9383403062820435, "learning_rate": 8.892174851592013e-05, "loss": 3.1745, "step": 79050 }, { "epoch": 8.528301886792454, "grad_norm": 0.8300234079360962, "learning_rate": 8.859794927145168e-05, "loss": 3.1902, "step": 79100 }, { "epoch": 8.533692722371967, "grad_norm": 0.9023022055625916, "learning_rate": 8.827415002698326e-05, "loss": 3.1957, "step": 79150 }, { "epoch": 8.539083557951482, "grad_norm": 0.8733766078948975, "learning_rate": 8.795035078251483e-05, "loss": 3.1877, "step": 79200 }, { "epoch": 8.544474393530997, "grad_norm": 0.8842893242835999, "learning_rate": 8.762655153804641e-05, "loss": 3.1868, "step": 79250 }, { "epoch": 8.549865229110512, "grad_norm": 0.8878859281539917, "learning_rate": 8.730275229357798e-05, "loss": 3.1857, "step": 79300 }, { "epoch": 8.555256064690028, "grad_norm": 0.9339536428451538, "learning_rate": 8.697895304910954e-05, "loss": 3.1905, "step": 79350 }, { "epoch": 8.560646900269543, "grad_norm": 0.9216422438621521, "learning_rate": 8.665515380464112e-05, "loss": 3.1944, "step": 79400 }, { "epoch": 8.566037735849056, "grad_norm": 0.915511965751648, "learning_rate": 8.633135456017268e-05, "loss": 3.2111, "step": 79450 }, { "epoch": 8.571428571428571, "grad_norm": 0.8784403204917908, "learning_rate": 8.600755531570426e-05, "loss": 3.1864, "step": 79500 }, { "epoch": 8.576819407008086, "grad_norm": 0.8907803893089294, "learning_rate": 8.568375607123582e-05, "loss": 3.1804, "step": 79550 }, { "epoch": 8.582210242587601, "grad_norm": 0.8826505541801453, "learning_rate": 8.53599568267674e-05, "loss": 3.1909, "step": 79600 }, { "epoch": 8.587601078167117, "grad_norm": 0.871987521648407, "learning_rate": 8.503615758229897e-05, "loss": 3.1889, "step": 79650 }, { "epoch": 8.59299191374663, "grad_norm": 0.9072272777557373, "learning_rate": 8.471235833783054e-05, "loss": 3.204, "step": 79700 }, { "epoch": 8.598382749326145, "grad_norm": 0.9233155846595764, "learning_rate": 8.438855909336211e-05, "loss": 3.2003, "step": 79750 }, { "epoch": 8.60377358490566, "grad_norm": 0.9233723878860474, "learning_rate": 8.406475984889367e-05, "loss": 3.2135, "step": 79800 }, { "epoch": 8.609164420485175, "grad_norm": 0.9055242538452148, "learning_rate": 8.374096060442525e-05, "loss": 3.1896, "step": 79850 }, { "epoch": 8.61455525606469, "grad_norm": 0.8707368969917297, "learning_rate": 8.341716135995683e-05, "loss": 3.1881, "step": 79900 }, { "epoch": 8.619946091644206, "grad_norm": 0.9200837016105652, "learning_rate": 8.309336211548838e-05, "loss": 3.2068, "step": 79950 }, { "epoch": 8.625336927223719, "grad_norm": 0.9371185898780823, "learning_rate": 8.276956287101996e-05, "loss": 3.217, "step": 80000 }, { "epoch": 8.625336927223719, "eval_accuracy": 0.39134336366088956, "eval_loss": 3.3252391815185547, "eval_runtime": 187.9935, "eval_samples_per_second": 95.807, "eval_steps_per_second": 5.99, "step": 80000 }, { "epoch": 8.630727762803234, "grad_norm": 0.9155548810958862, "learning_rate": 8.244576362655153e-05, "loss": 3.1843, "step": 80050 }, { "epoch": 8.63611859838275, "grad_norm": 0.9367262125015259, "learning_rate": 8.212196438208311e-05, "loss": 3.1933, "step": 80100 }, { "epoch": 8.641509433962264, "grad_norm": 0.9463507533073425, "learning_rate": 8.179816513761467e-05, "loss": 3.1984, "step": 80150 }, { "epoch": 8.64690026954178, "grad_norm": 0.978238046169281, "learning_rate": 8.147436589314624e-05, "loss": 3.2138, "step": 80200 }, { "epoch": 8.652291105121293, "grad_norm": 0.9424788355827332, "learning_rate": 8.115056664867782e-05, "loss": 3.1938, "step": 80250 }, { "epoch": 8.657681940700808, "grad_norm": 0.9349570870399475, "learning_rate": 8.082676740420938e-05, "loss": 3.2039, "step": 80300 }, { "epoch": 8.663072776280323, "grad_norm": 0.9339366555213928, "learning_rate": 8.050296815974096e-05, "loss": 3.1935, "step": 80350 }, { "epoch": 8.668463611859838, "grad_norm": 0.9126559495925903, "learning_rate": 8.017916891527252e-05, "loss": 3.2023, "step": 80400 }, { "epoch": 8.673854447439354, "grad_norm": 0.9444116353988647, "learning_rate": 7.985536967080409e-05, "loss": 3.1947, "step": 80450 }, { "epoch": 8.679245283018869, "grad_norm": 0.9091424345970154, "learning_rate": 7.953157042633566e-05, "loss": 3.1969, "step": 80500 }, { "epoch": 8.684636118598382, "grad_norm": 0.8752775192260742, "learning_rate": 7.920777118186724e-05, "loss": 3.1877, "step": 80550 }, { "epoch": 8.690026954177897, "grad_norm": 0.8782562017440796, "learning_rate": 7.888397193739879e-05, "loss": 3.1763, "step": 80600 }, { "epoch": 8.695417789757412, "grad_norm": 0.8987087607383728, "learning_rate": 7.856017269293037e-05, "loss": 3.195, "step": 80650 }, { "epoch": 8.700808625336927, "grad_norm": 0.9534454345703125, "learning_rate": 7.824284943335132e-05, "loss": 3.1834, "step": 80700 }, { "epoch": 8.706199460916443, "grad_norm": 1.0178718566894531, "learning_rate": 7.791905018888289e-05, "loss": 3.1851, "step": 80750 }, { "epoch": 8.711590296495956, "grad_norm": 0.9170519709587097, "learning_rate": 7.759525094441445e-05, "loss": 3.1812, "step": 80800 }, { "epoch": 8.716981132075471, "grad_norm": 0.8797026872634888, "learning_rate": 7.727145169994603e-05, "loss": 3.1884, "step": 80850 }, { "epoch": 8.722371967654986, "grad_norm": 0.9378693103790283, "learning_rate": 7.69476524554776e-05, "loss": 3.2125, "step": 80900 }, { "epoch": 8.727762803234501, "grad_norm": 0.9277364015579224, "learning_rate": 7.662385321100916e-05, "loss": 3.1871, "step": 80950 }, { "epoch": 8.733153638814017, "grad_norm": 0.9785609841346741, "learning_rate": 7.630005396654073e-05, "loss": 3.1705, "step": 81000 }, { "epoch": 8.733153638814017, "eval_accuracy": 0.39164357158992097, "eval_loss": 3.32247257232666, "eval_runtime": 187.8542, "eval_samples_per_second": 95.878, "eval_steps_per_second": 5.994, "step": 81000 }, { "epoch": 8.738544474393532, "grad_norm": 0.973168671131134, "learning_rate": 7.597625472207231e-05, "loss": 3.1917, "step": 81050 }, { "epoch": 8.743935309973045, "grad_norm": 0.9081736207008362, "learning_rate": 7.565245547760389e-05, "loss": 3.1902, "step": 81100 }, { "epoch": 8.74932614555256, "grad_norm": 0.9135347008705139, "learning_rate": 7.532865623313544e-05, "loss": 3.1826, "step": 81150 }, { "epoch": 8.754716981132075, "grad_norm": 0.8944445252418518, "learning_rate": 7.500485698866702e-05, "loss": 3.2065, "step": 81200 }, { "epoch": 8.76010781671159, "grad_norm": 0.919346034526825, "learning_rate": 7.46810577441986e-05, "loss": 3.1938, "step": 81250 }, { "epoch": 8.765498652291106, "grad_norm": 0.8800582885742188, "learning_rate": 7.435725849973016e-05, "loss": 3.1885, "step": 81300 }, { "epoch": 8.77088948787062, "grad_norm": 0.9364388585090637, "learning_rate": 7.403345925526174e-05, "loss": 3.1847, "step": 81350 }, { "epoch": 8.776280323450134, "grad_norm": 0.9442126154899597, "learning_rate": 7.37096600107933e-05, "loss": 3.19, "step": 81400 }, { "epoch": 8.78167115902965, "grad_norm": 0.9176348447799683, "learning_rate": 7.338586076632488e-05, "loss": 3.1993, "step": 81450 }, { "epoch": 8.787061994609164, "grad_norm": 0.9438028931617737, "learning_rate": 7.306206152185644e-05, "loss": 3.2149, "step": 81500 }, { "epoch": 8.79245283018868, "grad_norm": 0.8861234188079834, "learning_rate": 7.273826227738801e-05, "loss": 3.1937, "step": 81550 }, { "epoch": 8.797843665768195, "grad_norm": 0.8872656226158142, "learning_rate": 7.241446303291958e-05, "loss": 3.1829, "step": 81600 }, { "epoch": 8.80323450134771, "grad_norm": 0.8715137839317322, "learning_rate": 7.209066378845115e-05, "loss": 3.1958, "step": 81650 }, { "epoch": 8.808625336927223, "grad_norm": 0.9722141623497009, "learning_rate": 7.176686454398273e-05, "loss": 3.1777, "step": 81700 }, { "epoch": 8.814016172506738, "grad_norm": 0.961341917514801, "learning_rate": 7.144306529951429e-05, "loss": 3.1831, "step": 81750 }, { "epoch": 8.819407008086253, "grad_norm": 0.8844859600067139, "learning_rate": 7.111926605504587e-05, "loss": 3.1845, "step": 81800 }, { "epoch": 8.824797843665769, "grad_norm": 0.9381985068321228, "learning_rate": 7.079546681057743e-05, "loss": 3.2038, "step": 81850 }, { "epoch": 8.830188679245284, "grad_norm": 0.9069252014160156, "learning_rate": 7.047166756610901e-05, "loss": 3.1925, "step": 81900 }, { "epoch": 8.835579514824797, "grad_norm": 0.9448679089546204, "learning_rate": 7.015434430652993e-05, "loss": 3.2069, "step": 81950 }, { "epoch": 8.840970350404312, "grad_norm": 0.9136323928833008, "learning_rate": 6.983054506206151e-05, "loss": 3.1868, "step": 82000 }, { "epoch": 8.840970350404312, "eval_accuracy": 0.3918973847358556, "eval_loss": 3.3201887607574463, "eval_runtime": 188.2231, "eval_samples_per_second": 95.69, "eval_steps_per_second": 5.982, "step": 82000 }, { "epoch": 8.846361185983827, "grad_norm": 0.893839418888092, "learning_rate": 6.950674581759309e-05, "loss": 3.1889, "step": 82050 }, { "epoch": 8.851752021563343, "grad_norm": 0.9275647401809692, "learning_rate": 6.918294657312465e-05, "loss": 3.1863, "step": 82100 }, { "epoch": 8.857142857142858, "grad_norm": 0.848946213722229, "learning_rate": 6.885914732865623e-05, "loss": 3.1965, "step": 82150 }, { "epoch": 8.862533692722373, "grad_norm": 0.9524142742156982, "learning_rate": 6.85353480841878e-05, "loss": 3.1925, "step": 82200 }, { "epoch": 8.867924528301886, "grad_norm": 0.9237378835678101, "learning_rate": 6.821154883971937e-05, "loss": 3.2198, "step": 82250 }, { "epoch": 8.873315363881401, "grad_norm": 0.9178951978683472, "learning_rate": 6.788774959525094e-05, "loss": 3.2063, "step": 82300 }, { "epoch": 8.878706199460916, "grad_norm": 0.9111886024475098, "learning_rate": 6.756395035078252e-05, "loss": 3.1763, "step": 82350 }, { "epoch": 8.884097035040432, "grad_norm": 0.8813912868499756, "learning_rate": 6.724015110631408e-05, "loss": 3.1891, "step": 82400 }, { "epoch": 8.889487870619947, "grad_norm": 0.9190998077392578, "learning_rate": 6.691635186184566e-05, "loss": 3.1971, "step": 82450 }, { "epoch": 8.89487870619946, "grad_norm": 0.963971734046936, "learning_rate": 6.659255261737722e-05, "loss": 3.199, "step": 82500 }, { "epoch": 8.900269541778975, "grad_norm": 0.9044879078865051, "learning_rate": 6.626875337290879e-05, "loss": 3.2037, "step": 82550 }, { "epoch": 8.90566037735849, "grad_norm": 0.9314145445823669, "learning_rate": 6.594495412844036e-05, "loss": 3.1844, "step": 82600 }, { "epoch": 8.911051212938006, "grad_norm": 0.9088249206542969, "learning_rate": 6.562115488397193e-05, "loss": 3.1967, "step": 82650 }, { "epoch": 8.91644204851752, "grad_norm": 0.9422711730003357, "learning_rate": 6.52973556395035e-05, "loss": 3.1891, "step": 82700 }, { "epoch": 8.921832884097036, "grad_norm": 0.9077197909355164, "learning_rate": 6.497355639503507e-05, "loss": 3.2024, "step": 82750 }, { "epoch": 8.92722371967655, "grad_norm": 0.9061179757118225, "learning_rate": 6.464975715056663e-05, "loss": 3.1981, "step": 82800 }, { "epoch": 8.932614555256064, "grad_norm": 0.9592944979667664, "learning_rate": 6.432595790609821e-05, "loss": 3.204, "step": 82850 }, { "epoch": 8.93800539083558, "grad_norm": 0.8956493139266968, "learning_rate": 6.400215866162979e-05, "loss": 3.2006, "step": 82900 }, { "epoch": 8.943396226415095, "grad_norm": 0.8995712995529175, "learning_rate": 6.367835941716135e-05, "loss": 3.18, "step": 82950 }, { "epoch": 8.94878706199461, "grad_norm": 0.96676105260849, "learning_rate": 6.335456017269293e-05, "loss": 3.2008, "step": 83000 }, { "epoch": 8.94878706199461, "eval_accuracy": 0.39247530944614084, "eval_loss": 3.3157753944396973, "eval_runtime": 187.7013, "eval_samples_per_second": 95.956, "eval_steps_per_second": 5.999, "step": 83000 }, { "epoch": 8.954177897574123, "grad_norm": 0.9426843523979187, "learning_rate": 6.30307609282245e-05, "loss": 3.1781, "step": 83050 }, { "epoch": 8.959568733153638, "grad_norm": 0.8954929113388062, "learning_rate": 6.270696168375607e-05, "loss": 3.1773, "step": 83100 }, { "epoch": 8.964959568733153, "grad_norm": 0.9750449657440186, "learning_rate": 6.238316243928764e-05, "loss": 3.199, "step": 83150 }, { "epoch": 8.970350404312669, "grad_norm": 0.9364018440246582, "learning_rate": 6.205936319481921e-05, "loss": 3.2079, "step": 83200 }, { "epoch": 8.975741239892184, "grad_norm": 0.8858622908592224, "learning_rate": 6.173556395035078e-05, "loss": 3.1922, "step": 83250 }, { "epoch": 8.981132075471699, "grad_norm": 1.0074647665023804, "learning_rate": 6.141176470588236e-05, "loss": 3.1942, "step": 83300 }, { "epoch": 8.986522911051212, "grad_norm": 0.9643685817718506, "learning_rate": 6.108796546141392e-05, "loss": 3.1766, "step": 83350 }, { "epoch": 8.991913746630727, "grad_norm": 0.9266127943992615, "learning_rate": 6.076416621694549e-05, "loss": 3.1933, "step": 83400 }, { "epoch": 8.997304582210242, "grad_norm": 0.9136055707931519, "learning_rate": 6.0440366972477055e-05, "loss": 3.2052, "step": 83450 }, { "epoch": 9.002695417789758, "grad_norm": 0.9789246916770935, "learning_rate": 6.011656772800863e-05, "loss": 3.1685, "step": 83500 }, { "epoch": 9.008086253369273, "grad_norm": 0.9144122004508972, "learning_rate": 5.9792768483540197e-05, "loss": 3.1438, "step": 83550 }, { "epoch": 9.013477088948788, "grad_norm": 0.9275460243225098, "learning_rate": 5.946896923907177e-05, "loss": 3.1252, "step": 83600 }, { "epoch": 9.018867924528301, "grad_norm": 1.0540989637374878, "learning_rate": 5.9145169994603345e-05, "loss": 3.1482, "step": 83650 }, { "epoch": 9.024258760107816, "grad_norm": 0.8946913480758667, "learning_rate": 5.882137075013491e-05, "loss": 3.1316, "step": 83700 }, { "epoch": 9.029649595687331, "grad_norm": 0.9386449456214905, "learning_rate": 5.849757150566649e-05, "loss": 3.1166, "step": 83750 }, { "epoch": 9.035040431266847, "grad_norm": 0.916728675365448, "learning_rate": 5.817377226119805e-05, "loss": 3.1525, "step": 83800 }, { "epoch": 9.040431266846362, "grad_norm": 0.890503466129303, "learning_rate": 5.784997301672963e-05, "loss": 3.1461, "step": 83850 }, { "epoch": 9.045822102425875, "grad_norm": 0.8731235861778259, "learning_rate": 5.752617377226119e-05, "loss": 3.1499, "step": 83900 }, { "epoch": 9.05121293800539, "grad_norm": 0.9045177698135376, "learning_rate": 5.7202374527792764e-05, "loss": 3.1331, "step": 83950 }, { "epoch": 9.056603773584905, "grad_norm": 0.9173175692558289, "learning_rate": 5.6878575283324335e-05, "loss": 3.1378, "step": 84000 }, { "epoch": 9.056603773584905, "eval_accuracy": 0.39228864378487555, "eval_loss": 3.3212382793426514, "eval_runtime": 188.0841, "eval_samples_per_second": 95.76, "eval_steps_per_second": 5.987, "step": 84000 }, { "epoch": 9.06199460916442, "grad_norm": 0.9891923069953918, "learning_rate": 5.6554776038855905e-05, "loss": 3.1346, "step": 84050 }, { "epoch": 9.067385444743936, "grad_norm": 0.9098547697067261, "learning_rate": 5.623097679438747e-05, "loss": 3.14, "step": 84100 }, { "epoch": 9.07277628032345, "grad_norm": 0.9232341647148132, "learning_rate": 5.590717754991905e-05, "loss": 3.1318, "step": 84150 }, { "epoch": 9.078167115902964, "grad_norm": 0.9636071920394897, "learning_rate": 5.558337830545061e-05, "loss": 3.1412, "step": 84200 }, { "epoch": 9.08355795148248, "grad_norm": 0.9031469821929932, "learning_rate": 5.525957906098219e-05, "loss": 3.1383, "step": 84250 }, { "epoch": 9.088948787061994, "grad_norm": 0.9007047414779663, "learning_rate": 5.493577981651375e-05, "loss": 3.1277, "step": 84300 }, { "epoch": 9.09433962264151, "grad_norm": 0.9313275814056396, "learning_rate": 5.4611980572045324e-05, "loss": 3.1248, "step": 84350 }, { "epoch": 9.099730458221025, "grad_norm": 0.8984010219573975, "learning_rate": 5.4288181327576895e-05, "loss": 3.1446, "step": 84400 }, { "epoch": 9.10512129380054, "grad_norm": 0.9281125664710999, "learning_rate": 5.3964382083108466e-05, "loss": 3.1514, "step": 84450 }, { "epoch": 9.110512129380053, "grad_norm": 0.921140730381012, "learning_rate": 5.3640582838640043e-05, "loss": 3.1314, "step": 84500 }, { "epoch": 9.115902964959568, "grad_norm": 0.9595467448234558, "learning_rate": 5.331678359417161e-05, "loss": 3.1323, "step": 84550 }, { "epoch": 9.121293800539084, "grad_norm": 0.9736076593399048, "learning_rate": 5.2992984349703185e-05, "loss": 3.1573, "step": 84600 }, { "epoch": 9.126684636118599, "grad_norm": 0.9062066078186035, "learning_rate": 5.266918510523475e-05, "loss": 3.1342, "step": 84650 }, { "epoch": 9.132075471698114, "grad_norm": 0.8909493684768677, "learning_rate": 5.234538586076632e-05, "loss": 3.1348, "step": 84700 }, { "epoch": 9.137466307277627, "grad_norm": 0.9599141478538513, "learning_rate": 5.202158661629789e-05, "loss": 3.1318, "step": 84750 }, { "epoch": 9.142857142857142, "grad_norm": 0.8932557106018066, "learning_rate": 5.169778737182946e-05, "loss": 3.132, "step": 84800 }, { "epoch": 9.148247978436657, "grad_norm": 0.9026200771331787, "learning_rate": 5.1373988127361026e-05, "loss": 3.1385, "step": 84850 }, { "epoch": 9.153638814016173, "grad_norm": 0.9215351939201355, "learning_rate": 5.1050188882892604e-05, "loss": 3.1445, "step": 84900 }, { "epoch": 9.159029649595688, "grad_norm": 0.9547321200370789, "learning_rate": 5.072638963842417e-05, "loss": 3.1533, "step": 84950 }, { "epoch": 9.164420485175203, "grad_norm": 0.9079762101173401, "learning_rate": 5.0402590393955746e-05, "loss": 3.1322, "step": 85000 }, { "epoch": 9.164420485175203, "eval_accuracy": 0.3925814633175356, "eval_loss": 3.317584276199341, "eval_runtime": 187.6367, "eval_samples_per_second": 95.989, "eval_steps_per_second": 6.001, "step": 85000 }, { "epoch": 9.169811320754716, "grad_norm": 0.8956434726715088, "learning_rate": 5.007879114948731e-05, "loss": 3.1324, "step": 85050 }, { "epoch": 9.175202156334231, "grad_norm": 0.9152560234069824, "learning_rate": 4.975499190501889e-05, "loss": 3.1568, "step": 85100 }, { "epoch": 9.180592991913747, "grad_norm": 0.915358304977417, "learning_rate": 4.943119266055045e-05, "loss": 3.1344, "step": 85150 }, { "epoch": 9.185983827493262, "grad_norm": 0.9076443910598755, "learning_rate": 4.910739341608202e-05, "loss": 3.1544, "step": 85200 }, { "epoch": 9.191374663072777, "grad_norm": 0.8821456432342529, "learning_rate": 4.87835941716136e-05, "loss": 3.1362, "step": 85250 }, { "epoch": 9.19676549865229, "grad_norm": 0.9051779508590698, "learning_rate": 4.8459794927145164e-05, "loss": 3.1339, "step": 85300 }, { "epoch": 9.202156334231805, "grad_norm": 0.9335005283355713, "learning_rate": 4.813599568267674e-05, "loss": 3.1469, "step": 85350 }, { "epoch": 9.20754716981132, "grad_norm": 0.9518271684646606, "learning_rate": 4.7812196438208306e-05, "loss": 3.1175, "step": 85400 }, { "epoch": 9.212938005390836, "grad_norm": 0.9518353343009949, "learning_rate": 4.748839719373988e-05, "loss": 3.1383, "step": 85450 }, { "epoch": 9.21832884097035, "grad_norm": 0.9310569763183594, "learning_rate": 4.716459794927145e-05, "loss": 3.16, "step": 85500 }, { "epoch": 9.223719676549866, "grad_norm": 0.98565274477005, "learning_rate": 4.684079870480302e-05, "loss": 3.1408, "step": 85550 }, { "epoch": 9.22911051212938, "grad_norm": 0.9924389123916626, "learning_rate": 4.651699946033459e-05, "loss": 3.153, "step": 85600 }, { "epoch": 9.234501347708894, "grad_norm": 0.9255833029747009, "learning_rate": 4.619320021586616e-05, "loss": 3.1516, "step": 85650 }, { "epoch": 9.23989218328841, "grad_norm": 0.9540524482727051, "learning_rate": 4.5869400971397725e-05, "loss": 3.1385, "step": 85700 }, { "epoch": 9.245283018867925, "grad_norm": 0.9115455150604248, "learning_rate": 4.55456017269293e-05, "loss": 3.1441, "step": 85750 }, { "epoch": 9.25067385444744, "grad_norm": 0.8959429264068604, "learning_rate": 4.5221802482460866e-05, "loss": 3.1475, "step": 85800 }, { "epoch": 9.256064690026955, "grad_norm": 0.9074015021324158, "learning_rate": 4.4898003237992444e-05, "loss": 3.144, "step": 85850 }, { "epoch": 9.261455525606468, "grad_norm": 0.954766571521759, "learning_rate": 4.457420399352401e-05, "loss": 3.1538, "step": 85900 }, { "epoch": 9.266846361185983, "grad_norm": 0.9908513426780701, "learning_rate": 4.425040474905558e-05, "loss": 3.1401, "step": 85950 }, { "epoch": 9.272237196765499, "grad_norm": 0.9178355932235718, "learning_rate": 4.392660550458716e-05, "loss": 3.1334, "step": 86000 }, { "epoch": 9.272237196765499, "eval_accuracy": 0.3929567503920468, "eval_loss": 3.315417766571045, "eval_runtime": 187.4351, "eval_samples_per_second": 96.092, "eval_steps_per_second": 6.007, "step": 86000 }, { "epoch": 9.277628032345014, "grad_norm": 0.9122679233551025, "learning_rate": 4.360280626011872e-05, "loss": 3.1163, "step": 86050 }, { "epoch": 9.283018867924529, "grad_norm": 0.9035681486129761, "learning_rate": 4.32790070156503e-05, "loss": 3.1341, "step": 86100 }, { "epoch": 9.288409703504042, "grad_norm": 0.9279851913452148, "learning_rate": 4.295520777118186e-05, "loss": 3.153, "step": 86150 }, { "epoch": 9.293800539083557, "grad_norm": 0.9015566110610962, "learning_rate": 4.263140852671344e-05, "loss": 3.1309, "step": 86200 }, { "epoch": 9.299191374663073, "grad_norm": 0.8954885601997375, "learning_rate": 4.2307609282245004e-05, "loss": 3.1584, "step": 86250 }, { "epoch": 9.304582210242588, "grad_norm": 0.9186733961105347, "learning_rate": 4.1983810037776575e-05, "loss": 3.1386, "step": 86300 }, { "epoch": 9.309973045822103, "grad_norm": 0.9036773443222046, "learning_rate": 4.1660010793308146e-05, "loss": 3.1344, "step": 86350 }, { "epoch": 9.315363881401618, "grad_norm": 0.9160073399543762, "learning_rate": 4.133621154883972e-05, "loss": 3.1418, "step": 86400 }, { "epoch": 9.320754716981131, "grad_norm": 0.9248919486999512, "learning_rate": 4.101241230437128e-05, "loss": 3.1488, "step": 86450 }, { "epoch": 9.326145552560646, "grad_norm": 0.957439124584198, "learning_rate": 4.068861305990286e-05, "loss": 3.1424, "step": 86500 }, { "epoch": 9.331536388140162, "grad_norm": 0.9187906980514526, "learning_rate": 4.036481381543442e-05, "loss": 3.1554, "step": 86550 }, { "epoch": 9.336927223719677, "grad_norm": 0.9415502548217773, "learning_rate": 4.0041014570966e-05, "loss": 3.143, "step": 86600 }, { "epoch": 9.342318059299192, "grad_norm": 0.9612598419189453, "learning_rate": 3.9717215326497565e-05, "loss": 3.1304, "step": 86650 }, { "epoch": 9.347708894878707, "grad_norm": 0.8706883788108826, "learning_rate": 3.9393416082029136e-05, "loss": 3.134, "step": 86700 }, { "epoch": 9.35309973045822, "grad_norm": 0.9136846661567688, "learning_rate": 3.9069616837560706e-05, "loss": 3.1408, "step": 86750 }, { "epoch": 9.358490566037736, "grad_norm": 0.9092503786087036, "learning_rate": 3.874581759309228e-05, "loss": 3.153, "step": 86800 }, { "epoch": 9.36388140161725, "grad_norm": 0.9336559176445007, "learning_rate": 3.8422018348623855e-05, "loss": 3.1576, "step": 86850 }, { "epoch": 9.369272237196766, "grad_norm": 0.9446649551391602, "learning_rate": 3.809821910415542e-05, "loss": 3.1343, "step": 86900 }, { "epoch": 9.374663072776281, "grad_norm": 0.9409492611885071, "learning_rate": 3.7774419859687e-05, "loss": 3.1445, "step": 86950 }, { "epoch": 9.380053908355794, "grad_norm": 0.9114211201667786, "learning_rate": 3.745062061521856e-05, "loss": 3.1351, "step": 87000 }, { "epoch": 9.380053908355794, "eval_accuracy": 0.3931550419123082, "eval_loss": 3.313524007797241, "eval_runtime": 187.7613, "eval_samples_per_second": 95.925, "eval_steps_per_second": 5.997, "step": 87000 }, { "epoch": 9.38544474393531, "grad_norm": 0.9090732932090759, "learning_rate": 3.712682137075013e-05, "loss": 3.1364, "step": 87050 }, { "epoch": 9.390835579514825, "grad_norm": 0.9086277484893799, "learning_rate": 3.68030221262817e-05, "loss": 3.1433, "step": 87100 }, { "epoch": 9.39622641509434, "grad_norm": 0.957598090171814, "learning_rate": 3.648569886670264e-05, "loss": 3.1458, "step": 87150 }, { "epoch": 9.401617250673855, "grad_norm": 0.9030936360359192, "learning_rate": 3.616189962223421e-05, "loss": 3.1413, "step": 87200 }, { "epoch": 9.40700808625337, "grad_norm": 0.8740924000740051, "learning_rate": 3.583810037776578e-05, "loss": 3.1287, "step": 87250 }, { "epoch": 9.412398921832883, "grad_norm": 0.9102224707603455, "learning_rate": 3.5514301133297354e-05, "loss": 3.1572, "step": 87300 }, { "epoch": 9.417789757412399, "grad_norm": 0.9368909001350403, "learning_rate": 3.5190501888828925e-05, "loss": 3.1524, "step": 87350 }, { "epoch": 9.423180592991914, "grad_norm": 0.9046692848205566, "learning_rate": 3.4866702644360496e-05, "loss": 3.1581, "step": 87400 }, { "epoch": 9.428571428571429, "grad_norm": 0.9296504259109497, "learning_rate": 3.4542903399892067e-05, "loss": 3.148, "step": 87450 }, { "epoch": 9.433962264150944, "grad_norm": 0.9260674118995667, "learning_rate": 3.421910415542363e-05, "loss": 3.1497, "step": 87500 }, { "epoch": 9.439353099730457, "grad_norm": 0.9342174530029297, "learning_rate": 3.38953049109552e-05, "loss": 3.1412, "step": 87550 }, { "epoch": 9.444743935309972, "grad_norm": 0.9455352425575256, "learning_rate": 3.357150566648677e-05, "loss": 3.126, "step": 87600 }, { "epoch": 9.450134770889488, "grad_norm": 0.9465023279190063, "learning_rate": 3.324770642201834e-05, "loss": 3.134, "step": 87650 }, { "epoch": 9.455525606469003, "grad_norm": 0.8920009136199951, "learning_rate": 3.2923907177549914e-05, "loss": 3.1474, "step": 87700 }, { "epoch": 9.460916442048518, "grad_norm": 0.9538445472717285, "learning_rate": 3.2600107933081485e-05, "loss": 3.1486, "step": 87750 }, { "epoch": 9.466307277628033, "grad_norm": 0.889605700969696, "learning_rate": 3.2276308688613056e-05, "loss": 3.1442, "step": 87800 }, { "epoch": 9.471698113207546, "grad_norm": 0.9327829480171204, "learning_rate": 3.195250944414463e-05, "loss": 3.1462, "step": 87850 }, { "epoch": 9.477088948787062, "grad_norm": 0.9338760375976562, "learning_rate": 3.16287101996762e-05, "loss": 3.1455, "step": 87900 }, { "epoch": 9.482479784366577, "grad_norm": 0.9253023862838745, "learning_rate": 3.130491095520777e-05, "loss": 3.141, "step": 87950 }, { "epoch": 9.487870619946092, "grad_norm": 0.922599196434021, "learning_rate": 3.098111171073934e-05, "loss": 3.1321, "step": 88000 }, { "epoch": 9.487870619946092, "eval_accuracy": 0.3934250443385272, "eval_loss": 3.311544895172119, "eval_runtime": 187.9489, "eval_samples_per_second": 95.829, "eval_steps_per_second": 5.991, "step": 88000 }, { "epoch": 9.493261455525607, "grad_norm": 0.9280849099159241, "learning_rate": 3.065731246627091e-05, "loss": 3.1346, "step": 88050 }, { "epoch": 9.498652291105122, "grad_norm": 0.9111308455467224, "learning_rate": 3.033351322180248e-05, "loss": 3.1326, "step": 88100 }, { "epoch": 9.504043126684635, "grad_norm": 0.878240704536438, "learning_rate": 3.000971397733405e-05, "loss": 3.1318, "step": 88150 }, { "epoch": 9.50943396226415, "grad_norm": 0.9397717118263245, "learning_rate": 2.968591473286562e-05, "loss": 3.1424, "step": 88200 }, { "epoch": 9.514824797843666, "grad_norm": 0.9439541697502136, "learning_rate": 2.936211548839719e-05, "loss": 3.1412, "step": 88250 }, { "epoch": 9.520215633423181, "grad_norm": 0.9720891714096069, "learning_rate": 2.903831624392876e-05, "loss": 3.1292, "step": 88300 }, { "epoch": 9.525606469002696, "grad_norm": 0.9237875938415527, "learning_rate": 2.8714516999460332e-05, "loss": 3.1326, "step": 88350 }, { "epoch": 9.530997304582211, "grad_norm": 0.9764357209205627, "learning_rate": 2.83907177549919e-05, "loss": 3.1372, "step": 88400 }, { "epoch": 9.536388140161725, "grad_norm": 0.942274272441864, "learning_rate": 2.806691851052347e-05, "loss": 3.1463, "step": 88450 }, { "epoch": 9.54177897574124, "grad_norm": 0.9024509787559509, "learning_rate": 2.774311926605504e-05, "loss": 3.1422, "step": 88500 }, { "epoch": 9.547169811320755, "grad_norm": 0.923398494720459, "learning_rate": 2.7419320021586613e-05, "loss": 3.1545, "step": 88550 }, { "epoch": 9.55256064690027, "grad_norm": 0.9231415390968323, "learning_rate": 2.7095520777118187e-05, "loss": 3.1488, "step": 88600 }, { "epoch": 9.557951482479785, "grad_norm": 0.9691388010978699, "learning_rate": 2.6771721532649758e-05, "loss": 3.1497, "step": 88650 }, { "epoch": 9.563342318059298, "grad_norm": 0.9140695929527283, "learning_rate": 2.6447922288181325e-05, "loss": 3.143, "step": 88700 }, { "epoch": 9.568733153638814, "grad_norm": 0.9069567322731018, "learning_rate": 2.6124123043712896e-05, "loss": 3.1387, "step": 88750 }, { "epoch": 9.574123989218329, "grad_norm": 0.9632625579833984, "learning_rate": 2.5800323799244467e-05, "loss": 3.13, "step": 88800 }, { "epoch": 9.579514824797844, "grad_norm": 0.8916159868240356, "learning_rate": 2.5476524554776038e-05, "loss": 3.15, "step": 88850 }, { "epoch": 9.584905660377359, "grad_norm": 0.9191704392433167, "learning_rate": 2.515272531030761e-05, "loss": 3.1395, "step": 88900 }, { "epoch": 9.590296495956874, "grad_norm": 0.8881397843360901, "learning_rate": 2.4828926065839176e-05, "loss": 3.123, "step": 88950 }, { "epoch": 9.595687331536388, "grad_norm": 0.9439510703086853, "learning_rate": 2.4505126821370747e-05, "loss": 3.1353, "step": 89000 }, { "epoch": 9.595687331536388, "eval_accuracy": 0.39363930783329737, "eval_loss": 3.308457136154175, "eval_runtime": 187.5014, "eval_samples_per_second": 96.058, "eval_steps_per_second": 6.005, "step": 89000 }, { "epoch": 9.601078167115903, "grad_norm": 0.9280842542648315, "learning_rate": 2.4181327576902318e-05, "loss": 3.1481, "step": 89050 }, { "epoch": 9.606469002695418, "grad_norm": 0.9251762628555298, "learning_rate": 2.385752833243389e-05, "loss": 3.1365, "step": 89100 }, { "epoch": 9.611859838274933, "grad_norm": 0.9192915558815002, "learning_rate": 2.353372908796546e-05, "loss": 3.1542, "step": 89150 }, { "epoch": 9.617250673854448, "grad_norm": 0.9379941821098328, "learning_rate": 2.3209929843497027e-05, "loss": 3.1426, "step": 89200 }, { "epoch": 9.622641509433961, "grad_norm": 0.9859703779220581, "learning_rate": 2.289260658391797e-05, "loss": 3.1537, "step": 89250 }, { "epoch": 9.628032345013477, "grad_norm": 0.9222863912582397, "learning_rate": 2.256880733944954e-05, "loss": 3.1382, "step": 89300 }, { "epoch": 9.633423180592992, "grad_norm": 0.9080533981323242, "learning_rate": 2.224500809498111e-05, "loss": 3.1555, "step": 89350 }, { "epoch": 9.638814016172507, "grad_norm": 0.926685631275177, "learning_rate": 2.1921208850512682e-05, "loss": 3.1466, "step": 89400 }, { "epoch": 9.644204851752022, "grad_norm": 0.9160860180854797, "learning_rate": 2.1597409606044253e-05, "loss": 3.1422, "step": 89450 }, { "epoch": 9.649595687331537, "grad_norm": 0.9045087695121765, "learning_rate": 2.127361036157582e-05, "loss": 3.1423, "step": 89500 }, { "epoch": 9.65498652291105, "grad_norm": 0.9281613826751709, "learning_rate": 2.094981111710739e-05, "loss": 3.1433, "step": 89550 }, { "epoch": 9.660377358490566, "grad_norm": 0.9077588319778442, "learning_rate": 2.0626011872638962e-05, "loss": 3.1219, "step": 89600 }, { "epoch": 9.66576819407008, "grad_norm": 0.9211140871047974, "learning_rate": 2.0302212628170533e-05, "loss": 3.1379, "step": 89650 }, { "epoch": 9.671159029649596, "grad_norm": 0.9394590854644775, "learning_rate": 1.99784133837021e-05, "loss": 3.1425, "step": 89700 }, { "epoch": 9.676549865229111, "grad_norm": 0.9185537099838257, "learning_rate": 1.965461413923367e-05, "loss": 3.1399, "step": 89750 }, { "epoch": 9.681940700808624, "grad_norm": 0.8864555358886719, "learning_rate": 1.9330814894765242e-05, "loss": 3.1365, "step": 89800 }, { "epoch": 9.68733153638814, "grad_norm": 0.952918529510498, "learning_rate": 1.9007015650296813e-05, "loss": 3.1575, "step": 89850 }, { "epoch": 9.692722371967655, "grad_norm": 0.9361217617988586, "learning_rate": 1.8683216405828384e-05, "loss": 3.1383, "step": 89900 }, { "epoch": 9.69811320754717, "grad_norm": 0.9423272013664246, "learning_rate": 1.8359417161359955e-05, "loss": 3.1508, "step": 89950 }, { "epoch": 9.703504043126685, "grad_norm": 0.9519898295402527, "learning_rate": 1.8035617916891526e-05, "loss": 3.1414, "step": 90000 }, { "epoch": 9.703504043126685, "eval_accuracy": 0.3940164420069014, "eval_loss": 3.3063955307006836, "eval_runtime": 187.5748, "eval_samples_per_second": 96.02, "eval_steps_per_second": 6.003, "step": 90000 }, { "epoch": 9.7088948787062, "grad_norm": 0.9187700748443604, "learning_rate": 1.7711818672423097e-05, "loss": 3.1495, "step": 90050 }, { "epoch": 9.714285714285714, "grad_norm": 0.9302745461463928, "learning_rate": 1.7388019427954664e-05, "loss": 3.1235, "step": 90100 }, { "epoch": 9.719676549865229, "grad_norm": 0.9535933136940002, "learning_rate": 1.706422018348624e-05, "loss": 3.1415, "step": 90150 }, { "epoch": 9.725067385444744, "grad_norm": 0.8942263722419739, "learning_rate": 1.674042093901781e-05, "loss": 3.1423, "step": 90200 }, { "epoch": 9.730458221024259, "grad_norm": 0.9255458116531372, "learning_rate": 1.6416621694549377e-05, "loss": 3.1483, "step": 90250 }, { "epoch": 9.735849056603774, "grad_norm": 0.9040615558624268, "learning_rate": 1.6092822450080948e-05, "loss": 3.1325, "step": 90300 }, { "epoch": 9.74123989218329, "grad_norm": 0.9275330901145935, "learning_rate": 1.576902320561252e-05, "loss": 3.1358, "step": 90350 }, { "epoch": 9.746630727762803, "grad_norm": 0.9373171925544739, "learning_rate": 1.544522396114409e-05, "loss": 3.1712, "step": 90400 }, { "epoch": 9.752021563342318, "grad_norm": 0.9148626923561096, "learning_rate": 1.5121424716675659e-05, "loss": 3.1346, "step": 90450 }, { "epoch": 9.757412398921833, "grad_norm": 0.9298881888389587, "learning_rate": 1.479762547220723e-05, "loss": 3.1401, "step": 90500 }, { "epoch": 9.762803234501348, "grad_norm": 0.9553585052490234, "learning_rate": 1.4473826227738802e-05, "loss": 3.1409, "step": 90550 }, { "epoch": 9.768194070080863, "grad_norm": 0.8911868333816528, "learning_rate": 1.4150026983270371e-05, "loss": 3.1391, "step": 90600 }, { "epoch": 9.773584905660378, "grad_norm": 0.9505174160003662, "learning_rate": 1.3826227738801942e-05, "loss": 3.1435, "step": 90650 }, { "epoch": 9.778975741239892, "grad_norm": 0.8940786123275757, "learning_rate": 1.3502428494333512e-05, "loss": 3.1435, "step": 90700 }, { "epoch": 9.784366576819407, "grad_norm": 0.9654335379600525, "learning_rate": 1.3178629249865082e-05, "loss": 3.136, "step": 90750 }, { "epoch": 9.789757412398922, "grad_norm": 0.9177462458610535, "learning_rate": 1.2854830005396653e-05, "loss": 3.1599, "step": 90800 }, { "epoch": 9.795148247978437, "grad_norm": 0.8937206268310547, "learning_rate": 1.2531030760928222e-05, "loss": 3.1504, "step": 90850 }, { "epoch": 9.800539083557952, "grad_norm": 0.9319292902946472, "learning_rate": 1.2207231516459793e-05, "loss": 3.1356, "step": 90900 }, { "epoch": 9.805929919137466, "grad_norm": 0.9177371859550476, "learning_rate": 1.1883432271991366e-05, "loss": 3.1507, "step": 90950 }, { "epoch": 9.81132075471698, "grad_norm": 0.9243341088294983, "learning_rate": 1.1559633027522935e-05, "loss": 3.1511, "step": 91000 }, { "epoch": 9.81132075471698, "eval_accuracy": 0.3941807251732769, "eval_loss": 3.304288148880005, "eval_runtime": 187.6044, "eval_samples_per_second": 96.005, "eval_steps_per_second": 6.002, "step": 91000 }, { "epoch": 9.816711590296496, "grad_norm": 0.9366071224212646, "learning_rate": 1.1235833783054506e-05, "loss": 3.1432, "step": 91050 }, { "epoch": 9.822102425876011, "grad_norm": 0.8996028900146484, "learning_rate": 1.0912034538586075e-05, "loss": 3.1392, "step": 91100 }, { "epoch": 9.827493261455526, "grad_norm": 0.9392869472503662, "learning_rate": 1.0588235294117646e-05, "loss": 3.1376, "step": 91150 }, { "epoch": 9.832884097035041, "grad_norm": 0.9204728007316589, "learning_rate": 1.0264436049649217e-05, "loss": 3.1242, "step": 91200 }, { "epoch": 9.838274932614555, "grad_norm": 0.9263073801994324, "learning_rate": 9.940636805180786e-06, "loss": 3.13, "step": 91250 }, { "epoch": 9.84366576819407, "grad_norm": 0.9406017661094666, "learning_rate": 9.616837560712357e-06, "loss": 3.125, "step": 91300 }, { "epoch": 9.849056603773585, "grad_norm": 0.9088371992111206, "learning_rate": 9.293038316243928e-06, "loss": 3.1387, "step": 91350 }, { "epoch": 9.8544474393531, "grad_norm": 0.9465605616569519, "learning_rate": 8.975715056664867e-06, "loss": 3.1628, "step": 91400 }, { "epoch": 9.859838274932615, "grad_norm": 0.9424158334732056, "learning_rate": 8.651915812196437e-06, "loss": 3.1449, "step": 91450 }, { "epoch": 9.865229110512129, "grad_norm": 0.9331150054931641, "learning_rate": 8.328116567728008e-06, "loss": 3.1452, "step": 91500 }, { "epoch": 9.870619946091644, "grad_norm": 0.8824761509895325, "learning_rate": 8.004317323259577e-06, "loss": 3.1565, "step": 91550 }, { "epoch": 9.876010781671159, "grad_norm": 0.8807875514030457, "learning_rate": 7.680518078791148e-06, "loss": 3.1295, "step": 91600 }, { "epoch": 9.881401617250674, "grad_norm": 0.9300699830055237, "learning_rate": 7.356718834322719e-06, "loss": 3.1535, "step": 91650 }, { "epoch": 9.88679245283019, "grad_norm": 0.9522417783737183, "learning_rate": 7.03291958985429e-06, "loss": 3.1476, "step": 91700 }, { "epoch": 9.892183288409704, "grad_norm": 0.9019232392311096, "learning_rate": 6.70912034538586e-06, "loss": 3.1431, "step": 91750 }, { "epoch": 9.897574123989218, "grad_norm": 0.91133052110672, "learning_rate": 6.38532110091743e-06, "loss": 3.1496, "step": 91800 }, { "epoch": 9.902964959568733, "grad_norm": 0.9428941011428833, "learning_rate": 6.061521856449001e-06, "loss": 3.1511, "step": 91850 }, { "epoch": 9.908355795148248, "grad_norm": 0.9570835828781128, "learning_rate": 5.737722611980571e-06, "loss": 3.145, "step": 91900 }, { "epoch": 9.913746630727763, "grad_norm": 0.907947838306427, "learning_rate": 5.413923367512142e-06, "loss": 3.1555, "step": 91950 }, { "epoch": 9.919137466307278, "grad_norm": 0.8969404101371765, "learning_rate": 5.090124123043712e-06, "loss": 3.1522, "step": 92000 }, { "epoch": 9.919137466307278, "eval_accuracy": 0.3943509842484822, "eval_loss": 3.303070068359375, "eval_runtime": 188.1629, "eval_samples_per_second": 95.72, "eval_steps_per_second": 5.984, "step": 92000 }, { "epoch": 9.924528301886792, "grad_norm": 0.9321413040161133, "learning_rate": 4.766324878575283e-06, "loss": 3.1493, "step": 92050 }, { "epoch": 9.929919137466307, "grad_norm": 0.8739159107208252, "learning_rate": 4.442525634106853e-06, "loss": 3.1489, "step": 92100 }, { "epoch": 9.935309973045822, "grad_norm": 0.8782985210418701, "learning_rate": 4.118726389638424e-06, "loss": 3.1392, "step": 92150 }, { "epoch": 9.940700808625337, "grad_norm": 0.905799925327301, "learning_rate": 3.7949271451699944e-06, "loss": 3.1412, "step": 92200 }, { "epoch": 9.946091644204852, "grad_norm": 0.933990478515625, "learning_rate": 3.4711279007015644e-06, "loss": 3.1318, "step": 92250 }, { "epoch": 9.951482479784367, "grad_norm": 0.951168417930603, "learning_rate": 3.1473286562331353e-06, "loss": 3.1352, "step": 92300 }, { "epoch": 9.95687331536388, "grad_norm": 0.935908854007721, "learning_rate": 2.8235294117647054e-06, "loss": 3.1434, "step": 92350 }, { "epoch": 9.962264150943396, "grad_norm": 0.8934540748596191, "learning_rate": 2.4997301672962763e-06, "loss": 3.1592, "step": 92400 }, { "epoch": 9.967654986522911, "grad_norm": 0.9462904930114746, "learning_rate": 2.1759309228278467e-06, "loss": 3.1313, "step": 92450 }, { "epoch": 9.973045822102426, "grad_norm": 0.9313557147979736, "learning_rate": 1.852131678359417e-06, "loss": 3.132, "step": 92500 }, { "epoch": 9.978436657681941, "grad_norm": 0.9732421636581421, "learning_rate": 1.5283324338909875e-06, "loss": 3.1402, "step": 92550 }, { "epoch": 9.983827493261456, "grad_norm": 0.8972058892250061, "learning_rate": 1.204533189422558e-06, "loss": 3.1391, "step": 92600 }, { "epoch": 9.98921832884097, "grad_norm": 0.9335411190986633, "learning_rate": 8.807339449541284e-07, "loss": 3.1346, "step": 92650 }, { "epoch": 9.994609164420485, "grad_norm": 0.9858599305152893, "learning_rate": 5.569347004856989e-07, "loss": 3.1294, "step": 92700 }, { "epoch": 10.0, "grad_norm": 1.9937326908111572, "learning_rate": 2.3313545601726927e-07, "loss": 3.139, "step": 92750 }, { "epoch": 10.0, "step": 92750, "total_flos": 7.75449427968e+17, "train_loss": 3.467074710979616, "train_runtime": 80468.1749, "train_samples_per_second": 36.881, "train_steps_per_second": 1.153 } ], "logging_steps": 50, "max_steps": 92750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.75449427968e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }