{ "best_metric": 3.3018898963928223, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_2000_8397/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005390835579514825, "grad_norm": 1.5813435316085815, "learning_rate": 0.0003, "loss": 8.6318, "step": 50 }, { "epoch": 0.01078167115902965, "grad_norm": 1.0082982778549194, "learning_rate": 0.0006, "loss": 6.8697, "step": 100 }, { "epoch": 0.016172506738544475, "grad_norm": 2.0049550533294678, "learning_rate": 0.0005996762007555315, "loss": 6.4573, "step": 150 }, { "epoch": 0.0215633423180593, "grad_norm": 2.044867515563965, "learning_rate": 0.000599352401511063, "loss": 6.239, "step": 200 }, { "epoch": 0.026954177897574125, "grad_norm": 1.7265796661376953, "learning_rate": 0.0005990286022665946, "loss": 6.0844, "step": 250 }, { "epoch": 0.03234501347708895, "grad_norm": 3.774665594100952, "learning_rate": 0.0005987048030221263, "loss": 5.9817, "step": 300 }, { "epoch": 0.03773584905660377, "grad_norm": 1.1093494892120361, "learning_rate": 0.0005983810037776578, "loss": 5.8816, "step": 350 }, { "epoch": 0.0431266846361186, "grad_norm": 1.401320457458496, "learning_rate": 0.0005980572045331894, "loss": 5.8369, "step": 400 }, { "epoch": 0.04851752021563342, "grad_norm": 1.4539614915847778, "learning_rate": 0.0005977334052887209, "loss": 5.756, "step": 450 }, { "epoch": 0.05390835579514825, "grad_norm": 1.237868309020996, "learning_rate": 0.0005974096060442526, "loss": 5.6613, "step": 500 }, { "epoch": 0.05929919137466307, "grad_norm": 1.7472739219665527, "learning_rate": 0.0005970858067997841, "loss": 5.5746, "step": 550 }, { "epoch": 0.0646900269541779, "grad_norm": 1.1309614181518555, "learning_rate": 0.0005967620075553157, "loss": 5.5044, "step": 600 }, { "epoch": 0.07008086253369272, "grad_norm": 1.2399049997329712, "learning_rate": 0.0005964382083108472, "loss": 5.4345, "step": 650 }, { "epoch": 0.07547169811320754, "grad_norm": 1.7309972047805786, "learning_rate": 0.0005961144090663788, "loss": 5.386, "step": 700 }, { "epoch": 0.08086253369272237, "grad_norm": 1.1939959526062012, "learning_rate": 0.0005957906098219104, "loss": 5.3088, "step": 750 }, { "epoch": 0.0862533692722372, "grad_norm": 1.6205377578735352, "learning_rate": 0.0005954668105774419, "loss": 5.2622, "step": 800 }, { "epoch": 0.09164420485175202, "grad_norm": 1.3945573568344116, "learning_rate": 0.0005951430113329735, "loss": 5.2251, "step": 850 }, { "epoch": 0.09703504043126684, "grad_norm": 1.3046308755874634, "learning_rate": 0.0005948192120885051, "loss": 5.1654, "step": 900 }, { "epoch": 0.10242587601078167, "grad_norm": 1.2131174802780151, "learning_rate": 0.0005944954128440366, "loss": 5.1448, "step": 950 }, { "epoch": 0.1078167115902965, "grad_norm": 0.9675126075744629, "learning_rate": 0.0005941716135995682, "loss": 5.1035, "step": 1000 }, { "epoch": 0.1078167115902965, "eval_accuracy": 0.22749035406825205, "eval_loss": 5.018069267272949, "eval_runtime": 184.6816, "eval_samples_per_second": 97.525, "eval_steps_per_second": 6.097, "step": 1000 }, { "epoch": 0.11320754716981132, "grad_norm": 1.0149263143539429, "learning_rate": 0.0005938478143550997, "loss": 5.0486, "step": 1050 }, { "epoch": 0.11859838274932614, "grad_norm": 1.3123433589935303, "learning_rate": 0.0005935240151106314, "loss": 5.0207, "step": 1100 }, { "epoch": 0.12398921832884097, "grad_norm": 0.9050843715667725, "learning_rate": 0.0005932002158661629, "loss": 4.9928, "step": 1150 }, { "epoch": 0.1293800539083558, "grad_norm": 1.274807095527649, "learning_rate": 0.0005928764166216945, "loss": 4.9535, "step": 1200 }, { "epoch": 0.1347708894878706, "grad_norm": 1.00777006149292, "learning_rate": 0.000592552617377226, "loss": 4.9137, "step": 1250 }, { "epoch": 0.14016172506738545, "grad_norm": 0.8887598514556885, "learning_rate": 0.0005922288181327577, "loss": 4.8818, "step": 1300 }, { "epoch": 0.14555256064690028, "grad_norm": 0.9829823970794678, "learning_rate": 0.0005919050188882893, "loss": 4.8648, "step": 1350 }, { "epoch": 0.1509433962264151, "grad_norm": 0.9974054098129272, "learning_rate": 0.0005915812196438207, "loss": 4.8498, "step": 1400 }, { "epoch": 0.15633423180592992, "grad_norm": 1.0419261455535889, "learning_rate": 0.0005912574203993524, "loss": 4.8005, "step": 1450 }, { "epoch": 0.16172506738544473, "grad_norm": 1.1737334728240967, "learning_rate": 0.0005909336211548839, "loss": 4.8138, "step": 1500 }, { "epoch": 0.16711590296495957, "grad_norm": 1.0961012840270996, "learning_rate": 0.0005906098219104155, "loss": 4.7659, "step": 1550 }, { "epoch": 0.1725067385444744, "grad_norm": 1.1323438882827759, "learning_rate": 0.000590286022665947, "loss": 4.7663, "step": 1600 }, { "epoch": 0.1778975741239892, "grad_norm": 0.8736843466758728, "learning_rate": 0.0005899622234214787, "loss": 4.7157, "step": 1650 }, { "epoch": 0.18328840970350405, "grad_norm": 0.7891756892204285, "learning_rate": 0.0005896384241770102, "loss": 4.7099, "step": 1700 }, { "epoch": 0.18867924528301888, "grad_norm": 1.0397777557373047, "learning_rate": 0.0005893146249325418, "loss": 4.6729, "step": 1750 }, { "epoch": 0.1940700808625337, "grad_norm": 1.151475429534912, "learning_rate": 0.0005889908256880733, "loss": 4.6804, "step": 1800 }, { "epoch": 0.19946091644204852, "grad_norm": 1.5584900379180908, "learning_rate": 0.0005886670264436049, "loss": 4.6534, "step": 1850 }, { "epoch": 0.20485175202156333, "grad_norm": 0.7592549920082092, "learning_rate": 0.0005883432271991365, "loss": 4.6368, "step": 1900 }, { "epoch": 0.21024258760107817, "grad_norm": 1.132956862449646, "learning_rate": 0.0005880194279546681, "loss": 4.6073, "step": 1950 }, { "epoch": 0.215633423180593, "grad_norm": 0.904139518737793, "learning_rate": 0.0005876956287101996, "loss": 4.5922, "step": 2000 }, { "epoch": 0.215633423180593, "eval_accuracy": 0.26994136329603313, "eval_loss": 4.514337062835693, "eval_runtime": 183.9245, "eval_samples_per_second": 97.926, "eval_steps_per_second": 6.122, "step": 2000 }, { "epoch": 0.2210242587601078, "grad_norm": 0.8630588054656982, "learning_rate": 0.0005873718294657312, "loss": 4.5736, "step": 2050 }, { "epoch": 0.22641509433962265, "grad_norm": 0.751631498336792, "learning_rate": 0.0005870480302212628, "loss": 4.5577, "step": 2100 }, { "epoch": 0.23180592991913745, "grad_norm": 0.9096190333366394, "learning_rate": 0.0005867242309767943, "loss": 4.5327, "step": 2150 }, { "epoch": 0.2371967654986523, "grad_norm": 0.9275792241096497, "learning_rate": 0.0005864004317323259, "loss": 4.513, "step": 2200 }, { "epoch": 0.24258760107816713, "grad_norm": 0.8501319885253906, "learning_rate": 0.0005860766324878575, "loss": 4.4957, "step": 2250 }, { "epoch": 0.24797843665768193, "grad_norm": 0.8335550427436829, "learning_rate": 0.000585752833243389, "loss": 4.505, "step": 2300 }, { "epoch": 0.25336927223719674, "grad_norm": 0.7996338605880737, "learning_rate": 0.0005854290339989206, "loss": 4.4554, "step": 2350 }, { "epoch": 0.2587601078167116, "grad_norm": 1.0249171257019043, "learning_rate": 0.0005851052347544521, "loss": 4.4629, "step": 2400 }, { "epoch": 0.2641509433962264, "grad_norm": 0.9536342620849609, "learning_rate": 0.0005847814355099838, "loss": 4.4412, "step": 2450 }, { "epoch": 0.2695417789757412, "grad_norm": 1.2684301137924194, "learning_rate": 0.0005844576362655154, "loss": 4.4274, "step": 2500 }, { "epoch": 0.2749326145552561, "grad_norm": 0.8013315796852112, "learning_rate": 0.0005841338370210469, "loss": 4.4099, "step": 2550 }, { "epoch": 0.2803234501347709, "grad_norm": 0.8502827882766724, "learning_rate": 0.0005838100377765785, "loss": 4.3802, "step": 2600 }, { "epoch": 0.2857142857142857, "grad_norm": 0.8461282849311829, "learning_rate": 0.0005834862385321101, "loss": 4.3947, "step": 2650 }, { "epoch": 0.29110512129380056, "grad_norm": 1.0242016315460205, "learning_rate": 0.0005831624392876417, "loss": 4.3976, "step": 2700 }, { "epoch": 0.29649595687331537, "grad_norm": 0.8631992936134338, "learning_rate": 0.0005828386400431731, "loss": 4.3521, "step": 2750 }, { "epoch": 0.3018867924528302, "grad_norm": 0.7447966933250427, "learning_rate": 0.0005825148407987048, "loss": 4.3323, "step": 2800 }, { "epoch": 0.30727762803234504, "grad_norm": 0.7234294414520264, "learning_rate": 0.0005821910415542363, "loss": 4.3364, "step": 2850 }, { "epoch": 0.31266846361185985, "grad_norm": 0.8805587887763977, "learning_rate": 0.0005818672423097679, "loss": 4.3307, "step": 2900 }, { "epoch": 0.31805929919137466, "grad_norm": 0.8215044140815735, "learning_rate": 0.0005815434430652994, "loss": 4.3141, "step": 2950 }, { "epoch": 0.32345013477088946, "grad_norm": 0.7359305620193481, "learning_rate": 0.0005812196438208311, "loss": 4.311, "step": 3000 }, { "epoch": 0.32345013477088946, "eval_accuracy": 0.298171882566655, "eval_loss": 4.2451090812683105, "eval_runtime": 183.7967, "eval_samples_per_second": 97.994, "eval_steps_per_second": 6.126, "step": 3000 }, { "epoch": 0.3288409703504043, "grad_norm": 0.770511269569397, "learning_rate": 0.0005808958445763626, "loss": 4.3027, "step": 3050 }, { "epoch": 0.33423180592991913, "grad_norm": 0.677880048751831, "learning_rate": 0.0005805720453318942, "loss": 4.3083, "step": 3100 }, { "epoch": 0.33962264150943394, "grad_norm": 0.6858367919921875, "learning_rate": 0.0005802482460874257, "loss": 4.2657, "step": 3150 }, { "epoch": 0.3450134770889488, "grad_norm": 0.6485952138900757, "learning_rate": 0.0005799244468429573, "loss": 4.261, "step": 3200 }, { "epoch": 0.3504043126684636, "grad_norm": 0.706744372844696, "learning_rate": 0.0005796006475984889, "loss": 4.2609, "step": 3250 }, { "epoch": 0.3557951482479784, "grad_norm": 0.7495291233062744, "learning_rate": 0.0005792768483540205, "loss": 4.2458, "step": 3300 }, { "epoch": 0.3611859838274933, "grad_norm": 0.6684705018997192, "learning_rate": 0.000578953049109552, "loss": 4.2625, "step": 3350 }, { "epoch": 0.3665768194070081, "grad_norm": 0.6015046834945679, "learning_rate": 0.0005786292498650836, "loss": 4.2344, "step": 3400 }, { "epoch": 0.3719676549865229, "grad_norm": 0.6746869087219238, "learning_rate": 0.0005783054506206152, "loss": 4.2115, "step": 3450 }, { "epoch": 0.37735849056603776, "grad_norm": 0.6800693273544312, "learning_rate": 0.0005779816513761467, "loss": 4.2238, "step": 3500 }, { "epoch": 0.38274932614555257, "grad_norm": 0.8763014674186707, "learning_rate": 0.0005776578521316782, "loss": 4.2176, "step": 3550 }, { "epoch": 0.3881401617250674, "grad_norm": 0.6071361899375916, "learning_rate": 0.0005773340528872099, "loss": 4.2172, "step": 3600 }, { "epoch": 0.3935309973045822, "grad_norm": 0.8424677848815918, "learning_rate": 0.0005770102536427414, "loss": 4.1909, "step": 3650 }, { "epoch": 0.39892183288409705, "grad_norm": 0.7024722099304199, "learning_rate": 0.000576686454398273, "loss": 4.193, "step": 3700 }, { "epoch": 0.40431266846361186, "grad_norm": 0.6678298711776733, "learning_rate": 0.0005763626551538045, "loss": 4.1954, "step": 3750 }, { "epoch": 0.40970350404312667, "grad_norm": 0.591792643070221, "learning_rate": 0.0005760388559093362, "loss": 4.1897, "step": 3800 }, { "epoch": 0.41509433962264153, "grad_norm": 0.7370854020118713, "learning_rate": 0.0005757150566648678, "loss": 4.1703, "step": 3850 }, { "epoch": 0.42048517520215634, "grad_norm": 0.8055949807167053, "learning_rate": 0.0005753912574203993, "loss": 4.1666, "step": 3900 }, { "epoch": 0.42587601078167114, "grad_norm": 0.6214135885238647, "learning_rate": 0.0005750674581759309, "loss": 4.1586, "step": 3950 }, { "epoch": 0.431266846361186, "grad_norm": 0.7452210783958435, "learning_rate": 0.0005747436589314624, "loss": 4.159, "step": 4000 }, { "epoch": 0.431266846361186, "eval_accuracy": 0.31298409615085193, "eval_loss": 4.08732271194458, "eval_runtime": 183.8347, "eval_samples_per_second": 97.974, "eval_steps_per_second": 6.125, "step": 4000 }, { "epoch": 0.4366576819407008, "grad_norm": 0.5454065203666687, "learning_rate": 0.0005744198596869941, "loss": 4.1509, "step": 4050 }, { "epoch": 0.4420485175202156, "grad_norm": 0.6480352282524109, "learning_rate": 0.0005740960604425255, "loss": 4.1515, "step": 4100 }, { "epoch": 0.4474393530997305, "grad_norm": 0.7491263151168823, "learning_rate": 0.0005737722611980572, "loss": 4.1543, "step": 4150 }, { "epoch": 0.4528301886792453, "grad_norm": 0.5414714217185974, "learning_rate": 0.0005734484619535887, "loss": 4.1397, "step": 4200 }, { "epoch": 0.4582210242587601, "grad_norm": 0.735364556312561, "learning_rate": 0.0005731246627091203, "loss": 4.1231, "step": 4250 }, { "epoch": 0.4636118598382749, "grad_norm": 0.7785477042198181, "learning_rate": 0.0005728008634646518, "loss": 4.1242, "step": 4300 }, { "epoch": 0.46900269541778977, "grad_norm": 0.6108700633049011, "learning_rate": 0.0005724770642201835, "loss": 4.118, "step": 4350 }, { "epoch": 0.4743935309973046, "grad_norm": 0.6415695548057556, "learning_rate": 0.000572153264975715, "loss": 4.1178, "step": 4400 }, { "epoch": 0.4797843665768194, "grad_norm": 0.8770371675491333, "learning_rate": 0.0005718294657312466, "loss": 4.1028, "step": 4450 }, { "epoch": 0.48517520215633425, "grad_norm": 0.7115625143051147, "learning_rate": 0.0005715056664867781, "loss": 4.1109, "step": 4500 }, { "epoch": 0.49056603773584906, "grad_norm": 0.660191535949707, "learning_rate": 0.0005711818672423097, "loss": 4.1011, "step": 4550 }, { "epoch": 0.49595687331536387, "grad_norm": 0.6192442178726196, "learning_rate": 0.0005708580679978413, "loss": 4.0857, "step": 4600 }, { "epoch": 0.5013477088948787, "grad_norm": 0.6783645153045654, "learning_rate": 0.0005705342687533729, "loss": 4.078, "step": 4650 }, { "epoch": 0.5067385444743935, "grad_norm": 0.6928367018699646, "learning_rate": 0.0005702104695089044, "loss": 4.0956, "step": 4700 }, { "epoch": 0.5121293800539084, "grad_norm": 0.6265180110931396, "learning_rate": 0.000569886670264436, "loss": 4.0822, "step": 4750 }, { "epoch": 0.5175202156334232, "grad_norm": 0.7510149478912354, "learning_rate": 0.0005695628710199675, "loss": 4.0755, "step": 4800 }, { "epoch": 0.522911051212938, "grad_norm": 0.7354061603546143, "learning_rate": 0.0005692390717754991, "loss": 4.0655, "step": 4850 }, { "epoch": 0.5283018867924528, "grad_norm": 0.6540889739990234, "learning_rate": 0.0005689152725310306, "loss": 4.055, "step": 4900 }, { "epoch": 0.5336927223719676, "grad_norm": 0.674555778503418, "learning_rate": 0.0005685914732865623, "loss": 4.0696, "step": 4950 }, { "epoch": 0.5390835579514824, "grad_norm": 0.4996623992919922, "learning_rate": 0.0005682676740420939, "loss": 4.0472, "step": 5000 }, { "epoch": 0.5390835579514824, "eval_accuracy": 0.3215982057496718, "eval_loss": 3.9894087314605713, "eval_runtime": 183.8499, "eval_samples_per_second": 97.966, "eval_steps_per_second": 6.125, "step": 5000 }, { "epoch": 0.5444743935309974, "grad_norm": 0.7127335071563721, "learning_rate": 0.0005679438747976254, "loss": 4.0568, "step": 5050 }, { "epoch": 0.5498652291105122, "grad_norm": 0.6820212602615356, "learning_rate": 0.000567620075553157, "loss": 4.0516, "step": 5100 }, { "epoch": 0.555256064690027, "grad_norm": 0.7278046011924744, "learning_rate": 0.0005672962763086886, "loss": 4.0498, "step": 5150 }, { "epoch": 0.5606469002695418, "grad_norm": 0.624863862991333, "learning_rate": 0.0005669724770642202, "loss": 4.0671, "step": 5200 }, { "epoch": 0.5660377358490566, "grad_norm": 0.6197010278701782, "learning_rate": 0.0005666486778197517, "loss": 4.041, "step": 5250 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7419002056121826, "learning_rate": 0.0005663248785752833, "loss": 4.0326, "step": 5300 }, { "epoch": 0.5768194070080862, "grad_norm": 0.6294003129005432, "learning_rate": 0.0005660010793308148, "loss": 4.0263, "step": 5350 }, { "epoch": 0.5822102425876011, "grad_norm": 0.5749978423118591, "learning_rate": 0.0005656772800863465, "loss": 4.0359, "step": 5400 }, { "epoch": 0.5876010781671159, "grad_norm": 0.5415307879447937, "learning_rate": 0.0005653534808418779, "loss": 4.0382, "step": 5450 }, { "epoch": 0.5929919137466307, "grad_norm": 0.6727957725524902, "learning_rate": 0.0005650296815974096, "loss": 4.0181, "step": 5500 }, { "epoch": 0.5983827493261455, "grad_norm": 0.6418060660362244, "learning_rate": 0.0005647058823529411, "loss": 4.0069, "step": 5550 }, { "epoch": 0.6037735849056604, "grad_norm": 0.7099882960319519, "learning_rate": 0.0005643820831084727, "loss": 4.0226, "step": 5600 }, { "epoch": 0.6091644204851752, "grad_norm": 0.6498609185218811, "learning_rate": 0.0005640582838640042, "loss": 4.0046, "step": 5650 }, { "epoch": 0.6145552560646901, "grad_norm": 0.5713142156600952, "learning_rate": 0.0005637344846195358, "loss": 4.0043, "step": 5700 }, { "epoch": 0.6199460916442049, "grad_norm": 0.6617730259895325, "learning_rate": 0.0005634106853750674, "loss": 3.979, "step": 5750 }, { "epoch": 0.6253369272237197, "grad_norm": 0.6460627317428589, "learning_rate": 0.000563086886130599, "loss": 3.9846, "step": 5800 }, { "epoch": 0.6307277628032345, "grad_norm": 0.6782198548316956, "learning_rate": 0.0005627630868861305, "loss": 3.9947, "step": 5850 }, { "epoch": 0.6361185983827493, "grad_norm": 0.5800774097442627, "learning_rate": 0.0005624392876416621, "loss": 3.9922, "step": 5900 }, { "epoch": 0.6415094339622641, "grad_norm": 0.676412045955658, "learning_rate": 0.0005621154883971937, "loss": 3.9791, "step": 5950 }, { "epoch": 0.6469002695417789, "grad_norm": 0.583685040473938, "learning_rate": 0.0005617916891527253, "loss": 3.9931, "step": 6000 }, { "epoch": 0.6469002695417789, "eval_accuracy": 0.32843508006251015, "eval_loss": 3.9174036979675293, "eval_runtime": 183.9133, "eval_samples_per_second": 97.932, "eval_steps_per_second": 6.122, "step": 6000 }, { "epoch": 0.6522911051212938, "grad_norm": 0.5780828595161438, "learning_rate": 0.0005614678899082568, "loss": 3.9663, "step": 6050 }, { "epoch": 0.6576819407008087, "grad_norm": 0.5652017593383789, "learning_rate": 0.0005611440906637884, "loss": 3.9971, "step": 6100 }, { "epoch": 0.6630727762803235, "grad_norm": 0.8341758847236633, "learning_rate": 0.00056082029141932, "loss": 3.9793, "step": 6150 }, { "epoch": 0.6684636118598383, "grad_norm": 0.578648567199707, "learning_rate": 0.0005604964921748515, "loss": 3.9499, "step": 6200 }, { "epoch": 0.6738544474393531, "grad_norm": 0.59043949842453, "learning_rate": 0.000560172692930383, "loss": 3.9715, "step": 6250 }, { "epoch": 0.6792452830188679, "grad_norm": 0.6629766225814819, "learning_rate": 0.0005598488936859147, "loss": 3.9749, "step": 6300 }, { "epoch": 0.6846361185983828, "grad_norm": 0.5541216135025024, "learning_rate": 0.0005595250944414463, "loss": 3.9608, "step": 6350 }, { "epoch": 0.6900269541778976, "grad_norm": 0.6767361164093018, "learning_rate": 0.0005592012951969778, "loss": 3.9694, "step": 6400 }, { "epoch": 0.6954177897574124, "grad_norm": 0.6981496810913086, "learning_rate": 0.0005588774959525094, "loss": 3.9655, "step": 6450 }, { "epoch": 0.7008086253369272, "grad_norm": 0.5768890380859375, "learning_rate": 0.000558553696708041, "loss": 3.96, "step": 6500 }, { "epoch": 0.706199460916442, "grad_norm": 0.6919478178024292, "learning_rate": 0.0005582298974635726, "loss": 3.958, "step": 6550 }, { "epoch": 0.7115902964959568, "grad_norm": 0.5978683829307556, "learning_rate": 0.0005579060982191041, "loss": 3.9587, "step": 6600 }, { "epoch": 0.7169811320754716, "grad_norm": 0.6285462379455566, "learning_rate": 0.0005575822989746357, "loss": 3.9703, "step": 6650 }, { "epoch": 0.7223719676549866, "grad_norm": 0.7304993867874146, "learning_rate": 0.0005572584997301672, "loss": 3.9348, "step": 6700 }, { "epoch": 0.7277628032345014, "grad_norm": 0.5624628663063049, "learning_rate": 0.0005569347004856989, "loss": 3.9602, "step": 6750 }, { "epoch": 0.7331536388140162, "grad_norm": 0.6064887642860413, "learning_rate": 0.0005566109012412303, "loss": 3.9331, "step": 6800 }, { "epoch": 0.738544474393531, "grad_norm": 0.5876811146736145, "learning_rate": 0.000556287101996762, "loss": 3.9323, "step": 6850 }, { "epoch": 0.7439353099730458, "grad_norm": 0.5871797204017639, "learning_rate": 0.0005559633027522935, "loss": 3.9329, "step": 6900 }, { "epoch": 0.7493261455525606, "grad_norm": 0.6020776629447937, "learning_rate": 0.0005556395035078251, "loss": 3.9362, "step": 6950 }, { "epoch": 0.7547169811320755, "grad_norm": 0.6208156943321228, "learning_rate": 0.0005553157042633566, "loss": 3.9191, "step": 7000 }, { "epoch": 0.7547169811320755, "eval_accuracy": 0.33344278300899177, "eval_loss": 3.8618791103363037, "eval_runtime": 183.7498, "eval_samples_per_second": 98.019, "eval_steps_per_second": 6.128, "step": 7000 }, { "epoch": 0.7601078167115903, "grad_norm": 0.6798801422119141, "learning_rate": 0.0005549919050188882, "loss": 3.9331, "step": 7050 }, { "epoch": 0.7654986522911051, "grad_norm": 1.4628221988677979, "learning_rate": 0.0005546681057744198, "loss": 3.906, "step": 7100 }, { "epoch": 0.77088948787062, "grad_norm": 0.6019647121429443, "learning_rate": 0.0005543443065299514, "loss": 3.9351, "step": 7150 }, { "epoch": 0.7762803234501348, "grad_norm": 0.5555400252342224, "learning_rate": 0.000554020507285483, "loss": 3.9058, "step": 7200 }, { "epoch": 0.7816711590296496, "grad_norm": 0.6362798810005188, "learning_rate": 0.0005536967080410145, "loss": 3.8967, "step": 7250 }, { "epoch": 0.7870619946091644, "grad_norm": 0.5722786784172058, "learning_rate": 0.0005533729087965462, "loss": 3.9111, "step": 7300 }, { "epoch": 0.7924528301886793, "grad_norm": 0.580388069152832, "learning_rate": 0.0005530491095520777, "loss": 3.9099, "step": 7350 }, { "epoch": 0.7978436657681941, "grad_norm": 0.5334943532943726, "learning_rate": 0.0005527253103076093, "loss": 3.9072, "step": 7400 }, { "epoch": 0.8032345013477089, "grad_norm": 0.6169307827949524, "learning_rate": 0.0005524015110631408, "loss": 3.8832, "step": 7450 }, { "epoch": 0.8086253369272237, "grad_norm": 0.6007163524627686, "learning_rate": 0.0005520777118186724, "loss": 3.9024, "step": 7500 }, { "epoch": 0.8140161725067385, "grad_norm": 0.5735055804252625, "learning_rate": 0.0005517539125742039, "loss": 3.8979, "step": 7550 }, { "epoch": 0.8194070080862533, "grad_norm": 0.5585691928863525, "learning_rate": 0.0005514365893146249, "loss": 3.8902, "step": 7600 }, { "epoch": 0.8247978436657682, "grad_norm": 0.5613856315612793, "learning_rate": 0.0005511127900701564, "loss": 3.8952, "step": 7650 }, { "epoch": 0.8301886792452831, "grad_norm": 0.5988004803657532, "learning_rate": 0.000550788990825688, "loss": 3.8894, "step": 7700 }, { "epoch": 0.8355795148247979, "grad_norm": 0.5915306210517883, "learning_rate": 0.0005504651915812196, "loss": 3.8922, "step": 7750 }, { "epoch": 0.8409703504043127, "grad_norm": 0.7122664451599121, "learning_rate": 0.0005501413923367512, "loss": 3.8766, "step": 7800 }, { "epoch": 0.8463611859838275, "grad_norm": 0.5410688519477844, "learning_rate": 0.0005498175930922827, "loss": 3.8794, "step": 7850 }, { "epoch": 0.8517520215633423, "grad_norm": 0.5719017386436462, "learning_rate": 0.0005494937938478143, "loss": 3.8727, "step": 7900 }, { "epoch": 0.8571428571428571, "grad_norm": 0.570475697517395, "learning_rate": 0.0005491699946033459, "loss": 3.9063, "step": 7950 }, { "epoch": 0.862533692722372, "grad_norm": 0.5836682915687561, "learning_rate": 0.0005488461953588775, "loss": 3.8542, "step": 8000 }, { "epoch": 0.862533692722372, "eval_accuracy": 0.33794155582895036, "eval_loss": 3.816067695617676, "eval_runtime": 183.5803, "eval_samples_per_second": 98.11, "eval_steps_per_second": 6.134, "step": 8000 }, { "epoch": 0.8679245283018868, "grad_norm": 0.6498636603355408, "learning_rate": 0.000548522396114409, "loss": 3.895, "step": 8050 }, { "epoch": 0.8733153638814016, "grad_norm": 0.6250914931297302, "learning_rate": 0.0005481985968699406, "loss": 3.8735, "step": 8100 }, { "epoch": 0.8787061994609164, "grad_norm": 0.5511185526847839, "learning_rate": 0.0005478747976254721, "loss": 3.8758, "step": 8150 }, { "epoch": 0.8840970350404312, "grad_norm": 0.6440311074256897, "learning_rate": 0.0005475509983810037, "loss": 3.869, "step": 8200 }, { "epoch": 0.889487870619946, "grad_norm": 0.5398238301277161, "learning_rate": 0.0005472271991365352, "loss": 3.866, "step": 8250 }, { "epoch": 0.894878706199461, "grad_norm": 0.5822187066078186, "learning_rate": 0.0005469033998920669, "loss": 3.8864, "step": 8300 }, { "epoch": 0.9002695417789758, "grad_norm": 0.5405433773994446, "learning_rate": 0.0005465796006475984, "loss": 3.857, "step": 8350 }, { "epoch": 0.9056603773584906, "grad_norm": 0.5735152959823608, "learning_rate": 0.00054625580140313, "loss": 3.8628, "step": 8400 }, { "epoch": 0.9110512129380054, "grad_norm": 0.5623797178268433, "learning_rate": 0.0005459320021586615, "loss": 3.858, "step": 8450 }, { "epoch": 0.9164420485175202, "grad_norm": 0.5893810391426086, "learning_rate": 0.0005456082029141932, "loss": 3.8638, "step": 8500 }, { "epoch": 0.921832884097035, "grad_norm": 0.6306450366973877, "learning_rate": 0.0005452844036697248, "loss": 3.8565, "step": 8550 }, { "epoch": 0.9272237196765498, "grad_norm": 0.5499542951583862, "learning_rate": 0.0005449606044252563, "loss": 3.8639, "step": 8600 }, { "epoch": 0.9326145552560647, "grad_norm": 0.563671886920929, "learning_rate": 0.0005446368051807879, "loss": 3.844, "step": 8650 }, { "epoch": 0.9380053908355795, "grad_norm": 0.5789870619773865, "learning_rate": 0.0005443130059363194, "loss": 3.8492, "step": 8700 }, { "epoch": 0.9433962264150944, "grad_norm": 0.5480877757072449, "learning_rate": 0.0005439892066918511, "loss": 3.8458, "step": 8750 }, { "epoch": 0.9487870619946092, "grad_norm": 0.5352542996406555, "learning_rate": 0.0005436654074473825, "loss": 3.8487, "step": 8800 }, { "epoch": 0.954177897574124, "grad_norm": 0.5493354797363281, "learning_rate": 0.0005433416082029142, "loss": 3.8454, "step": 8850 }, { "epoch": 0.9595687331536388, "grad_norm": 0.5753820538520813, "learning_rate": 0.0005430178089584457, "loss": 3.8372, "step": 8900 }, { "epoch": 0.9649595687331537, "grad_norm": 0.5328227877616882, "learning_rate": 0.0005426940097139773, "loss": 3.8276, "step": 8950 }, { "epoch": 0.9703504043126685, "grad_norm": 0.5667588114738464, "learning_rate": 0.0005423702104695088, "loss": 3.8531, "step": 9000 }, { "epoch": 0.9703504043126685, "eval_accuracy": 0.34165379039401994, "eval_loss": 3.7784414291381836, "eval_runtime": 184.2031, "eval_samples_per_second": 97.778, "eval_steps_per_second": 6.113, "step": 9000 }, { "epoch": 0.9757412398921833, "grad_norm": 0.6010326743125916, "learning_rate": 0.0005420464112250404, "loss": 3.8678, "step": 9050 }, { "epoch": 0.9811320754716981, "grad_norm": 0.5293521285057068, "learning_rate": 0.000541722611980572, "loss": 3.8241, "step": 9100 }, { "epoch": 0.9865229110512129, "grad_norm": 0.5834252238273621, "learning_rate": 0.0005413988127361036, "loss": 3.8378, "step": 9150 }, { "epoch": 0.9919137466307277, "grad_norm": 0.5734151005744934, "learning_rate": 0.0005410750134916351, "loss": 3.8295, "step": 9200 }, { "epoch": 0.9973045822102425, "grad_norm": 0.5477326512336731, "learning_rate": 0.0005407512142471667, "loss": 3.8451, "step": 9250 }, { "epoch": 1.0026954177897573, "grad_norm": 0.5776128172874451, "learning_rate": 0.0005404274150026983, "loss": 3.7938, "step": 9300 }, { "epoch": 1.0080862533692723, "grad_norm": 0.5947125554084778, "learning_rate": 0.0005401036157582299, "loss": 3.7605, "step": 9350 }, { "epoch": 1.013477088948787, "grad_norm": 0.6069636940956116, "learning_rate": 0.0005397798165137614, "loss": 3.7525, "step": 9400 }, { "epoch": 1.0188679245283019, "grad_norm": 0.5255810618400574, "learning_rate": 0.000539456017269293, "loss": 3.7525, "step": 9450 }, { "epoch": 1.0242587601078168, "grad_norm": 0.546853244304657, "learning_rate": 0.0005391322180248245, "loss": 3.7576, "step": 9500 }, { "epoch": 1.0296495956873315, "grad_norm": 0.5615027546882629, "learning_rate": 0.0005388084187803561, "loss": 3.7679, "step": 9550 }, { "epoch": 1.0350404312668464, "grad_norm": 0.579430341720581, "learning_rate": 0.0005384846195358876, "loss": 3.7661, "step": 9600 }, { "epoch": 1.0404312668463611, "grad_norm": 0.6099395751953125, "learning_rate": 0.0005381608202914193, "loss": 3.7707, "step": 9650 }, { "epoch": 1.045822102425876, "grad_norm": 0.5670583248138428, "learning_rate": 0.0005378370210469509, "loss": 3.7544, "step": 9700 }, { "epoch": 1.0512129380053907, "grad_norm": 0.5511836409568787, "learning_rate": 0.0005375132218024824, "loss": 3.7525, "step": 9750 }, { "epoch": 1.0566037735849056, "grad_norm": 0.5288443565368652, "learning_rate": 0.000537189422558014, "loss": 3.7805, "step": 9800 }, { "epoch": 1.0619946091644206, "grad_norm": 0.5281191468238831, "learning_rate": 0.0005368656233135455, "loss": 3.7818, "step": 9850 }, { "epoch": 1.0673854447439353, "grad_norm": 0.5530230402946472, "learning_rate": 0.0005365483000539665, "loss": 3.7735, "step": 9900 }, { "epoch": 1.0727762803234502, "grad_norm": 0.6312490701675415, "learning_rate": 0.0005362245008094981, "loss": 3.7825, "step": 9950 }, { "epoch": 1.0781671159029649, "grad_norm": 0.5409021377563477, "learning_rate": 0.0005359007015650297, "loss": 3.7613, "step": 10000 }, { "epoch": 1.0781671159029649, "eval_accuracy": 0.3451733833889944, "eval_loss": 3.7442734241485596, "eval_runtime": 183.7504, "eval_samples_per_second": 98.019, "eval_steps_per_second": 6.128, "step": 10000 }, { "epoch": 1.0835579514824798, "grad_norm": 0.5455986857414246, "learning_rate": 0.0005355769023205612, "loss": 3.7552, "step": 10050 }, { "epoch": 1.0889487870619945, "grad_norm": 0.5736761093139648, "learning_rate": 0.0005352531030760928, "loss": 3.7591, "step": 10100 }, { "epoch": 1.0943396226415094, "grad_norm": 0.5683985948562622, "learning_rate": 0.0005349293038316244, "loss": 3.7521, "step": 10150 }, { "epoch": 1.0997304582210243, "grad_norm": 0.5910122394561768, "learning_rate": 0.0005346055045871559, "loss": 3.7653, "step": 10200 }, { "epoch": 1.105121293800539, "grad_norm": 0.6161583065986633, "learning_rate": 0.0005342817053426874, "loss": 3.753, "step": 10250 }, { "epoch": 1.110512129380054, "grad_norm": 0.5718348026275635, "learning_rate": 0.0005339579060982191, "loss": 3.7615, "step": 10300 }, { "epoch": 1.1159029649595686, "grad_norm": 0.576435387134552, "learning_rate": 0.0005336341068537506, "loss": 3.7567, "step": 10350 }, { "epoch": 1.1212938005390836, "grad_norm": 0.5745424628257751, "learning_rate": 0.0005333103076092822, "loss": 3.7612, "step": 10400 }, { "epoch": 1.1266846361185983, "grad_norm": 0.5627392530441284, "learning_rate": 0.0005329865083648137, "loss": 3.757, "step": 10450 }, { "epoch": 1.1320754716981132, "grad_norm": 0.6117731332778931, "learning_rate": 0.0005326627091203454, "loss": 3.7597, "step": 10500 }, { "epoch": 1.137466307277628, "grad_norm": 0.6170853972434998, "learning_rate": 0.0005323389098758769, "loss": 3.7656, "step": 10550 }, { "epoch": 1.1428571428571428, "grad_norm": 0.5776093006134033, "learning_rate": 0.0005320151106314085, "loss": 3.7554, "step": 10600 }, { "epoch": 1.1482479784366577, "grad_norm": 0.6169910430908203, "learning_rate": 0.00053169131138694, "loss": 3.7413, "step": 10650 }, { "epoch": 1.1536388140161726, "grad_norm": 0.5566491484642029, "learning_rate": 0.0005313675121424716, "loss": 3.7528, "step": 10700 }, { "epoch": 1.1590296495956873, "grad_norm": 0.5667610168457031, "learning_rate": 0.0005310437128980032, "loss": 3.7331, "step": 10750 }, { "epoch": 1.1644204851752022, "grad_norm": 0.5442368984222412, "learning_rate": 0.0005307199136535348, "loss": 3.7391, "step": 10800 }, { "epoch": 1.169811320754717, "grad_norm": 0.5643858313560486, "learning_rate": 0.0005303961144090663, "loss": 3.7309, "step": 10850 }, { "epoch": 1.1752021563342319, "grad_norm": 0.6005975604057312, "learning_rate": 0.0005300723151645979, "loss": 3.7448, "step": 10900 }, { "epoch": 1.1805929919137466, "grad_norm": 0.5089237093925476, "learning_rate": 0.0005297485159201295, "loss": 3.7585, "step": 10950 }, { "epoch": 1.1859838274932615, "grad_norm": 0.5960420370101929, "learning_rate": 0.000529424716675661, "loss": 3.7685, "step": 11000 }, { "epoch": 1.1859838274932615, "eval_accuracy": 0.34775128180528075, "eval_loss": 3.720940589904785, "eval_runtime": 183.5903, "eval_samples_per_second": 98.104, "eval_steps_per_second": 6.133, "step": 11000 }, { "epoch": 1.1913746630727764, "grad_norm": 0.575962245464325, "learning_rate": 0.0005291009174311926, "loss": 3.755, "step": 11050 }, { "epoch": 1.196765498652291, "grad_norm": 0.523404598236084, "learning_rate": 0.0005287771181867242, "loss": 3.7415, "step": 11100 }, { "epoch": 1.202156334231806, "grad_norm": 0.6159588694572449, "learning_rate": 0.0005284533189422558, "loss": 3.7365, "step": 11150 }, { "epoch": 1.2075471698113207, "grad_norm": 0.49879810214042664, "learning_rate": 0.0005281295196977873, "loss": 3.7591, "step": 11200 }, { "epoch": 1.2129380053908356, "grad_norm": 0.5774774551391602, "learning_rate": 0.0005278057204533189, "loss": 3.7483, "step": 11250 }, { "epoch": 1.2183288409703503, "grad_norm": 0.5057991743087769, "learning_rate": 0.0005274819212088505, "loss": 3.732, "step": 11300 }, { "epoch": 1.2237196765498652, "grad_norm": 0.5313456654548645, "learning_rate": 0.0005271581219643821, "loss": 3.7282, "step": 11350 }, { "epoch": 1.2291105121293802, "grad_norm": 0.7374264597892761, "learning_rate": 0.0005268343227199136, "loss": 3.7426, "step": 11400 }, { "epoch": 1.2345013477088949, "grad_norm": 0.5699669718742371, "learning_rate": 0.0005265105234754452, "loss": 3.7535, "step": 11450 }, { "epoch": 1.2398921832884098, "grad_norm": 0.567206084728241, "learning_rate": 0.0005261867242309767, "loss": 3.7431, "step": 11500 }, { "epoch": 1.2452830188679245, "grad_norm": 0.5753964781761169, "learning_rate": 0.0005258629249865083, "loss": 3.7371, "step": 11550 }, { "epoch": 1.2506738544474394, "grad_norm": 0.5284551978111267, "learning_rate": 0.0005255391257420398, "loss": 3.7509, "step": 11600 }, { "epoch": 1.256064690026954, "grad_norm": 0.5363852381706238, "learning_rate": 0.0005252153264975715, "loss": 3.7284, "step": 11650 }, { "epoch": 1.261455525606469, "grad_norm": 0.5310810804367065, "learning_rate": 0.000524891527253103, "loss": 3.7414, "step": 11700 }, { "epoch": 1.266846361185984, "grad_norm": 0.5255619287490845, "learning_rate": 0.0005245677280086346, "loss": 3.7409, "step": 11750 }, { "epoch": 1.2722371967654986, "grad_norm": 0.6119208931922913, "learning_rate": 0.0005242439287641661, "loss": 3.7545, "step": 11800 }, { "epoch": 1.2776280323450135, "grad_norm": 0.6038151979446411, "learning_rate": 0.0005239201295196978, "loss": 3.7357, "step": 11850 }, { "epoch": 1.2830188679245282, "grad_norm": 0.5870354175567627, "learning_rate": 0.0005235963302752293, "loss": 3.7318, "step": 11900 }, { "epoch": 1.2884097035040432, "grad_norm": 0.6072551012039185, "learning_rate": 0.0005232725310307609, "loss": 3.7151, "step": 11950 }, { "epoch": 1.2938005390835579, "grad_norm": 0.5396766662597656, "learning_rate": 0.0005229487317862924, "loss": 3.7287, "step": 12000 }, { "epoch": 1.2938005390835579, "eval_accuracy": 0.3499499816430946, "eval_loss": 3.6942965984344482, "eval_runtime": 183.9882, "eval_samples_per_second": 97.892, "eval_steps_per_second": 6.12, "step": 12000 }, { "epoch": 1.2991913746630728, "grad_norm": 0.5627809166908264, "learning_rate": 0.000522624932541824, "loss": 3.7433, "step": 12050 }, { "epoch": 1.3045822102425877, "grad_norm": 0.5790122151374817, "learning_rate": 0.0005223011332973557, "loss": 3.7371, "step": 12100 }, { "epoch": 1.3099730458221024, "grad_norm": 0.5617051720619202, "learning_rate": 0.0005219773340528872, "loss": 3.725, "step": 12150 }, { "epoch": 1.3153638814016173, "grad_norm": 0.5151467323303223, "learning_rate": 0.0005216535348084188, "loss": 3.7435, "step": 12200 }, { "epoch": 1.320754716981132, "grad_norm": 0.571425199508667, "learning_rate": 0.0005213297355639503, "loss": 3.7168, "step": 12250 }, { "epoch": 1.326145552560647, "grad_norm": 0.554076611995697, "learning_rate": 0.0005210059363194819, "loss": 3.7248, "step": 12300 }, { "epoch": 1.3315363881401616, "grad_norm": 0.5426397919654846, "learning_rate": 0.0005206821370750134, "loss": 3.733, "step": 12350 }, { "epoch": 1.3369272237196765, "grad_norm": 0.5451375246047974, "learning_rate": 0.000520358337830545, "loss": 3.7304, "step": 12400 }, { "epoch": 1.3423180592991915, "grad_norm": 0.5756955146789551, "learning_rate": 0.0005200345385860766, "loss": 3.7188, "step": 12450 }, { "epoch": 1.3477088948787062, "grad_norm": 0.5725782513618469, "learning_rate": 0.0005197107393416082, "loss": 3.7372, "step": 12500 }, { "epoch": 1.353099730458221, "grad_norm": 0.5362164974212646, "learning_rate": 0.0005193869400971397, "loss": 3.7099, "step": 12550 }, { "epoch": 1.3584905660377358, "grad_norm": 0.5761519074440002, "learning_rate": 0.0005190631408526713, "loss": 3.7107, "step": 12600 }, { "epoch": 1.3638814016172507, "grad_norm": 0.6134148240089417, "learning_rate": 0.0005187393416082029, "loss": 3.7015, "step": 12650 }, { "epoch": 1.3692722371967654, "grad_norm": 0.556781530380249, "learning_rate": 0.0005184220183486238, "loss": 3.7163, "step": 12700 }, { "epoch": 1.3746630727762803, "grad_norm": 0.534551739692688, "learning_rate": 0.0005180982191041554, "loss": 3.7077, "step": 12750 }, { "epoch": 1.3800539083557952, "grad_norm": 0.5834203362464905, "learning_rate": 0.000517774419859687, "loss": 3.7058, "step": 12800 }, { "epoch": 1.38544474393531, "grad_norm": 0.5853343605995178, "learning_rate": 0.0005174506206152185, "loss": 3.7172, "step": 12850 }, { "epoch": 1.3908355795148248, "grad_norm": 0.5919023156166077, "learning_rate": 0.0005171268213707501, "loss": 3.7155, "step": 12900 }, { "epoch": 1.3962264150943398, "grad_norm": 0.5478957295417786, "learning_rate": 0.0005168030221262816, "loss": 3.724, "step": 12950 }, { "epoch": 1.4016172506738545, "grad_norm": 0.5379932522773743, "learning_rate": 0.0005164792228818132, "loss": 3.7051, "step": 13000 }, { "epoch": 1.4016172506738545, "eval_accuracy": 0.3524520403436865, "eval_loss": 3.672461986541748, "eval_runtime": 183.6622, "eval_samples_per_second": 98.066, "eval_steps_per_second": 6.131, "step": 13000 }, { "epoch": 1.4070080862533692, "grad_norm": 0.5632694363594055, "learning_rate": 0.0005161554236373448, "loss": 3.7037, "step": 13050 }, { "epoch": 1.412398921832884, "grad_norm": 0.5602201223373413, "learning_rate": 0.0005158316243928764, "loss": 3.698, "step": 13100 }, { "epoch": 1.417789757412399, "grad_norm": 0.6008055806159973, "learning_rate": 0.0005155078251484079, "loss": 3.7036, "step": 13150 }, { "epoch": 1.4231805929919137, "grad_norm": 0.5000778436660767, "learning_rate": 0.0005151840259039395, "loss": 3.7021, "step": 13200 }, { "epoch": 1.4285714285714286, "grad_norm": 0.5265121459960938, "learning_rate": 0.000514860226659471, "loss": 3.7143, "step": 13250 }, { "epoch": 1.4339622641509435, "grad_norm": 0.5470620393753052, "learning_rate": 0.0005145364274150027, "loss": 3.6872, "step": 13300 }, { "epoch": 1.4393530997304582, "grad_norm": 0.5715850591659546, "learning_rate": 0.0005142126281705343, "loss": 3.7053, "step": 13350 }, { "epoch": 1.444743935309973, "grad_norm": 0.5808437466621399, "learning_rate": 0.0005138888289260658, "loss": 3.7218, "step": 13400 }, { "epoch": 1.4501347708894878, "grad_norm": 0.5598849058151245, "learning_rate": 0.0005135650296815974, "loss": 3.7246, "step": 13450 }, { "epoch": 1.4555256064690028, "grad_norm": 0.6316524744033813, "learning_rate": 0.0005132412304371289, "loss": 3.7159, "step": 13500 }, { "epoch": 1.4609164420485174, "grad_norm": 0.6849909424781799, "learning_rate": 0.0005129174311926605, "loss": 3.7046, "step": 13550 }, { "epoch": 1.4663072776280324, "grad_norm": 0.6671061515808105, "learning_rate": 0.000512593631948192, "loss": 3.7215, "step": 13600 }, { "epoch": 1.4716981132075473, "grad_norm": 0.5725294947624207, "learning_rate": 0.0005122698327037237, "loss": 3.7105, "step": 13650 }, { "epoch": 1.477088948787062, "grad_norm": 0.6049585342407227, "learning_rate": 0.0005119460334592552, "loss": 3.6986, "step": 13700 }, { "epoch": 1.482479784366577, "grad_norm": 0.582939624786377, "learning_rate": 0.0005116222342147868, "loss": 3.699, "step": 13750 }, { "epoch": 1.4878706199460916, "grad_norm": 0.510508120059967, "learning_rate": 0.0005112984349703183, "loss": 3.7006, "step": 13800 }, { "epoch": 1.4932614555256065, "grad_norm": 0.5556900501251221, "learning_rate": 0.00051097463572585, "loss": 3.7061, "step": 13850 }, { "epoch": 1.4986522911051212, "grad_norm": 0.6585386395454407, "learning_rate": 0.0005106508364813815, "loss": 3.6874, "step": 13900 }, { "epoch": 1.5040431266846361, "grad_norm": 0.5924018025398254, "learning_rate": 0.0005103270372369131, "loss": 3.688, "step": 13950 }, { "epoch": 1.509433962264151, "grad_norm": 0.5659307241439819, "learning_rate": 0.0005100032379924446, "loss": 3.7032, "step": 14000 }, { "epoch": 1.509433962264151, "eval_accuracy": 0.3539072284701858, "eval_loss": 3.6559207439422607, "eval_runtime": 183.7429, "eval_samples_per_second": 98.023, "eval_steps_per_second": 6.128, "step": 14000 }, { "epoch": 1.5148247978436657, "grad_norm": 0.5375577807426453, "learning_rate": 0.0005096794387479762, "loss": 3.7013, "step": 14050 }, { "epoch": 1.5202156334231804, "grad_norm": 0.631818950176239, "learning_rate": 0.0005093556395035078, "loss": 3.6999, "step": 14100 }, { "epoch": 1.5256064690026954, "grad_norm": 0.56558758020401, "learning_rate": 0.0005090318402590394, "loss": 3.6931, "step": 14150 }, { "epoch": 1.5309973045822103, "grad_norm": 0.5545461773872375, "learning_rate": 0.0005087080410145709, "loss": 3.6975, "step": 14200 }, { "epoch": 1.536388140161725, "grad_norm": 0.5795785784721375, "learning_rate": 0.0005083842417701025, "loss": 3.6976, "step": 14250 }, { "epoch": 1.54177897574124, "grad_norm": 0.5446183681488037, "learning_rate": 0.000508060442525634, "loss": 3.6944, "step": 14300 }, { "epoch": 1.5471698113207548, "grad_norm": 0.5753372311592102, "learning_rate": 0.0005077366432811656, "loss": 3.6866, "step": 14350 }, { "epoch": 1.5525606469002695, "grad_norm": 0.5649062991142273, "learning_rate": 0.0005074128440366971, "loss": 3.6891, "step": 14400 }, { "epoch": 1.5579514824797842, "grad_norm": 0.5094801783561707, "learning_rate": 0.0005070890447922288, "loss": 3.6906, "step": 14450 }, { "epoch": 1.5633423180592994, "grad_norm": 0.5297936797142029, "learning_rate": 0.0005067652455477604, "loss": 3.6888, "step": 14500 }, { "epoch": 1.568733153638814, "grad_norm": 0.537732720375061, "learning_rate": 0.0005064414463032919, "loss": 3.6874, "step": 14550 }, { "epoch": 1.5741239892183287, "grad_norm": 0.5864264369010925, "learning_rate": 0.0005061176470588235, "loss": 3.6742, "step": 14600 }, { "epoch": 1.5795148247978437, "grad_norm": 0.5446892380714417, "learning_rate": 0.0005057938478143551, "loss": 3.6769, "step": 14650 }, { "epoch": 1.5849056603773586, "grad_norm": 0.5335270762443542, "learning_rate": 0.0005054700485698867, "loss": 3.6846, "step": 14700 }, { "epoch": 1.5902964959568733, "grad_norm": 0.5628997087478638, "learning_rate": 0.0005051527253103076, "loss": 3.664, "step": 14750 }, { "epoch": 1.595687331536388, "grad_norm": 0.5562835931777954, "learning_rate": 0.0005048289260658392, "loss": 3.6578, "step": 14800 }, { "epoch": 1.6010781671159031, "grad_norm": 0.5934857726097107, "learning_rate": 0.0005045051268213707, "loss": 3.6825, "step": 14850 }, { "epoch": 1.6064690026954178, "grad_norm": 0.5580174922943115, "learning_rate": 0.0005041813275769023, "loss": 3.6859, "step": 14900 }, { "epoch": 1.6118598382749325, "grad_norm": 0.5808007121086121, "learning_rate": 0.0005038575283324338, "loss": 3.6708, "step": 14950 }, { "epoch": 1.6172506738544474, "grad_norm": 0.504628598690033, "learning_rate": 0.0005035337290879654, "loss": 3.6641, "step": 15000 }, { "epoch": 1.6172506738544474, "eval_accuracy": 0.3560253078652413, "eval_loss": 3.6341910362243652, "eval_runtime": 183.7265, "eval_samples_per_second": 98.032, "eval_steps_per_second": 6.129, "step": 15000 }, { "epoch": 1.6226415094339623, "grad_norm": 0.5475382208824158, "learning_rate": 0.000503209929843497, "loss": 3.6619, "step": 15050 }, { "epoch": 1.628032345013477, "grad_norm": 0.561851978302002, "learning_rate": 0.0005028861305990286, "loss": 3.6737, "step": 15100 }, { "epoch": 1.633423180592992, "grad_norm": 0.5445704460144043, "learning_rate": 0.0005025623313545601, "loss": 3.6936, "step": 15150 }, { "epoch": 1.6388140161725069, "grad_norm": 0.5345889329910278, "learning_rate": 0.0005022385321100917, "loss": 3.6804, "step": 15200 }, { "epoch": 1.6442048517520216, "grad_norm": 0.4933413863182068, "learning_rate": 0.0005019147328656232, "loss": 3.6721, "step": 15250 }, { "epoch": 1.6495956873315363, "grad_norm": 0.5393181443214417, "learning_rate": 0.0005015909336211549, "loss": 3.6716, "step": 15300 }, { "epoch": 1.6549865229110512, "grad_norm": 0.5043426752090454, "learning_rate": 0.0005012671343766864, "loss": 3.6568, "step": 15350 }, { "epoch": 1.6603773584905661, "grad_norm": 0.5442577004432678, "learning_rate": 0.000500943335132218, "loss": 3.675, "step": 15400 }, { "epoch": 1.6657681940700808, "grad_norm": 0.5932238101959229, "learning_rate": 0.0005006195358877495, "loss": 3.6634, "step": 15450 }, { "epoch": 1.6711590296495957, "grad_norm": 0.5122867226600647, "learning_rate": 0.0005002957366432812, "loss": 3.6645, "step": 15500 }, { "epoch": 1.6765498652291106, "grad_norm": 0.5891242027282715, "learning_rate": 0.0004999719373988127, "loss": 3.651, "step": 15550 }, { "epoch": 1.6819407008086253, "grad_norm": 0.570210874080658, "learning_rate": 0.0004996481381543442, "loss": 3.6675, "step": 15600 }, { "epoch": 1.68733153638814, "grad_norm": 0.5468628406524658, "learning_rate": 0.0004993243389098758, "loss": 3.6734, "step": 15650 }, { "epoch": 1.692722371967655, "grad_norm": 0.6194903254508972, "learning_rate": 0.0004990005396654074, "loss": 3.6724, "step": 15700 }, { "epoch": 1.6981132075471699, "grad_norm": 0.5616596937179565, "learning_rate": 0.000498676740420939, "loss": 3.6535, "step": 15750 }, { "epoch": 1.7035040431266846, "grad_norm": 0.5665090084075928, "learning_rate": 0.0004983529411764705, "loss": 3.6628, "step": 15800 }, { "epoch": 1.7088948787061995, "grad_norm": 0.5654871463775635, "learning_rate": 0.0004980291419320022, "loss": 3.6803, "step": 15850 }, { "epoch": 1.7142857142857144, "grad_norm": 0.5348817110061646, "learning_rate": 0.0004977053426875337, "loss": 3.6793, "step": 15900 }, { "epoch": 1.719676549865229, "grad_norm": 0.556675136089325, "learning_rate": 0.0004973815434430653, "loss": 3.6665, "step": 15950 }, { "epoch": 1.7250673854447438, "grad_norm": 0.5713870525360107, "learning_rate": 0.0004970577441985968, "loss": 3.6638, "step": 16000 }, { "epoch": 1.7250673854447438, "eval_accuracy": 0.35771866312183, "eval_loss": 3.618290662765503, "eval_runtime": 183.7921, "eval_samples_per_second": 97.997, "eval_steps_per_second": 6.126, "step": 16000 }, { "epoch": 1.7304582210242587, "grad_norm": 0.5948731899261475, "learning_rate": 0.0004967339449541284, "loss": 3.6498, "step": 16050 }, { "epoch": 1.7358490566037736, "grad_norm": 0.5521835684776306, "learning_rate": 0.00049641014570966, "loss": 3.6514, "step": 16100 }, { "epoch": 1.7412398921832883, "grad_norm": 0.6063022613525391, "learning_rate": 0.0004960863464651916, "loss": 3.6574, "step": 16150 }, { "epoch": 1.7466307277628033, "grad_norm": 0.6571112871170044, "learning_rate": 0.0004957625472207231, "loss": 3.6673, "step": 16200 }, { "epoch": 1.7520215633423182, "grad_norm": 0.5960469841957092, "learning_rate": 0.0004954387479762547, "loss": 3.65, "step": 16250 }, { "epoch": 1.7574123989218329, "grad_norm": 0.518771231174469, "learning_rate": 0.0004951149487317862, "loss": 3.6426, "step": 16300 }, { "epoch": 1.7628032345013476, "grad_norm": 0.5418721437454224, "learning_rate": 0.0004947911494873178, "loss": 3.6663, "step": 16350 }, { "epoch": 1.7681940700808625, "grad_norm": 0.4927101135253906, "learning_rate": 0.0004944673502428493, "loss": 3.646, "step": 16400 }, { "epoch": 1.7735849056603774, "grad_norm": 0.5151129364967346, "learning_rate": 0.000494143550998381, "loss": 3.6555, "step": 16450 }, { "epoch": 1.778975741239892, "grad_norm": 0.5475391149520874, "learning_rate": 0.0004938197517539125, "loss": 3.6402, "step": 16500 }, { "epoch": 1.784366576819407, "grad_norm": 0.5400313138961792, "learning_rate": 0.0004934959525094441, "loss": 3.6485, "step": 16550 }, { "epoch": 1.789757412398922, "grad_norm": 0.5347439646720886, "learning_rate": 0.0004931721532649756, "loss": 3.6607, "step": 16600 }, { "epoch": 1.7951482479784366, "grad_norm": 0.5058249235153198, "learning_rate": 0.0004928483540205073, "loss": 3.6536, "step": 16650 }, { "epoch": 1.8005390835579513, "grad_norm": 0.5410025119781494, "learning_rate": 0.0004925245547760388, "loss": 3.6574, "step": 16700 }, { "epoch": 1.8059299191374663, "grad_norm": 0.5334374904632568, "learning_rate": 0.0004922072315164598, "loss": 3.6566, "step": 16750 }, { "epoch": 1.8113207547169812, "grad_norm": 0.5640711784362793, "learning_rate": 0.0004918834322719913, "loss": 3.6645, "step": 16800 }, { "epoch": 1.8167115902964959, "grad_norm": 0.5393502116203308, "learning_rate": 0.0004915596330275229, "loss": 3.6718, "step": 16850 }, { "epoch": 1.8221024258760108, "grad_norm": 0.5351586937904358, "learning_rate": 0.0004912358337830544, "loss": 3.6445, "step": 16900 }, { "epoch": 1.8274932614555257, "grad_norm": 0.5266234874725342, "learning_rate": 0.000490912034538586, "loss": 3.6559, "step": 16950 }, { "epoch": 1.8328840970350404, "grad_norm": 0.5003353357315063, "learning_rate": 0.0004905882352941175, "loss": 3.6368, "step": 17000 }, { "epoch": 1.8328840970350404, "eval_accuracy": 0.3592136182052694, "eval_loss": 3.602701187133789, "eval_runtime": 183.6818, "eval_samples_per_second": 98.055, "eval_steps_per_second": 6.13, "step": 17000 }, { "epoch": 1.838274932614555, "grad_norm": 0.5567882657051086, "learning_rate": 0.0004902644360496492, "loss": 3.6407, "step": 17050 }, { "epoch": 1.8436657681940702, "grad_norm": 0.5742266178131104, "learning_rate": 0.0004899406368051808, "loss": 3.6327, "step": 17100 }, { "epoch": 1.849056603773585, "grad_norm": 0.5474602580070496, "learning_rate": 0.0004896168375607123, "loss": 3.6396, "step": 17150 }, { "epoch": 1.8544474393530996, "grad_norm": 0.5230289697647095, "learning_rate": 0.0004892930383162439, "loss": 3.6459, "step": 17200 }, { "epoch": 1.8598382749326146, "grad_norm": 0.4841998517513275, "learning_rate": 0.0004889692390717754, "loss": 3.6515, "step": 17250 }, { "epoch": 1.8652291105121295, "grad_norm": 0.5501573085784912, "learning_rate": 0.0004886454398273071, "loss": 3.6547, "step": 17300 }, { "epoch": 1.8706199460916442, "grad_norm": 0.543190598487854, "learning_rate": 0.0004883216405828386, "loss": 3.6554, "step": 17350 }, { "epoch": 1.8760107816711589, "grad_norm": 0.5470271706581116, "learning_rate": 0.00048799784133837017, "loss": 3.6487, "step": 17400 }, { "epoch": 1.881401617250674, "grad_norm": 0.5897737145423889, "learning_rate": 0.0004876740420939017, "loss": 3.6389, "step": 17450 }, { "epoch": 1.8867924528301887, "grad_norm": 0.5379341244697571, "learning_rate": 0.0004873502428494333, "loss": 3.6411, "step": 17500 }, { "epoch": 1.8921832884097034, "grad_norm": 0.5637924671173096, "learning_rate": 0.0004870264436049649, "loss": 3.6377, "step": 17550 }, { "epoch": 1.8975741239892183, "grad_norm": 0.5570976734161377, "learning_rate": 0.00048670264436049643, "loss": 3.6509, "step": 17600 }, { "epoch": 1.9029649595687332, "grad_norm": 0.6049391031265259, "learning_rate": 0.00048637884511602803, "loss": 3.6425, "step": 17650 }, { "epoch": 1.908355795148248, "grad_norm": 0.5467279553413391, "learning_rate": 0.0004860550458715596, "loss": 3.6374, "step": 17700 }, { "epoch": 1.9137466307277629, "grad_norm": 0.5799625515937805, "learning_rate": 0.0004857312466270912, "loss": 3.6579, "step": 17750 }, { "epoch": 1.9191374663072778, "grad_norm": 0.5544312000274658, "learning_rate": 0.00048540744738262274, "loss": 3.6415, "step": 17800 }, { "epoch": 1.9245283018867925, "grad_norm": 0.577680766582489, "learning_rate": 0.00048508364813815434, "loss": 3.6342, "step": 17850 }, { "epoch": 1.9299191374663072, "grad_norm": 0.5421233177185059, "learning_rate": 0.00048475984889368584, "loss": 3.6444, "step": 17900 }, { "epoch": 1.935309973045822, "grad_norm": 0.5379252433776855, "learning_rate": 0.0004844360496492175, "loss": 3.6261, "step": 17950 }, { "epoch": 1.940700808625337, "grad_norm": 0.5494712591171265, "learning_rate": 0.000484112250404749, "loss": 3.6291, "step": 18000 }, { "epoch": 1.940700808625337, "eval_accuracy": 0.3600362292189129, "eval_loss": 3.5892419815063477, "eval_runtime": 184.0541, "eval_samples_per_second": 97.857, "eval_steps_per_second": 6.118, "step": 18000 }, { "epoch": 1.9460916442048517, "grad_norm": 0.5733945369720459, "learning_rate": 0.00048378845116028055, "loss": 3.6336, "step": 18050 }, { "epoch": 1.9514824797843666, "grad_norm": 0.567290186882019, "learning_rate": 0.00048346465191581215, "loss": 3.633, "step": 18100 }, { "epoch": 1.9568733153638815, "grad_norm": 0.5201252698898315, "learning_rate": 0.0004831408526713437, "loss": 3.6308, "step": 18150 }, { "epoch": 1.9622641509433962, "grad_norm": 0.5754126310348511, "learning_rate": 0.0004828170534268753, "loss": 3.6186, "step": 18200 }, { "epoch": 1.967654986522911, "grad_norm": 0.5907006859779358, "learning_rate": 0.00048249325418240686, "loss": 3.6398, "step": 18250 }, { "epoch": 1.9730458221024259, "grad_norm": 0.6127999424934387, "learning_rate": 0.00048216945493793846, "loss": 3.6509, "step": 18300 }, { "epoch": 1.9784366576819408, "grad_norm": 0.5465409159660339, "learning_rate": 0.00048184565569347, "loss": 3.6337, "step": 18350 }, { "epoch": 1.9838274932614555, "grad_norm": 0.6550977230072021, "learning_rate": 0.0004815218564490016, "loss": 3.636, "step": 18400 }, { "epoch": 1.9892183288409704, "grad_norm": 0.5740667581558228, "learning_rate": 0.00048119805720453317, "loss": 3.6441, "step": 18450 }, { "epoch": 1.9946091644204853, "grad_norm": 0.5781065225601196, "learning_rate": 0.0004808742579600647, "loss": 3.6359, "step": 18500 }, { "epoch": 2.0, "grad_norm": 1.1652530431747437, "learning_rate": 0.0004805504587155963, "loss": 3.6409, "step": 18550 }, { "epoch": 2.0053908355795147, "grad_norm": 0.5303277373313904, "learning_rate": 0.0004802266594711278, "loss": 3.5688, "step": 18600 }, { "epoch": 2.01078167115903, "grad_norm": 0.5572181344032288, "learning_rate": 0.0004799028602266594, "loss": 3.5383, "step": 18650 }, { "epoch": 2.0161725067385445, "grad_norm": 0.577308177947998, "learning_rate": 0.000479579060982191, "loss": 3.5517, "step": 18700 }, { "epoch": 2.0215633423180592, "grad_norm": 0.5191822648048401, "learning_rate": 0.0004792552617377226, "loss": 3.5297, "step": 18750 }, { "epoch": 2.026954177897574, "grad_norm": 0.5460506081581116, "learning_rate": 0.00047893146249325413, "loss": 3.5495, "step": 18800 }, { "epoch": 2.032345013477089, "grad_norm": 0.5392601490020752, "learning_rate": 0.0004786141392336751, "loss": 3.5583, "step": 18850 }, { "epoch": 2.0377358490566038, "grad_norm": 0.5567241311073303, "learning_rate": 0.00047829033998920663, "loss": 3.5447, "step": 18900 }, { "epoch": 2.0431266846361185, "grad_norm": 0.5814675092697144, "learning_rate": 0.00047796654074473824, "loss": 3.539, "step": 18950 }, { "epoch": 2.0485175202156336, "grad_norm": 0.6219574213027954, "learning_rate": 0.0004776427415002698, "loss": 3.5668, "step": 19000 }, { "epoch": 2.0485175202156336, "eval_accuracy": 0.3623585760430596, "eval_loss": 3.5776376724243164, "eval_runtime": 183.6098, "eval_samples_per_second": 98.094, "eval_steps_per_second": 6.133, "step": 19000 }, { "epoch": 2.0539083557951483, "grad_norm": 0.5591341257095337, "learning_rate": 0.0004773189422558014, "loss": 3.5422, "step": 19050 }, { "epoch": 2.059299191374663, "grad_norm": 0.5593909025192261, "learning_rate": 0.00047699514301133294, "loss": 3.566, "step": 19100 }, { "epoch": 2.0646900269541777, "grad_norm": 0.5888485908508301, "learning_rate": 0.00047667134376686455, "loss": 3.5615, "step": 19150 }, { "epoch": 2.070080862533693, "grad_norm": 0.5862798094749451, "learning_rate": 0.0004763475445223961, "loss": 3.5691, "step": 19200 }, { "epoch": 2.0754716981132075, "grad_norm": 0.5456637740135193, "learning_rate": 0.0004760237452779276, "loss": 3.5466, "step": 19250 }, { "epoch": 2.0808625336927222, "grad_norm": 0.5669352412223816, "learning_rate": 0.0004756999460334592, "loss": 3.544, "step": 19300 }, { "epoch": 2.0862533692722374, "grad_norm": 0.6276587247848511, "learning_rate": 0.00047537614678899075, "loss": 3.5469, "step": 19350 }, { "epoch": 2.091644204851752, "grad_norm": 0.5461747646331787, "learning_rate": 0.00047505234754452235, "loss": 3.5435, "step": 19400 }, { "epoch": 2.0970350404312668, "grad_norm": 0.7402114868164062, "learning_rate": 0.0004747285483000539, "loss": 3.5519, "step": 19450 }, { "epoch": 2.1024258760107815, "grad_norm": 0.6080474853515625, "learning_rate": 0.0004744047490555855, "loss": 3.5453, "step": 19500 }, { "epoch": 2.1078167115902966, "grad_norm": 0.5156586170196533, "learning_rate": 0.00047408094981111706, "loss": 3.5511, "step": 19550 }, { "epoch": 2.1132075471698113, "grad_norm": 0.6247926354408264, "learning_rate": 0.00047375715056664866, "loss": 3.548, "step": 19600 }, { "epoch": 2.118598382749326, "grad_norm": 0.5468766093254089, "learning_rate": 0.0004734333513221802, "loss": 3.5704, "step": 19650 }, { "epoch": 2.123989218328841, "grad_norm": 0.5999529957771301, "learning_rate": 0.00047310955207771177, "loss": 3.5414, "step": 19700 }, { "epoch": 2.129380053908356, "grad_norm": 0.5577241778373718, "learning_rate": 0.00047278575283324337, "loss": 3.561, "step": 19750 }, { "epoch": 2.1347708894878705, "grad_norm": 0.5632361769676208, "learning_rate": 0.0004724619535887749, "loss": 3.5539, "step": 19800 }, { "epoch": 2.1401617250673857, "grad_norm": 0.5894713401794434, "learning_rate": 0.0004721381543443065, "loss": 3.5463, "step": 19850 }, { "epoch": 2.1455525606469004, "grad_norm": 0.5929577946662903, "learning_rate": 0.000471814355099838, "loss": 3.5588, "step": 19900 }, { "epoch": 2.150943396226415, "grad_norm": 0.5735053420066833, "learning_rate": 0.0004714905558553697, "loss": 3.5507, "step": 19950 }, { "epoch": 2.1563342318059298, "grad_norm": 0.572102427482605, "learning_rate": 0.0004711667566109012, "loss": 3.5744, "step": 20000 }, { "epoch": 2.1563342318059298, "eval_accuracy": 0.3623309782095547, "eval_loss": 3.5741710662841797, "eval_runtime": 184.0834, "eval_samples_per_second": 97.842, "eval_steps_per_second": 6.117, "step": 20000 }, { "epoch": 2.161725067385445, "grad_norm": 0.5751939415931702, "learning_rate": 0.00047084295736643273, "loss": 3.5742, "step": 20050 }, { "epoch": 2.1671159029649596, "grad_norm": 0.5660809874534607, "learning_rate": 0.00047051915812196433, "loss": 3.5769, "step": 20100 }, { "epoch": 2.1725067385444743, "grad_norm": 0.5410208702087402, "learning_rate": 0.0004701953588774959, "loss": 3.57, "step": 20150 }, { "epoch": 2.177897574123989, "grad_norm": 0.5733333826065063, "learning_rate": 0.0004698715596330275, "loss": 3.5658, "step": 20200 }, { "epoch": 2.183288409703504, "grad_norm": 0.6063035130500793, "learning_rate": 0.00046954776038855904, "loss": 3.5663, "step": 20250 }, { "epoch": 2.188679245283019, "grad_norm": 0.5596222281455994, "learning_rate": 0.00046922396114409064, "loss": 3.5569, "step": 20300 }, { "epoch": 2.1940700808625335, "grad_norm": 0.5413583517074585, "learning_rate": 0.0004689001618996222, "loss": 3.5729, "step": 20350 }, { "epoch": 2.1994609164420487, "grad_norm": 0.550186276435852, "learning_rate": 0.0004685763626551538, "loss": 3.5729, "step": 20400 }, { "epoch": 2.2048517520215634, "grad_norm": 0.6196731925010681, "learning_rate": 0.00046825256341068535, "loss": 3.5705, "step": 20450 }, { "epoch": 2.210242587601078, "grad_norm": 0.5624878406524658, "learning_rate": 0.0004679287641662169, "loss": 3.5411, "step": 20500 }, { "epoch": 2.215633423180593, "grad_norm": 0.55267733335495, "learning_rate": 0.0004676049649217485, "loss": 3.5695, "step": 20550 }, { "epoch": 2.221024258760108, "grad_norm": 0.6480850577354431, "learning_rate": 0.00046728116567728, "loss": 3.5455, "step": 20600 }, { "epoch": 2.2264150943396226, "grad_norm": 0.561083197593689, "learning_rate": 0.0004669573664328116, "loss": 3.5533, "step": 20650 }, { "epoch": 2.2318059299191373, "grad_norm": 0.5583511590957642, "learning_rate": 0.00046663356718834316, "loss": 3.5504, "step": 20700 }, { "epoch": 2.2371967654986524, "grad_norm": 0.6693071126937866, "learning_rate": 0.00046630976794387476, "loss": 3.5581, "step": 20750 }, { "epoch": 2.242587601078167, "grad_norm": 0.59278404712677, "learning_rate": 0.0004659859686994063, "loss": 3.5522, "step": 20800 }, { "epoch": 2.247978436657682, "grad_norm": 0.5818943977355957, "learning_rate": 0.0004656621694549379, "loss": 3.5674, "step": 20850 }, { "epoch": 2.2533692722371965, "grad_norm": 0.5376104116439819, "learning_rate": 0.00046533837021046947, "loss": 3.5601, "step": 20900 }, { "epoch": 2.2587601078167117, "grad_norm": 0.5341477990150452, "learning_rate": 0.000465014570966001, "loss": 3.5495, "step": 20950 }, { "epoch": 2.2641509433962264, "grad_norm": 0.5756168961524963, "learning_rate": 0.0004646907717215326, "loss": 3.5621, "step": 21000 }, { "epoch": 2.2641509433962264, "eval_accuracy": 0.3640750743647527, "eval_loss": 3.559407949447632, "eval_runtime": 183.7723, "eval_samples_per_second": 98.007, "eval_steps_per_second": 6.127, "step": 21000 }, { "epoch": 2.269541778975741, "grad_norm": 0.5583627223968506, "learning_rate": 0.0004643734484619536, "loss": 3.5553, "step": 21050 }, { "epoch": 2.274932614555256, "grad_norm": 0.6221109628677368, "learning_rate": 0.0004640496492174851, "loss": 3.5521, "step": 21100 }, { "epoch": 2.280323450134771, "grad_norm": 0.5557717680931091, "learning_rate": 0.00046372584997301673, "loss": 3.5767, "step": 21150 }, { "epoch": 2.2857142857142856, "grad_norm": 0.5979974269866943, "learning_rate": 0.0004634020507285483, "loss": 3.5602, "step": 21200 }, { "epoch": 2.2911051212938007, "grad_norm": 0.6088611483573914, "learning_rate": 0.0004630782514840798, "loss": 3.5701, "step": 21250 }, { "epoch": 2.2964959568733154, "grad_norm": 0.561540961265564, "learning_rate": 0.0004627544522396114, "loss": 3.5368, "step": 21300 }, { "epoch": 2.30188679245283, "grad_norm": 0.6222971677780151, "learning_rate": 0.00046243065299514293, "loss": 3.559, "step": 21350 }, { "epoch": 2.3072776280323453, "grad_norm": 0.5412083864212036, "learning_rate": 0.00046210685375067454, "loss": 3.5516, "step": 21400 }, { "epoch": 2.31266846361186, "grad_norm": 0.5675464868545532, "learning_rate": 0.0004617830545062061, "loss": 3.5741, "step": 21450 }, { "epoch": 2.3180592991913747, "grad_norm": 0.5713619589805603, "learning_rate": 0.0004614592552617377, "loss": 3.5468, "step": 21500 }, { "epoch": 2.3234501347708894, "grad_norm": 0.5480716824531555, "learning_rate": 0.00046113545601726924, "loss": 3.5626, "step": 21550 }, { "epoch": 2.3288409703504045, "grad_norm": 0.6332092881202698, "learning_rate": 0.00046081165677280085, "loss": 3.5526, "step": 21600 }, { "epoch": 2.334231805929919, "grad_norm": 0.6008698344230652, "learning_rate": 0.0004604878575283324, "loss": 3.5542, "step": 21650 }, { "epoch": 2.339622641509434, "grad_norm": 0.6072392463684082, "learning_rate": 0.00046016405828386395, "loss": 3.5722, "step": 21700 }, { "epoch": 2.3450134770889486, "grad_norm": 0.5895165801048279, "learning_rate": 0.00045984025903939555, "loss": 3.5609, "step": 21750 }, { "epoch": 2.3504043126684637, "grad_norm": 0.6073868274688721, "learning_rate": 0.0004595164597949271, "loss": 3.5427, "step": 21800 }, { "epoch": 2.3557951482479784, "grad_norm": 0.5244640111923218, "learning_rate": 0.0004591926605504587, "loss": 3.5541, "step": 21850 }, { "epoch": 2.361185983827493, "grad_norm": 0.5883101224899292, "learning_rate": 0.0004588688613059902, "loss": 3.5479, "step": 21900 }, { "epoch": 2.3665768194070083, "grad_norm": 0.5923961997032166, "learning_rate": 0.00045854506206152186, "loss": 3.5506, "step": 21950 }, { "epoch": 2.371967654986523, "grad_norm": 0.5549956560134888, "learning_rate": 0.00045822126281705336, "loss": 3.5697, "step": 22000 }, { "epoch": 2.371967654986523, "eval_accuracy": 0.3651554100282921, "eval_loss": 3.5489094257354736, "eval_runtime": 183.5303, "eval_samples_per_second": 98.136, "eval_steps_per_second": 6.135, "step": 22000 }, { "epoch": 2.3773584905660377, "grad_norm": 0.5528952479362488, "learning_rate": 0.00045789746357258497, "loss": 3.5474, "step": 22050 }, { "epoch": 2.382749326145553, "grad_norm": 0.6946583390235901, "learning_rate": 0.0004575736643281165, "loss": 3.5487, "step": 22100 }, { "epoch": 2.3881401617250675, "grad_norm": 0.5659127235412598, "learning_rate": 0.00045724986508364807, "loss": 3.5513, "step": 22150 }, { "epoch": 2.393530997304582, "grad_norm": 0.5336350798606873, "learning_rate": 0.00045692606583917967, "loss": 3.5623, "step": 22200 }, { "epoch": 2.398921832884097, "grad_norm": 0.5657646656036377, "learning_rate": 0.0004566022665947112, "loss": 3.5658, "step": 22250 }, { "epoch": 2.404312668463612, "grad_norm": 0.5422862768173218, "learning_rate": 0.0004562849433351322, "loss": 3.5705, "step": 22300 }, { "epoch": 2.4097035040431267, "grad_norm": 0.5550121068954468, "learning_rate": 0.0004559611440906638, "loss": 3.5507, "step": 22350 }, { "epoch": 2.4150943396226414, "grad_norm": 0.5688802003860474, "learning_rate": 0.00045563734484619533, "loss": 3.5711, "step": 22400 }, { "epoch": 2.420485175202156, "grad_norm": 0.5675523281097412, "learning_rate": 0.0004553135456017269, "loss": 3.5616, "step": 22450 }, { "epoch": 2.4258760107816713, "grad_norm": 0.5429262518882751, "learning_rate": 0.0004549897463572585, "loss": 3.5557, "step": 22500 }, { "epoch": 2.431266846361186, "grad_norm": 0.6132704615592957, "learning_rate": 0.00045466594711279, "loss": 3.5504, "step": 22550 }, { "epoch": 2.4366576819407006, "grad_norm": 0.5489976406097412, "learning_rate": 0.00045434214786832164, "loss": 3.5518, "step": 22600 }, { "epoch": 2.442048517520216, "grad_norm": 0.5603680610656738, "learning_rate": 0.00045401834862385314, "loss": 3.5373, "step": 22650 }, { "epoch": 2.4474393530997305, "grad_norm": 0.5937761068344116, "learning_rate": 0.00045369454937938474, "loss": 3.5433, "step": 22700 }, { "epoch": 2.452830188679245, "grad_norm": 0.5349526405334473, "learning_rate": 0.0004533707501349163, "loss": 3.5468, "step": 22750 }, { "epoch": 2.4582210242587603, "grad_norm": 0.5487583875656128, "learning_rate": 0.0004530469508904479, "loss": 3.5434, "step": 22800 }, { "epoch": 2.463611859838275, "grad_norm": 0.6188079714775085, "learning_rate": 0.00045272315164597945, "loss": 3.5518, "step": 22850 }, { "epoch": 2.4690026954177897, "grad_norm": 0.6644785404205322, "learning_rate": 0.000452399352401511, "loss": 3.5607, "step": 22900 }, { "epoch": 2.4743935309973044, "grad_norm": 0.579646646976471, "learning_rate": 0.0004520755531570426, "loss": 3.5622, "step": 22950 }, { "epoch": 2.4797843665768196, "grad_norm": 0.5342908501625061, "learning_rate": 0.00045175175391257415, "loss": 3.537, "step": 23000 }, { "epoch": 2.4797843665768196, "eval_accuracy": 0.36602680618856426, "eval_loss": 3.5380289554595947, "eval_runtime": 183.7529, "eval_samples_per_second": 98.017, "eval_steps_per_second": 6.128, "step": 23000 }, { "epoch": 2.4851752021563343, "grad_norm": 0.570767343044281, "learning_rate": 0.00045142795466810576, "loss": 3.5417, "step": 23050 }, { "epoch": 2.490566037735849, "grad_norm": 0.5754830837249756, "learning_rate": 0.0004511041554236373, "loss": 3.5566, "step": 23100 }, { "epoch": 2.4959568733153636, "grad_norm": 0.5624446868896484, "learning_rate": 0.0004507803561791689, "loss": 3.5579, "step": 23150 }, { "epoch": 2.501347708894879, "grad_norm": 0.6118834614753723, "learning_rate": 0.00045045655693470046, "loss": 3.5519, "step": 23200 }, { "epoch": 2.5067385444743935, "grad_norm": 0.5770419836044312, "learning_rate": 0.00045013275769023207, "loss": 3.5446, "step": 23250 }, { "epoch": 2.512129380053908, "grad_norm": 0.5695980787277222, "learning_rate": 0.00044980895844576356, "loss": 3.544, "step": 23300 }, { "epoch": 2.5175202156334233, "grad_norm": 0.6304803490638733, "learning_rate": 0.0004494851592012951, "loss": 3.5394, "step": 23350 }, { "epoch": 2.522911051212938, "grad_norm": 0.5704509019851685, "learning_rate": 0.0004491613599568267, "loss": 3.5319, "step": 23400 }, { "epoch": 2.5283018867924527, "grad_norm": 0.5579874515533447, "learning_rate": 0.00044883756071235827, "loss": 3.5454, "step": 23450 }, { "epoch": 2.533692722371968, "grad_norm": 0.5578489899635315, "learning_rate": 0.0004485137614678899, "loss": 3.5406, "step": 23500 }, { "epoch": 2.5390835579514826, "grad_norm": 0.5366498231887817, "learning_rate": 0.0004481899622234214, "loss": 3.5523, "step": 23550 }, { "epoch": 2.5444743935309972, "grad_norm": 0.5812051892280579, "learning_rate": 0.00044786616297895303, "loss": 3.5521, "step": 23600 }, { "epoch": 2.5498652291105124, "grad_norm": 0.577279806137085, "learning_rate": 0.0004475423637344846, "loss": 3.5546, "step": 23650 }, { "epoch": 2.555256064690027, "grad_norm": 0.5395628213882446, "learning_rate": 0.00044721856449001613, "loss": 3.5627, "step": 23700 }, { "epoch": 2.560646900269542, "grad_norm": 0.5482959151268005, "learning_rate": 0.00044689476524554774, "loss": 3.5664, "step": 23750 }, { "epoch": 2.5660377358490565, "grad_norm": 0.6769027709960938, "learning_rate": 0.0004465709660010793, "loss": 3.5453, "step": 23800 }, { "epoch": 2.571428571428571, "grad_norm": 0.5909700989723206, "learning_rate": 0.0004462471667566109, "loss": 3.5611, "step": 23850 }, { "epoch": 2.5768194070080863, "grad_norm": 0.6435213088989258, "learning_rate": 0.00044592336751214244, "loss": 3.5433, "step": 23900 }, { "epoch": 2.582210242587601, "grad_norm": 0.5850532650947571, "learning_rate": 0.00044559956826767405, "loss": 3.5489, "step": 23950 }, { "epoch": 2.5876010781671157, "grad_norm": 0.663982093334198, "learning_rate": 0.00044527576902320554, "loss": 3.5319, "step": 24000 }, { "epoch": 2.5876010781671157, "eval_accuracy": 0.36721959759099165, "eval_loss": 3.5314929485321045, "eval_runtime": 183.664, "eval_samples_per_second": 98.065, "eval_steps_per_second": 6.131, "step": 24000 }, { "epoch": 2.592991913746631, "grad_norm": 0.5680175423622131, "learning_rate": 0.00044495196977873715, "loss": 3.5434, "step": 24050 }, { "epoch": 2.5983827493261455, "grad_norm": 0.6317371129989624, "learning_rate": 0.0004446281705342687, "loss": 3.538, "step": 24100 }, { "epoch": 2.6037735849056602, "grad_norm": 0.5761317610740662, "learning_rate": 0.00044430437128980025, "loss": 3.5633, "step": 24150 }, { "epoch": 2.6091644204851754, "grad_norm": 0.5837670564651489, "learning_rate": 0.00044398057204533185, "loss": 3.5289, "step": 24200 }, { "epoch": 2.61455525606469, "grad_norm": 0.5544631481170654, "learning_rate": 0.0004436567728008634, "loss": 3.54, "step": 24250 }, { "epoch": 2.6199460916442048, "grad_norm": 0.5587742924690247, "learning_rate": 0.000443332973556395, "loss": 3.5479, "step": 24300 }, { "epoch": 2.62533692722372, "grad_norm": 0.5495783090591431, "learning_rate": 0.00044300917431192656, "loss": 3.5642, "step": 24350 }, { "epoch": 2.6307277628032346, "grad_norm": 0.5933278203010559, "learning_rate": 0.00044268537506745816, "loss": 3.526, "step": 24400 }, { "epoch": 2.6361185983827493, "grad_norm": 0.5730457305908203, "learning_rate": 0.0004423615758229897, "loss": 3.546, "step": 24450 }, { "epoch": 2.641509433962264, "grad_norm": 0.5949668288230896, "learning_rate": 0.0004420377765785213, "loss": 3.5588, "step": 24500 }, { "epoch": 2.6469002695417787, "grad_norm": 0.5814875364303589, "learning_rate": 0.00044171397733405287, "loss": 3.5262, "step": 24550 }, { "epoch": 2.652291105121294, "grad_norm": 0.5717437267303467, "learning_rate": 0.00044139017808958437, "loss": 3.5318, "step": 24600 }, { "epoch": 2.6576819407008085, "grad_norm": 0.5098128914833069, "learning_rate": 0.00044106637884511597, "loss": 3.5355, "step": 24650 }, { "epoch": 2.6630727762803232, "grad_norm": 0.5785273909568787, "learning_rate": 0.0004407425796006475, "loss": 3.5384, "step": 24700 }, { "epoch": 2.6684636118598384, "grad_norm": 0.5735189318656921, "learning_rate": 0.0004404252563410685, "loss": 3.5379, "step": 24750 }, { "epoch": 2.673854447439353, "grad_norm": 0.5718660354614258, "learning_rate": 0.0004401014570966001, "loss": 3.5167, "step": 24800 }, { "epoch": 2.6792452830188678, "grad_norm": 0.5400899052619934, "learning_rate": 0.00043977765785213163, "loss": 3.5144, "step": 24850 }, { "epoch": 2.684636118598383, "grad_norm": 0.5528421998023987, "learning_rate": 0.0004394538586076632, "loss": 3.5536, "step": 24900 }, { "epoch": 2.6900269541778976, "grad_norm": 0.5778202414512634, "learning_rate": 0.0004391300593631948, "loss": 3.536, "step": 24950 }, { "epoch": 2.6954177897574123, "grad_norm": 0.5615628957748413, "learning_rate": 0.00043880626011872634, "loss": 3.5265, "step": 25000 }, { "epoch": 2.6954177897574123, "eval_accuracy": 0.36784598148924214, "eval_loss": 3.5187320709228516, "eval_runtime": 183.9267, "eval_samples_per_second": 97.925, "eval_steps_per_second": 6.122, "step": 25000 }, { "epoch": 2.7008086253369274, "grad_norm": 0.5722801685333252, "learning_rate": 0.00043848246087425794, "loss": 3.5369, "step": 25050 }, { "epoch": 2.706199460916442, "grad_norm": 0.5277543067932129, "learning_rate": 0.0004381586616297895, "loss": 3.5222, "step": 25100 }, { "epoch": 2.711590296495957, "grad_norm": 0.5880123972892761, "learning_rate": 0.0004378348623853211, "loss": 3.5126, "step": 25150 }, { "epoch": 2.7169811320754715, "grad_norm": 0.5826494097709656, "learning_rate": 0.00043751106314085265, "loss": 3.5171, "step": 25200 }, { "epoch": 2.7223719676549867, "grad_norm": 0.5825526118278503, "learning_rate": 0.00043718726389638425, "loss": 3.5533, "step": 25250 }, { "epoch": 2.7277628032345014, "grad_norm": 0.5669980049133301, "learning_rate": 0.00043686346465191575, "loss": 3.5104, "step": 25300 }, { "epoch": 2.733153638814016, "grad_norm": 0.6107063889503479, "learning_rate": 0.0004365396654074473, "loss": 3.5351, "step": 25350 }, { "epoch": 2.7385444743935308, "grad_norm": 0.6292389035224915, "learning_rate": 0.0004362158661629789, "loss": 3.5373, "step": 25400 }, { "epoch": 2.743935309973046, "grad_norm": 0.5547637343406677, "learning_rate": 0.00043589206691851045, "loss": 3.5098, "step": 25450 }, { "epoch": 2.7493261455525606, "grad_norm": 0.5594532489776611, "learning_rate": 0.00043556826767404206, "loss": 3.5693, "step": 25500 }, { "epoch": 2.7547169811320753, "grad_norm": 0.5568592548370361, "learning_rate": 0.0004352444684295736, "loss": 3.5577, "step": 25550 }, { "epoch": 2.7601078167115904, "grad_norm": 0.5955683588981628, "learning_rate": 0.0004349206691851052, "loss": 3.5345, "step": 25600 }, { "epoch": 2.765498652291105, "grad_norm": 0.6182993054389954, "learning_rate": 0.00043459686994063676, "loss": 3.5233, "step": 25650 }, { "epoch": 2.77088948787062, "grad_norm": 0.5633643865585327, "learning_rate": 0.00043427307069616837, "loss": 3.5408, "step": 25700 }, { "epoch": 2.776280323450135, "grad_norm": 0.5655704140663147, "learning_rate": 0.0004339492714516999, "loss": 3.5331, "step": 25750 }, { "epoch": 2.7816711590296497, "grad_norm": 0.560209333896637, "learning_rate": 0.00043362547220723147, "loss": 3.5319, "step": 25800 }, { "epoch": 2.7870619946091644, "grad_norm": 0.5956665873527527, "learning_rate": 0.0004333016729627631, "loss": 3.5201, "step": 25850 }, { "epoch": 2.7924528301886795, "grad_norm": 0.586514949798584, "learning_rate": 0.0004329778737182946, "loss": 3.5297, "step": 25900 }, { "epoch": 2.797843665768194, "grad_norm": 0.5690777897834778, "learning_rate": 0.00043265407447382623, "loss": 3.5196, "step": 25950 }, { "epoch": 2.803234501347709, "grad_norm": 0.5629533529281616, "learning_rate": 0.0004323302752293577, "loss": 3.5313, "step": 26000 }, { "epoch": 2.803234501347709, "eval_accuracy": 0.36869542976617575, "eval_loss": 3.510869026184082, "eval_runtime": 183.6283, "eval_samples_per_second": 98.084, "eval_steps_per_second": 6.132, "step": 26000 }, { "epoch": 2.8086253369272236, "grad_norm": 0.5801641345024109, "learning_rate": 0.00043200647598488933, "loss": 3.5313, "step": 26050 }, { "epoch": 2.8140161725067383, "grad_norm": 0.5631428956985474, "learning_rate": 0.0004316826767404209, "loss": 3.503, "step": 26100 }, { "epoch": 2.8194070080862534, "grad_norm": 0.5923662185668945, "learning_rate": 0.0004313588774959525, "loss": 3.5071, "step": 26150 }, { "epoch": 2.824797843665768, "grad_norm": 0.5873034000396729, "learning_rate": 0.00043103507825148404, "loss": 3.5307, "step": 26200 }, { "epoch": 2.830188679245283, "grad_norm": 0.5601552724838257, "learning_rate": 0.0004307112790070156, "loss": 3.5429, "step": 26250 }, { "epoch": 2.835579514824798, "grad_norm": 0.5437904596328735, "learning_rate": 0.0004303874797625472, "loss": 3.5588, "step": 26300 }, { "epoch": 2.8409703504043127, "grad_norm": 0.7464920878410339, "learning_rate": 0.00043006368051807874, "loss": 3.529, "step": 26350 }, { "epoch": 2.8463611859838274, "grad_norm": 0.5931203365325928, "learning_rate": 0.00042973988127361035, "loss": 3.5378, "step": 26400 }, { "epoch": 2.8517520215633425, "grad_norm": 0.6171684265136719, "learning_rate": 0.0004294160820291419, "loss": 3.5438, "step": 26450 }, { "epoch": 2.857142857142857, "grad_norm": 0.5586252808570862, "learning_rate": 0.0004290922827846735, "loss": 3.5151, "step": 26500 }, { "epoch": 2.862533692722372, "grad_norm": 0.5578598976135254, "learning_rate": 0.00042876848354020505, "loss": 3.5306, "step": 26550 }, { "epoch": 2.867924528301887, "grad_norm": 0.5919521450996399, "learning_rate": 0.00042844468429573655, "loss": 3.5387, "step": 26600 }, { "epoch": 2.8733153638814017, "grad_norm": 0.5993890762329102, "learning_rate": 0.00042812088505126815, "loss": 3.5445, "step": 26650 }, { "epoch": 2.8787061994609164, "grad_norm": 0.5829864144325256, "learning_rate": 0.0004277970858067997, "loss": 3.5301, "step": 26700 }, { "epoch": 2.884097035040431, "grad_norm": 0.6206557154655457, "learning_rate": 0.0004274732865623313, "loss": 3.5432, "step": 26750 }, { "epoch": 2.889487870619946, "grad_norm": 0.5877953171730042, "learning_rate": 0.00042714948731786286, "loss": 3.5212, "step": 26800 }, { "epoch": 2.894878706199461, "grad_norm": 0.6274372935295105, "learning_rate": 0.00042682568807339447, "loss": 3.5414, "step": 26850 }, { "epoch": 2.9002695417789757, "grad_norm": 0.5919215679168701, "learning_rate": 0.000426501888828926, "loss": 3.5075, "step": 26900 }, { "epoch": 2.9056603773584904, "grad_norm": 0.5824446678161621, "learning_rate": 0.0004261780895844576, "loss": 3.5275, "step": 26950 }, { "epoch": 2.9110512129380055, "grad_norm": 0.5630289316177368, "learning_rate": 0.00042585429033998917, "loss": 3.5433, "step": 27000 }, { "epoch": 2.9110512129380055, "eval_accuracy": 0.3697433868691464, "eval_loss": 3.5023281574249268, "eval_runtime": 183.854, "eval_samples_per_second": 97.964, "eval_steps_per_second": 6.124, "step": 27000 }, { "epoch": 2.91644204851752, "grad_norm": 0.5719215273857117, "learning_rate": 0.0004255304910955207, "loss": 3.5288, "step": 27050 }, { "epoch": 2.921832884097035, "grad_norm": 0.5788906216621399, "learning_rate": 0.0004252066918510523, "loss": 3.5314, "step": 27100 }, { "epoch": 2.92722371967655, "grad_norm": 0.6185005903244019, "learning_rate": 0.0004248828926065839, "loss": 3.5273, "step": 27150 }, { "epoch": 2.9326145552560647, "grad_norm": 0.6660767793655396, "learning_rate": 0.0004245590933621155, "loss": 3.516, "step": 27200 }, { "epoch": 2.9380053908355794, "grad_norm": 0.5727997422218323, "learning_rate": 0.00042423529411764703, "loss": 3.5295, "step": 27250 }, { "epoch": 2.9433962264150946, "grad_norm": 0.5961682200431824, "learning_rate": 0.00042391149487317864, "loss": 3.5323, "step": 27300 }, { "epoch": 2.9487870619946093, "grad_norm": 0.6300620436668396, "learning_rate": 0.00042358769562871013, "loss": 3.5177, "step": 27350 }, { "epoch": 2.954177897574124, "grad_norm": 0.6043277978897095, "learning_rate": 0.00042326389638424174, "loss": 3.5079, "step": 27400 }, { "epoch": 2.9595687331536387, "grad_norm": 0.5596343874931335, "learning_rate": 0.0004229400971397733, "loss": 3.5159, "step": 27450 }, { "epoch": 2.964959568733154, "grad_norm": 0.5775852203369141, "learning_rate": 0.00042261629789530484, "loss": 3.5263, "step": 27500 }, { "epoch": 2.9703504043126685, "grad_norm": 0.5495153069496155, "learning_rate": 0.00042229249865083644, "loss": 3.5161, "step": 27550 }, { "epoch": 2.975741239892183, "grad_norm": 0.5802893042564392, "learning_rate": 0.000421968699406368, "loss": 3.5265, "step": 27600 }, { "epoch": 2.981132075471698, "grad_norm": 0.6191282272338867, "learning_rate": 0.0004216449001618996, "loss": 3.5039, "step": 27650 }, { "epoch": 2.986522911051213, "grad_norm": 0.5658198595046997, "learning_rate": 0.00042132110091743115, "loss": 3.5147, "step": 27700 }, { "epoch": 2.9919137466307277, "grad_norm": 0.6845032572746277, "learning_rate": 0.00042099730167296275, "loss": 3.521, "step": 27750 }, { "epoch": 2.9973045822102424, "grad_norm": 0.5844449400901794, "learning_rate": 0.0004206735024284943, "loss": 3.5218, "step": 27800 }, { "epoch": 3.0026954177897576, "grad_norm": 0.5795214772224426, "learning_rate": 0.0004203497031840259, "loss": 3.4854, "step": 27850 }, { "epoch": 3.0080862533692723, "grad_norm": 0.7003495693206787, "learning_rate": 0.00042002590393955746, "loss": 3.4273, "step": 27900 }, { "epoch": 3.013477088948787, "grad_norm": 0.549725353717804, "learning_rate": 0.00041970210469508896, "loss": 3.431, "step": 27950 }, { "epoch": 3.018867924528302, "grad_norm": 0.5706000328063965, "learning_rate": 0.00041937830545062056, "loss": 3.4434, "step": 28000 }, { "epoch": 3.018867924528302, "eval_accuracy": 0.37032381059585134, "eval_loss": 3.499523639678955, "eval_runtime": 183.7469, "eval_samples_per_second": 98.021, "eval_steps_per_second": 6.128, "step": 28000 }, { "epoch": 3.024258760107817, "grad_norm": 0.6284692883491516, "learning_rate": 0.0004190545062061521, "loss": 3.4299, "step": 28050 }, { "epoch": 3.0296495956873315, "grad_norm": 0.6581982374191284, "learning_rate": 0.0004187307069616837, "loss": 3.4238, "step": 28100 }, { "epoch": 3.035040431266846, "grad_norm": 0.6106514930725098, "learning_rate": 0.00041840690771721527, "loss": 3.4642, "step": 28150 }, { "epoch": 3.0404312668463613, "grad_norm": 0.5879032611846924, "learning_rate": 0.00041808310847274687, "loss": 3.4451, "step": 28200 }, { "epoch": 3.045822102425876, "grad_norm": 0.5639373660087585, "learning_rate": 0.0004177593092282784, "loss": 3.4552, "step": 28250 }, { "epoch": 3.0512129380053907, "grad_norm": 0.593923807144165, "learning_rate": 0.00041743550998381, "loss": 3.4516, "step": 28300 }, { "epoch": 3.056603773584906, "grad_norm": 0.6003124117851257, "learning_rate": 0.0004171117107393416, "loss": 3.4434, "step": 28350 }, { "epoch": 3.0619946091644206, "grad_norm": 0.5810577869415283, "learning_rate": 0.00041678791149487313, "loss": 3.42, "step": 28400 }, { "epoch": 3.0673854447439353, "grad_norm": 0.606003999710083, "learning_rate": 0.00041646411225040473, "loss": 3.4639, "step": 28450 }, { "epoch": 3.07277628032345, "grad_norm": 0.6627290844917297, "learning_rate": 0.0004161403130059363, "loss": 3.4576, "step": 28500 }, { "epoch": 3.078167115902965, "grad_norm": 0.615278959274292, "learning_rate": 0.0004158165137614679, "loss": 3.4439, "step": 28550 }, { "epoch": 3.08355795148248, "grad_norm": 0.5780900716781616, "learning_rate": 0.00041549271451699944, "loss": 3.4401, "step": 28600 }, { "epoch": 3.0889487870619945, "grad_norm": 0.6600579023361206, "learning_rate": 0.00041516891527253104, "loss": 3.4401, "step": 28650 }, { "epoch": 3.0943396226415096, "grad_norm": 0.5386844277381897, "learning_rate": 0.00041484511602806254, "loss": 3.45, "step": 28700 }, { "epoch": 3.0997304582210243, "grad_norm": 0.5708300471305847, "learning_rate": 0.0004145213167835941, "loss": 3.4373, "step": 28750 }, { "epoch": 3.105121293800539, "grad_norm": 0.5719059109687805, "learning_rate": 0.0004141975175391257, "loss": 3.4413, "step": 28800 }, { "epoch": 3.1105121293800537, "grad_norm": 0.5947633385658264, "learning_rate": 0.00041387371829465725, "loss": 3.4252, "step": 28850 }, { "epoch": 3.115902964959569, "grad_norm": 0.6137687563896179, "learning_rate": 0.00041354991905018885, "loss": 3.4438, "step": 28900 }, { "epoch": 3.1212938005390836, "grad_norm": 0.6112726330757141, "learning_rate": 0.0004132261198057204, "loss": 3.4593, "step": 28950 }, { "epoch": 3.1266846361185983, "grad_norm": 0.6146498918533325, "learning_rate": 0.000412902320561252, "loss": 3.4286, "step": 29000 }, { "epoch": 3.1266846361185983, "eval_accuracy": 0.371247034183611, "eval_loss": 3.4940013885498047, "eval_runtime": 183.9217, "eval_samples_per_second": 97.928, "eval_steps_per_second": 6.122, "step": 29000 }, { "epoch": 3.1320754716981134, "grad_norm": 0.6162520051002502, "learning_rate": 0.00041258499730167296, "loss": 3.4511, "step": 29050 }, { "epoch": 3.137466307277628, "grad_norm": 0.6008721590042114, "learning_rate": 0.0004122611980572045, "loss": 3.4405, "step": 29100 }, { "epoch": 3.142857142857143, "grad_norm": 0.5806031227111816, "learning_rate": 0.00041193739881273606, "loss": 3.4575, "step": 29150 }, { "epoch": 3.1482479784366575, "grad_norm": 0.5788300037384033, "learning_rate": 0.00041161359956826766, "loss": 3.4583, "step": 29200 }, { "epoch": 3.1536388140161726, "grad_norm": 0.6230771541595459, "learning_rate": 0.0004112898003237992, "loss": 3.4449, "step": 29250 }, { "epoch": 3.1590296495956873, "grad_norm": 0.5821077227592468, "learning_rate": 0.0004109660010793308, "loss": 3.4518, "step": 29300 }, { "epoch": 3.164420485175202, "grad_norm": 0.6511071920394897, "learning_rate": 0.0004106422018348623, "loss": 3.4639, "step": 29350 }, { "epoch": 3.169811320754717, "grad_norm": 0.5678673982620239, "learning_rate": 0.0004103184025903939, "loss": 3.4581, "step": 29400 }, { "epoch": 3.175202156334232, "grad_norm": 0.593774139881134, "learning_rate": 0.00040999460334592547, "loss": 3.4671, "step": 29450 }, { "epoch": 3.1805929919137466, "grad_norm": 0.5744243860244751, "learning_rate": 0.000409670804101457, "loss": 3.4629, "step": 29500 }, { "epoch": 3.1859838274932613, "grad_norm": 0.8164382576942444, "learning_rate": 0.00040934700485698863, "loss": 3.4392, "step": 29550 }, { "epoch": 3.1913746630727764, "grad_norm": 0.6126202940940857, "learning_rate": 0.0004090232056125202, "loss": 3.4606, "step": 29600 }, { "epoch": 3.196765498652291, "grad_norm": 0.5741614103317261, "learning_rate": 0.0004086994063680518, "loss": 3.4472, "step": 29650 }, { "epoch": 3.202156334231806, "grad_norm": 0.5594901442527771, "learning_rate": 0.00040837560712358333, "loss": 3.4545, "step": 29700 }, { "epoch": 3.207547169811321, "grad_norm": 0.5645290613174438, "learning_rate": 0.00040805180787911494, "loss": 3.4618, "step": 29750 }, { "epoch": 3.2129380053908356, "grad_norm": 0.5808700919151306, "learning_rate": 0.0004077280086346465, "loss": 3.4444, "step": 29800 }, { "epoch": 3.2183288409703503, "grad_norm": 0.6301125288009644, "learning_rate": 0.0004074042093901781, "loss": 3.4651, "step": 29850 }, { "epoch": 3.223719676549865, "grad_norm": 0.5941428542137146, "learning_rate": 0.00040708041014570964, "loss": 3.4735, "step": 29900 }, { "epoch": 3.22911051212938, "grad_norm": 0.5769610404968262, "learning_rate": 0.00040675661090124114, "loss": 3.4638, "step": 29950 }, { "epoch": 3.234501347708895, "grad_norm": 0.6222258806228638, "learning_rate": 0.0004064328116567728, "loss": 3.4506, "step": 30000 }, { "epoch": 3.234501347708895, "eval_accuracy": 0.3719082956588499, "eval_loss": 3.4874448776245117, "eval_runtime": 183.6565, "eval_samples_per_second": 98.069, "eval_steps_per_second": 6.131, "step": 30000 }, { "epoch": 3.2398921832884096, "grad_norm": 0.6240214109420776, "learning_rate": 0.0004061090124123043, "loss": 3.4467, "step": 30050 }, { "epoch": 3.2452830188679247, "grad_norm": 0.608329713344574, "learning_rate": 0.0004057852131678359, "loss": 3.4485, "step": 30100 }, { "epoch": 3.2506738544474394, "grad_norm": 0.5454065799713135, "learning_rate": 0.00040546141392336745, "loss": 3.4499, "step": 30150 }, { "epoch": 3.256064690026954, "grad_norm": 0.667070209980011, "learning_rate": 0.00040513761467889906, "loss": 3.4693, "step": 30200 }, { "epoch": 3.2614555256064692, "grad_norm": 0.6167458295822144, "learning_rate": 0.0004048138154344306, "loss": 3.4631, "step": 30250 }, { "epoch": 3.266846361185984, "grad_norm": 0.7352801561355591, "learning_rate": 0.0004044900161899622, "loss": 3.4654, "step": 30300 }, { "epoch": 3.2722371967654986, "grad_norm": 0.6545478701591492, "learning_rate": 0.00040416621694549376, "loss": 3.4401, "step": 30350 }, { "epoch": 3.2776280323450133, "grad_norm": 0.6410787105560303, "learning_rate": 0.0004038424177010253, "loss": 3.4697, "step": 30400 }, { "epoch": 3.2830188679245285, "grad_norm": 0.5979618430137634, "learning_rate": 0.0004035186184565569, "loss": 3.4568, "step": 30450 }, { "epoch": 3.288409703504043, "grad_norm": 0.6123748421669006, "learning_rate": 0.00040319481921208847, "loss": 3.4417, "step": 30500 }, { "epoch": 3.293800539083558, "grad_norm": 0.607407808303833, "learning_rate": 0.00040287101996762007, "loss": 3.4708, "step": 30550 }, { "epoch": 3.2991913746630726, "grad_norm": 0.7021310329437256, "learning_rate": 0.0004025472207231516, "loss": 3.4618, "step": 30600 }, { "epoch": 3.3045822102425877, "grad_norm": 0.6325151920318604, "learning_rate": 0.00040222342147868323, "loss": 3.4408, "step": 30650 }, { "epoch": 3.3099730458221024, "grad_norm": 0.6190793514251709, "learning_rate": 0.0004018996222342147, "loss": 3.4592, "step": 30700 }, { "epoch": 3.315363881401617, "grad_norm": 0.5823326110839844, "learning_rate": 0.00040157582298974633, "loss": 3.4269, "step": 30750 }, { "epoch": 3.3207547169811322, "grad_norm": 0.5816695690155029, "learning_rate": 0.0004012520237452779, "loss": 3.4634, "step": 30800 }, { "epoch": 3.326145552560647, "grad_norm": 0.6027852892875671, "learning_rate": 0.00040092822450080943, "loss": 3.4441, "step": 30850 }, { "epoch": 3.3315363881401616, "grad_norm": 0.6045222878456116, "learning_rate": 0.00040060442525634103, "loss": 3.4673, "step": 30900 }, { "epoch": 3.3369272237196768, "grad_norm": 0.597585141658783, "learning_rate": 0.0004002806260118726, "loss": 3.4441, "step": 30950 }, { "epoch": 3.3423180592991915, "grad_norm": 0.6446326375007629, "learning_rate": 0.0003999568267674042, "loss": 3.4563, "step": 31000 }, { "epoch": 3.3423180592991915, "eval_accuracy": 0.3723845212661408, "eval_loss": 3.4811089038848877, "eval_runtime": 183.6383, "eval_samples_per_second": 98.079, "eval_steps_per_second": 6.132, "step": 31000 }, { "epoch": 3.347708894878706, "grad_norm": 0.6195496916770935, "learning_rate": 0.00039963302752293574, "loss": 3.4571, "step": 31050 }, { "epoch": 3.353099730458221, "grad_norm": 0.6132037043571472, "learning_rate": 0.00039930922827846735, "loss": 3.4524, "step": 31100 }, { "epoch": 3.358490566037736, "grad_norm": 0.5746621489524841, "learning_rate": 0.0003989854290339989, "loss": 3.463, "step": 31150 }, { "epoch": 3.3638814016172507, "grad_norm": 0.596961498260498, "learning_rate": 0.00039866810577441985, "loss": 3.4495, "step": 31200 }, { "epoch": 3.3692722371967654, "grad_norm": 0.5609466433525085, "learning_rate": 0.0003983443065299514, "loss": 3.4515, "step": 31250 }, { "epoch": 3.37466307277628, "grad_norm": 0.5683972835540771, "learning_rate": 0.000398020507285483, "loss": 3.4771, "step": 31300 }, { "epoch": 3.3800539083557952, "grad_norm": 0.6360358595848083, "learning_rate": 0.0003976967080410145, "loss": 3.4486, "step": 31350 }, { "epoch": 3.38544474393531, "grad_norm": 0.6113670468330383, "learning_rate": 0.0003973729087965461, "loss": 3.448, "step": 31400 }, { "epoch": 3.3908355795148246, "grad_norm": 0.5796744227409363, "learning_rate": 0.00039704910955207765, "loss": 3.4534, "step": 31450 }, { "epoch": 3.3962264150943398, "grad_norm": 0.5753939151763916, "learning_rate": 0.00039672531030760926, "loss": 3.4462, "step": 31500 }, { "epoch": 3.4016172506738545, "grad_norm": 0.6340164542198181, "learning_rate": 0.0003964015110631408, "loss": 3.4674, "step": 31550 }, { "epoch": 3.407008086253369, "grad_norm": 0.5975732207298279, "learning_rate": 0.00039607771181867236, "loss": 3.445, "step": 31600 }, { "epoch": 3.4123989218328843, "grad_norm": 0.5981366634368896, "learning_rate": 0.00039575391257420397, "loss": 3.4462, "step": 31650 }, { "epoch": 3.417789757412399, "grad_norm": 0.6517634987831116, "learning_rate": 0.0003954301133297355, "loss": 3.4725, "step": 31700 }, { "epoch": 3.4231805929919137, "grad_norm": 0.6406164169311523, "learning_rate": 0.0003951063140852671, "loss": 3.4464, "step": 31750 }, { "epoch": 3.4285714285714284, "grad_norm": 0.6119227409362793, "learning_rate": 0.00039478251484079867, "loss": 3.4483, "step": 31800 }, { "epoch": 3.4339622641509435, "grad_norm": 0.6088457107543945, "learning_rate": 0.0003944587155963303, "loss": 3.4796, "step": 31850 }, { "epoch": 3.439353099730458, "grad_norm": 0.6567962169647217, "learning_rate": 0.0003941349163518618, "loss": 3.4615, "step": 31900 }, { "epoch": 3.444743935309973, "grad_norm": 0.6715599894523621, "learning_rate": 0.00039381111710739343, "loss": 3.4414, "step": 31950 }, { "epoch": 3.450134770889488, "grad_norm": 0.6421747803688049, "learning_rate": 0.000393487317862925, "loss": 3.4568, "step": 32000 }, { "epoch": 3.450134770889488, "eval_accuracy": 0.3734006430729818, "eval_loss": 3.473811388015747, "eval_runtime": 183.9, "eval_samples_per_second": 97.939, "eval_steps_per_second": 6.123, "step": 32000 }, { "epoch": 3.4555256064690028, "grad_norm": 0.593186616897583, "learning_rate": 0.0003931635186184565, "loss": 3.4593, "step": 32050 }, { "epoch": 3.4609164420485174, "grad_norm": 0.697148859500885, "learning_rate": 0.0003928397193739881, "loss": 3.453, "step": 32100 }, { "epoch": 3.466307277628032, "grad_norm": 0.6392279863357544, "learning_rate": 0.00039251592012951963, "loss": 3.4662, "step": 32150 }, { "epoch": 3.4716981132075473, "grad_norm": 0.5713638663291931, "learning_rate": 0.00039219212088505124, "loss": 3.4484, "step": 32200 }, { "epoch": 3.477088948787062, "grad_norm": 0.6339240074157715, "learning_rate": 0.0003918683216405828, "loss": 3.4653, "step": 32250 }, { "epoch": 3.4824797843665767, "grad_norm": 0.6048734784126282, "learning_rate": 0.0003915445223961144, "loss": 3.4757, "step": 32300 }, { "epoch": 3.487870619946092, "grad_norm": 0.616313099861145, "learning_rate": 0.00039122072315164594, "loss": 3.4397, "step": 32350 }, { "epoch": 3.4932614555256065, "grad_norm": 0.6406932473182678, "learning_rate": 0.0003908969239071775, "loss": 3.4706, "step": 32400 }, { "epoch": 3.498652291105121, "grad_norm": 0.6056227087974548, "learning_rate": 0.0003905731246627091, "loss": 3.4449, "step": 32450 }, { "epoch": 3.5040431266846364, "grad_norm": 0.5830230712890625, "learning_rate": 0.00039024932541824065, "loss": 3.4704, "step": 32500 }, { "epoch": 3.509433962264151, "grad_norm": 0.5504217147827148, "learning_rate": 0.00038992552617377225, "loss": 3.4591, "step": 32550 }, { "epoch": 3.5148247978436657, "grad_norm": 0.586064875125885, "learning_rate": 0.0003896017269293038, "loss": 3.4541, "step": 32600 }, { "epoch": 3.5202156334231804, "grad_norm": 0.6093249917030334, "learning_rate": 0.0003892779276848354, "loss": 3.4655, "step": 32650 }, { "epoch": 3.525606469002695, "grad_norm": 0.6133269667625427, "learning_rate": 0.0003889541284403669, "loss": 3.4392, "step": 32700 }, { "epoch": 3.5309973045822103, "grad_norm": 0.6184484958648682, "learning_rate": 0.0003886303291958985, "loss": 3.4513, "step": 32750 }, { "epoch": 3.536388140161725, "grad_norm": 0.6110262870788574, "learning_rate": 0.00038830652995143006, "loss": 3.4623, "step": 32800 }, { "epoch": 3.5417789757412397, "grad_norm": 0.6329076886177063, "learning_rate": 0.0003879827307069616, "loss": 3.4245, "step": 32850 }, { "epoch": 3.547169811320755, "grad_norm": 0.6302708387374878, "learning_rate": 0.0003876589314624932, "loss": 3.4471, "step": 32900 }, { "epoch": 3.5525606469002695, "grad_norm": 0.6204167008399963, "learning_rate": 0.00038733513221802477, "loss": 3.4626, "step": 32950 }, { "epoch": 3.557951482479784, "grad_norm": 0.635398805141449, "learning_rate": 0.00038701133297355637, "loss": 3.4606, "step": 33000 }, { "epoch": 3.557951482479784, "eval_accuracy": 0.37382395472390706, "eval_loss": 3.468949556350708, "eval_runtime": 183.5943, "eval_samples_per_second": 98.102, "eval_steps_per_second": 6.133, "step": 33000 }, { "epoch": 3.5633423180592994, "grad_norm": 0.5870746374130249, "learning_rate": 0.0003866875337290879, "loss": 3.4628, "step": 33050 }, { "epoch": 3.568733153638814, "grad_norm": 0.6441285014152527, "learning_rate": 0.00038636373448461953, "loss": 3.4496, "step": 33100 }, { "epoch": 3.5741239892183287, "grad_norm": 0.6544851660728455, "learning_rate": 0.0003860399352401511, "loss": 3.4598, "step": 33150 }, { "epoch": 3.579514824797844, "grad_norm": 0.601290225982666, "learning_rate": 0.0003857161359956827, "loss": 3.4587, "step": 33200 }, { "epoch": 3.5849056603773586, "grad_norm": 0.6271600127220154, "learning_rate": 0.0003853988127361036, "loss": 3.4506, "step": 33250 }, { "epoch": 3.5902964959568733, "grad_norm": 0.6283750534057617, "learning_rate": 0.0003850750134916352, "loss": 3.4413, "step": 33300 }, { "epoch": 3.595687331536388, "grad_norm": 0.5692456364631653, "learning_rate": 0.0003847512142471667, "loss": 3.4524, "step": 33350 }, { "epoch": 3.601078167115903, "grad_norm": 0.6084824204444885, "learning_rate": 0.0003844274150026983, "loss": 3.4519, "step": 33400 }, { "epoch": 3.606469002695418, "grad_norm": 0.6203867197036743, "learning_rate": 0.00038410361575822984, "loss": 3.4636, "step": 33450 }, { "epoch": 3.6118598382749325, "grad_norm": 0.5996057987213135, "learning_rate": 0.00038377981651376144, "loss": 3.4679, "step": 33500 }, { "epoch": 3.617250673854447, "grad_norm": 0.5934234857559204, "learning_rate": 0.000383456017269293, "loss": 3.4491, "step": 33550 }, { "epoch": 3.6226415094339623, "grad_norm": 0.6062605381011963, "learning_rate": 0.00038313221802482454, "loss": 3.451, "step": 33600 }, { "epoch": 3.628032345013477, "grad_norm": 0.6698931455612183, "learning_rate": 0.00038280841878035615, "loss": 3.4472, "step": 33650 }, { "epoch": 3.6334231805929917, "grad_norm": 0.6485372185707092, "learning_rate": 0.0003824846195358877, "loss": 3.4337, "step": 33700 }, { "epoch": 3.638814016172507, "grad_norm": 0.6247066259384155, "learning_rate": 0.0003821608202914193, "loss": 3.4591, "step": 33750 }, { "epoch": 3.6442048517520216, "grad_norm": 0.6120551228523254, "learning_rate": 0.00038183702104695085, "loss": 3.4388, "step": 33800 }, { "epoch": 3.6495956873315363, "grad_norm": 0.6113671660423279, "learning_rate": 0.00038151322180248246, "loss": 3.4713, "step": 33850 }, { "epoch": 3.6549865229110514, "grad_norm": 0.5842575430870056, "learning_rate": 0.000381189422558014, "loss": 3.4447, "step": 33900 }, { "epoch": 3.660377358490566, "grad_norm": 0.6196116209030151, "learning_rate": 0.0003808656233135456, "loss": 3.4459, "step": 33950 }, { "epoch": 3.665768194070081, "grad_norm": 0.6134396195411682, "learning_rate": 0.00038054182406907716, "loss": 3.4583, "step": 34000 }, { "epoch": 3.665768194070081, "eval_accuracy": 0.3740601661020157, "eval_loss": 3.4624133110046387, "eval_runtime": 184.0257, "eval_samples_per_second": 97.872, "eval_steps_per_second": 6.119, "step": 34000 }, { "epoch": 3.671159029649596, "grad_norm": 0.5997918248176575, "learning_rate": 0.00038021802482460866, "loss": 3.4519, "step": 34050 }, { "epoch": 3.6765498652291106, "grad_norm": 0.5810784101486206, "learning_rate": 0.00037989422558014027, "loss": 3.4694, "step": 34100 }, { "epoch": 3.6819407008086253, "grad_norm": 0.6275520324707031, "learning_rate": 0.0003795704263356718, "loss": 3.4657, "step": 34150 }, { "epoch": 3.68733153638814, "grad_norm": 0.6059765815734863, "learning_rate": 0.0003792466270912034, "loss": 3.4458, "step": 34200 }, { "epoch": 3.6927223719676547, "grad_norm": 0.6004732251167297, "learning_rate": 0.00037892282784673497, "loss": 3.4397, "step": 34250 }, { "epoch": 3.69811320754717, "grad_norm": 0.6046389937400818, "learning_rate": 0.0003785990286022666, "loss": 3.4392, "step": 34300 }, { "epoch": 3.7035040431266846, "grad_norm": 0.6276936531066895, "learning_rate": 0.00037827522935779813, "loss": 3.4455, "step": 34350 }, { "epoch": 3.7088948787061993, "grad_norm": 0.5531956553459167, "learning_rate": 0.00037795143011332973, "loss": 3.4406, "step": 34400 }, { "epoch": 3.7142857142857144, "grad_norm": 0.576456606388092, "learning_rate": 0.0003776276308688613, "loss": 3.4619, "step": 34450 }, { "epoch": 3.719676549865229, "grad_norm": 0.5860700011253357, "learning_rate": 0.00037730383162439283, "loss": 3.4433, "step": 34500 }, { "epoch": 3.725067385444744, "grad_norm": 0.6148476004600525, "learning_rate": 0.00037698003237992444, "loss": 3.4385, "step": 34550 }, { "epoch": 3.730458221024259, "grad_norm": 0.6846517324447632, "learning_rate": 0.000376656233135456, "loss": 3.4564, "step": 34600 }, { "epoch": 3.7358490566037736, "grad_norm": 0.6337952613830566, "learning_rate": 0.0003763324338909876, "loss": 3.4555, "step": 34650 }, { "epoch": 3.7412398921832883, "grad_norm": 0.6312965154647827, "learning_rate": 0.0003760086346465191, "loss": 3.4656, "step": 34700 }, { "epoch": 3.7466307277628035, "grad_norm": 0.5984307527542114, "learning_rate": 0.0003756848354020507, "loss": 3.4617, "step": 34750 }, { "epoch": 3.752021563342318, "grad_norm": 0.5848837494850159, "learning_rate": 0.00037536103615758224, "loss": 3.4411, "step": 34800 }, { "epoch": 3.757412398921833, "grad_norm": 0.6520708799362183, "learning_rate": 0.0003750372369131138, "loss": 3.4426, "step": 34850 }, { "epoch": 3.7628032345013476, "grad_norm": 0.5814857482910156, "learning_rate": 0.0003747134376686454, "loss": 3.4595, "step": 34900 }, { "epoch": 3.7681940700808623, "grad_norm": 0.6383914351463318, "learning_rate": 0.00037438963842417695, "loss": 3.4509, "step": 34950 }, { "epoch": 3.7735849056603774, "grad_norm": 0.6446661353111267, "learning_rate": 0.00037406583917970856, "loss": 3.459, "step": 35000 }, { "epoch": 3.7735849056603774, "eval_accuracy": 0.3753658478548823, "eval_loss": 3.4548258781433105, "eval_runtime": 183.6058, "eval_samples_per_second": 98.096, "eval_steps_per_second": 6.133, "step": 35000 }, { "epoch": 3.778975741239892, "grad_norm": 0.6061877012252808, "learning_rate": 0.0003737420399352401, "loss": 3.4709, "step": 35050 }, { "epoch": 3.784366576819407, "grad_norm": 0.6421599984169006, "learning_rate": 0.0003734182406907717, "loss": 3.4359, "step": 35100 }, { "epoch": 3.789757412398922, "grad_norm": 0.6605613231658936, "learning_rate": 0.00037309444144630326, "loss": 3.457, "step": 35150 }, { "epoch": 3.7951482479784366, "grad_norm": 0.6076908111572266, "learning_rate": 0.00037277064220183487, "loss": 3.4568, "step": 35200 }, { "epoch": 3.8005390835579513, "grad_norm": 0.611754298210144, "learning_rate": 0.0003724468429573664, "loss": 3.4381, "step": 35250 }, { "epoch": 3.8059299191374665, "grad_norm": 0.6356371641159058, "learning_rate": 0.00037212951969778737, "loss": 3.4514, "step": 35300 }, { "epoch": 3.811320754716981, "grad_norm": 0.671958863735199, "learning_rate": 0.00037180572045331887, "loss": 3.4529, "step": 35350 }, { "epoch": 3.816711590296496, "grad_norm": 0.6135779023170471, "learning_rate": 0.00037148192120885047, "loss": 3.4536, "step": 35400 }, { "epoch": 3.822102425876011, "grad_norm": 0.683573305606842, "learning_rate": 0.000371158121964382, "loss": 3.4466, "step": 35450 }, { "epoch": 3.8274932614555257, "grad_norm": 0.5822563171386719, "learning_rate": 0.0003708343227199136, "loss": 3.4516, "step": 35500 }, { "epoch": 3.8328840970350404, "grad_norm": 0.6443066596984863, "learning_rate": 0.0003705105234754452, "loss": 3.4611, "step": 35550 }, { "epoch": 3.838274932614555, "grad_norm": 0.642926037311554, "learning_rate": 0.0003701867242309768, "loss": 3.4675, "step": 35600 }, { "epoch": 3.8436657681940702, "grad_norm": 0.6716592311859131, "learning_rate": 0.00036986292498650833, "loss": 3.4654, "step": 35650 }, { "epoch": 3.849056603773585, "grad_norm": 0.5893973112106323, "learning_rate": 0.0003695391257420399, "loss": 3.4669, "step": 35700 }, { "epoch": 3.8544474393530996, "grad_norm": 0.6722517013549805, "learning_rate": 0.0003692153264975715, "loss": 3.4484, "step": 35750 }, { "epoch": 3.8598382749326143, "grad_norm": 0.6753908395767212, "learning_rate": 0.00036889152725310304, "loss": 3.4549, "step": 35800 }, { "epoch": 3.8652291105121295, "grad_norm": 0.6143805980682373, "learning_rate": 0.00036856772800863464, "loss": 3.4301, "step": 35850 }, { "epoch": 3.870619946091644, "grad_norm": 0.6016655564308167, "learning_rate": 0.0003682439287641662, "loss": 3.4456, "step": 35900 }, { "epoch": 3.876010781671159, "grad_norm": 0.6141828298568726, "learning_rate": 0.0003679201295196978, "loss": 3.4712, "step": 35950 }, { "epoch": 3.881401617250674, "grad_norm": 0.5878786444664001, "learning_rate": 0.00036759633027522935, "loss": 3.4485, "step": 36000 }, { "epoch": 3.881401617250674, "eval_accuracy": 0.3757107121208055, "eval_loss": 3.449120044708252, "eval_runtime": 184.1528, "eval_samples_per_second": 97.805, "eval_steps_per_second": 6.114, "step": 36000 }, { "epoch": 3.8867924528301887, "grad_norm": 0.6707215309143066, "learning_rate": 0.00036727253103076084, "loss": 3.4586, "step": 36050 }, { "epoch": 3.8921832884097034, "grad_norm": 0.6566523909568787, "learning_rate": 0.00036694873178629245, "loss": 3.4523, "step": 36100 }, { "epoch": 3.8975741239892185, "grad_norm": 0.5984740257263184, "learning_rate": 0.000366624932541824, "loss": 3.4565, "step": 36150 }, { "epoch": 3.9029649595687332, "grad_norm": 0.5919893383979797, "learning_rate": 0.0003663011332973556, "loss": 3.4475, "step": 36200 }, { "epoch": 3.908355795148248, "grad_norm": 0.5963663458824158, "learning_rate": 0.00036597733405288715, "loss": 3.4628, "step": 36250 }, { "epoch": 3.913746630727763, "grad_norm": 0.6338704824447632, "learning_rate": 0.00036565353480841876, "loss": 3.441, "step": 36300 }, { "epoch": 3.9191374663072778, "grad_norm": 0.6194099187850952, "learning_rate": 0.0003653297355639503, "loss": 3.4647, "step": 36350 }, { "epoch": 3.9245283018867925, "grad_norm": 0.6529048681259155, "learning_rate": 0.0003650059363194819, "loss": 3.465, "step": 36400 }, { "epoch": 3.929919137466307, "grad_norm": 0.632722020149231, "learning_rate": 0.00036468213707501347, "loss": 3.4408, "step": 36450 }, { "epoch": 3.935309973045822, "grad_norm": 0.6227572560310364, "learning_rate": 0.000364358337830545, "loss": 3.4502, "step": 36500 }, { "epoch": 3.940700808625337, "grad_norm": 0.6491719484329224, "learning_rate": 0.0003640345385860766, "loss": 3.4509, "step": 36550 }, { "epoch": 3.9460916442048517, "grad_norm": 0.5961635112762451, "learning_rate": 0.00036371073934160817, "loss": 3.4478, "step": 36600 }, { "epoch": 3.9514824797843664, "grad_norm": 0.6824967265129089, "learning_rate": 0.0003633869400971398, "loss": 3.4401, "step": 36650 }, { "epoch": 3.9568733153638815, "grad_norm": 0.6491476893424988, "learning_rate": 0.00036306314085267127, "loss": 3.4488, "step": 36700 }, { "epoch": 3.9622641509433962, "grad_norm": 0.6334487199783325, "learning_rate": 0.00036273934160820293, "loss": 3.4413, "step": 36750 }, { "epoch": 3.967654986522911, "grad_norm": 0.6231988668441772, "learning_rate": 0.00036241554236373443, "loss": 3.4347, "step": 36800 }, { "epoch": 3.973045822102426, "grad_norm": 0.6323678493499756, "learning_rate": 0.00036209174311926603, "loss": 3.4487, "step": 36850 }, { "epoch": 3.9784366576819408, "grad_norm": 0.6366806030273438, "learning_rate": 0.0003617679438747976, "loss": 3.4424, "step": 36900 }, { "epoch": 3.9838274932614555, "grad_norm": 0.6505522131919861, "learning_rate": 0.00036144414463032913, "loss": 3.4522, "step": 36950 }, { "epoch": 3.9892183288409706, "grad_norm": 0.6528891921043396, "learning_rate": 0.00036112034538586074, "loss": 3.4463, "step": 37000 }, { "epoch": 3.9892183288409706, "eval_accuracy": 0.3762563669234098, "eval_loss": 3.4432127475738525, "eval_runtime": 183.5509, "eval_samples_per_second": 98.125, "eval_steps_per_second": 6.135, "step": 37000 }, { "epoch": 3.9946091644204853, "grad_norm": 0.6302518248558044, "learning_rate": 0.0003607965461413923, "loss": 3.4597, "step": 37050 }, { "epoch": 4.0, "grad_norm": 1.2225829362869263, "learning_rate": 0.0003604727468969239, "loss": 3.4339, "step": 37100 }, { "epoch": 4.005390835579515, "grad_norm": 0.6290598511695862, "learning_rate": 0.00036014894765245544, "loss": 3.3275, "step": 37150 }, { "epoch": 4.010781671159029, "grad_norm": 0.6681774258613586, "learning_rate": 0.00035982514840798705, "loss": 3.3563, "step": 37200 }, { "epoch": 4.0161725067385445, "grad_norm": 0.6569148898124695, "learning_rate": 0.0003595013491635186, "loss": 3.3489, "step": 37250 }, { "epoch": 4.02156334231806, "grad_norm": 0.5988460183143616, "learning_rate": 0.0003591775499190502, "loss": 3.3536, "step": 37300 }, { "epoch": 4.026954177897574, "grad_norm": 0.6734127998352051, "learning_rate": 0.00035886022665947105, "loss": 3.3418, "step": 37350 }, { "epoch": 4.032345013477089, "grad_norm": 0.6245633363723755, "learning_rate": 0.0003585364274150027, "loss": 3.361, "step": 37400 }, { "epoch": 4.037735849056604, "grad_norm": 0.6742115616798401, "learning_rate": 0.0003582126281705342, "loss": 3.3584, "step": 37450 }, { "epoch": 4.0431266846361185, "grad_norm": 0.5911452770233154, "learning_rate": 0.0003578888289260658, "loss": 3.3485, "step": 37500 }, { "epoch": 4.048517520215634, "grad_norm": 0.6306824088096619, "learning_rate": 0.00035756502968159736, "loss": 3.3633, "step": 37550 }, { "epoch": 4.053908355795148, "grad_norm": 0.6810359358787537, "learning_rate": 0.00035724123043712896, "loss": 3.3582, "step": 37600 }, { "epoch": 4.059299191374663, "grad_norm": 0.6800394654273987, "learning_rate": 0.0003569174311926605, "loss": 3.3723, "step": 37650 }, { "epoch": 4.064690026954178, "grad_norm": 0.6404582262039185, "learning_rate": 0.00035659363194819206, "loss": 3.3558, "step": 37700 }, { "epoch": 4.070080862533692, "grad_norm": 0.7083536386489868, "learning_rate": 0.00035626983270372367, "loss": 3.3775, "step": 37750 }, { "epoch": 4.0754716981132075, "grad_norm": 0.6967944502830505, "learning_rate": 0.0003559460334592552, "loss": 3.3606, "step": 37800 }, { "epoch": 4.080862533692723, "grad_norm": 0.6590530872344971, "learning_rate": 0.0003556222342147868, "loss": 3.3595, "step": 37850 }, { "epoch": 4.086253369272237, "grad_norm": 0.6163628101348877, "learning_rate": 0.0003552984349703184, "loss": 3.3587, "step": 37900 }, { "epoch": 4.091644204851752, "grad_norm": 0.6308746337890625, "learning_rate": 0.00035497463572585, "loss": 3.3714, "step": 37950 }, { "epoch": 4.097035040431267, "grad_norm": 0.6695423722267151, "learning_rate": 0.00035465083648138153, "loss": 3.3631, "step": 38000 }, { "epoch": 4.097035040431267, "eval_accuracy": 0.37669934474702943, "eval_loss": 3.446185827255249, "eval_runtime": 183.7978, "eval_samples_per_second": 97.994, "eval_steps_per_second": 6.126, "step": 38000 }, { "epoch": 4.1024258760107815, "grad_norm": 0.6689504384994507, "learning_rate": 0.00035432703723691314, "loss": 3.3665, "step": 38050 }, { "epoch": 4.107816711590297, "grad_norm": 0.6203464865684509, "learning_rate": 0.00035400323799244463, "loss": 3.3684, "step": 38100 }, { "epoch": 4.113207547169812, "grad_norm": 0.6366583704948425, "learning_rate": 0.0003536794387479762, "loss": 3.3796, "step": 38150 }, { "epoch": 4.118598382749326, "grad_norm": 0.6430602669715881, "learning_rate": 0.0003533556395035078, "loss": 3.3595, "step": 38200 }, { "epoch": 4.123989218328841, "grad_norm": 0.6412113904953003, "learning_rate": 0.00035303184025903934, "loss": 3.3632, "step": 38250 }, { "epoch": 4.129380053908355, "grad_norm": 0.654319167137146, "learning_rate": 0.00035270804101457094, "loss": 3.3759, "step": 38300 }, { "epoch": 4.1347708894878705, "grad_norm": 0.6605011224746704, "learning_rate": 0.0003523842417701025, "loss": 3.3537, "step": 38350 }, { "epoch": 4.140161725067386, "grad_norm": 0.601024866104126, "learning_rate": 0.0003520604425256341, "loss": 3.3774, "step": 38400 }, { "epoch": 4.1455525606469, "grad_norm": 0.6261166930198669, "learning_rate": 0.00035173664328116565, "loss": 3.3613, "step": 38450 }, { "epoch": 4.150943396226415, "grad_norm": 0.6527787446975708, "learning_rate": 0.00035141284403669725, "loss": 3.3736, "step": 38500 }, { "epoch": 4.15633423180593, "grad_norm": 0.6282969117164612, "learning_rate": 0.0003510890447922288, "loss": 3.3998, "step": 38550 }, { "epoch": 4.1617250673854445, "grad_norm": 0.6190927028656006, "learning_rate": 0.00035077172153264976, "loss": 3.3699, "step": 38600 }, { "epoch": 4.16711590296496, "grad_norm": 0.6161026358604431, "learning_rate": 0.0003504479222881813, "loss": 3.3852, "step": 38650 }, { "epoch": 4.172506738544475, "grad_norm": 0.7209351658821106, "learning_rate": 0.0003501241230437129, "loss": 3.3512, "step": 38700 }, { "epoch": 4.177897574123989, "grad_norm": 0.625611424446106, "learning_rate": 0.0003498003237992444, "loss": 3.3763, "step": 38750 }, { "epoch": 4.183288409703504, "grad_norm": 0.6233698129653931, "learning_rate": 0.000349476524554776, "loss": 3.3798, "step": 38800 }, { "epoch": 4.188679245283019, "grad_norm": 0.6900334358215332, "learning_rate": 0.00034915272531030756, "loss": 3.3821, "step": 38850 }, { "epoch": 4.1940700808625335, "grad_norm": 0.6348626613616943, "learning_rate": 0.0003488289260658391, "loss": 3.3889, "step": 38900 }, { "epoch": 4.199460916442049, "grad_norm": 0.6387774348258972, "learning_rate": 0.0003485051268213707, "loss": 3.3821, "step": 38950 }, { "epoch": 4.204851752021563, "grad_norm": 0.6410725712776184, "learning_rate": 0.00034818132757690227, "loss": 3.3635, "step": 39000 }, { "epoch": 4.204851752021563, "eval_accuracy": 0.3774232989385373, "eval_loss": 3.443272113800049, "eval_runtime": 183.5709, "eval_samples_per_second": 98.115, "eval_steps_per_second": 6.134, "step": 39000 }, { "epoch": 4.210242587601078, "grad_norm": 0.6588302850723267, "learning_rate": 0.0003478575283324339, "loss": 3.3933, "step": 39050 }, { "epoch": 4.215633423180593, "grad_norm": 0.6347865462303162, "learning_rate": 0.0003475337290879654, "loss": 3.3851, "step": 39100 }, { "epoch": 4.2210242587601075, "grad_norm": 0.6326488852500916, "learning_rate": 0.00034720992984349703, "loss": 3.3934, "step": 39150 }, { "epoch": 4.226415094339623, "grad_norm": 0.6076704263687134, "learning_rate": 0.0003468861305990286, "loss": 3.3747, "step": 39200 }, { "epoch": 4.231805929919138, "grad_norm": 0.6273050308227539, "learning_rate": 0.0003465623313545602, "loss": 3.3727, "step": 39250 }, { "epoch": 4.237196765498652, "grad_norm": 0.6443808674812317, "learning_rate": 0.00034623853211009173, "loss": 3.384, "step": 39300 }, { "epoch": 4.242587601078167, "grad_norm": 0.6146422624588013, "learning_rate": 0.00034591473286562323, "loss": 3.3702, "step": 39350 }, { "epoch": 4.247978436657682, "grad_norm": 0.6165491938591003, "learning_rate": 0.0003455909336211549, "loss": 3.3923, "step": 39400 }, { "epoch": 4.2533692722371965, "grad_norm": 0.6656982898712158, "learning_rate": 0.0003452671343766864, "loss": 3.3772, "step": 39450 }, { "epoch": 4.258760107816712, "grad_norm": 0.6853116750717163, "learning_rate": 0.000344943335132218, "loss": 3.3865, "step": 39500 }, { "epoch": 4.264150943396227, "grad_norm": 0.6323865056037903, "learning_rate": 0.00034461953588774954, "loss": 3.3894, "step": 39550 }, { "epoch": 4.269541778975741, "grad_norm": 0.6815229654312134, "learning_rate": 0.00034429573664328115, "loss": 3.3976, "step": 39600 }, { "epoch": 4.274932614555256, "grad_norm": 0.6554942727088928, "learning_rate": 0.0003439719373988127, "loss": 3.3929, "step": 39650 }, { "epoch": 4.280323450134771, "grad_norm": 0.7081801891326904, "learning_rate": 0.00034364813815434425, "loss": 3.3724, "step": 39700 }, { "epoch": 4.285714285714286, "grad_norm": 0.6656792163848877, "learning_rate": 0.00034332433890987585, "loss": 3.383, "step": 39750 }, { "epoch": 4.291105121293801, "grad_norm": 0.6686912775039673, "learning_rate": 0.0003430005396654074, "loss": 3.3819, "step": 39800 }, { "epoch": 4.296495956873315, "grad_norm": 0.6758848428726196, "learning_rate": 0.000342676740420939, "loss": 3.3823, "step": 39850 }, { "epoch": 4.30188679245283, "grad_norm": 0.691242516040802, "learning_rate": 0.00034235294117647056, "loss": 3.3879, "step": 39900 }, { "epoch": 4.307277628032345, "grad_norm": 0.6893483400344849, "learning_rate": 0.00034202914193200216, "loss": 3.3772, "step": 39950 }, { "epoch": 4.3126684636118595, "grad_norm": 0.5819874405860901, "learning_rate": 0.0003417053426875337, "loss": 3.3887, "step": 40000 }, { "epoch": 4.3126684636118595, "eval_accuracy": 0.3775175009922725, "eval_loss": 3.4382338523864746, "eval_runtime": 184.2231, "eval_samples_per_second": 97.767, "eval_steps_per_second": 6.112, "step": 40000 }, { "epoch": 4.318059299191375, "grad_norm": 0.6500782370567322, "learning_rate": 0.0003413815434430653, "loss": 3.3864, "step": 40050 }, { "epoch": 4.32345013477089, "grad_norm": 0.6664853692054749, "learning_rate": 0.0003410577441985968, "loss": 3.3832, "step": 40100 }, { "epoch": 4.328840970350404, "grad_norm": 0.6425719857215881, "learning_rate": 0.00034073394495412837, "loss": 3.3954, "step": 40150 }, { "epoch": 4.334231805929919, "grad_norm": 0.6592696905136108, "learning_rate": 0.00034041014570965997, "loss": 3.3912, "step": 40200 }, { "epoch": 4.339622641509434, "grad_norm": 0.6312816739082336, "learning_rate": 0.0003400863464651915, "loss": 3.3784, "step": 40250 }, { "epoch": 4.345013477088949, "grad_norm": 0.637238085269928, "learning_rate": 0.0003397625472207231, "loss": 3.3924, "step": 40300 }, { "epoch": 4.350404312668464, "grad_norm": 0.5931002497673035, "learning_rate": 0.0003394387479762547, "loss": 3.3745, "step": 40350 }, { "epoch": 4.355795148247978, "grad_norm": 0.6598531603813171, "learning_rate": 0.0003391149487317863, "loss": 3.3874, "step": 40400 }, { "epoch": 4.361185983827493, "grad_norm": 0.6362660527229309, "learning_rate": 0.00033879114948731783, "loss": 3.3833, "step": 40450 }, { "epoch": 4.366576819407008, "grad_norm": 0.623309850692749, "learning_rate": 0.00033846735024284944, "loss": 3.3982, "step": 40500 }, { "epoch": 4.3719676549865225, "grad_norm": 0.6290128231048584, "learning_rate": 0.000338143550998381, "loss": 3.3897, "step": 40550 }, { "epoch": 4.377358490566038, "grad_norm": 0.6916012167930603, "learning_rate": 0.00033781975175391254, "loss": 3.3787, "step": 40600 }, { "epoch": 4.382749326145553, "grad_norm": 0.6463659405708313, "learning_rate": 0.00033749595250944414, "loss": 3.3871, "step": 40650 }, { "epoch": 4.388140161725067, "grad_norm": 0.675667941570282, "learning_rate": 0.00033717215326497564, "loss": 3.3882, "step": 40700 }, { "epoch": 4.393530997304582, "grad_norm": 0.6485020518302917, "learning_rate": 0.0003368483540205073, "loss": 3.3871, "step": 40750 }, { "epoch": 4.398921832884097, "grad_norm": 0.7107369303703308, "learning_rate": 0.0003365245547760388, "loss": 3.3755, "step": 40800 }, { "epoch": 4.404312668463612, "grad_norm": 0.6821770668029785, "learning_rate": 0.0003362007555315704, "loss": 3.3906, "step": 40850 }, { "epoch": 4.409703504043127, "grad_norm": 0.6535558104515076, "learning_rate": 0.00033587695628710195, "loss": 3.3996, "step": 40900 }, { "epoch": 4.415094339622642, "grad_norm": 0.6308371424674988, "learning_rate": 0.00033555315704263355, "loss": 3.3959, "step": 40950 }, { "epoch": 4.420485175202156, "grad_norm": 0.5909251570701599, "learning_rate": 0.0003352293577981651, "loss": 3.392, "step": 41000 }, { "epoch": 4.420485175202156, "eval_accuracy": 0.37843866017516364, "eval_loss": 3.4319839477539062, "eval_runtime": 183.6983, "eval_samples_per_second": 98.047, "eval_steps_per_second": 6.13, "step": 41000 }, { "epoch": 4.425876010781671, "grad_norm": 0.6647576689720154, "learning_rate": 0.00033490555855369665, "loss": 3.4006, "step": 41050 }, { "epoch": 4.431266846361186, "grad_norm": 0.6007199883460999, "learning_rate": 0.00033458175930922826, "loss": 3.3697, "step": 41100 }, { "epoch": 4.436657681940701, "grad_norm": 0.6572251319885254, "learning_rate": 0.0003342579600647598, "loss": 3.3942, "step": 41150 }, { "epoch": 4.442048517520216, "grad_norm": 0.6462733745574951, "learning_rate": 0.0003339341608202914, "loss": 3.406, "step": 41200 }, { "epoch": 4.44743935309973, "grad_norm": 0.7077494859695435, "learning_rate": 0.00033361036157582297, "loss": 3.4034, "step": 41250 }, { "epoch": 4.452830188679245, "grad_norm": 0.6289212703704834, "learning_rate": 0.00033328656233135457, "loss": 3.3715, "step": 41300 }, { "epoch": 4.45822102425876, "grad_norm": 0.6613530516624451, "learning_rate": 0.0003329627630868861, "loss": 3.3796, "step": 41350 }, { "epoch": 4.463611859838275, "grad_norm": 0.653528094291687, "learning_rate": 0.0003326389638424177, "loss": 3.397, "step": 41400 }, { "epoch": 4.46900269541779, "grad_norm": 0.6634072065353394, "learning_rate": 0.0003323151645979492, "loss": 3.4074, "step": 41450 }, { "epoch": 4.474393530997305, "grad_norm": 0.643779456615448, "learning_rate": 0.00033199136535348077, "loss": 3.4053, "step": 41500 }, { "epoch": 4.479784366576819, "grad_norm": 0.6200483441352844, "learning_rate": 0.0003316675661090124, "loss": 3.3927, "step": 41550 }, { "epoch": 4.485175202156334, "grad_norm": 0.7240083813667297, "learning_rate": 0.00033134376686454393, "loss": 3.3854, "step": 41600 }, { "epoch": 4.490566037735849, "grad_norm": 0.6716325283050537, "learning_rate": 0.00033101996762007553, "loss": 3.398, "step": 41650 }, { "epoch": 4.495956873315364, "grad_norm": 0.676361083984375, "learning_rate": 0.0003306961683756071, "loss": 3.3881, "step": 41700 }, { "epoch": 4.501347708894879, "grad_norm": 0.6593371629714966, "learning_rate": 0.0003303723691311387, "loss": 3.3887, "step": 41750 }, { "epoch": 4.506738544474393, "grad_norm": 0.7852563261985779, "learning_rate": 0.00033004856988667024, "loss": 3.4041, "step": 41800 }, { "epoch": 4.512129380053908, "grad_norm": 0.651554524898529, "learning_rate": 0.0003297247706422018, "loss": 3.3813, "step": 41850 }, { "epoch": 4.517520215633423, "grad_norm": 0.61871337890625, "learning_rate": 0.0003294009713977334, "loss": 3.3981, "step": 41900 }, { "epoch": 4.5229110512129385, "grad_norm": 0.7037910223007202, "learning_rate": 0.00032907717215326494, "loss": 3.3993, "step": 41950 }, { "epoch": 4.528301886792453, "grad_norm": 0.6677039265632629, "learning_rate": 0.00032875337290879655, "loss": 3.3866, "step": 42000 }, { "epoch": 4.528301886792453, "eval_accuracy": 0.37861880666316006, "eval_loss": 3.4264838695526123, "eval_runtime": 183.841, "eval_samples_per_second": 97.971, "eval_steps_per_second": 6.125, "step": 42000 }, { "epoch": 4.533692722371968, "grad_norm": 0.6433460116386414, "learning_rate": 0.00032842957366432805, "loss": 3.385, "step": 42050 }, { "epoch": 4.539083557951482, "grad_norm": 0.6643568277359009, "learning_rate": 0.0003281057744198597, "loss": 3.3937, "step": 42100 }, { "epoch": 4.544474393530997, "grad_norm": 0.6829423308372498, "learning_rate": 0.0003277819751753912, "loss": 3.393, "step": 42150 }, { "epoch": 4.549865229110512, "grad_norm": 0.6842221021652222, "learning_rate": 0.0003274581759309228, "loss": 3.3902, "step": 42200 }, { "epoch": 4.555256064690027, "grad_norm": 0.6358245015144348, "learning_rate": 0.00032713437668645436, "loss": 3.4025, "step": 42250 }, { "epoch": 4.560646900269542, "grad_norm": 0.7178597450256348, "learning_rate": 0.0003268105774419859, "loss": 3.38, "step": 42300 }, { "epoch": 4.566037735849057, "grad_norm": 0.6716541051864624, "learning_rate": 0.0003264867781975175, "loss": 3.4133, "step": 42350 }, { "epoch": 4.571428571428571, "grad_norm": 0.6570898294448853, "learning_rate": 0.00032616297895304906, "loss": 3.3732, "step": 42400 }, { "epoch": 4.576819407008086, "grad_norm": 0.6482057571411133, "learning_rate": 0.00032583917970858067, "loss": 3.3954, "step": 42450 }, { "epoch": 4.5822102425876015, "grad_norm": 0.6238645315170288, "learning_rate": 0.0003255153804641122, "loss": 3.4011, "step": 42500 }, { "epoch": 4.587601078167116, "grad_norm": 0.6898269653320312, "learning_rate": 0.0003251915812196438, "loss": 3.3792, "step": 42550 }, { "epoch": 4.592991913746631, "grad_norm": 0.6453298926353455, "learning_rate": 0.00032486778197517537, "loss": 3.4067, "step": 42600 }, { "epoch": 4.598382749326145, "grad_norm": 0.6815885901451111, "learning_rate": 0.000324543982730707, "loss": 3.3896, "step": 42650 }, { "epoch": 4.60377358490566, "grad_norm": 0.624555766582489, "learning_rate": 0.0003242266594711278, "loss": 3.3894, "step": 42700 }, { "epoch": 4.609164420485175, "grad_norm": 0.6440547704696655, "learning_rate": 0.0003239028602266595, "loss": 3.3971, "step": 42750 }, { "epoch": 4.6145552560646905, "grad_norm": 0.6493710875511169, "learning_rate": 0.000323579060982191, "loss": 3.3733, "step": 42800 }, { "epoch": 4.619946091644205, "grad_norm": 0.6572422981262207, "learning_rate": 0.0003232552617377226, "loss": 3.3984, "step": 42850 }, { "epoch": 4.62533692722372, "grad_norm": 0.6159889698028564, "learning_rate": 0.00032293146249325413, "loss": 3.3971, "step": 42900 }, { "epoch": 4.630727762803234, "grad_norm": 0.7039486169815063, "learning_rate": 0.00032260766324878574, "loss": 3.3921, "step": 42950 }, { "epoch": 4.636118598382749, "grad_norm": 0.681053876876831, "learning_rate": 0.0003222838640043173, "loss": 3.3958, "step": 43000 }, { "epoch": 4.636118598382749, "eval_accuracy": 0.3791799988287219, "eval_loss": 3.422306776046753, "eval_runtime": 183.9806, "eval_samples_per_second": 97.896, "eval_steps_per_second": 6.12, "step": 43000 }, { "epoch": 4.6415094339622645, "grad_norm": 0.6886916160583496, "learning_rate": 0.00032196006475984884, "loss": 3.3878, "step": 43050 }, { "epoch": 4.646900269541779, "grad_norm": 0.6496508121490479, "learning_rate": 0.00032163626551538044, "loss": 3.3883, "step": 43100 }, { "epoch": 4.652291105121294, "grad_norm": 0.6063190698623657, "learning_rate": 0.000321312466270912, "loss": 3.406, "step": 43150 }, { "epoch": 4.657681940700809, "grad_norm": 0.6703405380249023, "learning_rate": 0.00032099514301133295, "loss": 3.3759, "step": 43200 }, { "epoch": 4.663072776280323, "grad_norm": 0.6515152454376221, "learning_rate": 0.00032067134376686455, "loss": 3.3939, "step": 43250 }, { "epoch": 4.668463611859838, "grad_norm": 0.6611012816429138, "learning_rate": 0.0003203475445223961, "loss": 3.3818, "step": 43300 }, { "epoch": 4.6738544474393535, "grad_norm": 0.7121708393096924, "learning_rate": 0.0003200237452779277, "loss": 3.3905, "step": 43350 }, { "epoch": 4.679245283018868, "grad_norm": 0.6578240394592285, "learning_rate": 0.00031969994603345926, "loss": 3.3888, "step": 43400 }, { "epoch": 4.684636118598383, "grad_norm": 0.646023154258728, "learning_rate": 0.00031937614678899075, "loss": 3.3989, "step": 43450 }, { "epoch": 4.690026954177897, "grad_norm": 0.6373788714408875, "learning_rate": 0.00031905234754452236, "loss": 3.3742, "step": 43500 }, { "epoch": 4.695417789757412, "grad_norm": 0.6402143239974976, "learning_rate": 0.0003187285483000539, "loss": 3.3918, "step": 43550 }, { "epoch": 4.7008086253369274, "grad_norm": 0.7260469794273376, "learning_rate": 0.0003184047490555855, "loss": 3.4, "step": 43600 }, { "epoch": 4.706199460916442, "grad_norm": 0.657831072807312, "learning_rate": 0.00031808094981111706, "loss": 3.3965, "step": 43650 }, { "epoch": 4.711590296495957, "grad_norm": 0.6721468567848206, "learning_rate": 0.00031775715056664867, "loss": 3.3898, "step": 43700 }, { "epoch": 4.716981132075472, "grad_norm": 0.6404152512550354, "learning_rate": 0.0003174333513221802, "loss": 3.407, "step": 43750 }, { "epoch": 4.722371967654986, "grad_norm": 0.6717345118522644, "learning_rate": 0.00031710955207771177, "loss": 3.3918, "step": 43800 }, { "epoch": 4.727762803234501, "grad_norm": 0.6559435725212097, "learning_rate": 0.0003167857528332434, "loss": 3.3983, "step": 43850 }, { "epoch": 4.7331536388140165, "grad_norm": 0.6558781862258911, "learning_rate": 0.0003164619535887749, "loss": 3.3887, "step": 43900 }, { "epoch": 4.738544474393531, "grad_norm": 0.6264235377311707, "learning_rate": 0.00031613815434430653, "loss": 3.3874, "step": 43950 }, { "epoch": 4.743935309973046, "grad_norm": 0.6540461778640747, "learning_rate": 0.0003158143550998381, "loss": 3.3875, "step": 44000 }, { "epoch": 4.743935309973046, "eval_accuracy": 0.3794974825669158, "eval_loss": 3.418121099472046, "eval_runtime": 183.5372, "eval_samples_per_second": 98.133, "eval_steps_per_second": 6.135, "step": 44000 }, { "epoch": 4.74932614555256, "grad_norm": 0.6550005078315735, "learning_rate": 0.0003154905558553697, "loss": 3.3855, "step": 44050 }, { "epoch": 4.754716981132075, "grad_norm": 0.6489050984382629, "learning_rate": 0.0003151667566109012, "loss": 3.3913, "step": 44100 }, { "epoch": 4.7601078167115904, "grad_norm": 0.7171805500984192, "learning_rate": 0.00031484295736643284, "loss": 3.386, "step": 44150 }, { "epoch": 4.765498652291106, "grad_norm": 0.7200286388397217, "learning_rate": 0.00031451915812196434, "loss": 3.3915, "step": 44200 }, { "epoch": 4.77088948787062, "grad_norm": 0.6571013927459717, "learning_rate": 0.0003141953588774959, "loss": 3.3739, "step": 44250 }, { "epoch": 4.776280323450135, "grad_norm": 0.7125272750854492, "learning_rate": 0.0003138715596330275, "loss": 3.3972, "step": 44300 }, { "epoch": 4.781671159029649, "grad_norm": 0.6634894013404846, "learning_rate": 0.00031354776038855904, "loss": 3.3768, "step": 44350 }, { "epoch": 4.787061994609164, "grad_norm": 0.6476519703865051, "learning_rate": 0.00031322396114409065, "loss": 3.4184, "step": 44400 }, { "epoch": 4.7924528301886795, "grad_norm": 0.6488552689552307, "learning_rate": 0.0003129001618996222, "loss": 3.3751, "step": 44450 }, { "epoch": 4.797843665768194, "grad_norm": 0.6651248335838318, "learning_rate": 0.0003125763626551538, "loss": 3.3902, "step": 44500 }, { "epoch": 4.803234501347709, "grad_norm": 0.6538376212120056, "learning_rate": 0.00031225256341068535, "loss": 3.3826, "step": 44550 }, { "epoch": 4.808625336927224, "grad_norm": 0.6490738391876221, "learning_rate": 0.00031192876416621696, "loss": 3.3966, "step": 44600 }, { "epoch": 4.814016172506738, "grad_norm": 0.7015892267227173, "learning_rate": 0.0003116049649217485, "loss": 3.3805, "step": 44650 }, { "epoch": 4.819407008086253, "grad_norm": 0.6848888993263245, "learning_rate": 0.00031128116567728, "loss": 3.3929, "step": 44700 }, { "epoch": 4.824797843665769, "grad_norm": 0.722858726978302, "learning_rate": 0.00031095736643281166, "loss": 3.3867, "step": 44750 }, { "epoch": 4.830188679245283, "grad_norm": 0.6747796535491943, "learning_rate": 0.00031063356718834316, "loss": 3.386, "step": 44800 }, { "epoch": 4.835579514824798, "grad_norm": 0.6656998991966248, "learning_rate": 0.00031030976794387476, "loss": 3.3898, "step": 44850 }, { "epoch": 4.840970350404312, "grad_norm": 0.6500997543334961, "learning_rate": 0.0003099859686994063, "loss": 3.3885, "step": 44900 }, { "epoch": 4.846361185983827, "grad_norm": 0.6383869647979736, "learning_rate": 0.0003096621694549379, "loss": 3.3816, "step": 44950 }, { "epoch": 4.8517520215633425, "grad_norm": 0.6764130592346191, "learning_rate": 0.00030933837021046947, "loss": 3.4055, "step": 45000 }, { "epoch": 4.8517520215633425, "eval_accuracy": 0.3802766324254334, "eval_loss": 3.4115288257598877, "eval_runtime": 184.1198, "eval_samples_per_second": 97.822, "eval_steps_per_second": 6.116, "step": 45000 }, { "epoch": 4.857142857142857, "grad_norm": 0.7333670854568481, "learning_rate": 0.0003090145709660011, "loss": 3.3761, "step": 45050 }, { "epoch": 4.862533692722372, "grad_norm": 0.666333794593811, "learning_rate": 0.0003086907717215326, "loss": 3.3802, "step": 45100 }, { "epoch": 4.867924528301887, "grad_norm": 0.6561823487281799, "learning_rate": 0.0003083669724770642, "loss": 3.3913, "step": 45150 }, { "epoch": 4.873315363881401, "grad_norm": 0.6639450192451477, "learning_rate": 0.0003080431732325958, "loss": 3.3775, "step": 45200 }, { "epoch": 4.878706199460916, "grad_norm": 0.6635633707046509, "learning_rate": 0.00030771937398812733, "loss": 3.394, "step": 45250 }, { "epoch": 4.884097035040432, "grad_norm": 0.6871430277824402, "learning_rate": 0.00030739557474365894, "loss": 3.3784, "step": 45300 }, { "epoch": 4.889487870619946, "grad_norm": 0.6663231253623962, "learning_rate": 0.0003070717754991905, "loss": 3.3862, "step": 45350 }, { "epoch": 4.894878706199461, "grad_norm": 0.6640026569366455, "learning_rate": 0.0003067479762547221, "loss": 3.3841, "step": 45400 }, { "epoch": 4.900269541778976, "grad_norm": 0.7088777422904968, "learning_rate": 0.0003064241770102536, "loss": 3.3933, "step": 45450 }, { "epoch": 4.90566037735849, "grad_norm": 0.6727973222732544, "learning_rate": 0.00030610037776578514, "loss": 3.3949, "step": 45500 }, { "epoch": 4.9110512129380055, "grad_norm": 0.6286665201187134, "learning_rate": 0.00030577657852131674, "loss": 3.394, "step": 45550 }, { "epoch": 4.916442048517521, "grad_norm": 0.623975396156311, "learning_rate": 0.0003054527792768483, "loss": 3.3913, "step": 45600 }, { "epoch": 4.921832884097035, "grad_norm": 0.6779341101646423, "learning_rate": 0.0003051289800323799, "loss": 3.406, "step": 45650 }, { "epoch": 4.92722371967655, "grad_norm": 0.6502783894538879, "learning_rate": 0.00030480518078791145, "loss": 3.402, "step": 45700 }, { "epoch": 4.932614555256064, "grad_norm": 0.656953752040863, "learning_rate": 0.00030448138154344305, "loss": 3.3801, "step": 45750 }, { "epoch": 4.938005390835579, "grad_norm": 0.6836448311805725, "learning_rate": 0.0003041575822989746, "loss": 3.4076, "step": 45800 }, { "epoch": 4.943396226415095, "grad_norm": 0.669844925403595, "learning_rate": 0.0003038337830545062, "loss": 3.3775, "step": 45850 }, { "epoch": 4.948787061994609, "grad_norm": 0.6585819721221924, "learning_rate": 0.00030350998381003776, "loss": 3.3735, "step": 45900 }, { "epoch": 4.954177897574124, "grad_norm": 0.6468183398246765, "learning_rate": 0.0003031861845655693, "loss": 3.3727, "step": 45950 }, { "epoch": 4.959568733153639, "grad_norm": 0.6753085255622864, "learning_rate": 0.0003028623853211009, "loss": 3.3721, "step": 46000 }, { "epoch": 4.959568733153639, "eval_accuracy": 0.3807414494795038, "eval_loss": 3.407444953918457, "eval_runtime": 183.8672, "eval_samples_per_second": 97.957, "eval_steps_per_second": 6.124, "step": 46000 }, { "epoch": 4.964959568733153, "grad_norm": 0.6388835906982422, "learning_rate": 0.00030253858607663247, "loss": 3.3795, "step": 46050 }, { "epoch": 4.9703504043126685, "grad_norm": 0.6600316166877747, "learning_rate": 0.00030221478683216407, "loss": 3.3892, "step": 46100 }, { "epoch": 4.975741239892184, "grad_norm": 0.6317592263221741, "learning_rate": 0.00030189098758769557, "loss": 3.388, "step": 46150 }, { "epoch": 4.981132075471698, "grad_norm": 0.670013964176178, "learning_rate": 0.00030156718834322717, "loss": 3.3928, "step": 46200 }, { "epoch": 4.986522911051213, "grad_norm": 0.65596604347229, "learning_rate": 0.0003012433890987587, "loss": 3.3911, "step": 46250 }, { "epoch": 4.991913746630727, "grad_norm": 0.6848031878471375, "learning_rate": 0.0003009195898542903, "loss": 3.3845, "step": 46300 }, { "epoch": 4.997304582210242, "grad_norm": 0.6780948638916016, "learning_rate": 0.0003005957906098219, "loss": 3.3775, "step": 46350 }, { "epoch": 5.002695417789758, "grad_norm": 0.7113416790962219, "learning_rate": 0.00030027199136535343, "loss": 3.3608, "step": 46400 }, { "epoch": 5.008086253369272, "grad_norm": 0.641831636428833, "learning_rate": 0.00029994819212088503, "loss": 3.2879, "step": 46450 }, { "epoch": 5.013477088948787, "grad_norm": 0.6682975888252258, "learning_rate": 0.0002996243928764166, "loss": 3.295, "step": 46500 }, { "epoch": 5.018867924528302, "grad_norm": 0.6478630900382996, "learning_rate": 0.0002993005936319482, "loss": 3.3073, "step": 46550 }, { "epoch": 5.024258760107816, "grad_norm": 0.6761943697929382, "learning_rate": 0.00029897679438747974, "loss": 3.2986, "step": 46600 }, { "epoch": 5.0296495956873315, "grad_norm": 0.7165064215660095, "learning_rate": 0.0002986529951430113, "loss": 3.286, "step": 46650 }, { "epoch": 5.035040431266847, "grad_norm": 0.6782529354095459, "learning_rate": 0.0002983291958985429, "loss": 3.297, "step": 46700 }, { "epoch": 5.040431266846361, "grad_norm": 0.6282607316970825, "learning_rate": 0.00029800539665407444, "loss": 3.2874, "step": 46750 }, { "epoch": 5.045822102425876, "grad_norm": 0.7574204802513123, "learning_rate": 0.000297681597409606, "loss": 3.296, "step": 46800 }, { "epoch": 5.051212938005391, "grad_norm": 0.6350822448730469, "learning_rate": 0.0002973577981651376, "loss": 3.3101, "step": 46850 }, { "epoch": 5.056603773584905, "grad_norm": 0.648804783821106, "learning_rate": 0.00029703399892066915, "loss": 3.3177, "step": 46900 }, { "epoch": 5.061994609164421, "grad_norm": 0.6287740468978882, "learning_rate": 0.00029671019967620076, "loss": 3.3077, "step": 46950 }, { "epoch": 5.067385444743936, "grad_norm": 0.671330988407135, "learning_rate": 0.0002963864004317323, "loss": 3.3139, "step": 47000 }, { "epoch": 5.067385444743936, "eval_accuracy": 0.3809589725609084, "eval_loss": 3.4086132049560547, "eval_runtime": 183.6167, "eval_samples_per_second": 98.09, "eval_steps_per_second": 6.132, "step": 47000 }, { "epoch": 5.07277628032345, "grad_norm": 0.6910858750343323, "learning_rate": 0.00029606260118726386, "loss": 3.2859, "step": 47050 }, { "epoch": 5.078167115902965, "grad_norm": 0.643031120300293, "learning_rate": 0.0002957388019427954, "loss": 3.2986, "step": 47100 }, { "epoch": 5.083557951482479, "grad_norm": 0.7260094881057739, "learning_rate": 0.000295415002698327, "loss": 3.3228, "step": 47150 }, { "epoch": 5.0889487870619945, "grad_norm": 0.6842207908630371, "learning_rate": 0.00029509120345385856, "loss": 3.3173, "step": 47200 }, { "epoch": 5.09433962264151, "grad_norm": 0.6923130750656128, "learning_rate": 0.0002947738801942795, "loss": 3.3, "step": 47250 }, { "epoch": 5.099730458221024, "grad_norm": 0.652860701084137, "learning_rate": 0.0002944500809498111, "loss": 3.3124, "step": 47300 }, { "epoch": 5.105121293800539, "grad_norm": 0.7349758744239807, "learning_rate": 0.00029412628170534267, "loss": 3.314, "step": 47350 }, { "epoch": 5.110512129380054, "grad_norm": 0.6797593235969543, "learning_rate": 0.0002938024824608742, "loss": 3.3092, "step": 47400 }, { "epoch": 5.115902964959568, "grad_norm": 0.7050797939300537, "learning_rate": 0.00029347868321640577, "loss": 3.316, "step": 47450 }, { "epoch": 5.121293800539084, "grad_norm": 0.7074447870254517, "learning_rate": 0.0002931548839719374, "loss": 3.3116, "step": 47500 }, { "epoch": 5.126684636118599, "grad_norm": 0.7114849090576172, "learning_rate": 0.0002928310847274689, "loss": 3.3062, "step": 47550 }, { "epoch": 5.132075471698113, "grad_norm": 0.7198069095611572, "learning_rate": 0.00029250728548300053, "loss": 3.3031, "step": 47600 }, { "epoch": 5.137466307277628, "grad_norm": 0.6557970643043518, "learning_rate": 0.0002921834862385321, "loss": 3.3167, "step": 47650 }, { "epoch": 5.142857142857143, "grad_norm": 0.6805382370948792, "learning_rate": 0.0002918596869940637, "loss": 3.3198, "step": 47700 }, { "epoch": 5.1482479784366575, "grad_norm": 0.6633416414260864, "learning_rate": 0.0002915358877495952, "loss": 3.3215, "step": 47750 }, { "epoch": 5.153638814016173, "grad_norm": 0.6749739646911621, "learning_rate": 0.0002912120885051268, "loss": 3.3283, "step": 47800 }, { "epoch": 5.159029649595688, "grad_norm": 0.7563609480857849, "learning_rate": 0.00029088828926065834, "loss": 3.3213, "step": 47850 }, { "epoch": 5.164420485175202, "grad_norm": 0.7092325687408447, "learning_rate": 0.00029056449001618994, "loss": 3.3062, "step": 47900 }, { "epoch": 5.169811320754717, "grad_norm": 0.7057313919067383, "learning_rate": 0.0002902406907717215, "loss": 3.3222, "step": 47950 }, { "epoch": 5.175202156334231, "grad_norm": 0.6930835247039795, "learning_rate": 0.0002899168915272531, "loss": 3.3123, "step": 48000 }, { "epoch": 5.175202156334231, "eval_accuracy": 0.3812406008461235, "eval_loss": 3.4109530448913574, "eval_runtime": 183.8217, "eval_samples_per_second": 97.981, "eval_steps_per_second": 6.126, "step": 48000 }, { "epoch": 5.180592991913747, "grad_norm": 0.6858886480331421, "learning_rate": 0.00028959309228278465, "loss": 3.3438, "step": 48050 }, { "epoch": 5.185983827493262, "grad_norm": 0.7007202506065369, "learning_rate": 0.00028926929303831625, "loss": 3.3228, "step": 48100 }, { "epoch": 5.191374663072776, "grad_norm": 0.6837854981422424, "learning_rate": 0.0002889454937938478, "loss": 3.3293, "step": 48150 }, { "epoch": 5.196765498652291, "grad_norm": 0.6684266924858093, "learning_rate": 0.00028862169454937935, "loss": 3.3229, "step": 48200 }, { "epoch": 5.202156334231806, "grad_norm": 0.6720783710479736, "learning_rate": 0.0002882978953049109, "loss": 3.3037, "step": 48250 }, { "epoch": 5.2075471698113205, "grad_norm": 0.652512788772583, "learning_rate": 0.0002879740960604425, "loss": 3.3299, "step": 48300 }, { "epoch": 5.212938005390836, "grad_norm": 0.6807644367218018, "learning_rate": 0.00028765029681597406, "loss": 3.317, "step": 48350 }, { "epoch": 5.218328840970351, "grad_norm": 0.6457434892654419, "learning_rate": 0.00028732649757150566, "loss": 3.3284, "step": 48400 }, { "epoch": 5.223719676549865, "grad_norm": 0.7819386720657349, "learning_rate": 0.0002870026983270372, "loss": 3.3175, "step": 48450 }, { "epoch": 5.22911051212938, "grad_norm": 0.6470807790756226, "learning_rate": 0.00028667889908256877, "loss": 3.3181, "step": 48500 }, { "epoch": 5.234501347708895, "grad_norm": 0.6543563604354858, "learning_rate": 0.00028635509983810037, "loss": 3.3024, "step": 48550 }, { "epoch": 5.2398921832884096, "grad_norm": 0.6741393208503723, "learning_rate": 0.0002860313005936319, "loss": 3.3287, "step": 48600 }, { "epoch": 5.245283018867925, "grad_norm": 0.6699310541152954, "learning_rate": 0.00028570750134916347, "loss": 3.3222, "step": 48650 }, { "epoch": 5.250673854447439, "grad_norm": 0.6818548440933228, "learning_rate": 0.0002853837021046951, "loss": 3.3207, "step": 48700 }, { "epoch": 5.256064690026954, "grad_norm": 0.6772786974906921, "learning_rate": 0.00028505990286022663, "loss": 3.3115, "step": 48750 }, { "epoch": 5.261455525606469, "grad_norm": 0.6387109756469727, "learning_rate": 0.0002847361036157582, "loss": 3.3251, "step": 48800 }, { "epoch": 5.2668463611859835, "grad_norm": 0.6841785907745361, "learning_rate": 0.0002844123043712898, "loss": 3.3394, "step": 48850 }, { "epoch": 5.272237196765499, "grad_norm": 0.6936202049255371, "learning_rate": 0.00028408850512682133, "loss": 3.3173, "step": 48900 }, { "epoch": 5.277628032345014, "grad_norm": 0.6657775044441223, "learning_rate": 0.00028376470588235294, "loss": 3.3317, "step": 48950 }, { "epoch": 5.283018867924528, "grad_norm": 0.6841873526573181, "learning_rate": 0.0002834409066378845, "loss": 3.3424, "step": 49000 }, { "epoch": 5.283018867924528, "eval_accuracy": 0.38162577533342584, "eval_loss": 3.40484619140625, "eval_runtime": 183.6772, "eval_samples_per_second": 98.058, "eval_steps_per_second": 6.13, "step": 49000 }, { "epoch": 5.288409703504043, "grad_norm": 0.6730915307998657, "learning_rate": 0.00028311710739341604, "loss": 3.336, "step": 49050 }, { "epoch": 5.293800539083558, "grad_norm": 0.7052405476570129, "learning_rate": 0.00028279330814894764, "loss": 3.3385, "step": 49100 }, { "epoch": 5.2991913746630726, "grad_norm": 0.6705338954925537, "learning_rate": 0.0002824695089044792, "loss": 3.3141, "step": 49150 }, { "epoch": 5.304582210242588, "grad_norm": 0.6564133763313293, "learning_rate": 0.00028214570966001075, "loss": 3.3388, "step": 49200 }, { "epoch": 5.309973045822103, "grad_norm": 0.6649319529533386, "learning_rate": 0.0002818283864004317, "loss": 3.3264, "step": 49250 }, { "epoch": 5.315363881401617, "grad_norm": 0.7245952486991882, "learning_rate": 0.0002815045871559633, "loss": 3.3236, "step": 49300 }, { "epoch": 5.320754716981132, "grad_norm": 0.6721333861351013, "learning_rate": 0.00028118078791149485, "loss": 3.3403, "step": 49350 }, { "epoch": 5.3261455525606465, "grad_norm": 0.6484825015068054, "learning_rate": 0.0002808569886670264, "loss": 3.3165, "step": 49400 }, { "epoch": 5.331536388140162, "grad_norm": 0.6736767292022705, "learning_rate": 0.00028053318942255795, "loss": 3.316, "step": 49450 }, { "epoch": 5.336927223719677, "grad_norm": 0.6363009810447693, "learning_rate": 0.00028020939017808956, "loss": 3.343, "step": 49500 }, { "epoch": 5.342318059299191, "grad_norm": 0.698854923248291, "learning_rate": 0.0002798855909336211, "loss": 3.3392, "step": 49550 }, { "epoch": 5.347708894878706, "grad_norm": 0.6946191787719727, "learning_rate": 0.0002795617916891527, "loss": 3.3336, "step": 49600 }, { "epoch": 5.353099730458221, "grad_norm": 0.6818442940711975, "learning_rate": 0.00027923799244468426, "loss": 3.321, "step": 49650 }, { "epoch": 5.3584905660377355, "grad_norm": 0.6966042518615723, "learning_rate": 0.00027891419320021587, "loss": 3.3464, "step": 49700 }, { "epoch": 5.363881401617251, "grad_norm": 0.6865735054016113, "learning_rate": 0.0002785903939557474, "loss": 3.3428, "step": 49750 }, { "epoch": 5.369272237196766, "grad_norm": 0.6621612310409546, "learning_rate": 0.00027826659471127897, "loss": 3.3162, "step": 49800 }, { "epoch": 5.37466307277628, "grad_norm": 0.6775622963905334, "learning_rate": 0.0002779427954668105, "loss": 3.348, "step": 49850 }, { "epoch": 5.380053908355795, "grad_norm": 0.6511054039001465, "learning_rate": 0.0002776189962223421, "loss": 3.3274, "step": 49900 }, { "epoch": 5.38544474393531, "grad_norm": 0.7020261883735657, "learning_rate": 0.0002772951969778737, "loss": 3.3409, "step": 49950 }, { "epoch": 5.390835579514825, "grad_norm": 0.6863080263137817, "learning_rate": 0.0002769713977334053, "loss": 3.3402, "step": 50000 }, { "epoch": 5.390835579514825, "eval_accuracy": 0.3820112757793916, "eval_loss": 3.4008445739746094, "eval_runtime": 183.6696, "eval_samples_per_second": 98.062, "eval_steps_per_second": 6.131, "step": 50000 }, { "epoch": 5.39622641509434, "grad_norm": 0.6434275507926941, "learning_rate": 0.00027664759848893683, "loss": 3.3234, "step": 50050 }, { "epoch": 5.401617250673855, "grad_norm": 0.7132904529571533, "learning_rate": 0.00027632379924446844, "loss": 3.3187, "step": 50100 }, { "epoch": 5.407008086253369, "grad_norm": 0.6604653000831604, "learning_rate": 0.000276, "loss": 3.3325, "step": 50150 }, { "epoch": 5.412398921832884, "grad_norm": 0.702043890953064, "learning_rate": 0.00027567620075553154, "loss": 3.33, "step": 50200 }, { "epoch": 5.4177897574123985, "grad_norm": 0.6745196580886841, "learning_rate": 0.0002753524015110631, "loss": 3.3392, "step": 50250 }, { "epoch": 5.423180592991914, "grad_norm": 0.6715883612632751, "learning_rate": 0.0002750286022665947, "loss": 3.3213, "step": 50300 }, { "epoch": 5.428571428571429, "grad_norm": 0.6874634623527527, "learning_rate": 0.00027470480302212624, "loss": 3.3519, "step": 50350 }, { "epoch": 5.433962264150943, "grad_norm": 0.6819542050361633, "learning_rate": 0.00027438100377765785, "loss": 3.3322, "step": 50400 }, { "epoch": 5.439353099730458, "grad_norm": 0.7074691653251648, "learning_rate": 0.0002740572045331894, "loss": 3.3365, "step": 50450 }, { "epoch": 5.444743935309973, "grad_norm": 0.7091408967971802, "learning_rate": 0.00027373340528872095, "loss": 3.3268, "step": 50500 }, { "epoch": 5.450134770889488, "grad_norm": 0.6710211634635925, "learning_rate": 0.00027340960604425255, "loss": 3.3326, "step": 50550 }, { "epoch": 5.455525606469003, "grad_norm": 0.6949871182441711, "learning_rate": 0.0002730858067997841, "loss": 3.3433, "step": 50600 }, { "epoch": 5.460916442048518, "grad_norm": 0.7077759504318237, "learning_rate": 0.00027276200755531565, "loss": 3.3538, "step": 50650 }, { "epoch": 5.466307277628032, "grad_norm": 0.7271976470947266, "learning_rate": 0.00027243820831084726, "loss": 3.3146, "step": 50700 }, { "epoch": 5.471698113207547, "grad_norm": 0.6720749735832214, "learning_rate": 0.0002721144090663788, "loss": 3.3346, "step": 50750 }, { "epoch": 5.4770889487870615, "grad_norm": 0.6647154092788696, "learning_rate": 0.00027179060982191036, "loss": 3.3343, "step": 50800 }, { "epoch": 5.482479784366577, "grad_norm": 0.633385419845581, "learning_rate": 0.00027146681057744197, "loss": 3.3266, "step": 50850 }, { "epoch": 5.487870619946092, "grad_norm": 0.7008586525917053, "learning_rate": 0.0002711430113329735, "loss": 3.3252, "step": 50900 }, { "epoch": 5.493261455525606, "grad_norm": 0.6773853898048401, "learning_rate": 0.0002708192120885051, "loss": 3.3414, "step": 50950 }, { "epoch": 5.498652291105121, "grad_norm": 0.6934987306594849, "learning_rate": 0.00027049541284403667, "loss": 3.3439, "step": 51000 }, { "epoch": 5.498652291105121, "eval_accuracy": 0.38229887997343653, "eval_loss": 3.3964803218841553, "eval_runtime": 184.2643, "eval_samples_per_second": 97.745, "eval_steps_per_second": 6.111, "step": 51000 }, { "epoch": 5.504043126684636, "grad_norm": 0.6530951857566833, "learning_rate": 0.0002701716135995683, "loss": 3.3507, "step": 51050 }, { "epoch": 5.509433962264151, "grad_norm": 0.6792340874671936, "learning_rate": 0.0002698478143550998, "loss": 3.3366, "step": 51100 }, { "epoch": 5.514824797843666, "grad_norm": 0.6732293367385864, "learning_rate": 0.0002695240151106314, "loss": 3.3349, "step": 51150 }, { "epoch": 5.520215633423181, "grad_norm": 0.6773748993873596, "learning_rate": 0.00026920021586616293, "loss": 3.3177, "step": 51200 }, { "epoch": 5.525606469002695, "grad_norm": 0.6929314136505127, "learning_rate": 0.00026887641662169453, "loss": 3.338, "step": 51250 }, { "epoch": 5.53099730458221, "grad_norm": 0.6713197231292725, "learning_rate": 0.0002685590933621155, "loss": 3.33, "step": 51300 }, { "epoch": 5.536388140161725, "grad_norm": 0.6788355708122253, "learning_rate": 0.00026823529411764704, "loss": 3.3213, "step": 51350 }, { "epoch": 5.54177897574124, "grad_norm": 0.7311290502548218, "learning_rate": 0.0002679114948731786, "loss": 3.3131, "step": 51400 }, { "epoch": 5.547169811320755, "grad_norm": 0.6731926202774048, "learning_rate": 0.0002675876956287102, "loss": 3.3122, "step": 51450 }, { "epoch": 5.55256064690027, "grad_norm": 0.7098152041435242, "learning_rate": 0.00026726389638424174, "loss": 3.3427, "step": 51500 }, { "epoch": 5.557951482479784, "grad_norm": 0.7369858026504517, "learning_rate": 0.0002669400971397733, "loss": 3.3459, "step": 51550 }, { "epoch": 5.563342318059299, "grad_norm": 0.6638474464416504, "learning_rate": 0.0002666162978953049, "loss": 3.3208, "step": 51600 }, { "epoch": 5.568733153638814, "grad_norm": 0.6835918426513672, "learning_rate": 0.00026629249865083645, "loss": 3.3362, "step": 51650 }, { "epoch": 5.574123989218329, "grad_norm": 0.7660952806472778, "learning_rate": 0.00026596869940636805, "loss": 3.3392, "step": 51700 }, { "epoch": 5.579514824797844, "grad_norm": 0.6250906586647034, "learning_rate": 0.0002656449001618996, "loss": 3.3312, "step": 51750 }, { "epoch": 5.584905660377358, "grad_norm": 0.6872050762176514, "learning_rate": 0.0002653211009174312, "loss": 3.3224, "step": 51800 }, { "epoch": 5.590296495956873, "grad_norm": 0.7721320986747742, "learning_rate": 0.0002649973016729627, "loss": 3.333, "step": 51850 }, { "epoch": 5.595687331536388, "grad_norm": 0.6676695346832275, "learning_rate": 0.0002646735024284943, "loss": 3.3208, "step": 51900 }, { "epoch": 5.601078167115903, "grad_norm": 0.7069310545921326, "learning_rate": 0.00026434970318402586, "loss": 3.3394, "step": 51950 }, { "epoch": 5.606469002695418, "grad_norm": 0.6760640144348145, "learning_rate": 0.00026402590393955746, "loss": 3.3189, "step": 52000 }, { "epoch": 5.606469002695418, "eval_accuracy": 0.3827234954590155, "eval_loss": 3.3922245502471924, "eval_runtime": 183.5334, "eval_samples_per_second": 98.135, "eval_steps_per_second": 6.135, "step": 52000 }, { "epoch": 5.611859838274933, "grad_norm": 0.7278321981430054, "learning_rate": 0.000263702104695089, "loss": 3.335, "step": 52050 }, { "epoch": 5.617250673854447, "grad_norm": 0.7073381543159485, "learning_rate": 0.0002633783054506206, "loss": 3.3279, "step": 52100 }, { "epoch": 5.622641509433962, "grad_norm": 0.7076802253723145, "learning_rate": 0.00026305450620615217, "loss": 3.3472, "step": 52150 }, { "epoch": 5.628032345013477, "grad_norm": 0.7682281732559204, "learning_rate": 0.0002627307069616837, "loss": 3.3518, "step": 52200 }, { "epoch": 5.633423180592992, "grad_norm": 0.6639510989189148, "learning_rate": 0.0002624069077172153, "loss": 3.3299, "step": 52250 }, { "epoch": 5.638814016172507, "grad_norm": 0.6877766847610474, "learning_rate": 0.0002620831084727469, "loss": 3.3161, "step": 52300 }, { "epoch": 5.644204851752022, "grad_norm": 0.6921620965003967, "learning_rate": 0.0002617593092282784, "loss": 3.3437, "step": 52350 }, { "epoch": 5.649595687331536, "grad_norm": 0.6771333813667297, "learning_rate": 0.00026143550998381003, "loss": 3.3196, "step": 52400 }, { "epoch": 5.654986522911051, "grad_norm": 0.7312127351760864, "learning_rate": 0.0002611117107393416, "loss": 3.3373, "step": 52450 }, { "epoch": 5.660377358490566, "grad_norm": 0.7787958383560181, "learning_rate": 0.00026078791149487313, "loss": 3.3444, "step": 52500 }, { "epoch": 5.665768194070081, "grad_norm": 0.6783169507980347, "learning_rate": 0.00026046411225040474, "loss": 3.3308, "step": 52550 }, { "epoch": 5.671159029649596, "grad_norm": 0.6774412393569946, "learning_rate": 0.0002601403130059363, "loss": 3.3452, "step": 52600 }, { "epoch": 5.67654986522911, "grad_norm": 0.7152056097984314, "learning_rate": 0.0002598165137614679, "loss": 3.3428, "step": 52650 }, { "epoch": 5.681940700808625, "grad_norm": 0.7162232398986816, "learning_rate": 0.00025949271451699944, "loss": 3.3342, "step": 52700 }, { "epoch": 5.6873315363881405, "grad_norm": 0.7322828769683838, "learning_rate": 0.000259168915272531, "loss": 3.3471, "step": 52750 }, { "epoch": 5.692722371967655, "grad_norm": 0.7306454181671143, "learning_rate": 0.0002588451160280626, "loss": 3.3263, "step": 52800 }, { "epoch": 5.69811320754717, "grad_norm": 0.7096463441848755, "learning_rate": 0.00025852131678359415, "loss": 3.326, "step": 52850 }, { "epoch": 5.703504043126685, "grad_norm": 0.7072291374206543, "learning_rate": 0.0002581975175391257, "loss": 3.3271, "step": 52900 }, { "epoch": 5.708894878706199, "grad_norm": 0.7425927519798279, "learning_rate": 0.0002578737182946573, "loss": 3.343, "step": 52950 }, { "epoch": 5.714285714285714, "grad_norm": 0.7667298316955566, "learning_rate": 0.00025754991905018885, "loss": 3.3341, "step": 53000 }, { "epoch": 5.714285714285714, "eval_accuracy": 0.38343777954350794, "eval_loss": 3.385892391204834, "eval_runtime": 183.7702, "eval_samples_per_second": 98.008, "eval_steps_per_second": 6.127, "step": 53000 }, { "epoch": 5.719676549865229, "grad_norm": 0.666398286819458, "learning_rate": 0.00025722611980572046, "loss": 3.3137, "step": 53050 }, { "epoch": 5.725067385444744, "grad_norm": 0.7412814497947693, "learning_rate": 0.000256902320561252, "loss": 3.327, "step": 53100 }, { "epoch": 5.730458221024259, "grad_norm": 0.7019506692886353, "learning_rate": 0.00025657852131678356, "loss": 3.3345, "step": 53150 }, { "epoch": 5.735849056603773, "grad_norm": 0.7068491578102112, "learning_rate": 0.0002562547220723151, "loss": 3.3335, "step": 53200 }, { "epoch": 5.741239892183288, "grad_norm": 0.6603403091430664, "learning_rate": 0.0002559309228278467, "loss": 3.3237, "step": 53250 }, { "epoch": 5.7466307277628035, "grad_norm": 0.6823389530181885, "learning_rate": 0.00025561359956826767, "loss": 3.3499, "step": 53300 }, { "epoch": 5.752021563342318, "grad_norm": 0.6933403611183167, "learning_rate": 0.0002552898003237992, "loss": 3.327, "step": 53350 }, { "epoch": 5.757412398921833, "grad_norm": 0.6974020004272461, "learning_rate": 0.0002549660010793308, "loss": 3.3137, "step": 53400 }, { "epoch": 5.762803234501348, "grad_norm": 0.6720824241638184, "learning_rate": 0.0002546422018348624, "loss": 3.3461, "step": 53450 }, { "epoch": 5.768194070080862, "grad_norm": 0.7010537385940552, "learning_rate": 0.0002543184025903939, "loss": 3.3161, "step": 53500 }, { "epoch": 5.773584905660377, "grad_norm": 0.6914909482002258, "learning_rate": 0.0002539946033459255, "loss": 3.3437, "step": 53550 }, { "epoch": 5.7789757412398925, "grad_norm": 0.6911107301712036, "learning_rate": 0.0002536708041014571, "loss": 3.3169, "step": 53600 }, { "epoch": 5.784366576819407, "grad_norm": 0.6994060277938843, "learning_rate": 0.00025334700485698863, "loss": 3.3496, "step": 53650 }, { "epoch": 5.789757412398922, "grad_norm": 0.7192985415458679, "learning_rate": 0.00025302320561252023, "loss": 3.3223, "step": 53700 }, { "epoch": 5.795148247978437, "grad_norm": 0.7265368103981018, "learning_rate": 0.0002526994063680518, "loss": 3.3361, "step": 53750 }, { "epoch": 5.800539083557951, "grad_norm": 0.7237759232521057, "learning_rate": 0.0002523756071235834, "loss": 3.329, "step": 53800 }, { "epoch": 5.8059299191374665, "grad_norm": 0.698350191116333, "learning_rate": 0.00025205180787911494, "loss": 3.3318, "step": 53850 }, { "epoch": 5.811320754716981, "grad_norm": 0.6901752948760986, "learning_rate": 0.0002517280086346465, "loss": 3.3252, "step": 53900 }, { "epoch": 5.816711590296496, "grad_norm": 0.6650895476341248, "learning_rate": 0.00025140420939017804, "loss": 3.3167, "step": 53950 }, { "epoch": 5.822102425876011, "grad_norm": 0.6690970063209534, "learning_rate": 0.00025108041014570965, "loss": 3.3311, "step": 54000 }, { "epoch": 5.822102425876011, "eval_accuracy": 0.384017551352886, "eval_loss": 3.3810160160064697, "eval_runtime": 183.9693, "eval_samples_per_second": 97.902, "eval_steps_per_second": 6.121, "step": 54000 }, { "epoch": 5.827493261455525, "grad_norm": 0.6725430488586426, "learning_rate": 0.0002507566109012412, "loss": 3.3441, "step": 54050 }, { "epoch": 5.83288409703504, "grad_norm": 0.6652386784553528, "learning_rate": 0.0002504328116567728, "loss": 3.3496, "step": 54100 }, { "epoch": 5.8382749326145555, "grad_norm": 0.7626335620880127, "learning_rate": 0.00025010901241230435, "loss": 3.3252, "step": 54150 }, { "epoch": 5.84366576819407, "grad_norm": 0.6746981143951416, "learning_rate": 0.0002497852131678359, "loss": 3.3347, "step": 54200 }, { "epoch": 5.849056603773585, "grad_norm": 0.7410542368888855, "learning_rate": 0.0002494614139233675, "loss": 3.3627, "step": 54250 }, { "epoch": 5.8544474393531, "grad_norm": 0.6656700372695923, "learning_rate": 0.00024913761467889906, "loss": 3.3415, "step": 54300 }, { "epoch": 5.859838274932614, "grad_norm": 0.6493524312973022, "learning_rate": 0.0002488138154344306, "loss": 3.3432, "step": 54350 }, { "epoch": 5.8652291105121295, "grad_norm": 0.7260926961898804, "learning_rate": 0.0002484900161899622, "loss": 3.3449, "step": 54400 }, { "epoch": 5.870619946091644, "grad_norm": 0.7246960401535034, "learning_rate": 0.00024816621694549376, "loss": 3.3164, "step": 54450 }, { "epoch": 5.876010781671159, "grad_norm": 0.7035079002380371, "learning_rate": 0.00024784241770102537, "loss": 3.3455, "step": 54500 }, { "epoch": 5.881401617250674, "grad_norm": 0.6615075469017029, "learning_rate": 0.0002475186184565569, "loss": 3.332, "step": 54550 }, { "epoch": 5.886792452830189, "grad_norm": 0.6515076160430908, "learning_rate": 0.00024719481921208847, "loss": 3.3244, "step": 54600 }, { "epoch": 5.892183288409703, "grad_norm": 0.6946157813072205, "learning_rate": 0.0002468710199676201, "loss": 3.3463, "step": 54650 }, { "epoch": 5.8975741239892185, "grad_norm": 0.6916877627372742, "learning_rate": 0.0002465472207231516, "loss": 3.3441, "step": 54700 }, { "epoch": 5.902964959568733, "grad_norm": 0.7089888453483582, "learning_rate": 0.0002462234214786832, "loss": 3.3255, "step": 54750 }, { "epoch": 5.908355795148248, "grad_norm": 0.6872110366821289, "learning_rate": 0.0002458996222342148, "loss": 3.3591, "step": 54800 }, { "epoch": 5.913746630727763, "grad_norm": 0.692903459072113, "learning_rate": 0.00024557582298974633, "loss": 3.3261, "step": 54850 }, { "epoch": 5.919137466307277, "grad_norm": 0.696246862411499, "learning_rate": 0.0002452520237452779, "loss": 3.3555, "step": 54900 }, { "epoch": 5.9245283018867925, "grad_norm": 0.6855026483535767, "learning_rate": 0.0002449282245008095, "loss": 3.3346, "step": 54950 }, { "epoch": 5.929919137466308, "grad_norm": 0.736362099647522, "learning_rate": 0.00024460442525634104, "loss": 3.3347, "step": 55000 }, { "epoch": 5.929919137466308, "eval_accuracy": 0.38405981732624583, "eval_loss": 3.3812179565429688, "eval_runtime": 183.7282, "eval_samples_per_second": 98.031, "eval_steps_per_second": 6.129, "step": 55000 }, { "epoch": 5.935309973045822, "grad_norm": 0.702866792678833, "learning_rate": 0.00024428062601187264, "loss": 3.3561, "step": 55050 }, { "epoch": 5.940700808625337, "grad_norm": 0.7051444053649902, "learning_rate": 0.0002439568267674042, "loss": 3.3295, "step": 55100 }, { "epoch": 5.946091644204852, "grad_norm": 0.7245047092437744, "learning_rate": 0.00024363302752293574, "loss": 3.3433, "step": 55150 }, { "epoch": 5.951482479784366, "grad_norm": 0.7037590742111206, "learning_rate": 0.00024330922827846732, "loss": 3.3271, "step": 55200 }, { "epoch": 5.9568733153638815, "grad_norm": 0.6878694295883179, "learning_rate": 0.0002429854290339989, "loss": 3.3147, "step": 55250 }, { "epoch": 5.962264150943396, "grad_norm": 0.6708479523658752, "learning_rate": 0.00024266810577441985, "loss": 3.3289, "step": 55300 }, { "epoch": 5.967654986522911, "grad_norm": 0.701370358467102, "learning_rate": 0.00024234430652995143, "loss": 3.3266, "step": 55350 }, { "epoch": 5.973045822102426, "grad_norm": 0.6717433929443359, "learning_rate": 0.00024202050728548298, "loss": 3.3393, "step": 55400 }, { "epoch": 5.97843665768194, "grad_norm": 0.6901810169219971, "learning_rate": 0.00024169670804101456, "loss": 3.3462, "step": 55450 }, { "epoch": 5.9838274932614555, "grad_norm": 0.6869121789932251, "learning_rate": 0.0002413729087965461, "loss": 3.3139, "step": 55500 }, { "epoch": 5.989218328840971, "grad_norm": 0.7031119465827942, "learning_rate": 0.00024104910955207768, "loss": 3.336, "step": 55550 }, { "epoch": 5.994609164420485, "grad_norm": 0.6958100199699402, "learning_rate": 0.00024072531030760926, "loss": 3.3504, "step": 55600 }, { "epoch": 6.0, "grad_norm": 1.451558232307434, "learning_rate": 0.00024040151106314084, "loss": 3.3206, "step": 55650 }, { "epoch": 6.005390835579515, "grad_norm": 0.6931526064872742, "learning_rate": 0.0002400777118186724, "loss": 3.2332, "step": 55700 }, { "epoch": 6.010781671159029, "grad_norm": 0.7126358151435852, "learning_rate": 0.00023975391257420397, "loss": 3.2352, "step": 55750 }, { "epoch": 6.0161725067385445, "grad_norm": 0.7339010238647461, "learning_rate": 0.00023943011332973555, "loss": 3.2684, "step": 55800 }, { "epoch": 6.02156334231806, "grad_norm": 0.7038446068763733, "learning_rate": 0.00023910631408526712, "loss": 3.2338, "step": 55850 }, { "epoch": 6.026954177897574, "grad_norm": 0.7177115082740784, "learning_rate": 0.0002387825148407987, "loss": 3.2345, "step": 55900 }, { "epoch": 6.032345013477089, "grad_norm": 0.7070705890655518, "learning_rate": 0.00023845871559633025, "loss": 3.2623, "step": 55950 }, { "epoch": 6.037735849056604, "grad_norm": 0.7214942574501038, "learning_rate": 0.0002381349163518618, "loss": 3.2357, "step": 56000 }, { "epoch": 6.037735849056604, "eval_accuracy": 0.38422996774856333, "eval_loss": 3.3826565742492676, "eval_runtime": 183.9778, "eval_samples_per_second": 97.898, "eval_steps_per_second": 6.12, "step": 56000 }, { "epoch": 6.0431266846361185, "grad_norm": 0.6994975209236145, "learning_rate": 0.00023781111710739338, "loss": 3.2527, "step": 56050 }, { "epoch": 6.048517520215634, "grad_norm": 0.6794683933258057, "learning_rate": 0.00023748731786292496, "loss": 3.2424, "step": 56100 }, { "epoch": 6.053908355795148, "grad_norm": 0.6963556408882141, "learning_rate": 0.00023716351861845654, "loss": 3.2607, "step": 56150 }, { "epoch": 6.059299191374663, "grad_norm": 0.673784613609314, "learning_rate": 0.0002368397193739881, "loss": 3.2584, "step": 56200 }, { "epoch": 6.064690026954178, "grad_norm": 0.7150440812110901, "learning_rate": 0.0002365159201295197, "loss": 3.2551, "step": 56250 }, { "epoch": 6.070080862533692, "grad_norm": 0.7197666764259338, "learning_rate": 0.00023619212088505127, "loss": 3.2584, "step": 56300 }, { "epoch": 6.0754716981132075, "grad_norm": 0.6947667002677917, "learning_rate": 0.0002358683216405828, "loss": 3.2609, "step": 56350 }, { "epoch": 6.080862533692723, "grad_norm": 0.7144685983657837, "learning_rate": 0.00023554452239611437, "loss": 3.2739, "step": 56400 }, { "epoch": 6.086253369272237, "grad_norm": 0.7323464155197144, "learning_rate": 0.00023522072315164595, "loss": 3.2539, "step": 56450 }, { "epoch": 6.091644204851752, "grad_norm": 0.7219662070274353, "learning_rate": 0.00023489692390717752, "loss": 3.2484, "step": 56500 }, { "epoch": 6.097035040431267, "grad_norm": 0.7247258424758911, "learning_rate": 0.0002345731246627091, "loss": 3.2491, "step": 56550 }, { "epoch": 6.1024258760107815, "grad_norm": 0.761326014995575, "learning_rate": 0.00023424932541824068, "loss": 3.2653, "step": 56600 }, { "epoch": 6.107816711590297, "grad_norm": 0.706102728843689, "learning_rate": 0.00023392552617377226, "loss": 3.2608, "step": 56650 }, { "epoch": 6.113207547169812, "grad_norm": 0.6789826154708862, "learning_rate": 0.00023360172692930384, "loss": 3.2632, "step": 56700 }, { "epoch": 6.118598382749326, "grad_norm": 0.7313393354415894, "learning_rate": 0.00023327792768483539, "loss": 3.2674, "step": 56750 }, { "epoch": 6.123989218328841, "grad_norm": 0.7755120992660522, "learning_rate": 0.0002329606044252563, "loss": 3.2678, "step": 56800 }, { "epoch": 6.129380053908355, "grad_norm": 0.6921339631080627, "learning_rate": 0.0002326368051807879, "loss": 3.2611, "step": 56850 }, { "epoch": 6.1347708894878705, "grad_norm": 0.6921306252479553, "learning_rate": 0.00023231300593631947, "loss": 3.2698, "step": 56900 }, { "epoch": 6.140161725067386, "grad_norm": 0.7285143733024597, "learning_rate": 0.00023198920669185104, "loss": 3.2675, "step": 56950 }, { "epoch": 6.1455525606469, "grad_norm": 0.7773783802986145, "learning_rate": 0.00023166540744738262, "loss": 3.2576, "step": 57000 }, { "epoch": 6.1455525606469, "eval_accuracy": 0.3845002961334457, "eval_loss": 3.383111000061035, "eval_runtime": 185.0654, "eval_samples_per_second": 97.322, "eval_steps_per_second": 6.084, "step": 57000 }, { "epoch": 6.150943396226415, "grad_norm": 0.724994957447052, "learning_rate": 0.0002313416082029142, "loss": 3.2746, "step": 57050 }, { "epoch": 6.15633423180593, "grad_norm": 0.69971764087677, "learning_rate": 0.00023101780895844572, "loss": 3.278, "step": 57100 }, { "epoch": 6.1617250673854445, "grad_norm": 0.7013365626335144, "learning_rate": 0.0002306940097139773, "loss": 3.2754, "step": 57150 }, { "epoch": 6.16711590296496, "grad_norm": 0.7439168095588684, "learning_rate": 0.00023037021046950888, "loss": 3.2606, "step": 57200 }, { "epoch": 6.172506738544475, "grad_norm": 0.6989354491233826, "learning_rate": 0.00023004641122504046, "loss": 3.2701, "step": 57250 }, { "epoch": 6.177897574123989, "grad_norm": 0.7361764311790466, "learning_rate": 0.00022972261198057203, "loss": 3.2753, "step": 57300 }, { "epoch": 6.183288409703504, "grad_norm": 0.7835276126861572, "learning_rate": 0.0002293988127361036, "loss": 3.281, "step": 57350 }, { "epoch": 6.188679245283019, "grad_norm": 0.7380451560020447, "learning_rate": 0.00022907501349163516, "loss": 3.2676, "step": 57400 }, { "epoch": 6.1940700808625335, "grad_norm": 0.7116109132766724, "learning_rate": 0.00022875121424716674, "loss": 3.2601, "step": 57450 }, { "epoch": 6.199460916442049, "grad_norm": 0.7472301125526428, "learning_rate": 0.00022842741500269832, "loss": 3.2762, "step": 57500 }, { "epoch": 6.204851752021563, "grad_norm": 0.7332467436790466, "learning_rate": 0.00022810361575822987, "loss": 3.2991, "step": 57550 }, { "epoch": 6.210242587601078, "grad_norm": 0.6753149032592773, "learning_rate": 0.00022777981651376145, "loss": 3.2702, "step": 57600 }, { "epoch": 6.215633423180593, "grad_norm": 0.7721616625785828, "learning_rate": 0.00022745601726929302, "loss": 3.2752, "step": 57650 }, { "epoch": 6.2210242587601075, "grad_norm": 0.7246688604354858, "learning_rate": 0.00022713221802482457, "loss": 3.2696, "step": 57700 }, { "epoch": 6.226415094339623, "grad_norm": 0.6795578002929688, "learning_rate": 0.00022680841878035615, "loss": 3.2596, "step": 57750 }, { "epoch": 6.231805929919138, "grad_norm": 0.7370179891586304, "learning_rate": 0.00022648461953588773, "loss": 3.2688, "step": 57800 }, { "epoch": 6.237196765498652, "grad_norm": 0.6983503699302673, "learning_rate": 0.0002261608202914193, "loss": 3.2753, "step": 57850 }, { "epoch": 6.242587601078167, "grad_norm": 0.7345255613327026, "learning_rate": 0.00022583702104695088, "loss": 3.2752, "step": 57900 }, { "epoch": 6.247978436657682, "grad_norm": 0.7167491316795349, "learning_rate": 0.00022551322180248246, "loss": 3.2838, "step": 57950 }, { "epoch": 6.2533692722371965, "grad_norm": 0.7173773646354675, "learning_rate": 0.00022518942255801399, "loss": 3.2761, "step": 58000 }, { "epoch": 6.2533692722371965, "eval_accuracy": 0.38490709254542316, "eval_loss": 3.379185914993286, "eval_runtime": 184.7659, "eval_samples_per_second": 97.48, "eval_steps_per_second": 6.094, "step": 58000 }, { "epoch": 6.258760107816712, "grad_norm": 0.7806297540664673, "learning_rate": 0.00022486562331354556, "loss": 3.268, "step": 58050 }, { "epoch": 6.264150943396227, "grad_norm": 0.7050454616546631, "learning_rate": 0.00022454182406907714, "loss": 3.2624, "step": 58100 }, { "epoch": 6.269541778975741, "grad_norm": 0.6826767325401306, "learning_rate": 0.00022421802482460872, "loss": 3.2622, "step": 58150 }, { "epoch": 6.274932614555256, "grad_norm": 0.7257795333862305, "learning_rate": 0.0002238942255801403, "loss": 3.2999, "step": 58200 }, { "epoch": 6.280323450134771, "grad_norm": 0.7288961410522461, "learning_rate": 0.00022357042633567187, "loss": 3.2753, "step": 58250 }, { "epoch": 6.285714285714286, "grad_norm": 0.7374154925346375, "learning_rate": 0.00022324662709120345, "loss": 3.262, "step": 58300 }, { "epoch": 6.291105121293801, "grad_norm": 0.7150074243545532, "learning_rate": 0.00022292282784673503, "loss": 3.2877, "step": 58350 }, { "epoch": 6.296495956873315, "grad_norm": 0.7542361617088318, "learning_rate": 0.00022259902860226655, "loss": 3.2719, "step": 58400 }, { "epoch": 6.30188679245283, "grad_norm": 0.7025394439697266, "learning_rate": 0.00022227522935779813, "loss": 3.2783, "step": 58450 }, { "epoch": 6.307277628032345, "grad_norm": 0.7773345708847046, "learning_rate": 0.0002219514301133297, "loss": 3.2935, "step": 58500 }, { "epoch": 6.3126684636118595, "grad_norm": 0.7307718396186829, "learning_rate": 0.00022162763086886129, "loss": 3.2527, "step": 58550 }, { "epoch": 6.318059299191375, "grad_norm": 0.7027918696403503, "learning_rate": 0.00022130383162439286, "loss": 3.2868, "step": 58600 }, { "epoch": 6.32345013477089, "grad_norm": 0.7043139934539795, "learning_rate": 0.00022098003237992444, "loss": 3.2659, "step": 58650 }, { "epoch": 6.328840970350404, "grad_norm": 0.6800243854522705, "learning_rate": 0.00022065623313545602, "loss": 3.2702, "step": 58700 }, { "epoch": 6.334231805929919, "grad_norm": 0.7141492962837219, "learning_rate": 0.00022033243389098757, "loss": 3.2907, "step": 58750 }, { "epoch": 6.339622641509434, "grad_norm": 0.7111701369285583, "learning_rate": 0.00022000863464651915, "loss": 3.2788, "step": 58800 }, { "epoch": 6.345013477088949, "grad_norm": 0.7029032111167908, "learning_rate": 0.0002196848354020507, "loss": 3.2843, "step": 58850 }, { "epoch": 6.350404312668464, "grad_norm": 0.7716480493545532, "learning_rate": 0.00021936103615758227, "loss": 3.29, "step": 58900 }, { "epoch": 6.355795148247978, "grad_norm": 0.6854252815246582, "learning_rate": 0.00021903723691311385, "loss": 3.3005, "step": 58950 }, { "epoch": 6.361185983827493, "grad_norm": 0.6938695311546326, "learning_rate": 0.00021871343766864543, "loss": 3.289, "step": 59000 }, { "epoch": 6.361185983827493, "eval_accuracy": 0.3852709710667139, "eval_loss": 3.37485408782959, "eval_runtime": 185.1638, "eval_samples_per_second": 97.271, "eval_steps_per_second": 6.081, "step": 59000 }, { "epoch": 6.366576819407008, "grad_norm": 0.7637062668800354, "learning_rate": 0.00021838963842417698, "loss": 3.2879, "step": 59050 }, { "epoch": 6.3719676549865225, "grad_norm": 0.71692955493927, "learning_rate": 0.00021806583917970856, "loss": 3.2819, "step": 59100 }, { "epoch": 6.377358490566038, "grad_norm": 0.7055894732475281, "learning_rate": 0.00021774203993524014, "loss": 3.2831, "step": 59150 }, { "epoch": 6.382749326145553, "grad_norm": 0.6954150199890137, "learning_rate": 0.00021741824069077171, "loss": 3.2762, "step": 59200 }, { "epoch": 6.388140161725067, "grad_norm": 0.7507545948028564, "learning_rate": 0.00021709444144630326, "loss": 3.2795, "step": 59250 }, { "epoch": 6.393530997304582, "grad_norm": 0.7440924644470215, "learning_rate": 0.00021677064220183484, "loss": 3.2736, "step": 59300 }, { "epoch": 6.398921832884097, "grad_norm": 0.7490218281745911, "learning_rate": 0.00021644684295736642, "loss": 3.2633, "step": 59350 }, { "epoch": 6.404312668463612, "grad_norm": 0.7519710659980774, "learning_rate": 0.00021612304371289797, "loss": 3.2532, "step": 59400 }, { "epoch": 6.409703504043127, "grad_norm": 0.7268811464309692, "learning_rate": 0.00021579924446842955, "loss": 3.2831, "step": 59450 }, { "epoch": 6.415094339622642, "grad_norm": 0.7365671396255493, "learning_rate": 0.00021547544522396113, "loss": 3.2734, "step": 59500 }, { "epoch": 6.420485175202156, "grad_norm": 0.784694492816925, "learning_rate": 0.0002151516459794927, "loss": 3.2898, "step": 59550 }, { "epoch": 6.425876010781671, "grad_norm": 0.7644874453544617, "learning_rate": 0.00021482784673502428, "loss": 3.2836, "step": 59600 }, { "epoch": 6.431266846361186, "grad_norm": 0.7010469436645508, "learning_rate": 0.00021450404749055586, "loss": 3.2646, "step": 59650 }, { "epoch": 6.436657681940701, "grad_norm": 0.7214942574501038, "learning_rate": 0.00021418024824608738, "loss": 3.2619, "step": 59700 }, { "epoch": 6.442048517520216, "grad_norm": 0.7231842875480652, "learning_rate": 0.00021385644900161896, "loss": 3.278, "step": 59750 }, { "epoch": 6.44743935309973, "grad_norm": 0.728193998336792, "learning_rate": 0.00021353264975715054, "loss": 3.2837, "step": 59800 }, { "epoch": 6.452830188679245, "grad_norm": 0.768265426158905, "learning_rate": 0.00021320885051268211, "loss": 3.2929, "step": 59850 }, { "epoch": 6.45822102425876, "grad_norm": 0.7742714285850525, "learning_rate": 0.0002128850512682137, "loss": 3.2794, "step": 59900 }, { "epoch": 6.463611859838275, "grad_norm": 0.6836204528808594, "learning_rate": 0.00021256125202374527, "loss": 3.2765, "step": 59950 }, { "epoch": 6.46900269541779, "grad_norm": 0.9617937207221985, "learning_rate": 0.00021223745277927685, "loss": 3.2893, "step": 60000 }, { "epoch": 6.46900269541779, "eval_accuracy": 0.38564234663726377, "eval_loss": 3.3716087341308594, "eval_runtime": 184.7027, "eval_samples_per_second": 97.513, "eval_steps_per_second": 6.096, "step": 60000 }, { "epoch": 6.474393530997305, "grad_norm": 0.7370874881744385, "learning_rate": 0.00021191365353480843, "loss": 3.2907, "step": 60050 }, { "epoch": 6.479784366576819, "grad_norm": 0.7189035415649414, "learning_rate": 0.00021158985429033995, "loss": 3.2801, "step": 60100 }, { "epoch": 6.485175202156334, "grad_norm": 0.7072881460189819, "learning_rate": 0.00021126605504587153, "loss": 3.2779, "step": 60150 }, { "epoch": 6.490566037735849, "grad_norm": 0.711500883102417, "learning_rate": 0.0002109422558014031, "loss": 3.2626, "step": 60200 }, { "epoch": 6.495956873315364, "grad_norm": 0.7228755950927734, "learning_rate": 0.00021061845655693468, "loss": 3.2895, "step": 60250 }, { "epoch": 6.501347708894879, "grad_norm": 0.7080203890800476, "learning_rate": 0.00021029465731246626, "loss": 3.2776, "step": 60300 }, { "epoch": 6.506738544474393, "grad_norm": 0.7516891360282898, "learning_rate": 0.00020997085806799784, "loss": 3.2753, "step": 60350 }, { "epoch": 6.512129380053908, "grad_norm": 0.6993798017501831, "learning_rate": 0.0002096470588235294, "loss": 3.2688, "step": 60400 }, { "epoch": 6.517520215633423, "grad_norm": 0.7494493126869202, "learning_rate": 0.00020932325957906097, "loss": 3.2716, "step": 60450 }, { "epoch": 6.5229110512129385, "grad_norm": 0.7326953411102295, "learning_rate": 0.00020899946033459254, "loss": 3.2797, "step": 60500 }, { "epoch": 6.528301886792453, "grad_norm": 0.7331379652023315, "learning_rate": 0.0002086756610901241, "loss": 3.272, "step": 60550 }, { "epoch": 6.533692722371968, "grad_norm": 0.7465304136276245, "learning_rate": 0.00020835186184565567, "loss": 3.2809, "step": 60600 }, { "epoch": 6.539083557951482, "grad_norm": 0.6983660459518433, "learning_rate": 0.00020802806260118725, "loss": 3.283, "step": 60650 }, { "epoch": 6.544474393530997, "grad_norm": 0.7236700654029846, "learning_rate": 0.00020770426335671883, "loss": 3.2601, "step": 60700 }, { "epoch": 6.549865229110512, "grad_norm": 0.794366717338562, "learning_rate": 0.00020738694009713975, "loss": 3.2861, "step": 60750 }, { "epoch": 6.555256064690027, "grad_norm": 0.7264977097511292, "learning_rate": 0.00020706314085267133, "loss": 3.3007, "step": 60800 }, { "epoch": 6.560646900269542, "grad_norm": 0.7221018075942993, "learning_rate": 0.00020673934160820288, "loss": 3.2797, "step": 60850 }, { "epoch": 6.566037735849057, "grad_norm": 0.7276595830917358, "learning_rate": 0.00020641554236373446, "loss": 3.2759, "step": 60900 }, { "epoch": 6.571428571428571, "grad_norm": 0.7512069940567017, "learning_rate": 0.00020609174311926604, "loss": 3.2959, "step": 60950 }, { "epoch": 6.576819407008086, "grad_norm": 0.7264222502708435, "learning_rate": 0.0002057679438747976, "loss": 3.2819, "step": 61000 }, { "epoch": 6.576819407008086, "eval_accuracy": 0.38633196651622226, "eval_loss": 3.366112470626831, "eval_runtime": 184.763, "eval_samples_per_second": 97.482, "eval_steps_per_second": 6.094, "step": 61000 }, { "epoch": 6.5822102425876015, "grad_norm": 0.7526605725288391, "learning_rate": 0.00020544414463032916, "loss": 3.2952, "step": 61050 }, { "epoch": 6.587601078167116, "grad_norm": 0.7065187692642212, "learning_rate": 0.00020512034538586074, "loss": 3.263, "step": 61100 }, { "epoch": 6.592991913746631, "grad_norm": 0.767744779586792, "learning_rate": 0.00020479654614139232, "loss": 3.3016, "step": 61150 }, { "epoch": 6.598382749326145, "grad_norm": 0.7380595803260803, "learning_rate": 0.0002044727468969239, "loss": 3.2851, "step": 61200 }, { "epoch": 6.60377358490566, "grad_norm": 0.6881290674209595, "learning_rate": 0.00020414894765245547, "loss": 3.2706, "step": 61250 }, { "epoch": 6.609164420485175, "grad_norm": 0.6972199082374573, "learning_rate": 0.00020382514840798702, "loss": 3.2987, "step": 61300 }, { "epoch": 6.6145552560646905, "grad_norm": 0.7250159382820129, "learning_rate": 0.0002035013491635186, "loss": 3.2942, "step": 61350 }, { "epoch": 6.619946091644205, "grad_norm": 0.7611603736877441, "learning_rate": 0.00020317754991905015, "loss": 3.2798, "step": 61400 }, { "epoch": 6.62533692722372, "grad_norm": 0.7625829577445984, "learning_rate": 0.00020285375067458173, "loss": 3.2658, "step": 61450 }, { "epoch": 6.630727762803234, "grad_norm": 0.7896705865859985, "learning_rate": 0.0002025299514301133, "loss": 3.2919, "step": 61500 }, { "epoch": 6.636118598382749, "grad_norm": 0.75810706615448, "learning_rate": 0.00020220615218564489, "loss": 3.278, "step": 61550 }, { "epoch": 6.6415094339622645, "grad_norm": 0.7625492215156555, "learning_rate": 0.00020188235294117646, "loss": 3.2772, "step": 61600 }, { "epoch": 6.646900269541779, "grad_norm": 0.7140247821807861, "learning_rate": 0.00020155855369670804, "loss": 3.2875, "step": 61650 }, { "epoch": 6.652291105121294, "grad_norm": 0.7512028217315674, "learning_rate": 0.00020123475445223956, "loss": 3.2969, "step": 61700 }, { "epoch": 6.657681940700809, "grad_norm": 0.7267550826072693, "learning_rate": 0.00020091095520777114, "loss": 3.2861, "step": 61750 }, { "epoch": 6.663072776280323, "grad_norm": 0.7511851787567139, "learning_rate": 0.00020058715596330272, "loss": 3.2806, "step": 61800 }, { "epoch": 6.668463611859838, "grad_norm": 0.777556836605072, "learning_rate": 0.0002002633567188343, "loss": 3.2767, "step": 61850 }, { "epoch": 6.6738544474393535, "grad_norm": 0.7368999123573303, "learning_rate": 0.00019993955747436588, "loss": 3.274, "step": 61900 }, { "epoch": 6.679245283018868, "grad_norm": 0.7757512331008911, "learning_rate": 0.00019961575822989745, "loss": 3.2747, "step": 61950 }, { "epoch": 6.684636118598383, "grad_norm": 0.7147890329360962, "learning_rate": 0.00019929195898542903, "loss": 3.2998, "step": 62000 }, { "epoch": 6.684636118598383, "eval_accuracy": 0.38667759135236013, "eval_loss": 3.3609557151794434, "eval_runtime": 185.0014, "eval_samples_per_second": 97.356, "eval_steps_per_second": 6.086, "step": 62000 }, { "epoch": 6.690026954177897, "grad_norm": 0.7304677367210388, "learning_rate": 0.0001989681597409606, "loss": 3.2825, "step": 62050 }, { "epoch": 6.695417789757412, "grad_norm": 0.7841406464576721, "learning_rate": 0.00019864436049649216, "loss": 3.2635, "step": 62100 }, { "epoch": 6.7008086253369274, "grad_norm": 0.7494974732398987, "learning_rate": 0.0001983205612520237, "loss": 3.3044, "step": 62150 }, { "epoch": 6.706199460916442, "grad_norm": 0.7478502988815308, "learning_rate": 0.0001979967620075553, "loss": 3.2954, "step": 62200 }, { "epoch": 6.711590296495957, "grad_norm": 0.7201170325279236, "learning_rate": 0.00019767296276308686, "loss": 3.2798, "step": 62250 }, { "epoch": 6.716981132075472, "grad_norm": 0.7911655902862549, "learning_rate": 0.00019734916351861844, "loss": 3.2812, "step": 62300 }, { "epoch": 6.722371967654986, "grad_norm": 0.8429829478263855, "learning_rate": 0.00019702536427415002, "loss": 3.3016, "step": 62350 }, { "epoch": 6.727762803234501, "grad_norm": 0.7752325534820557, "learning_rate": 0.0001967015650296816, "loss": 3.2843, "step": 62400 }, { "epoch": 6.7331536388140165, "grad_norm": 0.7346392273902893, "learning_rate": 0.00019637776578521315, "loss": 3.2862, "step": 62450 }, { "epoch": 6.738544474393531, "grad_norm": 0.7346991300582886, "learning_rate": 0.00019605396654074473, "loss": 3.2908, "step": 62500 }, { "epoch": 6.743935309973046, "grad_norm": 0.7638818025588989, "learning_rate": 0.0001957301672962763, "loss": 3.2882, "step": 62550 }, { "epoch": 6.74932614555256, "grad_norm": 0.7539122104644775, "learning_rate": 0.00019540636805180785, "loss": 3.2801, "step": 62600 }, { "epoch": 6.754716981132075, "grad_norm": 0.7578554153442383, "learning_rate": 0.00019508256880733943, "loss": 3.2811, "step": 62650 }, { "epoch": 6.7601078167115904, "grad_norm": 0.7557489275932312, "learning_rate": 0.000194758769562871, "loss": 3.2902, "step": 62700 }, { "epoch": 6.765498652291106, "grad_norm": 0.7585029006004333, "learning_rate": 0.00019443497031840256, "loss": 3.2708, "step": 62750 }, { "epoch": 6.77088948787062, "grad_norm": 0.7600674629211426, "learning_rate": 0.00019411117107393414, "loss": 3.2731, "step": 62800 }, { "epoch": 6.776280323450135, "grad_norm": 0.7798412442207336, "learning_rate": 0.00019378737182946572, "loss": 3.2889, "step": 62850 }, { "epoch": 6.781671159029649, "grad_norm": 0.8409514427185059, "learning_rate": 0.0001934635725849973, "loss": 3.2864, "step": 62900 }, { "epoch": 6.787061994609164, "grad_norm": 0.7428311705589294, "learning_rate": 0.00019313977334052887, "loss": 3.3036, "step": 62950 }, { "epoch": 6.7924528301886795, "grad_norm": 0.7958593964576721, "learning_rate": 0.00019281597409606042, "loss": 3.2773, "step": 63000 }, { "epoch": 6.7924528301886795, "eval_accuracy": 0.38693759771290015, "eval_loss": 3.3576364517211914, "eval_runtime": 185.2969, "eval_samples_per_second": 97.201, "eval_steps_per_second": 6.077, "step": 63000 }, { "epoch": 6.797843665768194, "grad_norm": 0.757053017616272, "learning_rate": 0.00019249217485159197, "loss": 3.291, "step": 63050 }, { "epoch": 6.803234501347709, "grad_norm": 0.703620433807373, "learning_rate": 0.00019216837560712355, "loss": 3.2964, "step": 63100 }, { "epoch": 6.808625336927224, "grad_norm": 0.7836263179779053, "learning_rate": 0.00019184457636265513, "loss": 3.2749, "step": 63150 }, { "epoch": 6.814016172506738, "grad_norm": 0.7800487279891968, "learning_rate": 0.0001915207771181867, "loss": 3.2889, "step": 63200 }, { "epoch": 6.819407008086253, "grad_norm": 0.7118497490882874, "learning_rate": 0.00019119697787371828, "loss": 3.2898, "step": 63250 }, { "epoch": 6.824797843665769, "grad_norm": 0.7383244037628174, "learning_rate": 0.00019087317862924986, "loss": 3.2806, "step": 63300 }, { "epoch": 6.830188679245283, "grad_norm": 0.8963047862052917, "learning_rate": 0.00019054937938478144, "loss": 3.2644, "step": 63350 }, { "epoch": 6.835579514824798, "grad_norm": 0.7486133575439453, "learning_rate": 0.00019022558014031302, "loss": 3.295, "step": 63400 }, { "epoch": 6.840970350404312, "grad_norm": 0.7660999894142151, "learning_rate": 0.00018990178089584454, "loss": 3.2999, "step": 63450 }, { "epoch": 6.846361185983827, "grad_norm": 0.7520319223403931, "learning_rate": 0.00018957798165137612, "loss": 3.2928, "step": 63500 }, { "epoch": 6.8517520215633425, "grad_norm": 0.7498881220817566, "learning_rate": 0.0001892541824069077, "loss": 3.2875, "step": 63550 }, { "epoch": 6.857142857142857, "grad_norm": 0.7376906275749207, "learning_rate": 0.00018893038316243927, "loss": 3.2675, "step": 63600 }, { "epoch": 6.862533692722372, "grad_norm": 0.7770941853523254, "learning_rate": 0.00018860658391797085, "loss": 3.2695, "step": 63650 }, { "epoch": 6.867924528301887, "grad_norm": 0.7603805661201477, "learning_rate": 0.00018828278467350243, "loss": 3.278, "step": 63700 }, { "epoch": 6.873315363881401, "grad_norm": 0.7244340181350708, "learning_rate": 0.000187958985429034, "loss": 3.2774, "step": 63750 }, { "epoch": 6.878706199460916, "grad_norm": 0.8087451457977295, "learning_rate": 0.00018763518618456556, "loss": 3.2684, "step": 63800 }, { "epoch": 6.884097035040432, "grad_norm": 0.7538607120513916, "learning_rate": 0.0001873113869400971, "loss": 3.2879, "step": 63850 }, { "epoch": 6.889487870619946, "grad_norm": 0.7293424606323242, "learning_rate": 0.00018698758769562868, "loss": 3.2841, "step": 63900 }, { "epoch": 6.894878706199461, "grad_norm": 0.8110401034355164, "learning_rate": 0.00018666378845116026, "loss": 3.302, "step": 63950 }, { "epoch": 6.900269541778976, "grad_norm": 0.7437310814857483, "learning_rate": 0.00018633998920669184, "loss": 3.2882, "step": 64000 }, { "epoch": 6.900269541778976, "eval_accuracy": 0.38752508387731305, "eval_loss": 3.353229522705078, "eval_runtime": 184.6958, "eval_samples_per_second": 97.517, "eval_steps_per_second": 6.097, "step": 64000 }, { "epoch": 6.90566037735849, "grad_norm": 0.7894251346588135, "learning_rate": 0.00018601618996222342, "loss": 3.2894, "step": 64050 }, { "epoch": 6.9110512129380055, "grad_norm": 0.7877970933914185, "learning_rate": 0.00018569239071775497, "loss": 3.2938, "step": 64100 }, { "epoch": 6.916442048517521, "grad_norm": 0.7726397514343262, "learning_rate": 0.00018536859147328655, "loss": 3.2889, "step": 64150 }, { "epoch": 6.921832884097035, "grad_norm": 0.7730385065078735, "learning_rate": 0.00018504479222881812, "loss": 3.2796, "step": 64200 }, { "epoch": 6.92722371967655, "grad_norm": 0.7642452716827393, "learning_rate": 0.0001847209929843497, "loss": 3.264, "step": 64250 }, { "epoch": 6.932614555256064, "grad_norm": 0.7660939693450928, "learning_rate": 0.00018439719373988125, "loss": 3.306, "step": 64300 }, { "epoch": 6.938005390835579, "grad_norm": 0.7493078112602234, "learning_rate": 0.00018407339449541283, "loss": 3.2839, "step": 64350 }, { "epoch": 6.943396226415095, "grad_norm": 0.7475608587265015, "learning_rate": 0.00018374959525094438, "loss": 3.2844, "step": 64400 }, { "epoch": 6.948787061994609, "grad_norm": 0.7330486178398132, "learning_rate": 0.00018342579600647596, "loss": 3.2808, "step": 64450 }, { "epoch": 6.954177897574124, "grad_norm": 1.2176107168197632, "learning_rate": 0.00018310199676200753, "loss": 3.2776, "step": 64500 }, { "epoch": 6.959568733153639, "grad_norm": 0.710422933101654, "learning_rate": 0.0001827781975175391, "loss": 3.2817, "step": 64550 }, { "epoch": 6.964959568733153, "grad_norm": 0.7552564144134521, "learning_rate": 0.0001824543982730707, "loss": 3.2731, "step": 64600 }, { "epoch": 6.9703504043126685, "grad_norm": 0.7585906386375427, "learning_rate": 0.00018213059902860227, "loss": 3.2843, "step": 64650 }, { "epoch": 6.975741239892184, "grad_norm": 0.7237086892127991, "learning_rate": 0.00018180679978413382, "loss": 3.2802, "step": 64700 }, { "epoch": 6.981132075471698, "grad_norm": 0.8057036995887756, "learning_rate": 0.00018148300053966537, "loss": 3.2895, "step": 64750 }, { "epoch": 6.986522911051213, "grad_norm": 0.7535995244979858, "learning_rate": 0.00018116567728008632, "loss": 3.277, "step": 64800 }, { "epoch": 6.991913746630727, "grad_norm": 0.7500599026679993, "learning_rate": 0.0001808418780356179, "loss": 3.2713, "step": 64850 }, { "epoch": 6.997304582210242, "grad_norm": 0.8092964291572571, "learning_rate": 0.00018051807879114948, "loss": 3.2817, "step": 64900 }, { "epoch": 7.002695417789758, "grad_norm": 0.7631925940513611, "learning_rate": 0.0001802007555315704, "loss": 3.2317, "step": 64950 }, { "epoch": 7.008086253369272, "grad_norm": 0.7095732092857361, "learning_rate": 0.00017987695628710198, "loss": 3.1839, "step": 65000 }, { "epoch": 7.008086253369272, "eval_accuracy": 0.3875124801423266, "eval_loss": 3.354387044906616, "eval_runtime": 184.7817, "eval_samples_per_second": 97.472, "eval_steps_per_second": 6.094, "step": 65000 }, { "epoch": 7.013477088948787, "grad_norm": 0.9158305525779724, "learning_rate": 0.00017955315704263356, "loss": 3.2062, "step": 65050 }, { "epoch": 7.018867924528302, "grad_norm": 0.7549683451652527, "learning_rate": 0.0001792293577981651, "loss": 3.2018, "step": 65100 }, { "epoch": 7.024258760107816, "grad_norm": 0.7751328349113464, "learning_rate": 0.00017890555855369668, "loss": 3.2119, "step": 65150 }, { "epoch": 7.0296495956873315, "grad_norm": 0.7429438829421997, "learning_rate": 0.00017858175930922826, "loss": 3.2053, "step": 65200 }, { "epoch": 7.035040431266847, "grad_norm": 0.7653865218162537, "learning_rate": 0.00017825796006475984, "loss": 3.2041, "step": 65250 }, { "epoch": 7.040431266846361, "grad_norm": 0.7375260591506958, "learning_rate": 0.00017793416082029142, "loss": 3.1978, "step": 65300 }, { "epoch": 7.045822102425876, "grad_norm": 0.7559249997138977, "learning_rate": 0.000177610361575823, "loss": 3.2147, "step": 65350 }, { "epoch": 7.051212938005391, "grad_norm": 0.7759532332420349, "learning_rate": 0.00017728656233135452, "loss": 3.1921, "step": 65400 }, { "epoch": 7.056603773584905, "grad_norm": 0.7644091844558716, "learning_rate": 0.0001769627630868861, "loss": 3.1814, "step": 65450 }, { "epoch": 7.061994609164421, "grad_norm": 0.7937024235725403, "learning_rate": 0.00017663896384241767, "loss": 3.1995, "step": 65500 }, { "epoch": 7.067385444743936, "grad_norm": 0.7552359104156494, "learning_rate": 0.00017631516459794925, "loss": 3.1924, "step": 65550 }, { "epoch": 7.07277628032345, "grad_norm": 0.7388563752174377, "learning_rate": 0.00017599136535348083, "loss": 3.2205, "step": 65600 }, { "epoch": 7.078167115902965, "grad_norm": 0.7597045302391052, "learning_rate": 0.0001756675661090124, "loss": 3.2121, "step": 65650 }, { "epoch": 7.083557951482479, "grad_norm": 0.8127196431159973, "learning_rate": 0.00017534376686454398, "loss": 3.2061, "step": 65700 }, { "epoch": 7.0889487870619945, "grad_norm": 0.7847875356674194, "learning_rate": 0.00017501996762007556, "loss": 3.1891, "step": 65750 }, { "epoch": 7.09433962264151, "grad_norm": 0.7454573512077332, "learning_rate": 0.00017469616837560709, "loss": 3.2224, "step": 65800 }, { "epoch": 7.099730458221024, "grad_norm": 0.739266574382782, "learning_rate": 0.00017437236913113866, "loss": 3.2168, "step": 65850 }, { "epoch": 7.105121293800539, "grad_norm": 0.7397430539131165, "learning_rate": 0.00017404856988667024, "loss": 3.2299, "step": 65900 }, { "epoch": 7.110512129380054, "grad_norm": 0.7771823406219482, "learning_rate": 0.00017372477064220182, "loss": 3.2127, "step": 65950 }, { "epoch": 7.115902964959568, "grad_norm": 0.8010558485984802, "learning_rate": 0.0001734009713977334, "loss": 3.2252, "step": 66000 }, { "epoch": 7.115902964959568, "eval_accuracy": 0.3877516251484063, "eval_loss": 3.3572838306427, "eval_runtime": 184.947, "eval_samples_per_second": 97.385, "eval_steps_per_second": 6.088, "step": 66000 }, { "epoch": 7.121293800539084, "grad_norm": 0.7524279952049255, "learning_rate": 0.00017307717215326497, "loss": 3.2191, "step": 66050 }, { "epoch": 7.126684636118599, "grad_norm": 0.7754007577896118, "learning_rate": 0.00017275337290879655, "loss": 3.2188, "step": 66100 }, { "epoch": 7.132075471698113, "grad_norm": 0.8016265630722046, "learning_rate": 0.0001724295736643281, "loss": 3.2193, "step": 66150 }, { "epoch": 7.137466307277628, "grad_norm": 0.7195385098457336, "learning_rate": 0.00017210577441985968, "loss": 3.2144, "step": 66200 }, { "epoch": 7.142857142857143, "grad_norm": 0.7985889911651611, "learning_rate": 0.00017178197517539123, "loss": 3.2224, "step": 66250 }, { "epoch": 7.1482479784366575, "grad_norm": 0.7573408484458923, "learning_rate": 0.0001714581759309228, "loss": 3.2186, "step": 66300 }, { "epoch": 7.153638814016173, "grad_norm": 0.8037809729576111, "learning_rate": 0.00017113437668645439, "loss": 3.2227, "step": 66350 }, { "epoch": 7.159029649595688, "grad_norm": 0.7795762419700623, "learning_rate": 0.00017081057744198596, "loss": 3.2077, "step": 66400 }, { "epoch": 7.164420485175202, "grad_norm": 0.7938854098320007, "learning_rate": 0.00017048677819751751, "loss": 3.2172, "step": 66450 }, { "epoch": 7.169811320754717, "grad_norm": 0.7674042582511902, "learning_rate": 0.0001701629789530491, "loss": 3.2177, "step": 66500 }, { "epoch": 7.175202156334231, "grad_norm": 0.782508134841919, "learning_rate": 0.00016983917970858067, "loss": 3.2215, "step": 66550 }, { "epoch": 7.180592991913747, "grad_norm": 0.7263006567955017, "learning_rate": 0.00016951538046411225, "loss": 3.2079, "step": 66600 }, { "epoch": 7.185983827493262, "grad_norm": 0.7729055881500244, "learning_rate": 0.0001691915812196438, "loss": 3.215, "step": 66650 }, { "epoch": 7.191374663072776, "grad_norm": 0.7278570532798767, "learning_rate": 0.00016886778197517538, "loss": 3.2179, "step": 66700 }, { "epoch": 7.196765498652291, "grad_norm": 0.7499024271965027, "learning_rate": 0.00016854398273070693, "loss": 3.2216, "step": 66750 }, { "epoch": 7.202156334231806, "grad_norm": 0.8059422373771667, "learning_rate": 0.0001682201834862385, "loss": 3.211, "step": 66800 }, { "epoch": 7.2075471698113205, "grad_norm": 0.7714557647705078, "learning_rate": 0.00016789638424177008, "loss": 3.2301, "step": 66850 }, { "epoch": 7.212938005390836, "grad_norm": 0.7622218728065491, "learning_rate": 0.00016757258499730166, "loss": 3.2388, "step": 66900 }, { "epoch": 7.218328840970351, "grad_norm": 0.7789103984832764, "learning_rate": 0.00016724878575283324, "loss": 3.227, "step": 66950 }, { "epoch": 7.223719676549865, "grad_norm": 0.7892781496047974, "learning_rate": 0.00016692498650836481, "loss": 3.2308, "step": 67000 }, { "epoch": 7.223719676549865, "eval_accuracy": 0.388054006135194, "eval_loss": 3.3542799949645996, "eval_runtime": 184.5856, "eval_samples_per_second": 97.575, "eval_steps_per_second": 6.1, "step": 67000 }, { "epoch": 7.22911051212938, "grad_norm": 0.772681474685669, "learning_rate": 0.0001666011872638964, "loss": 3.1994, "step": 67050 }, { "epoch": 7.234501347708895, "grad_norm": 0.7405024766921997, "learning_rate": 0.00016627738801942792, "loss": 3.2434, "step": 67100 }, { "epoch": 7.2398921832884096, "grad_norm": 0.7997522354125977, "learning_rate": 0.00016596006475984887, "loss": 3.2065, "step": 67150 }, { "epoch": 7.245283018867925, "grad_norm": 0.8065983057022095, "learning_rate": 0.00016563626551538045, "loss": 3.2268, "step": 67200 }, { "epoch": 7.250673854447439, "grad_norm": 0.7845258116722107, "learning_rate": 0.00016531246627091202, "loss": 3.2351, "step": 67250 }, { "epoch": 7.256064690026954, "grad_norm": 0.7305331826210022, "learning_rate": 0.0001649886670264436, "loss": 3.237, "step": 67300 }, { "epoch": 7.261455525606469, "grad_norm": 0.7165262699127197, "learning_rate": 0.00016466486778197518, "loss": 3.2162, "step": 67350 }, { "epoch": 7.2668463611859835, "grad_norm": 0.7781831622123718, "learning_rate": 0.0001643410685375067, "loss": 3.2421, "step": 67400 }, { "epoch": 7.272237196765499, "grad_norm": 0.8223204016685486, "learning_rate": 0.00016401726929303828, "loss": 3.2393, "step": 67450 }, { "epoch": 7.277628032345014, "grad_norm": 0.7840092778205872, "learning_rate": 0.00016369347004856986, "loss": 3.229, "step": 67500 }, { "epoch": 7.283018867924528, "grad_norm": 0.7795689702033997, "learning_rate": 0.00016336967080410143, "loss": 3.2406, "step": 67550 }, { "epoch": 7.288409703504043, "grad_norm": 0.8028538823127747, "learning_rate": 0.000163045871559633, "loss": 3.2256, "step": 67600 }, { "epoch": 7.293800539083558, "grad_norm": 0.7559919953346252, "learning_rate": 0.0001627220723151646, "loss": 3.228, "step": 67650 }, { "epoch": 7.2991913746630726, "grad_norm": 0.7705023884773254, "learning_rate": 0.00016239827307069617, "loss": 3.2099, "step": 67700 }, { "epoch": 7.304582210242588, "grad_norm": 0.7943682670593262, "learning_rate": 0.00016207447382622775, "loss": 3.2247, "step": 67750 }, { "epoch": 7.309973045822103, "grad_norm": 0.7587892413139343, "learning_rate": 0.00016175067458175932, "loss": 3.2219, "step": 67800 }, { "epoch": 7.315363881401617, "grad_norm": 0.8514966368675232, "learning_rate": 0.00016142687533729085, "loss": 3.2224, "step": 67850 }, { "epoch": 7.320754716981132, "grad_norm": 0.7471888065338135, "learning_rate": 0.00016110307609282242, "loss": 3.237, "step": 67900 }, { "epoch": 7.3261455525606465, "grad_norm": 0.7487116456031799, "learning_rate": 0.000160779276848354, "loss": 3.2122, "step": 67950 }, { "epoch": 7.331536388140162, "grad_norm": 0.8473713397979736, "learning_rate": 0.00016045547760388558, "loss": 3.2282, "step": 68000 }, { "epoch": 7.331536388140162, "eval_accuracy": 0.3882555572420898, "eval_loss": 3.352534532546997, "eval_runtime": 184.9997, "eval_samples_per_second": 97.357, "eval_steps_per_second": 6.086, "step": 68000 }, { "epoch": 7.336927223719677, "grad_norm": 0.7282635569572449, "learning_rate": 0.00016013167835941716, "loss": 3.2332, "step": 68050 }, { "epoch": 7.342318059299191, "grad_norm": 0.7940888404846191, "learning_rate": 0.00015980787911494873, "loss": 3.2392, "step": 68100 }, { "epoch": 7.347708894878706, "grad_norm": 0.8099002242088318, "learning_rate": 0.00015948407987048029, "loss": 3.2271, "step": 68150 }, { "epoch": 7.353099730458221, "grad_norm": 0.7751094102859497, "learning_rate": 0.00015916028062601186, "loss": 3.2459, "step": 68200 }, { "epoch": 7.3584905660377355, "grad_norm": 0.7973043918609619, "learning_rate": 0.00015883648138154344, "loss": 3.2324, "step": 68250 }, { "epoch": 7.363881401617251, "grad_norm": 0.787239670753479, "learning_rate": 0.000158512682137075, "loss": 3.2199, "step": 68300 }, { "epoch": 7.369272237196766, "grad_norm": 0.7251448631286621, "learning_rate": 0.00015818888289260657, "loss": 3.2329, "step": 68350 }, { "epoch": 7.37466307277628, "grad_norm": 0.7860444188117981, "learning_rate": 0.00015786508364813815, "loss": 3.225, "step": 68400 }, { "epoch": 7.380053908355795, "grad_norm": 0.8149095773696899, "learning_rate": 0.0001575412844036697, "loss": 3.2095, "step": 68450 }, { "epoch": 7.38544474393531, "grad_norm": 0.8455824255943298, "learning_rate": 0.00015721748515920127, "loss": 3.2296, "step": 68500 }, { "epoch": 7.390835579514825, "grad_norm": 0.7530245780944824, "learning_rate": 0.00015689368591473285, "loss": 3.2297, "step": 68550 }, { "epoch": 7.39622641509434, "grad_norm": 0.7616867423057556, "learning_rate": 0.00015656988667026443, "loss": 3.2266, "step": 68600 }, { "epoch": 7.401617250673855, "grad_norm": 0.7562954425811768, "learning_rate": 0.000156246087425796, "loss": 3.2275, "step": 68650 }, { "epoch": 7.407008086253369, "grad_norm": 0.7328892350196838, "learning_rate": 0.00015592228818132756, "loss": 3.2381, "step": 68700 }, { "epoch": 7.412398921832884, "grad_norm": 0.8182516694068909, "learning_rate": 0.0001555984889368591, "loss": 3.2225, "step": 68750 }, { "epoch": 7.4177897574123985, "grad_norm": 0.8202837109565735, "learning_rate": 0.0001552746896923907, "loss": 3.2338, "step": 68800 }, { "epoch": 7.423180592991914, "grad_norm": 0.775603711605072, "learning_rate": 0.00015495089044792226, "loss": 3.2351, "step": 68850 }, { "epoch": 7.428571428571429, "grad_norm": 0.776694118976593, "learning_rate": 0.00015462709120345384, "loss": 3.23, "step": 68900 }, { "epoch": 7.433962264150943, "grad_norm": 0.786277711391449, "learning_rate": 0.00015430329195898542, "loss": 3.2334, "step": 68950 }, { "epoch": 7.439353099730458, "grad_norm": 0.7692110538482666, "learning_rate": 0.000153979492714517, "loss": 3.2182, "step": 69000 }, { "epoch": 7.439353099730458, "eval_accuracy": 0.388615632912307, "eval_loss": 3.346757173538208, "eval_runtime": 184.8049, "eval_samples_per_second": 97.46, "eval_steps_per_second": 6.093, "step": 69000 }, { "epoch": 7.444743935309973, "grad_norm": 0.8124752640724182, "learning_rate": 0.00015365569347004858, "loss": 3.2203, "step": 69050 }, { "epoch": 7.450134770889488, "grad_norm": 0.7867438793182373, "learning_rate": 0.00015333189422558015, "loss": 3.2298, "step": 69100 }, { "epoch": 7.455525606469003, "grad_norm": 0.7936617136001587, "learning_rate": 0.00015300809498111168, "loss": 3.2292, "step": 69150 }, { "epoch": 7.460916442048518, "grad_norm": 0.776745617389679, "learning_rate": 0.00015268429573664325, "loss": 3.2207, "step": 69200 }, { "epoch": 7.466307277628032, "grad_norm": 0.798880934715271, "learning_rate": 0.00015236049649217483, "loss": 3.2378, "step": 69250 }, { "epoch": 7.471698113207547, "grad_norm": 0.7821260690689087, "learning_rate": 0.0001520366972477064, "loss": 3.2253, "step": 69300 }, { "epoch": 7.4770889487870615, "grad_norm": 0.767823338508606, "learning_rate": 0.000151712898003238, "loss": 3.2376, "step": 69350 }, { "epoch": 7.482479784366577, "grad_norm": 0.7894257307052612, "learning_rate": 0.00015138909875876956, "loss": 3.2231, "step": 69400 }, { "epoch": 7.487870619946092, "grad_norm": 0.8050634860992432, "learning_rate": 0.00015106529951430114, "loss": 3.2228, "step": 69450 }, { "epoch": 7.493261455525606, "grad_norm": 0.7465404868125916, "learning_rate": 0.0001507415002698327, "loss": 3.2237, "step": 69500 }, { "epoch": 7.498652291105121, "grad_norm": 0.8405658006668091, "learning_rate": 0.00015041770102536424, "loss": 3.2376, "step": 69550 }, { "epoch": 7.504043126684636, "grad_norm": 0.7720454335212708, "learning_rate": 0.00015009390178089582, "loss": 3.2108, "step": 69600 }, { "epoch": 7.509433962264151, "grad_norm": 0.7862969636917114, "learning_rate": 0.0001497701025364274, "loss": 3.2289, "step": 69650 }, { "epoch": 7.514824797843666, "grad_norm": 0.761938214302063, "learning_rate": 0.00014944630329195898, "loss": 3.238, "step": 69700 }, { "epoch": 7.520215633423181, "grad_norm": 0.8123347163200378, "learning_rate": 0.00014912250404749055, "loss": 3.244, "step": 69750 }, { "epoch": 7.525606469002695, "grad_norm": 0.8718035817146301, "learning_rate": 0.0001487987048030221, "loss": 3.2249, "step": 69800 }, { "epoch": 7.53099730458221, "grad_norm": 0.755567193031311, "learning_rate": 0.00014847490555855368, "loss": 3.227, "step": 69850 }, { "epoch": 7.536388140161725, "grad_norm": 0.7703545093536377, "learning_rate": 0.00014815110631408526, "loss": 3.2319, "step": 69900 }, { "epoch": 7.54177897574124, "grad_norm": 0.7938928604125977, "learning_rate": 0.0001478273070696168, "loss": 3.2262, "step": 69950 }, { "epoch": 7.547169811320755, "grad_norm": 0.7993608713150024, "learning_rate": 0.0001475035078251484, "loss": 3.2238, "step": 70000 }, { "epoch": 7.547169811320755, "eval_accuracy": 0.3888174013249785, "eval_loss": 3.3437108993530273, "eval_runtime": 184.8174, "eval_samples_per_second": 97.453, "eval_steps_per_second": 6.093, "step": 70000 }, { "epoch": 7.55256064690027, "grad_norm": 0.78631591796875, "learning_rate": 0.00014717970858067997, "loss": 3.2313, "step": 70050 }, { "epoch": 7.557951482479784, "grad_norm": 0.7872461080551147, "learning_rate": 0.00014685590933621154, "loss": 3.2356, "step": 70100 }, { "epoch": 7.563342318059299, "grad_norm": 0.7788323760032654, "learning_rate": 0.0001465321100917431, "loss": 3.2341, "step": 70150 }, { "epoch": 7.568733153638814, "grad_norm": 0.8200633525848389, "learning_rate": 0.00014620831084727467, "loss": 3.2179, "step": 70200 }, { "epoch": 7.574123989218329, "grad_norm": 0.806707501411438, "learning_rate": 0.00014588451160280625, "loss": 3.2329, "step": 70250 }, { "epoch": 7.579514824797844, "grad_norm": 0.7861495614051819, "learning_rate": 0.0001455607123583378, "loss": 3.2287, "step": 70300 }, { "epoch": 7.584905660377358, "grad_norm": 0.8309231996536255, "learning_rate": 0.00014523691311386938, "loss": 3.2446, "step": 70350 }, { "epoch": 7.590296495956873, "grad_norm": 0.7764883637428284, "learning_rate": 0.00014491311386940096, "loss": 3.2258, "step": 70400 }, { "epoch": 7.595687331536388, "grad_norm": 0.7940296530723572, "learning_rate": 0.00014458931462493253, "loss": 3.2383, "step": 70450 }, { "epoch": 7.601078167115903, "grad_norm": 0.8480539321899414, "learning_rate": 0.00014426551538046408, "loss": 3.2404, "step": 70500 }, { "epoch": 7.606469002695418, "grad_norm": 0.7697724103927612, "learning_rate": 0.00014394171613599566, "loss": 3.2288, "step": 70550 }, { "epoch": 7.611859838274933, "grad_norm": 0.7845086455345154, "learning_rate": 0.00014361791689152724, "loss": 3.2292, "step": 70600 }, { "epoch": 7.617250673854447, "grad_norm": 0.7915938496589661, "learning_rate": 0.00014329411764705882, "loss": 3.2291, "step": 70650 }, { "epoch": 7.622641509433962, "grad_norm": 0.7984021306037903, "learning_rate": 0.0001429703184025904, "loss": 3.2314, "step": 70700 }, { "epoch": 7.628032345013477, "grad_norm": 0.8050603866577148, "learning_rate": 0.00014264651915812194, "loss": 3.2322, "step": 70750 }, { "epoch": 7.633423180592992, "grad_norm": 0.8283957839012146, "learning_rate": 0.00014232271991365352, "loss": 3.2444, "step": 70800 }, { "epoch": 7.638814016172507, "grad_norm": 0.7875947952270508, "learning_rate": 0.0001419989206691851, "loss": 3.2222, "step": 70850 }, { "epoch": 7.644204851752022, "grad_norm": 0.8083146810531616, "learning_rate": 0.00014168159740960602, "loss": 3.2393, "step": 70900 }, { "epoch": 7.649595687331536, "grad_norm": 0.7859264612197876, "learning_rate": 0.0001413577981651376, "loss": 3.2201, "step": 70950 }, { "epoch": 7.654986522911051, "grad_norm": 0.7985530495643616, "learning_rate": 0.00014103399892066918, "loss": 3.2513, "step": 71000 }, { "epoch": 7.654986522911051, "eval_accuracy": 0.38947138305673384, "eval_loss": 3.3401072025299072, "eval_runtime": 184.8335, "eval_samples_per_second": 97.444, "eval_steps_per_second": 6.092, "step": 71000 }, { "epoch": 7.660377358490566, "grad_norm": 0.7884781956672668, "learning_rate": 0.00014071019967620073, "loss": 3.2415, "step": 71050 }, { "epoch": 7.665768194070081, "grad_norm": 0.7903490662574768, "learning_rate": 0.0001403864004317323, "loss": 3.2499, "step": 71100 }, { "epoch": 7.671159029649596, "grad_norm": 0.7995983958244324, "learning_rate": 0.00014006260118726389, "loss": 3.233, "step": 71150 }, { "epoch": 7.67654986522911, "grad_norm": 0.7693688273429871, "learning_rate": 0.00013973880194279546, "loss": 3.2488, "step": 71200 }, { "epoch": 7.681940700808625, "grad_norm": 0.7897132039070129, "learning_rate": 0.00013941500269832704, "loss": 3.2375, "step": 71250 }, { "epoch": 7.6873315363881405, "grad_norm": 0.8603083491325378, "learning_rate": 0.0001390912034538586, "loss": 3.2326, "step": 71300 }, { "epoch": 7.692722371967655, "grad_norm": 0.7997660636901855, "learning_rate": 0.00013876740420939017, "loss": 3.2474, "step": 71350 }, { "epoch": 7.69811320754717, "grad_norm": 0.8124864101409912, "learning_rate": 0.00013844360496492175, "loss": 3.2522, "step": 71400 }, { "epoch": 7.703504043126685, "grad_norm": 0.7789794206619263, "learning_rate": 0.00013811980572045333, "loss": 3.2292, "step": 71450 }, { "epoch": 7.708894878706199, "grad_norm": 0.8284264206886292, "learning_rate": 0.00013779600647598488, "loss": 3.2384, "step": 71500 }, { "epoch": 7.714285714285714, "grad_norm": 0.7541694045066833, "learning_rate": 0.00013747220723151645, "loss": 3.2273, "step": 71550 }, { "epoch": 7.719676549865229, "grad_norm": 0.8304938673973083, "learning_rate": 0.00013714840798704803, "loss": 3.2218, "step": 71600 }, { "epoch": 7.725067385444744, "grad_norm": 0.8099672198295593, "learning_rate": 0.00013682460874257958, "loss": 3.244, "step": 71650 }, { "epoch": 7.730458221024259, "grad_norm": 0.8018233180046082, "learning_rate": 0.00013650080949811116, "loss": 3.2449, "step": 71700 }, { "epoch": 7.735849056603773, "grad_norm": 0.8082437515258789, "learning_rate": 0.00013617701025364274, "loss": 3.2408, "step": 71750 }, { "epoch": 7.741239892183288, "grad_norm": 0.7693942785263062, "learning_rate": 0.0001358532110091743, "loss": 3.2298, "step": 71800 }, { "epoch": 7.7466307277628035, "grad_norm": 0.7721502780914307, "learning_rate": 0.00013552941176470587, "loss": 3.2468, "step": 71850 }, { "epoch": 7.752021563342318, "grad_norm": 0.76530522108078, "learning_rate": 0.00013520561252023744, "loss": 3.2439, "step": 71900 }, { "epoch": 7.757412398921833, "grad_norm": 0.786440908908844, "learning_rate": 0.00013488181327576902, "loss": 3.244, "step": 71950 }, { "epoch": 7.762803234501348, "grad_norm": 0.7962386012077332, "learning_rate": 0.00013455801403130057, "loss": 3.2518, "step": 72000 }, { "epoch": 7.762803234501348, "eval_accuracy": 0.39001149656205963, "eval_loss": 3.3362839221954346, "eval_runtime": 184.9207, "eval_samples_per_second": 97.399, "eval_steps_per_second": 6.089, "step": 72000 }, { "epoch": 7.768194070080862, "grad_norm": 0.8243362903594971, "learning_rate": 0.00013423421478683215, "loss": 3.2376, "step": 72050 }, { "epoch": 7.773584905660377, "grad_norm": 0.7997215390205383, "learning_rate": 0.00013391041554236373, "loss": 3.2442, "step": 72100 }, { "epoch": 7.7789757412398925, "grad_norm": 0.8042483329772949, "learning_rate": 0.00013358661629789528, "loss": 3.2313, "step": 72150 }, { "epoch": 7.784366576819407, "grad_norm": 0.7405474781990051, "learning_rate": 0.00013326281705342685, "loss": 3.2229, "step": 72200 }, { "epoch": 7.789757412398922, "grad_norm": 0.784062385559082, "learning_rate": 0.00013293901780895843, "loss": 3.2333, "step": 72250 }, { "epoch": 7.795148247978437, "grad_norm": 0.8437650203704834, "learning_rate": 0.00013261521856449, "loss": 3.2217, "step": 72300 }, { "epoch": 7.800539083557951, "grad_norm": 0.7900862097740173, "learning_rate": 0.00013229141932002156, "loss": 3.2094, "step": 72350 }, { "epoch": 7.8059299191374665, "grad_norm": 0.80324786901474, "learning_rate": 0.00013196762007555314, "loss": 3.2356, "step": 72400 }, { "epoch": 7.811320754716981, "grad_norm": 0.8037505149841309, "learning_rate": 0.00013164382083108472, "loss": 3.2382, "step": 72450 }, { "epoch": 7.816711590296496, "grad_norm": 0.8118758797645569, "learning_rate": 0.0001313200215866163, "loss": 3.2327, "step": 72500 }, { "epoch": 7.822102425876011, "grad_norm": 0.8497474789619446, "learning_rate": 0.00013099622234214784, "loss": 3.2207, "step": 72550 }, { "epoch": 7.827493261455525, "grad_norm": 0.7857134938240051, "learning_rate": 0.00013067242309767942, "loss": 3.2506, "step": 72600 }, { "epoch": 7.83288409703504, "grad_norm": 0.8757586479187012, "learning_rate": 0.000130348623853211, "loss": 3.2352, "step": 72650 }, { "epoch": 7.8382749326145555, "grad_norm": 0.7962746024131775, "learning_rate": 0.00013002482460874258, "loss": 3.2401, "step": 72700 }, { "epoch": 7.84366576819407, "grad_norm": 0.8165475726127625, "learning_rate": 0.00012970102536427413, "loss": 3.2086, "step": 72750 }, { "epoch": 7.849056603773585, "grad_norm": 0.7961111068725586, "learning_rate": 0.0001293772261198057, "loss": 3.2404, "step": 72800 }, { "epoch": 7.8544474393531, "grad_norm": 0.8070231676101685, "learning_rate": 0.00012905342687533728, "loss": 3.2371, "step": 72850 }, { "epoch": 7.859838274932614, "grad_norm": 0.7886551022529602, "learning_rate": 0.00012872962763086886, "loss": 3.2402, "step": 72900 }, { "epoch": 7.8652291105121295, "grad_norm": 0.8020292520523071, "learning_rate": 0.00012840582838640044, "loss": 3.2398, "step": 72950 }, { "epoch": 7.870619946091644, "grad_norm": 0.8239029049873352, "learning_rate": 0.000128082029141932, "loss": 3.2309, "step": 73000 }, { "epoch": 7.870619946091644, "eval_accuracy": 0.39013916370524165, "eval_loss": 3.332185983657837, "eval_runtime": 184.799, "eval_samples_per_second": 97.463, "eval_steps_per_second": 6.093, "step": 73000 }, { "epoch": 7.876010781671159, "grad_norm": 0.850242018699646, "learning_rate": 0.00012775822989746357, "loss": 3.2415, "step": 73050 }, { "epoch": 7.881401617250674, "grad_norm": 0.7743697166442871, "learning_rate": 0.0001274409066378845, "loss": 3.2358, "step": 73100 }, { "epoch": 7.886792452830189, "grad_norm": 0.7658908367156982, "learning_rate": 0.00012711710739341607, "loss": 3.2371, "step": 73150 }, { "epoch": 7.892183288409703, "grad_norm": 0.8552940487861633, "learning_rate": 0.00012679330814894765, "loss": 3.2308, "step": 73200 }, { "epoch": 7.8975741239892185, "grad_norm": 0.791025698184967, "learning_rate": 0.00012646950890447922, "loss": 3.2368, "step": 73250 }, { "epoch": 7.902964959568733, "grad_norm": 0.8062611222267151, "learning_rate": 0.00012614570966001077, "loss": 3.2394, "step": 73300 }, { "epoch": 7.908355795148248, "grad_norm": 0.8666378259658813, "learning_rate": 0.00012582191041554235, "loss": 3.2175, "step": 73350 }, { "epoch": 7.913746630727763, "grad_norm": 0.8134450316429138, "learning_rate": 0.00012549811117107393, "loss": 3.2308, "step": 73400 }, { "epoch": 7.919137466307277, "grad_norm": 0.822311282157898, "learning_rate": 0.0001251743119266055, "loss": 3.2146, "step": 73450 }, { "epoch": 7.9245283018867925, "grad_norm": 0.8420822620391846, "learning_rate": 0.00012485051268213706, "loss": 3.2317, "step": 73500 }, { "epoch": 7.929919137466308, "grad_norm": 0.7920177578926086, "learning_rate": 0.00012452671343766864, "loss": 3.2235, "step": 73550 }, { "epoch": 7.935309973045822, "grad_norm": 0.8194290995597839, "learning_rate": 0.00012420291419320021, "loss": 3.2235, "step": 73600 }, { "epoch": 7.940700808625337, "grad_norm": 0.8023219704627991, "learning_rate": 0.0001238791149487318, "loss": 3.2397, "step": 73650 }, { "epoch": 7.946091644204852, "grad_norm": 1.2114663124084473, "learning_rate": 0.00012355531570426334, "loss": 3.2168, "step": 73700 }, { "epoch": 7.951482479784366, "grad_norm": 0.8042489290237427, "learning_rate": 0.00012323151645979492, "loss": 3.2347, "step": 73750 }, { "epoch": 7.9568733153638815, "grad_norm": 0.8316463232040405, "learning_rate": 0.0001229077172153265, "loss": 3.2324, "step": 73800 }, { "epoch": 7.962264150943396, "grad_norm": 0.8015411496162415, "learning_rate": 0.00012258391797085805, "loss": 3.221, "step": 73850 }, { "epoch": 7.967654986522911, "grad_norm": 0.8200330138206482, "learning_rate": 0.00012226011872638963, "loss": 3.2256, "step": 73900 }, { "epoch": 7.973045822102426, "grad_norm": 0.7811427712440491, "learning_rate": 0.00012193631948192119, "loss": 3.228, "step": 73950 }, { "epoch": 7.97843665768194, "grad_norm": 0.8133488297462463, "learning_rate": 0.00012161252023745277, "loss": 3.2519, "step": 74000 }, { "epoch": 7.97843665768194, "eval_accuracy": 0.39057507909115335, "eval_loss": 3.328582525253296, "eval_runtime": 185.1358, "eval_samples_per_second": 97.285, "eval_steps_per_second": 6.082, "step": 74000 }, { "epoch": 7.9838274932614555, "grad_norm": 0.8701046109199524, "learning_rate": 0.00012128872099298435, "loss": 3.2368, "step": 74050 }, { "epoch": 7.989218328840971, "grad_norm": 0.8377940058708191, "learning_rate": 0.00012096492174851591, "loss": 3.2394, "step": 74100 }, { "epoch": 7.994609164420485, "grad_norm": 0.8556270599365234, "learning_rate": 0.00012064112250404749, "loss": 3.2404, "step": 74150 }, { "epoch": 8.0, "grad_norm": 1.6344754695892334, "learning_rate": 0.00012031732325957905, "loss": 3.2383, "step": 74200 }, { "epoch": 8.005390835579515, "grad_norm": 0.801658034324646, "learning_rate": 0.00011999352401511062, "loss": 3.1543, "step": 74250 }, { "epoch": 8.01078167115903, "grad_norm": 0.7476630210876465, "learning_rate": 0.00011966972477064219, "loss": 3.1637, "step": 74300 }, { "epoch": 8.016172506738544, "grad_norm": 0.8232449293136597, "learning_rate": 0.00011934592552617377, "loss": 3.1622, "step": 74350 }, { "epoch": 8.021563342318059, "grad_norm": 0.8696622252464294, "learning_rate": 0.00011902212628170532, "loss": 3.1583, "step": 74400 }, { "epoch": 8.026954177897574, "grad_norm": 0.8294352293014526, "learning_rate": 0.0001186983270372369, "loss": 3.1607, "step": 74450 }, { "epoch": 8.032345013477089, "grad_norm": 0.7915950417518616, "learning_rate": 0.00011837452779276848, "loss": 3.1799, "step": 74500 }, { "epoch": 8.037735849056604, "grad_norm": 0.8357707858085632, "learning_rate": 0.00011805072854830005, "loss": 3.158, "step": 74550 }, { "epoch": 8.04312668463612, "grad_norm": 0.8107123970985413, "learning_rate": 0.0001177269293038316, "loss": 3.1785, "step": 74600 }, { "epoch": 8.048517520215633, "grad_norm": 0.8061670064926147, "learning_rate": 0.00011740313005936318, "loss": 3.175, "step": 74650 }, { "epoch": 8.053908355795148, "grad_norm": 0.7983211874961853, "learning_rate": 0.00011707933081489476, "loss": 3.1642, "step": 74700 }, { "epoch": 8.059299191374663, "grad_norm": 0.8365211486816406, "learning_rate": 0.00011675553157042632, "loss": 3.1511, "step": 74750 }, { "epoch": 8.064690026954178, "grad_norm": 0.8603684306144714, "learning_rate": 0.00011643173232595789, "loss": 3.1714, "step": 74800 }, { "epoch": 8.070080862533693, "grad_norm": 0.7773607969284058, "learning_rate": 0.00011610793308148947, "loss": 3.1512, "step": 74850 }, { "epoch": 8.075471698113208, "grad_norm": 0.7770172953605652, "learning_rate": 0.00011578413383702104, "loss": 3.152, "step": 74900 }, { "epoch": 8.080862533692722, "grad_norm": 0.78849858045578, "learning_rate": 0.00011546033459255261, "loss": 3.1676, "step": 74950 }, { "epoch": 8.086253369272237, "grad_norm": 0.814720630645752, "learning_rate": 0.00011513653534808419, "loss": 3.1583, "step": 75000 }, { "epoch": 8.086253369272237, "eval_accuracy": 0.3906881867473682, "eval_loss": 3.3347113132476807, "eval_runtime": 184.6754, "eval_samples_per_second": 97.528, "eval_steps_per_second": 6.097, "step": 75000 }, { "epoch": 8.091644204851752, "grad_norm": 0.8267271518707275, "learning_rate": 0.00011481273610361575, "loss": 3.165, "step": 75050 }, { "epoch": 8.097035040431267, "grad_norm": 0.8477714657783508, "learning_rate": 0.00011448893685914731, "loss": 3.1511, "step": 75100 }, { "epoch": 8.102425876010782, "grad_norm": 0.7959654331207275, "learning_rate": 0.00011416513761467889, "loss": 3.1745, "step": 75150 }, { "epoch": 8.107816711590296, "grad_norm": 0.8167853951454163, "learning_rate": 0.00011384133837021047, "loss": 3.1673, "step": 75200 }, { "epoch": 8.11320754716981, "grad_norm": 0.7871206402778625, "learning_rate": 0.00011351753912574202, "loss": 3.1734, "step": 75250 }, { "epoch": 8.118598382749326, "grad_norm": 0.7872896194458008, "learning_rate": 0.0001131937398812736, "loss": 3.1801, "step": 75300 }, { "epoch": 8.123989218328841, "grad_norm": 0.7880840301513672, "learning_rate": 0.00011286994063680517, "loss": 3.1839, "step": 75350 }, { "epoch": 8.129380053908356, "grad_norm": 0.883547306060791, "learning_rate": 0.00011254614139233675, "loss": 3.1856, "step": 75400 }, { "epoch": 8.134770889487871, "grad_norm": 0.8173453211784363, "learning_rate": 0.0001122223421478683, "loss": 3.1721, "step": 75450 }, { "epoch": 8.140161725067385, "grad_norm": 0.7894411087036133, "learning_rate": 0.00011189854290339988, "loss": 3.1769, "step": 75500 }, { "epoch": 8.1455525606469, "grad_norm": 0.7918045520782471, "learning_rate": 0.00011157474365893146, "loss": 3.1614, "step": 75550 }, { "epoch": 8.150943396226415, "grad_norm": 0.825491726398468, "learning_rate": 0.00011125094441446302, "loss": 3.1587, "step": 75600 }, { "epoch": 8.15633423180593, "grad_norm": 0.8395944833755493, "learning_rate": 0.00011092714516999459, "loss": 3.1649, "step": 75650 }, { "epoch": 8.161725067385445, "grad_norm": 0.8326493501663208, "learning_rate": 0.00011060334592552616, "loss": 3.1851, "step": 75700 }, { "epoch": 8.167115902964959, "grad_norm": 0.795487642288208, "learning_rate": 0.00011027954668105773, "loss": 3.1673, "step": 75750 }, { "epoch": 8.172506738544474, "grad_norm": 0.8097192049026489, "learning_rate": 0.0001099557474365893, "loss": 3.1614, "step": 75800 }, { "epoch": 8.177897574123989, "grad_norm": 0.8157842755317688, "learning_rate": 0.00010963194819212088, "loss": 3.1657, "step": 75850 }, { "epoch": 8.183288409703504, "grad_norm": 0.7984054088592529, "learning_rate": 0.00010930814894765243, "loss": 3.1681, "step": 75900 }, { "epoch": 8.18867924528302, "grad_norm": 0.8114879727363586, "learning_rate": 0.00010898434970318401, "loss": 3.1797, "step": 75950 }, { "epoch": 8.194070080862534, "grad_norm": 0.7779016494750977, "learning_rate": 0.00010866055045871559, "loss": 3.1571, "step": 76000 }, { "epoch": 8.194070080862534, "eval_accuracy": 0.3906270151715287, "eval_loss": 3.3333170413970947, "eval_runtime": 185.1654, "eval_samples_per_second": 97.27, "eval_steps_per_second": 6.081, "step": 76000 }, { "epoch": 8.199460916442048, "grad_norm": 0.8344588279724121, "learning_rate": 0.00010833675121424717, "loss": 3.1813, "step": 76050 }, { "epoch": 8.204851752021563, "grad_norm": 0.8400987386703491, "learning_rate": 0.00010801295196977872, "loss": 3.1983, "step": 76100 }, { "epoch": 8.210242587601078, "grad_norm": 0.7751808762550354, "learning_rate": 0.0001076891527253103, "loss": 3.1688, "step": 76150 }, { "epoch": 8.215633423180593, "grad_norm": 0.8149611353874207, "learning_rate": 0.00010736535348084187, "loss": 3.1826, "step": 76200 }, { "epoch": 8.221024258760108, "grad_norm": 0.8001996874809265, "learning_rate": 0.00010704155423637345, "loss": 3.1785, "step": 76250 }, { "epoch": 8.226415094339623, "grad_norm": 0.8465657830238342, "learning_rate": 0.000106717754991905, "loss": 3.1606, "step": 76300 }, { "epoch": 8.231805929919137, "grad_norm": 0.8427553772926331, "learning_rate": 0.00010639395574743658, "loss": 3.1798, "step": 76350 }, { "epoch": 8.237196765498652, "grad_norm": 0.7900227308273315, "learning_rate": 0.00010607015650296816, "loss": 3.1745, "step": 76400 }, { "epoch": 8.242587601078167, "grad_norm": 0.7699751257896423, "learning_rate": 0.00010574635725849972, "loss": 3.175, "step": 76450 }, { "epoch": 8.247978436657682, "grad_norm": 0.8397601246833801, "learning_rate": 0.00010542255801403128, "loss": 3.164, "step": 76500 }, { "epoch": 8.253369272237197, "grad_norm": 0.874344527721405, "learning_rate": 0.00010509875876956286, "loss": 3.1902, "step": 76550 }, { "epoch": 8.25876010781671, "grad_norm": 0.7956122159957886, "learning_rate": 0.00010477495952509443, "loss": 3.1647, "step": 76600 }, { "epoch": 8.264150943396226, "grad_norm": 0.7708241939544678, "learning_rate": 0.000104451160280626, "loss": 3.1618, "step": 76650 }, { "epoch": 8.269541778975741, "grad_norm": 0.8957728743553162, "learning_rate": 0.00010412736103615758, "loss": 3.1832, "step": 76700 }, { "epoch": 8.274932614555256, "grad_norm": 0.8150855898857117, "learning_rate": 0.00010380356179168913, "loss": 3.1755, "step": 76750 }, { "epoch": 8.280323450134771, "grad_norm": 0.8756967186927795, "learning_rate": 0.00010347976254722071, "loss": 3.1797, "step": 76800 }, { "epoch": 8.285714285714286, "grad_norm": 0.7948899865150452, "learning_rate": 0.00010315596330275229, "loss": 3.1907, "step": 76850 }, { "epoch": 8.2911051212938, "grad_norm": 0.8002045154571533, "learning_rate": 0.00010283216405828387, "loss": 3.2053, "step": 76900 }, { "epoch": 8.296495956873315, "grad_norm": 0.8174263834953308, "learning_rate": 0.00010250836481381542, "loss": 3.1878, "step": 76950 }, { "epoch": 8.30188679245283, "grad_norm": 0.7678391933441162, "learning_rate": 0.000102184565569347, "loss": 3.1771, "step": 77000 }, { "epoch": 8.30188679245283, "eval_accuracy": 0.3910068656673281, "eval_loss": 3.3296449184417725, "eval_runtime": 184.8006, "eval_samples_per_second": 97.462, "eval_steps_per_second": 6.093, "step": 77000 }, { "epoch": 8.307277628032345, "grad_norm": 0.8229707479476929, "learning_rate": 0.00010186724230976793, "loss": 3.1903, "step": 77050 }, { "epoch": 8.31266846361186, "grad_norm": 0.8444199562072754, "learning_rate": 0.0001015434430652995, "loss": 3.1739, "step": 77100 }, { "epoch": 8.318059299191376, "grad_norm": 0.8374699354171753, "learning_rate": 0.00010121964382083107, "loss": 3.1627, "step": 77150 }, { "epoch": 8.323450134770889, "grad_norm": 0.8763797283172607, "learning_rate": 0.00010089584457636265, "loss": 3.1634, "step": 77200 }, { "epoch": 8.328840970350404, "grad_norm": 0.844142496585846, "learning_rate": 0.00010057204533189423, "loss": 3.1628, "step": 77250 }, { "epoch": 8.33423180592992, "grad_norm": 0.8463607430458069, "learning_rate": 0.00010024824608742578, "loss": 3.1713, "step": 77300 }, { "epoch": 8.339622641509434, "grad_norm": 0.816167414188385, "learning_rate": 9.992444684295736e-05, "loss": 3.1854, "step": 77350 }, { "epoch": 8.34501347708895, "grad_norm": 0.868462860584259, "learning_rate": 9.960064759848894e-05, "loss": 3.1387, "step": 77400 }, { "epoch": 8.350404312668463, "grad_norm": 0.8351553082466125, "learning_rate": 9.92768483540205e-05, "loss": 3.1847, "step": 77450 }, { "epoch": 8.355795148247978, "grad_norm": 0.8340010046958923, "learning_rate": 9.895304910955206e-05, "loss": 3.1897, "step": 77500 }, { "epoch": 8.361185983827493, "grad_norm": 0.8664101362228394, "learning_rate": 9.862924986508364e-05, "loss": 3.1756, "step": 77550 }, { "epoch": 8.366576819407008, "grad_norm": 0.9245196580886841, "learning_rate": 9.83054506206152e-05, "loss": 3.1933, "step": 77600 }, { "epoch": 8.371967654986523, "grad_norm": 0.8184807300567627, "learning_rate": 9.798165137614678e-05, "loss": 3.1595, "step": 77650 }, { "epoch": 8.377358490566039, "grad_norm": 0.8540685176849365, "learning_rate": 9.765785213167835e-05, "loss": 3.1975, "step": 77700 }, { "epoch": 8.382749326145552, "grad_norm": 0.8300887942314148, "learning_rate": 9.733405288720992e-05, "loss": 3.1769, "step": 77750 }, { "epoch": 8.388140161725067, "grad_norm": 0.8534173369407654, "learning_rate": 9.701025364274149e-05, "loss": 3.1714, "step": 77800 }, { "epoch": 8.393530997304582, "grad_norm": 0.8436583280563354, "learning_rate": 9.668645439827307e-05, "loss": 3.1783, "step": 77850 }, { "epoch": 8.398921832884097, "grad_norm": 0.8904436230659485, "learning_rate": 9.636265515380464e-05, "loss": 3.176, "step": 77900 }, { "epoch": 8.404312668463612, "grad_norm": 0.9166279435157776, "learning_rate": 9.60388559093362e-05, "loss": 3.1848, "step": 77950 }, { "epoch": 8.409703504043126, "grad_norm": 0.8319751024246216, "learning_rate": 9.571505666486777e-05, "loss": 3.1673, "step": 78000 }, { "epoch": 8.409703504043126, "eval_accuracy": 0.3911980947498816, "eval_loss": 3.3280577659606934, "eval_runtime": 184.9642, "eval_samples_per_second": 97.376, "eval_steps_per_second": 6.088, "step": 78000 }, { "epoch": 8.415094339622641, "grad_norm": 0.8120921850204468, "learning_rate": 9.539125742039935e-05, "loss": 3.1822, "step": 78050 }, { "epoch": 8.420485175202156, "grad_norm": 0.8287458419799805, "learning_rate": 9.506745817593093e-05, "loss": 3.1704, "step": 78100 }, { "epoch": 8.425876010781671, "grad_norm": 0.8519405722618103, "learning_rate": 9.474365893146248e-05, "loss": 3.1791, "step": 78150 }, { "epoch": 8.431266846361186, "grad_norm": 0.9082578420639038, "learning_rate": 9.441985968699406e-05, "loss": 3.1903, "step": 78200 }, { "epoch": 8.436657681940702, "grad_norm": 0.8822596669197083, "learning_rate": 9.409606044252563e-05, "loss": 3.2137, "step": 78250 }, { "epoch": 8.442048517520215, "grad_norm": 0.8615216016769409, "learning_rate": 9.37722611980572e-05, "loss": 3.1818, "step": 78300 }, { "epoch": 8.44743935309973, "grad_norm": 0.8587093353271484, "learning_rate": 9.344846195358876e-05, "loss": 3.1776, "step": 78350 }, { "epoch": 8.452830188679245, "grad_norm": 0.8063041567802429, "learning_rate": 9.312466270912034e-05, "loss": 3.1846, "step": 78400 }, { "epoch": 8.45822102425876, "grad_norm": 0.8895376324653625, "learning_rate": 9.28008634646519e-05, "loss": 3.1933, "step": 78450 }, { "epoch": 8.463611859838275, "grad_norm": 0.8046665787696838, "learning_rate": 9.247706422018348e-05, "loss": 3.1872, "step": 78500 }, { "epoch": 8.46900269541779, "grad_norm": 0.8373346924781799, "learning_rate": 9.215326497571505e-05, "loss": 3.1793, "step": 78550 }, { "epoch": 8.474393530997304, "grad_norm": 0.8134214878082275, "learning_rate": 9.182946573124661e-05, "loss": 3.1889, "step": 78600 }, { "epoch": 8.479784366576819, "grad_norm": 0.8032200932502747, "learning_rate": 9.150566648677819e-05, "loss": 3.1765, "step": 78650 }, { "epoch": 8.485175202156334, "grad_norm": 0.8005744218826294, "learning_rate": 9.118186724230976e-05, "loss": 3.1795, "step": 78700 }, { "epoch": 8.49056603773585, "grad_norm": 0.8798218965530396, "learning_rate": 9.085806799784134e-05, "loss": 3.189, "step": 78750 }, { "epoch": 8.495956873315365, "grad_norm": 0.8683854937553406, "learning_rate": 9.053426875337289e-05, "loss": 3.1934, "step": 78800 }, { "epoch": 8.501347708894878, "grad_norm": 0.813085675239563, "learning_rate": 9.021046950890447e-05, "loss": 3.1887, "step": 78850 }, { "epoch": 8.506738544474393, "grad_norm": 0.8143117427825928, "learning_rate": 8.988667026443605e-05, "loss": 3.1808, "step": 78900 }, { "epoch": 8.512129380053908, "grad_norm": 0.8620724678039551, "learning_rate": 8.956287101996761e-05, "loss": 3.1792, "step": 78950 }, { "epoch": 8.517520215633423, "grad_norm": 0.8598374724388123, "learning_rate": 8.923907177549918e-05, "loss": 3.1806, "step": 79000 }, { "epoch": 8.517520215633423, "eval_accuracy": 0.3915852249891646, "eval_loss": 3.322683095932007, "eval_runtime": 184.9558, "eval_samples_per_second": 97.38, "eval_steps_per_second": 6.088, "step": 79000 }, { "epoch": 8.522911051212938, "grad_norm": 0.8073253035545349, "learning_rate": 8.891527253103075e-05, "loss": 3.1988, "step": 79050 }, { "epoch": 8.528301886792454, "grad_norm": 0.8984984755516052, "learning_rate": 8.859147328656233e-05, "loss": 3.1804, "step": 79100 }, { "epoch": 8.533692722371967, "grad_norm": 0.843517541885376, "learning_rate": 8.82676740420939e-05, "loss": 3.1896, "step": 79150 }, { "epoch": 8.539083557951482, "grad_norm": 0.8529115319252014, "learning_rate": 8.794387479762546e-05, "loss": 3.2095, "step": 79200 }, { "epoch": 8.544474393530997, "grad_norm": 0.8642679452896118, "learning_rate": 8.762007555315704e-05, "loss": 3.1868, "step": 79250 }, { "epoch": 8.549865229110512, "grad_norm": 0.8819208145141602, "learning_rate": 8.72962763086886e-05, "loss": 3.1621, "step": 79300 }, { "epoch": 8.555256064690028, "grad_norm": 0.8184266686439514, "learning_rate": 8.697247706422018e-05, "loss": 3.1841, "step": 79350 }, { "epoch": 8.560646900269543, "grad_norm": 0.9021525979042053, "learning_rate": 8.664867781975174e-05, "loss": 3.1821, "step": 79400 }, { "epoch": 8.566037735849056, "grad_norm": 0.823969841003418, "learning_rate": 8.632487857528331e-05, "loss": 3.1666, "step": 79450 }, { "epoch": 8.571428571428571, "grad_norm": 0.8354356288909912, "learning_rate": 8.600107933081489e-05, "loss": 3.1763, "step": 79500 }, { "epoch": 8.576819407008086, "grad_norm": 0.9359567165374756, "learning_rate": 8.567728008634646e-05, "loss": 3.176, "step": 79550 }, { "epoch": 8.582210242587601, "grad_norm": 0.8543683290481567, "learning_rate": 8.535348084187804e-05, "loss": 3.1616, "step": 79600 }, { "epoch": 8.587601078167117, "grad_norm": 0.8099145293235779, "learning_rate": 8.502968159740959e-05, "loss": 3.1743, "step": 79650 }, { "epoch": 8.59299191374663, "grad_norm": 0.8237737417221069, "learning_rate": 8.470588235294117e-05, "loss": 3.173, "step": 79700 }, { "epoch": 8.598382749326145, "grad_norm": 0.8273354768753052, "learning_rate": 8.438208310847275e-05, "loss": 3.1819, "step": 79750 }, { "epoch": 8.60377358490566, "grad_norm": 0.8201135396957397, "learning_rate": 8.405828386400431e-05, "loss": 3.1865, "step": 79800 }, { "epoch": 8.609164420485175, "grad_norm": 0.835292637348175, "learning_rate": 8.373448461953587e-05, "loss": 3.165, "step": 79850 }, { "epoch": 8.61455525606469, "grad_norm": 0.8473345041275024, "learning_rate": 8.341068537506745e-05, "loss": 3.2078, "step": 79900 }, { "epoch": 8.619946091644206, "grad_norm": 0.8289116024971008, "learning_rate": 8.308688613059902e-05, "loss": 3.1844, "step": 79950 }, { "epoch": 8.625336927223719, "grad_norm": 0.8131082653999329, "learning_rate": 8.27630868861306e-05, "loss": 3.1973, "step": 80000 }, { "epoch": 8.625336927223719, "eval_accuracy": 0.39217618804598753, "eval_loss": 3.3202123641967773, "eval_runtime": 184.6219, "eval_samples_per_second": 97.556, "eval_steps_per_second": 6.099, "step": 80000 }, { "epoch": 8.630727762803234, "grad_norm": 0.8290890455245972, "learning_rate": 8.243928764166216e-05, "loss": 3.1902, "step": 80050 }, { "epoch": 8.63611859838275, "grad_norm": 0.8842519521713257, "learning_rate": 8.211548839719374e-05, "loss": 3.1727, "step": 80100 }, { "epoch": 8.641509433962264, "grad_norm": 0.8551476001739502, "learning_rate": 8.179816513761467e-05, "loss": 3.1815, "step": 80150 }, { "epoch": 8.64690026954178, "grad_norm": 0.8610032796859741, "learning_rate": 8.147436589314624e-05, "loss": 3.1982, "step": 80200 }, { "epoch": 8.652291105121293, "grad_norm": 0.8614827394485474, "learning_rate": 8.115056664867782e-05, "loss": 3.1849, "step": 80250 }, { "epoch": 8.657681940700808, "grad_norm": 0.8478200435638428, "learning_rate": 8.082676740420938e-05, "loss": 3.1728, "step": 80300 }, { "epoch": 8.663072776280323, "grad_norm": 0.8627998232841492, "learning_rate": 8.050296815974096e-05, "loss": 3.1857, "step": 80350 }, { "epoch": 8.668463611859838, "grad_norm": 0.8678106069564819, "learning_rate": 8.017916891527252e-05, "loss": 3.1834, "step": 80400 }, { "epoch": 8.673854447439354, "grad_norm": 0.8390142321586609, "learning_rate": 7.985536967080409e-05, "loss": 3.1827, "step": 80450 }, { "epoch": 8.679245283018869, "grad_norm": 0.8010692000389099, "learning_rate": 7.953157042633566e-05, "loss": 3.1803, "step": 80500 }, { "epoch": 8.684636118598382, "grad_norm": 0.9025766253471375, "learning_rate": 7.920777118186724e-05, "loss": 3.1869, "step": 80550 }, { "epoch": 8.690026954177897, "grad_norm": 0.83486008644104, "learning_rate": 7.888397193739879e-05, "loss": 3.1919, "step": 80600 }, { "epoch": 8.695417789757412, "grad_norm": 0.8118559122085571, "learning_rate": 7.856017269293037e-05, "loss": 3.1851, "step": 80650 }, { "epoch": 8.700808625336927, "grad_norm": 0.8715183734893799, "learning_rate": 7.823637344846195e-05, "loss": 3.1723, "step": 80700 }, { "epoch": 8.706199460916443, "grad_norm": 0.867709219455719, "learning_rate": 7.791257420399353e-05, "loss": 3.1792, "step": 80750 }, { "epoch": 8.711590296495956, "grad_norm": 0.8244636654853821, "learning_rate": 7.758877495952508e-05, "loss": 3.1918, "step": 80800 }, { "epoch": 8.716981132075471, "grad_norm": 0.8116188049316406, "learning_rate": 7.726497571505665e-05, "loss": 3.1863, "step": 80850 }, { "epoch": 8.722371967654986, "grad_norm": 0.8385442495346069, "learning_rate": 7.694117647058823e-05, "loss": 3.1733, "step": 80900 }, { "epoch": 8.727762803234501, "grad_norm": 0.8450207710266113, "learning_rate": 7.661737722611981e-05, "loss": 3.1752, "step": 80950 }, { "epoch": 8.733153638814017, "grad_norm": 0.855089545249939, "learning_rate": 7.629357798165137e-05, "loss": 3.1893, "step": 81000 }, { "epoch": 8.733153638814017, "eval_accuracy": 0.3925210523119107, "eval_loss": 3.317495346069336, "eval_runtime": 184.9457, "eval_samples_per_second": 97.385, "eval_steps_per_second": 6.088, "step": 81000 }, { "epoch": 8.738544474393532, "grad_norm": 0.841830849647522, "learning_rate": 7.596977873718294e-05, "loss": 3.1854, "step": 81050 }, { "epoch": 8.743935309973045, "grad_norm": 0.8838796019554138, "learning_rate": 7.564597949271451e-05, "loss": 3.1856, "step": 81100 }, { "epoch": 8.74932614555256, "grad_norm": 0.8529191613197327, "learning_rate": 7.532218024824608e-05, "loss": 3.1826, "step": 81150 }, { "epoch": 8.754716981132075, "grad_norm": 0.8259932398796082, "learning_rate": 7.499838100377764e-05, "loss": 3.1631, "step": 81200 }, { "epoch": 8.76010781671159, "grad_norm": 0.9050982594490051, "learning_rate": 7.467458175930922e-05, "loss": 3.1785, "step": 81250 }, { "epoch": 8.765498652291106, "grad_norm": 0.8278824687004089, "learning_rate": 7.435078251484078e-05, "loss": 3.1802, "step": 81300 }, { "epoch": 8.77088948787062, "grad_norm": 0.8377477526664734, "learning_rate": 7.402698327037236e-05, "loss": 3.1874, "step": 81350 }, { "epoch": 8.776280323450134, "grad_norm": 0.8223797678947449, "learning_rate": 7.370318402590393e-05, "loss": 3.1866, "step": 81400 }, { "epoch": 8.78167115902965, "grad_norm": 0.8468875288963318, "learning_rate": 7.33793847814355e-05, "loss": 3.1894, "step": 81450 }, { "epoch": 8.787061994609164, "grad_norm": 0.8402231931686401, "learning_rate": 7.305558553696707e-05, "loss": 3.198, "step": 81500 }, { "epoch": 8.79245283018868, "grad_norm": 0.965043842792511, "learning_rate": 7.273178629249865e-05, "loss": 3.1848, "step": 81550 }, { "epoch": 8.797843665768195, "grad_norm": 0.8373719453811646, "learning_rate": 7.240798704803021e-05, "loss": 3.1881, "step": 81600 }, { "epoch": 8.80323450134771, "grad_norm": 0.859524130821228, "learning_rate": 7.208418780356179e-05, "loss": 3.1844, "step": 81650 }, { "epoch": 8.808625336927223, "grad_norm": 0.87333744764328, "learning_rate": 7.176038855909337e-05, "loss": 3.1909, "step": 81700 }, { "epoch": 8.814016172506738, "grad_norm": 0.8479647636413574, "learning_rate": 7.143658931462493e-05, "loss": 3.1834, "step": 81750 }, { "epoch": 8.819407008086253, "grad_norm": 0.8975802659988403, "learning_rate": 7.11127900701565e-05, "loss": 3.1805, "step": 81800 }, { "epoch": 8.824797843665769, "grad_norm": 0.8739588260650635, "learning_rate": 7.078899082568807e-05, "loss": 3.1803, "step": 81850 }, { "epoch": 8.830188679245284, "grad_norm": 0.8485525250434875, "learning_rate": 7.046519158121964e-05, "loss": 3.1731, "step": 81900 }, { "epoch": 8.835579514824797, "grad_norm": 0.8152867555618286, "learning_rate": 7.014139233675121e-05, "loss": 3.2022, "step": 81950 }, { "epoch": 8.840970350404312, "grad_norm": 0.8222323060035706, "learning_rate": 6.981759309228278e-05, "loss": 3.1843, "step": 82000 }, { "epoch": 8.840970350404312, "eval_accuracy": 0.3926712106028703, "eval_loss": 3.3136684894561768, "eval_runtime": 184.9068, "eval_samples_per_second": 97.406, "eval_steps_per_second": 6.09, "step": 82000 }, { "epoch": 8.846361185983827, "grad_norm": 0.8213887214660645, "learning_rate": 6.949379384781434e-05, "loss": 3.1873, "step": 82050 }, { "epoch": 8.851752021563343, "grad_norm": 0.8546909093856812, "learning_rate": 6.916999460334592e-05, "loss": 3.1781, "step": 82100 }, { "epoch": 8.857142857142858, "grad_norm": 0.8814563751220703, "learning_rate": 6.884619535887748e-05, "loss": 3.1857, "step": 82150 }, { "epoch": 8.862533692722373, "grad_norm": 0.8545170426368713, "learning_rate": 6.852239611440906e-05, "loss": 3.1847, "step": 82200 }, { "epoch": 8.867924528301886, "grad_norm": 0.8242815732955933, "learning_rate": 6.819859686994062e-05, "loss": 3.1869, "step": 82250 }, { "epoch": 8.873315363881401, "grad_norm": 0.8673837184906006, "learning_rate": 6.78747976254722e-05, "loss": 3.1932, "step": 82300 }, { "epoch": 8.878706199460916, "grad_norm": 0.8401963114738464, "learning_rate": 6.755747436589314e-05, "loss": 3.1965, "step": 82350 }, { "epoch": 8.884097035040432, "grad_norm": 0.8353580236434937, "learning_rate": 6.72336751214247e-05, "loss": 3.172, "step": 82400 }, { "epoch": 8.889487870619947, "grad_norm": 0.8120338320732117, "learning_rate": 6.690987587695628e-05, "loss": 3.1932, "step": 82450 }, { "epoch": 8.89487870619946, "grad_norm": 0.8129428029060364, "learning_rate": 6.658607663248785e-05, "loss": 3.1704, "step": 82500 }, { "epoch": 8.900269541778975, "grad_norm": 0.8249984979629517, "learning_rate": 6.626227738801942e-05, "loss": 3.1927, "step": 82550 }, { "epoch": 8.90566037735849, "grad_norm": 0.8213885426521301, "learning_rate": 6.593847814355099e-05, "loss": 3.1605, "step": 82600 }, { "epoch": 8.911051212938006, "grad_norm": 0.8929885029792786, "learning_rate": 6.561467889908257e-05, "loss": 3.1752, "step": 82650 }, { "epoch": 8.91644204851752, "grad_norm": 0.8327133655548096, "learning_rate": 6.529087965461413e-05, "loss": 3.1778, "step": 82700 }, { "epoch": 8.921832884097036, "grad_norm": 0.8178892731666565, "learning_rate": 6.496708041014571e-05, "loss": 3.1733, "step": 82750 }, { "epoch": 8.92722371967655, "grad_norm": 0.8787632584571838, "learning_rate": 6.464328116567727e-05, "loss": 3.1881, "step": 82800 }, { "epoch": 8.932614555256064, "grad_norm": 0.8857139945030212, "learning_rate": 6.431948192120885e-05, "loss": 3.1853, "step": 82850 }, { "epoch": 8.93800539083558, "grad_norm": 0.8443358540534973, "learning_rate": 6.399568267674041e-05, "loss": 3.1897, "step": 82900 }, { "epoch": 8.943396226415095, "grad_norm": 0.7980090379714966, "learning_rate": 6.367188343227199e-05, "loss": 3.1898, "step": 82950 }, { "epoch": 8.94878706199461, "grad_norm": 0.9037806391716003, "learning_rate": 6.334808418780356e-05, "loss": 3.1679, "step": 83000 }, { "epoch": 8.94878706199461, "eval_accuracy": 0.3930123806706078, "eval_loss": 3.31054425239563, "eval_runtime": 184.7094, "eval_samples_per_second": 97.51, "eval_steps_per_second": 6.096, "step": 83000 }, { "epoch": 8.954177897574123, "grad_norm": 0.8617427945137024, "learning_rate": 6.302428494333512e-05, "loss": 3.1946, "step": 83050 }, { "epoch": 8.959568733153638, "grad_norm": 0.8506762385368347, "learning_rate": 6.27004856988667e-05, "loss": 3.181, "step": 83100 }, { "epoch": 8.964959568733153, "grad_norm": 0.8383913040161133, "learning_rate": 6.237668645439826e-05, "loss": 3.1833, "step": 83150 }, { "epoch": 8.970350404312669, "grad_norm": 0.8611315488815308, "learning_rate": 6.205288720992984e-05, "loss": 3.1827, "step": 83200 }, { "epoch": 8.975741239892184, "grad_norm": 0.8510012626647949, "learning_rate": 6.17290879654614e-05, "loss": 3.1884, "step": 83250 }, { "epoch": 8.981132075471699, "grad_norm": 0.826678991317749, "learning_rate": 6.140528872099298e-05, "loss": 3.1951, "step": 83300 }, { "epoch": 8.986522911051212, "grad_norm": 0.8059219121932983, "learning_rate": 6.108148947652455e-05, "loss": 3.1925, "step": 83350 }, { "epoch": 8.991913746630727, "grad_norm": 0.8477720618247986, "learning_rate": 6.075769023205612e-05, "loss": 3.1788, "step": 83400 }, { "epoch": 8.997304582210242, "grad_norm": 0.8244584798812866, "learning_rate": 6.043389098758769e-05, "loss": 3.1788, "step": 83450 }, { "epoch": 9.002695417789758, "grad_norm": 0.8307106494903564, "learning_rate": 6.0110091743119265e-05, "loss": 3.155, "step": 83500 }, { "epoch": 9.008086253369273, "grad_norm": 0.880568265914917, "learning_rate": 5.978629249865083e-05, "loss": 3.1189, "step": 83550 }, { "epoch": 9.013477088948788, "grad_norm": 0.8291214108467102, "learning_rate": 5.94624932541824e-05, "loss": 3.1419, "step": 83600 }, { "epoch": 9.018867924528301, "grad_norm": 0.814835250377655, "learning_rate": 5.913869400971397e-05, "loss": 3.1524, "step": 83650 }, { "epoch": 9.024258760107816, "grad_norm": 0.8376071453094482, "learning_rate": 5.881489476524554e-05, "loss": 3.1034, "step": 83700 }, { "epoch": 9.029649595687331, "grad_norm": 0.8205394744873047, "learning_rate": 5.849109552077711e-05, "loss": 3.1062, "step": 83750 }, { "epoch": 9.035040431266847, "grad_norm": 0.8897798657417297, "learning_rate": 5.8167296276308683e-05, "loss": 3.1204, "step": 83800 }, { "epoch": 9.040431266846362, "grad_norm": 0.8899309039115906, "learning_rate": 5.784349703184026e-05, "loss": 3.133, "step": 83850 }, { "epoch": 9.045822102425875, "grad_norm": 0.8886575698852539, "learning_rate": 5.7519697787371825e-05, "loss": 3.1176, "step": 83900 }, { "epoch": 9.05121293800539, "grad_norm": 0.8485926389694214, "learning_rate": 5.7195898542903396e-05, "loss": 3.102, "step": 83950 }, { "epoch": 9.056603773584905, "grad_norm": 0.8616915941238403, "learning_rate": 5.687209929843497e-05, "loss": 3.1293, "step": 84000 }, { "epoch": 9.056603773584905, "eval_accuracy": 0.3930330247192925, "eval_loss": 3.3129842281341553, "eval_runtime": 184.7976, "eval_samples_per_second": 97.463, "eval_steps_per_second": 6.093, "step": 84000 }, { "epoch": 9.06199460916442, "grad_norm": 0.8321478962898254, "learning_rate": 5.654830005396654e-05, "loss": 3.127, "step": 84050 }, { "epoch": 9.067385444743936, "grad_norm": 0.883215069770813, "learning_rate": 5.62245008094981e-05, "loss": 3.1293, "step": 84100 }, { "epoch": 9.07277628032345, "grad_norm": 0.8449257612228394, "learning_rate": 5.590070156502968e-05, "loss": 3.1404, "step": 84150 }, { "epoch": 9.078167115902964, "grad_norm": 0.8433477878570557, "learning_rate": 5.5576902320561244e-05, "loss": 3.117, "step": 84200 }, { "epoch": 9.08355795148248, "grad_norm": 0.8466358780860901, "learning_rate": 5.525310307609282e-05, "loss": 3.1382, "step": 84250 }, { "epoch": 9.088948787061994, "grad_norm": 0.8381649255752563, "learning_rate": 5.4929303831624386e-05, "loss": 3.1199, "step": 84300 }, { "epoch": 9.09433962264151, "grad_norm": 0.840488076210022, "learning_rate": 5.460550458715596e-05, "loss": 3.1297, "step": 84350 }, { "epoch": 9.099730458221025, "grad_norm": 0.8325525522232056, "learning_rate": 5.428170534268753e-05, "loss": 3.0981, "step": 84400 }, { "epoch": 9.10512129380054, "grad_norm": 0.8756667971611023, "learning_rate": 5.39579060982191e-05, "loss": 3.111, "step": 84450 }, { "epoch": 9.110512129380053, "grad_norm": 0.8472950458526611, "learning_rate": 5.363410685375067e-05, "loss": 3.1322, "step": 84500 }, { "epoch": 9.115902964959568, "grad_norm": 0.818108081817627, "learning_rate": 5.331030760928224e-05, "loss": 3.1101, "step": 84550 }, { "epoch": 9.121293800539084, "grad_norm": 0.8659102916717529, "learning_rate": 5.2986508364813804e-05, "loss": 3.1253, "step": 84600 }, { "epoch": 9.126684636118599, "grad_norm": 0.8552064895629883, "learning_rate": 5.266270912034538e-05, "loss": 3.1252, "step": 84650 }, { "epoch": 9.132075471698114, "grad_norm": 0.8596740365028381, "learning_rate": 5.233890987587695e-05, "loss": 3.1311, "step": 84700 }, { "epoch": 9.137466307277627, "grad_norm": 0.8284486532211304, "learning_rate": 5.2015110631408524e-05, "loss": 3.1365, "step": 84750 }, { "epoch": 9.142857142857142, "grad_norm": 0.8481780886650085, "learning_rate": 5.1691311386940094e-05, "loss": 3.131, "step": 84800 }, { "epoch": 9.148247978436657, "grad_norm": 0.824164092540741, "learning_rate": 5.136751214247166e-05, "loss": 3.1193, "step": 84850 }, { "epoch": 9.153638814016173, "grad_norm": 0.851373016834259, "learning_rate": 5.1050188882892604e-05, "loss": 3.1248, "step": 84900 }, { "epoch": 9.159029649595688, "grad_norm": 0.8321095705032349, "learning_rate": 5.072638963842417e-05, "loss": 3.1121, "step": 84950 }, { "epoch": 9.164420485175203, "grad_norm": 0.7981374263763428, "learning_rate": 5.0402590393955746e-05, "loss": 3.1214, "step": 85000 }, { "epoch": 9.164420485175203, "eval_accuracy": 0.39314189491288265, "eval_loss": 3.31256365776062, "eval_runtime": 184.9292, "eval_samples_per_second": 97.394, "eval_steps_per_second": 6.089, "step": 85000 }, { "epoch": 9.169811320754716, "grad_norm": 0.8480749726295471, "learning_rate": 5.007879114948731e-05, "loss": 3.1318, "step": 85050 }, { "epoch": 9.175202156334231, "grad_norm": 0.8506357073783875, "learning_rate": 4.975499190501889e-05, "loss": 3.1311, "step": 85100 }, { "epoch": 9.180592991913747, "grad_norm": 0.864138662815094, "learning_rate": 4.943119266055045e-05, "loss": 3.1381, "step": 85150 }, { "epoch": 9.185983827493262, "grad_norm": 0.853118360042572, "learning_rate": 4.910739341608202e-05, "loss": 3.1358, "step": 85200 }, { "epoch": 9.191374663072777, "grad_norm": 0.8572016358375549, "learning_rate": 4.87835941716136e-05, "loss": 3.1298, "step": 85250 }, { "epoch": 9.19676549865229, "grad_norm": 0.8740936517715454, "learning_rate": 4.8459794927145164e-05, "loss": 3.1435, "step": 85300 }, { "epoch": 9.202156334231805, "grad_norm": 0.8369245529174805, "learning_rate": 4.813599568267674e-05, "loss": 3.1389, "step": 85350 }, { "epoch": 9.20754716981132, "grad_norm": 0.8236960768699646, "learning_rate": 4.7812196438208306e-05, "loss": 3.123, "step": 85400 }, { "epoch": 9.212938005390836, "grad_norm": 0.8511966466903687, "learning_rate": 4.748839719373988e-05, "loss": 3.124, "step": 85450 }, { "epoch": 9.21832884097035, "grad_norm": 0.8575359582901001, "learning_rate": 4.716459794927145e-05, "loss": 3.1427, "step": 85500 }, { "epoch": 9.223719676549866, "grad_norm": 0.8404532670974731, "learning_rate": 4.684079870480302e-05, "loss": 3.1437, "step": 85550 }, { "epoch": 9.22911051212938, "grad_norm": 0.9211980104446411, "learning_rate": 4.651699946033459e-05, "loss": 3.1337, "step": 85600 }, { "epoch": 9.234501347708894, "grad_norm": 0.820880115032196, "learning_rate": 4.619320021586616e-05, "loss": 3.1308, "step": 85650 }, { "epoch": 9.23989218328841, "grad_norm": 0.8520514369010925, "learning_rate": 4.5869400971397725e-05, "loss": 3.1308, "step": 85700 }, { "epoch": 9.245283018867925, "grad_norm": 0.8518171906471252, "learning_rate": 4.55456017269293e-05, "loss": 3.1324, "step": 85750 }, { "epoch": 9.25067385444744, "grad_norm": 0.8360389471054077, "learning_rate": 4.5221802482460866e-05, "loss": 3.1236, "step": 85800 }, { "epoch": 9.256064690026955, "grad_norm": 0.8417567610740662, "learning_rate": 4.4898003237992444e-05, "loss": 3.1152, "step": 85850 }, { "epoch": 9.261455525606468, "grad_norm": 0.8565623164176941, "learning_rate": 4.457420399352401e-05, "loss": 3.1323, "step": 85900 }, { "epoch": 9.266846361185983, "grad_norm": 0.8710352182388306, "learning_rate": 4.425040474905558e-05, "loss": 3.1216, "step": 85950 }, { "epoch": 9.272237196765499, "grad_norm": 0.8659188151359558, "learning_rate": 4.392660550458716e-05, "loss": 3.1466, "step": 86000 }, { "epoch": 9.272237196765499, "eval_accuracy": 0.3936142090162122, "eval_loss": 3.3102121353149414, "eval_runtime": 184.7159, "eval_samples_per_second": 97.507, "eval_steps_per_second": 6.096, "step": 86000 }, { "epoch": 9.277628032345014, "grad_norm": 0.8267226219177246, "learning_rate": 4.360280626011872e-05, "loss": 3.1328, "step": 86050 }, { "epoch": 9.283018867924529, "grad_norm": 0.848037838935852, "learning_rate": 4.32790070156503e-05, "loss": 3.1186, "step": 86100 }, { "epoch": 9.288409703504042, "grad_norm": 0.841865062713623, "learning_rate": 4.295520777118186e-05, "loss": 3.1225, "step": 86150 }, { "epoch": 9.293800539083557, "grad_norm": 0.8285828232765198, "learning_rate": 4.263788451160281e-05, "loss": 3.1311, "step": 86200 }, { "epoch": 9.299191374663073, "grad_norm": 0.8383690714836121, "learning_rate": 4.231408526713437e-05, "loss": 3.1298, "step": 86250 }, { "epoch": 9.304582210242588, "grad_norm": 0.8436566591262817, "learning_rate": 4.199028602266594e-05, "loss": 3.129, "step": 86300 }, { "epoch": 9.309973045822103, "grad_norm": 0.8680644631385803, "learning_rate": 4.1666486778197514e-05, "loss": 3.1218, "step": 86350 }, { "epoch": 9.315363881401618, "grad_norm": 0.8822382092475891, "learning_rate": 4.1342687533729085e-05, "loss": 3.1494, "step": 86400 }, { "epoch": 9.320754716981131, "grad_norm": 0.8971915245056152, "learning_rate": 4.101888828926065e-05, "loss": 3.1238, "step": 86450 }, { "epoch": 9.326145552560646, "grad_norm": 0.8807439208030701, "learning_rate": 4.0695089044792226e-05, "loss": 3.1397, "step": 86500 }, { "epoch": 9.331536388140162, "grad_norm": 0.8657439351081848, "learning_rate": 4.037128980032379e-05, "loss": 3.1451, "step": 86550 }, { "epoch": 9.336927223719677, "grad_norm": 0.8694212436676025, "learning_rate": 4.004749055585537e-05, "loss": 3.1326, "step": 86600 }, { "epoch": 9.342318059299192, "grad_norm": 0.868144690990448, "learning_rate": 3.972369131138694e-05, "loss": 3.1345, "step": 86650 }, { "epoch": 9.347708894878707, "grad_norm": 0.9292327761650085, "learning_rate": 3.93998920669185e-05, "loss": 3.1448, "step": 86700 }, { "epoch": 9.35309973045822, "grad_norm": 0.8244055509567261, "learning_rate": 3.907609282245008e-05, "loss": 3.1568, "step": 86750 }, { "epoch": 9.358490566037736, "grad_norm": 0.8296806216239929, "learning_rate": 3.8752293577981645e-05, "loss": 3.1391, "step": 86800 }, { "epoch": 9.36388140161725, "grad_norm": 0.8584550023078918, "learning_rate": 3.842849433351322e-05, "loss": 3.1285, "step": 86850 }, { "epoch": 9.369272237196766, "grad_norm": 0.936661422252655, "learning_rate": 3.810469508904479e-05, "loss": 3.1412, "step": 86900 }, { "epoch": 9.374663072776281, "grad_norm": 0.8722965717315674, "learning_rate": 3.7780895844576364e-05, "loss": 3.1186, "step": 86950 }, { "epoch": 9.380053908355794, "grad_norm": 0.8549427390098572, "learning_rate": 3.745709660010793e-05, "loss": 3.1247, "step": 87000 }, { "epoch": 9.380053908355794, "eval_accuracy": 0.3936190983961639, "eval_loss": 3.3095529079437256, "eval_runtime": 184.7197, "eval_samples_per_second": 97.504, "eval_steps_per_second": 6.096, "step": 87000 }, { "epoch": 9.38544474393531, "grad_norm": 0.8127095699310303, "learning_rate": 3.71332973556395e-05, "loss": 3.1343, "step": 87050 }, { "epoch": 9.390835579514825, "grad_norm": 0.8294982314109802, "learning_rate": 3.680949811117107e-05, "loss": 3.1216, "step": 87100 }, { "epoch": 9.39622641509434, "grad_norm": 0.8491564393043518, "learning_rate": 3.648569886670264e-05, "loss": 3.1247, "step": 87150 }, { "epoch": 9.401617250673855, "grad_norm": 0.8749399781227112, "learning_rate": 3.616189962223421e-05, "loss": 3.1505, "step": 87200 }, { "epoch": 9.40700808625337, "grad_norm": 0.8547865152359009, "learning_rate": 3.583810037776578e-05, "loss": 3.1383, "step": 87250 }, { "epoch": 9.412398921832883, "grad_norm": 0.8318867683410645, "learning_rate": 3.5514301133297354e-05, "loss": 3.1398, "step": 87300 }, { "epoch": 9.417789757412399, "grad_norm": 0.8504889607429504, "learning_rate": 3.5190501888828925e-05, "loss": 3.1407, "step": 87350 }, { "epoch": 9.423180592991914, "grad_norm": 0.8165703415870667, "learning_rate": 3.4866702644360496e-05, "loss": 3.1343, "step": 87400 }, { "epoch": 9.428571428571429, "grad_norm": 0.8537405729293823, "learning_rate": 3.4542903399892067e-05, "loss": 3.142, "step": 87450 }, { "epoch": 9.433962264150944, "grad_norm": 0.8769167065620422, "learning_rate": 3.421910415542363e-05, "loss": 3.1425, "step": 87500 }, { "epoch": 9.439353099730457, "grad_norm": 0.815278172492981, "learning_rate": 3.38953049109552e-05, "loss": 3.1374, "step": 87550 }, { "epoch": 9.444743935309972, "grad_norm": 0.8766458630561829, "learning_rate": 3.357150566648677e-05, "loss": 3.1123, "step": 87600 }, { "epoch": 9.450134770889488, "grad_norm": 0.8410953283309937, "learning_rate": 3.324770642201834e-05, "loss": 3.1393, "step": 87650 }, { "epoch": 9.455525606469003, "grad_norm": 0.8328889012336731, "learning_rate": 3.2923907177549914e-05, "loss": 3.134, "step": 87700 }, { "epoch": 9.460916442048518, "grad_norm": 0.9297603368759155, "learning_rate": 3.2600107933081485e-05, "loss": 3.1313, "step": 87750 }, { "epoch": 9.466307277628033, "grad_norm": 0.8465941548347473, "learning_rate": 3.2276308688613056e-05, "loss": 3.1189, "step": 87800 }, { "epoch": 9.471698113207546, "grad_norm": 0.8548228144645691, "learning_rate": 3.195250944414463e-05, "loss": 3.1317, "step": 87850 }, { "epoch": 9.477088948787062, "grad_norm": 0.8719295263290405, "learning_rate": 3.16287101996762e-05, "loss": 3.119, "step": 87900 }, { "epoch": 9.482479784366577, "grad_norm": 0.8304482102394104, "learning_rate": 3.130491095520777e-05, "loss": 3.1184, "step": 87950 }, { "epoch": 9.487870619946092, "grad_norm": 0.8402472734451294, "learning_rate": 3.098111171073934e-05, "loss": 3.132, "step": 88000 }, { "epoch": 9.487870619946092, "eval_accuracy": 0.3939127871519264, "eval_loss": 3.3066344261169434, "eval_runtime": 185.0557, "eval_samples_per_second": 97.327, "eval_steps_per_second": 6.085, "step": 88000 }, { "epoch": 9.493261455525607, "grad_norm": 0.8302611112594604, "learning_rate": 3.065731246627091e-05, "loss": 3.1315, "step": 88050 }, { "epoch": 9.498652291105122, "grad_norm": 0.8773887753486633, "learning_rate": 3.033351322180248e-05, "loss": 3.1278, "step": 88100 }, { "epoch": 9.504043126684635, "grad_norm": 0.8220431804656982, "learning_rate": 3.000971397733405e-05, "loss": 3.1538, "step": 88150 }, { "epoch": 9.50943396226415, "grad_norm": 0.8599470257759094, "learning_rate": 2.968591473286562e-05, "loss": 3.1354, "step": 88200 }, { "epoch": 9.514824797843666, "grad_norm": 0.8439090251922607, "learning_rate": 2.936211548839719e-05, "loss": 3.1212, "step": 88250 }, { "epoch": 9.520215633423181, "grad_norm": 0.9031540751457214, "learning_rate": 2.903831624392876e-05, "loss": 3.141, "step": 88300 }, { "epoch": 9.525606469002696, "grad_norm": 0.8797536492347717, "learning_rate": 2.8714516999460332e-05, "loss": 3.1318, "step": 88350 }, { "epoch": 9.530997304582211, "grad_norm": 0.8907245397567749, "learning_rate": 2.83907177549919e-05, "loss": 3.1219, "step": 88400 }, { "epoch": 9.536388140161725, "grad_norm": 0.8620273470878601, "learning_rate": 2.806691851052347e-05, "loss": 3.1271, "step": 88450 }, { "epoch": 9.54177897574124, "grad_norm": 0.8356373310089111, "learning_rate": 2.774311926605504e-05, "loss": 3.1419, "step": 88500 }, { "epoch": 9.547169811320755, "grad_norm": 0.8548893928527832, "learning_rate": 2.7419320021586613e-05, "loss": 3.1212, "step": 88550 }, { "epoch": 9.55256064690027, "grad_norm": 0.8545761704444885, "learning_rate": 2.7095520777118187e-05, "loss": 3.1353, "step": 88600 }, { "epoch": 9.557951482479785, "grad_norm": 0.8439458608627319, "learning_rate": 2.6771721532649758e-05, "loss": 3.1304, "step": 88650 }, { "epoch": 9.563342318059298, "grad_norm": 0.8117093443870544, "learning_rate": 2.6447922288181325e-05, "loss": 3.1321, "step": 88700 }, { "epoch": 9.568733153638814, "grad_norm": 0.8481546640396118, "learning_rate": 2.6124123043712896e-05, "loss": 3.1397, "step": 88750 }, { "epoch": 9.574123989218329, "grad_norm": 0.8625066876411438, "learning_rate": 2.5800323799244467e-05, "loss": 3.1234, "step": 88800 }, { "epoch": 9.579514824797844, "grad_norm": 0.8870933651924133, "learning_rate": 2.5476524554776038e-05, "loss": 3.132, "step": 88850 }, { "epoch": 9.584905660377359, "grad_norm": 0.8924064636230469, "learning_rate": 2.515272531030761e-05, "loss": 3.156, "step": 88900 }, { "epoch": 9.590296495956874, "grad_norm": 0.912037193775177, "learning_rate": 2.4828926065839176e-05, "loss": 3.1337, "step": 88950 }, { "epoch": 9.595687331536388, "grad_norm": 0.8235248923301697, "learning_rate": 2.4505126821370747e-05, "loss": 3.1121, "step": 89000 }, { "epoch": 9.595687331536388, "eval_accuracy": 0.3942704724586117, "eval_loss": 3.304278612136841, "eval_runtime": 184.915, "eval_samples_per_second": 97.402, "eval_steps_per_second": 6.089, "step": 89000 }, { "epoch": 9.601078167115903, "grad_norm": 0.8647744655609131, "learning_rate": 2.4181327576902318e-05, "loss": 3.1443, "step": 89050 }, { "epoch": 9.606469002695418, "grad_norm": 0.879808783531189, "learning_rate": 2.385752833243389e-05, "loss": 3.1359, "step": 89100 }, { "epoch": 9.611859838274933, "grad_norm": 0.891708254814148, "learning_rate": 2.353372908796546e-05, "loss": 3.1443, "step": 89150 }, { "epoch": 9.617250673854448, "grad_norm": 0.9006338119506836, "learning_rate": 2.3209929843497027e-05, "loss": 3.1291, "step": 89200 }, { "epoch": 9.622641509433961, "grad_norm": 0.854828417301178, "learning_rate": 2.2886130599028598e-05, "loss": 3.135, "step": 89250 }, { "epoch": 9.628032345013477, "grad_norm": 0.8300639986991882, "learning_rate": 2.256233135456017e-05, "loss": 3.113, "step": 89300 }, { "epoch": 9.633423180592992, "grad_norm": 0.815534234046936, "learning_rate": 2.2238532110091743e-05, "loss": 3.1478, "step": 89350 }, { "epoch": 9.638814016172507, "grad_norm": 0.8253652453422546, "learning_rate": 2.1914732865623314e-05, "loss": 3.1443, "step": 89400 }, { "epoch": 9.644204851752022, "grad_norm": 0.8685674667358398, "learning_rate": 2.1590933621154885e-05, "loss": 3.1428, "step": 89450 }, { "epoch": 9.649595687331537, "grad_norm": 0.8638211488723755, "learning_rate": 2.1267134376686453e-05, "loss": 3.1376, "step": 89500 }, { "epoch": 9.65498652291105, "grad_norm": 0.8781023621559143, "learning_rate": 2.0943335132218024e-05, "loss": 3.1339, "step": 89550 }, { "epoch": 9.660377358490566, "grad_norm": 0.8183006644248962, "learning_rate": 2.0619535887749594e-05, "loss": 3.1411, "step": 89600 }, { "epoch": 9.66576819407008, "grad_norm": 0.8766468167304993, "learning_rate": 2.0295736643281165e-05, "loss": 3.1244, "step": 89650 }, { "epoch": 9.671159029649596, "grad_norm": 0.8593804240226746, "learning_rate": 1.9971937398812736e-05, "loss": 3.1434, "step": 89700 }, { "epoch": 9.676549865229111, "grad_norm": 0.8419660329818726, "learning_rate": 1.9648138154344304e-05, "loss": 3.1206, "step": 89750 }, { "epoch": 9.681940700808624, "grad_norm": 0.8620070219039917, "learning_rate": 1.9324338909875875e-05, "loss": 3.1244, "step": 89800 }, { "epoch": 9.68733153638814, "grad_norm": 0.8452453017234802, "learning_rate": 1.9000539665407446e-05, "loss": 3.1357, "step": 89850 }, { "epoch": 9.692722371967655, "grad_norm": 0.8461737036705017, "learning_rate": 1.8676740420939016e-05, "loss": 3.1241, "step": 89900 }, { "epoch": 9.69811320754717, "grad_norm": 0.9039965867996216, "learning_rate": 1.8352941176470587e-05, "loss": 3.1415, "step": 89950 }, { "epoch": 9.703504043126685, "grad_norm": 0.8599136471748352, "learning_rate": 1.8029141932002158e-05, "loss": 3.1384, "step": 90000 }, { "epoch": 9.703504043126685, "eval_accuracy": 0.3945016858038809, "eval_loss": 3.3018898963928223, "eval_runtime": 184.7082, "eval_samples_per_second": 97.511, "eval_steps_per_second": 6.096, "step": 90000 }, { "epoch": 9.7088948787062, "grad_norm": 0.8381307721138, "learning_rate": 1.770534268753373e-05, "loss": 3.1231, "step": 90050 }, { "epoch": 9.714285714285714, "grad_norm": 0.8417419791221619, "learning_rate": 1.7381543443065297e-05, "loss": 3.1217, "step": 90100 }, { "epoch": 9.719676549865229, "grad_norm": 0.8387212753295898, "learning_rate": 1.7057744198596867e-05, "loss": 3.1338, "step": 90150 }, { "epoch": 9.725067385444744, "grad_norm": 0.8630876541137695, "learning_rate": 1.673394495412844e-05, "loss": 3.1339, "step": 90200 }, { "epoch": 9.730458221024259, "grad_norm": 0.836463987827301, "learning_rate": 1.641014570966001e-05, "loss": 3.1234, "step": 90250 }, { "epoch": 9.735849056603774, "grad_norm": 0.872067928314209, "learning_rate": 1.608634646519158e-05, "loss": 3.1269, "step": 90300 }, { "epoch": 9.74123989218329, "grad_norm": 0.8544020056724548, "learning_rate": 1.576254722072315e-05, "loss": 3.1358, "step": 90350 }, { "epoch": 9.746630727762803, "grad_norm": 0.8575515747070312, "learning_rate": 1.5438747976254722e-05, "loss": 3.1237, "step": 90400 }, { "epoch": 9.752021563342318, "grad_norm": 0.849241316318512, "learning_rate": 1.5114948731786291e-05, "loss": 3.1376, "step": 90450 }, { "epoch": 9.757412398921833, "grad_norm": 0.8438499569892883, "learning_rate": 1.4791149487317862e-05, "loss": 3.1372, "step": 90500 }, { "epoch": 9.762803234501348, "grad_norm": 0.9010724425315857, "learning_rate": 1.4467350242849431e-05, "loss": 3.1429, "step": 90550 }, { "epoch": 9.768194070080863, "grad_norm": 0.8614817261695862, "learning_rate": 1.4143550998381002e-05, "loss": 3.1199, "step": 90600 }, { "epoch": 9.773584905660378, "grad_norm": 0.8398626446723938, "learning_rate": 1.3819751753912573e-05, "loss": 3.1367, "step": 90650 }, { "epoch": 9.778975741239892, "grad_norm": 0.8385025262832642, "learning_rate": 1.3495952509444142e-05, "loss": 3.1227, "step": 90700 }, { "epoch": 9.784366576819407, "grad_norm": 0.8450053930282593, "learning_rate": 1.3172153264975715e-05, "loss": 3.1295, "step": 90750 }, { "epoch": 9.789757412398922, "grad_norm": 0.852215588092804, "learning_rate": 1.2848354020507286e-05, "loss": 3.1309, "step": 90800 }, { "epoch": 9.795148247978437, "grad_norm": 0.8445461392402649, "learning_rate": 1.2524554776038855e-05, "loss": 3.1305, "step": 90850 }, { "epoch": 9.800539083557952, "grad_norm": 0.8316650986671448, "learning_rate": 1.2200755531570426e-05, "loss": 3.1438, "step": 90900 }, { "epoch": 9.805929919137466, "grad_norm": 0.8489454984664917, "learning_rate": 1.1876956287101995e-05, "loss": 3.1357, "step": 90950 }, { "epoch": 9.81132075471698, "grad_norm": 0.8464455008506775, "learning_rate": 1.1553157042633566e-05, "loss": 3.1282, "step": 91000 }, { "epoch": 9.81132075471698, "eval_accuracy": 0.39473615873578455, "eval_loss": 3.2998392581939697, "eval_runtime": 184.8391, "eval_samples_per_second": 97.441, "eval_steps_per_second": 6.092, "step": 91000 }, { "epoch": 9.816711590296496, "grad_norm": 0.8439268469810486, "learning_rate": 1.1229357798165137e-05, "loss": 3.1357, "step": 91050 }, { "epoch": 9.822102425876011, "grad_norm": 0.8137937784194946, "learning_rate": 1.0905558553696706e-05, "loss": 3.119, "step": 91100 }, { "epoch": 9.827493261455526, "grad_norm": 0.8356099128723145, "learning_rate": 1.0581759309228278e-05, "loss": 3.1498, "step": 91150 }, { "epoch": 9.832884097035041, "grad_norm": 0.8601353168487549, "learning_rate": 1.025796006475985e-05, "loss": 3.1239, "step": 91200 }, { "epoch": 9.838274932614555, "grad_norm": 0.8289896249771118, "learning_rate": 9.934160820291419e-06, "loss": 3.1346, "step": 91250 }, { "epoch": 9.84366576819407, "grad_norm": 0.8336220383644104, "learning_rate": 9.61036157582299e-06, "loss": 3.134, "step": 91300 }, { "epoch": 9.849056603773585, "grad_norm": 0.8082159161567688, "learning_rate": 9.286562331354559e-06, "loss": 3.1424, "step": 91350 }, { "epoch": 9.8544474393531, "grad_norm": 0.8767507076263428, "learning_rate": 8.96276308688613e-06, "loss": 3.1319, "step": 91400 }, { "epoch": 9.859838274932615, "grad_norm": 0.8523329496383667, "learning_rate": 8.6389638424177e-06, "loss": 3.1263, "step": 91450 }, { "epoch": 9.865229110512129, "grad_norm": 0.8412721157073975, "learning_rate": 8.315164597949271e-06, "loss": 3.1274, "step": 91500 }, { "epoch": 9.870619946091644, "grad_norm": 0.8239290118217468, "learning_rate": 7.99136535348084e-06, "loss": 3.1269, "step": 91550 }, { "epoch": 9.876010781671159, "grad_norm": 0.8275209665298462, "learning_rate": 7.667566109012411e-06, "loss": 3.1202, "step": 91600 }, { "epoch": 9.881401617250674, "grad_norm": 0.8666613101959229, "learning_rate": 7.343766864543982e-06, "loss": 3.1307, "step": 91650 }, { "epoch": 9.88679245283019, "grad_norm": 0.8908191323280334, "learning_rate": 7.019967620075552e-06, "loss": 3.1524, "step": 91700 }, { "epoch": 9.892183288409704, "grad_norm": 0.8692740797996521, "learning_rate": 6.702644360496492e-06, "loss": 3.1283, "step": 91750 }, { "epoch": 9.897574123989218, "grad_norm": 0.8492558002471924, "learning_rate": 6.378845116028062e-06, "loss": 3.1457, "step": 91800 }, { "epoch": 9.902964959568733, "grad_norm": 0.8426559567451477, "learning_rate": 6.055045871559633e-06, "loss": 3.1189, "step": 91850 }, { "epoch": 9.908355795148248, "grad_norm": 0.8520486354827881, "learning_rate": 5.731246627091203e-06, "loss": 3.1194, "step": 91900 }, { "epoch": 9.913746630727763, "grad_norm": 0.8753141164779663, "learning_rate": 5.407447382622774e-06, "loss": 3.1268, "step": 91950 }, { "epoch": 9.919137466307278, "grad_norm": 0.8538247346878052, "learning_rate": 5.083648138154344e-06, "loss": 3.1198, "step": 92000 }, { "epoch": 9.919137466307278, "eval_accuracy": 0.3948759950024018, "eval_loss": 3.298466920852661, "eval_runtime": 185.0411, "eval_samples_per_second": 97.335, "eval_steps_per_second": 6.085, "step": 92000 }, { "epoch": 9.924528301886792, "grad_norm": 0.8613267540931702, "learning_rate": 4.7598488936859145e-06, "loss": 3.1107, "step": 92050 }, { "epoch": 9.929919137466307, "grad_norm": 0.8902148604393005, "learning_rate": 4.4360496492174846e-06, "loss": 3.1319, "step": 92100 }, { "epoch": 9.935309973045822, "grad_norm": 0.9252253770828247, "learning_rate": 4.112250404749055e-06, "loss": 3.1135, "step": 92150 }, { "epoch": 9.940700808625337, "grad_norm": 0.8792473673820496, "learning_rate": 3.788451160280626e-06, "loss": 3.1454, "step": 92200 }, { "epoch": 9.946091644204852, "grad_norm": 0.8873490691184998, "learning_rate": 3.464651915812196e-06, "loss": 3.1286, "step": 92250 }, { "epoch": 9.951482479784367, "grad_norm": 0.8667479753494263, "learning_rate": 3.1408526713437664e-06, "loss": 3.1376, "step": 92300 }, { "epoch": 9.95687331536388, "grad_norm": 0.8077408075332642, "learning_rate": 2.817053426875337e-06, "loss": 3.1289, "step": 92350 }, { "epoch": 9.962264150943396, "grad_norm": 0.8791921734809875, "learning_rate": 2.4932541824069074e-06, "loss": 3.1286, "step": 92400 }, { "epoch": 9.967654986522911, "grad_norm": 0.9087523818016052, "learning_rate": 2.169454937938478e-06, "loss": 3.1201, "step": 92450 }, { "epoch": 9.973045822102426, "grad_norm": 0.828881561756134, "learning_rate": 1.8456556934700485e-06, "loss": 3.1279, "step": 92500 }, { "epoch": 9.978436657681941, "grad_norm": 0.8658260703086853, "learning_rate": 1.521856449001619e-06, "loss": 3.1078, "step": 92550 }, { "epoch": 9.983827493261456, "grad_norm": 0.8751649260520935, "learning_rate": 1.1980572045331894e-06, "loss": 3.1215, "step": 92600 }, { "epoch": 9.98921832884097, "grad_norm": 0.8642011284828186, "learning_rate": 8.742579600647598e-07, "loss": 3.1244, "step": 92650 }, { "epoch": 9.994609164420485, "grad_norm": 0.8841090798377991, "learning_rate": 5.504587155963303e-07, "loss": 3.1176, "step": 92700 }, { "epoch": 10.0, "grad_norm": 1.7963483333587646, "learning_rate": 2.2665947112790068e-07, "loss": 3.1182, "step": 92750 }, { "epoch": 10.0, "step": 92750, "total_flos": 7.75449427968e+17, "train_loss": 3.455072832380022, "train_runtime": 79721.2665, "train_samples_per_second": 37.227, "train_steps_per_second": 1.163 } ], "logging_steps": 50, "max_steps": 92750, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.75449427968e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }