{ "best_metric": 3.301514148712158, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_495/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005381552039608223, "grad_norm": 1.2837527990341187, "learning_rate": 0.00028799999999999995, "loss": 8.8816, "step": 50 }, { "epoch": 0.010763104079216447, "grad_norm": 1.5149996280670166, "learning_rate": 0.000588, "loss": 6.9347, "step": 100 }, { "epoch": 0.01614465611882467, "grad_norm": 3.8635504245758057, "learning_rate": 0.000599689688611141, "loss": 6.5162, "step": 150 }, { "epoch": 0.021526208158432893, "grad_norm": 2.2935287952423096, "learning_rate": 0.0005993664475810796, "loss": 6.2792, "step": 200 }, { "epoch": 0.026907760198041114, "grad_norm": 0.673748254776001, "learning_rate": 0.0005990432065510182, "loss": 6.1014, "step": 250 }, { "epoch": 0.03228931223764934, "grad_norm": 1.4223030805587769, "learning_rate": 0.0005987199655209567, "loss": 5.9959, "step": 300 }, { "epoch": 0.03767086427725756, "grad_norm": 1.1047555208206177, "learning_rate": 0.0005983967244908953, "loss": 5.8796, "step": 350 }, { "epoch": 0.04305241631686579, "grad_norm": 1.4528274536132812, "learning_rate": 0.0005980734834608338, "loss": 5.7982, "step": 400 }, { "epoch": 0.048433968356474004, "grad_norm": 2.160820245742798, "learning_rate": 0.0005977502424307725, "loss": 5.7395, "step": 450 }, { "epoch": 0.05381552039608223, "grad_norm": 1.033282995223999, "learning_rate": 0.0005974270014007111, "loss": 5.681, "step": 500 }, { "epoch": 0.05919707243569045, "grad_norm": 1.3398876190185547, "learning_rate": 0.0005971037603706497, "loss": 5.6023, "step": 550 }, { "epoch": 0.06457862447529868, "grad_norm": 1.4797229766845703, "learning_rate": 0.0005967805193405882, "loss": 5.5184, "step": 600 }, { "epoch": 0.0699601765149069, "grad_norm": 1.2783616781234741, "learning_rate": 0.0005964572783105269, "loss": 5.44, "step": 650 }, { "epoch": 0.07534172855451512, "grad_norm": 1.311514973640442, "learning_rate": 0.0005961340372804654, "loss": 5.3904, "step": 700 }, { "epoch": 0.08072328059412334, "grad_norm": 1.440206527709961, "learning_rate": 0.000595810796250404, "loss": 5.325, "step": 750 }, { "epoch": 0.08610483263373157, "grad_norm": 1.1301391124725342, "learning_rate": 0.0005954875552203426, "loss": 5.2693, "step": 800 }, { "epoch": 0.09148638467333979, "grad_norm": 1.1538249254226685, "learning_rate": 0.0005951643141902811, "loss": 5.225, "step": 850 }, { "epoch": 0.09686793671294801, "grad_norm": 1.1800639629364014, "learning_rate": 0.0005948410731602198, "loss": 5.1712, "step": 900 }, { "epoch": 0.10224948875255624, "grad_norm": 1.4021689891815186, "learning_rate": 0.0005945178321301583, "loss": 5.1358, "step": 950 }, { "epoch": 0.10763104079216446, "grad_norm": 1.0665432214736938, "learning_rate": 0.000594194591100097, "loss": 5.1185, "step": 1000 }, { "epoch": 0.10763104079216446, "eval_accuracy": 0.22684452130308277, "eval_loss": 5.0283942222595215, "eval_runtime": 183.7619, "eval_samples_per_second": 98.013, "eval_steps_per_second": 6.127, "step": 1000 }, { "epoch": 0.11301259283177269, "grad_norm": 1.2405089139938354, "learning_rate": 0.0005938713500700355, "loss": 5.0458, "step": 1050 }, { "epoch": 0.1183941448713809, "grad_norm": 1.3682039976119995, "learning_rate": 0.000593548109039974, "loss": 5.0129, "step": 1100 }, { "epoch": 0.12377569691098914, "grad_norm": 0.9749146699905396, "learning_rate": 0.0005932248680099127, "loss": 5.0004, "step": 1150 }, { "epoch": 0.12915724895059735, "grad_norm": 1.3824317455291748, "learning_rate": 0.0005929016269798512, "loss": 4.9522, "step": 1200 }, { "epoch": 0.13453880099020557, "grad_norm": 1.1913814544677734, "learning_rate": 0.0005925783859497898, "loss": 4.9708, "step": 1250 }, { "epoch": 0.1399203530298138, "grad_norm": 1.5464386940002441, "learning_rate": 0.0005922551449197284, "loss": 4.9231, "step": 1300 }, { "epoch": 0.14530190506942203, "grad_norm": 0.9591012597084045, "learning_rate": 0.0005919319038896671, "loss": 4.8803, "step": 1350 }, { "epoch": 0.15068345710903025, "grad_norm": 0.8420179486274719, "learning_rate": 0.0005916086628596056, "loss": 4.8637, "step": 1400 }, { "epoch": 0.15606500914863847, "grad_norm": 0.9842941761016846, "learning_rate": 0.0005912854218295442, "loss": 4.8474, "step": 1450 }, { "epoch": 0.16144656118824668, "grad_norm": 0.8151698708534241, "learning_rate": 0.0005909621807994827, "loss": 4.7987, "step": 1500 }, { "epoch": 0.1668281132278549, "grad_norm": 0.9097293615341187, "learning_rate": 0.0005906389397694213, "loss": 4.7904, "step": 1550 }, { "epoch": 0.17220966526746315, "grad_norm": 1.1878386735916138, "learning_rate": 0.00059031569873936, "loss": 4.7364, "step": 1600 }, { "epoch": 0.17759121730707136, "grad_norm": 1.1684958934783936, "learning_rate": 0.0005899924577092985, "loss": 4.7345, "step": 1650 }, { "epoch": 0.18297276934667958, "grad_norm": 0.7626133561134338, "learning_rate": 0.0005896692166792371, "loss": 4.7318, "step": 1700 }, { "epoch": 0.1883543213862878, "grad_norm": 1.020124912261963, "learning_rate": 0.0005893459756491757, "loss": 4.6724, "step": 1750 }, { "epoch": 0.19373587342589602, "grad_norm": 0.8802762627601624, "learning_rate": 0.0005890227346191143, "loss": 4.6678, "step": 1800 }, { "epoch": 0.19911742546550426, "grad_norm": 0.9750682711601257, "learning_rate": 0.0005886994935890529, "loss": 4.6395, "step": 1850 }, { "epoch": 0.20449897750511248, "grad_norm": 1.1132116317749023, "learning_rate": 0.0005883762525589915, "loss": 4.6198, "step": 1900 }, { "epoch": 0.2098805295447207, "grad_norm": 0.8680851459503174, "learning_rate": 0.00058805301152893, "loss": 4.6078, "step": 1950 }, { "epoch": 0.2152620815843289, "grad_norm": 0.961650550365448, "learning_rate": 0.0005877297704988686, "loss": 4.5798, "step": 2000 }, { "epoch": 0.2152620815843289, "eval_accuracy": 0.27055231848421396, "eval_loss": 4.5154547691345215, "eval_runtime": 180.811, "eval_samples_per_second": 99.612, "eval_steps_per_second": 6.227, "step": 2000 }, { "epoch": 0.22064363362393713, "grad_norm": 0.765781581401825, "learning_rate": 0.0005874065294688072, "loss": 4.5771, "step": 2050 }, { "epoch": 0.22602518566354537, "grad_norm": 1.058321475982666, "learning_rate": 0.0005870832884387457, "loss": 4.5435, "step": 2100 }, { "epoch": 0.2314067377031536, "grad_norm": 0.7259834408760071, "learning_rate": 0.0005867600474086844, "loss": 4.5295, "step": 2150 }, { "epoch": 0.2367882897427618, "grad_norm": 0.9045870304107666, "learning_rate": 0.0005864368063786229, "loss": 4.5164, "step": 2200 }, { "epoch": 0.24216984178237003, "grad_norm": 0.695756196975708, "learning_rate": 0.0005861135653485616, "loss": 4.4846, "step": 2250 }, { "epoch": 0.24755139382197827, "grad_norm": 1.038356065750122, "learning_rate": 0.0005857903243185001, "loss": 4.4839, "step": 2300 }, { "epoch": 0.2529329458615865, "grad_norm": 0.8050019145011902, "learning_rate": 0.0005854670832884386, "loss": 4.4633, "step": 2350 }, { "epoch": 0.2583144979011947, "grad_norm": 0.9252821803092957, "learning_rate": 0.0005851438422583773, "loss": 4.4508, "step": 2400 }, { "epoch": 0.2636960499408029, "grad_norm": 0.9051347374916077, "learning_rate": 0.0005848206012283159, "loss": 4.4359, "step": 2450 }, { "epoch": 0.26907760198041114, "grad_norm": 0.890594482421875, "learning_rate": 0.0005844973601982545, "loss": 4.4382, "step": 2500 }, { "epoch": 0.27445915402001936, "grad_norm": 0.897269070148468, "learning_rate": 0.000584174119168193, "loss": 4.4075, "step": 2550 }, { "epoch": 0.2798407060596276, "grad_norm": 0.8940602540969849, "learning_rate": 0.0005838508781381316, "loss": 4.3954, "step": 2600 }, { "epoch": 0.2852222580992358, "grad_norm": 0.8267364501953125, "learning_rate": 0.0005835276371080702, "loss": 4.388, "step": 2650 }, { "epoch": 0.29060381013884407, "grad_norm": 0.6600859761238098, "learning_rate": 0.0005832043960780088, "loss": 4.3691, "step": 2700 }, { "epoch": 0.2959853621784523, "grad_norm": 0.8835673332214355, "learning_rate": 0.0005828811550479474, "loss": 4.3444, "step": 2750 }, { "epoch": 0.3013669142180605, "grad_norm": 0.6922082304954529, "learning_rate": 0.0005825579140178859, "loss": 4.3532, "step": 2800 }, { "epoch": 0.3067484662576687, "grad_norm": 0.7541009783744812, "learning_rate": 0.0005822346729878246, "loss": 4.3299, "step": 2850 }, { "epoch": 0.31213001829727693, "grad_norm": 0.7759166359901428, "learning_rate": 0.0005819114319577631, "loss": 4.3374, "step": 2900 }, { "epoch": 0.31751157033688515, "grad_norm": 1.1995506286621094, "learning_rate": 0.0005815881909277017, "loss": 4.3214, "step": 2950 }, { "epoch": 0.32289312237649337, "grad_norm": 0.7463189959526062, "learning_rate": 0.0005812649498976403, "loss": 4.3387, "step": 3000 }, { "epoch": 0.32289312237649337, "eval_accuracy": 0.29824424538993943, "eval_loss": 4.238636016845703, "eval_runtime": 181.3245, "eval_samples_per_second": 99.33, "eval_steps_per_second": 6.21, "step": 3000 }, { "epoch": 0.3282746744161016, "grad_norm": 0.7469370365142822, "learning_rate": 0.0005809417088675789, "loss": 4.293, "step": 3050 }, { "epoch": 0.3336562264557098, "grad_norm": 0.8371293544769287, "learning_rate": 0.0005806184678375175, "loss": 4.3008, "step": 3100 }, { "epoch": 0.3390377784953181, "grad_norm": 0.6738422513008118, "learning_rate": 0.000580295226807456, "loss": 4.2936, "step": 3150 }, { "epoch": 0.3444193305349263, "grad_norm": 0.8029556274414062, "learning_rate": 0.0005799719857773946, "loss": 4.2804, "step": 3200 }, { "epoch": 0.3498008825745345, "grad_norm": 0.7457656264305115, "learning_rate": 0.0005796487447473331, "loss": 4.2732, "step": 3250 }, { "epoch": 0.35518243461414273, "grad_norm": 0.699137270450592, "learning_rate": 0.0005793255037172718, "loss": 4.2464, "step": 3300 }, { "epoch": 0.36056398665375095, "grad_norm": 0.7366440892219543, "learning_rate": 0.0005790022626872104, "loss": 4.2612, "step": 3350 }, { "epoch": 0.36594553869335916, "grad_norm": 0.7930153012275696, "learning_rate": 0.000578679021657149, "loss": 4.2381, "step": 3400 }, { "epoch": 0.3713270907329674, "grad_norm": 0.6190389394760132, "learning_rate": 0.0005783557806270875, "loss": 4.218, "step": 3450 }, { "epoch": 0.3767086427725756, "grad_norm": 0.7235623598098755, "learning_rate": 0.0005780325395970262, "loss": 4.2333, "step": 3500 }, { "epoch": 0.3820901948121838, "grad_norm": 0.6598897576332092, "learning_rate": 0.0005777092985669647, "loss": 4.2311, "step": 3550 }, { "epoch": 0.38747174685179203, "grad_norm": 0.7026280760765076, "learning_rate": 0.0005773860575369033, "loss": 4.2126, "step": 3600 }, { "epoch": 0.3928532988914003, "grad_norm": 0.9066165685653687, "learning_rate": 0.0005770628165068419, "loss": 4.1823, "step": 3650 }, { "epoch": 0.3982348509310085, "grad_norm": 0.6496531963348389, "learning_rate": 0.0005767395754767804, "loss": 4.1756, "step": 3700 }, { "epoch": 0.40361640297061674, "grad_norm": 0.8745374083518982, "learning_rate": 0.0005764163344467191, "loss": 4.225, "step": 3750 }, { "epoch": 0.40899795501022496, "grad_norm": 0.7930644154548645, "learning_rate": 0.0005760930934166576, "loss": 4.1885, "step": 3800 }, { "epoch": 0.4143795070498332, "grad_norm": 0.781657338142395, "learning_rate": 0.0005757698523865963, "loss": 4.1779, "step": 3850 }, { "epoch": 0.4197610590894414, "grad_norm": 0.5834868550300598, "learning_rate": 0.0005754466113565348, "loss": 4.172, "step": 3900 }, { "epoch": 0.4251426111290496, "grad_norm": 0.6679921746253967, "learning_rate": 0.0005751233703264733, "loss": 4.1775, "step": 3950 }, { "epoch": 0.4305241631686578, "grad_norm": 0.6265010833740234, "learning_rate": 0.000574800129296412, "loss": 4.1699, "step": 4000 }, { "epoch": 0.4305241631686578, "eval_accuracy": 0.31248146789182213, "eval_loss": 4.09141206741333, "eval_runtime": 181.1997, "eval_samples_per_second": 99.399, "eval_steps_per_second": 6.214, "step": 4000 }, { "epoch": 0.43590571520826604, "grad_norm": 0.7085737586021423, "learning_rate": 0.0005744768882663505, "loss": 4.1584, "step": 4050 }, { "epoch": 0.44128726724787426, "grad_norm": 0.6993206143379211, "learning_rate": 0.0005741536472362891, "loss": 4.1495, "step": 4100 }, { "epoch": 0.44666881928748253, "grad_norm": 0.7425188422203064, "learning_rate": 0.0005738304062062277, "loss": 4.1402, "step": 4150 }, { "epoch": 0.45205037132709075, "grad_norm": 0.72447669506073, "learning_rate": 0.0005735071651761664, "loss": 4.145, "step": 4200 }, { "epoch": 0.45743192336669897, "grad_norm": 0.6373715996742249, "learning_rate": 0.0005731839241461049, "loss": 4.1479, "step": 4250 }, { "epoch": 0.4628134754063072, "grad_norm": 0.7162675261497498, "learning_rate": 0.0005728606831160435, "loss": 4.1218, "step": 4300 }, { "epoch": 0.4681950274459154, "grad_norm": 0.6584669351577759, "learning_rate": 0.000572537442085982, "loss": 4.1373, "step": 4350 }, { "epoch": 0.4735765794855236, "grad_norm": 0.7650042176246643, "learning_rate": 0.0005722142010559206, "loss": 4.1174, "step": 4400 }, { "epoch": 0.47895813152513184, "grad_norm": 0.6704964637756348, "learning_rate": 0.0005718909600258593, "loss": 4.1131, "step": 4450 }, { "epoch": 0.48433968356474005, "grad_norm": 0.6296616196632385, "learning_rate": 0.0005715677189957978, "loss": 4.1163, "step": 4500 }, { "epoch": 0.48972123560434827, "grad_norm": 0.6592406630516052, "learning_rate": 0.0005712444779657364, "loss": 4.1115, "step": 4550 }, { "epoch": 0.49510278764395654, "grad_norm": 0.6917868852615356, "learning_rate": 0.000570921236935675, "loss": 4.1011, "step": 4600 }, { "epoch": 0.5004843396835648, "grad_norm": 0.6076876521110535, "learning_rate": 0.0005705979959056136, "loss": 4.0946, "step": 4650 }, { "epoch": 0.505865891723173, "grad_norm": 0.7738022804260254, "learning_rate": 0.0005702747548755522, "loss": 4.0802, "step": 4700 }, { "epoch": 0.5112474437627812, "grad_norm": 0.566720187664032, "learning_rate": 0.0005699515138454908, "loss": 4.0647, "step": 4750 }, { "epoch": 0.5166289958023894, "grad_norm": 0.6442734003067017, "learning_rate": 0.0005696282728154293, "loss": 4.0752, "step": 4800 }, { "epoch": 0.5220105478419976, "grad_norm": 0.6389538049697876, "learning_rate": 0.0005693050317853679, "loss": 4.0793, "step": 4850 }, { "epoch": 0.5273920998816058, "grad_norm": 0.614249587059021, "learning_rate": 0.0005689817907553065, "loss": 4.0488, "step": 4900 }, { "epoch": 0.5327736519212141, "grad_norm": 0.6224474310874939, "learning_rate": 0.000568658549725245, "loss": 4.0715, "step": 4950 }, { "epoch": 0.5381552039608223, "grad_norm": 0.7582871317863464, "learning_rate": 0.0005683353086951837, "loss": 4.0481, "step": 5000 }, { "epoch": 0.5381552039608223, "eval_accuracy": 0.32132353124927676, "eval_loss": 3.9923758506774902, "eval_runtime": 181.276, "eval_samples_per_second": 99.357, "eval_steps_per_second": 6.212, "step": 5000 }, { "epoch": 0.5435367560004305, "grad_norm": 0.5739832520484924, "learning_rate": 0.0005680120676651222, "loss": 4.0529, "step": 5050 }, { "epoch": 0.5489183080400387, "grad_norm": 0.7246623635292053, "learning_rate": 0.0005676888266350609, "loss": 4.0543, "step": 5100 }, { "epoch": 0.5542998600796469, "grad_norm": 0.644861102104187, "learning_rate": 0.0005673655856049994, "loss": 4.0439, "step": 5150 }, { "epoch": 0.5596814121192552, "grad_norm": 0.7004746198654175, "learning_rate": 0.0005670423445749379, "loss": 4.0376, "step": 5200 }, { "epoch": 0.5650629641588634, "grad_norm": 0.632524311542511, "learning_rate": 0.0005667191035448766, "loss": 4.032, "step": 5250 }, { "epoch": 0.5704445161984716, "grad_norm": 0.5797837972640991, "learning_rate": 0.0005663958625148152, "loss": 4.0324, "step": 5300 }, { "epoch": 0.5758260682380799, "grad_norm": 0.6302703619003296, "learning_rate": 0.0005660726214847538, "loss": 4.0344, "step": 5350 }, { "epoch": 0.5812076202776881, "grad_norm": 0.6832019686698914, "learning_rate": 0.0005657493804546923, "loss": 4.0161, "step": 5400 }, { "epoch": 0.5865891723172963, "grad_norm": 0.5395603179931641, "learning_rate": 0.0005654261394246309, "loss": 4.0286, "step": 5450 }, { "epoch": 0.5919707243569046, "grad_norm": 0.6312322020530701, "learning_rate": 0.0005651028983945695, "loss": 4.0048, "step": 5500 }, { "epoch": 0.5973522763965128, "grad_norm": 0.6085729002952576, "learning_rate": 0.000564779657364508, "loss": 4.0161, "step": 5550 }, { "epoch": 0.602733828436121, "grad_norm": 0.6463218331336975, "learning_rate": 0.0005644564163344467, "loss": 3.9968, "step": 5600 }, { "epoch": 0.6081153804757292, "grad_norm": 0.6449352502822876, "learning_rate": 0.0005641331753043852, "loss": 4.0241, "step": 5650 }, { "epoch": 0.6134969325153374, "grad_norm": 0.6060164570808411, "learning_rate": 0.0005638099342743239, "loss": 3.9932, "step": 5700 }, { "epoch": 0.6188784845549457, "grad_norm": 0.6284982562065125, "learning_rate": 0.0005634866932442624, "loss": 4.0173, "step": 5750 }, { "epoch": 0.6242600365945539, "grad_norm": 0.6938678026199341, "learning_rate": 0.000563163452214201, "loss": 4.0105, "step": 5800 }, { "epoch": 0.6296415886341621, "grad_norm": 0.6271978616714478, "learning_rate": 0.0005628402111841396, "loss": 4.0062, "step": 5850 }, { "epoch": 0.6350231406737703, "grad_norm": 0.5207304358482361, "learning_rate": 0.0005625169701540782, "loss": 3.9621, "step": 5900 }, { "epoch": 0.6404046927133785, "grad_norm": 0.6998205780982971, "learning_rate": 0.0005621937291240168, "loss": 3.9976, "step": 5950 }, { "epoch": 0.6457862447529867, "grad_norm": 0.6199911236763, "learning_rate": 0.0005618704880939553, "loss": 4.0019, "step": 6000 }, { "epoch": 0.6457862447529867, "eval_accuracy": 0.32811672710121376, "eval_loss": 3.920245409011841, "eval_runtime": 181.1614, "eval_samples_per_second": 99.42, "eval_steps_per_second": 6.215, "step": 6000 }, { "epoch": 0.651167796792595, "grad_norm": 0.5323458313941956, "learning_rate": 0.0005615472470638939, "loss": 3.9823, "step": 6050 }, { "epoch": 0.6565493488322032, "grad_norm": 0.7789810299873352, "learning_rate": 0.0005612240060338325, "loss": 3.9929, "step": 6100 }, { "epoch": 0.6619309008718114, "grad_norm": 0.5490685701370239, "learning_rate": 0.0005609007650037711, "loss": 3.9736, "step": 6150 }, { "epoch": 0.6673124529114196, "grad_norm": 0.6046157479286194, "learning_rate": 0.0005605775239737097, "loss": 3.9811, "step": 6200 }, { "epoch": 0.6726940049510278, "grad_norm": 0.562800407409668, "learning_rate": 0.0005602542829436483, "loss": 3.9609, "step": 6250 }, { "epoch": 0.6780755569906362, "grad_norm": 0.6234575510025024, "learning_rate": 0.0005599310419135868, "loss": 3.9628, "step": 6300 }, { "epoch": 0.6834571090302444, "grad_norm": 0.660358190536499, "learning_rate": 0.0005596078008835255, "loss": 3.9779, "step": 6350 }, { "epoch": 0.6888386610698526, "grad_norm": 0.5322813391685486, "learning_rate": 0.000559284559853464, "loss": 3.9642, "step": 6400 }, { "epoch": 0.6942202131094608, "grad_norm": 0.6975204944610596, "learning_rate": 0.0005589613188234026, "loss": 3.9443, "step": 6450 }, { "epoch": 0.699601765149069, "grad_norm": 0.6204344630241394, "learning_rate": 0.0005586380777933412, "loss": 3.9501, "step": 6500 }, { "epoch": 0.7049833171886772, "grad_norm": 0.5647640228271484, "learning_rate": 0.0005583148367632797, "loss": 3.9594, "step": 6550 }, { "epoch": 0.7103648692282855, "grad_norm": 0.7194311022758484, "learning_rate": 0.0005579915957332184, "loss": 3.9391, "step": 6600 }, { "epoch": 0.7157464212678937, "grad_norm": 0.6102573275566101, "learning_rate": 0.0005576683547031569, "loss": 3.935, "step": 6650 }, { "epoch": 0.7211279733075019, "grad_norm": 0.6759886145591736, "learning_rate": 0.0005573451136730956, "loss": 3.9674, "step": 6700 }, { "epoch": 0.7265095253471101, "grad_norm": 0.6923319697380066, "learning_rate": 0.0005570218726430341, "loss": 3.9456, "step": 6750 }, { "epoch": 0.7318910773867183, "grad_norm": 0.592879056930542, "learning_rate": 0.0005566986316129728, "loss": 3.9607, "step": 6800 }, { "epoch": 0.7372726294263265, "grad_norm": 0.6049233675003052, "learning_rate": 0.0005563753905829113, "loss": 3.9329, "step": 6850 }, { "epoch": 0.7426541814659348, "grad_norm": 0.5757265090942383, "learning_rate": 0.0005560521495528498, "loss": 3.9367, "step": 6900 }, { "epoch": 0.748035733505543, "grad_norm": 0.5787569880485535, "learning_rate": 0.0005557289085227884, "loss": 3.9432, "step": 6950 }, { "epoch": 0.7534172855451512, "grad_norm": 0.6775709390640259, "learning_rate": 0.000555405667492727, "loss": 3.9366, "step": 7000 }, { "epoch": 0.7534172855451512, "eval_accuracy": 0.33399778195994817, "eval_loss": 3.8597476482391357, "eval_runtime": 181.4913, "eval_samples_per_second": 99.239, "eval_steps_per_second": 6.204, "step": 7000 }, { "epoch": 0.7587988375847594, "grad_norm": 0.600846529006958, "learning_rate": 0.0005550824264626657, "loss": 3.9292, "step": 7050 }, { "epoch": 0.7641803896243676, "grad_norm": 0.7947003245353699, "learning_rate": 0.0005547591854326042, "loss": 3.9306, "step": 7100 }, { "epoch": 0.7695619416639758, "grad_norm": 0.6536075472831726, "learning_rate": 0.0005544359444025428, "loss": 3.923, "step": 7150 }, { "epoch": 0.7749434937035841, "grad_norm": 0.5992883443832397, "learning_rate": 0.0005541127033724813, "loss": 3.927, "step": 7200 }, { "epoch": 0.7803250457431924, "grad_norm": 0.5310186147689819, "learning_rate": 0.0005537894623424199, "loss": 3.9144, "step": 7250 }, { "epoch": 0.7857065977828006, "grad_norm": 0.5524613261222839, "learning_rate": 0.0005534662213123586, "loss": 3.8913, "step": 7300 }, { "epoch": 0.7910881498224088, "grad_norm": 0.5931475162506104, "learning_rate": 0.0005531429802822971, "loss": 3.9173, "step": 7350 }, { "epoch": 0.796469701862017, "grad_norm": 0.6450363993644714, "learning_rate": 0.0005528197392522357, "loss": 3.9086, "step": 7400 }, { "epoch": 0.8018512539016253, "grad_norm": 0.6539124846458435, "learning_rate": 0.0005524964982221743, "loss": 3.9094, "step": 7450 }, { "epoch": 0.8072328059412335, "grad_norm": 0.7083090543746948, "learning_rate": 0.0005521732571921129, "loss": 3.9112, "step": 7500 }, { "epoch": 0.8126143579808417, "grad_norm": 0.6583232879638672, "learning_rate": 0.0005518500161620514, "loss": 3.8951, "step": 7550 }, { "epoch": 0.8179959100204499, "grad_norm": 0.5080458521842957, "learning_rate": 0.00055152677513199, "loss": 3.8707, "step": 7600 }, { "epoch": 0.8233774620600581, "grad_norm": 0.6174957156181335, "learning_rate": 0.0005512035341019286, "loss": 3.9096, "step": 7650 }, { "epoch": 0.8287590140996663, "grad_norm": 0.5554836392402649, "learning_rate": 0.0005508802930718672, "loss": 3.8794, "step": 7700 }, { "epoch": 0.8341405661392746, "grad_norm": 0.5927399396896362, "learning_rate": 0.0005505570520418058, "loss": 3.8899, "step": 7750 }, { "epoch": 0.8395221181788828, "grad_norm": 0.6369514465332031, "learning_rate": 0.0005502338110117443, "loss": 3.894, "step": 7800 }, { "epoch": 0.844903670218491, "grad_norm": 0.5553373098373413, "learning_rate": 0.000549910569981683, "loss": 3.871, "step": 7850 }, { "epoch": 0.8502852222580992, "grad_norm": 0.6265012621879578, "learning_rate": 0.0005495873289516215, "loss": 3.8954, "step": 7900 }, { "epoch": 0.8556667742977074, "grad_norm": 0.5350773930549622, "learning_rate": 0.0005492640879215602, "loss": 3.8793, "step": 7950 }, { "epoch": 0.8610483263373157, "grad_norm": 0.566001296043396, "learning_rate": 0.0005489408468914987, "loss": 3.8759, "step": 8000 }, { "epoch": 0.8610483263373157, "eval_accuracy": 0.3379087426568304, "eval_loss": 3.8164145946502686, "eval_runtime": 179.7462, "eval_samples_per_second": 100.202, "eval_steps_per_second": 6.264, "step": 8000 }, { "epoch": 0.8664298783769239, "grad_norm": 0.5853043794631958, "learning_rate": 0.0005486176058614372, "loss": 3.883, "step": 8050 }, { "epoch": 0.8718114304165321, "grad_norm": 0.5731094479560852, "learning_rate": 0.0005482943648313759, "loss": 3.9002, "step": 8100 }, { "epoch": 0.8771929824561403, "grad_norm": 0.7093199491500854, "learning_rate": 0.0005479711238013145, "loss": 3.8633, "step": 8150 }, { "epoch": 0.8825745344957485, "grad_norm": 0.5111085772514343, "learning_rate": 0.0005476478827712531, "loss": 3.8737, "step": 8200 }, { "epoch": 0.8879560865353568, "grad_norm": 0.6130983233451843, "learning_rate": 0.0005473246417411916, "loss": 3.8601, "step": 8250 }, { "epoch": 0.8933376385749651, "grad_norm": 0.5903039574623108, "learning_rate": 0.0005470014007111302, "loss": 3.8517, "step": 8300 }, { "epoch": 0.8987191906145733, "grad_norm": 0.5702494382858276, "learning_rate": 0.0005466781596810688, "loss": 3.8744, "step": 8350 }, { "epoch": 0.9041007426541815, "grad_norm": 0.5253993272781372, "learning_rate": 0.0005463549186510073, "loss": 3.861, "step": 8400 }, { "epoch": 0.9094822946937897, "grad_norm": 0.5987130999565125, "learning_rate": 0.000546031677620946, "loss": 3.861, "step": 8450 }, { "epoch": 0.9148638467333979, "grad_norm": 0.6229389309883118, "learning_rate": 0.0005457084365908845, "loss": 3.8491, "step": 8500 }, { "epoch": 0.9202453987730062, "grad_norm": 0.5428225994110107, "learning_rate": 0.0005453851955608232, "loss": 3.8548, "step": 8550 }, { "epoch": 0.9256269508126144, "grad_norm": 0.6359056830406189, "learning_rate": 0.0005450619545307617, "loss": 3.8263, "step": 8600 }, { "epoch": 0.9310085028522226, "grad_norm": 0.5836291909217834, "learning_rate": 0.0005447387135007003, "loss": 3.8549, "step": 8650 }, { "epoch": 0.9363900548918308, "grad_norm": 0.5731891393661499, "learning_rate": 0.0005444154724706389, "loss": 3.8527, "step": 8700 }, { "epoch": 0.941771606931439, "grad_norm": 0.5091255903244019, "learning_rate": 0.0005440922314405775, "loss": 3.8348, "step": 8750 }, { "epoch": 0.9471531589710472, "grad_norm": 0.6273380517959595, "learning_rate": 0.0005437689904105161, "loss": 3.8499, "step": 8800 }, { "epoch": 0.9525347110106555, "grad_norm": 0.5405938625335693, "learning_rate": 0.0005434457493804546, "loss": 3.8546, "step": 8850 }, { "epoch": 0.9579162630502637, "grad_norm": 0.628632128238678, "learning_rate": 0.0005431225083503932, "loss": 3.8536, "step": 8900 }, { "epoch": 0.9632978150898719, "grad_norm": 0.5118025541305542, "learning_rate": 0.0005427992673203318, "loss": 3.8352, "step": 8950 }, { "epoch": 0.9686793671294801, "grad_norm": 0.6539580821990967, "learning_rate": 0.0005424760262902704, "loss": 3.868, "step": 9000 }, { "epoch": 0.9686793671294801, "eval_accuracy": 0.341995177767533, "eval_loss": 3.7746593952178955, "eval_runtime": 178.9687, "eval_samples_per_second": 100.638, "eval_steps_per_second": 6.292, "step": 9000 }, { "epoch": 0.9740609191690883, "grad_norm": 0.5451881289482117, "learning_rate": 0.000542152785260209, "loss": 3.8354, "step": 9050 }, { "epoch": 0.9794424712086965, "grad_norm": 0.6332874298095703, "learning_rate": 0.0005418295442301476, "loss": 3.8133, "step": 9100 }, { "epoch": 0.9848240232483048, "grad_norm": 0.5308850407600403, "learning_rate": 0.0005415063032000861, "loss": 3.8335, "step": 9150 }, { "epoch": 0.9902055752879131, "grad_norm": 0.5831335783004761, "learning_rate": 0.0005411830621700248, "loss": 3.85, "step": 9200 }, { "epoch": 0.9955871273275213, "grad_norm": 0.5904872417449951, "learning_rate": 0.0005408598211399633, "loss": 3.8349, "step": 9250 }, { "epoch": 1.0009686793671295, "grad_norm": 0.5540934205055237, "learning_rate": 0.0005405365801099019, "loss": 3.8278, "step": 9300 }, { "epoch": 1.0063502314067376, "grad_norm": 0.56868976354599, "learning_rate": 0.0005402133390798405, "loss": 3.7578, "step": 9350 }, { "epoch": 1.011731783446346, "grad_norm": 0.6353201270103455, "learning_rate": 0.000539890098049779, "loss": 3.758, "step": 9400 }, { "epoch": 1.017113335485954, "grad_norm": 0.6136249899864197, "learning_rate": 0.0005395668570197177, "loss": 3.7775, "step": 9450 }, { "epoch": 1.0224948875255624, "grad_norm": 0.6076843738555908, "learning_rate": 0.0005392436159896562, "loss": 3.7493, "step": 9500 }, { "epoch": 1.0278764395651705, "grad_norm": 0.5511475801467896, "learning_rate": 0.0005389203749595948, "loss": 3.7807, "step": 9550 }, { "epoch": 1.0332579916047788, "grad_norm": 0.6536751389503479, "learning_rate": 0.0005385971339295334, "loss": 3.7464, "step": 9600 }, { "epoch": 1.0386395436443872, "grad_norm": 0.5289199948310852, "learning_rate": 0.000538273892899472, "loss": 3.7723, "step": 9650 }, { "epoch": 1.0440210956839953, "grad_norm": 0.5578382611274719, "learning_rate": 0.0005379506518694106, "loss": 3.7609, "step": 9700 }, { "epoch": 1.0494026477236036, "grad_norm": 0.6024869084358215, "learning_rate": 0.0005376274108393491, "loss": 3.7518, "step": 9750 }, { "epoch": 1.0547841997632117, "grad_norm": 0.5720689296722412, "learning_rate": 0.0005373041698092877, "loss": 3.7627, "step": 9800 }, { "epoch": 1.06016575180282, "grad_norm": 0.6048834323883057, "learning_rate": 0.0005369809287792263, "loss": 3.7701, "step": 9850 }, { "epoch": 1.0655473038424281, "grad_norm": 0.6051644086837769, "learning_rate": 0.000536657687749165, "loss": 3.7416, "step": 9900 }, { "epoch": 1.0709288558820365, "grad_norm": 0.49157214164733887, "learning_rate": 0.0005363344467191035, "loss": 3.756, "step": 9950 }, { "epoch": 1.0763104079216446, "grad_norm": 0.5985872149467468, "learning_rate": 0.000536011205689042, "loss": 3.7487, "step": 10000 }, { "epoch": 1.0763104079216446, "eval_accuracy": 0.34483232197414476, "eval_loss": 3.7465176582336426, "eval_runtime": 178.9663, "eval_samples_per_second": 100.639, "eval_steps_per_second": 6.292, "step": 10000 }, { "epoch": 1.081691959961253, "grad_norm": 0.6235678791999817, "learning_rate": 0.0005356879646589807, "loss": 3.7469, "step": 10050 }, { "epoch": 1.087073512000861, "grad_norm": 0.6059385538101196, "learning_rate": 0.0005353647236289192, "loss": 3.7727, "step": 10100 }, { "epoch": 1.0924550640404693, "grad_norm": 0.5322882533073425, "learning_rate": 0.0005350414825988579, "loss": 3.752, "step": 10150 }, { "epoch": 1.0978366160800774, "grad_norm": 0.5794306397438049, "learning_rate": 0.0005347182415687964, "loss": 3.7532, "step": 10200 }, { "epoch": 1.1032181681196858, "grad_norm": 0.5910801291465759, "learning_rate": 0.000534395000538735, "loss": 3.7556, "step": 10250 }, { "epoch": 1.1085997201592939, "grad_norm": 0.6253539323806763, "learning_rate": 0.0005340717595086736, "loss": 3.7568, "step": 10300 }, { "epoch": 1.1139812721989022, "grad_norm": 0.5157849788665771, "learning_rate": 0.0005337485184786122, "loss": 3.7716, "step": 10350 }, { "epoch": 1.1193628242385103, "grad_norm": 0.6016262173652649, "learning_rate": 0.0005334252774485507, "loss": 3.7357, "step": 10400 }, { "epoch": 1.1247443762781186, "grad_norm": 0.579675018787384, "learning_rate": 0.0005331020364184894, "loss": 3.7609, "step": 10450 }, { "epoch": 1.1301259283177267, "grad_norm": 0.560275673866272, "learning_rate": 0.0005327787953884279, "loss": 3.7441, "step": 10500 }, { "epoch": 1.135507480357335, "grad_norm": 0.6928154230117798, "learning_rate": 0.0005324555543583665, "loss": 3.7657, "step": 10550 }, { "epoch": 1.1408890323969434, "grad_norm": 0.658794105052948, "learning_rate": 0.0005321323133283051, "loss": 3.7411, "step": 10600 }, { "epoch": 1.1462705844365515, "grad_norm": 0.5493429899215698, "learning_rate": 0.0005318090722982436, "loss": 3.7338, "step": 10650 }, { "epoch": 1.1516521364761596, "grad_norm": 0.550923228263855, "learning_rate": 0.0005314858312681823, "loss": 3.7536, "step": 10700 }, { "epoch": 1.157033688515768, "grad_norm": 0.5759270191192627, "learning_rate": 0.0005311625902381209, "loss": 3.7412, "step": 10750 }, { "epoch": 1.1624152405553763, "grad_norm": 0.59690922498703, "learning_rate": 0.0005308393492080595, "loss": 3.7677, "step": 10800 }, { "epoch": 1.1677967925949844, "grad_norm": 0.579595685005188, "learning_rate": 0.000530516108177998, "loss": 3.7422, "step": 10850 }, { "epoch": 1.1731783446345927, "grad_norm": 0.5605522990226746, "learning_rate": 0.0005301928671479365, "loss": 3.7655, "step": 10900 }, { "epoch": 1.1785598966742008, "grad_norm": 0.49691706895828247, "learning_rate": 0.0005298696261178752, "loss": 3.7477, "step": 10950 }, { "epoch": 1.1839414487138091, "grad_norm": 0.5915963649749756, "learning_rate": 0.0005295463850878138, "loss": 3.7571, "step": 11000 }, { "epoch": 1.1839414487138091, "eval_accuracy": 0.3470917587762469, "eval_loss": 3.7183022499084473, "eval_runtime": 178.9408, "eval_samples_per_second": 100.653, "eval_steps_per_second": 6.293, "step": 11000 }, { "epoch": 1.1893230007534172, "grad_norm": 0.6167376637458801, "learning_rate": 0.0005292231440577524, "loss": 3.7491, "step": 11050 }, { "epoch": 1.1947045527930256, "grad_norm": 0.5490695238113403, "learning_rate": 0.0005288999030276909, "loss": 3.7556, "step": 11100 }, { "epoch": 1.2000861048326337, "grad_norm": 0.5550501942634583, "learning_rate": 0.0005285831268182307, "loss": 3.7355, "step": 11150 }, { "epoch": 1.205467656872242, "grad_norm": 0.5252893567085266, "learning_rate": 0.0005282598857881694, "loss": 3.75, "step": 11200 }, { "epoch": 1.21084920891185, "grad_norm": 0.6135852336883545, "learning_rate": 0.0005279366447581079, "loss": 3.7647, "step": 11250 }, { "epoch": 1.2162307609514584, "grad_norm": 0.5655341744422913, "learning_rate": 0.0005276134037280465, "loss": 3.7508, "step": 11300 }, { "epoch": 1.2216123129910665, "grad_norm": 0.640620768070221, "learning_rate": 0.0005272901626979851, "loss": 3.7323, "step": 11350 }, { "epoch": 1.2269938650306749, "grad_norm": 0.5826889276504517, "learning_rate": 0.0005269669216679236, "loss": 3.7508, "step": 11400 }, { "epoch": 1.232375417070283, "grad_norm": 0.6063902974128723, "learning_rate": 0.0005266436806378623, "loss": 3.7142, "step": 11450 }, { "epoch": 1.2377569691098913, "grad_norm": 0.5887851119041443, "learning_rate": 0.0005263204396078008, "loss": 3.7454, "step": 11500 }, { "epoch": 1.2431385211494996, "grad_norm": 0.5676994323730469, "learning_rate": 0.0005259971985777394, "loss": 3.7398, "step": 11550 }, { "epoch": 1.2485200731891077, "grad_norm": 0.5710114240646362, "learning_rate": 0.000525673957547678, "loss": 3.7415, "step": 11600 }, { "epoch": 1.2539016252287158, "grad_norm": 0.5706628561019897, "learning_rate": 0.0005253507165176167, "loss": 3.7503, "step": 11650 }, { "epoch": 1.2592831772683242, "grad_norm": 0.5831769108772278, "learning_rate": 0.0005250274754875552, "loss": 3.7363, "step": 11700 }, { "epoch": 1.2646647293079325, "grad_norm": 0.547321081161499, "learning_rate": 0.0005247042344574938, "loss": 3.7223, "step": 11750 }, { "epoch": 1.2700462813475406, "grad_norm": 0.5827503204345703, "learning_rate": 0.0005243809934274323, "loss": 3.74, "step": 11800 }, { "epoch": 1.275427833387149, "grad_norm": 0.5403709411621094, "learning_rate": 0.0005240577523973709, "loss": 3.736, "step": 11850 }, { "epoch": 1.280809385426757, "grad_norm": 0.5563663840293884, "learning_rate": 0.0005237345113673095, "loss": 3.7202, "step": 11900 }, { "epoch": 1.2861909374663654, "grad_norm": 0.5416322946548462, "learning_rate": 0.0005234112703372481, "loss": 3.7263, "step": 11950 }, { "epoch": 1.2915724895059735, "grad_norm": 0.5695447325706482, "learning_rate": 0.0005230880293071867, "loss": 3.7338, "step": 12000 }, { "epoch": 1.2915724895059735, "eval_accuracy": 0.3501622893858841, "eval_loss": 3.6940038204193115, "eval_runtime": 178.9986, "eval_samples_per_second": 100.621, "eval_steps_per_second": 6.291, "step": 12000 }, { "epoch": 1.2969540415455818, "grad_norm": 0.6381962299346924, "learning_rate": 0.0005227647882771253, "loss": 3.731, "step": 12050 }, { "epoch": 1.30233559358519, "grad_norm": 0.5522515177726746, "learning_rate": 0.0005224415472470639, "loss": 3.7184, "step": 12100 }, { "epoch": 1.3077171456247982, "grad_norm": 0.6615414023399353, "learning_rate": 0.0005221183062170024, "loss": 3.7341, "step": 12150 }, { "epoch": 1.3130986976644063, "grad_norm": 0.6087273955345154, "learning_rate": 0.0005217950651869409, "loss": 3.7104, "step": 12200 }, { "epoch": 1.3184802497040147, "grad_norm": 0.5587693452835083, "learning_rate": 0.0005214718241568796, "loss": 3.7384, "step": 12250 }, { "epoch": 1.3238618017436228, "grad_norm": 0.5303893089294434, "learning_rate": 0.0005211485831268182, "loss": 3.739, "step": 12300 }, { "epoch": 1.329243353783231, "grad_norm": 0.5861002802848816, "learning_rate": 0.0005208253420967568, "loss": 3.7087, "step": 12350 }, { "epoch": 1.3346249058228392, "grad_norm": 0.5741501450538635, "learning_rate": 0.0005205021010666953, "loss": 3.7136, "step": 12400 }, { "epoch": 1.3400064578624475, "grad_norm": 0.5382627844810486, "learning_rate": 0.0005201788600366339, "loss": 3.7229, "step": 12450 }, { "epoch": 1.3453880099020559, "grad_norm": 0.5657112002372742, "learning_rate": 0.0005198556190065725, "loss": 3.7083, "step": 12500 }, { "epoch": 1.350769561941664, "grad_norm": 0.6373938322067261, "learning_rate": 0.0005195323779765112, "loss": 3.7256, "step": 12550 }, { "epoch": 1.356151113981272, "grad_norm": 0.5914209485054016, "learning_rate": 0.0005192091369464497, "loss": 3.7075, "step": 12600 }, { "epoch": 1.3615326660208804, "grad_norm": 0.5015832781791687, "learning_rate": 0.0005188858959163882, "loss": 3.7254, "step": 12650 }, { "epoch": 1.3669142180604887, "grad_norm": 0.5955027937889099, "learning_rate": 0.0005185626548863269, "loss": 3.7225, "step": 12700 }, { "epoch": 1.3722957701000968, "grad_norm": 0.5259278416633606, "learning_rate": 0.0005182394138562654, "loss": 3.7237, "step": 12750 }, { "epoch": 1.3776773221397052, "grad_norm": 0.5889491438865662, "learning_rate": 0.0005179161728262041, "loss": 3.7347, "step": 12800 }, { "epoch": 1.3830588741793133, "grad_norm": 0.5814417600631714, "learning_rate": 0.0005175929317961426, "loss": 3.7086, "step": 12850 }, { "epoch": 1.3884404262189216, "grad_norm": 0.5897478461265564, "learning_rate": 0.0005172696907660812, "loss": 3.7287, "step": 12900 }, { "epoch": 1.3938219782585297, "grad_norm": 0.5687185525894165, "learning_rate": 0.0005169464497360198, "loss": 3.7013, "step": 12950 }, { "epoch": 1.399203530298138, "grad_norm": 0.5773078203201294, "learning_rate": 0.0005166232087059583, "loss": 3.7185, "step": 13000 }, { "epoch": 1.399203530298138, "eval_accuracy": 0.35234914605892614, "eval_loss": 3.6752188205718994, "eval_runtime": 179.3536, "eval_samples_per_second": 100.422, "eval_steps_per_second": 6.278, "step": 13000 }, { "epoch": 1.4045850823377461, "grad_norm": 0.5312627553939819, "learning_rate": 0.0005162999676758969, "loss": 3.7118, "step": 13050 }, { "epoch": 1.4099666343773545, "grad_norm": 0.5687617063522339, "learning_rate": 0.0005159767266458355, "loss": 3.7301, "step": 13100 }, { "epoch": 1.4153481864169626, "grad_norm": 0.7186458110809326, "learning_rate": 0.0005156599504363753, "loss": 3.7048, "step": 13150 }, { "epoch": 1.420729738456571, "grad_norm": 0.5419722199440002, "learning_rate": 0.000515336709406314, "loss": 3.7085, "step": 13200 }, { "epoch": 1.426111290496179, "grad_norm": 0.5562013983726501, "learning_rate": 0.0005150134683762525, "loss": 3.7139, "step": 13250 }, { "epoch": 1.4314928425357873, "grad_norm": 0.637855589389801, "learning_rate": 0.0005146902273461911, "loss": 3.7102, "step": 13300 }, { "epoch": 1.4368743945753955, "grad_norm": 0.5790863037109375, "learning_rate": 0.0005143669863161297, "loss": 3.6912, "step": 13350 }, { "epoch": 1.4422559466150038, "grad_norm": 0.5652977228164673, "learning_rate": 0.0005140437452860683, "loss": 3.7058, "step": 13400 }, { "epoch": 1.447637498654612, "grad_norm": 0.5649129748344421, "learning_rate": 0.0005137205042560069, "loss": 3.7058, "step": 13450 }, { "epoch": 1.4530190506942202, "grad_norm": 0.5907796025276184, "learning_rate": 0.0005133972632259455, "loss": 3.6956, "step": 13500 }, { "epoch": 1.4584006027338283, "grad_norm": 0.5557147860527039, "learning_rate": 0.000513074022195884, "loss": 3.6873, "step": 13550 }, { "epoch": 1.4637821547734367, "grad_norm": 0.5033633708953857, "learning_rate": 0.0005127507811658226, "loss": 3.6974, "step": 13600 }, { "epoch": 1.469163706813045, "grad_norm": 0.594517707824707, "learning_rate": 0.0005124275401357612, "loss": 3.7054, "step": 13650 }, { "epoch": 1.474545258852653, "grad_norm": 0.557863712310791, "learning_rate": 0.0005121042991056997, "loss": 3.7019, "step": 13700 }, { "epoch": 1.4799268108922612, "grad_norm": 0.5355758666992188, "learning_rate": 0.0005117810580756384, "loss": 3.6927, "step": 13750 }, { "epoch": 1.4853083629318695, "grad_norm": 0.5377559065818787, "learning_rate": 0.0005114578170455769, "loss": 3.6882, "step": 13800 }, { "epoch": 1.4906899149714778, "grad_norm": 0.6003853678703308, "learning_rate": 0.0005111345760155156, "loss": 3.7148, "step": 13850 }, { "epoch": 1.496071467011086, "grad_norm": 0.5111503005027771, "learning_rate": 0.0005108113349854541, "loss": 3.695, "step": 13900 }, { "epoch": 1.501453019050694, "grad_norm": 0.5682783722877502, "learning_rate": 0.0005104880939553926, "loss": 3.6882, "step": 13950 }, { "epoch": 1.5068345710903024, "grad_norm": 0.5791579484939575, "learning_rate": 0.0005101648529253313, "loss": 3.6864, "step": 14000 }, { "epoch": 1.5068345710903024, "eval_accuracy": 0.3540987835114027, "eval_loss": 3.6519079208374023, "eval_runtime": 179.0958, "eval_samples_per_second": 100.566, "eval_steps_per_second": 6.287, "step": 14000 }, { "epoch": 1.5122161231299107, "grad_norm": 0.5820406675338745, "learning_rate": 0.0005098416118952699, "loss": 3.691, "step": 14050 }, { "epoch": 1.5175976751695188, "grad_norm": 0.5601463317871094, "learning_rate": 0.0005095183708652085, "loss": 3.6892, "step": 14100 }, { "epoch": 1.5229792272091272, "grad_norm": 0.5960391759872437, "learning_rate": 0.000509195129835147, "loss": 3.6978, "step": 14150 }, { "epoch": 1.5283607792487355, "grad_norm": 0.5785766839981079, "learning_rate": 0.0005088718888050856, "loss": 3.6912, "step": 14200 }, { "epoch": 1.5337423312883436, "grad_norm": 0.5855419635772705, "learning_rate": 0.0005085486477750242, "loss": 3.6761, "step": 14250 }, { "epoch": 1.5391238833279517, "grad_norm": 0.5645076036453247, "learning_rate": 0.0005082254067449629, "loss": 3.6715, "step": 14300 }, { "epoch": 1.54450543536756, "grad_norm": 0.4859999716281891, "learning_rate": 0.0005079021657149014, "loss": 3.6828, "step": 14350 }, { "epoch": 1.5498869874071683, "grad_norm": 0.5878191590309143, "learning_rate": 0.0005075789246848399, "loss": 3.6857, "step": 14400 }, { "epoch": 1.5552685394467765, "grad_norm": 0.5775585770606995, "learning_rate": 0.0005072556836547785, "loss": 3.6733, "step": 14450 }, { "epoch": 1.5606500914863846, "grad_norm": 0.6529456377029419, "learning_rate": 0.0005069324426247171, "loss": 3.6879, "step": 14500 }, { "epoch": 1.566031643525993, "grad_norm": 0.5500629544258118, "learning_rate": 0.0005066092015946557, "loss": 3.6819, "step": 14550 }, { "epoch": 1.5714131955656012, "grad_norm": 0.5275052785873413, "learning_rate": 0.0005062859605645943, "loss": 3.694, "step": 14600 }, { "epoch": 1.5767947476052093, "grad_norm": 0.6165966987609863, "learning_rate": 0.0005059627195345329, "loss": 3.6683, "step": 14650 }, { "epoch": 1.5821762996448174, "grad_norm": 0.5942190289497375, "learning_rate": 0.0005056394785044715, "loss": 3.682, "step": 14700 }, { "epoch": 1.5875578516844258, "grad_norm": 0.5476101040840149, "learning_rate": 0.00050531623747441, "loss": 3.7049, "step": 14750 }, { "epoch": 1.592939403724034, "grad_norm": 0.5422455072402954, "learning_rate": 0.0005049929964443486, "loss": 3.6912, "step": 14800 }, { "epoch": 1.5983209557636422, "grad_norm": 0.5225234031677246, "learning_rate": 0.0005046697554142871, "loss": 3.6757, "step": 14850 }, { "epoch": 1.6037025078032503, "grad_norm": 0.6003244519233704, "learning_rate": 0.0005043465143842258, "loss": 3.6728, "step": 14900 }, { "epoch": 1.6090840598428586, "grad_norm": 0.5976955890655518, "learning_rate": 0.0005040232733541644, "loss": 3.6692, "step": 14950 }, { "epoch": 1.614465611882467, "grad_norm": 0.6137278079986572, "learning_rate": 0.000503700032324103, "loss": 3.6783, "step": 15000 }, { "epoch": 1.614465611882467, "eval_accuracy": 0.35586439293838806, "eval_loss": 3.632730722427368, "eval_runtime": 178.9737, "eval_samples_per_second": 100.635, "eval_steps_per_second": 6.291, "step": 15000 }, { "epoch": 1.619847163922075, "grad_norm": 0.5619004368782043, "learning_rate": 0.0005033767912940415, "loss": 3.6571, "step": 15050 }, { "epoch": 1.6252287159616834, "grad_norm": 0.5776040554046631, "learning_rate": 0.0005030535502639802, "loss": 3.688, "step": 15100 }, { "epoch": 1.6306102680012917, "grad_norm": 0.5769944190979004, "learning_rate": 0.0005027303092339187, "loss": 3.6596, "step": 15150 }, { "epoch": 1.6359918200408998, "grad_norm": 0.547518253326416, "learning_rate": 0.0005024135330244585, "loss": 3.6813, "step": 15200 }, { "epoch": 1.641373372080508, "grad_norm": 0.5963606834411621, "learning_rate": 0.0005020902919943972, "loss": 3.6733, "step": 15250 }, { "epoch": 1.6467549241201163, "grad_norm": 0.5078262090682983, "learning_rate": 0.0005017670509643357, "loss": 3.6892, "step": 15300 }, { "epoch": 1.6521364761597246, "grad_norm": 0.5686724185943604, "learning_rate": 0.0005014438099342743, "loss": 3.6721, "step": 15350 }, { "epoch": 1.6575180281993327, "grad_norm": 0.5834243297576904, "learning_rate": 0.0005011205689042129, "loss": 3.6635, "step": 15400 }, { "epoch": 1.6628995802389408, "grad_norm": 0.5421767830848694, "learning_rate": 0.0005007973278741514, "loss": 3.6618, "step": 15450 }, { "epoch": 1.6682811322785491, "grad_norm": 0.546318769454956, "learning_rate": 0.00050047408684409, "loss": 3.6866, "step": 15500 }, { "epoch": 1.6736626843181575, "grad_norm": 0.5625678300857544, "learning_rate": 0.0005001508458140286, "loss": 3.67, "step": 15550 }, { "epoch": 1.6790442363577656, "grad_norm": 0.5252569913864136, "learning_rate": 0.0004998276047839673, "loss": 3.6774, "step": 15600 }, { "epoch": 1.6844257883973737, "grad_norm": 0.5419048070907593, "learning_rate": 0.0004995043637539058, "loss": 3.6742, "step": 15650 }, { "epoch": 1.689807340436982, "grad_norm": 0.5557122826576233, "learning_rate": 0.0004991811227238443, "loss": 3.6612, "step": 15700 }, { "epoch": 1.6951888924765903, "grad_norm": 0.588685929775238, "learning_rate": 0.0004988578816937829, "loss": 3.6471, "step": 15750 }, { "epoch": 1.7005704445161984, "grad_norm": 0.5404671430587769, "learning_rate": 0.0004985346406637215, "loss": 3.6745, "step": 15800 }, { "epoch": 1.7059519965558065, "grad_norm": 0.5492653846740723, "learning_rate": 0.0004982113996336602, "loss": 3.6696, "step": 15850 }, { "epoch": 1.7113335485954149, "grad_norm": 0.5558483600616455, "learning_rate": 0.0004978881586035987, "loss": 3.6613, "step": 15900 }, { "epoch": 1.7167151006350232, "grad_norm": 0.5276104211807251, "learning_rate": 0.0004975649175735373, "loss": 3.6524, "step": 15950 }, { "epoch": 1.7220966526746313, "grad_norm": 0.6016148328781128, "learning_rate": 0.0004972416765434759, "loss": 3.6678, "step": 16000 }, { "epoch": 1.7220966526746313, "eval_accuracy": 0.35738705450822017, "eval_loss": 3.619539976119995, "eval_runtime": 179.3517, "eval_samples_per_second": 100.423, "eval_steps_per_second": 6.278, "step": 16000 }, { "epoch": 1.7274782047142396, "grad_norm": 0.5481765270233154, "learning_rate": 0.0004969184355134145, "loss": 3.6751, "step": 16050 }, { "epoch": 1.732859756753848, "grad_norm": 0.5532946586608887, "learning_rate": 0.0004965951944833531, "loss": 3.6738, "step": 16100 }, { "epoch": 1.738241308793456, "grad_norm": 0.5443522930145264, "learning_rate": 0.0004962719534532916, "loss": 3.6568, "step": 16150 }, { "epoch": 1.7436228608330642, "grad_norm": 0.6046338081359863, "learning_rate": 0.0004959487124232302, "loss": 3.6623, "step": 16200 }, { "epoch": 1.7490044128726725, "grad_norm": 0.5878551006317139, "learning_rate": 0.0004956254713931688, "loss": 3.6728, "step": 16250 }, { "epoch": 1.7543859649122808, "grad_norm": 0.5456554293632507, "learning_rate": 0.0004953022303631074, "loss": 3.6478, "step": 16300 }, { "epoch": 1.759767516951889, "grad_norm": 0.5853443741798401, "learning_rate": 0.0004949789893330459, "loss": 3.6518, "step": 16350 }, { "epoch": 1.765149068991497, "grad_norm": 0.5492565631866455, "learning_rate": 0.0004946557483029846, "loss": 3.6614, "step": 16400 }, { "epoch": 1.7705306210311054, "grad_norm": 0.5802302360534668, "learning_rate": 0.0004943325072729231, "loss": 3.6504, "step": 16450 }, { "epoch": 1.7759121730707137, "grad_norm": 0.5635486245155334, "learning_rate": 0.0004940092662428617, "loss": 3.6599, "step": 16500 }, { "epoch": 1.7812937251103218, "grad_norm": 0.5809216499328613, "learning_rate": 0.0004936860252128003, "loss": 3.647, "step": 16550 }, { "epoch": 1.78667527714993, "grad_norm": 0.6346895694732666, "learning_rate": 0.0004933627841827388, "loss": 3.6313, "step": 16600 }, { "epoch": 1.7920568291895382, "grad_norm": 0.6176491379737854, "learning_rate": 0.0004930395431526775, "loss": 3.6595, "step": 16650 }, { "epoch": 1.7974383812291466, "grad_norm": 0.5374283194541931, "learning_rate": 0.0004927163021226161, "loss": 3.674, "step": 16700 }, { "epoch": 1.8028199332687547, "grad_norm": 0.5539432764053345, "learning_rate": 0.0004923930610925547, "loss": 3.6739, "step": 16750 }, { "epoch": 1.8082014853083628, "grad_norm": 0.5157961845397949, "learning_rate": 0.0004920698200624932, "loss": 3.6467, "step": 16800 }, { "epoch": 1.813583037347971, "grad_norm": 0.5923105478286743, "learning_rate": 0.0004917465790324317, "loss": 3.6798, "step": 16850 }, { "epoch": 1.8189645893875794, "grad_norm": 0.5784060955047607, "learning_rate": 0.0004914233380023704, "loss": 3.6693, "step": 16900 }, { "epoch": 1.8243461414271875, "grad_norm": 0.5685577392578125, "learning_rate": 0.0004911000969723089, "loss": 3.6522, "step": 16950 }, { "epoch": 1.8297276934667959, "grad_norm": 0.5154936909675598, "learning_rate": 0.0004907768559422476, "loss": 3.6386, "step": 17000 }, { "epoch": 1.8297276934667959, "eval_accuracy": 0.3590139141974664, "eval_loss": 3.6005563735961914, "eval_runtime": 182.6628, "eval_samples_per_second": 98.602, "eval_steps_per_second": 6.164, "step": 17000 }, { "epoch": 1.8351092455064042, "grad_norm": 0.5483383536338806, "learning_rate": 0.0004904536149121861, "loss": 3.654, "step": 17050 }, { "epoch": 1.8404907975460123, "grad_norm": 0.563372790813446, "learning_rate": 0.0004901303738821248, "loss": 3.6598, "step": 17100 }, { "epoch": 1.8458723495856204, "grad_norm": 0.5781205892562866, "learning_rate": 0.0004898071328520633, "loss": 3.6501, "step": 17150 }, { "epoch": 1.8512539016252287, "grad_norm": 0.5907060503959656, "learning_rate": 0.0004894838918220019, "loss": 3.6404, "step": 17200 }, { "epoch": 1.856635453664837, "grad_norm": 0.5685167908668518, "learning_rate": 0.0004891606507919405, "loss": 3.6296, "step": 17250 }, { "epoch": 1.8620170057044452, "grad_norm": 0.6219983696937561, "learning_rate": 0.000488837409761879, "loss": 3.6522, "step": 17300 }, { "epoch": 1.8673985577440533, "grad_norm": 0.6225223541259766, "learning_rate": 0.0004885141687318177, "loss": 3.6491, "step": 17350 }, { "epoch": 1.8727801097836616, "grad_norm": 0.5667858123779297, "learning_rate": 0.00048819092770175623, "loss": 3.6426, "step": 17400 }, { "epoch": 1.87816166182327, "grad_norm": 0.5822393298149109, "learning_rate": 0.0004878676866716948, "loss": 3.6589, "step": 17450 }, { "epoch": 1.883543213862878, "grad_norm": 0.5308080315589905, "learning_rate": 0.00048754444564163337, "loss": 3.6459, "step": 17500 }, { "epoch": 1.8889247659024861, "grad_norm": 0.5627791881561279, "learning_rate": 0.000487221204611572, "loss": 3.6394, "step": 17550 }, { "epoch": 1.8943063179420945, "grad_norm": 0.528186023235321, "learning_rate": 0.00048689796358151056, "loss": 3.6365, "step": 17600 }, { "epoch": 1.8996878699817028, "grad_norm": 0.5638163685798645, "learning_rate": 0.00048657472255144915, "loss": 3.6258, "step": 17650 }, { "epoch": 1.905069422021311, "grad_norm": 0.5592211484909058, "learning_rate": 0.00048625794634198896, "loss": 3.6379, "step": 17700 }, { "epoch": 1.910450974060919, "grad_norm": 0.5550535321235657, "learning_rate": 0.00048593470531192756, "loss": 3.6288, "step": 17750 }, { "epoch": 1.9158325261005273, "grad_norm": 0.5201454162597656, "learning_rate": 0.00048561146428186615, "loss": 3.6343, "step": 17800 }, { "epoch": 1.9212140781401357, "grad_norm": 0.5514869689941406, "learning_rate": 0.0004852882232518047, "loss": 3.6373, "step": 17850 }, { "epoch": 1.9265956301797438, "grad_norm": 0.5841929316520691, "learning_rate": 0.00048496498222174334, "loss": 3.6414, "step": 17900 }, { "epoch": 1.931977182219352, "grad_norm": 0.5401586294174194, "learning_rate": 0.00048464174119168193, "loss": 3.6548, "step": 17950 }, { "epoch": 1.9373587342589604, "grad_norm": 0.5513443946838379, "learning_rate": 0.0004843185001616205, "loss": 3.6326, "step": 18000 }, { "epoch": 1.9373587342589604, "eval_accuracy": 0.36038424441858263, "eval_loss": 3.5883660316467285, "eval_runtime": 183.1957, "eval_samples_per_second": 98.316, "eval_steps_per_second": 6.146, "step": 18000 }, { "epoch": 1.9427402862985685, "grad_norm": 0.632521390914917, "learning_rate": 0.00048399525913155907, "loss": 3.6209, "step": 18050 }, { "epoch": 1.9481218383381766, "grad_norm": 0.5382032990455627, "learning_rate": 0.0004836720181014976, "loss": 3.6386, "step": 18100 }, { "epoch": 1.953503390377785, "grad_norm": 0.5061617493629456, "learning_rate": 0.0004833487770714362, "loss": 3.6401, "step": 18150 }, { "epoch": 1.9588849424173933, "grad_norm": 0.5783522129058838, "learning_rate": 0.00048302553604137485, "loss": 3.6428, "step": 18200 }, { "epoch": 1.9642664944570014, "grad_norm": 0.6195810437202454, "learning_rate": 0.0004827022950113134, "loss": 3.6187, "step": 18250 }, { "epoch": 1.9696480464966095, "grad_norm": 0.5604776740074158, "learning_rate": 0.000482379053981252, "loss": 3.6415, "step": 18300 }, { "epoch": 1.9750295985362178, "grad_norm": 0.5185155868530273, "learning_rate": 0.0004820558129511906, "loss": 3.6327, "step": 18350 }, { "epoch": 1.9804111505758262, "grad_norm": 0.6525479555130005, "learning_rate": 0.0004817325719211291, "loss": 3.6585, "step": 18400 }, { "epoch": 1.9857927026154343, "grad_norm": 0.5597861409187317, "learning_rate": 0.0004814093308910677, "loss": 3.6256, "step": 18450 }, { "epoch": 1.9911742546550424, "grad_norm": 0.5498181581497192, "learning_rate": 0.00048108608986100637, "loss": 3.6316, "step": 18500 }, { "epoch": 1.9965558066946507, "grad_norm": 0.5421364307403564, "learning_rate": 0.0004807628488309449, "loss": 3.6267, "step": 18550 }, { "epoch": 2.001937358734259, "grad_norm": 0.5200116038322449, "learning_rate": 0.0004804396078008835, "loss": 3.6031, "step": 18600 }, { "epoch": 2.007318910773867, "grad_norm": 0.5705385804176331, "learning_rate": 0.00048011636677082204, "loss": 3.5411, "step": 18650 }, { "epoch": 2.0127004628134753, "grad_norm": 0.5364378094673157, "learning_rate": 0.00047979312574076064, "loss": 3.5445, "step": 18700 }, { "epoch": 2.018082014853084, "grad_norm": 0.5557453632354736, "learning_rate": 0.0004794698847106992, "loss": 3.5594, "step": 18750 }, { "epoch": 2.023463566892692, "grad_norm": 0.5713274478912354, "learning_rate": 0.0004791466436806378, "loss": 3.5587, "step": 18800 }, { "epoch": 2.0288451189323, "grad_norm": 0.6158965826034546, "learning_rate": 0.0004788234026505764, "loss": 3.5378, "step": 18850 }, { "epoch": 2.034226670971908, "grad_norm": 0.572852611541748, "learning_rate": 0.00047850016162051496, "loss": 3.5545, "step": 18900 }, { "epoch": 2.0396082230115167, "grad_norm": 0.508951723575592, "learning_rate": 0.00047817692059045356, "loss": 3.5317, "step": 18950 }, { "epoch": 2.044989775051125, "grad_norm": 0.6339460015296936, "learning_rate": 0.00047785367956039215, "loss": 3.5578, "step": 19000 }, { "epoch": 2.044989775051125, "eval_accuracy": 0.36163223148801976, "eval_loss": 3.578120470046997, "eval_runtime": 182.344, "eval_samples_per_second": 98.775, "eval_steps_per_second": 6.175, "step": 19000 }, { "epoch": 2.050371327090733, "grad_norm": 0.5592548847198486, "learning_rate": 0.00047753043853033075, "loss": 3.5847, "step": 19050 }, { "epoch": 2.055752879130341, "grad_norm": 0.5482358932495117, "learning_rate": 0.00047720719750026934, "loss": 3.5428, "step": 19100 }, { "epoch": 2.0611344311699495, "grad_norm": 0.5582023859024048, "learning_rate": 0.00047688395647020793, "loss": 3.568, "step": 19150 }, { "epoch": 2.0665159832095576, "grad_norm": 0.5522531867027283, "learning_rate": 0.0004765607154401465, "loss": 3.5402, "step": 19200 }, { "epoch": 2.0718975352491658, "grad_norm": 0.5672833919525146, "learning_rate": 0.00047623747441008507, "loss": 3.5641, "step": 19250 }, { "epoch": 2.0772790872887743, "grad_norm": 0.5249935984611511, "learning_rate": 0.0004759142333800236, "loss": 3.5564, "step": 19300 }, { "epoch": 2.0826606393283824, "grad_norm": 0.5762811303138733, "learning_rate": 0.00047559099234996226, "loss": 3.542, "step": 19350 }, { "epoch": 2.0880421913679905, "grad_norm": 0.5417709946632385, "learning_rate": 0.00047526775131990085, "loss": 3.5579, "step": 19400 }, { "epoch": 2.0934237434075986, "grad_norm": 0.5435495376586914, "learning_rate": 0.0004749445102898394, "loss": 3.5733, "step": 19450 }, { "epoch": 2.098805295447207, "grad_norm": 0.6130183339118958, "learning_rate": 0.000474621269259778, "loss": 3.5351, "step": 19500 }, { "epoch": 2.1041868474868153, "grad_norm": 0.5624420642852783, "learning_rate": 0.0004742980282297166, "loss": 3.557, "step": 19550 }, { "epoch": 2.1095683995264234, "grad_norm": 0.548542320728302, "learning_rate": 0.0004739747871996551, "loss": 3.5597, "step": 19600 }, { "epoch": 2.1149499515660315, "grad_norm": 0.6133421063423157, "learning_rate": 0.00047365154616959377, "loss": 3.5479, "step": 19650 }, { "epoch": 2.12033150360564, "grad_norm": 0.5370809435844421, "learning_rate": 0.00047332830513953237, "loss": 3.5669, "step": 19700 }, { "epoch": 2.125713055645248, "grad_norm": 0.5745679140090942, "learning_rate": 0.0004730050641094709, "loss": 3.5413, "step": 19750 }, { "epoch": 2.1310946076848563, "grad_norm": 0.6464185118675232, "learning_rate": 0.0004726818230794095, "loss": 3.5614, "step": 19800 }, { "epoch": 2.1364761597244644, "grad_norm": 0.6298586130142212, "learning_rate": 0.0004723650468699493, "loss": 3.5592, "step": 19850 }, { "epoch": 2.141857711764073, "grad_norm": 0.5500888228416443, "learning_rate": 0.0004720482706604891, "loss": 3.557, "step": 19900 }, { "epoch": 2.147239263803681, "grad_norm": 0.5739681720733643, "learning_rate": 0.0004717250296304277, "loss": 3.5703, "step": 19950 }, { "epoch": 2.152620815843289, "grad_norm": 0.5937275290489197, "learning_rate": 0.0004714017886003663, "loss": 3.5739, "step": 20000 }, { "epoch": 2.152620815843289, "eval_accuracy": 0.36332993286012105, "eval_loss": 3.56706166267395, "eval_runtime": 179.8702, "eval_samples_per_second": 100.133, "eval_steps_per_second": 6.26, "step": 20000 }, { "epoch": 2.1580023678828972, "grad_norm": 0.5616446733474731, "learning_rate": 0.00047107854757030485, "loss": 3.5788, "step": 20050 }, { "epoch": 2.163383919922506, "grad_norm": 0.5950763821601868, "learning_rate": 0.0004707553065402435, "loss": 3.5544, "step": 20100 }, { "epoch": 2.168765471962114, "grad_norm": 0.586762011051178, "learning_rate": 0.0004704320655101821, "loss": 3.5593, "step": 20150 }, { "epoch": 2.174147024001722, "grad_norm": 0.5497268438339233, "learning_rate": 0.00047010882448012063, "loss": 3.5502, "step": 20200 }, { "epoch": 2.1795285760413305, "grad_norm": 0.6361690759658813, "learning_rate": 0.0004697855834500592, "loss": 3.5526, "step": 20250 }, { "epoch": 2.1849101280809387, "grad_norm": 0.5467177033424377, "learning_rate": 0.00046946234241999776, "loss": 3.5691, "step": 20300 }, { "epoch": 2.1902916801205468, "grad_norm": 0.5798904895782471, "learning_rate": 0.00046913910138993636, "loss": 3.5546, "step": 20350 }, { "epoch": 2.195673232160155, "grad_norm": 0.5986142754554749, "learning_rate": 0.000468815860359875, "loss": 3.5545, "step": 20400 }, { "epoch": 2.2010547841997634, "grad_norm": 0.5662965178489685, "learning_rate": 0.00046849261932981355, "loss": 3.5701, "step": 20450 }, { "epoch": 2.2064363362393715, "grad_norm": 0.5928505063056946, "learning_rate": 0.00046816937829975214, "loss": 3.5488, "step": 20500 }, { "epoch": 2.2118178882789796, "grad_norm": 0.592729389667511, "learning_rate": 0.00046784613726969074, "loss": 3.5633, "step": 20550 }, { "epoch": 2.2171994403185877, "grad_norm": 0.6153143048286438, "learning_rate": 0.0004675228962396293, "loss": 3.5494, "step": 20600 }, { "epoch": 2.2225809923581963, "grad_norm": 0.569177508354187, "learning_rate": 0.0004671996552095679, "loss": 3.5509, "step": 20650 }, { "epoch": 2.2279625443978044, "grad_norm": 0.5724853873252869, "learning_rate": 0.0004668764141795065, "loss": 3.5543, "step": 20700 }, { "epoch": 2.2333440964374125, "grad_norm": 0.5677852034568787, "learning_rate": 0.00046655317314944506, "loss": 3.5382, "step": 20750 }, { "epoch": 2.2387256484770206, "grad_norm": 0.6422692537307739, "learning_rate": 0.00046622993211938366, "loss": 3.5493, "step": 20800 }, { "epoch": 2.244107200516629, "grad_norm": 0.5729255676269531, "learning_rate": 0.0004659066910893222, "loss": 3.5507, "step": 20850 }, { "epoch": 2.2494887525562373, "grad_norm": 0.7229714393615723, "learning_rate": 0.0004655834500592608, "loss": 3.5629, "step": 20900 }, { "epoch": 2.2548703045958454, "grad_norm": 0.5660877227783203, "learning_rate": 0.00046526020902919944, "loss": 3.526, "step": 20950 }, { "epoch": 2.2602518566354535, "grad_norm": 0.616032600402832, "learning_rate": 0.000464936967999138, "loss": 3.5597, "step": 21000 }, { "epoch": 2.2602518566354535, "eval_accuracy": 0.3637671520806865, "eval_loss": 3.5601515769958496, "eval_runtime": 179.4785, "eval_samples_per_second": 100.352, "eval_steps_per_second": 6.274, "step": 21000 }, { "epoch": 2.265633408675062, "grad_norm": 0.546413004398346, "learning_rate": 0.0004646137269690766, "loss": 3.5763, "step": 21050 }, { "epoch": 2.27101496071467, "grad_norm": 0.5391570329666138, "learning_rate": 0.00046429048593901517, "loss": 3.5192, "step": 21100 }, { "epoch": 2.2763965127542782, "grad_norm": 0.5423991680145264, "learning_rate": 0.0004639672449089537, "loss": 3.5526, "step": 21150 }, { "epoch": 2.281778064793887, "grad_norm": 0.6076175570487976, "learning_rate": 0.0004636440038788923, "loss": 3.559, "step": 21200 }, { "epoch": 2.287159616833495, "grad_norm": 0.5601520538330078, "learning_rate": 0.00046332076284883095, "loss": 3.5698, "step": 21250 }, { "epoch": 2.292541168873103, "grad_norm": 0.5304701924324036, "learning_rate": 0.0004629975218187695, "loss": 3.5464, "step": 21300 }, { "epoch": 2.297922720912711, "grad_norm": 0.5973435640335083, "learning_rate": 0.0004626742807887081, "loss": 3.5339, "step": 21350 }, { "epoch": 2.303304272952319, "grad_norm": 0.5656226277351379, "learning_rate": 0.00046235103975864663, "loss": 3.5591, "step": 21400 }, { "epoch": 2.3086858249919278, "grad_norm": 0.5647458434104919, "learning_rate": 0.0004620277987285852, "loss": 3.5547, "step": 21450 }, { "epoch": 2.314067377031536, "grad_norm": 0.5909318923950195, "learning_rate": 0.0004617045576985239, "loss": 3.5391, "step": 21500 }, { "epoch": 2.319448929071144, "grad_norm": 0.6222209334373474, "learning_rate": 0.0004613813166684624, "loss": 3.5373, "step": 21550 }, { "epoch": 2.3248304811107525, "grad_norm": 0.5309708714485168, "learning_rate": 0.000461058075638401, "loss": 3.5349, "step": 21600 }, { "epoch": 2.3302120331503606, "grad_norm": 0.6044318675994873, "learning_rate": 0.00046073483460833955, "loss": 3.5298, "step": 21650 }, { "epoch": 2.3355935851899687, "grad_norm": 0.5997243523597717, "learning_rate": 0.00046041159357827814, "loss": 3.551, "step": 21700 }, { "epoch": 2.340975137229577, "grad_norm": 0.5723425149917603, "learning_rate": 0.00046008835254821674, "loss": 3.5645, "step": 21750 }, { "epoch": 2.3463566892691854, "grad_norm": 0.5611083507537842, "learning_rate": 0.0004597651115181554, "loss": 3.5489, "step": 21800 }, { "epoch": 2.3517382413087935, "grad_norm": 0.5662772059440613, "learning_rate": 0.00045944187048809393, "loss": 3.5535, "step": 21850 }, { "epoch": 2.3571197933484016, "grad_norm": 0.5749315023422241, "learning_rate": 0.0004591186294580325, "loss": 3.5401, "step": 21900 }, { "epoch": 2.3625013453880097, "grad_norm": 0.5483359694480896, "learning_rate": 0.00045879538842797106, "loss": 3.553, "step": 21950 }, { "epoch": 2.3678828974276183, "grad_norm": 0.5702618360519409, "learning_rate": 0.00045847214739790966, "loss": 3.5387, "step": 22000 }, { "epoch": 2.3678828974276183, "eval_accuracy": 0.3652301632151085, "eval_loss": 3.5506792068481445, "eval_runtime": 179.5022, "eval_samples_per_second": 100.339, "eval_steps_per_second": 6.273, "step": 22000 }, { "epoch": 2.3732644494672264, "grad_norm": 0.5987721681594849, "learning_rate": 0.0004581489063678482, "loss": 3.5629, "step": 22050 }, { "epoch": 2.3786460015068345, "grad_norm": 0.5932543873786926, "learning_rate": 0.00045782566533778685, "loss": 3.5549, "step": 22100 }, { "epoch": 2.384027553546443, "grad_norm": 0.5876075625419617, "learning_rate": 0.00045750242430772544, "loss": 3.5467, "step": 22150 }, { "epoch": 2.389409105586051, "grad_norm": 0.5543049573898315, "learning_rate": 0.000457179183277664, "loss": 3.5546, "step": 22200 }, { "epoch": 2.3947906576256592, "grad_norm": 0.5559258460998535, "learning_rate": 0.0004568559422476026, "loss": 3.5553, "step": 22250 }, { "epoch": 2.4001722096652673, "grad_norm": 0.5804566740989685, "learning_rate": 0.00045653270121754117, "loss": 3.5514, "step": 22300 }, { "epoch": 2.4055537617048754, "grad_norm": 0.6004949808120728, "learning_rate": 0.0004562094601874797, "loss": 3.5641, "step": 22350 }, { "epoch": 2.410935313744484, "grad_norm": 0.563329815864563, "learning_rate": 0.00045588621915741836, "loss": 3.5434, "step": 22400 }, { "epoch": 2.416316865784092, "grad_norm": 0.581366777420044, "learning_rate": 0.00045556297812735696, "loss": 3.5442, "step": 22450 }, { "epoch": 2.4216984178237, "grad_norm": 0.6063306331634521, "learning_rate": 0.0004552397370972955, "loss": 3.5368, "step": 22500 }, { "epoch": 2.4270799698633088, "grad_norm": 0.6148455142974854, "learning_rate": 0.0004549164960672341, "loss": 3.545, "step": 22550 }, { "epoch": 2.432461521902917, "grad_norm": 0.5941827893257141, "learning_rate": 0.0004545997198577739, "loss": 3.5557, "step": 22600 }, { "epoch": 2.437843073942525, "grad_norm": 0.5602982640266418, "learning_rate": 0.0004542764788277125, "loss": 3.5344, "step": 22650 }, { "epoch": 2.443224625982133, "grad_norm": 0.5603272318840027, "learning_rate": 0.00045395323779765103, "loss": 3.5663, "step": 22700 }, { "epoch": 2.4486061780217416, "grad_norm": 0.5608507990837097, "learning_rate": 0.0004536299967675897, "loss": 3.5418, "step": 22750 }, { "epoch": 2.4539877300613497, "grad_norm": 0.6365243196487427, "learning_rate": 0.0004533067557375283, "loss": 3.5495, "step": 22800 }, { "epoch": 2.459369282100958, "grad_norm": 0.5514809489250183, "learning_rate": 0.0004529835147074668, "loss": 3.5499, "step": 22850 }, { "epoch": 2.464750834140566, "grad_norm": 0.5582593679428101, "learning_rate": 0.0004526602736774054, "loss": 3.5552, "step": 22900 }, { "epoch": 2.4701323861801745, "grad_norm": 0.5393312573432922, "learning_rate": 0.00045233703264734395, "loss": 3.5355, "step": 22950 }, { "epoch": 2.4755139382197826, "grad_norm": 0.5282938480377197, "learning_rate": 0.00045201379161728255, "loss": 3.5421, "step": 23000 }, { "epoch": 2.4755139382197826, "eval_accuracy": 0.36613448120038844, "eval_loss": 3.537445068359375, "eval_runtime": 179.0202, "eval_samples_per_second": 100.609, "eval_steps_per_second": 6.29, "step": 23000 }, { "epoch": 2.4808954902593907, "grad_norm": 0.5586053133010864, "learning_rate": 0.0004516905505872212, "loss": 3.5589, "step": 23050 }, { "epoch": 2.4862770422989993, "grad_norm": 0.5327692031860352, "learning_rate": 0.00045136730955715973, "loss": 3.5308, "step": 23100 }, { "epoch": 2.4916585943386074, "grad_norm": 0.5942808389663696, "learning_rate": 0.00045104406852709833, "loss": 3.5534, "step": 23150 }, { "epoch": 2.4970401463782155, "grad_norm": 0.5541889667510986, "learning_rate": 0.0004507208274970369, "loss": 3.5468, "step": 23200 }, { "epoch": 2.5024216984178236, "grad_norm": 0.5596364140510559, "learning_rate": 0.00045039758646697546, "loss": 3.5618, "step": 23250 }, { "epoch": 2.5078032504574317, "grad_norm": 0.6264579892158508, "learning_rate": 0.0004500743454369141, "loss": 3.5583, "step": 23300 }, { "epoch": 2.5131848024970402, "grad_norm": 0.5607829093933105, "learning_rate": 0.0004497511044068527, "loss": 3.5352, "step": 23350 }, { "epoch": 2.5185663545366483, "grad_norm": 0.5586680173873901, "learning_rate": 0.00044942786337679125, "loss": 3.5583, "step": 23400 }, { "epoch": 2.5239479065762565, "grad_norm": 0.5748319625854492, "learning_rate": 0.00044910462234672984, "loss": 3.5533, "step": 23450 }, { "epoch": 2.529329458615865, "grad_norm": 0.6069074273109436, "learning_rate": 0.0004487813813166684, "loss": 3.5411, "step": 23500 }, { "epoch": 2.534711010655473, "grad_norm": 0.6651023626327515, "learning_rate": 0.000448458140286607, "loss": 3.5325, "step": 23550 }, { "epoch": 2.540092562695081, "grad_norm": 0.5722082257270813, "learning_rate": 0.00044813489925654563, "loss": 3.538, "step": 23600 }, { "epoch": 2.5454741147346893, "grad_norm": 0.5829025506973267, "learning_rate": 0.00044781165822648417, "loss": 3.5338, "step": 23650 }, { "epoch": 2.550855666774298, "grad_norm": 0.6493865251541138, "learning_rate": 0.00044748841719642276, "loss": 3.5337, "step": 23700 }, { "epoch": 2.556237218813906, "grad_norm": 0.624065101146698, "learning_rate": 0.00044716517616636136, "loss": 3.5621, "step": 23750 }, { "epoch": 2.561618770853514, "grad_norm": 0.5480876564979553, "learning_rate": 0.0004468419351362999, "loss": 3.5341, "step": 23800 }, { "epoch": 2.567000322893122, "grad_norm": 0.6361683011054993, "learning_rate": 0.0004465186941062385, "loss": 3.5563, "step": 23850 }, { "epoch": 2.5723818749327307, "grad_norm": 0.5823378562927246, "learning_rate": 0.00044619545307617714, "loss": 3.5492, "step": 23900 }, { "epoch": 2.577763426972339, "grad_norm": 0.6135628819465637, "learning_rate": 0.0004458722120461157, "loss": 3.5398, "step": 23950 }, { "epoch": 2.583144979011947, "grad_norm": 0.6044685244560242, "learning_rate": 0.0004455489710160543, "loss": 3.5439, "step": 24000 }, { "epoch": 2.583144979011947, "eval_accuracy": 0.36710627262900114, "eval_loss": 3.529404401779175, "eval_runtime": 179.2864, "eval_samples_per_second": 100.459, "eval_steps_per_second": 6.28, "step": 24000 }, { "epoch": 2.5885265310515555, "grad_norm": 0.6280840635299683, "learning_rate": 0.0004452257299859928, "loss": 3.5257, "step": 24050 }, { "epoch": 2.5939080830911636, "grad_norm": 0.5950229167938232, "learning_rate": 0.0004449024889559314, "loss": 3.5257, "step": 24100 }, { "epoch": 2.5992896351307717, "grad_norm": 0.5847386121749878, "learning_rate": 0.00044457924792587, "loss": 3.532, "step": 24150 }, { "epoch": 2.60467118717038, "grad_norm": 0.5832902789115906, "learning_rate": 0.0004442560068958086, "loss": 3.5449, "step": 24200 }, { "epoch": 2.610052739209988, "grad_norm": 0.5737943053245544, "learning_rate": 0.0004439327658657472, "loss": 3.5354, "step": 24250 }, { "epoch": 2.6154342912495965, "grad_norm": 0.6369310021400452, "learning_rate": 0.0004436095248356858, "loss": 3.5432, "step": 24300 }, { "epoch": 2.6208158432892046, "grad_norm": 0.5807034969329834, "learning_rate": 0.00044328628380562433, "loss": 3.5404, "step": 24350 }, { "epoch": 2.6261973953288127, "grad_norm": 0.5453730821609497, "learning_rate": 0.0004429630427755629, "loss": 3.5349, "step": 24400 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5745566487312317, "learning_rate": 0.0004426398017455016, "loss": 3.5345, "step": 24450 }, { "epoch": 2.6369604994080293, "grad_norm": 0.6166813373565674, "learning_rate": 0.0004423165607154401, "loss": 3.5383, "step": 24500 }, { "epoch": 2.6423420514476375, "grad_norm": 0.5567078590393066, "learning_rate": 0.0004419933196853787, "loss": 3.5399, "step": 24550 }, { "epoch": 2.6477236034872456, "grad_norm": 0.5739792585372925, "learning_rate": 0.00044167007865531725, "loss": 3.5616, "step": 24600 }, { "epoch": 2.653105155526854, "grad_norm": 0.6040252447128296, "learning_rate": 0.00044134683762525584, "loss": 3.5345, "step": 24650 }, { "epoch": 2.658486707566462, "grad_norm": 0.5815364122390747, "learning_rate": 0.00044102359659519444, "loss": 3.5555, "step": 24700 }, { "epoch": 2.6638682596060703, "grad_norm": 0.6047835350036621, "learning_rate": 0.00044070035556513303, "loss": 3.538, "step": 24750 }, { "epoch": 2.6692498116456784, "grad_norm": 0.6201872229576111, "learning_rate": 0.00044037711453507163, "loss": 3.5486, "step": 24800 }, { "epoch": 2.674631363685287, "grad_norm": 0.5501798987388611, "learning_rate": 0.0004400538735050102, "loss": 3.5435, "step": 24850 }, { "epoch": 2.680012915724895, "grad_norm": 0.5451369881629944, "learning_rate": 0.00043973063247494876, "loss": 3.5211, "step": 24900 }, { "epoch": 2.685394467764503, "grad_norm": 0.5624337792396545, "learning_rate": 0.00043940739144488736, "loss": 3.5436, "step": 24950 }, { "epoch": 2.6907760198041117, "grad_norm": 0.6210815906524658, "learning_rate": 0.0004390841504148259, "loss": 3.551, "step": 25000 }, { "epoch": 2.6907760198041117, "eval_accuracy": 0.36806002767823665, "eval_loss": 3.5205116271972656, "eval_runtime": 179.8438, "eval_samples_per_second": 100.148, "eval_steps_per_second": 6.261, "step": 25000 }, { "epoch": 2.69615757184372, "grad_norm": 0.609665036201477, "learning_rate": 0.00043876090938476455, "loss": 3.5318, "step": 25050 }, { "epoch": 2.701539123883328, "grad_norm": 0.5681210160255432, "learning_rate": 0.00043843766835470314, "loss": 3.5439, "step": 25100 }, { "epoch": 2.706920675922936, "grad_norm": 0.5603126287460327, "learning_rate": 0.0004381144273246417, "loss": 3.5464, "step": 25150 }, { "epoch": 2.712302227962544, "grad_norm": 0.5702228546142578, "learning_rate": 0.0004377911862945803, "loss": 3.5436, "step": 25200 }, { "epoch": 2.7176837800021527, "grad_norm": 0.6090660095214844, "learning_rate": 0.00043746794526451887, "loss": 3.5304, "step": 25250 }, { "epoch": 2.723065332041761, "grad_norm": 0.5577659606933594, "learning_rate": 0.00043714470423445747, "loss": 3.5416, "step": 25300 }, { "epoch": 2.728446884081369, "grad_norm": 0.5535686016082764, "learning_rate": 0.00043682146320439606, "loss": 3.537, "step": 25350 }, { "epoch": 2.7338284361209775, "grad_norm": 0.5997450351715088, "learning_rate": 0.00043649822217433466, "loss": 3.5411, "step": 25400 }, { "epoch": 2.7392099881605856, "grad_norm": 0.5895914435386658, "learning_rate": 0.0004361749811442732, "loss": 3.5324, "step": 25450 }, { "epoch": 2.7445915402001937, "grad_norm": 0.5619504451751709, "learning_rate": 0.0004358517401142118, "loss": 3.537, "step": 25500 }, { "epoch": 2.749973092239802, "grad_norm": 0.6418375968933105, "learning_rate": 0.00043552849908415033, "loss": 3.542, "step": 25550 }, { "epoch": 2.7553546442794103, "grad_norm": 0.5967657566070557, "learning_rate": 0.000435205258054089, "loss": 3.5547, "step": 25600 }, { "epoch": 2.7607361963190185, "grad_norm": 0.6191643476486206, "learning_rate": 0.0004348820170240276, "loss": 3.5349, "step": 25650 }, { "epoch": 2.7661177483586266, "grad_norm": 0.6163985729217529, "learning_rate": 0.0004345587759939661, "loss": 3.535, "step": 25700 }, { "epoch": 2.7714993003982347, "grad_norm": 0.5719308257102966, "learning_rate": 0.0004342355349639047, "loss": 3.5499, "step": 25750 }, { "epoch": 2.776880852437843, "grad_norm": 0.6114965677261353, "learning_rate": 0.0004339122939338433, "loss": 3.5296, "step": 25800 }, { "epoch": 2.7822624044774513, "grad_norm": 0.5850511789321899, "learning_rate": 0.00043358905290378184, "loss": 3.534, "step": 25850 }, { "epoch": 2.7876439565170594, "grad_norm": 0.6050160527229309, "learning_rate": 0.0004332658118737205, "loss": 3.5341, "step": 25900 }, { "epoch": 2.793025508556668, "grad_norm": 0.5997806787490845, "learning_rate": 0.0004329425708436591, "loss": 3.5615, "step": 25950 }, { "epoch": 2.798407060596276, "grad_norm": 0.5498060584068298, "learning_rate": 0.00043261932981359763, "loss": 3.5305, "step": 26000 }, { "epoch": 2.798407060596276, "eval_accuracy": 0.3686556628092356, "eval_loss": 3.511131763458252, "eval_runtime": 179.1968, "eval_samples_per_second": 100.51, "eval_steps_per_second": 6.284, "step": 26000 }, { "epoch": 2.803788612635884, "grad_norm": 0.5489454865455627, "learning_rate": 0.0004322960887835362, "loss": 3.5271, "step": 26050 }, { "epoch": 2.8091701646754923, "grad_norm": 0.5742686986923218, "learning_rate": 0.00043197284775347476, "loss": 3.543, "step": 26100 }, { "epoch": 2.8145517167151004, "grad_norm": 0.5467936396598816, "learning_rate": 0.00043164960672341336, "loss": 3.539, "step": 26150 }, { "epoch": 2.819933268754709, "grad_norm": 0.5823588371276855, "learning_rate": 0.000431326365693352, "loss": 3.543, "step": 26200 }, { "epoch": 2.825314820794317, "grad_norm": 0.5678799748420715, "learning_rate": 0.00043100312466329055, "loss": 3.5221, "step": 26250 }, { "epoch": 2.830696372833925, "grad_norm": 0.6485151052474976, "learning_rate": 0.00043067988363322914, "loss": 3.5214, "step": 26300 }, { "epoch": 2.8360779248735337, "grad_norm": 0.7046056985855103, "learning_rate": 0.00043035664260316774, "loss": 3.5218, "step": 26350 }, { "epoch": 2.841459476913142, "grad_norm": 0.5689581632614136, "learning_rate": 0.0004300334015731063, "loss": 3.5213, "step": 26400 }, { "epoch": 2.84684102895275, "grad_norm": 0.5881664156913757, "learning_rate": 0.0004297101605430449, "loss": 3.5253, "step": 26450 }, { "epoch": 2.852222580992358, "grad_norm": 0.6149146556854248, "learning_rate": 0.0004293869195129835, "loss": 3.5354, "step": 26500 }, { "epoch": 2.857604133031966, "grad_norm": 0.5535168647766113, "learning_rate": 0.00042906367848292206, "loss": 3.5316, "step": 26550 }, { "epoch": 2.8629856850715747, "grad_norm": 0.5905988216400146, "learning_rate": 0.00042874690227346187, "loss": 3.5534, "step": 26600 }, { "epoch": 2.868367237111183, "grad_norm": 0.5926728248596191, "learning_rate": 0.00042842366124340046, "loss": 3.5224, "step": 26650 }, { "epoch": 2.873748789150791, "grad_norm": 0.6075058579444885, "learning_rate": 0.00042810042021333906, "loss": 3.5317, "step": 26700 }, { "epoch": 2.8791303411903995, "grad_norm": 0.5303025245666504, "learning_rate": 0.0004277771791832776, "loss": 3.5231, "step": 26750 }, { "epoch": 2.8845118932300076, "grad_norm": 0.6336898803710938, "learning_rate": 0.0004274539381532162, "loss": 3.5145, "step": 26800 }, { "epoch": 2.8898934452696157, "grad_norm": 0.6171728372573853, "learning_rate": 0.00042713069712315484, "loss": 3.5337, "step": 26850 }, { "epoch": 2.895274997309224, "grad_norm": 0.5975232720375061, "learning_rate": 0.0004268074560930934, "loss": 3.5473, "step": 26900 }, { "epoch": 2.9006565493488323, "grad_norm": 0.6211768984794617, "learning_rate": 0.000426484215063032, "loss": 3.5236, "step": 26950 }, { "epoch": 2.9060381013884404, "grad_norm": 0.5851085782051086, "learning_rate": 0.0004261609740329705, "loss": 3.5287, "step": 27000 }, { "epoch": 2.9060381013884404, "eval_accuracy": 0.3698442167490382, "eval_loss": 3.5010669231414795, "eval_runtime": 179.3282, "eval_samples_per_second": 100.436, "eval_steps_per_second": 6.279, "step": 27000 }, { "epoch": 2.9114196534280485, "grad_norm": 0.6108700633049011, "learning_rate": 0.0004258377330029091, "loss": 3.5186, "step": 27050 }, { "epoch": 2.9168012054676566, "grad_norm": 0.5464538931846619, "learning_rate": 0.00042551449197284776, "loss": 3.5305, "step": 27100 }, { "epoch": 2.922182757507265, "grad_norm": 0.6180171966552734, "learning_rate": 0.0004251912509427863, "loss": 3.5396, "step": 27150 }, { "epoch": 2.9275643095468733, "grad_norm": 0.5598770380020142, "learning_rate": 0.00042487447473332616, "loss": 3.5229, "step": 27200 }, { "epoch": 2.9329458615864814, "grad_norm": 0.5802903771400452, "learning_rate": 0.0004245512337032647, "loss": 3.5343, "step": 27250 }, { "epoch": 2.93832741362609, "grad_norm": 0.5852136611938477, "learning_rate": 0.0004242279926732033, "loss": 3.5192, "step": 27300 }, { "epoch": 2.943708965665698, "grad_norm": 0.6180029511451721, "learning_rate": 0.00042390475164314184, "loss": 3.5109, "step": 27350 }, { "epoch": 2.949090517705306, "grad_norm": 0.5679557919502258, "learning_rate": 0.00042358151061308043, "loss": 3.501, "step": 27400 }, { "epoch": 2.9544720697449143, "grad_norm": 0.5848575830459595, "learning_rate": 0.000423258269583019, "loss": 3.5109, "step": 27450 }, { "epoch": 2.9598536217845224, "grad_norm": 0.5937060713768005, "learning_rate": 0.0004229350285529576, "loss": 3.5339, "step": 27500 }, { "epoch": 2.965235173824131, "grad_norm": 0.6204190254211426, "learning_rate": 0.0004226117875228962, "loss": 3.5337, "step": 27550 }, { "epoch": 2.970616725863739, "grad_norm": 0.6267488598823547, "learning_rate": 0.0004222885464928348, "loss": 3.5302, "step": 27600 }, { "epoch": 2.975998277903347, "grad_norm": 0.6439998149871826, "learning_rate": 0.00042196530546277335, "loss": 3.544, "step": 27650 }, { "epoch": 2.9813798299429557, "grad_norm": 0.5691748857498169, "learning_rate": 0.00042164206443271195, "loss": 3.5132, "step": 27700 }, { "epoch": 2.986761381982564, "grad_norm": 0.5572154521942139, "learning_rate": 0.0004213188234026505, "loss": 3.5259, "step": 27750 }, { "epoch": 2.992142934022172, "grad_norm": 0.5671584010124207, "learning_rate": 0.00042099558237258914, "loss": 3.5239, "step": 27800 }, { "epoch": 2.9975244860617805, "grad_norm": 0.5981967449188232, "learning_rate": 0.00042067234134252773, "loss": 3.5162, "step": 27850 }, { "epoch": 3.0029060381013886, "grad_norm": 0.6473037004470825, "learning_rate": 0.00042034910031246627, "loss": 3.4669, "step": 27900 }, { "epoch": 3.0082875901409967, "grad_norm": 0.550574541091919, "learning_rate": 0.00042002585928240486, "loss": 3.4256, "step": 27950 }, { "epoch": 3.0136691421806048, "grad_norm": 0.656923770904541, "learning_rate": 0.00041970261825234346, "loss": 3.448, "step": 28000 }, { "epoch": 3.0136691421806048, "eval_accuracy": 0.37074321074281524, "eval_loss": 3.4990992546081543, "eval_runtime": 178.84, "eval_samples_per_second": 100.71, "eval_steps_per_second": 6.296, "step": 28000 }, { "epoch": 3.0190506942202133, "grad_norm": 0.620670735836029, "learning_rate": 0.00041937937722228205, "loss": 3.429, "step": 28050 }, { "epoch": 3.0244322462598214, "grad_norm": 0.7033103704452515, "learning_rate": 0.00041905613619222065, "loss": 3.4355, "step": 28100 }, { "epoch": 3.0298137982994295, "grad_norm": 0.5861430168151855, "learning_rate": 0.00041873289516215924, "loss": 3.4417, "step": 28150 }, { "epoch": 3.0351953503390376, "grad_norm": 0.761894702911377, "learning_rate": 0.0004184096541320978, "loss": 3.4445, "step": 28200 }, { "epoch": 3.040576902378646, "grad_norm": 0.5738074779510498, "learning_rate": 0.0004180864131020364, "loss": 3.4452, "step": 28250 }, { "epoch": 3.0459584544182543, "grad_norm": 0.6064544320106506, "learning_rate": 0.0004177631720719749, "loss": 3.438, "step": 28300 }, { "epoch": 3.0513400064578624, "grad_norm": 0.6525137424468994, "learning_rate": 0.00041743993104191357, "loss": 3.4415, "step": 28350 }, { "epoch": 3.0567215584974705, "grad_norm": 0.590854287147522, "learning_rate": 0.00041711669001185216, "loss": 3.4301, "step": 28400 }, { "epoch": 3.062103110537079, "grad_norm": 0.5857189297676086, "learning_rate": 0.0004167934489817907, "loss": 3.4297, "step": 28450 }, { "epoch": 3.067484662576687, "grad_norm": 0.6011194586753845, "learning_rate": 0.0004164702079517293, "loss": 3.4315, "step": 28500 }, { "epoch": 3.0728662146162953, "grad_norm": 0.5845563411712646, "learning_rate": 0.0004161469669216679, "loss": 3.4628, "step": 28550 }, { "epoch": 3.0782477666559034, "grad_norm": 0.581007182598114, "learning_rate": 0.00041582372589160643, "loss": 3.4393, "step": 28600 }, { "epoch": 3.083629318695512, "grad_norm": 0.5793245434761047, "learning_rate": 0.0004155004848615451, "loss": 3.435, "step": 28650 }, { "epoch": 3.08901087073512, "grad_norm": 0.6015080809593201, "learning_rate": 0.0004151772438314837, "loss": 3.4343, "step": 28700 }, { "epoch": 3.094392422774728, "grad_norm": 0.6130943894386292, "learning_rate": 0.0004148540028014222, "loss": 3.4579, "step": 28750 }, { "epoch": 3.0997739748143363, "grad_norm": 0.6099987626075745, "learning_rate": 0.0004145307617713608, "loss": 3.4259, "step": 28800 }, { "epoch": 3.105155526853945, "grad_norm": 0.5792219638824463, "learning_rate": 0.00041420752074129935, "loss": 3.4132, "step": 28850 }, { "epoch": 3.110537078893553, "grad_norm": 0.5792203545570374, "learning_rate": 0.000413884279711238, "loss": 3.4499, "step": 28900 }, { "epoch": 3.115918630933161, "grad_norm": 0.6480391621589661, "learning_rate": 0.0004135610386811766, "loss": 3.4545, "step": 28950 }, { "epoch": 3.121300182972769, "grad_norm": 0.584260880947113, "learning_rate": 0.00041323779765111514, "loss": 3.4403, "step": 29000 }, { "epoch": 3.121300182972769, "eval_accuracy": 0.3711630455013304, "eval_loss": 3.4935173988342285, "eval_runtime": 179.0537, "eval_samples_per_second": 100.59, "eval_steps_per_second": 6.289, "step": 29000 }, { "epoch": 3.1266817350123777, "grad_norm": 0.6889036893844604, "learning_rate": 0.00041291455662105373, "loss": 3.4449, "step": 29050 }, { "epoch": 3.132063287051986, "grad_norm": 0.5993597507476807, "learning_rate": 0.0004125913155909923, "loss": 3.4489, "step": 29100 }, { "epoch": 3.137444839091594, "grad_norm": 0.5783586502075195, "learning_rate": 0.00041226807456093087, "loss": 3.4628, "step": 29150 }, { "epoch": 3.1428263911312024, "grad_norm": 0.6329275965690613, "learning_rate": 0.0004119448335308695, "loss": 3.4342, "step": 29200 }, { "epoch": 3.1482079431708105, "grad_norm": 0.5965296626091003, "learning_rate": 0.0004116215925008081, "loss": 3.4469, "step": 29250 }, { "epoch": 3.1535894952104186, "grad_norm": 0.5802229046821594, "learning_rate": 0.00041129835147074665, "loss": 3.4513, "step": 29300 }, { "epoch": 3.1589710472500268, "grad_norm": 0.6095985770225525, "learning_rate": 0.00041097511044068524, "loss": 3.4548, "step": 29350 }, { "epoch": 3.1643525992896353, "grad_norm": 0.6009272933006287, "learning_rate": 0.0004106518694106238, "loss": 3.4591, "step": 29400 }, { "epoch": 3.1697341513292434, "grad_norm": 0.6169276237487793, "learning_rate": 0.0004103286283805624, "loss": 3.4587, "step": 29450 }, { "epoch": 3.1751157033688515, "grad_norm": 0.6713859438896179, "learning_rate": 0.00041000538735050103, "loss": 3.4482, "step": 29500 }, { "epoch": 3.1804972554084596, "grad_norm": 0.5751778483390808, "learning_rate": 0.00040968214632043957, "loss": 3.4539, "step": 29550 }, { "epoch": 3.185878807448068, "grad_norm": 0.6202540397644043, "learning_rate": 0.00040935890529037816, "loss": 3.4528, "step": 29600 }, { "epoch": 3.1912603594876763, "grad_norm": 0.6211392879486084, "learning_rate": 0.00040903566426031676, "loss": 3.4564, "step": 29650 }, { "epoch": 3.1966419115272844, "grad_norm": 0.6369031071662903, "learning_rate": 0.0004087124232302553, "loss": 3.4653, "step": 29700 }, { "epoch": 3.2020234635668925, "grad_norm": 0.6040171384811401, "learning_rate": 0.0004083891822001939, "loss": 3.458, "step": 29750 }, { "epoch": 3.207405015606501, "grad_norm": 0.6595917344093323, "learning_rate": 0.00040806594117013254, "loss": 3.453, "step": 29800 }, { "epoch": 3.212786567646109, "grad_norm": 0.6135461330413818, "learning_rate": 0.0004077427001400711, "loss": 3.4446, "step": 29850 }, { "epoch": 3.2181681196857173, "grad_norm": 0.6126970648765564, "learning_rate": 0.0004074194591100097, "loss": 3.4321, "step": 29900 }, { "epoch": 3.2235496717253254, "grad_norm": 0.6935044527053833, "learning_rate": 0.0004070962180799482, "loss": 3.4546, "step": 29950 }, { "epoch": 3.228931223764934, "grad_norm": 0.6535180807113647, "learning_rate": 0.0004067729770498868, "loss": 3.4615, "step": 30000 }, { "epoch": 3.228931223764934, "eval_accuracy": 0.3719571894583664, "eval_loss": 3.48696231842041, "eval_runtime": 179.286, "eval_samples_per_second": 100.46, "eval_steps_per_second": 6.28, "step": 30000 }, { "epoch": 3.234312775804542, "grad_norm": 0.619732141494751, "learning_rate": 0.00040644973601982546, "loss": 3.4334, "step": 30050 }, { "epoch": 3.23969432784415, "grad_norm": 0.6059393882751465, "learning_rate": 0.000406126494989764, "loss": 3.4488, "step": 30100 }, { "epoch": 3.2450758798837587, "grad_norm": 0.6176928877830505, "learning_rate": 0.0004058032539597026, "loss": 3.4561, "step": 30150 }, { "epoch": 3.250457431923367, "grad_norm": 0.5531356334686279, "learning_rate": 0.00040548001292964114, "loss": 3.4478, "step": 30200 }, { "epoch": 3.255838983962975, "grad_norm": 0.6354610919952393, "learning_rate": 0.00040515677189957973, "loss": 3.4416, "step": 30250 }, { "epoch": 3.261220536002583, "grad_norm": 0.6155176758766174, "learning_rate": 0.0004048335308695183, "loss": 3.4479, "step": 30300 }, { "epoch": 3.2666020880421915, "grad_norm": 0.6497113704681396, "learning_rate": 0.0004045102898394569, "loss": 3.4545, "step": 30350 }, { "epoch": 3.2719836400817996, "grad_norm": 0.6843705177307129, "learning_rate": 0.0004041870488093955, "loss": 3.4663, "step": 30400 }, { "epoch": 3.2773651921214078, "grad_norm": 0.5940167903900146, "learning_rate": 0.0004038638077793341, "loss": 3.4506, "step": 30450 }, { "epoch": 3.282746744161016, "grad_norm": 0.5947675704956055, "learning_rate": 0.00040354056674927265, "loss": 3.4674, "step": 30500 }, { "epoch": 3.2881282962006244, "grad_norm": 0.5861588716506958, "learning_rate": 0.00040321732571921124, "loss": 3.4518, "step": 30550 }, { "epoch": 3.2935098482402325, "grad_norm": 0.5792436003684998, "learning_rate": 0.0004028940846891498, "loss": 3.4557, "step": 30600 }, { "epoch": 3.2988914002798406, "grad_norm": 0.718202531337738, "learning_rate": 0.00040257084365908843, "loss": 3.4538, "step": 30650 }, { "epoch": 3.304272952319449, "grad_norm": 0.707562267780304, "learning_rate": 0.00040224760262902703, "loss": 3.4609, "step": 30700 }, { "epoch": 3.3096545043590573, "grad_norm": 0.6532739400863647, "learning_rate": 0.00040192436159896557, "loss": 3.4803, "step": 30750 }, { "epoch": 3.3150360563986654, "grad_norm": 0.5957037210464478, "learning_rate": 0.00040160112056890416, "loss": 3.4379, "step": 30800 }, { "epoch": 3.3204176084382735, "grad_norm": 0.5771480798721313, "learning_rate": 0.00040127787953884276, "loss": 3.4441, "step": 30850 }, { "epoch": 3.3257991604778816, "grad_norm": 0.5710120797157288, "learning_rate": 0.00040095463850878135, "loss": 3.4541, "step": 30900 }, { "epoch": 3.33118071251749, "grad_norm": 0.5772055387496948, "learning_rate": 0.00040063139747871995, "loss": 3.4533, "step": 30950 }, { "epoch": 3.3365622645570983, "grad_norm": 0.5999658703804016, "learning_rate": 0.00040030815644865854, "loss": 3.4563, "step": 31000 }, { "epoch": 3.3365622645570983, "eval_accuracy": 0.3728993186486058, "eval_loss": 3.479647397994995, "eval_runtime": 179.0896, "eval_samples_per_second": 100.57, "eval_steps_per_second": 6.287, "step": 31000 }, { "epoch": 3.3419438165967064, "grad_norm": 0.6468915939331055, "learning_rate": 0.0003999849154185971, "loss": 3.4626, "step": 31050 }, { "epoch": 3.347325368636315, "grad_norm": 0.585728108882904, "learning_rate": 0.0003996616743885357, "loss": 3.4419, "step": 31100 }, { "epoch": 3.352706920675923, "grad_norm": 0.6033294796943665, "learning_rate": 0.0003993384333584742, "loss": 3.4547, "step": 31150 }, { "epoch": 3.358088472715531, "grad_norm": 0.590506374835968, "learning_rate": 0.0003990216571490141, "loss": 3.4594, "step": 31200 }, { "epoch": 3.3634700247551392, "grad_norm": 0.5833864808082581, "learning_rate": 0.0003986984161189526, "loss": 3.4588, "step": 31250 }, { "epoch": 3.368851576794748, "grad_norm": 0.6163685321807861, "learning_rate": 0.00039837517508889127, "loss": 3.4569, "step": 31300 }, { "epoch": 3.374233128834356, "grad_norm": 0.5709459781646729, "learning_rate": 0.00039805193405882986, "loss": 3.4552, "step": 31350 }, { "epoch": 3.379614680873964, "grad_norm": 0.6267880201339722, "learning_rate": 0.0003977286930287684, "loss": 3.4539, "step": 31400 }, { "epoch": 3.384996232913572, "grad_norm": 0.6592099070549011, "learning_rate": 0.000397405451998707, "loss": 3.4596, "step": 31450 }, { "epoch": 3.3903777849531807, "grad_norm": 0.5931367874145508, "learning_rate": 0.00039708221096864554, "loss": 3.4444, "step": 31500 }, { "epoch": 3.3957593369927888, "grad_norm": 0.5845255851745605, "learning_rate": 0.00039675896993858413, "loss": 3.447, "step": 31550 }, { "epoch": 3.401140889032397, "grad_norm": 0.58004230260849, "learning_rate": 0.0003964357289085228, "loss": 3.4497, "step": 31600 }, { "epoch": 3.4065224410720054, "grad_norm": 0.6078285574913025, "learning_rate": 0.0003961124878784613, "loss": 3.4546, "step": 31650 }, { "epoch": 3.4119039931116135, "grad_norm": 0.6139497756958008, "learning_rate": 0.0003957892468483999, "loss": 3.4651, "step": 31700 }, { "epoch": 3.4172855451512216, "grad_norm": 0.5797174572944641, "learning_rate": 0.0003954660058183385, "loss": 3.479, "step": 31750 }, { "epoch": 3.4226670971908297, "grad_norm": 0.5693271160125732, "learning_rate": 0.00039514276478827705, "loss": 3.4581, "step": 31800 }, { "epoch": 3.428048649230438, "grad_norm": 0.6098562479019165, "learning_rate": 0.0003948195237582157, "loss": 3.4598, "step": 31850 }, { "epoch": 3.4334302012700464, "grad_norm": 0.6037207841873169, "learning_rate": 0.0003944962827281543, "loss": 3.4733, "step": 31900 }, { "epoch": 3.4388117533096545, "grad_norm": 0.652802586555481, "learning_rate": 0.00039417304169809284, "loss": 3.4701, "step": 31950 }, { "epoch": 3.4441933053492626, "grad_norm": 0.6588259935379028, "learning_rate": 0.00039384980066803143, "loss": 3.4406, "step": 32000 }, { "epoch": 3.4441933053492626, "eval_accuracy": 0.3734812635157402, "eval_loss": 3.4747695922851562, "eval_runtime": 179.1046, "eval_samples_per_second": 100.561, "eval_steps_per_second": 6.287, "step": 32000 }, { "epoch": 3.449574857388871, "grad_norm": 0.5948982834815979, "learning_rate": 0.00039352655963796997, "loss": 3.4522, "step": 32050 }, { "epoch": 3.4549564094284793, "grad_norm": 0.5928410291671753, "learning_rate": 0.00039320331860790857, "loss": 3.44, "step": 32100 }, { "epoch": 3.4603379614680874, "grad_norm": 0.6897343993186951, "learning_rate": 0.0003928800775778472, "loss": 3.4647, "step": 32150 }, { "epoch": 3.4657195135076955, "grad_norm": 0.6423550844192505, "learning_rate": 0.00039255683654778576, "loss": 3.4764, "step": 32200 }, { "epoch": 3.471101065547304, "grad_norm": 0.6164859533309937, "learning_rate": 0.00039223359551772435, "loss": 3.4429, "step": 32250 }, { "epoch": 3.476482617586912, "grad_norm": 0.6273153424263, "learning_rate": 0.00039191035448766294, "loss": 3.4514, "step": 32300 }, { "epoch": 3.4818641696265202, "grad_norm": 0.6922532916069031, "learning_rate": 0.0003915871134576015, "loss": 3.4654, "step": 32350 }, { "epoch": 3.4872457216661283, "grad_norm": 0.6379693150520325, "learning_rate": 0.0003912638724275401, "loss": 3.4567, "step": 32400 }, { "epoch": 3.492627273705737, "grad_norm": 0.5990399718284607, "learning_rate": 0.00039094063139747873, "loss": 3.4531, "step": 32450 }, { "epoch": 3.498008825745345, "grad_norm": 0.6209031343460083, "learning_rate": 0.00039061739036741727, "loss": 3.4531, "step": 32500 }, { "epoch": 3.503390377784953, "grad_norm": 0.6115899085998535, "learning_rate": 0.00039029414933735586, "loss": 3.4524, "step": 32550 }, { "epoch": 3.5087719298245617, "grad_norm": 0.6001299619674683, "learning_rate": 0.0003899709083072944, "loss": 3.4439, "step": 32600 }, { "epoch": 3.5141534818641698, "grad_norm": 0.5546129941940308, "learning_rate": 0.000389647667277233, "loss": 3.4577, "step": 32650 }, { "epoch": 3.519535033903778, "grad_norm": 0.6230340600013733, "learning_rate": 0.00038932442624717165, "loss": 3.4563, "step": 32700 }, { "epoch": 3.524916585943386, "grad_norm": 0.5751449465751648, "learning_rate": 0.0003890011852171102, "loss": 3.4683, "step": 32750 }, { "epoch": 3.530298137982994, "grad_norm": 0.5927337408065796, "learning_rate": 0.0003886779441870488, "loss": 3.4581, "step": 32800 }, { "epoch": 3.5356796900226026, "grad_norm": 0.6218268871307373, "learning_rate": 0.0003883547031569874, "loss": 3.4406, "step": 32850 }, { "epoch": 3.5410612420622107, "grad_norm": 0.6192695498466492, "learning_rate": 0.0003880314621269259, "loss": 3.4409, "step": 32900 }, { "epoch": 3.546442794101819, "grad_norm": 0.6310914158821106, "learning_rate": 0.0003877082210968645, "loss": 3.4578, "step": 32950 }, { "epoch": 3.5518243461414274, "grad_norm": 0.6243562698364258, "learning_rate": 0.00038738498006680316, "loss": 3.4574, "step": 33000 }, { "epoch": 3.5518243461414274, "eval_accuracy": 0.37389370987788395, "eval_loss": 3.4683125019073486, "eval_runtime": 179.3203, "eval_samples_per_second": 100.44, "eval_steps_per_second": 6.279, "step": 33000 }, { "epoch": 3.5572058981810355, "grad_norm": 0.6324516534805298, "learning_rate": 0.0003870617390367417, "loss": 3.4369, "step": 33050 }, { "epoch": 3.5625874502206436, "grad_norm": 0.5811484456062317, "learning_rate": 0.0003867384980066803, "loss": 3.4627, "step": 33100 }, { "epoch": 3.5679690022602517, "grad_norm": 0.7062387466430664, "learning_rate": 0.00038641525697661884, "loss": 3.4627, "step": 33150 }, { "epoch": 3.57335055429986, "grad_norm": 0.6775194406509399, "learning_rate": 0.00038609201594655743, "loss": 3.4499, "step": 33200 }, { "epoch": 3.5787321063394684, "grad_norm": 0.6260322332382202, "learning_rate": 0.000385768774916496, "loss": 3.4538, "step": 33250 }, { "epoch": 3.5841136583790765, "grad_norm": 0.6278160214424133, "learning_rate": 0.0003854455338864346, "loss": 3.4616, "step": 33300 }, { "epoch": 3.5894952104186846, "grad_norm": 0.5945996046066284, "learning_rate": 0.0003851222928563732, "loss": 3.4556, "step": 33350 }, { "epoch": 3.594876762458293, "grad_norm": 0.6463197469711304, "learning_rate": 0.0003847990518263118, "loss": 3.4489, "step": 33400 }, { "epoch": 3.6002583144979012, "grad_norm": 0.5937623381614685, "learning_rate": 0.00038447581079625035, "loss": 3.457, "step": 33450 }, { "epoch": 3.6056398665375093, "grad_norm": 0.5999521017074585, "learning_rate": 0.00038415256976618895, "loss": 3.4546, "step": 33500 }, { "epoch": 3.611021418577118, "grad_norm": 0.6468307375907898, "learning_rate": 0.0003838293287361275, "loss": 3.4781, "step": 33550 }, { "epoch": 3.616402970616726, "grad_norm": 0.6306170225143433, "learning_rate": 0.00038350608770606613, "loss": 3.4605, "step": 33600 }, { "epoch": 3.621784522656334, "grad_norm": 0.5937240123748779, "learning_rate": 0.00038318284667600473, "loss": 3.4531, "step": 33650 }, { "epoch": 3.627166074695942, "grad_norm": 0.6180366277694702, "learning_rate": 0.00038285960564594327, "loss": 3.4572, "step": 33700 }, { "epoch": 3.6325476267355503, "grad_norm": 0.6059223413467407, "learning_rate": 0.00038253636461588186, "loss": 3.437, "step": 33750 }, { "epoch": 3.637929178775159, "grad_norm": 0.6537757515907288, "learning_rate": 0.00038221312358582046, "loss": 3.4549, "step": 33800 }, { "epoch": 3.643310730814767, "grad_norm": 0.640531063079834, "learning_rate": 0.00038188988255575905, "loss": 3.4548, "step": 33850 }, { "epoch": 3.648692282854375, "grad_norm": 0.6328907608985901, "learning_rate": 0.00038156664152569765, "loss": 3.4805, "step": 33900 }, { "epoch": 3.6540738348939836, "grad_norm": 0.6357612013816833, "learning_rate": 0.00038124340049563624, "loss": 3.4324, "step": 33950 }, { "epoch": 3.6594553869335917, "grad_norm": 0.6222441792488098, "learning_rate": 0.0003809201594655748, "loss": 3.4647, "step": 34000 }, { "epoch": 3.6594553869335917, "eval_accuracy": 0.37435081257691943, "eval_loss": 3.4609298706054688, "eval_runtime": 179.0439, "eval_samples_per_second": 100.595, "eval_steps_per_second": 6.289, "step": 34000 }, { "epoch": 3.6648369389732, "grad_norm": 0.6288761496543884, "learning_rate": 0.0003805969184355134, "loss": 3.4532, "step": 34050 }, { "epoch": 3.670218491012808, "grad_norm": 0.6488564014434814, "learning_rate": 0.0003802736774054519, "loss": 3.4541, "step": 34100 }, { "epoch": 3.675600043052416, "grad_norm": 0.5783025026321411, "learning_rate": 0.00037995043637539057, "loss": 3.4668, "step": 34150 }, { "epoch": 3.6809815950920246, "grad_norm": 0.6275387406349182, "learning_rate": 0.00037962719534532916, "loss": 3.4485, "step": 34200 }, { "epoch": 3.6863631471316327, "grad_norm": 0.6633421182632446, "learning_rate": 0.0003793039543152677, "loss": 3.4531, "step": 34250 }, { "epoch": 3.691744699171241, "grad_norm": 0.5747531056404114, "learning_rate": 0.0003789807132852063, "loss": 3.4494, "step": 34300 }, { "epoch": 3.6971262512108494, "grad_norm": 0.6330657601356506, "learning_rate": 0.0003786574722551449, "loss": 3.4509, "step": 34350 }, { "epoch": 3.7025078032504575, "grad_norm": 0.6217444539070129, "learning_rate": 0.00037833423122508343, "loss": 3.4681, "step": 34400 }, { "epoch": 3.7078893552900656, "grad_norm": 0.6288900375366211, "learning_rate": 0.0003780109901950221, "loss": 3.4492, "step": 34450 }, { "epoch": 3.713270907329674, "grad_norm": 0.6115100383758545, "learning_rate": 0.0003776877491649607, "loss": 3.457, "step": 34500 }, { "epoch": 3.7186524593692822, "grad_norm": 0.6019355654716492, "learning_rate": 0.0003773645081348992, "loss": 3.4513, "step": 34550 }, { "epoch": 3.7240340114088903, "grad_norm": 0.6346986889839172, "learning_rate": 0.0003770412671048378, "loss": 3.4432, "step": 34600 }, { "epoch": 3.7294155634484984, "grad_norm": 0.672346830368042, "learning_rate": 0.00037671802607477635, "loss": 3.4673, "step": 34650 }, { "epoch": 3.7347971154881066, "grad_norm": 0.6043028831481934, "learning_rate": 0.000376394785044715, "loss": 3.4596, "step": 34700 }, { "epoch": 3.740178667527715, "grad_norm": 0.5926814079284668, "learning_rate": 0.0003760715440146536, "loss": 3.4503, "step": 34750 }, { "epoch": 3.745560219567323, "grad_norm": 0.5881823897361755, "learning_rate": 0.00037574830298459214, "loss": 3.4753, "step": 34800 }, { "epoch": 3.7509417716069313, "grad_norm": 0.666195809841156, "learning_rate": 0.00037542506195453073, "loss": 3.4442, "step": 34850 }, { "epoch": 3.75632332364654, "grad_norm": 0.6226490139961243, "learning_rate": 0.0003751018209244693, "loss": 3.4731, "step": 34900 }, { "epoch": 3.761704875686148, "grad_norm": 0.6833046674728394, "learning_rate": 0.00037477857989440787, "loss": 3.4444, "step": 34950 }, { "epoch": 3.767086427725756, "grad_norm": 0.6438870429992676, "learning_rate": 0.0003744553388643465, "loss": 3.4859, "step": 35000 }, { "epoch": 3.767086427725756, "eval_accuracy": 0.37514006715400383, "eval_loss": 3.4551682472229004, "eval_runtime": 179.0275, "eval_samples_per_second": 100.605, "eval_steps_per_second": 6.29, "step": 35000 }, { "epoch": 3.772467979765364, "grad_norm": 0.659283459186554, "learning_rate": 0.0003741320978342851, "loss": 3.4453, "step": 35050 }, { "epoch": 3.7778495318049723, "grad_norm": 0.696498453617096, "learning_rate": 0.00037380885680422365, "loss": 3.429, "step": 35100 }, { "epoch": 3.783231083844581, "grad_norm": 0.6495005488395691, "learning_rate": 0.00037348561577416224, "loss": 3.4386, "step": 35150 }, { "epoch": 3.788612635884189, "grad_norm": 0.6730723977088928, "learning_rate": 0.0003731623747441008, "loss": 3.4539, "step": 35200 }, { "epoch": 3.793994187923797, "grad_norm": 0.6224913597106934, "learning_rate": 0.00037284559853464064, "loss": 3.4536, "step": 35250 }, { "epoch": 3.7993757399634056, "grad_norm": 0.6458625793457031, "learning_rate": 0.0003725223575045792, "loss": 3.4476, "step": 35300 }, { "epoch": 3.8047572920030137, "grad_norm": 0.6062401533126831, "learning_rate": 0.0003721991164745178, "loss": 3.431, "step": 35350 }, { "epoch": 3.810138844042622, "grad_norm": 0.6031923890113831, "learning_rate": 0.00037187587544445643, "loss": 3.4511, "step": 35400 }, { "epoch": 3.8155203960822304, "grad_norm": 0.7520219087600708, "learning_rate": 0.00037155263441439497, "loss": 3.4427, "step": 35450 }, { "epoch": 3.8209019481218385, "grad_norm": 0.6550789475440979, "learning_rate": 0.00037122939338433356, "loss": 3.4508, "step": 35500 }, { "epoch": 3.8262835001614466, "grad_norm": 0.604832112789154, "learning_rate": 0.0003709061523542721, "loss": 3.4375, "step": 35550 }, { "epoch": 3.8316650522010547, "grad_norm": 0.6706380844116211, "learning_rate": 0.0003705829113242107, "loss": 3.449, "step": 35600 }, { "epoch": 3.837046604240663, "grad_norm": 0.6066935062408447, "learning_rate": 0.00037025967029414935, "loss": 3.4552, "step": 35650 }, { "epoch": 3.8424281562802713, "grad_norm": 0.5741865038871765, "learning_rate": 0.0003699364292640879, "loss": 3.4315, "step": 35700 }, { "epoch": 3.8478097083198795, "grad_norm": 0.6105890274047852, "learning_rate": 0.0003696131882340265, "loss": 3.4611, "step": 35750 }, { "epoch": 3.8531912603594876, "grad_norm": 0.6041397452354431, "learning_rate": 0.0003692899472039651, "loss": 3.4443, "step": 35800 }, { "epoch": 3.858572812399096, "grad_norm": 0.5912279486656189, "learning_rate": 0.0003689667061739036, "loss": 3.4535, "step": 35850 }, { "epoch": 3.863954364438704, "grad_norm": 0.620097815990448, "learning_rate": 0.0003686434651438422, "loss": 3.4531, "step": 35900 }, { "epoch": 3.8693359164783123, "grad_norm": 0.5963624715805054, "learning_rate": 0.00036832022411378086, "loss": 3.4493, "step": 35950 }, { "epoch": 3.8747174685179204, "grad_norm": 0.6551373600959778, "learning_rate": 0.0003679969830837194, "loss": 3.4496, "step": 36000 }, { "epoch": 3.8747174685179204, "eval_accuracy": 0.37571505823631807, "eval_loss": 3.4498140811920166, "eval_runtime": 179.3248, "eval_samples_per_second": 100.438, "eval_steps_per_second": 6.279, "step": 36000 }, { "epoch": 3.8800990205575285, "grad_norm": 0.6709862947463989, "learning_rate": 0.000367673742053658, "loss": 3.4286, "step": 36050 }, { "epoch": 3.885480572597137, "grad_norm": 0.6202583909034729, "learning_rate": 0.00036735050102359654, "loss": 3.4538, "step": 36100 }, { "epoch": 3.890862124636745, "grad_norm": 0.5883134007453918, "learning_rate": 0.00036702725999353513, "loss": 3.4411, "step": 36150 }, { "epoch": 3.8962436766763533, "grad_norm": 0.6494138836860657, "learning_rate": 0.0003667040189634737, "loss": 3.4612, "step": 36200 }, { "epoch": 3.901625228715962, "grad_norm": 0.6044232845306396, "learning_rate": 0.0003663807779334123, "loss": 3.4286, "step": 36250 }, { "epoch": 3.90700678075557, "grad_norm": 0.6023014783859253, "learning_rate": 0.0003660575369033509, "loss": 3.4456, "step": 36300 }, { "epoch": 3.912388332795178, "grad_norm": 1.1901336908340454, "learning_rate": 0.0003657342958732895, "loss": 3.4404, "step": 36350 }, { "epoch": 3.9177698848347866, "grad_norm": 0.6059907674789429, "learning_rate": 0.00036541105484322805, "loss": 3.4501, "step": 36400 }, { "epoch": 3.9231514368743947, "grad_norm": 0.6489651203155518, "learning_rate": 0.00036508781381316665, "loss": 3.4375, "step": 36450 }, { "epoch": 3.928532988914003, "grad_norm": 0.6621146202087402, "learning_rate": 0.0003647645727831053, "loss": 3.4413, "step": 36500 }, { "epoch": 3.933914540953611, "grad_norm": 0.6240432262420654, "learning_rate": 0.00036444133175304384, "loss": 3.4458, "step": 36550 }, { "epoch": 3.939296092993219, "grad_norm": 0.5703659057617188, "learning_rate": 0.00036411809072298243, "loss": 3.459, "step": 36600 }, { "epoch": 3.9446776450328276, "grad_norm": 0.5848928689956665, "learning_rate": 0.00036379484969292097, "loss": 3.4437, "step": 36650 }, { "epoch": 3.9500591970724357, "grad_norm": 0.6029636859893799, "learning_rate": 0.00036347160866285956, "loss": 3.4499, "step": 36700 }, { "epoch": 3.955440749112044, "grad_norm": 0.6592400670051575, "learning_rate": 0.0003631483676327981, "loss": 3.4583, "step": 36750 }, { "epoch": 3.9608223011516523, "grad_norm": 0.6309558749198914, "learning_rate": 0.00036282512660273675, "loss": 3.4388, "step": 36800 }, { "epoch": 3.9662038531912605, "grad_norm": 0.6201919913291931, "learning_rate": 0.00036250188557267535, "loss": 3.4422, "step": 36850 }, { "epoch": 3.9715854052308686, "grad_norm": 0.6117024421691895, "learning_rate": 0.0003621786445426139, "loss": 3.4464, "step": 36900 }, { "epoch": 3.9769669572704767, "grad_norm": 0.6090784072875977, "learning_rate": 0.0003618554035125525, "loss": 3.4537, "step": 36950 }, { "epoch": 3.9823485093100848, "grad_norm": 0.6393431425094604, "learning_rate": 0.0003615321624824911, "loss": 3.4556, "step": 37000 }, { "epoch": 3.9823485093100848, "eval_accuracy": 0.3763014578718528, "eval_loss": 3.4423341751098633, "eval_runtime": 179.0373, "eval_samples_per_second": 100.599, "eval_steps_per_second": 6.289, "step": 37000 }, { "epoch": 3.9877300613496933, "grad_norm": 0.6844801902770996, "learning_rate": 0.0003612089214524296, "loss": 3.4586, "step": 37050 }, { "epoch": 3.9931116133893014, "grad_norm": 0.6407906413078308, "learning_rate": 0.00036088568042236827, "loss": 3.4415, "step": 37100 }, { "epoch": 3.9984931654289095, "grad_norm": 0.6470766663551331, "learning_rate": 0.00036056243939230686, "loss": 3.4393, "step": 37150 }, { "epoch": 4.003874717468518, "grad_norm": 0.6062134504318237, "learning_rate": 0.0003602391983622454, "loss": 3.3765, "step": 37200 }, { "epoch": 4.009256269508126, "grad_norm": 0.599993884563446, "learning_rate": 0.00035992242215278526, "loss": 3.3677, "step": 37250 }, { "epoch": 4.014637821547734, "grad_norm": 0.6825929284095764, "learning_rate": 0.0003595991811227238, "loss": 3.348, "step": 37300 }, { "epoch": 4.020019373587343, "grad_norm": 0.6646528840065002, "learning_rate": 0.0003592759400926624, "loss": 3.3601, "step": 37350 }, { "epoch": 4.0254009256269505, "grad_norm": 0.6230810880661011, "learning_rate": 0.00035895269906260094, "loss": 3.3544, "step": 37400 }, { "epoch": 4.030782477666559, "grad_norm": 0.6473919153213501, "learning_rate": 0.0003586294580325396, "loss": 3.3651, "step": 37450 }, { "epoch": 4.036164029706168, "grad_norm": 0.6319429278373718, "learning_rate": 0.00035831268182307934, "loss": 3.3484, "step": 37500 }, { "epoch": 4.041545581745775, "grad_norm": 0.6236182451248169, "learning_rate": 0.000357989440793018, "loss": 3.3391, "step": 37550 }, { "epoch": 4.046927133785384, "grad_norm": 0.664544939994812, "learning_rate": 0.0003576661997629566, "loss": 3.3503, "step": 37600 }, { "epoch": 4.0523086858249915, "grad_norm": 0.6256778240203857, "learning_rate": 0.0003573429587328951, "loss": 3.3679, "step": 37650 }, { "epoch": 4.0576902378646, "grad_norm": 0.616200864315033, "learning_rate": 0.0003570197177028337, "loss": 3.3674, "step": 37700 }, { "epoch": 4.063071789904209, "grad_norm": 0.6404024958610535, "learning_rate": 0.00035669647667277226, "loss": 3.3791, "step": 37750 }, { "epoch": 4.068453341943816, "grad_norm": 0.6022494435310364, "learning_rate": 0.00035637323564271085, "loss": 3.3543, "step": 37800 }, { "epoch": 4.073834893983425, "grad_norm": 0.6449595093727112, "learning_rate": 0.0003560499946126495, "loss": 3.3718, "step": 37850 }, { "epoch": 4.079216446023033, "grad_norm": 0.6162230968475342, "learning_rate": 0.00035572675358258804, "loss": 3.365, "step": 37900 }, { "epoch": 4.084597998062641, "grad_norm": 0.5845982432365417, "learning_rate": 0.00035540351255252664, "loss": 3.3627, "step": 37950 }, { "epoch": 4.08997955010225, "grad_norm": 0.6631813049316406, "learning_rate": 0.00035508027152246523, "loss": 3.3659, "step": 38000 }, { "epoch": 4.08997955010225, "eval_accuracy": 0.37662448290732525, "eval_loss": 3.4459056854248047, "eval_runtime": 179.0185, "eval_samples_per_second": 100.61, "eval_steps_per_second": 6.29, "step": 38000 }, { "epoch": 4.095361102141858, "grad_norm": 0.6242915391921997, "learning_rate": 0.0003547570304924038, "loss": 3.3691, "step": 38050 }, { "epoch": 4.100742654181466, "grad_norm": 0.6871652603149414, "learning_rate": 0.0003544337894623424, "loss": 3.3887, "step": 38100 }, { "epoch": 4.106124206221074, "grad_norm": 0.6138073801994324, "learning_rate": 0.000354110548432281, "loss": 3.3747, "step": 38150 }, { "epoch": 4.111505758260682, "grad_norm": 0.6306424736976624, "learning_rate": 0.00035378730740221956, "loss": 3.3801, "step": 38200 }, { "epoch": 4.1168873103002905, "grad_norm": 0.647663414478302, "learning_rate": 0.00035346406637215815, "loss": 3.3536, "step": 38250 }, { "epoch": 4.122268862339899, "grad_norm": 0.6308625936508179, "learning_rate": 0.0003531408253420967, "loss": 3.3774, "step": 38300 }, { "epoch": 4.127650414379507, "grad_norm": 0.636153519153595, "learning_rate": 0.0003528175843120353, "loss": 3.3664, "step": 38350 }, { "epoch": 4.133031966419115, "grad_norm": 0.5801687836647034, "learning_rate": 0.00035249434328197394, "loss": 3.3771, "step": 38400 }, { "epoch": 4.138413518458724, "grad_norm": 0.5874185562133789, "learning_rate": 0.0003521711022519125, "loss": 3.3719, "step": 38450 }, { "epoch": 4.1437950704983315, "grad_norm": 0.6609805226325989, "learning_rate": 0.00035184786122185107, "loss": 3.3727, "step": 38500 }, { "epoch": 4.14917662253794, "grad_norm": 0.6710774302482605, "learning_rate": 0.00035152462019178967, "loss": 3.3762, "step": 38550 }, { "epoch": 4.154558174577549, "grad_norm": 0.6188481450080872, "learning_rate": 0.0003512013791617282, "loss": 3.3715, "step": 38600 }, { "epoch": 4.159939726617156, "grad_norm": 0.6339542865753174, "learning_rate": 0.0003508781381316668, "loss": 3.3594, "step": 38650 }, { "epoch": 4.165321278656765, "grad_norm": 0.680853009223938, "learning_rate": 0.00035055489710160545, "loss": 3.3658, "step": 38700 }, { "epoch": 4.1707028306963725, "grad_norm": 0.6483212113380432, "learning_rate": 0.000350231656071544, "loss": 3.3674, "step": 38750 }, { "epoch": 4.176084382735981, "grad_norm": 0.6421170234680176, "learning_rate": 0.0003499084150414826, "loss": 3.3749, "step": 38800 }, { "epoch": 4.18146593477559, "grad_norm": 0.6279784440994263, "learning_rate": 0.0003495851740114211, "loss": 3.3742, "step": 38850 }, { "epoch": 4.186847486815197, "grad_norm": 0.6636990904808044, "learning_rate": 0.0003492619329813597, "loss": 3.3885, "step": 38900 }, { "epoch": 4.192229038854806, "grad_norm": 0.6253726482391357, "learning_rate": 0.0003489386919512983, "loss": 3.3746, "step": 38950 }, { "epoch": 4.197610590894414, "grad_norm": 0.6464277505874634, "learning_rate": 0.0003486154509212369, "loss": 3.376, "step": 39000 }, { "epoch": 4.197610590894414, "eval_accuracy": 0.37757954179121456, "eval_loss": 3.4406824111938477, "eval_runtime": 178.994, "eval_samples_per_second": 100.623, "eval_steps_per_second": 6.291, "step": 39000 }, { "epoch": 4.202992142934022, "grad_norm": 0.6213078498840332, "learning_rate": 0.0003482922098911755, "loss": 3.3774, "step": 39050 }, { "epoch": 4.208373694973631, "grad_norm": 0.7072701454162598, "learning_rate": 0.0003479689688611141, "loss": 3.3771, "step": 39100 }, { "epoch": 4.213755247013238, "grad_norm": 0.6380259990692139, "learning_rate": 0.00034764572783105264, "loss": 3.3821, "step": 39150 }, { "epoch": 4.219136799052847, "grad_norm": 0.6575095653533936, "learning_rate": 0.00034732248680099123, "loss": 3.3728, "step": 39200 }, { "epoch": 4.224518351092455, "grad_norm": 0.6727277636528015, "learning_rate": 0.0003469992457709299, "loss": 3.3735, "step": 39250 }, { "epoch": 4.229899903132063, "grad_norm": 0.6541663408279419, "learning_rate": 0.0003466760047408684, "loss": 3.3874, "step": 39300 }, { "epoch": 4.2352814551716715, "grad_norm": 0.6344107389450073, "learning_rate": 0.000346352763710807, "loss": 3.4072, "step": 39350 }, { "epoch": 4.24066300721128, "grad_norm": 0.6421983242034912, "learning_rate": 0.00034602952268074556, "loss": 3.3878, "step": 39400 }, { "epoch": 4.246044559250888, "grad_norm": 0.6348140835762024, "learning_rate": 0.00034570628165068415, "loss": 3.3976, "step": 39450 }, { "epoch": 4.251426111290496, "grad_norm": 0.6559439301490784, "learning_rate": 0.0003453830406206227, "loss": 3.3817, "step": 39500 }, { "epoch": 4.256807663330104, "grad_norm": 0.6733367443084717, "learning_rate": 0.00034505979959056134, "loss": 3.3824, "step": 39550 }, { "epoch": 4.2621892153697125, "grad_norm": 0.6131154894828796, "learning_rate": 0.00034473655856049994, "loss": 3.3716, "step": 39600 }, { "epoch": 4.267570767409321, "grad_norm": 0.6350231170654297, "learning_rate": 0.0003444133175304385, "loss": 3.3547, "step": 39650 }, { "epoch": 4.272952319448929, "grad_norm": 0.6846705675125122, "learning_rate": 0.00034409007650037707, "loss": 3.3829, "step": 39700 }, { "epoch": 4.278333871488537, "grad_norm": 0.6620013117790222, "learning_rate": 0.00034376683547031567, "loss": 3.3971, "step": 39750 }, { "epoch": 4.283715423528146, "grad_norm": 0.6244055032730103, "learning_rate": 0.0003434435944402542, "loss": 3.3613, "step": 39800 }, { "epoch": 4.2890969755677535, "grad_norm": 0.6607896685600281, "learning_rate": 0.00034312035341019286, "loss": 3.367, "step": 39850 }, { "epoch": 4.294478527607362, "grad_norm": 0.6466280817985535, "learning_rate": 0.00034279711238013145, "loss": 3.3783, "step": 39900 }, { "epoch": 4.299860079646971, "grad_norm": 0.6921299695968628, "learning_rate": 0.00034247387135007, "loss": 3.3904, "step": 39950 }, { "epoch": 4.305241631686578, "grad_norm": 0.6485236883163452, "learning_rate": 0.0003421506303200086, "loss": 3.3755, "step": 40000 }, { "epoch": 4.305241631686578, "eval_accuracy": 0.37784726250678946, "eval_loss": 3.434654951095581, "eval_runtime": 179.4052, "eval_samples_per_second": 100.393, "eval_steps_per_second": 6.276, "step": 40000 }, { "epoch": 4.310623183726187, "grad_norm": 0.6441997289657593, "learning_rate": 0.0003418273892899471, "loss": 3.3875, "step": 40050 }, { "epoch": 4.3160047357657945, "grad_norm": 0.6506465673446655, "learning_rate": 0.0003415041482598858, "loss": 3.3937, "step": 40100 }, { "epoch": 4.321386287805403, "grad_norm": 0.6414850354194641, "learning_rate": 0.00034118090722982437, "loss": 3.3867, "step": 40150 }, { "epoch": 4.326767839845012, "grad_norm": 0.6698691844940186, "learning_rate": 0.0003408576661997629, "loss": 3.3669, "step": 40200 }, { "epoch": 4.332149391884619, "grad_norm": 0.6112655401229858, "learning_rate": 0.0003405344251697015, "loss": 3.3832, "step": 40250 }, { "epoch": 4.337530943924228, "grad_norm": 0.6444694399833679, "learning_rate": 0.0003402111841396401, "loss": 3.3952, "step": 40300 }, { "epoch": 4.342912495963836, "grad_norm": 0.637163519859314, "learning_rate": 0.00033988794310957864, "loss": 3.377, "step": 40350 }, { "epoch": 4.348294048003444, "grad_norm": 0.6567486524581909, "learning_rate": 0.0003395647020795173, "loss": 3.377, "step": 40400 }, { "epoch": 4.3536756000430525, "grad_norm": 0.6155647039413452, "learning_rate": 0.0003392414610494559, "loss": 3.3735, "step": 40450 }, { "epoch": 4.359057152082661, "grad_norm": 0.6506165862083435, "learning_rate": 0.0003389182200193944, "loss": 3.4027, "step": 40500 }, { "epoch": 4.364438704122269, "grad_norm": 0.6013103127479553, "learning_rate": 0.000338594978989333, "loss": 3.3847, "step": 40550 }, { "epoch": 4.369820256161877, "grad_norm": 0.6560415625572205, "learning_rate": 0.00033827173795927156, "loss": 3.3801, "step": 40600 }, { "epoch": 4.375201808201485, "grad_norm": 0.6578083038330078, "learning_rate": 0.00033794849692921015, "loss": 3.3883, "step": 40650 }, { "epoch": 4.3805833602410935, "grad_norm": 0.6615694761276245, "learning_rate": 0.0003376252558991488, "loss": 3.3674, "step": 40700 }, { "epoch": 4.385964912280702, "grad_norm": 0.6453083753585815, "learning_rate": 0.00033730201486908734, "loss": 3.385, "step": 40750 }, { "epoch": 4.39134646432031, "grad_norm": 0.592745840549469, "learning_rate": 0.00033697877383902594, "loss": 3.3716, "step": 40800 }, { "epoch": 4.396728016359918, "grad_norm": 0.626979649066925, "learning_rate": 0.000336668462450167, "loss": 3.3785, "step": 40850 }, { "epoch": 4.402109568399527, "grad_norm": 0.6453368067741394, "learning_rate": 0.0003363452214201056, "loss": 3.379, "step": 40900 }, { "epoch": 4.4074911204391345, "grad_norm": 0.6402779221534729, "learning_rate": 0.00033602198039004415, "loss": 3.3919, "step": 40950 }, { "epoch": 4.412872672478743, "grad_norm": 0.6731301546096802, "learning_rate": 0.00033569873935998274, "loss": 3.4051, "step": 41000 }, { "epoch": 4.412872672478743, "eval_accuracy": 0.3784006316644286, "eval_loss": 3.429743766784668, "eval_runtime": 178.9978, "eval_samples_per_second": 100.621, "eval_steps_per_second": 6.291, "step": 41000 }, { "epoch": 4.418254224518351, "grad_norm": 0.6004547476768494, "learning_rate": 0.0003353754983299213, "loss": 3.4034, "step": 41050 }, { "epoch": 4.423635776557959, "grad_norm": 0.6192396283149719, "learning_rate": 0.0003350522572998599, "loss": 3.4108, "step": 41100 }, { "epoch": 4.429017328597568, "grad_norm": 0.665625274181366, "learning_rate": 0.0003347290162697985, "loss": 3.3965, "step": 41150 }, { "epoch": 4.4343988806371755, "grad_norm": 0.6514542102813721, "learning_rate": 0.00033440577523973706, "loss": 3.3935, "step": 41200 }, { "epoch": 4.439780432676784, "grad_norm": 0.6218470931053162, "learning_rate": 0.00033408253420967566, "loss": 3.3888, "step": 41250 }, { "epoch": 4.445161984716393, "grad_norm": 0.6087473034858704, "learning_rate": 0.00033375929317961425, "loss": 3.3706, "step": 41300 }, { "epoch": 4.450543536756, "grad_norm": 0.6203783750534058, "learning_rate": 0.0003334360521495528, "loss": 3.3829, "step": 41350 }, { "epoch": 4.455925088795609, "grad_norm": 0.647840678691864, "learning_rate": 0.0003331128111194914, "loss": 3.3909, "step": 41400 }, { "epoch": 4.461306640835216, "grad_norm": 0.6511422991752625, "learning_rate": 0.00033278957008943004, "loss": 3.3799, "step": 41450 }, { "epoch": 4.466688192874825, "grad_norm": 0.6809494495391846, "learning_rate": 0.0003324663290593686, "loss": 3.399, "step": 41500 }, { "epoch": 4.4720697449144335, "grad_norm": 0.6122585535049438, "learning_rate": 0.00033214308802930717, "loss": 3.3768, "step": 41550 }, { "epoch": 4.477451296954041, "grad_norm": 0.6239863634109497, "learning_rate": 0.0003318198469992457, "loss": 3.3881, "step": 41600 }, { "epoch": 4.48283284899365, "grad_norm": 0.5899037718772888, "learning_rate": 0.0003314966059691843, "loss": 3.3914, "step": 41650 }, { "epoch": 4.488214401033258, "grad_norm": 0.6268954277038574, "learning_rate": 0.00033117336493912296, "loss": 3.3969, "step": 41700 }, { "epoch": 4.493595953072866, "grad_norm": 0.6434704065322876, "learning_rate": 0.0003308501239090615, "loss": 3.3989, "step": 41750 }, { "epoch": 4.4989775051124745, "grad_norm": 0.642910361289978, "learning_rate": 0.0003305268828790001, "loss": 3.367, "step": 41800 }, { "epoch": 4.504359057152083, "grad_norm": 0.622340738773346, "learning_rate": 0.0003302036418489387, "loss": 3.3788, "step": 41850 }, { "epoch": 4.509740609191691, "grad_norm": 0.663396954536438, "learning_rate": 0.0003298804008188772, "loss": 3.4013, "step": 41900 }, { "epoch": 4.515122161231299, "grad_norm": 0.6161965131759644, "learning_rate": 0.0003295571597888158, "loss": 3.3932, "step": 41950 }, { "epoch": 4.520503713270907, "grad_norm": 0.6699892282485962, "learning_rate": 0.00032923391875875447, "loss": 3.3824, "step": 42000 }, { "epoch": 4.520503713270907, "eval_accuracy": 0.378954435433619, "eval_loss": 3.425100564956665, "eval_runtime": 179.5384, "eval_samples_per_second": 100.318, "eval_steps_per_second": 6.272, "step": 42000 }, { "epoch": 4.5258852653105155, "grad_norm": 0.6648452281951904, "learning_rate": 0.000328910677728693, "loss": 3.3917, "step": 42050 }, { "epoch": 4.531266817350124, "grad_norm": 0.7076273560523987, "learning_rate": 0.0003285874366986316, "loss": 3.3749, "step": 42100 }, { "epoch": 4.536648369389732, "grad_norm": 0.6585971117019653, "learning_rate": 0.00032826419566857015, "loss": 3.4023, "step": 42150 }, { "epoch": 4.54202992142934, "grad_norm": 0.6582302451133728, "learning_rate": 0.00032794095463850874, "loss": 3.3844, "step": 42200 }, { "epoch": 4.547411473468949, "grad_norm": 0.6505902409553528, "learning_rate": 0.0003276177136084473, "loss": 3.3838, "step": 42250 }, { "epoch": 4.5527930255085565, "grad_norm": 0.6097198724746704, "learning_rate": 0.00032729447257838593, "loss": 3.3831, "step": 42300 }, { "epoch": 4.558174577548165, "grad_norm": 0.6444432735443115, "learning_rate": 0.0003269712315483245, "loss": 3.3908, "step": 42350 }, { "epoch": 4.563556129587774, "grad_norm": 0.6576614379882812, "learning_rate": 0.00032664799051826306, "loss": 3.3699, "step": 42400 }, { "epoch": 4.568937681627381, "grad_norm": 0.646975040435791, "learning_rate": 0.00032632474948820166, "loss": 3.3979, "step": 42450 }, { "epoch": 4.57431923366699, "grad_norm": 0.6581142544746399, "learning_rate": 0.00032600150845814025, "loss": 3.3949, "step": 42500 }, { "epoch": 4.579700785706597, "grad_norm": 0.6320644617080688, "learning_rate": 0.0003256782674280788, "loss": 3.3851, "step": 42550 }, { "epoch": 4.585082337746206, "grad_norm": 0.6291263699531555, "learning_rate": 0.00032535502639801744, "loss": 3.3822, "step": 42600 }, { "epoch": 4.5904638897858145, "grad_norm": 0.6159578561782837, "learning_rate": 0.00032503178536795604, "loss": 3.3705, "step": 42650 }, { "epoch": 4.595845441825422, "grad_norm": 0.6452188491821289, "learning_rate": 0.0003247085443378946, "loss": 3.403, "step": 42700 }, { "epoch": 4.601226993865031, "grad_norm": 0.6549307703971863, "learning_rate": 0.0003243853033078332, "loss": 3.393, "step": 42750 }, { "epoch": 4.606608545904638, "grad_norm": 0.6222548484802246, "learning_rate": 0.0003240620622777717, "loss": 3.393, "step": 42800 }, { "epoch": 4.611990097944247, "grad_norm": 0.686937689781189, "learning_rate": 0.00032373882124771036, "loss": 3.3727, "step": 42850 }, { "epoch": 4.6173716499838555, "grad_norm": 0.6424272656440735, "learning_rate": 0.00032341558021764896, "loss": 3.3925, "step": 42900 }, { "epoch": 4.622753202023463, "grad_norm": 0.6390585899353027, "learning_rate": 0.0003230923391875875, "loss": 3.3799, "step": 42950 }, { "epoch": 4.628134754063072, "grad_norm": 0.6410428285598755, "learning_rate": 0.0003227690981575261, "loss": 3.3968, "step": 43000 }, { "epoch": 4.628134754063072, "eval_accuracy": 0.37920401111692886, "eval_loss": 3.4200704097747803, "eval_runtime": 178.8905, "eval_samples_per_second": 100.682, "eval_steps_per_second": 6.294, "step": 43000 }, { "epoch": 4.63351630610268, "grad_norm": 0.708107590675354, "learning_rate": 0.0003224458571274647, "loss": 3.3855, "step": 43050 }, { "epoch": 4.638897858142288, "grad_norm": 0.620175838470459, "learning_rate": 0.00032212261609740323, "loss": 3.4038, "step": 43100 }, { "epoch": 4.6442794101818965, "grad_norm": 0.6275815367698669, "learning_rate": 0.0003217993750673419, "loss": 3.3983, "step": 43150 }, { "epoch": 4.649660962221505, "grad_norm": 0.6273365616798401, "learning_rate": 0.00032147613403728047, "loss": 3.3947, "step": 43200 }, { "epoch": 4.655042514261113, "grad_norm": 0.658158004283905, "learning_rate": 0.000321152893007219, "loss": 3.4089, "step": 43250 }, { "epoch": 4.660424066300721, "grad_norm": 0.7174252271652222, "learning_rate": 0.0003208296519771576, "loss": 3.3896, "step": 43300 }, { "epoch": 4.665805618340329, "grad_norm": 0.6721243262290955, "learning_rate": 0.00032050641094709615, "loss": 3.383, "step": 43350 }, { "epoch": 4.6711871703799375, "grad_norm": 0.6401117444038391, "learning_rate": 0.00032018316991703474, "loss": 3.3873, "step": 43400 }, { "epoch": 4.676568722419546, "grad_norm": 0.6724918484687805, "learning_rate": 0.0003198599288869734, "loss": 3.3841, "step": 43450 }, { "epoch": 4.681950274459154, "grad_norm": 0.6670373678207397, "learning_rate": 0.00031953668785691193, "loss": 3.4003, "step": 43500 }, { "epoch": 4.687331826498762, "grad_norm": 0.6416376829147339, "learning_rate": 0.0003192134468268505, "loss": 3.3737, "step": 43550 }, { "epoch": 4.692713378538371, "grad_norm": 0.6145159006118774, "learning_rate": 0.0003188902057967891, "loss": 3.3842, "step": 43600 }, { "epoch": 4.6980949305779784, "grad_norm": 0.7071024179458618, "learning_rate": 0.00031856696476672766, "loss": 3.3662, "step": 43650 }, { "epoch": 4.703476482617587, "grad_norm": 0.6578840613365173, "learning_rate": 0.0003182437237366663, "loss": 3.3783, "step": 43700 }, { "epoch": 4.7088580346571955, "grad_norm": 0.6362777352333069, "learning_rate": 0.0003179204827066049, "loss": 3.3855, "step": 43750 }, { "epoch": 4.714239586696803, "grad_norm": 0.642004132270813, "learning_rate": 0.00031759724167654344, "loss": 3.3849, "step": 43800 }, { "epoch": 4.719621138736412, "grad_norm": 0.6391999125480652, "learning_rate": 0.00031727400064648204, "loss": 3.3797, "step": 43850 }, { "epoch": 4.725002690776019, "grad_norm": 0.6582598686218262, "learning_rate": 0.0003169507596164206, "loss": 3.371, "step": 43900 }, { "epoch": 4.730384242815628, "grad_norm": 0.6339578628540039, "learning_rate": 0.0003166275185863592, "loss": 3.3863, "step": 43950 }, { "epoch": 4.7357657948552365, "grad_norm": 0.6599995493888855, "learning_rate": 0.0003163042775562978, "loss": 3.404, "step": 44000 }, { "epoch": 4.7357657948552365, "eval_accuracy": 0.37974923130798194, "eval_loss": 3.415609836578369, "eval_runtime": 180.7107, "eval_samples_per_second": 99.668, "eval_steps_per_second": 6.231, "step": 44000 }, { "epoch": 4.741147346894844, "grad_norm": 0.6366297006607056, "learning_rate": 0.00031598103652623636, "loss": 3.3845, "step": 44050 }, { "epoch": 4.746528898934453, "grad_norm": 0.6110204458236694, "learning_rate": 0.00031565779549617496, "loss": 3.3879, "step": 44100 }, { "epoch": 4.751910450974061, "grad_norm": 0.6180295944213867, "learning_rate": 0.00031533455446611355, "loss": 3.3808, "step": 44150 }, { "epoch": 4.757292003013669, "grad_norm": 0.6570284962654114, "learning_rate": 0.0003150113134360521, "loss": 3.3805, "step": 44200 }, { "epoch": 4.7626735550532775, "grad_norm": 0.6676671504974365, "learning_rate": 0.0003146880724059907, "loss": 3.3978, "step": 44250 }, { "epoch": 4.768055107092886, "grad_norm": 0.6489489078521729, "learning_rate": 0.00031436483137592934, "loss": 3.4011, "step": 44300 }, { "epoch": 4.773436659132494, "grad_norm": 0.6341047286987305, "learning_rate": 0.0003140415903458679, "loss": 3.3793, "step": 44350 }, { "epoch": 4.778818211172102, "grad_norm": 0.7046522498130798, "learning_rate": 0.00031371834931580647, "loss": 3.3835, "step": 44400 }, { "epoch": 4.78419976321171, "grad_norm": 0.69482421875, "learning_rate": 0.000313395108285745, "loss": 3.4168, "step": 44450 }, { "epoch": 4.7895813152513185, "grad_norm": 0.6584940552711487, "learning_rate": 0.0003130718672556836, "loss": 3.3873, "step": 44500 }, { "epoch": 4.794962867290927, "grad_norm": 0.6360692977905273, "learning_rate": 0.0003127486262256222, "loss": 3.3943, "step": 44550 }, { "epoch": 4.800344419330535, "grad_norm": 0.6465499997138977, "learning_rate": 0.0003124253851955608, "loss": 3.3921, "step": 44600 }, { "epoch": 4.805725971370143, "grad_norm": 0.6448742151260376, "learning_rate": 0.0003121021441654994, "loss": 3.3871, "step": 44650 }, { "epoch": 4.811107523409751, "grad_norm": 0.6658437252044678, "learning_rate": 0.000311778903135438, "loss": 3.3711, "step": 44700 }, { "epoch": 4.8164890754493594, "grad_norm": 0.6406242847442627, "learning_rate": 0.0003114556621053765, "loss": 3.37, "step": 44750 }, { "epoch": 4.821870627488968, "grad_norm": 0.649381160736084, "learning_rate": 0.0003111324210753151, "loss": 3.4078, "step": 44800 }, { "epoch": 4.827252179528576, "grad_norm": 0.6473474502563477, "learning_rate": 0.00031080918004525377, "loss": 3.3874, "step": 44850 }, { "epoch": 4.832633731568184, "grad_norm": 0.6915144324302673, "learning_rate": 0.0003104859390151923, "loss": 3.3673, "step": 44900 }, { "epoch": 4.838015283607793, "grad_norm": 0.6494570374488831, "learning_rate": 0.0003101626979851309, "loss": 3.3784, "step": 44950 }, { "epoch": 4.8433968356474, "grad_norm": 0.7485749125480652, "learning_rate": 0.00030983945695506945, "loss": 3.3898, "step": 45000 }, { "epoch": 4.8433968356474, "eval_accuracy": 0.38026554983087635, "eval_loss": 3.410443067550659, "eval_runtime": 183.3344, "eval_samples_per_second": 98.241, "eval_steps_per_second": 6.142, "step": 45000 }, { "epoch": 4.848778387687009, "grad_norm": 0.7347899675369263, "learning_rate": 0.00030951621592500804, "loss": 3.3883, "step": 45050 }, { "epoch": 4.8541599397266175, "grad_norm": 0.7700813412666321, "learning_rate": 0.00030919297489494663, "loss": 3.3862, "step": 45100 }, { "epoch": 4.859541491766225, "grad_norm": 0.6585997939109802, "learning_rate": 0.00030886973386488523, "loss": 3.3757, "step": 45150 }, { "epoch": 4.864923043805834, "grad_norm": 0.6966983079910278, "learning_rate": 0.0003085464928348238, "loss": 3.3622, "step": 45200 }, { "epoch": 4.870304595845441, "grad_norm": 0.6641828417778015, "learning_rate": 0.0003082232518047624, "loss": 3.3832, "step": 45250 }, { "epoch": 4.87568614788505, "grad_norm": 0.6917539238929749, "learning_rate": 0.00030790001077470096, "loss": 3.3792, "step": 45300 }, { "epoch": 4.8810676999246585, "grad_norm": 0.7116442322731018, "learning_rate": 0.00030757676974463955, "loss": 3.388, "step": 45350 }, { "epoch": 4.886449251964266, "grad_norm": 0.671840250492096, "learning_rate": 0.0003072535287145781, "loss": 3.3899, "step": 45400 }, { "epoch": 4.891830804003875, "grad_norm": 0.6766582131385803, "learning_rate": 0.00030693028768451674, "loss": 3.3824, "step": 45450 }, { "epoch": 4.897212356043483, "grad_norm": 0.6944827437400818, "learning_rate": 0.00030660704665445534, "loss": 3.3976, "step": 45500 }, { "epoch": 4.902593908083091, "grad_norm": 0.6937603950500488, "learning_rate": 0.0003062838056243939, "loss": 3.3742, "step": 45550 }, { "epoch": 4.9079754601226995, "grad_norm": 0.6783007979393005, "learning_rate": 0.00030596056459433247, "loss": 3.3957, "step": 45600 }, { "epoch": 4.913357012162308, "grad_norm": 0.6475757360458374, "learning_rate": 0.00030563732356427107, "loss": 3.3943, "step": 45650 }, { "epoch": 4.918738564201916, "grad_norm": 0.734342098236084, "learning_rate": 0.00030531408253420966, "loss": 3.3746, "step": 45700 }, { "epoch": 4.924120116241524, "grad_norm": 0.6630499958992004, "learning_rate": 0.00030499084150414826, "loss": 3.4035, "step": 45750 }, { "epoch": 4.929501668281132, "grad_norm": 0.6387501955032349, "learning_rate": 0.00030466760047408685, "loss": 3.3798, "step": 45800 }, { "epoch": 4.9348832203207404, "grad_norm": 0.6920491456985474, "learning_rate": 0.0003043443594440254, "loss": 3.3884, "step": 45850 }, { "epoch": 4.940264772360349, "grad_norm": 0.6548166275024414, "learning_rate": 0.000304021118413964, "loss": 3.378, "step": 45900 }, { "epoch": 4.945646324399957, "grad_norm": 0.6863994598388672, "learning_rate": 0.0003036978773839025, "loss": 3.3926, "step": 45950 }, { "epoch": 4.951027876439565, "grad_norm": 0.7059321999549866, "learning_rate": 0.0003033746363538412, "loss": 3.3912, "step": 46000 }, { "epoch": 4.951027876439565, "eval_accuracy": 0.3807130910757842, "eval_loss": 3.4059853553771973, "eval_runtime": 182.4738, "eval_samples_per_second": 98.705, "eval_steps_per_second": 6.171, "step": 46000 }, { "epoch": 4.956409428479174, "grad_norm": 0.6990537047386169, "learning_rate": 0.00030305139532377977, "loss": 3.3663, "step": 46050 }, { "epoch": 4.961790980518781, "grad_norm": 0.6453987956047058, "learning_rate": 0.0003027281542937183, "loss": 3.3885, "step": 46100 }, { "epoch": 4.96717253255839, "grad_norm": 0.707766056060791, "learning_rate": 0.0003024049132636569, "loss": 3.3785, "step": 46150 }, { "epoch": 4.9725540845979985, "grad_norm": 0.6762568354606628, "learning_rate": 0.0003020816722335955, "loss": 3.3855, "step": 46200 }, { "epoch": 4.977935636637606, "grad_norm": 0.7340874075889587, "learning_rate": 0.00030175843120353404, "loss": 3.3836, "step": 46250 }, { "epoch": 4.983317188677215, "grad_norm": 0.6631291508674622, "learning_rate": 0.0003014351901734727, "loss": 3.3923, "step": 46300 }, { "epoch": 4.988698740716822, "grad_norm": 0.6882035136222839, "learning_rate": 0.0003011119491434113, "loss": 3.3714, "step": 46350 }, { "epoch": 4.994080292756431, "grad_norm": 0.7139785289764404, "learning_rate": 0.0003007887081133498, "loss": 3.3729, "step": 46400 }, { "epoch": 4.9994618447960395, "grad_norm": 0.6610046625137329, "learning_rate": 0.0003004654670832884, "loss": 3.3688, "step": 46450 }, { "epoch": 5.004843396835647, "grad_norm": 0.7078374028205872, "learning_rate": 0.00030014222605322696, "loss": 3.2984, "step": 46500 }, { "epoch": 5.010224948875256, "grad_norm": 0.6555793285369873, "learning_rate": 0.00029981898502316555, "loss": 3.3013, "step": 46550 }, { "epoch": 5.015606500914864, "grad_norm": 0.7276795506477356, "learning_rate": 0.00029949574399310415, "loss": 3.2854, "step": 46600 }, { "epoch": 5.020988052954472, "grad_norm": 0.6936183571815491, "learning_rate": 0.00029917250296304274, "loss": 3.2863, "step": 46650 }, { "epoch": 5.0263696049940805, "grad_norm": 0.6473773121833801, "learning_rate": 0.00029884926193298134, "loss": 3.2803, "step": 46700 }, { "epoch": 5.031751157033688, "grad_norm": 0.668242871761322, "learning_rate": 0.0002985260209029199, "loss": 3.2833, "step": 46750 }, { "epoch": 5.037132709073297, "grad_norm": 0.720012903213501, "learning_rate": 0.00029820277987285853, "loss": 3.2926, "step": 46800 }, { "epoch": 5.042514261112905, "grad_norm": 0.7183952927589417, "learning_rate": 0.00029787953884279707, "loss": 3.2884, "step": 46850 }, { "epoch": 5.047895813152513, "grad_norm": 0.6796324849128723, "learning_rate": 0.00029756276263333693, "loss": 3.3159, "step": 46900 }, { "epoch": 5.0532773651921215, "grad_norm": 0.6587697863578796, "learning_rate": 0.00029723952160327547, "loss": 3.2951, "step": 46950 }, { "epoch": 5.05865891723173, "grad_norm": 0.704316258430481, "learning_rate": 0.00029691628057321406, "loss": 3.2951, "step": 47000 }, { "epoch": 5.05865891723173, "eval_accuracy": 0.3809405015699799, "eval_loss": 3.4095516204833984, "eval_runtime": 182.1072, "eval_samples_per_second": 98.903, "eval_steps_per_second": 6.183, "step": 47000 }, { "epoch": 5.064040469271338, "grad_norm": 0.7102801203727722, "learning_rate": 0.00029659303954315266, "loss": 3.3029, "step": 47050 }, { "epoch": 5.069422021310946, "grad_norm": 0.6769624352455139, "learning_rate": 0.00029626979851309125, "loss": 3.314, "step": 47100 }, { "epoch": 5.074803573350554, "grad_norm": 0.6973090767860413, "learning_rate": 0.00029594655748302985, "loss": 3.3063, "step": 47150 }, { "epoch": 5.080185125390162, "grad_norm": 0.685106635093689, "learning_rate": 0.0002956233164529684, "loss": 3.3115, "step": 47200 }, { "epoch": 5.085566677429771, "grad_norm": 0.6684597730636597, "learning_rate": 0.000295300075422907, "loss": 3.3155, "step": 47250 }, { "epoch": 5.090948229469379, "grad_norm": 0.7032604217529297, "learning_rate": 0.0002949768343928456, "loss": 3.3285, "step": 47300 }, { "epoch": 5.096329781508987, "grad_norm": 0.6905900835990906, "learning_rate": 0.00029465359336278417, "loss": 3.3124, "step": 47350 }, { "epoch": 5.101711333548596, "grad_norm": 0.6917199492454529, "learning_rate": 0.0002943303523327227, "loss": 3.3038, "step": 47400 }, { "epoch": 5.107092885588203, "grad_norm": 0.685508668422699, "learning_rate": 0.00029400711130266136, "loss": 3.2876, "step": 47450 }, { "epoch": 5.112474437627812, "grad_norm": 0.6823153495788574, "learning_rate": 0.0002936838702725999, "loss": 3.3271, "step": 47500 }, { "epoch": 5.1178559896674205, "grad_norm": 0.733137845993042, "learning_rate": 0.0002933606292425385, "loss": 3.323, "step": 47550 }, { "epoch": 5.123237541707028, "grad_norm": 0.6484814882278442, "learning_rate": 0.0002930438530330783, "loss": 3.3273, "step": 47600 }, { "epoch": 5.128619093746637, "grad_norm": 0.7186241745948792, "learning_rate": 0.0002927206120030169, "loss": 3.3091, "step": 47650 }, { "epoch": 5.134000645786244, "grad_norm": 0.6843365430831909, "learning_rate": 0.0002923973709729555, "loss": 3.3132, "step": 47700 }, { "epoch": 5.139382197825853, "grad_norm": 0.7353106141090393, "learning_rate": 0.00029207412994289403, "loss": 3.3153, "step": 47750 }, { "epoch": 5.1447637498654615, "grad_norm": 0.6848884224891663, "learning_rate": 0.0002917508889128327, "loss": 3.3066, "step": 47800 }, { "epoch": 5.150145301905069, "grad_norm": 0.6536684036254883, "learning_rate": 0.0002914276478827712, "loss": 3.3267, "step": 47850 }, { "epoch": 5.155526853944678, "grad_norm": 0.7789492011070251, "learning_rate": 0.0002911044068527098, "loss": 3.3082, "step": 47900 }, { "epoch": 5.160908405984286, "grad_norm": 0.6255111694335938, "learning_rate": 0.0002907811658226484, "loss": 3.2982, "step": 47950 }, { "epoch": 5.166289958023894, "grad_norm": 0.6895121335983276, "learning_rate": 0.000290457924792587, "loss": 3.3121, "step": 48000 }, { "epoch": 5.166289958023894, "eval_accuracy": 0.3814385664077215, "eval_loss": 3.407867908477783, "eval_runtime": 183.4619, "eval_samples_per_second": 98.173, "eval_steps_per_second": 6.138, "step": 48000 }, { "epoch": 5.1716715100635025, "grad_norm": 0.691476047039032, "learning_rate": 0.00029013468376252555, "loss": 3.2897, "step": 48050 }, { "epoch": 5.17705306210311, "grad_norm": 0.6488911509513855, "learning_rate": 0.00028981144273246414, "loss": 3.3308, "step": 48100 }, { "epoch": 5.182434614142719, "grad_norm": 0.718095064163208, "learning_rate": 0.00028948820170240274, "loss": 3.3256, "step": 48150 }, { "epoch": 5.187816166182327, "grad_norm": 0.762520968914032, "learning_rate": 0.00028916496067234133, "loss": 3.3103, "step": 48200 }, { "epoch": 5.193197718221935, "grad_norm": 0.6988745927810669, "learning_rate": 0.0002888417196422799, "loss": 3.2996, "step": 48250 }, { "epoch": 5.198579270261543, "grad_norm": 0.6673710346221924, "learning_rate": 0.00028851847861221847, "loss": 3.307, "step": 48300 }, { "epoch": 5.203960822301152, "grad_norm": 0.7387024164199829, "learning_rate": 0.00028819523758215706, "loss": 3.3147, "step": 48350 }, { "epoch": 5.20934237434076, "grad_norm": 0.6830635070800781, "learning_rate": 0.00028787199655209566, "loss": 3.3077, "step": 48400 }, { "epoch": 5.214723926380368, "grad_norm": 0.778832733631134, "learning_rate": 0.00028754875552203425, "loss": 3.317, "step": 48450 }, { "epoch": 5.220105478419977, "grad_norm": 0.7117233872413635, "learning_rate": 0.0002872255144919728, "loss": 3.3447, "step": 48500 }, { "epoch": 5.225487030459584, "grad_norm": 0.7413156032562256, "learning_rate": 0.00028690227346191144, "loss": 3.2984, "step": 48550 }, { "epoch": 5.230868582499193, "grad_norm": 0.733216404914856, "learning_rate": 0.00028657903243185, "loss": 3.3121, "step": 48600 }, { "epoch": 5.236250134538801, "grad_norm": 0.737923800945282, "learning_rate": 0.0002862557914017886, "loss": 3.3335, "step": 48650 }, { "epoch": 5.241631686578409, "grad_norm": 0.6947996020317078, "learning_rate": 0.00028593255037172717, "loss": 3.3339, "step": 48700 }, { "epoch": 5.247013238618018, "grad_norm": 0.7261170744895935, "learning_rate": 0.00028560930934166576, "loss": 3.3188, "step": 48750 }, { "epoch": 5.252394790657625, "grad_norm": 0.6556313037872314, "learning_rate": 0.00028528606831160436, "loss": 3.3253, "step": 48800 }, { "epoch": 5.257776342697234, "grad_norm": 0.6677282452583313, "learning_rate": 0.0002849628272815429, "loss": 3.3282, "step": 48850 }, { "epoch": 5.2631578947368425, "grad_norm": 0.7024506330490112, "learning_rate": 0.0002846395862514815, "loss": 3.3332, "step": 48900 }, { "epoch": 5.26853944677645, "grad_norm": 0.714886486530304, "learning_rate": 0.0002843163452214201, "loss": 3.3301, "step": 48950 }, { "epoch": 5.273920998816059, "grad_norm": 0.7136442065238953, "learning_rate": 0.0002839931041913587, "loss": 3.318, "step": 49000 }, { "epoch": 5.273920998816059, "eval_accuracy": 0.38172845231241054, "eval_loss": 3.4046378135681152, "eval_runtime": 185.9322, "eval_samples_per_second": 96.869, "eval_steps_per_second": 6.056, "step": 49000 }, { "epoch": 5.279302550855666, "grad_norm": 0.7311763763427734, "learning_rate": 0.0002836698631612972, "loss": 3.3058, "step": 49050 }, { "epoch": 5.284684102895275, "grad_norm": 0.6605286598205566, "learning_rate": 0.00028334662213123587, "loss": 3.3052, "step": 49100 }, { "epoch": 5.2900656549348835, "grad_norm": 0.6947770118713379, "learning_rate": 0.0002830233811011744, "loss": 3.3165, "step": 49150 }, { "epoch": 5.295447206974491, "grad_norm": 0.7277875542640686, "learning_rate": 0.000282700140071113, "loss": 3.304, "step": 49200 }, { "epoch": 5.3008287590141, "grad_norm": 0.6745076775550842, "learning_rate": 0.0002823768990410516, "loss": 3.3241, "step": 49250 }, { "epoch": 5.306210311053708, "grad_norm": 0.7036528587341309, "learning_rate": 0.00028205365801099014, "loss": 3.3194, "step": 49300 }, { "epoch": 5.311591863093316, "grad_norm": 0.7196166515350342, "learning_rate": 0.00028173041698092874, "loss": 3.3236, "step": 49350 }, { "epoch": 5.316973415132924, "grad_norm": 0.7171549201011658, "learning_rate": 0.00028140717595086733, "loss": 3.3313, "step": 49400 }, { "epoch": 5.322354967172533, "grad_norm": 0.7267553806304932, "learning_rate": 0.0002810839349208059, "loss": 3.3521, "step": 49450 }, { "epoch": 5.327736519212141, "grad_norm": 0.694525957107544, "learning_rate": 0.00028076069389074447, "loss": 3.3368, "step": 49500 }, { "epoch": 5.333118071251749, "grad_norm": 0.6676056981086731, "learning_rate": 0.0002804374528606831, "loss": 3.3081, "step": 49550 }, { "epoch": 5.338499623291357, "grad_norm": 0.7265917658805847, "learning_rate": 0.00028011421183062166, "loss": 3.3277, "step": 49600 }, { "epoch": 5.343881175330965, "grad_norm": 0.6560418605804443, "learning_rate": 0.00027979097080056025, "loss": 3.3412, "step": 49650 }, { "epoch": 5.349262727370574, "grad_norm": 0.7159461379051208, "learning_rate": 0.00027946772977049885, "loss": 3.3103, "step": 49700 }, { "epoch": 5.354644279410182, "grad_norm": 0.7556479573249817, "learning_rate": 0.00027914448874043744, "loss": 3.3244, "step": 49750 }, { "epoch": 5.36002583144979, "grad_norm": 0.7696731090545654, "learning_rate": 0.00027882124771037603, "loss": 3.3257, "step": 49800 }, { "epoch": 5.365407383489399, "grad_norm": 0.7160837650299072, "learning_rate": 0.0002784980066803146, "loss": 3.3288, "step": 49850 }, { "epoch": 5.370788935529006, "grad_norm": 0.6964970231056213, "learning_rate": 0.00027817476565025317, "loss": 3.3312, "step": 49900 }, { "epoch": 5.376170487568615, "grad_norm": 0.7204991579055786, "learning_rate": 0.00027785152462019176, "loss": 3.3356, "step": 49950 }, { "epoch": 5.3815520396082235, "grad_norm": 0.6679324507713318, "learning_rate": 0.00027752828359013036, "loss": 3.3299, "step": 50000 }, { "epoch": 5.3815520396082235, "eval_accuracy": 0.38207755404095844, "eval_loss": 3.3999838829040527, "eval_runtime": 185.5503, "eval_samples_per_second": 97.068, "eval_steps_per_second": 6.068, "step": 50000 }, { "epoch": 5.386933591647831, "grad_norm": 0.7059887647628784, "learning_rate": 0.0002772050425600689, "loss": 3.3145, "step": 50050 }, { "epoch": 5.39231514368744, "grad_norm": 0.7049746513366699, "learning_rate": 0.00027688180153000755, "loss": 3.3248, "step": 50100 }, { "epoch": 5.397696695727047, "grad_norm": 0.7025423049926758, "learning_rate": 0.0002765585604999461, "loss": 3.3303, "step": 50150 }, { "epoch": 5.403078247766656, "grad_norm": 0.6935784816741943, "learning_rate": 0.0002762353194698847, "loss": 3.3289, "step": 50200 }, { "epoch": 5.4084597998062645, "grad_norm": 0.6921148300170898, "learning_rate": 0.0002759120784398233, "loss": 3.328, "step": 50250 }, { "epoch": 5.413841351845872, "grad_norm": 0.7081592679023743, "learning_rate": 0.00027558883740976187, "loss": 3.3491, "step": 50300 }, { "epoch": 5.419222903885481, "grad_norm": 0.6685131192207336, "learning_rate": 0.0002752655963797004, "loss": 3.3293, "step": 50350 }, { "epoch": 5.424604455925088, "grad_norm": 0.7033519744873047, "learning_rate": 0.000274942355349639, "loss": 3.329, "step": 50400 }, { "epoch": 5.429986007964697, "grad_norm": 0.7400333881378174, "learning_rate": 0.0002746191143195776, "loss": 3.3361, "step": 50450 }, { "epoch": 5.435367560004305, "grad_norm": 0.7403768301010132, "learning_rate": 0.0002742958732895162, "loss": 3.336, "step": 50500 }, { "epoch": 5.440749112043913, "grad_norm": 0.6520399451255798, "learning_rate": 0.0002739726322594548, "loss": 3.3277, "step": 50550 }, { "epoch": 5.446130664083522, "grad_norm": 0.6986637711524963, "learning_rate": 0.00027364939122939333, "loss": 3.3338, "step": 50600 }, { "epoch": 5.45151221612313, "grad_norm": 0.7400344014167786, "learning_rate": 0.0002733261501993319, "loss": 3.3249, "step": 50650 }, { "epoch": 5.456893768162738, "grad_norm": 0.7693380117416382, "learning_rate": 0.0002730029091692705, "loss": 3.3294, "step": 50700 }, { "epoch": 5.462275320202346, "grad_norm": 0.7421239614486694, "learning_rate": 0.0002726796681392091, "loss": 3.3212, "step": 50750 }, { "epoch": 5.467656872241955, "grad_norm": 0.7280569672584534, "learning_rate": 0.0002723564271091477, "loss": 3.3327, "step": 50800 }, { "epoch": 5.473038424281563, "grad_norm": 0.7103368043899536, "learning_rate": 0.0002720331860790863, "loss": 3.346, "step": 50850 }, { "epoch": 5.478419976321171, "grad_norm": 0.7020702362060547, "learning_rate": 0.00027170994504902485, "loss": 3.3336, "step": 50900 }, { "epoch": 5.483801528360779, "grad_norm": 0.6580615043640137, "learning_rate": 0.00027138670401896344, "loss": 3.3426, "step": 50950 }, { "epoch": 5.489183080400387, "grad_norm": 0.6923621892929077, "learning_rate": 0.00027106346298890204, "loss": 3.3203, "step": 51000 }, { "epoch": 5.489183080400387, "eval_accuracy": 0.3825792044239979, "eval_loss": 3.395419120788574, "eval_runtime": 183.9392, "eval_samples_per_second": 97.918, "eval_steps_per_second": 6.122, "step": 51000 }, { "epoch": 5.494564632439996, "grad_norm": 0.7564615607261658, "learning_rate": 0.00027074022195884063, "loss": 3.3356, "step": 51050 }, { "epoch": 5.499946184479604, "grad_norm": 0.7157580852508545, "learning_rate": 0.0002704169809287792, "loss": 3.3146, "step": 51100 }, { "epoch": 5.505327736519212, "grad_norm": 0.668010950088501, "learning_rate": 0.00027009373989871776, "loss": 3.3243, "step": 51150 }, { "epoch": 5.510709288558821, "grad_norm": 0.7341917157173157, "learning_rate": 0.00026977049886865636, "loss": 3.3405, "step": 51200 }, { "epoch": 5.516090840598428, "grad_norm": 0.6746595501899719, "learning_rate": 0.00026944725783859495, "loss": 3.3325, "step": 51250 }, { "epoch": 5.521472392638037, "grad_norm": 0.6800440549850464, "learning_rate": 0.00026912401680853355, "loss": 3.3435, "step": 51300 }, { "epoch": 5.5268539446776455, "grad_norm": 0.6775352954864502, "learning_rate": 0.0002688007757784721, "loss": 3.3471, "step": 51350 }, { "epoch": 5.532235496717253, "grad_norm": 0.702453076839447, "learning_rate": 0.00026847753474841074, "loss": 3.3362, "step": 51400 }, { "epoch": 5.537617048756862, "grad_norm": 0.6794137954711914, "learning_rate": 0.0002681542937183493, "loss": 3.3212, "step": 51450 }, { "epoch": 5.542998600796469, "grad_norm": 0.7172088027000427, "learning_rate": 0.0002678310526882879, "loss": 3.3113, "step": 51500 }, { "epoch": 5.548380152836078, "grad_norm": 0.7116549015045166, "learning_rate": 0.00026750781165822647, "loss": 3.3282, "step": 51550 }, { "epoch": 5.553761704875686, "grad_norm": 0.7780585289001465, "learning_rate": 0.0002671910354487663, "loss": 3.3387, "step": 51600 }, { "epoch": 5.559143256915294, "grad_norm": 0.6774951219558716, "learning_rate": 0.00026686779441870487, "loss": 3.33, "step": 51650 }, { "epoch": 5.564524808954903, "grad_norm": 0.7718983888626099, "learning_rate": 0.0002665445533886434, "loss": 3.3254, "step": 51700 }, { "epoch": 5.569906360994511, "grad_norm": 0.712583601474762, "learning_rate": 0.00026622131235858206, "loss": 3.3213, "step": 51750 }, { "epoch": 5.575287913034119, "grad_norm": 0.6779072284698486, "learning_rate": 0.0002658980713285206, "loss": 3.3523, "step": 51800 }, { "epoch": 5.580669465073727, "grad_norm": 0.6965441107749939, "learning_rate": 0.0002655748302984592, "loss": 3.3396, "step": 51850 }, { "epoch": 5.586051017113336, "grad_norm": 0.6819996237754822, "learning_rate": 0.0002652515892683978, "loss": 3.329, "step": 51900 }, { "epoch": 5.591432569152944, "grad_norm": 0.7118018865585327, "learning_rate": 0.0002649283482383364, "loss": 3.3442, "step": 51950 }, { "epoch": 5.596814121192552, "grad_norm": 0.7522190809249878, "learning_rate": 0.0002646051072082749, "loss": 3.3414, "step": 52000 }, { "epoch": 5.596814121192552, "eval_accuracy": 0.38301251214060206, "eval_loss": 3.3925588130950928, "eval_runtime": 200.0832, "eval_samples_per_second": 90.018, "eval_steps_per_second": 5.628, "step": 52000 }, { "epoch": 5.60219567323216, "grad_norm": 0.7399505376815796, "learning_rate": 0.0002642818661782135, "loss": 3.336, "step": 52050 }, { "epoch": 5.607577225271768, "grad_norm": 0.7187677621841431, "learning_rate": 0.0002639586251481521, "loss": 3.3313, "step": 52100 }, { "epoch": 5.612958777311377, "grad_norm": 0.7134539484977722, "learning_rate": 0.0002636353841180907, "loss": 3.3282, "step": 52150 }, { "epoch": 5.618340329350985, "grad_norm": 0.700485348701477, "learning_rate": 0.0002633121430880293, "loss": 3.3433, "step": 52200 }, { "epoch": 5.623721881390593, "grad_norm": 0.7902025580406189, "learning_rate": 0.00026298890205796784, "loss": 3.3292, "step": 52250 }, { "epoch": 5.629103433430201, "grad_norm": 0.7196438908576965, "learning_rate": 0.00026266566102790644, "loss": 3.333, "step": 52300 }, { "epoch": 5.634484985469809, "grad_norm": 0.7103811502456665, "learning_rate": 0.00026234241999784503, "loss": 3.3273, "step": 52350 }, { "epoch": 5.639866537509418, "grad_norm": 0.7162445783615112, "learning_rate": 0.0002620191789677836, "loss": 3.3133, "step": 52400 }, { "epoch": 5.645248089549026, "grad_norm": 0.6804549694061279, "learning_rate": 0.00026169593793772217, "loss": 3.3273, "step": 52450 }, { "epoch": 5.650629641588634, "grad_norm": 0.7342245578765869, "learning_rate": 0.0002613726969076608, "loss": 3.3272, "step": 52500 }, { "epoch": 5.656011193628243, "grad_norm": 0.7277638912200928, "learning_rate": 0.00026104945587759936, "loss": 3.3345, "step": 52550 }, { "epoch": 5.66139274566785, "grad_norm": 0.7011197805404663, "learning_rate": 0.00026072621484753795, "loss": 3.3152, "step": 52600 }, { "epoch": 5.666774297707459, "grad_norm": 0.7422172427177429, "learning_rate": 0.00026040297381747655, "loss": 3.3492, "step": 52650 }, { "epoch": 5.672155849747067, "grad_norm": 0.6664391160011292, "learning_rate": 0.00026007973278741514, "loss": 3.3361, "step": 52700 }, { "epoch": 5.677537401786675, "grad_norm": 0.6835415363311768, "learning_rate": 0.00025975649175735373, "loss": 3.3284, "step": 52750 }, { "epoch": 5.682918953826284, "grad_norm": 0.7129110097885132, "learning_rate": 0.0002594332507272923, "loss": 3.3495, "step": 52800 }, { "epoch": 5.688300505865891, "grad_norm": 0.7200473546981812, "learning_rate": 0.00025911000969723087, "loss": 3.3403, "step": 52850 }, { "epoch": 5.6936820579055, "grad_norm": 0.716098427772522, "learning_rate": 0.00025878676866716946, "loss": 3.3266, "step": 52900 }, { "epoch": 5.699063609945108, "grad_norm": 0.7075340151786804, "learning_rate": 0.00025846352763710806, "loss": 3.3493, "step": 52950 }, { "epoch": 5.704445161984716, "grad_norm": 0.7516505718231201, "learning_rate": 0.0002581402866070466, "loss": 3.3442, "step": 53000 }, { "epoch": 5.704445161984716, "eval_accuracy": 0.38337019744728734, "eval_loss": 3.387094497680664, "eval_runtime": 188.9664, "eval_samples_per_second": 95.313, "eval_steps_per_second": 5.959, "step": 53000 }, { "epoch": 5.709826714024325, "grad_norm": 0.7055996656417847, "learning_rate": 0.00025781704557698525, "loss": 3.3359, "step": 53050 }, { "epoch": 5.715208266063933, "grad_norm": 0.739187479019165, "learning_rate": 0.000257500269367525, "loss": 3.3171, "step": 53100 }, { "epoch": 5.720589818103541, "grad_norm": 0.7215222716331482, "learning_rate": 0.0002571770283374636, "loss": 3.3471, "step": 53150 }, { "epoch": 5.725971370143149, "grad_norm": 0.6919649243354797, "learning_rate": 0.0002568537873074022, "loss": 3.3146, "step": 53200 }, { "epoch": 5.731352922182758, "grad_norm": 0.7388262748718262, "learning_rate": 0.0002565305462773408, "loss": 3.3336, "step": 53250 }, { "epoch": 5.736734474222366, "grad_norm": 0.6789129376411438, "learning_rate": 0.0002562073052472794, "loss": 3.3352, "step": 53300 }, { "epoch": 5.742116026261974, "grad_norm": 0.6696457266807556, "learning_rate": 0.0002558840642172179, "loss": 3.3141, "step": 53350 }, { "epoch": 5.747497578301582, "grad_norm": 0.7184023857116699, "learning_rate": 0.00025556082318715657, "loss": 3.3503, "step": 53400 }, { "epoch": 5.75287913034119, "grad_norm": 0.7339393496513367, "learning_rate": 0.0002552375821570951, "loss": 3.325, "step": 53450 }, { "epoch": 5.758260682380799, "grad_norm": 0.7395399212837219, "learning_rate": 0.0002549143411270337, "loss": 3.3382, "step": 53500 }, { "epoch": 5.763642234420407, "grad_norm": 0.6828871965408325, "learning_rate": 0.0002545911000969723, "loss": 3.315, "step": 53550 }, { "epoch": 5.769023786460015, "grad_norm": 0.7149345278739929, "learning_rate": 0.0002542678590669109, "loss": 3.3451, "step": 53600 }, { "epoch": 5.774405338499624, "grad_norm": 0.8174017667770386, "learning_rate": 0.00025394461803684943, "loss": 3.3367, "step": 53650 }, { "epoch": 5.779786890539231, "grad_norm": 0.8184425234794617, "learning_rate": 0.00025362137700678803, "loss": 3.3355, "step": 53700 }, { "epoch": 5.78516844257884, "grad_norm": 0.710602879524231, "learning_rate": 0.0002532981359767266, "loss": 3.3368, "step": 53750 }, { "epoch": 5.790549994618448, "grad_norm": 0.6902764439582825, "learning_rate": 0.0002529748949466652, "loss": 3.3518, "step": 53800 }, { "epoch": 5.795931546658056, "grad_norm": 0.7848208546638489, "learning_rate": 0.0002526516539166038, "loss": 3.3374, "step": 53850 }, { "epoch": 5.801313098697665, "grad_norm": 0.6792861819267273, "learning_rate": 0.00025232841288654235, "loss": 3.3248, "step": 53900 }, { "epoch": 5.806694650737272, "grad_norm": 0.6952259540557861, "learning_rate": 0.00025200517185648095, "loss": 3.3276, "step": 53950 }, { "epoch": 5.812076202776881, "grad_norm": 0.7168992161750793, "learning_rate": 0.00025168193082641954, "loss": 3.3466, "step": 54000 }, { "epoch": 5.812076202776881, "eval_accuracy": 0.38378622935472895, "eval_loss": 3.3818042278289795, "eval_runtime": 185.4192, "eval_samples_per_second": 97.137, "eval_steps_per_second": 6.073, "step": 54000 }, { "epoch": 5.817457754816489, "grad_norm": 0.702364981174469, "learning_rate": 0.00025135868979635814, "loss": 3.3416, "step": 54050 }, { "epoch": 5.822839306856097, "grad_norm": 0.7311719655990601, "learning_rate": 0.0002510354487662967, "loss": 3.3433, "step": 54100 }, { "epoch": 5.828220858895706, "grad_norm": 0.7476702332496643, "learning_rate": 0.0002507122077362353, "loss": 3.3212, "step": 54150 }, { "epoch": 5.833602410935313, "grad_norm": 0.6840526461601257, "learning_rate": 0.00025038896670617387, "loss": 3.3328, "step": 54200 }, { "epoch": 5.838983962974922, "grad_norm": 0.7292221188545227, "learning_rate": 0.00025006572567611246, "loss": 3.3161, "step": 54250 }, { "epoch": 5.84436551501453, "grad_norm": 0.7330601215362549, "learning_rate": 0.00024974248464605106, "loss": 3.3535, "step": 54300 }, { "epoch": 5.849747067054138, "grad_norm": 0.7055369019508362, "learning_rate": 0.00024941924361598965, "loss": 3.3419, "step": 54350 }, { "epoch": 5.855128619093747, "grad_norm": 0.7380827069282532, "learning_rate": 0.00024909600258592825, "loss": 3.326, "step": 54400 }, { "epoch": 5.860510171133355, "grad_norm": 0.7359794974327087, "learning_rate": 0.0002487727615558668, "loss": 3.3285, "step": 54450 }, { "epoch": 5.865891723172963, "grad_norm": 0.7361670732498169, "learning_rate": 0.0002484495205258054, "loss": 3.3419, "step": 54500 }, { "epoch": 5.871273275212571, "grad_norm": 0.690818190574646, "learning_rate": 0.000248126279495744, "loss": 3.3314, "step": 54550 }, { "epoch": 5.87665482725218, "grad_norm": 0.7662556767463684, "learning_rate": 0.00024780303846568257, "loss": 3.345, "step": 54600 }, { "epoch": 5.882036379291788, "grad_norm": 0.7552100419998169, "learning_rate": 0.0002474797974356211, "loss": 3.3404, "step": 54650 }, { "epoch": 5.887417931331396, "grad_norm": 0.7591714262962341, "learning_rate": 0.00024715655640555976, "loss": 3.3363, "step": 54700 }, { "epoch": 5.892799483371004, "grad_norm": 0.695042073726654, "learning_rate": 0.0002468333153754983, "loss": 3.3432, "step": 54750 }, { "epoch": 5.898181035410612, "grad_norm": 0.7628441452980042, "learning_rate": 0.0002465100743454369, "loss": 3.3404, "step": 54800 }, { "epoch": 5.903562587450221, "grad_norm": 0.708487331867218, "learning_rate": 0.0002461868333153755, "loss": 3.323, "step": 54850 }, { "epoch": 5.9089441394898286, "grad_norm": 0.7083818316459656, "learning_rate": 0.0002458635922853141, "loss": 3.3385, "step": 54900 }, { "epoch": 5.914325691529437, "grad_norm": 0.6774075627326965, "learning_rate": 0.0002455403512552526, "loss": 3.3205, "step": 54950 }, { "epoch": 5.919707243569046, "grad_norm": 0.7627544403076172, "learning_rate": 0.0002452171102251912, "loss": 3.3158, "step": 55000 }, { "epoch": 5.919707243569046, "eval_accuracy": 0.3840134225431491, "eval_loss": 3.3804023265838623, "eval_runtime": 200.1322, "eval_samples_per_second": 89.996, "eval_steps_per_second": 5.626, "step": 55000 }, { "epoch": 5.925088795608653, "grad_norm": 0.7071161866188049, "learning_rate": 0.0002448938691951298, "loss": 3.3494, "step": 55050 }, { "epoch": 5.930470347648262, "grad_norm": 0.6669203639030457, "learning_rate": 0.0002445706281650684, "loss": 3.3279, "step": 55100 }, { "epoch": 5.93585189968787, "grad_norm": 0.7078651785850525, "learning_rate": 0.000244247387135007, "loss": 3.3279, "step": 55150 }, { "epoch": 5.941233451727478, "grad_norm": 0.7112964987754822, "learning_rate": 0.00024392414610494557, "loss": 3.3294, "step": 55200 }, { "epoch": 5.946615003767087, "grad_norm": 0.6864575743675232, "learning_rate": 0.00024360090507488414, "loss": 3.3398, "step": 55250 }, { "epoch": 5.951996555806694, "grad_norm": 0.7129098773002625, "learning_rate": 0.00024327766404482273, "loss": 3.3439, "step": 55300 }, { "epoch": 5.957378107846303, "grad_norm": 0.7185387015342712, "learning_rate": 0.0002429544230147613, "loss": 3.3439, "step": 55350 }, { "epoch": 5.962759659885911, "grad_norm": 0.7282950282096863, "learning_rate": 0.00024263118198469992, "loss": 3.3519, "step": 55400 }, { "epoch": 5.968141211925519, "grad_norm": 0.7358017563819885, "learning_rate": 0.0002423079409546385, "loss": 3.3289, "step": 55450 }, { "epoch": 5.973522763965128, "grad_norm": 0.7266455292701721, "learning_rate": 0.00024198469992457706, "loss": 3.332, "step": 55500 }, { "epoch": 5.978904316004736, "grad_norm": 0.7006992697715759, "learning_rate": 0.00024166145889451568, "loss": 3.3241, "step": 55550 }, { "epoch": 5.984285868044344, "grad_norm": 0.7283947467803955, "learning_rate": 0.00024133821786445425, "loss": 3.3531, "step": 55600 }, { "epoch": 5.989667420083952, "grad_norm": 0.7370797991752625, "learning_rate": 0.0002410149768343928, "loss": 3.3443, "step": 55650 }, { "epoch": 5.995048972123561, "grad_norm": 0.7191126942634583, "learning_rate": 0.0002406917358043314, "loss": 3.3158, "step": 55700 }, { "epoch": 6.000430524163169, "grad_norm": 0.7115250825881958, "learning_rate": 0.00024036849477427, "loss": 3.3082, "step": 55750 }, { "epoch": 6.005812076202777, "grad_norm": 0.7569565176963806, "learning_rate": 0.00024004525374420857, "loss": 3.2241, "step": 55800 }, { "epoch": 6.011193628242385, "grad_norm": 0.7155330777168274, "learning_rate": 0.00023972201271414716, "loss": 3.2566, "step": 55850 }, { "epoch": 6.016575180281993, "grad_norm": 0.7080894708633423, "learning_rate": 0.00023939877168408573, "loss": 3.2555, "step": 55900 }, { "epoch": 6.021956732321602, "grad_norm": 0.6885237693786621, "learning_rate": 0.00023907553065402433, "loss": 3.263, "step": 55950 }, { "epoch": 6.0273382843612096, "grad_norm": 0.7117237448692322, "learning_rate": 0.00023875228962396292, "loss": 3.2221, "step": 56000 }, { "epoch": 6.0273382843612096, "eval_accuracy": 0.3842926604648323, "eval_loss": 3.382725238800049, "eval_runtime": 186.5559, "eval_samples_per_second": 96.545, "eval_steps_per_second": 6.036, "step": 56000 }, { "epoch": 6.032719836400818, "grad_norm": 0.7262279987335205, "learning_rate": 0.0002384290485939015, "loss": 3.2502, "step": 56050 }, { "epoch": 6.038101388440427, "grad_norm": 0.7350507974624634, "learning_rate": 0.00023810580756384006, "loss": 3.2486, "step": 56100 }, { "epoch": 6.043482940480034, "grad_norm": 0.7523810863494873, "learning_rate": 0.00023778256653377868, "loss": 3.2522, "step": 56150 }, { "epoch": 6.048864492519643, "grad_norm": 0.7505652904510498, "learning_rate": 0.00023745932550371725, "loss": 3.2574, "step": 56200 }, { "epoch": 6.0542460445592505, "grad_norm": 0.7597776651382446, "learning_rate": 0.00023713608447365584, "loss": 3.2487, "step": 56250 }, { "epoch": 6.059627596598859, "grad_norm": 0.7254223227500916, "learning_rate": 0.00023681284344359444, "loss": 3.2471, "step": 56300 }, { "epoch": 6.065009148638468, "grad_norm": 0.741804301738739, "learning_rate": 0.000236489602413533, "loss": 3.2517, "step": 56350 }, { "epoch": 6.070390700678075, "grad_norm": 0.7750810980796814, "learning_rate": 0.0002361663613834716, "loss": 3.2595, "step": 56400 }, { "epoch": 6.075772252717684, "grad_norm": 0.7423557043075562, "learning_rate": 0.00023584312035341017, "loss": 3.2446, "step": 56450 }, { "epoch": 6.081153804757292, "grad_norm": 0.7267886996269226, "learning_rate": 0.00023551987932334876, "loss": 3.2457, "step": 56500 }, { "epoch": 6.0865353567969, "grad_norm": 0.7142068147659302, "learning_rate": 0.00023519663829328735, "loss": 3.2624, "step": 56550 }, { "epoch": 6.091916908836509, "grad_norm": 0.7558031678199768, "learning_rate": 0.00023487339726322592, "loss": 3.2485, "step": 56600 }, { "epoch": 6.097298460876116, "grad_norm": 0.7324398756027222, "learning_rate": 0.0002345501562331645, "loss": 3.2484, "step": 56650 }, { "epoch": 6.102680012915725, "grad_norm": 0.720934271812439, "learning_rate": 0.0002342269152031031, "loss": 3.2449, "step": 56700 }, { "epoch": 6.108061564955333, "grad_norm": 0.7175422310829163, "learning_rate": 0.00023390367417304168, "loss": 3.2482, "step": 56750 }, { "epoch": 6.113443116994941, "grad_norm": 0.7684024572372437, "learning_rate": 0.00023358043314298025, "loss": 3.2574, "step": 56800 }, { "epoch": 6.11882466903455, "grad_norm": 0.7317076325416565, "learning_rate": 0.00023325719211291887, "loss": 3.2622, "step": 56850 }, { "epoch": 6.124206221074158, "grad_norm": 0.7210121750831604, "learning_rate": 0.00023293395108285744, "loss": 3.2546, "step": 56900 }, { "epoch": 6.129587773113766, "grad_norm": 0.7663288712501526, "learning_rate": 0.000232610710052796, "loss": 3.2664, "step": 56950 }, { "epoch": 6.134969325153374, "grad_norm": 0.7994782328605652, "learning_rate": 0.0002322874690227346, "loss": 3.2598, "step": 57000 }, { "epoch": 6.134969325153374, "eval_accuracy": 0.38437925681642043, "eval_loss": 3.3824102878570557, "eval_runtime": 191.6177, "eval_samples_per_second": 93.994, "eval_steps_per_second": 5.876, "step": 57000 }, { "epoch": 6.140350877192983, "grad_norm": 0.7230924963951111, "learning_rate": 0.0002319642279926732, "loss": 3.261, "step": 57050 }, { "epoch": 6.1457324292325906, "grad_norm": 0.7568264007568359, "learning_rate": 0.000231647451783213, "loss": 3.2668, "step": 57100 }, { "epoch": 6.151113981272199, "grad_norm": 0.7552214860916138, "learning_rate": 0.00023132421075315157, "loss": 3.266, "step": 57150 }, { "epoch": 6.156495533311807, "grad_norm": 0.7440437078475952, "learning_rate": 0.0002310009697230902, "loss": 3.2646, "step": 57200 }, { "epoch": 6.161877085351415, "grad_norm": 0.721347451210022, "learning_rate": 0.00023067772869302876, "loss": 3.2607, "step": 57250 }, { "epoch": 6.167258637391024, "grad_norm": 0.7305224537849426, "learning_rate": 0.00023035448766296732, "loss": 3.2804, "step": 57300 }, { "epoch": 6.1726401894306315, "grad_norm": 0.7204300761222839, "learning_rate": 0.00023003124663290592, "loss": 3.2567, "step": 57350 }, { "epoch": 6.17802174147024, "grad_norm": 0.7423768043518066, "learning_rate": 0.0002297080056028445, "loss": 3.2606, "step": 57400 }, { "epoch": 6.183403293509849, "grad_norm": 0.7213789224624634, "learning_rate": 0.00022938476457278308, "loss": 3.247, "step": 57450 }, { "epoch": 6.188784845549456, "grad_norm": 0.7615440487861633, "learning_rate": 0.00022906152354272168, "loss": 3.2705, "step": 57500 }, { "epoch": 6.194166397589065, "grad_norm": 0.7375072240829468, "learning_rate": 0.00022873828251266024, "loss": 3.2749, "step": 57550 }, { "epoch": 6.1995479496286725, "grad_norm": 0.7319751381874084, "learning_rate": 0.00022841504148259884, "loss": 3.2663, "step": 57600 }, { "epoch": 6.204929501668281, "grad_norm": 0.7472988963127136, "learning_rate": 0.00022809180045253743, "loss": 3.2722, "step": 57650 }, { "epoch": 6.21031105370789, "grad_norm": 0.7064111232757568, "learning_rate": 0.000227768559422476, "loss": 3.2948, "step": 57700 }, { "epoch": 6.215692605747497, "grad_norm": 0.7327179312705994, "learning_rate": 0.00022744531839241457, "loss": 3.2755, "step": 57750 }, { "epoch": 6.221074157787106, "grad_norm": 0.7835708856582642, "learning_rate": 0.0002271220773623532, "loss": 3.2493, "step": 57800 }, { "epoch": 6.226455709826714, "grad_norm": 0.7523908615112305, "learning_rate": 0.00022679883633229176, "loss": 3.2708, "step": 57850 }, { "epoch": 6.231837261866322, "grad_norm": 0.7339306473731995, "learning_rate": 0.00022647559530223032, "loss": 3.2754, "step": 57900 }, { "epoch": 6.237218813905931, "grad_norm": 0.7209348082542419, "learning_rate": 0.00022615235427216895, "loss": 3.2695, "step": 57950 }, { "epoch": 6.242600365945538, "grad_norm": 0.7288614511489868, "learning_rate": 0.0002258291132421075, "loss": 3.274, "step": 58000 }, { "epoch": 6.242600365945538, "eval_accuracy": 0.38512472427971556, "eval_loss": 3.378944158554077, "eval_runtime": 186.7187, "eval_samples_per_second": 96.461, "eval_steps_per_second": 6.03, "step": 58000 }, { "epoch": 6.247981917985147, "grad_norm": 0.7308927178382874, "learning_rate": 0.0002255058722120461, "loss": 3.2747, "step": 58050 }, { "epoch": 6.253363470024755, "grad_norm": 0.7706624865531921, "learning_rate": 0.00022518263118198468, "loss": 3.2845, "step": 58100 }, { "epoch": 6.258745022064363, "grad_norm": 0.7144868969917297, "learning_rate": 0.00022485939015192327, "loss": 3.3109, "step": 58150 }, { "epoch": 6.264126574103972, "grad_norm": 0.7081130743026733, "learning_rate": 0.00022453614912186186, "loss": 3.289, "step": 58200 }, { "epoch": 6.26950812614358, "grad_norm": 0.7932727336883545, "learning_rate": 0.00022421290809180043, "loss": 3.264, "step": 58250 }, { "epoch": 6.274889678183188, "grad_norm": 0.7837817668914795, "learning_rate": 0.000223889667061739, "loss": 3.2689, "step": 58300 }, { "epoch": 6.280271230222796, "grad_norm": 0.7259349226951599, "learning_rate": 0.00022356642603167762, "loss": 3.2607, "step": 58350 }, { "epoch": 6.285652782262405, "grad_norm": 0.7716546654701233, "learning_rate": 0.0002232431850016162, "loss": 3.2775, "step": 58400 }, { "epoch": 6.2910343343020125, "grad_norm": 0.7826303243637085, "learning_rate": 0.00022291994397155476, "loss": 3.2898, "step": 58450 }, { "epoch": 6.296415886341621, "grad_norm": 0.7641787528991699, "learning_rate": 0.00022259670294149338, "loss": 3.2681, "step": 58500 }, { "epoch": 6.301797438381229, "grad_norm": 0.7516437768936157, "learning_rate": 0.00022227346191143195, "loss": 3.2658, "step": 58550 }, { "epoch": 6.307178990420837, "grad_norm": 0.7420710921287537, "learning_rate": 0.00022195022088137051, "loss": 3.2688, "step": 58600 }, { "epoch": 6.312560542460446, "grad_norm": 0.7336184978485107, "learning_rate": 0.0002216269798513091, "loss": 3.2825, "step": 58650 }, { "epoch": 6.3179420945000535, "grad_norm": 0.7269745469093323, "learning_rate": 0.00022130373882124768, "loss": 3.2911, "step": 58700 }, { "epoch": 6.323323646539662, "grad_norm": 0.733131468296051, "learning_rate": 0.00022098049779118627, "loss": 3.2773, "step": 58750 }, { "epoch": 6.328705198579271, "grad_norm": 0.7320564985275269, "learning_rate": 0.00022065725676112487, "loss": 3.2701, "step": 58800 }, { "epoch": 6.334086750618878, "grad_norm": 0.7288387417793274, "learning_rate": 0.00022033401573106343, "loss": 3.2697, "step": 58850 }, { "epoch": 6.339468302658487, "grad_norm": 0.7281746864318848, "learning_rate": 0.000220010774701002, "loss": 3.281, "step": 58900 }, { "epoch": 6.344849854698095, "grad_norm": 0.8159940242767334, "learning_rate": 0.00021968753367094062, "loss": 3.2708, "step": 58950 }, { "epoch": 6.350231406737703, "grad_norm": 0.7718303799629211, "learning_rate": 0.0002193642926408792, "loss": 3.2676, "step": 59000 }, { "epoch": 6.350231406737703, "eval_accuracy": 0.38559377879641066, "eval_loss": 3.372860908508301, "eval_runtime": 189.2704, "eval_samples_per_second": 95.16, "eval_steps_per_second": 5.949, "step": 59000 }, { "epoch": 6.355612958777312, "grad_norm": 0.7953489422798157, "learning_rate": 0.00021904105161081778, "loss": 3.2538, "step": 59050 }, { "epoch": 6.360994510816919, "grad_norm": 0.7005577683448792, "learning_rate": 0.0002187242754013576, "loss": 3.2733, "step": 59100 }, { "epoch": 6.366376062856528, "grad_norm": 0.7291739583015442, "learning_rate": 0.00021840103437129619, "loss": 3.2857, "step": 59150 }, { "epoch": 6.371757614896136, "grad_norm": 0.8576701283454895, "learning_rate": 0.00021807779334123475, "loss": 3.2795, "step": 59200 }, { "epoch": 6.377139166935744, "grad_norm": 0.7578076720237732, "learning_rate": 0.00021775455231117335, "loss": 3.2589, "step": 59250 }, { "epoch": 6.382520718975353, "grad_norm": 0.699042558670044, "learning_rate": 0.00021743131128111194, "loss": 3.2874, "step": 59300 }, { "epoch": 6.387902271014961, "grad_norm": 0.7538677453994751, "learning_rate": 0.0002171080702510505, "loss": 3.282, "step": 59350 }, { "epoch": 6.393283823054569, "grad_norm": 0.7306122779846191, "learning_rate": 0.00021678482922098908, "loss": 3.2769, "step": 59400 }, { "epoch": 6.398665375094177, "grad_norm": 0.7715863585472107, "learning_rate": 0.0002164615881909277, "loss": 3.2577, "step": 59450 }, { "epoch": 6.404046927133785, "grad_norm": 0.7828499674797058, "learning_rate": 0.00021613834716086627, "loss": 3.2838, "step": 59500 }, { "epoch": 6.4094284791733935, "grad_norm": 0.7628936767578125, "learning_rate": 0.00021581510613080483, "loss": 3.2736, "step": 59550 }, { "epoch": 6.414810031213002, "grad_norm": 0.8166696429252625, "learning_rate": 0.00021549186510074346, "loss": 3.2829, "step": 59600 }, { "epoch": 6.42019158325261, "grad_norm": 0.7255755066871643, "learning_rate": 0.00021516862407068202, "loss": 3.2819, "step": 59650 }, { "epoch": 6.425573135292218, "grad_norm": 0.7925482392311096, "learning_rate": 0.0002148453830406206, "loss": 3.302, "step": 59700 }, { "epoch": 6.430954687331827, "grad_norm": 0.7258265614509583, "learning_rate": 0.00021452214201055919, "loss": 3.2691, "step": 59750 }, { "epoch": 6.4363362393714345, "grad_norm": 0.7687867879867554, "learning_rate": 0.00021419890098049778, "loss": 3.2862, "step": 59800 }, { "epoch": 6.441717791411043, "grad_norm": 0.7652358412742615, "learning_rate": 0.00021387565995043638, "loss": 3.293, "step": 59850 }, { "epoch": 6.447099343450651, "grad_norm": 0.739409327507019, "learning_rate": 0.00021355241892037494, "loss": 3.2833, "step": 59900 }, { "epoch": 6.452480895490259, "grad_norm": 0.733664870262146, "learning_rate": 0.0002132291778903135, "loss": 3.2796, "step": 59950 }, { "epoch": 6.457862447529868, "grad_norm": 0.9308944344520569, "learning_rate": 0.00021290593686025213, "loss": 3.2854, "step": 60000 }, { "epoch": 6.457862447529868, "eval_accuracy": 0.3857741425901827, "eval_loss": 3.370995283126831, "eval_runtime": 189.5205, "eval_samples_per_second": 95.035, "eval_steps_per_second": 5.941, "step": 60000 }, { "epoch": 6.4632439995694755, "grad_norm": 0.8035341501235962, "learning_rate": 0.0002125826958301907, "loss": 3.2629, "step": 60050 }, { "epoch": 6.468625551609084, "grad_norm": 0.7408281564712524, "learning_rate": 0.00021225945480012927, "loss": 3.2811, "step": 60100 }, { "epoch": 6.474007103648693, "grad_norm": 0.7610222697257996, "learning_rate": 0.00021193621377006786, "loss": 3.2728, "step": 60150 }, { "epoch": 6.4793886556883, "grad_norm": 0.7922364473342896, "learning_rate": 0.00021161297274000646, "loss": 3.2679, "step": 60200 }, { "epoch": 6.484770207727909, "grad_norm": 0.7946208715438843, "learning_rate": 0.00021128973170994502, "loss": 3.2618, "step": 60250 }, { "epoch": 6.490151759767517, "grad_norm": 0.7420467138290405, "learning_rate": 0.00021096649067988362, "loss": 3.2625, "step": 60300 }, { "epoch": 6.495533311807125, "grad_norm": 0.7789103388786316, "learning_rate": 0.00021064324964982219, "loss": 3.286, "step": 60350 }, { "epoch": 6.500914863846734, "grad_norm": 0.7576149702072144, "learning_rate": 0.00021032000861976078, "loss": 3.2835, "step": 60400 }, { "epoch": 6.506296415886341, "grad_norm": 0.7600129842758179, "learning_rate": 0.00020999676758969938, "loss": 3.2629, "step": 60450 }, { "epoch": 6.51167796792595, "grad_norm": 0.7913733720779419, "learning_rate": 0.00020967352655963794, "loss": 3.2719, "step": 60500 }, { "epoch": 6.517059519965558, "grad_norm": 0.7463458776473999, "learning_rate": 0.0002093502855295765, "loss": 3.2867, "step": 60550 }, { "epoch": 6.522441072005166, "grad_norm": 0.7545099258422852, "learning_rate": 0.00020902704449951513, "loss": 3.2887, "step": 60600 }, { "epoch": 6.5278226240447745, "grad_norm": 0.7538031339645386, "learning_rate": 0.0002087038034694537, "loss": 3.2802, "step": 60650 }, { "epoch": 6.533204176084383, "grad_norm": 0.7591269016265869, "learning_rate": 0.00020838056243939227, "loss": 3.251, "step": 60700 }, { "epoch": 6.538585728123991, "grad_norm": 0.7781814336776733, "learning_rate": 0.0002080573214093309, "loss": 3.2822, "step": 60750 }, { "epoch": 6.543967280163599, "grad_norm": 0.745883047580719, "learning_rate": 0.00020773408037926946, "loss": 3.2792, "step": 60800 }, { "epoch": 6.549348832203208, "grad_norm": 0.7403407096862793, "learning_rate": 0.00020741083934920805, "loss": 3.3001, "step": 60850 }, { "epoch": 6.5547303842428155, "grad_norm": 0.7606509923934937, "learning_rate": 0.00020708759831914662, "loss": 3.2974, "step": 60900 }, { "epoch": 6.560111936282424, "grad_norm": 0.7597663998603821, "learning_rate": 0.00020676435728908521, "loss": 3.2874, "step": 60950 }, { "epoch": 6.565493488322032, "grad_norm": 0.7940024137496948, "learning_rate": 0.0002064411162590238, "loss": 3.2864, "step": 61000 }, { "epoch": 6.565493488322032, "eval_accuracy": 0.3859547236897304, "eval_loss": 3.3682405948638916, "eval_runtime": 189.1949, "eval_samples_per_second": 95.198, "eval_steps_per_second": 5.952, "step": 61000 }, { "epoch": 6.57087504036164, "grad_norm": 0.7270107865333557, "learning_rate": 0.00020611787522896238, "loss": 3.2818, "step": 61050 }, { "epoch": 6.576256592401249, "grad_norm": 0.7495406866073608, "learning_rate": 0.00020579463419890094, "loss": 3.2956, "step": 61100 }, { "epoch": 6.5816381444408565, "grad_norm": 0.825792670249939, "learning_rate": 0.00020547785798944078, "loss": 3.2882, "step": 61150 }, { "epoch": 6.587019696480465, "grad_norm": 0.7263872623443604, "learning_rate": 0.00020515461695937934, "loss": 3.2876, "step": 61200 }, { "epoch": 6.592401248520073, "grad_norm": 0.7494966983795166, "learning_rate": 0.00020483137592931797, "loss": 3.272, "step": 61250 }, { "epoch": 6.597782800559681, "grad_norm": 0.7126851081848145, "learning_rate": 0.00020450813489925653, "loss": 3.2948, "step": 61300 }, { "epoch": 6.60316435259929, "grad_norm": 0.7981864213943481, "learning_rate": 0.0002041848938691951, "loss": 3.2767, "step": 61350 }, { "epoch": 6.608545904638898, "grad_norm": 0.7677425742149353, "learning_rate": 0.0002038616528391337, "loss": 3.2844, "step": 61400 }, { "epoch": 6.613927456678506, "grad_norm": 0.7435486912727356, "learning_rate": 0.00020353841180907226, "loss": 3.2593, "step": 61450 }, { "epoch": 6.619309008718115, "grad_norm": 0.824278712272644, "learning_rate": 0.00020321517077901086, "loss": 3.2904, "step": 61500 }, { "epoch": 6.624690560757722, "grad_norm": 0.7523923516273499, "learning_rate": 0.00020289192974894945, "loss": 3.2578, "step": 61550 }, { "epoch": 6.630072112797331, "grad_norm": 0.7673538327217102, "learning_rate": 0.00020256868871888802, "loss": 3.3023, "step": 61600 }, { "epoch": 6.635453664836939, "grad_norm": 0.7364659905433655, "learning_rate": 0.00020224544768882664, "loss": 3.2894, "step": 61650 }, { "epoch": 6.640835216876547, "grad_norm": 0.7490333914756775, "learning_rate": 0.0002019222066587652, "loss": 3.2939, "step": 61700 }, { "epoch": 6.6462167689161555, "grad_norm": 0.7625733017921448, "learning_rate": 0.00020159896562870378, "loss": 3.2909, "step": 61750 }, { "epoch": 6.651598320955763, "grad_norm": 0.7619541883468628, "learning_rate": 0.00020127572459864237, "loss": 3.2739, "step": 61800 }, { "epoch": 6.656979872995372, "grad_norm": 0.7490652799606323, "learning_rate": 0.00020095248356858097, "loss": 3.2761, "step": 61850 }, { "epoch": 6.66236142503498, "grad_norm": 0.7691114544868469, "learning_rate": 0.00020062924253851953, "loss": 3.2678, "step": 61900 }, { "epoch": 6.667742977074588, "grad_norm": 0.7397304177284241, "learning_rate": 0.00020030600150845813, "loss": 3.2868, "step": 61950 }, { "epoch": 6.6731245291141965, "grad_norm": 0.7724108695983887, "learning_rate": 0.0001999827604783967, "loss": 3.2844, "step": 62000 }, { "epoch": 6.6731245291141965, "eval_accuracy": 0.3865379723915185, "eval_loss": 3.3638100624084473, "eval_runtime": 191.2519, "eval_samples_per_second": 94.174, "eval_steps_per_second": 5.888, "step": 62000 }, { "epoch": 6.678506081153805, "grad_norm": 0.7569757103919983, "learning_rate": 0.0001996595194483353, "loss": 3.2948, "step": 62050 }, { "epoch": 6.683887633193413, "grad_norm": 0.7932971715927124, "learning_rate": 0.00019933627841827389, "loss": 3.2844, "step": 62100 }, { "epoch": 6.689269185233021, "grad_norm": 0.7220522165298462, "learning_rate": 0.00019901303738821245, "loss": 3.2874, "step": 62150 }, { "epoch": 6.69465073727263, "grad_norm": 0.7374184131622314, "learning_rate": 0.00019868979635815102, "loss": 3.2787, "step": 62200 }, { "epoch": 6.7000322893122375, "grad_norm": 0.7741689682006836, "learning_rate": 0.00019837302014869085, "loss": 3.2896, "step": 62250 }, { "epoch": 6.705413841351846, "grad_norm": 0.7424774169921875, "learning_rate": 0.00019804977911862942, "loss": 3.2957, "step": 62300 }, { "epoch": 6.710795393391454, "grad_norm": 0.7883257865905762, "learning_rate": 0.00019772653808856804, "loss": 3.3029, "step": 62350 }, { "epoch": 6.716176945431062, "grad_norm": 0.757195234298706, "learning_rate": 0.0001974032970585066, "loss": 3.2799, "step": 62400 }, { "epoch": 6.721558497470671, "grad_norm": 0.7731918692588806, "learning_rate": 0.00019708005602844518, "loss": 3.2875, "step": 62450 }, { "epoch": 6.7269400495102785, "grad_norm": 0.7180660367012024, "learning_rate": 0.00019675681499838377, "loss": 3.2893, "step": 62500 }, { "epoch": 6.732321601549887, "grad_norm": 0.7762702703475952, "learning_rate": 0.00019643357396832237, "loss": 3.28, "step": 62550 }, { "epoch": 6.737703153589496, "grad_norm": 0.7744964361190796, "learning_rate": 0.00019611033293826096, "loss": 3.2863, "step": 62600 }, { "epoch": 6.743084705629103, "grad_norm": 0.8226206302642822, "learning_rate": 0.00019578709190819953, "loss": 3.2793, "step": 62650 }, { "epoch": 6.748466257668712, "grad_norm": 0.813818633556366, "learning_rate": 0.0001954638508781381, "loss": 3.2688, "step": 62700 }, { "epoch": 6.75384780970832, "grad_norm": 0.7814973592758179, "learning_rate": 0.00019514060984807672, "loss": 3.2817, "step": 62750 }, { "epoch": 6.759229361747928, "grad_norm": 0.755522608757019, "learning_rate": 0.0001948173688180153, "loss": 3.2886, "step": 62800 }, { "epoch": 6.7646109137875365, "grad_norm": 0.7371417880058289, "learning_rate": 0.00019449412778795386, "loss": 3.2922, "step": 62850 }, { "epoch": 6.769992465827144, "grad_norm": 0.829099714756012, "learning_rate": 0.00019417088675789245, "loss": 3.2693, "step": 62900 }, { "epoch": 6.775374017866753, "grad_norm": 0.7393680810928345, "learning_rate": 0.00019384764572783104, "loss": 3.2789, "step": 62950 }, { "epoch": 6.780755569906361, "grad_norm": 0.7941797375679016, "learning_rate": 0.0001935244046977696, "loss": 3.2749, "step": 63000 }, { "epoch": 6.780755569906361, "eval_accuracy": 0.38678733076905275, "eval_loss": 3.358473539352417, "eval_runtime": 183.8407, "eval_samples_per_second": 97.971, "eval_steps_per_second": 6.125, "step": 63000 }, { "epoch": 6.786137121945969, "grad_norm": 0.8067751526832581, "learning_rate": 0.0001932011636677082, "loss": 3.2727, "step": 63050 }, { "epoch": 6.7915186739855775, "grad_norm": 0.7249636650085449, "learning_rate": 0.00019287792263764677, "loss": 3.2811, "step": 63100 }, { "epoch": 6.796900226025185, "grad_norm": 0.8218013048171997, "learning_rate": 0.00019255468160758537, "loss": 3.2913, "step": 63150 }, { "epoch": 6.802281778064794, "grad_norm": 0.7430583238601685, "learning_rate": 0.00019223144057752396, "loss": 3.2848, "step": 63200 }, { "epoch": 6.807663330104402, "grad_norm": 0.7417072653770447, "learning_rate": 0.00019190819954746253, "loss": 3.2855, "step": 63250 }, { "epoch": 6.813044882144011, "grad_norm": 0.7472231388092041, "learning_rate": 0.0001915849585174011, "loss": 3.2809, "step": 63300 }, { "epoch": 6.8184264341836185, "grad_norm": 0.8350476026535034, "learning_rate": 0.00019126171748733972, "loss": 3.2885, "step": 63350 }, { "epoch": 6.823807986223227, "grad_norm": 0.7260509729385376, "learning_rate": 0.0001909384764572783, "loss": 3.2901, "step": 63400 }, { "epoch": 6.829189538262835, "grad_norm": 0.742171585559845, "learning_rate": 0.00019061523542721688, "loss": 3.2887, "step": 63450 }, { "epoch": 6.834571090302443, "grad_norm": 0.7622074484825134, "learning_rate": 0.00019029199439715548, "loss": 3.2763, "step": 63500 }, { "epoch": 6.839952642342052, "grad_norm": 0.7722497582435608, "learning_rate": 0.00018996875336709404, "loss": 3.2793, "step": 63550 }, { "epoch": 6.8453341943816595, "grad_norm": 0.7499434947967529, "learning_rate": 0.00018964551233703264, "loss": 3.286, "step": 63600 }, { "epoch": 6.850715746421268, "grad_norm": 0.7652062177658081, "learning_rate": 0.0001893222713069712, "loss": 3.2974, "step": 63650 }, { "epoch": 6.856097298460876, "grad_norm": 0.7807160019874573, "learning_rate": 0.0001889990302769098, "loss": 3.2688, "step": 63700 }, { "epoch": 6.861478850500484, "grad_norm": 0.7585758566856384, "learning_rate": 0.0001886757892468484, "loss": 3.2742, "step": 63750 }, { "epoch": 6.866860402540093, "grad_norm": 0.8624706268310547, "learning_rate": 0.00018835254821678696, "loss": 3.2897, "step": 63800 }, { "epoch": 6.8722419545797, "grad_norm": 0.7231000065803528, "learning_rate": 0.00018802930718672553, "loss": 3.2769, "step": 63850 }, { "epoch": 6.877623506619309, "grad_norm": 0.7677696943283081, "learning_rate": 0.00018770606615666415, "loss": 3.2972, "step": 63900 }, { "epoch": 6.8830050586589175, "grad_norm": 0.7870625853538513, "learning_rate": 0.00018738282512660272, "loss": 3.2948, "step": 63950 }, { "epoch": 6.888386610698525, "grad_norm": 0.8059346079826355, "learning_rate": 0.0001870595840965413, "loss": 3.2759, "step": 64000 }, { "epoch": 6.888386610698525, "eval_accuracy": 0.3872708361198272, "eval_loss": 3.3542022705078125, "eval_runtime": 191.9315, "eval_samples_per_second": 93.841, "eval_steps_per_second": 5.867, "step": 64000 }, { "epoch": 6.893768162738134, "grad_norm": 0.7675033211708069, "learning_rate": 0.0001867363430664799, "loss": 3.2783, "step": 64050 }, { "epoch": 6.899149714777742, "grad_norm": 0.8100468516349792, "learning_rate": 0.00018641310203641848, "loss": 3.2831, "step": 64100 }, { "epoch": 6.90453126681735, "grad_norm": 0.7860651612281799, "learning_rate": 0.00018608986100635705, "loss": 3.2847, "step": 64150 }, { "epoch": 6.9099128188569585, "grad_norm": 0.8007771968841553, "learning_rate": 0.00018576661997629564, "loss": 3.2828, "step": 64200 }, { "epoch": 6.915294370896566, "grad_norm": 0.7722747325897217, "learning_rate": 0.00018544337894623423, "loss": 3.2771, "step": 64250 }, { "epoch": 6.920675922936175, "grad_norm": 0.7721737623214722, "learning_rate": 0.0001851201379161728, "loss": 3.2788, "step": 64300 }, { "epoch": 6.926057474975783, "grad_norm": 0.7347503304481506, "learning_rate": 0.0001847968968861114, "loss": 3.2833, "step": 64350 }, { "epoch": 6.931439027015391, "grad_norm": 0.8173298239707947, "learning_rate": 0.00018447365585604996, "loss": 3.2794, "step": 64400 }, { "epoch": 6.9368205790549995, "grad_norm": 0.799849808216095, "learning_rate": 0.00018415041482598859, "loss": 3.2842, "step": 64450 }, { "epoch": 6.942202131094608, "grad_norm": 0.7882811427116394, "learning_rate": 0.00018382717379592715, "loss": 3.2744, "step": 64500 }, { "epoch": 6.947583683134216, "grad_norm": 0.7814345955848694, "learning_rate": 0.00018350393276586572, "loss": 3.2842, "step": 64550 }, { "epoch": 6.952965235173824, "grad_norm": 0.7994155883789062, "learning_rate": 0.00018318069173580434, "loss": 3.2837, "step": 64600 }, { "epoch": 6.958346787213433, "grad_norm": 0.7689683437347412, "learning_rate": 0.0001828574507057429, "loss": 3.2896, "step": 64650 }, { "epoch": 6.9637283392530405, "grad_norm": 0.757786750793457, "learning_rate": 0.00018253420967568148, "loss": 3.2799, "step": 64700 }, { "epoch": 6.969109891292649, "grad_norm": 0.8227623701095581, "learning_rate": 0.00018221096864562007, "loss": 3.2847, "step": 64750 }, { "epoch": 6.974491443332257, "grad_norm": 0.7690378427505493, "learning_rate": 0.00018188772761555867, "loss": 3.2809, "step": 64800 }, { "epoch": 6.979872995371865, "grad_norm": 0.7968791723251343, "learning_rate": 0.00018156448658549723, "loss": 3.2829, "step": 64850 }, { "epoch": 6.985254547411474, "grad_norm": 0.8313443064689636, "learning_rate": 0.00018124124555543583, "loss": 3.2758, "step": 64900 }, { "epoch": 6.990636099451081, "grad_norm": 0.7663018107414246, "learning_rate": 0.0001809180045253744, "loss": 3.2756, "step": 64950 }, { "epoch": 6.99601765149069, "grad_norm": 0.7925686240196228, "learning_rate": 0.00018059476349531296, "loss": 3.2937, "step": 65000 }, { "epoch": 6.99601765149069, "eval_accuracy": 0.38779812858439083, "eval_loss": 3.350365400314331, "eval_runtime": 179.2457, "eval_samples_per_second": 100.482, "eval_steps_per_second": 6.282, "step": 65000 }, { "epoch": 7.0013992035302985, "grad_norm": 0.7466768026351929, "learning_rate": 0.00018027152246525159, "loss": 3.2677, "step": 65050 }, { "epoch": 7.006780755569906, "grad_norm": 0.7414117455482483, "learning_rate": 0.00017994828143519015, "loss": 3.1873, "step": 65100 }, { "epoch": 7.012162307609515, "grad_norm": 0.7512695789337158, "learning_rate": 0.00017962504040512872, "loss": 3.2051, "step": 65150 }, { "epoch": 7.017543859649122, "grad_norm": 0.761222779750824, "learning_rate": 0.00017930179937506734, "loss": 3.1992, "step": 65200 }, { "epoch": 7.022925411688731, "grad_norm": 0.8188371062278748, "learning_rate": 0.0001789785583450059, "loss": 3.2136, "step": 65250 }, { "epoch": 7.0283069637283395, "grad_norm": 0.8293972611427307, "learning_rate": 0.00017865531731494448, "loss": 3.2026, "step": 65300 }, { "epoch": 7.033688515767947, "grad_norm": 0.7585407495498657, "learning_rate": 0.00017833207628488307, "loss": 3.1916, "step": 65350 }, { "epoch": 7.039070067807556, "grad_norm": 0.7732641100883484, "learning_rate": 0.00017800883525482167, "loss": 3.1814, "step": 65400 }, { "epoch": 7.044451619847164, "grad_norm": 0.7659725546836853, "learning_rate": 0.00017768559422476026, "loss": 3.1805, "step": 65450 }, { "epoch": 7.049833171886772, "grad_norm": 0.85804283618927, "learning_rate": 0.00017736235319469883, "loss": 3.2066, "step": 65500 }, { "epoch": 7.0552147239263805, "grad_norm": 0.7600443959236145, "learning_rate": 0.0001770391121646374, "loss": 3.2134, "step": 65550 }, { "epoch": 7.060596275965988, "grad_norm": 0.9107420444488525, "learning_rate": 0.00017671587113457602, "loss": 3.1905, "step": 65600 }, { "epoch": 7.065977828005597, "grad_norm": 0.8022816181182861, "learning_rate": 0.0001763926301045146, "loss": 3.2074, "step": 65650 }, { "epoch": 7.071359380045205, "grad_norm": 0.7785311937332153, "learning_rate": 0.00017606938907445315, "loss": 3.1951, "step": 65700 }, { "epoch": 7.076740932084813, "grad_norm": 0.788573682308197, "learning_rate": 0.00017574614804439178, "loss": 3.2086, "step": 65750 }, { "epoch": 7.0821224841244215, "grad_norm": 0.7947928309440613, "learning_rate": 0.00017542290701433034, "loss": 3.1812, "step": 65800 }, { "epoch": 7.08750403616403, "grad_norm": 0.7900101542472839, "learning_rate": 0.0001750996659842689, "loss": 3.2148, "step": 65850 }, { "epoch": 7.092885588203638, "grad_norm": 0.7751492261886597, "learning_rate": 0.0001747764249542075, "loss": 3.2141, "step": 65900 }, { "epoch": 7.098267140243246, "grad_norm": 0.7915271520614624, "learning_rate": 0.0001744531839241461, "loss": 3.1943, "step": 65950 }, { "epoch": 7.103648692282855, "grad_norm": 0.8676785826683044, "learning_rate": 0.00017412994289408467, "loss": 3.1982, "step": 66000 }, { "epoch": 7.103648692282855, "eval_accuracy": 0.3875537682396961, "eval_loss": 3.3568191528320312, "eval_runtime": 179.0246, "eval_samples_per_second": 100.606, "eval_steps_per_second": 6.29, "step": 66000 }, { "epoch": 7.109030244322462, "grad_norm": 0.7964807152748108, "learning_rate": 0.00017380670186402326, "loss": 3.1976, "step": 66050 }, { "epoch": 7.114411796362071, "grad_norm": 0.8339501619338989, "learning_rate": 0.00017348346083396183, "loss": 3.1998, "step": 66100 }, { "epoch": 7.119793348401679, "grad_norm": 0.7632756233215332, "learning_rate": 0.00017316021980390042, "loss": 3.2123, "step": 66150 }, { "epoch": 7.125174900441287, "grad_norm": 0.8226380348205566, "learning_rate": 0.00017283697877383902, "loss": 3.2096, "step": 66200 }, { "epoch": 7.130556452480896, "grad_norm": 0.7625822424888611, "learning_rate": 0.00017252020256437885, "loss": 3.2044, "step": 66250 }, { "epoch": 7.135938004520503, "grad_norm": 0.8822200298309326, "learning_rate": 0.00017219696153431742, "loss": 3.2125, "step": 66300 }, { "epoch": 7.141319556560112, "grad_norm": 0.7756070494651794, "learning_rate": 0.000171873720504256, "loss": 3.2148, "step": 66350 }, { "epoch": 7.1467011085997205, "grad_norm": 0.8165068626403809, "learning_rate": 0.00017155047947419458, "loss": 3.2027, "step": 66400 }, { "epoch": 7.152082660639328, "grad_norm": 0.8383119106292725, "learning_rate": 0.00017122723844413315, "loss": 3.2115, "step": 66450 }, { "epoch": 7.157464212678937, "grad_norm": 0.82657390832901, "learning_rate": 0.00017090399741407175, "loss": 3.2219, "step": 66500 }, { "epoch": 7.162845764718545, "grad_norm": 0.8515119552612305, "learning_rate": 0.00017058075638401034, "loss": 3.2207, "step": 66550 }, { "epoch": 7.168227316758153, "grad_norm": 0.7827452421188354, "learning_rate": 0.0001702575153539489, "loss": 3.2322, "step": 66600 }, { "epoch": 7.1736088687977615, "grad_norm": 0.840316116809845, "learning_rate": 0.00016993427432388747, "loss": 3.2247, "step": 66650 }, { "epoch": 7.178990420837369, "grad_norm": 0.7921803593635559, "learning_rate": 0.0001696110332938261, "loss": 3.2177, "step": 66700 }, { "epoch": 7.184371972876978, "grad_norm": 0.8684450387954712, "learning_rate": 0.00016928779226376466, "loss": 3.2118, "step": 66750 }, { "epoch": 7.189753524916586, "grad_norm": 0.7981023788452148, "learning_rate": 0.00016896455123370323, "loss": 3.2196, "step": 66800 }, { "epoch": 7.195135076956194, "grad_norm": 0.7679049968719482, "learning_rate": 0.00016864131020364185, "loss": 3.2201, "step": 66850 }, { "epoch": 7.2005166289958025, "grad_norm": 0.7706493139266968, "learning_rate": 0.00016831806917358042, "loss": 3.2194, "step": 66900 }, { "epoch": 7.205898181035411, "grad_norm": 0.7626670002937317, "learning_rate": 0.000167994828143519, "loss": 3.2079, "step": 66950 }, { "epoch": 7.211279733075019, "grad_norm": 0.819564938545227, "learning_rate": 0.00016767158711345758, "loss": 3.2259, "step": 67000 }, { "epoch": 7.211279733075019, "eval_accuracy": 0.38785745306113756, "eval_loss": 3.353729248046875, "eval_runtime": 179.222, "eval_samples_per_second": 100.495, "eval_steps_per_second": 6.283, "step": 67000 }, { "epoch": 7.216661285114627, "grad_norm": 0.774648904800415, "learning_rate": 0.00016734834608339618, "loss": 3.2139, "step": 67050 }, { "epoch": 7.222042837154235, "grad_norm": 0.822987973690033, "learning_rate": 0.00016702510505333475, "loss": 3.2234, "step": 67100 }, { "epoch": 7.2274243891938434, "grad_norm": 0.8166954517364502, "learning_rate": 0.00016670186402327334, "loss": 3.2314, "step": 67150 }, { "epoch": 7.232805941233452, "grad_norm": 0.8326581716537476, "learning_rate": 0.0001663786229932119, "loss": 3.2154, "step": 67200 }, { "epoch": 7.23818749327306, "grad_norm": 0.8265476822853088, "learning_rate": 0.00016605538196315053, "loss": 3.2145, "step": 67250 }, { "epoch": 7.243569045312668, "grad_norm": 0.7868024110794067, "learning_rate": 0.0001657321409330891, "loss": 3.2171, "step": 67300 }, { "epoch": 7.248950597352277, "grad_norm": 0.7723427414894104, "learning_rate": 0.00016540889990302766, "loss": 3.2378, "step": 67350 }, { "epoch": 7.254332149391884, "grad_norm": 0.8476577401161194, "learning_rate": 0.00016508565887296629, "loss": 3.2249, "step": 67400 }, { "epoch": 7.259713701431493, "grad_norm": 0.8280165195465088, "learning_rate": 0.00016476241784290485, "loss": 3.2132, "step": 67450 }, { "epoch": 7.265095253471101, "grad_norm": 0.762355387210846, "learning_rate": 0.00016443917681284342, "loss": 3.2243, "step": 67500 }, { "epoch": 7.270476805510709, "grad_norm": 0.7618349194526672, "learning_rate": 0.00016411593578278202, "loss": 3.2179, "step": 67550 }, { "epoch": 7.275858357550318, "grad_norm": 0.8076784610748291, "learning_rate": 0.00016379915957332182, "loss": 3.2319, "step": 67600 }, { "epoch": 7.281239909589925, "grad_norm": 0.8296750783920288, "learning_rate": 0.00016347591854326042, "loss": 3.2194, "step": 67650 }, { "epoch": 7.286621461629534, "grad_norm": 0.8115049004554749, "learning_rate": 0.00016315267751319898, "loss": 3.2324, "step": 67700 }, { "epoch": 7.2920030136691425, "grad_norm": 0.8424254059791565, "learning_rate": 0.00016282943648313755, "loss": 3.2201, "step": 67750 }, { "epoch": 7.29738456570875, "grad_norm": 0.7592359781265259, "learning_rate": 0.00016250619545307617, "loss": 3.2463, "step": 67800 }, { "epoch": 7.302766117748359, "grad_norm": 0.8043787479400635, "learning_rate": 0.00016218295442301474, "loss": 3.2215, "step": 67850 }, { "epoch": 7.308147669787967, "grad_norm": 0.8411751389503479, "learning_rate": 0.0001618597133929533, "loss": 3.2257, "step": 67900 }, { "epoch": 7.313529221827575, "grad_norm": 0.7985913753509521, "learning_rate": 0.00016153647236289193, "loss": 3.22, "step": 67950 }, { "epoch": 7.3189107738671835, "grad_norm": 0.8440186381340027, "learning_rate": 0.0001612132313328305, "loss": 3.2147, "step": 68000 }, { "epoch": 7.3189107738671835, "eval_accuracy": 0.3882622937211343, "eval_loss": 3.3515677452087402, "eval_runtime": 179.2443, "eval_samples_per_second": 100.483, "eval_steps_per_second": 6.282, "step": 68000 }, { "epoch": 7.324292325906791, "grad_norm": 0.8085800409317017, "learning_rate": 0.0001608899903027691, "loss": 3.2128, "step": 68050 }, { "epoch": 7.3296738779464, "grad_norm": 0.8044326305389404, "learning_rate": 0.00016056674927270766, "loss": 3.2181, "step": 68100 }, { "epoch": 7.335055429986008, "grad_norm": 0.8221646547317505, "learning_rate": 0.00016024350824264626, "loss": 3.2044, "step": 68150 }, { "epoch": 7.340436982025616, "grad_norm": 0.8378121256828308, "learning_rate": 0.00015992026721258485, "loss": 3.234, "step": 68200 }, { "epoch": 7.3458185340652244, "grad_norm": 0.8055646419525146, "learning_rate": 0.00015959702618252342, "loss": 3.2368, "step": 68250 }, { "epoch": 7.351200086104833, "grad_norm": 0.8302401900291443, "learning_rate": 0.00015927378515246199, "loss": 3.2315, "step": 68300 }, { "epoch": 7.356581638144441, "grad_norm": 0.8142952919006348, "learning_rate": 0.0001589505441224006, "loss": 3.2306, "step": 68350 }, { "epoch": 7.361963190184049, "grad_norm": 0.8154958486557007, "learning_rate": 0.00015862730309233917, "loss": 3.2365, "step": 68400 }, { "epoch": 7.367344742223658, "grad_norm": 0.7811055779457092, "learning_rate": 0.00015830406206227774, "loss": 3.2484, "step": 68450 }, { "epoch": 7.372726294263265, "grad_norm": 0.8077459335327148, "learning_rate": 0.00015798082103221636, "loss": 3.2218, "step": 68500 }, { "epoch": 7.378107846302874, "grad_norm": 0.7829011082649231, "learning_rate": 0.00015765758000215493, "loss": 3.2213, "step": 68550 }, { "epoch": 7.383489398342482, "grad_norm": 0.8035743236541748, "learning_rate": 0.0001573343389720935, "loss": 3.2415, "step": 68600 }, { "epoch": 7.38887095038209, "grad_norm": 0.8412837386131287, "learning_rate": 0.0001570110979420321, "loss": 3.215, "step": 68650 }, { "epoch": 7.394252502421699, "grad_norm": 0.8717572689056396, "learning_rate": 0.0001566878569119707, "loss": 3.2316, "step": 68700 }, { "epoch": 7.399634054461306, "grad_norm": 0.8612834215164185, "learning_rate": 0.00015636461588190926, "loss": 3.2321, "step": 68750 }, { "epoch": 7.405015606500915, "grad_norm": 0.7927626967430115, "learning_rate": 0.00015604137485184785, "loss": 3.2364, "step": 68800 }, { "epoch": 7.4103971585405235, "grad_norm": 0.8288998007774353, "learning_rate": 0.00015571813382178642, "loss": 3.2182, "step": 68850 }, { "epoch": 7.415778710580131, "grad_norm": 0.8665758967399597, "learning_rate": 0.000155394892791725, "loss": 3.2401, "step": 68900 }, { "epoch": 7.42116026261974, "grad_norm": 0.8682220578193665, "learning_rate": 0.0001550716517616636, "loss": 3.2315, "step": 68950 }, { "epoch": 7.426541814659347, "grad_norm": 0.8869706392288208, "learning_rate": 0.00015474841073160217, "loss": 3.2513, "step": 69000 }, { "epoch": 7.426541814659347, "eval_accuracy": 0.3883139038428462, "eval_loss": 3.3499770164489746, "eval_runtime": 179.2818, "eval_samples_per_second": 100.462, "eval_steps_per_second": 6.281, "step": 69000 }, { "epoch": 7.431923366698956, "grad_norm": 0.8428071141242981, "learning_rate": 0.0001544251697015408, "loss": 3.2343, "step": 69050 }, { "epoch": 7.4373049187385645, "grad_norm": 0.804516613483429, "learning_rate": 0.00015410192867147936, "loss": 3.2305, "step": 69100 }, { "epoch": 7.442686470778172, "grad_norm": 0.8072082996368408, "learning_rate": 0.00015377868764141793, "loss": 3.2158, "step": 69150 }, { "epoch": 7.448068022817781, "grad_norm": 0.8010827302932739, "learning_rate": 0.00015345544661135653, "loss": 3.2347, "step": 69200 }, { "epoch": 7.453449574857389, "grad_norm": 0.8006973266601562, "learning_rate": 0.00015313220558129512, "loss": 3.2442, "step": 69250 }, { "epoch": 7.458831126896997, "grad_norm": 0.7692738771438599, "learning_rate": 0.0001528089645512337, "loss": 3.2361, "step": 69300 }, { "epoch": 7.4642126789366054, "grad_norm": 0.82619309425354, "learning_rate": 0.00015248572352117228, "loss": 3.221, "step": 69350 }, { "epoch": 7.469594230976213, "grad_norm": 0.8146241903305054, "learning_rate": 0.00015216248249111085, "loss": 3.2214, "step": 69400 }, { "epoch": 7.474975783015822, "grad_norm": 0.8031963109970093, "learning_rate": 0.00015183924146104945, "loss": 3.2264, "step": 69450 }, { "epoch": 7.48035733505543, "grad_norm": 0.8314658999443054, "learning_rate": 0.00015151600043098804, "loss": 3.2235, "step": 69500 }, { "epoch": 7.485738887095038, "grad_norm": 0.8674782514572144, "learning_rate": 0.0001511927594009266, "loss": 3.2231, "step": 69550 }, { "epoch": 7.491120439134646, "grad_norm": 0.8414506316184998, "learning_rate": 0.00015086951837086518, "loss": 3.2222, "step": 69600 }, { "epoch": 7.496501991174255, "grad_norm": 0.8061162829399109, "learning_rate": 0.0001505462773408038, "loss": 3.2285, "step": 69650 }, { "epoch": 7.501883543213863, "grad_norm": 0.8615111112594604, "learning_rate": 0.00015022303631074236, "loss": 3.2363, "step": 69700 }, { "epoch": 7.507265095253471, "grad_norm": 0.8201255798339844, "learning_rate": 0.00014989979528068096, "loss": 3.2321, "step": 69750 }, { "epoch": 7.51264664729308, "grad_norm": 0.7630177140235901, "learning_rate": 0.00014957655425061953, "loss": 3.2417, "step": 69800 }, { "epoch": 7.518028199332687, "grad_norm": 0.8191272020339966, "learning_rate": 0.00014925331322055812, "loss": 3.2231, "step": 69850 }, { "epoch": 7.523409751372296, "grad_norm": 0.8036609292030334, "learning_rate": 0.0001489300721904967, "loss": 3.2342, "step": 69900 }, { "epoch": 7.528791303411904, "grad_norm": 0.8153430819511414, "learning_rate": 0.00014860683116043528, "loss": 3.224, "step": 69950 }, { "epoch": 7.534172855451512, "grad_norm": 0.837643027305603, "learning_rate": 0.00014828359013037385, "loss": 3.2262, "step": 70000 }, { "epoch": 7.534172855451512, "eval_accuracy": 0.38884782413356656, "eval_loss": 3.3439652919769287, "eval_runtime": 179.3165, "eval_samples_per_second": 100.442, "eval_steps_per_second": 6.279, "step": 70000 }, { "epoch": 7.539554407491121, "grad_norm": 0.7882395386695862, "learning_rate": 0.00014796034910031245, "loss": 3.2289, "step": 70050 }, { "epoch": 7.544935959530728, "grad_norm": 0.8111067414283752, "learning_rate": 0.00014763710807025104, "loss": 3.2415, "step": 70100 }, { "epoch": 7.550317511570337, "grad_norm": 0.8321049213409424, "learning_rate": 0.0001473138670401896, "loss": 3.2244, "step": 70150 }, { "epoch": 7.5556990636099455, "grad_norm": 0.8700869679450989, "learning_rate": 0.0001469906260101282, "loss": 3.2436, "step": 70200 }, { "epoch": 7.561080615649553, "grad_norm": 0.8332130908966064, "learning_rate": 0.0001466673849800668, "loss": 3.236, "step": 70250 }, { "epoch": 7.566462167689162, "grad_norm": 0.8263179063796997, "learning_rate": 0.0001463441439500054, "loss": 3.2416, "step": 70300 }, { "epoch": 7.57184371972877, "grad_norm": 0.7900987267494202, "learning_rate": 0.00014602090291994396, "loss": 3.239, "step": 70350 }, { "epoch": 7.577225271768378, "grad_norm": 0.7908528447151184, "learning_rate": 0.00014569766188988255, "loss": 3.2437, "step": 70400 }, { "epoch": 7.5826068238079865, "grad_norm": 0.8256222605705261, "learning_rate": 0.00014537442085982112, "loss": 3.2404, "step": 70450 }, { "epoch": 7.587988375847594, "grad_norm": 0.8238363862037659, "learning_rate": 0.00014505117982975972, "loss": 3.2413, "step": 70500 }, { "epoch": 7.593369927887203, "grad_norm": 0.8361555337905884, "learning_rate": 0.00014472793879969828, "loss": 3.2401, "step": 70550 }, { "epoch": 7.598751479926811, "grad_norm": 0.8410863876342773, "learning_rate": 0.00014440469776963688, "loss": 3.2278, "step": 70600 }, { "epoch": 7.604133031966419, "grad_norm": 0.8186968564987183, "learning_rate": 0.00014408145673957545, "loss": 3.2183, "step": 70650 }, { "epoch": 7.609514584006027, "grad_norm": 0.8294521570205688, "learning_rate": 0.00014375821570951404, "loss": 3.2322, "step": 70700 }, { "epoch": 7.614896136045635, "grad_norm": 0.7877250909805298, "learning_rate": 0.00014343497467945264, "loss": 3.2444, "step": 70750 }, { "epoch": 7.620277688085244, "grad_norm": 0.812689483165741, "learning_rate": 0.00014311173364939123, "loss": 3.2376, "step": 70800 }, { "epoch": 7.625659240124852, "grad_norm": 0.8702561855316162, "learning_rate": 0.0001427884926193298, "loss": 3.2253, "step": 70850 }, { "epoch": 7.63104079216446, "grad_norm": 0.8528902530670166, "learning_rate": 0.0001424652515892684, "loss": 3.2467, "step": 70900 }, { "epoch": 7.636422344204068, "grad_norm": 0.8240679502487183, "learning_rate": 0.000142142010559207, "loss": 3.2224, "step": 70950 }, { "epoch": 7.641803896243677, "grad_norm": 0.8918917179107666, "learning_rate": 0.00014181876952914555, "loss": 3.2326, "step": 71000 }, { "epoch": 7.641803896243677, "eval_accuracy": 0.38928830294076644, "eval_loss": 3.341247081756592, "eval_runtime": 179.268, "eval_samples_per_second": 100.47, "eval_steps_per_second": 6.281, "step": 71000 }, { "epoch": 7.647185448283285, "grad_norm": 0.9188471436500549, "learning_rate": 0.00014149552849908415, "loss": 3.2322, "step": 71050 }, { "epoch": 7.652567000322893, "grad_norm": 0.8203415870666504, "learning_rate": 0.00014117228746902272, "loss": 3.2332, "step": 71100 }, { "epoch": 7.657948552362502, "grad_norm": 0.834678590297699, "learning_rate": 0.0001408490464389613, "loss": 3.2491, "step": 71150 }, { "epoch": 7.663330104402109, "grad_norm": 0.8428419828414917, "learning_rate": 0.00014052580540889988, "loss": 3.2281, "step": 71200 }, { "epoch": 7.668711656441718, "grad_norm": 0.8586658835411072, "learning_rate": 0.00014020256437883847, "loss": 3.2245, "step": 71250 }, { "epoch": 7.674093208481326, "grad_norm": 0.8209629058837891, "learning_rate": 0.00013987932334877707, "loss": 3.228, "step": 71300 }, { "epoch": 7.679474760520934, "grad_norm": 0.8002371788024902, "learning_rate": 0.00013955608231871564, "loss": 3.2264, "step": 71350 }, { "epoch": 7.684856312560543, "grad_norm": 0.8161579370498657, "learning_rate": 0.00013923284128865423, "loss": 3.2163, "step": 71400 }, { "epoch": 7.69023786460015, "grad_norm": 0.8561053276062012, "learning_rate": 0.00013890960025859283, "loss": 3.2391, "step": 71450 }, { "epoch": 7.695619416639759, "grad_norm": 0.8303159475326538, "learning_rate": 0.00013859282404913263, "loss": 3.2233, "step": 71500 }, { "epoch": 7.7010009686793675, "grad_norm": 0.8900315761566162, "learning_rate": 0.0001382695830190712, "loss": 3.2274, "step": 71550 }, { "epoch": 7.706382520718975, "grad_norm": 0.8540375828742981, "learning_rate": 0.0001379463419890098, "loss": 3.2291, "step": 71600 }, { "epoch": 7.711764072758584, "grad_norm": 0.8394600749015808, "learning_rate": 0.00013762310095894836, "loss": 3.2354, "step": 71650 }, { "epoch": 7.717145624798192, "grad_norm": 0.8473957180976868, "learning_rate": 0.00013729985992888696, "loss": 3.239, "step": 71700 }, { "epoch": 7.7225271768378, "grad_norm": 0.8300439715385437, "learning_rate": 0.00013697661889882555, "loss": 3.2442, "step": 71750 }, { "epoch": 7.727908728877408, "grad_norm": 0.812078595161438, "learning_rate": 0.00013665337786876412, "loss": 3.2443, "step": 71800 }, { "epoch": 7.733290280917016, "grad_norm": 0.8100917935371399, "learning_rate": 0.0001363301368387027, "loss": 3.2227, "step": 71850 }, { "epoch": 7.738671832956625, "grad_norm": 0.8426954746246338, "learning_rate": 0.0001360068958086413, "loss": 3.2113, "step": 71900 }, { "epoch": 7.744053384996233, "grad_norm": 0.8209559321403503, "learning_rate": 0.00013568365477857988, "loss": 3.2343, "step": 71950 }, { "epoch": 7.749434937035841, "grad_norm": 0.8225012421607971, "learning_rate": 0.00013536041374851847, "loss": 3.242, "step": 72000 }, { "epoch": 7.749434937035841, "eval_accuracy": 0.3897854985554055, "eval_loss": 3.3368544578552246, "eval_runtime": 179.1985, "eval_samples_per_second": 100.509, "eval_steps_per_second": 6.284, "step": 72000 }, { "epoch": 7.754816489075449, "grad_norm": 0.8724761605262756, "learning_rate": 0.00013503717271845706, "loss": 3.2443, "step": 72050 }, { "epoch": 7.760198041115058, "grad_norm": 0.8532405495643616, "learning_rate": 0.00013471393168839563, "loss": 3.2327, "step": 72100 }, { "epoch": 7.765579593154666, "grad_norm": 0.8478601574897766, "learning_rate": 0.00013439069065833423, "loss": 3.2408, "step": 72150 }, { "epoch": 7.770961145194274, "grad_norm": 0.8687472939491272, "learning_rate": 0.0001340674496282728, "loss": 3.2419, "step": 72200 }, { "epoch": 7.776342697233883, "grad_norm": 0.8549054265022278, "learning_rate": 0.0001337442085982114, "loss": 3.235, "step": 72250 }, { "epoch": 7.78172424927349, "grad_norm": 0.8640989661216736, "learning_rate": 0.00013342096756814996, "loss": 3.2354, "step": 72300 }, { "epoch": 7.787105801313099, "grad_norm": 0.8307853937149048, "learning_rate": 0.00013309772653808855, "loss": 3.2265, "step": 72350 }, { "epoch": 7.792487353352707, "grad_norm": 0.8140857219696045, "learning_rate": 0.00013277448550802715, "loss": 3.2454, "step": 72400 }, { "epoch": 7.797868905392315, "grad_norm": 0.8205322623252869, "learning_rate": 0.0001324512444779657, "loss": 3.2445, "step": 72450 }, { "epoch": 7.803250457431924, "grad_norm": 0.8552926182746887, "learning_rate": 0.0001321280034479043, "loss": 3.2242, "step": 72500 }, { "epoch": 7.808632009471531, "grad_norm": 0.7958500385284424, "learning_rate": 0.0001318047624178429, "loss": 3.2331, "step": 72550 }, { "epoch": 7.81401356151114, "grad_norm": 0.9368075728416443, "learning_rate": 0.0001314815213877815, "loss": 3.2311, "step": 72600 }, { "epoch": 7.819395113550748, "grad_norm": 0.852423906326294, "learning_rate": 0.00013115828035772007, "loss": 3.2386, "step": 72650 }, { "epoch": 7.824776665590356, "grad_norm": 0.8643367290496826, "learning_rate": 0.00013083503932765866, "loss": 3.2349, "step": 72700 }, { "epoch": 7.830158217629965, "grad_norm": 0.8430445194244385, "learning_rate": 0.00013051179829759723, "loss": 3.23, "step": 72750 }, { "epoch": 7.835539769669572, "grad_norm": 0.8127181529998779, "learning_rate": 0.00013018855726753582, "loss": 3.242, "step": 72800 }, { "epoch": 7.840921321709181, "grad_norm": 0.8073041439056396, "learning_rate": 0.0001298653162374744, "loss": 3.2286, "step": 72850 }, { "epoch": 7.846302873748789, "grad_norm": 0.8760915398597717, "learning_rate": 0.00012954207520741298, "loss": 3.2477, "step": 72900 }, { "epoch": 7.851684425788397, "grad_norm": 0.8761159777641296, "learning_rate": 0.00012921883417735155, "loss": 3.2429, "step": 72950 }, { "epoch": 7.857065977828006, "grad_norm": 0.8289068341255188, "learning_rate": 0.00012889559314729015, "loss": 3.2394, "step": 73000 }, { "epoch": 7.857065977828006, "eval_accuracy": 0.39016361060499993, "eval_loss": 3.3336265087127686, "eval_runtime": 179.3407, "eval_samples_per_second": 100.429, "eval_steps_per_second": 6.279, "step": 73000 }, { "epoch": 7.862447529867614, "grad_norm": 0.8425275087356567, "learning_rate": 0.00012857235211722874, "loss": 3.2496, "step": 73050 }, { "epoch": 7.867829081907222, "grad_norm": 0.8188695311546326, "learning_rate": 0.00012824911108716734, "loss": 3.2372, "step": 73100 }, { "epoch": 7.87321063394683, "grad_norm": 0.8350045084953308, "learning_rate": 0.0001279258700571059, "loss": 3.2357, "step": 73150 }, { "epoch": 7.878592185986438, "grad_norm": 0.8477059006690979, "learning_rate": 0.0001276026290270445, "loss": 3.2387, "step": 73200 }, { "epoch": 7.883973738026047, "grad_norm": 0.8441916108131409, "learning_rate": 0.00012727938799698307, "loss": 3.2293, "step": 73250 }, { "epoch": 7.889355290065655, "grad_norm": 0.8447609543800354, "learning_rate": 0.00012695614696692166, "loss": 3.2421, "step": 73300 }, { "epoch": 7.894736842105263, "grad_norm": 0.8122196197509766, "learning_rate": 0.00012663290593686023, "loss": 3.2626, "step": 73350 }, { "epoch": 7.900118394144871, "grad_norm": 0.8014147877693176, "learning_rate": 0.00012630966490679882, "loss": 3.2293, "step": 73400 }, { "epoch": 7.90549994618448, "grad_norm": 0.8616482615470886, "learning_rate": 0.0001259864238767374, "loss": 3.2435, "step": 73450 }, { "epoch": 7.910881498224088, "grad_norm": 0.864067018032074, "learning_rate": 0.00012566318284667598, "loss": 3.2424, "step": 73500 }, { "epoch": 7.916263050263696, "grad_norm": 0.7892735004425049, "learning_rate": 0.00012533994181661458, "loss": 3.2339, "step": 73550 }, { "epoch": 7.921644602303305, "grad_norm": 0.800003707408905, "learning_rate": 0.00012501670078655317, "loss": 3.2413, "step": 73600 }, { "epoch": 7.927026154342912, "grad_norm": 0.8988176584243774, "learning_rate": 0.00012469345975649174, "loss": 3.2335, "step": 73650 }, { "epoch": 7.932407706382521, "grad_norm": 0.7975991368293762, "learning_rate": 0.00012437021872643034, "loss": 3.2144, "step": 73700 }, { "epoch": 7.937789258422129, "grad_norm": 0.8801944851875305, "learning_rate": 0.00012405344251697014, "loss": 3.2155, "step": 73750 }, { "epoch": 7.943170810461737, "grad_norm": 0.8546746373176575, "learning_rate": 0.00012373020148690874, "loss": 3.2406, "step": 73800 }, { "epoch": 7.948552362501346, "grad_norm": 0.836940348148346, "learning_rate": 0.0001234069604568473, "loss": 3.2232, "step": 73850 }, { "epoch": 7.953933914540953, "grad_norm": 0.8503960967063904, "learning_rate": 0.0001230837194267859, "loss": 3.2315, "step": 73900 }, { "epoch": 7.959315466580562, "grad_norm": 0.9259964227676392, "learning_rate": 0.00012276047839672447, "loss": 3.2268, "step": 73950 }, { "epoch": 7.96469701862017, "grad_norm": 0.848434567451477, "learning_rate": 0.00012243723736666306, "loss": 3.2251, "step": 74000 }, { "epoch": 7.96469701862017, "eval_accuracy": 0.3905919202887646, "eval_loss": 3.3294827938079834, "eval_runtime": 179.029, "eval_samples_per_second": 100.604, "eval_steps_per_second": 6.289, "step": 74000 }, { "epoch": 7.970078570659778, "grad_norm": 0.8327162265777588, "learning_rate": 0.00012211399633660166, "loss": 3.2243, "step": 74050 }, { "epoch": 7.975460122699387, "grad_norm": 0.8550471067428589, "learning_rate": 0.00012179075530654022, "loss": 3.231, "step": 74100 }, { "epoch": 7.980841674738995, "grad_norm": 0.8318188190460205, "learning_rate": 0.00012146751427647882, "loss": 3.2195, "step": 74150 }, { "epoch": 7.986223226778603, "grad_norm": 0.8346085548400879, "learning_rate": 0.0001211442732464174, "loss": 3.2558, "step": 74200 }, { "epoch": 7.991604778818211, "grad_norm": 0.870581328868866, "learning_rate": 0.00012082103221635598, "loss": 3.2239, "step": 74250 }, { "epoch": 7.996986330857819, "grad_norm": 0.8395835757255554, "learning_rate": 0.00012049779118629456, "loss": 3.2399, "step": 74300 }, { "epoch": 8.002367882897428, "grad_norm": 0.8109171390533447, "learning_rate": 0.00012017455015623316, "loss": 3.1927, "step": 74350 }, { "epoch": 8.007749434937036, "grad_norm": 0.8452024459838867, "learning_rate": 0.00011985130912617175, "loss": 3.1479, "step": 74400 }, { "epoch": 8.013130986976645, "grad_norm": 0.8645105957984924, "learning_rate": 0.00011952806809611032, "loss": 3.152, "step": 74450 }, { "epoch": 8.018512539016251, "grad_norm": 0.8408581018447876, "learning_rate": 0.00011920482706604891, "loss": 3.1642, "step": 74500 }, { "epoch": 8.02389409105586, "grad_norm": 0.830633819103241, "learning_rate": 0.0001188815860359875, "loss": 3.1516, "step": 74550 }, { "epoch": 8.029275643095469, "grad_norm": 0.8915413022041321, "learning_rate": 0.00011855834500592608, "loss": 3.1711, "step": 74600 }, { "epoch": 8.034657195135077, "grad_norm": 0.8216363787651062, "learning_rate": 0.00011823510397586466, "loss": 3.1643, "step": 74650 }, { "epoch": 8.040038747174686, "grad_norm": 0.8459712266921997, "learning_rate": 0.00011791186294580325, "loss": 3.1667, "step": 74700 }, { "epoch": 8.045420299214294, "grad_norm": 0.8982348442077637, "learning_rate": 0.00011758862191574182, "loss": 3.16, "step": 74750 }, { "epoch": 8.050801851253901, "grad_norm": 0.8435618281364441, "learning_rate": 0.00011726538088568041, "loss": 3.1483, "step": 74800 }, { "epoch": 8.05618340329351, "grad_norm": 0.8488997220993042, "learning_rate": 0.000116942139855619, "loss": 3.15, "step": 74850 }, { "epoch": 8.061564955333118, "grad_norm": 0.8723399639129639, "learning_rate": 0.00011661889882555759, "loss": 3.1584, "step": 74900 }, { "epoch": 8.066946507372727, "grad_norm": 0.8550832271575928, "learning_rate": 0.00011629565779549616, "loss": 3.1725, "step": 74950 }, { "epoch": 8.072328059412335, "grad_norm": 0.8233960866928101, "learning_rate": 0.00011597241676543475, "loss": 3.1612, "step": 75000 }, { "epoch": 8.072328059412335, "eval_accuracy": 0.3904815289547451, "eval_loss": 3.3351383209228516, "eval_runtime": 179.2437, "eval_samples_per_second": 100.483, "eval_steps_per_second": 6.282, "step": 75000 }, { "epoch": 8.077709611451942, "grad_norm": 0.8487289547920227, "learning_rate": 0.00011564917573537335, "loss": 3.1645, "step": 75050 }, { "epoch": 8.08309116349155, "grad_norm": 0.8253041505813599, "learning_rate": 0.00011532593470531191, "loss": 3.1504, "step": 75100 }, { "epoch": 8.088472715531159, "grad_norm": 0.8512349128723145, "learning_rate": 0.0001150026936752505, "loss": 3.1535, "step": 75150 }, { "epoch": 8.093854267570768, "grad_norm": 0.8260550498962402, "learning_rate": 0.00011467945264518909, "loss": 3.1583, "step": 75200 }, { "epoch": 8.099235819610376, "grad_norm": 0.8708517551422119, "learning_rate": 0.00011435621161512766, "loss": 3.1757, "step": 75250 }, { "epoch": 8.104617371649983, "grad_norm": 0.9398133158683777, "learning_rate": 0.00011403297058506625, "loss": 3.1491, "step": 75300 }, { "epoch": 8.109998923689592, "grad_norm": 0.8403450846672058, "learning_rate": 0.00011370972955500485, "loss": 3.159, "step": 75350 }, { "epoch": 8.1153804757292, "grad_norm": 0.8401364684104919, "learning_rate": 0.00011338648852494343, "loss": 3.1786, "step": 75400 }, { "epoch": 8.120762027768809, "grad_norm": 0.8833390474319458, "learning_rate": 0.00011306324749488201, "loss": 3.1712, "step": 75450 }, { "epoch": 8.126143579808417, "grad_norm": 0.866776704788208, "learning_rate": 0.00011274000646482059, "loss": 3.1578, "step": 75500 }, { "epoch": 8.131525131848026, "grad_norm": 0.8826317191123962, "learning_rate": 0.00011241676543475918, "loss": 3.1767, "step": 75550 }, { "epoch": 8.136906683887632, "grad_norm": 0.8661509156227112, "learning_rate": 0.00011209352440469775, "loss": 3.1897, "step": 75600 }, { "epoch": 8.142288235927241, "grad_norm": 0.8728544116020203, "learning_rate": 0.00011177028337463635, "loss": 3.1767, "step": 75650 }, { "epoch": 8.14766978796685, "grad_norm": 0.868950366973877, "learning_rate": 0.00011144704234457493, "loss": 3.1763, "step": 75700 }, { "epoch": 8.153051340006458, "grad_norm": 0.8438011407852173, "learning_rate": 0.00011112380131451351, "loss": 3.1782, "step": 75750 }, { "epoch": 8.158432892046067, "grad_norm": 0.9406909942626953, "learning_rate": 0.00011080056028445209, "loss": 3.1721, "step": 75800 }, { "epoch": 8.163814444085673, "grad_norm": 0.8312217593193054, "learning_rate": 0.00011047731925439068, "loss": 3.1815, "step": 75850 }, { "epoch": 8.169195996125282, "grad_norm": 0.8989969491958618, "learning_rate": 0.00011015407822432928, "loss": 3.1747, "step": 75900 }, { "epoch": 8.17457754816489, "grad_norm": 0.8258314728736877, "learning_rate": 0.00010983083719426785, "loss": 3.1706, "step": 75950 }, { "epoch": 8.1799591002045, "grad_norm": 0.927180826663971, "learning_rate": 0.00010950759616420644, "loss": 3.1731, "step": 76000 }, { "epoch": 8.1799591002045, "eval_accuracy": 0.390790211809026, "eval_loss": 3.3343276977539062, "eval_runtime": 179.2381, "eval_samples_per_second": 100.486, "eval_steps_per_second": 6.282, "step": 76000 }, { "epoch": 8.185340652244108, "grad_norm": 0.8896839618682861, "learning_rate": 0.00010918435513414502, "loss": 3.1869, "step": 76050 }, { "epoch": 8.190722204283716, "grad_norm": 0.8260973691940308, "learning_rate": 0.00010886111410408359, "loss": 3.163, "step": 76100 }, { "epoch": 8.196103756323323, "grad_norm": 0.8701211810112, "learning_rate": 0.00010853787307402218, "loss": 3.1757, "step": 76150 }, { "epoch": 8.201485308362932, "grad_norm": 0.8848615288734436, "learning_rate": 0.00010821463204396078, "loss": 3.1815, "step": 76200 }, { "epoch": 8.20686686040254, "grad_norm": 0.9153262972831726, "learning_rate": 0.00010789139101389935, "loss": 3.1656, "step": 76250 }, { "epoch": 8.212248412442149, "grad_norm": 0.8344658017158508, "learning_rate": 0.00010756814998383794, "loss": 3.1613, "step": 76300 }, { "epoch": 8.217629964481757, "grad_norm": 0.8095808625221252, "learning_rate": 0.00010724490895377652, "loss": 3.1623, "step": 76350 }, { "epoch": 8.223011516521364, "grad_norm": 0.832159161567688, "learning_rate": 0.00010692166792371512, "loss": 3.1755, "step": 76400 }, { "epoch": 8.228393068560973, "grad_norm": 0.8940076231956482, "learning_rate": 0.00010659842689365368, "loss": 3.1735, "step": 76450 }, { "epoch": 8.233774620600581, "grad_norm": 0.8475741744041443, "learning_rate": 0.00010627518586359228, "loss": 3.1722, "step": 76500 }, { "epoch": 8.23915617264019, "grad_norm": 0.875935435295105, "learning_rate": 0.00010595194483353086, "loss": 3.1751, "step": 76550 }, { "epoch": 8.244537724679798, "grad_norm": 0.8820996880531311, "learning_rate": 0.00010562870380346944, "loss": 3.1791, "step": 76600 }, { "epoch": 8.249919276719407, "grad_norm": 0.9192360639572144, "learning_rate": 0.00010530546277340802, "loss": 3.1746, "step": 76650 }, { "epoch": 8.255300828759013, "grad_norm": 0.9355788826942444, "learning_rate": 0.00010498222174334662, "loss": 3.1643, "step": 76700 }, { "epoch": 8.260682380798622, "grad_norm": 0.8911601901054382, "learning_rate": 0.00010465898071328519, "loss": 3.1752, "step": 76750 }, { "epoch": 8.26606393283823, "grad_norm": 1.0382132530212402, "learning_rate": 0.00010433573968322378, "loss": 3.1726, "step": 76800 }, { "epoch": 8.27144548487784, "grad_norm": 0.8596870303153992, "learning_rate": 0.00010401249865316237, "loss": 3.1892, "step": 76850 }, { "epoch": 8.276827036917448, "grad_norm": 0.8965983986854553, "learning_rate": 0.00010368925762310096, "loss": 3.1845, "step": 76900 }, { "epoch": 8.282208588957054, "grad_norm": 0.8177803754806519, "learning_rate": 0.00010336601659303954, "loss": 3.1748, "step": 76950 }, { "epoch": 8.287590140996663, "grad_norm": 0.8794350028038025, "learning_rate": 0.00010304277556297812, "loss": 3.1844, "step": 77000 }, { "epoch": 8.287590140996663, "eval_accuracy": 0.3909625352890998, "eval_loss": 3.330594301223755, "eval_runtime": 179.3195, "eval_samples_per_second": 100.441, "eval_steps_per_second": 6.279, "step": 77000 }, { "epoch": 8.292971693036272, "grad_norm": 0.8605765104293823, "learning_rate": 0.00010271953453291671, "loss": 3.1799, "step": 77050 }, { "epoch": 8.29835324507588, "grad_norm": 0.8603830337524414, "learning_rate": 0.00010239629350285528, "loss": 3.1891, "step": 77100 }, { "epoch": 8.303734797115489, "grad_norm": 0.8687741160392761, "learning_rate": 0.00010207305247279387, "loss": 3.1723, "step": 77150 }, { "epoch": 8.309116349155097, "grad_norm": 0.9100622534751892, "learning_rate": 0.00010174981144273246, "loss": 3.1788, "step": 77200 }, { "epoch": 8.314497901194704, "grad_norm": 0.9056679606437683, "learning_rate": 0.00010142657041267104, "loss": 3.1763, "step": 77250 }, { "epoch": 8.319879453234313, "grad_norm": 0.8805974721908569, "learning_rate": 0.00010110332938260962, "loss": 3.1736, "step": 77300 }, { "epoch": 8.325261005273921, "grad_norm": 0.8837348818778992, "learning_rate": 0.00010078008835254821, "loss": 3.178, "step": 77350 }, { "epoch": 8.33064255731353, "grad_norm": 0.981016218662262, "learning_rate": 0.0001004568473224868, "loss": 3.18, "step": 77400 }, { "epoch": 8.336024109353138, "grad_norm": 0.9569023847579956, "learning_rate": 0.00010013360629242537, "loss": 3.1678, "step": 77450 }, { "epoch": 8.341405661392745, "grad_norm": 0.9140768051147461, "learning_rate": 9.981036526236396e-05, "loss": 3.17, "step": 77500 }, { "epoch": 8.346787213432354, "grad_norm": 0.8736293911933899, "learning_rate": 9.948712423230255e-05, "loss": 3.1683, "step": 77550 }, { "epoch": 8.352168765471962, "grad_norm": 0.8517115116119385, "learning_rate": 9.916388320224112e-05, "loss": 3.199, "step": 77600 }, { "epoch": 8.35755031751157, "grad_norm": 0.8828803300857544, "learning_rate": 9.884064217217971e-05, "loss": 3.1824, "step": 77650 }, { "epoch": 8.36293186955118, "grad_norm": 0.840035617351532, "learning_rate": 9.851740114211831e-05, "loss": 3.1762, "step": 77700 }, { "epoch": 8.368313421590786, "grad_norm": 0.9291461110115051, "learning_rate": 9.819416011205688e-05, "loss": 3.1746, "step": 77750 }, { "epoch": 8.373694973630395, "grad_norm": 0.8464294075965881, "learning_rate": 9.78773839025967e-05, "loss": 3.1668, "step": 77800 }, { "epoch": 8.379076525670003, "grad_norm": 0.9900839924812317, "learning_rate": 9.755414287253529e-05, "loss": 3.1755, "step": 77850 }, { "epoch": 8.384458077709612, "grad_norm": 0.8731220960617065, "learning_rate": 9.723090184247386e-05, "loss": 3.1788, "step": 77900 }, { "epoch": 8.38983962974922, "grad_norm": 0.8841520547866821, "learning_rate": 9.690766081241245e-05, "loss": 3.1859, "step": 77950 }, { "epoch": 8.395221181788829, "grad_norm": 0.9122378826141357, "learning_rate": 9.658441978235103e-05, "loss": 3.1734, "step": 78000 }, { "epoch": 8.395221181788829, "eval_accuracy": 0.39124590202051995, "eval_loss": 3.3288753032684326, "eval_runtime": 179.1559, "eval_samples_per_second": 100.533, "eval_steps_per_second": 6.285, "step": 78000 }, { "epoch": 8.400602733828435, "grad_norm": 0.8795517086982727, "learning_rate": 9.626117875228961e-05, "loss": 3.1788, "step": 78050 }, { "epoch": 8.405984285868044, "grad_norm": 0.921349048614502, "learning_rate": 9.59379377222282e-05, "loss": 3.1791, "step": 78100 }, { "epoch": 8.411365837907653, "grad_norm": 0.920566976070404, "learning_rate": 9.561469669216679e-05, "loss": 3.1871, "step": 78150 }, { "epoch": 8.416747389947261, "grad_norm": 0.8672985434532166, "learning_rate": 9.529145566210537e-05, "loss": 3.1715, "step": 78200 }, { "epoch": 8.42212894198687, "grad_norm": 0.9214320778846741, "learning_rate": 9.496821463204395e-05, "loss": 3.1768, "step": 78250 }, { "epoch": 8.427510494026476, "grad_norm": 0.9051466584205627, "learning_rate": 9.464497360198253e-05, "loss": 3.1766, "step": 78300 }, { "epoch": 8.432892046066085, "grad_norm": 0.9038311243057251, "learning_rate": 9.432173257192113e-05, "loss": 3.1974, "step": 78350 }, { "epoch": 8.438273598105694, "grad_norm": 0.848071277141571, "learning_rate": 9.39984915418597e-05, "loss": 3.1722, "step": 78400 }, { "epoch": 8.443655150145302, "grad_norm": 0.8110135197639465, "learning_rate": 9.367525051179829e-05, "loss": 3.1781, "step": 78450 }, { "epoch": 8.44903670218491, "grad_norm": 0.8690963983535767, "learning_rate": 9.335200948173688e-05, "loss": 3.1759, "step": 78500 }, { "epoch": 8.45441825422452, "grad_norm": 0.8520603775978088, "learning_rate": 9.302876845167545e-05, "loss": 3.1921, "step": 78550 }, { "epoch": 8.459799806264126, "grad_norm": 0.8720440864562988, "learning_rate": 9.270552742161403e-05, "loss": 3.1896, "step": 78600 }, { "epoch": 8.465181358303735, "grad_norm": 0.9057508707046509, "learning_rate": 9.238228639155263e-05, "loss": 3.1796, "step": 78650 }, { "epoch": 8.470562910343343, "grad_norm": 0.894538938999176, "learning_rate": 9.205904536149122e-05, "loss": 3.1792, "step": 78700 }, { "epoch": 8.475944462382952, "grad_norm": 0.8490887880325317, "learning_rate": 9.173580433142979e-05, "loss": 3.1702, "step": 78750 }, { "epoch": 8.48132601442256, "grad_norm": 0.8573673963546753, "learning_rate": 9.141256330136838e-05, "loss": 3.1964, "step": 78800 }, { "epoch": 8.486707566462167, "grad_norm": 0.8426880240440369, "learning_rate": 9.108932227130697e-05, "loss": 3.181, "step": 78850 }, { "epoch": 8.492089118501776, "grad_norm": 0.893245279788971, "learning_rate": 9.076608124124555e-05, "loss": 3.1698, "step": 78900 }, { "epoch": 8.497470670541384, "grad_norm": 0.8832472562789917, "learning_rate": 9.044284021118413e-05, "loss": 3.1562, "step": 78950 }, { "epoch": 8.502852222580993, "grad_norm": 0.8996571898460388, "learning_rate": 9.011959918112272e-05, "loss": 3.1778, "step": 79000 }, { "epoch": 8.502852222580993, "eval_accuracy": 0.3916138006986598, "eval_loss": 3.324289560317993, "eval_runtime": 179.3247, "eval_samples_per_second": 100.438, "eval_steps_per_second": 6.279, "step": 79000 }, { "epoch": 8.508233774620601, "grad_norm": 0.8849459290504456, "learning_rate": 8.979635815106129e-05, "loss": 3.1795, "step": 79050 }, { "epoch": 8.513615326660208, "grad_norm": 0.8930593729019165, "learning_rate": 8.947311712099989e-05, "loss": 3.1814, "step": 79100 }, { "epoch": 8.518996878699816, "grad_norm": 0.8527243733406067, "learning_rate": 8.914987609093847e-05, "loss": 3.1858, "step": 79150 }, { "epoch": 8.524378430739425, "grad_norm": 0.9225887060165405, "learning_rate": 8.882663506087706e-05, "loss": 3.1935, "step": 79200 }, { "epoch": 8.529759982779034, "grad_norm": 0.9130849242210388, "learning_rate": 8.850339403081563e-05, "loss": 3.1808, "step": 79250 }, { "epoch": 8.535141534818642, "grad_norm": 0.9363095760345459, "learning_rate": 8.818015300075422e-05, "loss": 3.1824, "step": 79300 }, { "epoch": 8.54052308685825, "grad_norm": 0.9057344794273376, "learning_rate": 8.785691197069282e-05, "loss": 3.1788, "step": 79350 }, { "epoch": 8.545904638897857, "grad_norm": 0.8579224944114685, "learning_rate": 8.753367094063139e-05, "loss": 3.1908, "step": 79400 }, { "epoch": 8.551286190937466, "grad_norm": 0.9359004497528076, "learning_rate": 8.721042991056998e-05, "loss": 3.1648, "step": 79450 }, { "epoch": 8.556667742977075, "grad_norm": 0.8773726224899292, "learning_rate": 8.688718888050856e-05, "loss": 3.1892, "step": 79500 }, { "epoch": 8.562049295016683, "grad_norm": 0.8513708710670471, "learning_rate": 8.656394785044713e-05, "loss": 3.1951, "step": 79550 }, { "epoch": 8.567430847056292, "grad_norm": 0.8982970714569092, "learning_rate": 8.624070682038572e-05, "loss": 3.1756, "step": 79600 }, { "epoch": 8.572812399095898, "grad_norm": 0.8402177691459656, "learning_rate": 8.591746579032432e-05, "loss": 3.1707, "step": 79650 }, { "epoch": 8.578193951135507, "grad_norm": 0.8468958735466003, "learning_rate": 8.55942247602629e-05, "loss": 3.1854, "step": 79700 }, { "epoch": 8.583575503175116, "grad_norm": 0.8655433058738708, "learning_rate": 8.527098373020148e-05, "loss": 3.1686, "step": 79750 }, { "epoch": 8.588957055214724, "grad_norm": 0.8573452234268188, "learning_rate": 8.49542075207413e-05, "loss": 3.188, "step": 79800 }, { "epoch": 8.594338607254333, "grad_norm": 0.8844240307807922, "learning_rate": 8.463096649067987e-05, "loss": 3.175, "step": 79850 }, { "epoch": 8.599720159293941, "grad_norm": 0.8560530543327332, "learning_rate": 8.430772546061846e-05, "loss": 3.1862, "step": 79900 }, { "epoch": 8.605101711333548, "grad_norm": 0.9046381711959839, "learning_rate": 8.398448443055704e-05, "loss": 3.1704, "step": 79950 }, { "epoch": 8.610483263373157, "grad_norm": 0.9058465957641602, "learning_rate": 8.366124340049564e-05, "loss": 3.1943, "step": 80000 }, { "epoch": 8.610483263373157, "eval_accuracy": 0.3920069068467726, "eval_loss": 3.3214492797851562, "eval_runtime": 179.3139, "eval_samples_per_second": 100.444, "eval_steps_per_second": 6.279, "step": 80000 }, { "epoch": 8.615864815412765, "grad_norm": 0.8372693657875061, "learning_rate": 8.33380023704342e-05, "loss": 3.1987, "step": 80050 }, { "epoch": 8.621246367452374, "grad_norm": 0.851773202419281, "learning_rate": 8.30147613403728e-05, "loss": 3.1737, "step": 80100 }, { "epoch": 8.626627919491982, "grad_norm": 0.8636631369590759, "learning_rate": 8.269152031031138e-05, "loss": 3.1681, "step": 80150 }, { "epoch": 8.632009471531589, "grad_norm": 0.9294169545173645, "learning_rate": 8.236827928024996e-05, "loss": 3.175, "step": 80200 }, { "epoch": 8.637391023571197, "grad_norm": 0.8787388801574707, "learning_rate": 8.204503825018854e-05, "loss": 3.1673, "step": 80250 }, { "epoch": 8.642772575610806, "grad_norm": 0.9393197298049927, "learning_rate": 8.172179722012714e-05, "loss": 3.1781, "step": 80300 }, { "epoch": 8.648154127650415, "grad_norm": 0.8477320075035095, "learning_rate": 8.13985561900657e-05, "loss": 3.1866, "step": 80350 }, { "epoch": 8.653535679690023, "grad_norm": 0.8306042551994324, "learning_rate": 8.10753151600043e-05, "loss": 3.2008, "step": 80400 }, { "epoch": 8.658917231729632, "grad_norm": 0.8581592440605164, "learning_rate": 8.07520741299429e-05, "loss": 3.1938, "step": 80450 }, { "epoch": 8.664298783769238, "grad_norm": 0.9003025889396667, "learning_rate": 8.042883309988148e-05, "loss": 3.2003, "step": 80500 }, { "epoch": 8.669680335808847, "grad_norm": 0.8702462315559387, "learning_rate": 8.010559206982006e-05, "loss": 3.1826, "step": 80550 }, { "epoch": 8.675061887848456, "grad_norm": 0.9237415194511414, "learning_rate": 7.978235103975864e-05, "loss": 3.1745, "step": 80600 }, { "epoch": 8.680443439888064, "grad_norm": 0.8819509148597717, "learning_rate": 7.945911000969723e-05, "loss": 3.2113, "step": 80650 }, { "epoch": 8.685824991927673, "grad_norm": 0.8889045119285583, "learning_rate": 7.91358689796358e-05, "loss": 3.1955, "step": 80700 }, { "epoch": 8.69120654396728, "grad_norm": 0.8410822153091431, "learning_rate": 7.88126279495744e-05, "loss": 3.1846, "step": 80750 }, { "epoch": 8.696588096006888, "grad_norm": 0.8435183167457581, "learning_rate": 7.848938691951298e-05, "loss": 3.1723, "step": 80800 }, { "epoch": 8.701969648046497, "grad_norm": 0.8416341543197632, "learning_rate": 7.816614588945156e-05, "loss": 3.1905, "step": 80850 }, { "epoch": 8.707351200086105, "grad_norm": 0.8567610383033752, "learning_rate": 7.784290485939014e-05, "loss": 3.1796, "step": 80900 }, { "epoch": 8.712732752125714, "grad_norm": 0.8820018768310547, "learning_rate": 7.751966382932873e-05, "loss": 3.1988, "step": 80950 }, { "epoch": 8.718114304165322, "grad_norm": 0.9903234839439392, "learning_rate": 7.719642279926731e-05, "loss": 3.172, "step": 81000 }, { "epoch": 8.718114304165322, "eval_accuracy": 0.3921916167560572, "eval_loss": 3.3188509941101074, "eval_runtime": 179.3424, "eval_samples_per_second": 100.428, "eval_steps_per_second": 6.278, "step": 81000 }, { "epoch": 8.723495856204929, "grad_norm": 0.8541567921638489, "learning_rate": 7.68731817692059e-05, "loss": 3.1764, "step": 81050 }, { "epoch": 8.728877408244538, "grad_norm": 0.8884470462799072, "learning_rate": 7.654994073914448e-05, "loss": 3.1947, "step": 81100 }, { "epoch": 8.734258960284146, "grad_norm": 0.9023048877716064, "learning_rate": 7.622669970908307e-05, "loss": 3.1933, "step": 81150 }, { "epoch": 8.739640512323755, "grad_norm": 0.8709390163421631, "learning_rate": 7.590345867902164e-05, "loss": 3.1878, "step": 81200 }, { "epoch": 8.745022064363363, "grad_norm": 0.8721332550048828, "learning_rate": 7.558021764896023e-05, "loss": 3.1814, "step": 81250 }, { "epoch": 8.75040361640297, "grad_norm": 0.8687638640403748, "learning_rate": 7.525697661889883e-05, "loss": 3.1815, "step": 81300 }, { "epoch": 8.755785168442578, "grad_norm": 0.8232962489128113, "learning_rate": 7.493373558883741e-05, "loss": 3.1965, "step": 81350 }, { "epoch": 8.761166720482187, "grad_norm": 0.9516159296035767, "learning_rate": 7.461049455877599e-05, "loss": 3.177, "step": 81400 }, { "epoch": 8.766548272521796, "grad_norm": 0.8718137741088867, "learning_rate": 7.428725352871457e-05, "loss": 3.1737, "step": 81450 }, { "epoch": 8.771929824561404, "grad_norm": 0.9087874889373779, "learning_rate": 7.396401249865315e-05, "loss": 3.1779, "step": 81500 }, { "epoch": 8.777311376601011, "grad_norm": 0.8435814380645752, "learning_rate": 7.364077146859173e-05, "loss": 3.1815, "step": 81550 }, { "epoch": 8.78269292864062, "grad_norm": 0.9062038064002991, "learning_rate": 7.331753043853033e-05, "loss": 3.1745, "step": 81600 }, { "epoch": 8.788074480680228, "grad_norm": 0.8888091444969177, "learning_rate": 7.299428940846891e-05, "loss": 3.1747, "step": 81650 }, { "epoch": 8.793456032719837, "grad_norm": 0.8553709983825684, "learning_rate": 7.267751319900872e-05, "loss": 3.171, "step": 81700 }, { "epoch": 8.798837584759445, "grad_norm": 0.9256872534751892, "learning_rate": 7.235427216894731e-05, "loss": 3.1893, "step": 81750 }, { "epoch": 8.804219136799054, "grad_norm": 0.8422934412956238, "learning_rate": 7.203103113888589e-05, "loss": 3.1736, "step": 81800 }, { "epoch": 8.80960068883866, "grad_norm": 0.9026160836219788, "learning_rate": 7.170779010882447e-05, "loss": 3.1941, "step": 81850 }, { "epoch": 8.814982240878269, "grad_norm": 0.9136305451393127, "learning_rate": 7.138454907876305e-05, "loss": 3.1815, "step": 81900 }, { "epoch": 8.820363792917878, "grad_norm": 0.8973026871681213, "learning_rate": 7.106130804870164e-05, "loss": 3.1921, "step": 81950 }, { "epoch": 8.825745344957486, "grad_norm": 0.9524576663970947, "learning_rate": 7.073806701864023e-05, "loss": 3.183, "step": 82000 }, { "epoch": 8.825745344957486, "eval_accuracy": 0.39272325533613345, "eval_loss": 3.3131747245788574, "eval_runtime": 178.9237, "eval_samples_per_second": 100.663, "eval_steps_per_second": 6.293, "step": 82000 }, { "epoch": 8.831126896997095, "grad_norm": 0.8254840970039368, "learning_rate": 7.041482598857881e-05, "loss": 3.1827, "step": 82050 }, { "epoch": 8.836508449036701, "grad_norm": 0.9229602813720703, "learning_rate": 7.009158495851739e-05, "loss": 3.1795, "step": 82100 }, { "epoch": 8.84189000107631, "grad_norm": 0.8392314910888672, "learning_rate": 6.976834392845599e-05, "loss": 3.1832, "step": 82150 }, { "epoch": 8.847271553115919, "grad_norm": 0.8893462419509888, "learning_rate": 6.944510289839457e-05, "loss": 3.1912, "step": 82200 }, { "epoch": 8.852653105155527, "grad_norm": 0.9405621886253357, "learning_rate": 6.912186186833315e-05, "loss": 3.2126, "step": 82250 }, { "epoch": 8.858034657195136, "grad_norm": 0.907820463180542, "learning_rate": 6.879862083827173e-05, "loss": 3.1976, "step": 82300 }, { "epoch": 8.863416209234742, "grad_norm": 0.8853485584259033, "learning_rate": 6.847537980821031e-05, "loss": 3.195, "step": 82350 }, { "epoch": 8.868797761274351, "grad_norm": 0.9410234093666077, "learning_rate": 6.81521387781489e-05, "loss": 3.1846, "step": 82400 }, { "epoch": 8.87417931331396, "grad_norm": 0.9512822031974792, "learning_rate": 6.782889774808749e-05, "loss": 3.1878, "step": 82450 }, { "epoch": 8.879560865353568, "grad_norm": 0.8625288605690002, "learning_rate": 6.750565671802607e-05, "loss": 3.1966, "step": 82500 }, { "epoch": 8.884942417393177, "grad_norm": 0.8903507590293884, "learning_rate": 6.718241568796465e-05, "loss": 3.1821, "step": 82550 }, { "epoch": 8.890323969432785, "grad_norm": 0.8716627955436707, "learning_rate": 6.685917465790323e-05, "loss": 3.1615, "step": 82600 }, { "epoch": 8.895705521472392, "grad_norm": 0.8321760296821594, "learning_rate": 6.653593362784182e-05, "loss": 3.1666, "step": 82650 }, { "epoch": 8.901087073512, "grad_norm": 0.8693181872367859, "learning_rate": 6.62126925977804e-05, "loss": 3.1736, "step": 82700 }, { "epoch": 8.906468625551609, "grad_norm": 0.913057804107666, "learning_rate": 6.588945156771899e-05, "loss": 3.1825, "step": 82750 }, { "epoch": 8.911850177591218, "grad_norm": 0.8574439287185669, "learning_rate": 6.556621053765757e-05, "loss": 3.1762, "step": 82800 }, { "epoch": 8.917231729630826, "grad_norm": 0.8727285265922546, "learning_rate": 6.524296950759615e-05, "loss": 3.1624, "step": 82850 }, { "epoch": 8.922613281670433, "grad_norm": 0.8932309150695801, "learning_rate": 6.491972847753474e-05, "loss": 3.187, "step": 82900 }, { "epoch": 8.927994833710041, "grad_norm": 0.8489239811897278, "learning_rate": 6.459648744747333e-05, "loss": 3.1765, "step": 82950 }, { "epoch": 8.93337638574965, "grad_norm": 0.8805703520774841, "learning_rate": 6.427324641741192e-05, "loss": 3.1623, "step": 83000 }, { "epoch": 8.93337638574965, "eval_accuracy": 0.3930642080980953, "eval_loss": 3.311410427093506, "eval_runtime": 179.2517, "eval_samples_per_second": 100.479, "eval_steps_per_second": 6.282, "step": 83000 }, { "epoch": 8.938757937789259, "grad_norm": 0.9038493037223816, "learning_rate": 6.39500053873505e-05, "loss": 3.1784, "step": 83050 }, { "epoch": 8.944139489828867, "grad_norm": 0.8922439813613892, "learning_rate": 6.362676435728908e-05, "loss": 3.185, "step": 83100 }, { "epoch": 8.949521041868476, "grad_norm": 0.8802945613861084, "learning_rate": 6.330352332722766e-05, "loss": 3.179, "step": 83150 }, { "epoch": 8.954902593908082, "grad_norm": 0.8873797655105591, "learning_rate": 6.298028229716624e-05, "loss": 3.198, "step": 83200 }, { "epoch": 8.960284145947691, "grad_norm": 0.8683014512062073, "learning_rate": 6.265704126710484e-05, "loss": 3.1793, "step": 83250 }, { "epoch": 8.9656656979873, "grad_norm": 0.8842764496803284, "learning_rate": 6.233380023704342e-05, "loss": 3.1674, "step": 83300 }, { "epoch": 8.971047250026908, "grad_norm": 0.9601628184318542, "learning_rate": 6.2010559206982e-05, "loss": 3.1691, "step": 83350 }, { "epoch": 8.976428802066517, "grad_norm": 0.9314997792243958, "learning_rate": 6.168731817692058e-05, "loss": 3.1754, "step": 83400 }, { "epoch": 8.981810354106123, "grad_norm": 0.9326640367507935, "learning_rate": 6.136407714685916e-05, "loss": 3.2023, "step": 83450 }, { "epoch": 8.987191906145732, "grad_norm": 0.8711418509483337, "learning_rate": 6.104083611679776e-05, "loss": 3.1784, "step": 83500 }, { "epoch": 8.99257345818534, "grad_norm": 0.8587135672569275, "learning_rate": 6.071759508673634e-05, "loss": 3.2018, "step": 83550 }, { "epoch": 8.997955010224949, "grad_norm": 0.9234573841094971, "learning_rate": 6.039435405667492e-05, "loss": 3.1767, "step": 83600 }, { "epoch": 9.003336562264558, "grad_norm": 0.8794888257980347, "learning_rate": 6.007111302661351e-05, "loss": 3.1481, "step": 83650 }, { "epoch": 9.008718114304166, "grad_norm": 0.8656247854232788, "learning_rate": 5.974787199655209e-05, "loss": 3.123, "step": 83700 }, { "epoch": 9.014099666343773, "grad_norm": 0.8666937947273254, "learning_rate": 5.942463096649068e-05, "loss": 3.1186, "step": 83750 }, { "epoch": 9.019481218383381, "grad_norm": 0.8951687216758728, "learning_rate": 5.910138993642926e-05, "loss": 3.1233, "step": 83800 }, { "epoch": 9.02486277042299, "grad_norm": 0.8769862651824951, "learning_rate": 5.877814890636784e-05, "loss": 3.1218, "step": 83850 }, { "epoch": 9.030244322462599, "grad_norm": 0.8752743601799011, "learning_rate": 5.845490787630643e-05, "loss": 3.115, "step": 83900 }, { "epoch": 9.035625874502207, "grad_norm": 0.8764790296554565, "learning_rate": 5.813166684624501e-05, "loss": 3.1174, "step": 83950 }, { "epoch": 9.041007426541814, "grad_norm": 0.8726732134819031, "learning_rate": 5.7808425816183596e-05, "loss": 3.1248, "step": 84000 }, { "epoch": 9.041007426541814, "eval_accuracy": 0.3929663118461745, "eval_loss": 3.3138480186462402, "eval_runtime": 179.2237, "eval_samples_per_second": 100.495, "eval_steps_per_second": 6.283, "step": 84000 }, { "epoch": 9.046388978581422, "grad_norm": 0.8422228097915649, "learning_rate": 5.748518478612218e-05, "loss": 3.1182, "step": 84050 }, { "epoch": 9.051770530621031, "grad_norm": 0.9051665663719177, "learning_rate": 5.716194375606076e-05, "loss": 3.1208, "step": 84100 }, { "epoch": 9.05715208266064, "grad_norm": 0.8868678212165833, "learning_rate": 5.6838702725999346e-05, "loss": 3.1158, "step": 84150 }, { "epoch": 9.062533634700248, "grad_norm": 0.8721837997436523, "learning_rate": 5.651546169593793e-05, "loss": 3.1164, "step": 84200 }, { "epoch": 9.067915186739857, "grad_norm": 0.8881808519363403, "learning_rate": 5.619222066587652e-05, "loss": 3.1168, "step": 84250 }, { "epoch": 9.073296738779463, "grad_norm": 0.8928927183151245, "learning_rate": 5.58689796358151e-05, "loss": 3.1271, "step": 84300 }, { "epoch": 9.078678290819072, "grad_norm": 0.8700510263442993, "learning_rate": 5.554573860575369e-05, "loss": 3.1413, "step": 84350 }, { "epoch": 9.08405984285868, "grad_norm": 0.9075739979743958, "learning_rate": 5.522249757569227e-05, "loss": 3.1263, "step": 84400 }, { "epoch": 9.089441394898289, "grad_norm": 0.8832513093948364, "learning_rate": 5.489925654563085e-05, "loss": 3.1139, "step": 84450 }, { "epoch": 9.094822946937898, "grad_norm": 0.8535157442092896, "learning_rate": 5.457601551556944e-05, "loss": 3.1316, "step": 84500 }, { "epoch": 9.100204498977504, "grad_norm": 0.8854033946990967, "learning_rate": 5.425277448550802e-05, "loss": 3.1223, "step": 84550 }, { "epoch": 9.105586051017113, "grad_norm": 0.8909099102020264, "learning_rate": 5.392953345544661e-05, "loss": 3.1212, "step": 84600 }, { "epoch": 9.110967603056721, "grad_norm": 0.862286388874054, "learning_rate": 5.360629242538519e-05, "loss": 3.1178, "step": 84650 }, { "epoch": 9.11634915509633, "grad_norm": 0.8904756903648376, "learning_rate": 5.328305139532377e-05, "loss": 3.1226, "step": 84700 }, { "epoch": 9.121730707135939, "grad_norm": 0.8563750982284546, "learning_rate": 5.295981036526236e-05, "loss": 3.1087, "step": 84750 }, { "epoch": 9.127112259175545, "grad_norm": 0.8712977170944214, "learning_rate": 5.263656933520094e-05, "loss": 3.1361, "step": 84800 }, { "epoch": 9.132493811215154, "grad_norm": 0.8745869994163513, "learning_rate": 5.231332830513953e-05, "loss": 3.1236, "step": 84850 }, { "epoch": 9.137875363254762, "grad_norm": 0.8896132111549377, "learning_rate": 5.199008727507811e-05, "loss": 3.1296, "step": 84900 }, { "epoch": 9.143256915294371, "grad_norm": 0.9015775918960571, "learning_rate": 5.166684624501669e-05, "loss": 3.1169, "step": 84950 }, { "epoch": 9.14863846733398, "grad_norm": 0.84309321641922, "learning_rate": 5.1343605214955286e-05, "loss": 3.1268, "step": 85000 }, { "epoch": 9.14863846733398, "eval_accuracy": 0.39307268302334486, "eval_loss": 3.313570737838745, "eval_runtime": 179.0955, "eval_samples_per_second": 100.566, "eval_steps_per_second": 6.287, "step": 85000 }, { "epoch": 9.154020019373588, "grad_norm": 0.893982470035553, "learning_rate": 5.102036418489387e-05, "loss": 3.1286, "step": 85050 }, { "epoch": 9.159401571413195, "grad_norm": 0.8679631352424622, "learning_rate": 5.0697123154832455e-05, "loss": 3.1442, "step": 85100 }, { "epoch": 9.164783123452803, "grad_norm": 0.901399552822113, "learning_rate": 5.0373882124771036e-05, "loss": 3.13, "step": 85150 }, { "epoch": 9.170164675492412, "grad_norm": 0.8721624612808228, "learning_rate": 5.005064109470962e-05, "loss": 3.1227, "step": 85200 }, { "epoch": 9.17554622753202, "grad_norm": 0.8802534341812134, "learning_rate": 4.9727400064648205e-05, "loss": 3.1197, "step": 85250 }, { "epoch": 9.180927779571629, "grad_norm": 0.8530134558677673, "learning_rate": 4.9404159034586786e-05, "loss": 3.1277, "step": 85300 }, { "epoch": 9.186309331611236, "grad_norm": 0.9548059701919556, "learning_rate": 4.9080918004525374e-05, "loss": 3.1363, "step": 85350 }, { "epoch": 9.191690883650844, "grad_norm": 0.8883795142173767, "learning_rate": 4.8757676974463955e-05, "loss": 3.1206, "step": 85400 }, { "epoch": 9.197072435690453, "grad_norm": 0.9452154636383057, "learning_rate": 4.8434435944402536e-05, "loss": 3.1301, "step": 85450 }, { "epoch": 9.202453987730062, "grad_norm": 0.84602290391922, "learning_rate": 4.8111194914341124e-05, "loss": 3.1332, "step": 85500 }, { "epoch": 9.20783553976967, "grad_norm": 0.8860790729522705, "learning_rate": 4.7787953884279705e-05, "loss": 3.109, "step": 85550 }, { "epoch": 9.213217091809279, "grad_norm": 0.928162157535553, "learning_rate": 4.746471285421829e-05, "loss": 3.125, "step": 85600 }, { "epoch": 9.218598643848885, "grad_norm": 0.9089372754096985, "learning_rate": 4.7141471824156874e-05, "loss": 3.1375, "step": 85650 }, { "epoch": 9.223980195888494, "grad_norm": 0.8874738812446594, "learning_rate": 4.682469561469669e-05, "loss": 3.1166, "step": 85700 }, { "epoch": 9.229361747928102, "grad_norm": 0.9361822605133057, "learning_rate": 4.650145458463527e-05, "loss": 3.1335, "step": 85750 }, { "epoch": 9.234743299967711, "grad_norm": 0.9551217555999756, "learning_rate": 4.6178213554573856e-05, "loss": 3.1439, "step": 85800 }, { "epoch": 9.24012485200732, "grad_norm": 0.8974487781524658, "learning_rate": 4.585497252451244e-05, "loss": 3.1352, "step": 85850 }, { "epoch": 9.245506404046926, "grad_norm": 0.8875505328178406, "learning_rate": 4.553173149445102e-05, "loss": 3.1341, "step": 85900 }, { "epoch": 9.250887956086535, "grad_norm": 0.8941646814346313, "learning_rate": 4.520849046438961e-05, "loss": 3.1149, "step": 85950 }, { "epoch": 9.256269508126143, "grad_norm": 0.873041033744812, "learning_rate": 4.488524943432819e-05, "loss": 3.1174, "step": 86000 }, { "epoch": 9.256269508126143, "eval_accuracy": 0.3934670930061114, "eval_loss": 3.311234712600708, "eval_runtime": 179.4331, "eval_samples_per_second": 100.377, "eval_steps_per_second": 6.275, "step": 86000 }, { "epoch": 9.261651060165752, "grad_norm": 0.9252175092697144, "learning_rate": 4.456200840426678e-05, "loss": 3.1298, "step": 86050 }, { "epoch": 9.26703261220536, "grad_norm": 0.8891488313674927, "learning_rate": 4.4238767374205363e-05, "loss": 3.118, "step": 86100 }, { "epoch": 9.272414164244967, "grad_norm": 0.880336582660675, "learning_rate": 4.3915526344143945e-05, "loss": 3.1297, "step": 86150 }, { "epoch": 9.277795716284576, "grad_norm": 0.917701005935669, "learning_rate": 4.359228531408253e-05, "loss": 3.13, "step": 86200 }, { "epoch": 9.283177268324184, "grad_norm": 0.8821280598640442, "learning_rate": 4.3269044284021114e-05, "loss": 3.1461, "step": 86250 }, { "epoch": 9.288558820363793, "grad_norm": 0.8745120763778687, "learning_rate": 4.29458032539597e-05, "loss": 3.1313, "step": 86300 }, { "epoch": 9.293940372403402, "grad_norm": 0.8729362487792969, "learning_rate": 4.262256222389828e-05, "loss": 3.128, "step": 86350 }, { "epoch": 9.29932192444301, "grad_norm": 0.8997379541397095, "learning_rate": 4.229932119383687e-05, "loss": 3.1428, "step": 86400 }, { "epoch": 9.304703476482617, "grad_norm": 0.8468843698501587, "learning_rate": 4.197608016377545e-05, "loss": 3.1149, "step": 86450 }, { "epoch": 9.310085028522225, "grad_norm": 0.9072235822677612, "learning_rate": 4.165283913371403e-05, "loss": 3.1408, "step": 86500 }, { "epoch": 9.315466580561834, "grad_norm": 0.871537446975708, "learning_rate": 4.132959810365262e-05, "loss": 3.1476, "step": 86550 }, { "epoch": 9.320848132601443, "grad_norm": 0.8553623557090759, "learning_rate": 4.10063570735912e-05, "loss": 3.134, "step": 86600 }, { "epoch": 9.326229684641051, "grad_norm": 0.8827656507492065, "learning_rate": 4.068311604352979e-05, "loss": 3.1345, "step": 86650 }, { "epoch": 9.331611236680658, "grad_norm": 0.9275478720664978, "learning_rate": 4.035987501346837e-05, "loss": 3.1266, "step": 86700 }, { "epoch": 9.336992788720266, "grad_norm": 0.8909872174263, "learning_rate": 4.003663398340695e-05, "loss": 3.124, "step": 86750 }, { "epoch": 9.342374340759875, "grad_norm": 0.8755511045455933, "learning_rate": 3.9713392953345546e-05, "loss": 3.1287, "step": 86800 }, { "epoch": 9.347755892799483, "grad_norm": 0.944328784942627, "learning_rate": 3.939015192328413e-05, "loss": 3.1187, "step": 86850 }, { "epoch": 9.353137444839092, "grad_norm": 0.9090091586112976, "learning_rate": 3.9066910893222715e-05, "loss": 3.1165, "step": 86900 }, { "epoch": 9.3585189968787, "grad_norm": 0.8840731382369995, "learning_rate": 3.8743669863161296e-05, "loss": 3.1364, "step": 86950 }, { "epoch": 9.363900548918307, "grad_norm": 0.8947219252586365, "learning_rate": 3.842042883309988e-05, "loss": 3.1296, "step": 87000 }, { "epoch": 9.363900548918307, "eval_accuracy": 0.3936966765580634, "eval_loss": 3.3083441257476807, "eval_runtime": 179.1237, "eval_samples_per_second": 100.551, "eval_steps_per_second": 6.286, "step": 87000 }, { "epoch": 9.369282100957916, "grad_norm": 0.8991856575012207, "learning_rate": 3.8097187803038465e-05, "loss": 3.1312, "step": 87050 }, { "epoch": 9.374663652997524, "grad_norm": 0.8899461627006531, "learning_rate": 3.7773946772977047e-05, "loss": 3.1328, "step": 87100 }, { "epoch": 9.380045205037133, "grad_norm": 0.8527735471725464, "learning_rate": 3.745070574291563e-05, "loss": 3.1389, "step": 87150 }, { "epoch": 9.385426757076742, "grad_norm": 0.9347635507583618, "learning_rate": 3.7127464712854216e-05, "loss": 3.149, "step": 87200 }, { "epoch": 9.390808309116348, "grad_norm": 0.8766456246376038, "learning_rate": 3.6804223682792803e-05, "loss": 3.1307, "step": 87250 }, { "epoch": 9.396189861155957, "grad_norm": 0.8890450596809387, "learning_rate": 3.6480982652731385e-05, "loss": 3.139, "step": 87300 }, { "epoch": 9.401571413195565, "grad_norm": 0.8748633861541748, "learning_rate": 3.6157741622669966e-05, "loss": 3.1314, "step": 87350 }, { "epoch": 9.406952965235174, "grad_norm": 0.8773558139801025, "learning_rate": 3.5834500592608554e-05, "loss": 3.1432, "step": 87400 }, { "epoch": 9.412334517274783, "grad_norm": 0.9081812500953674, "learning_rate": 3.5511259562547135e-05, "loss": 3.1529, "step": 87450 }, { "epoch": 9.417716069314391, "grad_norm": 0.883679211139679, "learning_rate": 3.518801853248572e-05, "loss": 3.121, "step": 87500 }, { "epoch": 9.423097621353998, "grad_norm": 0.8753542900085449, "learning_rate": 3.4864777502424304e-05, "loss": 3.1457, "step": 87550 }, { "epoch": 9.428479173393606, "grad_norm": 0.8767165541648865, "learning_rate": 3.4541536472362885e-05, "loss": 3.1574, "step": 87600 }, { "epoch": 9.433860725433215, "grad_norm": 0.9559025764465332, "learning_rate": 3.421829544230147e-05, "loss": 3.1416, "step": 87650 }, { "epoch": 9.439242277472824, "grad_norm": 0.8750273585319519, "learning_rate": 3.3901519232841286e-05, "loss": 3.1225, "step": 87700 }, { "epoch": 9.444623829512432, "grad_norm": 0.9168546199798584, "learning_rate": 3.3578278202779874e-05, "loss": 3.1216, "step": 87750 }, { "epoch": 9.450005381552039, "grad_norm": 0.8629895448684692, "learning_rate": 3.3255037172718455e-05, "loss": 3.1191, "step": 87800 }, { "epoch": 9.455386933591647, "grad_norm": 0.8742921948432922, "learning_rate": 3.2931796142657036e-05, "loss": 3.1403, "step": 87850 }, { "epoch": 9.460768485631256, "grad_norm": 0.9085738062858582, "learning_rate": 3.2608555112595624e-05, "loss": 3.1228, "step": 87900 }, { "epoch": 9.466150037670864, "grad_norm": 0.8764187097549438, "learning_rate": 3.2285314082534205e-05, "loss": 3.1336, "step": 87950 }, { "epoch": 9.471531589710473, "grad_norm": 0.8770283460617065, "learning_rate": 3.196207305247279e-05, "loss": 3.1201, "step": 88000 }, { "epoch": 9.471531589710473, "eval_accuracy": 0.3938910565743635, "eval_loss": 3.3071236610412598, "eval_runtime": 179.2724, "eval_samples_per_second": 100.467, "eval_steps_per_second": 6.281, "step": 88000 }, { "epoch": 9.476913141750082, "grad_norm": 0.8615807294845581, "learning_rate": 3.1638832022411374e-05, "loss": 3.1407, "step": 88050 }, { "epoch": 9.482294693789688, "grad_norm": 0.9012972116470337, "learning_rate": 3.1315590992349955e-05, "loss": 3.1261, "step": 88100 }, { "epoch": 9.487676245829297, "grad_norm": 0.9230588674545288, "learning_rate": 3.099234996228854e-05, "loss": 3.1169, "step": 88150 }, { "epoch": 9.493057797868905, "grad_norm": 0.9576355218887329, "learning_rate": 3.066910893222713e-05, "loss": 3.1357, "step": 88200 }, { "epoch": 9.498439349908514, "grad_norm": 0.8960409164428711, "learning_rate": 3.0345867902165712e-05, "loss": 3.1261, "step": 88250 }, { "epoch": 9.503820901948123, "grad_norm": 0.8987086415290833, "learning_rate": 3.00226268721043e-05, "loss": 3.1353, "step": 88300 }, { "epoch": 9.50920245398773, "grad_norm": 0.8619529008865356, "learning_rate": 2.9699385842042878e-05, "loss": 3.1268, "step": 88350 }, { "epoch": 9.514584006027338, "grad_norm": 0.8624460101127625, "learning_rate": 2.9376144811981465e-05, "loss": 3.1297, "step": 88400 }, { "epoch": 9.519965558066946, "grad_norm": 0.9188991189002991, "learning_rate": 2.905290378192005e-05, "loss": 3.1352, "step": 88450 }, { "epoch": 9.525347110106555, "grad_norm": 0.9187957048416138, "learning_rate": 2.8729662751858634e-05, "loss": 3.1134, "step": 88500 }, { "epoch": 9.530728662146164, "grad_norm": 0.9001445174217224, "learning_rate": 2.840642172179722e-05, "loss": 3.1202, "step": 88550 }, { "epoch": 9.536110214185772, "grad_norm": 0.8827770352363586, "learning_rate": 2.80831806917358e-05, "loss": 3.1317, "step": 88600 }, { "epoch": 9.541491766225379, "grad_norm": 0.8941860795021057, "learning_rate": 2.7759939661674384e-05, "loss": 3.1365, "step": 88650 }, { "epoch": 9.546873318264987, "grad_norm": 0.8406631350517273, "learning_rate": 2.743669863161297e-05, "loss": 3.1199, "step": 88700 }, { "epoch": 9.552254870304596, "grad_norm": 0.8941790461540222, "learning_rate": 2.7113457601551557e-05, "loss": 3.1334, "step": 88750 }, { "epoch": 9.557636422344205, "grad_norm": 0.8630831241607666, "learning_rate": 2.679021657149014e-05, "loss": 3.1302, "step": 88800 }, { "epoch": 9.563017974383813, "grad_norm": 0.9254708886146545, "learning_rate": 2.6466975541428722e-05, "loss": 3.1282, "step": 88850 }, { "epoch": 9.56839952642342, "grad_norm": 0.8795071244239807, "learning_rate": 2.6143734511367307e-05, "loss": 3.1302, "step": 88900 }, { "epoch": 9.573781078463028, "grad_norm": 0.9612981081008911, "learning_rate": 2.582049348130589e-05, "loss": 3.1315, "step": 88950 }, { "epoch": 9.579162630502637, "grad_norm": 0.8750224113464355, "learning_rate": 2.5497252451244476e-05, "loss": 3.1288, "step": 89000 }, { "epoch": 9.579162630502637, "eval_accuracy": 0.39411955359743733, "eval_loss": 3.304368734359741, "eval_runtime": 179.2616, "eval_samples_per_second": 100.473, "eval_steps_per_second": 6.281, "step": 89000 }, { "epoch": 9.584544182542245, "grad_norm": 0.9232970476150513, "learning_rate": 2.517401142118306e-05, "loss": 3.1317, "step": 89050 }, { "epoch": 9.589925734581854, "grad_norm": 0.8880868554115295, "learning_rate": 2.485077039112164e-05, "loss": 3.1186, "step": 89100 }, { "epoch": 9.59530728662146, "grad_norm": 0.8806546330451965, "learning_rate": 2.4527529361060226e-05, "loss": 3.1405, "step": 89150 }, { "epoch": 9.60068883866107, "grad_norm": 0.892593264579773, "learning_rate": 2.4204288330998814e-05, "loss": 3.1235, "step": 89200 }, { "epoch": 9.606070390700678, "grad_norm": 0.9136896133422852, "learning_rate": 2.38810473009374e-05, "loss": 3.1225, "step": 89250 }, { "epoch": 9.611451942740286, "grad_norm": 0.8949507474899292, "learning_rate": 2.3557806270875983e-05, "loss": 3.1267, "step": 89300 }, { "epoch": 9.616833494779895, "grad_norm": 0.8627079129219055, "learning_rate": 2.3241030061415796e-05, "loss": 3.1545, "step": 89350 }, { "epoch": 9.622215046819504, "grad_norm": 0.8995786905288696, "learning_rate": 2.2917789031354377e-05, "loss": 3.1376, "step": 89400 }, { "epoch": 9.62759659885911, "grad_norm": 0.8639857769012451, "learning_rate": 2.2594548001292962e-05, "loss": 3.1401, "step": 89450 }, { "epoch": 9.632978150898719, "grad_norm": 0.8793827295303345, "learning_rate": 2.2271306971231546e-05, "loss": 3.1427, "step": 89500 }, { "epoch": 9.638359702938327, "grad_norm": 0.9110519886016846, "learning_rate": 2.194806594117013e-05, "loss": 3.123, "step": 89550 }, { "epoch": 9.643741254977936, "grad_norm": 0.9195792078971863, "learning_rate": 2.1624824911108715e-05, "loss": 3.1333, "step": 89600 }, { "epoch": 9.649122807017545, "grad_norm": 0.9242751598358154, "learning_rate": 2.1301583881047296e-05, "loss": 3.1175, "step": 89650 }, { "epoch": 9.654504359057151, "grad_norm": 0.9064347147941589, "learning_rate": 2.097834285098588e-05, "loss": 3.1332, "step": 89700 }, { "epoch": 9.65988591109676, "grad_norm": 0.9124810099601746, "learning_rate": 2.065510182092447e-05, "loss": 3.1335, "step": 89750 }, { "epoch": 9.665267463136368, "grad_norm": 0.8664395213127136, "learning_rate": 2.0331860790863053e-05, "loss": 3.1163, "step": 89800 }, { "epoch": 9.670649015175977, "grad_norm": 0.8817176222801208, "learning_rate": 2.0008619760801638e-05, "loss": 3.126, "step": 89850 }, { "epoch": 9.676030567215586, "grad_norm": 0.9068438410758972, "learning_rate": 1.968537873074022e-05, "loss": 3.0999, "step": 89900 }, { "epoch": 9.681412119255192, "grad_norm": 0.870971143245697, "learning_rate": 1.9362137700678803e-05, "loss": 3.137, "step": 89950 }, { "epoch": 9.6867936712948, "grad_norm": 0.8804100751876831, "learning_rate": 1.9038896670617388e-05, "loss": 3.1367, "step": 90000 }, { "epoch": 9.6867936712948, "eval_accuracy": 0.39440596260971633, "eval_loss": 3.301514148712158, "eval_runtime": 179.5464, "eval_samples_per_second": 100.314, "eval_steps_per_second": 6.271, "step": 90000 }, { "epoch": 9.69217522333441, "grad_norm": 0.8757747411727905, "learning_rate": 1.8715655640555972e-05, "loss": 3.1169, "step": 90050 }, { "epoch": 9.697556775374018, "grad_norm": 0.9005279541015625, "learning_rate": 1.8392414610494557e-05, "loss": 3.1265, "step": 90100 }, { "epoch": 9.702938327413626, "grad_norm": 0.8953185677528381, "learning_rate": 1.806917358043314e-05, "loss": 3.1257, "step": 90150 }, { "epoch": 9.708319879453235, "grad_norm": 0.8756401538848877, "learning_rate": 1.7745932550371726e-05, "loss": 3.136, "step": 90200 }, { "epoch": 9.713701431492842, "grad_norm": 0.9136323928833008, "learning_rate": 1.742269152031031e-05, "loss": 3.1446, "step": 90250 }, { "epoch": 9.71908298353245, "grad_norm": 0.8698214888572693, "learning_rate": 1.7099450490248895e-05, "loss": 3.1222, "step": 90300 }, { "epoch": 9.724464535572059, "grad_norm": 0.9343684911727905, "learning_rate": 1.677620946018748e-05, "loss": 3.144, "step": 90350 }, { "epoch": 9.729846087611667, "grad_norm": 0.911002516746521, "learning_rate": 1.6452968430126064e-05, "loss": 3.127, "step": 90400 }, { "epoch": 9.735227639651276, "grad_norm": 0.8995512127876282, "learning_rate": 1.6129727400064645e-05, "loss": 3.098, "step": 90450 }, { "epoch": 9.740609191690883, "grad_norm": 0.9500533938407898, "learning_rate": 1.580648637000323e-05, "loss": 3.1302, "step": 90500 }, { "epoch": 9.745990743730491, "grad_norm": 0.8776048421859741, "learning_rate": 1.5483245339941817e-05, "loss": 3.12, "step": 90550 }, { "epoch": 9.7513722957701, "grad_norm": 0.897566556930542, "learning_rate": 1.5160004309880398e-05, "loss": 3.136, "step": 90600 }, { "epoch": 9.756753847809708, "grad_norm": 0.8855232000350952, "learning_rate": 1.4836763279818985e-05, "loss": 3.1295, "step": 90650 }, { "epoch": 9.762135399849317, "grad_norm": 0.8825761079788208, "learning_rate": 1.4513522249757567e-05, "loss": 3.1294, "step": 90700 }, { "epoch": 9.767516951888926, "grad_norm": 0.9207885265350342, "learning_rate": 1.4190281219696152e-05, "loss": 3.1158, "step": 90750 }, { "epoch": 9.772898503928532, "grad_norm": 0.9126257300376892, "learning_rate": 1.3867040189634736e-05, "loss": 3.1438, "step": 90800 }, { "epoch": 9.77828005596814, "grad_norm": 0.9926032423973083, "learning_rate": 1.3543799159573321e-05, "loss": 3.1172, "step": 90850 }, { "epoch": 9.78366160800775, "grad_norm": 0.9141454100608826, "learning_rate": 1.3220558129511905e-05, "loss": 3.1364, "step": 90900 }, { "epoch": 9.789043160047358, "grad_norm": 0.9459318518638611, "learning_rate": 1.2897317099450488e-05, "loss": 3.1291, "step": 90950 }, { "epoch": 9.794424712086967, "grad_norm": 0.8875244855880737, "learning_rate": 1.2574076069389073e-05, "loss": 3.1399, "step": 91000 }, { "epoch": 9.794424712086967, "eval_accuracy": 0.3946021897251093, "eval_loss": 3.2999260425567627, "eval_runtime": 179.157, "eval_samples_per_second": 100.532, "eval_steps_per_second": 6.285, "step": 91000 }, { "epoch": 9.799806264126573, "grad_norm": 0.9031006693840027, "learning_rate": 1.2250835039327659e-05, "loss": 3.1216, "step": 91050 }, { "epoch": 9.805187816166182, "grad_norm": 0.8972072601318359, "learning_rate": 1.1927594009266242e-05, "loss": 3.124, "step": 91100 }, { "epoch": 9.81056936820579, "grad_norm": 0.8891579508781433, "learning_rate": 1.1604352979204826e-05, "loss": 3.1269, "step": 91150 }, { "epoch": 9.815950920245399, "grad_norm": 0.8733758330345154, "learning_rate": 1.1281111949143409e-05, "loss": 3.139, "step": 91200 }, { "epoch": 9.821332472285007, "grad_norm": 0.8693758249282837, "learning_rate": 1.0957870919081995e-05, "loss": 3.1331, "step": 91250 }, { "epoch": 9.826714024324616, "grad_norm": 0.9038960933685303, "learning_rate": 1.063462988902058e-05, "loss": 3.1206, "step": 91300 }, { "epoch": 9.832095576364223, "grad_norm": 0.8685023784637451, "learning_rate": 1.0311388858959162e-05, "loss": 3.128, "step": 91350 }, { "epoch": 9.837477128403831, "grad_norm": 0.9239031672477722, "learning_rate": 9.988147828897747e-06, "loss": 3.1368, "step": 91400 }, { "epoch": 9.84285868044344, "grad_norm": 0.8665789365768433, "learning_rate": 9.66490679883633e-06, "loss": 3.1224, "step": 91450 }, { "epoch": 9.848240232483048, "grad_norm": 0.8577331304550171, "learning_rate": 9.341665768774916e-06, "loss": 3.1408, "step": 91500 }, { "epoch": 9.853621784522657, "grad_norm": 0.8790497779846191, "learning_rate": 9.018424738713499e-06, "loss": 3.126, "step": 91550 }, { "epoch": 9.859003336562264, "grad_norm": 0.9997568726539612, "learning_rate": 8.695183708652085e-06, "loss": 3.1386, "step": 91600 }, { "epoch": 9.864384888601872, "grad_norm": 0.9710144996643066, "learning_rate": 8.37194267859067e-06, "loss": 3.1364, "step": 91650 }, { "epoch": 9.869766440641481, "grad_norm": 0.8693020343780518, "learning_rate": 8.048701648529252e-06, "loss": 3.1195, "step": 91700 }, { "epoch": 9.87514799268109, "grad_norm": 0.857377290725708, "learning_rate": 7.725460618467837e-06, "loss": 3.1236, "step": 91750 }, { "epoch": 9.880529544720698, "grad_norm": 0.8829137682914734, "learning_rate": 7.40221958840642e-06, "loss": 3.1073, "step": 91800 }, { "epoch": 9.885911096760307, "grad_norm": 0.8725801706314087, "learning_rate": 7.078978558345006e-06, "loss": 3.1332, "step": 91850 }, { "epoch": 9.891292648799913, "grad_norm": 0.8576915264129639, "learning_rate": 6.75573752828359e-06, "loss": 3.1341, "step": 91900 }, { "epoch": 9.896674200839522, "grad_norm": 0.8982592821121216, "learning_rate": 6.432496498222174e-06, "loss": 3.1293, "step": 91950 }, { "epoch": 9.90205575287913, "grad_norm": 0.8765159249305725, "learning_rate": 6.1092554681607575e-06, "loss": 3.1335, "step": 92000 }, { "epoch": 9.90205575287913, "eval_accuracy": 0.3949138062073612, "eval_loss": 3.2980058193206787, "eval_runtime": 179.2356, "eval_samples_per_second": 100.488, "eval_steps_per_second": 6.282, "step": 92000 }, { "epoch": 9.907437304918739, "grad_norm": 0.8775045275688171, "learning_rate": 5.786014438099342e-06, "loss": 3.1425, "step": 92050 }, { "epoch": 9.912818856958348, "grad_norm": 0.9498361945152283, "learning_rate": 5.4627734080379264e-06, "loss": 3.1392, "step": 92100 }, { "epoch": 9.918200408997954, "grad_norm": 0.8609201312065125, "learning_rate": 5.139532377976511e-06, "loss": 3.1347, "step": 92150 }, { "epoch": 9.923581961037563, "grad_norm": 0.8858768939971924, "learning_rate": 4.816291347915095e-06, "loss": 3.1238, "step": 92200 }, { "epoch": 9.928963513077171, "grad_norm": 0.9281898736953735, "learning_rate": 4.493050317853679e-06, "loss": 3.1501, "step": 92250 }, { "epoch": 9.93434506511678, "grad_norm": 0.8417092561721802, "learning_rate": 4.169809287792264e-06, "loss": 3.1356, "step": 92300 }, { "epoch": 9.939726617156388, "grad_norm": 0.8472110033035278, "learning_rate": 3.846568257730847e-06, "loss": 3.1233, "step": 92350 }, { "epoch": 9.945108169195997, "grad_norm": 0.8769899606704712, "learning_rate": 3.523327227669432e-06, "loss": 3.1339, "step": 92400 }, { "epoch": 9.950489721235604, "grad_norm": 0.8860825896263123, "learning_rate": 3.2000861976080162e-06, "loss": 3.1236, "step": 92450 }, { "epoch": 9.955871273275212, "grad_norm": 0.8979299068450928, "learning_rate": 2.8768451675466007e-06, "loss": 3.129, "step": 92500 }, { "epoch": 9.961252825314821, "grad_norm": 0.8497779965400696, "learning_rate": 2.5536041374851848e-06, "loss": 3.1383, "step": 92550 }, { "epoch": 9.96663437735443, "grad_norm": 0.9217830896377563, "learning_rate": 2.230363107423769e-06, "loss": 3.1084, "step": 92600 }, { "epoch": 9.972015929394038, "grad_norm": 0.9171563386917114, "learning_rate": 1.9071220773623531e-06, "loss": 3.1382, "step": 92650 }, { "epoch": 9.977397481433645, "grad_norm": 0.9279325604438782, "learning_rate": 1.5838810473009372e-06, "loss": 3.1447, "step": 92700 }, { "epoch": 9.982779033473253, "grad_norm": 0.8588573336601257, "learning_rate": 1.2606400172395215e-06, "loss": 3.1446, "step": 92750 }, { "epoch": 9.988160585512862, "grad_norm": 0.8988666534423828, "learning_rate": 9.373989871781058e-07, "loss": 3.1185, "step": 92800 }, { "epoch": 9.99354213755247, "grad_norm": 0.9189948439598083, "learning_rate": 6.1415795711669e-07, "loss": 3.1271, "step": 92850 }, { "epoch": 9.998923689592079, "grad_norm": 0.9704410433769226, "learning_rate": 2.909169270552742e-07, "loss": 3.1122, "step": 92900 }, { "epoch": 10.0, "step": 92910, "total_flos": 7.7681859821568e+17, "train_loss": 3.4541932165449496, "train_runtime": 79417.511, "train_samples_per_second": 37.435, "train_steps_per_second": 1.17 } ], "logging_steps": 50, "max_steps": 92910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.7681859821568e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }