{ "best_metric": 3.3028476238250732, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_495/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005381552039608223, "grad_norm": 1.6117655038833618, "learning_rate": 0.00027, "loss": 8.7843, "step": 50 }, { "epoch": 0.010763104079216447, "grad_norm": 1.1765116453170776, "learning_rate": 0.00057, "loss": 6.9711, "step": 100 }, { "epoch": 0.01614465611882467, "grad_norm": 1.6633868217468262, "learning_rate": 0.0005997090830729447, "loss": 6.5373, "step": 150 }, { "epoch": 0.021526208158432893, "grad_norm": 1.8836766481399536, "learning_rate": 0.0005993858420428833, "loss": 6.2429, "step": 200 }, { "epoch": 0.026907760198041114, "grad_norm": 1.5037431716918945, "learning_rate": 0.0005990626010128218, "loss": 6.0966, "step": 250 }, { "epoch": 0.03228931223764934, "grad_norm": 2.2753920555114746, "learning_rate": 0.0005987393599827604, "loss": 5.953, "step": 300 }, { "epoch": 0.03767086427725756, "grad_norm": 3.0884439945220947, "learning_rate": 0.000598416118952699, "loss": 5.869, "step": 350 }, { "epoch": 0.04305241631686579, "grad_norm": 1.4899078607559204, "learning_rate": 0.0005980928779226376, "loss": 5.8032, "step": 400 }, { "epoch": 0.048433968356474004, "grad_norm": 1.125209093093872, "learning_rate": 0.0005977696368925762, "loss": 5.7355, "step": 450 }, { "epoch": 0.05381552039608223, "grad_norm": 1.960442066192627, "learning_rate": 0.0005974463958625148, "loss": 5.6583, "step": 500 }, { "epoch": 0.05919707243569045, "grad_norm": 1.044176697731018, "learning_rate": 0.0005971231548324534, "loss": 5.5711, "step": 550 }, { "epoch": 0.06457862447529868, "grad_norm": 1.6373082399368286, "learning_rate": 0.0005967999138023919, "loss": 5.5099, "step": 600 }, { "epoch": 0.0699601765149069, "grad_norm": 1.8287220001220703, "learning_rate": 0.0005964766727723304, "loss": 5.4242, "step": 650 }, { "epoch": 0.07534172855451512, "grad_norm": 1.3936927318572998, "learning_rate": 0.0005961534317422691, "loss": 5.3571, "step": 700 }, { "epoch": 0.08072328059412334, "grad_norm": 0.9875015020370483, "learning_rate": 0.0005958301907122077, "loss": 5.3186, "step": 750 }, { "epoch": 0.08610483263373157, "grad_norm": 1.0579822063446045, "learning_rate": 0.0005955069496821463, "loss": 5.2725, "step": 800 }, { "epoch": 0.09148638467333979, "grad_norm": 1.3142889738082886, "learning_rate": 0.0005951837086520848, "loss": 5.2145, "step": 850 }, { "epoch": 0.09686793671294801, "grad_norm": 1.207129716873169, "learning_rate": 0.0005948604676220235, "loss": 5.1809, "step": 900 }, { "epoch": 0.10224948875255624, "grad_norm": 1.3603378534317017, "learning_rate": 0.000594537226591962, "loss": 5.1166, "step": 950 }, { "epoch": 0.10763104079216446, "grad_norm": 1.0100822448730469, "learning_rate": 0.0005942139855619007, "loss": 5.0872, "step": 1000 }, { "epoch": 0.10763104079216446, "eval_accuracy": 0.22757434275053265, "eval_loss": 5.0165934562683105, "eval_runtime": 186.2978, "eval_samples_per_second": 96.679, "eval_steps_per_second": 6.044, "step": 1000 }, { "epoch": 0.11301259283177269, "grad_norm": 0.9639580249786377, "learning_rate": 0.0005938907445318392, "loss": 5.0357, "step": 1050 }, { "epoch": 0.1183941448713809, "grad_norm": 0.8208778500556946, "learning_rate": 0.0005935675035017777, "loss": 5.0074, "step": 1100 }, { "epoch": 0.12377569691098914, "grad_norm": 0.9588396549224854, "learning_rate": 0.0005932442624717164, "loss": 4.9928, "step": 1150 }, { "epoch": 0.12915724895059735, "grad_norm": 0.8537307977676392, "learning_rate": 0.0005929210214416549, "loss": 4.9625, "step": 1200 }, { "epoch": 0.13453880099020557, "grad_norm": 1.1293931007385254, "learning_rate": 0.0005925977804115936, "loss": 4.9296, "step": 1250 }, { "epoch": 0.1399203530298138, "grad_norm": 1.039612054824829, "learning_rate": 0.0005922745393815321, "loss": 4.904, "step": 1300 }, { "epoch": 0.14530190506942203, "grad_norm": 0.986505925655365, "learning_rate": 0.0005919512983514707, "loss": 4.8739, "step": 1350 }, { "epoch": 0.15068345710903025, "grad_norm": 0.9047093391418457, "learning_rate": 0.0005916280573214093, "loss": 4.8174, "step": 1400 }, { "epoch": 0.15606500914863847, "grad_norm": 0.770999014377594, "learning_rate": 0.0005913048162913478, "loss": 4.8181, "step": 1450 }, { "epoch": 0.16144656118824668, "grad_norm": 0.9082644581794739, "learning_rate": 0.0005909815752612864, "loss": 4.8036, "step": 1500 }, { "epoch": 0.1668281132278549, "grad_norm": 0.8358648419380188, "learning_rate": 0.000590658334231225, "loss": 4.7798, "step": 1550 }, { "epoch": 0.17220966526746315, "grad_norm": 1.0487140417099, "learning_rate": 0.0005903350932011637, "loss": 4.7364, "step": 1600 }, { "epoch": 0.17759121730707136, "grad_norm": 1.1815866231918335, "learning_rate": 0.0005900118521711022, "loss": 4.7392, "step": 1650 }, { "epoch": 0.18297276934667958, "grad_norm": 0.9304591417312622, "learning_rate": 0.0005896886111410408, "loss": 4.7118, "step": 1700 }, { "epoch": 0.1883543213862878, "grad_norm": 1.0020699501037598, "learning_rate": 0.0005893653701109793, "loss": 4.6978, "step": 1750 }, { "epoch": 0.19373587342589602, "grad_norm": 0.8822704553604126, "learning_rate": 0.000589042129080918, "loss": 4.6769, "step": 1800 }, { "epoch": 0.19911742546550426, "grad_norm": 0.944671630859375, "learning_rate": 0.0005887188880508566, "loss": 4.6617, "step": 1850 }, { "epoch": 0.20449897750511248, "grad_norm": 1.0103189945220947, "learning_rate": 0.0005883956470207951, "loss": 4.6347, "step": 1900 }, { "epoch": 0.2098805295447207, "grad_norm": 0.9612853527069092, "learning_rate": 0.0005880724059907337, "loss": 4.5904, "step": 1950 }, { "epoch": 0.2152620815843289, "grad_norm": 0.9868743419647217, "learning_rate": 0.0005877491649606723, "loss": 4.58, "step": 2000 }, { "epoch": 0.2152620815843289, "eval_accuracy": 0.27043877621644785, "eval_loss": 4.506998538970947, "eval_runtime": 184.7948, "eval_samples_per_second": 97.465, "eval_steps_per_second": 6.093, "step": 2000 }, { "epoch": 0.22064363362393713, "grad_norm": 0.8025125861167908, "learning_rate": 0.0005874259239306109, "loss": 4.5503, "step": 2050 }, { "epoch": 0.22602518566354537, "grad_norm": 1.0225203037261963, "learning_rate": 0.0005871026829005494, "loss": 4.5634, "step": 2100 }, { "epoch": 0.2314067377031536, "grad_norm": 0.9046829342842102, "learning_rate": 0.0005867794418704881, "loss": 4.5213, "step": 2150 }, { "epoch": 0.2367882897427618, "grad_norm": 0.7576098442077637, "learning_rate": 0.0005864562008404266, "loss": 4.5007, "step": 2200 }, { "epoch": 0.24216984178237003, "grad_norm": 0.9998608231544495, "learning_rate": 0.0005861329598103653, "loss": 4.4993, "step": 2250 }, { "epoch": 0.24755139382197827, "grad_norm": 1.168628215789795, "learning_rate": 0.0005858097187803038, "loss": 4.4896, "step": 2300 }, { "epoch": 0.2529329458615865, "grad_norm": 0.8030685782432556, "learning_rate": 0.0005854864777502423, "loss": 4.4601, "step": 2350 }, { "epoch": 0.2583144979011947, "grad_norm": 1.0477194786071777, "learning_rate": 0.000585163236720181, "loss": 4.4508, "step": 2400 }, { "epoch": 0.2636960499408029, "grad_norm": 0.86094731092453, "learning_rate": 0.0005848399956901195, "loss": 4.428, "step": 2450 }, { "epoch": 0.26907760198041114, "grad_norm": 0.9053926467895508, "learning_rate": 0.0005845167546600582, "loss": 4.4063, "step": 2500 }, { "epoch": 0.27445915402001936, "grad_norm": 0.7576990127563477, "learning_rate": 0.0005841935136299967, "loss": 4.4131, "step": 2550 }, { "epoch": 0.2798407060596276, "grad_norm": 0.7213003635406494, "learning_rate": 0.0005838702725999353, "loss": 4.3988, "step": 2600 }, { "epoch": 0.2852222580992358, "grad_norm": 0.9967924356460571, "learning_rate": 0.0005835470315698739, "loss": 4.3945, "step": 2650 }, { "epoch": 0.29060381013884407, "grad_norm": 0.7419942617416382, "learning_rate": 0.0005832237905398125, "loss": 4.3636, "step": 2700 }, { "epoch": 0.2959853621784523, "grad_norm": 0.7668341994285583, "learning_rate": 0.0005829005495097511, "loss": 4.3658, "step": 2750 }, { "epoch": 0.3013669142180605, "grad_norm": 0.7742299437522888, "learning_rate": 0.0005825773084796896, "loss": 4.334, "step": 2800 }, { "epoch": 0.3067484662576687, "grad_norm": 0.9014106392860413, "learning_rate": 0.0005822540674496282, "loss": 4.327, "step": 2850 }, { "epoch": 0.31213001829727693, "grad_norm": 0.6771566867828369, "learning_rate": 0.0005819308264195668, "loss": 4.3486, "step": 2900 }, { "epoch": 0.31751157033688515, "grad_norm": 0.712129533290863, "learning_rate": 0.0005816075853895054, "loss": 4.3111, "step": 2950 }, { "epoch": 0.32289312237649337, "grad_norm": 0.6686953902244568, "learning_rate": 0.000581284344359444, "loss": 4.3079, "step": 3000 }, { "epoch": 0.32289312237649337, "eval_accuracy": 0.2981868766651734, "eval_loss": 4.241795063018799, "eval_runtime": 184.6997, "eval_samples_per_second": 97.515, "eval_steps_per_second": 6.096, "step": 3000 }, { "epoch": 0.3282746744161016, "grad_norm": 0.6886458992958069, "learning_rate": 0.0005809611033293825, "loss": 4.3161, "step": 3050 }, { "epoch": 0.3336562264557098, "grad_norm": 0.7012606859207153, "learning_rate": 0.0005806378622993211, "loss": 4.2908, "step": 3100 }, { "epoch": 0.3390377784953181, "grad_norm": 0.6854538321495056, "learning_rate": 0.0005803146212692597, "loss": 4.2775, "step": 3150 }, { "epoch": 0.3444193305349263, "grad_norm": 0.8339408040046692, "learning_rate": 0.0005799913802391983, "loss": 4.27, "step": 3200 }, { "epoch": 0.3498008825745345, "grad_norm": 0.8958104848861694, "learning_rate": 0.0005796681392091369, "loss": 4.2535, "step": 3250 }, { "epoch": 0.35518243461414273, "grad_norm": 0.671660304069519, "learning_rate": 0.0005793448981790755, "loss": 4.2516, "step": 3300 }, { "epoch": 0.36056398665375095, "grad_norm": 0.7408727407455444, "learning_rate": 0.0005790216571490141, "loss": 4.2658, "step": 3350 }, { "epoch": 0.36594553869335916, "grad_norm": 0.6920570731163025, "learning_rate": 0.0005786984161189527, "loss": 4.2325, "step": 3400 }, { "epoch": 0.3713270907329674, "grad_norm": 0.8909915685653687, "learning_rate": 0.0005783751750888912, "loss": 4.2163, "step": 3450 }, { "epoch": 0.3767086427725756, "grad_norm": 0.8377681970596313, "learning_rate": 0.0005780519340588297, "loss": 4.2289, "step": 3500 }, { "epoch": 0.3820901948121838, "grad_norm": 0.6133842468261719, "learning_rate": 0.0005777286930287684, "loss": 4.216, "step": 3550 }, { "epoch": 0.38747174685179203, "grad_norm": 0.5939428806304932, "learning_rate": 0.000577405451998707, "loss": 4.2227, "step": 3600 }, { "epoch": 0.3928532988914003, "grad_norm": 0.6408166885375977, "learning_rate": 0.0005770822109686456, "loss": 4.2049, "step": 3650 }, { "epoch": 0.3982348509310085, "grad_norm": 0.819424033164978, "learning_rate": 0.0005767589699385841, "loss": 4.2005, "step": 3700 }, { "epoch": 0.40361640297061674, "grad_norm": 0.6583958864212036, "learning_rate": 0.0005764357289085228, "loss": 4.1926, "step": 3750 }, { "epoch": 0.40899795501022496, "grad_norm": 0.6472761631011963, "learning_rate": 0.0005761124878784613, "loss": 4.1791, "step": 3800 }, { "epoch": 0.4143795070498332, "grad_norm": 0.6922594308853149, "learning_rate": 0.0005757892468484, "loss": 4.1849, "step": 3850 }, { "epoch": 0.4197610590894414, "grad_norm": 0.6930834054946899, "learning_rate": 0.0005754660058183385, "loss": 4.1757, "step": 3900 }, { "epoch": 0.4251426111290496, "grad_norm": 0.6437042355537415, "learning_rate": 0.000575142764788277, "loss": 4.1714, "step": 3950 }, { "epoch": 0.4305241631686578, "grad_norm": 0.7084268927574158, "learning_rate": 0.0005748195237582157, "loss": 4.1579, "step": 4000 }, { "epoch": 0.4305241631686578, "eval_accuracy": 0.31272235134410686, "eval_loss": 4.0872697830200195, "eval_runtime": 184.7092, "eval_samples_per_second": 97.51, "eval_steps_per_second": 6.096, "step": 4000 }, { "epoch": 0.43590571520826604, "grad_norm": 0.6244553327560425, "learning_rate": 0.0005744962827281542, "loss": 4.1348, "step": 4050 }, { "epoch": 0.44128726724787426, "grad_norm": 0.7042115330696106, "learning_rate": 0.0005741730416980928, "loss": 4.1397, "step": 4100 }, { "epoch": 0.44666881928748253, "grad_norm": 0.7811130881309509, "learning_rate": 0.0005738498006680314, "loss": 4.1573, "step": 4150 }, { "epoch": 0.45205037132709075, "grad_norm": 0.6470036506652832, "learning_rate": 0.00057352655963797, "loss": 4.1357, "step": 4200 }, { "epoch": 0.45743192336669897, "grad_norm": 0.6716099977493286, "learning_rate": 0.0005732033186079086, "loss": 4.1327, "step": 4250 }, { "epoch": 0.4628134754063072, "grad_norm": 0.7920870184898376, "learning_rate": 0.0005728800775778471, "loss": 4.1263, "step": 4300 }, { "epoch": 0.4681950274459154, "grad_norm": 0.7230188846588135, "learning_rate": 0.0005725568365477857, "loss": 4.1104, "step": 4350 }, { "epoch": 0.4735765794855236, "grad_norm": 0.7376595735549927, "learning_rate": 0.0005722335955177243, "loss": 4.1103, "step": 4400 }, { "epoch": 0.47895813152513184, "grad_norm": 1.1194459199905396, "learning_rate": 0.000571910354487663, "loss": 4.1222, "step": 4450 }, { "epoch": 0.48433968356474005, "grad_norm": 0.6437053680419922, "learning_rate": 0.0005715871134576015, "loss": 4.1131, "step": 4500 }, { "epoch": 0.48972123560434827, "grad_norm": 0.7767078876495361, "learning_rate": 0.0005712638724275401, "loss": 4.1017, "step": 4550 }, { "epoch": 0.49510278764395654, "grad_norm": 0.6522454023361206, "learning_rate": 0.0005709406313974786, "loss": 4.0979, "step": 4600 }, { "epoch": 0.5004843396835648, "grad_norm": 0.6566357612609863, "learning_rate": 0.0005706173903674173, "loss": 4.0735, "step": 4650 }, { "epoch": 0.505865891723173, "grad_norm": 0.5966187715530396, "learning_rate": 0.0005702941493373559, "loss": 4.0895, "step": 4700 }, { "epoch": 0.5112474437627812, "grad_norm": 0.7359771132469177, "learning_rate": 0.0005699709083072944, "loss": 4.0868, "step": 4750 }, { "epoch": 0.5166289958023894, "grad_norm": 0.7156068086624146, "learning_rate": 0.000569647667277233, "loss": 4.0897, "step": 4800 }, { "epoch": 0.5220105478419976, "grad_norm": 0.6234465837478638, "learning_rate": 0.0005693244262471716, "loss": 4.0549, "step": 4850 }, { "epoch": 0.5273920998816058, "grad_norm": 0.7085710167884827, "learning_rate": 0.0005690011852171102, "loss": 4.0802, "step": 4900 }, { "epoch": 0.5327736519212141, "grad_norm": 0.5667790174484253, "learning_rate": 0.0005686779441870487, "loss": 4.0628, "step": 4950 }, { "epoch": 0.5381552039608223, "grad_norm": 0.6275297999382019, "learning_rate": 0.0005683547031569874, "loss": 4.0589, "step": 5000 }, { "epoch": 0.5381552039608223, "eval_accuracy": 0.3213402637940002, "eval_loss": 3.990799903869629, "eval_runtime": 184.6417, "eval_samples_per_second": 97.546, "eval_steps_per_second": 6.098, "step": 5000 }, { "epoch": 0.5435367560004305, "grad_norm": 0.6828199625015259, "learning_rate": 0.0005680314621269259, "loss": 4.056, "step": 5050 }, { "epoch": 0.5489183080400387, "grad_norm": 0.6843883991241455, "learning_rate": 0.0005677082210968646, "loss": 4.0418, "step": 5100 }, { "epoch": 0.5542998600796469, "grad_norm": 0.5970234274864197, "learning_rate": 0.0005673849800668031, "loss": 4.0339, "step": 5150 }, { "epoch": 0.5596814121192552, "grad_norm": 0.6066529154777527, "learning_rate": 0.0005670617390367416, "loss": 4.0554, "step": 5200 }, { "epoch": 0.5650629641588634, "grad_norm": 0.7181238532066345, "learning_rate": 0.0005667384980066803, "loss": 4.0289, "step": 5250 }, { "epoch": 0.5704445161984716, "grad_norm": 0.6789528131484985, "learning_rate": 0.0005664152569766188, "loss": 4.0306, "step": 5300 }, { "epoch": 0.5758260682380799, "grad_norm": 0.6828908920288086, "learning_rate": 0.0005660920159465575, "loss": 4.0383, "step": 5350 }, { "epoch": 0.5812076202776881, "grad_norm": 0.5343970656394958, "learning_rate": 0.000565768774916496, "loss": 4.0472, "step": 5400 }, { "epoch": 0.5865891723172963, "grad_norm": 0.6703925728797913, "learning_rate": 0.0005654455338864346, "loss": 4.0139, "step": 5450 }, { "epoch": 0.5919707243569046, "grad_norm": 0.5589005351066589, "learning_rate": 0.0005651222928563732, "loss": 4.0171, "step": 5500 }, { "epoch": 0.5973522763965128, "grad_norm": 0.6419985890388489, "learning_rate": 0.0005647990518263118, "loss": 3.994, "step": 5550 }, { "epoch": 0.602733828436121, "grad_norm": 0.5866629481315613, "learning_rate": 0.0005644758107962504, "loss": 4.0104, "step": 5600 }, { "epoch": 0.6081153804757292, "grad_norm": 0.9215855002403259, "learning_rate": 0.0005641525697661889, "loss": 4.0096, "step": 5650 }, { "epoch": 0.6134969325153374, "grad_norm": 0.5779905319213867, "learning_rate": 0.0005638293287361275, "loss": 3.9754, "step": 5700 }, { "epoch": 0.6188784845549457, "grad_norm": 0.6425080299377441, "learning_rate": 0.0005635060877060661, "loss": 3.9917, "step": 5750 }, { "epoch": 0.6242600365945539, "grad_norm": 0.6887379884719849, "learning_rate": 0.0005631828466760047, "loss": 3.9941, "step": 5800 }, { "epoch": 0.6296415886341621, "grad_norm": 0.6834486722946167, "learning_rate": 0.0005628596056459433, "loss": 3.9953, "step": 5850 }, { "epoch": 0.6350231406737703, "grad_norm": 0.7504608631134033, "learning_rate": 0.0005625363646158818, "loss": 3.9892, "step": 5900 }, { "epoch": 0.6404046927133785, "grad_norm": 0.6510689854621887, "learning_rate": 0.0005622131235858205, "loss": 3.9722, "step": 5950 }, { "epoch": 0.6457862447529867, "grad_norm": 0.5909599661827087, "learning_rate": 0.000561889882555759, "loss": 3.9673, "step": 6000 }, { "epoch": 0.6457862447529867, "eval_accuracy": 0.32834229049631664, "eval_loss": 3.9165666103363037, "eval_runtime": 184.5514, "eval_samples_per_second": 97.593, "eval_steps_per_second": 6.101, "step": 6000 }, { "epoch": 0.651167796792595, "grad_norm": 0.6396195292472839, "learning_rate": 0.0005615666415256976, "loss": 3.997, "step": 6050 }, { "epoch": 0.6565493488322032, "grad_norm": 0.5541490316390991, "learning_rate": 0.0005612434004956361, "loss": 3.981, "step": 6100 }, { "epoch": 0.6619309008718114, "grad_norm": 0.6580690145492554, "learning_rate": 0.0005609201594655748, "loss": 3.9881, "step": 6150 }, { "epoch": 0.6673124529114196, "grad_norm": 0.6880823373794556, "learning_rate": 0.0005605969184355134, "loss": 3.9573, "step": 6200 }, { "epoch": 0.6726940049510278, "grad_norm": 0.6356266140937805, "learning_rate": 0.000560273677405452, "loss": 3.9975, "step": 6250 }, { "epoch": 0.6780755569906362, "grad_norm": 0.5227646827697754, "learning_rate": 0.0005599504363753905, "loss": 3.9533, "step": 6300 }, { "epoch": 0.6834571090302444, "grad_norm": 0.6362395882606506, "learning_rate": 0.0005596271953453291, "loss": 3.9563, "step": 6350 }, { "epoch": 0.6888386610698526, "grad_norm": 0.5801821947097778, "learning_rate": 0.0005593039543152677, "loss": 3.9428, "step": 6400 }, { "epoch": 0.6942202131094608, "grad_norm": 0.6077560782432556, "learning_rate": 0.0005589807132852063, "loss": 3.9554, "step": 6450 }, { "epoch": 0.699601765149069, "grad_norm": 0.5769425630569458, "learning_rate": 0.0005586574722551449, "loss": 3.9447, "step": 6500 }, { "epoch": 0.7049833171886772, "grad_norm": 0.6278015971183777, "learning_rate": 0.0005583342312250834, "loss": 3.9678, "step": 6550 }, { "epoch": 0.7103648692282855, "grad_norm": 0.5199827551841736, "learning_rate": 0.0005580109901950221, "loss": 3.9618, "step": 6600 }, { "epoch": 0.7157464212678937, "grad_norm": 0.659668505191803, "learning_rate": 0.0005576877491649606, "loss": 3.9206, "step": 6650 }, { "epoch": 0.7211279733075019, "grad_norm": 0.6791085004806519, "learning_rate": 0.0005573645081348993, "loss": 3.9427, "step": 6700 }, { "epoch": 0.7265095253471101, "grad_norm": 0.6757652759552002, "learning_rate": 0.0005570412671048378, "loss": 3.9381, "step": 6750 }, { "epoch": 0.7318910773867183, "grad_norm": 0.5604714155197144, "learning_rate": 0.0005567180260747763, "loss": 3.9385, "step": 6800 }, { "epoch": 0.7372726294263265, "grad_norm": 0.6604406237602234, "learning_rate": 0.000556394785044715, "loss": 3.9272, "step": 6850 }, { "epoch": 0.7426541814659348, "grad_norm": 0.564445972442627, "learning_rate": 0.0005560715440146535, "loss": 3.9496, "step": 6900 }, { "epoch": 0.748035733505543, "grad_norm": 0.5737416744232178, "learning_rate": 0.0005557483029845921, "loss": 3.9387, "step": 6950 }, { "epoch": 0.7534172855451512, "grad_norm": 0.5588909983634949, "learning_rate": 0.0005554250619545307, "loss": 3.9074, "step": 7000 }, { "epoch": 0.7534172855451512, "eval_accuracy": 0.33360815270424543, "eval_loss": 3.859344720840454, "eval_runtime": 184.9069, "eval_samples_per_second": 97.406, "eval_steps_per_second": 6.09, "step": 7000 }, { "epoch": 0.7587988375847594, "grad_norm": 0.6285735964775085, "learning_rate": 0.0005551018209244694, "loss": 3.9333, "step": 7050 }, { "epoch": 0.7641803896243676, "grad_norm": 0.5597401261329651, "learning_rate": 0.0005547785798944079, "loss": 3.9209, "step": 7100 }, { "epoch": 0.7695619416639758, "grad_norm": 0.578063428401947, "learning_rate": 0.0005544553388643464, "loss": 3.9312, "step": 7150 }, { "epoch": 0.7749434937035841, "grad_norm": 0.5833026170730591, "learning_rate": 0.000554132097834285, "loss": 3.9328, "step": 7200 }, { "epoch": 0.7803250457431924, "grad_norm": 0.518353283405304, "learning_rate": 0.0005538088568042236, "loss": 3.9029, "step": 7250 }, { "epoch": 0.7857065977828006, "grad_norm": 0.6059916019439697, "learning_rate": 0.0005534856157741623, "loss": 3.9199, "step": 7300 }, { "epoch": 0.7910881498224088, "grad_norm": 0.5463325381278992, "learning_rate": 0.0005531623747441008, "loss": 3.9054, "step": 7350 }, { "epoch": 0.796469701862017, "grad_norm": 0.6463024616241455, "learning_rate": 0.0005528391337140394, "loss": 3.9191, "step": 7400 }, { "epoch": 0.8018512539016253, "grad_norm": 0.6024007797241211, "learning_rate": 0.0005525158926839779, "loss": 3.9284, "step": 7450 }, { "epoch": 0.8072328059412335, "grad_norm": 0.5554441213607788, "learning_rate": 0.0005521926516539166, "loss": 3.9015, "step": 7500 }, { "epoch": 0.8126143579808417, "grad_norm": 0.5887565612792969, "learning_rate": 0.0005518694106238552, "loss": 3.9024, "step": 7550 }, { "epoch": 0.8179959100204499, "grad_norm": 0.575228214263916, "learning_rate": 0.0005515461695937937, "loss": 3.8981, "step": 7600 }, { "epoch": 0.8233774620600581, "grad_norm": 0.6179103255271912, "learning_rate": 0.0005512229285637323, "loss": 3.9081, "step": 7650 }, { "epoch": 0.8287590140996663, "grad_norm": 0.6597625017166138, "learning_rate": 0.0005508996875336709, "loss": 3.8958, "step": 7700 }, { "epoch": 0.8341405661392746, "grad_norm": 0.5965509414672852, "learning_rate": 0.0005505764465036095, "loss": 3.8879, "step": 7750 }, { "epoch": 0.8395221181788828, "grad_norm": 0.512361466884613, "learning_rate": 0.000550253205473548, "loss": 3.8809, "step": 7800 }, { "epoch": 0.844903670218491, "grad_norm": 0.6104485988616943, "learning_rate": 0.0005499299644434867, "loss": 3.8852, "step": 7850 }, { "epoch": 0.8502852222580992, "grad_norm": 0.6456380486488342, "learning_rate": 0.0005496067234134252, "loss": 3.8783, "step": 7900 }, { "epoch": 0.8556667742977074, "grad_norm": 0.5954079627990723, "learning_rate": 0.0005492834823833639, "loss": 3.889, "step": 7950 }, { "epoch": 0.8610483263373157, "grad_norm": 0.6212036609649658, "learning_rate": 0.0005489602413533024, "loss": 3.8712, "step": 8000 }, { "epoch": 0.8610483263373157, "eval_accuracy": 0.33805890094779, "eval_loss": 3.8169474601745605, "eval_runtime": 184.4499, "eval_samples_per_second": 97.647, "eval_steps_per_second": 6.105, "step": 8000 }, { "epoch": 0.8664298783769239, "grad_norm": 0.5757076144218445, "learning_rate": 0.0005486370003232409, "loss": 3.8725, "step": 8050 }, { "epoch": 0.8718114304165321, "grad_norm": 0.6085401177406311, "learning_rate": 0.0005483137592931796, "loss": 3.8905, "step": 8100 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5095037817955017, "learning_rate": 0.0005479905182631181, "loss": 3.8799, "step": 8150 }, { "epoch": 0.8825745344957485, "grad_norm": 0.5414556860923767, "learning_rate": 0.0005476672772330568, "loss": 3.8747, "step": 8200 }, { "epoch": 0.8879560865353568, "grad_norm": 0.5936631560325623, "learning_rate": 0.0005473440362029953, "loss": 3.863, "step": 8250 }, { "epoch": 0.8933376385749651, "grad_norm": 0.5667993426322937, "learning_rate": 0.0005470207951729339, "loss": 3.8607, "step": 8300 }, { "epoch": 0.8987191906145733, "grad_norm": 0.5451300144195557, "learning_rate": 0.0005466975541428725, "loss": 3.8857, "step": 8350 }, { "epoch": 0.9041007426541815, "grad_norm": 0.5745592713356018, "learning_rate": 0.0005463743131128111, "loss": 3.8554, "step": 8400 }, { "epoch": 0.9094822946937897, "grad_norm": 0.5426753759384155, "learning_rate": 0.0005460510720827497, "loss": 3.8752, "step": 8450 }, { "epoch": 0.9148638467333979, "grad_norm": 0.508082389831543, "learning_rate": 0.0005457278310526882, "loss": 3.8584, "step": 8500 }, { "epoch": 0.9202453987730062, "grad_norm": 0.532146155834198, "learning_rate": 0.0005454045900226268, "loss": 3.8476, "step": 8550 }, { "epoch": 0.9256269508126144, "grad_norm": 0.5778146386146545, "learning_rate": 0.0005450813489925654, "loss": 3.8612, "step": 8600 }, { "epoch": 0.9310085028522226, "grad_norm": 0.5984019637107849, "learning_rate": 0.000544758107962504, "loss": 3.8478, "step": 8650 }, { "epoch": 0.9363900548918308, "grad_norm": 0.609585165977478, "learning_rate": 0.0005444348669324426, "loss": 3.8541, "step": 8700 }, { "epoch": 0.941771606931439, "grad_norm": 0.5877447724342346, "learning_rate": 0.0005441116259023811, "loss": 3.8298, "step": 8750 }, { "epoch": 0.9471531589710472, "grad_norm": 0.6004450917243958, "learning_rate": 0.0005437883848723198, "loss": 3.8393, "step": 8800 }, { "epoch": 0.9525347110106555, "grad_norm": 0.5448794960975647, "learning_rate": 0.0005434651438422583, "loss": 3.8515, "step": 8850 }, { "epoch": 0.9579162630502637, "grad_norm": 0.5065131783485413, "learning_rate": 0.0005431419028121969, "loss": 3.8478, "step": 8900 }, { "epoch": 0.9632978150898719, "grad_norm": 0.6335159540176392, "learning_rate": 0.0005428186617821354, "loss": 3.8411, "step": 8950 }, { "epoch": 0.9686793671294801, "grad_norm": 0.5555748343467712, "learning_rate": 0.0005424954207520741, "loss": 3.8516, "step": 9000 }, { "epoch": 0.9686793671294801, "eval_accuracy": 0.34156469502601206, "eval_loss": 3.775557279586792, "eval_runtime": 184.197, "eval_samples_per_second": 97.781, "eval_steps_per_second": 6.113, "step": 9000 }, { "epoch": 0.9740609191690883, "grad_norm": 0.6176546812057495, "learning_rate": 0.0005421721797220127, "loss": 3.8444, "step": 9050 }, { "epoch": 0.9794424712086965, "grad_norm": 0.5457547307014465, "learning_rate": 0.0005418489386919513, "loss": 3.8305, "step": 9100 }, { "epoch": 0.9848240232483048, "grad_norm": 0.5831869840621948, "learning_rate": 0.0005415256976618898, "loss": 3.8196, "step": 9150 }, { "epoch": 0.9902055752879131, "grad_norm": 0.5737512111663818, "learning_rate": 0.0005412024566318284, "loss": 3.8124, "step": 9200 }, { "epoch": 0.9955871273275213, "grad_norm": 0.5983269810676575, "learning_rate": 0.000540879215601767, "loss": 3.8368, "step": 9250 }, { "epoch": 1.0009686793671295, "grad_norm": 0.5368288159370422, "learning_rate": 0.0005405559745717056, "loss": 3.8048, "step": 9300 }, { "epoch": 1.0063502314067376, "grad_norm": 0.5906519889831543, "learning_rate": 0.0005402327335416442, "loss": 3.7763, "step": 9350 }, { "epoch": 1.011731783446346, "grad_norm": 0.6060296893119812, "learning_rate": 0.0005399094925115827, "loss": 3.7644, "step": 9400 }, { "epoch": 1.017113335485954, "grad_norm": 0.5626549124717712, "learning_rate": 0.0005395862514815214, "loss": 3.7658, "step": 9450 }, { "epoch": 1.0224948875255624, "grad_norm": 0.521769642829895, "learning_rate": 0.0005392630104514599, "loss": 3.7667, "step": 9500 }, { "epoch": 1.0278764395651705, "grad_norm": 0.5677669048309326, "learning_rate": 0.0005389397694213986, "loss": 3.7803, "step": 9550 }, { "epoch": 1.0332579916047788, "grad_norm": 0.5613917708396912, "learning_rate": 0.0005386165283913371, "loss": 3.7582, "step": 9600 }, { "epoch": 1.0386395436443872, "grad_norm": 0.5857552289962769, "learning_rate": 0.0005382932873612756, "loss": 3.7806, "step": 9650 }, { "epoch": 1.0440210956839953, "grad_norm": 0.5657854080200195, "learning_rate": 0.0005379700463312143, "loss": 3.7687, "step": 9700 }, { "epoch": 1.0494026477236036, "grad_norm": 0.580058217048645, "learning_rate": 0.0005376468053011528, "loss": 3.7603, "step": 9750 }, { "epoch": 1.0547841997632117, "grad_norm": 0.5268446207046509, "learning_rate": 0.0005373235642710914, "loss": 3.7591, "step": 9800 }, { "epoch": 1.06016575180282, "grad_norm": 0.5513815879821777, "learning_rate": 0.00053700032324103, "loss": 3.7748, "step": 9850 }, { "epoch": 1.0655473038424281, "grad_norm": 0.5946319699287415, "learning_rate": 0.0005366770822109687, "loss": 3.7816, "step": 9900 }, { "epoch": 1.0709288558820365, "grad_norm": 0.5772368311882019, "learning_rate": 0.0005363538411809072, "loss": 3.754, "step": 9950 }, { "epoch": 1.0763104079216446, "grad_norm": 0.6033820509910583, "learning_rate": 0.0005360306001508457, "loss": 3.7639, "step": 10000 }, { "epoch": 1.0763104079216446, "eval_accuracy": 0.345006709859087, "eval_loss": 3.7440133094787598, "eval_runtime": 184.605, "eval_samples_per_second": 97.565, "eval_steps_per_second": 6.1, "step": 10000 }, { "epoch": 1.081691959961253, "grad_norm": 0.7267680764198303, "learning_rate": 0.0005357073591207843, "loss": 3.7537, "step": 10050 }, { "epoch": 1.087073512000861, "grad_norm": 0.5396670699119568, "learning_rate": 0.0005353841180907229, "loss": 3.7696, "step": 10100 }, { "epoch": 1.0924550640404693, "grad_norm": 0.5720505118370056, "learning_rate": 0.0005350608770606616, "loss": 3.7771, "step": 10150 }, { "epoch": 1.0978366160800774, "grad_norm": 0.5750023722648621, "learning_rate": 0.0005347376360306001, "loss": 3.7454, "step": 10200 }, { "epoch": 1.1032181681196858, "grad_norm": 0.6659409403800964, "learning_rate": 0.0005344143950005387, "loss": 3.7663, "step": 10250 }, { "epoch": 1.1085997201592939, "grad_norm": 0.6214229464530945, "learning_rate": 0.0005340911539704773, "loss": 3.7545, "step": 10300 }, { "epoch": 1.1139812721989022, "grad_norm": 0.5536146759986877, "learning_rate": 0.0005337679129404159, "loss": 3.7517, "step": 10350 }, { "epoch": 1.1193628242385103, "grad_norm": 0.6057895421981812, "learning_rate": 0.0005334446719103545, "loss": 3.7618, "step": 10400 }, { "epoch": 1.1247443762781186, "grad_norm": 0.5203273296356201, "learning_rate": 0.000533121430880293, "loss": 3.7583, "step": 10450 }, { "epoch": 1.1301259283177267, "grad_norm": 0.4910421371459961, "learning_rate": 0.0005327981898502316, "loss": 3.7713, "step": 10500 }, { "epoch": 1.135507480357335, "grad_norm": 0.5717787742614746, "learning_rate": 0.0005324749488201702, "loss": 3.7681, "step": 10550 }, { "epoch": 1.1408890323969434, "grad_norm": 0.5272240042686462, "learning_rate": 0.0005321517077901088, "loss": 3.7644, "step": 10600 }, { "epoch": 1.1462705844365515, "grad_norm": 0.6392913460731506, "learning_rate": 0.0005318284667600473, "loss": 3.7694, "step": 10650 }, { "epoch": 1.1516521364761596, "grad_norm": 0.5354857444763184, "learning_rate": 0.000531505225729986, "loss": 3.742, "step": 10700 }, { "epoch": 1.157033688515768, "grad_norm": 0.7330885529518127, "learning_rate": 0.0005311819846999245, "loss": 3.7568, "step": 10750 }, { "epoch": 1.1624152405553763, "grad_norm": 0.5588583946228027, "learning_rate": 0.0005308587436698631, "loss": 3.7659, "step": 10800 }, { "epoch": 1.1677967925949844, "grad_norm": 0.534525990486145, "learning_rate": 0.0005305355026398017, "loss": 3.7399, "step": 10850 }, { "epoch": 1.1731783446345927, "grad_norm": 0.541637659072876, "learning_rate": 0.0005302122616097402, "loss": 3.7469, "step": 10900 }, { "epoch": 1.1785598966742008, "grad_norm": 0.5589224100112915, "learning_rate": 0.0005298890205796789, "loss": 3.7605, "step": 10950 }, { "epoch": 1.1839414487138091, "grad_norm": 0.5767397880554199, "learning_rate": 0.0005295657795496175, "loss": 3.7414, "step": 11000 }, { "epoch": 1.1839414487138091, "eval_accuracy": 0.34713293822072855, "eval_loss": 3.7198851108551025, "eval_runtime": 184.6204, "eval_samples_per_second": 97.557, "eval_steps_per_second": 6.099, "step": 11000 }, { "epoch": 1.1893230007534172, "grad_norm": 0.5567173957824707, "learning_rate": 0.0005292425385195561, "loss": 3.7439, "step": 11050 }, { "epoch": 1.1947045527930256, "grad_norm": 0.5579226016998291, "learning_rate": 0.0005289192974894946, "loss": 3.7435, "step": 11100 }, { "epoch": 1.2000861048326337, "grad_norm": 0.61114102602005, "learning_rate": 0.0005285960564594331, "loss": 3.7653, "step": 11150 }, { "epoch": 1.205467656872242, "grad_norm": 0.6080530881881714, "learning_rate": 0.0005282728154293718, "loss": 3.7311, "step": 11200 }, { "epoch": 1.21084920891185, "grad_norm": 0.5481367111206055, "learning_rate": 0.0005279495743993104, "loss": 3.7429, "step": 11250 }, { "epoch": 1.2162307609514584, "grad_norm": 0.6586524248123169, "learning_rate": 0.000527626333369249, "loss": 3.7476, "step": 11300 }, { "epoch": 1.2216123129910665, "grad_norm": 0.5334315299987793, "learning_rate": 0.0005273030923391875, "loss": 3.7436, "step": 11350 }, { "epoch": 1.2269938650306749, "grad_norm": 0.5702027082443237, "learning_rate": 0.0005269798513091261, "loss": 3.7232, "step": 11400 }, { "epoch": 1.232375417070283, "grad_norm": 0.5503053665161133, "learning_rate": 0.0005266566102790647, "loss": 3.7325, "step": 11450 }, { "epoch": 1.2377569691098913, "grad_norm": 0.6727051138877869, "learning_rate": 0.0005263333692490033, "loss": 3.7457, "step": 11500 }, { "epoch": 1.2431385211494996, "grad_norm": 0.5632249116897583, "learning_rate": 0.0005260101282189419, "loss": 3.7434, "step": 11550 }, { "epoch": 1.2485200731891077, "grad_norm": 0.6183726191520691, "learning_rate": 0.0005256868871888804, "loss": 3.7454, "step": 11600 }, { "epoch": 1.2539016252287158, "grad_norm": 0.6172442436218262, "learning_rate": 0.0005253636461588191, "loss": 3.7201, "step": 11650 }, { "epoch": 1.2592831772683242, "grad_norm": 0.5206655859947205, "learning_rate": 0.0005250404051287576, "loss": 3.7412, "step": 11700 }, { "epoch": 1.2646647293079325, "grad_norm": 0.6109122633934021, "learning_rate": 0.0005247171640986962, "loss": 3.7149, "step": 11750 }, { "epoch": 1.2700462813475406, "grad_norm": 0.5555276870727539, "learning_rate": 0.0005243939230686347, "loss": 3.7154, "step": 11800 }, { "epoch": 1.275427833387149, "grad_norm": 0.5312607884407043, "learning_rate": 0.0005240706820385734, "loss": 3.7459, "step": 11850 }, { "epoch": 1.280809385426757, "grad_norm": 0.552590548992157, "learning_rate": 0.000523747441008512, "loss": 3.713, "step": 11900 }, { "epoch": 1.2861909374663654, "grad_norm": 0.5426647663116455, "learning_rate": 0.0005234241999784506, "loss": 3.7425, "step": 11950 }, { "epoch": 1.2915724895059735, "grad_norm": 0.5763972997665405, "learning_rate": 0.0005231009589483891, "loss": 3.7364, "step": 12000 }, { "epoch": 1.2915724895059735, "eval_accuracy": 0.3500625460348704, "eval_loss": 3.695915460586548, "eval_runtime": 184.4233, "eval_samples_per_second": 97.661, "eval_steps_per_second": 6.106, "step": 12000 }, { "epoch": 1.2969540415455818, "grad_norm": 0.6403552889823914, "learning_rate": 0.0005227777179183277, "loss": 3.7176, "step": 12050 }, { "epoch": 1.30233559358519, "grad_norm": 0.5373594164848328, "learning_rate": 0.0005224544768882663, "loss": 3.7174, "step": 12100 }, { "epoch": 1.3077171456247982, "grad_norm": 0.5955021977424622, "learning_rate": 0.0005221312358582049, "loss": 3.72, "step": 12150 }, { "epoch": 1.3130986976644063, "grad_norm": 0.6005784869194031, "learning_rate": 0.0005218079948281435, "loss": 3.7168, "step": 12200 }, { "epoch": 1.3184802497040147, "grad_norm": 0.5965345501899719, "learning_rate": 0.000521484753798082, "loss": 3.7286, "step": 12250 }, { "epoch": 1.3238618017436228, "grad_norm": 0.4936142861843109, "learning_rate": 0.0005211615127680207, "loss": 3.7257, "step": 12300 }, { "epoch": 1.329243353783231, "grad_norm": 0.5646725296974182, "learning_rate": 0.0005208382717379592, "loss": 3.7049, "step": 12350 }, { "epoch": 1.3346249058228392, "grad_norm": 0.5552993416786194, "learning_rate": 0.0005205150307078979, "loss": 3.7232, "step": 12400 }, { "epoch": 1.3400064578624475, "grad_norm": 0.5514169931411743, "learning_rate": 0.0005201917896778364, "loss": 3.7065, "step": 12450 }, { "epoch": 1.3453880099020559, "grad_norm": 0.5545753240585327, "learning_rate": 0.0005198685486477749, "loss": 3.7162, "step": 12500 }, { "epoch": 1.350769561941664, "grad_norm": 0.5531725287437439, "learning_rate": 0.0005195453076177136, "loss": 3.7265, "step": 12550 }, { "epoch": 1.356151113981272, "grad_norm": 0.504911482334137, "learning_rate": 0.0005192220665876521, "loss": 3.7103, "step": 12600 }, { "epoch": 1.3615326660208804, "grad_norm": 0.5750048160552979, "learning_rate": 0.0005188988255575907, "loss": 3.715, "step": 12650 }, { "epoch": 1.3669142180604887, "grad_norm": 0.5439777970314026, "learning_rate": 0.0005185755845275293, "loss": 3.7145, "step": 12700 }, { "epoch": 1.3722957701000968, "grad_norm": 0.5391664505004883, "learning_rate": 0.000518252343497468, "loss": 3.7083, "step": 12750 }, { "epoch": 1.3776773221397052, "grad_norm": 0.5579096078872681, "learning_rate": 0.0005179291024674065, "loss": 3.7188, "step": 12800 }, { "epoch": 1.3830588741793133, "grad_norm": 0.563324511051178, "learning_rate": 0.000517605861437345, "loss": 3.7051, "step": 12850 }, { "epoch": 1.3884404262189216, "grad_norm": 0.5446837544441223, "learning_rate": 0.0005172826204072836, "loss": 3.7164, "step": 12900 }, { "epoch": 1.3938219782585297, "grad_norm": 0.5987839102745056, "learning_rate": 0.0005169593793772222, "loss": 3.7137, "step": 12950 }, { "epoch": 1.399203530298138, "grad_norm": 0.5936101078987122, "learning_rate": 0.0005166361383471609, "loss": 3.7086, "step": 13000 }, { "epoch": 1.399203530298138, "eval_accuracy": 0.3522383201133554, "eval_loss": 3.673328161239624, "eval_runtime": 184.7267, "eval_samples_per_second": 97.501, "eval_steps_per_second": 6.095, "step": 13000 }, { "epoch": 1.4045850823377461, "grad_norm": 0.6273950934410095, "learning_rate": 0.0005163128973170994, "loss": 3.7156, "step": 13050 }, { "epoch": 1.4099666343773545, "grad_norm": 0.5783877968788147, "learning_rate": 0.000515989656287038, "loss": 3.702, "step": 13100 }, { "epoch": 1.4153481864169626, "grad_norm": 0.5784569382667542, "learning_rate": 0.0005156664152569766, "loss": 3.7103, "step": 13150 }, { "epoch": 1.420729738456571, "grad_norm": 0.5583294034004211, "learning_rate": 0.0005153431742269152, "loss": 3.7069, "step": 13200 }, { "epoch": 1.426111290496179, "grad_norm": 0.5216031670570374, "learning_rate": 0.0005150199331968538, "loss": 3.7013, "step": 13250 }, { "epoch": 1.4314928425357873, "grad_norm": 0.6091950535774231, "learning_rate": 0.0005146966921667923, "loss": 3.7041, "step": 13300 }, { "epoch": 1.4368743945753955, "grad_norm": 0.5405470728874207, "learning_rate": 0.0005143734511367309, "loss": 3.7233, "step": 13350 }, { "epoch": 1.4422559466150038, "grad_norm": 0.5653330683708191, "learning_rate": 0.0005140502101066695, "loss": 3.7113, "step": 13400 }, { "epoch": 1.447637498654612, "grad_norm": 0.5083488821983337, "learning_rate": 0.0005137269690766081, "loss": 3.6946, "step": 13450 }, { "epoch": 1.4530190506942202, "grad_norm": 0.5432257652282715, "learning_rate": 0.0005134037280465466, "loss": 3.6909, "step": 13500 }, { "epoch": 1.4584006027338283, "grad_norm": 0.5383495092391968, "learning_rate": 0.0005130804870164853, "loss": 3.6824, "step": 13550 }, { "epoch": 1.4637821547734367, "grad_norm": 0.5217844247817993, "learning_rate": 0.0005127572459864238, "loss": 3.6961, "step": 13600 }, { "epoch": 1.469163706813045, "grad_norm": 0.5690218210220337, "learning_rate": 0.0005124340049563624, "loss": 3.6898, "step": 13650 }, { "epoch": 1.474545258852653, "grad_norm": 0.536909282207489, "learning_rate": 0.000512110763926301, "loss": 3.6847, "step": 13700 }, { "epoch": 1.4799268108922612, "grad_norm": 0.5096193552017212, "learning_rate": 0.0005117875228962395, "loss": 3.6682, "step": 13750 }, { "epoch": 1.4853083629318695, "grad_norm": 0.5519959330558777, "learning_rate": 0.0005114642818661782, "loss": 3.7007, "step": 13800 }, { "epoch": 1.4906899149714778, "grad_norm": 0.6047645807266235, "learning_rate": 0.0005111410408361168, "loss": 3.6784, "step": 13850 }, { "epoch": 1.496071467011086, "grad_norm": 0.5431356430053711, "learning_rate": 0.0005108177998060554, "loss": 3.6863, "step": 13900 }, { "epoch": 1.501453019050694, "grad_norm": 0.5571900606155396, "learning_rate": 0.0005104945587759939, "loss": 3.7115, "step": 13950 }, { "epoch": 1.5068345710903024, "grad_norm": 0.5458298921585083, "learning_rate": 0.0005101713177459324, "loss": 3.6937, "step": 14000 }, { "epoch": 1.5068345710903024, "eval_accuracy": 0.353437413383276, "eval_loss": 3.6556448936462402, "eval_runtime": 184.6362, "eval_samples_per_second": 97.549, "eval_steps_per_second": 6.098, "step": 14000 }, { "epoch": 1.5122161231299107, "grad_norm": 0.5262465476989746, "learning_rate": 0.0005098480767158711, "loss": 3.7033, "step": 14050 }, { "epoch": 1.5175976751695188, "grad_norm": 0.5345038175582886, "learning_rate": 0.0005095248356858097, "loss": 3.6971, "step": 14100 }, { "epoch": 1.5229792272091272, "grad_norm": 0.5168197751045227, "learning_rate": 0.0005092015946557483, "loss": 3.6759, "step": 14150 }, { "epoch": 1.5283607792487355, "grad_norm": 0.5598461627960205, "learning_rate": 0.0005088783536256868, "loss": 3.6936, "step": 14200 }, { "epoch": 1.5337423312883436, "grad_norm": 0.616462230682373, "learning_rate": 0.0005085551125956255, "loss": 3.6958, "step": 14250 }, { "epoch": 1.5391238833279517, "grad_norm": 0.5687780380249023, "learning_rate": 0.000508231871565564, "loss": 3.6718, "step": 14300 }, { "epoch": 1.54450543536756, "grad_norm": 0.6172040104866028, "learning_rate": 0.0005079086305355026, "loss": 3.6722, "step": 14350 }, { "epoch": 1.5498869874071683, "grad_norm": 0.5774914622306824, "learning_rate": 0.0005075853895054412, "loss": 3.7067, "step": 14400 }, { "epoch": 1.5552685394467765, "grad_norm": 0.5606103539466858, "learning_rate": 0.0005072621484753797, "loss": 3.6953, "step": 14450 }, { "epoch": 1.5606500914863846, "grad_norm": 0.6137360334396362, "learning_rate": 0.0005069389074453184, "loss": 3.6851, "step": 14500 }, { "epoch": 1.566031643525993, "grad_norm": 0.5306119322776794, "learning_rate": 0.0005066156664152569, "loss": 3.6717, "step": 14550 }, { "epoch": 1.5714131955656012, "grad_norm": 0.5361549258232117, "learning_rate": 0.0005062924253851955, "loss": 3.6858, "step": 14600 }, { "epoch": 1.5767947476052093, "grad_norm": 0.5695285201072693, "learning_rate": 0.0005059691843551341, "loss": 3.6793, "step": 14650 }, { "epoch": 1.5821762996448174, "grad_norm": 0.6082088351249695, "learning_rate": 0.0005056459433250727, "loss": 3.6823, "step": 14700 }, { "epoch": 1.5875578516844258, "grad_norm": 0.5494985580444336, "learning_rate": 0.0005053227022950113, "loss": 3.6773, "step": 14750 }, { "epoch": 1.592939403724034, "grad_norm": 0.5614803433418274, "learning_rate": 0.0005049994612649499, "loss": 3.6898, "step": 14800 }, { "epoch": 1.5983209557636422, "grad_norm": 0.5661324858665466, "learning_rate": 0.0005046762202348884, "loss": 3.6868, "step": 14850 }, { "epoch": 1.6037025078032503, "grad_norm": 0.5200116634368896, "learning_rate": 0.000504352979204827, "loss": 3.6836, "step": 14900 }, { "epoch": 1.6090840598428586, "grad_norm": 0.5705900192260742, "learning_rate": 0.0005040297381747656, "loss": 3.6905, "step": 14950 }, { "epoch": 1.614465611882467, "grad_norm": 0.5745868682861328, "learning_rate": 0.0005037064971447042, "loss": 3.6986, "step": 15000 }, { "epoch": 1.614465611882467, "eval_accuracy": 0.35545531481576653, "eval_loss": 3.6349782943725586, "eval_runtime": 184.312, "eval_samples_per_second": 97.72, "eval_steps_per_second": 6.109, "step": 15000 }, { "epoch": 1.619847163922075, "grad_norm": 0.5304009318351746, "learning_rate": 0.0005033832561146428, "loss": 3.6643, "step": 15050 }, { "epoch": 1.6252287159616834, "grad_norm": 0.5392767190933228, "learning_rate": 0.0005030600150845813, "loss": 3.6664, "step": 15100 }, { "epoch": 1.6306102680012917, "grad_norm": 0.5564727187156677, "learning_rate": 0.00050273677405452, "loss": 3.6673, "step": 15150 }, { "epoch": 1.6359918200408998, "grad_norm": 0.5869589447975159, "learning_rate": 0.0005024135330244585, "loss": 3.6861, "step": 15200 }, { "epoch": 1.641373372080508, "grad_norm": 0.6014476418495178, "learning_rate": 0.0005020902919943972, "loss": 3.6682, "step": 15250 }, { "epoch": 1.6467549241201163, "grad_norm": 0.5469924211502075, "learning_rate": 0.0005017670509643357, "loss": 3.6743, "step": 15300 }, { "epoch": 1.6521364761597246, "grad_norm": 0.5648680925369263, "learning_rate": 0.0005014438099342743, "loss": 3.6877, "step": 15350 }, { "epoch": 1.6575180281993327, "grad_norm": 0.6184208989143372, "learning_rate": 0.0005011205689042129, "loss": 3.6477, "step": 15400 }, { "epoch": 1.6628995802389408, "grad_norm": 0.575512707233429, "learning_rate": 0.0005007973278741514, "loss": 3.6665, "step": 15450 }, { "epoch": 1.6682811322785491, "grad_norm": 0.5355293154716492, "learning_rate": 0.00050047408684409, "loss": 3.68, "step": 15500 }, { "epoch": 1.6736626843181575, "grad_norm": 0.5851496458053589, "learning_rate": 0.0005001508458140286, "loss": 3.6668, "step": 15550 }, { "epoch": 1.6790442363577656, "grad_norm": 0.5289725661277771, "learning_rate": 0.0004998276047839673, "loss": 3.6951, "step": 15600 }, { "epoch": 1.6844257883973737, "grad_norm": 0.6031738519668579, "learning_rate": 0.0004995043637539058, "loss": 3.6694, "step": 15650 }, { "epoch": 1.689807340436982, "grad_norm": 0.567891001701355, "learning_rate": 0.0004991811227238443, "loss": 3.6925, "step": 15700 }, { "epoch": 1.6951888924765903, "grad_norm": 0.524456262588501, "learning_rate": 0.0004988578816937829, "loss": 3.6733, "step": 15750 }, { "epoch": 1.7005704445161984, "grad_norm": 0.6047706604003906, "learning_rate": 0.0004985346406637215, "loss": 3.6716, "step": 15800 }, { "epoch": 1.7059519965558065, "grad_norm": 0.6086586117744446, "learning_rate": 0.0004982113996336602, "loss": 3.6652, "step": 15850 }, { "epoch": 1.7113335485954149, "grad_norm": 0.5751634836196899, "learning_rate": 0.0004978881586035987, "loss": 3.6706, "step": 15900 }, { "epoch": 1.7167151006350232, "grad_norm": 0.5148400664329529, "learning_rate": 0.0004975649175735373, "loss": 3.6692, "step": 15950 }, { "epoch": 1.7220966526746313, "grad_norm": 0.5664333701133728, "learning_rate": 0.0004972416765434759, "loss": 3.6646, "step": 16000 }, { "epoch": 1.7220966526746313, "eval_accuracy": 0.3574298637460191, "eval_loss": 3.618809461593628, "eval_runtime": 184.6411, "eval_samples_per_second": 97.546, "eval_steps_per_second": 6.098, "step": 16000 }, { "epoch": 1.7274782047142396, "grad_norm": 0.5310845971107483, "learning_rate": 0.0004969184355134145, "loss": 3.6796, "step": 16050 }, { "epoch": 1.732859756753848, "grad_norm": 0.5434026122093201, "learning_rate": 0.0004966016593039543, "loss": 3.6685, "step": 16100 }, { "epoch": 1.738241308793456, "grad_norm": 0.5860040783882141, "learning_rate": 0.0004962784182738928, "loss": 3.6586, "step": 16150 }, { "epoch": 1.7436228608330642, "grad_norm": 0.5336900949478149, "learning_rate": 0.0004959551772438314, "loss": 3.675, "step": 16200 }, { "epoch": 1.7490044128726725, "grad_norm": 0.5451406836509705, "learning_rate": 0.0004956319362137701, "loss": 3.6633, "step": 16250 }, { "epoch": 1.7543859649122808, "grad_norm": 0.5304573178291321, "learning_rate": 0.0004953086951837086, "loss": 3.6377, "step": 16300 }, { "epoch": 1.759767516951889, "grad_norm": 0.521602988243103, "learning_rate": 0.0004949854541536472, "loss": 3.6885, "step": 16350 }, { "epoch": 1.765149068991497, "grad_norm": 0.5319018363952637, "learning_rate": 0.0004946622131235857, "loss": 3.6523, "step": 16400 }, { "epoch": 1.7705306210311054, "grad_norm": 0.6209896802902222, "learning_rate": 0.0004943389720935244, "loss": 3.6639, "step": 16450 }, { "epoch": 1.7759121730707137, "grad_norm": 0.5356787443161011, "learning_rate": 0.000494015731063463, "loss": 3.6537, "step": 16500 }, { "epoch": 1.7812937251103218, "grad_norm": 0.5571577548980713, "learning_rate": 0.0004936924900334016, "loss": 3.6549, "step": 16550 }, { "epoch": 1.78667527714993, "grad_norm": 0.5454883575439453, "learning_rate": 0.0004933692490033401, "loss": 3.6586, "step": 16600 }, { "epoch": 1.7920568291895382, "grad_norm": 0.5444821715354919, "learning_rate": 0.0004930460079732786, "loss": 3.6702, "step": 16650 }, { "epoch": 1.7974383812291466, "grad_norm": 0.5578252673149109, "learning_rate": 0.0004927227669432173, "loss": 3.6416, "step": 16700 }, { "epoch": 1.8028199332687547, "grad_norm": 0.5404045581817627, "learning_rate": 0.0004923995259131558, "loss": 3.6455, "step": 16750 }, { "epoch": 1.8082014853083628, "grad_norm": 0.5403972268104553, "learning_rate": 0.0004920762848830945, "loss": 3.6536, "step": 16800 }, { "epoch": 1.813583037347971, "grad_norm": 0.6036391258239746, "learning_rate": 0.000491753043853033, "loss": 3.6643, "step": 16850 }, { "epoch": 1.8189645893875794, "grad_norm": 0.5716856122016907, "learning_rate": 0.0004914298028229717, "loss": 3.6373, "step": 16900 }, { "epoch": 1.8243461414271875, "grad_norm": 0.5589890480041504, "learning_rate": 0.0004911065617929102, "loss": 3.6553, "step": 16950 }, { "epoch": 1.8297276934667959, "grad_norm": 0.5614646673202515, "learning_rate": 0.0004907833207628487, "loss": 3.6434, "step": 17000 }, { "epoch": 1.8297276934667959, "eval_accuracy": 0.3586415607509262, "eval_loss": 3.6042091846466064, "eval_runtime": 184.6067, "eval_samples_per_second": 97.564, "eval_steps_per_second": 6.099, "step": 17000 }, { "epoch": 1.8351092455064042, "grad_norm": 0.5553935170173645, "learning_rate": 0.0004904600797327874, "loss": 3.6565, "step": 17050 }, { "epoch": 1.8404907975460123, "grad_norm": 0.5318084359169006, "learning_rate": 0.0004901368387027259, "loss": 3.6334, "step": 17100 }, { "epoch": 1.8458723495856204, "grad_norm": 0.5458089113235474, "learning_rate": 0.0004898135976726646, "loss": 3.6462, "step": 17150 }, { "epoch": 1.8512539016252287, "grad_norm": 0.5222024321556091, "learning_rate": 0.0004894903566426031, "loss": 3.6486, "step": 17200 }, { "epoch": 1.856635453664837, "grad_norm": 0.576395571231842, "learning_rate": 0.0004891671156125417, "loss": 3.6422, "step": 17250 }, { "epoch": 1.8620170057044452, "grad_norm": 0.5254101157188416, "learning_rate": 0.0004888438745824803, "loss": 3.6532, "step": 17300 }, { "epoch": 1.8673985577440533, "grad_norm": 0.580644965171814, "learning_rate": 0.0004885206335524189, "loss": 3.6338, "step": 17350 }, { "epoch": 1.8727801097836616, "grad_norm": 0.5833187699317932, "learning_rate": 0.0004881973925223575, "loss": 3.6468, "step": 17400 }, { "epoch": 1.87816166182327, "grad_norm": 0.5698775053024292, "learning_rate": 0.00048787415149229604, "loss": 3.6259, "step": 17450 }, { "epoch": 1.883543213862878, "grad_norm": 0.5299211740493774, "learning_rate": 0.00048755091046223464, "loss": 3.6593, "step": 17500 }, { "epoch": 1.8889247659024861, "grad_norm": 0.5283573865890503, "learning_rate": 0.0004872276694321732, "loss": 3.6396, "step": 17550 }, { "epoch": 1.8943063179420945, "grad_norm": 0.5362534523010254, "learning_rate": 0.00048690442840211177, "loss": 3.6368, "step": 17600 }, { "epoch": 1.8996878699817028, "grad_norm": 0.6080977916717529, "learning_rate": 0.0004865811873720504, "loss": 3.6426, "step": 17650 }, { "epoch": 1.905069422021311, "grad_norm": 0.5486099123954773, "learning_rate": 0.00048625794634198896, "loss": 3.6298, "step": 17700 }, { "epoch": 1.910450974060919, "grad_norm": 0.556928813457489, "learning_rate": 0.00048593470531192756, "loss": 3.6388, "step": 17750 }, { "epoch": 1.9158325261005273, "grad_norm": 0.5231663584709167, "learning_rate": 0.00048561146428186615, "loss": 3.6384, "step": 17800 }, { "epoch": 1.9212140781401357, "grad_norm": 0.5433329343795776, "learning_rate": 0.0004852882232518047, "loss": 3.6283, "step": 17850 }, { "epoch": 1.9265956301797438, "grad_norm": 0.5465906262397766, "learning_rate": 0.00048496498222174334, "loss": 3.6332, "step": 17900 }, { "epoch": 1.931977182219352, "grad_norm": 0.5724872946739197, "learning_rate": 0.00048464174119168193, "loss": 3.6462, "step": 17950 }, { "epoch": 1.9373587342589604, "grad_norm": 0.6602355241775513, "learning_rate": 0.0004843185001616205, "loss": 3.6473, "step": 18000 }, { "epoch": 1.9373587342589604, "eval_accuracy": 0.3605781898233315, "eval_loss": 3.587989091873169, "eval_runtime": 184.6487, "eval_samples_per_second": 97.542, "eval_steps_per_second": 6.098, "step": 18000 }, { "epoch": 1.9427402862985685, "grad_norm": 0.5606858134269714, "learning_rate": 0.00048399525913155907, "loss": 3.6358, "step": 18050 }, { "epoch": 1.9481218383381766, "grad_norm": 0.739033579826355, "learning_rate": 0.0004836720181014976, "loss": 3.6298, "step": 18100 }, { "epoch": 1.953503390377785, "grad_norm": 0.5382635593414307, "learning_rate": 0.0004833487770714362, "loss": 3.626, "step": 18150 }, { "epoch": 1.9588849424173933, "grad_norm": 0.5662149786949158, "learning_rate": 0.00048302553604137485, "loss": 3.6201, "step": 18200 }, { "epoch": 1.9642664944570014, "grad_norm": 0.5744447708129883, "learning_rate": 0.0004827087598319146, "loss": 3.6442, "step": 18250 }, { "epoch": 1.9696480464966095, "grad_norm": 0.5152103304862976, "learning_rate": 0.00048238551880185325, "loss": 3.64, "step": 18300 }, { "epoch": 1.9750295985362178, "grad_norm": 0.5561021566390991, "learning_rate": 0.0004820622777717918, "loss": 3.6278, "step": 18350 }, { "epoch": 1.9804111505758262, "grad_norm": 0.5693721771240234, "learning_rate": 0.0004817390367417304, "loss": 3.6472, "step": 18400 }, { "epoch": 1.9857927026154343, "grad_norm": 0.5853719115257263, "learning_rate": 0.00048141579571166893, "loss": 3.6207, "step": 18450 }, { "epoch": 1.9911742546550424, "grad_norm": 0.597014844417572, "learning_rate": 0.0004810925546816075, "loss": 3.6292, "step": 18500 }, { "epoch": 1.9965558066946507, "grad_norm": 0.5561902523040771, "learning_rate": 0.0004807693136515461, "loss": 3.648, "step": 18550 }, { "epoch": 2.001937358734259, "grad_norm": 0.5626040101051331, "learning_rate": 0.0004804460726214847, "loss": 3.5959, "step": 18600 }, { "epoch": 2.007318910773867, "grad_norm": 0.5984902381896973, "learning_rate": 0.0004801228315914233, "loss": 3.539, "step": 18650 }, { "epoch": 2.0127004628134753, "grad_norm": 0.5392407178878784, "learning_rate": 0.0004797995905613619, "loss": 3.531, "step": 18700 }, { "epoch": 2.018082014853084, "grad_norm": 0.6213756799697876, "learning_rate": 0.00047947634953130044, "loss": 3.5341, "step": 18750 }, { "epoch": 2.023463566892692, "grad_norm": 0.5914905667304993, "learning_rate": 0.00047915310850123904, "loss": 3.5401, "step": 18800 }, { "epoch": 2.0288451189323, "grad_norm": 0.5456693172454834, "learning_rate": 0.0004788298674711777, "loss": 3.5331, "step": 18850 }, { "epoch": 2.034226670971908, "grad_norm": 0.5378903150558472, "learning_rate": 0.00047850662644111623, "loss": 3.5555, "step": 18900 }, { "epoch": 2.0396082230115167, "grad_norm": 0.5830462574958801, "learning_rate": 0.0004781833854110548, "loss": 3.532, "step": 18950 }, { "epoch": 2.044989775051125, "grad_norm": 0.5927799940109253, "learning_rate": 0.00047786014438099336, "loss": 3.5583, "step": 19000 }, { "epoch": 2.044989775051125, "eval_accuracy": 0.3620152329175658, "eval_loss": 3.5778863430023193, "eval_runtime": 184.4238, "eval_samples_per_second": 97.661, "eval_steps_per_second": 6.106, "step": 19000 }, { "epoch": 2.050371327090733, "grad_norm": 0.6135937571525574, "learning_rate": 0.00047753690335093196, "loss": 3.5692, "step": 19050 }, { "epoch": 2.055752879130341, "grad_norm": 0.5517251491546631, "learning_rate": 0.00047721366232087055, "loss": 3.5511, "step": 19100 }, { "epoch": 2.0611344311699495, "grad_norm": 0.5605584383010864, "learning_rate": 0.00047689042129080915, "loss": 3.5399, "step": 19150 }, { "epoch": 2.0665159832095576, "grad_norm": 0.54792720079422, "learning_rate": 0.00047656718026074774, "loss": 3.5455, "step": 19200 }, { "epoch": 2.0718975352491658, "grad_norm": 0.594642162322998, "learning_rate": 0.00047624393923068634, "loss": 3.5548, "step": 19250 }, { "epoch": 2.0772790872887743, "grad_norm": 0.5518810153007507, "learning_rate": 0.0004759206982006249, "loss": 3.5538, "step": 19300 }, { "epoch": 2.0826606393283824, "grad_norm": 0.5460236072540283, "learning_rate": 0.00047559745717056347, "loss": 3.5546, "step": 19350 }, { "epoch": 2.0880421913679905, "grad_norm": 0.5818831920623779, "learning_rate": 0.000475274216140502, "loss": 3.5441, "step": 19400 }, { "epoch": 2.0934237434075986, "grad_norm": 0.5418691635131836, "learning_rate": 0.00047495097511044066, "loss": 3.552, "step": 19450 }, { "epoch": 2.098805295447207, "grad_norm": 0.6234689354896545, "learning_rate": 0.00047462773408037925, "loss": 3.5575, "step": 19500 }, { "epoch": 2.1041868474868153, "grad_norm": 0.5577658414840698, "learning_rate": 0.0004743044930503178, "loss": 3.5691, "step": 19550 }, { "epoch": 2.1095683995264234, "grad_norm": 0.5367577075958252, "learning_rate": 0.0004739812520202564, "loss": 3.5516, "step": 19600 }, { "epoch": 2.1149499515660315, "grad_norm": 0.5605822205543518, "learning_rate": 0.000473658010990195, "loss": 3.5661, "step": 19650 }, { "epoch": 2.12033150360564, "grad_norm": 0.5611740946769714, "learning_rate": 0.0004733347699601336, "loss": 3.5445, "step": 19700 }, { "epoch": 2.125713055645248, "grad_norm": 0.5580589771270752, "learning_rate": 0.0004730115289300722, "loss": 3.5638, "step": 19750 }, { "epoch": 2.1310946076848563, "grad_norm": 0.5551881790161133, "learning_rate": 0.00047268828790001077, "loss": 3.5791, "step": 19800 }, { "epoch": 2.1364761597244644, "grad_norm": 0.5823877453804016, "learning_rate": 0.0004723650468699493, "loss": 3.5641, "step": 19850 }, { "epoch": 2.141857711764073, "grad_norm": 0.5327476263046265, "learning_rate": 0.0004720418058398879, "loss": 3.5788, "step": 19900 }, { "epoch": 2.147239263803681, "grad_norm": 0.5645259618759155, "learning_rate": 0.00047171856480982644, "loss": 3.5569, "step": 19950 }, { "epoch": 2.152620815843289, "grad_norm": 0.6214436292648315, "learning_rate": 0.0004713953237797651, "loss": 3.5672, "step": 20000 }, { "epoch": 2.152620815843289, "eval_accuracy": 0.3629375872822229, "eval_loss": 3.5702061653137207, "eval_runtime": 184.4838, "eval_samples_per_second": 97.629, "eval_steps_per_second": 6.104, "step": 20000 }, { "epoch": 2.1580023678828972, "grad_norm": 0.6375493407249451, "learning_rate": 0.0004710720827497037, "loss": 3.5479, "step": 20050 }, { "epoch": 2.163383919922506, "grad_norm": 0.5471000671386719, "learning_rate": 0.00047074884171964223, "loss": 3.5683, "step": 20100 }, { "epoch": 2.168765471962114, "grad_norm": 0.6272624135017395, "learning_rate": 0.0004704256006895808, "loss": 3.5552, "step": 20150 }, { "epoch": 2.174147024001722, "grad_norm": 0.5281191468238831, "learning_rate": 0.00047010235965951936, "loss": 3.5606, "step": 20200 }, { "epoch": 2.1795285760413305, "grad_norm": 0.5559777021408081, "learning_rate": 0.00046977911862945796, "loss": 3.5424, "step": 20250 }, { "epoch": 2.1849101280809387, "grad_norm": 0.5731501579284668, "learning_rate": 0.0004694558775993966, "loss": 3.5593, "step": 20300 }, { "epoch": 2.1902916801205468, "grad_norm": 0.5684100389480591, "learning_rate": 0.00046913263656933515, "loss": 3.5506, "step": 20350 }, { "epoch": 2.195673232160155, "grad_norm": 0.627325177192688, "learning_rate": 0.00046880939553927374, "loss": 3.5475, "step": 20400 }, { "epoch": 2.2010547841997634, "grad_norm": 0.5675097703933716, "learning_rate": 0.00046848615450921234, "loss": 3.5461, "step": 20450 }, { "epoch": 2.2064363362393715, "grad_norm": 0.5912321209907532, "learning_rate": 0.0004681629134791509, "loss": 3.5582, "step": 20500 }, { "epoch": 2.2118178882789796, "grad_norm": 0.5270475149154663, "learning_rate": 0.00046783967244908947, "loss": 3.575, "step": 20550 }, { "epoch": 2.2171994403185877, "grad_norm": 0.5744220614433289, "learning_rate": 0.0004675164314190281, "loss": 3.559, "step": 20600 }, { "epoch": 2.2225809923581963, "grad_norm": 0.5762168765068054, "learning_rate": 0.00046719319038896666, "loss": 3.5585, "step": 20650 }, { "epoch": 2.2279625443978044, "grad_norm": 0.6091347932815552, "learning_rate": 0.00046686994935890526, "loss": 3.5631, "step": 20700 }, { "epoch": 2.2333440964374125, "grad_norm": 0.5728710889816284, "learning_rate": 0.0004665467083288438, "loss": 3.5612, "step": 20750 }, { "epoch": 2.2387256484770206, "grad_norm": 0.5855892896652222, "learning_rate": 0.0004662234672987824, "loss": 3.5564, "step": 20800 }, { "epoch": 2.244107200516629, "grad_norm": 0.5865923166275024, "learning_rate": 0.0004659066910893222, "loss": 3.5502, "step": 20850 }, { "epoch": 2.2494887525562373, "grad_norm": 0.5471289753913879, "learning_rate": 0.0004655834500592608, "loss": 3.5732, "step": 20900 }, { "epoch": 2.2548703045958454, "grad_norm": 0.5742563009262085, "learning_rate": 0.00046526020902919944, "loss": 3.5591, "step": 20950 }, { "epoch": 2.2602518566354535, "grad_norm": 0.5915543437004089, "learning_rate": 0.000464936967999138, "loss": 3.5534, "step": 21000 }, { "epoch": 2.2602518566354535, "eval_accuracy": 0.3639646830307332, "eval_loss": 3.5602052211761475, "eval_runtime": 184.9594, "eval_samples_per_second": 97.378, "eval_steps_per_second": 6.088, "step": 21000 }, { "epoch": 2.265633408675062, "grad_norm": 0.6229991316795349, "learning_rate": 0.0004646137269690766, "loss": 3.5646, "step": 21050 }, { "epoch": 2.27101496071467, "grad_norm": 0.5470733642578125, "learning_rate": 0.00046429048593901517, "loss": 3.5405, "step": 21100 }, { "epoch": 2.2763965127542782, "grad_norm": 0.5559133291244507, "learning_rate": 0.0004639672449089537, "loss": 3.5681, "step": 21150 }, { "epoch": 2.281778064793887, "grad_norm": 0.586371123790741, "learning_rate": 0.0004636440038788923, "loss": 3.5481, "step": 21200 }, { "epoch": 2.287159616833495, "grad_norm": 0.5637801885604858, "learning_rate": 0.00046332076284883095, "loss": 3.5569, "step": 21250 }, { "epoch": 2.292541168873103, "grad_norm": 0.6041868925094604, "learning_rate": 0.0004629975218187695, "loss": 3.5539, "step": 21300 }, { "epoch": 2.297922720912711, "grad_norm": 0.5667281746864319, "learning_rate": 0.0004626742807887081, "loss": 3.5323, "step": 21350 }, { "epoch": 2.303304272952319, "grad_norm": 0.5741965770721436, "learning_rate": 0.00046235103975864663, "loss": 3.5524, "step": 21400 }, { "epoch": 2.3086858249919278, "grad_norm": 0.5632966160774231, "learning_rate": 0.0004620277987285852, "loss": 3.5519, "step": 21450 }, { "epoch": 2.314067377031536, "grad_norm": 0.5545016527175903, "learning_rate": 0.0004617045576985239, "loss": 3.5487, "step": 21500 }, { "epoch": 2.319448929071144, "grad_norm": 0.5945175290107727, "learning_rate": 0.0004613813166684624, "loss": 3.5649, "step": 21550 }, { "epoch": 2.3248304811107525, "grad_norm": 0.5715595483779907, "learning_rate": 0.000461058075638401, "loss": 3.5639, "step": 21600 }, { "epoch": 2.3302120331503606, "grad_norm": 0.6180649399757385, "learning_rate": 0.00046073483460833955, "loss": 3.5566, "step": 21650 }, { "epoch": 2.3355935851899687, "grad_norm": 0.5411754250526428, "learning_rate": 0.00046041159357827814, "loss": 3.5633, "step": 21700 }, { "epoch": 2.340975137229577, "grad_norm": 0.5503979325294495, "learning_rate": 0.00046008835254821674, "loss": 3.5429, "step": 21750 }, { "epoch": 2.3463566892691854, "grad_norm": 0.5738011002540588, "learning_rate": 0.0004597651115181554, "loss": 3.5402, "step": 21800 }, { "epoch": 2.3517382413087935, "grad_norm": 0.5915544033050537, "learning_rate": 0.00045944187048809393, "loss": 3.5577, "step": 21850 }, { "epoch": 2.3571197933484016, "grad_norm": 0.6151456236839294, "learning_rate": 0.0004591186294580325, "loss": 3.5619, "step": 21900 }, { "epoch": 2.3625013453880097, "grad_norm": 0.5948041081428528, "learning_rate": 0.00045879538842797106, "loss": 3.5498, "step": 21950 }, { "epoch": 2.3678828974276183, "grad_norm": 0.597334086894989, "learning_rate": 0.00045847214739790966, "loss": 3.5551, "step": 22000 }, { "epoch": 2.3678828974276183, "eval_accuracy": 0.3649270216581061, "eval_loss": 3.5502870082855225, "eval_runtime": 186.0038, "eval_samples_per_second": 96.831, "eval_steps_per_second": 6.054, "step": 22000 }, { "epoch": 2.3732644494672264, "grad_norm": 0.556607186794281, "learning_rate": 0.0004581489063678482, "loss": 3.5555, "step": 22050 }, { "epoch": 2.3786460015068345, "grad_norm": 0.6205542087554932, "learning_rate": 0.00045782566533778685, "loss": 3.5576, "step": 22100 }, { "epoch": 2.384027553546443, "grad_norm": 0.5172244906425476, "learning_rate": 0.00045750242430772544, "loss": 3.5658, "step": 22150 }, { "epoch": 2.389409105586051, "grad_norm": 0.5946823358535767, "learning_rate": 0.000457179183277664, "loss": 3.5521, "step": 22200 }, { "epoch": 2.3947906576256592, "grad_norm": 0.600932240486145, "learning_rate": 0.0004568559422476026, "loss": 3.5612, "step": 22250 }, { "epoch": 2.4001722096652673, "grad_norm": 0.6086694598197937, "learning_rate": 0.00045653270121754117, "loss": 3.5581, "step": 22300 }, { "epoch": 2.4055537617048754, "grad_norm": 0.6031166911125183, "learning_rate": 0.0004562094601874797, "loss": 3.5667, "step": 22350 }, { "epoch": 2.410935313744484, "grad_norm": 0.5419747233390808, "learning_rate": 0.00045589268397801957, "loss": 3.5316, "step": 22400 }, { "epoch": 2.416316865784092, "grad_norm": 0.5568996667861938, "learning_rate": 0.00045556944294795817, "loss": 3.5659, "step": 22450 }, { "epoch": 2.4216984178237, "grad_norm": 0.5609857439994812, "learning_rate": 0.00045524620191789676, "loss": 3.5513, "step": 22500 }, { "epoch": 2.4270799698633088, "grad_norm": 0.5434156656265259, "learning_rate": 0.00045492296088783536, "loss": 3.5403, "step": 22550 }, { "epoch": 2.432461521902917, "grad_norm": 0.5359591841697693, "learning_rate": 0.0004545997198577739, "loss": 3.5464, "step": 22600 }, { "epoch": 2.437843073942525, "grad_norm": 0.6035705208778381, "learning_rate": 0.0004542764788277125, "loss": 3.5432, "step": 22650 }, { "epoch": 2.443224625982133, "grad_norm": 0.5715034008026123, "learning_rate": 0.00045395323779765103, "loss": 3.5508, "step": 22700 }, { "epoch": 2.4486061780217416, "grad_norm": 0.5985683798789978, "learning_rate": 0.0004536299967675897, "loss": 3.5508, "step": 22750 }, { "epoch": 2.4539877300613497, "grad_norm": 0.5577922463417053, "learning_rate": 0.0004533067557375283, "loss": 3.5565, "step": 22800 }, { "epoch": 2.459369282100958, "grad_norm": 0.6181262731552124, "learning_rate": 0.0004529835147074668, "loss": 3.5475, "step": 22850 }, { "epoch": 2.464750834140566, "grad_norm": 0.5955373644828796, "learning_rate": 0.0004526602736774054, "loss": 3.5359, "step": 22900 }, { "epoch": 2.4701323861801745, "grad_norm": 0.5582594275474548, "learning_rate": 0.00045233703264734395, "loss": 3.5376, "step": 22950 }, { "epoch": 2.4755139382197826, "grad_norm": 0.5990350246429443, "learning_rate": 0.00045201379161728255, "loss": 3.5504, "step": 23000 }, { "epoch": 2.4755139382197826, "eval_accuracy": 0.36597867295926245, "eval_loss": 3.541978597640991, "eval_runtime": 184.7351, "eval_samples_per_second": 97.496, "eval_steps_per_second": 6.095, "step": 23000 }, { "epoch": 2.4808954902593907, "grad_norm": 0.5420169234275818, "learning_rate": 0.0004516905505872212, "loss": 3.5606, "step": 23050 }, { "epoch": 2.4862770422989993, "grad_norm": 0.5671257972717285, "learning_rate": 0.00045136730955715973, "loss": 3.548, "step": 23100 }, { "epoch": 2.4916585943386074, "grad_norm": 0.6008259057998657, "learning_rate": 0.00045104406852709833, "loss": 3.5426, "step": 23150 }, { "epoch": 2.4970401463782155, "grad_norm": 0.5630355477333069, "learning_rate": 0.0004507208274970369, "loss": 3.5474, "step": 23200 }, { "epoch": 2.5024216984178236, "grad_norm": 0.6546486020088196, "learning_rate": 0.00045039758646697546, "loss": 3.5549, "step": 23250 }, { "epoch": 2.5078032504574317, "grad_norm": 0.5610313415527344, "learning_rate": 0.0004500743454369141, "loss": 3.5606, "step": 23300 }, { "epoch": 2.5131848024970402, "grad_norm": 0.6008824706077576, "learning_rate": 0.0004497511044068527, "loss": 3.554, "step": 23350 }, { "epoch": 2.5185663545366483, "grad_norm": 0.5663953423500061, "learning_rate": 0.00044942786337679125, "loss": 3.5343, "step": 23400 }, { "epoch": 2.5239479065762565, "grad_norm": 0.5768746733665466, "learning_rate": 0.00044910462234672984, "loss": 3.5481, "step": 23450 }, { "epoch": 2.529329458615865, "grad_norm": 0.5716328620910645, "learning_rate": 0.0004487813813166684, "loss": 3.5617, "step": 23500 }, { "epoch": 2.534711010655473, "grad_norm": 0.5698726773262024, "learning_rate": 0.000448458140286607, "loss": 3.5434, "step": 23550 }, { "epoch": 2.540092562695081, "grad_norm": 0.6069528460502625, "learning_rate": 0.00044813489925654563, "loss": 3.55, "step": 23600 }, { "epoch": 2.5454741147346893, "grad_norm": 0.5537393689155579, "learning_rate": 0.00044781165822648417, "loss": 3.5412, "step": 23650 }, { "epoch": 2.550855666774298, "grad_norm": 0.6079427003860474, "learning_rate": 0.00044748841719642276, "loss": 3.55, "step": 23700 }, { "epoch": 2.556237218813906, "grad_norm": 0.5689899921417236, "learning_rate": 0.00044716517616636136, "loss": 3.5378, "step": 23750 }, { "epoch": 2.561618770853514, "grad_norm": 0.5739214420318604, "learning_rate": 0.0004468419351362999, "loss": 3.5636, "step": 23800 }, { "epoch": 2.567000322893122, "grad_norm": 0.581843912601471, "learning_rate": 0.0004465186941062385, "loss": 3.5358, "step": 23850 }, { "epoch": 2.5723818749327307, "grad_norm": 0.5732669234275818, "learning_rate": 0.00044619545307617714, "loss": 3.5419, "step": 23900 }, { "epoch": 2.577763426972339, "grad_norm": 0.572491466999054, "learning_rate": 0.0004458722120461157, "loss": 3.5357, "step": 23950 }, { "epoch": 2.583144979011947, "grad_norm": 0.5753400325775146, "learning_rate": 0.0004455489710160543, "loss": 3.5292, "step": 24000 }, { "epoch": 2.583144979011947, "eval_accuracy": 0.36699979279894296, "eval_loss": 3.5284852981567383, "eval_runtime": 184.7189, "eval_samples_per_second": 97.505, "eval_steps_per_second": 6.096, "step": 24000 }, { "epoch": 2.5885265310515555, "grad_norm": 0.5776293873786926, "learning_rate": 0.0004452257299859928, "loss": 3.5573, "step": 24050 }, { "epoch": 2.5939080830911636, "grad_norm": 0.5396984815597534, "learning_rate": 0.0004449024889559314, "loss": 3.5348, "step": 24100 }, { "epoch": 2.5992896351307717, "grad_norm": 0.5629790425300598, "learning_rate": 0.00044457924792587, "loss": 3.5311, "step": 24150 }, { "epoch": 2.60467118717038, "grad_norm": 0.6055911779403687, "learning_rate": 0.0004442560068958086, "loss": 3.5521, "step": 24200 }, { "epoch": 2.610052739209988, "grad_norm": 0.6158533692359924, "learning_rate": 0.0004439327658657472, "loss": 3.5443, "step": 24250 }, { "epoch": 2.6154342912495965, "grad_norm": 0.5898953080177307, "learning_rate": 0.0004436095248356858, "loss": 3.5279, "step": 24300 }, { "epoch": 2.6208158432892046, "grad_norm": 0.6070363521575928, "learning_rate": 0.00044328628380562433, "loss": 3.5328, "step": 24350 }, { "epoch": 2.6261973953288127, "grad_norm": 0.6255342364311218, "learning_rate": 0.0004429630427755629, "loss": 3.5308, "step": 24400 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5635362863540649, "learning_rate": 0.00044264626656610273, "loss": 3.5503, "step": 24450 }, { "epoch": 2.6369604994080293, "grad_norm": 0.5545345544815063, "learning_rate": 0.0004423230255360413, "loss": 3.5512, "step": 24500 }, { "epoch": 2.6423420514476375, "grad_norm": 0.5683971047401428, "learning_rate": 0.00044199978450598, "loss": 3.5455, "step": 24550 }, { "epoch": 2.6477236034872456, "grad_norm": 0.5963163375854492, "learning_rate": 0.0004416765434759185, "loss": 3.5487, "step": 24600 }, { "epoch": 2.653105155526854, "grad_norm": 0.5661613941192627, "learning_rate": 0.0004413533024458571, "loss": 3.5521, "step": 24650 }, { "epoch": 2.658486707566462, "grad_norm": 0.5891050100326538, "learning_rate": 0.00044103006141579565, "loss": 3.5376, "step": 24700 }, { "epoch": 2.6638682596060703, "grad_norm": 0.6227092146873474, "learning_rate": 0.00044070682038573425, "loss": 3.5448, "step": 24750 }, { "epoch": 2.6692498116456784, "grad_norm": 0.602344810962677, "learning_rate": 0.0004403835793556728, "loss": 3.5597, "step": 24800 }, { "epoch": 2.674631363685287, "grad_norm": 0.5948516726493835, "learning_rate": 0.00044006033832561143, "loss": 3.5432, "step": 24850 }, { "epoch": 2.680012915724895, "grad_norm": 0.562466561794281, "learning_rate": 0.00043973709729555003, "loss": 3.5447, "step": 24900 }, { "epoch": 2.685394467764503, "grad_norm": 0.5580047369003296, "learning_rate": 0.00043941385626548857, "loss": 3.5319, "step": 24950 }, { "epoch": 2.6907760198041117, "grad_norm": 0.6161678433418274, "learning_rate": 0.00043909061523542716, "loss": 3.548, "step": 25000 }, { "epoch": 2.6907760198041117, "eval_accuracy": 0.3677566688154586, "eval_loss": 3.519853353500366, "eval_runtime": 184.6274, "eval_samples_per_second": 97.553, "eval_steps_per_second": 6.099, "step": 25000 }, { "epoch": 2.69615757184372, "grad_norm": 0.5613181591033936, "learning_rate": 0.00043876737420536576, "loss": 3.5378, "step": 25050 }, { "epoch": 2.701539123883328, "grad_norm": 0.5858383178710938, "learning_rate": 0.00043844413317530435, "loss": 3.543, "step": 25100 }, { "epoch": 2.706920675922936, "grad_norm": 0.584028959274292, "learning_rate": 0.00043812089214524295, "loss": 3.5279, "step": 25150 }, { "epoch": 2.712302227962544, "grad_norm": 0.6211503744125366, "learning_rate": 0.00043779765111518154, "loss": 3.542, "step": 25200 }, { "epoch": 2.7176837800021527, "grad_norm": 0.6113668084144592, "learning_rate": 0.0004374744100851201, "loss": 3.5454, "step": 25250 }, { "epoch": 2.723065332041761, "grad_norm": 0.6018054485321045, "learning_rate": 0.0004371511690550587, "loss": 3.5472, "step": 25300 }, { "epoch": 2.728446884081369, "grad_norm": 0.581635057926178, "learning_rate": 0.0004368279280249972, "loss": 3.5521, "step": 25350 }, { "epoch": 2.7338284361209775, "grad_norm": 0.6109009385108948, "learning_rate": 0.00043650468699493587, "loss": 3.5456, "step": 25400 }, { "epoch": 2.7392099881605856, "grad_norm": 0.6068916320800781, "learning_rate": 0.00043618144596487446, "loss": 3.5174, "step": 25450 }, { "epoch": 2.7445915402001937, "grad_norm": 0.5705057978630066, "learning_rate": 0.000435858204934813, "loss": 3.5223, "step": 25500 }, { "epoch": 2.749973092239802, "grad_norm": 0.5605146288871765, "learning_rate": 0.0004355349639047516, "loss": 3.5414, "step": 25550 }, { "epoch": 2.7553546442794103, "grad_norm": 0.5750584602355957, "learning_rate": 0.0004352117228746902, "loss": 3.5424, "step": 25600 }, { "epoch": 2.7607361963190185, "grad_norm": 0.58180171251297, "learning_rate": 0.00043488848184462873, "loss": 3.5355, "step": 25650 }, { "epoch": 2.7661177483586266, "grad_norm": 0.6068442463874817, "learning_rate": 0.0004345652408145674, "loss": 3.5318, "step": 25700 }, { "epoch": 2.7714993003982347, "grad_norm": 0.6122245192527771, "learning_rate": 0.000434241999784506, "loss": 3.5256, "step": 25750 }, { "epoch": 2.776880852437843, "grad_norm": 0.5642405152320862, "learning_rate": 0.0004339187587544445, "loss": 3.5298, "step": 25800 }, { "epoch": 2.7822624044774513, "grad_norm": 0.604735791683197, "learning_rate": 0.0004335955177243831, "loss": 3.5435, "step": 25850 }, { "epoch": 2.7876439565170594, "grad_norm": 0.5870072841644287, "learning_rate": 0.00043327227669432165, "loss": 3.5304, "step": 25900 }, { "epoch": 2.793025508556668, "grad_norm": 0.5905956029891968, "learning_rate": 0.00043294903566426025, "loss": 3.5234, "step": 25950 }, { "epoch": 2.798407060596276, "grad_norm": 0.5809804201126099, "learning_rate": 0.0004326257946341989, "loss": 3.5277, "step": 26000 }, { "epoch": 2.798407060596276, "eval_accuracy": 0.368913713417795, "eval_loss": 3.5110578536987305, "eval_runtime": 184.5749, "eval_samples_per_second": 97.581, "eval_steps_per_second": 6.101, "step": 26000 }, { "epoch": 2.803788612635884, "grad_norm": 0.5987585186958313, "learning_rate": 0.00043230255360413744, "loss": 3.5359, "step": 26050 }, { "epoch": 2.8091701646754923, "grad_norm": 0.6006620526313782, "learning_rate": 0.00043197931257407603, "loss": 3.534, "step": 26100 }, { "epoch": 2.8145517167151004, "grad_norm": 0.5650146007537842, "learning_rate": 0.0004316560715440146, "loss": 3.5355, "step": 26150 }, { "epoch": 2.819933268754709, "grad_norm": 0.6045055985450745, "learning_rate": 0.00043133283051395317, "loss": 3.5055, "step": 26200 }, { "epoch": 2.825314820794317, "grad_norm": 0.5808995962142944, "learning_rate": 0.0004310095894838918, "loss": 3.5506, "step": 26250 }, { "epoch": 2.830696372833925, "grad_norm": 0.6251314878463745, "learning_rate": 0.0004306863484538304, "loss": 3.5199, "step": 26300 }, { "epoch": 2.8360779248735337, "grad_norm": 0.5796584486961365, "learning_rate": 0.00043036310742376895, "loss": 3.5251, "step": 26350 }, { "epoch": 2.841459476913142, "grad_norm": 0.5469850301742554, "learning_rate": 0.00043003986639370754, "loss": 3.524, "step": 26400 }, { "epoch": 2.84684102895275, "grad_norm": 0.5464411377906799, "learning_rate": 0.0004297166253636461, "loss": 3.5052, "step": 26450 }, { "epoch": 2.852222580992358, "grad_norm": 0.6001213788986206, "learning_rate": 0.0004293933843335847, "loss": 3.5216, "step": 26500 }, { "epoch": 2.857604133031966, "grad_norm": 0.6024080514907837, "learning_rate": 0.00042907014330352333, "loss": 3.5236, "step": 26550 }, { "epoch": 2.8629856850715747, "grad_norm": 0.5663939118385315, "learning_rate": 0.00042874690227346187, "loss": 3.5461, "step": 26600 }, { "epoch": 2.868367237111183, "grad_norm": 0.5653455853462219, "learning_rate": 0.00042842366124340046, "loss": 3.5291, "step": 26650 }, { "epoch": 2.873748789150791, "grad_norm": 0.5584545135498047, "learning_rate": 0.00042810042021333906, "loss": 3.5283, "step": 26700 }, { "epoch": 2.8791303411903995, "grad_norm": 0.5881627798080444, "learning_rate": 0.00042778364400387886, "loss": 3.5353, "step": 26750 }, { "epoch": 2.8845118932300076, "grad_norm": 0.5636775493621826, "learning_rate": 0.0004274604029738174, "loss": 3.5406, "step": 26800 }, { "epoch": 2.8898934452696157, "grad_norm": 0.6478109359741211, "learning_rate": 0.000427137161943756, "loss": 3.5313, "step": 26850 }, { "epoch": 2.895274997309224, "grad_norm": 0.6400372385978699, "learning_rate": 0.00042681392091369465, "loss": 3.5297, "step": 26900 }, { "epoch": 2.9006565493488323, "grad_norm": 0.6088976263999939, "learning_rate": 0.0004264906798836332, "loss": 3.5215, "step": 26950 }, { "epoch": 2.9060381013884404, "grad_norm": 0.6216518878936768, "learning_rate": 0.0004261674388535718, "loss": 3.5187, "step": 27000 }, { "epoch": 2.9060381013884404, "eval_accuracy": 0.3694821853268404, "eval_loss": 3.503814697265625, "eval_runtime": 184.8292, "eval_samples_per_second": 97.447, "eval_steps_per_second": 6.092, "step": 27000 }, { "epoch": 2.9114196534280485, "grad_norm": 0.5948754549026489, "learning_rate": 0.0004258441978235104, "loss": 3.5347, "step": 27050 }, { "epoch": 2.9168012054676566, "grad_norm": 0.5998795032501221, "learning_rate": 0.0004255209567934489, "loss": 3.5336, "step": 27100 }, { "epoch": 2.922182757507265, "grad_norm": 0.6143434643745422, "learning_rate": 0.0004251977157633875, "loss": 3.5265, "step": 27150 }, { "epoch": 2.9275643095468733, "grad_norm": 0.590839147567749, "learning_rate": 0.00042487447473332616, "loss": 3.5134, "step": 27200 }, { "epoch": 2.9329458615864814, "grad_norm": 0.5512373447418213, "learning_rate": 0.0004245512337032647, "loss": 3.5192, "step": 27250 }, { "epoch": 2.93832741362609, "grad_norm": 0.636211633682251, "learning_rate": 0.0004242279926732033, "loss": 3.527, "step": 27300 }, { "epoch": 2.943708965665698, "grad_norm": 0.5445491671562195, "learning_rate": 0.00042390475164314184, "loss": 3.5407, "step": 27350 }, { "epoch": 2.949090517705306, "grad_norm": 0.5808195471763611, "learning_rate": 0.00042358151061308043, "loss": 3.5187, "step": 27400 }, { "epoch": 2.9544720697449143, "grad_norm": 0.5917678475379944, "learning_rate": 0.000423258269583019, "loss": 3.5374, "step": 27450 }, { "epoch": 2.9598536217845224, "grad_norm": 0.6088602542877197, "learning_rate": 0.0004229350285529576, "loss": 3.5279, "step": 27500 }, { "epoch": 2.965235173824131, "grad_norm": 0.6586233377456665, "learning_rate": 0.0004226117875228962, "loss": 3.5174, "step": 27550 }, { "epoch": 2.970616725863739, "grad_norm": 0.5706504583358765, "learning_rate": 0.0004222885464928348, "loss": 3.5181, "step": 27600 }, { "epoch": 2.975998277903347, "grad_norm": 0.5670300722122192, "learning_rate": 0.00042196530546277335, "loss": 3.539, "step": 27650 }, { "epoch": 2.9813798299429557, "grad_norm": 0.5491836667060852, "learning_rate": 0.00042164206443271195, "loss": 3.5378, "step": 27700 }, { "epoch": 2.986761381982564, "grad_norm": 0.607046902179718, "learning_rate": 0.0004213188234026505, "loss": 3.5157, "step": 27750 }, { "epoch": 2.992142934022172, "grad_norm": 0.5921494364738464, "learning_rate": 0.00042099558237258914, "loss": 3.5087, "step": 27800 }, { "epoch": 2.9975244860617805, "grad_norm": 0.5658209323883057, "learning_rate": 0.00042067234134252773, "loss": 3.5393, "step": 27850 }, { "epoch": 3.0029060381013886, "grad_norm": 0.5574458837509155, "learning_rate": 0.00042034910031246627, "loss": 3.4678, "step": 27900 }, { "epoch": 3.0082875901409967, "grad_norm": 0.6002983450889587, "learning_rate": 0.00042002585928240486, "loss": 3.4162, "step": 27950 }, { "epoch": 3.0136691421806048, "grad_norm": 0.5765467286109924, "learning_rate": 0.00041970261825234346, "loss": 3.4414, "step": 28000 }, { "epoch": 3.0136691421806048, "eval_accuracy": 0.37054198559458285, "eval_loss": 3.4985103607177734, "eval_runtime": 184.6194, "eval_samples_per_second": 97.557, "eval_steps_per_second": 6.099, "step": 28000 }, { "epoch": 3.0190506942202133, "grad_norm": 0.6787931323051453, "learning_rate": 0.00041937937722228205, "loss": 3.4392, "step": 28050 }, { "epoch": 3.0244322462598214, "grad_norm": 0.5360280871391296, "learning_rate": 0.00041905613619222065, "loss": 3.4451, "step": 28100 }, { "epoch": 3.0298137982994295, "grad_norm": 0.6370947957038879, "learning_rate": 0.00041873289516215924, "loss": 3.4504, "step": 28150 }, { "epoch": 3.0351953503390376, "grad_norm": 0.6381440162658691, "learning_rate": 0.0004184096541320978, "loss": 3.4473, "step": 28200 }, { "epoch": 3.040576902378646, "grad_norm": 0.5641018152236938, "learning_rate": 0.0004180864131020364, "loss": 3.4285, "step": 28250 }, { "epoch": 3.0459584544182543, "grad_norm": 0.573603630065918, "learning_rate": 0.0004177631720719749, "loss": 3.4438, "step": 28300 }, { "epoch": 3.0513400064578624, "grad_norm": 0.635581374168396, "learning_rate": 0.00041743993104191357, "loss": 3.4441, "step": 28350 }, { "epoch": 3.0567215584974705, "grad_norm": 0.6056490540504456, "learning_rate": 0.00041711669001185216, "loss": 3.4447, "step": 28400 }, { "epoch": 3.062103110537079, "grad_norm": 0.6397773623466492, "learning_rate": 0.0004167934489817907, "loss": 3.4359, "step": 28450 }, { "epoch": 3.067484662576687, "grad_norm": 0.570362389087677, "learning_rate": 0.0004164702079517293, "loss": 3.4428, "step": 28500 }, { "epoch": 3.0728662146162953, "grad_norm": 0.6196599006652832, "learning_rate": 0.0004161469669216679, "loss": 3.4492, "step": 28550 }, { "epoch": 3.0782477666559034, "grad_norm": 0.5860263705253601, "learning_rate": 0.00041582372589160643, "loss": 3.4482, "step": 28600 }, { "epoch": 3.083629318695512, "grad_norm": 0.6199707388877869, "learning_rate": 0.0004155004848615451, "loss": 3.451, "step": 28650 }, { "epoch": 3.08901087073512, "grad_norm": 0.6375261545181274, "learning_rate": 0.0004151772438314837, "loss": 3.4516, "step": 28700 }, { "epoch": 3.094392422774728, "grad_norm": 0.5758550763130188, "learning_rate": 0.0004148540028014222, "loss": 3.4535, "step": 28750 }, { "epoch": 3.0997739748143363, "grad_norm": 0.6565601825714111, "learning_rate": 0.0004145307617713608, "loss": 3.4571, "step": 28800 }, { "epoch": 3.105155526853945, "grad_norm": 0.5880964994430542, "learning_rate": 0.00041420752074129935, "loss": 3.432, "step": 28850 }, { "epoch": 3.110537078893553, "grad_norm": 0.585135281085968, "learning_rate": 0.000413884279711238, "loss": 3.4393, "step": 28900 }, { "epoch": 3.115918630933161, "grad_norm": 0.5878077745437622, "learning_rate": 0.0004135610386811766, "loss": 3.457, "step": 28950 }, { "epoch": 3.121300182972769, "grad_norm": 0.5695430636405945, "learning_rate": 0.0004132442624717164, "loss": 3.4285, "step": 29000 }, { "epoch": 3.121300182972769, "eval_accuracy": 0.3709822470960071, "eval_loss": 3.4938888549804688, "eval_runtime": 184.3826, "eval_samples_per_second": 97.683, "eval_steps_per_second": 6.107, "step": 29000 }, { "epoch": 3.1266817350123777, "grad_norm": 0.5889701843261719, "learning_rate": 0.000412921021441655, "loss": 3.4346, "step": 29050 }, { "epoch": 3.132063287051986, "grad_norm": 0.5934194326400757, "learning_rate": 0.00041259778041159354, "loss": 3.4426, "step": 29100 }, { "epoch": 3.137444839091594, "grad_norm": 0.5477867126464844, "learning_rate": 0.00041227453938153213, "loss": 3.4467, "step": 29150 }, { "epoch": 3.1428263911312024, "grad_norm": 0.5779726505279541, "learning_rate": 0.00041195129835147067, "loss": 3.4411, "step": 29200 }, { "epoch": 3.1482079431708105, "grad_norm": 0.5577905178070068, "learning_rate": 0.00041162805732140927, "loss": 3.4518, "step": 29250 }, { "epoch": 3.1535894952104186, "grad_norm": 0.5991485714912415, "learning_rate": 0.0004113048162913479, "loss": 3.4528, "step": 29300 }, { "epoch": 3.1589710472500268, "grad_norm": 0.5515627264976501, "learning_rate": 0.00041098157526128646, "loss": 3.4553, "step": 29350 }, { "epoch": 3.1643525992896353, "grad_norm": 0.577649712562561, "learning_rate": 0.00041065833423122505, "loss": 3.4708, "step": 29400 }, { "epoch": 3.1697341513292434, "grad_norm": 0.609976589679718, "learning_rate": 0.00041033509320116365, "loss": 3.4677, "step": 29450 }, { "epoch": 3.1751157033688515, "grad_norm": 0.5896977186203003, "learning_rate": 0.0004100118521711022, "loss": 3.4464, "step": 29500 }, { "epoch": 3.1804972554084596, "grad_norm": 0.5586709976196289, "learning_rate": 0.0004096886111410408, "loss": 3.4503, "step": 29550 }, { "epoch": 3.185878807448068, "grad_norm": 0.6799648404121399, "learning_rate": 0.00040936537011097943, "loss": 3.4442, "step": 29600 }, { "epoch": 3.1912603594876763, "grad_norm": 0.5831512808799744, "learning_rate": 0.00040904212908091797, "loss": 3.4351, "step": 29650 }, { "epoch": 3.1966419115272844, "grad_norm": 0.6388946771621704, "learning_rate": 0.00040871888805085656, "loss": 3.4333, "step": 29700 }, { "epoch": 3.2020234635668925, "grad_norm": 0.6149418950080872, "learning_rate": 0.0004083956470207951, "loss": 3.4522, "step": 29750 }, { "epoch": 3.207405015606501, "grad_norm": 0.5800194144248962, "learning_rate": 0.0004080724059907337, "loss": 3.4605, "step": 29800 }, { "epoch": 3.212786567646109, "grad_norm": 0.665179967880249, "learning_rate": 0.00040774916496067235, "loss": 3.4525, "step": 29850 }, { "epoch": 3.2181681196857173, "grad_norm": 0.6393986344337463, "learning_rate": 0.0004074259239306109, "loss": 3.4625, "step": 29900 }, { "epoch": 3.2235496717253254, "grad_norm": 0.5729492902755737, "learning_rate": 0.0004071026829005495, "loss": 3.448, "step": 29950 }, { "epoch": 3.228931223764934, "grad_norm": 0.6174540519714355, "learning_rate": 0.0004067794418704881, "loss": 3.4394, "step": 30000 }, { "epoch": 3.228931223764934, "eval_accuracy": 0.3717323866334783, "eval_loss": 3.488069772720337, "eval_runtime": 184.6952, "eval_samples_per_second": 97.517, "eval_steps_per_second": 6.097, "step": 30000 }, { "epoch": 3.234312775804542, "grad_norm": 0.5699408650398254, "learning_rate": 0.0004064562008404266, "loss": 3.4601, "step": 30050 }, { "epoch": 3.23969432784415, "grad_norm": 0.5962291955947876, "learning_rate": 0.0004061329598103652, "loss": 3.4724, "step": 30100 }, { "epoch": 3.2450758798837587, "grad_norm": 0.6102081537246704, "learning_rate": 0.00040580971878030386, "loss": 3.4567, "step": 30150 }, { "epoch": 3.250457431923367, "grad_norm": 0.6508827805519104, "learning_rate": 0.0004054864777502424, "loss": 3.4537, "step": 30200 }, { "epoch": 3.255838983962975, "grad_norm": 0.6434182524681091, "learning_rate": 0.000405163236720181, "loss": 3.4557, "step": 30250 }, { "epoch": 3.261220536002583, "grad_norm": 0.5960790514945984, "learning_rate": 0.00040483999569011954, "loss": 3.4562, "step": 30300 }, { "epoch": 3.2666020880421915, "grad_norm": 0.5768137574195862, "learning_rate": 0.00040451675466005813, "loss": 3.4538, "step": 30350 }, { "epoch": 3.2719836400817996, "grad_norm": 0.6380401849746704, "learning_rate": 0.0004041935136299967, "loss": 3.4476, "step": 30400 }, { "epoch": 3.2773651921214078, "grad_norm": 0.6057765483856201, "learning_rate": 0.0004038702725999353, "loss": 3.4356, "step": 30450 }, { "epoch": 3.282746744161016, "grad_norm": 0.5699383020401001, "learning_rate": 0.0004035470315698739, "loss": 3.452, "step": 30500 }, { "epoch": 3.2881282962006244, "grad_norm": 0.5979689359664917, "learning_rate": 0.0004032237905398125, "loss": 3.4618, "step": 30550 }, { "epoch": 3.2935098482402325, "grad_norm": 0.6368375420570374, "learning_rate": 0.00040290054950975105, "loss": 3.4628, "step": 30600 }, { "epoch": 3.2988914002798406, "grad_norm": 0.6144343614578247, "learning_rate": 0.00040257730847968965, "loss": 3.4395, "step": 30650 }, { "epoch": 3.304272952319449, "grad_norm": 0.5661136507987976, "learning_rate": 0.0004022540674496283, "loss": 3.469, "step": 30700 }, { "epoch": 3.3096545043590573, "grad_norm": 0.6017364263534546, "learning_rate": 0.00040193082641956684, "loss": 3.4608, "step": 30750 }, { "epoch": 3.3150360563986654, "grad_norm": 0.5940915942192078, "learning_rate": 0.00040160758538950543, "loss": 3.4437, "step": 30800 }, { "epoch": 3.3204176084382735, "grad_norm": 0.6294538974761963, "learning_rate": 0.00040128434435944397, "loss": 3.4343, "step": 30850 }, { "epoch": 3.3257991604778816, "grad_norm": 0.650331437587738, "learning_rate": 0.00040096110332938257, "loss": 3.4636, "step": 30900 }, { "epoch": 3.33118071251749, "grad_norm": 0.6346154808998108, "learning_rate": 0.00040063786229932116, "loss": 3.4786, "step": 30950 }, { "epoch": 3.3365622645570983, "grad_norm": 0.5942608714103699, "learning_rate": 0.00040031462126925975, "loss": 3.4575, "step": 31000 }, { "epoch": 3.3365622645570983, "eval_accuracy": 0.3724254834048469, "eval_loss": 3.4845597743988037, "eval_runtime": 184.6678, "eval_samples_per_second": 97.532, "eval_steps_per_second": 6.097, "step": 31000 }, { "epoch": 3.3419438165967064, "grad_norm": 0.5669004321098328, "learning_rate": 0.00039999138023919835, "loss": 3.4639, "step": 31050 }, { "epoch": 3.347325368636315, "grad_norm": 0.6496381759643555, "learning_rate": 0.00039966813920913694, "loss": 3.457, "step": 31100 }, { "epoch": 3.352706920675923, "grad_norm": 0.5670893788337708, "learning_rate": 0.0003993448981790755, "loss": 3.4657, "step": 31150 }, { "epoch": 3.358088472715531, "grad_norm": 0.608622670173645, "learning_rate": 0.0003990216571490141, "loss": 3.437, "step": 31200 }, { "epoch": 3.3634700247551392, "grad_norm": 0.5956699848175049, "learning_rate": 0.0003986984161189526, "loss": 3.463, "step": 31250 }, { "epoch": 3.368851576794748, "grad_norm": 0.6540201306343079, "learning_rate": 0.00039837517508889127, "loss": 3.4603, "step": 31300 }, { "epoch": 3.374233128834356, "grad_norm": 0.5587558150291443, "learning_rate": 0.000398058398879431, "loss": 3.4485, "step": 31350 }, { "epoch": 3.379614680873964, "grad_norm": 0.5995318293571472, "learning_rate": 0.00039773515784936967, "loss": 3.4373, "step": 31400 }, { "epoch": 3.384996232913572, "grad_norm": 0.60701984167099, "learning_rate": 0.00039741191681930826, "loss": 3.4547, "step": 31450 }, { "epoch": 3.3903777849531807, "grad_norm": 0.5675340890884399, "learning_rate": 0.0003970886757892468, "loss": 3.4381, "step": 31500 }, { "epoch": 3.3957593369927888, "grad_norm": 0.5904266238212585, "learning_rate": 0.0003967654347591854, "loss": 3.4733, "step": 31550 }, { "epoch": 3.401140889032397, "grad_norm": 0.5990027189254761, "learning_rate": 0.00039644219372912394, "loss": 3.4597, "step": 31600 }, { "epoch": 3.4065224410720054, "grad_norm": 0.6386702656745911, "learning_rate": 0.0003961189526990626, "loss": 3.4413, "step": 31650 }, { "epoch": 3.4119039931116135, "grad_norm": 0.6855025887489319, "learning_rate": 0.0003957957116690012, "loss": 3.4663, "step": 31700 }, { "epoch": 3.4172855451512216, "grad_norm": 0.6087505221366882, "learning_rate": 0.0003954724706389397, "loss": 3.4613, "step": 31750 }, { "epoch": 3.4226670971908297, "grad_norm": 0.6036788821220398, "learning_rate": 0.0003951492296088783, "loss": 3.45, "step": 31800 }, { "epoch": 3.428048649230438, "grad_norm": 0.6118823289871216, "learning_rate": 0.0003948259885788169, "loss": 3.4542, "step": 31850 }, { "epoch": 3.4334302012700464, "grad_norm": 0.6175610423088074, "learning_rate": 0.00039450274754875545, "loss": 3.4705, "step": 31900 }, { "epoch": 3.4388117533096545, "grad_norm": 0.5950233340263367, "learning_rate": 0.0003941795065186941, "loss": 3.4619, "step": 31950 }, { "epoch": 3.4441933053492626, "grad_norm": 0.6266686320304871, "learning_rate": 0.0003938562654886327, "loss": 3.4513, "step": 32000 }, { "epoch": 3.4441933053492626, "eval_accuracy": 0.37286737469958836, "eval_loss": 3.4757885932922363, "eval_runtime": 184.4894, "eval_samples_per_second": 97.626, "eval_steps_per_second": 6.103, "step": 32000 }, { "epoch": 3.449574857388871, "grad_norm": 0.5964539647102356, "learning_rate": 0.00039353302445857124, "loss": 3.4683, "step": 32050 }, { "epoch": 3.4549564094284793, "grad_norm": 0.66469806432724, "learning_rate": 0.00039320978342850983, "loss": 3.4752, "step": 32100 }, { "epoch": 3.4603379614680874, "grad_norm": 0.5947219729423523, "learning_rate": 0.00039288654239844837, "loss": 3.4626, "step": 32150 }, { "epoch": 3.4657195135076955, "grad_norm": 0.6041039824485779, "learning_rate": 0.00039256330136838697, "loss": 3.4448, "step": 32200 }, { "epoch": 3.471101065547304, "grad_norm": 0.6173006296157837, "learning_rate": 0.0003922400603383256, "loss": 3.4376, "step": 32250 }, { "epoch": 3.476482617586912, "grad_norm": 0.6498045921325684, "learning_rate": 0.00039191681930826416, "loss": 3.4638, "step": 32300 }, { "epoch": 3.4818641696265202, "grad_norm": 0.6071352958679199, "learning_rate": 0.00039159357827820275, "loss": 3.4619, "step": 32350 }, { "epoch": 3.4872457216661283, "grad_norm": 0.6316366195678711, "learning_rate": 0.00039127033724814135, "loss": 3.448, "step": 32400 }, { "epoch": 3.492627273705737, "grad_norm": 0.6110300421714783, "learning_rate": 0.0003909470962180799, "loss": 3.4594, "step": 32450 }, { "epoch": 3.498008825745345, "grad_norm": 0.5853701829910278, "learning_rate": 0.00039062385518801854, "loss": 3.4595, "step": 32500 }, { "epoch": 3.503390377784953, "grad_norm": 0.7893177270889282, "learning_rate": 0.00039030061415795713, "loss": 3.4567, "step": 32550 }, { "epoch": 3.5087719298245617, "grad_norm": 0.5927507877349854, "learning_rate": 0.00038997737312789567, "loss": 3.4584, "step": 32600 }, { "epoch": 3.5141534818641698, "grad_norm": 0.5924676060676575, "learning_rate": 0.00038965413209783426, "loss": 3.4384, "step": 32650 }, { "epoch": 3.519535033903778, "grad_norm": 0.5994710922241211, "learning_rate": 0.0003893308910677728, "loss": 3.4538, "step": 32700 }, { "epoch": 3.524916585943386, "grad_norm": 0.6486222743988037, "learning_rate": 0.0003890076500377114, "loss": 3.4661, "step": 32750 }, { "epoch": 3.530298137982994, "grad_norm": 0.636140763759613, "learning_rate": 0.00038868440900765005, "loss": 3.4545, "step": 32800 }, { "epoch": 3.5356796900226026, "grad_norm": 0.6097226142883301, "learning_rate": 0.0003883611679775886, "loss": 3.4571, "step": 32850 }, { "epoch": 3.5410612420622107, "grad_norm": 0.5975202322006226, "learning_rate": 0.0003880379269475272, "loss": 3.45, "step": 32900 }, { "epoch": 3.546442794101819, "grad_norm": 0.6173543334007263, "learning_rate": 0.0003877146859174657, "loss": 3.4763, "step": 32950 }, { "epoch": 3.5518243461414274, "grad_norm": 0.5743898153305054, "learning_rate": 0.0003873914448874043, "loss": 3.466, "step": 33000 }, { "epoch": 3.5518243461414274, "eval_accuracy": 0.3738963175471915, "eval_loss": 3.4691905975341797, "eval_runtime": 184.6686, "eval_samples_per_second": 97.531, "eval_steps_per_second": 6.097, "step": 33000 }, { "epoch": 3.5572058981810355, "grad_norm": 0.5753702521324158, "learning_rate": 0.0003870682038573429, "loss": 3.447, "step": 33050 }, { "epoch": 3.5625874502206436, "grad_norm": 0.57344651222229, "learning_rate": 0.0003867449628272815, "loss": 3.4531, "step": 33100 }, { "epoch": 3.5679690022602517, "grad_norm": 0.596196711063385, "learning_rate": 0.0003864217217972201, "loss": 3.466, "step": 33150 }, { "epoch": 3.57335055429986, "grad_norm": 0.5894469618797302, "learning_rate": 0.0003860984807671587, "loss": 3.4502, "step": 33200 }, { "epoch": 3.5787321063394684, "grad_norm": 0.5605970621109009, "learning_rate": 0.00038577523973709724, "loss": 3.4534, "step": 33250 }, { "epoch": 3.5841136583790765, "grad_norm": 0.6303330659866333, "learning_rate": 0.00038545199870703583, "loss": 3.4537, "step": 33300 }, { "epoch": 3.5894952104186846, "grad_norm": 0.6184437274932861, "learning_rate": 0.0003851287576769744, "loss": 3.4538, "step": 33350 }, { "epoch": 3.594876762458293, "grad_norm": 0.5937089323997498, "learning_rate": 0.000384805516646913, "loss": 3.4745, "step": 33400 }, { "epoch": 3.6002583144979012, "grad_norm": 0.650280773639679, "learning_rate": 0.0003844822756168516, "loss": 3.4664, "step": 33450 }, { "epoch": 3.6056398665375093, "grad_norm": 0.6183555722236633, "learning_rate": 0.00038415903458679016, "loss": 3.4595, "step": 33500 }, { "epoch": 3.611021418577118, "grad_norm": 0.6294432878494263, "learning_rate": 0.00038383579355672875, "loss": 3.4687, "step": 33550 }, { "epoch": 3.616402970616726, "grad_norm": 0.5563703775405884, "learning_rate": 0.00038351255252666735, "loss": 3.4539, "step": 33600 }, { "epoch": 3.621784522656334, "grad_norm": 0.5843515992164612, "learning_rate": 0.00038318931149660594, "loss": 3.4602, "step": 33650 }, { "epoch": 3.627166074695942, "grad_norm": 0.6656185388565063, "learning_rate": 0.00038286607046654454, "loss": 3.4504, "step": 33700 }, { "epoch": 3.6325476267355503, "grad_norm": 0.5866097211837769, "learning_rate": 0.00038254282943648313, "loss": 3.4492, "step": 33750 }, { "epoch": 3.637929178775159, "grad_norm": 0.6325012445449829, "learning_rate": 0.00038221958840642167, "loss": 3.4465, "step": 33800 }, { "epoch": 3.643310730814767, "grad_norm": 0.5878468751907349, "learning_rate": 0.00038189634737636027, "loss": 3.4465, "step": 33850 }, { "epoch": 3.648692282854375, "grad_norm": 0.6447891592979431, "learning_rate": 0.0003815731063462988, "loss": 3.4554, "step": 33900 }, { "epoch": 3.6540738348939836, "grad_norm": 0.6783122420310974, "learning_rate": 0.00038124986531623745, "loss": 3.4506, "step": 33950 }, { "epoch": 3.6594553869335917, "grad_norm": 0.5637625455856323, "learning_rate": 0.00038092662428617605, "loss": 3.4664, "step": 34000 }, { "epoch": 3.6594553869335917, "eval_accuracy": 0.3743114802315306, "eval_loss": 3.4632701873779297, "eval_runtime": 184.7686, "eval_samples_per_second": 97.479, "eval_steps_per_second": 6.094, "step": 34000 }, { "epoch": 3.6648369389732, "grad_norm": 0.6158130764961243, "learning_rate": 0.0003806033832561146, "loss": 3.4482, "step": 34050 }, { "epoch": 3.670218491012808, "grad_norm": 0.6033890843391418, "learning_rate": 0.0003802801422260532, "loss": 3.4567, "step": 34100 }, { "epoch": 3.675600043052416, "grad_norm": 0.6929822564125061, "learning_rate": 0.0003799569011959918, "loss": 3.4441, "step": 34150 }, { "epoch": 3.6809815950920246, "grad_norm": 0.6433200240135193, "learning_rate": 0.0003796336601659303, "loss": 3.4583, "step": 34200 }, { "epoch": 3.6863631471316327, "grad_norm": 0.6610816717147827, "learning_rate": 0.00037931041913586897, "loss": 3.4545, "step": 34250 }, { "epoch": 3.691744699171241, "grad_norm": 0.6418637037277222, "learning_rate": 0.00037898717810580756, "loss": 3.467, "step": 34300 }, { "epoch": 3.6971262512108494, "grad_norm": 0.587744414806366, "learning_rate": 0.0003786639370757461, "loss": 3.446, "step": 34350 }, { "epoch": 3.7025078032504575, "grad_norm": 0.5943315029144287, "learning_rate": 0.0003783406960456847, "loss": 3.453, "step": 34400 }, { "epoch": 3.7078893552900656, "grad_norm": 0.6527094841003418, "learning_rate": 0.00037801745501562324, "loss": 3.4449, "step": 34450 }, { "epoch": 3.713270907329674, "grad_norm": 0.5982084274291992, "learning_rate": 0.0003776942139855619, "loss": 3.4604, "step": 34500 }, { "epoch": 3.7186524593692822, "grad_norm": 0.6434234380722046, "learning_rate": 0.0003773709729555005, "loss": 3.462, "step": 34550 }, { "epoch": 3.7240340114088903, "grad_norm": 0.6087839603424072, "learning_rate": 0.000377047731925439, "loss": 3.4495, "step": 34600 }, { "epoch": 3.7294155634484984, "grad_norm": 0.6338579058647156, "learning_rate": 0.0003767244908953776, "loss": 3.4548, "step": 34650 }, { "epoch": 3.7347971154881066, "grad_norm": 0.6043111085891724, "learning_rate": 0.0003764012498653162, "loss": 3.4783, "step": 34700 }, { "epoch": 3.740178667527715, "grad_norm": 0.6296453475952148, "learning_rate": 0.00037607800883525475, "loss": 3.4469, "step": 34750 }, { "epoch": 3.745560219567323, "grad_norm": 0.5895748734474182, "learning_rate": 0.0003757547678051934, "loss": 3.4689, "step": 34800 }, { "epoch": 3.7509417716069313, "grad_norm": 0.6216940879821777, "learning_rate": 0.000375431526775132, "loss": 3.4502, "step": 34850 }, { "epoch": 3.75632332364654, "grad_norm": 0.6479626893997192, "learning_rate": 0.00037510828574507054, "loss": 3.4286, "step": 34900 }, { "epoch": 3.761704875686148, "grad_norm": 0.626582682132721, "learning_rate": 0.00037478504471500913, "loss": 3.4464, "step": 34950 }, { "epoch": 3.767086427725756, "grad_norm": 0.6230822205543518, "learning_rate": 0.00037446180368494767, "loss": 3.4389, "step": 35000 }, { "epoch": 3.767086427725756, "eval_accuracy": 0.37504858142246406, "eval_loss": 3.4572198390960693, "eval_runtime": 184.5056, "eval_samples_per_second": 97.618, "eval_steps_per_second": 6.103, "step": 35000 }, { "epoch": 3.772467979765364, "grad_norm": 0.6130627393722534, "learning_rate": 0.00037413856265488627, "loss": 3.4359, "step": 35050 }, { "epoch": 3.7778495318049723, "grad_norm": 0.5983439683914185, "learning_rate": 0.0003738153216248249, "loss": 3.4625, "step": 35100 }, { "epoch": 3.783231083844581, "grad_norm": 0.6246692538261414, "learning_rate": 0.00037349208059476346, "loss": 3.449, "step": 35150 }, { "epoch": 3.788612635884189, "grad_norm": 0.6247369647026062, "learning_rate": 0.00037316883956470205, "loss": 3.4509, "step": 35200 }, { "epoch": 3.793994187923797, "grad_norm": 0.5881553888320923, "learning_rate": 0.00037284559853464064, "loss": 3.4625, "step": 35250 }, { "epoch": 3.7993757399634056, "grad_norm": 0.621890127658844, "learning_rate": 0.0003725223575045792, "loss": 3.4556, "step": 35300 }, { "epoch": 3.8047572920030137, "grad_norm": 0.6440849304199219, "learning_rate": 0.000372205581295119, "loss": 3.4437, "step": 35350 }, { "epoch": 3.810138844042622, "grad_norm": 0.6532849669456482, "learning_rate": 0.0003718823402650576, "loss": 3.4493, "step": 35400 }, { "epoch": 3.8155203960822304, "grad_norm": 0.704378068447113, "learning_rate": 0.00037155909923499624, "loss": 3.4462, "step": 35450 }, { "epoch": 3.8209019481218385, "grad_norm": 0.6049141883850098, "learning_rate": 0.0003712358582049348, "loss": 3.4795, "step": 35500 }, { "epoch": 3.8262835001614466, "grad_norm": 0.6370956897735596, "learning_rate": 0.00037091261717487337, "loss": 3.4381, "step": 35550 }, { "epoch": 3.8316650522010547, "grad_norm": 0.6178144216537476, "learning_rate": 0.00037058937614481197, "loss": 3.4739, "step": 35600 }, { "epoch": 3.837046604240663, "grad_norm": 0.6254431009292603, "learning_rate": 0.0003702661351147505, "loss": 3.4459, "step": 35650 }, { "epoch": 3.8424281562802713, "grad_norm": 0.6104562878608704, "learning_rate": 0.0003699428940846891, "loss": 3.4574, "step": 35700 }, { "epoch": 3.8478097083198795, "grad_norm": 0.6314630508422852, "learning_rate": 0.00036961965305462775, "loss": 3.4543, "step": 35750 }, { "epoch": 3.8531912603594876, "grad_norm": 0.6032555103302002, "learning_rate": 0.0003692964120245663, "loss": 3.4393, "step": 35800 }, { "epoch": 3.858572812399096, "grad_norm": 0.5797463655471802, "learning_rate": 0.0003689731709945049, "loss": 3.4418, "step": 35850 }, { "epoch": 3.863954364438704, "grad_norm": 0.6190691590309143, "learning_rate": 0.0003686499299644434, "loss": 3.4475, "step": 35900 }, { "epoch": 3.8693359164783123, "grad_norm": 0.6184253692626953, "learning_rate": 0.000368326688934382, "loss": 3.4371, "step": 35950 }, { "epoch": 3.8747174685179204, "grad_norm": 0.6029491424560547, "learning_rate": 0.0003680034479043206, "loss": 3.4297, "step": 36000 }, { "epoch": 3.8747174685179204, "eval_accuracy": 0.3752638227932245, "eval_loss": 3.45257568359375, "eval_runtime": 184.636, "eval_samples_per_second": 97.549, "eval_steps_per_second": 6.098, "step": 36000 }, { "epoch": 3.8800990205575285, "grad_norm": 0.6468108296394348, "learning_rate": 0.0003676802068742592, "loss": 3.4645, "step": 36050 }, { "epoch": 3.885480572597137, "grad_norm": 0.6638432741165161, "learning_rate": 0.0003673569658441978, "loss": 3.4543, "step": 36100 }, { "epoch": 3.890862124636745, "grad_norm": 0.5957279205322266, "learning_rate": 0.0003670337248141364, "loss": 3.4544, "step": 36150 }, { "epoch": 3.8962436766763533, "grad_norm": 0.6390054821968079, "learning_rate": 0.00036671048378407494, "loss": 3.4491, "step": 36200 }, { "epoch": 3.901625228715962, "grad_norm": 0.6385354399681091, "learning_rate": 0.00036638724275401353, "loss": 3.4407, "step": 36250 }, { "epoch": 3.90700678075557, "grad_norm": 0.6172071099281311, "learning_rate": 0.0003660640017239522, "loss": 3.4678, "step": 36300 }, { "epoch": 3.912388332795178, "grad_norm": 0.6054326891899109, "learning_rate": 0.0003657407606938907, "loss": 3.4324, "step": 36350 }, { "epoch": 3.9177698848347866, "grad_norm": 0.617918074131012, "learning_rate": 0.0003654175196638293, "loss": 3.4563, "step": 36400 }, { "epoch": 3.9231514368743947, "grad_norm": 0.6514351963996887, "learning_rate": 0.00036509427863376786, "loss": 3.4482, "step": 36450 }, { "epoch": 3.928532988914003, "grad_norm": 0.6361662149429321, "learning_rate": 0.00036477103760370645, "loss": 3.4561, "step": 36500 }, { "epoch": 3.933914540953611, "grad_norm": 0.5907837152481079, "learning_rate": 0.00036444779657364505, "loss": 3.451, "step": 36550 }, { "epoch": 3.939296092993219, "grad_norm": 0.6405068635940552, "learning_rate": 0.00036412455554358364, "loss": 3.455, "step": 36600 }, { "epoch": 3.9446776450328276, "grad_norm": 0.5964227914810181, "learning_rate": 0.00036380131451352224, "loss": 3.4496, "step": 36650 }, { "epoch": 3.9500591970724357, "grad_norm": 0.5983424782752991, "learning_rate": 0.00036347807348346083, "loss": 3.4556, "step": 36700 }, { "epoch": 3.955440749112044, "grad_norm": 0.6061781048774719, "learning_rate": 0.00036315483245339937, "loss": 3.4668, "step": 36750 }, { "epoch": 3.9608223011516523, "grad_norm": 0.619894802570343, "learning_rate": 0.00036283159142333797, "loss": 3.4392, "step": 36800 }, { "epoch": 3.9662038531912605, "grad_norm": 0.6279895901679993, "learning_rate": 0.0003625083503932765, "loss": 3.4514, "step": 36850 }, { "epoch": 3.9715854052308686, "grad_norm": 0.5655210018157959, "learning_rate": 0.00036218510936321516, "loss": 3.4708, "step": 36900 }, { "epoch": 3.9769669572704767, "grad_norm": 0.5848953127861023, "learning_rate": 0.00036186186833315375, "loss": 3.465, "step": 36950 }, { "epoch": 3.9823485093100848, "grad_norm": 0.6551256775856018, "learning_rate": 0.0003615386273030923, "loss": 3.445, "step": 37000 }, { "epoch": 3.9823485093100848, "eval_accuracy": 0.37610197116982547, "eval_loss": 3.4445886611938477, "eval_runtime": 187.3097, "eval_samples_per_second": 96.156, "eval_steps_per_second": 6.011, "step": 37000 }, { "epoch": 3.9877300613496933, "grad_norm": 0.6091813445091248, "learning_rate": 0.0003612153862730309, "loss": 3.451, "step": 37050 }, { "epoch": 3.9931116133893014, "grad_norm": 0.6060930490493774, "learning_rate": 0.0003608921452429695, "loss": 3.4552, "step": 37100 }, { "epoch": 3.9984931654289095, "grad_norm": 0.6171631813049316, "learning_rate": 0.000360568904212908, "loss": 3.4441, "step": 37150 }, { "epoch": 4.003874717468518, "grad_norm": 0.6074433326721191, "learning_rate": 0.00036024566318284667, "loss": 3.3806, "step": 37200 }, { "epoch": 4.009256269508126, "grad_norm": 0.5943333506584167, "learning_rate": 0.00035992242215278526, "loss": 3.3569, "step": 37250 }, { "epoch": 4.014637821547734, "grad_norm": 0.637510359287262, "learning_rate": 0.0003595991811227238, "loss": 3.3698, "step": 37300 }, { "epoch": 4.020019373587343, "grad_norm": 0.6431918740272522, "learning_rate": 0.0003592759400926624, "loss": 3.3724, "step": 37350 }, { "epoch": 4.0254009256269505, "grad_norm": 0.7091398239135742, "learning_rate": 0.00035895269906260094, "loss": 3.3644, "step": 37400 }, { "epoch": 4.030782477666559, "grad_norm": 0.6210770010948181, "learning_rate": 0.0003586359228531408, "loss": 3.3404, "step": 37450 }, { "epoch": 4.036164029706168, "grad_norm": 0.6259356141090393, "learning_rate": 0.00035831268182307934, "loss": 3.3691, "step": 37500 }, { "epoch": 4.041545581745775, "grad_norm": 0.6482320427894592, "learning_rate": 0.000357989440793018, "loss": 3.3458, "step": 37550 }, { "epoch": 4.046927133785384, "grad_norm": 0.6331608891487122, "learning_rate": 0.0003576661997629566, "loss": 3.3768, "step": 37600 }, { "epoch": 4.0523086858249915, "grad_norm": 0.6756312847137451, "learning_rate": 0.0003573429587328951, "loss": 3.3593, "step": 37650 }, { "epoch": 4.0576902378646, "grad_norm": 0.5940220355987549, "learning_rate": 0.0003570197177028337, "loss": 3.3723, "step": 37700 }, { "epoch": 4.063071789904209, "grad_norm": 0.604108989238739, "learning_rate": 0.00035669647667277226, "loss": 3.3676, "step": 37750 }, { "epoch": 4.068453341943816, "grad_norm": 0.6163511872291565, "learning_rate": 0.00035637323564271085, "loss": 3.3558, "step": 37800 }, { "epoch": 4.073834893983425, "grad_norm": 0.6365783214569092, "learning_rate": 0.0003560499946126495, "loss": 3.3668, "step": 37850 }, { "epoch": 4.079216446023033, "grad_norm": 0.6292728185653687, "learning_rate": 0.00035572675358258804, "loss": 3.3648, "step": 37900 }, { "epoch": 4.084597998062641, "grad_norm": 0.6514345407485962, "learning_rate": 0.00035540351255252664, "loss": 3.3643, "step": 37950 }, { "epoch": 4.08997955010225, "grad_norm": 0.6355460286140442, "learning_rate": 0.00035508027152246523, "loss": 3.354, "step": 38000 }, { "epoch": 4.08997955010225, "eval_accuracy": 0.3767296589027297, "eval_loss": 3.4461004734039307, "eval_runtime": 186.8586, "eval_samples_per_second": 96.388, "eval_steps_per_second": 6.026, "step": 38000 }, { "epoch": 4.095361102141858, "grad_norm": 0.6700498461723328, "learning_rate": 0.0003547570304924038, "loss": 3.3655, "step": 38050 }, { "epoch": 4.100742654181466, "grad_norm": 0.663998544216156, "learning_rate": 0.0003544337894623424, "loss": 3.3594, "step": 38100 }, { "epoch": 4.106124206221074, "grad_norm": 0.6124699711799622, "learning_rate": 0.000354110548432281, "loss": 3.3701, "step": 38150 }, { "epoch": 4.111505758260682, "grad_norm": 0.6116810441017151, "learning_rate": 0.00035378730740221956, "loss": 3.3662, "step": 38200 }, { "epoch": 4.1168873103002905, "grad_norm": 0.6466190218925476, "learning_rate": 0.00035346406637215815, "loss": 3.3641, "step": 38250 }, { "epoch": 4.122268862339899, "grad_norm": 0.5948997139930725, "learning_rate": 0.0003531408253420967, "loss": 3.3756, "step": 38300 }, { "epoch": 4.127650414379507, "grad_norm": 0.6248114705085754, "learning_rate": 0.0003528175843120353, "loss": 3.3627, "step": 38350 }, { "epoch": 4.133031966419115, "grad_norm": 0.6819987297058105, "learning_rate": 0.00035249434328197394, "loss": 3.3872, "step": 38400 }, { "epoch": 4.138413518458724, "grad_norm": 0.6541435122489929, "learning_rate": 0.0003521711022519125, "loss": 3.3846, "step": 38450 }, { "epoch": 4.1437950704983315, "grad_norm": 0.6173901557922363, "learning_rate": 0.00035184786122185107, "loss": 3.3829, "step": 38500 }, { "epoch": 4.14917662253794, "grad_norm": 0.6310703158378601, "learning_rate": 0.00035152462019178967, "loss": 3.3788, "step": 38550 }, { "epoch": 4.154558174577549, "grad_norm": 0.6219739317893982, "learning_rate": 0.0003512013791617282, "loss": 3.3957, "step": 38600 }, { "epoch": 4.159939726617156, "grad_norm": 0.6413523554801941, "learning_rate": 0.0003508781381316668, "loss": 3.3756, "step": 38650 }, { "epoch": 4.165321278656765, "grad_norm": 0.6500468254089355, "learning_rate": 0.00035055489710160545, "loss": 3.3924, "step": 38700 }, { "epoch": 4.1707028306963725, "grad_norm": 0.6107872724533081, "learning_rate": 0.000350231656071544, "loss": 3.3748, "step": 38750 }, { "epoch": 4.176084382735981, "grad_norm": 0.6339595317840576, "learning_rate": 0.0003499084150414826, "loss": 3.3773, "step": 38800 }, { "epoch": 4.18146593477559, "grad_norm": 0.6154005527496338, "learning_rate": 0.0003495851740114211, "loss": 3.3888, "step": 38850 }, { "epoch": 4.186847486815197, "grad_norm": 0.5948972702026367, "learning_rate": 0.0003492619329813597, "loss": 3.3617, "step": 38900 }, { "epoch": 4.192229038854806, "grad_norm": 0.6268037557601929, "learning_rate": 0.0003489386919512983, "loss": 3.3808, "step": 38950 }, { "epoch": 4.197610590894414, "grad_norm": 0.6886900067329407, "learning_rate": 0.0003486154509212369, "loss": 3.3701, "step": 39000 }, { "epoch": 4.197610590894414, "eval_accuracy": 0.3767333531009154, "eval_loss": 3.4467148780822754, "eval_runtime": 186.912, "eval_samples_per_second": 96.361, "eval_steps_per_second": 6.024, "step": 39000 }, { "epoch": 4.202992142934022, "grad_norm": 0.6147679686546326, "learning_rate": 0.0003482922098911755, "loss": 3.3867, "step": 39050 }, { "epoch": 4.208373694973631, "grad_norm": 0.6290424466133118, "learning_rate": 0.0003479689688611141, "loss": 3.375, "step": 39100 }, { "epoch": 4.213755247013238, "grad_norm": 0.6457464098930359, "learning_rate": 0.00034764572783105264, "loss": 3.3837, "step": 39150 }, { "epoch": 4.219136799052847, "grad_norm": 0.646575391292572, "learning_rate": 0.00034732248680099123, "loss": 3.3858, "step": 39200 }, { "epoch": 4.224518351092455, "grad_norm": 0.6269925236701965, "learning_rate": 0.0003469992457709299, "loss": 3.3909, "step": 39250 }, { "epoch": 4.229899903132063, "grad_norm": 0.6284508109092712, "learning_rate": 0.0003466760047408684, "loss": 3.3784, "step": 39300 }, { "epoch": 4.2352814551716715, "grad_norm": 0.6222100257873535, "learning_rate": 0.000346352763710807, "loss": 3.3784, "step": 39350 }, { "epoch": 4.24066300721128, "grad_norm": 0.708451509475708, "learning_rate": 0.00034602952268074556, "loss": 3.3835, "step": 39400 }, { "epoch": 4.246044559250888, "grad_norm": 0.6076235175132751, "learning_rate": 0.00034570628165068415, "loss": 3.3924, "step": 39450 }, { "epoch": 4.251426111290496, "grad_norm": 0.6589393615722656, "learning_rate": 0.0003453830406206227, "loss": 3.3961, "step": 39500 }, { "epoch": 4.256807663330104, "grad_norm": 0.6510894298553467, "learning_rate": 0.00034505979959056134, "loss": 3.3551, "step": 39550 }, { "epoch": 4.2621892153697125, "grad_norm": 0.8012067675590515, "learning_rate": 0.00034473655856049994, "loss": 3.3709, "step": 39600 }, { "epoch": 4.267570767409321, "grad_norm": 0.6146559119224548, "learning_rate": 0.0003444133175304385, "loss": 3.4107, "step": 39650 }, { "epoch": 4.272952319448929, "grad_norm": 0.6020640134811401, "learning_rate": 0.00034409007650037707, "loss": 3.3753, "step": 39700 }, { "epoch": 4.278333871488537, "grad_norm": 0.6155546307563782, "learning_rate": 0.00034376683547031567, "loss": 3.3717, "step": 39750 }, { "epoch": 4.283715423528146, "grad_norm": 0.6239294409751892, "learning_rate": 0.0003434435944402542, "loss": 3.3639, "step": 39800 }, { "epoch": 4.2890969755677535, "grad_norm": 0.6434022188186646, "learning_rate": 0.00034312035341019286, "loss": 3.3988, "step": 39850 }, { "epoch": 4.294478527607362, "grad_norm": 0.6435459852218628, "learning_rate": 0.00034279711238013145, "loss": 3.3989, "step": 39900 }, { "epoch": 4.299860079646971, "grad_norm": 0.6500222086906433, "learning_rate": 0.00034248033617067126, "loss": 3.397, "step": 39950 }, { "epoch": 4.305241631686578, "grad_norm": 0.622305691242218, "learning_rate": 0.00034215709514060985, "loss": 3.3786, "step": 40000 }, { "epoch": 4.305241631686578, "eval_accuracy": 0.377357346635634, "eval_loss": 3.438131093978882, "eval_runtime": 186.8572, "eval_samples_per_second": 96.389, "eval_steps_per_second": 6.026, "step": 40000 }, { "epoch": 4.310623183726187, "grad_norm": 0.6466324329376221, "learning_rate": 0.0003418338541105484, "loss": 3.3885, "step": 40050 }, { "epoch": 4.3160047357657945, "grad_norm": 0.632236897945404, "learning_rate": 0.000341510613080487, "loss": 3.3859, "step": 40100 }, { "epoch": 4.321386287805403, "grad_norm": 0.6783816814422607, "learning_rate": 0.0003411873720504255, "loss": 3.37, "step": 40150 }, { "epoch": 4.326767839845012, "grad_norm": 0.632381796836853, "learning_rate": 0.0003408641310203642, "loss": 3.3859, "step": 40200 }, { "epoch": 4.332149391884619, "grad_norm": 0.6269997358322144, "learning_rate": 0.00034054088999030277, "loss": 3.3862, "step": 40250 }, { "epoch": 4.337530943924228, "grad_norm": 0.6661253571510315, "learning_rate": 0.0003402176489602413, "loss": 3.3827, "step": 40300 }, { "epoch": 4.342912495963836, "grad_norm": 0.6136159896850586, "learning_rate": 0.0003398944079301799, "loss": 3.3986, "step": 40350 }, { "epoch": 4.348294048003444, "grad_norm": 0.6653974056243896, "learning_rate": 0.0003395711669001185, "loss": 3.3938, "step": 40400 }, { "epoch": 4.3536756000430525, "grad_norm": 0.6155456900596619, "learning_rate": 0.00033924792587005704, "loss": 3.393, "step": 40450 }, { "epoch": 4.359057152082661, "grad_norm": 0.690609872341156, "learning_rate": 0.0003389246848399957, "loss": 3.3828, "step": 40500 }, { "epoch": 4.364438704122269, "grad_norm": 0.6541188955307007, "learning_rate": 0.0003386014438099343, "loss": 3.3799, "step": 40550 }, { "epoch": 4.369820256161877, "grad_norm": 0.6513378620147705, "learning_rate": 0.0003382782027798728, "loss": 3.391, "step": 40600 }, { "epoch": 4.375201808201485, "grad_norm": 0.6599308252334595, "learning_rate": 0.0003379549617498114, "loss": 3.3743, "step": 40650 }, { "epoch": 4.3805833602410935, "grad_norm": 0.6568775773048401, "learning_rate": 0.00033763172071974996, "loss": 3.3868, "step": 40700 }, { "epoch": 4.385964912280702, "grad_norm": 0.6643113493919373, "learning_rate": 0.00033730847968968855, "loss": 3.3817, "step": 40750 }, { "epoch": 4.39134646432031, "grad_norm": 0.6479328870773315, "learning_rate": 0.0003369852386596272, "loss": 3.3893, "step": 40800 }, { "epoch": 4.396728016359918, "grad_norm": 0.6810503602027893, "learning_rate": 0.00033666199762956574, "loss": 3.3667, "step": 40850 }, { "epoch": 4.402109568399527, "grad_norm": 0.652211606502533, "learning_rate": 0.00033633875659950434, "loss": 3.3764, "step": 40900 }, { "epoch": 4.4074911204391345, "grad_norm": 0.6235228180885315, "learning_rate": 0.0003360155155694429, "loss": 3.3965, "step": 40950 }, { "epoch": 4.412872672478743, "grad_norm": 0.6516085863113403, "learning_rate": 0.0003356922745393815, "loss": 3.3923, "step": 41000 }, { "epoch": 4.412872672478743, "eval_accuracy": 0.3779175609252054, "eval_loss": 3.4350180625915527, "eval_runtime": 185.3058, "eval_samples_per_second": 97.196, "eval_steps_per_second": 6.076, "step": 41000 }, { "epoch": 4.418254224518351, "grad_norm": 0.653671383857727, "learning_rate": 0.0003353690335093201, "loss": 3.3929, "step": 41050 }, { "epoch": 4.423635776557959, "grad_norm": 0.6062602996826172, "learning_rate": 0.0003350457924792587, "loss": 3.3809, "step": 41100 }, { "epoch": 4.429017328597568, "grad_norm": 0.6373082995414734, "learning_rate": 0.00033472255144919726, "loss": 3.3912, "step": 41150 }, { "epoch": 4.4343988806371755, "grad_norm": 0.6125858426094055, "learning_rate": 0.00033439931041913585, "loss": 3.3907, "step": 41200 }, { "epoch": 4.439780432676784, "grad_norm": 0.6293478608131409, "learning_rate": 0.0003340760693890744, "loss": 3.3564, "step": 41250 }, { "epoch": 4.445161984716393, "grad_norm": 0.6728382706642151, "learning_rate": 0.000333752828359013, "loss": 3.4139, "step": 41300 }, { "epoch": 4.450543536756, "grad_norm": 0.6386088728904724, "learning_rate": 0.00033342958732895164, "loss": 3.4263, "step": 41350 }, { "epoch": 4.455925088795609, "grad_norm": 0.6368845105171204, "learning_rate": 0.0003331063462988902, "loss": 3.3761, "step": 41400 }, { "epoch": 4.461306640835216, "grad_norm": 0.6361759305000305, "learning_rate": 0.00033278310526882877, "loss": 3.3773, "step": 41450 }, { "epoch": 4.466688192874825, "grad_norm": 0.6400294899940491, "learning_rate": 0.0003324598642387673, "loss": 3.3874, "step": 41500 }, { "epoch": 4.4720697449144335, "grad_norm": 0.6286135911941528, "learning_rate": 0.0003321366232087059, "loss": 3.3928, "step": 41550 }, { "epoch": 4.477451296954041, "grad_norm": 0.6225272417068481, "learning_rate": 0.0003318133821786445, "loss": 3.3863, "step": 41600 }, { "epoch": 4.48283284899365, "grad_norm": 0.6586177349090576, "learning_rate": 0.0003314901411485831, "loss": 3.3986, "step": 41650 }, { "epoch": 4.488214401033258, "grad_norm": 0.6606486439704895, "learning_rate": 0.0003311669001185217, "loss": 3.3888, "step": 41700 }, { "epoch": 4.493595953072866, "grad_norm": 0.657923698425293, "learning_rate": 0.0003308436590884603, "loss": 3.3853, "step": 41750 }, { "epoch": 4.4989775051124745, "grad_norm": 0.6433077454566956, "learning_rate": 0.0003305204180583988, "loss": 3.388, "step": 41800 }, { "epoch": 4.504359057152083, "grad_norm": 0.6714504361152649, "learning_rate": 0.0003301971770283374, "loss": 3.3915, "step": 41850 }, { "epoch": 4.509740609191691, "grad_norm": 0.703129231929779, "learning_rate": 0.00032987393599827607, "loss": 3.3991, "step": 41900 }, { "epoch": 4.515122161231299, "grad_norm": 0.6598409414291382, "learning_rate": 0.0003295506949682146, "loss": 3.4052, "step": 41950 }, { "epoch": 4.520503713270907, "grad_norm": 0.6891419291496277, "learning_rate": 0.0003292274539381532, "loss": 3.389, "step": 42000 }, { "epoch": 4.520503713270907, "eval_accuracy": 0.37835901760839563, "eval_loss": 3.4275496006011963, "eval_runtime": 184.4745, "eval_samples_per_second": 97.634, "eval_steps_per_second": 6.104, "step": 42000 }, { "epoch": 4.5258852653105155, "grad_norm": 0.6308417916297913, "learning_rate": 0.00032890421290809174, "loss": 3.3936, "step": 42050 }, { "epoch": 4.531266817350124, "grad_norm": 0.640397310256958, "learning_rate": 0.00032858097187803034, "loss": 3.4059, "step": 42100 }, { "epoch": 4.536648369389732, "grad_norm": 0.6476185917854309, "learning_rate": 0.00032825773084796893, "loss": 3.3851, "step": 42150 }, { "epoch": 4.54202992142934, "grad_norm": 0.6753180027008057, "learning_rate": 0.00032793448981790753, "loss": 3.4025, "step": 42200 }, { "epoch": 4.547411473468949, "grad_norm": 0.664780855178833, "learning_rate": 0.0003276112487878461, "loss": 3.3951, "step": 42250 }, { "epoch": 4.5527930255085565, "grad_norm": 0.6564720869064331, "learning_rate": 0.0003272880077577847, "loss": 3.3786, "step": 42300 }, { "epoch": 4.558174577548165, "grad_norm": 0.6598086357116699, "learning_rate": 0.00032696476672772326, "loss": 3.402, "step": 42350 }, { "epoch": 4.563556129587774, "grad_norm": 0.6517024040222168, "learning_rate": 0.00032664152569766185, "loss": 3.3968, "step": 42400 }, { "epoch": 4.568937681627381, "grad_norm": 0.6901940703392029, "learning_rate": 0.0003263182846676004, "loss": 3.3889, "step": 42450 }, { "epoch": 4.57431923366699, "grad_norm": 0.6727946400642395, "learning_rate": 0.00032599504363753904, "loss": 3.4006, "step": 42500 }, { "epoch": 4.579700785706597, "grad_norm": 0.6285117864608765, "learning_rate": 0.00032567180260747764, "loss": 3.3855, "step": 42550 }, { "epoch": 4.585082337746206, "grad_norm": 0.6336430907249451, "learning_rate": 0.0003253485615774162, "loss": 3.399, "step": 42600 }, { "epoch": 4.5904638897858145, "grad_norm": 0.6821553111076355, "learning_rate": 0.00032502532054735477, "loss": 3.3826, "step": 42650 }, { "epoch": 4.595845441825422, "grad_norm": 0.6781034469604492, "learning_rate": 0.00032470207951729337, "loss": 3.3837, "step": 42700 }, { "epoch": 4.601226993865031, "grad_norm": 0.6807826161384583, "learning_rate": 0.0003243788384872319, "loss": 3.3979, "step": 42750 }, { "epoch": 4.606608545904638, "grad_norm": 0.6595771908760071, "learning_rate": 0.0003240620622777717, "loss": 3.3888, "step": 42800 }, { "epoch": 4.611990097944247, "grad_norm": 0.6954243779182434, "learning_rate": 0.00032373882124771036, "loss": 3.3703, "step": 42850 }, { "epoch": 4.6173716499838555, "grad_norm": 0.6635729670524597, "learning_rate": 0.00032341558021764896, "loss": 3.3973, "step": 42900 }, { "epoch": 4.622753202023463, "grad_norm": 0.7169069051742554, "learning_rate": 0.0003230923391875875, "loss": 3.4055, "step": 42950 }, { "epoch": 4.628134754063072, "grad_norm": 0.643351137638092, "learning_rate": 0.0003227690981575261, "loss": 3.3754, "step": 43000 }, { "epoch": 4.628134754063072, "eval_accuracy": 0.3789961581425398, "eval_loss": 3.423278331756592, "eval_runtime": 184.6106, "eval_samples_per_second": 97.562, "eval_steps_per_second": 6.099, "step": 43000 }, { "epoch": 4.63351630610268, "grad_norm": 0.6247761845588684, "learning_rate": 0.0003224458571274647, "loss": 3.3895, "step": 43050 }, { "epoch": 4.638897858142288, "grad_norm": 0.6495500802993774, "learning_rate": 0.00032212261609740323, "loss": 3.3633, "step": 43100 }, { "epoch": 4.6442794101818965, "grad_norm": 0.6213272213935852, "learning_rate": 0.0003217993750673419, "loss": 3.3898, "step": 43150 }, { "epoch": 4.649660962221505, "grad_norm": 0.6968953609466553, "learning_rate": 0.00032147613403728047, "loss": 3.4009, "step": 43200 }, { "epoch": 4.655042514261113, "grad_norm": 0.6720950603485107, "learning_rate": 0.000321152893007219, "loss": 3.3992, "step": 43250 }, { "epoch": 4.660424066300721, "grad_norm": 0.648715078830719, "learning_rate": 0.0003208296519771576, "loss": 3.3778, "step": 43300 }, { "epoch": 4.665805618340329, "grad_norm": 0.6420325636863708, "learning_rate": 0.00032050641094709615, "loss": 3.397, "step": 43350 }, { "epoch": 4.6711871703799375, "grad_norm": 0.6525468230247498, "learning_rate": 0.00032018316991703474, "loss": 3.3925, "step": 43400 }, { "epoch": 4.676568722419546, "grad_norm": 0.6904116272926331, "learning_rate": 0.0003198599288869734, "loss": 3.3921, "step": 43450 }, { "epoch": 4.681950274459154, "grad_norm": 0.6783482432365417, "learning_rate": 0.00031953668785691193, "loss": 3.4008, "step": 43500 }, { "epoch": 4.687331826498762, "grad_norm": 0.6485925316810608, "learning_rate": 0.0003192134468268505, "loss": 3.3889, "step": 43550 }, { "epoch": 4.692713378538371, "grad_norm": 0.6691424250602722, "learning_rate": 0.0003188902057967891, "loss": 3.3749, "step": 43600 }, { "epoch": 4.6980949305779784, "grad_norm": 0.6782438158988953, "learning_rate": 0.00031856696476672766, "loss": 3.3979, "step": 43650 }, { "epoch": 4.703476482617587, "grad_norm": 0.7157396674156189, "learning_rate": 0.0003182437237366663, "loss": 3.3879, "step": 43700 }, { "epoch": 4.7088580346571955, "grad_norm": 0.6502918004989624, "learning_rate": 0.0003179204827066049, "loss": 3.3739, "step": 43750 }, { "epoch": 4.714239586696803, "grad_norm": 0.6235714554786682, "learning_rate": 0.00031759724167654344, "loss": 3.3888, "step": 43800 }, { "epoch": 4.719621138736412, "grad_norm": 0.6773161292076111, "learning_rate": 0.00031727400064648204, "loss": 3.3858, "step": 43850 }, { "epoch": 4.725002690776019, "grad_norm": 0.6831490397453308, "learning_rate": 0.0003169507596164206, "loss": 3.3861, "step": 43900 }, { "epoch": 4.730384242815628, "grad_norm": 0.644305944442749, "learning_rate": 0.0003166275185863592, "loss": 3.3901, "step": 43950 }, { "epoch": 4.7357657948552365, "grad_norm": 0.7171474695205688, "learning_rate": 0.0003163042775562978, "loss": 3.3811, "step": 44000 }, { "epoch": 4.7357657948552365, "eval_accuracy": 0.3795620223822776, "eval_loss": 3.4181690216064453, "eval_runtime": 184.9367, "eval_samples_per_second": 97.39, "eval_steps_per_second": 6.089, "step": 44000 }, { "epoch": 4.741147346894844, "grad_norm": 0.7104071974754333, "learning_rate": 0.00031598103652623636, "loss": 3.3849, "step": 44050 }, { "epoch": 4.746528898934453, "grad_norm": 0.6331132650375366, "learning_rate": 0.00031565779549617496, "loss": 3.382, "step": 44100 }, { "epoch": 4.751910450974061, "grad_norm": 0.669050395488739, "learning_rate": 0.00031533455446611355, "loss": 3.3709, "step": 44150 }, { "epoch": 4.757292003013669, "grad_norm": 0.6390455365180969, "learning_rate": 0.0003150113134360521, "loss": 3.403, "step": 44200 }, { "epoch": 4.7626735550532775, "grad_norm": 0.6818135380744934, "learning_rate": 0.0003146880724059907, "loss": 3.3829, "step": 44250 }, { "epoch": 4.768055107092886, "grad_norm": 0.6637402772903442, "learning_rate": 0.00031436483137592934, "loss": 3.4032, "step": 44300 }, { "epoch": 4.773436659132494, "grad_norm": 0.674086332321167, "learning_rate": 0.0003140415903458679, "loss": 3.3966, "step": 44350 }, { "epoch": 4.778818211172102, "grad_norm": 0.6924407482147217, "learning_rate": 0.00031371834931580647, "loss": 3.3863, "step": 44400 }, { "epoch": 4.78419976321171, "grad_norm": 0.6802992224693298, "learning_rate": 0.000313395108285745, "loss": 3.4059, "step": 44450 }, { "epoch": 4.7895813152513185, "grad_norm": 0.6734175682067871, "learning_rate": 0.0003130718672556836, "loss": 3.3743, "step": 44500 }, { "epoch": 4.794962867290927, "grad_norm": 0.6906740069389343, "learning_rate": 0.0003127486262256222, "loss": 3.3817, "step": 44550 }, { "epoch": 4.800344419330535, "grad_norm": 0.6398283839225769, "learning_rate": 0.0003124253851955608, "loss": 3.3937, "step": 44600 }, { "epoch": 4.805725971370143, "grad_norm": 0.6402740478515625, "learning_rate": 0.0003121021441654994, "loss": 3.4015, "step": 44650 }, { "epoch": 4.811107523409751, "grad_norm": 0.6882855296134949, "learning_rate": 0.000311778903135438, "loss": 3.3789, "step": 44700 }, { "epoch": 4.8164890754493594, "grad_norm": 0.6392636895179749, "learning_rate": 0.0003114556621053765, "loss": 3.3849, "step": 44750 }, { "epoch": 4.821870627488968, "grad_norm": 0.6478724479675293, "learning_rate": 0.0003111324210753151, "loss": 3.4083, "step": 44800 }, { "epoch": 4.827252179528576, "grad_norm": 0.6727092862129211, "learning_rate": 0.00031080918004525377, "loss": 3.4042, "step": 44850 }, { "epoch": 4.832633731568184, "grad_norm": 0.681175947189331, "learning_rate": 0.0003104859390151923, "loss": 3.3854, "step": 44900 }, { "epoch": 4.838015283607793, "grad_norm": 0.6760862469673157, "learning_rate": 0.0003101626979851309, "loss": 3.3831, "step": 44950 }, { "epoch": 4.8433968356474, "grad_norm": 0.6785559058189392, "learning_rate": 0.00030983945695506945, "loss": 3.3894, "step": 45000 }, { "epoch": 4.8433968356474, "eval_accuracy": 0.38030520813492863, "eval_loss": 3.4138073921203613, "eval_runtime": 184.496, "eval_samples_per_second": 97.623, "eval_steps_per_second": 6.103, "step": 45000 }, { "epoch": 4.848778387687009, "grad_norm": 0.6968111991882324, "learning_rate": 0.00030951621592500804, "loss": 3.3986, "step": 45050 }, { "epoch": 4.8541599397266175, "grad_norm": 0.6569320559501648, "learning_rate": 0.00030919297489494663, "loss": 3.3936, "step": 45100 }, { "epoch": 4.859541491766225, "grad_norm": 0.6438539624214172, "learning_rate": 0.00030886973386488523, "loss": 3.3937, "step": 45150 }, { "epoch": 4.864923043805834, "grad_norm": 0.6866356730461121, "learning_rate": 0.0003085464928348238, "loss": 3.3841, "step": 45200 }, { "epoch": 4.870304595845441, "grad_norm": 0.6964440941810608, "learning_rate": 0.0003082232518047624, "loss": 3.404, "step": 45250 }, { "epoch": 4.87568614788505, "grad_norm": 0.6882241368293762, "learning_rate": 0.00030790001077470096, "loss": 3.3983, "step": 45300 }, { "epoch": 4.8810676999246585, "grad_norm": 0.6609480977058411, "learning_rate": 0.00030757676974463955, "loss": 3.4132, "step": 45350 }, { "epoch": 4.886449251964266, "grad_norm": 0.664958655834198, "learning_rate": 0.0003072535287145781, "loss": 3.3773, "step": 45400 }, { "epoch": 4.891830804003875, "grad_norm": 0.715764582157135, "learning_rate": 0.00030693028768451674, "loss": 3.3921, "step": 45450 }, { "epoch": 4.897212356043483, "grad_norm": 0.6146078109741211, "learning_rate": 0.00030660704665445534, "loss": 3.3802, "step": 45500 }, { "epoch": 4.902593908083091, "grad_norm": 0.6503034830093384, "learning_rate": 0.0003062838056243939, "loss": 3.3987, "step": 45550 }, { "epoch": 4.9079754601226995, "grad_norm": 0.6508064270019531, "learning_rate": 0.00030596056459433247, "loss": 3.3721, "step": 45600 }, { "epoch": 4.913357012162308, "grad_norm": 0.6286031603813171, "learning_rate": 0.00030563732356427107, "loss": 3.3893, "step": 45650 }, { "epoch": 4.918738564201916, "grad_norm": 0.6623638868331909, "learning_rate": 0.00030531408253420966, "loss": 3.379, "step": 45700 }, { "epoch": 4.924120116241524, "grad_norm": 0.668628454208374, "learning_rate": 0.00030499084150414826, "loss": 3.3861, "step": 45750 }, { "epoch": 4.929501668281132, "grad_norm": 0.6662799119949341, "learning_rate": 0.00030466760047408685, "loss": 3.3934, "step": 45800 }, { "epoch": 4.9348832203207404, "grad_norm": 0.6624265313148499, "learning_rate": 0.0003043443594440254, "loss": 3.3864, "step": 45850 }, { "epoch": 4.940264772360349, "grad_norm": 0.6628495454788208, "learning_rate": 0.0003040275832345652, "loss": 3.404, "step": 45900 }, { "epoch": 4.945646324399957, "grad_norm": 0.6551867127418518, "learning_rate": 0.0003037043422045038, "loss": 3.3789, "step": 45950 }, { "epoch": 4.951027876439565, "grad_norm": 0.7384966015815735, "learning_rate": 0.0003033811011744424, "loss": 3.3842, "step": 46000 }, { "epoch": 4.951027876439565, "eval_accuracy": 0.3805596731981902, "eval_loss": 3.4086427688598633, "eval_runtime": 184.6207, "eval_samples_per_second": 97.557, "eval_steps_per_second": 6.099, "step": 46000 }, { "epoch": 4.956409428479174, "grad_norm": 0.680334746837616, "learning_rate": 0.00030305786014438093, "loss": 3.3907, "step": 46050 }, { "epoch": 4.961790980518781, "grad_norm": 0.6668956875801086, "learning_rate": 0.0003027346191143196, "loss": 3.3966, "step": 46100 }, { "epoch": 4.96717253255839, "grad_norm": 0.6901334524154663, "learning_rate": 0.00030241137808425817, "loss": 3.3896, "step": 46150 }, { "epoch": 4.9725540845979985, "grad_norm": 0.6751418709754944, "learning_rate": 0.0003020881370541967, "loss": 3.3745, "step": 46200 }, { "epoch": 4.977935636637606, "grad_norm": 0.7121784090995789, "learning_rate": 0.0003017648960241353, "loss": 3.385, "step": 46250 }, { "epoch": 4.983317188677215, "grad_norm": 0.6521340012550354, "learning_rate": 0.00030144165499407385, "loss": 3.4098, "step": 46300 }, { "epoch": 4.988698740716822, "grad_norm": 0.8021095395088196, "learning_rate": 0.00030111841396401244, "loss": 3.4035, "step": 46350 }, { "epoch": 4.994080292756431, "grad_norm": 0.6512081027030945, "learning_rate": 0.0003007951729339511, "loss": 3.3843, "step": 46400 }, { "epoch": 4.9994618447960395, "grad_norm": 0.7062845230102539, "learning_rate": 0.00030047193190388963, "loss": 3.396, "step": 46450 }, { "epoch": 5.004843396835647, "grad_norm": 0.6628386974334717, "learning_rate": 0.0003001486908738282, "loss": 3.3034, "step": 46500 }, { "epoch": 5.010224948875256, "grad_norm": 0.7442699670791626, "learning_rate": 0.0002998254498437668, "loss": 3.2796, "step": 46550 }, { "epoch": 5.015606500914864, "grad_norm": 0.7247018218040466, "learning_rate": 0.0002995022088137054, "loss": 3.2988, "step": 46600 }, { "epoch": 5.020988052954472, "grad_norm": 0.6431019902229309, "learning_rate": 0.00029917896778364396, "loss": 3.3149, "step": 46650 }, { "epoch": 5.0263696049940805, "grad_norm": 0.6791779398918152, "learning_rate": 0.00029885572675358255, "loss": 3.2962, "step": 46700 }, { "epoch": 5.031751157033688, "grad_norm": 0.6947548985481262, "learning_rate": 0.00029853248572352114, "loss": 3.3008, "step": 46750 }, { "epoch": 5.037132709073297, "grad_norm": 0.6451291441917419, "learning_rate": 0.00029820924469345974, "loss": 3.3082, "step": 46800 }, { "epoch": 5.042514261112905, "grad_norm": 0.698814332485199, "learning_rate": 0.0002978860036633983, "loss": 3.2975, "step": 46850 }, { "epoch": 5.047895813152513, "grad_norm": 0.702712893486023, "learning_rate": 0.00029756276263333693, "loss": 3.3188, "step": 46900 }, { "epoch": 5.0532773651921215, "grad_norm": 0.6753806471824646, "learning_rate": 0.00029723952160327547, "loss": 3.3139, "step": 46950 }, { "epoch": 5.05865891723173, "grad_norm": 0.7441586256027222, "learning_rate": 0.00029691628057321406, "loss": 3.3241, "step": 47000 }, { "epoch": 5.05865891723173, "eval_accuracy": 0.3806568088798963, "eval_loss": 3.4130375385284424, "eval_runtime": 184.7285, "eval_samples_per_second": 97.5, "eval_steps_per_second": 6.095, "step": 47000 }, { "epoch": 5.064040469271338, "grad_norm": 0.7186902761459351, "learning_rate": 0.00029659303954315266, "loss": 3.3039, "step": 47050 }, { "epoch": 5.069422021310946, "grad_norm": 0.6443047523498535, "learning_rate": 0.00029626979851309125, "loss": 3.3242, "step": 47100 }, { "epoch": 5.074803573350554, "grad_norm": 0.6580514907836914, "learning_rate": 0.00029594655748302985, "loss": 3.2998, "step": 47150 }, { "epoch": 5.080185125390162, "grad_norm": 0.6719970107078552, "learning_rate": 0.0002956233164529684, "loss": 3.3033, "step": 47200 }, { "epoch": 5.085566677429771, "grad_norm": 0.7296539545059204, "learning_rate": 0.000295300075422907, "loss": 3.299, "step": 47250 }, { "epoch": 5.090948229469379, "grad_norm": 0.7392880320549011, "learning_rate": 0.0002949768343928456, "loss": 3.3106, "step": 47300 }, { "epoch": 5.096329781508987, "grad_norm": 0.6548142433166504, "learning_rate": 0.00029465359336278417, "loss": 3.2949, "step": 47350 }, { "epoch": 5.101711333548596, "grad_norm": 0.6920472383499146, "learning_rate": 0.0002943303523327227, "loss": 3.302, "step": 47400 }, { "epoch": 5.107092885588203, "grad_norm": 0.6915966272354126, "learning_rate": 0.00029400711130266136, "loss": 3.315, "step": 47450 }, { "epoch": 5.112474437627812, "grad_norm": 0.7536485195159912, "learning_rate": 0.0002936838702725999, "loss": 3.3202, "step": 47500 }, { "epoch": 5.1178559896674205, "grad_norm": 0.6746701002120972, "learning_rate": 0.0002933606292425385, "loss": 3.3131, "step": 47550 }, { "epoch": 5.123237541707028, "grad_norm": 0.6896171569824219, "learning_rate": 0.0002930373882124771, "loss": 3.3146, "step": 47600 }, { "epoch": 5.128619093746637, "grad_norm": 0.6922616362571716, "learning_rate": 0.0002927141471824157, "loss": 3.3254, "step": 47650 }, { "epoch": 5.134000645786244, "grad_norm": 0.6527367234230042, "learning_rate": 0.0002923909061523542, "loss": 3.3171, "step": 47700 }, { "epoch": 5.139382197825853, "grad_norm": 0.7178485989570618, "learning_rate": 0.0002920676651222928, "loss": 3.322, "step": 47750 }, { "epoch": 5.1447637498654615, "grad_norm": 0.6550273895263672, "learning_rate": 0.0002917444240922314, "loss": 3.3252, "step": 47800 }, { "epoch": 5.150145301905069, "grad_norm": 0.6551401615142822, "learning_rate": 0.00029142118306216996, "loss": 3.3017, "step": 47850 }, { "epoch": 5.155526853944678, "grad_norm": 0.7181161046028137, "learning_rate": 0.0002910979420321086, "loss": 3.3123, "step": 47900 }, { "epoch": 5.160908405984286, "grad_norm": 0.7358765602111816, "learning_rate": 0.00029077470100204715, "loss": 3.3117, "step": 47950 }, { "epoch": 5.166289958023894, "grad_norm": 0.6698744297027588, "learning_rate": 0.00029045145997198574, "loss": 3.312, "step": 48000 }, { "epoch": 5.166289958023894, "eval_accuracy": 0.3811957272034561, "eval_loss": 3.4106051921844482, "eval_runtime": 184.3793, "eval_samples_per_second": 97.685, "eval_steps_per_second": 6.107, "step": 48000 }, { "epoch": 5.1716715100635025, "grad_norm": 0.7061935067176819, "learning_rate": 0.00029012821894192433, "loss": 3.3312, "step": 48050 }, { "epoch": 5.17705306210311, "grad_norm": 0.6918174028396606, "learning_rate": 0.00028980497791186293, "loss": 3.3123, "step": 48100 }, { "epoch": 5.182434614142719, "grad_norm": 0.6964541673660278, "learning_rate": 0.00028948820170240274, "loss": 3.3137, "step": 48150 }, { "epoch": 5.187816166182327, "grad_norm": 0.6781343221664429, "learning_rate": 0.00028916496067234133, "loss": 3.3102, "step": 48200 }, { "epoch": 5.193197718221935, "grad_norm": 0.6664396524429321, "learning_rate": 0.0002888417196422799, "loss": 3.3151, "step": 48250 }, { "epoch": 5.198579270261543, "grad_norm": 0.6614742875099182, "learning_rate": 0.00028851847861221847, "loss": 3.3069, "step": 48300 }, { "epoch": 5.203960822301152, "grad_norm": 0.6851457953453064, "learning_rate": 0.00028819523758215706, "loss": 3.3248, "step": 48350 }, { "epoch": 5.20934237434076, "grad_norm": 0.7182928323745728, "learning_rate": 0.00028787199655209566, "loss": 3.3229, "step": 48400 }, { "epoch": 5.214723926380368, "grad_norm": 0.7236764430999756, "learning_rate": 0.00028754875552203425, "loss": 3.337, "step": 48450 }, { "epoch": 5.220105478419977, "grad_norm": 0.6826680302619934, "learning_rate": 0.0002872255144919728, "loss": 3.3206, "step": 48500 }, { "epoch": 5.225487030459584, "grad_norm": 0.7440321445465088, "learning_rate": 0.00028690227346191144, "loss": 3.3193, "step": 48550 }, { "epoch": 5.230868582499193, "grad_norm": 0.698179304599762, "learning_rate": 0.00028657903243185, "loss": 3.3166, "step": 48600 }, { "epoch": 5.236250134538801, "grad_norm": 0.6869778037071228, "learning_rate": 0.0002862557914017886, "loss": 3.3437, "step": 48650 }, { "epoch": 5.241631686578409, "grad_norm": 0.6472514867782593, "learning_rate": 0.00028593255037172717, "loss": 3.3263, "step": 48700 }, { "epoch": 5.247013238618018, "grad_norm": 0.6622984409332275, "learning_rate": 0.00028560930934166576, "loss": 3.3033, "step": 48750 }, { "epoch": 5.252394790657625, "grad_norm": 0.687536358833313, "learning_rate": 0.00028528606831160436, "loss": 3.3054, "step": 48800 }, { "epoch": 5.257776342697234, "grad_norm": 0.6665416359901428, "learning_rate": 0.0002849628272815429, "loss": 3.3273, "step": 48850 }, { "epoch": 5.2631578947368425, "grad_norm": 0.7293050289154053, "learning_rate": 0.0002846395862514815, "loss": 3.3237, "step": 48900 }, { "epoch": 5.26853944677645, "grad_norm": 0.6864955425262451, "learning_rate": 0.0002843163452214201, "loss": 3.3194, "step": 48950 }, { "epoch": 5.273920998816059, "grad_norm": 0.6750340461730957, "learning_rate": 0.0002839931041913587, "loss": 3.3303, "step": 49000 }, { "epoch": 5.273920998816059, "eval_accuracy": 0.3815918756324277, "eval_loss": 3.406046152114868, "eval_runtime": 184.5273, "eval_samples_per_second": 97.606, "eval_steps_per_second": 6.102, "step": 49000 }, { "epoch": 5.279302550855666, "grad_norm": 0.6839402914047241, "learning_rate": 0.0002836698631612972, "loss": 3.3091, "step": 49050 }, { "epoch": 5.284684102895275, "grad_norm": 0.6985600590705872, "learning_rate": 0.00028334662213123587, "loss": 3.3188, "step": 49100 }, { "epoch": 5.2900656549348835, "grad_norm": 0.6883201599121094, "learning_rate": 0.0002830233811011744, "loss": 3.3188, "step": 49150 }, { "epoch": 5.295447206974491, "grad_norm": 0.7022517323493958, "learning_rate": 0.000282700140071113, "loss": 3.3139, "step": 49200 }, { "epoch": 5.3008287590141, "grad_norm": 0.7535263299942017, "learning_rate": 0.0002823768990410516, "loss": 3.3594, "step": 49250 }, { "epoch": 5.306210311053708, "grad_norm": 0.7504818439483643, "learning_rate": 0.00028205365801099014, "loss": 3.3368, "step": 49300 }, { "epoch": 5.311591863093316, "grad_norm": 0.6709638237953186, "learning_rate": 0.00028173041698092874, "loss": 3.3417, "step": 49350 }, { "epoch": 5.316973415132924, "grad_norm": 0.7753866910934448, "learning_rate": 0.00028140717595086733, "loss": 3.3401, "step": 49400 }, { "epoch": 5.322354967172533, "grad_norm": 0.753419041633606, "learning_rate": 0.0002810839349208059, "loss": 3.3209, "step": 49450 }, { "epoch": 5.327736519212141, "grad_norm": 0.7451731562614441, "learning_rate": 0.00028076069389074447, "loss": 3.3269, "step": 49500 }, { "epoch": 5.333118071251749, "grad_norm": 0.6875990033149719, "learning_rate": 0.0002804374528606831, "loss": 3.3183, "step": 49550 }, { "epoch": 5.338499623291357, "grad_norm": 0.7078640460968018, "learning_rate": 0.00028011421183062166, "loss": 3.3154, "step": 49600 }, { "epoch": 5.343881175330965, "grad_norm": 0.7015400528907776, "learning_rate": 0.00027979097080056025, "loss": 3.3199, "step": 49650 }, { "epoch": 5.349262727370574, "grad_norm": 0.6855900883674622, "learning_rate": 0.00027946772977049885, "loss": 3.3365, "step": 49700 }, { "epoch": 5.354644279410182, "grad_norm": 0.6965287923812866, "learning_rate": 0.00027914448874043744, "loss": 3.3461, "step": 49750 }, { "epoch": 5.36002583144979, "grad_norm": 0.706678569316864, "learning_rate": 0.00027882124771037603, "loss": 3.3384, "step": 49800 }, { "epoch": 5.365407383489399, "grad_norm": 0.7243395447731018, "learning_rate": 0.0002784980066803146, "loss": 3.3406, "step": 49850 }, { "epoch": 5.370788935529006, "grad_norm": 0.7101507782936096, "learning_rate": 0.00027817476565025317, "loss": 3.3327, "step": 49900 }, { "epoch": 5.376170487568615, "grad_norm": 0.7034843564033508, "learning_rate": 0.00027785152462019176, "loss": 3.3268, "step": 49950 }, { "epoch": 5.3815520396082235, "grad_norm": 0.6907040476799011, "learning_rate": 0.00027752828359013036, "loss": 3.3415, "step": 50000 }, { "epoch": 5.3815520396082235, "eval_accuracy": 0.38182634856433134, "eval_loss": 3.402235507965088, "eval_runtime": 184.8851, "eval_samples_per_second": 97.417, "eval_steps_per_second": 6.09, "step": 50000 }, { "epoch": 5.386933591647831, "grad_norm": 0.6705796122550964, "learning_rate": 0.0002772050425600689, "loss": 3.356, "step": 50050 }, { "epoch": 5.39231514368744, "grad_norm": 0.7291210889816284, "learning_rate": 0.00027688180153000755, "loss": 3.3134, "step": 50100 }, { "epoch": 5.397696695727047, "grad_norm": 0.7164068222045898, "learning_rate": 0.0002765585604999461, "loss": 3.3312, "step": 50150 }, { "epoch": 5.403078247766656, "grad_norm": 0.7845756411552429, "learning_rate": 0.0002762353194698847, "loss": 3.3374, "step": 50200 }, { "epoch": 5.4084597998062645, "grad_norm": 0.7239874601364136, "learning_rate": 0.0002759120784398233, "loss": 3.3221, "step": 50250 }, { "epoch": 5.413841351845872, "grad_norm": 0.6949788331985474, "learning_rate": 0.00027558883740976187, "loss": 3.341, "step": 50300 }, { "epoch": 5.419222903885481, "grad_norm": 0.680122971534729, "learning_rate": 0.0002752655963797004, "loss": 3.3322, "step": 50350 }, { "epoch": 5.424604455925088, "grad_norm": 0.679383397102356, "learning_rate": 0.000274942355349639, "loss": 3.3406, "step": 50400 }, { "epoch": 5.429986007964697, "grad_norm": 0.7332752346992493, "learning_rate": 0.0002746191143195776, "loss": 3.3444, "step": 50450 }, { "epoch": 5.435367560004305, "grad_norm": 0.7282567620277405, "learning_rate": 0.0002742958732895162, "loss": 3.3267, "step": 50500 }, { "epoch": 5.440749112043913, "grad_norm": 0.7126004695892334, "learning_rate": 0.000273979097080056, "loss": 3.3275, "step": 50550 }, { "epoch": 5.446130664083522, "grad_norm": 0.7799147963523865, "learning_rate": 0.0002736558560499946, "loss": 3.3277, "step": 50600 }, { "epoch": 5.45151221612313, "grad_norm": 0.6828324794769287, "learning_rate": 0.0002733390798405344, "loss": 3.3377, "step": 50650 }, { "epoch": 5.456893768162738, "grad_norm": 0.7099668383598328, "learning_rate": 0.000273015838810473, "loss": 3.325, "step": 50700 }, { "epoch": 5.462275320202346, "grad_norm": 0.7239611148834229, "learning_rate": 0.0002726925977804116, "loss": 3.3314, "step": 50750 }, { "epoch": 5.467656872241955, "grad_norm": 0.6668616533279419, "learning_rate": 0.00027236935675035013, "loss": 3.3309, "step": 50800 }, { "epoch": 5.473038424281563, "grad_norm": 0.6595761179924011, "learning_rate": 0.00027204611572028873, "loss": 3.3113, "step": 50850 }, { "epoch": 5.478419976321171, "grad_norm": 0.6890519261360168, "learning_rate": 0.0002717228746902273, "loss": 3.324, "step": 50900 }, { "epoch": 5.483801528360779, "grad_norm": 0.7110984325408936, "learning_rate": 0.0002713996336601659, "loss": 3.3335, "step": 50950 }, { "epoch": 5.489183080400387, "grad_norm": 0.6996448040008545, "learning_rate": 0.0002710763926301045, "loss": 3.3321, "step": 51000 }, { "epoch": 5.489183080400387, "eval_accuracy": 0.38222977673678654, "eval_loss": 3.397602081298828, "eval_runtime": 184.4703, "eval_samples_per_second": 97.636, "eval_steps_per_second": 6.104, "step": 51000 }, { "epoch": 5.494564632439996, "grad_norm": 0.6930586099624634, "learning_rate": 0.00027075315160004305, "loss": 3.3259, "step": 51050 }, { "epoch": 5.499946184479604, "grad_norm": 0.6693201661109924, "learning_rate": 0.00027042991056998165, "loss": 3.3238, "step": 51100 }, { "epoch": 5.505327736519212, "grad_norm": 0.7076327204704285, "learning_rate": 0.00027010666953992024, "loss": 3.3132, "step": 51150 }, { "epoch": 5.510709288558821, "grad_norm": 0.7098067998886108, "learning_rate": 0.00026978342850985884, "loss": 3.3202, "step": 51200 }, { "epoch": 5.516090840598428, "grad_norm": 0.7085769176483154, "learning_rate": 0.0002694601874797974, "loss": 3.3231, "step": 51250 }, { "epoch": 5.521472392638037, "grad_norm": 0.7210639715194702, "learning_rate": 0.000269136946449736, "loss": 3.3344, "step": 51300 }, { "epoch": 5.5268539446776455, "grad_norm": 0.7563997507095337, "learning_rate": 0.00026881370541967457, "loss": 3.3448, "step": 51350 }, { "epoch": 5.532235496717253, "grad_norm": 0.6973581314086914, "learning_rate": 0.00026849046438961316, "loss": 3.3427, "step": 51400 }, { "epoch": 5.537617048756862, "grad_norm": 0.6693292856216431, "learning_rate": 0.00026816722335955176, "loss": 3.3359, "step": 51450 }, { "epoch": 5.542998600796469, "grad_norm": 0.7080926299095154, "learning_rate": 0.00026784398232949035, "loss": 3.3283, "step": 51500 }, { "epoch": 5.548380152836078, "grad_norm": 0.6850876808166504, "learning_rate": 0.00026752074129942895, "loss": 3.3366, "step": 51550 }, { "epoch": 5.553761704875686, "grad_norm": 0.7193228602409363, "learning_rate": 0.0002671975002693675, "loss": 3.3334, "step": 51600 }, { "epoch": 5.559143256915294, "grad_norm": 0.7010207176208496, "learning_rate": 0.0002668742592393061, "loss": 3.3395, "step": 51650 }, { "epoch": 5.564524808954903, "grad_norm": 0.6764205694198608, "learning_rate": 0.0002665510182092447, "loss": 3.3341, "step": 51700 }, { "epoch": 5.569906360994511, "grad_norm": 0.6543567180633545, "learning_rate": 0.00026622777717918327, "loss": 3.3362, "step": 51750 }, { "epoch": 5.575287913034119, "grad_norm": 0.7548144459724426, "learning_rate": 0.0002659045361491218, "loss": 3.3307, "step": 51800 }, { "epoch": 5.580669465073727, "grad_norm": 0.7587510943412781, "learning_rate": 0.00026558129511906046, "loss": 3.3517, "step": 51850 }, { "epoch": 5.586051017113336, "grad_norm": 0.6862114667892456, "learning_rate": 0.000265258054088999, "loss": 3.3552, "step": 51900 }, { "epoch": 5.591432569152944, "grad_norm": 0.8210598230361938, "learning_rate": 0.0002649348130589376, "loss": 3.3333, "step": 51950 }, { "epoch": 5.596814121192552, "grad_norm": 0.7292066216468811, "learning_rate": 0.0002646115720288762, "loss": 3.345, "step": 52000 }, { "epoch": 5.596814121192552, "eval_accuracy": 0.3826878573118124, "eval_loss": 3.3923404216766357, "eval_runtime": 184.6516, "eval_samples_per_second": 97.54, "eval_steps_per_second": 6.098, "step": 52000 }, { "epoch": 5.60219567323216, "grad_norm": 0.7065765857696533, "learning_rate": 0.00026428833099881473, "loss": 3.3593, "step": 52050 }, { "epoch": 5.607577225271768, "grad_norm": 0.7100358009338379, "learning_rate": 0.0002639650899687533, "loss": 3.3294, "step": 52100 }, { "epoch": 5.612958777311377, "grad_norm": 0.7037196755409241, "learning_rate": 0.0002636418489386919, "loss": 3.3131, "step": 52150 }, { "epoch": 5.618340329350985, "grad_norm": 0.7100152373313904, "learning_rate": 0.0002633186079086305, "loss": 3.3275, "step": 52200 }, { "epoch": 5.623721881390593, "grad_norm": 0.6936700344085693, "learning_rate": 0.00026299536687856905, "loss": 3.3193, "step": 52250 }, { "epoch": 5.629103433430201, "grad_norm": 0.7310315370559692, "learning_rate": 0.0002626721258485077, "loss": 3.3352, "step": 52300 }, { "epoch": 5.634484985469809, "grad_norm": 0.7135825753211975, "learning_rate": 0.00026234888481844624, "loss": 3.33, "step": 52350 }, { "epoch": 5.639866537509418, "grad_norm": 0.6975454688072205, "learning_rate": 0.00026202564378838484, "loss": 3.3285, "step": 52400 }, { "epoch": 5.645248089549026, "grad_norm": 0.6941152811050415, "learning_rate": 0.00026170240275832343, "loss": 3.3472, "step": 52450 }, { "epoch": 5.650629641588634, "grad_norm": 0.7469999194145203, "learning_rate": 0.00026137916172826203, "loss": 3.3204, "step": 52500 }, { "epoch": 5.656011193628243, "grad_norm": 0.7071282863616943, "learning_rate": 0.0002610559206982006, "loss": 3.3354, "step": 52550 }, { "epoch": 5.66139274566785, "grad_norm": 0.705010175704956, "learning_rate": 0.00026073267966813916, "loss": 3.3401, "step": 52600 }, { "epoch": 5.666774297707459, "grad_norm": 0.769747257232666, "learning_rate": 0.00026040943863807776, "loss": 3.3283, "step": 52650 }, { "epoch": 5.672155849747067, "grad_norm": 0.7160783410072327, "learning_rate": 0.00026008619760801635, "loss": 3.3407, "step": 52700 }, { "epoch": 5.677537401786675, "grad_norm": 0.7041793465614319, "learning_rate": 0.00025976295657795495, "loss": 3.3489, "step": 52750 }, { "epoch": 5.682918953826284, "grad_norm": 0.7076753377914429, "learning_rate": 0.0002594397155478935, "loss": 3.343, "step": 52800 }, { "epoch": 5.688300505865891, "grad_norm": 0.7296324968338013, "learning_rate": 0.00025911647451783214, "loss": 3.3402, "step": 52850 }, { "epoch": 5.6936820579055, "grad_norm": 0.70832759141922, "learning_rate": 0.0002587932334877707, "loss": 3.3191, "step": 52900 }, { "epoch": 5.699063609945108, "grad_norm": 0.6894454956054688, "learning_rate": 0.00025846999245770927, "loss": 3.3372, "step": 52950 }, { "epoch": 5.704445161984716, "grad_norm": 0.7215243577957153, "learning_rate": 0.00025814675142764787, "loss": 3.3433, "step": 53000 }, { "epoch": 5.704445161984716, "eval_accuracy": 0.38319276728148627, "eval_loss": 3.3882508277893066, "eval_runtime": 186.9278, "eval_samples_per_second": 96.353, "eval_steps_per_second": 6.024, "step": 53000 }, { "epoch": 5.709826714024325, "grad_norm": 0.7630006074905396, "learning_rate": 0.00025782351039758646, "loss": 3.3234, "step": 53050 }, { "epoch": 5.715208266063933, "grad_norm": 0.6765692234039307, "learning_rate": 0.000257500269367525, "loss": 3.3324, "step": 53100 }, { "epoch": 5.720589818103541, "grad_norm": 0.7011041641235352, "learning_rate": 0.0002571770283374636, "loss": 3.3426, "step": 53150 }, { "epoch": 5.725971370143149, "grad_norm": 0.6970390677452087, "learning_rate": 0.0002568537873074022, "loss": 3.3182, "step": 53200 }, { "epoch": 5.731352922182758, "grad_norm": 0.7535760402679443, "learning_rate": 0.0002565305462773408, "loss": 3.3382, "step": 53250 }, { "epoch": 5.736734474222366, "grad_norm": 0.7167165279388428, "learning_rate": 0.0002562073052472794, "loss": 3.3495, "step": 53300 }, { "epoch": 5.742116026261974, "grad_norm": 0.7161158323287964, "learning_rate": 0.0002558840642172179, "loss": 3.3477, "step": 53350 }, { "epoch": 5.747497578301582, "grad_norm": 0.7228608727455139, "learning_rate": 0.00025556082318715657, "loss": 3.3316, "step": 53400 }, { "epoch": 5.75287913034119, "grad_norm": 0.7354365587234497, "learning_rate": 0.0002552375821570951, "loss": 3.35, "step": 53450 }, { "epoch": 5.758260682380799, "grad_norm": 0.7172362208366394, "learning_rate": 0.0002549143411270337, "loss": 3.3494, "step": 53500 }, { "epoch": 5.763642234420407, "grad_norm": 0.7296724319458008, "learning_rate": 0.0002545911000969723, "loss": 3.3451, "step": 53550 }, { "epoch": 5.769023786460015, "grad_norm": 0.7641502022743225, "learning_rate": 0.0002542678590669109, "loss": 3.3496, "step": 53600 }, { "epoch": 5.774405338499624, "grad_norm": 0.6677383184432983, "learning_rate": 0.00025394461803684943, "loss": 3.3406, "step": 53650 }, { "epoch": 5.779786890539231, "grad_norm": 0.7079816460609436, "learning_rate": 0.00025362137700678803, "loss": 3.3472, "step": 53700 }, { "epoch": 5.78516844257884, "grad_norm": 0.8173105120658875, "learning_rate": 0.0002532981359767266, "loss": 3.3269, "step": 53750 }, { "epoch": 5.790549994618448, "grad_norm": 0.740807294845581, "learning_rate": 0.0002529748949466652, "loss": 3.3255, "step": 53800 }, { "epoch": 5.795931546658056, "grad_norm": 0.7093555927276611, "learning_rate": 0.0002526516539166038, "loss": 3.3337, "step": 53850 }, { "epoch": 5.801313098697665, "grad_norm": 0.7256129384040833, "learning_rate": 0.00025232841288654235, "loss": 3.3258, "step": 53900 }, { "epoch": 5.806694650737272, "grad_norm": 0.6999818682670593, "learning_rate": 0.00025200517185648095, "loss": 3.3464, "step": 53950 }, { "epoch": 5.812076202776881, "grad_norm": 0.7375802397727966, "learning_rate": 0.00025168193082641954, "loss": 3.3351, "step": 54000 }, { "epoch": 5.812076202776881, "eval_accuracy": 0.3836312903367055, "eval_loss": 3.3833701610565186, "eval_runtime": 185.1377, "eval_samples_per_second": 97.284, "eval_steps_per_second": 6.082, "step": 54000 }, { "epoch": 5.817457754816489, "grad_norm": 0.714192807674408, "learning_rate": 0.00025135868979635814, "loss": 3.3304, "step": 54050 }, { "epoch": 5.822839306856097, "grad_norm": 0.7494600415229797, "learning_rate": 0.0002510354487662967, "loss": 3.3435, "step": 54100 }, { "epoch": 5.828220858895706, "grad_norm": 0.730343759059906, "learning_rate": 0.0002507122077362353, "loss": 3.3399, "step": 54150 }, { "epoch": 5.833602410935313, "grad_norm": 0.7611250877380371, "learning_rate": 0.00025038896670617387, "loss": 3.349, "step": 54200 }, { "epoch": 5.838983962974922, "grad_norm": 0.7368343472480774, "learning_rate": 0.00025006572567611246, "loss": 3.3408, "step": 54250 }, { "epoch": 5.84436551501453, "grad_norm": 0.711065948009491, "learning_rate": 0.00024974248464605106, "loss": 3.3482, "step": 54300 }, { "epoch": 5.849747067054138, "grad_norm": 0.7511163353919983, "learning_rate": 0.00024941924361598965, "loss": 3.3501, "step": 54350 }, { "epoch": 5.855128619093747, "grad_norm": 0.682986319065094, "learning_rate": 0.00024909600258592825, "loss": 3.3234, "step": 54400 }, { "epoch": 5.860510171133355, "grad_norm": 0.6947693824768066, "learning_rate": 0.0002487727615558668, "loss": 3.3509, "step": 54450 }, { "epoch": 5.865891723172963, "grad_norm": 0.6829459071159363, "learning_rate": 0.0002484495205258054, "loss": 3.3442, "step": 54500 }, { "epoch": 5.871273275212571, "grad_norm": 0.7875406742095947, "learning_rate": 0.000248126279495744, "loss": 3.3377, "step": 54550 }, { "epoch": 5.87665482725218, "grad_norm": 0.7647419571876526, "learning_rate": 0.00024780303846568257, "loss": 3.3455, "step": 54600 }, { "epoch": 5.882036379291788, "grad_norm": 0.687256395816803, "learning_rate": 0.0002474797974356211, "loss": 3.3324, "step": 54650 }, { "epoch": 5.887417931331396, "grad_norm": 0.7644538283348083, "learning_rate": 0.00024715655640555976, "loss": 3.3397, "step": 54700 }, { "epoch": 5.892799483371004, "grad_norm": 0.7015940546989441, "learning_rate": 0.0002468333153754983, "loss": 3.334, "step": 54750 }, { "epoch": 5.898181035410612, "grad_norm": 0.701823353767395, "learning_rate": 0.0002465100743454369, "loss": 3.3449, "step": 54800 }, { "epoch": 5.903562587450221, "grad_norm": 0.684441328048706, "learning_rate": 0.0002461868333153755, "loss": 3.3369, "step": 54850 }, { "epoch": 5.9089441394898286, "grad_norm": 0.7343729734420776, "learning_rate": 0.0002458635922853141, "loss": 3.3461, "step": 54900 }, { "epoch": 5.914325691529437, "grad_norm": 0.7203894257545471, "learning_rate": 0.0002455403512552526, "loss": 3.3287, "step": 54950 }, { "epoch": 5.919707243569046, "grad_norm": 0.7090471982955933, "learning_rate": 0.0002452171102251912, "loss": 3.3374, "step": 55000 }, { "epoch": 5.919707243569046, "eval_accuracy": 0.38405818753292864, "eval_loss": 3.379855155944824, "eval_runtime": 184.4932, "eval_samples_per_second": 97.624, "eval_steps_per_second": 6.103, "step": 55000 }, { "epoch": 5.925088795608653, "grad_norm": 0.6990830898284912, "learning_rate": 0.0002448938691951298, "loss": 3.3292, "step": 55050 }, { "epoch": 5.930470347648262, "grad_norm": 0.7348403334617615, "learning_rate": 0.0002445706281650684, "loss": 3.3253, "step": 55100 }, { "epoch": 5.93585189968787, "grad_norm": 0.7289194464683533, "learning_rate": 0.000244247387135007, "loss": 3.3327, "step": 55150 }, { "epoch": 5.941233451727478, "grad_norm": 0.74270099401474, "learning_rate": 0.00024392414610494557, "loss": 3.3126, "step": 55200 }, { "epoch": 5.946615003767087, "grad_norm": 0.6974494457244873, "learning_rate": 0.00024360090507488414, "loss": 3.3297, "step": 55250 }, { "epoch": 5.951996555806694, "grad_norm": 0.7457931637763977, "learning_rate": 0.00024327766404482273, "loss": 3.3188, "step": 55300 }, { "epoch": 5.957378107846303, "grad_norm": 0.7141943573951721, "learning_rate": 0.0002429544230147613, "loss": 3.3607, "step": 55350 }, { "epoch": 5.962759659885911, "grad_norm": 0.7391859889030457, "learning_rate": 0.00024263118198469992, "loss": 3.3317, "step": 55400 }, { "epoch": 5.968141211925519, "grad_norm": 0.7539886832237244, "learning_rate": 0.0002423079409546385, "loss": 3.3524, "step": 55450 }, { "epoch": 5.973522763965128, "grad_norm": 0.764616072177887, "learning_rate": 0.00024198469992457706, "loss": 3.3379, "step": 55500 }, { "epoch": 5.978904316004736, "grad_norm": 0.7487912178039551, "learning_rate": 0.00024166145889451568, "loss": 3.3334, "step": 55550 }, { "epoch": 5.984285868044344, "grad_norm": 0.7643945217132568, "learning_rate": 0.00024133821786445425, "loss": 3.3554, "step": 55600 }, { "epoch": 5.989667420083952, "grad_norm": 0.726444661617279, "learning_rate": 0.0002410149768343928, "loss": 3.3291, "step": 55650 }, { "epoch": 5.995048972123561, "grad_norm": 0.7337467670440674, "learning_rate": 0.0002406917358043314, "loss": 3.3395, "step": 55700 }, { "epoch": 6.000430524163169, "grad_norm": 0.7354254722595215, "learning_rate": 0.00024036849477427, "loss": 3.3185, "step": 55750 }, { "epoch": 6.005812076202777, "grad_norm": 0.7437455058097839, "learning_rate": 0.00024004525374420857, "loss": 3.251, "step": 55800 }, { "epoch": 6.011193628242385, "grad_norm": 0.7114704251289368, "learning_rate": 0.00023972201271414716, "loss": 3.2531, "step": 55850 }, { "epoch": 6.016575180281993, "grad_norm": 0.7447876930236816, "learning_rate": 0.00023939877168408573, "loss": 3.2347, "step": 55900 }, { "epoch": 6.021956732321602, "grad_norm": 0.7433951497077942, "learning_rate": 0.00023907553065402433, "loss": 3.2466, "step": 55950 }, { "epoch": 6.0273382843612096, "grad_norm": 0.7271364331245422, "learning_rate": 0.00023875875444456413, "loss": 3.2475, "step": 56000 }, { "epoch": 6.0273382843612096, "eval_accuracy": 0.3842132352038399, "eval_loss": 3.383183002471924, "eval_runtime": 184.6903, "eval_samples_per_second": 97.52, "eval_steps_per_second": 6.097, "step": 56000 }, { "epoch": 6.032719836400818, "grad_norm": 0.672038733959198, "learning_rate": 0.0002384355134145027, "loss": 3.2535, "step": 56050 }, { "epoch": 6.038101388440427, "grad_norm": 0.7392109036445618, "learning_rate": 0.00023811227238444132, "loss": 3.23, "step": 56100 }, { "epoch": 6.043482940480034, "grad_norm": 0.7268337607383728, "learning_rate": 0.0002377890313543799, "loss": 3.2515, "step": 56150 }, { "epoch": 6.048864492519643, "grad_norm": 0.7550994753837585, "learning_rate": 0.00023746579032431849, "loss": 3.2548, "step": 56200 }, { "epoch": 6.0542460445592505, "grad_norm": 0.7194846868515015, "learning_rate": 0.00023714254929425708, "loss": 3.2536, "step": 56250 }, { "epoch": 6.059627596598859, "grad_norm": 0.7446109652519226, "learning_rate": 0.00023681930826419565, "loss": 3.2548, "step": 56300 }, { "epoch": 6.065009148638468, "grad_norm": 0.7766132354736328, "learning_rate": 0.00023649606723413424, "loss": 3.2696, "step": 56350 }, { "epoch": 6.070390700678075, "grad_norm": 0.7171474695205688, "learning_rate": 0.0002361728262040728, "loss": 3.2452, "step": 56400 }, { "epoch": 6.075772252717684, "grad_norm": 0.7210790514945984, "learning_rate": 0.00023584958517401138, "loss": 3.2505, "step": 56450 }, { "epoch": 6.081153804757292, "grad_norm": 0.743676483631134, "learning_rate": 0.00023552634414395, "loss": 3.2716, "step": 56500 }, { "epoch": 6.0865353567969, "grad_norm": 0.7467906475067139, "learning_rate": 0.00023520310311388857, "loss": 3.2724, "step": 56550 }, { "epoch": 6.091916908836509, "grad_norm": 0.8049376010894775, "learning_rate": 0.00023487986208382713, "loss": 3.2701, "step": 56600 }, { "epoch": 6.097298460876116, "grad_norm": 0.7321614623069763, "learning_rate": 0.00023455662105376576, "loss": 3.2599, "step": 56650 }, { "epoch": 6.102680012915725, "grad_norm": 0.779611349105835, "learning_rate": 0.00023423338002370432, "loss": 3.2633, "step": 56700 }, { "epoch": 6.108061564955333, "grad_norm": 0.7684844136238098, "learning_rate": 0.0002339101389936429, "loss": 3.2466, "step": 56750 }, { "epoch": 6.113443116994941, "grad_norm": 0.7690160870552063, "learning_rate": 0.00023358689796358149, "loss": 3.2413, "step": 56800 }, { "epoch": 6.11882466903455, "grad_norm": 0.7911422252655029, "learning_rate": 0.00023326365693352008, "loss": 3.2716, "step": 56850 }, { "epoch": 6.124206221074158, "grad_norm": 0.7434123754501343, "learning_rate": 0.00023294041590345865, "loss": 3.2745, "step": 56900 }, { "epoch": 6.129587773113766, "grad_norm": 0.7318158149719238, "learning_rate": 0.00023261717487339724, "loss": 3.2717, "step": 56950 }, { "epoch": 6.134969325153374, "grad_norm": 0.748771071434021, "learning_rate": 0.0002322939338433358, "loss": 3.2754, "step": 57000 }, { "epoch": 6.134969325153374, "eval_accuracy": 0.3840938256801318, "eval_loss": 3.383234739303589, "eval_runtime": 184.5035, "eval_samples_per_second": 97.619, "eval_steps_per_second": 6.103, "step": 57000 }, { "epoch": 6.140350877192983, "grad_norm": 0.7642286419868469, "learning_rate": 0.0002319706928132744, "loss": 3.2569, "step": 57050 }, { "epoch": 6.1457324292325906, "grad_norm": 0.7967156767845154, "learning_rate": 0.000231647451783213, "loss": 3.272, "step": 57100 }, { "epoch": 6.151113981272199, "grad_norm": 0.7421084046363831, "learning_rate": 0.00023132421075315157, "loss": 3.2694, "step": 57150 }, { "epoch": 6.156495533311807, "grad_norm": 0.7525152564048767, "learning_rate": 0.0002310009697230902, "loss": 3.2596, "step": 57200 }, { "epoch": 6.161877085351415, "grad_norm": 0.6975091695785522, "learning_rate": 0.00023067772869302876, "loss": 3.2706, "step": 57250 }, { "epoch": 6.167258637391024, "grad_norm": 0.7624300122261047, "learning_rate": 0.00023035448766296732, "loss": 3.2747, "step": 57300 }, { "epoch": 6.1726401894306315, "grad_norm": 0.7489650845527649, "learning_rate": 0.00023003124663290592, "loss": 3.2598, "step": 57350 }, { "epoch": 6.17802174147024, "grad_norm": 0.747357189655304, "learning_rate": 0.0002297080056028445, "loss": 3.2593, "step": 57400 }, { "epoch": 6.183403293509849, "grad_norm": 0.7370163202285767, "learning_rate": 0.00022938476457278308, "loss": 3.2656, "step": 57450 }, { "epoch": 6.188784845549456, "grad_norm": 0.7547042369842529, "learning_rate": 0.00022906152354272168, "loss": 3.2699, "step": 57500 }, { "epoch": 6.194166397589065, "grad_norm": 0.7671395540237427, "learning_rate": 0.00022873828251266024, "loss": 3.2792, "step": 57550 }, { "epoch": 6.1995479496286725, "grad_norm": 0.7778907418251038, "learning_rate": 0.00022841504148259884, "loss": 3.2836, "step": 57600 }, { "epoch": 6.204929501668281, "grad_norm": 0.7617507576942444, "learning_rate": 0.00022809180045253743, "loss": 3.2632, "step": 57650 }, { "epoch": 6.21031105370789, "grad_norm": 0.8062065839767456, "learning_rate": 0.000227768559422476, "loss": 3.2826, "step": 57700 }, { "epoch": 6.215692605747497, "grad_norm": 0.7325953841209412, "learning_rate": 0.00022744531839241457, "loss": 3.2799, "step": 57750 }, { "epoch": 6.221074157787106, "grad_norm": 0.7609289288520813, "learning_rate": 0.0002271220773623532, "loss": 3.2695, "step": 57800 }, { "epoch": 6.226455709826714, "grad_norm": 0.8026849031448364, "learning_rate": 0.00022679883633229176, "loss": 3.27, "step": 57850 }, { "epoch": 6.231837261866322, "grad_norm": 0.7824159860610962, "learning_rate": 0.00022647559530223032, "loss": 3.2942, "step": 57900 }, { "epoch": 6.237218813905931, "grad_norm": 0.7842664122581482, "learning_rate": 0.00022615235427216895, "loss": 3.2698, "step": 57950 }, { "epoch": 6.242600365945538, "grad_norm": 0.8548306226730347, "learning_rate": 0.0002258291132421075, "loss": 3.2745, "step": 58000 }, { "epoch": 6.242600365945538, "eval_accuracy": 0.384586240567707, "eval_loss": 3.378992795944214, "eval_runtime": 184.88, "eval_samples_per_second": 97.42, "eval_steps_per_second": 6.09, "step": 58000 }, { "epoch": 6.247981917985147, "grad_norm": 0.744491696357727, "learning_rate": 0.0002255058722120461, "loss": 3.2725, "step": 58050 }, { "epoch": 6.253363470024755, "grad_norm": 0.7470669150352478, "learning_rate": 0.00022518263118198468, "loss": 3.2718, "step": 58100 }, { "epoch": 6.258745022064363, "grad_norm": 0.7122950553894043, "learning_rate": 0.00022485939015192327, "loss": 3.2667, "step": 58150 }, { "epoch": 6.264126574103972, "grad_norm": 0.7258158922195435, "learning_rate": 0.00022453614912186186, "loss": 3.265, "step": 58200 }, { "epoch": 6.26950812614358, "grad_norm": 0.7712364196777344, "learning_rate": 0.00022421290809180043, "loss": 3.2725, "step": 58250 }, { "epoch": 6.274889678183188, "grad_norm": 0.7405200600624084, "learning_rate": 0.000223889667061739, "loss": 3.2729, "step": 58300 }, { "epoch": 6.280271230222796, "grad_norm": 0.7869605422019958, "learning_rate": 0.00022356642603167762, "loss": 3.2867, "step": 58350 }, { "epoch": 6.285652782262405, "grad_norm": 0.7676852345466614, "learning_rate": 0.0002232431850016162, "loss": 3.2751, "step": 58400 }, { "epoch": 6.2910343343020125, "grad_norm": 0.8062360286712646, "learning_rate": 0.00022291994397155476, "loss": 3.2947, "step": 58450 }, { "epoch": 6.296415886341621, "grad_norm": 0.7730180621147156, "learning_rate": 0.00022259670294149338, "loss": 3.2726, "step": 58500 }, { "epoch": 6.301797438381229, "grad_norm": 0.7443013787269592, "learning_rate": 0.00022227346191143195, "loss": 3.2728, "step": 58550 }, { "epoch": 6.307178990420837, "grad_norm": 0.7263332605361938, "learning_rate": 0.00022195022088137051, "loss": 3.2891, "step": 58600 }, { "epoch": 6.312560542460446, "grad_norm": 0.730728268623352, "learning_rate": 0.0002216269798513091, "loss": 3.2826, "step": 58650 }, { "epoch": 6.3179420945000535, "grad_norm": 0.7482548952102661, "learning_rate": 0.00022130373882124768, "loss": 3.276, "step": 58700 }, { "epoch": 6.323323646539662, "grad_norm": 0.7771552205085754, "learning_rate": 0.00022098049779118627, "loss": 3.2764, "step": 58750 }, { "epoch": 6.328705198579271, "grad_norm": 0.7530174851417542, "learning_rate": 0.00022065725676112487, "loss": 3.2929, "step": 58800 }, { "epoch": 6.334086750618878, "grad_norm": 0.7332282066345215, "learning_rate": 0.00022033401573106343, "loss": 3.2768, "step": 58850 }, { "epoch": 6.339468302658487, "grad_norm": 0.8075442314147949, "learning_rate": 0.000220010774701002, "loss": 3.2745, "step": 58900 }, { "epoch": 6.344849854698095, "grad_norm": 0.7218873500823975, "learning_rate": 0.00021968753367094062, "loss": 3.2761, "step": 58950 }, { "epoch": 6.350231406737703, "grad_norm": 0.733670711517334, "learning_rate": 0.0002193642926408792, "loss": 3.2611, "step": 59000 }, { "epoch": 6.350231406737703, "eval_accuracy": 0.3853270359568261, "eval_loss": 3.375654935836792, "eval_runtime": 184.5756, "eval_samples_per_second": 97.581, "eval_steps_per_second": 6.1, "step": 59000 }, { "epoch": 6.355612958777312, "grad_norm": 0.7387909293174744, "learning_rate": 0.00021904105161081778, "loss": 3.258, "step": 59050 }, { "epoch": 6.360994510816919, "grad_norm": 0.869426965713501, "learning_rate": 0.00021871781058075638, "loss": 3.2898, "step": 59100 }, { "epoch": 6.366376062856528, "grad_norm": 0.7325130701065063, "learning_rate": 0.00021839456955069495, "loss": 3.2768, "step": 59150 }, { "epoch": 6.371757614896136, "grad_norm": 0.7691362500190735, "learning_rate": 0.00021807132852063354, "loss": 3.2778, "step": 59200 }, { "epoch": 6.377139166935744, "grad_norm": 0.7573683857917786, "learning_rate": 0.0002177480874905721, "loss": 3.2984, "step": 59250 }, { "epoch": 6.382520718975353, "grad_norm": 0.7838876843452454, "learning_rate": 0.0002174248464605107, "loss": 3.2733, "step": 59300 }, { "epoch": 6.387902271014961, "grad_norm": 0.793933093547821, "learning_rate": 0.0002171016054304493, "loss": 3.2801, "step": 59350 }, { "epoch": 6.393283823054569, "grad_norm": 0.7980042099952698, "learning_rate": 0.00021677836440038787, "loss": 3.2682, "step": 59400 }, { "epoch": 6.398665375094177, "grad_norm": 0.7875555753707886, "learning_rate": 0.0002164615881909277, "loss": 3.2946, "step": 59450 }, { "epoch": 6.404046927133785, "grad_norm": 0.76567542552948, "learning_rate": 0.00021613834716086627, "loss": 3.2778, "step": 59500 }, { "epoch": 6.4094284791733935, "grad_norm": 0.7699328660964966, "learning_rate": 0.00021581510613080483, "loss": 3.3058, "step": 59550 }, { "epoch": 6.414810031213002, "grad_norm": 0.7370243668556213, "learning_rate": 0.00021549186510074346, "loss": 3.275, "step": 59600 }, { "epoch": 6.42019158325261, "grad_norm": 0.7352474927902222, "learning_rate": 0.00021516862407068202, "loss": 3.2637, "step": 59650 }, { "epoch": 6.425573135292218, "grad_norm": 0.7487669587135315, "learning_rate": 0.0002148453830406206, "loss": 3.263, "step": 59700 }, { "epoch": 6.430954687331827, "grad_norm": 0.8222890496253967, "learning_rate": 0.00021452214201055919, "loss": 3.2685, "step": 59750 }, { "epoch": 6.4363362393714345, "grad_norm": 0.7670249342918396, "learning_rate": 0.00021419890098049778, "loss": 3.2656, "step": 59800 }, { "epoch": 6.441717791411043, "grad_norm": 0.7992569208145142, "learning_rate": 0.00021387565995043638, "loss": 3.2902, "step": 59850 }, { "epoch": 6.447099343450651, "grad_norm": 0.7975773215293884, "learning_rate": 0.00021355241892037494, "loss": 3.2832, "step": 59900 }, { "epoch": 6.452480895490259, "grad_norm": 0.7684232592582703, "learning_rate": 0.0002132291778903135, "loss": 3.2945, "step": 59950 }, { "epoch": 6.457862447529868, "grad_norm": 0.7654765844345093, "learning_rate": 0.00021290593686025213, "loss": 3.2576, "step": 60000 }, { "epoch": 6.457862447529868, "eval_accuracy": 0.3856219198943546, "eval_loss": 3.3727223873138428, "eval_runtime": 184.4472, "eval_samples_per_second": 97.649, "eval_steps_per_second": 6.105, "step": 60000 }, { "epoch": 6.4632439995694755, "grad_norm": 0.7334644794464111, "learning_rate": 0.0002125826958301907, "loss": 3.2746, "step": 60050 }, { "epoch": 6.468625551609084, "grad_norm": 0.7792539000511169, "learning_rate": 0.00021225945480012927, "loss": 3.2693, "step": 60100 }, { "epoch": 6.474007103648693, "grad_norm": 0.7655577659606934, "learning_rate": 0.00021193621377006786, "loss": 3.2839, "step": 60150 }, { "epoch": 6.4793886556883, "grad_norm": 0.7592105865478516, "learning_rate": 0.00021161297274000646, "loss": 3.2704, "step": 60200 }, { "epoch": 6.484770207727909, "grad_norm": 0.815912127494812, "learning_rate": 0.00021128973170994502, "loss": 3.2894, "step": 60250 }, { "epoch": 6.490151759767517, "grad_norm": 0.742038905620575, "learning_rate": 0.00021096649067988362, "loss": 3.2778, "step": 60300 }, { "epoch": 6.495533311807125, "grad_norm": 0.7956986427307129, "learning_rate": 0.00021064324964982219, "loss": 3.2668, "step": 60350 }, { "epoch": 6.500914863846734, "grad_norm": 0.7016414403915405, "learning_rate": 0.00021032000861976078, "loss": 3.2708, "step": 60400 }, { "epoch": 6.506296415886341, "grad_norm": 0.7408983707427979, "learning_rate": 0.00020999676758969938, "loss": 3.2812, "step": 60450 }, { "epoch": 6.51167796792595, "grad_norm": 0.7210450768470764, "learning_rate": 0.00020967352655963794, "loss": 3.2736, "step": 60500 }, { "epoch": 6.517059519965558, "grad_norm": 0.7317191958427429, "learning_rate": 0.0002093502855295765, "loss": 3.2999, "step": 60550 }, { "epoch": 6.522441072005166, "grad_norm": 0.7658736705780029, "learning_rate": 0.00020902704449951513, "loss": 3.3033, "step": 60600 }, { "epoch": 6.5278226240447745, "grad_norm": 0.7588192820549011, "learning_rate": 0.0002087038034694537, "loss": 3.2845, "step": 60650 }, { "epoch": 6.533204176084383, "grad_norm": 0.7971400022506714, "learning_rate": 0.00020838056243939227, "loss": 3.284, "step": 60700 }, { "epoch": 6.538585728123991, "grad_norm": 0.7562872171401978, "learning_rate": 0.0002080573214093309, "loss": 3.2741, "step": 60750 }, { "epoch": 6.543967280163599, "grad_norm": 0.7392501831054688, "learning_rate": 0.00020773408037926946, "loss": 3.2865, "step": 60800 }, { "epoch": 6.549348832203208, "grad_norm": 0.7754622101783752, "learning_rate": 0.00020741083934920805, "loss": 3.2653, "step": 60850 }, { "epoch": 6.5547303842428155, "grad_norm": 0.7140413522720337, "learning_rate": 0.00020708759831914662, "loss": 3.2833, "step": 60900 }, { "epoch": 6.560111936282424, "grad_norm": 0.7713199257850647, "learning_rate": 0.00020676435728908521, "loss": 3.2903, "step": 60950 }, { "epoch": 6.565493488322032, "grad_norm": 0.7456794381141663, "learning_rate": 0.0002064411162590238, "loss": 3.2939, "step": 61000 }, { "epoch": 6.565493488322032, "eval_accuracy": 0.38606848326327214, "eval_loss": 3.368579626083374, "eval_runtime": 184.7639, "eval_samples_per_second": 97.481, "eval_steps_per_second": 6.094, "step": 61000 }, { "epoch": 6.57087504036164, "grad_norm": 0.7683389186859131, "learning_rate": 0.00020611787522896238, "loss": 3.2849, "step": 61050 }, { "epoch": 6.576256592401249, "grad_norm": 0.8046767711639404, "learning_rate": 0.00020579463419890094, "loss": 3.3024, "step": 61100 }, { "epoch": 6.5816381444408565, "grad_norm": 0.7584671378135681, "learning_rate": 0.00020547139316883957, "loss": 3.2972, "step": 61150 }, { "epoch": 6.587019696480465, "grad_norm": 0.7123998999595642, "learning_rate": 0.00020514815213877813, "loss": 3.2849, "step": 61200 }, { "epoch": 6.592401248520073, "grad_norm": 0.7630634307861328, "learning_rate": 0.0002048249111087167, "loss": 3.2829, "step": 61250 }, { "epoch": 6.597782800559681, "grad_norm": 0.7825208902359009, "learning_rate": 0.00020450167007865532, "loss": 3.2779, "step": 61300 }, { "epoch": 6.60316435259929, "grad_norm": 0.822171688079834, "learning_rate": 0.0002041784290485939, "loss": 3.2678, "step": 61350 }, { "epoch": 6.608545904638898, "grad_norm": 0.745606005191803, "learning_rate": 0.00020385518801853246, "loss": 3.2759, "step": 61400 }, { "epoch": 6.613927456678506, "grad_norm": 0.7718546986579895, "learning_rate": 0.00020353194698847105, "loss": 3.2836, "step": 61450 }, { "epoch": 6.619309008718115, "grad_norm": 0.7141976952552795, "learning_rate": 0.00020320870595840965, "loss": 3.2647, "step": 61500 }, { "epoch": 6.624690560757722, "grad_norm": 0.7380606532096863, "learning_rate": 0.00020288546492834821, "loss": 3.2891, "step": 61550 }, { "epoch": 6.630072112797331, "grad_norm": 0.7463494539260864, "learning_rate": 0.0002025622238982868, "loss": 3.2688, "step": 61600 }, { "epoch": 6.635453664836939, "grad_norm": 0.76692795753479, "learning_rate": 0.00020223898286822538, "loss": 3.2883, "step": 61650 }, { "epoch": 6.640835216876547, "grad_norm": 0.8029922246932983, "learning_rate": 0.00020191574183816397, "loss": 3.2836, "step": 61700 }, { "epoch": 6.6462167689161555, "grad_norm": 0.728897213935852, "learning_rate": 0.00020159250080810257, "loss": 3.2701, "step": 61750 }, { "epoch": 6.651598320955763, "grad_norm": 0.7733128070831299, "learning_rate": 0.00020126925977804113, "loss": 3.2767, "step": 61800 }, { "epoch": 6.656979872995372, "grad_norm": 0.7983481884002686, "learning_rate": 0.00020095248356858097, "loss": 3.2963, "step": 61850 }, { "epoch": 6.66236142503498, "grad_norm": 0.8184331059455872, "learning_rate": 0.00020062924253851953, "loss": 3.2907, "step": 61900 }, { "epoch": 6.667742977074588, "grad_norm": 0.7854340076446533, "learning_rate": 0.00020030600150845813, "loss": 3.2837, "step": 61950 }, { "epoch": 6.6731245291141965, "grad_norm": 0.843238353729248, "learning_rate": 0.0001999827604783967, "loss": 3.2781, "step": 62000 }, { "epoch": 6.6731245291141965, "eval_accuracy": 0.3863260992602803, "eval_loss": 3.364689350128174, "eval_runtime": 184.8366, "eval_samples_per_second": 97.443, "eval_steps_per_second": 6.092, "step": 62000 }, { "epoch": 6.678506081153805, "grad_norm": 0.7781000733375549, "learning_rate": 0.0001996595194483353, "loss": 3.2809, "step": 62050 }, { "epoch": 6.683887633193413, "grad_norm": 0.8087568879127502, "learning_rate": 0.00019933627841827389, "loss": 3.2962, "step": 62100 }, { "epoch": 6.689269185233021, "grad_norm": 0.7818818092346191, "learning_rate": 0.00019901303738821245, "loss": 3.3061, "step": 62150 }, { "epoch": 6.69465073727263, "grad_norm": 0.7550804615020752, "learning_rate": 0.00019868979635815102, "loss": 3.3017, "step": 62200 }, { "epoch": 6.7000322893122375, "grad_norm": 0.736332356929779, "learning_rate": 0.00019836655532808964, "loss": 3.2831, "step": 62250 }, { "epoch": 6.705413841351846, "grad_norm": 0.7578887343406677, "learning_rate": 0.0001980433142980282, "loss": 3.2868, "step": 62300 }, { "epoch": 6.710795393391454, "grad_norm": 0.751354992389679, "learning_rate": 0.00019772007326796678, "loss": 3.2909, "step": 62350 }, { "epoch": 6.716176945431062, "grad_norm": 0.7875543236732483, "learning_rate": 0.0001973968322379054, "loss": 3.2875, "step": 62400 }, { "epoch": 6.721558497470671, "grad_norm": 0.7486498355865479, "learning_rate": 0.00019707359120784397, "loss": 3.2804, "step": 62450 }, { "epoch": 6.7269400495102785, "grad_norm": 0.790752112865448, "learning_rate": 0.00019675035017778253, "loss": 3.3072, "step": 62500 }, { "epoch": 6.732321601549887, "grad_norm": 0.7708467841148376, "learning_rate": 0.00019643357396832237, "loss": 3.2902, "step": 62550 }, { "epoch": 6.737703153589496, "grad_norm": 0.7900062799453735, "learning_rate": 0.00019611033293826096, "loss": 3.281, "step": 62600 }, { "epoch": 6.743084705629103, "grad_norm": 0.7324670553207397, "learning_rate": 0.00019578709190819953, "loss": 3.2743, "step": 62650 }, { "epoch": 6.748466257668712, "grad_norm": 0.7415837645530701, "learning_rate": 0.0001954638508781381, "loss": 3.3112, "step": 62700 }, { "epoch": 6.75384780970832, "grad_norm": 0.7605777382850647, "learning_rate": 0.00019514060984807672, "loss": 3.2669, "step": 62750 }, { "epoch": 6.759229361747928, "grad_norm": 0.7561804056167603, "learning_rate": 0.0001948173688180153, "loss": 3.2779, "step": 62800 }, { "epoch": 6.7646109137875365, "grad_norm": 0.799517810344696, "learning_rate": 0.00019449412778795386, "loss": 3.2907, "step": 62850 }, { "epoch": 6.769992465827144, "grad_norm": 0.9140427112579346, "learning_rate": 0.00019417088675789245, "loss": 3.2925, "step": 62900 }, { "epoch": 6.775374017866753, "grad_norm": 0.7958894968032837, "learning_rate": 0.00019384764572783104, "loss": 3.2867, "step": 62950 }, { "epoch": 6.780755569906361, "grad_norm": 0.7784082293510437, "learning_rate": 0.0001935244046977696, "loss": 3.2786, "step": 63000 }, { "epoch": 6.780755569906361, "eval_accuracy": 0.38668899990558064, "eval_loss": 3.360412836074829, "eval_runtime": 185.3645, "eval_samples_per_second": 97.165, "eval_steps_per_second": 6.075, "step": 63000 }, { "epoch": 6.786137121945969, "grad_norm": 0.7796688675880432, "learning_rate": 0.0001932011636677082, "loss": 3.3014, "step": 63050 }, { "epoch": 6.7915186739855775, "grad_norm": 0.8077431917190552, "learning_rate": 0.00019287792263764677, "loss": 3.2886, "step": 63100 }, { "epoch": 6.796900226025185, "grad_norm": 0.7771782875061035, "learning_rate": 0.00019255468160758537, "loss": 3.2901, "step": 63150 }, { "epoch": 6.802281778064794, "grad_norm": 0.8028762340545654, "learning_rate": 0.00019223144057752396, "loss": 3.2833, "step": 63200 }, { "epoch": 6.807663330104402, "grad_norm": 0.7487192749977112, "learning_rate": 0.00019190819954746253, "loss": 3.2754, "step": 63250 }, { "epoch": 6.813044882144011, "grad_norm": 0.7907421588897705, "learning_rate": 0.0001915849585174011, "loss": 3.2896, "step": 63300 }, { "epoch": 6.8184264341836185, "grad_norm": 0.7865395545959473, "learning_rate": 0.00019126171748733972, "loss": 3.2825, "step": 63350 }, { "epoch": 6.823807986223227, "grad_norm": 0.8095396757125854, "learning_rate": 0.0001909384764572783, "loss": 3.2916, "step": 63400 }, { "epoch": 6.829189538262835, "grad_norm": 0.812186598777771, "learning_rate": 0.00019061523542721688, "loss": 3.2914, "step": 63450 }, { "epoch": 6.834571090302443, "grad_norm": 0.767548680305481, "learning_rate": 0.00019029199439715548, "loss": 3.2989, "step": 63500 }, { "epoch": 6.839952642342052, "grad_norm": 0.9850395917892456, "learning_rate": 0.00018996875336709404, "loss": 3.2761, "step": 63550 }, { "epoch": 6.8453341943816595, "grad_norm": 0.8007294535636902, "learning_rate": 0.00018964551233703264, "loss": 3.2923, "step": 63600 }, { "epoch": 6.850715746421268, "grad_norm": 0.7697404026985168, "learning_rate": 0.0001893222713069712, "loss": 3.2658, "step": 63650 }, { "epoch": 6.856097298460876, "grad_norm": 0.775759220123291, "learning_rate": 0.0001889990302769098, "loss": 3.2943, "step": 63700 }, { "epoch": 6.861478850500484, "grad_norm": 0.7596869468688965, "learning_rate": 0.0001886757892468484, "loss": 3.2866, "step": 63750 }, { "epoch": 6.866860402540093, "grad_norm": 0.7999332547187805, "learning_rate": 0.00018835254821678696, "loss": 3.2685, "step": 63800 }, { "epoch": 6.8722419545797, "grad_norm": 0.8248339891433716, "learning_rate": 0.00018802930718672553, "loss": 3.298, "step": 63850 }, { "epoch": 6.877623506619309, "grad_norm": 0.7721416354179382, "learning_rate": 0.00018770606615666415, "loss": 3.2776, "step": 63900 }, { "epoch": 6.8830050586589175, "grad_norm": 0.8377927541732788, "learning_rate": 0.00018738282512660272, "loss": 3.2831, "step": 63950 }, { "epoch": 6.888386610698525, "grad_norm": 0.8438067436218262, "learning_rate": 0.0001870595840965413, "loss": 3.2927, "step": 64000 }, { "epoch": 6.888386610698525, "eval_accuracy": 0.3870464679064903, "eval_loss": 3.357008457183838, "eval_runtime": 185.2485, "eval_samples_per_second": 97.226, "eval_steps_per_second": 6.078, "step": 64000 }, { "epoch": 6.893768162738134, "grad_norm": 0.7458718419075012, "learning_rate": 0.0001867363430664799, "loss": 3.289, "step": 64050 }, { "epoch": 6.899149714777742, "grad_norm": 0.8297790288925171, "learning_rate": 0.00018641310203641848, "loss": 3.2955, "step": 64100 }, { "epoch": 6.90453126681735, "grad_norm": 0.7854870557785034, "learning_rate": 0.00018608986100635705, "loss": 3.2603, "step": 64150 }, { "epoch": 6.9099128188569585, "grad_norm": 0.7834721207618713, "learning_rate": 0.00018576661997629564, "loss": 3.2736, "step": 64200 }, { "epoch": 6.915294370896566, "grad_norm": 0.7471182346343994, "learning_rate": 0.00018544337894623423, "loss": 3.2952, "step": 64250 }, { "epoch": 6.920675922936175, "grad_norm": 0.7817671895027161, "learning_rate": 0.0001851201379161728, "loss": 3.2912, "step": 64300 }, { "epoch": 6.926057474975783, "grad_norm": 0.7854493856430054, "learning_rate": 0.0001847968968861114, "loss": 3.2829, "step": 64350 }, { "epoch": 6.931439027015391, "grad_norm": 0.7644644379615784, "learning_rate": 0.00018447365585604996, "loss": 3.283, "step": 64400 }, { "epoch": 6.9368205790549995, "grad_norm": 0.79556804895401, "learning_rate": 0.00018415041482598859, "loss": 3.2803, "step": 64450 }, { "epoch": 6.942202131094608, "grad_norm": 0.8365612030029297, "learning_rate": 0.00018382717379592715, "loss": 3.279, "step": 64500 }, { "epoch": 6.947583683134216, "grad_norm": 0.8554062843322754, "learning_rate": 0.00018350393276586572, "loss": 3.2721, "step": 64550 }, { "epoch": 6.952965235173824, "grad_norm": 0.8001241683959961, "learning_rate": 0.00018318069173580434, "loss": 3.2768, "step": 64600 }, { "epoch": 6.958346787213433, "grad_norm": 0.7490769624710083, "learning_rate": 0.0001828574507057429, "loss": 3.2979, "step": 64650 }, { "epoch": 6.9637283392530405, "grad_norm": 0.809732973575592, "learning_rate": 0.00018253420967568148, "loss": 3.2905, "step": 64700 }, { "epoch": 6.969109891292649, "grad_norm": 0.7997918725013733, "learning_rate": 0.00018221096864562007, "loss": 3.288, "step": 64750 }, { "epoch": 6.974491443332257, "grad_norm": 0.7825332880020142, "learning_rate": 0.00018188772761555867, "loss": 3.2976, "step": 64800 }, { "epoch": 6.979872995371865, "grad_norm": 0.7849071621894836, "learning_rate": 0.00018156448658549723, "loss": 3.2743, "step": 64850 }, { "epoch": 6.985254547411474, "grad_norm": 0.7942416667938232, "learning_rate": 0.00018124124555543583, "loss": 3.279, "step": 64900 }, { "epoch": 6.990636099451081, "grad_norm": 0.8016338348388672, "learning_rate": 0.0001809180045253744, "loss": 3.3058, "step": 64950 }, { "epoch": 6.99601765149069, "grad_norm": 1.0467689037322998, "learning_rate": 0.00018059476349531296, "loss": 3.2897, "step": 65000 }, { "epoch": 6.99601765149069, "eval_accuracy": 0.3873104944238795, "eval_loss": 3.3523385524749756, "eval_runtime": 185.1152, "eval_samples_per_second": 97.296, "eval_steps_per_second": 6.083, "step": 65000 }, { "epoch": 7.0013992035302985, "grad_norm": 0.8019529581069946, "learning_rate": 0.00018027152246525159, "loss": 3.2621, "step": 65050 }, { "epoch": 7.006780755569906, "grad_norm": 0.7568375468254089, "learning_rate": 0.00017994828143519015, "loss": 3.1794, "step": 65100 }, { "epoch": 7.012162307609515, "grad_norm": 0.7430813908576965, "learning_rate": 0.00017962504040512872, "loss": 3.1624, "step": 65150 }, { "epoch": 7.017543859649122, "grad_norm": 0.772469699382782, "learning_rate": 0.00017930179937506734, "loss": 3.2228, "step": 65200 }, { "epoch": 7.022925411688731, "grad_norm": 0.8518636226654053, "learning_rate": 0.0001789785583450059, "loss": 3.1921, "step": 65250 }, { "epoch": 7.0283069637283395, "grad_norm": 0.791405200958252, "learning_rate": 0.00017865531731494448, "loss": 3.1906, "step": 65300 }, { "epoch": 7.033688515767947, "grad_norm": 0.8231589198112488, "learning_rate": 0.00017833207628488307, "loss": 3.1997, "step": 65350 }, { "epoch": 7.039070067807556, "grad_norm": 0.8060066103935242, "learning_rate": 0.00017800883525482167, "loss": 3.2046, "step": 65400 }, { "epoch": 7.044451619847164, "grad_norm": 0.7832460403442383, "learning_rate": 0.00017768559422476026, "loss": 3.2033, "step": 65450 }, { "epoch": 7.049833171886772, "grad_norm": 0.8058651089668274, "learning_rate": 0.00017736235319469883, "loss": 3.2072, "step": 65500 }, { "epoch": 7.0552147239263805, "grad_norm": 0.7742359638214111, "learning_rate": 0.0001770391121646374, "loss": 3.1953, "step": 65550 }, { "epoch": 7.060596275965988, "grad_norm": 0.803683340549469, "learning_rate": 0.00017671587113457602, "loss": 3.2056, "step": 65600 }, { "epoch": 7.065977828005597, "grad_norm": 0.7680373787879944, "learning_rate": 0.0001763926301045146, "loss": 3.1984, "step": 65650 }, { "epoch": 7.071359380045205, "grad_norm": 0.7999513149261475, "learning_rate": 0.00017606938907445315, "loss": 3.2236, "step": 65700 }, { "epoch": 7.076740932084813, "grad_norm": 0.7916874885559082, "learning_rate": 0.00017574614804439178, "loss": 3.1951, "step": 65750 }, { "epoch": 7.0821224841244215, "grad_norm": 0.7580865025520325, "learning_rate": 0.00017542290701433034, "loss": 3.2117, "step": 65800 }, { "epoch": 7.08750403616403, "grad_norm": 0.8287043571472168, "learning_rate": 0.0001750996659842689, "loss": 3.2259, "step": 65850 }, { "epoch": 7.092885588203638, "grad_norm": 0.8034733533859253, "learning_rate": 0.0001747764249542075, "loss": 3.2206, "step": 65900 }, { "epoch": 7.098267140243246, "grad_norm": 0.7903028726577759, "learning_rate": 0.0001744531839241461, "loss": 3.2228, "step": 65950 }, { "epoch": 7.103648692282855, "grad_norm": 0.8349509835243225, "learning_rate": 0.00017412994289408467, "loss": 3.2322, "step": 66000 }, { "epoch": 7.103648692282855, "eval_accuracy": 0.3872474757489471, "eval_loss": 3.3593993186950684, "eval_runtime": 184.6002, "eval_samples_per_second": 97.568, "eval_steps_per_second": 6.1, "step": 66000 }, { "epoch": 7.109030244322462, "grad_norm": 0.8231300115585327, "learning_rate": 0.00017380670186402326, "loss": 3.2055, "step": 66050 }, { "epoch": 7.114411796362071, "grad_norm": 0.8903416395187378, "learning_rate": 0.00017348346083396183, "loss": 3.2181, "step": 66100 }, { "epoch": 7.119793348401679, "grad_norm": 0.8198266625404358, "learning_rate": 0.00017316021980390042, "loss": 3.2099, "step": 66150 }, { "epoch": 7.125174900441287, "grad_norm": 0.783141553401947, "learning_rate": 0.00017283697877383902, "loss": 3.2097, "step": 66200 }, { "epoch": 7.130556452480896, "grad_norm": 0.7667449116706848, "learning_rate": 0.0001725137377437776, "loss": 3.1992, "step": 66250 }, { "epoch": 7.135938004520503, "grad_norm": 0.8115949630737305, "learning_rate": 0.00017219049671371615, "loss": 3.2247, "step": 66300 }, { "epoch": 7.141319556560112, "grad_norm": 0.7727253437042236, "learning_rate": 0.00017186725568365478, "loss": 3.2203, "step": 66350 }, { "epoch": 7.1467011085997205, "grad_norm": 0.7740350961685181, "learning_rate": 0.00017154401465359334, "loss": 3.2199, "step": 66400 }, { "epoch": 7.152082660639328, "grad_norm": 0.7830681204795837, "learning_rate": 0.00017122077362353194, "loss": 3.2105, "step": 66450 }, { "epoch": 7.157464212678937, "grad_norm": 0.810469388961792, "learning_rate": 0.00017089753259347053, "loss": 3.2302, "step": 66500 }, { "epoch": 7.162845764718545, "grad_norm": 0.8127453327178955, "learning_rate": 0.0001705742915634091, "loss": 3.2225, "step": 66550 }, { "epoch": 7.168227316758153, "grad_norm": 0.8133081197738647, "learning_rate": 0.0001702510505333477, "loss": 3.2229, "step": 66600 }, { "epoch": 7.1736088687977615, "grad_norm": 0.8117032647132874, "learning_rate": 0.00016992780950328626, "loss": 3.2174, "step": 66650 }, { "epoch": 7.178990420837369, "grad_norm": 0.7892330288887024, "learning_rate": 0.00016960456847322486, "loss": 3.2199, "step": 66700 }, { "epoch": 7.184371972876978, "grad_norm": 0.7563703656196594, "learning_rate": 0.00016928132744316345, "loss": 3.2345, "step": 66750 }, { "epoch": 7.189753524916586, "grad_norm": 0.7536913752555847, "learning_rate": 0.00016895808641310202, "loss": 3.2084, "step": 66800 }, { "epoch": 7.195135076956194, "grad_norm": 0.8196943402290344, "learning_rate": 0.0001686348453830406, "loss": 3.2209, "step": 66850 }, { "epoch": 7.2005166289958025, "grad_norm": 0.7916746139526367, "learning_rate": 0.0001683116043529792, "loss": 3.2216, "step": 66900 }, { "epoch": 7.205898181035411, "grad_norm": 0.7951803803443909, "learning_rate": 0.00016798836332291778, "loss": 3.2184, "step": 66950 }, { "epoch": 7.211279733075019, "grad_norm": 0.8097406029701233, "learning_rate": 0.00016766512229285634, "loss": 3.2149, "step": 67000 }, { "epoch": 7.211279733075019, "eval_accuracy": 0.38753573186031887, "eval_loss": 3.3569376468658447, "eval_runtime": 184.6912, "eval_samples_per_second": 97.52, "eval_steps_per_second": 6.097, "step": 67000 }, { "epoch": 7.216661285114627, "grad_norm": 0.8015643954277039, "learning_rate": 0.00016734188126279494, "loss": 3.2318, "step": 67050 }, { "epoch": 7.222042837154235, "grad_norm": 0.7905345559120178, "learning_rate": 0.00016701864023273353, "loss": 3.2268, "step": 67100 }, { "epoch": 7.2274243891938434, "grad_norm": 0.8032287359237671, "learning_rate": 0.00016670186402327334, "loss": 3.2127, "step": 67150 }, { "epoch": 7.232805941233452, "grad_norm": 0.8226504325866699, "learning_rate": 0.0001663786229932119, "loss": 3.2253, "step": 67200 }, { "epoch": 7.23818749327306, "grad_norm": 0.8013386130332947, "learning_rate": 0.00016605538196315053, "loss": 3.2136, "step": 67250 }, { "epoch": 7.243569045312668, "grad_norm": 0.7741095423698425, "learning_rate": 0.0001657321409330891, "loss": 3.2263, "step": 67300 }, { "epoch": 7.248950597352277, "grad_norm": 0.7725183963775635, "learning_rate": 0.00016540889990302766, "loss": 3.2461, "step": 67350 }, { "epoch": 7.254332149391884, "grad_norm": 0.845971941947937, "learning_rate": 0.00016508565887296629, "loss": 3.2246, "step": 67400 }, { "epoch": 7.259713701431493, "grad_norm": 0.8192383646965027, "learning_rate": 0.00016476241784290485, "loss": 3.2089, "step": 67450 }, { "epoch": 7.265095253471101, "grad_norm": 0.7881582975387573, "learning_rate": 0.00016443917681284342, "loss": 3.2185, "step": 67500 }, { "epoch": 7.270476805510709, "grad_norm": 0.7972434163093567, "learning_rate": 0.00016411593578278202, "loss": 3.2279, "step": 67550 }, { "epoch": 7.275858357550318, "grad_norm": 0.8054504990577698, "learning_rate": 0.0001637926947527206, "loss": 3.2168, "step": 67600 }, { "epoch": 7.281239909589925, "grad_norm": 0.9870213270187378, "learning_rate": 0.00016346945372265918, "loss": 3.2263, "step": 67650 }, { "epoch": 7.286621461629534, "grad_norm": 0.7929873466491699, "learning_rate": 0.00016314621269259777, "loss": 3.2217, "step": 67700 }, { "epoch": 7.2920030136691425, "grad_norm": 0.8502267003059387, "learning_rate": 0.00016282297166253634, "loss": 3.2171, "step": 67750 }, { "epoch": 7.29738456570875, "grad_norm": 0.7992681264877319, "learning_rate": 0.00016249973063247494, "loss": 3.233, "step": 67800 }, { "epoch": 7.302766117748359, "grad_norm": 0.7842960953712463, "learning_rate": 0.00016217648960241353, "loss": 3.2266, "step": 67850 }, { "epoch": 7.308147669787967, "grad_norm": 0.7908195853233337, "learning_rate": 0.0001618532485723521, "loss": 3.2199, "step": 67900 }, { "epoch": 7.313529221827575, "grad_norm": 0.9430187940597534, "learning_rate": 0.00016153000754229067, "loss": 3.2218, "step": 67950 }, { "epoch": 7.3189107738671835, "grad_norm": 0.7958492040634155, "learning_rate": 0.0001612067665122293, "loss": 3.2237, "step": 68000 }, { "epoch": 7.3189107738671835, "eval_accuracy": 0.38775401551193817, "eval_loss": 3.354123830795288, "eval_runtime": 184.4291, "eval_samples_per_second": 97.658, "eval_steps_per_second": 6.105, "step": 68000 }, { "epoch": 7.324292325906791, "grad_norm": 0.784818708896637, "learning_rate": 0.00016088352548216785, "loss": 3.2084, "step": 68050 }, { "epoch": 7.3296738779464, "grad_norm": 0.7515696287155151, "learning_rate": 0.00016056028445210642, "loss": 3.214, "step": 68100 }, { "epoch": 7.335055429986008, "grad_norm": 0.8103103637695312, "learning_rate": 0.00016023704342204504, "loss": 3.2215, "step": 68150 }, { "epoch": 7.340436982025616, "grad_norm": 0.7764951586723328, "learning_rate": 0.0001599138023919836, "loss": 3.2311, "step": 68200 }, { "epoch": 7.3458185340652244, "grad_norm": 0.8081309795379639, "learning_rate": 0.0001595905613619222, "loss": 3.226, "step": 68250 }, { "epoch": 7.351200086104833, "grad_norm": 0.791309118270874, "learning_rate": 0.00015926732033186077, "loss": 3.2143, "step": 68300 }, { "epoch": 7.356581638144441, "grad_norm": 0.8047209978103638, "learning_rate": 0.00015894407930179934, "loss": 3.2546, "step": 68350 }, { "epoch": 7.361963190184049, "grad_norm": 0.8465035557746887, "learning_rate": 0.00015862083827173796, "loss": 3.2317, "step": 68400 }, { "epoch": 7.367344742223658, "grad_norm": 0.8064917922019958, "learning_rate": 0.00015829759724167653, "loss": 3.2442, "step": 68450 }, { "epoch": 7.372726294263265, "grad_norm": 0.8181912302970886, "learning_rate": 0.0001579743562116151, "loss": 3.2323, "step": 68500 }, { "epoch": 7.378107846302874, "grad_norm": 0.7603694796562195, "learning_rate": 0.00015765111518155372, "loss": 3.2263, "step": 68550 }, { "epoch": 7.383489398342482, "grad_norm": 0.7845638990402222, "learning_rate": 0.0001573278741514923, "loss": 3.2383, "step": 68600 }, { "epoch": 7.38887095038209, "grad_norm": 0.8256173729896545, "learning_rate": 0.00015700463312143085, "loss": 3.2371, "step": 68650 }, { "epoch": 7.394252502421699, "grad_norm": 0.7949895262718201, "learning_rate": 0.00015668139209136945, "loss": 3.2313, "step": 68700 }, { "epoch": 7.399634054461306, "grad_norm": 0.8290249705314636, "learning_rate": 0.00015635815106130804, "loss": 3.2535, "step": 68750 }, { "epoch": 7.405015606500915, "grad_norm": 0.8266456723213196, "learning_rate": 0.0001560349100312466, "loss": 3.2336, "step": 68800 }, { "epoch": 7.4103971585405235, "grad_norm": 0.7951944470405579, "learning_rate": 0.0001557116690011852, "loss": 3.2226, "step": 68850 }, { "epoch": 7.415778710580131, "grad_norm": 0.8083871603012085, "learning_rate": 0.00015538842797112377, "loss": 3.2418, "step": 68900 }, { "epoch": 7.42116026261974, "grad_norm": 0.789167046546936, "learning_rate": 0.00015506518694106237, "loss": 3.2354, "step": 68950 }, { "epoch": 7.426541814659347, "grad_norm": 0.8217098712921143, "learning_rate": 0.00015474194591100096, "loss": 3.2238, "step": 69000 }, { "epoch": 7.426541814659347, "eval_accuracy": 0.38835714769219637, "eval_loss": 3.349827289581299, "eval_runtime": 184.7119, "eval_samples_per_second": 97.509, "eval_steps_per_second": 6.096, "step": 69000 }, { "epoch": 7.431923366698956, "grad_norm": 0.8126811981201172, "learning_rate": 0.00015441870488093953, "loss": 3.2442, "step": 69050 }, { "epoch": 7.4373049187385645, "grad_norm": 0.9607537984848022, "learning_rate": 0.0001540954638508781, "loss": 3.216, "step": 69100 }, { "epoch": 7.442686470778172, "grad_norm": 0.8086814880371094, "learning_rate": 0.00015377222282081672, "loss": 3.2337, "step": 69150 }, { "epoch": 7.448068022817781, "grad_norm": 0.8346220850944519, "learning_rate": 0.0001534489817907553, "loss": 3.2318, "step": 69200 }, { "epoch": 7.453449574857389, "grad_norm": 0.8226746916770935, "learning_rate": 0.00015312574076069388, "loss": 3.2374, "step": 69250 }, { "epoch": 7.458831126896997, "grad_norm": 0.8161358833312988, "learning_rate": 0.00015280249973063248, "loss": 3.2381, "step": 69300 }, { "epoch": 7.4642126789366054, "grad_norm": 0.8100681304931641, "learning_rate": 0.00015248572352117228, "loss": 3.2397, "step": 69350 }, { "epoch": 7.469594230976213, "grad_norm": 0.8125149011611938, "learning_rate": 0.00015216248249111085, "loss": 3.2401, "step": 69400 }, { "epoch": 7.474975783015822, "grad_norm": 0.8136345744132996, "learning_rate": 0.00015183924146104945, "loss": 3.2329, "step": 69450 }, { "epoch": 7.48035733505543, "grad_norm": 0.8273459076881409, "learning_rate": 0.00015151600043098804, "loss": 3.2415, "step": 69500 }, { "epoch": 7.485738887095038, "grad_norm": 0.823276698589325, "learning_rate": 0.0001511927594009266, "loss": 3.2372, "step": 69550 }, { "epoch": 7.491120439134646, "grad_norm": 0.8145060539245605, "learning_rate": 0.00015086951837086518, "loss": 3.2337, "step": 69600 }, { "epoch": 7.496501991174255, "grad_norm": 0.8343734741210938, "learning_rate": 0.0001505462773408038, "loss": 3.2174, "step": 69650 }, { "epoch": 7.501883543213863, "grad_norm": 0.8493902683258057, "learning_rate": 0.00015022303631074236, "loss": 3.2238, "step": 69700 }, { "epoch": 7.507265095253471, "grad_norm": 0.8339319825172424, "learning_rate": 0.00014989979528068096, "loss": 3.2439, "step": 69750 }, { "epoch": 7.51264664729308, "grad_norm": 0.7970526218414307, "learning_rate": 0.00014957655425061953, "loss": 3.2329, "step": 69800 }, { "epoch": 7.518028199332687, "grad_norm": 0.8021500706672668, "learning_rate": 0.00014925331322055812, "loss": 3.2463, "step": 69850 }, { "epoch": 7.523409751372296, "grad_norm": 0.8772468566894531, "learning_rate": 0.0001489300721904967, "loss": 3.2297, "step": 69900 }, { "epoch": 7.528791303411904, "grad_norm": 0.8051742911338806, "learning_rate": 0.00014860683116043528, "loss": 3.2178, "step": 69950 }, { "epoch": 7.534172855451512, "grad_norm": 0.7842574119567871, "learning_rate": 0.00014828359013037385, "loss": 3.2394, "step": 70000 }, { "epoch": 7.534172855451512, "eval_accuracy": 0.3887887169625955, "eval_loss": 3.3451998233795166, "eval_runtime": 184.5808, "eval_samples_per_second": 97.578, "eval_steps_per_second": 6.1, "step": 70000 }, { "epoch": 7.539554407491121, "grad_norm": 0.8179084062576294, "learning_rate": 0.00014796034910031245, "loss": 3.2395, "step": 70050 }, { "epoch": 7.544935959530728, "grad_norm": 0.872739851474762, "learning_rate": 0.00014763710807025104, "loss": 3.214, "step": 70100 }, { "epoch": 7.550317511570337, "grad_norm": 0.8286665678024292, "learning_rate": 0.0001473138670401896, "loss": 3.2258, "step": 70150 }, { "epoch": 7.5556990636099455, "grad_norm": 0.8176371455192566, "learning_rate": 0.0001469906260101282, "loss": 3.2416, "step": 70200 }, { "epoch": 7.561080615649553, "grad_norm": 0.8359342813491821, "learning_rate": 0.0001466673849800668, "loss": 3.2232, "step": 70250 }, { "epoch": 7.566462167689162, "grad_norm": 0.8256863355636597, "learning_rate": 0.0001463441439500054, "loss": 3.2405, "step": 70300 }, { "epoch": 7.57184371972877, "grad_norm": 0.832430899143219, "learning_rate": 0.00014602090291994396, "loss": 3.2448, "step": 70350 }, { "epoch": 7.577225271768378, "grad_norm": 0.812171995639801, "learning_rate": 0.00014569766188988255, "loss": 3.2402, "step": 70400 }, { "epoch": 7.5826068238079865, "grad_norm": 0.849791944026947, "learning_rate": 0.00014537442085982112, "loss": 3.2329, "step": 70450 }, { "epoch": 7.587988375847594, "grad_norm": 0.8218454122543335, "learning_rate": 0.00014505117982975972, "loss": 3.2569, "step": 70500 }, { "epoch": 7.593369927887203, "grad_norm": 0.8892902135848999, "learning_rate": 0.00014472793879969828, "loss": 3.2399, "step": 70550 }, { "epoch": 7.598751479926811, "grad_norm": 0.8445575833320618, "learning_rate": 0.00014440469776963688, "loss": 3.2096, "step": 70600 }, { "epoch": 7.604133031966419, "grad_norm": 0.8453192114830017, "learning_rate": 0.00014408145673957545, "loss": 3.2261, "step": 70650 }, { "epoch": 7.609514584006027, "grad_norm": 0.8189914226531982, "learning_rate": 0.00014375821570951404, "loss": 3.236, "step": 70700 }, { "epoch": 7.614896136045635, "grad_norm": 0.8180625438690186, "learning_rate": 0.00014343497467945264, "loss": 3.2269, "step": 70750 }, { "epoch": 7.620277688085244, "grad_norm": 0.8000085353851318, "learning_rate": 0.00014311173364939123, "loss": 3.2445, "step": 70800 }, { "epoch": 7.625659240124852, "grad_norm": 0.8189267516136169, "learning_rate": 0.0001427884926193298, "loss": 3.221, "step": 70850 }, { "epoch": 7.63104079216446, "grad_norm": 0.8770104646682739, "learning_rate": 0.0001424652515892684, "loss": 3.2435, "step": 70900 }, { "epoch": 7.636422344204068, "grad_norm": 0.8498367667198181, "learning_rate": 0.000142142010559207, "loss": 3.2299, "step": 70950 }, { "epoch": 7.641803896243677, "grad_norm": 0.8652117848396301, "learning_rate": 0.00014181876952914555, "loss": 3.2487, "step": 71000 }, { "epoch": 7.641803896243677, "eval_accuracy": 0.3894372660499601, "eval_loss": 3.341649293899536, "eval_runtime": 184.9462, "eval_samples_per_second": 97.385, "eval_steps_per_second": 6.088, "step": 71000 }, { "epoch": 7.647185448283285, "grad_norm": 0.8142329454421997, "learning_rate": 0.00014149552849908415, "loss": 3.2602, "step": 71050 }, { "epoch": 7.652567000322893, "grad_norm": 0.869361400604248, "learning_rate": 0.00014117228746902272, "loss": 3.2542, "step": 71100 }, { "epoch": 7.657948552362502, "grad_norm": 0.792498767375946, "learning_rate": 0.0001408490464389613, "loss": 3.2124, "step": 71150 }, { "epoch": 7.663330104402109, "grad_norm": 0.7901838421821594, "learning_rate": 0.00014052580540889988, "loss": 3.2253, "step": 71200 }, { "epoch": 7.668711656441718, "grad_norm": 0.8150041103363037, "learning_rate": 0.00014020256437883847, "loss": 3.2392, "step": 71250 }, { "epoch": 7.674093208481326, "grad_norm": 0.8320419788360596, "learning_rate": 0.00013987932334877707, "loss": 3.2272, "step": 71300 }, { "epoch": 7.679474760520934, "grad_norm": 0.8377978801727295, "learning_rate": 0.00013956254713931687, "loss": 3.2441, "step": 71350 }, { "epoch": 7.684856312560543, "grad_norm": 0.7959548234939575, "learning_rate": 0.00013923930610925547, "loss": 3.2421, "step": 71400 }, { "epoch": 7.69023786460015, "grad_norm": 0.8453052043914795, "learning_rate": 0.00013891606507919404, "loss": 3.2083, "step": 71450 }, { "epoch": 7.695619416639759, "grad_norm": 0.8451975584030151, "learning_rate": 0.00013859282404913263, "loss": 3.2368, "step": 71500 }, { "epoch": 7.7010009686793675, "grad_norm": 0.8180558681488037, "learning_rate": 0.0001382695830190712, "loss": 3.2399, "step": 71550 }, { "epoch": 7.706382520718975, "grad_norm": 0.8149081468582153, "learning_rate": 0.0001379463419890098, "loss": 3.2282, "step": 71600 }, { "epoch": 7.711764072758584, "grad_norm": 0.8298981785774231, "learning_rate": 0.00013762310095894836, "loss": 3.2366, "step": 71650 }, { "epoch": 7.717145624798192, "grad_norm": 0.8108612298965454, "learning_rate": 0.00013729985992888696, "loss": 3.2308, "step": 71700 }, { "epoch": 7.7225271768378, "grad_norm": 0.8104018568992615, "learning_rate": 0.00013697661889882555, "loss": 3.2337, "step": 71750 }, { "epoch": 7.727908728877408, "grad_norm": 0.8229024410247803, "learning_rate": 0.00013665337786876412, "loss": 3.2546, "step": 71800 }, { "epoch": 7.733290280917016, "grad_norm": 0.8073616027832031, "learning_rate": 0.0001363301368387027, "loss": 3.2341, "step": 71850 }, { "epoch": 7.738671832956625, "grad_norm": 0.8207933306694031, "learning_rate": 0.0001360068958086413, "loss": 3.2404, "step": 71900 }, { "epoch": 7.744053384996233, "grad_norm": 0.7952953577041626, "learning_rate": 0.00013568365477857988, "loss": 3.2419, "step": 71950 }, { "epoch": 7.749434937035841, "grad_norm": 0.8869734406471252, "learning_rate": 0.00013536041374851847, "loss": 3.2442, "step": 72000 }, { "epoch": 7.749434937035841, "eval_accuracy": 0.3894115153155481, "eval_loss": 3.33847975730896, "eval_runtime": 184.598, "eval_samples_per_second": 97.569, "eval_steps_per_second": 6.1, "step": 72000 }, { "epoch": 7.754816489075449, "grad_norm": 0.880556583404541, "learning_rate": 0.00013503717271845706, "loss": 3.2394, "step": 72050 }, { "epoch": 7.760198041115058, "grad_norm": 0.7544320821762085, "learning_rate": 0.00013471393168839563, "loss": 3.2569, "step": 72100 }, { "epoch": 7.765579593154666, "grad_norm": 0.8141136169433594, "learning_rate": 0.00013439069065833423, "loss": 3.2304, "step": 72150 }, { "epoch": 7.770961145194274, "grad_norm": 0.8603343963623047, "learning_rate": 0.0001340674496282728, "loss": 3.2456, "step": 72200 }, { "epoch": 7.776342697233883, "grad_norm": 0.8688738942146301, "learning_rate": 0.0001337442085982114, "loss": 3.2221, "step": 72250 }, { "epoch": 7.78172424927349, "grad_norm": 0.8236707448959351, "learning_rate": 0.00013342096756814996, "loss": 3.247, "step": 72300 }, { "epoch": 7.787105801313099, "grad_norm": 0.8244578838348389, "learning_rate": 0.00013309772653808855, "loss": 3.2482, "step": 72350 }, { "epoch": 7.792487353352707, "grad_norm": 0.867263913154602, "learning_rate": 0.00013277448550802715, "loss": 3.2227, "step": 72400 }, { "epoch": 7.797868905392315, "grad_norm": 0.8356115818023682, "learning_rate": 0.0001324512444779657, "loss": 3.2356, "step": 72450 }, { "epoch": 7.803250457431924, "grad_norm": 0.8518567681312561, "learning_rate": 0.0001321280034479043, "loss": 3.2419, "step": 72500 }, { "epoch": 7.808632009471531, "grad_norm": 0.8349553942680359, "learning_rate": 0.0001318047624178429, "loss": 3.2326, "step": 72550 }, { "epoch": 7.81401356151114, "grad_norm": 0.8264884352684021, "learning_rate": 0.0001314815213877815, "loss": 3.239, "step": 72600 }, { "epoch": 7.819395113550748, "grad_norm": 0.8145789504051208, "learning_rate": 0.00013115828035772007, "loss": 3.2395, "step": 72650 }, { "epoch": 7.824776665590356, "grad_norm": 0.8533801436424255, "learning_rate": 0.00013083503932765866, "loss": 3.2414, "step": 72700 }, { "epoch": 7.830158217629965, "grad_norm": 0.853441596031189, "learning_rate": 0.00013051179829759723, "loss": 3.2316, "step": 72750 }, { "epoch": 7.835539769669572, "grad_norm": 0.8426158428192139, "learning_rate": 0.00013018855726753582, "loss": 3.2577, "step": 72800 }, { "epoch": 7.840921321709181, "grad_norm": 0.8060492873191833, "learning_rate": 0.0001298653162374744, "loss": 3.2267, "step": 72850 }, { "epoch": 7.846302873748789, "grad_norm": 0.8100488185882568, "learning_rate": 0.00012954207520741298, "loss": 3.2407, "step": 72900 }, { "epoch": 7.851684425788397, "grad_norm": 0.8481305837631226, "learning_rate": 0.00012921883417735155, "loss": 3.2192, "step": 72950 }, { "epoch": 7.857065977828006, "grad_norm": 0.8777086138725281, "learning_rate": 0.00012889559314729015, "loss": 3.2569, "step": 73000 }, { "epoch": 7.857065977828006, "eval_accuracy": 0.3896842340639624, "eval_loss": 3.3339617252349854, "eval_runtime": 184.6586, "eval_samples_per_second": 97.537, "eval_steps_per_second": 6.098, "step": 73000 }, { "epoch": 7.862447529867614, "grad_norm": 0.8324296474456787, "learning_rate": 0.00012857235211722874, "loss": 3.2141, "step": 73050 }, { "epoch": 7.867829081907222, "grad_norm": 0.8081611394882202, "learning_rate": 0.00012824911108716734, "loss": 3.2168, "step": 73100 }, { "epoch": 7.87321063394683, "grad_norm": 0.8032556176185608, "learning_rate": 0.0001279258700571059, "loss": 3.2353, "step": 73150 }, { "epoch": 7.878592185986438, "grad_norm": 0.8371847867965698, "learning_rate": 0.0001276026290270445, "loss": 3.2337, "step": 73200 }, { "epoch": 7.883973738026047, "grad_norm": 0.8007276654243469, "learning_rate": 0.00012727938799698307, "loss": 3.2279, "step": 73250 }, { "epoch": 7.889355290065655, "grad_norm": 0.8758023381233215, "learning_rate": 0.00012695614696692166, "loss": 3.2199, "step": 73300 }, { "epoch": 7.894736842105263, "grad_norm": 0.9168129563331604, "learning_rate": 0.00012663290593686023, "loss": 3.2366, "step": 73350 }, { "epoch": 7.900118394144871, "grad_norm": 0.8541741967201233, "learning_rate": 0.00012630966490679882, "loss": 3.2348, "step": 73400 }, { "epoch": 7.90549994618448, "grad_norm": 0.8171018362045288, "learning_rate": 0.0001259864238767374, "loss": 3.2464, "step": 73450 }, { "epoch": 7.910881498224088, "grad_norm": 0.8532667756080627, "learning_rate": 0.00012566318284667598, "loss": 3.2119, "step": 73500 }, { "epoch": 7.916263050263696, "grad_norm": 0.8246223330497742, "learning_rate": 0.00012533994181661458, "loss": 3.2272, "step": 73550 }, { "epoch": 7.921644602303305, "grad_norm": 0.8351603746414185, "learning_rate": 0.00012501670078655317, "loss": 3.2407, "step": 73600 }, { "epoch": 7.927026154342912, "grad_norm": 0.8015763163566589, "learning_rate": 0.00012469345975649174, "loss": 3.2179, "step": 73650 }, { "epoch": 7.932407706382521, "grad_norm": 0.8650892376899719, "learning_rate": 0.00012437021872643034, "loss": 3.2296, "step": 73700 }, { "epoch": 7.937789258422129, "grad_norm": 0.8532193303108215, "learning_rate": 0.00012405344251697014, "loss": 3.2463, "step": 73750 }, { "epoch": 7.943170810461737, "grad_norm": 0.810485303401947, "learning_rate": 0.00012373020148690874, "loss": 3.2286, "step": 73800 }, { "epoch": 7.948552362501346, "grad_norm": 0.7970120906829834, "learning_rate": 0.0001234069604568473, "loss": 3.2275, "step": 73850 }, { "epoch": 7.953933914540953, "grad_norm": 0.8421259522438049, "learning_rate": 0.0001230837194267859, "loss": 3.246, "step": 73900 }, { "epoch": 7.959315466580562, "grad_norm": 0.8532693982124329, "learning_rate": 0.00012276047839672447, "loss": 3.2488, "step": 73950 }, { "epoch": 7.96469701862017, "grad_norm": 0.8387293219566345, "learning_rate": 0.00012243723736666306, "loss": 3.2501, "step": 74000 }, { "epoch": 7.96469701862017, "eval_accuracy": 0.3901934901491489, "eval_loss": 3.3306832313537598, "eval_runtime": 184.7101, "eval_samples_per_second": 97.51, "eval_steps_per_second": 6.096, "step": 74000 }, { "epoch": 7.970078570659778, "grad_norm": 0.8431631326675415, "learning_rate": 0.00012211399633660166, "loss": 3.2543, "step": 74050 }, { "epoch": 7.975460122699387, "grad_norm": 0.8307387232780457, "learning_rate": 0.00012179075530654022, "loss": 3.2306, "step": 74100 }, { "epoch": 7.980841674738995, "grad_norm": 0.8382935523986816, "learning_rate": 0.00012146751427647882, "loss": 3.2373, "step": 74150 }, { "epoch": 7.986223226778603, "grad_norm": 0.8308893442153931, "learning_rate": 0.0001211442732464174, "loss": 3.2195, "step": 74200 }, { "epoch": 7.991604778818211, "grad_norm": 0.8867136240005493, "learning_rate": 0.00012082103221635598, "loss": 3.2518, "step": 74250 }, { "epoch": 7.996986330857819, "grad_norm": 0.8336354494094849, "learning_rate": 0.00012049779118629456, "loss": 3.2431, "step": 74300 }, { "epoch": 8.002367882897428, "grad_norm": 0.8303877711296082, "learning_rate": 0.00012017455015623316, "loss": 3.1969, "step": 74350 }, { "epoch": 8.007749434937036, "grad_norm": 0.836917519569397, "learning_rate": 0.00011985130912617175, "loss": 3.1684, "step": 74400 }, { "epoch": 8.013130986976645, "grad_norm": 0.8200033903121948, "learning_rate": 0.00011952806809611032, "loss": 3.1589, "step": 74450 }, { "epoch": 8.018512539016251, "grad_norm": 0.831483006477356, "learning_rate": 0.00011920482706604891, "loss": 3.1496, "step": 74500 }, { "epoch": 8.02389409105586, "grad_norm": 0.8704330921173096, "learning_rate": 0.0001188815860359875, "loss": 3.1754, "step": 74550 }, { "epoch": 8.029275643095469, "grad_norm": 0.8458114266395569, "learning_rate": 0.00011855834500592608, "loss": 3.1645, "step": 74600 }, { "epoch": 8.034657195135077, "grad_norm": 0.8807541728019714, "learning_rate": 0.00011823510397586466, "loss": 3.1698, "step": 74650 }, { "epoch": 8.040038747174686, "grad_norm": 0.8740618824958801, "learning_rate": 0.00011791186294580325, "loss": 3.1632, "step": 74700 }, { "epoch": 8.045420299214294, "grad_norm": 0.8318957686424255, "learning_rate": 0.00011758862191574182, "loss": 3.1803, "step": 74750 }, { "epoch": 8.050801851253901, "grad_norm": 0.9062291979789734, "learning_rate": 0.00011726538088568041, "loss": 3.1795, "step": 74800 }, { "epoch": 8.05618340329351, "grad_norm": 0.8924227952957153, "learning_rate": 0.000116942139855619, "loss": 3.155, "step": 74850 }, { "epoch": 8.061564955333118, "grad_norm": 0.8725975155830383, "learning_rate": 0.00011661889882555759, "loss": 3.1632, "step": 74900 }, { "epoch": 8.066946507372727, "grad_norm": 0.8697208762168884, "learning_rate": 0.00011629565779549616, "loss": 3.1696, "step": 74950 }, { "epoch": 8.072328059412335, "grad_norm": 0.8555173873901367, "learning_rate": 0.00011597241676543475, "loss": 3.1614, "step": 75000 }, { "epoch": 8.072328059412335, "eval_accuracy": 0.39014285790342734, "eval_loss": 3.3369150161743164, "eval_runtime": 184.9651, "eval_samples_per_second": 97.375, "eval_steps_per_second": 6.088, "step": 75000 }, { "epoch": 8.077709611451942, "grad_norm": 0.8447503447532654, "learning_rate": 0.00011564917573537335, "loss": 3.1697, "step": 75050 }, { "epoch": 8.08309116349155, "grad_norm": 0.8746142983436584, "learning_rate": 0.00011532593470531191, "loss": 3.1678, "step": 75100 }, { "epoch": 8.088472715531159, "grad_norm": 0.8564710021018982, "learning_rate": 0.0001150026936752505, "loss": 3.1848, "step": 75150 }, { "epoch": 8.093854267570768, "grad_norm": 0.87080979347229, "learning_rate": 0.00011467945264518909, "loss": 3.1768, "step": 75200 }, { "epoch": 8.099235819610376, "grad_norm": 0.8317235708236694, "learning_rate": 0.00011435621161512766, "loss": 3.1678, "step": 75250 }, { "epoch": 8.104617371649983, "grad_norm": 0.8491494059562683, "learning_rate": 0.00011403297058506625, "loss": 3.1711, "step": 75300 }, { "epoch": 8.109998923689592, "grad_norm": 0.8416190147399902, "learning_rate": 0.00011370972955500485, "loss": 3.1771, "step": 75350 }, { "epoch": 8.1153804757292, "grad_norm": 0.846297562122345, "learning_rate": 0.00011338648852494343, "loss": 3.1605, "step": 75400 }, { "epoch": 8.120762027768809, "grad_norm": 0.8915745615959167, "learning_rate": 0.00011306324749488201, "loss": 3.1726, "step": 75450 }, { "epoch": 8.126143579808417, "grad_norm": 0.8499758243560791, "learning_rate": 0.00011274000646482059, "loss": 3.1699, "step": 75500 }, { "epoch": 8.131525131848026, "grad_norm": 0.8181933164596558, "learning_rate": 0.00011241676543475918, "loss": 3.1845, "step": 75550 }, { "epoch": 8.136906683887632, "grad_norm": 0.8639881014823914, "learning_rate": 0.00011209352440469775, "loss": 3.168, "step": 75600 }, { "epoch": 8.142288235927241, "grad_norm": 0.8862914443016052, "learning_rate": 0.00011177028337463635, "loss": 3.1765, "step": 75650 }, { "epoch": 8.14766978796685, "grad_norm": 0.8618167042732239, "learning_rate": 0.00011144704234457493, "loss": 3.1776, "step": 75700 }, { "epoch": 8.153051340006458, "grad_norm": 0.8700433969497681, "learning_rate": 0.00011112380131451351, "loss": 3.1728, "step": 75750 }, { "epoch": 8.158432892046067, "grad_norm": 0.8872320652008057, "learning_rate": 0.00011080056028445209, "loss": 3.1696, "step": 75800 }, { "epoch": 8.163814444085673, "grad_norm": 0.8248878121376038, "learning_rate": 0.00011047731925439068, "loss": 3.1714, "step": 75850 }, { "epoch": 8.169195996125282, "grad_norm": 0.8656007051467896, "learning_rate": 0.00011015407822432928, "loss": 3.1678, "step": 75900 }, { "epoch": 8.17457754816489, "grad_norm": 0.886645495891571, "learning_rate": 0.00010983083719426785, "loss": 3.1808, "step": 75950 }, { "epoch": 8.1799591002045, "grad_norm": 0.917984127998352, "learning_rate": 0.00010950759616420644, "loss": 3.1899, "step": 76000 }, { "epoch": 8.1799591002045, "eval_accuracy": 0.39039047783475656, "eval_loss": 3.333770513534546, "eval_runtime": 184.4579, "eval_samples_per_second": 97.643, "eval_steps_per_second": 6.104, "step": 76000 }, { "epoch": 8.185340652244108, "grad_norm": 0.8363960385322571, "learning_rate": 0.00010918435513414502, "loss": 3.1928, "step": 76050 }, { "epoch": 8.190722204283716, "grad_norm": 0.8816800713539124, "learning_rate": 0.00010886111410408359, "loss": 3.1695, "step": 76100 }, { "epoch": 8.196103756323323, "grad_norm": 0.8266114592552185, "learning_rate": 0.00010853787307402218, "loss": 3.1846, "step": 76150 }, { "epoch": 8.201485308362932, "grad_norm": 0.8366254568099976, "learning_rate": 0.00010821463204396078, "loss": 3.1775, "step": 76200 }, { "epoch": 8.20686686040254, "grad_norm": 0.8426965475082397, "learning_rate": 0.00010789139101389935, "loss": 3.1647, "step": 76250 }, { "epoch": 8.212248412442149, "grad_norm": 0.799103319644928, "learning_rate": 0.00010756814998383794, "loss": 3.1751, "step": 76300 }, { "epoch": 8.217629964481757, "grad_norm": 0.8718476295471191, "learning_rate": 0.00010724490895377652, "loss": 3.1747, "step": 76350 }, { "epoch": 8.223011516521364, "grad_norm": 0.8915057182312012, "learning_rate": 0.00010692166792371512, "loss": 3.1874, "step": 76400 }, { "epoch": 8.228393068560973, "grad_norm": 0.8778514266014099, "learning_rate": 0.00010659842689365368, "loss": 3.1709, "step": 76450 }, { "epoch": 8.233774620600581, "grad_norm": 0.8106761574745178, "learning_rate": 0.00010627518586359228, "loss": 3.1766, "step": 76500 }, { "epoch": 8.23915617264019, "grad_norm": 0.8877562284469604, "learning_rate": 0.00010595194483353086, "loss": 3.178, "step": 76550 }, { "epoch": 8.244537724679798, "grad_norm": 0.874200165271759, "learning_rate": 0.00010562870380346944, "loss": 3.1837, "step": 76600 }, { "epoch": 8.249919276719407, "grad_norm": 0.8712376952171326, "learning_rate": 0.00010530546277340802, "loss": 3.1791, "step": 76650 }, { "epoch": 8.255300828759013, "grad_norm": 0.842028021812439, "learning_rate": 0.00010498222174334662, "loss": 3.188, "step": 76700 }, { "epoch": 8.260682380798622, "grad_norm": 0.8732202053070068, "learning_rate": 0.00010465898071328519, "loss": 3.1822, "step": 76750 }, { "epoch": 8.26606393283823, "grad_norm": 0.8106330037117004, "learning_rate": 0.00010433573968322378, "loss": 3.1808, "step": 76800 }, { "epoch": 8.27144548487784, "grad_norm": 0.8642439246177673, "learning_rate": 0.00010401249865316237, "loss": 3.1608, "step": 76850 }, { "epoch": 8.276827036917448, "grad_norm": 0.8715514540672302, "learning_rate": 0.00010368925762310096, "loss": 3.1802, "step": 76900 }, { "epoch": 8.282208588957054, "grad_norm": 0.8595199584960938, "learning_rate": 0.00010336601659303954, "loss": 3.1766, "step": 76950 }, { "epoch": 8.287590140996663, "grad_norm": 0.8247595429420471, "learning_rate": 0.00010304277556297812, "loss": 3.1874, "step": 77000 }, { "epoch": 8.287590140996663, "eval_accuracy": 0.3907075269613992, "eval_loss": 3.330806016921997, "eval_runtime": 184.729, "eval_samples_per_second": 97.5, "eval_steps_per_second": 6.095, "step": 77000 }, { "epoch": 8.292971693036272, "grad_norm": 0.9381096959114075, "learning_rate": 0.00010271953453291671, "loss": 3.166, "step": 77050 }, { "epoch": 8.29835324507588, "grad_norm": 0.8634301424026489, "learning_rate": 0.00010239629350285528, "loss": 3.1607, "step": 77100 }, { "epoch": 8.303734797115489, "grad_norm": 0.8498184680938721, "learning_rate": 0.00010207305247279387, "loss": 3.176, "step": 77150 }, { "epoch": 8.309116349155097, "grad_norm": 0.8834637999534607, "learning_rate": 0.00010174981144273246, "loss": 3.1865, "step": 77200 }, { "epoch": 8.314497901194704, "grad_norm": 0.877652108669281, "learning_rate": 0.00010142657041267104, "loss": 3.1713, "step": 77250 }, { "epoch": 8.319879453234313, "grad_norm": 0.8718097805976868, "learning_rate": 0.00010110332938260962, "loss": 3.17, "step": 77300 }, { "epoch": 8.325261005273921, "grad_norm": 0.8588361740112305, "learning_rate": 0.00010078655317314944, "loss": 3.1928, "step": 77350 }, { "epoch": 8.33064255731353, "grad_norm": 0.8447306752204895, "learning_rate": 0.00010046331214308802, "loss": 3.1892, "step": 77400 }, { "epoch": 8.336024109353138, "grad_norm": 0.8749763369560242, "learning_rate": 0.0001001400711130266, "loss": 3.1865, "step": 77450 }, { "epoch": 8.341405661392745, "grad_norm": 0.8686197400093079, "learning_rate": 9.98168300829652e-05, "loss": 3.1846, "step": 77500 }, { "epoch": 8.346787213432354, "grad_norm": 0.8535328507423401, "learning_rate": 9.949358905290376e-05, "loss": 3.1957, "step": 77550 }, { "epoch": 8.352168765471962, "grad_norm": 0.8441202044487, "learning_rate": 9.917034802284236e-05, "loss": 3.1839, "step": 77600 }, { "epoch": 8.35755031751157, "grad_norm": 0.8363760113716125, "learning_rate": 9.884710699278094e-05, "loss": 3.193, "step": 77650 }, { "epoch": 8.36293186955118, "grad_norm": 0.8740014433860779, "learning_rate": 9.852386596271953e-05, "loss": 3.1935, "step": 77700 }, { "epoch": 8.368313421590786, "grad_norm": 0.8385241627693176, "learning_rate": 9.82006249326581e-05, "loss": 3.1776, "step": 77750 }, { "epoch": 8.373694973630395, "grad_norm": 0.8790156841278076, "learning_rate": 9.78773839025967e-05, "loss": 3.1687, "step": 77800 }, { "epoch": 8.379076525670003, "grad_norm": 0.9463781714439392, "learning_rate": 9.755414287253529e-05, "loss": 3.1798, "step": 77850 }, { "epoch": 8.384458077709612, "grad_norm": 0.8797891736030579, "learning_rate": 9.723090184247386e-05, "loss": 3.1966, "step": 77900 }, { "epoch": 8.38983962974922, "grad_norm": 0.8536663055419922, "learning_rate": 9.690766081241245e-05, "loss": 3.1673, "step": 77950 }, { "epoch": 8.395221181788829, "grad_norm": 0.880811870098114, "learning_rate": 9.658441978235103e-05, "loss": 3.1641, "step": 78000 }, { "epoch": 8.395221181788829, "eval_accuracy": 0.3911067176712296, "eval_loss": 3.3282108306884766, "eval_runtime": 185.041, "eval_samples_per_second": 97.335, "eval_steps_per_second": 6.085, "step": 78000 }, { "epoch": 8.400602733828435, "grad_norm": 0.8960733413696289, "learning_rate": 9.626117875228961e-05, "loss": 3.1936, "step": 78050 }, { "epoch": 8.405984285868044, "grad_norm": 0.893669605255127, "learning_rate": 9.59379377222282e-05, "loss": 3.1858, "step": 78100 }, { "epoch": 8.411365837907653, "grad_norm": 0.8383551836013794, "learning_rate": 9.561469669216679e-05, "loss": 3.1571, "step": 78150 }, { "epoch": 8.416747389947261, "grad_norm": 0.8875780701637268, "learning_rate": 9.529145566210537e-05, "loss": 3.1831, "step": 78200 }, { "epoch": 8.42212894198687, "grad_norm": 0.9209667444229126, "learning_rate": 9.496821463204395e-05, "loss": 3.1833, "step": 78250 }, { "epoch": 8.427510494026476, "grad_norm": 0.8284728527069092, "learning_rate": 9.464497360198253e-05, "loss": 3.1816, "step": 78300 }, { "epoch": 8.432892046066085, "grad_norm": 0.8703120946884155, "learning_rate": 9.432173257192113e-05, "loss": 3.1751, "step": 78350 }, { "epoch": 8.438273598105694, "grad_norm": 0.8745947480201721, "learning_rate": 9.39984915418597e-05, "loss": 3.1674, "step": 78400 }, { "epoch": 8.443655150145302, "grad_norm": 0.8946985602378845, "learning_rate": 9.367525051179829e-05, "loss": 3.1602, "step": 78450 }, { "epoch": 8.44903670218491, "grad_norm": 0.867800772190094, "learning_rate": 9.335200948173688e-05, "loss": 3.1897, "step": 78500 }, { "epoch": 8.45441825422452, "grad_norm": 0.8727012872695923, "learning_rate": 9.302876845167545e-05, "loss": 3.1946, "step": 78550 }, { "epoch": 8.459799806264126, "grad_norm": 0.8764132261276245, "learning_rate": 9.270552742161403e-05, "loss": 3.1935, "step": 78600 }, { "epoch": 8.465181358303735, "grad_norm": 0.8671554923057556, "learning_rate": 9.238228639155263e-05, "loss": 3.1577, "step": 78650 }, { "epoch": 8.470562910343343, "grad_norm": 0.8665030002593994, "learning_rate": 9.205904536149122e-05, "loss": 3.1851, "step": 78700 }, { "epoch": 8.475944462382952, "grad_norm": 0.8668508529663086, "learning_rate": 9.173580433142979e-05, "loss": 3.2038, "step": 78750 }, { "epoch": 8.48132601442256, "grad_norm": 0.8960902094841003, "learning_rate": 9.141256330136838e-05, "loss": 3.1813, "step": 78800 }, { "epoch": 8.486707566462167, "grad_norm": 0.8812357783317566, "learning_rate": 9.108932227130697e-05, "loss": 3.1816, "step": 78850 }, { "epoch": 8.492089118501776, "grad_norm": 0.9052111506462097, "learning_rate": 9.076608124124555e-05, "loss": 3.1729, "step": 78900 }, { "epoch": 8.497470670541384, "grad_norm": 0.9190753102302551, "learning_rate": 9.044284021118413e-05, "loss": 3.1959, "step": 78950 }, { "epoch": 8.502852222580993, "grad_norm": 0.8276004195213318, "learning_rate": 9.011959918112272e-05, "loss": 3.185, "step": 79000 }, { "epoch": 8.502852222580993, "eval_accuracy": 0.3911831006513632, "eval_loss": 3.32541561126709, "eval_runtime": 184.5689, "eval_samples_per_second": 97.584, "eval_steps_per_second": 6.101, "step": 79000 }, { "epoch": 8.508233774620601, "grad_norm": 0.9947301745414734, "learning_rate": 8.979635815106129e-05, "loss": 3.1801, "step": 79050 }, { "epoch": 8.513615326660208, "grad_norm": 0.8838504552841187, "learning_rate": 8.947311712099989e-05, "loss": 3.196, "step": 79100 }, { "epoch": 8.518996878699816, "grad_norm": 0.8844854235649109, "learning_rate": 8.914987609093847e-05, "loss": 3.1692, "step": 79150 }, { "epoch": 8.524378430739425, "grad_norm": 0.9206638932228088, "learning_rate": 8.882663506087706e-05, "loss": 3.1824, "step": 79200 }, { "epoch": 8.529759982779034, "grad_norm": 0.8852978944778442, "learning_rate": 8.850339403081563e-05, "loss": 3.2002, "step": 79250 }, { "epoch": 8.535141534818642, "grad_norm": 0.9327110052108765, "learning_rate": 8.818015300075422e-05, "loss": 3.2021, "step": 79300 }, { "epoch": 8.54052308685825, "grad_norm": 0.8577570915222168, "learning_rate": 8.785691197069282e-05, "loss": 3.1762, "step": 79350 }, { "epoch": 8.545904638897857, "grad_norm": 0.8478599190711975, "learning_rate": 8.753367094063139e-05, "loss": 3.1831, "step": 79400 }, { "epoch": 8.551286190937466, "grad_norm": 0.909991443157196, "learning_rate": 8.721042991056998e-05, "loss": 3.1794, "step": 79450 }, { "epoch": 8.556667742977075, "grad_norm": 0.9163396954536438, "learning_rate": 8.688718888050856e-05, "loss": 3.1727, "step": 79500 }, { "epoch": 8.562049295016683, "grad_norm": 0.862952470779419, "learning_rate": 8.656394785044713e-05, "loss": 3.1933, "step": 79550 }, { "epoch": 8.567430847056292, "grad_norm": 0.8733549118041992, "learning_rate": 8.624070682038572e-05, "loss": 3.1773, "step": 79600 }, { "epoch": 8.572812399095898, "grad_norm": 0.8446792364120483, "learning_rate": 8.591746579032432e-05, "loss": 3.1822, "step": 79650 }, { "epoch": 8.578193951135507, "grad_norm": 0.8886731863021851, "learning_rate": 8.55942247602629e-05, "loss": 3.1945, "step": 79700 }, { "epoch": 8.583575503175116, "grad_norm": 0.8565948009490967, "learning_rate": 8.52774485508027e-05, "loss": 3.1875, "step": 79750 }, { "epoch": 8.588957055214724, "grad_norm": 0.8837953209877014, "learning_rate": 8.49542075207413e-05, "loss": 3.1851, "step": 79800 }, { "epoch": 8.594338607254333, "grad_norm": 0.9210212826728821, "learning_rate": 8.463096649067987e-05, "loss": 3.1748, "step": 79850 }, { "epoch": 8.599720159293941, "grad_norm": 0.9159842729568481, "learning_rate": 8.430772546061846e-05, "loss": 3.188, "step": 79900 }, { "epoch": 8.605101711333548, "grad_norm": 0.8564115166664124, "learning_rate": 8.398448443055704e-05, "loss": 3.1862, "step": 79950 }, { "epoch": 8.610483263373157, "grad_norm": 0.8882423639297485, "learning_rate": 8.366124340049564e-05, "loss": 3.1817, "step": 80000 }, { "epoch": 8.610483263373157, "eval_accuracy": 0.3914083380878026, "eval_loss": 3.3225831985473633, "eval_runtime": 184.6455, "eval_samples_per_second": 97.544, "eval_steps_per_second": 6.098, "step": 80000 }, { "epoch": 8.615864815412765, "grad_norm": 0.9362099766731262, "learning_rate": 8.33380023704342e-05, "loss": 3.186, "step": 80050 }, { "epoch": 8.621246367452374, "grad_norm": 0.8790286779403687, "learning_rate": 8.30147613403728e-05, "loss": 3.1654, "step": 80100 }, { "epoch": 8.626627919491982, "grad_norm": 0.8834596872329712, "learning_rate": 8.269152031031138e-05, "loss": 3.1895, "step": 80150 }, { "epoch": 8.632009471531589, "grad_norm": 0.8374781012535095, "learning_rate": 8.236827928024996e-05, "loss": 3.181, "step": 80200 }, { "epoch": 8.637391023571197, "grad_norm": 0.9129075407981873, "learning_rate": 8.204503825018854e-05, "loss": 3.1877, "step": 80250 }, { "epoch": 8.642772575610806, "grad_norm": 0.9645001292228699, "learning_rate": 8.172179722012714e-05, "loss": 3.1876, "step": 80300 }, { "epoch": 8.648154127650415, "grad_norm": 0.8795619010925293, "learning_rate": 8.13985561900657e-05, "loss": 3.1889, "step": 80350 }, { "epoch": 8.653535679690023, "grad_norm": 0.8914580941200256, "learning_rate": 8.10753151600043e-05, "loss": 3.1922, "step": 80400 }, { "epoch": 8.658917231729632, "grad_norm": 0.8828052878379822, "learning_rate": 8.07520741299429e-05, "loss": 3.178, "step": 80450 }, { "epoch": 8.664298783769238, "grad_norm": 0.8768082857131958, "learning_rate": 8.042883309988148e-05, "loss": 3.1827, "step": 80500 }, { "epoch": 8.669680335808847, "grad_norm": 0.8295359015464783, "learning_rate": 8.010559206982006e-05, "loss": 3.183, "step": 80550 }, { "epoch": 8.675061887848456, "grad_norm": 0.8298890590667725, "learning_rate": 7.978235103975864e-05, "loss": 3.1791, "step": 80600 }, { "epoch": 8.680443439888064, "grad_norm": 0.8362737894058228, "learning_rate": 7.945911000969723e-05, "loss": 3.1816, "step": 80650 }, { "epoch": 8.685824991927673, "grad_norm": 0.8717547059059143, "learning_rate": 7.91358689796358e-05, "loss": 3.1963, "step": 80700 }, { "epoch": 8.69120654396728, "grad_norm": 0.8629084825515747, "learning_rate": 7.88126279495744e-05, "loss": 3.2005, "step": 80750 }, { "epoch": 8.696588096006888, "grad_norm": 0.891313374042511, "learning_rate": 7.848938691951298e-05, "loss": 3.1822, "step": 80800 }, { "epoch": 8.701969648046497, "grad_norm": 0.9416958093643188, "learning_rate": 7.816614588945156e-05, "loss": 3.1739, "step": 80850 }, { "epoch": 8.707351200086105, "grad_norm": 0.9029965400695801, "learning_rate": 7.784290485939014e-05, "loss": 3.1881, "step": 80900 }, { "epoch": 8.712732752125714, "grad_norm": 0.8759965300559998, "learning_rate": 7.751966382932873e-05, "loss": 3.1905, "step": 80950 }, { "epoch": 8.718114304165322, "grad_norm": 0.8635278344154358, "learning_rate": 7.719642279926731e-05, "loss": 3.1891, "step": 81000 }, { "epoch": 8.718114304165322, "eval_accuracy": 0.3921397893285697, "eval_loss": 3.3176803588867188, "eval_runtime": 184.7567, "eval_samples_per_second": 97.485, "eval_steps_per_second": 6.095, "step": 81000 }, { "epoch": 8.723495856204929, "grad_norm": 0.9018222093582153, "learning_rate": 7.68731817692059e-05, "loss": 3.1802, "step": 81050 }, { "epoch": 8.728877408244538, "grad_norm": 0.8515042066574097, "learning_rate": 7.654994073914448e-05, "loss": 3.1897, "step": 81100 }, { "epoch": 8.734258960284146, "grad_norm": 0.8561316132545471, "learning_rate": 7.622669970908307e-05, "loss": 3.1992, "step": 81150 }, { "epoch": 8.739640512323755, "grad_norm": 0.8872911930084229, "learning_rate": 7.590345867902164e-05, "loss": 3.1834, "step": 81200 }, { "epoch": 8.745022064363363, "grad_norm": 0.8783745765686035, "learning_rate": 7.558021764896023e-05, "loss": 3.1589, "step": 81250 }, { "epoch": 8.75040361640297, "grad_norm": 0.8908545970916748, "learning_rate": 7.525697661889883e-05, "loss": 3.1957, "step": 81300 }, { "epoch": 8.755785168442578, "grad_norm": 0.8940176963806152, "learning_rate": 7.493373558883741e-05, "loss": 3.1899, "step": 81350 }, { "epoch": 8.761166720482187, "grad_norm": 0.8686371445655823, "learning_rate": 7.461049455877599e-05, "loss": 3.2028, "step": 81400 }, { "epoch": 8.766548272521796, "grad_norm": 0.8972199559211731, "learning_rate": 7.428725352871457e-05, "loss": 3.1772, "step": 81450 }, { "epoch": 8.771929824561404, "grad_norm": 0.9102570414543152, "learning_rate": 7.396401249865315e-05, "loss": 3.181, "step": 81500 }, { "epoch": 8.777311376601011, "grad_norm": 0.8299551606178284, "learning_rate": 7.364077146859173e-05, "loss": 3.1861, "step": 81550 }, { "epoch": 8.78269292864062, "grad_norm": 0.852982223033905, "learning_rate": 7.331753043853033e-05, "loss": 3.1815, "step": 81600 }, { "epoch": 8.788074480680228, "grad_norm": 0.8657204508781433, "learning_rate": 7.299428940846891e-05, "loss": 3.1924, "step": 81650 }, { "epoch": 8.793456032719837, "grad_norm": 0.8692888021469116, "learning_rate": 7.267104837840749e-05, "loss": 3.1805, "step": 81700 }, { "epoch": 8.798837584759445, "grad_norm": 0.8677384853363037, "learning_rate": 7.234780734834607e-05, "loss": 3.2016, "step": 81750 }, { "epoch": 8.804219136799054, "grad_norm": 0.9234066605567932, "learning_rate": 7.202456631828465e-05, "loss": 3.1823, "step": 81800 }, { "epoch": 8.80960068883866, "grad_norm": 0.8415629267692566, "learning_rate": 7.170132528822325e-05, "loss": 3.16, "step": 81850 }, { "epoch": 8.814982240878269, "grad_norm": 0.8653513193130493, "learning_rate": 7.137808425816183e-05, "loss": 3.1854, "step": 81900 }, { "epoch": 8.820363792917878, "grad_norm": 0.9204857349395752, "learning_rate": 7.105484322810041e-05, "loss": 3.1788, "step": 81950 }, { "epoch": 8.825745344957486, "grad_norm": 0.9475948810577393, "learning_rate": 7.073160219803899e-05, "loss": 3.1655, "step": 82000 }, { "epoch": 8.825745344957486, "eval_accuracy": 0.3923214569569955, "eval_loss": 3.316098928451538, "eval_runtime": 184.8571, "eval_samples_per_second": 97.432, "eval_steps_per_second": 6.091, "step": 82000 }, { "epoch": 8.831126896997095, "grad_norm": 0.9237799048423767, "learning_rate": 7.040836116797757e-05, "loss": 3.1975, "step": 82050 }, { "epoch": 8.836508449036701, "grad_norm": 0.8860271573066711, "learning_rate": 7.008512013791617e-05, "loss": 3.1737, "step": 82100 }, { "epoch": 8.84189000107631, "grad_norm": 0.8764588832855225, "learning_rate": 6.976187910785475e-05, "loss": 3.1814, "step": 82150 }, { "epoch": 8.847271553115919, "grad_norm": 0.8662134408950806, "learning_rate": 6.943863807779334e-05, "loss": 3.1879, "step": 82200 }, { "epoch": 8.852653105155527, "grad_norm": 0.8557435274124146, "learning_rate": 6.911539704773192e-05, "loss": 3.1715, "step": 82250 }, { "epoch": 8.858034657195136, "grad_norm": 0.8877943158149719, "learning_rate": 6.87921560176705e-05, "loss": 3.1928, "step": 82300 }, { "epoch": 8.863416209234742, "grad_norm": 0.9014151692390442, "learning_rate": 6.846891498760909e-05, "loss": 3.1795, "step": 82350 }, { "epoch": 8.868797761274351, "grad_norm": 0.9253289699554443, "learning_rate": 6.814567395754767e-05, "loss": 3.1607, "step": 82400 }, { "epoch": 8.87417931331396, "grad_norm": 0.8912492394447327, "learning_rate": 6.782243292748626e-05, "loss": 3.1784, "step": 82450 }, { "epoch": 8.879560865353568, "grad_norm": 0.9154777526855469, "learning_rate": 6.750565671802607e-05, "loss": 3.192, "step": 82500 }, { "epoch": 8.884942417393177, "grad_norm": 0.9307251572608948, "learning_rate": 6.718241568796465e-05, "loss": 3.1775, "step": 82550 }, { "epoch": 8.890323969432785, "grad_norm": 0.8818643689155579, "learning_rate": 6.685917465790323e-05, "loss": 3.166, "step": 82600 }, { "epoch": 8.895705521472392, "grad_norm": 0.8821066617965698, "learning_rate": 6.653593362784182e-05, "loss": 3.1702, "step": 82650 }, { "epoch": 8.901087073512, "grad_norm": 0.9354448318481445, "learning_rate": 6.62126925977804e-05, "loss": 3.171, "step": 82700 }, { "epoch": 8.906468625551609, "grad_norm": 0.8969210982322693, "learning_rate": 6.588945156771899e-05, "loss": 3.1851, "step": 82750 }, { "epoch": 8.911850177591218, "grad_norm": 0.8801003098487854, "learning_rate": 6.556621053765757e-05, "loss": 3.1559, "step": 82800 }, { "epoch": 8.917231729630826, "grad_norm": 0.914297878742218, "learning_rate": 6.524296950759615e-05, "loss": 3.1676, "step": 82850 }, { "epoch": 8.922613281670433, "grad_norm": 0.8998667001724243, "learning_rate": 6.491972847753474e-05, "loss": 3.1729, "step": 82900 }, { "epoch": 8.927994833710041, "grad_norm": 0.911956250667572, "learning_rate": 6.459648744747333e-05, "loss": 3.1796, "step": 82950 }, { "epoch": 8.93337638574965, "grad_norm": 0.9175410270690918, "learning_rate": 6.427324641741192e-05, "loss": 3.1707, "step": 83000 }, { "epoch": 8.93337638574965, "eval_accuracy": 0.3926996776594777, "eval_loss": 3.3129827976226807, "eval_runtime": 184.4576, "eval_samples_per_second": 97.643, "eval_steps_per_second": 6.104, "step": 83000 }, { "epoch": 8.938757937789259, "grad_norm": 0.8617079257965088, "learning_rate": 6.39500053873505e-05, "loss": 3.1874, "step": 83050 }, { "epoch": 8.944139489828867, "grad_norm": 0.8827839493751526, "learning_rate": 6.362676435728908e-05, "loss": 3.174, "step": 83100 }, { "epoch": 8.949521041868476, "grad_norm": 0.8599488139152527, "learning_rate": 6.330352332722766e-05, "loss": 3.2049, "step": 83150 }, { "epoch": 8.954902593908082, "grad_norm": 0.905812680721283, "learning_rate": 6.298028229716624e-05, "loss": 3.1845, "step": 83200 }, { "epoch": 8.960284145947691, "grad_norm": 0.8685833215713501, "learning_rate": 6.265704126710484e-05, "loss": 3.1687, "step": 83250 }, { "epoch": 8.9656656979873, "grad_norm": 0.90411376953125, "learning_rate": 6.233380023704342e-05, "loss": 3.1906, "step": 83300 }, { "epoch": 8.971047250026908, "grad_norm": 0.8720734119415283, "learning_rate": 6.2010559206982e-05, "loss": 3.1915, "step": 83350 }, { "epoch": 8.976428802066517, "grad_norm": 0.8964160680770874, "learning_rate": 6.168731817692058e-05, "loss": 3.1848, "step": 83400 }, { "epoch": 8.981810354106123, "grad_norm": 0.9113012552261353, "learning_rate": 6.136407714685916e-05, "loss": 3.1753, "step": 83450 }, { "epoch": 8.987191906145732, "grad_norm": 0.9036979079246521, "learning_rate": 6.104083611679776e-05, "loss": 3.1912, "step": 83500 }, { "epoch": 8.99257345818534, "grad_norm": 0.9076685905456543, "learning_rate": 6.071759508673634e-05, "loss": 3.1701, "step": 83550 }, { "epoch": 8.997955010224949, "grad_norm": 0.9387393593788147, "learning_rate": 6.039435405667492e-05, "loss": 3.1792, "step": 83600 }, { "epoch": 9.003336562264558, "grad_norm": 0.8677722215652466, "learning_rate": 6.007111302661351e-05, "loss": 3.1325, "step": 83650 }, { "epoch": 9.008718114304166, "grad_norm": 0.898324191570282, "learning_rate": 5.974787199655209e-05, "loss": 3.1276, "step": 83700 }, { "epoch": 9.014099666343773, "grad_norm": 0.8768854737281799, "learning_rate": 5.942463096649068e-05, "loss": 3.1129, "step": 83750 }, { "epoch": 9.019481218383381, "grad_norm": 0.8867700099945068, "learning_rate": 5.910138993642926e-05, "loss": 3.1221, "step": 83800 }, { "epoch": 9.02486277042299, "grad_norm": 0.8973957300186157, "learning_rate": 5.877814890636784e-05, "loss": 3.1166, "step": 83850 }, { "epoch": 9.030244322462599, "grad_norm": 0.8828250765800476, "learning_rate": 5.845490787630643e-05, "loss": 3.1444, "step": 83900 }, { "epoch": 9.035625874502207, "grad_norm": 0.8597937226295471, "learning_rate": 5.813166684624501e-05, "loss": 3.1197, "step": 83950 }, { "epoch": 9.041007426541814, "grad_norm": 0.8845257759094238, "learning_rate": 5.7808425816183596e-05, "loss": 3.1265, "step": 84000 }, { "epoch": 9.041007426541814, "eval_accuracy": 0.3927993123576036, "eval_loss": 3.3138818740844727, "eval_runtime": 184.539, "eval_samples_per_second": 97.6, "eval_steps_per_second": 6.102, "step": 84000 }, { "epoch": 9.046388978581422, "grad_norm": 0.8949583172798157, "learning_rate": 5.748518478612218e-05, "loss": 3.1287, "step": 84050 }, { "epoch": 9.051770530621031, "grad_norm": 0.8685891628265381, "learning_rate": 5.716194375606076e-05, "loss": 3.1128, "step": 84100 }, { "epoch": 9.05715208266064, "grad_norm": 0.863949179649353, "learning_rate": 5.6838702725999346e-05, "loss": 3.1296, "step": 84150 }, { "epoch": 9.062533634700248, "grad_norm": 0.9483818411827087, "learning_rate": 5.651546169593793e-05, "loss": 3.1382, "step": 84200 }, { "epoch": 9.067915186739857, "grad_norm": 0.898429811000824, "learning_rate": 5.619222066587652e-05, "loss": 3.1151, "step": 84250 }, { "epoch": 9.073296738779463, "grad_norm": 0.9140533208847046, "learning_rate": 5.58689796358151e-05, "loss": 3.1487, "step": 84300 }, { "epoch": 9.078678290819072, "grad_norm": 0.9005162119865417, "learning_rate": 5.554573860575369e-05, "loss": 3.1276, "step": 84350 }, { "epoch": 9.08405984285868, "grad_norm": 0.9215816855430603, "learning_rate": 5.522249757569227e-05, "loss": 3.1322, "step": 84400 }, { "epoch": 9.089441394898289, "grad_norm": 0.9173760414123535, "learning_rate": 5.489925654563085e-05, "loss": 3.1239, "step": 84450 }, { "epoch": 9.094822946937898, "grad_norm": 0.8493995070457458, "learning_rate": 5.457601551556944e-05, "loss": 3.1223, "step": 84500 }, { "epoch": 9.100204498977504, "grad_norm": 0.8942146897315979, "learning_rate": 5.425277448550802e-05, "loss": 3.144, "step": 84550 }, { "epoch": 9.105586051017113, "grad_norm": 0.9167282581329346, "learning_rate": 5.392953345544661e-05, "loss": 3.1309, "step": 84600 }, { "epoch": 9.110967603056721, "grad_norm": 0.9377874732017517, "learning_rate": 5.360629242538519e-05, "loss": 3.1163, "step": 84650 }, { "epoch": 9.11634915509633, "grad_norm": 0.8942631483078003, "learning_rate": 5.328305139532377e-05, "loss": 3.112, "step": 84700 }, { "epoch": 9.121730707135939, "grad_norm": 0.8881064653396606, "learning_rate": 5.295981036526236e-05, "loss": 3.121, "step": 84750 }, { "epoch": 9.127112259175545, "grad_norm": 0.9246423244476318, "learning_rate": 5.263656933520094e-05, "loss": 3.1307, "step": 84800 }, { "epoch": 9.132493811215154, "grad_norm": 0.8789636492729187, "learning_rate": 5.231332830513953e-05, "loss": 3.1251, "step": 84850 }, { "epoch": 9.137875363254762, "grad_norm": 0.9263179302215576, "learning_rate": 5.199008727507811e-05, "loss": 3.1385, "step": 84900 }, { "epoch": 9.143256915294371, "grad_norm": 0.9350448846817017, "learning_rate": 5.166684624501669e-05, "loss": 3.1264, "step": 84950 }, { "epoch": 9.14863846733398, "grad_norm": 0.9025295376777649, "learning_rate": 5.1343605214955286e-05, "loss": 3.1258, "step": 85000 }, { "epoch": 9.14863846733398, "eval_accuracy": 0.39295207831787077, "eval_loss": 3.3131446838378906, "eval_runtime": 184.5599, "eval_samples_per_second": 97.589, "eval_steps_per_second": 6.101, "step": 85000 }, { "epoch": 9.154020019373588, "grad_norm": 0.9414235949516296, "learning_rate": 5.102036418489387e-05, "loss": 3.1233, "step": 85050 }, { "epoch": 9.159401571413195, "grad_norm": 0.8915159106254578, "learning_rate": 5.0697123154832455e-05, "loss": 3.1198, "step": 85100 }, { "epoch": 9.164783123452803, "grad_norm": 0.9033054709434509, "learning_rate": 5.0373882124771036e-05, "loss": 3.1274, "step": 85150 }, { "epoch": 9.170164675492412, "grad_norm": 0.8779189586639404, "learning_rate": 5.005064109470962e-05, "loss": 3.1229, "step": 85200 }, { "epoch": 9.17554622753202, "grad_norm": 0.8858757019042969, "learning_rate": 4.9727400064648205e-05, "loss": 3.1166, "step": 85250 }, { "epoch": 9.180927779571629, "grad_norm": 0.8739831447601318, "learning_rate": 4.9404159034586786e-05, "loss": 3.1356, "step": 85300 }, { "epoch": 9.186309331611236, "grad_norm": 0.9114834666252136, "learning_rate": 4.9080918004525374e-05, "loss": 3.1189, "step": 85350 }, { "epoch": 9.191690883650844, "grad_norm": 0.9239445924758911, "learning_rate": 4.876414179506518e-05, "loss": 3.145, "step": 85400 }, { "epoch": 9.197072435690453, "grad_norm": 0.9356056451797485, "learning_rate": 4.844090076500377e-05, "loss": 3.1225, "step": 85450 }, { "epoch": 9.202453987730062, "grad_norm": 0.9145978689193726, "learning_rate": 4.811765973494235e-05, "loss": 3.1418, "step": 85500 }, { "epoch": 9.20783553976967, "grad_norm": 0.9663805961608887, "learning_rate": 4.779441870488094e-05, "loss": 3.1302, "step": 85550 }, { "epoch": 9.213217091809279, "grad_norm": 0.8993372321128845, "learning_rate": 4.747117767481952e-05, "loss": 3.1292, "step": 85600 }, { "epoch": 9.218598643848885, "grad_norm": 0.8745630979537964, "learning_rate": 4.71479366447581e-05, "loss": 3.1227, "step": 85650 }, { "epoch": 9.223980195888494, "grad_norm": 0.86039137840271, "learning_rate": 4.682469561469669e-05, "loss": 3.1389, "step": 85700 }, { "epoch": 9.229361747928102, "grad_norm": 0.9199390411376953, "learning_rate": 4.650145458463527e-05, "loss": 3.1364, "step": 85750 }, { "epoch": 9.234743299967711, "grad_norm": 0.9464226365089417, "learning_rate": 4.6178213554573856e-05, "loss": 3.1197, "step": 85800 }, { "epoch": 9.24012485200732, "grad_norm": 0.90545254945755, "learning_rate": 4.585497252451244e-05, "loss": 3.1347, "step": 85850 }, { "epoch": 9.245506404046926, "grad_norm": 0.943743109703064, "learning_rate": 4.553173149445102e-05, "loss": 3.1428, "step": 85900 }, { "epoch": 9.250887956086535, "grad_norm": 0.9059910774230957, "learning_rate": 4.520849046438961e-05, "loss": 3.1222, "step": 85950 }, { "epoch": 9.256269508126143, "grad_norm": 0.9065704941749573, "learning_rate": 4.488524943432819e-05, "loss": 3.1525, "step": 86000 }, { "epoch": 9.256269508126143, "eval_accuracy": 0.39329553009625234, "eval_loss": 3.3110220432281494, "eval_runtime": 184.9835, "eval_samples_per_second": 97.365, "eval_steps_per_second": 6.087, "step": 86000 }, { "epoch": 9.261651060165752, "grad_norm": 0.9051395654678345, "learning_rate": 4.456200840426678e-05, "loss": 3.1265, "step": 86050 }, { "epoch": 9.26703261220536, "grad_norm": 0.8704345226287842, "learning_rate": 4.4238767374205363e-05, "loss": 3.1381, "step": 86100 }, { "epoch": 9.272414164244967, "grad_norm": 0.9406769871711731, "learning_rate": 4.3915526344143945e-05, "loss": 3.1278, "step": 86150 }, { "epoch": 9.277795716284576, "grad_norm": 0.858677864074707, "learning_rate": 4.359228531408253e-05, "loss": 3.1111, "step": 86200 }, { "epoch": 9.283177268324184, "grad_norm": 0.9546279311180115, "learning_rate": 4.3269044284021114e-05, "loss": 3.1253, "step": 86250 }, { "epoch": 9.288558820363793, "grad_norm": 0.8848311901092529, "learning_rate": 4.29458032539597e-05, "loss": 3.1237, "step": 86300 }, { "epoch": 9.293940372403402, "grad_norm": 0.9586207270622253, "learning_rate": 4.262256222389828e-05, "loss": 3.1308, "step": 86350 }, { "epoch": 9.29932192444301, "grad_norm": 0.876517653465271, "learning_rate": 4.229932119383687e-05, "loss": 3.1307, "step": 86400 }, { "epoch": 9.304703476482617, "grad_norm": 0.8859730958938599, "learning_rate": 4.197608016377545e-05, "loss": 3.1464, "step": 86450 }, { "epoch": 9.310085028522225, "grad_norm": 0.8714919090270996, "learning_rate": 4.165283913371403e-05, "loss": 3.1382, "step": 86500 }, { "epoch": 9.315466580561834, "grad_norm": 0.936630368232727, "learning_rate": 4.132959810365262e-05, "loss": 3.1208, "step": 86550 }, { "epoch": 9.320848132601443, "grad_norm": 0.8881075382232666, "learning_rate": 4.10063570735912e-05, "loss": 3.1199, "step": 86600 }, { "epoch": 9.326229684641051, "grad_norm": 0.9490821361541748, "learning_rate": 4.068311604352979e-05, "loss": 3.1226, "step": 86650 }, { "epoch": 9.331611236680658, "grad_norm": 0.9327830672264099, "learning_rate": 4.035987501346837e-05, "loss": 3.1466, "step": 86700 }, { "epoch": 9.336992788720266, "grad_norm": 0.9075290560722351, "learning_rate": 4.003663398340695e-05, "loss": 3.1365, "step": 86750 }, { "epoch": 9.342374340759875, "grad_norm": 0.8921899199485779, "learning_rate": 3.9713392953345546e-05, "loss": 3.1491, "step": 86800 }, { "epoch": 9.347755892799483, "grad_norm": 0.9279853105545044, "learning_rate": 3.939015192328413e-05, "loss": 3.1321, "step": 86850 }, { "epoch": 9.353137444839092, "grad_norm": 0.9148728847503662, "learning_rate": 3.9066910893222715e-05, "loss": 3.1337, "step": 86900 }, { "epoch": 9.3585189968787, "grad_norm": 0.907360315322876, "learning_rate": 3.8743669863161296e-05, "loss": 3.1314, "step": 86950 }, { "epoch": 9.363900548918307, "grad_norm": 0.9280900359153748, "learning_rate": 3.842042883309988e-05, "loss": 3.1264, "step": 87000 }, { "epoch": 9.363900548918307, "eval_accuracy": 0.39339614267036854, "eval_loss": 3.3104796409606934, "eval_runtime": 184.6531, "eval_samples_per_second": 97.54, "eval_steps_per_second": 6.098, "step": 87000 }, { "epoch": 9.369282100957916, "grad_norm": 0.8657009601593018, "learning_rate": 3.8097187803038465e-05, "loss": 3.1423, "step": 87050 }, { "epoch": 9.374663652997524, "grad_norm": 0.8895671367645264, "learning_rate": 3.7773946772977047e-05, "loss": 3.11, "step": 87100 }, { "epoch": 9.380045205037133, "grad_norm": 0.8989640474319458, "learning_rate": 3.745070574291563e-05, "loss": 3.1225, "step": 87150 }, { "epoch": 9.385426757076742, "grad_norm": 0.9100672006607056, "learning_rate": 3.7127464712854216e-05, "loss": 3.1423, "step": 87200 }, { "epoch": 9.390808309116348, "grad_norm": 0.883725106716156, "learning_rate": 3.6804223682792803e-05, "loss": 3.1252, "step": 87250 }, { "epoch": 9.396189861155957, "grad_norm": 0.9332802891731262, "learning_rate": 3.6480982652731385e-05, "loss": 3.1588, "step": 87300 }, { "epoch": 9.401571413195565, "grad_norm": 0.878340482711792, "learning_rate": 3.6157741622669966e-05, "loss": 3.1199, "step": 87350 }, { "epoch": 9.406952965235174, "grad_norm": 0.8933216333389282, "learning_rate": 3.5834500592608554e-05, "loss": 3.1312, "step": 87400 }, { "epoch": 9.412334517274783, "grad_norm": 0.9225236177444458, "learning_rate": 3.5511259562547135e-05, "loss": 3.1305, "step": 87450 }, { "epoch": 9.417716069314391, "grad_norm": 0.8944328427314758, "learning_rate": 3.519448335308695e-05, "loss": 3.1197, "step": 87500 }, { "epoch": 9.423097621353998, "grad_norm": 0.8815526366233826, "learning_rate": 3.4871242323025536e-05, "loss": 3.1282, "step": 87550 }, { "epoch": 9.428479173393606, "grad_norm": 0.947081446647644, "learning_rate": 3.454800129296412e-05, "loss": 3.1075, "step": 87600 }, { "epoch": 9.433860725433215, "grad_norm": 0.9011362791061401, "learning_rate": 3.42247602629027e-05, "loss": 3.1441, "step": 87650 }, { "epoch": 9.439242277472824, "grad_norm": 0.8931843042373657, "learning_rate": 3.3901519232841286e-05, "loss": 3.1562, "step": 87700 }, { "epoch": 9.444623829512432, "grad_norm": 0.8832892179489136, "learning_rate": 3.3578278202779874e-05, "loss": 3.1324, "step": 87750 }, { "epoch": 9.450005381552039, "grad_norm": 0.9089193344116211, "learning_rate": 3.3255037172718455e-05, "loss": 3.1315, "step": 87800 }, { "epoch": 9.455386933591647, "grad_norm": 1.5505813360214233, "learning_rate": 3.2931796142657036e-05, "loss": 3.1245, "step": 87850 }, { "epoch": 9.460768485631256, "grad_norm": 0.8742733001708984, "learning_rate": 3.2608555112595624e-05, "loss": 3.1338, "step": 87900 }, { "epoch": 9.466150037670864, "grad_norm": 0.9426006078720093, "learning_rate": 3.2285314082534205e-05, "loss": 3.1198, "step": 87950 }, { "epoch": 9.471531589710473, "grad_norm": 0.9033601880073547, "learning_rate": 3.196207305247279e-05, "loss": 3.1522, "step": 88000 }, { "epoch": 9.471531589710473, "eval_accuracy": 0.39373003299462245, "eval_loss": 3.30660080909729, "eval_runtime": 184.6201, "eval_samples_per_second": 97.557, "eval_steps_per_second": 6.099, "step": 88000 }, { "epoch": 9.476913141750082, "grad_norm": 0.8803892731666565, "learning_rate": 3.1638832022411374e-05, "loss": 3.1347, "step": 88050 }, { "epoch": 9.482294693789688, "grad_norm": 0.9048923254013062, "learning_rate": 3.1315590992349955e-05, "loss": 3.1199, "step": 88100 }, { "epoch": 9.487676245829297, "grad_norm": 0.9061964154243469, "learning_rate": 3.099234996228854e-05, "loss": 3.1378, "step": 88150 }, { "epoch": 9.493057797868905, "grad_norm": 0.9181330800056458, "learning_rate": 3.066910893222713e-05, "loss": 3.131, "step": 88200 }, { "epoch": 9.498439349908514, "grad_norm": 0.9095847010612488, "learning_rate": 3.0345867902165712e-05, "loss": 3.1291, "step": 88250 }, { "epoch": 9.503820901948123, "grad_norm": 0.9039878249168396, "learning_rate": 3.00226268721043e-05, "loss": 3.1375, "step": 88300 }, { "epoch": 9.50920245398773, "grad_norm": 0.9264708757400513, "learning_rate": 2.9699385842042878e-05, "loss": 3.1186, "step": 88350 }, { "epoch": 9.514584006027338, "grad_norm": 0.9030917882919312, "learning_rate": 2.9376144811981465e-05, "loss": 3.1325, "step": 88400 }, { "epoch": 9.519965558066946, "grad_norm": 0.9015549421310425, "learning_rate": 2.9059368602521275e-05, "loss": 3.1507, "step": 88450 }, { "epoch": 9.525347110106555, "grad_norm": 0.9642078280448914, "learning_rate": 2.8736127572459863e-05, "loss": 3.1393, "step": 88500 }, { "epoch": 9.530728662146164, "grad_norm": 0.8946633338928223, "learning_rate": 2.8412886542398448e-05, "loss": 3.1269, "step": 88550 }, { "epoch": 9.536110214185772, "grad_norm": 0.9017453789710999, "learning_rate": 2.8089645512337032e-05, "loss": 3.1265, "step": 88600 }, { "epoch": 9.541491766225379, "grad_norm": 0.8839432001113892, "learning_rate": 2.7766404482275613e-05, "loss": 3.1254, "step": 88650 }, { "epoch": 9.546873318264987, "grad_norm": 0.9194327592849731, "learning_rate": 2.7443163452214198e-05, "loss": 3.1311, "step": 88700 }, { "epoch": 9.552254870304596, "grad_norm": 0.9012552499771118, "learning_rate": 2.7119922422152782e-05, "loss": 3.1302, "step": 88750 }, { "epoch": 9.557636422344205, "grad_norm": 0.894688606262207, "learning_rate": 2.6796681392091367e-05, "loss": 3.1381, "step": 88800 }, { "epoch": 9.563017974383813, "grad_norm": 0.8723885416984558, "learning_rate": 2.6473440362029955e-05, "loss": 3.1406, "step": 88850 }, { "epoch": 9.56839952642342, "grad_norm": 0.914779782295227, "learning_rate": 2.6150199331968536e-05, "loss": 3.1384, "step": 88900 }, { "epoch": 9.573781078463028, "grad_norm": 0.9142804145812988, "learning_rate": 2.582695830190712e-05, "loss": 3.1213, "step": 88950 }, { "epoch": 9.579162630502637, "grad_norm": 0.9292569756507874, "learning_rate": 2.5503717271845705e-05, "loss": 3.1493, "step": 89000 }, { "epoch": 9.579162630502637, "eval_accuracy": 0.39403013227076605, "eval_loss": 3.303948163986206, "eval_runtime": 184.7384, "eval_samples_per_second": 97.495, "eval_steps_per_second": 6.095, "step": 89000 }, { "epoch": 9.584544182542245, "grad_norm": 0.8625149130821228, "learning_rate": 2.518047624178429e-05, "loss": 3.1315, "step": 89050 }, { "epoch": 9.589925734581854, "grad_norm": 0.9564598798751831, "learning_rate": 2.4857235211722874e-05, "loss": 3.1362, "step": 89100 }, { "epoch": 9.59530728662146, "grad_norm": 0.9190493226051331, "learning_rate": 2.4533994181661455e-05, "loss": 3.12, "step": 89150 }, { "epoch": 9.60068883866107, "grad_norm": 0.8848850131034851, "learning_rate": 2.421075315160004e-05, "loss": 3.1241, "step": 89200 }, { "epoch": 9.606070390700678, "grad_norm": 1.0095571279525757, "learning_rate": 2.3887512121538624e-05, "loss": 3.145, "step": 89250 }, { "epoch": 9.611451942740286, "grad_norm": 0.8862415552139282, "learning_rate": 2.3564271091477212e-05, "loss": 3.1304, "step": 89300 }, { "epoch": 9.616833494779895, "grad_norm": 0.8857293725013733, "learning_rate": 2.3241030061415796e-05, "loss": 3.14, "step": 89350 }, { "epoch": 9.622215046819504, "grad_norm": 0.887195885181427, "learning_rate": 2.2917789031354377e-05, "loss": 3.1439, "step": 89400 }, { "epoch": 9.62759659885911, "grad_norm": 0.8865780234336853, "learning_rate": 2.2594548001292962e-05, "loss": 3.1397, "step": 89450 }, { "epoch": 9.632978150898719, "grad_norm": 0.9336400628089905, "learning_rate": 2.2271306971231546e-05, "loss": 3.1413, "step": 89500 }, { "epoch": 9.638359702938327, "grad_norm": 0.9491809606552124, "learning_rate": 2.194806594117013e-05, "loss": 3.1541, "step": 89550 }, { "epoch": 9.643741254977936, "grad_norm": 0.9103695154190063, "learning_rate": 2.1624824911108715e-05, "loss": 3.1467, "step": 89600 }, { "epoch": 9.649122807017545, "grad_norm": 0.9001594185829163, "learning_rate": 2.1301583881047296e-05, "loss": 3.1282, "step": 89650 }, { "epoch": 9.654504359057151, "grad_norm": 0.9728744029998779, "learning_rate": 2.097834285098588e-05, "loss": 3.1277, "step": 89700 }, { "epoch": 9.65988591109676, "grad_norm": 0.9372629523277283, "learning_rate": 2.065510182092447e-05, "loss": 3.1457, "step": 89750 }, { "epoch": 9.665267463136368, "grad_norm": 0.9052270650863647, "learning_rate": 2.0331860790863053e-05, "loss": 3.1316, "step": 89800 }, { "epoch": 9.670649015175977, "grad_norm": 0.9259636402130127, "learning_rate": 2.0008619760801638e-05, "loss": 3.1415, "step": 89850 }, { "epoch": 9.676030567215586, "grad_norm": 0.886457622051239, "learning_rate": 1.968537873074022e-05, "loss": 3.1369, "step": 89900 }, { "epoch": 9.681412119255192, "grad_norm": 0.9304108023643494, "learning_rate": 1.9362137700678803e-05, "loss": 3.1268, "step": 89950 }, { "epoch": 9.6867936712948, "grad_norm": 1.5604965686798096, "learning_rate": 1.9038896670617388e-05, "loss": 3.1222, "step": 90000 }, { "epoch": 9.6867936712948, "eval_accuracy": 0.3941440004971956, "eval_loss": 3.3028476238250732, "eval_runtime": 184.8637, "eval_samples_per_second": 97.429, "eval_steps_per_second": 6.091, "step": 90000 }, { "epoch": 9.69217522333441, "grad_norm": 0.9006509184837341, "learning_rate": 1.8715655640555972e-05, "loss": 3.1474, "step": 90050 }, { "epoch": 9.697556775374018, "grad_norm": 0.9110347032546997, "learning_rate": 1.8392414610494557e-05, "loss": 3.1244, "step": 90100 }, { "epoch": 9.702938327413626, "grad_norm": 0.9055619239807129, "learning_rate": 1.806917358043314e-05, "loss": 3.1425, "step": 90150 }, { "epoch": 9.708319879453235, "grad_norm": 0.864754319190979, "learning_rate": 1.7745932550371726e-05, "loss": 3.1258, "step": 90200 }, { "epoch": 9.713701431492842, "grad_norm": 0.9538236260414124, "learning_rate": 1.742269152031031e-05, "loss": 3.1427, "step": 90250 }, { "epoch": 9.71908298353245, "grad_norm": 0.8747223615646362, "learning_rate": 1.7099450490248895e-05, "loss": 3.1304, "step": 90300 }, { "epoch": 9.724464535572059, "grad_norm": 0.9114173650741577, "learning_rate": 1.677620946018748e-05, "loss": 3.1236, "step": 90350 }, { "epoch": 9.729846087611667, "grad_norm": 0.8518925905227661, "learning_rate": 1.6452968430126064e-05, "loss": 3.1251, "step": 90400 }, { "epoch": 9.735227639651276, "grad_norm": 0.9126296043395996, "learning_rate": 1.6129727400064645e-05, "loss": 3.1233, "step": 90450 }, { "epoch": 9.740609191690883, "grad_norm": 0.8861443996429443, "learning_rate": 1.580648637000323e-05, "loss": 3.1365, "step": 90500 }, { "epoch": 9.745990743730491, "grad_norm": 0.8633650541305542, "learning_rate": 1.5483245339941817e-05, "loss": 3.1337, "step": 90550 }, { "epoch": 9.7513722957701, "grad_norm": 0.951790452003479, "learning_rate": 1.5160004309880398e-05, "loss": 3.1405, "step": 90600 }, { "epoch": 9.756753847809708, "grad_norm": 0.8869327306747437, "learning_rate": 1.4836763279818985e-05, "loss": 3.1319, "step": 90650 }, { "epoch": 9.762135399849317, "grad_norm": 0.8825336694717407, "learning_rate": 1.4513522249757567e-05, "loss": 3.13, "step": 90700 }, { "epoch": 9.767516951888926, "grad_norm": 0.906427264213562, "learning_rate": 1.4190281219696152e-05, "loss": 3.1501, "step": 90750 }, { "epoch": 9.772898503928532, "grad_norm": 0.9699810147285461, "learning_rate": 1.3867040189634736e-05, "loss": 3.134, "step": 90800 }, { "epoch": 9.77828005596814, "grad_norm": 0.8849841952323914, "learning_rate": 1.3543799159573321e-05, "loss": 3.1346, "step": 90850 }, { "epoch": 9.78366160800775, "grad_norm": 0.9202008843421936, "learning_rate": 1.3220558129511905e-05, "loss": 3.1392, "step": 90900 }, { "epoch": 9.789043160047358, "grad_norm": 0.9104524254798889, "learning_rate": 1.2897317099450488e-05, "loss": 3.1384, "step": 90950 }, { "epoch": 9.794424712086967, "grad_norm": 0.9032725095748901, "learning_rate": 1.2574076069389073e-05, "loss": 3.1375, "step": 91000 }, { "epoch": 9.794424712086967, "eval_accuracy": 0.39456611696635485, "eval_loss": 3.3005428314208984, "eval_runtime": 184.4783, "eval_samples_per_second": 97.632, "eval_steps_per_second": 6.104, "step": 91000 }, { "epoch": 9.799806264126573, "grad_norm": 0.9103838205337524, "learning_rate": 1.2250835039327659e-05, "loss": 3.1205, "step": 91050 }, { "epoch": 9.805187816166182, "grad_norm": 0.8900352120399475, "learning_rate": 1.1927594009266242e-05, "loss": 3.1186, "step": 91100 }, { "epoch": 9.81056936820579, "grad_norm": 0.9095427393913269, "learning_rate": 1.1604352979204826e-05, "loss": 3.1354, "step": 91150 }, { "epoch": 9.815950920245399, "grad_norm": 0.910943329334259, "learning_rate": 1.1281111949143409e-05, "loss": 3.1302, "step": 91200 }, { "epoch": 9.821332472285007, "grad_norm": 0.899513304233551, "learning_rate": 1.0957870919081995e-05, "loss": 3.1446, "step": 91250 }, { "epoch": 9.826714024324616, "grad_norm": 0.9401593804359436, "learning_rate": 1.063462988902058e-05, "loss": 3.1492, "step": 91300 }, { "epoch": 9.832095576364223, "grad_norm": 0.9215407967567444, "learning_rate": 1.0311388858959162e-05, "loss": 3.1322, "step": 91350 }, { "epoch": 9.837477128403831, "grad_norm": 0.8816438317298889, "learning_rate": 9.988147828897747e-06, "loss": 3.1333, "step": 91400 }, { "epoch": 9.84285868044344, "grad_norm": 0.9174292683601379, "learning_rate": 9.66490679883633e-06, "loss": 3.1206, "step": 91450 }, { "epoch": 9.848240232483048, "grad_norm": 0.8972243666648865, "learning_rate": 9.341665768774916e-06, "loss": 3.1213, "step": 91500 }, { "epoch": 9.853621784522657, "grad_norm": 0.8909099102020264, "learning_rate": 9.018424738713499e-06, "loss": 3.1341, "step": 91550 }, { "epoch": 9.859003336562264, "grad_norm": 0.8665739893913269, "learning_rate": 8.695183708652085e-06, "loss": 3.1364, "step": 91600 }, { "epoch": 9.864384888601872, "grad_norm": 0.8592131733894348, "learning_rate": 8.37194267859067e-06, "loss": 3.1238, "step": 91650 }, { "epoch": 9.869766440641481, "grad_norm": 0.9557643532752991, "learning_rate": 8.048701648529252e-06, "loss": 3.1298, "step": 91700 }, { "epoch": 9.87514799268109, "grad_norm": 0.8807770609855652, "learning_rate": 7.725460618467837e-06, "loss": 3.1426, "step": 91750 }, { "epoch": 9.880529544720698, "grad_norm": 0.9055199027061462, "learning_rate": 7.40221958840642e-06, "loss": 3.1585, "step": 91800 }, { "epoch": 9.885911096760307, "grad_norm": 0.9519686102867126, "learning_rate": 7.078978558345006e-06, "loss": 3.1366, "step": 91850 }, { "epoch": 9.891292648799913, "grad_norm": 0.8609046936035156, "learning_rate": 6.75573752828359e-06, "loss": 3.1341, "step": 91900 }, { "epoch": 9.896674200839522, "grad_norm": 0.8886732459068298, "learning_rate": 6.432496498222174e-06, "loss": 3.1366, "step": 91950 }, { "epoch": 9.90205575287913, "grad_norm": 0.8871758580207825, "learning_rate": 6.1092554681607575e-06, "loss": 3.1266, "step": 92000 }, { "epoch": 9.90205575287913, "eval_accuracy": 0.3946833534323067, "eval_loss": 3.2989983558654785, "eval_runtime": 184.7106, "eval_samples_per_second": 97.509, "eval_steps_per_second": 6.096, "step": 92000 }, { "epoch": 9.907437304918739, "grad_norm": 0.9213749766349792, "learning_rate": 5.786014438099342e-06, "loss": 3.1351, "step": 92050 }, { "epoch": 9.912818856958348, "grad_norm": 0.8700889945030212, "learning_rate": 5.4627734080379264e-06, "loss": 3.1267, "step": 92100 }, { "epoch": 9.918200408997954, "grad_norm": 0.901222825050354, "learning_rate": 5.139532377976511e-06, "loss": 3.1486, "step": 92150 }, { "epoch": 9.923581961037563, "grad_norm": 0.8922194838523865, "learning_rate": 4.816291347915095e-06, "loss": 3.1234, "step": 92200 }, { "epoch": 9.928963513077171, "grad_norm": 0.876013994216919, "learning_rate": 4.493050317853679e-06, "loss": 3.1366, "step": 92250 }, { "epoch": 9.93434506511678, "grad_norm": 0.8912842869758606, "learning_rate": 4.169809287792264e-06, "loss": 3.1165, "step": 92300 }, { "epoch": 9.939726617156388, "grad_norm": 0.967231035232544, "learning_rate": 3.846568257730847e-06, "loss": 3.1164, "step": 92350 }, { "epoch": 9.945108169195997, "grad_norm": 0.9233632683753967, "learning_rate": 3.523327227669432e-06, "loss": 3.1274, "step": 92400 }, { "epoch": 9.950489721235604, "grad_norm": 0.8899291157722473, "learning_rate": 3.2000861976080162e-06, "loss": 3.1119, "step": 92450 }, { "epoch": 9.955871273275212, "grad_norm": 0.8986747860908508, "learning_rate": 2.8768451675466007e-06, "loss": 3.1101, "step": 92500 }, { "epoch": 9.961252825314821, "grad_norm": 0.8942649364471436, "learning_rate": 2.5536041374851848e-06, "loss": 3.1362, "step": 92550 }, { "epoch": 9.96663437735443, "grad_norm": 0.8900682926177979, "learning_rate": 2.230363107423769e-06, "loss": 3.1236, "step": 92600 }, { "epoch": 9.972015929394038, "grad_norm": 0.9322834014892578, "learning_rate": 1.9071220773623531e-06, "loss": 3.1256, "step": 92650 }, { "epoch": 9.977397481433645, "grad_norm": 0.9146674275398254, "learning_rate": 1.5838810473009372e-06, "loss": 3.1287, "step": 92700 }, { "epoch": 9.982779033473253, "grad_norm": 0.9451167583465576, "learning_rate": 1.2606400172395215e-06, "loss": 3.1258, "step": 92750 }, { "epoch": 9.988160585512862, "grad_norm": 0.8404658436775208, "learning_rate": 9.373989871781058e-07, "loss": 3.1275, "step": 92800 }, { "epoch": 9.99354213755247, "grad_norm": 0.9016643166542053, "learning_rate": 6.1415795711669e-07, "loss": 3.1313, "step": 92850 }, { "epoch": 9.998923689592079, "grad_norm": 0.8875418901443481, "learning_rate": 2.909169270552742e-07, "loss": 3.1211, "step": 92900 }, { "epoch": 10.0, "step": 92910, "total_flos": 7.7682904989696e+17, "train_loss": 3.4555358953643265, "train_runtime": 79991.7543, "train_samples_per_second": 37.167, "train_steps_per_second": 1.161 } ], "logging_steps": 50, "max_steps": 92910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.7682904989696e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }