{ "best_metric": 3.3024492263793945, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005381552039608223, "grad_norm": 1.2619380950927734, "learning_rate": 0.00028799999999999995, "loss": 8.8577, "step": 50 }, { "epoch": 0.010763104079216447, "grad_norm": 1.4157741069793701, "learning_rate": 0.000588, "loss": 6.8987, "step": 100 }, { "epoch": 0.01614465611882467, "grad_norm": 1.7850571870803833, "learning_rate": 0.000599689688611141, "loss": 6.4484, "step": 150 }, { "epoch": 0.021526208158432893, "grad_norm": 2.0074565410614014, "learning_rate": 0.0005993664475810796, "loss": 6.2377, "step": 200 }, { "epoch": 0.026907760198041114, "grad_norm": 1.4940956830978394, "learning_rate": 0.0005990432065510182, "loss": 6.0848, "step": 250 }, { "epoch": 0.03228931223764934, "grad_norm": 1.8901022672653198, "learning_rate": 0.0005987199655209567, "loss": 5.9665, "step": 300 }, { "epoch": 0.03767086427725756, "grad_norm": 1.3828638792037964, "learning_rate": 0.0005983967244908953, "loss": 5.9129, "step": 350 }, { "epoch": 0.04305241631686579, "grad_norm": 2.4580698013305664, "learning_rate": 0.0005980734834608338, "loss": 5.8175, "step": 400 }, { "epoch": 0.048433968356474004, "grad_norm": 1.11954665184021, "learning_rate": 0.0005977502424307725, "loss": 5.7469, "step": 450 }, { "epoch": 0.05381552039608223, "grad_norm": 2.018273115158081, "learning_rate": 0.0005974270014007111, "loss": 5.6686, "step": 500 }, { "epoch": 0.05919707243569045, "grad_norm": 1.5541627407073975, "learning_rate": 0.0005971037603706497, "loss": 5.588, "step": 550 }, { "epoch": 0.06457862447529868, "grad_norm": 1.323858380317688, "learning_rate": 0.0005967805193405882, "loss": 5.5044, "step": 600 }, { "epoch": 0.0699601765149069, "grad_norm": 1.0239312648773193, "learning_rate": 0.0005964572783105269, "loss": 5.4518, "step": 650 }, { "epoch": 0.07534172855451512, "grad_norm": 1.7311418056488037, "learning_rate": 0.0005961340372804654, "loss": 5.4031, "step": 700 }, { "epoch": 0.08072328059412334, "grad_norm": 1.5343774557113647, "learning_rate": 0.000595810796250404, "loss": 5.3123, "step": 750 }, { "epoch": 0.08610483263373157, "grad_norm": 0.9552584886550903, "learning_rate": 0.0005954875552203426, "loss": 5.2662, "step": 800 }, { "epoch": 0.09148638467333979, "grad_norm": 1.1131244897842407, "learning_rate": 0.0005951643141902811, "loss": 5.2001, "step": 850 }, { "epoch": 0.09686793671294801, "grad_norm": 0.9765788912773132, "learning_rate": 0.0005948410731602198, "loss": 5.1525, "step": 900 }, { "epoch": 0.10224948875255624, "grad_norm": 1.7889015674591064, "learning_rate": 0.0005945178321301583, "loss": 5.1081, "step": 950 }, { "epoch": 0.10763104079216446, "grad_norm": 1.146470308303833, "learning_rate": 0.000594194591100097, "loss": 5.0805, "step": 1000 }, { "epoch": 0.10763104079216446, "eval_accuracy": 0.22736985801566578, "eval_loss": 5.018322944641113, "eval_runtime": 185.6294, "eval_samples_per_second": 97.027, "eval_steps_per_second": 6.066, "step": 1000 }, { "epoch": 0.11301259283177269, "grad_norm": 1.156062364578247, "learning_rate": 0.0005938713500700355, "loss": 5.0496, "step": 1050 }, { "epoch": 0.1183941448713809, "grad_norm": 1.568895936012268, "learning_rate": 0.000593548109039974, "loss": 4.9916, "step": 1100 }, { "epoch": 0.12377569691098914, "grad_norm": 1.2984389066696167, "learning_rate": 0.0005932248680099127, "loss": 4.9957, "step": 1150 }, { "epoch": 0.12915724895059735, "grad_norm": 1.0946718454360962, "learning_rate": 0.0005929016269798512, "loss": 4.9453, "step": 1200 }, { "epoch": 0.13453880099020557, "grad_norm": 1.2031631469726562, "learning_rate": 0.0005925783859497898, "loss": 4.9155, "step": 1250 }, { "epoch": 0.1399203530298138, "grad_norm": 1.1124569177627563, "learning_rate": 0.0005922551449197284, "loss": 4.8997, "step": 1300 }, { "epoch": 0.14530190506942203, "grad_norm": 1.1879574060440063, "learning_rate": 0.0005919319038896671, "loss": 4.8547, "step": 1350 }, { "epoch": 0.15068345710903025, "grad_norm": 1.0509369373321533, "learning_rate": 0.0005916086628596056, "loss": 4.8307, "step": 1400 }, { "epoch": 0.15606500914863847, "grad_norm": 0.7919548153877258, "learning_rate": 0.0005912854218295442, "loss": 4.8545, "step": 1450 }, { "epoch": 0.16144656118824668, "grad_norm": 1.0027759075164795, "learning_rate": 0.0005909621807994827, "loss": 4.7745, "step": 1500 }, { "epoch": 0.1668281132278549, "grad_norm": 0.8752911686897278, "learning_rate": 0.0005906389397694213, "loss": 4.7652, "step": 1550 }, { "epoch": 0.17220966526746315, "grad_norm": 1.3122471570968628, "learning_rate": 0.00059031569873936, "loss": 4.7308, "step": 1600 }, { "epoch": 0.17759121730707136, "grad_norm": 1.111213207244873, "learning_rate": 0.0005899924577092985, "loss": 4.7252, "step": 1650 }, { "epoch": 0.18297276934667958, "grad_norm": 1.1256006956100464, "learning_rate": 0.0005896692166792371, "loss": 4.7047, "step": 1700 }, { "epoch": 0.1883543213862878, "grad_norm": 0.8078550100326538, "learning_rate": 0.0005893459756491757, "loss": 4.6808, "step": 1750 }, { "epoch": 0.19373587342589602, "grad_norm": 0.9641079902648926, "learning_rate": 0.0005890227346191143, "loss": 4.6419, "step": 1800 }, { "epoch": 0.19911742546550426, "grad_norm": 0.814166247844696, "learning_rate": 0.0005886994935890529, "loss": 4.6347, "step": 1850 }, { "epoch": 0.20449897750511248, "grad_norm": 0.7179664969444275, "learning_rate": 0.0005883762525589915, "loss": 4.6288, "step": 1900 }, { "epoch": 0.2098805295447207, "grad_norm": 0.7809556126594543, "learning_rate": 0.00058805301152893, "loss": 4.6272, "step": 1950 }, { "epoch": 0.2152620815843289, "grad_norm": 0.8820845484733582, "learning_rate": 0.0005877297704988686, "loss": 4.5851, "step": 2000 }, { "epoch": 0.2152620815843289, "eval_accuracy": 0.27214277945604237, "eval_loss": 4.502532958984375, "eval_runtime": 184.1309, "eval_samples_per_second": 97.816, "eval_steps_per_second": 6.115, "step": 2000 }, { "epoch": 0.22064363362393713, "grad_norm": 0.9190576672554016, "learning_rate": 0.0005874065294688072, "loss": 4.5677, "step": 2050 }, { "epoch": 0.22602518566354537, "grad_norm": 0.8735911846160889, "learning_rate": 0.0005870832884387457, "loss": 4.5348, "step": 2100 }, { "epoch": 0.2314067377031536, "grad_norm": 1.0905894041061401, "learning_rate": 0.0005867600474086844, "loss": 4.5185, "step": 2150 }, { "epoch": 0.2367882897427618, "grad_norm": 0.9560607075691223, "learning_rate": 0.0005864368063786229, "loss": 4.4963, "step": 2200 }, { "epoch": 0.24216984178237003, "grad_norm": 0.9163029789924622, "learning_rate": 0.0005861135653485616, "loss": 4.4676, "step": 2250 }, { "epoch": 0.24755139382197827, "grad_norm": 1.1281293630599976, "learning_rate": 0.0005857903243185001, "loss": 4.4414, "step": 2300 }, { "epoch": 0.2529329458615865, "grad_norm": 1.101348638534546, "learning_rate": 0.0005854670832884386, "loss": 4.4778, "step": 2350 }, { "epoch": 0.2583144979011947, "grad_norm": 0.9377833008766174, "learning_rate": 0.0005851438422583773, "loss": 4.4328, "step": 2400 }, { "epoch": 0.2636960499408029, "grad_norm": 0.8902820944786072, "learning_rate": 0.0005848206012283159, "loss": 4.4258, "step": 2450 }, { "epoch": 0.26907760198041114, "grad_norm": 1.4821397066116333, "learning_rate": 0.0005844973601982545, "loss": 4.4088, "step": 2500 }, { "epoch": 0.27445915402001936, "grad_norm": 0.9053783416748047, "learning_rate": 0.000584174119168193, "loss": 4.4095, "step": 2550 }, { "epoch": 0.2798407060596276, "grad_norm": 0.8805415630340576, "learning_rate": 0.0005838508781381316, "loss": 4.3952, "step": 2600 }, { "epoch": 0.2852222580992358, "grad_norm": 0.7359943389892578, "learning_rate": 0.0005835276371080702, "loss": 4.3912, "step": 2650 }, { "epoch": 0.29060381013884407, "grad_norm": 0.9388933777809143, "learning_rate": 0.0005832043960780088, "loss": 4.3577, "step": 2700 }, { "epoch": 0.2959853621784523, "grad_norm": 0.8783614635467529, "learning_rate": 0.0005828811550479474, "loss": 4.3579, "step": 2750 }, { "epoch": 0.3013669142180605, "grad_norm": 1.0729166269302368, "learning_rate": 0.0005825579140178859, "loss": 4.338, "step": 2800 }, { "epoch": 0.3067484662576687, "grad_norm": 0.8868817090988159, "learning_rate": 0.0005822346729878246, "loss": 4.3532, "step": 2850 }, { "epoch": 0.31213001829727693, "grad_norm": 0.6756743788719177, "learning_rate": 0.0005819114319577631, "loss": 4.3165, "step": 2900 }, { "epoch": 0.31751157033688515, "grad_norm": 1.115333080291748, "learning_rate": 0.0005815881909277017, "loss": 4.3235, "step": 2950 }, { "epoch": 0.32289312237649337, "grad_norm": 0.6743454933166504, "learning_rate": 0.0005812649498976403, "loss": 4.2902, "step": 3000 }, { "epoch": 0.32289312237649337, "eval_accuracy": 0.29958567394289704, "eval_loss": 4.224883079528809, "eval_runtime": 184.1606, "eval_samples_per_second": 97.8, "eval_steps_per_second": 6.114, "step": 3000 }, { "epoch": 0.3282746744161016, "grad_norm": 0.7674033641815186, "learning_rate": 0.0005809417088675789, "loss": 4.277, "step": 3050 }, { "epoch": 0.3336562264557098, "grad_norm": 0.6853671669960022, "learning_rate": 0.0005806184678375175, "loss": 4.2884, "step": 3100 }, { "epoch": 0.3390377784953181, "grad_norm": 1.0062206983566284, "learning_rate": 0.000580295226807456, "loss": 4.2747, "step": 3150 }, { "epoch": 0.3444193305349263, "grad_norm": 0.6991369724273682, "learning_rate": 0.0005799719857773946, "loss": 4.2573, "step": 3200 }, { "epoch": 0.3498008825745345, "grad_norm": 0.8243315815925598, "learning_rate": 0.0005796487447473331, "loss": 4.2707, "step": 3250 }, { "epoch": 0.35518243461414273, "grad_norm": 0.7548694610595703, "learning_rate": 0.0005793255037172718, "loss": 4.2478, "step": 3300 }, { "epoch": 0.36056398665375095, "grad_norm": 0.8714583516120911, "learning_rate": 0.0005790022626872104, "loss": 4.2274, "step": 3350 }, { "epoch": 0.36594553869335916, "grad_norm": 0.5796970725059509, "learning_rate": 0.000578679021657149, "loss": 4.2347, "step": 3400 }, { "epoch": 0.3713270907329674, "grad_norm": 0.770889937877655, "learning_rate": 0.0005783557806270875, "loss": 4.236, "step": 3450 }, { "epoch": 0.3767086427725756, "grad_norm": 0.6033868789672852, "learning_rate": 0.0005780325395970262, "loss": 4.2122, "step": 3500 }, { "epoch": 0.3820901948121838, "grad_norm": 0.6957647800445557, "learning_rate": 0.0005777092985669647, "loss": 4.2252, "step": 3550 }, { "epoch": 0.38747174685179203, "grad_norm": 0.6956198215484619, "learning_rate": 0.0005773860575369033, "loss": 4.1964, "step": 3600 }, { "epoch": 0.3928532988914003, "grad_norm": 0.7516166567802429, "learning_rate": 0.0005770628165068419, "loss": 4.1847, "step": 3650 }, { "epoch": 0.3982348509310085, "grad_norm": 0.6326409578323364, "learning_rate": 0.0005767395754767804, "loss": 4.1859, "step": 3700 }, { "epoch": 0.40361640297061674, "grad_norm": 0.7365351915359497, "learning_rate": 0.0005764163344467191, "loss": 4.1742, "step": 3750 }, { "epoch": 0.40899795501022496, "grad_norm": 0.6335952281951904, "learning_rate": 0.0005760930934166576, "loss": 4.1615, "step": 3800 }, { "epoch": 0.4143795070498332, "grad_norm": 0.7398914098739624, "learning_rate": 0.0005757698523865963, "loss": 4.1702, "step": 3850 }, { "epoch": 0.4197610590894414, "grad_norm": 0.7694666385650635, "learning_rate": 0.0005754466113565348, "loss": 4.1677, "step": 3900 }, { "epoch": 0.4251426111290496, "grad_norm": 0.690592885017395, "learning_rate": 0.0005751233703264733, "loss": 4.1512, "step": 3950 }, { "epoch": 0.4305241631686578, "grad_norm": 0.8871626257896423, "learning_rate": 0.000574800129296412, "loss": 4.1649, "step": 4000 }, { "epoch": 0.4305241631686578, "eval_accuracy": 0.31256534792121493, "eval_loss": 4.0934929847717285, "eval_runtime": 183.9147, "eval_samples_per_second": 97.931, "eval_steps_per_second": 6.122, "step": 4000 }, { "epoch": 0.43590571520826604, "grad_norm": 0.8400447368621826, "learning_rate": 0.0005744768882663505, "loss": 4.1542, "step": 4050 }, { "epoch": 0.44128726724787426, "grad_norm": 0.7789562344551086, "learning_rate": 0.0005741536472362891, "loss": 4.1515, "step": 4100 }, { "epoch": 0.44666881928748253, "grad_norm": 0.6236934065818787, "learning_rate": 0.0005738304062062277, "loss": 4.1483, "step": 4150 }, { "epoch": 0.45205037132709075, "grad_norm": 0.7111061811447144, "learning_rate": 0.0005735071651761664, "loss": 4.1187, "step": 4200 }, { "epoch": 0.45743192336669897, "grad_norm": 0.6331958770751953, "learning_rate": 0.0005731839241461049, "loss": 4.1465, "step": 4250 }, { "epoch": 0.4628134754063072, "grad_norm": 0.6426513195037842, "learning_rate": 0.0005728606831160435, "loss": 4.1179, "step": 4300 }, { "epoch": 0.4681950274459154, "grad_norm": 0.7988542318344116, "learning_rate": 0.000572537442085982, "loss": 4.0764, "step": 4350 }, { "epoch": 0.4735765794855236, "grad_norm": 0.5747326016426086, "learning_rate": 0.0005722142010559206, "loss": 4.1143, "step": 4400 }, { "epoch": 0.47895813152513184, "grad_norm": 0.5958849191665649, "learning_rate": 0.0005718909600258593, "loss": 4.1233, "step": 4450 }, { "epoch": 0.48433968356474005, "grad_norm": 0.6663408279418945, "learning_rate": 0.0005715677189957978, "loss": 4.0998, "step": 4500 }, { "epoch": 0.48972123560434827, "grad_norm": 0.704066812992096, "learning_rate": 0.0005712444779657364, "loss": 4.0918, "step": 4550 }, { "epoch": 0.49510278764395654, "grad_norm": 0.6369755268096924, "learning_rate": 0.000570921236935675, "loss": 4.0871, "step": 4600 }, { "epoch": 0.5004843396835648, "grad_norm": 0.6915847063064575, "learning_rate": 0.0005705979959056136, "loss": 4.0924, "step": 4650 }, { "epoch": 0.505865891723173, "grad_norm": 0.5787283182144165, "learning_rate": 0.0005702747548755522, "loss": 4.0775, "step": 4700 }, { "epoch": 0.5112474437627812, "grad_norm": 0.6264421343803406, "learning_rate": 0.0005699515138454908, "loss": 4.0891, "step": 4750 }, { "epoch": 0.5166289958023894, "grad_norm": 0.7452828884124756, "learning_rate": 0.0005696282728154293, "loss": 4.0623, "step": 4800 }, { "epoch": 0.5220105478419976, "grad_norm": 0.8062637448310852, "learning_rate": 0.0005693050317853679, "loss": 4.0989, "step": 4850 }, { "epoch": 0.5273920998816058, "grad_norm": 0.6160845756530762, "learning_rate": 0.0005689817907553065, "loss": 4.0604, "step": 4900 }, { "epoch": 0.5327736519212141, "grad_norm": 0.6188620328903198, "learning_rate": 0.000568658549725245, "loss": 4.0477, "step": 4950 }, { "epoch": 0.5381552039608223, "grad_norm": 0.7134661078453064, "learning_rate": 0.0005683353086951837, "loss": 4.0503, "step": 5000 }, { "epoch": 0.5381552039608223, "eval_accuracy": 0.3212809393172535, "eval_loss": 3.995776891708374, "eval_runtime": 183.9862, "eval_samples_per_second": 97.893, "eval_steps_per_second": 6.12, "step": 5000 }, { "epoch": 0.5435367560004305, "grad_norm": 0.7106888294219971, "learning_rate": 0.0005680120676651222, "loss": 4.0714, "step": 5050 }, { "epoch": 0.5489183080400387, "grad_norm": 0.5911557078361511, "learning_rate": 0.0005676888266350609, "loss": 4.0311, "step": 5100 }, { "epoch": 0.5542998600796469, "grad_norm": 0.5957518219947815, "learning_rate": 0.0005673655856049994, "loss": 4.0459, "step": 5150 }, { "epoch": 0.5596814121192552, "grad_norm": 0.6637642979621887, "learning_rate": 0.0005670423445749379, "loss": 4.05, "step": 5200 }, { "epoch": 0.5650629641588634, "grad_norm": 0.6797468662261963, "learning_rate": 0.0005667191035448766, "loss": 4.029, "step": 5250 }, { "epoch": 0.5704445161984716, "grad_norm": 0.6260005831718445, "learning_rate": 0.0005663958625148152, "loss": 4.0311, "step": 5300 }, { "epoch": 0.5758260682380799, "grad_norm": 0.6095857620239258, "learning_rate": 0.0005660726214847538, "loss": 4.0187, "step": 5350 }, { "epoch": 0.5812076202776881, "grad_norm": 0.6056799292564392, "learning_rate": 0.0005657493804546923, "loss": 4.0068, "step": 5400 }, { "epoch": 0.5865891723172963, "grad_norm": 0.6308525800704956, "learning_rate": 0.0005654261394246309, "loss": 4.014, "step": 5450 }, { "epoch": 0.5919707243569046, "grad_norm": 0.7065827250480652, "learning_rate": 0.0005651028983945695, "loss": 4.0161, "step": 5500 }, { "epoch": 0.5973522763965128, "grad_norm": 0.5757762789726257, "learning_rate": 0.000564779657364508, "loss": 4.0222, "step": 5550 }, { "epoch": 0.602733828436121, "grad_norm": 0.6408382654190063, "learning_rate": 0.0005644564163344467, "loss": 4.0136, "step": 5600 }, { "epoch": 0.6081153804757292, "grad_norm": 0.725348949432373, "learning_rate": 0.0005641331753043852, "loss": 3.9966, "step": 5650 }, { "epoch": 0.6134969325153374, "grad_norm": 0.6008983254432678, "learning_rate": 0.0005638099342743239, "loss": 4.0078, "step": 5700 }, { "epoch": 0.6188784845549457, "grad_norm": 0.596548318862915, "learning_rate": 0.0005634866932442624, "loss": 4.005, "step": 5750 }, { "epoch": 0.6242600365945539, "grad_norm": 0.6248924732208252, "learning_rate": 0.000563163452214201, "loss": 4.0162, "step": 5800 }, { "epoch": 0.6296415886341621, "grad_norm": 0.7190423011779785, "learning_rate": 0.0005628402111841396, "loss": 3.9889, "step": 5850 }, { "epoch": 0.6350231406737703, "grad_norm": 0.6302054524421692, "learning_rate": 0.0005625169701540782, "loss": 3.9835, "step": 5900 }, { "epoch": 0.6404046927133785, "grad_norm": 0.6800968647003174, "learning_rate": 0.0005621937291240168, "loss": 3.9814, "step": 5950 }, { "epoch": 0.6457862447529867, "grad_norm": 0.56999272108078, "learning_rate": 0.0005618704880939553, "loss": 3.957, "step": 6000 }, { "epoch": 0.6457862447529867, "eval_accuracy": 0.32865770982964204, "eval_loss": 3.9150233268737793, "eval_runtime": 184.1189, "eval_samples_per_second": 97.823, "eval_steps_per_second": 6.116, "step": 6000 }, { "epoch": 0.651167796792595, "grad_norm": 0.5242891311645508, "learning_rate": 0.0005615472470638939, "loss": 3.975, "step": 6050 }, { "epoch": 0.6565493488322032, "grad_norm": 0.5570003390312195, "learning_rate": 0.0005612240060338325, "loss": 3.9737, "step": 6100 }, { "epoch": 0.6619309008718114, "grad_norm": 0.5470379590988159, "learning_rate": 0.0005609007650037711, "loss": 3.983, "step": 6150 }, { "epoch": 0.6673124529114196, "grad_norm": 0.5839495658874512, "learning_rate": 0.0005605775239737097, "loss": 3.9725, "step": 6200 }, { "epoch": 0.6726940049510278, "grad_norm": 0.6675605177879333, "learning_rate": 0.0005602542829436483, "loss": 3.9563, "step": 6250 }, { "epoch": 0.6780755569906362, "grad_norm": 0.6430953741073608, "learning_rate": 0.0005599310419135868, "loss": 3.9557, "step": 6300 }, { "epoch": 0.6834571090302444, "grad_norm": 0.6517301797866821, "learning_rate": 0.0005596078008835255, "loss": 3.9683, "step": 6350 }, { "epoch": 0.6888386610698526, "grad_norm": 0.6999329328536987, "learning_rate": 0.000559284559853464, "loss": 3.9558, "step": 6400 }, { "epoch": 0.6942202131094608, "grad_norm": 0.6338062882423401, "learning_rate": 0.0005589613188234026, "loss": 3.9595, "step": 6450 }, { "epoch": 0.699601765149069, "grad_norm": 0.6228857636451721, "learning_rate": 0.0005586380777933412, "loss": 3.9603, "step": 6500 }, { "epoch": 0.7049833171886772, "grad_norm": 0.5923941731452942, "learning_rate": 0.0005583148367632797, "loss": 3.9509, "step": 6550 }, { "epoch": 0.7103648692282855, "grad_norm": 0.5909141898155212, "learning_rate": 0.0005579915957332184, "loss": 3.9544, "step": 6600 }, { "epoch": 0.7157464212678937, "grad_norm": 0.5389482378959656, "learning_rate": 0.0005576683547031569, "loss": 3.9467, "step": 6650 }, { "epoch": 0.7211279733075019, "grad_norm": 0.6230718493461609, "learning_rate": 0.0005573451136730956, "loss": 3.9298, "step": 6700 }, { "epoch": 0.7265095253471101, "grad_norm": 0.6455636024475098, "learning_rate": 0.0005570218726430341, "loss": 3.9429, "step": 6750 }, { "epoch": 0.7318910773867183, "grad_norm": 0.515100359916687, "learning_rate": 0.0005566986316129728, "loss": 3.9419, "step": 6800 }, { "epoch": 0.7372726294263265, "grad_norm": 0.6500300765037537, "learning_rate": 0.0005563753905829113, "loss": 3.9327, "step": 6850 }, { "epoch": 0.7426541814659348, "grad_norm": 0.5976806282997131, "learning_rate": 0.0005560521495528498, "loss": 3.9347, "step": 6900 }, { "epoch": 0.748035733505543, "grad_norm": 0.5752057433128357, "learning_rate": 0.0005557289085227884, "loss": 3.9364, "step": 6950 }, { "epoch": 0.7534172855451512, "grad_norm": 0.6120689511299133, "learning_rate": 0.000555405667492727, "loss": 3.926, "step": 7000 }, { "epoch": 0.7534172855451512, "eval_accuracy": 0.33354991475637685, "eval_loss": 3.8600199222564697, "eval_runtime": 184.4532, "eval_samples_per_second": 97.645, "eval_steps_per_second": 6.105, "step": 7000 }, { "epoch": 0.7587988375847594, "grad_norm": 0.6163683533668518, "learning_rate": 0.0005550824264626657, "loss": 3.9252, "step": 7050 }, { "epoch": 0.7641803896243676, "grad_norm": 0.5958462953567505, "learning_rate": 0.0005547591854326042, "loss": 3.9274, "step": 7100 }, { "epoch": 0.7695619416639758, "grad_norm": 0.6324918270111084, "learning_rate": 0.0005544359444025428, "loss": 3.9211, "step": 7150 }, { "epoch": 0.7749434937035841, "grad_norm": 0.6233291625976562, "learning_rate": 0.0005541127033724813, "loss": 3.9226, "step": 7200 }, { "epoch": 0.7803250457431924, "grad_norm": 0.6019555330276489, "learning_rate": 0.0005537894623424199, "loss": 3.9044, "step": 7250 }, { "epoch": 0.7857065977828006, "grad_norm": 0.6118972897529602, "learning_rate": 0.0005534662213123586, "loss": 3.9272, "step": 7300 }, { "epoch": 0.7910881498224088, "grad_norm": 0.5551448464393616, "learning_rate": 0.0005531429802822971, "loss": 3.9129, "step": 7350 }, { "epoch": 0.796469701862017, "grad_norm": 0.5576432347297668, "learning_rate": 0.0005528197392522357, "loss": 3.8883, "step": 7400 }, { "epoch": 0.8018512539016253, "grad_norm": 0.536040186882019, "learning_rate": 0.0005524964982221743, "loss": 3.9148, "step": 7450 }, { "epoch": 0.8072328059412335, "grad_norm": 0.5515653491020203, "learning_rate": 0.0005521732571921129, "loss": 3.8873, "step": 7500 }, { "epoch": 0.8126143579808417, "grad_norm": 0.6468381285667419, "learning_rate": 0.0005518500161620514, "loss": 3.9219, "step": 7550 }, { "epoch": 0.8179959100204499, "grad_norm": 0.6421065330505371, "learning_rate": 0.00055152677513199, "loss": 3.9011, "step": 7600 }, { "epoch": 0.8233774620600581, "grad_norm": 0.7124976515769958, "learning_rate": 0.0005512035341019286, "loss": 3.9073, "step": 7650 }, { "epoch": 0.8287590140996663, "grad_norm": 0.5300739407539368, "learning_rate": 0.0005508802930718672, "loss": 3.897, "step": 7700 }, { "epoch": 0.8341405661392746, "grad_norm": 0.5574651956558228, "learning_rate": 0.0005505570520418058, "loss": 3.878, "step": 7750 }, { "epoch": 0.8395221181788828, "grad_norm": 0.5600754618644714, "learning_rate": 0.0005502338110117443, "loss": 3.8917, "step": 7800 }, { "epoch": 0.844903670218491, "grad_norm": 0.5358843207359314, "learning_rate": 0.000549910569981683, "loss": 3.8804, "step": 7850 }, { "epoch": 0.8502852222580992, "grad_norm": 0.658395528793335, "learning_rate": 0.0005495873289516215, "loss": 3.8709, "step": 7900 }, { "epoch": 0.8556667742977074, "grad_norm": 0.5161238312721252, "learning_rate": 0.0005492640879215602, "loss": 3.8827, "step": 7950 }, { "epoch": 0.8610483263373157, "grad_norm": 0.5757775902748108, "learning_rate": 0.0005489408468914987, "loss": 3.8792, "step": 8000 }, { "epoch": 0.8610483263373157, "eval_accuracy": 0.33785246046094247, "eval_loss": 3.8122713565826416, "eval_runtime": 184.1424, "eval_samples_per_second": 97.81, "eval_steps_per_second": 6.115, "step": 8000 }, { "epoch": 0.8664298783769239, "grad_norm": 0.5138362646102905, "learning_rate": 0.0005486176058614372, "loss": 3.8879, "step": 8050 }, { "epoch": 0.8718114304165321, "grad_norm": 0.5533286929130554, "learning_rate": 0.0005482943648313759, "loss": 3.8783, "step": 8100 }, { "epoch": 0.8771929824561403, "grad_norm": 0.5804781317710876, "learning_rate": 0.0005479711238013145, "loss": 3.8718, "step": 8150 }, { "epoch": 0.8825745344957485, "grad_norm": 0.6782721281051636, "learning_rate": 0.0005476478827712531, "loss": 3.8798, "step": 8200 }, { "epoch": 0.8879560865353568, "grad_norm": 0.6069771647453308, "learning_rate": 0.0005473246417411916, "loss": 3.8703, "step": 8250 }, { "epoch": 0.8933376385749651, "grad_norm": 0.5682134032249451, "learning_rate": 0.0005470014007111302, "loss": 3.8639, "step": 8300 }, { "epoch": 0.8987191906145733, "grad_norm": 0.5916244387626648, "learning_rate": 0.0005466781596810688, "loss": 3.868, "step": 8350 }, { "epoch": 0.9041007426541815, "grad_norm": 0.5569131970405579, "learning_rate": 0.0005463549186510073, "loss": 3.8574, "step": 8400 }, { "epoch": 0.9094822946937897, "grad_norm": 0.5994654893875122, "learning_rate": 0.000546031677620946, "loss": 3.8383, "step": 8450 }, { "epoch": 0.9148638467333979, "grad_norm": 0.595244288444519, "learning_rate": 0.0005457084365908845, "loss": 3.8793, "step": 8500 }, { "epoch": 0.9202453987730062, "grad_norm": 0.6106542944908142, "learning_rate": 0.0005453851955608232, "loss": 3.8596, "step": 8550 }, { "epoch": 0.9256269508126144, "grad_norm": 0.5412901639938354, "learning_rate": 0.0005450619545307617, "loss": 3.8455, "step": 8600 }, { "epoch": 0.9310085028522226, "grad_norm": 0.5948059558868408, "learning_rate": 0.0005447387135007003, "loss": 3.8417, "step": 8650 }, { "epoch": 0.9363900548918308, "grad_norm": 0.5632950067520142, "learning_rate": 0.0005444154724706389, "loss": 3.841, "step": 8700 }, { "epoch": 0.941771606931439, "grad_norm": 0.5710453391075134, "learning_rate": 0.0005440922314405775, "loss": 3.8535, "step": 8750 }, { "epoch": 0.9471531589710472, "grad_norm": 0.5678730607032776, "learning_rate": 0.0005437689904105161, "loss": 3.8466, "step": 8800 }, { "epoch": 0.9525347110106555, "grad_norm": 0.5131510496139526, "learning_rate": 0.0005434457493804546, "loss": 3.8489, "step": 8850 }, { "epoch": 0.9579162630502637, "grad_norm": 0.6341388821601868, "learning_rate": 0.0005431225083503932, "loss": 3.8502, "step": 8900 }, { "epoch": 0.9632978150898719, "grad_norm": 0.5771986246109009, "learning_rate": 0.0005427992673203318, "loss": 3.8328, "step": 8950 }, { "epoch": 0.9686793671294801, "grad_norm": 0.6046352982521057, "learning_rate": 0.0005424760262902704, "loss": 3.8393, "step": 9000 }, { "epoch": 0.9686793671294801, "eval_accuracy": 0.34175309913348234, "eval_loss": 3.7765414714813232, "eval_runtime": 184.8929, "eval_samples_per_second": 97.413, "eval_steps_per_second": 6.09, "step": 9000 }, { "epoch": 0.9740609191690883, "grad_norm": 0.5805352330207825, "learning_rate": 0.000542152785260209, "loss": 3.8323, "step": 9050 }, { "epoch": 0.9794424712086965, "grad_norm": 0.6047805547714233, "learning_rate": 0.0005418295442301476, "loss": 3.8319, "step": 9100 }, { "epoch": 0.9848240232483048, "grad_norm": 0.5381801724433899, "learning_rate": 0.0005415063032000861, "loss": 3.8376, "step": 9150 }, { "epoch": 0.9902055752879131, "grad_norm": 0.553428590297699, "learning_rate": 0.0005411830621700248, "loss": 3.845, "step": 9200 }, { "epoch": 0.9955871273275213, "grad_norm": 0.5774129629135132, "learning_rate": 0.0005408598211399633, "loss": 3.8296, "step": 9250 }, { "epoch": 1.0009686793671295, "grad_norm": 0.571371853351593, "learning_rate": 0.0005405365801099019, "loss": 3.8086, "step": 9300 }, { "epoch": 1.0063502314067376, "grad_norm": 0.5810806155204773, "learning_rate": 0.0005402133390798405, "loss": 3.7664, "step": 9350 }, { "epoch": 1.011731783446346, "grad_norm": 0.5649880766868591, "learning_rate": 0.000539890098049779, "loss": 3.7867, "step": 9400 }, { "epoch": 1.017113335485954, "grad_norm": 0.5414764285087585, "learning_rate": 0.0005395668570197177, "loss": 3.7579, "step": 9450 }, { "epoch": 1.0224948875255624, "grad_norm": 0.5362746715545654, "learning_rate": 0.0005392436159896562, "loss": 3.7796, "step": 9500 }, { "epoch": 1.0278764395651705, "grad_norm": 0.6010889410972595, "learning_rate": 0.0005389203749595948, "loss": 3.7563, "step": 9550 }, { "epoch": 1.0332579916047788, "grad_norm": 0.6140136122703552, "learning_rate": 0.0005385971339295334, "loss": 3.7871, "step": 9600 }, { "epoch": 1.0386395436443872, "grad_norm": 0.5645223259925842, "learning_rate": 0.000538273892899472, "loss": 3.7527, "step": 9650 }, { "epoch": 1.0440210956839953, "grad_norm": 0.6200801730155945, "learning_rate": 0.0005379506518694106, "loss": 3.7533, "step": 9700 }, { "epoch": 1.0494026477236036, "grad_norm": 0.5485979318618774, "learning_rate": 0.0005376274108393491, "loss": 3.7422, "step": 9750 }, { "epoch": 1.0547841997632117, "grad_norm": 0.6224055290222168, "learning_rate": 0.0005373041698092877, "loss": 3.771, "step": 9800 }, { "epoch": 1.06016575180282, "grad_norm": 0.5823978781700134, "learning_rate": 0.0005369809287792263, "loss": 3.7602, "step": 9850 }, { "epoch": 1.0655473038424281, "grad_norm": 0.5620067715644836, "learning_rate": 0.000536657687749165, "loss": 3.7748, "step": 9900 }, { "epoch": 1.0709288558820365, "grad_norm": 0.6253240704536438, "learning_rate": 0.0005363344467191035, "loss": 3.748, "step": 9950 }, { "epoch": 1.0763104079216446, "grad_norm": 0.5606370568275452, "learning_rate": 0.000536011205689042, "loss": 3.7796, "step": 10000 }, { "epoch": 1.0763104079216446, "eval_accuracy": 0.34503865380810445, "eval_loss": 3.7481353282928467, "eval_runtime": 185.0934, "eval_samples_per_second": 97.308, "eval_steps_per_second": 6.083, "step": 10000 }, { "epoch": 1.081691959961253, "grad_norm": 0.5910335183143616, "learning_rate": 0.0005356879646589807, "loss": 3.761, "step": 10050 }, { "epoch": 1.087073512000861, "grad_norm": 0.5092912316322327, "learning_rate": 0.0005353647236289192, "loss": 3.7691, "step": 10100 }, { "epoch": 1.0924550640404693, "grad_norm": 0.5085718035697937, "learning_rate": 0.0005350414825988579, "loss": 3.7689, "step": 10150 }, { "epoch": 1.0978366160800774, "grad_norm": 0.5637722015380859, "learning_rate": 0.0005347182415687964, "loss": 3.7535, "step": 10200 }, { "epoch": 1.1032181681196858, "grad_norm": 0.5314459204673767, "learning_rate": 0.000534395000538735, "loss": 3.7466, "step": 10250 }, { "epoch": 1.1085997201592939, "grad_norm": 0.5898045301437378, "learning_rate": 0.0005340717595086736, "loss": 3.7796, "step": 10300 }, { "epoch": 1.1139812721989022, "grad_norm": 0.5477808117866516, "learning_rate": 0.0005337549832992134, "loss": 3.7733, "step": 10350 }, { "epoch": 1.1193628242385103, "grad_norm": 0.620238184928894, "learning_rate": 0.000533431742269152, "loss": 3.7601, "step": 10400 }, { "epoch": 1.1247443762781186, "grad_norm": 0.5703392028808594, "learning_rate": 0.0005331085012390905, "loss": 3.7633, "step": 10450 }, { "epoch": 1.1301259283177267, "grad_norm": 0.6331435441970825, "learning_rate": 0.0005327852602090292, "loss": 3.7571, "step": 10500 }, { "epoch": 1.135507480357335, "grad_norm": 0.6327672004699707, "learning_rate": 0.0005324620191789678, "loss": 3.7705, "step": 10550 }, { "epoch": 1.1408890323969434, "grad_norm": 0.5602995753288269, "learning_rate": 0.0005321387781489063, "loss": 3.7459, "step": 10600 }, { "epoch": 1.1462705844365515, "grad_norm": 0.6223279237747192, "learning_rate": 0.0005318155371188449, "loss": 3.7514, "step": 10650 }, { "epoch": 1.1516521364761596, "grad_norm": 0.5494676828384399, "learning_rate": 0.0005314922960887834, "loss": 3.7579, "step": 10700 }, { "epoch": 1.157033688515768, "grad_norm": 0.5711165070533752, "learning_rate": 0.0005311690550587221, "loss": 3.7455, "step": 10750 }, { "epoch": 1.1624152405553763, "grad_norm": 0.5691930651664734, "learning_rate": 0.0005308458140286607, "loss": 3.7467, "step": 10800 }, { "epoch": 1.1677967925949844, "grad_norm": 0.6407700181007385, "learning_rate": 0.0005305225729985993, "loss": 3.7363, "step": 10850 }, { "epoch": 1.1731783446345927, "grad_norm": 0.5777462124824524, "learning_rate": 0.0005301993319685378, "loss": 3.7576, "step": 10900 }, { "epoch": 1.1785598966742008, "grad_norm": 0.59559166431427, "learning_rate": 0.0005298760909384765, "loss": 3.7362, "step": 10950 }, { "epoch": 1.1839414487138091, "grad_norm": 0.5723061561584473, "learning_rate": 0.000529552849908415, "loss": 3.7514, "step": 11000 }, { "epoch": 1.1839414487138091, "eval_accuracy": 0.3472639736034328, "eval_loss": 3.719111442565918, "eval_runtime": 184.8414, "eval_samples_per_second": 97.44, "eval_steps_per_second": 6.092, "step": 11000 }, { "epoch": 1.1893230007534172, "grad_norm": 0.5246575474739075, "learning_rate": 0.0005292296088783535, "loss": 3.7532, "step": 11050 }, { "epoch": 1.1947045527930256, "grad_norm": 0.5730573534965515, "learning_rate": 0.0005289063678482922, "loss": 3.7489, "step": 11100 }, { "epoch": 1.2000861048326337, "grad_norm": 0.5547863245010376, "learning_rate": 0.0005285831268182307, "loss": 3.7359, "step": 11150 }, { "epoch": 1.205467656872242, "grad_norm": 0.5668656229972839, "learning_rate": 0.0005282598857881694, "loss": 3.7543, "step": 11200 }, { "epoch": 1.21084920891185, "grad_norm": 0.6278468370437622, "learning_rate": 0.0005279366447581079, "loss": 3.7559, "step": 11250 }, { "epoch": 1.2162307609514584, "grad_norm": 0.5997869968414307, "learning_rate": 0.0005276134037280465, "loss": 3.7455, "step": 11300 }, { "epoch": 1.2216123129910665, "grad_norm": 0.6103711128234863, "learning_rate": 0.0005272901626979851, "loss": 3.7338, "step": 11350 }, { "epoch": 1.2269938650306749, "grad_norm": 0.6048080921173096, "learning_rate": 0.0005269669216679236, "loss": 3.7437, "step": 11400 }, { "epoch": 1.232375417070283, "grad_norm": 0.6004089117050171, "learning_rate": 0.0005266436806378623, "loss": 3.7444, "step": 11450 }, { "epoch": 1.2377569691098913, "grad_norm": 0.529883861541748, "learning_rate": 0.0005263204396078008, "loss": 3.7269, "step": 11500 }, { "epoch": 1.2431385211494996, "grad_norm": 0.5505789518356323, "learning_rate": 0.0005259971985777394, "loss": 3.7437, "step": 11550 }, { "epoch": 1.2485200731891077, "grad_norm": 0.5899487733840942, "learning_rate": 0.000525673957547678, "loss": 3.724, "step": 11600 }, { "epoch": 1.2539016252287158, "grad_norm": 0.6136053800582886, "learning_rate": 0.0005253507165176167, "loss": 3.7511, "step": 11650 }, { "epoch": 1.2592831772683242, "grad_norm": 0.5636747479438782, "learning_rate": 0.0005250274754875552, "loss": 3.7729, "step": 11700 }, { "epoch": 1.2646647293079325, "grad_norm": 0.6134611368179321, "learning_rate": 0.0005247042344574938, "loss": 3.7339, "step": 11750 }, { "epoch": 1.2700462813475406, "grad_norm": 0.5715221762657166, "learning_rate": 0.0005243809934274323, "loss": 3.7425, "step": 11800 }, { "epoch": 1.275427833387149, "grad_norm": 0.5998755097389221, "learning_rate": 0.0005240577523973709, "loss": 3.7202, "step": 11850 }, { "epoch": 1.280809385426757, "grad_norm": 0.5555028319358826, "learning_rate": 0.0005237345113673095, "loss": 3.7324, "step": 11900 }, { "epoch": 1.2861909374663654, "grad_norm": 0.5886709690093994, "learning_rate": 0.0005234112703372481, "loss": 3.7376, "step": 11950 }, { "epoch": 1.2915724895059735, "grad_norm": 0.5459818840026855, "learning_rate": 0.0005230880293071867, "loss": 3.701, "step": 12000 }, { "epoch": 1.2915724895059735, "eval_accuracy": 0.3495896886671018, "eval_loss": 3.695312023162842, "eval_runtime": 184.9767, "eval_samples_per_second": 97.369, "eval_steps_per_second": 6.087, "step": 12000 }, { "epoch": 1.2969540415455818, "grad_norm": 0.6085287928581238, "learning_rate": 0.0005227647882771253, "loss": 3.726, "step": 12050 }, { "epoch": 1.30233559358519, "grad_norm": 0.613265335559845, "learning_rate": 0.0005224415472470639, "loss": 3.7448, "step": 12100 }, { "epoch": 1.3077171456247982, "grad_norm": 0.613146185874939, "learning_rate": 0.0005221183062170024, "loss": 3.7305, "step": 12150 }, { "epoch": 1.3130986976644063, "grad_norm": 0.569382905960083, "learning_rate": 0.0005217950651869409, "loss": 3.7302, "step": 12200 }, { "epoch": 1.3184802497040147, "grad_norm": 0.6018803715705872, "learning_rate": 0.0005214718241568796, "loss": 3.7289, "step": 12250 }, { "epoch": 1.3238618017436228, "grad_norm": 0.5862231850624084, "learning_rate": 0.0005211485831268182, "loss": 3.7318, "step": 12300 }, { "epoch": 1.329243353783231, "grad_norm": 0.5232207775115967, "learning_rate": 0.0005208253420967568, "loss": 3.7048, "step": 12350 }, { "epoch": 1.3346249058228392, "grad_norm": 0.6371702551841736, "learning_rate": 0.0005205021010666953, "loss": 3.7313, "step": 12400 }, { "epoch": 1.3400064578624475, "grad_norm": 0.5899184346199036, "learning_rate": 0.0005201788600366339, "loss": 3.714, "step": 12450 }, { "epoch": 1.3453880099020559, "grad_norm": 0.5880089998245239, "learning_rate": 0.0005198620838271738, "loss": 3.7194, "step": 12500 }, { "epoch": 1.350769561941664, "grad_norm": 0.6216872930526733, "learning_rate": 0.0005195388427971123, "loss": 3.7019, "step": 12550 }, { "epoch": 1.356151113981272, "grad_norm": 0.5582758188247681, "learning_rate": 0.000519215601767051, "loss": 3.7347, "step": 12600 }, { "epoch": 1.3615326660208804, "grad_norm": 0.531882107257843, "learning_rate": 0.0005188923607369895, "loss": 3.7195, "step": 12650 }, { "epoch": 1.3669142180604887, "grad_norm": 0.5658419728279114, "learning_rate": 0.000518569119706928, "loss": 3.7136, "step": 12700 }, { "epoch": 1.3722957701000968, "grad_norm": 0.580401599407196, "learning_rate": 0.0005182458786768667, "loss": 3.6957, "step": 12750 }, { "epoch": 1.3776773221397052, "grad_norm": 0.5358129143714905, "learning_rate": 0.0005179226376468052, "loss": 3.7211, "step": 12800 }, { "epoch": 1.3830588741793133, "grad_norm": 0.5377843379974365, "learning_rate": 0.0005175993966167438, "loss": 3.7195, "step": 12850 }, { "epoch": 1.3884404262189216, "grad_norm": 0.6239182949066162, "learning_rate": 0.0005172761555866824, "loss": 3.7085, "step": 12900 }, { "epoch": 1.3938219782585297, "grad_norm": 0.5507147312164307, "learning_rate": 0.0005169529145566211, "loss": 3.7299, "step": 12950 }, { "epoch": 1.399203530298138, "grad_norm": 0.5771177411079407, "learning_rate": 0.0005166296735265596, "loss": 3.7202, "step": 13000 }, { "epoch": 1.399203530298138, "eval_accuracy": 0.3518178334375133, "eval_loss": 3.671773910522461, "eval_runtime": 185.0122, "eval_samples_per_second": 97.35, "eval_steps_per_second": 6.086, "step": 13000 }, { "epoch": 1.4045850823377461, "grad_norm": 0.6277278065681458, "learning_rate": 0.0005163064324964982, "loss": 3.7068, "step": 13050 }, { "epoch": 1.4099666343773545, "grad_norm": 0.5851669311523438, "learning_rate": 0.0005159831914664367, "loss": 3.7063, "step": 13100 }, { "epoch": 1.4153481864169626, "grad_norm": 0.5492701530456543, "learning_rate": 0.0005156599504363753, "loss": 3.7113, "step": 13150 }, { "epoch": 1.420729738456571, "grad_norm": 0.6104116439819336, "learning_rate": 0.000515336709406314, "loss": 3.7048, "step": 13200 }, { "epoch": 1.426111290496179, "grad_norm": 0.5530329346656799, "learning_rate": 0.0005150134683762525, "loss": 3.6968, "step": 13250 }, { "epoch": 1.4314928425357873, "grad_norm": 0.6156694889068604, "learning_rate": 0.0005146902273461911, "loss": 3.7053, "step": 13300 }, { "epoch": 1.4368743945753955, "grad_norm": 0.5567038059234619, "learning_rate": 0.0005143669863161297, "loss": 3.7126, "step": 13350 }, { "epoch": 1.4422559466150038, "grad_norm": 0.6526008248329163, "learning_rate": 0.0005140437452860683, "loss": 3.7054, "step": 13400 }, { "epoch": 1.447637498654612, "grad_norm": 0.5717737674713135, "learning_rate": 0.0005137205042560069, "loss": 3.7223, "step": 13450 }, { "epoch": 1.4530190506942202, "grad_norm": 0.6178818941116333, "learning_rate": 0.0005133972632259455, "loss": 3.6973, "step": 13500 }, { "epoch": 1.4584006027338283, "grad_norm": 0.5545845627784729, "learning_rate": 0.000513074022195884, "loss": 3.7004, "step": 13550 }, { "epoch": 1.4637821547734367, "grad_norm": 0.548315167427063, "learning_rate": 0.0005127507811658226, "loss": 3.6831, "step": 13600 }, { "epoch": 1.469163706813045, "grad_norm": 0.5236667394638062, "learning_rate": 0.0005124275401357612, "loss": 3.697, "step": 13650 }, { "epoch": 1.474545258852653, "grad_norm": 0.5224073529243469, "learning_rate": 0.0005121042991056997, "loss": 3.694, "step": 13700 }, { "epoch": 1.4799268108922612, "grad_norm": 0.5565716624259949, "learning_rate": 0.0005117810580756384, "loss": 3.713, "step": 13750 }, { "epoch": 1.4853083629318695, "grad_norm": 0.6119269132614136, "learning_rate": 0.0005114578170455769, "loss": 3.6882, "step": 13800 }, { "epoch": 1.4906899149714778, "grad_norm": 0.5315612554550171, "learning_rate": 0.0005111345760155156, "loss": 3.7042, "step": 13850 }, { "epoch": 1.496071467011086, "grad_norm": 0.5465083122253418, "learning_rate": 0.0005108113349854541, "loss": 3.6865, "step": 13900 }, { "epoch": 1.501453019050694, "grad_norm": 0.5459629893302917, "learning_rate": 0.0005104880939553926, "loss": 3.6912, "step": 13950 }, { "epoch": 1.5068345710903024, "grad_norm": 0.6055029630661011, "learning_rate": 0.0005101648529253313, "loss": 3.6877, "step": 14000 }, { "epoch": 1.5068345710903024, "eval_accuracy": 0.3537886881695802, "eval_loss": 3.6538360118865967, "eval_runtime": 185.1086, "eval_samples_per_second": 97.3, "eval_steps_per_second": 6.083, "step": 14000 }, { "epoch": 1.5122161231299107, "grad_norm": 0.5669093728065491, "learning_rate": 0.0005098416118952699, "loss": 3.6786, "step": 14050 }, { "epoch": 1.5175976751695188, "grad_norm": 0.5493358373641968, "learning_rate": 0.0005095183708652085, "loss": 3.6787, "step": 14100 }, { "epoch": 1.5229792272091272, "grad_norm": 0.5622873902320862, "learning_rate": 0.000509195129835147, "loss": 3.699, "step": 14150 }, { "epoch": 1.5283607792487355, "grad_norm": 0.5534229278564453, "learning_rate": 0.0005088718888050856, "loss": 3.6991, "step": 14200 }, { "epoch": 1.5337423312883436, "grad_norm": 0.5834208726882935, "learning_rate": 0.0005085486477750242, "loss": 3.6889, "step": 14250 }, { "epoch": 1.5391238833279517, "grad_norm": 0.5694041848182678, "learning_rate": 0.0005082254067449629, "loss": 3.6882, "step": 14300 }, { "epoch": 1.54450543536756, "grad_norm": 0.5525422096252441, "learning_rate": 0.0005079021657149014, "loss": 3.6688, "step": 14350 }, { "epoch": 1.5498869874071683, "grad_norm": 0.5979780554771423, "learning_rate": 0.0005075789246848399, "loss": 3.6797, "step": 14400 }, { "epoch": 1.5552685394467765, "grad_norm": 0.5613248944282532, "learning_rate": 0.0005072556836547785, "loss": 3.6824, "step": 14450 }, { "epoch": 1.5606500914863846, "grad_norm": 0.6141908764839172, "learning_rate": 0.0005069389074453184, "loss": 3.6966, "step": 14500 }, { "epoch": 1.566031643525993, "grad_norm": 0.5723303556442261, "learning_rate": 0.0005066156664152569, "loss": 3.6921, "step": 14550 }, { "epoch": 1.5714131955656012, "grad_norm": 0.6092125773429871, "learning_rate": 0.0005062924253851955, "loss": 3.684, "step": 14600 }, { "epoch": 1.5767947476052093, "grad_norm": 0.5606322288513184, "learning_rate": 0.0005059691843551341, "loss": 3.6769, "step": 14650 }, { "epoch": 1.5821762996448174, "grad_norm": 0.5645138621330261, "learning_rate": 0.0005056459433250727, "loss": 3.6798, "step": 14700 }, { "epoch": 1.5875578516844258, "grad_norm": 0.5697076320648193, "learning_rate": 0.0005053227022950113, "loss": 3.6808, "step": 14750 }, { "epoch": 1.592939403724034, "grad_norm": 0.538927435874939, "learning_rate": 0.0005049994612649499, "loss": 3.6844, "step": 14800 }, { "epoch": 1.5983209557636422, "grad_norm": 0.5874403119087219, "learning_rate": 0.0005046762202348884, "loss": 3.6926, "step": 14850 }, { "epoch": 1.6037025078032503, "grad_norm": 0.6064614057540894, "learning_rate": 0.000504352979204827, "loss": 3.6815, "step": 14900 }, { "epoch": 1.6090840598428586, "grad_norm": 0.6142170429229736, "learning_rate": 0.0005040297381747656, "loss": 3.6766, "step": 14950 }, { "epoch": 1.614465611882467, "grad_norm": 0.6365212798118591, "learning_rate": 0.0005037064971447042, "loss": 3.6723, "step": 15000 }, { "epoch": 1.614465611882467, "eval_accuracy": 0.3555365871758518, "eval_loss": 3.6359355449676514, "eval_runtime": 185.0324, "eval_samples_per_second": 97.34, "eval_steps_per_second": 6.085, "step": 15000 }, { "epoch": 1.619847163922075, "grad_norm": 0.5601328015327454, "learning_rate": 0.0005033832561146428, "loss": 3.6874, "step": 15050 }, { "epoch": 1.6252287159616834, "grad_norm": 0.581095278263092, "learning_rate": 0.0005030600150845813, "loss": 3.656, "step": 15100 }, { "epoch": 1.6306102680012917, "grad_norm": 0.5768686532974243, "learning_rate": 0.00050273677405452, "loss": 3.6487, "step": 15150 }, { "epoch": 1.6359918200408998, "grad_norm": 0.5869755744934082, "learning_rate": 0.0005024135330244585, "loss": 3.6763, "step": 15200 }, { "epoch": 1.641373372080508, "grad_norm": 0.5455231666564941, "learning_rate": 0.0005020902919943972, "loss": 3.6665, "step": 15250 }, { "epoch": 1.6467549241201163, "grad_norm": 0.5418322086334229, "learning_rate": 0.0005017670509643357, "loss": 3.6758, "step": 15300 }, { "epoch": 1.6521364761597246, "grad_norm": 0.5567494630813599, "learning_rate": 0.0005014438099342743, "loss": 3.6804, "step": 15350 }, { "epoch": 1.6575180281993327, "grad_norm": 0.5733649134635925, "learning_rate": 0.0005011205689042129, "loss": 3.674, "step": 15400 }, { "epoch": 1.6628995802389408, "grad_norm": 0.5491770505905151, "learning_rate": 0.0005007973278741514, "loss": 3.6672, "step": 15450 }, { "epoch": 1.6682811322785491, "grad_norm": 0.6351288557052612, "learning_rate": 0.00050047408684409, "loss": 3.6549, "step": 15500 }, { "epoch": 1.6736626843181575, "grad_norm": 0.5563836097717285, "learning_rate": 0.0005001508458140286, "loss": 3.6801, "step": 15550 }, { "epoch": 1.6790442363577656, "grad_norm": 0.566293478012085, "learning_rate": 0.0004998276047839673, "loss": 3.6839, "step": 15600 }, { "epoch": 1.6844257883973737, "grad_norm": 0.5516136884689331, "learning_rate": 0.0004995043637539058, "loss": 3.6481, "step": 15650 }, { "epoch": 1.689807340436982, "grad_norm": 0.6708171963691711, "learning_rate": 0.0004991811227238443, "loss": 3.6718, "step": 15700 }, { "epoch": 1.6951888924765903, "grad_norm": 0.532315731048584, "learning_rate": 0.0004988578816937829, "loss": 3.6653, "step": 15750 }, { "epoch": 1.7005704445161984, "grad_norm": 0.5652549862861633, "learning_rate": 0.0004985346406637215, "loss": 3.6745, "step": 15800 }, { "epoch": 1.7059519965558065, "grad_norm": 0.537426233291626, "learning_rate": 0.0004982113996336602, "loss": 3.6653, "step": 15850 }, { "epoch": 1.7113335485954149, "grad_norm": 0.603627622127533, "learning_rate": 0.0004978881586035987, "loss": 3.6505, "step": 15900 }, { "epoch": 1.7167151006350232, "grad_norm": 0.5712310671806335, "learning_rate": 0.0004975649175735373, "loss": 3.6711, "step": 15950 }, { "epoch": 1.7220966526746313, "grad_norm": 0.5726265907287598, "learning_rate": 0.0004972416765434759, "loss": 3.6806, "step": 16000 }, { "epoch": 1.7220966526746313, "eval_accuracy": 0.35729241784293375, "eval_loss": 3.619314193725586, "eval_runtime": 185.1324, "eval_samples_per_second": 97.287, "eval_steps_per_second": 6.082, "step": 16000 }, { "epoch": 1.7274782047142396, "grad_norm": 0.5703783631324768, "learning_rate": 0.0004969184355134145, "loss": 3.6516, "step": 16050 }, { "epoch": 1.732859756753848, "grad_norm": 0.6545760035514832, "learning_rate": 0.0004965951944833531, "loss": 3.6533, "step": 16100 }, { "epoch": 1.738241308793456, "grad_norm": 0.6040812730789185, "learning_rate": 0.0004962719534532916, "loss": 3.6701, "step": 16150 }, { "epoch": 1.7436228608330642, "grad_norm": 0.6016229391098022, "learning_rate": 0.0004959487124232302, "loss": 3.6568, "step": 16200 }, { "epoch": 1.7490044128726725, "grad_norm": 0.5777092576026917, "learning_rate": 0.0004956254713931688, "loss": 3.6482, "step": 16250 }, { "epoch": 1.7543859649122808, "grad_norm": 0.5289350152015686, "learning_rate": 0.0004953022303631074, "loss": 3.6536, "step": 16300 }, { "epoch": 1.759767516951889, "grad_norm": 0.5491663217544556, "learning_rate": 0.0004949789893330459, "loss": 3.6432, "step": 16350 }, { "epoch": 1.765149068991497, "grad_norm": 0.5549047589302063, "learning_rate": 0.0004946557483029846, "loss": 3.6613, "step": 16400 }, { "epoch": 1.7705306210311054, "grad_norm": 0.537971019744873, "learning_rate": 0.0004943325072729231, "loss": 3.647, "step": 16450 }, { "epoch": 1.7759121730707137, "grad_norm": 0.6316960453987122, "learning_rate": 0.0004940092662428617, "loss": 3.6628, "step": 16500 }, { "epoch": 1.7812937251103218, "grad_norm": 0.5657507181167603, "learning_rate": 0.0004936860252128003, "loss": 3.6527, "step": 16550 }, { "epoch": 1.78667527714993, "grad_norm": 0.555227518081665, "learning_rate": 0.0004933627841827388, "loss": 3.6453, "step": 16600 }, { "epoch": 1.7920568291895382, "grad_norm": 0.5412525534629822, "learning_rate": 0.0004930395431526775, "loss": 3.6569, "step": 16650 }, { "epoch": 1.7974383812291466, "grad_norm": 0.578276515007019, "learning_rate": 0.0004927227669432173, "loss": 3.6502, "step": 16700 }, { "epoch": 1.8028199332687547, "grad_norm": 0.5755462646484375, "learning_rate": 0.0004923995259131558, "loss": 3.6558, "step": 16750 }, { "epoch": 1.8082014853083628, "grad_norm": 0.5374065637588501, "learning_rate": 0.0004920762848830945, "loss": 3.6661, "step": 16800 }, { "epoch": 1.813583037347971, "grad_norm": 0.5966348648071289, "learning_rate": 0.000491753043853033, "loss": 3.6616, "step": 16850 }, { "epoch": 1.8189645893875794, "grad_norm": 0.6156536340713501, "learning_rate": 0.0004914298028229717, "loss": 3.6491, "step": 16900 }, { "epoch": 1.8243461414271875, "grad_norm": 0.5442768931388855, "learning_rate": 0.0004911065617929102, "loss": 3.6634, "step": 16950 }, { "epoch": 1.8297276934667959, "grad_norm": 0.5258566737174988, "learning_rate": 0.0004907833207628487, "loss": 3.6554, "step": 17000 }, { "epoch": 1.8297276934667959, "eval_accuracy": 0.3586716576008508, "eval_loss": 3.6036384105682373, "eval_runtime": 185.2755, "eval_samples_per_second": 97.212, "eval_steps_per_second": 6.077, "step": 17000 }, { "epoch": 1.8351092455064042, "grad_norm": 0.5487478375434875, "learning_rate": 0.0004904600797327874, "loss": 3.6496, "step": 17050 }, { "epoch": 1.8404907975460123, "grad_norm": 0.5759326219558716, "learning_rate": 0.0004901368387027259, "loss": 3.638, "step": 17100 }, { "epoch": 1.8458723495856204, "grad_norm": 0.5645399689674377, "learning_rate": 0.0004898135976726646, "loss": 3.6561, "step": 17150 }, { "epoch": 1.8512539016252287, "grad_norm": 0.5687316060066223, "learning_rate": 0.0004894903566426031, "loss": 3.6386, "step": 17200 }, { "epoch": 1.856635453664837, "grad_norm": 0.5824282765388489, "learning_rate": 0.0004891671156125417, "loss": 3.6366, "step": 17250 }, { "epoch": 1.8620170057044452, "grad_norm": 0.6073246002197266, "learning_rate": 0.0004888438745824803, "loss": 3.6548, "step": 17300 }, { "epoch": 1.8673985577440533, "grad_norm": 0.5398355722427368, "learning_rate": 0.0004885206335524189, "loss": 3.6477, "step": 17350 }, { "epoch": 1.8727801097836616, "grad_norm": 0.5895220041275024, "learning_rate": 0.0004881973925223575, "loss": 3.6426, "step": 17400 }, { "epoch": 1.87816166182327, "grad_norm": 0.5928804278373718, "learning_rate": 0.00048787415149229604, "loss": 3.6433, "step": 17450 }, { "epoch": 1.883543213862878, "grad_norm": 0.5468927025794983, "learning_rate": 0.00048755091046223464, "loss": 3.6417, "step": 17500 }, { "epoch": 1.8889247659024861, "grad_norm": 0.6279405951499939, "learning_rate": 0.0004872276694321732, "loss": 3.6331, "step": 17550 }, { "epoch": 1.8943063179420945, "grad_norm": 0.5338152050971985, "learning_rate": 0.00048690442840211177, "loss": 3.6416, "step": 17600 }, { "epoch": 1.8996878699817028, "grad_norm": 0.5857802033424377, "learning_rate": 0.0004865811873720504, "loss": 3.6332, "step": 17650 }, { "epoch": 1.905069422021311, "grad_norm": 0.5994651913642883, "learning_rate": 0.00048625794634198896, "loss": 3.6365, "step": 17700 }, { "epoch": 1.910450974060919, "grad_norm": 0.5650983452796936, "learning_rate": 0.00048593470531192756, "loss": 3.6394, "step": 17750 }, { "epoch": 1.9158325261005273, "grad_norm": 0.6136479377746582, "learning_rate": 0.00048561146428186615, "loss": 3.6474, "step": 17800 }, { "epoch": 1.9212140781401357, "grad_norm": 0.5767033696174622, "learning_rate": 0.0004852882232518047, "loss": 3.642, "step": 17850 }, { "epoch": 1.9265956301797438, "grad_norm": 0.6233161687850952, "learning_rate": 0.00048496498222174334, "loss": 3.6456, "step": 17900 }, { "epoch": 1.931977182219352, "grad_norm": 0.5650816559791565, "learning_rate": 0.00048464174119168193, "loss": 3.6477, "step": 17950 }, { "epoch": 1.9373587342589604, "grad_norm": 0.5975984930992126, "learning_rate": 0.0004843185001616205, "loss": 3.6462, "step": 18000 }, { "epoch": 1.9373587342589604, "eval_accuracy": 0.36028145878671014, "eval_loss": 3.5907020568847656, "eval_runtime": 185.2587, "eval_samples_per_second": 97.221, "eval_steps_per_second": 6.078, "step": 18000 }, { "epoch": 1.9427402862985685, "grad_norm": 0.593018651008606, "learning_rate": 0.00048399525913155907, "loss": 3.63, "step": 18050 }, { "epoch": 1.9481218383381766, "grad_norm": 0.5793690085411072, "learning_rate": 0.0004836720181014976, "loss": 3.6503, "step": 18100 }, { "epoch": 1.953503390377785, "grad_norm": 0.5724852681159973, "learning_rate": 0.0004833487770714362, "loss": 3.6317, "step": 18150 }, { "epoch": 1.9588849424173933, "grad_norm": 0.5394271612167358, "learning_rate": 0.00048302553604137485, "loss": 3.6127, "step": 18200 }, { "epoch": 1.9642664944570014, "grad_norm": 0.5289519429206848, "learning_rate": 0.0004827022950113134, "loss": 3.6391, "step": 18250 }, { "epoch": 1.9696480464966095, "grad_norm": 0.5568416714668274, "learning_rate": 0.000482379053981252, "loss": 3.6495, "step": 18300 }, { "epoch": 1.9750295985362178, "grad_norm": 0.5665032863616943, "learning_rate": 0.0004820558129511906, "loss": 3.6344, "step": 18350 }, { "epoch": 1.9804111505758262, "grad_norm": 0.633760392665863, "learning_rate": 0.0004817325719211291, "loss": 3.6234, "step": 18400 }, { "epoch": 1.9857927026154343, "grad_norm": 0.5358874797821045, "learning_rate": 0.0004814093308910677, "loss": 3.6596, "step": 18450 }, { "epoch": 1.9911742546550424, "grad_norm": 0.5380644202232361, "learning_rate": 0.00048108608986100637, "loss": 3.6425, "step": 18500 }, { "epoch": 1.9965558066946507, "grad_norm": 0.6148858666419983, "learning_rate": 0.0004807628488309449, "loss": 3.6437, "step": 18550 }, { "epoch": 2.001937358734259, "grad_norm": 0.587160587310791, "learning_rate": 0.0004804396078008835, "loss": 3.6022, "step": 18600 }, { "epoch": 2.007318910773867, "grad_norm": 0.6171560287475586, "learning_rate": 0.00048011636677082204, "loss": 3.5436, "step": 18650 }, { "epoch": 2.0127004628134753, "grad_norm": 0.5365356802940369, "learning_rate": 0.00047979312574076064, "loss": 3.5454, "step": 18700 }, { "epoch": 2.018082014853084, "grad_norm": 0.5993242859840393, "learning_rate": 0.00047947634953130044, "loss": 3.5406, "step": 18750 }, { "epoch": 2.023463566892692, "grad_norm": 0.5617915391921997, "learning_rate": 0.00047915310850123904, "loss": 3.5468, "step": 18800 }, { "epoch": 2.0288451189323, "grad_norm": 0.6009356379508972, "learning_rate": 0.0004788298674711777, "loss": 3.5483, "step": 18850 }, { "epoch": 2.034226670971908, "grad_norm": 0.55379718542099, "learning_rate": 0.00047850662644111623, "loss": 3.5472, "step": 18900 }, { "epoch": 2.0396082230115167, "grad_norm": 0.614422082901001, "learning_rate": 0.0004781833854110548, "loss": 3.5519, "step": 18950 }, { "epoch": 2.044989775051125, "grad_norm": 0.6232186555862427, "learning_rate": 0.00047786014438099336, "loss": 3.5624, "step": 19000 }, { "epoch": 2.044989775051125, "eval_accuracy": 0.3610764719668487, "eval_loss": 3.5807666778564453, "eval_runtime": 184.9144, "eval_samples_per_second": 97.402, "eval_steps_per_second": 6.089, "step": 19000 }, { "epoch": 2.050371327090733, "grad_norm": 0.6150940656661987, "learning_rate": 0.00047753690335093196, "loss": 3.5413, "step": 19050 }, { "epoch": 2.055752879130341, "grad_norm": 0.5728451609611511, "learning_rate": 0.00047721366232087055, "loss": 3.5385, "step": 19100 }, { "epoch": 2.0611344311699495, "grad_norm": 0.5399880409240723, "learning_rate": 0.00047689042129080915, "loss": 3.5665, "step": 19150 }, { "epoch": 2.0665159832095576, "grad_norm": 0.5551028251647949, "learning_rate": 0.00047656718026074774, "loss": 3.5385, "step": 19200 }, { "epoch": 2.0718975352491658, "grad_norm": 0.5518538355827332, "learning_rate": 0.00047624393923068634, "loss": 3.5415, "step": 19250 }, { "epoch": 2.0772790872887743, "grad_norm": 0.5685696601867676, "learning_rate": 0.0004759206982006249, "loss": 3.5454, "step": 19300 }, { "epoch": 2.0826606393283824, "grad_norm": 0.6709660291671753, "learning_rate": 0.00047559745717056347, "loss": 3.5536, "step": 19350 }, { "epoch": 2.0880421913679905, "grad_norm": 0.5589509606361389, "learning_rate": 0.000475274216140502, "loss": 3.5396, "step": 19400 }, { "epoch": 2.0934237434075986, "grad_norm": 0.5621861219406128, "learning_rate": 0.00047495097511044066, "loss": 3.5561, "step": 19450 }, { "epoch": 2.098805295447207, "grad_norm": 0.6232321858406067, "learning_rate": 0.00047462773408037925, "loss": 3.5637, "step": 19500 }, { "epoch": 2.1041868474868153, "grad_norm": 0.6202095150947571, "learning_rate": 0.0004743044930503178, "loss": 3.5567, "step": 19550 }, { "epoch": 2.1095683995264234, "grad_norm": 0.6590968370437622, "learning_rate": 0.0004739812520202564, "loss": 3.5512, "step": 19600 }, { "epoch": 2.1149499515660315, "grad_norm": 0.5272144675254822, "learning_rate": 0.000473658010990195, "loss": 3.5622, "step": 19650 }, { "epoch": 2.12033150360564, "grad_norm": 0.5836525559425354, "learning_rate": 0.0004733347699601336, "loss": 3.5603, "step": 19700 }, { "epoch": 2.125713055645248, "grad_norm": 0.5840169787406921, "learning_rate": 0.0004730115289300722, "loss": 3.5542, "step": 19750 }, { "epoch": 2.1310946076848563, "grad_norm": 0.5527374744415283, "learning_rate": 0.00047268828790001077, "loss": 3.5596, "step": 19800 }, { "epoch": 2.1364761597244644, "grad_norm": 0.5612182021141052, "learning_rate": 0.0004723650468699493, "loss": 3.5595, "step": 19850 }, { "epoch": 2.141857711764073, "grad_norm": 0.6052794456481934, "learning_rate": 0.0004720418058398879, "loss": 3.558, "step": 19900 }, { "epoch": 2.147239263803681, "grad_norm": 0.5627298355102539, "learning_rate": 0.00047171856480982644, "loss": 3.5394, "step": 19950 }, { "epoch": 2.152620815843289, "grad_norm": 0.6209867000579834, "learning_rate": 0.0004713953237797651, "loss": 3.5722, "step": 20000 }, { "epoch": 2.152620815843289, "eval_accuracy": 0.3629283517867587, "eval_loss": 3.571505069732666, "eval_runtime": 184.3187, "eval_samples_per_second": 97.717, "eval_steps_per_second": 6.109, "step": 20000 }, { "epoch": 2.1580023678828972, "grad_norm": 0.579675018787384, "learning_rate": 0.0004710720827497037, "loss": 3.5576, "step": 20050 }, { "epoch": 2.163383919922506, "grad_norm": 0.5315395593643188, "learning_rate": 0.00047074884171964223, "loss": 3.5562, "step": 20100 }, { "epoch": 2.168765471962114, "grad_norm": 0.5574841499328613, "learning_rate": 0.0004704256006895808, "loss": 3.549, "step": 20150 }, { "epoch": 2.174147024001722, "grad_norm": 0.5768731236457825, "learning_rate": 0.00047010235965951936, "loss": 3.5636, "step": 20200 }, { "epoch": 2.1795285760413305, "grad_norm": 0.5584173798561096, "learning_rate": 0.00046977911862945796, "loss": 3.5582, "step": 20250 }, { "epoch": 2.1849101280809387, "grad_norm": 0.607611358165741, "learning_rate": 0.0004694558775993966, "loss": 3.5709, "step": 20300 }, { "epoch": 2.1902916801205468, "grad_norm": 0.5810279846191406, "learning_rate": 0.00046913263656933515, "loss": 3.5754, "step": 20350 }, { "epoch": 2.195673232160155, "grad_norm": 0.611072301864624, "learning_rate": 0.00046880939553927374, "loss": 3.5447, "step": 20400 }, { "epoch": 2.2010547841997634, "grad_norm": 0.5755279064178467, "learning_rate": 0.00046848615450921234, "loss": 3.5384, "step": 20450 }, { "epoch": 2.2064363362393715, "grad_norm": 0.5763221383094788, "learning_rate": 0.0004681629134791509, "loss": 3.5582, "step": 20500 }, { "epoch": 2.2118178882789796, "grad_norm": 0.5722882747650146, "learning_rate": 0.00046783967244908947, "loss": 3.5384, "step": 20550 }, { "epoch": 2.2171994403185877, "grad_norm": 0.6407657265663147, "learning_rate": 0.0004675164314190281, "loss": 3.571, "step": 20600 }, { "epoch": 2.2225809923581963, "grad_norm": 0.5894743204116821, "learning_rate": 0.00046719319038896666, "loss": 3.5524, "step": 20650 }, { "epoch": 2.2279625443978044, "grad_norm": 0.5992270112037659, "learning_rate": 0.00046686994935890526, "loss": 3.557, "step": 20700 }, { "epoch": 2.2333440964374125, "grad_norm": 0.5950222611427307, "learning_rate": 0.0004665467083288438, "loss": 3.5539, "step": 20750 }, { "epoch": 2.2387256484770206, "grad_norm": 0.5811877846717834, "learning_rate": 0.0004662234672987824, "loss": 3.5729, "step": 20800 }, { "epoch": 2.244107200516629, "grad_norm": 0.612930178642273, "learning_rate": 0.00046590022626872104, "loss": 3.5385, "step": 20850 }, { "epoch": 2.2494887525562373, "grad_norm": 0.5784337520599365, "learning_rate": 0.0004655769852386596, "loss": 3.5695, "step": 20900 }, { "epoch": 2.2548703045958454, "grad_norm": 0.5417907238006592, "learning_rate": 0.0004652537442085982, "loss": 3.5567, "step": 20950 }, { "epoch": 2.2602518566354535, "grad_norm": 0.5444612503051758, "learning_rate": 0.00046493050317853677, "loss": 3.5608, "step": 21000 }, { "epoch": 2.2602518566354535, "eval_accuracy": 0.36368631433215254, "eval_loss": 3.5603373050689697, "eval_runtime": 183.971, "eval_samples_per_second": 97.901, "eval_steps_per_second": 6.121, "step": 21000 }, { "epoch": 2.265633408675062, "grad_norm": 0.5688188672065735, "learning_rate": 0.0004646072621484753, "loss": 3.5453, "step": 21050 }, { "epoch": 2.27101496071467, "grad_norm": 0.5461804270744324, "learning_rate": 0.00046429048593901517, "loss": 3.5595, "step": 21100 }, { "epoch": 2.2763965127542782, "grad_norm": 0.5699180364608765, "learning_rate": 0.0004639672449089537, "loss": 3.5668, "step": 21150 }, { "epoch": 2.281778064793887, "grad_norm": 0.5944415330886841, "learning_rate": 0.0004636440038788923, "loss": 3.5442, "step": 21200 }, { "epoch": 2.287159616833495, "grad_norm": 0.5693159103393555, "learning_rate": 0.00046332076284883095, "loss": 3.5489, "step": 21250 }, { "epoch": 2.292541168873103, "grad_norm": 0.6215958595275879, "learning_rate": 0.0004629975218187695, "loss": 3.5608, "step": 21300 }, { "epoch": 2.297922720912711, "grad_norm": 0.5844936370849609, "learning_rate": 0.0004626742807887081, "loss": 3.5487, "step": 21350 }, { "epoch": 2.303304272952319, "grad_norm": 0.5955291986465454, "learning_rate": 0.00046235103975864663, "loss": 3.5503, "step": 21400 }, { "epoch": 2.3086858249919278, "grad_norm": 0.586269736289978, "learning_rate": 0.0004620277987285852, "loss": 3.5586, "step": 21450 }, { "epoch": 2.314067377031536, "grad_norm": 0.6128333210945129, "learning_rate": 0.0004617045576985239, "loss": 3.5605, "step": 21500 }, { "epoch": 2.319448929071144, "grad_norm": 0.6058995723724365, "learning_rate": 0.0004613813166684624, "loss": 3.5478, "step": 21550 }, { "epoch": 2.3248304811107525, "grad_norm": 0.5962044596672058, "learning_rate": 0.000461058075638401, "loss": 3.5652, "step": 21600 }, { "epoch": 2.3302120331503606, "grad_norm": 0.5575462579727173, "learning_rate": 0.00046073483460833955, "loss": 3.5416, "step": 21650 }, { "epoch": 2.3355935851899687, "grad_norm": 0.6554104089736938, "learning_rate": 0.00046041159357827814, "loss": 3.5761, "step": 21700 }, { "epoch": 2.340975137229577, "grad_norm": 0.6101064682006836, "learning_rate": 0.00046008835254821674, "loss": 3.5716, "step": 21750 }, { "epoch": 2.3463566892691854, "grad_norm": 0.5602294206619263, "learning_rate": 0.0004597651115181554, "loss": 3.5689, "step": 21800 }, { "epoch": 2.3517382413087935, "grad_norm": 0.5995686650276184, "learning_rate": 0.00045944187048809393, "loss": 3.5485, "step": 21850 }, { "epoch": 2.3571197933484016, "grad_norm": 0.5884931683540344, "learning_rate": 0.0004591186294580325, "loss": 3.57, "step": 21900 }, { "epoch": 2.3625013453880097, "grad_norm": 0.5541901588439941, "learning_rate": 0.00045879538842797106, "loss": 3.5498, "step": 21950 }, { "epoch": 2.3678828974276183, "grad_norm": 0.6184925436973572, "learning_rate": 0.00045847214739790966, "loss": 3.5486, "step": 22000 }, { "epoch": 2.3678828974276183, "eval_accuracy": 0.36485900495033424, "eval_loss": 3.5495412349700928, "eval_runtime": 184.8468, "eval_samples_per_second": 97.437, "eval_steps_per_second": 6.092, "step": 22000 }, { "epoch": 2.3732644494672264, "grad_norm": 0.6405237913131714, "learning_rate": 0.0004581489063678482, "loss": 3.5473, "step": 22050 }, { "epoch": 2.3786460015068345, "grad_norm": 0.6073254942893982, "learning_rate": 0.00045782566533778685, "loss": 3.5617, "step": 22100 }, { "epoch": 2.384027553546443, "grad_norm": 0.5916250944137573, "learning_rate": 0.00045750242430772544, "loss": 3.5663, "step": 22150 }, { "epoch": 2.389409105586051, "grad_norm": 0.6367147564888, "learning_rate": 0.000457179183277664, "loss": 3.5534, "step": 22200 }, { "epoch": 2.3947906576256592, "grad_norm": 0.584669291973114, "learning_rate": 0.0004568559422476026, "loss": 3.5471, "step": 22250 }, { "epoch": 2.4001722096652673, "grad_norm": 0.6060321927070618, "learning_rate": 0.00045653270121754117, "loss": 3.5543, "step": 22300 }, { "epoch": 2.4055537617048754, "grad_norm": 0.5905278325080872, "learning_rate": 0.0004562094601874797, "loss": 3.5571, "step": 22350 }, { "epoch": 2.410935313744484, "grad_norm": 0.6051254272460938, "learning_rate": 0.00045588621915741836, "loss": 3.5419, "step": 22400 }, { "epoch": 2.416316865784092, "grad_norm": 0.5944015383720398, "learning_rate": 0.00045556297812735696, "loss": 3.5482, "step": 22450 }, { "epoch": 2.4216984178237, "grad_norm": 0.6080989241600037, "learning_rate": 0.0004552397370972955, "loss": 3.5586, "step": 22500 }, { "epoch": 2.4270799698633088, "grad_norm": 0.606602668762207, "learning_rate": 0.0004549164960672341, "loss": 3.5539, "step": 22550 }, { "epoch": 2.432461521902917, "grad_norm": 0.5684707760810852, "learning_rate": 0.00045459325503717263, "loss": 3.5459, "step": 22600 }, { "epoch": 2.437843073942525, "grad_norm": 0.5968481302261353, "learning_rate": 0.0004542700140071113, "loss": 3.5392, "step": 22650 }, { "epoch": 2.443224625982133, "grad_norm": 0.5279818177223206, "learning_rate": 0.0004539467729770499, "loss": 3.5542, "step": 22700 }, { "epoch": 2.4486061780217416, "grad_norm": 0.5603899359703064, "learning_rate": 0.0004536235319469884, "loss": 3.5546, "step": 22750 }, { "epoch": 2.4539877300613497, "grad_norm": 0.5788828134536743, "learning_rate": 0.000453300290916927, "loss": 3.5395, "step": 22800 }, { "epoch": 2.459369282100958, "grad_norm": 0.614360511302948, "learning_rate": 0.0004529770498868656, "loss": 3.5336, "step": 22850 }, { "epoch": 2.464750834140566, "grad_norm": 0.7457820177078247, "learning_rate": 0.00045265380885680414, "loss": 3.5619, "step": 22900 }, { "epoch": 2.4701323861801745, "grad_norm": 0.5839369893074036, "learning_rate": 0.0004523305678267428, "loss": 3.5636, "step": 22950 }, { "epoch": 2.4755139382197826, "grad_norm": 0.6320124268531799, "learning_rate": 0.0004520073267966814, "loss": 3.5484, "step": 23000 }, { "epoch": 2.4755139382197826, "eval_accuracy": 0.36569813120292544, "eval_loss": 3.5397799015045166, "eval_runtime": 185.5668, "eval_samples_per_second": 97.059, "eval_steps_per_second": 6.068, "step": 23000 }, { "epoch": 2.4808954902593907, "grad_norm": 0.6412538886070251, "learning_rate": 0.00045168408576661993, "loss": 3.5485, "step": 23050 }, { "epoch": 2.4862770422989993, "grad_norm": 0.5754796266555786, "learning_rate": 0.0004513608447365585, "loss": 3.5538, "step": 23100 }, { "epoch": 2.4916585943386074, "grad_norm": 0.5703058242797852, "learning_rate": 0.00045103760370649706, "loss": 3.5478, "step": 23150 }, { "epoch": 2.4970401463782155, "grad_norm": 0.6098968386650085, "learning_rate": 0.0004507208274970369, "loss": 3.5542, "step": 23200 }, { "epoch": 2.5024216984178236, "grad_norm": 0.5805345177650452, "learning_rate": 0.00045039758646697546, "loss": 3.5412, "step": 23250 }, { "epoch": 2.5078032504574317, "grad_norm": 0.6296859979629517, "learning_rate": 0.0004500743454369141, "loss": 3.5606, "step": 23300 }, { "epoch": 2.5131848024970402, "grad_norm": 0.5676090121269226, "learning_rate": 0.0004497511044068527, "loss": 3.5395, "step": 23350 }, { "epoch": 2.5185663545366483, "grad_norm": 0.5905699729919434, "learning_rate": 0.00044942786337679125, "loss": 3.5604, "step": 23400 }, { "epoch": 2.5239479065762565, "grad_norm": 0.5741718411445618, "learning_rate": 0.00044910462234672984, "loss": 3.5438, "step": 23450 }, { "epoch": 2.529329458615865, "grad_norm": 0.5376855134963989, "learning_rate": 0.0004487813813166684, "loss": 3.5618, "step": 23500 }, { "epoch": 2.534711010655473, "grad_norm": 0.6151233315467834, "learning_rate": 0.000448458140286607, "loss": 3.5373, "step": 23550 }, { "epoch": 2.540092562695081, "grad_norm": 0.5695292949676514, "learning_rate": 0.00044813489925654563, "loss": 3.5465, "step": 23600 }, { "epoch": 2.5454741147346893, "grad_norm": 0.5898992419242859, "learning_rate": 0.00044781165822648417, "loss": 3.5536, "step": 23650 }, { "epoch": 2.550855666774298, "grad_norm": 0.5703639984130859, "learning_rate": 0.00044748841719642276, "loss": 3.5251, "step": 23700 }, { "epoch": 2.556237218813906, "grad_norm": 0.5784706473350525, "learning_rate": 0.00044716517616636136, "loss": 3.5486, "step": 23750 }, { "epoch": 2.561618770853514, "grad_norm": 0.589297890663147, "learning_rate": 0.0004468419351362999, "loss": 3.5452, "step": 23800 }, { "epoch": 2.567000322893122, "grad_norm": 0.6087639331817627, "learning_rate": 0.0004465186941062385, "loss": 3.5397, "step": 23850 }, { "epoch": 2.5723818749327307, "grad_norm": 0.5542170405387878, "learning_rate": 0.00044619545307617714, "loss": 3.5413, "step": 23900 }, { "epoch": 2.577763426972339, "grad_norm": 0.6168636679649353, "learning_rate": 0.0004458722120461157, "loss": 3.5368, "step": 23950 }, { "epoch": 2.583144979011947, "grad_norm": 0.5792571306228638, "learning_rate": 0.0004455489710160543, "loss": 3.5476, "step": 24000 }, { "epoch": 2.583144979011947, "eval_accuracy": 0.3665450804634393, "eval_loss": 3.5285420417785645, "eval_runtime": 184.6414, "eval_samples_per_second": 97.546, "eval_steps_per_second": 6.098, "step": 24000 }, { "epoch": 2.5885265310515555, "grad_norm": 0.5523707866668701, "learning_rate": 0.0004452257299859928, "loss": 3.5526, "step": 24050 }, { "epoch": 2.5939080830911636, "grad_norm": 0.5859895944595337, "learning_rate": 0.0004449024889559314, "loss": 3.5482, "step": 24100 }, { "epoch": 2.5992896351307717, "grad_norm": 0.59371417760849, "learning_rate": 0.00044457924792587, "loss": 3.5577, "step": 24150 }, { "epoch": 2.60467118717038, "grad_norm": 0.6154286861419678, "learning_rate": 0.0004442560068958086, "loss": 3.5255, "step": 24200 }, { "epoch": 2.610052739209988, "grad_norm": 0.5675551891326904, "learning_rate": 0.0004439327658657472, "loss": 3.5354, "step": 24250 }, { "epoch": 2.6154342912495965, "grad_norm": 0.6574042439460754, "learning_rate": 0.0004436095248356858, "loss": 3.5302, "step": 24300 }, { "epoch": 2.6208158432892046, "grad_norm": 0.562594473361969, "learning_rate": 0.00044328628380562433, "loss": 3.5307, "step": 24350 }, { "epoch": 2.6261973953288127, "grad_norm": 0.576907217502594, "learning_rate": 0.0004429630427755629, "loss": 3.5308, "step": 24400 }, { "epoch": 2.6315789473684212, "grad_norm": 0.6003679037094116, "learning_rate": 0.0004426398017455016, "loss": 3.5402, "step": 24450 }, { "epoch": 2.6369604994080293, "grad_norm": 0.5755712389945984, "learning_rate": 0.0004423165607154401, "loss": 3.552, "step": 24500 }, { "epoch": 2.6423420514476375, "grad_norm": 0.6702556610107422, "learning_rate": 0.0004419933196853787, "loss": 3.52, "step": 24550 }, { "epoch": 2.6477236034872456, "grad_norm": 0.6071345806121826, "learning_rate": 0.00044167007865531725, "loss": 3.5465, "step": 24600 }, { "epoch": 2.653105155526854, "grad_norm": 0.5764359831809998, "learning_rate": 0.00044134683762525584, "loss": 3.5266, "step": 24650 }, { "epoch": 2.658486707566462, "grad_norm": 0.6211927533149719, "learning_rate": 0.00044102359659519444, "loss": 3.5415, "step": 24700 }, { "epoch": 2.6638682596060703, "grad_norm": 0.6043727993965149, "learning_rate": 0.00044070035556513303, "loss": 3.5399, "step": 24750 }, { "epoch": 2.6692498116456784, "grad_norm": 0.5703555941581726, "learning_rate": 0.00044037711453507163, "loss": 3.54, "step": 24800 }, { "epoch": 2.674631363685287, "grad_norm": 0.6492805480957031, "learning_rate": 0.0004400538735050102, "loss": 3.56, "step": 24850 }, { "epoch": 2.680012915724895, "grad_norm": 0.6173940300941467, "learning_rate": 0.00043973063247494876, "loss": 3.5071, "step": 24900 }, { "epoch": 2.685394467764503, "grad_norm": 0.5987802147865295, "learning_rate": 0.00043940739144488736, "loss": 3.5465, "step": 24950 }, { "epoch": 2.6907760198041117, "grad_norm": 0.5491653084754944, "learning_rate": 0.0004390841504148259, "loss": 3.5566, "step": 25000 }, { "epoch": 2.6907760198041117, "eval_accuracy": 0.3676237863336615, "eval_loss": 3.521599769592285, "eval_runtime": 184.2851, "eval_samples_per_second": 97.734, "eval_steps_per_second": 6.11, "step": 25000 }, { "epoch": 2.69615757184372, "grad_norm": 0.5777750015258789, "learning_rate": 0.00043876090938476455, "loss": 3.5436, "step": 25050 }, { "epoch": 2.701539123883328, "grad_norm": 0.5859283208847046, "learning_rate": 0.00043843766835470314, "loss": 3.5255, "step": 25100 }, { "epoch": 2.706920675922936, "grad_norm": 0.5968005061149597, "learning_rate": 0.0004381144273246417, "loss": 3.55, "step": 25150 }, { "epoch": 2.712302227962544, "grad_norm": 0.591786801815033, "learning_rate": 0.0004377911862945803, "loss": 3.553, "step": 25200 }, { "epoch": 2.7176837800021527, "grad_norm": 0.6314839124679565, "learning_rate": 0.0004374744100851201, "loss": 3.551, "step": 25250 }, { "epoch": 2.723065332041761, "grad_norm": 0.6087203621864319, "learning_rate": 0.0004371511690550587, "loss": 3.52, "step": 25300 }, { "epoch": 2.728446884081369, "grad_norm": 0.6130897402763367, "learning_rate": 0.0004368279280249972, "loss": 3.5275, "step": 25350 }, { "epoch": 2.7338284361209775, "grad_norm": 0.5768114328384399, "learning_rate": 0.00043650468699493587, "loss": 3.5311, "step": 25400 }, { "epoch": 2.7392099881605856, "grad_norm": 0.5944812297821045, "learning_rate": 0.00043618144596487446, "loss": 3.513, "step": 25450 }, { "epoch": 2.7445915402001937, "grad_norm": 0.5946869254112244, "learning_rate": 0.000435858204934813, "loss": 3.5386, "step": 25500 }, { "epoch": 2.749973092239802, "grad_norm": 0.5887439846992493, "learning_rate": 0.0004355349639047516, "loss": 3.5194, "step": 25550 }, { "epoch": 2.7553546442794103, "grad_norm": 0.5720977187156677, "learning_rate": 0.0004352117228746902, "loss": 3.5394, "step": 25600 }, { "epoch": 2.7607361963190185, "grad_norm": 0.6243444681167603, "learning_rate": 0.00043488848184462873, "loss": 3.5382, "step": 25650 }, { "epoch": 2.7661177483586266, "grad_norm": 0.6166155338287354, "learning_rate": 0.0004345652408145674, "loss": 3.5201, "step": 25700 }, { "epoch": 2.7714993003982347, "grad_norm": 0.6198005080223083, "learning_rate": 0.000434241999784506, "loss": 3.5349, "step": 25750 }, { "epoch": 2.776880852437843, "grad_norm": 0.6009842157363892, "learning_rate": 0.0004339187587544445, "loss": 3.5257, "step": 25800 }, { "epoch": 2.7822624044774513, "grad_norm": 0.6034589409828186, "learning_rate": 0.0004335955177243831, "loss": 3.5273, "step": 25850 }, { "epoch": 2.7876439565170594, "grad_norm": 0.6453015804290771, "learning_rate": 0.00043327227669432165, "loss": 3.5241, "step": 25900 }, { "epoch": 2.793025508556668, "grad_norm": 0.6064215302467346, "learning_rate": 0.00043294903566426025, "loss": 3.5401, "step": 25950 }, { "epoch": 2.798407060596276, "grad_norm": 0.5937812328338623, "learning_rate": 0.0004326257946341989, "loss": 3.5297, "step": 26000 }, { "epoch": 2.798407060596276, "eval_accuracy": 0.3689817301255669, "eval_loss": 3.5106637477874756, "eval_runtime": 184.6375, "eval_samples_per_second": 97.548, "eval_steps_per_second": 6.098, "step": 26000 }, { "epoch": 2.803788612635884, "grad_norm": 0.5772119164466858, "learning_rate": 0.00043230255360413744, "loss": 3.5437, "step": 26050 }, { "epoch": 2.8091701646754923, "grad_norm": 0.6289253234863281, "learning_rate": 0.00043197931257407603, "loss": 3.527, "step": 26100 }, { "epoch": 2.8145517167151004, "grad_norm": 0.5981800556182861, "learning_rate": 0.0004316560715440146, "loss": 3.5416, "step": 26150 }, { "epoch": 2.819933268754709, "grad_norm": 0.5946755409240723, "learning_rate": 0.00043133283051395317, "loss": 3.5365, "step": 26200 }, { "epoch": 2.825314820794317, "grad_norm": 0.5931487679481506, "learning_rate": 0.0004310095894838918, "loss": 3.5297, "step": 26250 }, { "epoch": 2.830696372833925, "grad_norm": 0.6046642065048218, "learning_rate": 0.0004306863484538304, "loss": 3.5359, "step": 26300 }, { "epoch": 2.8360779248735337, "grad_norm": 0.587397575378418, "learning_rate": 0.00043036310742376895, "loss": 3.5265, "step": 26350 }, { "epoch": 2.841459476913142, "grad_norm": 0.595369279384613, "learning_rate": 0.00043003986639370754, "loss": 3.5306, "step": 26400 }, { "epoch": 2.84684102895275, "grad_norm": 0.5928206443786621, "learning_rate": 0.0004297166253636461, "loss": 3.5162, "step": 26450 }, { "epoch": 2.852222580992358, "grad_norm": 0.5930352210998535, "learning_rate": 0.0004293933843335847, "loss": 3.5489, "step": 26500 }, { "epoch": 2.857604133031966, "grad_norm": 0.5869969129562378, "learning_rate": 0.00042907014330352333, "loss": 3.5237, "step": 26550 }, { "epoch": 2.8629856850715747, "grad_norm": 0.6180474758148193, "learning_rate": 0.00042874690227346187, "loss": 3.5492, "step": 26600 }, { "epoch": 2.868367237111183, "grad_norm": 0.5644546151161194, "learning_rate": 0.00042842366124340046, "loss": 3.5143, "step": 26650 }, { "epoch": 2.873748789150791, "grad_norm": 0.5767242908477783, "learning_rate": 0.00042810042021333906, "loss": 3.5248, "step": 26700 }, { "epoch": 2.8791303411903995, "grad_norm": 0.5595628023147583, "learning_rate": 0.0004277771791832776, "loss": 3.537, "step": 26750 }, { "epoch": 2.8845118932300076, "grad_norm": 0.5932275056838989, "learning_rate": 0.0004274539381532162, "loss": 3.5399, "step": 26800 }, { "epoch": 2.8898934452696157, "grad_norm": 0.6271063685417175, "learning_rate": 0.00042713069712315484, "loss": 3.5408, "step": 26850 }, { "epoch": 2.895274997309224, "grad_norm": 0.5648878216743469, "learning_rate": 0.0004268074560930934, "loss": 3.528, "step": 26900 }, { "epoch": 2.9006565493488323, "grad_norm": 0.6094657182693481, "learning_rate": 0.000426484215063032, "loss": 3.5405, "step": 26950 }, { "epoch": 2.9060381013884404, "grad_norm": 0.5559983253479004, "learning_rate": 0.0004261609740329705, "loss": 3.5455, "step": 27000 }, { "epoch": 2.9060381013884404, "eval_accuracy": 0.3693569085471903, "eval_loss": 3.502919912338257, "eval_runtime": 184.9331, "eval_samples_per_second": 97.392, "eval_steps_per_second": 6.089, "step": 27000 }, { "epoch": 2.9114196534280485, "grad_norm": 0.6220670342445374, "learning_rate": 0.0004258377330029091, "loss": 3.5158, "step": 27050 }, { "epoch": 2.9168012054676566, "grad_norm": 0.5957185626029968, "learning_rate": 0.00042551449197284776, "loss": 3.5278, "step": 27100 }, { "epoch": 2.922182757507265, "grad_norm": 0.5763528943061829, "learning_rate": 0.0004251912509427863, "loss": 3.5373, "step": 27150 }, { "epoch": 2.9275643095468733, "grad_norm": 0.546169102191925, "learning_rate": 0.0004248680099127249, "loss": 3.5336, "step": 27200 }, { "epoch": 2.9329458615864814, "grad_norm": 0.5983578562736511, "learning_rate": 0.0004245512337032647, "loss": 3.5214, "step": 27250 }, { "epoch": 2.93832741362609, "grad_norm": 0.5858373641967773, "learning_rate": 0.0004242279926732033, "loss": 3.5204, "step": 27300 }, { "epoch": 2.943708965665698, "grad_norm": 0.6104702353477478, "learning_rate": 0.00042390475164314184, "loss": 3.5297, "step": 27350 }, { "epoch": 2.949090517705306, "grad_norm": 0.5596311092376709, "learning_rate": 0.00042358151061308043, "loss": 3.514, "step": 27400 }, { "epoch": 2.9544720697449143, "grad_norm": 0.6430082321166992, "learning_rate": 0.000423258269583019, "loss": 3.5117, "step": 27450 }, { "epoch": 2.9598536217845224, "grad_norm": 0.5712005496025085, "learning_rate": 0.0004229350285529576, "loss": 3.5119, "step": 27500 }, { "epoch": 2.965235173824131, "grad_norm": 0.6474849581718445, "learning_rate": 0.0004226117875228962, "loss": 3.5216, "step": 27550 }, { "epoch": 2.970616725863739, "grad_norm": 0.5952101945877075, "learning_rate": 0.0004222885464928348, "loss": 3.5213, "step": 27600 }, { "epoch": 2.975998277903347, "grad_norm": 0.5649101138114929, "learning_rate": 0.00042196530546277335, "loss": 3.4964, "step": 27650 }, { "epoch": 2.9813798299429557, "grad_norm": 0.6717711687088013, "learning_rate": 0.00042164206443271195, "loss": 3.5389, "step": 27700 }, { "epoch": 2.986761381982564, "grad_norm": 0.5842223167419434, "learning_rate": 0.0004213188234026505, "loss": 3.542, "step": 27750 }, { "epoch": 2.992142934022172, "grad_norm": 0.5732238292694092, "learning_rate": 0.00042099558237258914, "loss": 3.5153, "step": 27800 }, { "epoch": 2.9975244860617805, "grad_norm": 0.5700245499610901, "learning_rate": 0.00042067234134252773, "loss": 3.5282, "step": 27850 }, { "epoch": 3.0029060381013886, "grad_norm": 0.6182711124420166, "learning_rate": 0.00042034910031246627, "loss": 3.4692, "step": 27900 }, { "epoch": 3.0082875901409967, "grad_norm": 0.5772573351860046, "learning_rate": 0.00042002585928240486, "loss": 3.4259, "step": 27950 }, { "epoch": 3.0136691421806048, "grad_norm": 0.6104460954666138, "learning_rate": 0.00041970261825234346, "loss": 3.4261, "step": 28000 }, { "epoch": 3.0136691421806048, "eval_accuracy": 0.3703434767685458, "eval_loss": 3.4980907440185547, "eval_runtime": 184.5366, "eval_samples_per_second": 97.601, "eval_steps_per_second": 6.102, "step": 28000 }, { "epoch": 3.0190506942202133, "grad_norm": 0.6463789343833923, "learning_rate": 0.00041937937722228205, "loss": 3.4415, "step": 28050 }, { "epoch": 3.0244322462598214, "grad_norm": 0.6210352778434753, "learning_rate": 0.00041905613619222065, "loss": 3.444, "step": 28100 }, { "epoch": 3.0298137982994295, "grad_norm": 0.6435058116912842, "learning_rate": 0.00041873289516215924, "loss": 3.4282, "step": 28150 }, { "epoch": 3.0351953503390376, "grad_norm": 0.611407995223999, "learning_rate": 0.0004184096541320978, "loss": 3.4177, "step": 28200 }, { "epoch": 3.040576902378646, "grad_norm": 0.5989460945129395, "learning_rate": 0.0004180864131020364, "loss": 3.4313, "step": 28250 }, { "epoch": 3.0459584544182543, "grad_norm": 0.6122378706932068, "learning_rate": 0.0004177631720719749, "loss": 3.4188, "step": 28300 }, { "epoch": 3.0513400064578624, "grad_norm": 0.6049924492835999, "learning_rate": 0.00041743993104191357, "loss": 3.4338, "step": 28350 }, { "epoch": 3.0567215584974705, "grad_norm": 0.6787021160125732, "learning_rate": 0.00041711669001185216, "loss": 3.4402, "step": 28400 }, { "epoch": 3.062103110537079, "grad_norm": 0.5759785771369934, "learning_rate": 0.0004167934489817907, "loss": 3.4375, "step": 28450 }, { "epoch": 3.067484662576687, "grad_norm": 0.6150079369544983, "learning_rate": 0.0004164702079517293, "loss": 3.4477, "step": 28500 }, { "epoch": 3.0728662146162953, "grad_norm": 0.6279264092445374, "learning_rate": 0.0004161469669216679, "loss": 3.4317, "step": 28550 }, { "epoch": 3.0782477666559034, "grad_norm": 0.6215957403182983, "learning_rate": 0.00041582372589160643, "loss": 3.4224, "step": 28600 }, { "epoch": 3.083629318695512, "grad_norm": 0.5486896634101868, "learning_rate": 0.0004155004848615451, "loss": 3.4304, "step": 28650 }, { "epoch": 3.08901087073512, "grad_norm": 0.6455966830253601, "learning_rate": 0.0004151772438314837, "loss": 3.4457, "step": 28700 }, { "epoch": 3.094392422774728, "grad_norm": 0.6167451739311218, "learning_rate": 0.0004148540028014222, "loss": 3.4454, "step": 28750 }, { "epoch": 3.0997739748143363, "grad_norm": 0.6451519131660461, "learning_rate": 0.0004145307617713608, "loss": 3.4316, "step": 28800 }, { "epoch": 3.105155526853945, "grad_norm": 0.6412652134895325, "learning_rate": 0.00041420752074129935, "loss": 3.4598, "step": 28850 }, { "epoch": 3.110537078893553, "grad_norm": 0.6315445899963379, "learning_rate": 0.0004138907445318392, "loss": 3.4571, "step": 28900 }, { "epoch": 3.115918630933161, "grad_norm": 0.6011728048324585, "learning_rate": 0.00041356750350177775, "loss": 3.46, "step": 28950 }, { "epoch": 3.121300182972769, "grad_norm": 0.6177191734313965, "learning_rate": 0.0004132442624717164, "loss": 3.4499, "step": 29000 }, { "epoch": 3.121300182972769, "eval_accuracy": 0.37099246046746165, "eval_loss": 3.4960525035858154, "eval_runtime": 184.1237, "eval_samples_per_second": 97.82, "eval_steps_per_second": 6.115, "step": 29000 }, { "epoch": 3.1266817350123777, "grad_norm": 0.5993657112121582, "learning_rate": 0.000412921021441655, "loss": 3.4354, "step": 29050 }, { "epoch": 3.132063287051986, "grad_norm": 0.6064768433570862, "learning_rate": 0.00041259778041159354, "loss": 3.4492, "step": 29100 }, { "epoch": 3.137444839091594, "grad_norm": 0.6318159103393555, "learning_rate": 0.00041227453938153213, "loss": 3.4588, "step": 29150 }, { "epoch": 3.1428263911312024, "grad_norm": 0.6009914875030518, "learning_rate": 0.00041195129835147067, "loss": 3.4479, "step": 29200 }, { "epoch": 3.1482079431708105, "grad_norm": 0.6485038995742798, "learning_rate": 0.00041162805732140927, "loss": 3.4391, "step": 29250 }, { "epoch": 3.1535894952104186, "grad_norm": 0.6196446418762207, "learning_rate": 0.0004113048162913479, "loss": 3.4599, "step": 29300 }, { "epoch": 3.1589710472500268, "grad_norm": 0.6178466081619263, "learning_rate": 0.00041098157526128646, "loss": 3.4616, "step": 29350 }, { "epoch": 3.1643525992896353, "grad_norm": 0.6390627026557922, "learning_rate": 0.00041065833423122505, "loss": 3.4476, "step": 29400 }, { "epoch": 3.1697341513292434, "grad_norm": 0.5989649891853333, "learning_rate": 0.00041033509320116365, "loss": 3.4473, "step": 29450 }, { "epoch": 3.1751157033688515, "grad_norm": 0.6324307918548584, "learning_rate": 0.0004100118521711022, "loss": 3.4502, "step": 29500 }, { "epoch": 3.1804972554084596, "grad_norm": 0.6045710444450378, "learning_rate": 0.0004096886111410408, "loss": 3.4651, "step": 29550 }, { "epoch": 3.185878807448068, "grad_norm": 0.6226058602333069, "learning_rate": 0.00040936537011097943, "loss": 3.464, "step": 29600 }, { "epoch": 3.1912603594876763, "grad_norm": 0.6163890361785889, "learning_rate": 0.00040904212908091797, "loss": 3.4579, "step": 29650 }, { "epoch": 3.1966419115272844, "grad_norm": 0.5751364231109619, "learning_rate": 0.00040871888805085656, "loss": 3.4731, "step": 29700 }, { "epoch": 3.2020234635668925, "grad_norm": 0.5690685510635376, "learning_rate": 0.0004083956470207951, "loss": 3.4605, "step": 29750 }, { "epoch": 3.207405015606501, "grad_norm": 0.6342244744300842, "learning_rate": 0.0004080724059907337, "loss": 3.4488, "step": 29800 }, { "epoch": 3.212786567646109, "grad_norm": 0.6221125721931458, "learning_rate": 0.00040774916496067235, "loss": 3.4581, "step": 29850 }, { "epoch": 3.2181681196857173, "grad_norm": 0.6162823438644409, "learning_rate": 0.0004074259239306109, "loss": 3.4603, "step": 29900 }, { "epoch": 3.2235496717253254, "grad_norm": 0.5885461568832397, "learning_rate": 0.0004071026829005495, "loss": 3.48, "step": 29950 }, { "epoch": 3.228931223764934, "grad_norm": 0.6322233080863953, "learning_rate": 0.0004067794418704881, "loss": 3.4656, "step": 30000 }, { "epoch": 3.228931223764934, "eval_accuracy": 0.3715372460469635, "eval_loss": 3.4913604259490967, "eval_runtime": 184.4667, "eval_samples_per_second": 97.638, "eval_steps_per_second": 6.104, "step": 30000 }, { "epoch": 3.234312775804542, "grad_norm": 0.6058706045150757, "learning_rate": 0.0004064562008404266, "loss": 3.4556, "step": 30050 }, { "epoch": 3.23969432784415, "grad_norm": 0.6229328513145447, "learning_rate": 0.0004061329598103652, "loss": 3.4518, "step": 30100 }, { "epoch": 3.2450758798837587, "grad_norm": 0.6209508180618286, "learning_rate": 0.00040580971878030386, "loss": 3.4723, "step": 30150 }, { "epoch": 3.250457431923367, "grad_norm": 0.5959570407867432, "learning_rate": 0.0004054864777502424, "loss": 3.4559, "step": 30200 }, { "epoch": 3.255838983962975, "grad_norm": 0.5910437107086182, "learning_rate": 0.000405163236720181, "loss": 3.4819, "step": 30250 }, { "epoch": 3.261220536002583, "grad_norm": 0.6679221987724304, "learning_rate": 0.00040483999569011954, "loss": 3.436, "step": 30300 }, { "epoch": 3.2666020880421915, "grad_norm": 0.6360255479812622, "learning_rate": 0.00040451675466005813, "loss": 3.4475, "step": 30350 }, { "epoch": 3.2719836400817996, "grad_norm": 0.626301646232605, "learning_rate": 0.0004041935136299967, "loss": 3.4569, "step": 30400 }, { "epoch": 3.2773651921214078, "grad_norm": 0.6203222274780273, "learning_rate": 0.0004038702725999353, "loss": 3.4598, "step": 30450 }, { "epoch": 3.282746744161016, "grad_norm": 0.621329665184021, "learning_rate": 0.0004035470315698739, "loss": 3.4508, "step": 30500 }, { "epoch": 3.2881282962006244, "grad_norm": 0.6210439801216125, "learning_rate": 0.0004032237905398125, "loss": 3.437, "step": 30550 }, { "epoch": 3.2935098482402325, "grad_norm": 0.6080983281135559, "learning_rate": 0.00040290054950975105, "loss": 3.4409, "step": 30600 }, { "epoch": 3.2988914002798406, "grad_norm": 0.5964152812957764, "learning_rate": 0.00040257730847968965, "loss": 3.4796, "step": 30650 }, { "epoch": 3.304272952319449, "grad_norm": 0.5952696204185486, "learning_rate": 0.0004022540674496283, "loss": 3.4681, "step": 30700 }, { "epoch": 3.3096545043590573, "grad_norm": 0.6171097755432129, "learning_rate": 0.00040193082641956684, "loss": 3.4438, "step": 30750 }, { "epoch": 3.3150360563986654, "grad_norm": 0.5834512114524841, "learning_rate": 0.00040160758538950543, "loss": 3.4634, "step": 30800 }, { "epoch": 3.3204176084382735, "grad_norm": 0.6528227925300598, "learning_rate": 0.00040128434435944397, "loss": 3.4639, "step": 30850 }, { "epoch": 3.3257991604778816, "grad_norm": 0.6397239565849304, "learning_rate": 0.00040096110332938257, "loss": 3.4721, "step": 30900 }, { "epoch": 3.33118071251749, "grad_norm": 0.6081625819206238, "learning_rate": 0.00040063786229932116, "loss": 3.4685, "step": 30950 }, { "epoch": 3.3365622645570983, "grad_norm": 0.5660403966903687, "learning_rate": 0.00040031462126925975, "loss": 3.439, "step": 31000 }, { "epoch": 3.3365622645570983, "eval_accuracy": 0.3725310940118025, "eval_loss": 3.482257604598999, "eval_runtime": 184.4002, "eval_samples_per_second": 97.673, "eval_steps_per_second": 6.106, "step": 31000 }, { "epoch": 3.3419438165967064, "grad_norm": 0.6404114365577698, "learning_rate": 0.00039999138023919835, "loss": 3.4664, "step": 31050 }, { "epoch": 3.347325368636315, "grad_norm": 0.5815680623054504, "learning_rate": 0.00039966813920913694, "loss": 3.4588, "step": 31100 }, { "epoch": 3.352706920675923, "grad_norm": 0.6458703875541687, "learning_rate": 0.0003993448981790755, "loss": 3.4425, "step": 31150 }, { "epoch": 3.358088472715531, "grad_norm": 0.6019287705421448, "learning_rate": 0.0003990216571490141, "loss": 3.4646, "step": 31200 }, { "epoch": 3.3634700247551392, "grad_norm": 0.5842714309692383, "learning_rate": 0.0003986984161189526, "loss": 3.4607, "step": 31250 }, { "epoch": 3.368851576794748, "grad_norm": 0.6253634095191956, "learning_rate": 0.00039837517508889127, "loss": 3.4699, "step": 31300 }, { "epoch": 3.374233128834356, "grad_norm": 0.6147831678390503, "learning_rate": 0.00039805193405882986, "loss": 3.4686, "step": 31350 }, { "epoch": 3.379614680873964, "grad_norm": 0.6719804406166077, "learning_rate": 0.0003977286930287684, "loss": 3.4555, "step": 31400 }, { "epoch": 3.384996232913572, "grad_norm": 0.6484226584434509, "learning_rate": 0.000397405451998707, "loss": 3.467, "step": 31450 }, { "epoch": 3.3903777849531807, "grad_norm": 0.5903288722038269, "learning_rate": 0.00039708221096864554, "loss": 3.4543, "step": 31500 }, { "epoch": 3.3957593369927888, "grad_norm": 0.6492658853530884, "learning_rate": 0.00039675896993858413, "loss": 3.4523, "step": 31550 }, { "epoch": 3.401140889032397, "grad_norm": 0.6091883182525635, "learning_rate": 0.0003964357289085228, "loss": 3.4493, "step": 31600 }, { "epoch": 3.4065224410720054, "grad_norm": 0.6458459496498108, "learning_rate": 0.0003961124878784613, "loss": 3.4534, "step": 31650 }, { "epoch": 3.4119039931116135, "grad_norm": 0.6110221743583679, "learning_rate": 0.0003957892468483999, "loss": 3.4401, "step": 31700 }, { "epoch": 3.4172855451512216, "grad_norm": 0.621113121509552, "learning_rate": 0.0003954660058183385, "loss": 3.4804, "step": 31750 }, { "epoch": 3.4226670971908297, "grad_norm": 0.5934743285179138, "learning_rate": 0.00039514276478827705, "loss": 3.4456, "step": 31800 }, { "epoch": 3.428048649230438, "grad_norm": 0.5857704877853394, "learning_rate": 0.0003948195237582157, "loss": 3.462, "step": 31850 }, { "epoch": 3.4334302012700464, "grad_norm": 0.6002525091171265, "learning_rate": 0.0003944962827281543, "loss": 3.4515, "step": 31900 }, { "epoch": 3.4388117533096545, "grad_norm": 0.6284528970718384, "learning_rate": 0.00039417304169809284, "loss": 3.4546, "step": 31950 }, { "epoch": 3.4441933053492626, "grad_norm": 0.6180424094200134, "learning_rate": 0.00039384980066803143, "loss": 3.4641, "step": 32000 }, { "epoch": 3.4441933053492626, "eval_accuracy": 0.3728800870874626, "eval_loss": 3.478074073791504, "eval_runtime": 184.2216, "eval_samples_per_second": 97.768, "eval_steps_per_second": 6.112, "step": 32000 }, { "epoch": 3.449574857388871, "grad_norm": 0.603389322757721, "learning_rate": 0.00039352655963796997, "loss": 3.463, "step": 32050 }, { "epoch": 3.4549564094284793, "grad_norm": 0.7080710530281067, "learning_rate": 0.00039320331860790857, "loss": 3.4712, "step": 32100 }, { "epoch": 3.4603379614680874, "grad_norm": 0.6117133498191833, "learning_rate": 0.0003928800775778472, "loss": 3.4771, "step": 32150 }, { "epoch": 3.4657195135076955, "grad_norm": 0.615259051322937, "learning_rate": 0.00039255683654778576, "loss": 3.4592, "step": 32200 }, { "epoch": 3.471101065547304, "grad_norm": 0.5946685671806335, "learning_rate": 0.00039223359551772435, "loss": 3.4508, "step": 32250 }, { "epoch": 3.476482617586912, "grad_norm": 0.6408491134643555, "learning_rate": 0.00039191035448766294, "loss": 3.4737, "step": 32300 }, { "epoch": 3.4818641696265202, "grad_norm": 0.6141444444656372, "learning_rate": 0.0003915871134576015, "loss": 3.4653, "step": 32350 }, { "epoch": 3.4872457216661283, "grad_norm": 0.6397346258163452, "learning_rate": 0.0003912638724275401, "loss": 3.4523, "step": 32400 }, { "epoch": 3.492627273705737, "grad_norm": 0.6699585914611816, "learning_rate": 0.00039094063139747873, "loss": 3.4735, "step": 32450 }, { "epoch": 3.498008825745345, "grad_norm": 0.5959972143173218, "learning_rate": 0.00039061739036741727, "loss": 3.4612, "step": 32500 }, { "epoch": 3.503390377784953, "grad_norm": 0.7005971074104309, "learning_rate": 0.00039029414933735586, "loss": 3.4549, "step": 32550 }, { "epoch": 3.5087719298245617, "grad_norm": 0.6877140402793884, "learning_rate": 0.0003899709083072944, "loss": 3.4651, "step": 32600 }, { "epoch": 3.5141534818641698, "grad_norm": 0.5952162146568298, "learning_rate": 0.000389647667277233, "loss": 3.4546, "step": 32650 }, { "epoch": 3.519535033903778, "grad_norm": 0.5997613668441772, "learning_rate": 0.00038932442624717165, "loss": 3.453, "step": 32700 }, { "epoch": 3.524916585943386, "grad_norm": 0.5949071645736694, "learning_rate": 0.0003890011852171102, "loss": 3.4683, "step": 32750 }, { "epoch": 3.530298137982994, "grad_norm": 0.6204602122306824, "learning_rate": 0.00038868440900765005, "loss": 3.4499, "step": 32800 }, { "epoch": 3.5356796900226026, "grad_norm": 0.6080209612846375, "learning_rate": 0.0003883611679775886, "loss": 3.4595, "step": 32850 }, { "epoch": 3.5410612420622107, "grad_norm": 0.5958392024040222, "learning_rate": 0.0003880379269475272, "loss": 3.4431, "step": 32900 }, { "epoch": 3.546442794101819, "grad_norm": 0.607470691204071, "learning_rate": 0.0003877146859174657, "loss": 3.4496, "step": 32950 }, { "epoch": 3.5518243461414274, "grad_norm": 0.6062714457511902, "learning_rate": 0.0003873914448874043, "loss": 3.4477, "step": 33000 }, { "epoch": 3.5518243461414274, "eval_accuracy": 0.3737313824634891, "eval_loss": 3.470710277557373, "eval_runtime": 184.4455, "eval_samples_per_second": 97.649, "eval_steps_per_second": 6.105, "step": 33000 }, { "epoch": 3.5572058981810355, "grad_norm": 0.5941749811172485, "learning_rate": 0.0003870682038573429, "loss": 3.4643, "step": 33050 }, { "epoch": 3.5625874502206436, "grad_norm": 0.6141754984855652, "learning_rate": 0.0003867449628272815, "loss": 3.4329, "step": 33100 }, { "epoch": 3.5679690022602517, "grad_norm": 0.636290967464447, "learning_rate": 0.0003864217217972201, "loss": 3.4552, "step": 33150 }, { "epoch": 3.57335055429986, "grad_norm": 0.6304817795753479, "learning_rate": 0.0003860984807671587, "loss": 3.4362, "step": 33200 }, { "epoch": 3.5787321063394684, "grad_norm": 0.6164857745170593, "learning_rate": 0.00038577523973709724, "loss": 3.464, "step": 33250 }, { "epoch": 3.5841136583790765, "grad_norm": 0.6616718173027039, "learning_rate": 0.00038545199870703583, "loss": 3.448, "step": 33300 }, { "epoch": 3.5894952104186846, "grad_norm": 0.6571164131164551, "learning_rate": 0.0003851287576769744, "loss": 3.4409, "step": 33350 }, { "epoch": 3.594876762458293, "grad_norm": 0.5911239981651306, "learning_rate": 0.000384805516646913, "loss": 3.4646, "step": 33400 }, { "epoch": 3.6002583144979012, "grad_norm": 0.6162347197532654, "learning_rate": 0.0003844822756168516, "loss": 3.4659, "step": 33450 }, { "epoch": 3.6056398665375093, "grad_norm": 0.6484450101852417, "learning_rate": 0.00038415903458679016, "loss": 3.4609, "step": 33500 }, { "epoch": 3.611021418577118, "grad_norm": 0.6605974435806274, "learning_rate": 0.00038383579355672875, "loss": 3.4658, "step": 33550 }, { "epoch": 3.616402970616726, "grad_norm": 0.6180010437965393, "learning_rate": 0.00038351255252666735, "loss": 3.4558, "step": 33600 }, { "epoch": 3.621784522656334, "grad_norm": 0.6921665072441101, "learning_rate": 0.00038318931149660594, "loss": 3.4691, "step": 33650 }, { "epoch": 3.627166074695942, "grad_norm": 0.679875373840332, "learning_rate": 0.00038286607046654454, "loss": 3.4446, "step": 33700 }, { "epoch": 3.6325476267355503, "grad_norm": 0.6239168643951416, "learning_rate": 0.00038254282943648313, "loss": 3.4549, "step": 33750 }, { "epoch": 3.637929178775159, "grad_norm": 0.6452910304069519, "learning_rate": 0.00038221958840642167, "loss": 3.4668, "step": 33800 }, { "epoch": 3.643310730814767, "grad_norm": 0.5900523662567139, "learning_rate": 0.00038189634737636027, "loss": 3.4546, "step": 33850 }, { "epoch": 3.648692282854375, "grad_norm": 0.601752519607544, "learning_rate": 0.0003815731063462988, "loss": 3.4508, "step": 33900 }, { "epoch": 3.6540738348939836, "grad_norm": 0.6239110231399536, "learning_rate": 0.00038124986531623745, "loss": 3.4629, "step": 33950 }, { "epoch": 3.6594553869335917, "grad_norm": 0.5961187481880188, "learning_rate": 0.00038092662428617605, "loss": 3.4634, "step": 34000 }, { "epoch": 3.6594553869335917, "eval_accuracy": 0.3742123887978438, "eval_loss": 3.4635941982269287, "eval_runtime": 184.555, "eval_samples_per_second": 97.592, "eval_steps_per_second": 6.101, "step": 34000 }, { "epoch": 3.6648369389732, "grad_norm": 0.6280834078788757, "learning_rate": 0.0003806033832561146, "loss": 3.4486, "step": 34050 }, { "epoch": 3.670218491012808, "grad_norm": 0.6199246048927307, "learning_rate": 0.0003802801422260532, "loss": 3.4554, "step": 34100 }, { "epoch": 3.675600043052416, "grad_norm": 0.6499938368797302, "learning_rate": 0.0003799569011959918, "loss": 3.4459, "step": 34150 }, { "epoch": 3.6809815950920246, "grad_norm": 0.6102513670921326, "learning_rate": 0.0003796336601659303, "loss": 3.4383, "step": 34200 }, { "epoch": 3.6863631471316327, "grad_norm": 0.7319421172142029, "learning_rate": 0.00037931041913586897, "loss": 3.46, "step": 34250 }, { "epoch": 3.691744699171241, "grad_norm": 0.6436998248100281, "learning_rate": 0.00037898717810580756, "loss": 3.4635, "step": 34300 }, { "epoch": 3.6971262512108494, "grad_norm": 0.631674587726593, "learning_rate": 0.0003786639370757461, "loss": 3.4544, "step": 34350 }, { "epoch": 3.7025078032504575, "grad_norm": 0.6169427037239075, "learning_rate": 0.0003783406960456847, "loss": 3.4477, "step": 34400 }, { "epoch": 3.7078893552900656, "grad_norm": 0.6570582985877991, "learning_rate": 0.00037801745501562324, "loss": 3.4533, "step": 34450 }, { "epoch": 3.713270907329674, "grad_norm": 0.6398195624351501, "learning_rate": 0.0003776942139855619, "loss": 3.4486, "step": 34500 }, { "epoch": 3.7186524593692822, "grad_norm": 0.6023803949356079, "learning_rate": 0.0003773709729555005, "loss": 3.4388, "step": 34550 }, { "epoch": 3.7240340114088903, "grad_norm": 0.6208220720291138, "learning_rate": 0.000377047731925439, "loss": 3.4474, "step": 34600 }, { "epoch": 3.7294155634484984, "grad_norm": 0.6167011857032776, "learning_rate": 0.0003767244908953776, "loss": 3.4569, "step": 34650 }, { "epoch": 3.7347971154881066, "grad_norm": 0.6336820721626282, "learning_rate": 0.0003764012498653162, "loss": 3.4324, "step": 34700 }, { "epoch": 3.740178667527715, "grad_norm": 0.6216785907745361, "learning_rate": 0.00037607800883525475, "loss": 3.4332, "step": 34750 }, { "epoch": 3.745560219567323, "grad_norm": 0.6031525135040283, "learning_rate": 0.0003757547678051934, "loss": 3.443, "step": 34800 }, { "epoch": 3.7509417716069313, "grad_norm": 0.6389471888542175, "learning_rate": 0.000375431526775132, "loss": 3.4647, "step": 34850 }, { "epoch": 3.75632332364654, "grad_norm": 0.6141842007637024, "learning_rate": 0.00037510828574507054, "loss": 3.4614, "step": 34900 }, { "epoch": 3.761704875686148, "grad_norm": 0.6175715327262878, "learning_rate": 0.00037478504471500913, "loss": 3.468, "step": 34950 }, { "epoch": 3.767086427725756, "grad_norm": 0.6579195261001587, "learning_rate": 0.00037446180368494767, "loss": 3.4601, "step": 35000 }, { "epoch": 3.767086427725756, "eval_accuracy": 0.3748731070086437, "eval_loss": 3.4568800926208496, "eval_runtime": 184.5768, "eval_samples_per_second": 97.58, "eval_steps_per_second": 6.1, "step": 35000 }, { "epoch": 3.772467979765364, "grad_norm": 0.679194986820221, "learning_rate": 0.00037413856265488627, "loss": 3.4666, "step": 35050 }, { "epoch": 3.7778495318049723, "grad_norm": 0.6251862049102783, "learning_rate": 0.0003738153216248249, "loss": 3.4462, "step": 35100 }, { "epoch": 3.783231083844581, "grad_norm": 0.6154874563217163, "learning_rate": 0.00037349208059476346, "loss": 3.4634, "step": 35150 }, { "epoch": 3.788612635884189, "grad_norm": 0.6381683349609375, "learning_rate": 0.00037316883956470205, "loss": 3.4257, "step": 35200 }, { "epoch": 3.793994187923797, "grad_norm": 0.6327068209648132, "learning_rate": 0.00037284559853464064, "loss": 3.4648, "step": 35250 }, { "epoch": 3.7993757399634056, "grad_norm": 0.6380037069320679, "learning_rate": 0.0003725223575045792, "loss": 3.4387, "step": 35300 }, { "epoch": 3.8047572920030137, "grad_norm": 0.6446463465690613, "learning_rate": 0.0003721991164745178, "loss": 3.4613, "step": 35350 }, { "epoch": 3.810138844042622, "grad_norm": 0.5962299704551697, "learning_rate": 0.00037187587544445643, "loss": 3.4573, "step": 35400 }, { "epoch": 3.8155203960822304, "grad_norm": 0.6741969585418701, "learning_rate": 0.00037155263441439497, "loss": 3.4624, "step": 35450 }, { "epoch": 3.8209019481218385, "grad_norm": 0.6184021830558777, "learning_rate": 0.00037122939338433356, "loss": 3.4521, "step": 35500 }, { "epoch": 3.8262835001614466, "grad_norm": 0.6197969913482666, "learning_rate": 0.0003709061523542721, "loss": 3.439, "step": 35550 }, { "epoch": 3.8316650522010547, "grad_norm": 0.6020095348358154, "learning_rate": 0.0003705829113242107, "loss": 3.4464, "step": 35600 }, { "epoch": 3.837046604240663, "grad_norm": 0.6370236873626709, "learning_rate": 0.00037025967029414935, "loss": 3.4541, "step": 35650 }, { "epoch": 3.8424281562802713, "grad_norm": 0.6173757314682007, "learning_rate": 0.0003699364292640879, "loss": 3.4623, "step": 35700 }, { "epoch": 3.8478097083198795, "grad_norm": 0.6351857781410217, "learning_rate": 0.0003696131882340265, "loss": 3.4341, "step": 35750 }, { "epoch": 3.8531912603594876, "grad_norm": 0.6064653396606445, "learning_rate": 0.0003692899472039651, "loss": 3.4478, "step": 35800 }, { "epoch": 3.858572812399096, "grad_norm": 0.6070151925086975, "learning_rate": 0.0003689667061739036, "loss": 3.4469, "step": 35850 }, { "epoch": 3.863954364438704, "grad_norm": 0.5651928782463074, "learning_rate": 0.0003686434651438422, "loss": 3.4358, "step": 35900 }, { "epoch": 3.8693359164783123, "grad_norm": 0.656566858291626, "learning_rate": 0.00036832022411378086, "loss": 3.4482, "step": 35950 }, { "epoch": 3.8747174685179204, "grad_norm": 0.6477782130241394, "learning_rate": 0.0003679969830837194, "loss": 3.4465, "step": 36000 }, { "epoch": 3.8747174685179204, "eval_accuracy": 0.37578970277024665, "eval_loss": 3.450225830078125, "eval_runtime": 184.5189, "eval_samples_per_second": 97.611, "eval_steps_per_second": 6.102, "step": 36000 }, { "epoch": 3.8800990205575285, "grad_norm": 0.6996077299118042, "learning_rate": 0.000367673742053658, "loss": 3.457, "step": 36050 }, { "epoch": 3.885480572597137, "grad_norm": 0.6297636032104492, "learning_rate": 0.00036735050102359654, "loss": 3.4306, "step": 36100 }, { "epoch": 3.890862124636745, "grad_norm": 0.6252796053886414, "learning_rate": 0.00036702725999353513, "loss": 3.4513, "step": 36150 }, { "epoch": 3.8962436766763533, "grad_norm": 0.6050940155982971, "learning_rate": 0.0003667040189634737, "loss": 3.4483, "step": 36200 }, { "epoch": 3.901625228715962, "grad_norm": 0.7060495615005493, "learning_rate": 0.0003663807779334123, "loss": 3.4399, "step": 36250 }, { "epoch": 3.90700678075557, "grad_norm": 0.6431135535240173, "learning_rate": 0.0003660575369033509, "loss": 3.4603, "step": 36300 }, { "epoch": 3.912388332795178, "grad_norm": 0.6965773701667786, "learning_rate": 0.0003657342958732895, "loss": 3.4541, "step": 36350 }, { "epoch": 3.9177698848347866, "grad_norm": 0.6067307591438293, "learning_rate": 0.00036541105484322805, "loss": 3.4427, "step": 36400 }, { "epoch": 3.9231514368743947, "grad_norm": 0.6552959084510803, "learning_rate": 0.00036508781381316665, "loss": 3.4517, "step": 36450 }, { "epoch": 3.928532988914003, "grad_norm": 0.6734021306037903, "learning_rate": 0.0003647645727831053, "loss": 3.4371, "step": 36500 }, { "epoch": 3.933914540953611, "grad_norm": 0.6328858137130737, "learning_rate": 0.00036444133175304384, "loss": 3.4594, "step": 36550 }, { "epoch": 3.939296092993219, "grad_norm": 0.6011826395988464, "learning_rate": 0.00036411809072298243, "loss": 3.4521, "step": 36600 }, { "epoch": 3.9446776450328276, "grad_norm": 0.5960487723350525, "learning_rate": 0.00036379484969292097, "loss": 3.4394, "step": 36650 }, { "epoch": 3.9500591970724357, "grad_norm": 0.6299543976783752, "learning_rate": 0.00036347160866285956, "loss": 3.4479, "step": 36700 }, { "epoch": 3.955440749112044, "grad_norm": 0.6453956365585327, "learning_rate": 0.0003631483676327981, "loss": 3.4389, "step": 36750 }, { "epoch": 3.9608223011516523, "grad_norm": 0.6371225118637085, "learning_rate": 0.00036282512660273675, "loss": 3.4605, "step": 36800 }, { "epoch": 3.9662038531912605, "grad_norm": 0.6646671891212463, "learning_rate": 0.0003625083503932765, "loss": 3.4381, "step": 36850 }, { "epoch": 3.9715854052308686, "grad_norm": 0.6253241896629333, "learning_rate": 0.00036218510936321516, "loss": 3.4385, "step": 36900 }, { "epoch": 3.9769669572704767, "grad_norm": 0.6180611252784729, "learning_rate": 0.00036186186833315375, "loss": 3.4442, "step": 36950 }, { "epoch": 3.9823485093100848, "grad_norm": 0.6421630382537842, "learning_rate": 0.0003615386273030923, "loss": 3.4361, "step": 37000 }, { "epoch": 3.9823485093100848, "eval_accuracy": 0.37607926271627223, "eval_loss": 3.4442827701568604, "eval_runtime": 184.2143, "eval_samples_per_second": 97.772, "eval_steps_per_second": 6.112, "step": 37000 }, { "epoch": 3.9877300613496933, "grad_norm": 0.6145128607749939, "learning_rate": 0.0003612153862730309, "loss": 3.419, "step": 37050 }, { "epoch": 3.9931116133893014, "grad_norm": 0.7820795178413391, "learning_rate": 0.0003608921452429695, "loss": 3.4654, "step": 37100 }, { "epoch": 3.9984931654289095, "grad_norm": 0.6979103684425354, "learning_rate": 0.000360568904212908, "loss": 3.4498, "step": 37150 }, { "epoch": 4.003874717468518, "grad_norm": 0.6150457262992859, "learning_rate": 0.0003602521280034478, "loss": 3.3612, "step": 37200 }, { "epoch": 4.009256269508126, "grad_norm": 0.6420441269874573, "learning_rate": 0.0003599288869733865, "loss": 3.3599, "step": 37250 }, { "epoch": 4.014637821547734, "grad_norm": 0.6551081538200378, "learning_rate": 0.00035960564594332507, "loss": 3.3618, "step": 37300 }, { "epoch": 4.020019373587343, "grad_norm": 0.675521969795227, "learning_rate": 0.0003592824049132636, "loss": 3.3518, "step": 37350 }, { "epoch": 4.0254009256269505, "grad_norm": 0.6582967638969421, "learning_rate": 0.0003589591638832022, "loss": 3.3649, "step": 37400 }, { "epoch": 4.030782477666559, "grad_norm": 0.7078763246536255, "learning_rate": 0.0003586359228531408, "loss": 3.365, "step": 37450 }, { "epoch": 4.036164029706168, "grad_norm": 0.6814618110656738, "learning_rate": 0.00035831268182307934, "loss": 3.3522, "step": 37500 }, { "epoch": 4.041545581745775, "grad_norm": 0.6573576331138611, "learning_rate": 0.000357989440793018, "loss": 3.3654, "step": 37550 }, { "epoch": 4.046927133785384, "grad_norm": 0.6965717077255249, "learning_rate": 0.0003576661997629566, "loss": 3.3548, "step": 37600 }, { "epoch": 4.0523086858249915, "grad_norm": 0.7456988096237183, "learning_rate": 0.0003573429587328951, "loss": 3.3753, "step": 37650 }, { "epoch": 4.0576902378646, "grad_norm": 0.6657203435897827, "learning_rate": 0.0003570197177028337, "loss": 3.365, "step": 37700 }, { "epoch": 4.063071789904209, "grad_norm": 0.6479413509368896, "learning_rate": 0.00035669647667277226, "loss": 3.3552, "step": 37750 }, { "epoch": 4.068453341943816, "grad_norm": 0.6535371541976929, "learning_rate": 0.00035637323564271085, "loss": 3.3671, "step": 37800 }, { "epoch": 4.073834893983425, "grad_norm": 0.6314700841903687, "learning_rate": 0.0003560499946126495, "loss": 3.38, "step": 37850 }, { "epoch": 4.079216446023033, "grad_norm": 0.6591801047325134, "learning_rate": 0.00035572675358258804, "loss": 3.383, "step": 37900 }, { "epoch": 4.084597998062641, "grad_norm": 0.6518027782440186, "learning_rate": 0.00035540351255252664, "loss": 3.3631, "step": 37950 }, { "epoch": 4.08997955010225, "grad_norm": 0.6759366393089294, "learning_rate": 0.00035508027152246523, "loss": 3.3675, "step": 38000 }, { "epoch": 4.08997955010225, "eval_accuracy": 0.376277336930758, "eval_loss": 3.448333978652954, "eval_runtime": 184.1804, "eval_samples_per_second": 97.79, "eval_steps_per_second": 6.114, "step": 38000 }, { "epoch": 4.095361102141858, "grad_norm": 0.6117898225784302, "learning_rate": 0.0003547570304924038, "loss": 3.3685, "step": 38050 }, { "epoch": 4.100742654181466, "grad_norm": 0.6886461973190308, "learning_rate": 0.0003544337894623424, "loss": 3.3651, "step": 38100 }, { "epoch": 4.106124206221074, "grad_norm": 0.638882040977478, "learning_rate": 0.000354110548432281, "loss": 3.3923, "step": 38150 }, { "epoch": 4.111505758260682, "grad_norm": 0.6541633605957031, "learning_rate": 0.00035378730740221956, "loss": 3.3628, "step": 38200 }, { "epoch": 4.1168873103002905, "grad_norm": 0.6548678278923035, "learning_rate": 0.00035346406637215815, "loss": 3.3591, "step": 38250 }, { "epoch": 4.122268862339899, "grad_norm": 0.7060560584068298, "learning_rate": 0.0003531408253420967, "loss": 3.3706, "step": 38300 }, { "epoch": 4.127650414379507, "grad_norm": 0.6243436336517334, "learning_rate": 0.0003528175843120353, "loss": 3.3658, "step": 38350 }, { "epoch": 4.133031966419115, "grad_norm": 0.6286110281944275, "learning_rate": 0.00035249434328197394, "loss": 3.374, "step": 38400 }, { "epoch": 4.138413518458724, "grad_norm": 0.7103957533836365, "learning_rate": 0.0003521711022519125, "loss": 3.3789, "step": 38450 }, { "epoch": 4.1437950704983315, "grad_norm": 0.6807283163070679, "learning_rate": 0.00035184786122185107, "loss": 3.3842, "step": 38500 }, { "epoch": 4.14917662253794, "grad_norm": 0.6512000560760498, "learning_rate": 0.00035152462019178967, "loss": 3.3571, "step": 38550 }, { "epoch": 4.154558174577549, "grad_norm": 0.7092454433441162, "learning_rate": 0.0003512013791617282, "loss": 3.3936, "step": 38600 }, { "epoch": 4.159939726617156, "grad_norm": 0.6051034927368164, "learning_rate": 0.0003508781381316668, "loss": 3.3721, "step": 38650 }, { "epoch": 4.165321278656765, "grad_norm": 0.6658530235290527, "learning_rate": 0.00035055489710160545, "loss": 3.3817, "step": 38700 }, { "epoch": 4.1707028306963725, "grad_norm": 0.6855853199958801, "learning_rate": 0.000350231656071544, "loss": 3.3761, "step": 38750 }, { "epoch": 4.176084382735981, "grad_norm": 0.6582157611846924, "learning_rate": 0.0003499084150414826, "loss": 3.3862, "step": 38800 }, { "epoch": 4.18146593477559, "grad_norm": 0.6760985851287842, "learning_rate": 0.0003495851740114211, "loss": 3.3724, "step": 38850 }, { "epoch": 4.186847486815197, "grad_norm": 0.6384249329566956, "learning_rate": 0.0003492619329813597, "loss": 3.377, "step": 38900 }, { "epoch": 4.192229038854806, "grad_norm": 0.6790486574172974, "learning_rate": 0.0003489386919512983, "loss": 3.3953, "step": 38950 }, { "epoch": 4.197610590894414, "grad_norm": 0.6869431734085083, "learning_rate": 0.0003486154509212369, "loss": 3.3805, "step": 39000 }, { "epoch": 4.197610590894414, "eval_accuracy": 0.3771245034970475, "eval_loss": 3.4426193237304688, "eval_runtime": 184.1684, "eval_samples_per_second": 97.796, "eval_steps_per_second": 6.114, "step": 39000 }, { "epoch": 4.202992142934022, "grad_norm": 0.6304237842559814, "learning_rate": 0.0003482922098911755, "loss": 3.3827, "step": 39050 }, { "epoch": 4.208373694973631, "grad_norm": 0.6254282593727112, "learning_rate": 0.0003479689688611141, "loss": 3.3818, "step": 39100 }, { "epoch": 4.213755247013238, "grad_norm": 0.6747856140136719, "learning_rate": 0.00034764572783105264, "loss": 3.3738, "step": 39150 }, { "epoch": 4.219136799052847, "grad_norm": 0.7349036931991577, "learning_rate": 0.00034732248680099123, "loss": 3.399, "step": 39200 }, { "epoch": 4.224518351092455, "grad_norm": 0.7066739201545715, "learning_rate": 0.0003469992457709299, "loss": 3.3911, "step": 39250 }, { "epoch": 4.229899903132063, "grad_norm": 0.6758204102516174, "learning_rate": 0.00034668246956146963, "loss": 3.3648, "step": 39300 }, { "epoch": 4.2352814551716715, "grad_norm": 0.6167644262313843, "learning_rate": 0.00034635922853140823, "loss": 3.3959, "step": 39350 }, { "epoch": 4.24066300721128, "grad_norm": 0.674502968788147, "learning_rate": 0.0003460359875013468, "loss": 3.3892, "step": 39400 }, { "epoch": 4.246044559250888, "grad_norm": 0.6334875226020813, "learning_rate": 0.0003457127464712854, "loss": 3.3715, "step": 39450 }, { "epoch": 4.251426111290496, "grad_norm": 0.6755365133285522, "learning_rate": 0.00034538950544122396, "loss": 3.3765, "step": 39500 }, { "epoch": 4.256807663330104, "grad_norm": 0.9597848057746887, "learning_rate": 0.00034506626441116255, "loss": 3.3569, "step": 39550 }, { "epoch": 4.2621892153697125, "grad_norm": 0.7055816650390625, "learning_rate": 0.0003447430233811011, "loss": 3.3675, "step": 39600 }, { "epoch": 4.267570767409321, "grad_norm": 0.6341124773025513, "learning_rate": 0.00034441978235103974, "loss": 3.3746, "step": 39650 }, { "epoch": 4.272952319448929, "grad_norm": 0.6686251759529114, "learning_rate": 0.00034409654132097834, "loss": 3.3784, "step": 39700 }, { "epoch": 4.278333871488537, "grad_norm": 0.6585057973861694, "learning_rate": 0.0003437733002909169, "loss": 3.3783, "step": 39750 }, { "epoch": 4.283715423528146, "grad_norm": 0.6461507678031921, "learning_rate": 0.00034345005926085547, "loss": 3.3977, "step": 39800 }, { "epoch": 4.2890969755677535, "grad_norm": 0.6735521554946899, "learning_rate": 0.00034312681823079407, "loss": 3.3969, "step": 39850 }, { "epoch": 4.294478527607362, "grad_norm": 0.63730788230896, "learning_rate": 0.00034280357720073266, "loss": 3.3707, "step": 39900 }, { "epoch": 4.299860079646971, "grad_norm": 0.6699662804603577, "learning_rate": 0.00034248033617067126, "loss": 3.3988, "step": 39950 }, { "epoch": 4.305241631686578, "grad_norm": 0.7002863883972168, "learning_rate": 0.00034215709514060985, "loss": 3.3918, "step": 40000 }, { "epoch": 4.305241631686578, "eval_accuracy": 0.37718447989112114, "eval_loss": 3.4400510787963867, "eval_runtime": 184.1552, "eval_samples_per_second": 97.803, "eval_steps_per_second": 6.114, "step": 40000 }, { "epoch": 4.310623183726187, "grad_norm": 0.6464648246765137, "learning_rate": 0.0003418338541105484, "loss": 3.3784, "step": 40050 }, { "epoch": 4.3160047357657945, "grad_norm": 0.6864054799079895, "learning_rate": 0.000341510613080487, "loss": 3.3877, "step": 40100 }, { "epoch": 4.321386287805403, "grad_norm": 0.6681678891181946, "learning_rate": 0.0003411873720504255, "loss": 3.3802, "step": 40150 }, { "epoch": 4.326767839845012, "grad_norm": 0.6899495720863342, "learning_rate": 0.0003408641310203642, "loss": 3.3858, "step": 40200 }, { "epoch": 4.332149391884619, "grad_norm": 0.6967451572418213, "learning_rate": 0.00034054088999030277, "loss": 3.3822, "step": 40250 }, { "epoch": 4.337530943924228, "grad_norm": 0.6746821403503418, "learning_rate": 0.0003402176489602413, "loss": 3.3931, "step": 40300 }, { "epoch": 4.342912495963836, "grad_norm": 0.6436692476272583, "learning_rate": 0.0003398944079301799, "loss": 3.3982, "step": 40350 }, { "epoch": 4.348294048003444, "grad_norm": 0.6327050924301147, "learning_rate": 0.0003395711669001185, "loss": 3.3927, "step": 40400 }, { "epoch": 4.3536756000430525, "grad_norm": 0.6891005039215088, "learning_rate": 0.00033924792587005704, "loss": 3.3809, "step": 40450 }, { "epoch": 4.359057152082661, "grad_norm": 0.6955647468566895, "learning_rate": 0.0003389246848399957, "loss": 3.4006, "step": 40500 }, { "epoch": 4.364438704122269, "grad_norm": 0.7198289036750793, "learning_rate": 0.0003386014438099343, "loss": 3.3773, "step": 40550 }, { "epoch": 4.369820256161877, "grad_norm": 0.6177657246589661, "learning_rate": 0.0003382782027798728, "loss": 3.3723, "step": 40600 }, { "epoch": 4.375201808201485, "grad_norm": 0.6719635725021362, "learning_rate": 0.0003379549617498114, "loss": 3.3783, "step": 40650 }, { "epoch": 4.3805833602410935, "grad_norm": 0.63397616147995, "learning_rate": 0.00033763172071974996, "loss": 3.4042, "step": 40700 }, { "epoch": 4.385964912280702, "grad_norm": 0.6645906567573547, "learning_rate": 0.00033730847968968855, "loss": 3.3853, "step": 40750 }, { "epoch": 4.39134646432031, "grad_norm": 0.6917778253555298, "learning_rate": 0.0003369852386596272, "loss": 3.3921, "step": 40800 }, { "epoch": 4.396728016359918, "grad_norm": 0.6736299395561218, "learning_rate": 0.00033666199762956574, "loss": 3.387, "step": 40850 }, { "epoch": 4.402109568399527, "grad_norm": 0.7093892097473145, "learning_rate": 0.00033633875659950434, "loss": 3.3989, "step": 40900 }, { "epoch": 4.4074911204391345, "grad_norm": 0.6806743741035461, "learning_rate": 0.0003360155155694429, "loss": 3.4023, "step": 40950 }, { "epoch": 4.412872672478743, "grad_norm": 0.6926308870315552, "learning_rate": 0.0003356922745393815, "loss": 3.3932, "step": 41000 }, { "epoch": 4.412872672478743, "eval_accuracy": 0.37758030236142925, "eval_loss": 3.4347708225250244, "eval_runtime": 184.1303, "eval_samples_per_second": 97.817, "eval_steps_per_second": 6.115, "step": 41000 }, { "epoch": 4.418254224518351, "grad_norm": 0.6390978693962097, "learning_rate": 0.0003353690335093201, "loss": 3.382, "step": 41050 }, { "epoch": 4.423635776557959, "grad_norm": 0.6638002395629883, "learning_rate": 0.0003350457924792587, "loss": 3.3888, "step": 41100 }, { "epoch": 4.429017328597568, "grad_norm": 0.7220461964607239, "learning_rate": 0.00033472255144919726, "loss": 3.3918, "step": 41150 }, { "epoch": 4.4343988806371755, "grad_norm": 0.6852858066558838, "learning_rate": 0.00033439931041913585, "loss": 3.3836, "step": 41200 }, { "epoch": 4.439780432676784, "grad_norm": 0.6737593412399292, "learning_rate": 0.0003340760693890744, "loss": 3.3965, "step": 41250 }, { "epoch": 4.445161984716393, "grad_norm": 0.6659989356994629, "learning_rate": 0.000333752828359013, "loss": 3.3693, "step": 41300 }, { "epoch": 4.450543536756, "grad_norm": 0.6684435606002808, "learning_rate": 0.00033342958732895164, "loss": 3.3867, "step": 41350 }, { "epoch": 4.455925088795609, "grad_norm": 0.6846572756767273, "learning_rate": 0.0003331063462988902, "loss": 3.3901, "step": 41400 }, { "epoch": 4.461306640835216, "grad_norm": 0.6321297883987427, "learning_rate": 0.00033278310526882877, "loss": 3.3878, "step": 41450 }, { "epoch": 4.466688192874825, "grad_norm": 0.6980935335159302, "learning_rate": 0.0003324598642387673, "loss": 3.3906, "step": 41500 }, { "epoch": 4.4720697449144335, "grad_norm": 0.6567344665527344, "learning_rate": 0.0003321366232087059, "loss": 3.3924, "step": 41550 }, { "epoch": 4.477451296954041, "grad_norm": 0.6956687569618225, "learning_rate": 0.0003318133821786445, "loss": 3.3914, "step": 41600 }, { "epoch": 4.48283284899365, "grad_norm": 0.6631964445114136, "learning_rate": 0.0003314901411485831, "loss": 3.3817, "step": 41650 }, { "epoch": 4.488214401033258, "grad_norm": 0.7194749116897583, "learning_rate": 0.0003311669001185217, "loss": 3.3983, "step": 41700 }, { "epoch": 4.493595953072866, "grad_norm": 0.6941500306129456, "learning_rate": 0.0003308501239090615, "loss": 3.3616, "step": 41750 }, { "epoch": 4.4989775051124745, "grad_norm": 0.6804183125495911, "learning_rate": 0.0003305268828790001, "loss": 3.3995, "step": 41800 }, { "epoch": 4.504359057152083, "grad_norm": 0.6385798454284668, "learning_rate": 0.0003302036418489387, "loss": 3.3957, "step": 41850 }, { "epoch": 4.509740609191691, "grad_norm": 0.6700449585914612, "learning_rate": 0.0003298804008188772, "loss": 3.3992, "step": 41900 }, { "epoch": 4.515122161231299, "grad_norm": 0.6467819809913635, "learning_rate": 0.0003295571597888158, "loss": 3.3942, "step": 41950 }, { "epoch": 4.520503713270907, "grad_norm": 0.7048128843307495, "learning_rate": 0.00032923391875875447, "loss": 3.3812, "step": 42000 }, { "epoch": 4.520503713270907, "eval_accuracy": 0.37840237011063366, "eval_loss": 3.4284253120422363, "eval_runtime": 183.9766, "eval_samples_per_second": 97.898, "eval_steps_per_second": 6.12, "step": 42000 }, { "epoch": 4.5258852653105155, "grad_norm": 0.655368983745575, "learning_rate": 0.000328910677728693, "loss": 3.3768, "step": 42050 }, { "epoch": 4.531266817350124, "grad_norm": 0.6865506768226624, "learning_rate": 0.0003285874366986316, "loss": 3.3763, "step": 42100 }, { "epoch": 4.536648369389732, "grad_norm": 0.6689867377281189, "learning_rate": 0.00032826419566857015, "loss": 3.3731, "step": 42150 }, { "epoch": 4.54202992142934, "grad_norm": 0.8594128489494324, "learning_rate": 0.00032794095463850874, "loss": 3.4079, "step": 42200 }, { "epoch": 4.547411473468949, "grad_norm": 0.6911894083023071, "learning_rate": 0.0003276177136084473, "loss": 3.3957, "step": 42250 }, { "epoch": 4.5527930255085565, "grad_norm": 0.6454538702964783, "learning_rate": 0.00032729447257838593, "loss": 3.3836, "step": 42300 }, { "epoch": 4.558174577548165, "grad_norm": 0.7313205599784851, "learning_rate": 0.0003269712315483245, "loss": 3.3608, "step": 42350 }, { "epoch": 4.563556129587774, "grad_norm": 0.663030207157135, "learning_rate": 0.00032664799051826306, "loss": 3.3747, "step": 42400 }, { "epoch": 4.568937681627381, "grad_norm": 0.6612809896469116, "learning_rate": 0.00032632474948820166, "loss": 3.3845, "step": 42450 }, { "epoch": 4.57431923366699, "grad_norm": 0.714770495891571, "learning_rate": 0.00032600150845814025, "loss": 3.3781, "step": 42500 }, { "epoch": 4.579700785706597, "grad_norm": 0.671267032623291, "learning_rate": 0.0003256782674280788, "loss": 3.3893, "step": 42550 }, { "epoch": 4.585082337746206, "grad_norm": 0.6506121754646301, "learning_rate": 0.00032535502639801744, "loss": 3.3947, "step": 42600 }, { "epoch": 4.5904638897858145, "grad_norm": 0.6803109645843506, "learning_rate": 0.00032503178536795604, "loss": 3.4182, "step": 42650 }, { "epoch": 4.595845441825422, "grad_norm": 0.6810559034347534, "learning_rate": 0.0003247085443378946, "loss": 3.3943, "step": 42700 }, { "epoch": 4.601226993865031, "grad_norm": 0.6830928921699524, "learning_rate": 0.0003243853033078332, "loss": 3.3853, "step": 42750 }, { "epoch": 4.606608545904638, "grad_norm": 0.6540989279747009, "learning_rate": 0.0003240620622777717, "loss": 3.3672, "step": 42800 }, { "epoch": 4.611990097944247, "grad_norm": 0.6703516244888306, "learning_rate": 0.00032373882124771036, "loss": 3.4006, "step": 42850 }, { "epoch": 4.6173716499838555, "grad_norm": 0.7216318249702454, "learning_rate": 0.00032341558021764896, "loss": 3.3766, "step": 42900 }, { "epoch": 4.622753202023463, "grad_norm": 0.7245146036148071, "learning_rate": 0.0003230923391875875, "loss": 3.4101, "step": 42950 }, { "epoch": 4.628134754063072, "grad_norm": 0.750889778137207, "learning_rate": 0.0003227690981575261, "loss": 3.3834, "step": 43000 }, { "epoch": 4.628134754063072, "eval_accuracy": 0.3787263730220964, "eval_loss": 3.4243762493133545, "eval_runtime": 184.4247, "eval_samples_per_second": 97.66, "eval_steps_per_second": 6.105, "step": 43000 }, { "epoch": 4.63351630610268, "grad_norm": 0.6889617443084717, "learning_rate": 0.0003224458571274647, "loss": 3.3897, "step": 43050 }, { "epoch": 4.638897858142288, "grad_norm": 0.6792300343513489, "learning_rate": 0.00032212261609740323, "loss": 3.3808, "step": 43100 }, { "epoch": 4.6442794101818965, "grad_norm": 0.7432804107666016, "learning_rate": 0.0003217993750673419, "loss": 3.394, "step": 43150 }, { "epoch": 4.649660962221505, "grad_norm": 0.6786441206932068, "learning_rate": 0.00032147613403728047, "loss": 3.3971, "step": 43200 }, { "epoch": 4.655042514261113, "grad_norm": 0.6685068607330322, "learning_rate": 0.000321152893007219, "loss": 3.3921, "step": 43250 }, { "epoch": 4.660424066300721, "grad_norm": 0.661141037940979, "learning_rate": 0.0003208296519771576, "loss": 3.4016, "step": 43300 }, { "epoch": 4.665805618340329, "grad_norm": 0.6769970059394836, "learning_rate": 0.00032050641094709615, "loss": 3.3853, "step": 43350 }, { "epoch": 4.6711871703799375, "grad_norm": 0.6532602906227112, "learning_rate": 0.00032018316991703474, "loss": 3.4007, "step": 43400 }, { "epoch": 4.676568722419546, "grad_norm": 0.6924940347671509, "learning_rate": 0.0003198599288869734, "loss": 3.3916, "step": 43450 }, { "epoch": 4.681950274459154, "grad_norm": 0.6717170476913452, "learning_rate": 0.00031953668785691193, "loss": 3.3911, "step": 43500 }, { "epoch": 4.687331826498762, "grad_norm": 0.6337043642997742, "learning_rate": 0.0003192134468268505, "loss": 3.3745, "step": 43550 }, { "epoch": 4.692713378538371, "grad_norm": 0.7014371156692505, "learning_rate": 0.0003188902057967891, "loss": 3.3704, "step": 43600 }, { "epoch": 4.6980949305779784, "grad_norm": 0.6753278374671936, "learning_rate": 0.00031856696476672766, "loss": 3.3941, "step": 43650 }, { "epoch": 4.703476482617587, "grad_norm": 0.6636554002761841, "learning_rate": 0.0003182437237366663, "loss": 3.3875, "step": 43700 }, { "epoch": 4.7088580346571955, "grad_norm": 0.712126612663269, "learning_rate": 0.0003179204827066049, "loss": 3.3954, "step": 43750 }, { "epoch": 4.714239586696803, "grad_norm": 0.7265583276748657, "learning_rate": 0.00031759724167654344, "loss": 3.3831, "step": 43800 }, { "epoch": 4.719621138736412, "grad_norm": 0.664732038974762, "learning_rate": 0.0003172804654670833, "loss": 3.3932, "step": 43850 }, { "epoch": 4.725002690776019, "grad_norm": 0.7312369346618652, "learning_rate": 0.00031695722443702185, "loss": 3.3942, "step": 43900 }, { "epoch": 4.730384242815628, "grad_norm": 0.6675333976745605, "learning_rate": 0.00031663398340696044, "loss": 3.379, "step": 43950 }, { "epoch": 4.7357657948552365, "grad_norm": 0.6752867698669434, "learning_rate": 0.000316310742376899, "loss": 3.3815, "step": 44000 }, { "epoch": 4.7357657948552365, "eval_accuracy": 0.37960906908270126, "eval_loss": 3.4186949729919434, "eval_runtime": 184.2597, "eval_samples_per_second": 97.748, "eval_steps_per_second": 6.111, "step": 44000 }, { "epoch": 4.741147346894844, "grad_norm": 0.6951192021369934, "learning_rate": 0.0003159875013468376, "loss": 3.4031, "step": 44050 }, { "epoch": 4.746528898934453, "grad_norm": 0.6500643491744995, "learning_rate": 0.0003156642603167762, "loss": 3.3813, "step": 44100 }, { "epoch": 4.751910450974061, "grad_norm": 0.661619246006012, "learning_rate": 0.00031534101928671476, "loss": 3.3873, "step": 44150 }, { "epoch": 4.757292003013669, "grad_norm": 0.6603606939315796, "learning_rate": 0.00031501777825665336, "loss": 3.3996, "step": 44200 }, { "epoch": 4.7626735550532775, "grad_norm": 0.7476078867912292, "learning_rate": 0.0003146945372265919, "loss": 3.3959, "step": 44250 }, { "epoch": 4.768055107092886, "grad_norm": 0.6750323176383972, "learning_rate": 0.0003143712961965305, "loss": 3.3787, "step": 44300 }, { "epoch": 4.773436659132494, "grad_norm": 0.680514931678772, "learning_rate": 0.0003140480551664691, "loss": 3.396, "step": 44350 }, { "epoch": 4.778818211172102, "grad_norm": 0.6929029822349548, "learning_rate": 0.0003137248141364077, "loss": 3.4004, "step": 44400 }, { "epoch": 4.78419976321171, "grad_norm": 0.6420599222183228, "learning_rate": 0.0003134015731063463, "loss": 3.3965, "step": 44450 }, { "epoch": 4.7895813152513185, "grad_norm": 0.6655126214027405, "learning_rate": 0.0003130783320762849, "loss": 3.3778, "step": 44500 }, { "epoch": 4.794962867290927, "grad_norm": 0.7804667949676514, "learning_rate": 0.0003127550910462234, "loss": 3.4013, "step": 44550 }, { "epoch": 4.800344419330535, "grad_norm": 0.6613391041755676, "learning_rate": 0.000312431850016162, "loss": 3.3993, "step": 44600 }, { "epoch": 4.805725971370143, "grad_norm": 0.6758144497871399, "learning_rate": 0.00031210860898610066, "loss": 3.3911, "step": 44650 }, { "epoch": 4.811107523409751, "grad_norm": 0.7025032639503479, "learning_rate": 0.0003117853679560392, "loss": 3.4009, "step": 44700 }, { "epoch": 4.8164890754493594, "grad_norm": 0.6800017356872559, "learning_rate": 0.0003114621269259778, "loss": 3.406, "step": 44750 }, { "epoch": 4.821870627488968, "grad_norm": 0.6814872026443481, "learning_rate": 0.00031113888589591633, "loss": 3.3945, "step": 44800 }, { "epoch": 4.827252179528576, "grad_norm": 0.6770219802856445, "learning_rate": 0.00031081564486585493, "loss": 3.392, "step": 44850 }, { "epoch": 4.832633731568184, "grad_norm": 0.6795842051506042, "learning_rate": 0.0003104924038357935, "loss": 3.3917, "step": 44900 }, { "epoch": 4.838015283607793, "grad_norm": 0.6808043122291565, "learning_rate": 0.0003101691628057321, "loss": 3.3903, "step": 44950 }, { "epoch": 4.8433968356474, "grad_norm": 0.6572183966636658, "learning_rate": 0.0003098459217756707, "loss": 3.388, "step": 45000 }, { "epoch": 4.8433968356474, "eval_accuracy": 0.3798020366114598, "eval_loss": 3.412498712539673, "eval_runtime": 184.0046, "eval_samples_per_second": 97.883, "eval_steps_per_second": 6.119, "step": 45000 }, { "epoch": 4.848778387687009, "grad_norm": 0.7136537432670593, "learning_rate": 0.0003095226807456093, "loss": 3.397, "step": 45050 }, { "epoch": 4.8541599397266175, "grad_norm": 0.7045766711235046, "learning_rate": 0.00030919943971554785, "loss": 3.3891, "step": 45100 }, { "epoch": 4.859541491766225, "grad_norm": 0.6693941354751587, "learning_rate": 0.00030887619868548644, "loss": 3.3829, "step": 45150 }, { "epoch": 4.864923043805834, "grad_norm": 0.6665346622467041, "learning_rate": 0.000308552957655425, "loss": 3.3892, "step": 45200 }, { "epoch": 4.870304595845441, "grad_norm": 0.6944468021392822, "learning_rate": 0.00030822971662536363, "loss": 3.3953, "step": 45250 }, { "epoch": 4.87568614788505, "grad_norm": 0.6128963828086853, "learning_rate": 0.0003079064755953022, "loss": 3.3913, "step": 45300 }, { "epoch": 4.8810676999246585, "grad_norm": 0.6657466292381287, "learning_rate": 0.00030758323456524077, "loss": 3.387, "step": 45350 }, { "epoch": 4.886449251964266, "grad_norm": 0.6997761726379395, "learning_rate": 0.00030725999353517936, "loss": 3.3944, "step": 45400 }, { "epoch": 4.891830804003875, "grad_norm": 0.6384490132331848, "learning_rate": 0.00030693675250511795, "loss": 3.3733, "step": 45450 }, { "epoch": 4.897212356043483, "grad_norm": 0.6642307043075562, "learning_rate": 0.00030661351147505655, "loss": 3.3937, "step": 45500 }, { "epoch": 4.902593908083091, "grad_norm": 0.6693574786186218, "learning_rate": 0.00030629027044499514, "loss": 3.3965, "step": 45550 }, { "epoch": 4.9079754601226995, "grad_norm": 0.6383407711982727, "learning_rate": 0.00030596702941493374, "loss": 3.3776, "step": 45600 }, { "epoch": 4.913357012162308, "grad_norm": 0.6768180727958679, "learning_rate": 0.0003056437883848723, "loss": 3.4013, "step": 45650 }, { "epoch": 4.918738564201916, "grad_norm": 0.7400315403938293, "learning_rate": 0.0003053205473548109, "loss": 3.3948, "step": 45700 }, { "epoch": 4.924120116241524, "grad_norm": 0.6887032985687256, "learning_rate": 0.0003049973063247494, "loss": 3.3877, "step": 45750 }, { "epoch": 4.929501668281132, "grad_norm": 0.656160831451416, "learning_rate": 0.00030467406529468806, "loss": 3.3705, "step": 45800 }, { "epoch": 4.9348832203207404, "grad_norm": 0.6466988325119019, "learning_rate": 0.00030435082426462666, "loss": 3.3793, "step": 45850 }, { "epoch": 4.940264772360349, "grad_norm": 0.6337040662765503, "learning_rate": 0.0003040275832345652, "loss": 3.3951, "step": 45900 }, { "epoch": 4.945646324399957, "grad_norm": 0.6839388608932495, "learning_rate": 0.0003037043422045038, "loss": 3.3859, "step": 45950 }, { "epoch": 4.951027876439565, "grad_norm": 0.6461491584777832, "learning_rate": 0.0003033811011744424, "loss": 3.3858, "step": 46000 }, { "epoch": 4.951027876439565, "eval_accuracy": 0.38047948736698306, "eval_loss": 3.4082107543945312, "eval_runtime": 184.0969, "eval_samples_per_second": 97.834, "eval_steps_per_second": 6.116, "step": 46000 }, { "epoch": 4.956409428479174, "grad_norm": 0.6734457612037659, "learning_rate": 0.00030305786014438093, "loss": 3.3711, "step": 46050 }, { "epoch": 4.961790980518781, "grad_norm": 0.6695023775100708, "learning_rate": 0.0003027346191143196, "loss": 3.3819, "step": 46100 }, { "epoch": 4.96717253255839, "grad_norm": 0.6604933142662048, "learning_rate": 0.00030241137808425817, "loss": 3.3865, "step": 46150 }, { "epoch": 4.9725540845979985, "grad_norm": 0.6744388937950134, "learning_rate": 0.0003020881370541967, "loss": 3.3829, "step": 46200 }, { "epoch": 4.977935636637606, "grad_norm": 0.6832672953605652, "learning_rate": 0.0003017648960241353, "loss": 3.3886, "step": 46250 }, { "epoch": 4.983317188677215, "grad_norm": 0.6833645701408386, "learning_rate": 0.0003014481198146751, "loss": 3.3784, "step": 46300 }, { "epoch": 4.988698740716822, "grad_norm": 0.6692368984222412, "learning_rate": 0.0003011248787846137, "loss": 3.3824, "step": 46350 }, { "epoch": 4.994080292756431, "grad_norm": 0.6866097450256348, "learning_rate": 0.00030080163775455225, "loss": 3.3944, "step": 46400 }, { "epoch": 4.9994618447960395, "grad_norm": 0.6605845093727112, "learning_rate": 0.0003004783967244909, "loss": 3.3655, "step": 46450 }, { "epoch": 5.004843396835647, "grad_norm": 0.6645815968513489, "learning_rate": 0.0003001551556944295, "loss": 3.2941, "step": 46500 }, { "epoch": 5.010224948875256, "grad_norm": 0.6937685608863831, "learning_rate": 0.00029983191466436803, "loss": 3.2964, "step": 46550 }, { "epoch": 5.015606500914864, "grad_norm": 0.6456879377365112, "learning_rate": 0.0002995086736343066, "loss": 3.2849, "step": 46600 }, { "epoch": 5.020988052954472, "grad_norm": 0.7456214427947998, "learning_rate": 0.00029918543260424517, "loss": 3.3028, "step": 46650 }, { "epoch": 5.0263696049940805, "grad_norm": 0.6754710674285889, "learning_rate": 0.0002988621915741838, "loss": 3.2735, "step": 46700 }, { "epoch": 5.031751157033688, "grad_norm": 0.6434471011161804, "learning_rate": 0.00029853895054412236, "loss": 3.3039, "step": 46750 }, { "epoch": 5.037132709073297, "grad_norm": 0.6574273705482483, "learning_rate": 0.00029821570951406095, "loss": 3.3082, "step": 46800 }, { "epoch": 5.042514261112905, "grad_norm": 0.7584052085876465, "learning_rate": 0.00029789246848399955, "loss": 3.3018, "step": 46850 }, { "epoch": 5.047895813152513, "grad_norm": 0.6995671391487122, "learning_rate": 0.00029756922745393814, "loss": 3.3098, "step": 46900 }, { "epoch": 5.0532773651921215, "grad_norm": 0.7098338007926941, "learning_rate": 0.00029724598642387674, "loss": 3.2943, "step": 46950 }, { "epoch": 5.05865891723173, "grad_norm": 0.7222293019294739, "learning_rate": 0.0002969227453938153, "loss": 3.3062, "step": 47000 }, { "epoch": 5.05865891723173, "eval_accuracy": 0.38046797016087475, "eval_loss": 3.412203073501587, "eval_runtime": 184.1726, "eval_samples_per_second": 97.794, "eval_steps_per_second": 6.114, "step": 47000 }, { "epoch": 5.064040469271338, "grad_norm": 0.7070134282112122, "learning_rate": 0.00029659950436375387, "loss": 3.316, "step": 47050 }, { "epoch": 5.069422021310946, "grad_norm": 0.7262149453163147, "learning_rate": 0.00029627626333369246, "loss": 3.2856, "step": 47100 }, { "epoch": 5.074803573350554, "grad_norm": 0.7377148270606995, "learning_rate": 0.00029595302230363106, "loss": 3.3034, "step": 47150 }, { "epoch": 5.080185125390162, "grad_norm": 0.6759535074234009, "learning_rate": 0.0002956297812735696, "loss": 3.3016, "step": 47200 }, { "epoch": 5.085566677429771, "grad_norm": 0.6767816543579102, "learning_rate": 0.00029530654024350825, "loss": 3.316, "step": 47250 }, { "epoch": 5.090948229469379, "grad_norm": 0.6840730905532837, "learning_rate": 0.0002949832992134468, "loss": 3.3165, "step": 47300 }, { "epoch": 5.096329781508987, "grad_norm": 0.7208547592163086, "learning_rate": 0.0002946600581833854, "loss": 3.3314, "step": 47350 }, { "epoch": 5.101711333548596, "grad_norm": 0.766435444355011, "learning_rate": 0.000294336817153324, "loss": 3.3041, "step": 47400 }, { "epoch": 5.107092885588203, "grad_norm": 0.6821116209030151, "learning_rate": 0.0002940135761232626, "loss": 3.3046, "step": 47450 }, { "epoch": 5.112474437627812, "grad_norm": 0.6808845400810242, "learning_rate": 0.0002936903350932011, "loss": 3.317, "step": 47500 }, { "epoch": 5.1178559896674205, "grad_norm": 0.6992238163948059, "learning_rate": 0.0002933670940631397, "loss": 3.308, "step": 47550 }, { "epoch": 5.123237541707028, "grad_norm": 0.6857419610023499, "learning_rate": 0.0002930438530330783, "loss": 3.3146, "step": 47600 }, { "epoch": 5.128619093746637, "grad_norm": 0.7254018783569336, "learning_rate": 0.0002927206120030169, "loss": 3.3138, "step": 47650 }, { "epoch": 5.134000645786244, "grad_norm": 0.7838053107261658, "learning_rate": 0.0002923973709729555, "loss": 3.3054, "step": 47700 }, { "epoch": 5.139382197825853, "grad_norm": 0.721123456954956, "learning_rate": 0.00029207412994289403, "loss": 3.3308, "step": 47750 }, { "epoch": 5.1447637498654615, "grad_norm": 0.6714949011802673, "learning_rate": 0.0002917508889128327, "loss": 3.3227, "step": 47800 }, { "epoch": 5.150145301905069, "grad_norm": 0.7410764694213867, "learning_rate": 0.0002914276478827712, "loss": 3.3237, "step": 47850 }, { "epoch": 5.155526853944678, "grad_norm": 0.6743694543838501, "learning_rate": 0.0002911044068527098, "loss": 3.3191, "step": 47900 }, { "epoch": 5.160908405984286, "grad_norm": 0.7283504009246826, "learning_rate": 0.0002907811658226484, "loss": 3.3293, "step": 47950 }, { "epoch": 5.166289958023894, "grad_norm": 0.6825482845306396, "learning_rate": 0.000290457924792587, "loss": 3.3199, "step": 48000 }, { "epoch": 5.166289958023894, "eval_accuracy": 0.3809088835796259, "eval_loss": 3.4099676609039307, "eval_runtime": 184.191, "eval_samples_per_second": 97.784, "eval_steps_per_second": 6.113, "step": 48000 }, { "epoch": 5.1716715100635025, "grad_norm": 0.6646033525466919, "learning_rate": 0.00029013468376252555, "loss": 3.3106, "step": 48050 }, { "epoch": 5.17705306210311, "grad_norm": 0.726647138595581, "learning_rate": 0.00028981144273246414, "loss": 3.31, "step": 48100 }, { "epoch": 5.182434614142719, "grad_norm": 0.7260897159576416, "learning_rate": 0.00028948820170240274, "loss": 3.3137, "step": 48150 }, { "epoch": 5.187816166182327, "grad_norm": 0.7188605070114136, "learning_rate": 0.00028916496067234133, "loss": 3.3235, "step": 48200 }, { "epoch": 5.193197718221935, "grad_norm": 0.6788833141326904, "learning_rate": 0.0002888417196422799, "loss": 3.3213, "step": 48250 }, { "epoch": 5.198579270261543, "grad_norm": 0.663392961025238, "learning_rate": 0.00028851847861221847, "loss": 3.3061, "step": 48300 }, { "epoch": 5.203960822301152, "grad_norm": 0.6828113198280334, "learning_rate": 0.00028819523758215706, "loss": 3.3231, "step": 48350 }, { "epoch": 5.20934237434076, "grad_norm": 0.7312074303627014, "learning_rate": 0.00028787199655209566, "loss": 3.3161, "step": 48400 }, { "epoch": 5.214723926380368, "grad_norm": 0.7451295256614685, "learning_rate": 0.00028754875552203425, "loss": 3.3288, "step": 48450 }, { "epoch": 5.220105478419977, "grad_norm": 0.7013233304023743, "learning_rate": 0.0002872255144919728, "loss": 3.3205, "step": 48500 }, { "epoch": 5.225487030459584, "grad_norm": 0.6722540855407715, "learning_rate": 0.00028690227346191144, "loss": 3.3149, "step": 48550 }, { "epoch": 5.230868582499193, "grad_norm": 0.667367696762085, "learning_rate": 0.00028657903243185, "loss": 3.3376, "step": 48600 }, { "epoch": 5.236250134538801, "grad_norm": 0.697004497051239, "learning_rate": 0.0002862557914017886, "loss": 3.3144, "step": 48650 }, { "epoch": 5.241631686578409, "grad_norm": 0.6611058712005615, "learning_rate": 0.00028593255037172717, "loss": 3.3203, "step": 48700 }, { "epoch": 5.247013238618018, "grad_norm": 0.7373704314231873, "learning_rate": 0.00028560930934166576, "loss": 3.3383, "step": 48750 }, { "epoch": 5.252394790657625, "grad_norm": 0.7060421109199524, "learning_rate": 0.00028528606831160436, "loss": 3.3266, "step": 48800 }, { "epoch": 5.257776342697234, "grad_norm": 0.7901322841644287, "learning_rate": 0.0002849628272815429, "loss": 3.3193, "step": 48850 }, { "epoch": 5.2631578947368425, "grad_norm": 0.6814393401145935, "learning_rate": 0.0002846395862514815, "loss": 3.3232, "step": 48900 }, { "epoch": 5.26853944677645, "grad_norm": 0.7422091960906982, "learning_rate": 0.0002843163452214201, "loss": 3.3227, "step": 48950 }, { "epoch": 5.273920998816059, "grad_norm": 0.7197288870811462, "learning_rate": 0.0002839931041913587, "loss": 3.3185, "step": 49000 }, { "epoch": 5.273920998816059, "eval_accuracy": 0.38128982060430344, "eval_loss": 3.407198905944824, "eval_runtime": 184.1943, "eval_samples_per_second": 97.783, "eval_steps_per_second": 6.113, "step": 49000 }, { "epoch": 5.279302550855666, "grad_norm": 0.7363371253013611, "learning_rate": 0.0002836698631612972, "loss": 3.322, "step": 49050 }, { "epoch": 5.284684102895275, "grad_norm": 0.6939499974250793, "learning_rate": 0.0002833530869518371, "loss": 3.3228, "step": 49100 }, { "epoch": 5.2900656549348835, "grad_norm": 0.7077795267105103, "learning_rate": 0.0002830298459217756, "loss": 3.3191, "step": 49150 }, { "epoch": 5.295447206974491, "grad_norm": 0.7591833472251892, "learning_rate": 0.0002827066048917142, "loss": 3.3278, "step": 49200 }, { "epoch": 5.3008287590141, "grad_norm": 0.721301794052124, "learning_rate": 0.0002823833638616528, "loss": 3.3032, "step": 49250 }, { "epoch": 5.306210311053708, "grad_norm": 0.7010788321495056, "learning_rate": 0.0002820601228315914, "loss": 3.3365, "step": 49300 }, { "epoch": 5.311591863093316, "grad_norm": 0.6631658673286438, "learning_rate": 0.00028173688180153, "loss": 3.3141, "step": 49350 }, { "epoch": 5.316973415132924, "grad_norm": 0.72654789686203, "learning_rate": 0.00028141364077146854, "loss": 3.3346, "step": 49400 }, { "epoch": 5.322354967172533, "grad_norm": 0.7147997617721558, "learning_rate": 0.00028109039974140714, "loss": 3.3165, "step": 49450 }, { "epoch": 5.327736519212141, "grad_norm": 0.6770532131195068, "learning_rate": 0.00028076715871134573, "loss": 3.3283, "step": 49500 }, { "epoch": 5.333118071251749, "grad_norm": 0.6925684809684753, "learning_rate": 0.00028044391768128433, "loss": 3.3213, "step": 49550 }, { "epoch": 5.338499623291357, "grad_norm": 0.6806793808937073, "learning_rate": 0.0002801206766512229, "loss": 3.3244, "step": 49600 }, { "epoch": 5.343881175330965, "grad_norm": 0.6749091744422913, "learning_rate": 0.0002797974356211615, "loss": 3.3123, "step": 49650 }, { "epoch": 5.349262727370574, "grad_norm": 0.7087625861167908, "learning_rate": 0.00027947419459110006, "loss": 3.3246, "step": 49700 }, { "epoch": 5.354644279410182, "grad_norm": 0.6872217059135437, "learning_rate": 0.00027915095356103865, "loss": 3.32, "step": 49750 }, { "epoch": 5.36002583144979, "grad_norm": 0.7207382917404175, "learning_rate": 0.00027882771253097725, "loss": 3.3301, "step": 49800 }, { "epoch": 5.365407383489399, "grad_norm": 0.6697590947151184, "learning_rate": 0.00027850447150091584, "loss": 3.3386, "step": 49850 }, { "epoch": 5.370788935529006, "grad_norm": 0.6902472972869873, "learning_rate": 0.00027818123047085444, "loss": 3.3261, "step": 49900 }, { "epoch": 5.376170487568615, "grad_norm": 0.6929269433021545, "learning_rate": 0.000277857989440793, "loss": 3.3156, "step": 49950 }, { "epoch": 5.3815520396082235, "grad_norm": 0.7222278714179993, "learning_rate": 0.00027753474841073157, "loss": 3.3294, "step": 50000 }, { "epoch": 5.3815520396082235, "eval_accuracy": 0.3817663721702578, "eval_loss": 3.4018335342407227, "eval_runtime": 184.1232, "eval_samples_per_second": 97.82, "eval_steps_per_second": 6.115, "step": 50000 }, { "epoch": 5.386933591647831, "grad_norm": 0.7067680358886719, "learning_rate": 0.00027721150738067017, "loss": 3.3578, "step": 50050 }, { "epoch": 5.39231514368744, "grad_norm": 0.6830293536186218, "learning_rate": 0.00027688826635060876, "loss": 3.3091, "step": 50100 }, { "epoch": 5.397696695727047, "grad_norm": 0.7057771682739258, "learning_rate": 0.0002765650253205473, "loss": 3.3237, "step": 50150 }, { "epoch": 5.403078247766656, "grad_norm": 0.689148485660553, "learning_rate": 0.00027624178429048595, "loss": 3.3122, "step": 50200 }, { "epoch": 5.4084597998062645, "grad_norm": 0.6972809433937073, "learning_rate": 0.0002759185432604245, "loss": 3.3256, "step": 50250 }, { "epoch": 5.413841351845872, "grad_norm": 0.7275722026824951, "learning_rate": 0.0002755953022303631, "loss": 3.3268, "step": 50300 }, { "epoch": 5.419222903885481, "grad_norm": 0.7065505385398865, "learning_rate": 0.0002752720612003017, "loss": 3.3293, "step": 50350 }, { "epoch": 5.424604455925088, "grad_norm": 0.6810638308525085, "learning_rate": 0.0002749488201702403, "loss": 3.3356, "step": 50400 }, { "epoch": 5.429986007964697, "grad_norm": 0.7367533445358276, "learning_rate": 0.0002746255791401788, "loss": 3.3255, "step": 50450 }, { "epoch": 5.435367560004305, "grad_norm": 0.6835526823997498, "learning_rate": 0.0002743023381101174, "loss": 3.3326, "step": 50500 }, { "epoch": 5.440749112043913, "grad_norm": 0.684320867061615, "learning_rate": 0.000273979097080056, "loss": 3.3293, "step": 50550 }, { "epoch": 5.446130664083522, "grad_norm": 0.702653706073761, "learning_rate": 0.0002736558560499946, "loss": 3.3337, "step": 50600 }, { "epoch": 5.45151221612313, "grad_norm": 0.7121669054031372, "learning_rate": 0.0002733326150199332, "loss": 3.3284, "step": 50650 }, { "epoch": 5.456893768162738, "grad_norm": 0.739630937576294, "learning_rate": 0.00027300937398987173, "loss": 3.3416, "step": 50700 }, { "epoch": 5.462275320202346, "grad_norm": 0.7171410918235779, "learning_rate": 0.0002726861329598104, "loss": 3.3345, "step": 50750 }, { "epoch": 5.467656872241955, "grad_norm": 0.7838406562805176, "learning_rate": 0.0002723628919297489, "loss": 3.3318, "step": 50800 }, { "epoch": 5.473038424281563, "grad_norm": 0.7008166313171387, "learning_rate": 0.0002720396508996875, "loss": 3.3247, "step": 50850 }, { "epoch": 5.478419976321171, "grad_norm": 0.7487154603004456, "learning_rate": 0.0002717164098696261, "loss": 3.3302, "step": 50900 }, { "epoch": 5.483801528360779, "grad_norm": 0.6788049340248108, "learning_rate": 0.00027139316883956465, "loss": 3.3292, "step": 50950 }, { "epoch": 5.489183080400387, "grad_norm": 0.7225068807601929, "learning_rate": 0.00027106992780950325, "loss": 3.3338, "step": 51000 }, { "epoch": 5.489183080400387, "eval_accuracy": 0.38227714939587365, "eval_loss": 3.397834062576294, "eval_runtime": 184.3196, "eval_samples_per_second": 97.716, "eval_steps_per_second": 6.109, "step": 51000 }, { "epoch": 5.494564632439996, "grad_norm": 0.6784095764160156, "learning_rate": 0.00027074668677944184, "loss": 3.337, "step": 51050 }, { "epoch": 5.499946184479604, "grad_norm": 0.7230722904205322, "learning_rate": 0.00027042344574938044, "loss": 3.3321, "step": 51100 }, { "epoch": 5.505327736519212, "grad_norm": 0.7549768686294556, "learning_rate": 0.000270100204719319, "loss": 3.3321, "step": 51150 }, { "epoch": 5.510709288558821, "grad_norm": 0.7161559462547302, "learning_rate": 0.0002697769636892576, "loss": 3.3257, "step": 51200 }, { "epoch": 5.516090840598428, "grad_norm": 0.7540744543075562, "learning_rate": 0.00026945372265919617, "loss": 3.3214, "step": 51250 }, { "epoch": 5.521472392638037, "grad_norm": 0.6781821846961975, "learning_rate": 0.00026913048162913476, "loss": 3.3292, "step": 51300 }, { "epoch": 5.5268539446776455, "grad_norm": 0.688515305519104, "learning_rate": 0.00026880724059907336, "loss": 3.3326, "step": 51350 }, { "epoch": 5.532235496717253, "grad_norm": 0.7121948599815369, "learning_rate": 0.00026848399956901195, "loss": 3.3261, "step": 51400 }, { "epoch": 5.537617048756862, "grad_norm": 0.6862152814865112, "learning_rate": 0.0002681607585389505, "loss": 3.3231, "step": 51450 }, { "epoch": 5.542998600796469, "grad_norm": 0.7054077386856079, "learning_rate": 0.0002678375175088891, "loss": 3.323, "step": 51500 }, { "epoch": 5.548380152836078, "grad_norm": 0.7206218838691711, "learning_rate": 0.0002675142764788277, "loss": 3.3095, "step": 51550 }, { "epoch": 5.553761704875686, "grad_norm": 0.7103893160820007, "learning_rate": 0.0002671910354487663, "loss": 3.3479, "step": 51600 }, { "epoch": 5.559143256915294, "grad_norm": 0.70819091796875, "learning_rate": 0.00026686779441870487, "loss": 3.3347, "step": 51650 }, { "epoch": 5.564524808954903, "grad_norm": 0.7234737277030945, "learning_rate": 0.0002665445533886434, "loss": 3.3232, "step": 51700 }, { "epoch": 5.569906360994511, "grad_norm": 0.7277364730834961, "learning_rate": 0.00026622131235858206, "loss": 3.3231, "step": 51750 }, { "epoch": 5.575287913034119, "grad_norm": 0.7143622040748596, "learning_rate": 0.0002658980713285206, "loss": 3.3305, "step": 51800 }, { "epoch": 5.580669465073727, "grad_norm": 0.7191085815429688, "learning_rate": 0.0002655748302984592, "loss": 3.317, "step": 51850 }, { "epoch": 5.586051017113336, "grad_norm": 0.7273434400558472, "learning_rate": 0.0002652515892683978, "loss": 3.3296, "step": 51900 }, { "epoch": 5.591432569152944, "grad_norm": 0.6899665594100952, "learning_rate": 0.0002649283482383364, "loss": 3.348, "step": 51950 }, { "epoch": 5.596814121192552, "grad_norm": 0.6867828369140625, "learning_rate": 0.0002646051072082749, "loss": 3.3411, "step": 52000 }, { "epoch": 5.596814121192552, "eval_accuracy": 0.38285800773412987, "eval_loss": 3.3932530879974365, "eval_runtime": 184.3226, "eval_samples_per_second": 97.715, "eval_steps_per_second": 6.109, "step": 52000 }, { "epoch": 5.60219567323216, "grad_norm": 0.8301399350166321, "learning_rate": 0.0002642818661782135, "loss": 3.3444, "step": 52050 }, { "epoch": 5.607577225271768, "grad_norm": 0.7274202704429626, "learning_rate": 0.0002639586251481521, "loss": 3.3322, "step": 52100 }, { "epoch": 5.612958777311377, "grad_norm": 0.6852136254310608, "learning_rate": 0.0002636353841180907, "loss": 3.3477, "step": 52150 }, { "epoch": 5.618340329350985, "grad_norm": 0.6925633549690247, "learning_rate": 0.0002633121430880293, "loss": 3.3351, "step": 52200 }, { "epoch": 5.623721881390593, "grad_norm": 0.7001982927322388, "learning_rate": 0.00026298890205796784, "loss": 3.3395, "step": 52250 }, { "epoch": 5.629103433430201, "grad_norm": 0.7353464961051941, "learning_rate": 0.00026266566102790644, "loss": 3.3365, "step": 52300 }, { "epoch": 5.634484985469809, "grad_norm": 0.6678706407546997, "learning_rate": 0.00026234241999784503, "loss": 3.3379, "step": 52350 }, { "epoch": 5.639866537509418, "grad_norm": 0.7720281481742859, "learning_rate": 0.0002620191789677836, "loss": 3.3341, "step": 52400 }, { "epoch": 5.645248089549026, "grad_norm": 0.7036975026130676, "learning_rate": 0.00026169593793772217, "loss": 3.3266, "step": 52450 }, { "epoch": 5.650629641588634, "grad_norm": 0.73092120885849, "learning_rate": 0.0002613726969076608, "loss": 3.3349, "step": 52500 }, { "epoch": 5.656011193628243, "grad_norm": 0.7377499341964722, "learning_rate": 0.00026104945587759936, "loss": 3.3457, "step": 52550 }, { "epoch": 5.66139274566785, "grad_norm": 0.7286308407783508, "learning_rate": 0.00026072621484753795, "loss": 3.3448, "step": 52600 }, { "epoch": 5.666774297707459, "grad_norm": 0.711744487285614, "learning_rate": 0.00026040297381747655, "loss": 3.3343, "step": 52650 }, { "epoch": 5.672155849747067, "grad_norm": 0.7152302861213684, "learning_rate": 0.00026007973278741514, "loss": 3.3295, "step": 52700 }, { "epoch": 5.677537401786675, "grad_norm": 0.6984241604804993, "learning_rate": 0.00025975649175735373, "loss": 3.3254, "step": 52750 }, { "epoch": 5.682918953826284, "grad_norm": 0.7689533829689026, "learning_rate": 0.0002594332507272923, "loss": 3.3396, "step": 52800 }, { "epoch": 5.688300505865891, "grad_norm": 0.7001849412918091, "learning_rate": 0.00025911000969723087, "loss": 3.3241, "step": 52850 }, { "epoch": 5.6936820579055, "grad_norm": 0.7136799693107605, "learning_rate": 0.00025878676866716946, "loss": 3.3461, "step": 52900 }, { "epoch": 5.699063609945108, "grad_norm": 0.7490822076797485, "learning_rate": 0.00025846352763710806, "loss": 3.3466, "step": 52950 }, { "epoch": 5.704445161984716, "grad_norm": 0.6949664354324341, "learning_rate": 0.0002581402866070466, "loss": 3.3494, "step": 53000 }, { "epoch": 5.704445161984716, "eval_accuracy": 0.38309204605448227, "eval_loss": 3.388648271560669, "eval_runtime": 184.3909, "eval_samples_per_second": 97.678, "eval_steps_per_second": 6.107, "step": 53000 }, { "epoch": 5.709826714024325, "grad_norm": 0.7366794347763062, "learning_rate": 0.00025781704557698525, "loss": 3.3189, "step": 53050 }, { "epoch": 5.715208266063933, "grad_norm": 0.6596007943153381, "learning_rate": 0.000257500269367525, "loss": 3.3204, "step": 53100 }, { "epoch": 5.720589818103541, "grad_norm": 0.722526490688324, "learning_rate": 0.0002571770283374636, "loss": 3.3391, "step": 53150 }, { "epoch": 5.725971370143149, "grad_norm": 0.697116494178772, "learning_rate": 0.0002568537873074022, "loss": 3.344, "step": 53200 }, { "epoch": 5.731352922182758, "grad_norm": 0.7177129983901978, "learning_rate": 0.0002565305462773408, "loss": 3.326, "step": 53250 }, { "epoch": 5.736734474222366, "grad_norm": 0.7836544513702393, "learning_rate": 0.0002562073052472794, "loss": 3.3439, "step": 53300 }, { "epoch": 5.742116026261974, "grad_norm": 0.764375627040863, "learning_rate": 0.0002558840642172179, "loss": 3.3383, "step": 53350 }, { "epoch": 5.747497578301582, "grad_norm": 0.727080762386322, "learning_rate": 0.00025556082318715657, "loss": 3.3395, "step": 53400 }, { "epoch": 5.75287913034119, "grad_norm": 0.724855363368988, "learning_rate": 0.0002552375821570951, "loss": 3.3389, "step": 53450 }, { "epoch": 5.758260682380799, "grad_norm": 0.7034295201301575, "learning_rate": 0.0002549143411270337, "loss": 3.331, "step": 53500 }, { "epoch": 5.763642234420407, "grad_norm": 0.6917016506195068, "learning_rate": 0.0002545911000969723, "loss": 3.3282, "step": 53550 }, { "epoch": 5.769023786460015, "grad_norm": 0.7016806602478027, "learning_rate": 0.0002542678590669109, "loss": 3.3455, "step": 53600 }, { "epoch": 5.774405338499624, "grad_norm": 0.7391362190246582, "learning_rate": 0.00025394461803684943, "loss": 3.3435, "step": 53650 }, { "epoch": 5.779786890539231, "grad_norm": 0.7682531476020813, "learning_rate": 0.00025362137700678803, "loss": 3.3326, "step": 53700 }, { "epoch": 5.78516844257884, "grad_norm": 0.686830461025238, "learning_rate": 0.0002532981359767266, "loss": 3.3435, "step": 53750 }, { "epoch": 5.790549994618448, "grad_norm": 0.7108717560768127, "learning_rate": 0.0002529748949466652, "loss": 3.339, "step": 53800 }, { "epoch": 5.795931546658056, "grad_norm": 0.7073138356208801, "learning_rate": 0.0002526516539166038, "loss": 3.3263, "step": 53850 }, { "epoch": 5.801313098697665, "grad_norm": 0.7388830184936523, "learning_rate": 0.00025232841288654235, "loss": 3.3466, "step": 53900 }, { "epoch": 5.806694650737272, "grad_norm": 0.7408850789070129, "learning_rate": 0.00025200517185648095, "loss": 3.3326, "step": 53950 }, { "epoch": 5.812076202776881, "grad_norm": 0.7403854131698608, "learning_rate": 0.00025168193082641954, "loss": 3.339, "step": 54000 }, { "epoch": 5.812076202776881, "eval_accuracy": 0.3832379668828171, "eval_loss": 3.3853044509887695, "eval_runtime": 184.1659, "eval_samples_per_second": 97.798, "eval_steps_per_second": 6.114, "step": 54000 }, { "epoch": 5.817457754816489, "grad_norm": 0.7515339851379395, "learning_rate": 0.00025135868979635814, "loss": 3.3391, "step": 54050 }, { "epoch": 5.822839306856097, "grad_norm": 0.6873476505279541, "learning_rate": 0.0002510354487662967, "loss": 3.3314, "step": 54100 }, { "epoch": 5.828220858895706, "grad_norm": 0.7633679509162903, "learning_rate": 0.0002507122077362353, "loss": 3.3247, "step": 54150 }, { "epoch": 5.833602410935313, "grad_norm": 0.7475916147232056, "learning_rate": 0.00025038896670617387, "loss": 3.3237, "step": 54200 }, { "epoch": 5.838983962974922, "grad_norm": 0.7953991889953613, "learning_rate": 0.00025006572567611246, "loss": 3.3351, "step": 54250 }, { "epoch": 5.84436551501453, "grad_norm": 0.7283833026885986, "learning_rate": 0.00024974248464605106, "loss": 3.3289, "step": 54300 }, { "epoch": 5.849747067054138, "grad_norm": 0.7127808928489685, "learning_rate": 0.00024941924361598965, "loss": 3.3339, "step": 54350 }, { "epoch": 5.855128619093747, "grad_norm": 0.6898336410522461, "learning_rate": 0.00024909600258592825, "loss": 3.3268, "step": 54400 }, { "epoch": 5.860510171133355, "grad_norm": 0.7083118557929993, "learning_rate": 0.0002487727615558668, "loss": 3.3547, "step": 54450 }, { "epoch": 5.865891723172963, "grad_norm": 0.7142371535301208, "learning_rate": 0.0002484495205258054, "loss": 3.3276, "step": 54500 }, { "epoch": 5.871273275212571, "grad_norm": 0.7726343274116516, "learning_rate": 0.000248126279495744, "loss": 3.3332, "step": 54550 }, { "epoch": 5.87665482725218, "grad_norm": 0.719041645526886, "learning_rate": 0.00024780303846568257, "loss": 3.3257, "step": 54600 }, { "epoch": 5.882036379291788, "grad_norm": 0.6890484690666199, "learning_rate": 0.0002474797974356211, "loss": 3.34, "step": 54650 }, { "epoch": 5.887417931331396, "grad_norm": 0.7217288613319397, "learning_rate": 0.00024715655640555976, "loss": 3.3259, "step": 54700 }, { "epoch": 5.892799483371004, "grad_norm": 0.7373946309089661, "learning_rate": 0.0002468333153754983, "loss": 3.3372, "step": 54750 }, { "epoch": 5.898181035410612, "grad_norm": 0.7198396921157837, "learning_rate": 0.0002465100743454369, "loss": 3.3367, "step": 54800 }, { "epoch": 5.903562587450221, "grad_norm": 0.694810688495636, "learning_rate": 0.0002461868333153755, "loss": 3.3565, "step": 54850 }, { "epoch": 5.9089441394898286, "grad_norm": 0.7050492167472839, "learning_rate": 0.0002458635922853141, "loss": 3.3347, "step": 54900 }, { "epoch": 5.914325691529437, "grad_norm": 0.7072895169258118, "learning_rate": 0.0002455403512552526, "loss": 3.3233, "step": 54950 }, { "epoch": 5.919707243569046, "grad_norm": 0.7510350346565247, "learning_rate": 0.0002452171102251912, "loss": 3.3234, "step": 55000 }, { "epoch": 5.919707243569046, "eval_accuracy": 0.3839444279593869, "eval_loss": 3.380399227142334, "eval_runtime": 183.8737, "eval_samples_per_second": 97.953, "eval_steps_per_second": 6.124, "step": 55000 }, { "epoch": 5.925088795608653, "grad_norm": 0.7191430330276489, "learning_rate": 0.0002448938691951298, "loss": 3.3319, "step": 55050 }, { "epoch": 5.930470347648262, "grad_norm": 0.7841139435768127, "learning_rate": 0.0002445706281650684, "loss": 3.3525, "step": 55100 }, { "epoch": 5.93585189968787, "grad_norm": 0.7067576050758362, "learning_rate": 0.0002442538519556082, "loss": 3.3404, "step": 55150 }, { "epoch": 5.941233451727478, "grad_norm": 0.761639416217804, "learning_rate": 0.0002439306109255468, "loss": 3.3526, "step": 55200 }, { "epoch": 5.946615003767087, "grad_norm": 0.7069488763809204, "learning_rate": 0.00024360736989548538, "loss": 3.3316, "step": 55250 }, { "epoch": 5.951996555806694, "grad_norm": 0.7399417161941528, "learning_rate": 0.0002432905936860252, "loss": 3.3489, "step": 55300 }, { "epoch": 5.957378107846303, "grad_norm": 0.7451988458633423, "learning_rate": 0.00024296735265596378, "loss": 3.3275, "step": 55350 }, { "epoch": 5.962759659885911, "grad_norm": 0.7451727390289307, "learning_rate": 0.00024264411162590235, "loss": 3.3325, "step": 55400 }, { "epoch": 5.968141211925519, "grad_norm": 0.6925472021102905, "learning_rate": 0.00024232087059584097, "loss": 3.3169, "step": 55450 }, { "epoch": 5.973522763965128, "grad_norm": 0.7019506692886353, "learning_rate": 0.00024199762956577953, "loss": 3.3223, "step": 55500 }, { "epoch": 5.978904316004736, "grad_norm": 0.7174853682518005, "learning_rate": 0.0002416743885357181, "loss": 3.3329, "step": 55550 }, { "epoch": 5.984285868044344, "grad_norm": 0.7418709397315979, "learning_rate": 0.0002413511475056567, "loss": 3.3373, "step": 55600 }, { "epoch": 5.989667420083952, "grad_norm": 0.6945975422859192, "learning_rate": 0.0002410279064755953, "loss": 3.3393, "step": 55650 }, { "epoch": 5.995048972123561, "grad_norm": 0.747647762298584, "learning_rate": 0.00024070466544553386, "loss": 3.3233, "step": 55700 }, { "epoch": 6.000430524163169, "grad_norm": 0.7281306982040405, "learning_rate": 0.00024038142441547245, "loss": 3.3325, "step": 55750 }, { "epoch": 6.005812076202777, "grad_norm": 0.7471247911453247, "learning_rate": 0.00024005818338541102, "loss": 3.2446, "step": 55800 }, { "epoch": 6.011193628242385, "grad_norm": 0.7816767692565918, "learning_rate": 0.0002397349423553496, "loss": 3.2405, "step": 55850 }, { "epoch": 6.016575180281993, "grad_norm": 0.7186482548713684, "learning_rate": 0.0002394117013252882, "loss": 3.2389, "step": 55900 }, { "epoch": 6.021956732321602, "grad_norm": 0.7265479564666748, "learning_rate": 0.00023908846029522678, "loss": 3.2288, "step": 55950 }, { "epoch": 6.0273382843612096, "grad_norm": 0.7421016693115234, "learning_rate": 0.00023876521926516537, "loss": 3.2393, "step": 56000 }, { "epoch": 6.0273382843612096, "eval_accuracy": 0.3841643414043234, "eval_loss": 3.382951259613037, "eval_runtime": 184.2009, "eval_samples_per_second": 97.779, "eval_steps_per_second": 6.113, "step": 56000 }, { "epoch": 6.032719836400818, "grad_norm": 0.774722158908844, "learning_rate": 0.00023844197823510397, "loss": 3.237, "step": 56050 }, { "epoch": 6.038101388440427, "grad_norm": 0.7651278376579285, "learning_rate": 0.00023811873720504253, "loss": 3.2519, "step": 56100 }, { "epoch": 6.043482940480034, "grad_norm": 0.715975284576416, "learning_rate": 0.00023779549617498113, "loss": 3.2394, "step": 56150 }, { "epoch": 6.048864492519643, "grad_norm": 0.7559818625450134, "learning_rate": 0.0002374722551449197, "loss": 3.2529, "step": 56200 }, { "epoch": 6.0542460445592505, "grad_norm": 0.724737823009491, "learning_rate": 0.0002371490141148583, "loss": 3.2484, "step": 56250 }, { "epoch": 6.059627596598859, "grad_norm": 0.7519994974136353, "learning_rate": 0.00023682577308479689, "loss": 3.2548, "step": 56300 }, { "epoch": 6.065009148638468, "grad_norm": 0.6911830902099609, "learning_rate": 0.00023650253205473545, "loss": 3.2691, "step": 56350 }, { "epoch": 6.070390700678075, "grad_norm": 0.6975244283676147, "learning_rate": 0.00023617929102467402, "loss": 3.2576, "step": 56400 }, { "epoch": 6.075772252717684, "grad_norm": 0.7422133088111877, "learning_rate": 0.00023585604999461264, "loss": 3.2555, "step": 56450 }, { "epoch": 6.081153804757292, "grad_norm": 0.776143491268158, "learning_rate": 0.0002355328089645512, "loss": 3.2548, "step": 56500 }, { "epoch": 6.0865353567969, "grad_norm": 0.7200475931167603, "learning_rate": 0.00023520956793448978, "loss": 3.2679, "step": 56550 }, { "epoch": 6.091916908836509, "grad_norm": 0.7736318111419678, "learning_rate": 0.0002348863269044284, "loss": 3.2534, "step": 56600 }, { "epoch": 6.097298460876116, "grad_norm": 0.7112215757369995, "learning_rate": 0.00023456308587436697, "loss": 3.2462, "step": 56650 }, { "epoch": 6.102680012915725, "grad_norm": 0.7518984079360962, "learning_rate": 0.00023423984484430554, "loss": 3.2602, "step": 56700 }, { "epoch": 6.108061564955333, "grad_norm": 0.7721927762031555, "learning_rate": 0.00023391660381424413, "loss": 3.2626, "step": 56750 }, { "epoch": 6.113443116994941, "grad_norm": 0.7373853921890259, "learning_rate": 0.00023359336278418272, "loss": 3.2398, "step": 56800 }, { "epoch": 6.11882466903455, "grad_norm": 0.7509304881095886, "learning_rate": 0.0002332701217541213, "loss": 3.2475, "step": 56850 }, { "epoch": 6.124206221074158, "grad_norm": 0.7275040149688721, "learning_rate": 0.0002329468807240599, "loss": 3.26, "step": 56900 }, { "epoch": 6.129587773113766, "grad_norm": 0.7387740612030029, "learning_rate": 0.00023262363969399845, "loss": 3.2704, "step": 56950 }, { "epoch": 6.134969325153374, "grad_norm": 0.7612864375114441, "learning_rate": 0.00023230039866393708, "loss": 3.26, "step": 57000 }, { "epoch": 6.134969325153374, "eval_accuracy": 0.3841219667780757, "eval_loss": 3.3832507133483887, "eval_runtime": 184.352, "eval_samples_per_second": 97.699, "eval_steps_per_second": 6.108, "step": 57000 }, { "epoch": 6.140350877192983, "grad_norm": 0.735535204410553, "learning_rate": 0.00023197715763387564, "loss": 3.2553, "step": 57050 }, { "epoch": 6.1457324292325906, "grad_norm": 0.7251707911491394, "learning_rate": 0.0002316539166038142, "loss": 3.2835, "step": 57100 }, { "epoch": 6.151113981272199, "grad_norm": 0.7427584528923035, "learning_rate": 0.00023133067557375283, "loss": 3.2566, "step": 57150 }, { "epoch": 6.156495533311807, "grad_norm": 0.7262466549873352, "learning_rate": 0.0002310074345436914, "loss": 3.2601, "step": 57200 }, { "epoch": 6.161877085351415, "grad_norm": 0.7328972220420837, "learning_rate": 0.00023068419351362997, "loss": 3.2712, "step": 57250 }, { "epoch": 6.167258637391024, "grad_norm": 0.7297770380973816, "learning_rate": 0.00023036095248356856, "loss": 3.2629, "step": 57300 }, { "epoch": 6.1726401894306315, "grad_norm": 0.730864405632019, "learning_rate": 0.00023003771145350716, "loss": 3.2608, "step": 57350 }, { "epoch": 6.17802174147024, "grad_norm": 0.7603052258491516, "learning_rate": 0.00022971447042344572, "loss": 3.2845, "step": 57400 }, { "epoch": 6.183403293509849, "grad_norm": 0.7567489743232727, "learning_rate": 0.00022939122939338432, "loss": 3.2683, "step": 57450 }, { "epoch": 6.188784845549456, "grad_norm": 0.7320775985717773, "learning_rate": 0.0002290679883633229, "loss": 3.274, "step": 57500 }, { "epoch": 6.194166397589065, "grad_norm": 0.723310649394989, "learning_rate": 0.00022874474733326148, "loss": 3.2637, "step": 57550 }, { "epoch": 6.1995479496286725, "grad_norm": 0.7478488087654114, "learning_rate": 0.00022842150630320008, "loss": 3.2711, "step": 57600 }, { "epoch": 6.204929501668281, "grad_norm": 0.7612980604171753, "learning_rate": 0.00022809826527313864, "loss": 3.25, "step": 57650 }, { "epoch": 6.21031105370789, "grad_norm": 0.73157799243927, "learning_rate": 0.0002277750242430772, "loss": 3.272, "step": 57700 }, { "epoch": 6.215692605747497, "grad_norm": 0.7435078620910645, "learning_rate": 0.00022745178321301583, "loss": 3.2721, "step": 57750 }, { "epoch": 6.221074157787106, "grad_norm": 0.7415991425514221, "learning_rate": 0.0002271285421829544, "loss": 3.2617, "step": 57800 }, { "epoch": 6.226455709826714, "grad_norm": 0.7138713002204895, "learning_rate": 0.00022680530115289297, "loss": 3.2658, "step": 57850 }, { "epoch": 6.231837261866322, "grad_norm": 0.7327532768249512, "learning_rate": 0.0002264820601228316, "loss": 3.2558, "step": 57900 }, { "epoch": 6.237218813905931, "grad_norm": 0.717298686504364, "learning_rate": 0.00022615881909277016, "loss": 3.2714, "step": 57950 }, { "epoch": 6.242600365945538, "grad_norm": 0.7111124992370605, "learning_rate": 0.00022583557806270875, "loss": 3.2793, "step": 58000 }, { "epoch": 6.242600365945538, "eval_accuracy": 0.3847077144962836, "eval_loss": 3.378939390182495, "eval_runtime": 184.4989, "eval_samples_per_second": 97.621, "eval_steps_per_second": 6.103, "step": 58000 }, { "epoch": 6.247981917985147, "grad_norm": 0.7619994282722473, "learning_rate": 0.00022551233703264732, "loss": 3.2868, "step": 58050 }, { "epoch": 6.253363470024755, "grad_norm": 0.696587085723877, "learning_rate": 0.0002251890960025859, "loss": 3.2816, "step": 58100 }, { "epoch": 6.258745022064363, "grad_norm": 0.7583964467048645, "learning_rate": 0.0002248658549725245, "loss": 3.2699, "step": 58150 }, { "epoch": 6.264126574103972, "grad_norm": 0.73765629529953, "learning_rate": 0.00022454261394246308, "loss": 3.2802, "step": 58200 }, { "epoch": 6.26950812614358, "grad_norm": 0.6853163838386536, "learning_rate": 0.00022421937291240164, "loss": 3.2667, "step": 58250 }, { "epoch": 6.274889678183188, "grad_norm": 0.7149142622947693, "learning_rate": 0.00022389613188234027, "loss": 3.2588, "step": 58300 }, { "epoch": 6.280271230222796, "grad_norm": 0.7267086505889893, "learning_rate": 0.00022357289085227883, "loss": 3.2638, "step": 58350 }, { "epoch": 6.285652782262405, "grad_norm": 0.7292464971542358, "learning_rate": 0.0002232496498222174, "loss": 3.2593, "step": 58400 }, { "epoch": 6.2910343343020125, "grad_norm": 0.7045040726661682, "learning_rate": 0.000222926408792156, "loss": 3.2908, "step": 58450 }, { "epoch": 6.296415886341621, "grad_norm": 0.7823399901390076, "learning_rate": 0.0002226031677620946, "loss": 3.2798, "step": 58500 }, { "epoch": 6.301797438381229, "grad_norm": 0.7582830786705017, "learning_rate": 0.0002222863915526344, "loss": 3.2702, "step": 58550 }, { "epoch": 6.307178990420837, "grad_norm": 0.7241557836532593, "learning_rate": 0.00022196315052257296, "loss": 3.2674, "step": 58600 }, { "epoch": 6.312560542460446, "grad_norm": 0.9570308923721313, "learning_rate": 0.00022163990949251156, "loss": 3.2777, "step": 58650 }, { "epoch": 6.3179420945000535, "grad_norm": 0.7590973377227783, "learning_rate": 0.00022131666846245015, "loss": 3.2743, "step": 58700 }, { "epoch": 6.323323646539662, "grad_norm": 0.7064140439033508, "learning_rate": 0.00022099342743238872, "loss": 3.2777, "step": 58750 }, { "epoch": 6.328705198579271, "grad_norm": 0.7608388066291809, "learning_rate": 0.00022067018640232734, "loss": 3.302, "step": 58800 }, { "epoch": 6.334086750618878, "grad_norm": 0.7518062591552734, "learning_rate": 0.0002203469453722659, "loss": 3.2687, "step": 58850 }, { "epoch": 6.339468302658487, "grad_norm": 0.7372952699661255, "learning_rate": 0.00022002370434220448, "loss": 3.265, "step": 58900 }, { "epoch": 6.344849854698095, "grad_norm": 0.6782916188240051, "learning_rate": 0.00021970046331214307, "loss": 3.2752, "step": 58950 }, { "epoch": 6.350231406737703, "grad_norm": 0.7314630150794983, "learning_rate": 0.00021937722228208167, "loss": 3.2727, "step": 59000 }, { "epoch": 6.350231406737703, "eval_accuracy": 0.38491980493329747, "eval_loss": 3.3779428005218506, "eval_runtime": 183.9956, "eval_samples_per_second": 97.888, "eval_steps_per_second": 6.12, "step": 59000 }, { "epoch": 6.355612958777312, "grad_norm": 0.7423800230026245, "learning_rate": 0.00021905398125202024, "loss": 3.2673, "step": 59050 }, { "epoch": 6.360994510816919, "grad_norm": 0.7026423215866089, "learning_rate": 0.00021873074022195883, "loss": 3.2688, "step": 59100 }, { "epoch": 6.366376062856528, "grad_norm": 0.821156919002533, "learning_rate": 0.0002184074991918974, "loss": 3.2751, "step": 59150 }, { "epoch": 6.371757614896136, "grad_norm": 0.7709437012672424, "learning_rate": 0.00021808425816183597, "loss": 3.2669, "step": 59200 }, { "epoch": 6.377139166935744, "grad_norm": 0.7373313903808594, "learning_rate": 0.0002177610171317746, "loss": 3.2787, "step": 59250 }, { "epoch": 6.382520718975353, "grad_norm": 0.7437813878059387, "learning_rate": 0.00021743777610171315, "loss": 3.2697, "step": 59300 }, { "epoch": 6.387902271014961, "grad_norm": 0.7977573871612549, "learning_rate": 0.00021711453507165172, "loss": 3.2788, "step": 59350 }, { "epoch": 6.393283823054569, "grad_norm": 0.7082981467247009, "learning_rate": 0.00021679129404159034, "loss": 3.2801, "step": 59400 }, { "epoch": 6.398665375094177, "grad_norm": 0.7445041537284851, "learning_rate": 0.0002164680530115289, "loss": 3.3005, "step": 59450 }, { "epoch": 6.404046927133785, "grad_norm": 0.7911046743392944, "learning_rate": 0.00021614481198146748, "loss": 3.2766, "step": 59500 }, { "epoch": 6.4094284791733935, "grad_norm": 0.7671107053756714, "learning_rate": 0.00021582157095140607, "loss": 3.3081, "step": 59550 }, { "epoch": 6.414810031213002, "grad_norm": 0.7555608153343201, "learning_rate": 0.00021549832992134467, "loss": 3.28, "step": 59600 }, { "epoch": 6.42019158325261, "grad_norm": 0.737175464630127, "learning_rate": 0.00021517508889128324, "loss": 3.2808, "step": 59650 }, { "epoch": 6.425573135292218, "grad_norm": 0.7711895704269409, "learning_rate": 0.00021485184786122183, "loss": 3.2927, "step": 59700 }, { "epoch": 6.430954687331827, "grad_norm": 0.7621187567710876, "learning_rate": 0.0002145286068311604, "loss": 3.2822, "step": 59750 }, { "epoch": 6.4363362393714345, "grad_norm": 0.7647627592086792, "learning_rate": 0.00021420536580109902, "loss": 3.269, "step": 59800 }, { "epoch": 6.441717791411043, "grad_norm": 0.7704160809516907, "learning_rate": 0.0002138821247710376, "loss": 3.2597, "step": 59850 }, { "epoch": 6.447099343450651, "grad_norm": 0.7501802444458008, "learning_rate": 0.00021355888374097615, "loss": 3.2892, "step": 59900 }, { "epoch": 6.452480895490259, "grad_norm": 0.8180058598518372, "learning_rate": 0.00021323564271091478, "loss": 3.2942, "step": 59950 }, { "epoch": 6.457862447529868, "grad_norm": 0.7676056623458862, "learning_rate": 0.00021291240168085334, "loss": 3.2813, "step": 60000 }, { "epoch": 6.457862447529868, "eval_accuracy": 0.38506659498473483, "eval_loss": 3.373599052429199, "eval_runtime": 184.1576, "eval_samples_per_second": 97.802, "eval_steps_per_second": 6.114, "step": 60000 }, { "epoch": 6.4632439995694755, "grad_norm": 0.8020526766777039, "learning_rate": 0.0002125891606507919, "loss": 3.2657, "step": 60050 }, { "epoch": 6.468625551609084, "grad_norm": 0.7733620405197144, "learning_rate": 0.0002122659196207305, "loss": 3.2667, "step": 60100 }, { "epoch": 6.474007103648693, "grad_norm": 0.7357586026191711, "learning_rate": 0.0002119426785906691, "loss": 3.2889, "step": 60150 }, { "epoch": 6.4793886556883, "grad_norm": 0.7588229775428772, "learning_rate": 0.00021161943756060767, "loss": 3.2785, "step": 60200 }, { "epoch": 6.484770207727909, "grad_norm": 0.7582606077194214, "learning_rate": 0.00021129619653054626, "loss": 3.305, "step": 60250 }, { "epoch": 6.490151759767517, "grad_norm": 0.7341297268867493, "learning_rate": 0.00021097295550048483, "loss": 3.2763, "step": 60300 }, { "epoch": 6.495533311807125, "grad_norm": 0.768948495388031, "learning_rate": 0.00021064971447042343, "loss": 3.2899, "step": 60350 }, { "epoch": 6.500914863846734, "grad_norm": 0.7307316660881042, "learning_rate": 0.00021032647344036202, "loss": 3.2744, "step": 60400 }, { "epoch": 6.506296415886341, "grad_norm": 0.7900117635726929, "learning_rate": 0.0002100032324103006, "loss": 3.2796, "step": 60450 }, { "epoch": 6.51167796792595, "grad_norm": 0.7413355112075806, "learning_rate": 0.00020967999138023916, "loss": 3.283, "step": 60500 }, { "epoch": 6.517059519965558, "grad_norm": 0.8000146150588989, "learning_rate": 0.00020935675035017778, "loss": 3.2939, "step": 60550 }, { "epoch": 6.522441072005166, "grad_norm": 0.7362974286079407, "learning_rate": 0.00020903350932011634, "loss": 3.2799, "step": 60600 }, { "epoch": 6.5278226240447745, "grad_norm": 0.7781336307525635, "learning_rate": 0.0002087102682900549, "loss": 3.2738, "step": 60650 }, { "epoch": 6.533204176084383, "grad_norm": 0.7916768193244934, "learning_rate": 0.00020838702725999353, "loss": 3.268, "step": 60700 }, { "epoch": 6.538585728123991, "grad_norm": 0.7510265111923218, "learning_rate": 0.0002080637862299321, "loss": 3.2746, "step": 60750 }, { "epoch": 6.543967280163599, "grad_norm": 0.7636952996253967, "learning_rate": 0.0002077405451998707, "loss": 3.2605, "step": 60800 }, { "epoch": 6.549348832203208, "grad_norm": 0.7475779056549072, "learning_rate": 0.00020741730416980926, "loss": 3.2682, "step": 60850 }, { "epoch": 6.5547303842428155, "grad_norm": 0.7497417330741882, "learning_rate": 0.00020709406313974786, "loss": 3.2669, "step": 60900 }, { "epoch": 6.560111936282424, "grad_norm": 0.7327662110328674, "learning_rate": 0.00020677082210968645, "loss": 3.2707, "step": 60950 }, { "epoch": 6.565493488322032, "grad_norm": 0.7096711993217468, "learning_rate": 0.00020644758107962502, "loss": 3.277, "step": 61000 }, { "epoch": 6.565493488322032, "eval_accuracy": 0.3860929301630304, "eval_loss": 3.3662281036376953, "eval_runtime": 184.1226, "eval_samples_per_second": 97.821, "eval_steps_per_second": 6.115, "step": 61000 }, { "epoch": 6.57087504036164, "grad_norm": 0.7375599145889282, "learning_rate": 0.0002061243400495636, "loss": 3.2838, "step": 61050 }, { "epoch": 6.576256592401249, "grad_norm": 0.7727601528167725, "learning_rate": 0.0002058010990195022, "loss": 3.2831, "step": 61100 }, { "epoch": 6.5816381444408565, "grad_norm": 0.750218391418457, "learning_rate": 0.00020547785798944078, "loss": 3.2908, "step": 61150 }, { "epoch": 6.587019696480465, "grad_norm": 0.7926641702651978, "learning_rate": 0.00020515461695937934, "loss": 3.2667, "step": 61200 }, { "epoch": 6.592401248520073, "grad_norm": 0.7963508367538452, "learning_rate": 0.00020483137592931797, "loss": 3.2665, "step": 61250 }, { "epoch": 6.597782800559681, "grad_norm": 0.7930461168289185, "learning_rate": 0.00020450813489925653, "loss": 3.2883, "step": 61300 }, { "epoch": 6.60316435259929, "grad_norm": 0.7773116230964661, "learning_rate": 0.0002041848938691951, "loss": 3.2874, "step": 61350 }, { "epoch": 6.608545904638898, "grad_norm": 0.7539728283882141, "learning_rate": 0.0002038616528391337, "loss": 3.2594, "step": 61400 }, { "epoch": 6.613927456678506, "grad_norm": 0.7411528825759888, "learning_rate": 0.00020353841180907226, "loss": 3.2846, "step": 61450 }, { "epoch": 6.619309008718115, "grad_norm": 0.717140257358551, "learning_rate": 0.00020321517077901086, "loss": 3.2745, "step": 61500 }, { "epoch": 6.624690560757722, "grad_norm": 0.7917121052742004, "learning_rate": 0.00020289192974894945, "loss": 3.2675, "step": 61550 }, { "epoch": 6.630072112797331, "grad_norm": 0.7248356938362122, "learning_rate": 0.00020256868871888802, "loss": 3.3041, "step": 61600 }, { "epoch": 6.635453664836939, "grad_norm": 0.77708899974823, "learning_rate": 0.00020224544768882664, "loss": 3.278, "step": 61650 }, { "epoch": 6.640835216876547, "grad_norm": 0.7235255241394043, "learning_rate": 0.0002019222066587652, "loss": 3.2902, "step": 61700 }, { "epoch": 6.6462167689161555, "grad_norm": 0.7517228126525879, "learning_rate": 0.00020159896562870378, "loss": 3.2943, "step": 61750 }, { "epoch": 6.651598320955763, "grad_norm": 0.7579857707023621, "learning_rate": 0.00020127572459864237, "loss": 3.2747, "step": 61800 }, { "epoch": 6.656979872995372, "grad_norm": 0.7669872045516968, "learning_rate": 0.00020095248356858097, "loss": 3.2693, "step": 61850 }, { "epoch": 6.66236142503498, "grad_norm": 0.7341814041137695, "learning_rate": 0.00020062924253851953, "loss": 3.2715, "step": 61900 }, { "epoch": 6.667742977074588, "grad_norm": 0.7486647963523865, "learning_rate": 0.00020030600150845813, "loss": 3.2833, "step": 61950 }, { "epoch": 6.6731245291141965, "grad_norm": 0.7675405144691467, "learning_rate": 0.0001999827604783967, "loss": 3.2843, "step": 62000 }, { "epoch": 6.6731245291141965, "eval_accuracy": 0.3862772054607638, "eval_loss": 3.3661346435546875, "eval_runtime": 184.1953, "eval_samples_per_second": 97.782, "eval_steps_per_second": 6.113, "step": 62000 }, { "epoch": 6.678506081153805, "grad_norm": 0.7723380327224731, "learning_rate": 0.0001996595194483353, "loss": 3.3001, "step": 62050 }, { "epoch": 6.683887633193413, "grad_norm": 0.7448545098304749, "learning_rate": 0.00019933627841827389, "loss": 3.297, "step": 62100 }, { "epoch": 6.689269185233021, "grad_norm": 0.7352944016456604, "learning_rate": 0.00019901303738821245, "loss": 3.2681, "step": 62150 }, { "epoch": 6.69465073727263, "grad_norm": 0.7650615572929382, "learning_rate": 0.00019868979635815102, "loss": 3.2971, "step": 62200 }, { "epoch": 6.7000322893122375, "grad_norm": 0.7695125937461853, "learning_rate": 0.00019836655532808964, "loss": 3.2858, "step": 62250 }, { "epoch": 6.705413841351846, "grad_norm": 0.7242076396942139, "learning_rate": 0.0001980433142980282, "loss": 3.2761, "step": 62300 }, { "epoch": 6.710795393391454, "grad_norm": 0.7324261665344238, "learning_rate": 0.00019772007326796678, "loss": 3.2782, "step": 62350 }, { "epoch": 6.716176945431062, "grad_norm": 0.8060125708580017, "learning_rate": 0.0001973968322379054, "loss": 3.2976, "step": 62400 }, { "epoch": 6.721558497470671, "grad_norm": 0.7472890615463257, "learning_rate": 0.00019707359120784397, "loss": 3.2815, "step": 62450 }, { "epoch": 6.7269400495102785, "grad_norm": 0.7612674832344055, "learning_rate": 0.00019675035017778253, "loss": 3.2688, "step": 62500 }, { "epoch": 6.732321601549887, "grad_norm": 0.7790685892105103, "learning_rate": 0.00019642710914772113, "loss": 3.2854, "step": 62550 }, { "epoch": 6.737703153589496, "grad_norm": 0.8189994692802429, "learning_rate": 0.00019611033293826096, "loss": 3.2866, "step": 62600 }, { "epoch": 6.743084705629103, "grad_norm": 0.8347989916801453, "learning_rate": 0.00019578709190819953, "loss": 3.2727, "step": 62650 }, { "epoch": 6.748466257668712, "grad_norm": 0.7693740129470825, "learning_rate": 0.00019547031569873936, "loss": 3.2666, "step": 62700 }, { "epoch": 6.75384780970832, "grad_norm": 0.7447760701179504, "learning_rate": 0.00019514707466867793, "loss": 3.2922, "step": 62750 }, { "epoch": 6.759229361747928, "grad_norm": 0.7787308692932129, "learning_rate": 0.0001948238336386165, "loss": 3.2863, "step": 62800 }, { "epoch": 6.7646109137875365, "grad_norm": 0.8123381733894348, "learning_rate": 0.0001945005926085551, "loss": 3.283, "step": 62850 }, { "epoch": 6.769992465827144, "grad_norm": 0.7594086527824402, "learning_rate": 0.0001941773515784937, "loss": 3.2814, "step": 62900 }, { "epoch": 6.775374017866753, "grad_norm": 0.7873737812042236, "learning_rate": 0.00019385411054843226, "loss": 3.293, "step": 62950 }, { "epoch": 6.780755569906361, "grad_norm": 0.865595817565918, "learning_rate": 0.00019353086951837085, "loss": 3.2807, "step": 63000 }, { "epoch": 6.780755569906361, "eval_accuracy": 0.38660729293394414, "eval_loss": 3.3605573177337646, "eval_runtime": 184.2001, "eval_samples_per_second": 97.78, "eval_steps_per_second": 6.113, "step": 63000 }, { "epoch": 6.786137121945969, "grad_norm": 0.7730149626731873, "learning_rate": 0.00019320762848830942, "loss": 3.2922, "step": 63050 }, { "epoch": 6.7915186739855775, "grad_norm": 0.7273273468017578, "learning_rate": 0.000192884387458248, "loss": 3.2716, "step": 63100 }, { "epoch": 6.796900226025185, "grad_norm": 0.7354364395141602, "learning_rate": 0.0001925611464281866, "loss": 3.2889, "step": 63150 }, { "epoch": 6.802281778064794, "grad_norm": 0.7972354292869568, "learning_rate": 0.00019223790539812518, "loss": 3.296, "step": 63200 }, { "epoch": 6.807663330104402, "grad_norm": 0.7441288232803345, "learning_rate": 0.00019191466436806374, "loss": 3.293, "step": 63250 }, { "epoch": 6.813044882144011, "grad_norm": 0.807431161403656, "learning_rate": 0.00019159142333800236, "loss": 3.2904, "step": 63300 }, { "epoch": 6.8184264341836185, "grad_norm": 0.8397696018218994, "learning_rate": 0.00019126818230794093, "loss": 3.3126, "step": 63350 }, { "epoch": 6.823807986223227, "grad_norm": 0.7620358467102051, "learning_rate": 0.00019094494127787953, "loss": 3.293, "step": 63400 }, { "epoch": 6.829189538262835, "grad_norm": 0.7331714034080505, "learning_rate": 0.00019062170024781812, "loss": 3.2763, "step": 63450 }, { "epoch": 6.834571090302443, "grad_norm": 0.7353246808052063, "learning_rate": 0.0001902984592177567, "loss": 3.287, "step": 63500 }, { "epoch": 6.839952642342052, "grad_norm": 0.7742340564727783, "learning_rate": 0.00018997521818769528, "loss": 3.2861, "step": 63550 }, { "epoch": 6.8453341943816595, "grad_norm": 0.768855631351471, "learning_rate": 0.00018965197715763385, "loss": 3.2634, "step": 63600 }, { "epoch": 6.850715746421268, "grad_norm": 0.7666362524032593, "learning_rate": 0.00018932873612757245, "loss": 3.2793, "step": 63650 }, { "epoch": 6.856097298460876, "grad_norm": 0.7471009492874146, "learning_rate": 0.00018900549509751104, "loss": 3.2843, "step": 63700 }, { "epoch": 6.861478850500484, "grad_norm": 0.7938755750656128, "learning_rate": 0.0001886822540674496, "loss": 3.2844, "step": 63750 }, { "epoch": 6.866860402540093, "grad_norm": 0.8108975887298584, "learning_rate": 0.00018835901303738818, "loss": 3.2812, "step": 63800 }, { "epoch": 6.8722419545797, "grad_norm": 0.8145020604133606, "learning_rate": 0.0001880357720073268, "loss": 3.2823, "step": 63850 }, { "epoch": 6.877623506619309, "grad_norm": 0.7729702591896057, "learning_rate": 0.00018771253097726537, "loss": 3.2997, "step": 63900 }, { "epoch": 6.8830050586589175, "grad_norm": 0.7753148674964905, "learning_rate": 0.00018738928994720393, "loss": 3.2854, "step": 63950 }, { "epoch": 6.888386610698525, "grad_norm": 0.7631953358650208, "learning_rate": 0.00018706604891714255, "loss": 3.2976, "step": 64000 }, { "epoch": 6.888386610698525, "eval_accuracy": 0.3871290441012293, "eval_loss": 3.3551039695739746, "eval_runtime": 184.8944, "eval_samples_per_second": 97.412, "eval_steps_per_second": 6.09, "step": 64000 }, { "epoch": 6.893768162738134, "grad_norm": 0.7933676242828369, "learning_rate": 0.00018674280788708112, "loss": 3.291, "step": 64050 }, { "epoch": 6.899149714777742, "grad_norm": 0.7886412739753723, "learning_rate": 0.0001864195668570197, "loss": 3.2877, "step": 64100 }, { "epoch": 6.90453126681735, "grad_norm": 0.8481479287147522, "learning_rate": 0.00018609632582695828, "loss": 3.2884, "step": 64150 }, { "epoch": 6.9099128188569585, "grad_norm": 0.7675131559371948, "learning_rate": 0.00018577308479689685, "loss": 3.2864, "step": 64200 }, { "epoch": 6.915294370896566, "grad_norm": 0.8033954501152039, "learning_rate": 0.00018544984376683545, "loss": 3.2676, "step": 64250 }, { "epoch": 6.920675922936175, "grad_norm": 0.7999796867370605, "learning_rate": 0.00018512660273677404, "loss": 3.285, "step": 64300 }, { "epoch": 6.926057474975783, "grad_norm": 0.7918875217437744, "learning_rate": 0.0001848033617067126, "loss": 3.2704, "step": 64350 }, { "epoch": 6.931439027015391, "grad_norm": 0.7834922671318054, "learning_rate": 0.00018448012067665123, "loss": 3.2913, "step": 64400 }, { "epoch": 6.9368205790549995, "grad_norm": 0.8153764605522156, "learning_rate": 0.0001841568796465898, "loss": 3.2768, "step": 64450 }, { "epoch": 6.942202131094608, "grad_norm": 0.77406245470047, "learning_rate": 0.00018383363861652837, "loss": 3.281, "step": 64500 }, { "epoch": 6.947583683134216, "grad_norm": 0.798932671546936, "learning_rate": 0.00018351039758646696, "loss": 3.2851, "step": 64550 }, { "epoch": 6.952965235173824, "grad_norm": 0.7740693092346191, "learning_rate": 0.00018318715655640555, "loss": 3.2918, "step": 64600 }, { "epoch": 6.958346787213433, "grad_norm": 0.761839747428894, "learning_rate": 0.00018286391552634412, "loss": 3.2858, "step": 64650 }, { "epoch": 6.9637283392530405, "grad_norm": 0.783030092716217, "learning_rate": 0.00018254067449628272, "loss": 3.2941, "step": 64700 }, { "epoch": 6.969109891292649, "grad_norm": 0.7671773433685303, "learning_rate": 0.00018221743346622128, "loss": 3.2728, "step": 64750 }, { "epoch": 6.974491443332257, "grad_norm": 0.7592200040817261, "learning_rate": 0.00018189419243615988, "loss": 3.2744, "step": 64800 }, { "epoch": 6.979872995371865, "grad_norm": 0.8097104430198669, "learning_rate": 0.00018157095140609847, "loss": 3.28, "step": 64850 }, { "epoch": 6.985254547411474, "grad_norm": 0.7918432950973511, "learning_rate": 0.00018124771037603704, "loss": 3.2867, "step": 64900 }, { "epoch": 6.990636099451081, "grad_norm": 0.8203550577163696, "learning_rate": 0.0001809244693459756, "loss": 3.2911, "step": 64950 }, { "epoch": 6.99601765149069, "grad_norm": 0.792578399181366, "learning_rate": 0.00018060122831591423, "loss": 3.2965, "step": 65000 }, { "epoch": 6.99601765149069, "eval_accuracy": 0.38748759863101706, "eval_loss": 3.3496932983398438, "eval_runtime": 184.8606, "eval_samples_per_second": 97.43, "eval_steps_per_second": 6.091, "step": 65000 }, { "epoch": 7.0013992035302985, "grad_norm": 0.767149806022644, "learning_rate": 0.0001802779872858528, "loss": 3.2415, "step": 65050 }, { "epoch": 7.006780755569906, "grad_norm": 0.7580429315567017, "learning_rate": 0.00017995474625579137, "loss": 3.1974, "step": 65100 }, { "epoch": 7.012162307609515, "grad_norm": 0.7315022945404053, "learning_rate": 0.00017963150522573, "loss": 3.1866, "step": 65150 }, { "epoch": 7.017543859649122, "grad_norm": 0.8599745631217957, "learning_rate": 0.00017930826419566856, "loss": 3.2054, "step": 65200 }, { "epoch": 7.022925411688731, "grad_norm": 0.8150713443756104, "learning_rate": 0.00017898502316560712, "loss": 3.2072, "step": 65250 }, { "epoch": 7.0283069637283395, "grad_norm": 0.7963254451751709, "learning_rate": 0.00017866178213554572, "loss": 3.2006, "step": 65300 }, { "epoch": 7.033688515767947, "grad_norm": 0.7622230052947998, "learning_rate": 0.0001783385411054843, "loss": 3.2071, "step": 65350 }, { "epoch": 7.039070067807556, "grad_norm": 0.8268262147903442, "learning_rate": 0.0001780153000754229, "loss": 3.195, "step": 65400 }, { "epoch": 7.044451619847164, "grad_norm": 0.7498465180397034, "learning_rate": 0.00017769205904536147, "loss": 3.2051, "step": 65450 }, { "epoch": 7.049833171886772, "grad_norm": 0.7837993502616882, "learning_rate": 0.00017736881801530004, "loss": 3.2237, "step": 65500 }, { "epoch": 7.0552147239263805, "grad_norm": 0.8399375081062317, "learning_rate": 0.00017704557698523866, "loss": 3.2088, "step": 65550 }, { "epoch": 7.060596275965988, "grad_norm": 0.767216145992279, "learning_rate": 0.00017672233595517723, "loss": 3.2249, "step": 65600 }, { "epoch": 7.065977828005597, "grad_norm": 0.7908442616462708, "learning_rate": 0.0001763990949251158, "loss": 3.2, "step": 65650 }, { "epoch": 7.071359380045205, "grad_norm": 0.8151112794876099, "learning_rate": 0.00017607585389505442, "loss": 3.1897, "step": 65700 }, { "epoch": 7.076740932084813, "grad_norm": 0.8025968670845032, "learning_rate": 0.000175752612864993, "loss": 3.2118, "step": 65750 }, { "epoch": 7.0821224841244215, "grad_norm": 0.7988458275794983, "learning_rate": 0.00017542937183493156, "loss": 3.2117, "step": 65800 }, { "epoch": 7.08750403616403, "grad_norm": 0.7555399537086487, "learning_rate": 0.00017510613080487015, "loss": 3.2011, "step": 65850 }, { "epoch": 7.092885588203638, "grad_norm": 0.7525433897972107, "learning_rate": 0.00017478288977480874, "loss": 3.2135, "step": 65900 }, { "epoch": 7.098267140243246, "grad_norm": 0.7740173935890198, "learning_rate": 0.0001744596487447473, "loss": 3.2069, "step": 65950 }, { "epoch": 7.103648692282855, "grad_norm": 0.839492917060852, "learning_rate": 0.0001741364077146859, "loss": 3.2195, "step": 66000 }, { "epoch": 7.103648692282855, "eval_accuracy": 0.38749411780428594, "eval_loss": 3.35890793800354, "eval_runtime": 184.6574, "eval_samples_per_second": 97.537, "eval_steps_per_second": 6.098, "step": 66000 }, { "epoch": 7.109030244322462, "grad_norm": 0.8025331497192383, "learning_rate": 0.00017381316668462447, "loss": 3.2003, "step": 66050 }, { "epoch": 7.114411796362071, "grad_norm": 0.751524031162262, "learning_rate": 0.00017348992565456304, "loss": 3.206, "step": 66100 }, { "epoch": 7.119793348401679, "grad_norm": 0.8034775257110596, "learning_rate": 0.00017316668462450166, "loss": 3.2217, "step": 66150 }, { "epoch": 7.125174900441287, "grad_norm": 0.8346290588378906, "learning_rate": 0.00017284344359444023, "loss": 3.22, "step": 66200 }, { "epoch": 7.130556452480896, "grad_norm": 0.7539912462234497, "learning_rate": 0.00017252020256437885, "loss": 3.1888, "step": 66250 }, { "epoch": 7.135938004520503, "grad_norm": 0.7978689670562744, "learning_rate": 0.00017220342635491863, "loss": 3.1985, "step": 66300 }, { "epoch": 7.141319556560112, "grad_norm": 0.7891173958778381, "learning_rate": 0.00017188018532485723, "loss": 3.2051, "step": 66350 }, { "epoch": 7.1467011085997205, "grad_norm": 0.8780093789100647, "learning_rate": 0.0001715569442947958, "loss": 3.2111, "step": 66400 }, { "epoch": 7.152082660639328, "grad_norm": 0.8110262751579285, "learning_rate": 0.0001712337032647344, "loss": 3.2152, "step": 66450 }, { "epoch": 7.157464212678937, "grad_norm": 0.7836471796035767, "learning_rate": 0.00017091046223467298, "loss": 3.2271, "step": 66500 }, { "epoch": 7.162845764718545, "grad_norm": 0.7956977486610413, "learning_rate": 0.00017058722120461155, "loss": 3.2036, "step": 66550 }, { "epoch": 7.168227316758153, "grad_norm": 0.7651464939117432, "learning_rate": 0.00017026398017455012, "loss": 3.2196, "step": 66600 }, { "epoch": 7.1736088687977615, "grad_norm": 0.7748005986213684, "learning_rate": 0.00016994073914448874, "loss": 3.2184, "step": 66650 }, { "epoch": 7.178990420837369, "grad_norm": 0.7951914072036743, "learning_rate": 0.0001696174981144273, "loss": 3.2242, "step": 66700 }, { "epoch": 7.184371972876978, "grad_norm": 0.8075992465019226, "learning_rate": 0.00016929425708436588, "loss": 3.2272, "step": 66750 }, { "epoch": 7.189753524916586, "grad_norm": 0.7735618948936462, "learning_rate": 0.0001689710160543045, "loss": 3.2206, "step": 66800 }, { "epoch": 7.195135076956194, "grad_norm": 0.8450251817703247, "learning_rate": 0.00016864777502424307, "loss": 3.214, "step": 66850 }, { "epoch": 7.2005166289958025, "grad_norm": 0.8250156044960022, "learning_rate": 0.00016832453399418163, "loss": 3.2214, "step": 66900 }, { "epoch": 7.205898181035411, "grad_norm": 0.8176531791687012, "learning_rate": 0.00016800129296412023, "loss": 3.2233, "step": 66950 }, { "epoch": 7.211279733075019, "grad_norm": 0.8584686517715454, "learning_rate": 0.00016767805193405882, "loss": 3.2267, "step": 67000 }, { "epoch": 7.211279733075019, "eval_accuracy": 0.38754822694241753, "eval_loss": 3.354973554611206, "eval_runtime": 184.0611, "eval_samples_per_second": 97.853, "eval_steps_per_second": 6.118, "step": 67000 }, { "epoch": 7.216661285114627, "grad_norm": 0.8144950270652771, "learning_rate": 0.0001673548109039974, "loss": 3.2226, "step": 67050 }, { "epoch": 7.222042837154235, "grad_norm": 0.8484851121902466, "learning_rate": 0.00016703156987393598, "loss": 3.2033, "step": 67100 }, { "epoch": 7.2274243891938434, "grad_norm": 0.8151296377182007, "learning_rate": 0.00016670832884387455, "loss": 3.221, "step": 67150 }, { "epoch": 7.232805941233452, "grad_norm": 0.8212626576423645, "learning_rate": 0.00016638508781381317, "loss": 3.2155, "step": 67200 }, { "epoch": 7.23818749327306, "grad_norm": 0.7992985248565674, "learning_rate": 0.00016606184678375174, "loss": 3.2132, "step": 67250 }, { "epoch": 7.243569045312668, "grad_norm": 0.7793485522270203, "learning_rate": 0.0001657386057536903, "loss": 3.2284, "step": 67300 }, { "epoch": 7.248950597352277, "grad_norm": 0.839824914932251, "learning_rate": 0.00016541536472362893, "loss": 3.2131, "step": 67350 }, { "epoch": 7.254332149391884, "grad_norm": 0.7930625677108765, "learning_rate": 0.0001650921236935675, "loss": 3.2331, "step": 67400 }, { "epoch": 7.259713701431493, "grad_norm": 0.8035439848899841, "learning_rate": 0.00016476888266350607, "loss": 3.2162, "step": 67450 }, { "epoch": 7.265095253471101, "grad_norm": 0.8084062933921814, "learning_rate": 0.00016444564163344466, "loss": 3.2246, "step": 67500 }, { "epoch": 7.270476805510709, "grad_norm": 0.8094211220741272, "learning_rate": 0.00016412240060338326, "loss": 3.2292, "step": 67550 }, { "epoch": 7.275858357550318, "grad_norm": 0.835701048374176, "learning_rate": 0.00016379915957332182, "loss": 3.2277, "step": 67600 }, { "epoch": 7.281239909589925, "grad_norm": 0.8413548469543457, "learning_rate": 0.00016347591854326042, "loss": 3.2122, "step": 67650 }, { "epoch": 7.286621461629534, "grad_norm": 0.7890863418579102, "learning_rate": 0.00016315267751319898, "loss": 3.2247, "step": 67700 }, { "epoch": 7.2920030136691425, "grad_norm": 0.8312512040138245, "learning_rate": 0.00016282943648313755, "loss": 3.2228, "step": 67750 }, { "epoch": 7.29738456570875, "grad_norm": 0.7444777488708496, "learning_rate": 0.00016250619545307617, "loss": 3.2205, "step": 67800 }, { "epoch": 7.302766117748359, "grad_norm": 0.8243696689605713, "learning_rate": 0.00016218295442301474, "loss": 3.2077, "step": 67850 }, { "epoch": 7.308147669787967, "grad_norm": 0.779323935508728, "learning_rate": 0.0001618597133929533, "loss": 3.2311, "step": 67900 }, { "epoch": 7.313529221827575, "grad_norm": 0.8431495428085327, "learning_rate": 0.00016153647236289193, "loss": 3.2201, "step": 67950 }, { "epoch": 7.3189107738671835, "grad_norm": 0.8545737266540527, "learning_rate": 0.0001612132313328305, "loss": 3.2265, "step": 68000 }, { "epoch": 7.3189107738671835, "eval_accuracy": 0.3877443454049227, "eval_loss": 3.3524491786956787, "eval_runtime": 184.0714, "eval_samples_per_second": 97.848, "eval_steps_per_second": 6.117, "step": 68000 }, { "epoch": 7.324292325906791, "grad_norm": 0.8112559914588928, "learning_rate": 0.0001608899903027691, "loss": 3.2208, "step": 68050 }, { "epoch": 7.3296738779464, "grad_norm": 0.8382140398025513, "learning_rate": 0.00016056674927270766, "loss": 3.2382, "step": 68100 }, { "epoch": 7.335055429986008, "grad_norm": 0.7912194728851318, "learning_rate": 0.00016024350824264626, "loss": 3.2349, "step": 68150 }, { "epoch": 7.340436982025616, "grad_norm": 0.8154787421226501, "learning_rate": 0.00015992026721258485, "loss": 3.2259, "step": 68200 }, { "epoch": 7.3458185340652244, "grad_norm": 0.7978909015655518, "learning_rate": 0.00015959702618252342, "loss": 3.2271, "step": 68250 }, { "epoch": 7.351200086104833, "grad_norm": 0.8925394415855408, "learning_rate": 0.00015927378515246199, "loss": 3.2204, "step": 68300 }, { "epoch": 7.356581638144441, "grad_norm": 0.8202654719352722, "learning_rate": 0.0001589505441224006, "loss": 3.2393, "step": 68350 }, { "epoch": 7.361963190184049, "grad_norm": 0.799817681312561, "learning_rate": 0.00015862730309233917, "loss": 3.2265, "step": 68400 }, { "epoch": 7.367344742223658, "grad_norm": 2.2031750679016113, "learning_rate": 0.00015830406206227774, "loss": 3.2158, "step": 68450 }, { "epoch": 7.372726294263265, "grad_norm": 0.8100339770317078, "learning_rate": 0.00015798082103221636, "loss": 3.2297, "step": 68500 }, { "epoch": 7.378107846302874, "grad_norm": 0.8306113481521606, "learning_rate": 0.00015765758000215493, "loss": 3.221, "step": 68550 }, { "epoch": 7.383489398342482, "grad_norm": 0.8127340078353882, "learning_rate": 0.0001573343389720935, "loss": 3.2301, "step": 68600 }, { "epoch": 7.38887095038209, "grad_norm": 0.8154544234275818, "learning_rate": 0.0001570110979420321, "loss": 3.2302, "step": 68650 }, { "epoch": 7.394252502421699, "grad_norm": 0.8209565281867981, "learning_rate": 0.0001566878569119707, "loss": 3.2428, "step": 68700 }, { "epoch": 7.399634054461306, "grad_norm": 0.8079628944396973, "learning_rate": 0.00015636461588190926, "loss": 3.2217, "step": 68750 }, { "epoch": 7.405015606500915, "grad_norm": 0.869520366191864, "learning_rate": 0.00015604137485184785, "loss": 3.2361, "step": 68800 }, { "epoch": 7.4103971585405235, "grad_norm": 0.844996988773346, "learning_rate": 0.00015571813382178642, "loss": 3.2293, "step": 68850 }, { "epoch": 7.415778710580131, "grad_norm": 0.8296406269073486, "learning_rate": 0.000155394892791725, "loss": 3.2058, "step": 68900 }, { "epoch": 7.42116026261974, "grad_norm": 0.8450725674629211, "learning_rate": 0.0001550716517616636, "loss": 3.2244, "step": 68950 }, { "epoch": 7.426541814659347, "grad_norm": 0.8513227701187134, "learning_rate": 0.00015474841073160217, "loss": 3.2234, "step": 69000 }, { "epoch": 7.426541814659347, "eval_accuracy": 0.38826175045669525, "eval_loss": 3.3496501445770264, "eval_runtime": 184.2207, "eval_samples_per_second": 97.769, "eval_steps_per_second": 6.112, "step": 69000 }, { "epoch": 7.431923366698956, "grad_norm": 0.8405379056930542, "learning_rate": 0.0001544251697015408, "loss": 3.2202, "step": 69050 }, { "epoch": 7.4373049187385645, "grad_norm": 0.8393351435661316, "learning_rate": 0.00015410192867147936, "loss": 3.225, "step": 69100 }, { "epoch": 7.442686470778172, "grad_norm": 0.8616898655891418, "learning_rate": 0.00015377868764141793, "loss": 3.2377, "step": 69150 }, { "epoch": 7.448068022817781, "grad_norm": 0.8329727053642273, "learning_rate": 0.00015345544661135653, "loss": 3.2247, "step": 69200 }, { "epoch": 7.453449574857389, "grad_norm": 0.8001763820648193, "learning_rate": 0.00015313220558129512, "loss": 3.2371, "step": 69250 }, { "epoch": 7.458831126896997, "grad_norm": 0.8245202302932739, "learning_rate": 0.00015281542937183493, "loss": 3.2363, "step": 69300 }, { "epoch": 7.4642126789366054, "grad_norm": 0.8096289038658142, "learning_rate": 0.0001524921883417735, "loss": 3.2233, "step": 69350 }, { "epoch": 7.469594230976213, "grad_norm": 0.8227486610412598, "learning_rate": 0.00015216894731171206, "loss": 3.2437, "step": 69400 }, { "epoch": 7.474975783015822, "grad_norm": 0.8190933465957642, "learning_rate": 0.00015184570628165068, "loss": 3.2261, "step": 69450 }, { "epoch": 7.48035733505543, "grad_norm": 0.8173719644546509, "learning_rate": 0.00015152246525158925, "loss": 3.2172, "step": 69500 }, { "epoch": 7.485738887095038, "grad_norm": 0.8153382539749146, "learning_rate": 0.00015119922422152782, "loss": 3.2332, "step": 69550 }, { "epoch": 7.491120439134646, "grad_norm": 0.8580330610275269, "learning_rate": 0.00015087598319146644, "loss": 3.2464, "step": 69600 }, { "epoch": 7.496501991174255, "grad_norm": 0.8576569557189941, "learning_rate": 0.000150552742161405, "loss": 3.2279, "step": 69650 }, { "epoch": 7.501883543213863, "grad_norm": 0.8733546733856201, "learning_rate": 0.00015022950113134358, "loss": 3.2314, "step": 69700 }, { "epoch": 7.507265095253471, "grad_norm": 0.7958570718765259, "learning_rate": 0.00014990626010128217, "loss": 3.2444, "step": 69750 }, { "epoch": 7.51264664729308, "grad_norm": 0.8502094745635986, "learning_rate": 0.00014958301907122077, "loss": 3.2262, "step": 69800 }, { "epoch": 7.518028199332687, "grad_norm": 0.8462086915969849, "learning_rate": 0.00014925977804115933, "loss": 3.2331, "step": 69850 }, { "epoch": 7.523409751372296, "grad_norm": 0.8830896019935608, "learning_rate": 0.00014893653701109793, "loss": 3.2415, "step": 69900 }, { "epoch": 7.528791303411904, "grad_norm": 0.8277102112770081, "learning_rate": 0.0001486132959810365, "loss": 3.2387, "step": 69950 }, { "epoch": 7.534172855451512, "grad_norm": 0.8129670023918152, "learning_rate": 0.0001482900549509751, "loss": 3.2216, "step": 70000 }, { "epoch": 7.534172855451512, "eval_accuracy": 0.3888360896216826, "eval_loss": 3.3453478813171387, "eval_runtime": 184.2179, "eval_samples_per_second": 97.77, "eval_steps_per_second": 6.112, "step": 70000 }, { "epoch": 7.539554407491121, "grad_norm": 0.8361290097236633, "learning_rate": 0.00014796681392091368, "loss": 3.2383, "step": 70050 }, { "epoch": 7.544935959530728, "grad_norm": 0.8902501463890076, "learning_rate": 0.00014764357289085228, "loss": 3.2387, "step": 70100 }, { "epoch": 7.550317511570337, "grad_norm": 0.8061802387237549, "learning_rate": 0.00014732033186079085, "loss": 3.2448, "step": 70150 }, { "epoch": 7.5556990636099455, "grad_norm": 0.8156223297119141, "learning_rate": 0.00014699709083072944, "loss": 3.2182, "step": 70200 }, { "epoch": 7.561080615649553, "grad_norm": 0.8143748044967651, "learning_rate": 0.00014667384980066804, "loss": 3.2382, "step": 70250 }, { "epoch": 7.566462167689162, "grad_norm": 0.872662365436554, "learning_rate": 0.0001463506087706066, "loss": 3.221, "step": 70300 }, { "epoch": 7.57184371972877, "grad_norm": 0.8547393679618835, "learning_rate": 0.0001460273677405452, "loss": 3.2248, "step": 70350 }, { "epoch": 7.577225271768378, "grad_norm": 0.8916277885437012, "learning_rate": 0.00014570412671048377, "loss": 3.2326, "step": 70400 }, { "epoch": 7.5826068238079865, "grad_norm": 0.8322258591651917, "learning_rate": 0.00014538088568042236, "loss": 3.2349, "step": 70450 }, { "epoch": 7.587988375847594, "grad_norm": 0.8075584769248962, "learning_rate": 0.00014505764465036093, "loss": 3.2345, "step": 70500 }, { "epoch": 7.593369927887203, "grad_norm": 0.797018826007843, "learning_rate": 0.00014473440362029952, "loss": 3.2268, "step": 70550 }, { "epoch": 7.598751479926811, "grad_norm": 0.8739523887634277, "learning_rate": 0.00014441116259023812, "loss": 3.2349, "step": 70600 }, { "epoch": 7.604133031966419, "grad_norm": 0.8110100030899048, "learning_rate": 0.00014408792156017669, "loss": 3.2242, "step": 70650 }, { "epoch": 7.609514584006027, "grad_norm": 0.8647212386131287, "learning_rate": 0.00014376468053011528, "loss": 3.2395, "step": 70700 }, { "epoch": 7.614896136045635, "grad_norm": 0.8333895802497864, "learning_rate": 0.00014344143950005387, "loss": 3.2343, "step": 70750 }, { "epoch": 7.620277688085244, "grad_norm": 0.7953730821609497, "learning_rate": 0.00014311819846999244, "loss": 3.212, "step": 70800 }, { "epoch": 7.625659240124852, "grad_norm": 0.7988885641098022, "learning_rate": 0.00014279495743993104, "loss": 3.2082, "step": 70850 }, { "epoch": 7.63104079216446, "grad_norm": 0.8128767013549805, "learning_rate": 0.00014247818123047084, "loss": 3.2387, "step": 70900 }, { "epoch": 7.636422344204068, "grad_norm": 0.8233309388160706, "learning_rate": 0.0001421549402004094, "loss": 3.2134, "step": 70950 }, { "epoch": 7.641803896243677, "grad_norm": 0.8329945206642151, "learning_rate": 0.000141831699170348, "loss": 3.2545, "step": 71000 }, { "epoch": 7.641803896243677, "eval_accuracy": 0.3890867518338706, "eval_loss": 3.340794563293457, "eval_runtime": 184.3075, "eval_samples_per_second": 97.723, "eval_steps_per_second": 6.109, "step": 71000 }, { "epoch": 7.647185448283285, "grad_norm": 0.8543627858161926, "learning_rate": 0.0001415084581402866, "loss": 3.2362, "step": 71050 }, { "epoch": 7.652567000322893, "grad_norm": 0.850051760673523, "learning_rate": 0.00014118521711022517, "loss": 3.2434, "step": 71100 }, { "epoch": 7.657948552362502, "grad_norm": 0.878652811050415, "learning_rate": 0.00014086197608016376, "loss": 3.2306, "step": 71150 }, { "epoch": 7.663330104402109, "grad_norm": 0.8258744478225708, "learning_rate": 0.00014053873505010236, "loss": 3.2321, "step": 71200 }, { "epoch": 7.668711656441718, "grad_norm": 0.7892378568649292, "learning_rate": 0.00014021549402004092, "loss": 3.2283, "step": 71250 }, { "epoch": 7.674093208481326, "grad_norm": 0.8138274550437927, "learning_rate": 0.00013989225298997952, "loss": 3.2283, "step": 71300 }, { "epoch": 7.679474760520934, "grad_norm": 0.8015547394752502, "learning_rate": 0.00013956901195991811, "loss": 3.2251, "step": 71350 }, { "epoch": 7.684856312560543, "grad_norm": 0.8208349943161011, "learning_rate": 0.00013924577092985668, "loss": 3.2399, "step": 71400 }, { "epoch": 7.69023786460015, "grad_norm": 0.9041082262992859, "learning_rate": 0.00013892252989979528, "loss": 3.2339, "step": 71450 }, { "epoch": 7.695619416639759, "grad_norm": 0.8210029006004333, "learning_rate": 0.00013859928886973384, "loss": 3.2244, "step": 71500 }, { "epoch": 7.7010009686793675, "grad_norm": 0.8555104732513428, "learning_rate": 0.00013827604783967244, "loss": 3.2307, "step": 71550 }, { "epoch": 7.706382520718975, "grad_norm": 0.849856436252594, "learning_rate": 0.000137952806809611, "loss": 3.2357, "step": 71600 }, { "epoch": 7.711764072758584, "grad_norm": 0.8547579646110535, "learning_rate": 0.0001376295657795496, "loss": 3.236, "step": 71650 }, { "epoch": 7.717145624798192, "grad_norm": 0.8547543883323669, "learning_rate": 0.0001373063247494882, "loss": 3.2326, "step": 71700 }, { "epoch": 7.7225271768378, "grad_norm": 0.8579950332641602, "learning_rate": 0.00013698308371942676, "loss": 3.2309, "step": 71750 }, { "epoch": 7.727908728877408, "grad_norm": 0.833358883857727, "learning_rate": 0.00013665984268936536, "loss": 3.2228, "step": 71800 }, { "epoch": 7.733290280917016, "grad_norm": 0.8538786172866821, "learning_rate": 0.00013633660165930395, "loss": 3.219, "step": 71850 }, { "epoch": 7.738671832956625, "grad_norm": 0.8462111353874207, "learning_rate": 0.00013601336062924255, "loss": 3.2335, "step": 71900 }, { "epoch": 7.744053384996233, "grad_norm": 0.8430663347244263, "learning_rate": 0.00013569011959918111, "loss": 3.2374, "step": 71950 }, { "epoch": 7.749434937035841, "grad_norm": 0.8366883397102356, "learning_rate": 0.0001353668785691197, "loss": 3.2305, "step": 72000 }, { "epoch": 7.749434937035841, "eval_accuracy": 0.3892877596763274, "eval_loss": 3.3372321128845215, "eval_runtime": 184.2635, "eval_samples_per_second": 97.746, "eval_steps_per_second": 6.111, "step": 72000 }, { "epoch": 7.754816489075449, "grad_norm": 0.826667070388794, "learning_rate": 0.00013504363753905828, "loss": 3.2383, "step": 72050 }, { "epoch": 7.760198041115058, "grad_norm": 0.8618669509887695, "learning_rate": 0.00013472039650899687, "loss": 3.2343, "step": 72100 }, { "epoch": 7.765579593154666, "grad_norm": 0.8740438222885132, "learning_rate": 0.00013439715547893544, "loss": 3.2351, "step": 72150 }, { "epoch": 7.770961145194274, "grad_norm": 0.8314293026924133, "learning_rate": 0.00013407391444887403, "loss": 3.2297, "step": 72200 }, { "epoch": 7.776342697233883, "grad_norm": 0.8154666423797607, "learning_rate": 0.0001337506734188126, "loss": 3.2091, "step": 72250 }, { "epoch": 7.78172424927349, "grad_norm": 0.8608399629592896, "learning_rate": 0.0001334274323887512, "loss": 3.232, "step": 72300 }, { "epoch": 7.787105801313099, "grad_norm": 0.8048871159553528, "learning_rate": 0.0001331041913586898, "loss": 3.2303, "step": 72350 }, { "epoch": 7.792487353352707, "grad_norm": 0.839849591255188, "learning_rate": 0.00013278095032862838, "loss": 3.2228, "step": 72400 }, { "epoch": 7.797868905392315, "grad_norm": 0.8656668663024902, "learning_rate": 0.00013245770929856695, "loss": 3.241, "step": 72450 }, { "epoch": 7.803250457431924, "grad_norm": 0.8527569770812988, "learning_rate": 0.00013213446826850555, "loss": 3.2333, "step": 72500 }, { "epoch": 7.808632009471531, "grad_norm": 0.8511040210723877, "learning_rate": 0.00013181122723844411, "loss": 3.2265, "step": 72550 }, { "epoch": 7.81401356151114, "grad_norm": 0.8588733673095703, "learning_rate": 0.0001314879862083827, "loss": 3.2293, "step": 72600 }, { "epoch": 7.819395113550748, "grad_norm": 0.8625561594963074, "learning_rate": 0.00013116474517832128, "loss": 3.2291, "step": 72650 }, { "epoch": 7.824776665590356, "grad_norm": 0.8318623304367065, "learning_rate": 0.00013084150414825987, "loss": 3.2215, "step": 72700 }, { "epoch": 7.830158217629965, "grad_norm": 0.8144527673721313, "learning_rate": 0.00013051826311819844, "loss": 3.2311, "step": 72750 }, { "epoch": 7.835539769669572, "grad_norm": 0.8226759433746338, "learning_rate": 0.00013019502208813703, "loss": 3.2275, "step": 72800 }, { "epoch": 7.840921321709181, "grad_norm": 0.8554942607879639, "learning_rate": 0.00012987178105807563, "loss": 3.2445, "step": 72850 }, { "epoch": 7.846302873748789, "grad_norm": 0.8423759937286377, "learning_rate": 0.00012954854002801422, "loss": 3.2325, "step": 72900 }, { "epoch": 7.851684425788397, "grad_norm": 0.7883509993553162, "learning_rate": 0.0001292252989979528, "loss": 3.2471, "step": 72950 }, { "epoch": 7.857065977828006, "grad_norm": 0.834790825843811, "learning_rate": 0.00012890205796789139, "loss": 3.2315, "step": 73000 }, { "epoch": 7.857065977828006, "eval_accuracy": 0.3900051946945664, "eval_loss": 3.3319382667541504, "eval_runtime": 184.156, "eval_samples_per_second": 97.803, "eval_steps_per_second": 6.114, "step": 73000 }, { "epoch": 7.862447529867614, "grad_norm": 0.8149532079696655, "learning_rate": 0.00012857881693782998, "loss": 3.236, "step": 73050 }, { "epoch": 7.867829081907222, "grad_norm": 0.8755621314048767, "learning_rate": 0.00012825557590776855, "loss": 3.2441, "step": 73100 }, { "epoch": 7.87321063394683, "grad_norm": 0.8738328814506531, "learning_rate": 0.00012793233487770714, "loss": 3.2162, "step": 73150 }, { "epoch": 7.878592185986438, "grad_norm": 0.8532841801643372, "learning_rate": 0.0001276090938476457, "loss": 3.239, "step": 73200 }, { "epoch": 7.883973738026047, "grad_norm": 0.8384474515914917, "learning_rate": 0.0001272858528175843, "loss": 3.2406, "step": 73250 }, { "epoch": 7.889355290065655, "grad_norm": 0.8933846354484558, "learning_rate": 0.00012696261178752287, "loss": 3.2467, "step": 73300 }, { "epoch": 7.894736842105263, "grad_norm": 0.8702695965766907, "learning_rate": 0.00012663937075746147, "loss": 3.2448, "step": 73350 }, { "epoch": 7.900118394144871, "grad_norm": 0.8213550448417664, "learning_rate": 0.00012631612972740006, "loss": 3.2401, "step": 73400 }, { "epoch": 7.90549994618448, "grad_norm": 0.8284666538238525, "learning_rate": 0.00012599288869733863, "loss": 3.2283, "step": 73450 }, { "epoch": 7.910881498224088, "grad_norm": 0.8575464487075806, "learning_rate": 0.00012566964766727722, "loss": 3.2119, "step": 73500 }, { "epoch": 7.916263050263696, "grad_norm": 0.8304067850112915, "learning_rate": 0.00012534640663721582, "loss": 3.2233, "step": 73550 }, { "epoch": 7.921644602303305, "grad_norm": 0.8427515029907227, "learning_rate": 0.00012502316560715439, "loss": 3.2332, "step": 73600 }, { "epoch": 7.927026154342912, "grad_norm": 0.8646396398544312, "learning_rate": 0.00012469992457709298, "loss": 3.2182, "step": 73650 }, { "epoch": 7.932407706382521, "grad_norm": 0.8815178871154785, "learning_rate": 0.00012437668354703158, "loss": 3.2211, "step": 73700 }, { "epoch": 7.937789258422129, "grad_norm": 0.7838130593299866, "learning_rate": 0.00012405344251697014, "loss": 3.2266, "step": 73750 }, { "epoch": 7.943170810461737, "grad_norm": 0.8467072248458862, "learning_rate": 0.00012373020148690874, "loss": 3.2358, "step": 73800 }, { "epoch": 7.948552362501346, "grad_norm": 0.8720870018005371, "learning_rate": 0.0001234069604568473, "loss": 3.2307, "step": 73850 }, { "epoch": 7.953933914540953, "grad_norm": 0.8565042614936829, "learning_rate": 0.0001230837194267859, "loss": 3.239, "step": 73900 }, { "epoch": 7.959315466580562, "grad_norm": 0.8643916249275208, "learning_rate": 0.00012276047839672447, "loss": 3.2414, "step": 73950 }, { "epoch": 7.96469701862017, "grad_norm": 0.8559043407440186, "learning_rate": 0.00012243723736666306, "loss": 3.2508, "step": 74000 }, { "epoch": 7.96469701862017, "eval_accuracy": 0.39019609781845643, "eval_loss": 3.3301596641540527, "eval_runtime": 184.1878, "eval_samples_per_second": 97.786, "eval_steps_per_second": 6.113, "step": 74000 }, { "epoch": 7.970078570659778, "grad_norm": 0.898086667060852, "learning_rate": 0.00012211399633660166, "loss": 3.2323, "step": 74050 }, { "epoch": 7.975460122699387, "grad_norm": 0.8650676608085632, "learning_rate": 0.00012179075530654022, "loss": 3.2314, "step": 74100 }, { "epoch": 7.980841674738995, "grad_norm": 0.9009398221969604, "learning_rate": 0.00012146751427647882, "loss": 3.2237, "step": 74150 }, { "epoch": 7.986223226778603, "grad_norm": 0.8346262574195862, "learning_rate": 0.0001211442732464174, "loss": 3.2231, "step": 74200 }, { "epoch": 7.991604778818211, "grad_norm": 0.8448550701141357, "learning_rate": 0.00012082103221635598, "loss": 3.2183, "step": 74250 }, { "epoch": 7.996986330857819, "grad_norm": 0.8301780819892883, "learning_rate": 0.00012049779118629456, "loss": 3.2405, "step": 74300 }, { "epoch": 8.002367882897428, "grad_norm": 0.859961986541748, "learning_rate": 0.00012017455015623316, "loss": 3.2076, "step": 74350 }, { "epoch": 8.007749434937036, "grad_norm": 0.8312318921089172, "learning_rate": 0.00011985130912617175, "loss": 3.1726, "step": 74400 }, { "epoch": 8.013130986976645, "grad_norm": 0.8117290735244751, "learning_rate": 0.00011952806809611032, "loss": 3.1614, "step": 74450 }, { "epoch": 8.018512539016251, "grad_norm": 0.8326854109764099, "learning_rate": 0.00011920482706604891, "loss": 3.1556, "step": 74500 }, { "epoch": 8.02389409105586, "grad_norm": 0.8626469969749451, "learning_rate": 0.0001188815860359875, "loss": 3.1494, "step": 74550 }, { "epoch": 8.029275643095469, "grad_norm": 0.8175448775291443, "learning_rate": 0.00011855834500592608, "loss": 3.1476, "step": 74600 }, { "epoch": 8.034657195135077, "grad_norm": 0.8749653697013855, "learning_rate": 0.00011823510397586466, "loss": 3.1479, "step": 74650 }, { "epoch": 8.040038747174686, "grad_norm": 0.9233711361885071, "learning_rate": 0.00011791186294580325, "loss": 3.1514, "step": 74700 }, { "epoch": 8.045420299214294, "grad_norm": 0.8561426997184753, "learning_rate": 0.00011758862191574182, "loss": 3.1515, "step": 74750 }, { "epoch": 8.050801851253901, "grad_norm": 0.8589391112327576, "learning_rate": 0.00011726538088568041, "loss": 3.176, "step": 74800 }, { "epoch": 8.05618340329351, "grad_norm": 0.8228410482406616, "learning_rate": 0.000116942139855619, "loss": 3.1523, "step": 74850 }, { "epoch": 8.061564955333118, "grad_norm": 0.8332442045211792, "learning_rate": 0.00011661889882555759, "loss": 3.1445, "step": 74900 }, { "epoch": 8.066946507372727, "grad_norm": 0.8404805660247803, "learning_rate": 0.00011629565779549616, "loss": 3.1601, "step": 74950 }, { "epoch": 8.072328059412335, "grad_norm": 0.8581123352050781, "learning_rate": 0.00011597241676543475, "loss": 3.1804, "step": 75000 }, { "epoch": 8.072328059412335, "eval_accuracy": 0.3900238829912705, "eval_loss": 3.335019111633301, "eval_runtime": 184.1369, "eval_samples_per_second": 97.813, "eval_steps_per_second": 6.115, "step": 75000 }, { "epoch": 8.077709611451942, "grad_norm": 0.8404073715209961, "learning_rate": 0.00011564917573537335, "loss": 3.1665, "step": 75050 }, { "epoch": 8.08309116349155, "grad_norm": 0.9298220276832581, "learning_rate": 0.00011532593470531191, "loss": 3.1697, "step": 75100 }, { "epoch": 8.088472715531159, "grad_norm": 0.8481408953666687, "learning_rate": 0.0001150026936752505, "loss": 3.1655, "step": 75150 }, { "epoch": 8.093854267570768, "grad_norm": 0.8461535573005676, "learning_rate": 0.00011467945264518909, "loss": 3.1618, "step": 75200 }, { "epoch": 8.099235819610376, "grad_norm": 0.8676047921180725, "learning_rate": 0.00011435621161512766, "loss": 3.1669, "step": 75250 }, { "epoch": 8.104617371649983, "grad_norm": 0.8415380120277405, "learning_rate": 0.00011403297058506625, "loss": 3.1786, "step": 75300 }, { "epoch": 8.109998923689592, "grad_norm": 0.854771614074707, "learning_rate": 0.00011371619437560607, "loss": 3.168, "step": 75350 }, { "epoch": 8.1153804757292, "grad_norm": 0.8564488887786865, "learning_rate": 0.00011339295334554464, "loss": 3.1608, "step": 75400 }, { "epoch": 8.120762027768809, "grad_norm": 0.8662900924682617, "learning_rate": 0.00011306971231548323, "loss": 3.1465, "step": 75450 }, { "epoch": 8.126143579808417, "grad_norm": 0.841042160987854, "learning_rate": 0.00011274647128542183, "loss": 3.1626, "step": 75500 }, { "epoch": 8.131525131848026, "grad_norm": 0.8627299070358276, "learning_rate": 0.0001124232302553604, "loss": 3.1577, "step": 75550 }, { "epoch": 8.136906683887632, "grad_norm": 0.8255852460861206, "learning_rate": 0.00011209998922529899, "loss": 3.1891, "step": 75600 }, { "epoch": 8.142288235927241, "grad_norm": 0.8542093634605408, "learning_rate": 0.00011177674819523757, "loss": 3.167, "step": 75650 }, { "epoch": 8.14766978796685, "grad_norm": 0.8421674370765686, "learning_rate": 0.00011145350716517617, "loss": 3.1702, "step": 75700 }, { "epoch": 8.153051340006458, "grad_norm": 0.8243067264556885, "learning_rate": 0.00011113026613511473, "loss": 3.176, "step": 75750 }, { "epoch": 8.158432892046067, "grad_norm": 0.8624492287635803, "learning_rate": 0.00011080702510505333, "loss": 3.1643, "step": 75800 }, { "epoch": 8.163814444085673, "grad_norm": 0.8803246021270752, "learning_rate": 0.00011048378407499191, "loss": 3.1762, "step": 75850 }, { "epoch": 8.169195996125282, "grad_norm": 0.8619367480278015, "learning_rate": 0.00011016054304493049, "loss": 3.1714, "step": 75900 }, { "epoch": 8.17457754816489, "grad_norm": 0.874962329864502, "learning_rate": 0.00010983730201486907, "loss": 3.1623, "step": 75950 }, { "epoch": 8.1799591002045, "grad_norm": 0.9051128625869751, "learning_rate": 0.00010951406098480767, "loss": 3.1828, "step": 76000 }, { "epoch": 8.1799591002045, "eval_accuracy": 0.39044502158443944, "eval_loss": 3.3335366249084473, "eval_runtime": 184.1757, "eval_samples_per_second": 97.792, "eval_steps_per_second": 6.114, "step": 76000 }, { "epoch": 8.185340652244108, "grad_norm": 0.8197662234306335, "learning_rate": 0.00010919081995474623, "loss": 3.1625, "step": 76050 }, { "epoch": 8.190722204283716, "grad_norm": 0.8349660634994507, "learning_rate": 0.00010886757892468483, "loss": 3.1618, "step": 76100 }, { "epoch": 8.196103756323323, "grad_norm": 0.8758369088172913, "learning_rate": 0.00010854433789462342, "loss": 3.1693, "step": 76150 }, { "epoch": 8.201485308362932, "grad_norm": 0.8578165769577026, "learning_rate": 0.000108221096864562, "loss": 3.1835, "step": 76200 }, { "epoch": 8.20686686040254, "grad_norm": 0.8335930109024048, "learning_rate": 0.00010789785583450059, "loss": 3.1668, "step": 76250 }, { "epoch": 8.212248412442149, "grad_norm": 0.8943109512329102, "learning_rate": 0.00010757461480443917, "loss": 3.1783, "step": 76300 }, { "epoch": 8.217629964481757, "grad_norm": 0.8463231325149536, "learning_rate": 0.00010725137377437776, "loss": 3.166, "step": 76350 }, { "epoch": 8.223011516521364, "grad_norm": 0.8887134790420532, "learning_rate": 0.00010692813274431633, "loss": 3.169, "step": 76400 }, { "epoch": 8.228393068560973, "grad_norm": 0.8750781416893005, "learning_rate": 0.00010660489171425492, "loss": 3.1661, "step": 76450 }, { "epoch": 8.233774620600581, "grad_norm": 0.8328254818916321, "learning_rate": 0.0001062816506841935, "loss": 3.1636, "step": 76500 }, { "epoch": 8.23915617264019, "grad_norm": 0.8516666293144226, "learning_rate": 0.00010595840965413209, "loss": 3.1673, "step": 76550 }, { "epoch": 8.244537724679798, "grad_norm": 0.8627135753631592, "learning_rate": 0.00010563516862407067, "loss": 3.1656, "step": 76600 }, { "epoch": 8.249919276719407, "grad_norm": 0.8882033824920654, "learning_rate": 0.00010531192759400926, "loss": 3.1771, "step": 76650 }, { "epoch": 8.255300828759013, "grad_norm": 0.8409345149993896, "learning_rate": 0.00010498868656394784, "loss": 3.178, "step": 76700 }, { "epoch": 8.260682380798622, "grad_norm": 0.8309500813484192, "learning_rate": 0.00010466544553388642, "loss": 3.166, "step": 76750 }, { "epoch": 8.26606393283823, "grad_norm": 0.8517318964004517, "learning_rate": 0.000104342204503825, "loss": 3.1836, "step": 76800 }, { "epoch": 8.27144548487784, "grad_norm": 0.8607766032218933, "learning_rate": 0.0001040189634737636, "loss": 3.1787, "step": 76850 }, { "epoch": 8.276827036917448, "grad_norm": 0.8421453237533569, "learning_rate": 0.00010369572244370217, "loss": 3.1575, "step": 76900 }, { "epoch": 8.282208588957054, "grad_norm": 0.8441159725189209, "learning_rate": 0.00010337248141364076, "loss": 3.1726, "step": 76950 }, { "epoch": 8.287590140996663, "grad_norm": 0.9111336469650269, "learning_rate": 0.00010304924038357936, "loss": 3.1945, "step": 77000 }, { "epoch": 8.287590140996663, "eval_accuracy": 0.3903413667294644, "eval_loss": 3.331829309463501, "eval_runtime": 184.0804, "eval_samples_per_second": 97.843, "eval_steps_per_second": 6.117, "step": 77000 }, { "epoch": 8.292971693036272, "grad_norm": 0.8794386982917786, "learning_rate": 0.00010272599935351792, "loss": 3.1644, "step": 77050 }, { "epoch": 8.29835324507588, "grad_norm": 0.8792227506637573, "learning_rate": 0.00010240275832345652, "loss": 3.1806, "step": 77100 }, { "epoch": 8.303734797115489, "grad_norm": 0.8737832307815552, "learning_rate": 0.0001020795172933951, "loss": 3.1922, "step": 77150 }, { "epoch": 8.309116349155097, "grad_norm": 0.87740558385849, "learning_rate": 0.0001017562762633337, "loss": 3.1727, "step": 77200 }, { "epoch": 8.314497901194704, "grad_norm": 0.8958991765975952, "learning_rate": 0.00010143303523327226, "loss": 3.1757, "step": 77250 }, { "epoch": 8.319879453234313, "grad_norm": 0.8986038565635681, "learning_rate": 0.00010110979420321086, "loss": 3.1794, "step": 77300 }, { "epoch": 8.325261005273921, "grad_norm": 0.8379119038581848, "learning_rate": 0.00010078655317314944, "loss": 3.1847, "step": 77350 }, { "epoch": 8.33064255731353, "grad_norm": 0.8529695272445679, "learning_rate": 0.00010046331214308802, "loss": 3.1829, "step": 77400 }, { "epoch": 8.336024109353138, "grad_norm": 1.1032296419143677, "learning_rate": 0.0001001400711130266, "loss": 3.1575, "step": 77450 }, { "epoch": 8.341405661392745, "grad_norm": 0.8984729051589966, "learning_rate": 9.98168300829652e-05, "loss": 3.1752, "step": 77500 }, { "epoch": 8.346787213432354, "grad_norm": 0.8461748361587524, "learning_rate": 9.949358905290376e-05, "loss": 3.1868, "step": 77550 }, { "epoch": 8.352168765471962, "grad_norm": 0.8862440586090088, "learning_rate": 9.917034802284236e-05, "loss": 3.1927, "step": 77600 }, { "epoch": 8.35755031751157, "grad_norm": 0.8415391445159912, "learning_rate": 9.884710699278094e-05, "loss": 3.176, "step": 77650 }, { "epoch": 8.36293186955118, "grad_norm": 0.8411977291107178, "learning_rate": 9.852386596271953e-05, "loss": 3.1837, "step": 77700 }, { "epoch": 8.368313421590786, "grad_norm": 0.8862571120262146, "learning_rate": 9.82006249326581e-05, "loss": 3.1804, "step": 77750 }, { "epoch": 8.373694973630395, "grad_norm": 0.8638123273849487, "learning_rate": 9.78773839025967e-05, "loss": 3.1508, "step": 77800 }, { "epoch": 8.379076525670003, "grad_norm": 0.9505172371864319, "learning_rate": 9.755414287253529e-05, "loss": 3.1807, "step": 77850 }, { "epoch": 8.384458077709612, "grad_norm": 0.8431479930877686, "learning_rate": 9.723090184247386e-05, "loss": 3.1791, "step": 77900 }, { "epoch": 8.38983962974922, "grad_norm": 0.8901960849761963, "learning_rate": 9.690766081241245e-05, "loss": 3.1754, "step": 77950 }, { "epoch": 8.395221181788829, "grad_norm": 0.8637632131576538, "learning_rate": 9.658441978235103e-05, "loss": 3.1736, "step": 78000 }, { "epoch": 8.395221181788829, "eval_accuracy": 0.3908162885021015, "eval_loss": 3.328371047973633, "eval_runtime": 184.2614, "eval_samples_per_second": 97.747, "eval_steps_per_second": 6.111, "step": 78000 }, { "epoch": 8.400602733828435, "grad_norm": 0.8351421356201172, "learning_rate": 9.626117875228961e-05, "loss": 3.1715, "step": 78050 }, { "epoch": 8.405984285868044, "grad_norm": 0.8287306427955627, "learning_rate": 9.59379377222282e-05, "loss": 3.1876, "step": 78100 }, { "epoch": 8.411365837907653, "grad_norm": 0.8632174730300903, "learning_rate": 9.561469669216679e-05, "loss": 3.1692, "step": 78150 }, { "epoch": 8.416747389947261, "grad_norm": 0.8752260208129883, "learning_rate": 9.529145566210537e-05, "loss": 3.1784, "step": 78200 }, { "epoch": 8.42212894198687, "grad_norm": 0.878662645816803, "learning_rate": 9.496821463204395e-05, "loss": 3.1546, "step": 78250 }, { "epoch": 8.427510494026476, "grad_norm": 0.843353271484375, "learning_rate": 9.464497360198253e-05, "loss": 3.1921, "step": 78300 }, { "epoch": 8.432892046066085, "grad_norm": 0.8734889030456543, "learning_rate": 9.432173257192113e-05, "loss": 3.1705, "step": 78350 }, { "epoch": 8.438273598105694, "grad_norm": 0.846319317817688, "learning_rate": 9.39984915418597e-05, "loss": 3.1925, "step": 78400 }, { "epoch": 8.443655150145302, "grad_norm": 0.8590786457061768, "learning_rate": 9.367525051179829e-05, "loss": 3.1691, "step": 78450 }, { "epoch": 8.44903670218491, "grad_norm": 0.8895047307014465, "learning_rate": 9.335200948173688e-05, "loss": 3.1766, "step": 78500 }, { "epoch": 8.45441825422452, "grad_norm": 0.8600460886955261, "learning_rate": 9.302876845167545e-05, "loss": 3.1834, "step": 78550 }, { "epoch": 8.459799806264126, "grad_norm": 0.8451809883117676, "learning_rate": 9.270552742161403e-05, "loss": 3.1508, "step": 78600 }, { "epoch": 8.465181358303735, "grad_norm": 0.8594034314155579, "learning_rate": 9.238228639155263e-05, "loss": 3.1848, "step": 78650 }, { "epoch": 8.470562910343343, "grad_norm": 0.8766723275184631, "learning_rate": 9.205904536149122e-05, "loss": 3.1742, "step": 78700 }, { "epoch": 8.475944462382952, "grad_norm": 0.886196494102478, "learning_rate": 9.173580433142979e-05, "loss": 3.1845, "step": 78750 }, { "epoch": 8.48132601442256, "grad_norm": 0.846622884273529, "learning_rate": 9.141256330136838e-05, "loss": 3.1484, "step": 78800 }, { "epoch": 8.486707566462167, "grad_norm": 0.9133267402648926, "learning_rate": 9.108932227130697e-05, "loss": 3.1804, "step": 78850 }, { "epoch": 8.492089118501776, "grad_norm": 0.9626492261886597, "learning_rate": 9.076608124124555e-05, "loss": 3.1945, "step": 78900 }, { "epoch": 8.497470670541384, "grad_norm": 0.843144953250885, "learning_rate": 9.044930503178537e-05, "loss": 3.1742, "step": 78950 }, { "epoch": 8.502852222580993, "grad_norm": 0.8927271962165833, "learning_rate": 9.012606400172395e-05, "loss": 3.1824, "step": 79000 }, { "epoch": 8.502852222580993, "eval_accuracy": 0.3913783498907658, "eval_loss": 3.3255443572998047, "eval_runtime": 184.1563, "eval_samples_per_second": 97.803, "eval_steps_per_second": 6.114, "step": 79000 }, { "epoch": 8.508233774620601, "grad_norm": 0.8790886998176575, "learning_rate": 8.980282297166253e-05, "loss": 3.1835, "step": 79050 }, { "epoch": 8.513615326660208, "grad_norm": 0.881637454032898, "learning_rate": 8.947958194160111e-05, "loss": 3.1952, "step": 79100 }, { "epoch": 8.518996878699816, "grad_norm": 0.9322641491889954, "learning_rate": 8.91563409115397e-05, "loss": 3.1682, "step": 79150 }, { "epoch": 8.524378430739425, "grad_norm": 0.9204634428024292, "learning_rate": 8.883309988147827e-05, "loss": 3.1797, "step": 79200 }, { "epoch": 8.529759982779034, "grad_norm": 0.9618690013885498, "learning_rate": 8.850985885141687e-05, "loss": 3.1871, "step": 79250 }, { "epoch": 8.535141534818642, "grad_norm": 0.8813146352767944, "learning_rate": 8.818661782135545e-05, "loss": 3.2028, "step": 79300 }, { "epoch": 8.54052308685825, "grad_norm": 0.9076323509216309, "learning_rate": 8.786337679129403e-05, "loss": 3.1843, "step": 79350 }, { "epoch": 8.545904638897857, "grad_norm": 0.8698846101760864, "learning_rate": 8.754013576123261e-05, "loss": 3.1707, "step": 79400 }, { "epoch": 8.551286190937466, "grad_norm": 0.882914662361145, "learning_rate": 8.72168947311712e-05, "loss": 3.1889, "step": 79450 }, { "epoch": 8.556667742977075, "grad_norm": 0.8919097781181335, "learning_rate": 8.68936537011098e-05, "loss": 3.1831, "step": 79500 }, { "epoch": 8.562049295016683, "grad_norm": 0.8825957179069519, "learning_rate": 8.657041267104837e-05, "loss": 3.1739, "step": 79550 }, { "epoch": 8.567430847056292, "grad_norm": 0.8758592009544373, "learning_rate": 8.624717164098696e-05, "loss": 3.192, "step": 79600 }, { "epoch": 8.572812399095898, "grad_norm": 0.8645864725112915, "learning_rate": 8.592393061092554e-05, "loss": 3.1842, "step": 79650 }, { "epoch": 8.578193951135507, "grad_norm": 0.860946774482727, "learning_rate": 8.560068958086412e-05, "loss": 3.1601, "step": 79700 }, { "epoch": 8.583575503175116, "grad_norm": 0.8569620251655579, "learning_rate": 8.52774485508027e-05, "loss": 3.1922, "step": 79750 }, { "epoch": 8.588957055214724, "grad_norm": 0.8748207688331604, "learning_rate": 8.49542075207413e-05, "loss": 3.1766, "step": 79800 }, { "epoch": 8.594338607254333, "grad_norm": 0.8643341660499573, "learning_rate": 8.463096649067987e-05, "loss": 3.1729, "step": 79850 }, { "epoch": 8.599720159293941, "grad_norm": 0.880623459815979, "learning_rate": 8.430772546061846e-05, "loss": 3.1981, "step": 79900 }, { "epoch": 8.605101711333548, "grad_norm": 0.8855699896812439, "learning_rate": 8.398448443055704e-05, "loss": 3.1805, "step": 79950 }, { "epoch": 8.610483263373157, "grad_norm": 0.8724981546401978, "learning_rate": 8.366124340049564e-05, "loss": 3.192, "step": 80000 }, { "epoch": 8.610483263373157, "eval_accuracy": 0.39170050570313575, "eval_loss": 3.322553873062134, "eval_runtime": 184.2661, "eval_samples_per_second": 97.745, "eval_steps_per_second": 6.111, "step": 80000 }, { "epoch": 8.615864815412765, "grad_norm": 0.875298023223877, "learning_rate": 8.33380023704342e-05, "loss": 3.1883, "step": 80050 }, { "epoch": 8.621246367452374, "grad_norm": 0.8440449237823486, "learning_rate": 8.30147613403728e-05, "loss": 3.1755, "step": 80100 }, { "epoch": 8.626627919491982, "grad_norm": 0.910894513130188, "learning_rate": 8.269152031031138e-05, "loss": 3.1677, "step": 80150 }, { "epoch": 8.632009471531589, "grad_norm": 0.9109053015708923, "learning_rate": 8.236827928024996e-05, "loss": 3.1983, "step": 80200 }, { "epoch": 8.637391023571197, "grad_norm": 0.8878882527351379, "learning_rate": 8.204503825018854e-05, "loss": 3.1854, "step": 80250 }, { "epoch": 8.642772575610806, "grad_norm": 0.867188572883606, "learning_rate": 8.172179722012714e-05, "loss": 3.1818, "step": 80300 }, { "epoch": 8.648154127650415, "grad_norm": 0.8970143795013428, "learning_rate": 8.13985561900657e-05, "loss": 3.1704, "step": 80350 }, { "epoch": 8.653535679690023, "grad_norm": 0.9002646207809448, "learning_rate": 8.10753151600043e-05, "loss": 3.2036, "step": 80400 }, { "epoch": 8.658917231729632, "grad_norm": 0.8538011908531189, "learning_rate": 8.07520741299429e-05, "loss": 3.1805, "step": 80450 }, { "epoch": 8.664298783769238, "grad_norm": 0.8426478505134583, "learning_rate": 8.042883309988148e-05, "loss": 3.1701, "step": 80500 }, { "epoch": 8.669680335808847, "grad_norm": 0.890612781047821, "learning_rate": 8.010559206982006e-05, "loss": 3.1914, "step": 80550 }, { "epoch": 8.675061887848456, "grad_norm": 0.8478041291236877, "learning_rate": 7.978235103975864e-05, "loss": 3.18, "step": 80600 }, { "epoch": 8.680443439888064, "grad_norm": 0.8645349740982056, "learning_rate": 7.945911000969723e-05, "loss": 3.1701, "step": 80650 }, { "epoch": 8.685824991927673, "grad_norm": 0.8678893446922302, "learning_rate": 7.91358689796358e-05, "loss": 3.1734, "step": 80700 }, { "epoch": 8.69120654396728, "grad_norm": 0.9042659997940063, "learning_rate": 7.88126279495744e-05, "loss": 3.2059, "step": 80750 }, { "epoch": 8.696588096006888, "grad_norm": 0.8825768828392029, "learning_rate": 7.848938691951298e-05, "loss": 3.2001, "step": 80800 }, { "epoch": 8.701969648046497, "grad_norm": 0.8726889491081238, "learning_rate": 7.816614588945156e-05, "loss": 3.1913, "step": 80850 }, { "epoch": 8.707351200086105, "grad_norm": 0.9187114834785461, "learning_rate": 7.784290485939014e-05, "loss": 3.161, "step": 80900 }, { "epoch": 8.712732752125714, "grad_norm": 0.8871028423309326, "learning_rate": 7.751966382932873e-05, "loss": 3.1774, "step": 80950 }, { "epoch": 8.718114304165322, "grad_norm": 0.8644676804542542, "learning_rate": 7.719642279926731e-05, "loss": 3.1836, "step": 81000 }, { "epoch": 8.718114304165322, "eval_accuracy": 0.39196996486491564, "eval_loss": 3.3197431564331055, "eval_runtime": 184.1799, "eval_samples_per_second": 97.79, "eval_steps_per_second": 6.114, "step": 81000 }, { "epoch": 8.723495856204929, "grad_norm": 0.8747641444206238, "learning_rate": 7.68731817692059e-05, "loss": 3.1956, "step": 81050 }, { "epoch": 8.728877408244538, "grad_norm": 0.9052906632423401, "learning_rate": 7.654994073914448e-05, "loss": 3.1889, "step": 81100 }, { "epoch": 8.734258960284146, "grad_norm": 0.8427259922027588, "learning_rate": 7.622669970908307e-05, "loss": 3.1841, "step": 81150 }, { "epoch": 8.739640512323755, "grad_norm": 0.8795633912086487, "learning_rate": 7.590345867902164e-05, "loss": 3.1772, "step": 81200 }, { "epoch": 8.745022064363363, "grad_norm": 0.8907637596130371, "learning_rate": 7.558021764896023e-05, "loss": 3.1867, "step": 81250 }, { "epoch": 8.75040361640297, "grad_norm": 0.9481362700462341, "learning_rate": 7.525697661889883e-05, "loss": 3.1748, "step": 81300 }, { "epoch": 8.755785168442578, "grad_norm": 0.8920351266860962, "learning_rate": 7.493373558883741e-05, "loss": 3.1976, "step": 81350 }, { "epoch": 8.761166720482187, "grad_norm": 0.8413220643997192, "learning_rate": 7.461049455877599e-05, "loss": 3.1844, "step": 81400 }, { "epoch": 8.766548272521796, "grad_norm": 0.8947044610977173, "learning_rate": 7.428725352871457e-05, "loss": 3.1847, "step": 81450 }, { "epoch": 8.771929824561404, "grad_norm": 0.8887403607368469, "learning_rate": 7.396401249865315e-05, "loss": 3.1956, "step": 81500 }, { "epoch": 8.777311376601011, "grad_norm": 0.8827490210533142, "learning_rate": 7.364077146859173e-05, "loss": 3.1874, "step": 81550 }, { "epoch": 8.78269292864062, "grad_norm": 0.9076229929924011, "learning_rate": 7.331753043853033e-05, "loss": 3.18, "step": 81600 }, { "epoch": 8.788074480680228, "grad_norm": 1.0040441751480103, "learning_rate": 7.299428940846891e-05, "loss": 3.1731, "step": 81650 }, { "epoch": 8.793456032719837, "grad_norm": 0.8829438090324402, "learning_rate": 7.267104837840749e-05, "loss": 3.1869, "step": 81700 }, { "epoch": 8.798837584759445, "grad_norm": 0.8670448064804077, "learning_rate": 7.234780734834607e-05, "loss": 3.1835, "step": 81750 }, { "epoch": 8.804219136799054, "grad_norm": 0.8884845972061157, "learning_rate": 7.202456631828465e-05, "loss": 3.1692, "step": 81800 }, { "epoch": 8.80960068883866, "grad_norm": 0.8573910593986511, "learning_rate": 7.170132528822325e-05, "loss": 3.1944, "step": 81850 }, { "epoch": 8.814982240878269, "grad_norm": 0.8754822015762329, "learning_rate": 7.137808425816183e-05, "loss": 3.1814, "step": 81900 }, { "epoch": 8.820363792917878, "grad_norm": 0.8911744356155396, "learning_rate": 7.105484322810041e-05, "loss": 3.1788, "step": 81950 }, { "epoch": 8.825745344957486, "grad_norm": 0.878544807434082, "learning_rate": 7.073160219803899e-05, "loss": 3.1852, "step": 82000 }, { "epoch": 8.825745344957486, "eval_accuracy": 0.3925502799387328, "eval_loss": 3.3142056465148926, "eval_runtime": 184.2311, "eval_samples_per_second": 97.763, "eval_steps_per_second": 6.112, "step": 82000 }, { "epoch": 8.831126896997095, "grad_norm": 0.8705427646636963, "learning_rate": 7.040836116797757e-05, "loss": 3.1738, "step": 82050 }, { "epoch": 8.836508449036701, "grad_norm": 0.8893396854400635, "learning_rate": 7.008512013791617e-05, "loss": 3.189, "step": 82100 }, { "epoch": 8.84189000107631, "grad_norm": 0.8452768921852112, "learning_rate": 6.976187910785475e-05, "loss": 3.1855, "step": 82150 }, { "epoch": 8.847271553115919, "grad_norm": 0.8880962133407593, "learning_rate": 6.943863807779334e-05, "loss": 3.1845, "step": 82200 }, { "epoch": 8.852653105155527, "grad_norm": 0.884946346282959, "learning_rate": 6.911539704773192e-05, "loss": 3.1787, "step": 82250 }, { "epoch": 8.858034657195136, "grad_norm": 0.8754028081893921, "learning_rate": 6.87921560176705e-05, "loss": 3.1703, "step": 82300 }, { "epoch": 8.863416209234742, "grad_norm": 0.8525750041007996, "learning_rate": 6.846891498760909e-05, "loss": 3.1903, "step": 82350 }, { "epoch": 8.868797761274351, "grad_norm": 0.861952543258667, "learning_rate": 6.814567395754767e-05, "loss": 3.178, "step": 82400 }, { "epoch": 8.87417931331396, "grad_norm": 0.8923875689506531, "learning_rate": 6.782243292748626e-05, "loss": 3.1699, "step": 82450 }, { "epoch": 8.879560865353568, "grad_norm": 0.8709211349487305, "learning_rate": 6.749919189742484e-05, "loss": 3.1755, "step": 82500 }, { "epoch": 8.884942417393177, "grad_norm": 0.8978734612464905, "learning_rate": 6.717595086736342e-05, "loss": 3.179, "step": 82550 }, { "epoch": 8.890323969432785, "grad_norm": 0.8863310217857361, "learning_rate": 6.6852709837302e-05, "loss": 3.1661, "step": 82600 }, { "epoch": 8.895705521472392, "grad_norm": 0.8898776173591614, "learning_rate": 6.652946880724059e-05, "loss": 3.1924, "step": 82650 }, { "epoch": 8.901087073512, "grad_norm": 0.8487651348114014, "learning_rate": 6.620622777717918e-05, "loss": 3.1876, "step": 82700 }, { "epoch": 8.906468625551609, "grad_norm": 0.8702915906906128, "learning_rate": 6.588298674711776e-05, "loss": 3.1729, "step": 82750 }, { "epoch": 8.911850177591218, "grad_norm": 0.8562464118003845, "learning_rate": 6.555974571705636e-05, "loss": 3.1797, "step": 82800 }, { "epoch": 8.917231729630826, "grad_norm": 0.9602614641189575, "learning_rate": 6.523650468699494e-05, "loss": 3.1745, "step": 82850 }, { "epoch": 8.922613281670433, "grad_norm": 0.8433753252029419, "learning_rate": 6.491326365693352e-05, "loss": 3.1708, "step": 82900 }, { "epoch": 8.927994833710041, "grad_norm": 0.882200300693512, "learning_rate": 6.459648744747333e-05, "loss": 3.1967, "step": 82950 }, { "epoch": 8.93337638574965, "grad_norm": 0.8851763606071472, "learning_rate": 6.427324641741192e-05, "loss": 3.1693, "step": 83000 }, { "epoch": 8.93337638574965, "eval_accuracy": 0.39274422534348163, "eval_loss": 3.3120665550231934, "eval_runtime": 184.2339, "eval_samples_per_second": 97.762, "eval_steps_per_second": 6.112, "step": 83000 }, { "epoch": 8.938757937789259, "grad_norm": 0.8507657647132874, "learning_rate": 6.39500053873505e-05, "loss": 3.1728, "step": 83050 }, { "epoch": 8.944139489828867, "grad_norm": 0.9048356413841248, "learning_rate": 6.362676435728908e-05, "loss": 3.1782, "step": 83100 }, { "epoch": 8.949521041868476, "grad_norm": 0.9062862396240234, "learning_rate": 6.330352332722766e-05, "loss": 3.1815, "step": 83150 }, { "epoch": 8.954902593908082, "grad_norm": 0.9040356278419495, "learning_rate": 6.298028229716624e-05, "loss": 3.1693, "step": 83200 }, { "epoch": 8.960284145947691, "grad_norm": 0.8811610341072083, "learning_rate": 6.265704126710484e-05, "loss": 3.1772, "step": 83250 }, { "epoch": 8.9656656979873, "grad_norm": 0.8306777477264404, "learning_rate": 6.233380023704342e-05, "loss": 3.18, "step": 83300 }, { "epoch": 8.971047250026908, "grad_norm": 0.8694033622741699, "learning_rate": 6.2010559206982e-05, "loss": 3.1608, "step": 83350 }, { "epoch": 8.976428802066517, "grad_norm": 0.886613130569458, "learning_rate": 6.168731817692058e-05, "loss": 3.1841, "step": 83400 }, { "epoch": 8.981810354106123, "grad_norm": 0.8889451622962952, "learning_rate": 6.136407714685916e-05, "loss": 3.1751, "step": 83450 }, { "epoch": 8.987191906145732, "grad_norm": 0.8559045195579529, "learning_rate": 6.104083611679776e-05, "loss": 3.185, "step": 83500 }, { "epoch": 8.99257345818534, "grad_norm": 0.8820023536682129, "learning_rate": 6.071759508673634e-05, "loss": 3.1787, "step": 83550 }, { "epoch": 8.997955010224949, "grad_norm": 0.9109578728675842, "learning_rate": 6.039435405667492e-05, "loss": 3.1814, "step": 83600 }, { "epoch": 9.003336562264558, "grad_norm": 0.8902610540390015, "learning_rate": 6.007111302661351e-05, "loss": 3.1407, "step": 83650 }, { "epoch": 9.008718114304166, "grad_norm": 0.9014030694961548, "learning_rate": 5.974787199655209e-05, "loss": 3.1033, "step": 83700 }, { "epoch": 9.014099666343773, "grad_norm": 0.8839023113250732, "learning_rate": 5.942463096649068e-05, "loss": 3.1284, "step": 83750 }, { "epoch": 9.019481218383381, "grad_norm": 0.8602494597434998, "learning_rate": 5.910138993642926e-05, "loss": 3.1248, "step": 83800 }, { "epoch": 9.02486277042299, "grad_norm": 0.8714879751205444, "learning_rate": 5.877814890636784e-05, "loss": 3.1281, "step": 83850 }, { "epoch": 9.030244322462599, "grad_norm": 0.8769862651824951, "learning_rate": 5.845490787630643e-05, "loss": 3.1253, "step": 83900 }, { "epoch": 9.035625874502207, "grad_norm": 0.8885403275489807, "learning_rate": 5.813166684624501e-05, "loss": 3.1263, "step": 83950 }, { "epoch": 9.041007426541814, "grad_norm": 0.9083152413368225, "learning_rate": 5.7808425816183596e-05, "loss": 3.1239, "step": 84000 }, { "epoch": 9.041007426541814, "eval_accuracy": 0.39274042249240815, "eval_loss": 3.3148696422576904, "eval_runtime": 184.1491, "eval_samples_per_second": 97.807, "eval_steps_per_second": 6.115, "step": 84000 }, { "epoch": 9.046388978581422, "grad_norm": 0.9183508157730103, "learning_rate": 5.748518478612218e-05, "loss": 3.1139, "step": 84050 }, { "epoch": 9.051770530621031, "grad_norm": 0.9575625061988831, "learning_rate": 5.716840857666199e-05, "loss": 3.1065, "step": 84100 }, { "epoch": 9.05715208266064, "grad_norm": 0.8786783218383789, "learning_rate": 5.684516754660057e-05, "loss": 3.1214, "step": 84150 }, { "epoch": 9.062533634700248, "grad_norm": 0.9034135341644287, "learning_rate": 5.652192651653916e-05, "loss": 3.1147, "step": 84200 }, { "epoch": 9.067915186739857, "grad_norm": 1.0015065670013428, "learning_rate": 5.619868548647774e-05, "loss": 3.121, "step": 84250 }, { "epoch": 9.073296738779463, "grad_norm": 0.847421407699585, "learning_rate": 5.5875444456416335e-05, "loss": 3.1224, "step": 84300 }, { "epoch": 9.078678290819072, "grad_norm": 0.933115541934967, "learning_rate": 5.5552203426354916e-05, "loss": 3.1185, "step": 84350 }, { "epoch": 9.08405984285868, "grad_norm": 0.8964365720748901, "learning_rate": 5.52289623962935e-05, "loss": 3.129, "step": 84400 }, { "epoch": 9.089441394898289, "grad_norm": 0.87543785572052, "learning_rate": 5.4905721366232085e-05, "loss": 3.1214, "step": 84450 }, { "epoch": 9.094822946937898, "grad_norm": 0.8576852679252625, "learning_rate": 5.4582480336170666e-05, "loss": 3.1149, "step": 84500 }, { "epoch": 9.100204498977504, "grad_norm": 0.8693298697471619, "learning_rate": 5.4259239306109254e-05, "loss": 3.1293, "step": 84550 }, { "epoch": 9.105586051017113, "grad_norm": 0.8958057761192322, "learning_rate": 5.3935998276047835e-05, "loss": 3.1124, "step": 84600 }, { "epoch": 9.110967603056721, "grad_norm": 0.9171696901321411, "learning_rate": 5.3612757245986416e-05, "loss": 3.1309, "step": 84650 }, { "epoch": 9.11634915509633, "grad_norm": 0.8535910844802856, "learning_rate": 5.329598103652623e-05, "loss": 3.1134, "step": 84700 }, { "epoch": 9.121730707135939, "grad_norm": 0.8955597877502441, "learning_rate": 5.297274000646482e-05, "loss": 3.1267, "step": 84750 }, { "epoch": 9.127112259175545, "grad_norm": 0.8821160793304443, "learning_rate": 5.26494989764034e-05, "loss": 3.1187, "step": 84800 }, { "epoch": 9.132493811215154, "grad_norm": 0.8788377046585083, "learning_rate": 5.232625794634199e-05, "loss": 3.1049, "step": 84850 }, { "epoch": 9.137875363254762, "grad_norm": 0.8821089863777161, "learning_rate": 5.200301691628057e-05, "loss": 3.1429, "step": 84900 }, { "epoch": 9.143256915294371, "grad_norm": 0.9082825183868408, "learning_rate": 5.167977588621915e-05, "loss": 3.1429, "step": 84950 }, { "epoch": 9.14863846733398, "grad_norm": 0.8419699668884277, "learning_rate": 5.135653485615774e-05, "loss": 3.1428, "step": 85000 }, { "epoch": 9.14863846733398, "eval_accuracy": 0.39299162796903525, "eval_loss": 3.31325364112854, "eval_runtime": 184.0768, "eval_samples_per_second": 97.845, "eval_steps_per_second": 6.117, "step": 85000 }, { "epoch": 9.154020019373588, "grad_norm": 0.8873614072799683, "learning_rate": 5.103329382609632e-05, "loss": 3.1095, "step": 85050 }, { "epoch": 9.159401571413195, "grad_norm": 0.9346480965614319, "learning_rate": 5.0710052796034906e-05, "loss": 3.1219, "step": 85100 }, { "epoch": 9.164783123452803, "grad_norm": 0.88265460729599, "learning_rate": 5.038681176597349e-05, "loss": 3.1206, "step": 85150 }, { "epoch": 9.170164675492412, "grad_norm": 0.8832603693008423, "learning_rate": 5.006357073591207e-05, "loss": 3.1407, "step": 85200 }, { "epoch": 9.17554622753202, "grad_norm": 0.9677432775497437, "learning_rate": 4.974032970585066e-05, "loss": 3.1256, "step": 85250 }, { "epoch": 9.180927779571629, "grad_norm": 0.8652942776679993, "learning_rate": 4.9417088675789244e-05, "loss": 3.145, "step": 85300 }, { "epoch": 9.186309331611236, "grad_norm": 0.861595630645752, "learning_rate": 4.909384764572783e-05, "loss": 3.1331, "step": 85350 }, { "epoch": 9.191690883650844, "grad_norm": 0.879951000213623, "learning_rate": 4.877060661566641e-05, "loss": 3.1235, "step": 85400 }, { "epoch": 9.197072435690453, "grad_norm": 0.8860160112380981, "learning_rate": 4.8447365585604994e-05, "loss": 3.1253, "step": 85450 }, { "epoch": 9.202453987730062, "grad_norm": 0.8945945501327515, "learning_rate": 4.812412455554358e-05, "loss": 3.1369, "step": 85500 }, { "epoch": 9.20783553976967, "grad_norm": 0.9048729538917542, "learning_rate": 4.780088352548216e-05, "loss": 3.128, "step": 85550 }, { "epoch": 9.213217091809279, "grad_norm": 0.9097630977630615, "learning_rate": 4.747764249542075e-05, "loss": 3.1462, "step": 85600 }, { "epoch": 9.218598643848885, "grad_norm": 0.8978161215782166, "learning_rate": 4.715440146535933e-05, "loss": 3.1177, "step": 85650 }, { "epoch": 9.223980195888494, "grad_norm": 0.8683468699455261, "learning_rate": 4.683116043529791e-05, "loss": 3.1277, "step": 85700 }, { "epoch": 9.229361747928102, "grad_norm": 0.9079287648200989, "learning_rate": 4.65079194052365e-05, "loss": 3.1334, "step": 85750 }, { "epoch": 9.234743299967711, "grad_norm": 0.9061818718910217, "learning_rate": 4.618467837517508e-05, "loss": 3.1202, "step": 85800 }, { "epoch": 9.24012485200732, "grad_norm": 0.8817369937896729, "learning_rate": 4.586143734511367e-05, "loss": 3.1187, "step": 85850 }, { "epoch": 9.245506404046926, "grad_norm": 0.8657533526420593, "learning_rate": 4.553819631505225e-05, "loss": 3.1345, "step": 85900 }, { "epoch": 9.250887956086535, "grad_norm": 0.882895827293396, "learning_rate": 4.521495528499083e-05, "loss": 3.1268, "step": 85950 }, { "epoch": 9.256269508126143, "grad_norm": 0.8987122774124146, "learning_rate": 4.489171425492942e-05, "loss": 3.1204, "step": 86000 }, { "epoch": 9.256269508126143, "eval_accuracy": 0.39308094064281873, "eval_loss": 3.313157081604004, "eval_runtime": 184.0804, "eval_samples_per_second": 97.843, "eval_steps_per_second": 6.117, "step": 86000 }, { "epoch": 9.261651060165752, "grad_norm": 0.9081786274909973, "learning_rate": 4.4568473224868e-05, "loss": 3.1435, "step": 86050 }, { "epoch": 9.26703261220536, "grad_norm": 0.9416366815567017, "learning_rate": 4.4245232194806596e-05, "loss": 3.1312, "step": 86100 }, { "epoch": 9.272414164244967, "grad_norm": 0.8970310688018799, "learning_rate": 4.392199116474518e-05, "loss": 3.1364, "step": 86150 }, { "epoch": 9.277795716284576, "grad_norm": 0.8541820645332336, "learning_rate": 4.359875013468376e-05, "loss": 3.1246, "step": 86200 }, { "epoch": 9.283177268324184, "grad_norm": 0.9402540922164917, "learning_rate": 4.3275509104622346e-05, "loss": 3.0953, "step": 86250 }, { "epoch": 9.288558820363793, "grad_norm": 0.8408409357070923, "learning_rate": 4.295226807456093e-05, "loss": 3.1092, "step": 86300 }, { "epoch": 9.293940372403402, "grad_norm": 0.8786787390708923, "learning_rate": 4.2629027044499515e-05, "loss": 3.1178, "step": 86350 }, { "epoch": 9.29932192444301, "grad_norm": 0.8495444059371948, "learning_rate": 4.2305786014438096e-05, "loss": 3.1213, "step": 86400 }, { "epoch": 9.304703476482617, "grad_norm": 0.8830070495605469, "learning_rate": 4.198254498437668e-05, "loss": 3.1348, "step": 86450 }, { "epoch": 9.310085028522225, "grad_norm": 0.8976743221282959, "learning_rate": 4.1659303954315265e-05, "loss": 3.1316, "step": 86500 }, { "epoch": 9.315466580561834, "grad_norm": 0.8559629917144775, "learning_rate": 4.1336062924253846e-05, "loss": 3.1334, "step": 86550 }, { "epoch": 9.320848132601443, "grad_norm": 0.8923842310905457, "learning_rate": 4.1012821894192434e-05, "loss": 3.1209, "step": 86600 }, { "epoch": 9.326229684641051, "grad_norm": 0.8730953335762024, "learning_rate": 4.0689580864131015e-05, "loss": 3.1297, "step": 86650 }, { "epoch": 9.331611236680658, "grad_norm": 0.8883426785469055, "learning_rate": 4.0366339834069596e-05, "loss": 3.0943, "step": 86700 }, { "epoch": 9.336992788720266, "grad_norm": 0.8734377026557922, "learning_rate": 4.0043098804008184e-05, "loss": 3.1432, "step": 86750 }, { "epoch": 9.342374340759875, "grad_norm": 0.8974003195762634, "learning_rate": 3.9719857773946765e-05, "loss": 3.1423, "step": 86800 }, { "epoch": 9.347755892799483, "grad_norm": 0.9093900322914124, "learning_rate": 3.939661674388536e-05, "loss": 3.1176, "step": 86850 }, { "epoch": 9.353137444839092, "grad_norm": 0.913820743560791, "learning_rate": 3.9073375713823934e-05, "loss": 3.1279, "step": 86900 }, { "epoch": 9.3585189968787, "grad_norm": 0.8914437890052795, "learning_rate": 3.8750134683762515e-05, "loss": 3.1164, "step": 86950 }, { "epoch": 9.363900548918307, "grad_norm": 0.9102753400802612, "learning_rate": 3.842689365370111e-05, "loss": 3.1227, "step": 87000 }, { "epoch": 9.363900548918307, "eval_accuracy": 0.39337756302655225, "eval_loss": 3.3099188804626465, "eval_runtime": 184.3153, "eval_samples_per_second": 97.718, "eval_steps_per_second": 6.109, "step": 87000 }, { "epoch": 9.369282100957916, "grad_norm": 0.8917927145957947, "learning_rate": 3.810365262363969e-05, "loss": 3.1314, "step": 87050 }, { "epoch": 9.374663652997524, "grad_norm": 0.9128382205963135, "learning_rate": 3.778041159357828e-05, "loss": 3.1368, "step": 87100 }, { "epoch": 9.380045205037133, "grad_norm": 0.8954198956489563, "learning_rate": 3.745717056351686e-05, "loss": 3.1321, "step": 87150 }, { "epoch": 9.385426757076742, "grad_norm": 0.9754563570022583, "learning_rate": 3.713392953345544e-05, "loss": 3.1199, "step": 87200 }, { "epoch": 9.390808309116348, "grad_norm": 0.8979797959327698, "learning_rate": 3.681068850339403e-05, "loss": 3.1328, "step": 87250 }, { "epoch": 9.396189861155957, "grad_norm": 0.8693102598190308, "learning_rate": 3.648744747333262e-05, "loss": 3.1331, "step": 87300 }, { "epoch": 9.401571413195565, "grad_norm": 0.9184983372688293, "learning_rate": 3.61642064432712e-05, "loss": 3.1342, "step": 87350 }, { "epoch": 9.406952965235174, "grad_norm": 0.9083705544471741, "learning_rate": 3.584096541320978e-05, "loss": 3.1362, "step": 87400 }, { "epoch": 9.412334517274783, "grad_norm": 0.8478923439979553, "learning_rate": 3.551772438314837e-05, "loss": 3.1305, "step": 87450 }, { "epoch": 9.417716069314391, "grad_norm": 0.8750817179679871, "learning_rate": 3.519448335308695e-05, "loss": 3.1258, "step": 87500 }, { "epoch": 9.423097621353998, "grad_norm": 0.8788939118385315, "learning_rate": 3.4871242323025536e-05, "loss": 3.1174, "step": 87550 }, { "epoch": 9.428479173393606, "grad_norm": 0.8856424689292908, "learning_rate": 3.454800129296412e-05, "loss": 3.1396, "step": 87600 }, { "epoch": 9.433860725433215, "grad_norm": 0.8789232969284058, "learning_rate": 3.42247602629027e-05, "loss": 3.127, "step": 87650 }, { "epoch": 9.439242277472824, "grad_norm": 0.8911525011062622, "learning_rate": 3.3901519232841286e-05, "loss": 3.1275, "step": 87700 }, { "epoch": 9.444623829512432, "grad_norm": 0.8742188811302185, "learning_rate": 3.3578278202779874e-05, "loss": 3.1326, "step": 87750 }, { "epoch": 9.450005381552039, "grad_norm": 0.8650107979774475, "learning_rate": 3.3255037172718455e-05, "loss": 3.1415, "step": 87800 }, { "epoch": 9.455386933591647, "grad_norm": 0.9125131368637085, "learning_rate": 3.2931796142657036e-05, "loss": 3.1418, "step": 87850 }, { "epoch": 9.460768485631256, "grad_norm": 0.9166848063468933, "learning_rate": 3.2608555112595624e-05, "loss": 3.1295, "step": 87900 }, { "epoch": 9.466150037670864, "grad_norm": 0.8570738434791565, "learning_rate": 3.2285314082534205e-05, "loss": 3.1243, "step": 87950 }, { "epoch": 9.471531589710473, "grad_norm": 0.9025490880012512, "learning_rate": 3.196207305247279e-05, "loss": 3.1189, "step": 88000 }, { "epoch": 9.471531589710473, "eval_accuracy": 0.39366679701391444, "eval_loss": 3.3074960708618164, "eval_runtime": 184.3415, "eval_samples_per_second": 97.705, "eval_steps_per_second": 6.108, "step": 88000 }, { "epoch": 9.476913141750082, "grad_norm": 0.9051532745361328, "learning_rate": 3.1638832022411374e-05, "loss": 3.1272, "step": 88050 }, { "epoch": 9.482294693789688, "grad_norm": 0.9014541506767273, "learning_rate": 3.1315590992349955e-05, "loss": 3.1419, "step": 88100 }, { "epoch": 9.487676245829297, "grad_norm": 0.8619112372398376, "learning_rate": 3.099234996228854e-05, "loss": 3.1364, "step": 88150 }, { "epoch": 9.493057797868905, "grad_norm": 0.8996751308441162, "learning_rate": 3.066910893222713e-05, "loss": 3.1372, "step": 88200 }, { "epoch": 9.498439349908514, "grad_norm": 0.9325263500213623, "learning_rate": 3.0345867902165712e-05, "loss": 3.134, "step": 88250 }, { "epoch": 9.503820901948123, "grad_norm": 0.9033076167106628, "learning_rate": 3.00226268721043e-05, "loss": 3.1299, "step": 88300 }, { "epoch": 9.50920245398773, "grad_norm": 0.87554931640625, "learning_rate": 2.9699385842042878e-05, "loss": 3.1223, "step": 88350 }, { "epoch": 9.514584006027338, "grad_norm": 0.9007555842399597, "learning_rate": 2.9376144811981465e-05, "loss": 3.1333, "step": 88400 }, { "epoch": 9.519965558066946, "grad_norm": 0.9345928430557251, "learning_rate": 2.905290378192005e-05, "loss": 3.1345, "step": 88450 }, { "epoch": 9.525347110106555, "grad_norm": 0.8623220324516296, "learning_rate": 2.8729662751858634e-05, "loss": 3.147, "step": 88500 }, { "epoch": 9.530728662146164, "grad_norm": 0.9098676443099976, "learning_rate": 2.840642172179722e-05, "loss": 3.1281, "step": 88550 }, { "epoch": 9.536110214185772, "grad_norm": 0.9540818333625793, "learning_rate": 2.8089645512337032e-05, "loss": 3.1485, "step": 88600 }, { "epoch": 9.541491766225379, "grad_norm": 0.8820831775665283, "learning_rate": 2.7766404482275613e-05, "loss": 3.1324, "step": 88650 }, { "epoch": 9.546873318264987, "grad_norm": 0.8952277302742004, "learning_rate": 2.7443163452214198e-05, "loss": 3.1163, "step": 88700 }, { "epoch": 9.552254870304596, "grad_norm": 0.8552523255348206, "learning_rate": 2.7119922422152782e-05, "loss": 3.1395, "step": 88750 }, { "epoch": 9.557636422344205, "grad_norm": 0.8893306255340576, "learning_rate": 2.6796681392091367e-05, "loss": 3.1393, "step": 88800 }, { "epoch": 9.563017974383813, "grad_norm": 0.8926628828048706, "learning_rate": 2.6473440362029955e-05, "loss": 3.1356, "step": 88850 }, { "epoch": 9.56839952642342, "grad_norm": 0.8982229828834534, "learning_rate": 2.6150199331968536e-05, "loss": 3.1336, "step": 88900 }, { "epoch": 9.573781078463028, "grad_norm": 0.8643049001693726, "learning_rate": 2.582695830190712e-05, "loss": 3.1251, "step": 88950 }, { "epoch": 9.579162630502637, "grad_norm": 0.8758737444877625, "learning_rate": 2.5503717271845705e-05, "loss": 3.1408, "step": 89000 }, { "epoch": 9.579162630502637, "eval_accuracy": 0.3939050727968916, "eval_loss": 3.305797815322876, "eval_runtime": 184.2103, "eval_samples_per_second": 97.774, "eval_steps_per_second": 6.113, "step": 89000 }, { "epoch": 9.584544182542245, "grad_norm": 0.9471620321273804, "learning_rate": 2.518047624178429e-05, "loss": 3.1371, "step": 89050 }, { "epoch": 9.589925734581854, "grad_norm": 0.9016526341438293, "learning_rate": 2.4857235211722874e-05, "loss": 3.1133, "step": 89100 }, { "epoch": 9.59530728662146, "grad_norm": 0.9102988839149475, "learning_rate": 2.4533994181661455e-05, "loss": 3.1409, "step": 89150 }, { "epoch": 9.60068883866107, "grad_norm": 0.9097252488136292, "learning_rate": 2.421075315160004e-05, "loss": 3.1219, "step": 89200 }, { "epoch": 9.606070390700678, "grad_norm": 0.851654052734375, "learning_rate": 2.3887512121538624e-05, "loss": 3.1412, "step": 89250 }, { "epoch": 9.611451942740286, "grad_norm": 0.9233815670013428, "learning_rate": 2.3564271091477212e-05, "loss": 3.1346, "step": 89300 }, { "epoch": 9.616833494779895, "grad_norm": 0.8974182605743408, "learning_rate": 2.3241030061415796e-05, "loss": 3.1257, "step": 89350 }, { "epoch": 9.622215046819504, "grad_norm": 0.9321595430374146, "learning_rate": 2.2917789031354377e-05, "loss": 3.1293, "step": 89400 }, { "epoch": 9.62759659885911, "grad_norm": 0.9431509971618652, "learning_rate": 2.2594548001292962e-05, "loss": 3.1247, "step": 89450 }, { "epoch": 9.632978150898719, "grad_norm": 0.9153622388839722, "learning_rate": 2.2271306971231546e-05, "loss": 3.1157, "step": 89500 }, { "epoch": 9.638359702938327, "grad_norm": 0.8639370203018188, "learning_rate": 2.194806594117013e-05, "loss": 3.1321, "step": 89550 }, { "epoch": 9.643741254977936, "grad_norm": 0.8959195017814636, "learning_rate": 2.1624824911108715e-05, "loss": 3.1408, "step": 89600 }, { "epoch": 9.649122807017545, "grad_norm": 0.9533730745315552, "learning_rate": 2.1301583881047296e-05, "loss": 3.1172, "step": 89650 }, { "epoch": 9.654504359057151, "grad_norm": 0.9100210070610046, "learning_rate": 2.097834285098588e-05, "loss": 3.1337, "step": 89700 }, { "epoch": 9.65988591109676, "grad_norm": 1.005946159362793, "learning_rate": 2.065510182092447e-05, "loss": 3.1218, "step": 89750 }, { "epoch": 9.665267463136368, "grad_norm": 0.910432755947113, "learning_rate": 2.0331860790863053e-05, "loss": 3.1313, "step": 89800 }, { "epoch": 9.670649015175977, "grad_norm": 0.9411842226982117, "learning_rate": 2.0008619760801638e-05, "loss": 3.1271, "step": 89850 }, { "epoch": 9.676030567215586, "grad_norm": 0.9593133330345154, "learning_rate": 1.968537873074022e-05, "loss": 3.1221, "step": 89900 }, { "epoch": 9.681412119255192, "grad_norm": 0.8818777203559875, "learning_rate": 1.9362137700678803e-05, "loss": 3.1353, "step": 89950 }, { "epoch": 9.6867936712948, "grad_norm": 0.8657549023628235, "learning_rate": 1.9038896670617388e-05, "loss": 3.1365, "step": 90000 }, { "epoch": 9.6867936712948, "eval_accuracy": 0.39415008505891325, "eval_loss": 3.3024492263793945, "eval_runtime": 184.227, "eval_samples_per_second": 97.765, "eval_steps_per_second": 6.112, "step": 90000 }, { "epoch": 9.69217522333441, "grad_norm": 0.9198330640792847, "learning_rate": 1.8715655640555972e-05, "loss": 3.1364, "step": 90050 }, { "epoch": 9.697556775374018, "grad_norm": 0.8774855136871338, "learning_rate": 1.8392414610494557e-05, "loss": 3.1313, "step": 90100 }, { "epoch": 9.702938327413626, "grad_norm": 0.9081115126609802, "learning_rate": 1.806917358043314e-05, "loss": 3.1326, "step": 90150 }, { "epoch": 9.708319879453235, "grad_norm": 0.9057767987251282, "learning_rate": 1.7745932550371726e-05, "loss": 3.1302, "step": 90200 }, { "epoch": 9.713701431492842, "grad_norm": 0.9399815201759338, "learning_rate": 1.742269152031031e-05, "loss": 3.1324, "step": 90250 }, { "epoch": 9.71908298353245, "grad_norm": 0.9298363924026489, "learning_rate": 1.7099450490248895e-05, "loss": 3.1329, "step": 90300 }, { "epoch": 9.724464535572059, "grad_norm": 0.9103370308876038, "learning_rate": 1.677620946018748e-05, "loss": 3.1188, "step": 90350 }, { "epoch": 9.729846087611667, "grad_norm": 0.8799251914024353, "learning_rate": 1.6452968430126064e-05, "loss": 3.1057, "step": 90400 }, { "epoch": 9.735227639651276, "grad_norm": 0.8957668542861938, "learning_rate": 1.6129727400064645e-05, "loss": 3.1288, "step": 90450 }, { "epoch": 9.740609191690883, "grad_norm": 0.8894574642181396, "learning_rate": 1.580648637000323e-05, "loss": 3.1136, "step": 90500 }, { "epoch": 9.745990743730491, "grad_norm": 0.8673907518386841, "learning_rate": 1.5483245339941817e-05, "loss": 3.1208, "step": 90550 }, { "epoch": 9.7513722957701, "grad_norm": 0.9016950130462646, "learning_rate": 1.5160004309880398e-05, "loss": 3.1354, "step": 90600 }, { "epoch": 9.756753847809708, "grad_norm": 0.9157580733299255, "learning_rate": 1.4836763279818985e-05, "loss": 3.1238, "step": 90650 }, { "epoch": 9.762135399849317, "grad_norm": 0.886666476726532, "learning_rate": 1.4513522249757567e-05, "loss": 3.1383, "step": 90700 }, { "epoch": 9.767516951888926, "grad_norm": 0.913087010383606, "learning_rate": 1.4190281219696152e-05, "loss": 3.125, "step": 90750 }, { "epoch": 9.772898503928532, "grad_norm": 0.8724235892295837, "learning_rate": 1.3867040189634736e-05, "loss": 3.1012, "step": 90800 }, { "epoch": 9.77828005596814, "grad_norm": 0.846278727054596, "learning_rate": 1.3543799159573321e-05, "loss": 3.1216, "step": 90850 }, { "epoch": 9.78366160800775, "grad_norm": 0.8746052384376526, "learning_rate": 1.3220558129511905e-05, "loss": 3.1383, "step": 90900 }, { "epoch": 9.789043160047358, "grad_norm": 0.9024321436882019, "learning_rate": 1.2897317099450488e-05, "loss": 3.1287, "step": 90950 }, { "epoch": 9.794424712086967, "grad_norm": 0.8637295961380005, "learning_rate": 1.2574076069389073e-05, "loss": 3.1175, "step": 91000 }, { "epoch": 9.794424712086967, "eval_accuracy": 0.3944104173781167, "eval_loss": 3.3008501529693604, "eval_runtime": 184.4524, "eval_samples_per_second": 97.646, "eval_steps_per_second": 6.105, "step": 91000 }, { "epoch": 9.799806264126573, "grad_norm": 0.8993793725967407, "learning_rate": 1.2250835039327659e-05, "loss": 3.1117, "step": 91050 }, { "epoch": 9.805187816166182, "grad_norm": 0.9126404523849487, "learning_rate": 1.1927594009266242e-05, "loss": 3.1209, "step": 91100 }, { "epoch": 9.81056936820579, "grad_norm": 0.8682853579521179, "learning_rate": 1.1604352979204826e-05, "loss": 3.1171, "step": 91150 }, { "epoch": 9.815950920245399, "grad_norm": 0.8705700039863586, "learning_rate": 1.1281111949143409e-05, "loss": 3.137, "step": 91200 }, { "epoch": 9.821332472285007, "grad_norm": 0.8994699120521545, "learning_rate": 1.0957870919081995e-05, "loss": 3.1354, "step": 91250 }, { "epoch": 9.826714024324616, "grad_norm": 0.8948203921318054, "learning_rate": 1.063462988902058e-05, "loss": 3.1185, "step": 91300 }, { "epoch": 9.832095576364223, "grad_norm": 0.9019114971160889, "learning_rate": 1.0311388858959162e-05, "loss": 3.1287, "step": 91350 }, { "epoch": 9.837477128403831, "grad_norm": 0.9099586009979248, "learning_rate": 9.988147828897747e-06, "loss": 3.1224, "step": 91400 }, { "epoch": 9.84285868044344, "grad_norm": 0.9111315011978149, "learning_rate": 9.66490679883633e-06, "loss": 3.137, "step": 91450 }, { "epoch": 9.848240232483048, "grad_norm": 0.8715656399726868, "learning_rate": 9.341665768774916e-06, "loss": 3.1067, "step": 91500 }, { "epoch": 9.853621784522657, "grad_norm": 0.8602592349052429, "learning_rate": 9.018424738713499e-06, "loss": 3.1365, "step": 91550 }, { "epoch": 9.859003336562264, "grad_norm": 0.9012041091918945, "learning_rate": 8.695183708652085e-06, "loss": 3.1207, "step": 91600 }, { "epoch": 9.864384888601872, "grad_norm": 0.8642773628234863, "learning_rate": 8.37194267859067e-06, "loss": 3.1257, "step": 91650 }, { "epoch": 9.869766440641481, "grad_norm": 0.8773107528686523, "learning_rate": 8.048701648529252e-06, "loss": 3.1157, "step": 91700 }, { "epoch": 9.87514799268109, "grad_norm": 0.8600118160247803, "learning_rate": 7.725460618467837e-06, "loss": 3.1192, "step": 91750 }, { "epoch": 9.880529544720698, "grad_norm": 0.8994825482368469, "learning_rate": 7.40221958840642e-06, "loss": 3.1232, "step": 91800 }, { "epoch": 9.885911096760307, "grad_norm": 0.919068455696106, "learning_rate": 7.078978558345006e-06, "loss": 3.1329, "step": 91850 }, { "epoch": 9.891292648799913, "grad_norm": 0.8945634961128235, "learning_rate": 6.75573752828359e-06, "loss": 3.1352, "step": 91900 }, { "epoch": 9.896674200839522, "grad_norm": 0.876110851764679, "learning_rate": 6.432496498222174e-06, "loss": 3.1443, "step": 91950 }, { "epoch": 9.90205575287913, "grad_norm": 0.889359176158905, "learning_rate": 6.1092554681607575e-06, "loss": 3.1283, "step": 92000 }, { "epoch": 9.90205575287913, "eval_accuracy": 0.39456166219795447, "eval_loss": 3.299532890319824, "eval_runtime": 183.9856, "eval_samples_per_second": 97.894, "eval_steps_per_second": 6.12, "step": 92000 }, { "epoch": 9.907437304918739, "grad_norm": 0.882473349571228, "learning_rate": 5.786014438099342e-06, "loss": 3.1458, "step": 92050 }, { "epoch": 9.912818856958348, "grad_norm": 0.8881819248199463, "learning_rate": 5.4627734080379264e-06, "loss": 3.1383, "step": 92100 }, { "epoch": 9.918200408997954, "grad_norm": 0.8752103447914124, "learning_rate": 5.139532377976511e-06, "loss": 3.1248, "step": 92150 }, { "epoch": 9.923581961037563, "grad_norm": 0.8208050727844238, "learning_rate": 4.816291347915095e-06, "loss": 3.1235, "step": 92200 }, { "epoch": 9.928963513077171, "grad_norm": 0.8722397685050964, "learning_rate": 4.493050317853679e-06, "loss": 3.1277, "step": 92250 }, { "epoch": 9.93434506511678, "grad_norm": 0.8747850060462952, "learning_rate": 4.169809287792264e-06, "loss": 3.137, "step": 92300 }, { "epoch": 9.939726617156388, "grad_norm": 0.8743521571159363, "learning_rate": 3.846568257730847e-06, "loss": 3.1159, "step": 92350 }, { "epoch": 9.945108169195997, "grad_norm": 0.903657853603363, "learning_rate": 3.523327227669432e-06, "loss": 3.1135, "step": 92400 }, { "epoch": 9.950489721235604, "grad_norm": 0.9355119466781616, "learning_rate": 3.2000861976080162e-06, "loss": 3.1193, "step": 92450 }, { "epoch": 9.955871273275212, "grad_norm": 0.9146791100502014, "learning_rate": 2.8768451675466007e-06, "loss": 3.1347, "step": 92500 }, { "epoch": 9.961252825314821, "grad_norm": 0.8994093537330627, "learning_rate": 2.5536041374851848e-06, "loss": 3.1187, "step": 92550 }, { "epoch": 9.96663437735443, "grad_norm": 0.8931289911270142, "learning_rate": 2.230363107423769e-06, "loss": 3.1213, "step": 92600 }, { "epoch": 9.972015929394038, "grad_norm": 0.895991325378418, "learning_rate": 1.9071220773623531e-06, "loss": 3.1282, "step": 92650 }, { "epoch": 9.977397481433645, "grad_norm": 0.8852683305740356, "learning_rate": 1.5903458679021656e-06, "loss": 3.1207, "step": 92700 }, { "epoch": 9.982779033473253, "grad_norm": 0.8881168961524963, "learning_rate": 1.2671048378407496e-06, "loss": 3.1262, "step": 92750 }, { "epoch": 9.988160585512862, "grad_norm": 0.8898472189903259, "learning_rate": 9.438638077793341e-07, "loss": 3.1314, "step": 92800 }, { "epoch": 9.99354213755247, "grad_norm": 0.8932863473892212, "learning_rate": 6.206227777179182e-07, "loss": 3.1198, "step": 92850 }, { "epoch": 9.998923689592079, "grad_norm": 0.9075705409049988, "learning_rate": 2.9738174765650254e-07, "loss": 3.1304, "step": 92900 }, { "epoch": 10.0, "step": 92910, "total_flos": 7.7680292069376e+17, "train_loss": 3.453627050393827, "train_runtime": 80041.5362, "train_samples_per_second": 37.142, "train_steps_per_second": 1.161 } ], "logging_steps": 50, "max_steps": 92910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.7680292069376e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }