{ "best_metric": 3.3019161224365234, "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-90000", "epoch": 10.0, "eval_steps": 1000, "global_step": 92910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005381552039608223, "grad_norm": 1.3847299814224243, "learning_rate": 0.0003, "loss": 8.4655, "step": 50 }, { "epoch": 0.010763104079216447, "grad_norm": 1.0632425546646118, "learning_rate": 0.0006, "loss": 6.8249, "step": 100 }, { "epoch": 0.01614465611882467, "grad_norm": 2.088347911834717, "learning_rate": 0.0005996767589699385, "loss": 6.4221, "step": 150 }, { "epoch": 0.021526208158432893, "grad_norm": 1.0710954666137695, "learning_rate": 0.0005993535179398771, "loss": 6.1909, "step": 200 }, { "epoch": 0.026907760198041114, "grad_norm": 1.5236896276474, "learning_rate": 0.0005990302769098158, "loss": 6.0442, "step": 250 }, { "epoch": 0.03228931223764934, "grad_norm": 1.8247500658035278, "learning_rate": 0.0005987070358797543, "loss": 5.9513, "step": 300 }, { "epoch": 0.03767086427725756, "grad_norm": 1.2985302209854126, "learning_rate": 0.0005983837948496929, "loss": 5.8643, "step": 350 }, { "epoch": 0.04305241631686579, "grad_norm": 1.256401538848877, "learning_rate": 0.0005980605538196314, "loss": 5.8018, "step": 400 }, { "epoch": 0.048433968356474004, "grad_norm": 0.8569295406341553, "learning_rate": 0.0005977373127895701, "loss": 5.7053, "step": 450 }, { "epoch": 0.05381552039608223, "grad_norm": 0.860461413860321, "learning_rate": 0.0005974140717595086, "loss": 5.6413, "step": 500 }, { "epoch": 0.05919707243569045, "grad_norm": 1.6800360679626465, "learning_rate": 0.0005970908307294472, "loss": 5.5789, "step": 550 }, { "epoch": 0.06457862447529868, "grad_norm": 1.408766508102417, "learning_rate": 0.0005967675896993858, "loss": 5.4957, "step": 600 }, { "epoch": 0.0699601765149069, "grad_norm": 1.263355016708374, "learning_rate": 0.0005964443486693243, "loss": 5.421, "step": 650 }, { "epoch": 0.07534172855451512, "grad_norm": 1.617833137512207, "learning_rate": 0.000596121107639263, "loss": 5.334, "step": 700 }, { "epoch": 0.08072328059412334, "grad_norm": 1.2163816690444946, "learning_rate": 0.0005957978666092015, "loss": 5.2925, "step": 750 }, { "epoch": 0.08610483263373157, "grad_norm": 1.4054591655731201, "learning_rate": 0.0005954746255791401, "loss": 5.222, "step": 800 }, { "epoch": 0.09148638467333979, "grad_norm": 1.0674018859863281, "learning_rate": 0.0005951513845490787, "loss": 5.2071, "step": 850 }, { "epoch": 0.09686793671294801, "grad_norm": 1.4989951848983765, "learning_rate": 0.0005948281435190174, "loss": 5.1526, "step": 900 }, { "epoch": 0.10224948875255624, "grad_norm": 1.028799057006836, "learning_rate": 0.0005945049024889559, "loss": 5.1098, "step": 950 }, { "epoch": 0.10763104079216446, "grad_norm": 0.955634355545044, "learning_rate": 0.0005941816614588944, "loss": 5.0633, "step": 1000 }, { "epoch": 0.10763104079216446, "eval_accuracy": 0.22915491630956988, "eval_loss": 4.996466636657715, "eval_runtime": 202.5064, "eval_samples_per_second": 88.94, "eval_steps_per_second": 5.56, "step": 1000 }, { "epoch": 0.11301259283177269, "grad_norm": 1.2216582298278809, "learning_rate": 0.000593858420428833, "loss": 5.0219, "step": 1050 }, { "epoch": 0.1183941448713809, "grad_norm": 1.2919893264770508, "learning_rate": 0.0005935351793987716, "loss": 4.9962, "step": 1100 }, { "epoch": 0.12377569691098914, "grad_norm": 1.842529058456421, "learning_rate": 0.0005932119383687103, "loss": 4.9736, "step": 1150 }, { "epoch": 0.12915724895059735, "grad_norm": 0.9416630268096924, "learning_rate": 0.0005928886973386488, "loss": 4.9687, "step": 1200 }, { "epoch": 0.13453880099020557, "grad_norm": 1.340468406677246, "learning_rate": 0.0005925654563085874, "loss": 4.9251, "step": 1250 }, { "epoch": 0.1399203530298138, "grad_norm": 1.4057867527008057, "learning_rate": 0.000592242215278526, "loss": 4.8782, "step": 1300 }, { "epoch": 0.14530190506942203, "grad_norm": 1.3462789058685303, "learning_rate": 0.0005919189742484645, "loss": 4.8431, "step": 1350 }, { "epoch": 0.15068345710903025, "grad_norm": 0.9291279911994934, "learning_rate": 0.0005915957332184032, "loss": 4.8316, "step": 1400 }, { "epoch": 0.15606500914863847, "grad_norm": 0.7789126634597778, "learning_rate": 0.0005912724921883417, "loss": 4.8028, "step": 1450 }, { "epoch": 0.16144656118824668, "grad_norm": 1.2111430168151855, "learning_rate": 0.0005909492511582803, "loss": 4.7837, "step": 1500 }, { "epoch": 0.1668281132278549, "grad_norm": 1.1703121662139893, "learning_rate": 0.0005906260101282189, "loss": 4.7456, "step": 1550 }, { "epoch": 0.17220966526746315, "grad_norm": 0.9755436182022095, "learning_rate": 0.0005903027690981575, "loss": 4.7303, "step": 1600 }, { "epoch": 0.17759121730707136, "grad_norm": 0.9121830463409424, "learning_rate": 0.000589979528068096, "loss": 4.7437, "step": 1650 }, { "epoch": 0.18297276934667958, "grad_norm": 1.2337779998779297, "learning_rate": 0.0005896562870380347, "loss": 4.6991, "step": 1700 }, { "epoch": 0.1883543213862878, "grad_norm": 0.7923302054405212, "learning_rate": 0.0005893330460079732, "loss": 4.6819, "step": 1750 }, { "epoch": 0.19373587342589602, "grad_norm": 0.9414947032928467, "learning_rate": 0.0005890098049779118, "loss": 4.6401, "step": 1800 }, { "epoch": 0.19911742546550426, "grad_norm": 0.8575401306152344, "learning_rate": 0.0005886865639478504, "loss": 4.6503, "step": 1850 }, { "epoch": 0.20449897750511248, "grad_norm": 0.874113917350769, "learning_rate": 0.0005883633229177889, "loss": 4.6262, "step": 1900 }, { "epoch": 0.2098805295447207, "grad_norm": 0.7579442262649536, "learning_rate": 0.0005880400818877276, "loss": 4.5946, "step": 1950 }, { "epoch": 0.2152620815843289, "grad_norm": 0.8062272071838379, "learning_rate": 0.0005877168408576662, "loss": 4.579, "step": 2000 }, { "epoch": 0.2152620815843289, "eval_accuracy": 0.2712283567521957, "eval_loss": 4.50425910949707, "eval_runtime": 211.1194, "eval_samples_per_second": 85.312, "eval_steps_per_second": 5.333, "step": 2000 }, { "epoch": 0.22064363362393713, "grad_norm": 1.1572167873382568, "learning_rate": 0.0005873935998276048, "loss": 4.5564, "step": 2050 }, { "epoch": 0.22602518566354537, "grad_norm": 1.1494579315185547, "learning_rate": 0.0005870703587975433, "loss": 4.5425, "step": 2100 }, { "epoch": 0.2314067377031536, "grad_norm": 0.9105241298675537, "learning_rate": 0.0005867471177674818, "loss": 4.5083, "step": 2150 }, { "epoch": 0.2367882897427618, "grad_norm": 0.9526411890983582, "learning_rate": 0.0005864238767374205, "loss": 4.5128, "step": 2200 }, { "epoch": 0.24216984178237003, "grad_norm": 0.7726457715034485, "learning_rate": 0.0005861006357073591, "loss": 4.4899, "step": 2250 }, { "epoch": 0.24755139382197827, "grad_norm": 1.0442702770233154, "learning_rate": 0.0005857773946772977, "loss": 4.4716, "step": 2300 }, { "epoch": 0.2529329458615865, "grad_norm": 1.1448233127593994, "learning_rate": 0.0005854541536472362, "loss": 4.4607, "step": 2350 }, { "epoch": 0.2583144979011947, "grad_norm": 0.9515467286109924, "learning_rate": 0.0005851309126171749, "loss": 4.4544, "step": 2400 }, { "epoch": 0.2636960499408029, "grad_norm": 0.7929104566574097, "learning_rate": 0.0005848076715871134, "loss": 4.4392, "step": 2450 }, { "epoch": 0.26907760198041114, "grad_norm": 1.1213116645812988, "learning_rate": 0.000584484430557052, "loss": 4.4407, "step": 2500 }, { "epoch": 0.27445915402001936, "grad_norm": 1.006108045578003, "learning_rate": 0.0005841611895269906, "loss": 4.3997, "step": 2550 }, { "epoch": 0.2798407060596276, "grad_norm": 0.7756773829460144, "learning_rate": 0.0005838379484969291, "loss": 4.3982, "step": 2600 }, { "epoch": 0.2852222580992358, "grad_norm": 0.7442255020141602, "learning_rate": 0.0005835147074668678, "loss": 4.3751, "step": 2650 }, { "epoch": 0.29060381013884407, "grad_norm": 0.7645350694656372, "learning_rate": 0.0005831914664368063, "loss": 4.3873, "step": 2700 }, { "epoch": 0.2959853621784523, "grad_norm": 0.7028666734695435, "learning_rate": 0.0005828682254067449, "loss": 4.3659, "step": 2750 }, { "epoch": 0.3013669142180605, "grad_norm": 0.8504071831703186, "learning_rate": 0.0005825449843766835, "loss": 4.3617, "step": 2800 }, { "epoch": 0.3067484662576687, "grad_norm": 0.880102276802063, "learning_rate": 0.0005822217433466221, "loss": 4.3115, "step": 2850 }, { "epoch": 0.31213001829727693, "grad_norm": 0.7290977835655212, "learning_rate": 0.0005818985023165607, "loss": 4.3215, "step": 2900 }, { "epoch": 0.31751157033688515, "grad_norm": 0.8019910454750061, "learning_rate": 0.0005815752612864992, "loss": 4.3149, "step": 2950 }, { "epoch": 0.32289312237649337, "grad_norm": 0.7739085555076599, "learning_rate": 0.0005812520202564378, "loss": 4.322, "step": 3000 }, { "epoch": 0.32289312237649337, "eval_accuracy": 0.2992396144952079, "eval_loss": 4.226711750030518, "eval_runtime": 199.2176, "eval_samples_per_second": 90.409, "eval_steps_per_second": 5.652, "step": 3000 }, { "epoch": 0.3282746744161016, "grad_norm": 1.0869840383529663, "learning_rate": 0.0005809287792263764, "loss": 4.3072, "step": 3050 }, { "epoch": 0.3336562264557098, "grad_norm": 0.7337630391120911, "learning_rate": 0.0005806055381963151, "loss": 4.2759, "step": 3100 }, { "epoch": 0.3390377784953181, "grad_norm": 0.7336916923522949, "learning_rate": 0.0005802822971662536, "loss": 4.2838, "step": 3150 }, { "epoch": 0.3444193305349263, "grad_norm": 0.7901497483253479, "learning_rate": 0.0005799590561361922, "loss": 4.2757, "step": 3200 }, { "epoch": 0.3498008825745345, "grad_norm": 0.9568474888801575, "learning_rate": 0.0005796358151061307, "loss": 4.2574, "step": 3250 }, { "epoch": 0.35518243461414273, "grad_norm": 0.9219122529029846, "learning_rate": 0.0005793125740760694, "loss": 4.2423, "step": 3300 }, { "epoch": 0.36056398665375095, "grad_norm": 0.6552030444145203, "learning_rate": 0.0005789893330460079, "loss": 4.2339, "step": 3350 }, { "epoch": 0.36594553869335916, "grad_norm": 0.6318998336791992, "learning_rate": 0.0005786660920159465, "loss": 4.2253, "step": 3400 }, { "epoch": 0.3713270907329674, "grad_norm": 0.7610264420509338, "learning_rate": 0.0005783428509858851, "loss": 4.2369, "step": 3450 }, { "epoch": 0.3767086427725756, "grad_norm": 0.6418978571891785, "learning_rate": 0.0005780196099558237, "loss": 4.2239, "step": 3500 }, { "epoch": 0.3820901948121838, "grad_norm": 0.7791884541511536, "learning_rate": 0.0005776963689257623, "loss": 4.2253, "step": 3550 }, { "epoch": 0.38747174685179203, "grad_norm": 0.7014104127883911, "learning_rate": 0.0005773731278957008, "loss": 4.1888, "step": 3600 }, { "epoch": 0.3928532988914003, "grad_norm": 0.7691745758056641, "learning_rate": 0.0005770498868656394, "loss": 4.2055, "step": 3650 }, { "epoch": 0.3982348509310085, "grad_norm": 0.7330165505409241, "learning_rate": 0.000576726645835578, "loss": 4.1917, "step": 3700 }, { "epoch": 0.40361640297061674, "grad_norm": 0.6409156918525696, "learning_rate": 0.0005764034048055167, "loss": 4.1739, "step": 3750 }, { "epoch": 0.40899795501022496, "grad_norm": 0.751600444316864, "learning_rate": 0.0005760801637754552, "loss": 4.1815, "step": 3800 }, { "epoch": 0.4143795070498332, "grad_norm": 0.6341384649276733, "learning_rate": 0.0005757569227453937, "loss": 4.1752, "step": 3850 }, { "epoch": 0.4197610590894414, "grad_norm": 0.8716601729393005, "learning_rate": 0.0005754336817153324, "loss": 4.1772, "step": 3900 }, { "epoch": 0.4251426111290496, "grad_norm": 0.7268335223197937, "learning_rate": 0.0005751104406852709, "loss": 4.1579, "step": 3950 }, { "epoch": 0.4305241631686578, "grad_norm": 0.7490660548210144, "learning_rate": 0.0005747871996552096, "loss": 4.1439, "step": 4000 }, { "epoch": 0.4305241631686578, "eval_accuracy": 0.31218462820231296, "eval_loss": 4.092499256134033, "eval_runtime": 199.4467, "eval_samples_per_second": 90.305, "eval_steps_per_second": 5.646, "step": 4000 }, { "epoch": 0.43590571520826604, "grad_norm": 0.9424974918365479, "learning_rate": 0.0005744639586251481, "loss": 4.1592, "step": 4050 }, { "epoch": 0.44128726724787426, "grad_norm": 0.771391749382019, "learning_rate": 0.0005741407175950867, "loss": 4.1455, "step": 4100 }, { "epoch": 0.44666881928748253, "grad_norm": 0.7916195392608643, "learning_rate": 0.0005738174765650253, "loss": 4.1329, "step": 4150 }, { "epoch": 0.45205037132709075, "grad_norm": 0.7015873789787292, "learning_rate": 0.0005734942355349638, "loss": 4.1338, "step": 4200 }, { "epoch": 0.45743192336669897, "grad_norm": 0.62185138463974, "learning_rate": 0.0005731709945049025, "loss": 4.1517, "step": 4250 }, { "epoch": 0.4628134754063072, "grad_norm": 0.6499165296554565, "learning_rate": 0.000572847753474841, "loss": 4.1249, "step": 4300 }, { "epoch": 0.4681950274459154, "grad_norm": 0.8142028450965881, "learning_rate": 0.0005725245124447796, "loss": 4.1094, "step": 4350 }, { "epoch": 0.4735765794855236, "grad_norm": 0.686265230178833, "learning_rate": 0.0005722012714147182, "loss": 4.1156, "step": 4400 }, { "epoch": 0.47895813152513184, "grad_norm": 0.6861261129379272, "learning_rate": 0.0005718780303846568, "loss": 4.0911, "step": 4450 }, { "epoch": 0.48433968356474005, "grad_norm": 0.717899739742279, "learning_rate": 0.0005715547893545953, "loss": 4.0984, "step": 4500 }, { "epoch": 0.48972123560434827, "grad_norm": 0.635183572769165, "learning_rate": 0.000571231548324534, "loss": 4.0915, "step": 4550 }, { "epoch": 0.49510278764395654, "grad_norm": 0.6602552533149719, "learning_rate": 0.0005709083072944725, "loss": 4.0886, "step": 4600 }, { "epoch": 0.5004843396835648, "grad_norm": 0.7243141531944275, "learning_rate": 0.0005705850662644111, "loss": 4.0928, "step": 4650 }, { "epoch": 0.505865891723173, "grad_norm": 0.6310231685638428, "learning_rate": 0.0005702618252343497, "loss": 4.0986, "step": 4700 }, { "epoch": 0.5112474437627812, "grad_norm": 0.5868064761161804, "learning_rate": 0.0005699385842042882, "loss": 4.0687, "step": 4750 }, { "epoch": 0.5166289958023894, "grad_norm": 0.6389384269714355, "learning_rate": 0.0005696153431742269, "loss": 4.0789, "step": 4800 }, { "epoch": 0.5220105478419976, "grad_norm": 0.7776073217391968, "learning_rate": 0.0005692921021441655, "loss": 4.0977, "step": 4850 }, { "epoch": 0.5273920998816058, "grad_norm": 0.6234896779060364, "learning_rate": 0.0005689688611141041, "loss": 4.0547, "step": 4900 }, { "epoch": 0.5327736519212141, "grad_norm": 0.8750380277633667, "learning_rate": 0.0005686456200840426, "loss": 4.0718, "step": 4950 }, { "epoch": 0.5381552039608223, "grad_norm": 0.7700529098510742, "learning_rate": 0.0005683223790539811, "loss": 4.043, "step": 5000 }, { "epoch": 0.5381552039608223, "eval_accuracy": 0.3214325100957547, "eval_loss": 3.9926369190216064, "eval_runtime": 225.5059, "eval_samples_per_second": 79.869, "eval_steps_per_second": 4.993, "step": 5000 }, { "epoch": 0.5435367560004305, "grad_norm": 0.7643857002258301, "learning_rate": 0.0005679991380239198, "loss": 4.0605, "step": 5050 }, { "epoch": 0.5489183080400387, "grad_norm": 0.554442286491394, "learning_rate": 0.0005676758969938584, "loss": 4.0446, "step": 5100 }, { "epoch": 0.5542998600796469, "grad_norm": 0.7216308116912842, "learning_rate": 0.000567352655963797, "loss": 4.0313, "step": 5150 }, { "epoch": 0.5596814121192552, "grad_norm": 0.6057455539703369, "learning_rate": 0.0005670294149337355, "loss": 4.0337, "step": 5200 }, { "epoch": 0.5650629641588634, "grad_norm": 0.535860002040863, "learning_rate": 0.0005667061739036742, "loss": 4.0392, "step": 5250 }, { "epoch": 0.5704445161984716, "grad_norm": 0.647204577922821, "learning_rate": 0.0005663829328736127, "loss": 4.0421, "step": 5300 }, { "epoch": 0.5758260682380799, "grad_norm": 0.5126392841339111, "learning_rate": 0.0005660596918435512, "loss": 4.0318, "step": 5350 }, { "epoch": 0.5812076202776881, "grad_norm": 0.6040687561035156, "learning_rate": 0.0005657364508134899, "loss": 4.0117, "step": 5400 }, { "epoch": 0.5865891723172963, "grad_norm": 0.6971921324729919, "learning_rate": 0.0005654132097834284, "loss": 4.025, "step": 5450 }, { "epoch": 0.5919707243569046, "grad_norm": 0.5343753099441528, "learning_rate": 0.0005650899687533671, "loss": 4.015, "step": 5500 }, { "epoch": 0.5973522763965128, "grad_norm": 0.5600801706314087, "learning_rate": 0.0005647667277233056, "loss": 4.0087, "step": 5550 }, { "epoch": 0.602733828436121, "grad_norm": 0.6542683243751526, "learning_rate": 0.0005644434866932442, "loss": 3.9957, "step": 5600 }, { "epoch": 0.6081153804757292, "grad_norm": 0.6249606609344482, "learning_rate": 0.0005641202456631828, "loss": 4.0136, "step": 5650 }, { "epoch": 0.6134969325153374, "grad_norm": 0.6572969555854797, "learning_rate": 0.0005637970046331214, "loss": 4.0046, "step": 5700 }, { "epoch": 0.6188784845549457, "grad_norm": 0.6926669478416443, "learning_rate": 0.00056347376360306, "loss": 3.9869, "step": 5750 }, { "epoch": 0.6242600365945539, "grad_norm": 0.6491366624832153, "learning_rate": 0.0005631505225729985, "loss": 4.0004, "step": 5800 }, { "epoch": 0.6296415886341621, "grad_norm": 0.6138956546783447, "learning_rate": 0.0005628272815429371, "loss": 3.9814, "step": 5850 }, { "epoch": 0.6350231406737703, "grad_norm": 0.6467788219451904, "learning_rate": 0.0005625040405128757, "loss": 3.9771, "step": 5900 }, { "epoch": 0.6404046927133785, "grad_norm": 0.6469590067863464, "learning_rate": 0.0005621807994828143, "loss": 3.9882, "step": 5950 }, { "epoch": 0.6457862447529867, "grad_norm": 0.7396846413612366, "learning_rate": 0.0005618575584527529, "loss": 3.9759, "step": 6000 }, { "epoch": 0.6457862447529867, "eval_accuracy": 0.32833664054615025, "eval_loss": 3.919649839401245, "eval_runtime": 200.1239, "eval_samples_per_second": 89.999, "eval_steps_per_second": 5.627, "step": 6000 }, { "epoch": 0.651167796792595, "grad_norm": 0.7368170619010925, "learning_rate": 0.0005615343174226915, "loss": 3.9789, "step": 6050 }, { "epoch": 0.6565493488322032, "grad_norm": 0.593928873538971, "learning_rate": 0.00056121107639263, "loss": 3.962, "step": 6100 }, { "epoch": 0.6619309008718114, "grad_norm": 0.5571395754814148, "learning_rate": 0.0005608878353625687, "loss": 3.9732, "step": 6150 }, { "epoch": 0.6673124529114196, "grad_norm": 0.6184373497962952, "learning_rate": 0.0005605645943325072, "loss": 3.9618, "step": 6200 }, { "epoch": 0.6726940049510278, "grad_norm": 0.6731172800064087, "learning_rate": 0.0005602413533024458, "loss": 3.9629, "step": 6250 }, { "epoch": 0.6780755569906362, "grad_norm": Infinity, "learning_rate": 0.0005599245770929855, "loss": 3.9661, "step": 6300 }, { "epoch": 0.6834571090302444, "grad_norm": 0.6674894690513611, "learning_rate": 0.0005596013360629242, "loss": 3.9598, "step": 6350 }, { "epoch": 0.6888386610698526, "grad_norm": 0.6270789504051208, "learning_rate": 0.0005592780950328628, "loss": 3.9443, "step": 6400 }, { "epoch": 0.6942202131094608, "grad_norm": 0.6163814067840576, "learning_rate": 0.0005589548540028014, "loss": 3.9506, "step": 6450 }, { "epoch": 0.699601765149069, "grad_norm": 0.6060782670974731, "learning_rate": 0.0005586316129727399, "loss": 3.9645, "step": 6500 }, { "epoch": 0.7049833171886772, "grad_norm": 0.6246522068977356, "learning_rate": 0.0005583083719426786, "loss": 3.9614, "step": 6550 }, { "epoch": 0.7103648692282855, "grad_norm": 0.603921115398407, "learning_rate": 0.0005579851309126171, "loss": 3.94, "step": 6600 }, { "epoch": 0.7157464212678937, "grad_norm": 0.5655505061149597, "learning_rate": 0.0005576618898825558, "loss": 3.9318, "step": 6650 }, { "epoch": 0.7211279733075019, "grad_norm": 0.604542076587677, "learning_rate": 0.0005573386488524943, "loss": 3.9307, "step": 6700 }, { "epoch": 0.7265095253471101, "grad_norm": 0.600004255771637, "learning_rate": 0.0005570154078224328, "loss": 3.9442, "step": 6750 }, { "epoch": 0.7318910773867183, "grad_norm": 0.6960250735282898, "learning_rate": 0.0005566921667923715, "loss": 3.9425, "step": 6800 }, { "epoch": 0.7372726294263265, "grad_norm": 0.5619109869003296, "learning_rate": 0.00055636892576231, "loss": 3.9073, "step": 6850 }, { "epoch": 0.7426541814659348, "grad_norm": 0.6036113500595093, "learning_rate": 0.0005560456847322487, "loss": 3.9305, "step": 6900 }, { "epoch": 0.748035733505543, "grad_norm": 0.6234127283096313, "learning_rate": 0.0005557224437021872, "loss": 3.9285, "step": 6950 }, { "epoch": 0.7534172855451512, "grad_norm": 0.6131523251533508, "learning_rate": 0.0005553992026721258, "loss": 3.922, "step": 7000 }, { "epoch": 0.7534172855451512, "eval_accuracy": 0.3334209437785411, "eval_loss": 3.861107349395752, "eval_runtime": 205.1306, "eval_samples_per_second": 87.803, "eval_steps_per_second": 5.489, "step": 7000 }, { "epoch": 0.7587988375847594, "grad_norm": 0.6058487296104431, "learning_rate": 0.0005550759616420644, "loss": 3.9083, "step": 7050 }, { "epoch": 0.7641803896243676, "grad_norm": 0.624620258808136, "learning_rate": 0.000554752720612003, "loss": 3.9349, "step": 7100 }, { "epoch": 0.7695619416639758, "grad_norm": 0.5525732040405273, "learning_rate": 0.0005544294795819415, "loss": 3.9261, "step": 7150 }, { "epoch": 0.7749434937035841, "grad_norm": 0.5950748324394226, "learning_rate": 0.0005541062385518801, "loss": 3.9067, "step": 7200 }, { "epoch": 0.7803250457431924, "grad_norm": 0.5052813291549683, "learning_rate": 0.0005537829975218188, "loss": 3.9004, "step": 7250 }, { "epoch": 0.7857065977828006, "grad_norm": 0.5610913038253784, "learning_rate": 0.0005534662213123586, "loss": 3.8911, "step": 7300 }, { "epoch": 0.7910881498224088, "grad_norm": 0.5639758110046387, "learning_rate": 0.0005531429802822971, "loss": 3.931, "step": 7350 }, { "epoch": 0.796469701862017, "grad_norm": 0.6221727728843689, "learning_rate": 0.0005528197392522357, "loss": 3.9288, "step": 7400 }, { "epoch": 0.8018512539016253, "grad_norm": 0.5807225108146667, "learning_rate": 0.0005524964982221743, "loss": 3.9206, "step": 7450 }, { "epoch": 0.8072328059412335, "grad_norm": 0.6214258074760437, "learning_rate": 0.0005521732571921129, "loss": 3.9216, "step": 7500 }, { "epoch": 0.8126143579808417, "grad_norm": 0.6088549494743347, "learning_rate": 0.0005518500161620514, "loss": 3.9102, "step": 7550 }, { "epoch": 0.8179959100204499, "grad_norm": 0.615315318107605, "learning_rate": 0.00055152677513199, "loss": 3.8932, "step": 7600 }, { "epoch": 0.8233774620600581, "grad_norm": 0.5932325124740601, "learning_rate": 0.0005512035341019286, "loss": 3.9037, "step": 7650 }, { "epoch": 0.8287590140996663, "grad_norm": 0.632127583026886, "learning_rate": 0.0005508802930718672, "loss": 3.9074, "step": 7700 }, { "epoch": 0.8341405661392746, "grad_norm": 0.5906286239624023, "learning_rate": 0.0005505570520418058, "loss": 3.8969, "step": 7750 }, { "epoch": 0.8395221181788828, "grad_norm": 0.7419958114624023, "learning_rate": 0.0005502338110117443, "loss": 3.911, "step": 7800 }, { "epoch": 0.844903670218491, "grad_norm": 0.5959650874137878, "learning_rate": 0.000549910569981683, "loss": 3.885, "step": 7850 }, { "epoch": 0.8502852222580992, "grad_norm": 0.5370450615882874, "learning_rate": 0.0005495873289516215, "loss": 3.8883, "step": 7900 }, { "epoch": 0.8556667742977074, "grad_norm": 0.6142526865005493, "learning_rate": 0.0005492640879215602, "loss": 3.8891, "step": 7950 }, { "epoch": 0.8610483263373157, "grad_norm": 0.5250054001808167, "learning_rate": 0.0005489408468914987, "loss": 3.8731, "step": 8000 }, { "epoch": 0.8610483263373157, "eval_accuracy": 0.3380041398923315, "eval_loss": 3.814573049545288, "eval_runtime": 218.7029, "eval_samples_per_second": 82.354, "eval_steps_per_second": 5.149, "step": 8000 }, { "epoch": 0.8664298783769239, "grad_norm": 0.5592741370201111, "learning_rate": 0.0005486176058614372, "loss": 3.8661, "step": 8050 }, { "epoch": 0.8718114304165321, "grad_norm": 0.558358371257782, "learning_rate": 0.0005482943648313759, "loss": 3.8665, "step": 8100 }, { "epoch": 0.8771929824561403, "grad_norm": 0.6065065860748291, "learning_rate": 0.0005479711238013145, "loss": 3.8815, "step": 8150 }, { "epoch": 0.8825745344957485, "grad_norm": 0.566906750202179, "learning_rate": 0.0005476478827712531, "loss": 3.8788, "step": 8200 }, { "epoch": 0.8879560865353568, "grad_norm": 0.5542231202125549, "learning_rate": 0.0005473246417411916, "loss": 3.8757, "step": 8250 }, { "epoch": 0.8933376385749651, "grad_norm": 0.5650736093521118, "learning_rate": 0.0005470014007111302, "loss": 3.867, "step": 8300 }, { "epoch": 0.8987191906145733, "grad_norm": 0.6359168887138367, "learning_rate": 0.0005466781596810688, "loss": 3.8704, "step": 8350 }, { "epoch": 0.9041007426541815, "grad_norm": 0.5609973669052124, "learning_rate": 0.0005463549186510073, "loss": 3.8704, "step": 8400 }, { "epoch": 0.9094822946937897, "grad_norm": 0.5966047048568726, "learning_rate": 0.000546031677620946, "loss": 3.8649, "step": 8450 }, { "epoch": 0.9148638467333979, "grad_norm": 0.555415689945221, "learning_rate": 0.0005457084365908845, "loss": 3.8775, "step": 8500 }, { "epoch": 0.9202453987730062, "grad_norm": 0.5719990134239197, "learning_rate": 0.0005453851955608232, "loss": 3.8601, "step": 8550 }, { "epoch": 0.9256269508126144, "grad_norm": 0.5464230179786682, "learning_rate": 0.0005450619545307617, "loss": 3.8599, "step": 8600 }, { "epoch": 0.9310085028522226, "grad_norm": 0.6069023013114929, "learning_rate": 0.0005447387135007003, "loss": 3.8584, "step": 8650 }, { "epoch": 0.9363900548918308, "grad_norm": 0.556610643863678, "learning_rate": 0.0005444154724706389, "loss": 3.8636, "step": 8700 }, { "epoch": 0.941771606931439, "grad_norm": 0.5988168120384216, "learning_rate": 0.0005440922314405775, "loss": 3.8366, "step": 8750 }, { "epoch": 0.9471531589710472, "grad_norm": 0.5634347200393677, "learning_rate": 0.0005437689904105161, "loss": 3.8604, "step": 8800 }, { "epoch": 0.9525347110106555, "grad_norm": 0.5349180102348328, "learning_rate": 0.0005434457493804546, "loss": 3.8395, "step": 8850 }, { "epoch": 0.9579162630502637, "grad_norm": 0.5537461042404175, "learning_rate": 0.0005431225083503932, "loss": 3.8644, "step": 8900 }, { "epoch": 0.9632978150898719, "grad_norm": 0.5711461901664734, "learning_rate": 0.0005427992673203318, "loss": 3.8352, "step": 8950 }, { "epoch": 0.9686793671294801, "grad_norm": 0.6128414273262024, "learning_rate": 0.0005424760262902704, "loss": 3.8405, "step": 9000 }, { "epoch": 0.9686793671294801, "eval_accuracy": 0.3417073562677125, "eval_loss": 3.779106378555298, "eval_runtime": 210.7209, "eval_samples_per_second": 85.473, "eval_steps_per_second": 5.344, "step": 9000 }, { "epoch": 0.9740609191690883, "grad_norm": 0.6156434416770935, "learning_rate": 0.000542152785260209, "loss": 3.8333, "step": 9050 }, { "epoch": 0.9794424712086965, "grad_norm": 0.5652556419372559, "learning_rate": 0.0005418295442301476, "loss": 3.8327, "step": 9100 }, { "epoch": 0.9848240232483048, "grad_norm": 0.6443074941635132, "learning_rate": 0.0005415063032000861, "loss": 3.842, "step": 9150 }, { "epoch": 0.9902055752879131, "grad_norm": 0.632297158241272, "learning_rate": 0.0005411830621700248, "loss": 3.8312, "step": 9200 }, { "epoch": 0.9955871273275213, "grad_norm": 0.5863285064697266, "learning_rate": 0.0005408598211399633, "loss": 3.8298, "step": 9250 }, { "epoch": 1.0009686793671295, "grad_norm": 0.5393086671829224, "learning_rate": 0.0005405365801099019, "loss": 3.8069, "step": 9300 }, { "epoch": 1.0063502314067376, "grad_norm": 0.5285312533378601, "learning_rate": 0.0005402133390798405, "loss": 3.7619, "step": 9350 }, { "epoch": 1.011731783446346, "grad_norm": 0.5921708941459656, "learning_rate": 0.000539890098049779, "loss": 3.7701, "step": 9400 }, { "epoch": 1.017113335485954, "grad_norm": 0.5384525060653687, "learning_rate": 0.0005395668570197177, "loss": 3.7657, "step": 9450 }, { "epoch": 1.0224948875255624, "grad_norm": 0.5624929666519165, "learning_rate": 0.0005392436159896562, "loss": 3.7754, "step": 9500 }, { "epoch": 1.0278764395651705, "grad_norm": 0.5882208943367004, "learning_rate": 0.0005389203749595948, "loss": 3.7848, "step": 9550 }, { "epoch": 1.0332579916047788, "grad_norm": 0.5588924884796143, "learning_rate": 0.0005385971339295334, "loss": 3.7603, "step": 9600 }, { "epoch": 1.0386395436443872, "grad_norm": 0.5837873220443726, "learning_rate": 0.000538273892899472, "loss": 3.7785, "step": 9650 }, { "epoch": 1.0440210956839953, "grad_norm": 0.5880899429321289, "learning_rate": 0.0005379506518694106, "loss": 3.7506, "step": 9700 }, { "epoch": 1.0494026477236036, "grad_norm": 0.5579642653465271, "learning_rate": 0.0005376274108393491, "loss": 3.7714, "step": 9750 }, { "epoch": 1.0547841997632117, "grad_norm": 0.576766312122345, "learning_rate": 0.0005373041698092877, "loss": 3.7653, "step": 9800 }, { "epoch": 1.06016575180282, "grad_norm": 0.5471925735473633, "learning_rate": 0.0005369809287792263, "loss": 3.7686, "step": 9850 }, { "epoch": 1.0655473038424281, "grad_norm": 0.59092116355896, "learning_rate": 0.000536657687749165, "loss": 3.7667, "step": 9900 }, { "epoch": 1.0709288558820365, "grad_norm": 0.638380765914917, "learning_rate": 0.0005363344467191035, "loss": 3.7722, "step": 9950 }, { "epoch": 1.0763104079216446, "grad_norm": 0.5594536066055298, "learning_rate": 0.000536011205689042, "loss": 3.7655, "step": 10000 }, { "epoch": 1.0763104079216446, "eval_accuracy": 0.3451049320696713, "eval_loss": 3.7489354610443115, "eval_runtime": 202.8767, "eval_samples_per_second": 88.778, "eval_steps_per_second": 5.55, "step": 10000 }, { "epoch": 1.081691959961253, "grad_norm": 0.640766441822052, "learning_rate": 0.0005356879646589807, "loss": 3.7646, "step": 10050 }, { "epoch": 1.087073512000861, "grad_norm": 0.6311773657798767, "learning_rate": 0.0005353647236289192, "loss": 3.7643, "step": 10100 }, { "epoch": 1.0924550640404693, "grad_norm": 0.5654868483543396, "learning_rate": 0.0005350414825988579, "loss": 3.7592, "step": 10150 }, { "epoch": 1.0978366160800774, "grad_norm": 0.5341681838035583, "learning_rate": 0.0005347182415687964, "loss": 3.7609, "step": 10200 }, { "epoch": 1.1032181681196858, "grad_norm": 0.8984493613243103, "learning_rate": 0.000534395000538735, "loss": 3.7588, "step": 10250 }, { "epoch": 1.1085997201592939, "grad_norm": 0.6130673885345459, "learning_rate": 0.0005340717595086736, "loss": 3.7725, "step": 10300 }, { "epoch": 1.1139812721989022, "grad_norm": 0.5171802043914795, "learning_rate": 0.0005337485184786122, "loss": 3.7686, "step": 10350 }, { "epoch": 1.1193628242385103, "grad_norm": 0.6430924534797668, "learning_rate": 0.0005334252774485507, "loss": 3.7493, "step": 10400 }, { "epoch": 1.1247443762781186, "grad_norm": 0.5376786589622498, "learning_rate": 0.0005331020364184894, "loss": 3.7686, "step": 10450 }, { "epoch": 1.1301259283177267, "grad_norm": 0.5798326730728149, "learning_rate": 0.0005327787953884279, "loss": 3.7654, "step": 10500 }, { "epoch": 1.135507480357335, "grad_norm": 0.5384038090705872, "learning_rate": 0.0005324555543583665, "loss": 3.7485, "step": 10550 }, { "epoch": 1.1408890323969434, "grad_norm": 0.5550758838653564, "learning_rate": 0.0005321323133283051, "loss": 3.748, "step": 10600 }, { "epoch": 1.1462705844365515, "grad_norm": 0.5656107664108276, "learning_rate": 0.0005318090722982436, "loss": 3.7485, "step": 10650 }, { "epoch": 1.1516521364761596, "grad_norm": 0.5888227224349976, "learning_rate": 0.0005314858312681823, "loss": 3.7438, "step": 10700 }, { "epoch": 1.157033688515768, "grad_norm": 0.5515899658203125, "learning_rate": 0.0005311625902381209, "loss": 3.7721, "step": 10750 }, { "epoch": 1.1624152405553763, "grad_norm": 0.6210424900054932, "learning_rate": 0.0005308393492080595, "loss": 3.7652, "step": 10800 }, { "epoch": 1.1677967925949844, "grad_norm": 0.5752713084220886, "learning_rate": 0.000530516108177998, "loss": 3.7486, "step": 10850 }, { "epoch": 1.1731783446345927, "grad_norm": 0.6572223901748657, "learning_rate": 0.0005301928671479365, "loss": 3.7272, "step": 10900 }, { "epoch": 1.1785598966742008, "grad_norm": 0.6310757994651794, "learning_rate": 0.0005298696261178752, "loss": 3.7563, "step": 10950 }, { "epoch": 1.1839414487138091, "grad_norm": 0.5674658417701721, "learning_rate": 0.0005295463850878138, "loss": 3.7441, "step": 11000 }, { "epoch": 1.1839414487138091, "eval_accuracy": 0.34708784727228553, "eval_loss": 3.7200334072113037, "eval_runtime": 208.4996, "eval_samples_per_second": 86.384, "eval_steps_per_second": 5.4, "step": 11000 }, { "epoch": 1.1893230007534172, "grad_norm": 0.5541161298751831, "learning_rate": 0.0005292231440577524, "loss": 3.7499, "step": 11050 }, { "epoch": 1.1947045527930256, "grad_norm": 0.5533831715583801, "learning_rate": 0.0005288999030276909, "loss": 3.7418, "step": 11100 }, { "epoch": 1.2000861048326337, "grad_norm": 0.5780303478240967, "learning_rate": 0.0005285766619976295, "loss": 3.7409, "step": 11150 }, { "epoch": 1.205467656872242, "grad_norm": 0.6292614340782166, "learning_rate": 0.0005282534209675681, "loss": 3.7435, "step": 11200 }, { "epoch": 1.21084920891185, "grad_norm": 0.5940732359886169, "learning_rate": 0.0005279301799375066, "loss": 3.7362, "step": 11250 }, { "epoch": 1.2162307609514584, "grad_norm": 0.5676620006561279, "learning_rate": 0.0005276134037280465, "loss": 3.7241, "step": 11300 }, { "epoch": 1.2216123129910665, "grad_norm": 0.5294714570045471, "learning_rate": 0.0005272901626979851, "loss": 3.7192, "step": 11350 }, { "epoch": 1.2269938650306749, "grad_norm": 0.5521119832992554, "learning_rate": 0.0005269669216679236, "loss": 3.734, "step": 11400 }, { "epoch": 1.232375417070283, "grad_norm": 0.7975606918334961, "learning_rate": 0.0005266436806378623, "loss": 3.7188, "step": 11450 }, { "epoch": 1.2377569691098913, "grad_norm": 0.5794736742973328, "learning_rate": 0.0005263204396078008, "loss": 3.7279, "step": 11500 }, { "epoch": 1.2431385211494996, "grad_norm": 0.5361841917037964, "learning_rate": 0.0005259971985777394, "loss": 3.7432, "step": 11550 }, { "epoch": 1.2485200731891077, "grad_norm": 0.5248964428901672, "learning_rate": 0.000525673957547678, "loss": 3.7447, "step": 11600 }, { "epoch": 1.2539016252287158, "grad_norm": 0.5553768873214722, "learning_rate": 0.0005253507165176167, "loss": 3.7477, "step": 11650 }, { "epoch": 1.2592831772683242, "grad_norm": 0.5761224627494812, "learning_rate": 0.0005250274754875552, "loss": 3.7211, "step": 11700 }, { "epoch": 1.2646647293079325, "grad_norm": 0.607130229473114, "learning_rate": 0.0005247042344574938, "loss": 3.7359, "step": 11750 }, { "epoch": 1.2700462813475406, "grad_norm": 0.5540531873703003, "learning_rate": 0.0005243809934274323, "loss": 3.7217, "step": 11800 }, { "epoch": 1.275427833387149, "grad_norm": 0.6098884344100952, "learning_rate": 0.0005240577523973709, "loss": 3.7431, "step": 11850 }, { "epoch": 1.280809385426757, "grad_norm": 0.5816884636878967, "learning_rate": 0.0005237345113673095, "loss": 3.7404, "step": 11900 }, { "epoch": 1.2861909374663654, "grad_norm": 0.5769429802894592, "learning_rate": 0.0005234112703372481, "loss": 3.7404, "step": 11950 }, { "epoch": 1.2915724895059735, "grad_norm": 0.563298761844635, "learning_rate": 0.0005230880293071867, "loss": 3.7293, "step": 12000 }, { "epoch": 1.2915724895059735, "eval_accuracy": 0.34958697234490643, "eval_loss": 3.6987245082855225, "eval_runtime": 217.4267, "eval_samples_per_second": 82.837, "eval_steps_per_second": 5.179, "step": 12000 }, { "epoch": 1.2969540415455818, "grad_norm": 0.5788484811782837, "learning_rate": 0.0005227647882771253, "loss": 3.7336, "step": 12050 }, { "epoch": 1.30233559358519, "grad_norm": 0.6204023957252502, "learning_rate": 0.0005224415472470639, "loss": 3.7243, "step": 12100 }, { "epoch": 1.3077171456247982, "grad_norm": 0.5986481308937073, "learning_rate": 0.0005221183062170024, "loss": 3.7316, "step": 12150 }, { "epoch": 1.3130986976644063, "grad_norm": 0.6356789469718933, "learning_rate": 0.0005217950651869409, "loss": 3.7352, "step": 12200 }, { "epoch": 1.3184802497040147, "grad_norm": 0.555164098739624, "learning_rate": 0.0005214718241568796, "loss": 3.7149, "step": 12250 }, { "epoch": 1.3238618017436228, "grad_norm": 0.6156308054924011, "learning_rate": 0.0005211485831268182, "loss": 3.6991, "step": 12300 }, { "epoch": 1.329243353783231, "grad_norm": 0.5790920853614807, "learning_rate": 0.0005208253420967568, "loss": 3.7206, "step": 12350 }, { "epoch": 1.3346249058228392, "grad_norm": 0.5524982810020447, "learning_rate": 0.0005205021010666953, "loss": 3.7344, "step": 12400 }, { "epoch": 1.3400064578624475, "grad_norm": 0.5789136290550232, "learning_rate": 0.0005201788600366339, "loss": 3.714, "step": 12450 }, { "epoch": 1.3453880099020559, "grad_norm": 0.5845145583152771, "learning_rate": 0.0005198556190065725, "loss": 3.7083, "step": 12500 }, { "epoch": 1.350769561941664, "grad_norm": 0.6391981840133667, "learning_rate": 0.0005195323779765112, "loss": 3.7103, "step": 12550 }, { "epoch": 1.356151113981272, "grad_norm": 0.543088972568512, "learning_rate": 0.0005192091369464497, "loss": 3.7256, "step": 12600 }, { "epoch": 1.3615326660208804, "grad_norm": 0.5750318169593811, "learning_rate": 0.0005188858959163882, "loss": 3.7289, "step": 12650 }, { "epoch": 1.3669142180604887, "grad_norm": 0.6135967969894409, "learning_rate": 0.0005185626548863269, "loss": 3.7014, "step": 12700 }, { "epoch": 1.3722957701000968, "grad_norm": 0.5833747982978821, "learning_rate": 0.0005182394138562654, "loss": 3.7292, "step": 12750 }, { "epoch": 1.3776773221397052, "grad_norm": 0.571729838848114, "learning_rate": 0.0005179161728262041, "loss": 3.7158, "step": 12800 }, { "epoch": 1.3830588741793133, "grad_norm": 0.6265289783477783, "learning_rate": 0.0005175929317961426, "loss": 3.7261, "step": 12850 }, { "epoch": 1.3884404262189216, "grad_norm": 0.6207230091094971, "learning_rate": 0.0005172696907660812, "loss": 3.699, "step": 12900 }, { "epoch": 1.3938219782585297, "grad_norm": 0.5325709581375122, "learning_rate": 0.0005169464497360198, "loss": 3.7059, "step": 12950 }, { "epoch": 1.399203530298138, "grad_norm": 0.5781683325767517, "learning_rate": 0.0005166232087059583, "loss": 3.7029, "step": 13000 }, { "epoch": 1.399203530298138, "eval_accuracy": 0.3522667871699628, "eval_loss": 3.674454927444458, "eval_runtime": 217.915, "eval_samples_per_second": 82.651, "eval_steps_per_second": 5.167, "step": 13000 }, { "epoch": 1.4045850823377461, "grad_norm": 0.565986156463623, "learning_rate": 0.0005162999676758969, "loss": 3.7075, "step": 13050 }, { "epoch": 1.4099666343773545, "grad_norm": 0.5653162002563477, "learning_rate": 0.0005159767266458355, "loss": 3.6983, "step": 13100 }, { "epoch": 1.4153481864169626, "grad_norm": 0.5863538384437561, "learning_rate": 0.0005156534856157741, "loss": 3.7083, "step": 13150 }, { "epoch": 1.420729738456571, "grad_norm": 0.5486993789672852, "learning_rate": 0.0005153302445857127, "loss": 3.7251, "step": 13200 }, { "epoch": 1.426111290496179, "grad_norm": 0.6212597489356995, "learning_rate": 0.0005150070035556513, "loss": 3.7196, "step": 13250 }, { "epoch": 1.4314928425357873, "grad_norm": 0.6025354862213135, "learning_rate": 0.0005146837625255898, "loss": 3.7195, "step": 13300 }, { "epoch": 1.4368743945753955, "grad_norm": 0.5640348792076111, "learning_rate": 0.0005143669863161297, "loss": 3.7035, "step": 13350 }, { "epoch": 1.4422559466150038, "grad_norm": 0.5984680652618408, "learning_rate": 0.0005140437452860683, "loss": 3.7052, "step": 13400 }, { "epoch": 1.447637498654612, "grad_norm": 0.5823872685432434, "learning_rate": 0.0005137205042560069, "loss": 3.695, "step": 13450 }, { "epoch": 1.4530190506942202, "grad_norm": 0.5364094972610474, "learning_rate": 0.0005133972632259455, "loss": 3.7199, "step": 13500 }, { "epoch": 1.4584006027338283, "grad_norm": 0.5685153603553772, "learning_rate": 0.000513074022195884, "loss": 3.6992, "step": 13550 }, { "epoch": 1.4637821547734367, "grad_norm": 0.5787658095359802, "learning_rate": 0.0005127507811658226, "loss": 3.6923, "step": 13600 }, { "epoch": 1.469163706813045, "grad_norm": 0.5486370325088501, "learning_rate": 0.0005124275401357612, "loss": 3.7024, "step": 13650 }, { "epoch": 1.474545258852653, "grad_norm": 0.5473746061325073, "learning_rate": 0.0005121042991056997, "loss": 3.6833, "step": 13700 }, { "epoch": 1.4799268108922612, "grad_norm": 0.5517615675926208, "learning_rate": 0.0005117810580756384, "loss": 3.6902, "step": 13750 }, { "epoch": 1.4853083629318695, "grad_norm": 0.5971811413764954, "learning_rate": 0.0005114578170455769, "loss": 3.6845, "step": 13800 }, { "epoch": 1.4906899149714778, "grad_norm": 0.5672309398651123, "learning_rate": 0.0005111345760155156, "loss": 3.6854, "step": 13850 }, { "epoch": 1.496071467011086, "grad_norm": 0.5523454546928406, "learning_rate": 0.0005108113349854541, "loss": 3.6961, "step": 13900 }, { "epoch": 1.501453019050694, "grad_norm": 0.5691514611244202, "learning_rate": 0.0005104880939553926, "loss": 3.6803, "step": 13950 }, { "epoch": 1.5068345710903024, "grad_norm": 0.5720673203468323, "learning_rate": 0.0005101648529253313, "loss": 3.6962, "step": 14000 }, { "epoch": 1.5068345710903024, "eval_accuracy": 0.35364048563060124, "eval_loss": 3.655740261077881, "eval_runtime": 204.9718, "eval_samples_per_second": 87.871, "eval_steps_per_second": 5.493, "step": 14000 }, { "epoch": 1.5122161231299107, "grad_norm": 0.5242102742195129, "learning_rate": 0.0005098480767158711, "loss": 3.697, "step": 14050 }, { "epoch": 1.5175976751695188, "grad_norm": 0.5905733108520508, "learning_rate": 0.0005095248356858097, "loss": 3.6869, "step": 14100 }, { "epoch": 1.5229792272091272, "grad_norm": 0.6295709609985352, "learning_rate": 0.0005092015946557483, "loss": 3.6839, "step": 14150 }, { "epoch": 1.5283607792487355, "grad_norm": 0.6163989901542664, "learning_rate": 0.0005088783536256868, "loss": 3.6979, "step": 14200 }, { "epoch": 1.5337423312883436, "grad_norm": 0.5677395462989807, "learning_rate": 0.0005085551125956255, "loss": 3.698, "step": 14250 }, { "epoch": 1.5391238833279517, "grad_norm": 0.5294774174690247, "learning_rate": 0.000508231871565564, "loss": 3.6858, "step": 14300 }, { "epoch": 1.54450543536756, "grad_norm": 0.585757315158844, "learning_rate": 0.0005079086305355026, "loss": 3.7028, "step": 14350 }, { "epoch": 1.5498869874071683, "grad_norm": 0.5384769439697266, "learning_rate": 0.0005075853895054412, "loss": 3.686, "step": 14400 }, { "epoch": 1.5552685394467765, "grad_norm": 0.5806359052658081, "learning_rate": 0.0005072621484753797, "loss": 3.689, "step": 14450 }, { "epoch": 1.5606500914863846, "grad_norm": 0.5702711939811707, "learning_rate": 0.0005069389074453184, "loss": 3.675, "step": 14500 }, { "epoch": 1.566031643525993, "grad_norm": 0.5366608500480652, "learning_rate": 0.0005066156664152569, "loss": 3.6914, "step": 14550 }, { "epoch": 1.5714131955656012, "grad_norm": 0.60133296251297, "learning_rate": 0.0005062924253851955, "loss": 3.693, "step": 14600 }, { "epoch": 1.5767947476052093, "grad_norm": 0.5777215361595154, "learning_rate": 0.0005059691843551341, "loss": 3.6786, "step": 14650 }, { "epoch": 1.5821762996448174, "grad_norm": 0.5946151614189148, "learning_rate": 0.0005056459433250727, "loss": 3.6849, "step": 14700 }, { "epoch": 1.5875578516844258, "grad_norm": 0.601273775100708, "learning_rate": 0.0005053227022950113, "loss": 3.6833, "step": 14750 }, { "epoch": 1.592939403724034, "grad_norm": 0.5582096576690674, "learning_rate": 0.0005049994612649499, "loss": 3.681, "step": 14800 }, { "epoch": 1.5983209557636422, "grad_norm": 0.5676015615463257, "learning_rate": 0.0005046762202348884, "loss": 3.6892, "step": 14850 }, { "epoch": 1.6037025078032503, "grad_norm": 0.5280758738517761, "learning_rate": 0.000504352979204827, "loss": 3.6736, "step": 14900 }, { "epoch": 1.6090840598428586, "grad_norm": 0.6497607827186584, "learning_rate": 0.0005040297381747656, "loss": 3.6761, "step": 14950 }, { "epoch": 1.614465611882467, "grad_norm": 0.6543067693710327, "learning_rate": 0.0005037064971447042, "loss": 3.6601, "step": 15000 }, { "epoch": 1.614465611882467, "eval_accuracy": 0.3556327449815676, "eval_loss": 3.6367475986480713, "eval_runtime": 205.3772, "eval_samples_per_second": 87.697, "eval_steps_per_second": 5.483, "step": 15000 }, { "epoch": 1.619847163922075, "grad_norm": 0.5906841158866882, "learning_rate": 0.0005033832561146428, "loss": 3.6741, "step": 15050 }, { "epoch": 1.6252287159616834, "grad_norm": 0.5810515880584717, "learning_rate": 0.0005030600150845813, "loss": 3.6827, "step": 15100 }, { "epoch": 1.6306102680012917, "grad_norm": 0.6275368928909302, "learning_rate": 0.00050273677405452, "loss": 3.673, "step": 15150 }, { "epoch": 1.6359918200408998, "grad_norm": 0.5321255922317505, "learning_rate": 0.0005024135330244585, "loss": 3.6789, "step": 15200 }, { "epoch": 1.641373372080508, "grad_norm": 0.5455909371376038, "learning_rate": 0.0005020902919943972, "loss": 3.6691, "step": 15250 }, { "epoch": 1.6467549241201163, "grad_norm": 0.5684463977813721, "learning_rate": 0.0005017670509643357, "loss": 3.656, "step": 15300 }, { "epoch": 1.6521364761597246, "grad_norm": 0.5891856551170349, "learning_rate": 0.0005014438099342743, "loss": 3.666, "step": 15350 }, { "epoch": 1.6575180281993327, "grad_norm": 0.5768228769302368, "learning_rate": 0.0005011205689042129, "loss": 3.6802, "step": 15400 }, { "epoch": 1.6628995802389408, "grad_norm": 0.5384355187416077, "learning_rate": 0.0005007973278741514, "loss": 3.6677, "step": 15450 }, { "epoch": 1.6682811322785491, "grad_norm": 0.5776270031929016, "learning_rate": 0.00050047408684409, "loss": 3.6913, "step": 15500 }, { "epoch": 1.6736626843181575, "grad_norm": 0.5457106232643127, "learning_rate": 0.0005001508458140286, "loss": 3.6747, "step": 15550 }, { "epoch": 1.6790442363577656, "grad_norm": 0.5584999322891235, "learning_rate": 0.0004998276047839673, "loss": 3.6656, "step": 15600 }, { "epoch": 1.6844257883973737, "grad_norm": 0.5563948154449463, "learning_rate": 0.0004995043637539058, "loss": 3.6699, "step": 15650 }, { "epoch": 1.689807340436982, "grad_norm": 0.5881028175354004, "learning_rate": 0.0004991811227238443, "loss": 3.6704, "step": 15700 }, { "epoch": 1.6951888924765903, "grad_norm": 0.5460503101348877, "learning_rate": 0.0004988578816937829, "loss": 3.6895, "step": 15750 }, { "epoch": 1.7005704445161984, "grad_norm": 0.6377148032188416, "learning_rate": 0.0004985346406637215, "loss": 3.6985, "step": 15800 }, { "epoch": 1.7059519965558065, "grad_norm": 0.5566238760948181, "learning_rate": 0.0004982113996336602, "loss": 3.6537, "step": 15850 }, { "epoch": 1.7113335485954149, "grad_norm": 0.5709816813468933, "learning_rate": 0.0004978881586035987, "loss": 3.6434, "step": 15900 }, { "epoch": 1.7167151006350232, "grad_norm": 0.5734738111495972, "learning_rate": 0.0004975649175735373, "loss": 3.6449, "step": 15950 }, { "epoch": 1.7220966526746313, "grad_norm": 0.5572895407676697, "learning_rate": 0.0004972416765434759, "loss": 3.6579, "step": 16000 }, { "epoch": 1.7220966526746313, "eval_accuracy": 0.35718387360800713, "eval_loss": 3.6182830333709717, "eval_runtime": 214.6803, "eval_samples_per_second": 83.897, "eval_steps_per_second": 5.245, "step": 16000 }, { "epoch": 1.7274782047142396, "grad_norm": 0.5495673418045044, "learning_rate": 0.0004969184355134145, "loss": 3.6629, "step": 16050 }, { "epoch": 1.732859756753848, "grad_norm": 0.6197149753570557, "learning_rate": 0.0004965951944833531, "loss": 3.6439, "step": 16100 }, { "epoch": 1.738241308793456, "grad_norm": 0.5787531137466431, "learning_rate": 0.0004962719534532916, "loss": 3.6703, "step": 16150 }, { "epoch": 1.7436228608330642, "grad_norm": 0.5500142574310303, "learning_rate": 0.0004959487124232302, "loss": 3.656, "step": 16200 }, { "epoch": 1.7490044128726725, "grad_norm": 0.5271518230438232, "learning_rate": 0.0004956254713931688, "loss": 3.6594, "step": 16250 }, { "epoch": 1.7543859649122808, "grad_norm": 0.5560300350189209, "learning_rate": 0.0004953022303631074, "loss": 3.6622, "step": 16300 }, { "epoch": 1.759767516951889, "grad_norm": 0.5545780062675476, "learning_rate": 0.0004949789893330459, "loss": 3.6657, "step": 16350 }, { "epoch": 1.765149068991497, "grad_norm": 0.5637168884277344, "learning_rate": 0.0004946557483029846, "loss": 3.6701, "step": 16400 }, { "epoch": 1.7705306210311054, "grad_norm": 0.6499923467636108, "learning_rate": 0.0004943325072729231, "loss": 3.6591, "step": 16450 }, { "epoch": 1.7759121730707137, "grad_norm": 0.5951055884361267, "learning_rate": 0.0004940092662428617, "loss": 3.6589, "step": 16500 }, { "epoch": 1.7812937251103218, "grad_norm": 0.5989054441452026, "learning_rate": 0.0004936860252128003, "loss": 3.6615, "step": 16550 }, { "epoch": 1.78667527714993, "grad_norm": 0.6071681976318359, "learning_rate": 0.0004933627841827388, "loss": 3.6413, "step": 16600 }, { "epoch": 1.7920568291895382, "grad_norm": 0.5401762127876282, "learning_rate": 0.0004930395431526775, "loss": 3.6646, "step": 16650 }, { "epoch": 1.7974383812291466, "grad_norm": 0.6720755100250244, "learning_rate": 0.0004927163021226161, "loss": 3.6535, "step": 16700 }, { "epoch": 1.8028199332687547, "grad_norm": 0.5372287034988403, "learning_rate": 0.0004923930610925547, "loss": 3.6387, "step": 16750 }, { "epoch": 1.8082014853083628, "grad_norm": 0.5767584443092346, "learning_rate": 0.0004920698200624932, "loss": 3.6509, "step": 16800 }, { "epoch": 1.813583037347971, "grad_norm": 0.619735598564148, "learning_rate": 0.0004917465790324317, "loss": 3.6593, "step": 16850 }, { "epoch": 1.8189645893875794, "grad_norm": 0.612182080745697, "learning_rate": 0.0004914233380023704, "loss": 3.669, "step": 16900 }, { "epoch": 1.8243461414271875, "grad_norm": 0.5973613262176514, "learning_rate": 0.0004911000969723089, "loss": 3.6581, "step": 16950 }, { "epoch": 1.8297276934667959, "grad_norm": 0.5377869606018066, "learning_rate": 0.0004907768559422476, "loss": 3.6582, "step": 17000 }, { "epoch": 1.8297276934667959, "eval_accuracy": 0.3589950085949867, "eval_loss": 3.60500168800354, "eval_runtime": 203.9944, "eval_samples_per_second": 88.292, "eval_steps_per_second": 5.52, "step": 17000 }, { "epoch": 1.8351092455064042, "grad_norm": 0.5382079482078552, "learning_rate": 0.0004904536149121861, "loss": 3.6536, "step": 17050 }, { "epoch": 1.8404907975460123, "grad_norm": 0.5734342336654663, "learning_rate": 0.0004901303738821248, "loss": 3.6325, "step": 17100 }, { "epoch": 1.8458723495856204, "grad_norm": 0.6593245267868042, "learning_rate": 0.0004898071328520633, "loss": 3.6733, "step": 17150 }, { "epoch": 1.8512539016252287, "grad_norm": 0.5794579982757568, "learning_rate": 0.0004894838918220019, "loss": 3.6634, "step": 17200 }, { "epoch": 1.856635453664837, "grad_norm": 0.5953862071037292, "learning_rate": 0.0004891606507919405, "loss": 3.6544, "step": 17250 }, { "epoch": 1.8620170057044452, "grad_norm": 0.6200346946716309, "learning_rate": 0.000488837409761879, "loss": 3.6443, "step": 17300 }, { "epoch": 1.8673985577440533, "grad_norm": 0.585228681564331, "learning_rate": 0.0004885141687318177, "loss": 3.6478, "step": 17350 }, { "epoch": 1.8727801097836616, "grad_norm": 0.6394591927528381, "learning_rate": 0.00048819092770175623, "loss": 3.6416, "step": 17400 }, { "epoch": 1.87816166182327, "grad_norm": 0.5823236107826233, "learning_rate": 0.0004878676866716948, "loss": 3.6572, "step": 17450 }, { "epoch": 1.883543213862878, "grad_norm": 0.5914448499679565, "learning_rate": 0.00048754444564163337, "loss": 3.6479, "step": 17500 }, { "epoch": 1.8889247659024861, "grad_norm": 0.6287000775337219, "learning_rate": 0.000487221204611572, "loss": 3.6431, "step": 17550 }, { "epoch": 1.8943063179420945, "grad_norm": 0.5507499575614929, "learning_rate": 0.00048689796358151056, "loss": 3.6444, "step": 17600 }, { "epoch": 1.8996878699817028, "grad_norm": 0.5838987827301025, "learning_rate": 0.00048657472255144915, "loss": 3.649, "step": 17650 }, { "epoch": 1.905069422021311, "grad_norm": 0.6239995360374451, "learning_rate": 0.00048625148152138775, "loss": 3.6385, "step": 17700 }, { "epoch": 1.910450974060919, "grad_norm": 0.5718323588371277, "learning_rate": 0.0004859282404913263, "loss": 3.6248, "step": 17750 }, { "epoch": 1.9158325261005273, "grad_norm": 0.5826128125190735, "learning_rate": 0.0004856049994612649, "loss": 3.6441, "step": 17800 }, { "epoch": 1.9212140781401357, "grad_norm": 0.6261082291603088, "learning_rate": 0.00048528175843120353, "loss": 3.6379, "step": 17850 }, { "epoch": 1.9265956301797438, "grad_norm": 0.5729628205299377, "learning_rate": 0.0004849585174011421, "loss": 3.6403, "step": 17900 }, { "epoch": 1.931977182219352, "grad_norm": 0.6479345560073853, "learning_rate": 0.00048464174119168193, "loss": 3.6513, "step": 17950 }, { "epoch": 1.9373587342589604, "grad_norm": 0.6019191145896912, "learning_rate": 0.0004843185001616205, "loss": 3.6427, "step": 18000 }, { "epoch": 1.9373587342589604, "eval_accuracy": 0.3606186086975985, "eval_loss": 3.589313268661499, "eval_runtime": 204.8701, "eval_samples_per_second": 87.914, "eval_steps_per_second": 5.496, "step": 18000 }, { "epoch": 1.9427402862985685, "grad_norm": 0.5419387817382812, "learning_rate": 0.00048399525913155907, "loss": 3.6361, "step": 18050 }, { "epoch": 1.9481218383381766, "grad_norm": 0.6220253109931946, "learning_rate": 0.0004836720181014976, "loss": 3.6207, "step": 18100 }, { "epoch": 1.953503390377785, "grad_norm": 0.5842125415802002, "learning_rate": 0.0004833487770714362, "loss": 3.6405, "step": 18150 }, { "epoch": 1.9588849424173933, "grad_norm": 0.5908029675483704, "learning_rate": 0.00048302553604137485, "loss": 3.6445, "step": 18200 }, { "epoch": 1.9642664944570014, "grad_norm": 0.5424453020095825, "learning_rate": 0.0004827022950113134, "loss": 3.6419, "step": 18250 }, { "epoch": 1.9696480464966095, "grad_norm": 0.6032727360725403, "learning_rate": 0.000482379053981252, "loss": 3.6507, "step": 18300 }, { "epoch": 1.9750295985362178, "grad_norm": 0.6096407771110535, "learning_rate": 0.0004820558129511906, "loss": 3.6364, "step": 18350 }, { "epoch": 1.9804111505758262, "grad_norm": 0.6347204446792603, "learning_rate": 0.0004817325719211291, "loss": 3.6451, "step": 18400 }, { "epoch": 1.9857927026154343, "grad_norm": 0.5634959936141968, "learning_rate": 0.0004814093308910677, "loss": 3.641, "step": 18450 }, { "epoch": 1.9911742546550424, "grad_norm": 0.5860807299613953, "learning_rate": 0.00048108608986100637, "loss": 3.6336, "step": 18500 }, { "epoch": 1.9965558066946507, "grad_norm": 0.5338708162307739, "learning_rate": 0.0004807628488309449, "loss": 3.6211, "step": 18550 }, { "epoch": 2.001937358734259, "grad_norm": 0.5974145531654358, "learning_rate": 0.0004804396078008835, "loss": 3.5958, "step": 18600 }, { "epoch": 2.007318910773867, "grad_norm": 0.5562605261802673, "learning_rate": 0.00048011636677082204, "loss": 3.5582, "step": 18650 }, { "epoch": 2.0127004628134753, "grad_norm": 0.6376757025718689, "learning_rate": 0.00047979312574076064, "loss": 3.5377, "step": 18700 }, { "epoch": 2.018082014853084, "grad_norm": 0.5702477097511292, "learning_rate": 0.0004794698847106992, "loss": 3.5431, "step": 18750 }, { "epoch": 2.023463566892692, "grad_norm": 0.5655612945556641, "learning_rate": 0.0004791466436806378, "loss": 3.5661, "step": 18800 }, { "epoch": 2.0288451189323, "grad_norm": 0.5684214234352112, "learning_rate": 0.0004788234026505764, "loss": 3.5378, "step": 18850 }, { "epoch": 2.034226670971908, "grad_norm": 0.595730185508728, "learning_rate": 0.00047850016162051496, "loss": 3.5512, "step": 18900 }, { "epoch": 2.0396082230115167, "grad_norm": 0.602304220199585, "learning_rate": 0.00047817692059045356, "loss": 3.5471, "step": 18950 }, { "epoch": 2.044989775051125, "grad_norm": 0.6218283176422119, "learning_rate": 0.00047785367956039215, "loss": 3.56, "step": 19000 }, { "epoch": 2.044989775051125, "eval_accuracy": 0.3616626542966078, "eval_loss": 3.5785436630249023, "eval_runtime": 202.2767, "eval_samples_per_second": 89.041, "eval_steps_per_second": 5.567, "step": 19000 }, { "epoch": 2.050371327090733, "grad_norm": 0.5897380113601685, "learning_rate": 0.00047753043853033075, "loss": 3.5425, "step": 19050 }, { "epoch": 2.055752879130341, "grad_norm": 0.5932830572128296, "learning_rate": 0.00047720719750026934, "loss": 3.5699, "step": 19100 }, { "epoch": 2.0611344311699495, "grad_norm": 0.5921191573143005, "learning_rate": 0.00047688395647020793, "loss": 3.5431, "step": 19150 }, { "epoch": 2.0665159832095576, "grad_norm": 0.6052051186561584, "learning_rate": 0.0004765607154401465, "loss": 3.5734, "step": 19200 }, { "epoch": 2.0718975352491658, "grad_norm": 0.562523603439331, "learning_rate": 0.00047623747441008507, "loss": 3.543, "step": 19250 }, { "epoch": 2.0772790872887743, "grad_norm": 0.546420156955719, "learning_rate": 0.0004759142333800236, "loss": 3.5441, "step": 19300 }, { "epoch": 2.0826606393283824, "grad_norm": 0.590813159942627, "learning_rate": 0.00047559099234996226, "loss": 3.5624, "step": 19350 }, { "epoch": 2.0880421913679905, "grad_norm": 0.6105584502220154, "learning_rate": 0.00047526775131990085, "loss": 3.557, "step": 19400 }, { "epoch": 2.0934237434075986, "grad_norm": 0.6170618534088135, "learning_rate": 0.0004749445102898394, "loss": 3.5373, "step": 19450 }, { "epoch": 2.098805295447207, "grad_norm": 0.5814986228942871, "learning_rate": 0.000474621269259778, "loss": 3.5693, "step": 19500 }, { "epoch": 2.1041868474868153, "grad_norm": 0.6113174557685852, "learning_rate": 0.0004742980282297166, "loss": 3.5589, "step": 19550 }, { "epoch": 2.1095683995264234, "grad_norm": 0.6312413811683655, "learning_rate": 0.0004739747871996551, "loss": 3.5398, "step": 19600 }, { "epoch": 2.1149499515660315, "grad_norm": 0.6352087259292603, "learning_rate": 0.00047365154616959377, "loss": 3.5541, "step": 19650 }, { "epoch": 2.12033150360564, "grad_norm": 0.6081045269966125, "learning_rate": 0.00047332830513953237, "loss": 3.576, "step": 19700 }, { "epoch": 2.125713055645248, "grad_norm": 0.5704367160797119, "learning_rate": 0.0004730050641094709, "loss": 3.5563, "step": 19750 }, { "epoch": 2.1310946076848563, "grad_norm": 0.6367883682250977, "learning_rate": 0.0004726818230794095, "loss": 3.5358, "step": 19800 }, { "epoch": 2.1364761597244644, "grad_norm": 0.5543844699859619, "learning_rate": 0.00047235858204934804, "loss": 3.5561, "step": 19850 }, { "epoch": 2.141857711764073, "grad_norm": 0.6325891017913818, "learning_rate": 0.0004720353410192867, "loss": 3.5459, "step": 19900 }, { "epoch": 2.147239263803681, "grad_norm": 0.5876049995422363, "learning_rate": 0.0004717120999892253, "loss": 3.5568, "step": 19950 }, { "epoch": 2.152620815843289, "grad_norm": 0.5623601675033569, "learning_rate": 0.0004713888589591638, "loss": 3.5527, "step": 20000 }, { "epoch": 2.152620815843289, "eval_accuracy": 0.3627479879929867, "eval_loss": 3.5705673694610596, "eval_runtime": 206.2591, "eval_samples_per_second": 87.322, "eval_steps_per_second": 5.459, "step": 20000 }, { "epoch": 2.1580023678828972, "grad_norm": 0.5725038051605225, "learning_rate": 0.0004710656179291024, "loss": 3.5565, "step": 20050 }, { "epoch": 2.163383919922506, "grad_norm": 0.6287389993667603, "learning_rate": 0.000470742376899041, "loss": 3.5528, "step": 20100 }, { "epoch": 2.168765471962114, "grad_norm": 0.5884525179862976, "learning_rate": 0.00047041913586897956, "loss": 3.5597, "step": 20150 }, { "epoch": 2.174147024001722, "grad_norm": 0.5764286518096924, "learning_rate": 0.0004700958948389182, "loss": 3.5426, "step": 20200 }, { "epoch": 2.1795285760413305, "grad_norm": 0.6041747331619263, "learning_rate": 0.0004697726538088568, "loss": 3.5471, "step": 20250 }, { "epoch": 2.1849101280809387, "grad_norm": 0.5907628536224365, "learning_rate": 0.00046944941277879534, "loss": 3.5453, "step": 20300 }, { "epoch": 2.1902916801205468, "grad_norm": 0.5632272958755493, "learning_rate": 0.00046912617174873394, "loss": 3.5558, "step": 20350 }, { "epoch": 2.195673232160155, "grad_norm": 0.6226711869239807, "learning_rate": 0.0004688029307186725, "loss": 3.5865, "step": 20400 }, { "epoch": 2.2010547841997634, "grad_norm": 0.5935404896736145, "learning_rate": 0.00046847968968861107, "loss": 3.5594, "step": 20450 }, { "epoch": 2.2064363362393715, "grad_norm": 0.5879222750663757, "learning_rate": 0.0004681564486585497, "loss": 3.5637, "step": 20500 }, { "epoch": 2.2118178882789796, "grad_norm": 0.5888842940330505, "learning_rate": 0.00046783320762848826, "loss": 3.5438, "step": 20550 }, { "epoch": 2.2171994403185877, "grad_norm": 0.5803065299987793, "learning_rate": 0.00046750996659842685, "loss": 3.5429, "step": 20600 }, { "epoch": 2.2225809923581963, "grad_norm": 0.5627387166023254, "learning_rate": 0.00046718672556836545, "loss": 3.5601, "step": 20650 }, { "epoch": 2.2279625443978044, "grad_norm": 0.5622759461402893, "learning_rate": 0.000466863484538304, "loss": 3.5676, "step": 20700 }, { "epoch": 2.2333440964374125, "grad_norm": 0.5685352087020874, "learning_rate": 0.0004665402435082426, "loss": 3.5572, "step": 20750 }, { "epoch": 2.2387256484770206, "grad_norm": 0.6102375388145447, "learning_rate": 0.00046621700247818123, "loss": 3.5546, "step": 20800 }, { "epoch": 2.244107200516629, "grad_norm": 0.5915598273277283, "learning_rate": 0.0004658937614481198, "loss": 3.5519, "step": 20850 }, { "epoch": 2.2494887525562373, "grad_norm": 0.5531293153762817, "learning_rate": 0.00046557052041805837, "loss": 3.5491, "step": 20900 }, { "epoch": 2.2548703045958454, "grad_norm": 0.5677109956741333, "learning_rate": 0.0004652472793879969, "loss": 3.5562, "step": 20950 }, { "epoch": 2.2602518566354535, "grad_norm": 0.5943840742111206, "learning_rate": 0.0004649240383579355, "loss": 3.5487, "step": 21000 }, { "epoch": 2.2602518566354535, "eval_accuracy": 0.36394262649450687, "eval_loss": 3.560176372528076, "eval_runtime": 204.7535, "eval_samples_per_second": 87.964, "eval_steps_per_second": 5.499, "step": 21000 }, { "epoch": 2.265633408675062, "grad_norm": 0.5741783380508423, "learning_rate": 0.00046460079732787415, "loss": 3.5606, "step": 21050 }, { "epoch": 2.27101496071467, "grad_norm": 0.6168876886367798, "learning_rate": 0.0004642775562978127, "loss": 3.5662, "step": 21100 }, { "epoch": 2.2763965127542782, "grad_norm": 0.6492966413497925, "learning_rate": 0.0004639543152677513, "loss": 3.5546, "step": 21150 }, { "epoch": 2.281778064793887, "grad_norm": 0.634781539440155, "learning_rate": 0.0004636310742376899, "loss": 3.5674, "step": 21200 }, { "epoch": 2.287159616833495, "grad_norm": 0.5752384066581726, "learning_rate": 0.0004633078332076284, "loss": 3.5561, "step": 21250 }, { "epoch": 2.292541168873103, "grad_norm": 0.5751746296882629, "learning_rate": 0.000462984592177567, "loss": 3.5505, "step": 21300 }, { "epoch": 2.297922720912711, "grad_norm": 0.5733442902565002, "learning_rate": 0.00046266135114750567, "loss": 3.5489, "step": 21350 }, { "epoch": 2.303304272952319, "grad_norm": 0.6262395977973938, "learning_rate": 0.0004623381101174442, "loss": 3.552, "step": 21400 }, { "epoch": 2.3086858249919278, "grad_norm": 0.6084133982658386, "learning_rate": 0.0004620148690873828, "loss": 3.5515, "step": 21450 }, { "epoch": 2.314067377031536, "grad_norm": 0.558005690574646, "learning_rate": 0.00046169162805732134, "loss": 3.5501, "step": 21500 }, { "epoch": 2.319448929071144, "grad_norm": 0.656355619430542, "learning_rate": 0.00046136838702725994, "loss": 3.5677, "step": 21550 }, { "epoch": 2.3248304811107525, "grad_norm": 0.6125053763389587, "learning_rate": 0.00046104514599719853, "loss": 3.5496, "step": 21600 }, { "epoch": 2.3302120331503606, "grad_norm": 0.708257257938385, "learning_rate": 0.0004607219049671371, "loss": 3.594, "step": 21650 }, { "epoch": 2.3355935851899687, "grad_norm": 0.6216398477554321, "learning_rate": 0.0004603986639370757, "loss": 3.5749, "step": 21700 }, { "epoch": 2.340975137229577, "grad_norm": 0.5760979056358337, "learning_rate": 0.0004600754229070143, "loss": 3.5546, "step": 21750 }, { "epoch": 2.3463566892691854, "grad_norm": 0.5885084867477417, "learning_rate": 0.00045975218187695286, "loss": 3.5654, "step": 21800 }, { "epoch": 2.3517382413087935, "grad_norm": 0.6174579858779907, "learning_rate": 0.00045942894084689145, "loss": 3.5543, "step": 21850 }, { "epoch": 2.3571197933484016, "grad_norm": 0.6659450531005859, "learning_rate": 0.0004591056998168301, "loss": 3.553, "step": 21900 }, { "epoch": 2.3625013453880097, "grad_norm": 0.5789748430252075, "learning_rate": 0.00045878245878676864, "loss": 3.5611, "step": 21950 }, { "epoch": 2.3678828974276183, "grad_norm": 0.5881466865539551, "learning_rate": 0.00045846568257730845, "loss": 3.5664, "step": 22000 }, { "epoch": 2.3678828974276183, "eval_accuracy": 0.36495016472321057, "eval_loss": 3.5474905967712402, "eval_runtime": 194.8012, "eval_samples_per_second": 92.458, "eval_steps_per_second": 5.78, "step": 22000 }, { "epoch": 2.3732644494672264, "grad_norm": 0.6417264342308044, "learning_rate": 0.00045814244154724704, "loss": 3.5509, "step": 22050 }, { "epoch": 2.3786460015068345, "grad_norm": 0.6134101748466492, "learning_rate": 0.00045781920051718563, "loss": 3.5482, "step": 22100 }, { "epoch": 2.384027553546443, "grad_norm": 0.8174729943275452, "learning_rate": 0.0004574959594871242, "loss": 3.5538, "step": 22150 }, { "epoch": 2.389409105586051, "grad_norm": 0.6091681122779846, "learning_rate": 0.00045717271845706277, "loss": 3.5516, "step": 22200 }, { "epoch": 2.3947906576256592, "grad_norm": 0.5839729905128479, "learning_rate": 0.0004568494774270013, "loss": 3.5545, "step": 22250 }, { "epoch": 2.4001722096652673, "grad_norm": 0.620452344417572, "learning_rate": 0.00045652623639693996, "loss": 3.5427, "step": 22300 }, { "epoch": 2.4055537617048754, "grad_norm": 0.6439001560211182, "learning_rate": 0.00045620299536687855, "loss": 3.554, "step": 22350 }, { "epoch": 2.410935313744484, "grad_norm": 0.6319786906242371, "learning_rate": 0.0004558797543368171, "loss": 3.5723, "step": 22400 }, { "epoch": 2.416316865784092, "grad_norm": 0.6055615544319153, "learning_rate": 0.0004555565133067557, "loss": 3.5568, "step": 22450 }, { "epoch": 2.4216984178237, "grad_norm": 0.6199345588684082, "learning_rate": 0.0004552332722766943, "loss": 3.549, "step": 22500 }, { "epoch": 2.4270799698633088, "grad_norm": 0.587843656539917, "learning_rate": 0.0004549100312466328, "loss": 3.5311, "step": 22550 }, { "epoch": 2.432461521902917, "grad_norm": 0.5868039727210999, "learning_rate": 0.0004545867902165715, "loss": 3.5624, "step": 22600 }, { "epoch": 2.437843073942525, "grad_norm": 0.5922572016716003, "learning_rate": 0.00045426354918651007, "loss": 3.5538, "step": 22650 }, { "epoch": 2.443224625982133, "grad_norm": 0.6361838579177856, "learning_rate": 0.0004539403081564486, "loss": 3.5364, "step": 22700 }, { "epoch": 2.4486061780217416, "grad_norm": 0.5849791765213013, "learning_rate": 0.0004536170671263872, "loss": 3.5606, "step": 22750 }, { "epoch": 2.4539877300613497, "grad_norm": 0.6513404250144958, "learning_rate": 0.00045329382609632574, "loss": 3.535, "step": 22800 }, { "epoch": 2.459369282100958, "grad_norm": 0.5861430168151855, "learning_rate": 0.0004529705850662644, "loss": 3.5432, "step": 22850 }, { "epoch": 2.464750834140566, "grad_norm": 0.6476746797561646, "learning_rate": 0.000452647344036203, "loss": 3.5454, "step": 22900 }, { "epoch": 2.4701323861801745, "grad_norm": 0.6115290522575378, "learning_rate": 0.00045232410300614153, "loss": 3.5439, "step": 22950 }, { "epoch": 2.4755139382197826, "grad_norm": 0.5903562903404236, "learning_rate": 0.0004520008619760801, "loss": 3.5464, "step": 23000 }, { "epoch": 2.4755139382197826, "eval_accuracy": 0.3655437354493411, "eval_loss": 3.5417819023132324, "eval_runtime": 206.588, "eval_samples_per_second": 87.183, "eval_steps_per_second": 5.45, "step": 23000 }, { "epoch": 2.4808954902593907, "grad_norm": 0.5942795276641846, "learning_rate": 0.0004516776209460187, "loss": 3.5293, "step": 23050 }, { "epoch": 2.4862770422989993, "grad_norm": 0.5559934973716736, "learning_rate": 0.00045135437991595726, "loss": 3.5569, "step": 23100 }, { "epoch": 2.4916585943386074, "grad_norm": 0.6209527254104614, "learning_rate": 0.0004510311388858959, "loss": 3.5374, "step": 23150 }, { "epoch": 2.4970401463782155, "grad_norm": 0.5962895750999451, "learning_rate": 0.0004507078978558345, "loss": 3.5538, "step": 23200 }, { "epoch": 2.5024216984178236, "grad_norm": 0.6051533222198486, "learning_rate": 0.00045038465682577304, "loss": 3.552, "step": 23250 }, { "epoch": 2.5078032504574317, "grad_norm": 0.5640018582344055, "learning_rate": 0.00045006141579571164, "loss": 3.5596, "step": 23300 }, { "epoch": 2.5131848024970402, "grad_norm": 0.6006429195404053, "learning_rate": 0.0004497381747656502, "loss": 3.5692, "step": 23350 }, { "epoch": 2.5185663545366483, "grad_norm": 0.6432710886001587, "learning_rate": 0.00044941493373558877, "loss": 3.5302, "step": 23400 }, { "epoch": 2.5239479065762565, "grad_norm": 0.5585759282112122, "learning_rate": 0.0004490916927055274, "loss": 3.5559, "step": 23450 }, { "epoch": 2.529329458615865, "grad_norm": 0.6355800032615662, "learning_rate": 0.00044876845167546596, "loss": 3.5332, "step": 23500 }, { "epoch": 2.534711010655473, "grad_norm": 0.5865529179573059, "learning_rate": 0.00044844521064540455, "loss": 3.5558, "step": 23550 }, { "epoch": 2.540092562695081, "grad_norm": 0.6548492312431335, "learning_rate": 0.00044812196961534315, "loss": 3.5339, "step": 23600 }, { "epoch": 2.5454741147346893, "grad_norm": 0.6574897766113281, "learning_rate": 0.0004477987285852817, "loss": 3.5498, "step": 23650 }, { "epoch": 2.550855666774298, "grad_norm": 0.5814394950866699, "learning_rate": 0.00044747548755522034, "loss": 3.5552, "step": 23700 }, { "epoch": 2.556237218813906, "grad_norm": 0.5807026028633118, "learning_rate": 0.00044715224652515893, "loss": 3.5408, "step": 23750 }, { "epoch": 2.561618770853514, "grad_norm": 0.6143361926078796, "learning_rate": 0.0004468290054950975, "loss": 3.5664, "step": 23800 }, { "epoch": 2.567000322893122, "grad_norm": 0.5572286248207092, "learning_rate": 0.00044650576446503607, "loss": 3.5545, "step": 23850 }, { "epoch": 2.5723818749327307, "grad_norm": 0.5634745955467224, "learning_rate": 0.0004461825234349746, "loss": 3.5302, "step": 23900 }, { "epoch": 2.577763426972339, "grad_norm": 0.5858882665634155, "learning_rate": 0.00044586574722551447, "loss": 3.5528, "step": 23950 }, { "epoch": 2.583144979011947, "grad_norm": 0.6945074200630188, "learning_rate": 0.000445542506195453, "loss": 3.5619, "step": 24000 }, { "epoch": 2.583144979011947, "eval_accuracy": 0.36655561979355733, "eval_loss": 3.5325984954833984, "eval_runtime": 195.3908, "eval_samples_per_second": 92.179, "eval_steps_per_second": 5.763, "step": 24000 }, { "epoch": 2.5885265310515555, "grad_norm": 0.5496438145637512, "learning_rate": 0.0004452192651653916, "loss": 3.5522, "step": 24050 }, { "epoch": 2.5939080830911636, "grad_norm": 0.5970087051391602, "learning_rate": 0.00044489602413533025, "loss": 3.5635, "step": 24100 }, { "epoch": 2.5992896351307717, "grad_norm": 0.6501139998435974, "learning_rate": 0.0004445727831052688, "loss": 3.533, "step": 24150 }, { "epoch": 2.60467118717038, "grad_norm": 0.6015759110450745, "learning_rate": 0.0004442495420752074, "loss": 3.5451, "step": 24200 }, { "epoch": 2.610052739209988, "grad_norm": 0.5871186852455139, "learning_rate": 0.00044392630104514593, "loss": 3.5508, "step": 24250 }, { "epoch": 2.6154342912495965, "grad_norm": 0.5801131725311279, "learning_rate": 0.0004436030600150845, "loss": 3.5321, "step": 24300 }, { "epoch": 2.6208158432892046, "grad_norm": 0.7078419327735901, "learning_rate": 0.0004432798189850231, "loss": 3.5526, "step": 24350 }, { "epoch": 2.6261973953288127, "grad_norm": 0.6336981654167175, "learning_rate": 0.0004429565779549617, "loss": 3.553, "step": 24400 }, { "epoch": 2.6315789473684212, "grad_norm": 0.621759831905365, "learning_rate": 0.0004426333369249003, "loss": 3.5336, "step": 24450 }, { "epoch": 2.6369604994080293, "grad_norm": 0.5916746854782104, "learning_rate": 0.0004423100958948389, "loss": 3.5525, "step": 24500 }, { "epoch": 2.6423420514476375, "grad_norm": 0.6469952464103699, "learning_rate": 0.00044198685486477744, "loss": 3.5264, "step": 24550 }, { "epoch": 2.6477236034872456, "grad_norm": 0.6205400824546814, "learning_rate": 0.00044166361383471604, "loss": 3.5513, "step": 24600 }, { "epoch": 2.653105155526854, "grad_norm": 0.6007969975471497, "learning_rate": 0.0004413403728046547, "loss": 3.5303, "step": 24650 }, { "epoch": 2.658486707566462, "grad_norm": 0.6586825847625732, "learning_rate": 0.0004410171317745932, "loss": 3.5627, "step": 24700 }, { "epoch": 2.6638682596060703, "grad_norm": 0.5926357507705688, "learning_rate": 0.0004406938907445318, "loss": 3.5663, "step": 24750 }, { "epoch": 2.6692498116456784, "grad_norm": 0.5873600840568542, "learning_rate": 0.00044037064971447036, "loss": 3.5241, "step": 24800 }, { "epoch": 2.674631363685287, "grad_norm": 0.592735767364502, "learning_rate": 0.00044004740868440896, "loss": 3.5567, "step": 24850 }, { "epoch": 2.680012915724895, "grad_norm": 0.626936674118042, "learning_rate": 0.00043972416765434755, "loss": 3.5204, "step": 24900 }, { "epoch": 2.685394467764503, "grad_norm": 0.5691058039665222, "learning_rate": 0.00043940092662428615, "loss": 3.5214, "step": 24950 }, { "epoch": 2.6907760198041117, "grad_norm": 0.6450164914131165, "learning_rate": 0.00043907768559422474, "loss": 3.5487, "step": 25000 }, { "epoch": 2.6907760198041117, "eval_accuracy": 0.36812326365894465, "eval_loss": 3.521350145339966, "eval_runtime": 197.2244, "eval_samples_per_second": 91.322, "eval_steps_per_second": 5.709, "step": 25000 }, { "epoch": 2.69615757184372, "grad_norm": 0.6423863768577576, "learning_rate": 0.00043875444456416334, "loss": 3.5373, "step": 25050 }, { "epoch": 2.701539123883328, "grad_norm": 0.6400243043899536, "learning_rate": 0.0004384312035341019, "loss": 3.5428, "step": 25100 }, { "epoch": 2.706920675922936, "grad_norm": 0.6252772212028503, "learning_rate": 0.00043810796250404047, "loss": 3.5469, "step": 25150 }, { "epoch": 2.712302227962544, "grad_norm": 0.5882490873336792, "learning_rate": 0.000437784721473979, "loss": 3.5376, "step": 25200 }, { "epoch": 2.7176837800021527, "grad_norm": 0.604751467704773, "learning_rate": 0.00043746148044391766, "loss": 3.5409, "step": 25250 }, { "epoch": 2.723065332041761, "grad_norm": 0.6292399168014526, "learning_rate": 0.00043713823941385625, "loss": 3.5313, "step": 25300 }, { "epoch": 2.728446884081369, "grad_norm": 0.6144165396690369, "learning_rate": 0.0004368149983837948, "loss": 3.5484, "step": 25350 }, { "epoch": 2.7338284361209775, "grad_norm": 0.6086465716362, "learning_rate": 0.0004364917573537334, "loss": 3.5381, "step": 25400 }, { "epoch": 2.7392099881605856, "grad_norm": 0.6256065964698792, "learning_rate": 0.00043616851632367193, "loss": 3.557, "step": 25450 }, { "epoch": 2.7445915402001937, "grad_norm": 0.6041662096977234, "learning_rate": 0.0004358452752936106, "loss": 3.5347, "step": 25500 }, { "epoch": 2.749973092239802, "grad_norm": 0.6548029780387878, "learning_rate": 0.0004355220342635492, "loss": 3.5374, "step": 25550 }, { "epoch": 2.7553546442794103, "grad_norm": 0.6124310493469238, "learning_rate": 0.00043519879323348777, "loss": 3.529, "step": 25600 }, { "epoch": 2.7607361963190185, "grad_norm": 0.596762478351593, "learning_rate": 0.0004348755522034263, "loss": 3.5273, "step": 25650 }, { "epoch": 2.7661177483586266, "grad_norm": 0.5779131650924683, "learning_rate": 0.0004345523111733649, "loss": 3.5448, "step": 25700 }, { "epoch": 2.7714993003982347, "grad_norm": 0.6550736427307129, "learning_rate": 0.00043422907014330344, "loss": 3.5405, "step": 25750 }, { "epoch": 2.776880852437843, "grad_norm": 0.7348501682281494, "learning_rate": 0.0004339058291132421, "loss": 3.5329, "step": 25800 }, { "epoch": 2.7822624044774513, "grad_norm": 0.6231963038444519, "learning_rate": 0.0004335825880831807, "loss": 3.5465, "step": 25850 }, { "epoch": 2.7876439565170594, "grad_norm": 0.5558022856712341, "learning_rate": 0.00043325934705311923, "loss": 3.5381, "step": 25900 }, { "epoch": 2.793025508556668, "grad_norm": 0.6114339232444763, "learning_rate": 0.0004329361060230578, "loss": 3.5492, "step": 25950 }, { "epoch": 2.798407060596276, "grad_norm": 0.621716320514679, "learning_rate": 0.00043261286499299636, "loss": 3.5277, "step": 26000 }, { "epoch": 2.798407060596276, "eval_accuracy": 0.36862752171129165, "eval_loss": 3.51253342628479, "eval_runtime": 212.7222, "eval_samples_per_second": 84.669, "eval_steps_per_second": 5.293, "step": 26000 }, { "epoch": 2.803788612635884, "grad_norm": 0.6198678016662598, "learning_rate": 0.00043228962396293496, "loss": 3.5241, "step": 26050 }, { "epoch": 2.8091701646754923, "grad_norm": 0.5953556299209595, "learning_rate": 0.0004319663829328736, "loss": 3.5257, "step": 26100 }, { "epoch": 2.8145517167151004, "grad_norm": 0.6188264489173889, "learning_rate": 0.00043164314190281215, "loss": 3.5268, "step": 26150 }, { "epoch": 2.819933268754709, "grad_norm": 0.5980226993560791, "learning_rate": 0.00043131990087275074, "loss": 3.5478, "step": 26200 }, { "epoch": 2.825314820794317, "grad_norm": 0.6212344169616699, "learning_rate": 0.00043099665984268934, "loss": 3.5297, "step": 26250 }, { "epoch": 2.830696372833925, "grad_norm": 0.5705240964889526, "learning_rate": 0.0004306734188126279, "loss": 3.5324, "step": 26300 }, { "epoch": 2.8360779248735337, "grad_norm": 0.6547979712486267, "learning_rate": 0.00043035017778256647, "loss": 3.533, "step": 26350 }, { "epoch": 2.841459476913142, "grad_norm": 0.5784933567047119, "learning_rate": 0.0004300269367525051, "loss": 3.5297, "step": 26400 }, { "epoch": 2.84684102895275, "grad_norm": 0.7146517038345337, "learning_rate": 0.00042970369572244366, "loss": 3.5382, "step": 26450 }, { "epoch": 2.852222580992358, "grad_norm": 0.6623178124427795, "learning_rate": 0.00042938045469238226, "loss": 3.5374, "step": 26500 }, { "epoch": 2.857604133031966, "grad_norm": 0.5954160094261169, "learning_rate": 0.0004290572136623208, "loss": 3.5509, "step": 26550 }, { "epoch": 2.8629856850715747, "grad_norm": 0.6484636068344116, "learning_rate": 0.0004287339726322594, "loss": 3.5399, "step": 26600 }, { "epoch": 2.868367237111183, "grad_norm": 0.6219885349273682, "learning_rate": 0.00042841073160219804, "loss": 3.5334, "step": 26650 }, { "epoch": 2.873748789150791, "grad_norm": 0.6355128288269043, "learning_rate": 0.0004280874905721366, "loss": 3.5135, "step": 26700 }, { "epoch": 2.8791303411903995, "grad_norm": 0.5998795032501221, "learning_rate": 0.0004277642495420752, "loss": 3.5442, "step": 26750 }, { "epoch": 2.8845118932300076, "grad_norm": 0.5889739394187927, "learning_rate": 0.00042744100851201377, "loss": 3.5133, "step": 26800 }, { "epoch": 2.8898934452696157, "grad_norm": 0.6281293630599976, "learning_rate": 0.0004271177674819523, "loss": 3.5056, "step": 26850 }, { "epoch": 2.895274997309224, "grad_norm": 0.5909113883972168, "learning_rate": 0.0004267945264518909, "loss": 3.5157, "step": 26900 }, { "epoch": 2.9006565493488323, "grad_norm": 0.6297270059585571, "learning_rate": 0.00042647128542182955, "loss": 3.5354, "step": 26950 }, { "epoch": 2.9060381013884404, "grad_norm": 0.6224883794784546, "learning_rate": 0.0004261480443917681, "loss": 3.5346, "step": 27000 }, { "epoch": 2.9060381013884404, "eval_accuracy": 0.3694928333098462, "eval_loss": 3.5037596225738525, "eval_runtime": 206.2343, "eval_samples_per_second": 87.333, "eval_steps_per_second": 5.46, "step": 27000 }, { "epoch": 2.9114196534280485, "grad_norm": 0.6158651113510132, "learning_rate": 0.00042583126818230795, "loss": 3.5352, "step": 27050 }, { "epoch": 2.9168012054676566, "grad_norm": 0.6259952187538147, "learning_rate": 0.0004255080271522465, "loss": 3.5156, "step": 27100 }, { "epoch": 2.922182757507265, "grad_norm": 0.5961571335792542, "learning_rate": 0.0004251847861221851, "loss": 3.5362, "step": 27150 }, { "epoch": 2.9275643095468733, "grad_norm": 0.62592613697052, "learning_rate": 0.00042486154509212363, "loss": 3.5094, "step": 27200 }, { "epoch": 2.9329458615864814, "grad_norm": 0.5974050760269165, "learning_rate": 0.0004245383040620622, "loss": 3.5258, "step": 27250 }, { "epoch": 2.93832741362609, "grad_norm": 0.5732911229133606, "learning_rate": 0.0004242150630320009, "loss": 3.5374, "step": 27300 }, { "epoch": 2.943708965665698, "grad_norm": 0.6453855633735657, "learning_rate": 0.0004238918220019394, "loss": 3.527, "step": 27350 }, { "epoch": 2.949090517705306, "grad_norm": 0.5842043161392212, "learning_rate": 0.000423568580971878, "loss": 3.5214, "step": 27400 }, { "epoch": 2.9544720697449143, "grad_norm": 0.6021690964698792, "learning_rate": 0.00042324533994181655, "loss": 3.5026, "step": 27450 }, { "epoch": 2.9598536217845224, "grad_norm": 0.6042524576187134, "learning_rate": 0.00042292209891175514, "loss": 3.5169, "step": 27500 }, { "epoch": 2.965235173824131, "grad_norm": 0.6471766829490662, "learning_rate": 0.00042259885788169374, "loss": 3.5289, "step": 27550 }, { "epoch": 2.970616725863739, "grad_norm": 0.6328997611999512, "learning_rate": 0.00042227561685163233, "loss": 3.521, "step": 27600 }, { "epoch": 2.975998277903347, "grad_norm": 0.6381612420082092, "learning_rate": 0.00042195237582157093, "loss": 3.5258, "step": 27650 }, { "epoch": 2.9813798299429557, "grad_norm": 0.6035014986991882, "learning_rate": 0.0004216291347915095, "loss": 3.5295, "step": 27700 }, { "epoch": 2.986761381982564, "grad_norm": 0.6743372678756714, "learning_rate": 0.00042130589376144806, "loss": 3.5261, "step": 27750 }, { "epoch": 2.992142934022172, "grad_norm": 0.6017929315567017, "learning_rate": 0.00042098265273138666, "loss": 3.5172, "step": 27800 }, { "epoch": 2.9975244860617805, "grad_norm": 0.6554933786392212, "learning_rate": 0.0004206594117013252, "loss": 3.5245, "step": 27850 }, { "epoch": 3.0029060381013886, "grad_norm": 0.6437206864356995, "learning_rate": 0.00042033617067126385, "loss": 3.4742, "step": 27900 }, { "epoch": 3.0082875901409967, "grad_norm": 0.6490093469619751, "learning_rate": 0.00042001292964120244, "loss": 3.4372, "step": 27950 }, { "epoch": 3.0136691421806048, "grad_norm": 0.6507807374000549, "learning_rate": 0.000419689688611141, "loss": 3.4461, "step": 28000 }, { "epoch": 3.0136691421806048, "eval_accuracy": 0.3706709565724186, "eval_loss": 3.497730016708374, "eval_runtime": 215.1239, "eval_samples_per_second": 83.724, "eval_steps_per_second": 5.234, "step": 28000 }, { "epoch": 3.0190506942202133, "grad_norm": 0.6191187500953674, "learning_rate": 0.0004193664475810796, "loss": 3.4181, "step": 28050 }, { "epoch": 3.0244322462598214, "grad_norm": 0.62087082862854, "learning_rate": 0.00041904320655101817, "loss": 3.4181, "step": 28100 }, { "epoch": 3.0298137982994295, "grad_norm": 0.6341911554336548, "learning_rate": 0.0004187199655209567, "loss": 3.4286, "step": 28150 }, { "epoch": 3.0351953503390376, "grad_norm": 0.6108666062355042, "learning_rate": 0.00041839672449089536, "loss": 3.4278, "step": 28200 }, { "epoch": 3.040576902378646, "grad_norm": 0.5951593518257141, "learning_rate": 0.00041807348346083395, "loss": 3.441, "step": 28250 }, { "epoch": 3.0459584544182543, "grad_norm": 0.6405830383300781, "learning_rate": 0.0004177502424307725, "loss": 3.4542, "step": 28300 }, { "epoch": 3.0513400064578624, "grad_norm": 0.6463958024978638, "learning_rate": 0.0004174270014007111, "loss": 3.4252, "step": 28350 }, { "epoch": 3.0567215584974705, "grad_norm": 0.5980671048164368, "learning_rate": 0.00041710376037064963, "loss": 3.4458, "step": 28400 }, { "epoch": 3.062103110537079, "grad_norm": 0.6921752691268921, "learning_rate": 0.0004167805193405883, "loss": 3.4406, "step": 28450 }, { "epoch": 3.067484662576687, "grad_norm": 0.6020042896270752, "learning_rate": 0.0004164572783105269, "loss": 3.4446, "step": 28500 }, { "epoch": 3.0728662146162953, "grad_norm": 0.6640591025352478, "learning_rate": 0.0004161340372804654, "loss": 3.4635, "step": 28550 }, { "epoch": 3.0782477666559034, "grad_norm": 0.6354085206985474, "learning_rate": 0.000415810796250404, "loss": 3.4555, "step": 28600 }, { "epoch": 3.083629318695512, "grad_norm": 0.636901319026947, "learning_rate": 0.0004154875552203426, "loss": 3.4515, "step": 28650 }, { "epoch": 3.08901087073512, "grad_norm": 0.6186822056770325, "learning_rate": 0.00041516431419028114, "loss": 3.4345, "step": 28700 }, { "epoch": 3.094392422774728, "grad_norm": 0.6416160464286804, "learning_rate": 0.0004148410731602198, "loss": 3.45, "step": 28750 }, { "epoch": 3.0997739748143363, "grad_norm": 0.6019704341888428, "learning_rate": 0.0004145178321301584, "loss": 3.4452, "step": 28800 }, { "epoch": 3.105155526853945, "grad_norm": 0.6140844225883484, "learning_rate": 0.00041419459110009693, "loss": 3.4474, "step": 28850 }, { "epoch": 3.110537078893553, "grad_norm": 0.6584010720252991, "learning_rate": 0.0004138713500700355, "loss": 3.442, "step": 28900 }, { "epoch": 3.115918630933161, "grad_norm": 0.6763238906860352, "learning_rate": 0.00041354810903997406, "loss": 3.4475, "step": 28950 }, { "epoch": 3.121300182972769, "grad_norm": 0.6160973906517029, "learning_rate": 0.00041322486800991266, "loss": 3.4491, "step": 29000 }, { "epoch": 3.121300182972769, "eval_accuracy": 0.3712210661434233, "eval_loss": 3.494593858718872, "eval_runtime": 209.8562, "eval_samples_per_second": 85.825, "eval_steps_per_second": 5.366, "step": 29000 }, { "epoch": 3.1266817350123777, "grad_norm": 0.6309645771980286, "learning_rate": 0.0004129016269798513, "loss": 3.452, "step": 29050 }, { "epoch": 3.132063287051986, "grad_norm": 0.616537868976593, "learning_rate": 0.00041257838594978985, "loss": 3.4708, "step": 29100 }, { "epoch": 3.137444839091594, "grad_norm": 0.6185223460197449, "learning_rate": 0.00041225514491972844, "loss": 3.455, "step": 29150 }, { "epoch": 3.1428263911312024, "grad_norm": 0.6138353943824768, "learning_rate": 0.00041193190388966704, "loss": 3.4406, "step": 29200 }, { "epoch": 3.1482079431708105, "grad_norm": 0.608535647392273, "learning_rate": 0.0004116086628596056, "loss": 3.4461, "step": 29250 }, { "epoch": 3.1535894952104186, "grad_norm": 0.6112627387046814, "learning_rate": 0.0004112854218295442, "loss": 3.4395, "step": 29300 }, { "epoch": 3.1589710472500268, "grad_norm": 0.5768312811851501, "learning_rate": 0.0004109621807994828, "loss": 3.4302, "step": 29350 }, { "epoch": 3.1643525992896353, "grad_norm": 0.6325687170028687, "learning_rate": 0.00041063893976942136, "loss": 3.4465, "step": 29400 }, { "epoch": 3.1697341513292434, "grad_norm": 0.6342477202415466, "learning_rate": 0.00041031569873935996, "loss": 3.442, "step": 29450 }, { "epoch": 3.1751157033688515, "grad_norm": 0.6336598992347717, "learning_rate": 0.0004099924577092985, "loss": 3.4462, "step": 29500 }, { "epoch": 3.1804972554084596, "grad_norm": 0.607629656791687, "learning_rate": 0.0004096692166792371, "loss": 3.4299, "step": 29550 }, { "epoch": 3.185878807448068, "grad_norm": 0.6477324366569519, "learning_rate": 0.00040934597564917574, "loss": 3.448, "step": 29600 }, { "epoch": 3.1912603594876763, "grad_norm": 0.6741542220115662, "learning_rate": 0.0004090227346191143, "loss": 3.4596, "step": 29650 }, { "epoch": 3.1966419115272844, "grad_norm": 0.6519025564193726, "learning_rate": 0.0004086994935890529, "loss": 3.4715, "step": 29700 }, { "epoch": 3.2020234635668925, "grad_norm": 0.6172560453414917, "learning_rate": 0.00040837625255899147, "loss": 3.4494, "step": 29750 }, { "epoch": 3.207405015606501, "grad_norm": 0.6829916834831238, "learning_rate": 0.00040805301152893, "loss": 3.4398, "step": 29800 }, { "epoch": 3.212786567646109, "grad_norm": 0.6081319451332092, "learning_rate": 0.0004077297704988686, "loss": 3.446, "step": 29850 }, { "epoch": 3.2181681196857173, "grad_norm": 0.6863629817962646, "learning_rate": 0.00040740652946880725, "loss": 3.4471, "step": 29900 }, { "epoch": 3.2235496717253254, "grad_norm": 0.6253100633621216, "learning_rate": 0.0004070832884387458, "loss": 3.4593, "step": 29950 }, { "epoch": 3.228931223764934, "grad_norm": 0.61070317029953, "learning_rate": 0.0004067600474086844, "loss": 3.4684, "step": 30000 }, { "epoch": 3.228931223764934, "eval_accuracy": 0.3713381939564873, "eval_loss": 3.4922268390655518, "eval_runtime": 208.2583, "eval_samples_per_second": 86.484, "eval_steps_per_second": 5.407, "step": 30000 }, { "epoch": 3.234312775804542, "grad_norm": 0.6275219321250916, "learning_rate": 0.00040643680637862293, "loss": 3.4539, "step": 30050 }, { "epoch": 3.23969432784415, "grad_norm": 0.6307212710380554, "learning_rate": 0.0004061135653485615, "loss": 3.483, "step": 30100 }, { "epoch": 3.2450758798837587, "grad_norm": 0.6696796417236328, "learning_rate": 0.0004057903243185001, "loss": 3.4521, "step": 30150 }, { "epoch": 3.250457431923367, "grad_norm": 0.6193971037864685, "learning_rate": 0.0004054670832884387, "loss": 3.4574, "step": 30200 }, { "epoch": 3.255838983962975, "grad_norm": 0.6391464471817017, "learning_rate": 0.0004051438422583773, "loss": 3.4539, "step": 30250 }, { "epoch": 3.261220536002583, "grad_norm": 0.641261100769043, "learning_rate": 0.0004048206012283159, "loss": 3.4772, "step": 30300 }, { "epoch": 3.2666020880421915, "grad_norm": 0.6269187331199646, "learning_rate": 0.00040449736019825444, "loss": 3.4534, "step": 30350 }, { "epoch": 3.2719836400817996, "grad_norm": 0.6142138242721558, "learning_rate": 0.00040417411916819304, "loss": 3.4688, "step": 30400 }, { "epoch": 3.2773651921214078, "grad_norm": 0.6109180450439453, "learning_rate": 0.0004038508781381317, "loss": 3.4442, "step": 30450 }, { "epoch": 3.282746744161016, "grad_norm": 0.6190860867500305, "learning_rate": 0.0004035276371080702, "loss": 3.4456, "step": 30500 }, { "epoch": 3.2881282962006244, "grad_norm": 0.6348389983177185, "learning_rate": 0.0004032043960780088, "loss": 3.4503, "step": 30550 }, { "epoch": 3.2935098482402325, "grad_norm": 0.669296145439148, "learning_rate": 0.00040288115504794736, "loss": 3.4602, "step": 30600 }, { "epoch": 3.2988914002798406, "grad_norm": 0.6249747276306152, "learning_rate": 0.00040255791401788596, "loss": 3.4655, "step": 30650 }, { "epoch": 3.304272952319449, "grad_norm": 0.577464759349823, "learning_rate": 0.00040223467298782455, "loss": 3.4727, "step": 30700 }, { "epoch": 3.3096545043590573, "grad_norm": 0.6019587516784668, "learning_rate": 0.00040191143195776315, "loss": 3.4585, "step": 30750 }, { "epoch": 3.3150360563986654, "grad_norm": 0.601098895072937, "learning_rate": 0.00040158819092770174, "loss": 3.4467, "step": 30800 }, { "epoch": 3.3204176084382735, "grad_norm": 0.6603884696960449, "learning_rate": 0.00040127141471824155, "loss": 3.4478, "step": 30850 }, { "epoch": 3.3257991604778816, "grad_norm": 0.6579861044883728, "learning_rate": 0.00040094817368818014, "loss": 3.4784, "step": 30900 }, { "epoch": 3.33118071251749, "grad_norm": 0.622153103351593, "learning_rate": 0.0004006249326581187, "loss": 3.469, "step": 30950 }, { "epoch": 3.3365622645570983, "grad_norm": 0.6163874864578247, "learning_rate": 0.0004003016916280573, "loss": 3.4662, "step": 31000 }, { "epoch": 3.3365622645570983, "eval_accuracy": 0.37212973024421586, "eval_loss": 3.4844980239868164, "eval_runtime": 207.7673, "eval_samples_per_second": 86.688, "eval_steps_per_second": 5.42, "step": 31000 }, { "epoch": 3.3419438165967064, "grad_norm": 0.6363682746887207, "learning_rate": 0.00039997845059799587, "loss": 3.4479, "step": 31050 }, { "epoch": 3.347325368636315, "grad_norm": 0.6347747445106506, "learning_rate": 0.00039965520956793447, "loss": 3.4736, "step": 31100 }, { "epoch": 3.352706920675923, "grad_norm": 0.6099669933319092, "learning_rate": 0.00039933196853787306, "loss": 3.4788, "step": 31150 }, { "epoch": 3.358088472715531, "grad_norm": 0.6515194177627563, "learning_rate": 0.00039900872750781166, "loss": 3.4613, "step": 31200 }, { "epoch": 3.3634700247551392, "grad_norm": 0.6574184894561768, "learning_rate": 0.0003986854864777502, "loss": 3.4714, "step": 31250 }, { "epoch": 3.368851576794748, "grad_norm": 0.6157483458518982, "learning_rate": 0.0003983622454476888, "loss": 3.4752, "step": 31300 }, { "epoch": 3.374233128834356, "grad_norm": 0.632178544998169, "learning_rate": 0.00039803900441762733, "loss": 3.4579, "step": 31350 }, { "epoch": 3.379614680873964, "grad_norm": 0.6561319231987, "learning_rate": 0.000397715763387566, "loss": 3.4659, "step": 31400 }, { "epoch": 3.384996232913572, "grad_norm": 0.6884437203407288, "learning_rate": 0.0003973925223575046, "loss": 3.4761, "step": 31450 }, { "epoch": 3.3903777849531807, "grad_norm": 0.6156170964241028, "learning_rate": 0.0003970692813274431, "loss": 3.4604, "step": 31500 }, { "epoch": 3.3957593369927888, "grad_norm": 0.7083854675292969, "learning_rate": 0.0003967460402973817, "loss": 3.465, "step": 31550 }, { "epoch": 3.401140889032397, "grad_norm": 0.6180551648139954, "learning_rate": 0.0003964227992673203, "loss": 3.4453, "step": 31600 }, { "epoch": 3.4065224410720054, "grad_norm": 0.6462782621383667, "learning_rate": 0.00039609955823725884, "loss": 3.4589, "step": 31650 }, { "epoch": 3.4119039931116135, "grad_norm": 0.6157835721969604, "learning_rate": 0.0003957763172071975, "loss": 3.462, "step": 31700 }, { "epoch": 3.4172855451512216, "grad_norm": 0.6632956862449646, "learning_rate": 0.0003954530761771361, "loss": 3.4574, "step": 31750 }, { "epoch": 3.4226670971908297, "grad_norm": 0.5914563536643982, "learning_rate": 0.00039512983514707463, "loss": 3.4395, "step": 31800 }, { "epoch": 3.428048649230438, "grad_norm": 0.6110662817955017, "learning_rate": 0.0003948065941170132, "loss": 3.4704, "step": 31850 }, { "epoch": 3.4334302012700464, "grad_norm": 0.6194736361503601, "learning_rate": 0.00039448335308695176, "loss": 3.4669, "step": 31900 }, { "epoch": 3.4388117533096545, "grad_norm": 0.5982261896133423, "learning_rate": 0.00039416011205689036, "loss": 3.4541, "step": 31950 }, { "epoch": 3.4441933053492626, "grad_norm": 0.5901939868927002, "learning_rate": 0.000393836871026829, "loss": 3.4454, "step": 32000 }, { "epoch": 3.4441933053492626, "eval_accuracy": 0.3730818555001341, "eval_loss": 3.4764976501464844, "eval_runtime": 194.7932, "eval_samples_per_second": 92.462, "eval_steps_per_second": 5.78, "step": 32000 }, { "epoch": 3.449574857388871, "grad_norm": 0.633826732635498, "learning_rate": 0.00039351362999676755, "loss": 3.4546, "step": 32050 }, { "epoch": 3.4549564094284793, "grad_norm": 0.6311871409416199, "learning_rate": 0.00039319038896670614, "loss": 3.4664, "step": 32100 }, { "epoch": 3.4603379614680874, "grad_norm": 0.6177763342857361, "learning_rate": 0.00039286714793664474, "loss": 3.4497, "step": 32150 }, { "epoch": 3.4657195135076955, "grad_norm": 0.6959267258644104, "learning_rate": 0.0003925439069065833, "loss": 3.461, "step": 32200 }, { "epoch": 3.471101065547304, "grad_norm": 0.6201044321060181, "learning_rate": 0.0003922206658765219, "loss": 3.4577, "step": 32250 }, { "epoch": 3.476482617586912, "grad_norm": 0.6961297988891602, "learning_rate": 0.0003918974248464605, "loss": 3.4482, "step": 32300 }, { "epoch": 3.4818641696265202, "grad_norm": 0.6520218253135681, "learning_rate": 0.00039157418381639906, "loss": 3.4618, "step": 32350 }, { "epoch": 3.4872457216661283, "grad_norm": 0.6764187216758728, "learning_rate": 0.00039125094278633766, "loss": 3.4571, "step": 32400 }, { "epoch": 3.492627273705737, "grad_norm": 0.6045281291007996, "learning_rate": 0.0003909277017562762, "loss": 3.464, "step": 32450 }, { "epoch": 3.498008825745345, "grad_norm": 0.6529583930969238, "learning_rate": 0.0003906044607262148, "loss": 3.4465, "step": 32500 }, { "epoch": 3.503390377784953, "grad_norm": 0.6828312277793884, "learning_rate": 0.00039028121969615344, "loss": 3.4478, "step": 32550 }, { "epoch": 3.5087719298245617, "grad_norm": 0.6279124021530151, "learning_rate": 0.000389957978666092, "loss": 3.4532, "step": 32600 }, { "epoch": 3.5141534818641698, "grad_norm": 0.6408252120018005, "learning_rate": 0.0003896347376360306, "loss": 3.461, "step": 32650 }, { "epoch": 3.519535033903778, "grad_norm": 0.6262297034263611, "learning_rate": 0.0003893114966059691, "loss": 3.4685, "step": 32700 }, { "epoch": 3.524916585943386, "grad_norm": 0.6279938220977783, "learning_rate": 0.0003889882555759077, "loss": 3.4681, "step": 32750 }, { "epoch": 3.530298137982994, "grad_norm": 0.6286149621009827, "learning_rate": 0.0003886650145458463, "loss": 3.4685, "step": 32800 }, { "epoch": 3.5356796900226026, "grad_norm": 0.6136440634727478, "learning_rate": 0.0003883417735157849, "loss": 3.4395, "step": 32850 }, { "epoch": 3.5410612420622107, "grad_norm": 0.6475735902786255, "learning_rate": 0.0003880185324857235, "loss": 3.4601, "step": 32900 }, { "epoch": 3.546442794101819, "grad_norm": 0.6448686718940735, "learning_rate": 0.0003877017562762633, "loss": 3.4557, "step": 32950 }, { "epoch": 3.5518243461414274, "grad_norm": 0.6235514283180237, "learning_rate": 0.0003873785152462019, "loss": 3.4605, "step": 33000 }, { "epoch": 3.5518243461414274, "eval_accuracy": 0.3741357885119346, "eval_loss": 3.4696691036224365, "eval_runtime": 204.336, "eval_samples_per_second": 88.144, "eval_steps_per_second": 5.511, "step": 33000 }, { "epoch": 3.5572058981810355, "grad_norm": 0.6031110882759094, "learning_rate": 0.0003870552742161405, "loss": 3.4576, "step": 33050 }, { "epoch": 3.5625874502206436, "grad_norm": 0.6580427289009094, "learning_rate": 0.00038673203318607903, "loss": 3.4585, "step": 33100 }, { "epoch": 3.5679690022602517, "grad_norm": 0.6172491908073425, "learning_rate": 0.0003864087921560176, "loss": 3.445, "step": 33150 }, { "epoch": 3.57335055429986, "grad_norm": 0.6818593144416809, "learning_rate": 0.0003860855511259563, "loss": 3.4474, "step": 33200 }, { "epoch": 3.5787321063394684, "grad_norm": 0.6435357332229614, "learning_rate": 0.0003857623100958948, "loss": 3.4501, "step": 33250 }, { "epoch": 3.5841136583790765, "grad_norm": 0.639147937297821, "learning_rate": 0.0003854390690658334, "loss": 3.481, "step": 33300 }, { "epoch": 3.5894952104186846, "grad_norm": 0.6366181373596191, "learning_rate": 0.00038511582803577195, "loss": 3.4775, "step": 33350 }, { "epoch": 3.594876762458293, "grad_norm": 0.6355386972427368, "learning_rate": 0.00038479258700571054, "loss": 3.4591, "step": 33400 }, { "epoch": 3.6002583144979012, "grad_norm": 0.6315577030181885, "learning_rate": 0.00038446934597564914, "loss": 3.4535, "step": 33450 }, { "epoch": 3.6056398665375093, "grad_norm": 0.6854292750358582, "learning_rate": 0.00038414610494558773, "loss": 3.4665, "step": 33500 }, { "epoch": 3.611021418577118, "grad_norm": 0.680067777633667, "learning_rate": 0.00038382286391552633, "loss": 3.4476, "step": 33550 }, { "epoch": 3.616402970616726, "grad_norm": 0.6212747693061829, "learning_rate": 0.0003834996228854649, "loss": 3.4603, "step": 33600 }, { "epoch": 3.621784522656334, "grad_norm": 0.6352766156196594, "learning_rate": 0.00038317638185540346, "loss": 3.452, "step": 33650 }, { "epoch": 3.627166074695942, "grad_norm": 0.6111792922019958, "learning_rate": 0.00038285314082534206, "loss": 3.4636, "step": 33700 }, { "epoch": 3.6325476267355503, "grad_norm": 0.6384958028793335, "learning_rate": 0.0003825298997952806, "loss": 3.4458, "step": 33750 }, { "epoch": 3.637929178775159, "grad_norm": 0.6627931594848633, "learning_rate": 0.00038220665876521925, "loss": 3.4675, "step": 33800 }, { "epoch": 3.643310730814767, "grad_norm": 0.7328792810440063, "learning_rate": 0.00038188341773515784, "loss": 3.434, "step": 33850 }, { "epoch": 3.648692282854375, "grad_norm": 0.7406517863273621, "learning_rate": 0.0003815601767050964, "loss": 3.4399, "step": 33900 }, { "epoch": 3.6540738348939836, "grad_norm": 0.6318268775939941, "learning_rate": 0.000381236935675035, "loss": 3.4448, "step": 33950 }, { "epoch": 3.6594553869335917, "grad_norm": 0.6655885577201843, "learning_rate": 0.0003809136946449735, "loss": 3.4428, "step": 34000 }, { "epoch": 3.6594553869335917, "eval_accuracy": 0.3745497560145078, "eval_loss": 3.463956594467163, "eval_runtime": 207.206, "eval_samples_per_second": 86.923, "eval_steps_per_second": 5.434, "step": 34000 }, { "epoch": 3.6648369389732, "grad_norm": 0.6424694061279297, "learning_rate": 0.00038059045361491217, "loss": 3.4602, "step": 34050 }, { "epoch": 3.670218491012808, "grad_norm": 0.6229196190834045, "learning_rate": 0.00038026721258485076, "loss": 3.463, "step": 34100 }, { "epoch": 3.675600043052416, "grad_norm": 0.6844025254249573, "learning_rate": 0.0003799439715547893, "loss": 3.4696, "step": 34150 }, { "epoch": 3.6809815950920246, "grad_norm": 0.707100510597229, "learning_rate": 0.0003796207305247279, "loss": 3.465, "step": 34200 }, { "epoch": 3.6863631471316327, "grad_norm": 0.6834603548049927, "learning_rate": 0.0003792974894946665, "loss": 3.4657, "step": 34250 }, { "epoch": 3.691744699171241, "grad_norm": 0.6267171502113342, "learning_rate": 0.00037897424846460503, "loss": 3.4584, "step": 34300 }, { "epoch": 3.6971262512108494, "grad_norm": 0.6426784992218018, "learning_rate": 0.0003786510074345437, "loss": 3.4654, "step": 34350 }, { "epoch": 3.7025078032504575, "grad_norm": 0.697043776512146, "learning_rate": 0.0003783277664044823, "loss": 3.451, "step": 34400 }, { "epoch": 3.7078893552900656, "grad_norm": 0.6688522100448608, "learning_rate": 0.0003780045253744208, "loss": 3.4714, "step": 34450 }, { "epoch": 3.713270907329674, "grad_norm": 0.6643778085708618, "learning_rate": 0.0003776812843443594, "loss": 3.4316, "step": 34500 }, { "epoch": 3.7186524593692822, "grad_norm": 0.6475136280059814, "learning_rate": 0.00037735804331429795, "loss": 3.4301, "step": 34550 }, { "epoch": 3.7240340114088903, "grad_norm": 0.6495978236198425, "learning_rate": 0.00037703480228423654, "loss": 3.4566, "step": 34600 }, { "epoch": 3.7294155634484984, "grad_norm": 0.6580095887184143, "learning_rate": 0.0003767115612541752, "loss": 3.4636, "step": 34650 }, { "epoch": 3.7347971154881066, "grad_norm": 0.6825853586196899, "learning_rate": 0.00037638832022411373, "loss": 3.459, "step": 34700 }, { "epoch": 3.740178667527715, "grad_norm": 0.6098463535308838, "learning_rate": 0.00037606507919405233, "loss": 3.4392, "step": 34750 }, { "epoch": 3.745560219567323, "grad_norm": 0.7137957811355591, "learning_rate": 0.0003757418381639909, "loss": 3.4407, "step": 34800 }, { "epoch": 3.7509417716069313, "grad_norm": 0.6484520435333252, "learning_rate": 0.00037541859713392946, "loss": 3.4611, "step": 34850 }, { "epoch": 3.75632332364654, "grad_norm": 0.7095451354980469, "learning_rate": 0.0003750953561038681, "loss": 3.4435, "step": 34900 }, { "epoch": 3.761704875686148, "grad_norm": 0.625594973564148, "learning_rate": 0.0003747721150738067, "loss": 3.4527, "step": 34950 }, { "epoch": 3.767086427725756, "grad_norm": 0.6271294355392456, "learning_rate": 0.00037444887404374525, "loss": 3.458, "step": 35000 }, { "epoch": 3.767086427725756, "eval_accuracy": 0.37491146147804216, "eval_loss": 3.45908522605896, "eval_runtime": 199.5792, "eval_samples_per_second": 90.245, "eval_steps_per_second": 5.642, "step": 35000 }, { "epoch": 3.772467979765364, "grad_norm": 0.6571464538574219, "learning_rate": 0.00037412563301368384, "loss": 3.4627, "step": 35050 }, { "epoch": 3.7778495318049723, "grad_norm": 0.6304094791412354, "learning_rate": 0.0003738023919836224, "loss": 3.4734, "step": 35100 }, { "epoch": 3.783231083844581, "grad_norm": 0.7219865322113037, "learning_rate": 0.000373479150953561, "loss": 3.4536, "step": 35150 }, { "epoch": 3.788612635884189, "grad_norm": 0.6388824582099915, "learning_rate": 0.0003731559099234996, "loss": 3.4584, "step": 35200 }, { "epoch": 3.793994187923797, "grad_norm": 0.6524561047554016, "learning_rate": 0.00037283266889343817, "loss": 3.4621, "step": 35250 }, { "epoch": 3.7993757399634056, "grad_norm": 0.6496708989143372, "learning_rate": 0.00037250942786337676, "loss": 3.4551, "step": 35300 }, { "epoch": 3.8047572920030137, "grad_norm": 0.668373167514801, "learning_rate": 0.00037218618683331536, "loss": 3.4609, "step": 35350 }, { "epoch": 3.810138844042622, "grad_norm": 0.6569227576255798, "learning_rate": 0.0003718629458032539, "loss": 3.4436, "step": 35400 }, { "epoch": 3.8155203960822304, "grad_norm": 0.6292812824249268, "learning_rate": 0.0003715397047731925, "loss": 3.4514, "step": 35450 }, { "epoch": 3.8209019481218385, "grad_norm": 0.6967467665672302, "learning_rate": 0.00037121646374313114, "loss": 3.4759, "step": 35500 }, { "epoch": 3.8262835001614466, "grad_norm": 0.6586191058158875, "learning_rate": 0.0003708932227130697, "loss": 3.46, "step": 35550 }, { "epoch": 3.8316650522010547, "grad_norm": 0.6478423476219177, "learning_rate": 0.0003705699816830083, "loss": 3.4529, "step": 35600 }, { "epoch": 3.837046604240663, "grad_norm": 0.6029471755027771, "learning_rate": 0.0003702467406529468, "loss": 3.4454, "step": 35650 }, { "epoch": 3.8424281562802713, "grad_norm": 0.6288083791732788, "learning_rate": 0.0003699234996228854, "loss": 3.4531, "step": 35700 }, { "epoch": 3.8478097083198795, "grad_norm": 0.6120226383209229, "learning_rate": 0.000369600258592824, "loss": 3.4688, "step": 35750 }, { "epoch": 3.8531912603594876, "grad_norm": 0.6600449085235596, "learning_rate": 0.0003692770175627626, "loss": 3.4735, "step": 35800 }, { "epoch": 3.858572812399096, "grad_norm": 0.6423937678337097, "learning_rate": 0.0003689537765327012, "loss": 3.4451, "step": 35850 }, { "epoch": 3.863954364438704, "grad_norm": 0.644229531288147, "learning_rate": 0.000368637000323241, "loss": 3.468, "step": 35900 }, { "epoch": 3.8693359164783123, "grad_norm": 0.6580808162689209, "learning_rate": 0.0003683137592931796, "loss": 3.4742, "step": 35950 }, { "epoch": 3.8747174685179204, "grad_norm": 0.6204085350036621, "learning_rate": 0.00036799051826311814, "loss": 3.4501, "step": 36000 }, { "epoch": 3.8747174685179204, "eval_accuracy": 0.37576525587048837, "eval_loss": 3.4518706798553467, "eval_runtime": 205.4042, "eval_samples_per_second": 87.686, "eval_steps_per_second": 5.482, "step": 36000 }, { "epoch": 3.8800990205575285, "grad_norm": 0.6474447250366211, "learning_rate": 0.00036766727723305673, "loss": 3.4434, "step": 36050 }, { "epoch": 3.885480572597137, "grad_norm": 0.6868647336959839, "learning_rate": 0.0003673440362029953, "loss": 3.4569, "step": 36100 }, { "epoch": 3.890862124636745, "grad_norm": 0.6220848560333252, "learning_rate": 0.0003670207951729339, "loss": 3.4423, "step": 36150 }, { "epoch": 3.8962436766763533, "grad_norm": 0.7336555123329163, "learning_rate": 0.0003666975541428725, "loss": 3.4505, "step": 36200 }, { "epoch": 3.901625228715962, "grad_norm": 0.6814396381378174, "learning_rate": 0.0003663743131128111, "loss": 3.453, "step": 36250 }, { "epoch": 3.90700678075557, "grad_norm": 0.6469142436981201, "learning_rate": 0.00036605107208274965, "loss": 3.4619, "step": 36300 }, { "epoch": 3.912388332795178, "grad_norm": 1.2596371173858643, "learning_rate": 0.00036572783105268824, "loss": 3.4511, "step": 36350 }, { "epoch": 3.9177698848347866, "grad_norm": 0.6139094233512878, "learning_rate": 0.0003654045900226268, "loss": 3.4416, "step": 36400 }, { "epoch": 3.9231514368743947, "grad_norm": 0.6947947144508362, "learning_rate": 0.00036508134899256543, "loss": 3.4434, "step": 36450 }, { "epoch": 3.928532988914003, "grad_norm": 0.6977193355560303, "learning_rate": 0.00036475810796250403, "loss": 3.4436, "step": 36500 }, { "epoch": 3.933914540953611, "grad_norm": 0.6614829301834106, "learning_rate": 0.00036443486693244257, "loss": 3.4628, "step": 36550 }, { "epoch": 3.939296092993219, "grad_norm": 0.629019558429718, "learning_rate": 0.00036411162590238116, "loss": 3.446, "step": 36600 }, { "epoch": 3.9446776450328276, "grad_norm": 0.6478345394134521, "learning_rate": 0.00036378838487231976, "loss": 3.46, "step": 36650 }, { "epoch": 3.9500591970724357, "grad_norm": 0.7331146597862244, "learning_rate": 0.00036346514384225835, "loss": 3.4329, "step": 36700 }, { "epoch": 3.955440749112044, "grad_norm": 0.6980277299880981, "learning_rate": 0.00036314190281219695, "loss": 3.4525, "step": 36750 }, { "epoch": 3.9608223011516523, "grad_norm": 0.6497023105621338, "learning_rate": 0.00036281866178213554, "loss": 3.4378, "step": 36800 }, { "epoch": 3.9662038531912605, "grad_norm": 0.6118669509887695, "learning_rate": 0.0003624954207520741, "loss": 3.4387, "step": 36850 }, { "epoch": 3.9715854052308686, "grad_norm": 0.6508960127830505, "learning_rate": 0.0003621721797220127, "loss": 3.4462, "step": 36900 }, { "epoch": 3.9769669572704767, "grad_norm": 0.6341844201087952, "learning_rate": 0.0003618489386919512, "loss": 3.4397, "step": 36950 }, { "epoch": 3.9823485093100848, "grad_norm": 0.6382229328155518, "learning_rate": 0.00036152569766188987, "loss": 3.4582, "step": 37000 }, { "epoch": 3.9823485093100848, "eval_accuracy": 0.37578046727478237, "eval_loss": 3.4466753005981445, "eval_runtime": 203.9996, "eval_samples_per_second": 88.289, "eval_steps_per_second": 5.52, "step": 37000 }, { "epoch": 3.9877300613496933, "grad_norm": 0.6708056926727295, "learning_rate": 0.00036120245663182846, "loss": 3.4533, "step": 37050 }, { "epoch": 3.9931116133893014, "grad_norm": 0.7044811248779297, "learning_rate": 0.000360879215601767, "loss": 3.4423, "step": 37100 }, { "epoch": 3.9984931654289095, "grad_norm": 0.6734232306480408, "learning_rate": 0.0003605559745717056, "loss": 3.4398, "step": 37150 }, { "epoch": 4.003874717468518, "grad_norm": 0.624794602394104, "learning_rate": 0.0003602327335416442, "loss": 3.3979, "step": 37200 }, { "epoch": 4.009256269508126, "grad_norm": 0.6523617506027222, "learning_rate": 0.00035990949251158273, "loss": 3.3628, "step": 37250 }, { "epoch": 4.014637821547734, "grad_norm": 0.6801915764808655, "learning_rate": 0.0003595862514815214, "loss": 3.3562, "step": 37300 }, { "epoch": 4.020019373587343, "grad_norm": 0.6472498178482056, "learning_rate": 0.00035926301045146, "loss": 3.3433, "step": 37350 }, { "epoch": 4.0254009256269505, "grad_norm": 0.6406301259994507, "learning_rate": 0.0003589397694213985, "loss": 3.3586, "step": 37400 }, { "epoch": 4.030782477666559, "grad_norm": 0.635485827922821, "learning_rate": 0.0003586165283913371, "loss": 3.3601, "step": 37450 }, { "epoch": 4.036164029706168, "grad_norm": 0.6421555876731873, "learning_rate": 0.00035829328736127565, "loss": 3.3533, "step": 37500 }, { "epoch": 4.041545581745775, "grad_norm": 0.6644110679626465, "learning_rate": 0.00035797004633121425, "loss": 3.3551, "step": 37550 }, { "epoch": 4.046927133785384, "grad_norm": 0.6391462683677673, "learning_rate": 0.0003576468053011529, "loss": 3.3478, "step": 37600 }, { "epoch": 4.0523086858249915, "grad_norm": 0.7233789563179016, "learning_rate": 0.00035732356427109143, "loss": 3.3716, "step": 37650 }, { "epoch": 4.0576902378646, "grad_norm": 0.6202532052993774, "learning_rate": 0.00035700032324103003, "loss": 3.375, "step": 37700 }, { "epoch": 4.063071789904209, "grad_norm": 0.6700003743171692, "learning_rate": 0.0003566770822109686, "loss": 3.3763, "step": 37750 }, { "epoch": 4.068453341943816, "grad_norm": 0.6778683066368103, "learning_rate": 0.00035635384118090716, "loss": 3.3668, "step": 37800 }, { "epoch": 4.073834893983425, "grad_norm": 0.6656447649002075, "learning_rate": 0.0003560306001508458, "loss": 3.3624, "step": 37850 }, { "epoch": 4.079216446023033, "grad_norm": 0.6423748135566711, "learning_rate": 0.0003557073591207844, "loss": 3.3872, "step": 37900 }, { "epoch": 4.084597998062641, "grad_norm": 0.6550871729850769, "learning_rate": 0.00035538411809072295, "loss": 3.3689, "step": 37950 }, { "epoch": 4.08997955010225, "grad_norm": 0.7397666573524475, "learning_rate": 0.00035506087706066154, "loss": 3.3691, "step": 38000 }, { "epoch": 4.08997955010225, "eval_accuracy": 0.37662122332069087, "eval_loss": 3.448406934738159, "eval_runtime": 209.7019, "eval_samples_per_second": 85.889, "eval_steps_per_second": 5.37, "step": 38000 }, { "epoch": 4.095361102141858, "grad_norm": 0.6562075614929199, "learning_rate": 0.00035474410085120135, "loss": 3.3729, "step": 38050 }, { "epoch": 4.100742654181466, "grad_norm": 0.663729190826416, "learning_rate": 0.00035442085982113994, "loss": 3.3756, "step": 38100 }, { "epoch": 4.106124206221074, "grad_norm": 0.6242133378982544, "learning_rate": 0.0003540976187910785, "loss": 3.3834, "step": 38150 }, { "epoch": 4.111505758260682, "grad_norm": 0.7042984962463379, "learning_rate": 0.0003537743777610171, "loss": 3.3753, "step": 38200 }, { "epoch": 4.1168873103002905, "grad_norm": 0.6397444605827332, "learning_rate": 0.00035345113673095573, "loss": 3.3607, "step": 38250 }, { "epoch": 4.122268862339899, "grad_norm": 0.6845241189002991, "learning_rate": 0.00035312789570089427, "loss": 3.3678, "step": 38300 }, { "epoch": 4.127650414379507, "grad_norm": 0.6499135494232178, "learning_rate": 0.00035280465467083286, "loss": 3.3847, "step": 38350 }, { "epoch": 4.133031966419115, "grad_norm": 0.6635656356811523, "learning_rate": 0.0003524814136407714, "loss": 3.3735, "step": 38400 }, { "epoch": 4.138413518458724, "grad_norm": 0.6924629211425781, "learning_rate": 0.00035215817261071, "loss": 3.3783, "step": 38450 }, { "epoch": 4.1437950704983315, "grad_norm": 0.6163055300712585, "learning_rate": 0.00035183493158064865, "loss": 3.3716, "step": 38500 }, { "epoch": 4.14917662253794, "grad_norm": 0.6056356430053711, "learning_rate": 0.0003515116905505872, "loss": 3.3781, "step": 38550 }, { "epoch": 4.154558174577549, "grad_norm": 0.6951189041137695, "learning_rate": 0.0003511884495205258, "loss": 3.3743, "step": 38600 }, { "epoch": 4.159939726617156, "grad_norm": 0.6552825570106506, "learning_rate": 0.0003508652084904644, "loss": 3.3885, "step": 38650 }, { "epoch": 4.165321278656765, "grad_norm": 0.6377124786376953, "learning_rate": 0.0003505419674604029, "loss": 3.3997, "step": 38700 }, { "epoch": 4.1707028306963725, "grad_norm": 0.6340206861495972, "learning_rate": 0.0003502187264303415, "loss": 3.3831, "step": 38750 }, { "epoch": 4.176084382735981, "grad_norm": 0.6724329590797424, "learning_rate": 0.00034989548540028016, "loss": 3.4015, "step": 38800 }, { "epoch": 4.18146593477559, "grad_norm": 0.6731334924697876, "learning_rate": 0.0003495722443702187, "loss": 3.3766, "step": 38850 }, { "epoch": 4.186847486815197, "grad_norm": 0.6341095566749573, "learning_rate": 0.0003492490033401573, "loss": 3.3741, "step": 38900 }, { "epoch": 4.192229038854806, "grad_norm": 0.6485639214515686, "learning_rate": 0.00034892576231009584, "loss": 3.3827, "step": 38950 }, { "epoch": 4.197610590894414, "grad_norm": 0.6944514513015747, "learning_rate": 0.00034860252128003443, "loss": 3.3847, "step": 39000 }, { "epoch": 4.197610590894414, "eval_accuracy": 0.3769026343001303, "eval_loss": 3.4444234371185303, "eval_runtime": 194.3118, "eval_samples_per_second": 92.691, "eval_steps_per_second": 5.795, "step": 39000 }, { "epoch": 4.202992142934022, "grad_norm": 0.6771103143692017, "learning_rate": 0.000348279280249973, "loss": 3.3822, "step": 39050 }, { "epoch": 4.208373694973631, "grad_norm": 0.6469762325286865, "learning_rate": 0.0003479560392199116, "loss": 3.3847, "step": 39100 }, { "epoch": 4.213755247013238, "grad_norm": 0.6654626727104187, "learning_rate": 0.0003476327981898502, "loss": 3.3962, "step": 39150 }, { "epoch": 4.219136799052847, "grad_norm": 0.6649177074432373, "learning_rate": 0.0003473095571597888, "loss": 3.3866, "step": 39200 }, { "epoch": 4.224518351092455, "grad_norm": 0.639963686466217, "learning_rate": 0.00034698631612972735, "loss": 3.3654, "step": 39250 }, { "epoch": 4.229899903132063, "grad_norm": 0.7196988463401794, "learning_rate": 0.00034666307509966594, "loss": 3.3669, "step": 39300 }, { "epoch": 4.2352814551716715, "grad_norm": 0.6603704690933228, "learning_rate": 0.0003463398340696045, "loss": 3.3879, "step": 39350 }, { "epoch": 4.24066300721128, "grad_norm": 0.7158796191215515, "learning_rate": 0.00034601659303954313, "loss": 3.3925, "step": 39400 }, { "epoch": 4.246044559250888, "grad_norm": 0.6354226469993591, "learning_rate": 0.00034569335200948173, "loss": 3.3954, "step": 39450 }, { "epoch": 4.251426111290496, "grad_norm": 0.6703064441680908, "learning_rate": 0.00034537011097942027, "loss": 3.4026, "step": 39500 }, { "epoch": 4.256807663330104, "grad_norm": 0.6066316962242126, "learning_rate": 0.00034504686994935886, "loss": 3.4001, "step": 39550 }, { "epoch": 4.2621892153697125, "grad_norm": 0.6286178827285767, "learning_rate": 0.00034472362891929746, "loss": 3.3713, "step": 39600 }, { "epoch": 4.267570767409321, "grad_norm": 0.6771307587623596, "learning_rate": 0.00034440038788923605, "loss": 3.3985, "step": 39650 }, { "epoch": 4.272952319448929, "grad_norm": 0.6318666338920593, "learning_rate": 0.00034407714685917465, "loss": 3.381, "step": 39700 }, { "epoch": 4.278333871488537, "grad_norm": 0.6529156565666199, "learning_rate": 0.00034375390582911324, "loss": 3.3848, "step": 39750 }, { "epoch": 4.283715423528146, "grad_norm": 0.6864298582077026, "learning_rate": 0.0003434306647990518, "loss": 3.378, "step": 39800 }, { "epoch": 4.2890969755677535, "grad_norm": 0.7262616753578186, "learning_rate": 0.0003431074237689904, "loss": 3.3899, "step": 39850 }, { "epoch": 4.294478527607362, "grad_norm": 0.7095841765403748, "learning_rate": 0.0003427841827389289, "loss": 3.3793, "step": 39900 }, { "epoch": 4.299860079646971, "grad_norm": 0.6717966794967651, "learning_rate": 0.00034246094170886757, "loss": 3.3844, "step": 39950 }, { "epoch": 4.305241631686578, "grad_norm": 0.6544067859649658, "learning_rate": 0.00034213770067880616, "loss": 3.3714, "step": 40000 }, { "epoch": 4.305241631686578, "eval_accuracy": 0.3771877394777555, "eval_loss": 3.439746141433716, "eval_runtime": 210.3742, "eval_samples_per_second": 85.614, "eval_steps_per_second": 5.352, "step": 40000 }, { "epoch": 4.310623183726187, "grad_norm": 0.7182822227478027, "learning_rate": 0.0003418144596487447, "loss": 3.3831, "step": 40050 }, { "epoch": 4.3160047357657945, "grad_norm": 0.6663074493408203, "learning_rate": 0.0003414912186186833, "loss": 3.3781, "step": 40100 }, { "epoch": 4.321386287805403, "grad_norm": 0.7040979862213135, "learning_rate": 0.0003411679775886219, "loss": 3.3946, "step": 40150 }, { "epoch": 4.326767839845012, "grad_norm": 0.6793686151504517, "learning_rate": 0.00034084473655856043, "loss": 3.3925, "step": 40200 }, { "epoch": 4.332149391884619, "grad_norm": 0.6753449440002441, "learning_rate": 0.0003405214955284991, "loss": 3.3899, "step": 40250 }, { "epoch": 4.337530943924228, "grad_norm": 0.6695329546928406, "learning_rate": 0.0003401982544984377, "loss": 3.3848, "step": 40300 }, { "epoch": 4.342912495963836, "grad_norm": 1.0501422882080078, "learning_rate": 0.0003398750134683762, "loss": 3.3769, "step": 40350 }, { "epoch": 4.348294048003444, "grad_norm": 0.6779940128326416, "learning_rate": 0.0003395517724383148, "loss": 3.3852, "step": 40400 }, { "epoch": 4.3536756000430525, "grad_norm": 0.6622795462608337, "learning_rate": 0.0003392349962288546, "loss": 3.3954, "step": 40450 }, { "epoch": 4.359057152082661, "grad_norm": 0.6417733430862427, "learning_rate": 0.0003389117551987932, "loss": 3.3999, "step": 40500 }, { "epoch": 4.364438704122269, "grad_norm": 0.6586073040962219, "learning_rate": 0.00033858851416873175, "loss": 3.3942, "step": 40550 }, { "epoch": 4.369820256161877, "grad_norm": 0.6641150116920471, "learning_rate": 0.0003382652731386704, "loss": 3.3892, "step": 40600 }, { "epoch": 4.375201808201485, "grad_norm": 0.6859793066978455, "learning_rate": 0.000337942032108609, "loss": 3.3936, "step": 40650 }, { "epoch": 4.3805833602410935, "grad_norm": 0.6642147302627563, "learning_rate": 0.00033761879107854754, "loss": 3.3869, "step": 40700 }, { "epoch": 4.385964912280702, "grad_norm": 0.6813688278198242, "learning_rate": 0.00033729555004848613, "loss": 3.3792, "step": 40750 }, { "epoch": 4.39134646432031, "grad_norm": 0.6705623865127563, "learning_rate": 0.00033697230901842467, "loss": 3.3931, "step": 40800 }, { "epoch": 4.396728016359918, "grad_norm": 0.6627140045166016, "learning_rate": 0.00033664906798836327, "loss": 3.3971, "step": 40850 }, { "epoch": 4.402109568399527, "grad_norm": 0.6509461998939514, "learning_rate": 0.0003363258269583019, "loss": 3.3875, "step": 40900 }, { "epoch": 4.4074911204391345, "grad_norm": 0.7194676399230957, "learning_rate": 0.00033600258592824046, "loss": 3.3982, "step": 40950 }, { "epoch": 4.412872672478743, "grad_norm": 0.6531248092651367, "learning_rate": 0.00033567934489817905, "loss": 3.3768, "step": 41000 }, { "epoch": 4.412872672478743, "eval_accuracy": 0.3779571105763699, "eval_loss": 3.437781810760498, "eval_runtime": 205.598, "eval_samples_per_second": 87.603, "eval_steps_per_second": 5.477, "step": 41000 }, { "epoch": 4.418254224518351, "grad_norm": 0.6822459101676941, "learning_rate": 0.00033535610386811764, "loss": 3.4082, "step": 41050 }, { "epoch": 4.423635776557959, "grad_norm": 0.7048671841621399, "learning_rate": 0.0003350328628380562, "loss": 3.4037, "step": 41100 }, { "epoch": 4.429017328597568, "grad_norm": 0.6404821276664734, "learning_rate": 0.0003347096218079948, "loss": 3.377, "step": 41150 }, { "epoch": 4.4343988806371755, "grad_norm": 0.7114750742912292, "learning_rate": 0.00033438638077793343, "loss": 3.3984, "step": 41200 }, { "epoch": 4.439780432676784, "grad_norm": 0.6833235025405884, "learning_rate": 0.00033406313974787197, "loss": 3.3948, "step": 41250 }, { "epoch": 4.445161984716393, "grad_norm": 0.671991765499115, "learning_rate": 0.00033373989871781056, "loss": 3.3905, "step": 41300 }, { "epoch": 4.450543536756, "grad_norm": 0.6879648566246033, "learning_rate": 0.0003334166576877491, "loss": 3.4125, "step": 41350 }, { "epoch": 4.455925088795609, "grad_norm": 0.6425567865371704, "learning_rate": 0.0003330934166576877, "loss": 3.3877, "step": 41400 }, { "epoch": 4.461306640835216, "grad_norm": 0.6497949361801147, "learning_rate": 0.00033277017562762635, "loss": 3.3787, "step": 41450 }, { "epoch": 4.466688192874825, "grad_norm": 0.6711752414703369, "learning_rate": 0.0003324469345975649, "loss": 3.4, "step": 41500 }, { "epoch": 4.4720697449144335, "grad_norm": 0.6858512163162231, "learning_rate": 0.0003321236935675035, "loss": 3.3806, "step": 41550 }, { "epoch": 4.477451296954041, "grad_norm": 0.6588053107261658, "learning_rate": 0.0003318004525374421, "loss": 3.3769, "step": 41600 }, { "epoch": 4.48283284899365, "grad_norm": 0.6885084509849548, "learning_rate": 0.0003314772115073806, "loss": 3.3833, "step": 41650 }, { "epoch": 4.488214401033258, "grad_norm": 0.6567678451538086, "learning_rate": 0.0003311539704773192, "loss": 3.3795, "step": 41700 }, { "epoch": 4.493595953072866, "grad_norm": 0.644980251789093, "learning_rate": 0.00033083072944725786, "loss": 3.3881, "step": 41750 }, { "epoch": 4.4989775051124745, "grad_norm": 0.6543886065483093, "learning_rate": 0.0003305074884171964, "loss": 3.3881, "step": 41800 }, { "epoch": 4.504359057152083, "grad_norm": 0.6759121417999268, "learning_rate": 0.000330184247387135, "loss": 3.3842, "step": 41850 }, { "epoch": 4.509740609191691, "grad_norm": 0.68775475025177, "learning_rate": 0.00032986100635707354, "loss": 3.3953, "step": 41900 }, { "epoch": 4.515122161231299, "grad_norm": 0.6535020470619202, "learning_rate": 0.00032953776532701213, "loss": 3.3988, "step": 41950 }, { "epoch": 4.520503713270907, "grad_norm": 0.720953643321991, "learning_rate": 0.00032921452429695067, "loss": 3.393, "step": 42000 }, { "epoch": 4.520503713270907, "eval_accuracy": 0.3784747329339181, "eval_loss": 3.4299232959747314, "eval_runtime": 204.2689, "eval_samples_per_second": 88.173, "eval_steps_per_second": 5.512, "step": 42000 }, { "epoch": 4.5258852653105155, "grad_norm": 0.6605381965637207, "learning_rate": 0.0003288912832668893, "loss": 3.4042, "step": 42050 }, { "epoch": 4.531266817350124, "grad_norm": 0.6638337969779968, "learning_rate": 0.0003285680422368279, "loss": 3.4088, "step": 42100 }, { "epoch": 4.536648369389732, "grad_norm": 0.6967762112617493, "learning_rate": 0.0003282448012067665, "loss": 3.3833, "step": 42150 }, { "epoch": 4.54202992142934, "grad_norm": 0.6514131426811218, "learning_rate": 0.00032792156017670505, "loss": 3.3887, "step": 42200 }, { "epoch": 4.547411473468949, "grad_norm": 0.6850041747093201, "learning_rate": 0.00032759831914664365, "loss": 3.3844, "step": 42250 }, { "epoch": 4.5527930255085565, "grad_norm": 0.7080932855606079, "learning_rate": 0.0003272750781165823, "loss": 3.3936, "step": 42300 }, { "epoch": 4.558174577548165, "grad_norm": 0.6961115598678589, "learning_rate": 0.00032695183708652083, "loss": 3.3951, "step": 42350 }, { "epoch": 4.563556129587774, "grad_norm": 0.6541742086410522, "learning_rate": 0.00032662859605645943, "loss": 3.3929, "step": 42400 }, { "epoch": 4.568937681627381, "grad_norm": 0.6856427788734436, "learning_rate": 0.00032630535502639797, "loss": 3.3886, "step": 42450 }, { "epoch": 4.57431923366699, "grad_norm": 0.684547483921051, "learning_rate": 0.00032598211399633656, "loss": 3.3862, "step": 42500 }, { "epoch": 4.579700785706597, "grad_norm": 0.6282208561897278, "learning_rate": 0.0003256588729662751, "loss": 3.394, "step": 42550 }, { "epoch": 4.585082337746206, "grad_norm": 0.6854305863380432, "learning_rate": 0.00032533563193621375, "loss": 3.3815, "step": 42600 }, { "epoch": 4.5904638897858145, "grad_norm": 0.693110466003418, "learning_rate": 0.00032501239090615235, "loss": 3.3888, "step": 42650 }, { "epoch": 4.595845441825422, "grad_norm": 0.7092149257659912, "learning_rate": 0.0003246891498760909, "loss": 3.3926, "step": 42700 }, { "epoch": 4.601226993865031, "grad_norm": 0.6959360241889954, "learning_rate": 0.0003243659088460295, "loss": 3.394, "step": 42750 }, { "epoch": 4.606608545904638, "grad_norm": 0.6965814828872681, "learning_rate": 0.0003240426678159681, "loss": 3.3794, "step": 42800 }, { "epoch": 4.611990097944247, "grad_norm": 0.700532078742981, "learning_rate": 0.0003237194267859066, "loss": 3.401, "step": 42850 }, { "epoch": 4.6173716499838555, "grad_norm": 0.6489181518554688, "learning_rate": 0.00032339618575584527, "loss": 3.4006, "step": 42900 }, { "epoch": 4.622753202023463, "grad_norm": 0.6220369935035706, "learning_rate": 0.000323079409546385, "loss": 3.3956, "step": 42950 }, { "epoch": 4.628134754063072, "grad_norm": 0.7331081032752991, "learning_rate": 0.00032275616851632367, "loss": 3.3915, "step": 43000 }, { "epoch": 4.628134754063072, "eval_accuracy": 0.3789644314992979, "eval_loss": 3.4236953258514404, "eval_runtime": 203.2493, "eval_samples_per_second": 88.615, "eval_steps_per_second": 5.54, "step": 43000 }, { "epoch": 4.63351630610268, "grad_norm": 0.8542162179946899, "learning_rate": 0.00032243292748626226, "loss": 3.3898, "step": 43050 }, { "epoch": 4.638897858142288, "grad_norm": 0.6471918821334839, "learning_rate": 0.0003221096864562008, "loss": 3.4054, "step": 43100 }, { "epoch": 4.6442794101818965, "grad_norm": 0.654157280921936, "learning_rate": 0.0003217864454261394, "loss": 3.396, "step": 43150 }, { "epoch": 4.649660962221505, "grad_norm": 0.6689890623092651, "learning_rate": 0.00032146320439607794, "loss": 3.3881, "step": 43200 }, { "epoch": 4.655042514261113, "grad_norm": 0.6910762786865234, "learning_rate": 0.0003211399633660166, "loss": 3.3807, "step": 43250 }, { "epoch": 4.660424066300721, "grad_norm": 0.6583350300788879, "learning_rate": 0.0003208167223359552, "loss": 3.3996, "step": 43300 }, { "epoch": 4.665805618340329, "grad_norm": 0.7289586663246155, "learning_rate": 0.0003204934813058937, "loss": 3.398, "step": 43350 }, { "epoch": 4.6711871703799375, "grad_norm": 0.7323175668716431, "learning_rate": 0.0003201702402758323, "loss": 3.3893, "step": 43400 }, { "epoch": 4.676568722419546, "grad_norm": 0.7023487687110901, "learning_rate": 0.0003198469992457709, "loss": 3.381, "step": 43450 }, { "epoch": 4.681950274459154, "grad_norm": 0.6924834847450256, "learning_rate": 0.00031952375821570945, "loss": 3.3818, "step": 43500 }, { "epoch": 4.687331826498762, "grad_norm": 0.6525893211364746, "learning_rate": 0.0003192005171856481, "loss": 3.3878, "step": 43550 }, { "epoch": 4.692713378538371, "grad_norm": 0.6708674430847168, "learning_rate": 0.0003188772761555867, "loss": 3.3879, "step": 43600 }, { "epoch": 4.6980949305779784, "grad_norm": 0.6731662750244141, "learning_rate": 0.00031855403512552524, "loss": 3.3815, "step": 43650 }, { "epoch": 4.703476482617587, "grad_norm": 0.7286627888679504, "learning_rate": 0.00031823079409546383, "loss": 3.396, "step": 43700 }, { "epoch": 4.7088580346571955, "grad_norm": 0.6887483596801758, "learning_rate": 0.00031790755306540237, "loss": 3.3951, "step": 43750 }, { "epoch": 4.714239586696803, "grad_norm": 0.6878728270530701, "learning_rate": 0.00031758431203534097, "loss": 3.382, "step": 43800 }, { "epoch": 4.719621138736412, "grad_norm": 0.7092850208282471, "learning_rate": 0.0003172610710052796, "loss": 3.3839, "step": 43850 }, { "epoch": 4.725002690776019, "grad_norm": 0.6616544723510742, "learning_rate": 0.00031693782997521816, "loss": 3.3777, "step": 43900 }, { "epoch": 4.730384242815628, "grad_norm": 0.6741201281547546, "learning_rate": 0.000316621053765758, "loss": 3.3813, "step": 43950 }, { "epoch": 4.7357657948552365, "grad_norm": 0.6615583896636963, "learning_rate": 0.00031629781273569656, "loss": 3.3875, "step": 44000 }, { "epoch": 4.7357657948552365, "eval_accuracy": 0.3794617357668248, "eval_loss": 3.419157028198242, "eval_runtime": 196.8203, "eval_samples_per_second": 91.51, "eval_steps_per_second": 5.721, "step": 44000 }, { "epoch": 4.741147346894844, "grad_norm": 0.6903731226921082, "learning_rate": 0.00031597457170563515, "loss": 3.391, "step": 44050 }, { "epoch": 4.746528898934453, "grad_norm": 0.6820841431617737, "learning_rate": 0.0003156513306755737, "loss": 3.3847, "step": 44100 }, { "epoch": 4.751910450974061, "grad_norm": 0.6453799605369568, "learning_rate": 0.0003153280896455123, "loss": 3.3879, "step": 44150 }, { "epoch": 4.757292003013669, "grad_norm": 0.7102324962615967, "learning_rate": 0.00031500484861545094, "loss": 3.4133, "step": 44200 }, { "epoch": 4.7626735550532775, "grad_norm": 0.6926384568214417, "learning_rate": 0.0003146816075853895, "loss": 3.3997, "step": 44250 }, { "epoch": 4.768055107092886, "grad_norm": 0.6980499029159546, "learning_rate": 0.00031435836655532807, "loss": 3.3938, "step": 44300 }, { "epoch": 4.773436659132494, "grad_norm": 0.6628013253211975, "learning_rate": 0.00031403512552526667, "loss": 3.3952, "step": 44350 }, { "epoch": 4.778818211172102, "grad_norm": 0.7179418206214905, "learning_rate": 0.0003137118844952052, "loss": 3.3886, "step": 44400 }, { "epoch": 4.78419976321171, "grad_norm": 0.6799003481864929, "learning_rate": 0.0003133886434651438, "loss": 3.385, "step": 44450 }, { "epoch": 4.7895813152513185, "grad_norm": 0.6856629848480225, "learning_rate": 0.00031306540243508245, "loss": 3.3767, "step": 44500 }, { "epoch": 4.794962867290927, "grad_norm": 0.689735472202301, "learning_rate": 0.000312742161405021, "loss": 3.4087, "step": 44550 }, { "epoch": 4.800344419330535, "grad_norm": 0.685651957988739, "learning_rate": 0.0003124189203749596, "loss": 3.3955, "step": 44600 }, { "epoch": 4.805725971370143, "grad_norm": 0.6903262138366699, "learning_rate": 0.0003120956793448981, "loss": 3.3996, "step": 44650 }, { "epoch": 4.811107523409751, "grad_norm": 0.7058689594268799, "learning_rate": 0.0003117724383148367, "loss": 3.4022, "step": 44700 }, { "epoch": 4.8164890754493594, "grad_norm": 0.6723160147666931, "learning_rate": 0.00031144919728477526, "loss": 3.3906, "step": 44750 }, { "epoch": 4.821870627488968, "grad_norm": 0.6740667223930359, "learning_rate": 0.0003111259562547139, "loss": 3.3985, "step": 44800 }, { "epoch": 4.827252179528576, "grad_norm": 0.6879459023475647, "learning_rate": 0.0003108027152246525, "loss": 3.379, "step": 44850 }, { "epoch": 4.832633731568184, "grad_norm": 0.672219455242157, "learning_rate": 0.0003104794741945911, "loss": 3.3935, "step": 44900 }, { "epoch": 4.838015283607793, "grad_norm": 0.680568277835846, "learning_rate": 0.00031015623316452964, "loss": 3.3993, "step": 44950 }, { "epoch": 4.8433968356474, "grad_norm": 0.6724383234977722, "learning_rate": 0.00030983299213446823, "loss": 3.4045, "step": 45000 }, { "epoch": 4.8433968356474, "eval_accuracy": 0.3801395124810116, "eval_loss": 3.414053440093994, "eval_runtime": 206.488, "eval_samples_per_second": 87.225, "eval_steps_per_second": 5.453, "step": 45000 }, { "epoch": 4.848778387687009, "grad_norm": 0.7093207240104675, "learning_rate": 0.0003095097511044069, "loss": 3.3656, "step": 45050 }, { "epoch": 4.8541599397266175, "grad_norm": 0.6884026527404785, "learning_rate": 0.0003091865100743454, "loss": 3.3815, "step": 45100 }, { "epoch": 4.859541491766225, "grad_norm": 0.6394287943840027, "learning_rate": 0.000308863269044284, "loss": 3.4021, "step": 45150 }, { "epoch": 4.864923043805834, "grad_norm": 0.656760573387146, "learning_rate": 0.00030854002801422256, "loss": 3.3942, "step": 45200 }, { "epoch": 4.870304595845441, "grad_norm": 0.7228603363037109, "learning_rate": 0.00030821678698416115, "loss": 3.3831, "step": 45250 }, { "epoch": 4.87568614788505, "grad_norm": 0.6911720633506775, "learning_rate": 0.0003078935459540997, "loss": 3.3941, "step": 45300 }, { "epoch": 4.8810676999246585, "grad_norm": 0.6649526357650757, "learning_rate": 0.00030757030492403834, "loss": 3.3868, "step": 45350 }, { "epoch": 4.886449251964266, "grad_norm": 0.6843016743659973, "learning_rate": 0.00030724706389397694, "loss": 3.3774, "step": 45400 }, { "epoch": 4.891830804003875, "grad_norm": 0.6798348426818848, "learning_rate": 0.0003069238228639155, "loss": 3.3856, "step": 45450 }, { "epoch": 4.897212356043483, "grad_norm": 0.6509679555892944, "learning_rate": 0.00030660058183385407, "loss": 3.3992, "step": 45500 }, { "epoch": 4.902593908083091, "grad_norm": 0.6958284378051758, "learning_rate": 0.00030627734080379267, "loss": 3.3722, "step": 45550 }, { "epoch": 4.9079754601226995, "grad_norm": 0.6654974222183228, "learning_rate": 0.0003059540997737312, "loss": 3.3861, "step": 45600 }, { "epoch": 4.913357012162308, "grad_norm": 0.7664541602134705, "learning_rate": 0.00030563085874366986, "loss": 3.4079, "step": 45650 }, { "epoch": 4.918738564201916, "grad_norm": 0.7275959849357605, "learning_rate": 0.00030530761771360845, "loss": 3.405, "step": 45700 }, { "epoch": 4.924120116241524, "grad_norm": 0.6668584942817688, "learning_rate": 0.000304984376683547, "loss": 3.4006, "step": 45750 }, { "epoch": 4.929501668281132, "grad_norm": 0.6965872645378113, "learning_rate": 0.0003046611356534856, "loss": 3.3963, "step": 45800 }, { "epoch": 4.9348832203207404, "grad_norm": 0.7191141247749329, "learning_rate": 0.0003043378946234241, "loss": 3.3976, "step": 45850 }, { "epoch": 4.940264772360349, "grad_norm": 0.6690873503684998, "learning_rate": 0.0003040146535933628, "loss": 3.4008, "step": 45900 }, { "epoch": 4.945646324399957, "grad_norm": 0.6937520503997803, "learning_rate": 0.00030369141256330137, "loss": 3.3907, "step": 45950 }, { "epoch": 4.951027876439565, "grad_norm": 0.6963992118835449, "learning_rate": 0.0003033681715332399, "loss": 3.3761, "step": 46000 }, { "epoch": 4.951027876439565, "eval_accuracy": 0.3804870930691301, "eval_loss": 3.4095349311828613, "eval_runtime": 205.4933, "eval_samples_per_second": 87.648, "eval_steps_per_second": 5.479, "step": 46000 }, { "epoch": 4.956409428479174, "grad_norm": 0.7414904832839966, "learning_rate": 0.0003030449305031785, "loss": 3.3747, "step": 46050 }, { "epoch": 4.961790980518781, "grad_norm": 0.7259211540222168, "learning_rate": 0.0003027216894731171, "loss": 3.3819, "step": 46100 }, { "epoch": 4.96717253255839, "grad_norm": 0.7090394496917725, "learning_rate": 0.00030239844844305564, "loss": 3.3889, "step": 46150 }, { "epoch": 4.9725540845979985, "grad_norm": 0.7188470363616943, "learning_rate": 0.0003020752074129943, "loss": 3.3881, "step": 46200 }, { "epoch": 4.977935636637606, "grad_norm": 0.6995404362678528, "learning_rate": 0.0003017519663829329, "loss": 3.3543, "step": 46250 }, { "epoch": 4.983317188677215, "grad_norm": 0.6940346956253052, "learning_rate": 0.0003014287253528714, "loss": 3.378, "step": 46300 }, { "epoch": 4.988698740716822, "grad_norm": 0.717013955116272, "learning_rate": 0.00030110548432281, "loss": 3.3916, "step": 46350 }, { "epoch": 4.994080292756431, "grad_norm": 0.7562733888626099, "learning_rate": 0.00030078224329274856, "loss": 3.3738, "step": 46400 }, { "epoch": 4.9994618447960395, "grad_norm": 0.6929606199264526, "learning_rate": 0.00030045900226268715, "loss": 3.3991, "step": 46450 }, { "epoch": 5.004843396835647, "grad_norm": 0.684966504573822, "learning_rate": 0.0003001357612326258, "loss": 3.3138, "step": 46500 }, { "epoch": 5.010224948875256, "grad_norm": 0.6948528289794922, "learning_rate": 0.00029981252020256434, "loss": 3.2973, "step": 46550 }, { "epoch": 5.015606500914864, "grad_norm": 0.7069754004478455, "learning_rate": 0.00029948927917250294, "loss": 3.2938, "step": 46600 }, { "epoch": 5.020988052954472, "grad_norm": 0.6681404113769531, "learning_rate": 0.00029916603814244153, "loss": 3.3031, "step": 46650 }, { "epoch": 5.0263696049940805, "grad_norm": 0.7222424745559692, "learning_rate": 0.0002988427971123801, "loss": 3.2958, "step": 46700 }, { "epoch": 5.031751157033688, "grad_norm": 0.6528456211090088, "learning_rate": 0.00029851955608231867, "loss": 3.2972, "step": 46750 }, { "epoch": 5.037132709073297, "grad_norm": 0.7115670442581177, "learning_rate": 0.00029819631505225726, "loss": 3.3177, "step": 46800 }, { "epoch": 5.042514261112905, "grad_norm": 0.7243658304214478, "learning_rate": 0.00029787307402219586, "loss": 3.3053, "step": 46850 }, { "epoch": 5.047895813152513, "grad_norm": 0.6976185441017151, "learning_rate": 0.00029754983299213445, "loss": 3.3041, "step": 46900 }, { "epoch": 5.0532773651921215, "grad_norm": 0.6580340266227722, "learning_rate": 0.000297226591962073, "loss": 3.2969, "step": 46950 }, { "epoch": 5.05865891723173, "grad_norm": 0.6749905943870544, "learning_rate": 0.00029690335093201164, "loss": 3.3004, "step": 47000 }, { "epoch": 5.05865891723173, "eval_accuracy": 0.380867812788032, "eval_loss": 3.412269353866577, "eval_runtime": 196.4805, "eval_samples_per_second": 91.668, "eval_steps_per_second": 5.731, "step": 47000 }, { "epoch": 5.064040469271338, "grad_norm": 0.7184104919433594, "learning_rate": 0.0002965801099019502, "loss": 3.3063, "step": 47050 }, { "epoch": 5.069422021310946, "grad_norm": 0.6803567409515381, "learning_rate": 0.0002962568688718888, "loss": 3.2995, "step": 47100 }, { "epoch": 5.074803573350554, "grad_norm": 0.6682900190353394, "learning_rate": 0.00029593362784182737, "loss": 3.291, "step": 47150 }, { "epoch": 5.080185125390162, "grad_norm": 0.6613481640815735, "learning_rate": 0.00029561038681176596, "loss": 3.2996, "step": 47200 }, { "epoch": 5.085566677429771, "grad_norm": 0.6656726598739624, "learning_rate": 0.00029528714578170456, "loss": 3.3157, "step": 47250 }, { "epoch": 5.090948229469379, "grad_norm": 0.6987922191619873, "learning_rate": 0.0002949639047516431, "loss": 3.3214, "step": 47300 }, { "epoch": 5.096329781508987, "grad_norm": 0.6765496134757996, "learning_rate": 0.0002946406637215817, "loss": 3.3238, "step": 47350 }, { "epoch": 5.101711333548596, "grad_norm": 0.722225546836853, "learning_rate": 0.0002943174226915203, "loss": 3.3189, "step": 47400 }, { "epoch": 5.107092885588203, "grad_norm": 0.6817000508308411, "learning_rate": 0.0002939941816614589, "loss": 3.3209, "step": 47450 }, { "epoch": 5.112474437627812, "grad_norm": 0.7062390446662903, "learning_rate": 0.0002936709406313974, "loss": 3.315, "step": 47500 }, { "epoch": 5.1178559896674205, "grad_norm": 0.6629499197006226, "learning_rate": 0.0002933476996013361, "loss": 3.319, "step": 47550 }, { "epoch": 5.123237541707028, "grad_norm": 0.6865987777709961, "learning_rate": 0.0002930244585712746, "loss": 3.3216, "step": 47600 }, { "epoch": 5.128619093746637, "grad_norm": 0.7235592007637024, "learning_rate": 0.0002927012175412132, "loss": 3.3115, "step": 47650 }, { "epoch": 5.134000645786244, "grad_norm": 0.7191505432128906, "learning_rate": 0.0002923779765111518, "loss": 3.3012, "step": 47700 }, { "epoch": 5.139382197825853, "grad_norm": 0.706909716129303, "learning_rate": 0.0002920547354810904, "loss": 3.3227, "step": 47750 }, { "epoch": 5.1447637498654615, "grad_norm": 0.7347841262817383, "learning_rate": 0.00029173149445102894, "loss": 3.3219, "step": 47800 }, { "epoch": 5.150145301905069, "grad_norm": 0.7402267456054688, "learning_rate": 0.00029140825342096753, "loss": 3.3008, "step": 47850 }, { "epoch": 5.155526853944678, "grad_norm": 0.678464949131012, "learning_rate": 0.0002910850123909061, "loss": 3.3218, "step": 47900 }, { "epoch": 5.160908405984286, "grad_norm": 0.7275342345237732, "learning_rate": 0.0002907617713608447, "loss": 3.3034, "step": 47950 }, { "epoch": 5.166289958023894, "grad_norm": 0.706177294254303, "learning_rate": 0.0002904385303307833, "loss": 3.3289, "step": 48000 }, { "epoch": 5.166289958023894, "eval_accuracy": 0.3814682286460948, "eval_loss": 3.4115307331085205, "eval_runtime": 209.8199, "eval_samples_per_second": 85.84, "eval_steps_per_second": 5.367, "step": 48000 }, { "epoch": 5.1716715100635025, "grad_norm": 0.6725974678993225, "learning_rate": 0.00029011528930072186, "loss": 3.3346, "step": 48050 }, { "epoch": 5.17705306210311, "grad_norm": 0.726219117641449, "learning_rate": 0.00028979204827066045, "loss": 3.3088, "step": 48100 }, { "epoch": 5.182434614142719, "grad_norm": 0.6821277141571045, "learning_rate": 0.00028946880724059905, "loss": 3.3257, "step": 48150 }, { "epoch": 5.187816166182327, "grad_norm": 0.695120096206665, "learning_rate": 0.00028914556621053764, "loss": 3.3328, "step": 48200 }, { "epoch": 5.193197718221935, "grad_norm": 0.694706380367279, "learning_rate": 0.00028882232518047624, "loss": 3.3343, "step": 48250 }, { "epoch": 5.198579270261543, "grad_norm": 0.7311663031578064, "learning_rate": 0.00028849908415041483, "loss": 3.3231, "step": 48300 }, { "epoch": 5.203960822301152, "grad_norm": 0.7439178228378296, "learning_rate": 0.00028817584312035337, "loss": 3.314, "step": 48350 }, { "epoch": 5.20934237434076, "grad_norm": 0.7027797698974609, "learning_rate": 0.00028785260209029197, "loss": 3.3172, "step": 48400 }, { "epoch": 5.214723926380368, "grad_norm": 0.7429949045181274, "learning_rate": 0.00028752936106023056, "loss": 3.3185, "step": 48450 }, { "epoch": 5.220105478419977, "grad_norm": 0.6877169013023376, "learning_rate": 0.00028720612003016915, "loss": 3.3372, "step": 48500 }, { "epoch": 5.225487030459584, "grad_norm": 0.6922637224197388, "learning_rate": 0.00028688287900010775, "loss": 3.3232, "step": 48550 }, { "epoch": 5.230868582499193, "grad_norm": 0.7518928647041321, "learning_rate": 0.0002865596379700463, "loss": 3.3261, "step": 48600 }, { "epoch": 5.236250134538801, "grad_norm": 0.7136549353599548, "learning_rate": 0.0002862363969399849, "loss": 3.3301, "step": 48650 }, { "epoch": 5.241631686578409, "grad_norm": 0.7302454710006714, "learning_rate": 0.0002859131559099235, "loss": 3.3319, "step": 48700 }, { "epoch": 5.247013238618018, "grad_norm": 0.6988076567649841, "learning_rate": 0.0002855899148798621, "loss": 3.3271, "step": 48750 }, { "epoch": 5.252394790657625, "grad_norm": 0.6556980609893799, "learning_rate": 0.0002852666738498006, "loss": 3.3376, "step": 48800 }, { "epoch": 5.257776342697234, "grad_norm": 0.7035303115844727, "learning_rate": 0.00028494343281973926, "loss": 3.3302, "step": 48850 }, { "epoch": 5.2631578947368425, "grad_norm": 0.7211655974388123, "learning_rate": 0.0002846201917896778, "loss": 3.3299, "step": 48900 }, { "epoch": 5.26853944677645, "grad_norm": 0.7089269161224365, "learning_rate": 0.0002842969507596164, "loss": 3.3217, "step": 48950 }, { "epoch": 5.273920998816059, "grad_norm": 0.6631143689155579, "learning_rate": 0.000283973709729555, "loss": 3.3151, "step": 49000 }, { "epoch": 5.273920998816059, "eval_accuracy": 0.3817304080643912, "eval_loss": 3.4057440757751465, "eval_runtime": 211.9424, "eval_samples_per_second": 84.981, "eval_steps_per_second": 5.313, "step": 49000 }, { "epoch": 5.279302550855666, "grad_norm": 0.682040810585022, "learning_rate": 0.0002836504686994936, "loss": 3.3137, "step": 49050 }, { "epoch": 5.284684102895275, "grad_norm": 0.6834739446640015, "learning_rate": 0.00028332722766943213, "loss": 3.3403, "step": 49100 }, { "epoch": 5.2900656549348835, "grad_norm": 0.7186492681503296, "learning_rate": 0.0002830039866393707, "loss": 3.3335, "step": 49150 }, { "epoch": 5.295447206974491, "grad_norm": 0.6996015310287476, "learning_rate": 0.0002826807456093093, "loss": 3.3068, "step": 49200 }, { "epoch": 5.3008287590141, "grad_norm": 0.7867818474769592, "learning_rate": 0.0002823575045792479, "loss": 3.3175, "step": 49250 }, { "epoch": 5.306210311053708, "grad_norm": 0.6740609407424927, "learning_rate": 0.0002820342635491865, "loss": 3.307, "step": 49300 }, { "epoch": 5.311591863093316, "grad_norm": 0.6935590505599976, "learning_rate": 0.00028171102251912505, "loss": 3.3232, "step": 49350 }, { "epoch": 5.316973415132924, "grad_norm": 0.690018892288208, "learning_rate": 0.00028138778148906364, "loss": 3.3367, "step": 49400 }, { "epoch": 5.322354967172533, "grad_norm": 0.7360737919807434, "learning_rate": 0.00028106454045900224, "loss": 3.3176, "step": 49450 }, { "epoch": 5.327736519212141, "grad_norm": 0.6747078895568848, "learning_rate": 0.00028074129942894083, "loss": 3.3376, "step": 49500 }, { "epoch": 5.333118071251749, "grad_norm": 0.7133691310882568, "learning_rate": 0.0002804180583988794, "loss": 3.3254, "step": 49550 }, { "epoch": 5.338499623291357, "grad_norm": 0.8014851212501526, "learning_rate": 0.00028009481736881797, "loss": 3.321, "step": 49600 }, { "epoch": 5.343881175330965, "grad_norm": 0.7052977085113525, "learning_rate": 0.00027977157633875656, "loss": 3.3394, "step": 49650 }, { "epoch": 5.349262727370574, "grad_norm": 0.7522420883178711, "learning_rate": 0.00027944833530869516, "loss": 3.3236, "step": 49700 }, { "epoch": 5.354644279410182, "grad_norm": 0.7162737846374512, "learning_rate": 0.00027913155909923496, "loss": 3.3151, "step": 49750 }, { "epoch": 5.36002583144979, "grad_norm": 0.7829045653343201, "learning_rate": 0.00027880831806917356, "loss": 3.3139, "step": 49800 }, { "epoch": 5.365407383489399, "grad_norm": 0.6889187693595886, "learning_rate": 0.00027848507703911215, "loss": 3.3473, "step": 49850 }, { "epoch": 5.370788935529006, "grad_norm": 0.7904999256134033, "learning_rate": 0.0002781618360090507, "loss": 3.3207, "step": 49900 }, { "epoch": 5.376170487568615, "grad_norm": 0.6826810836791992, "learning_rate": 0.00027783859497898934, "loss": 3.338, "step": 49950 }, { "epoch": 5.3815520396082235, "grad_norm": 0.7005066871643066, "learning_rate": 0.0002775153539489279, "loss": 3.3528, "step": 50000 }, { "epoch": 5.3815520396082235, "eval_accuracy": 0.3818176563333062, "eval_loss": 3.402635097503662, "eval_runtime": 195.8066, "eval_samples_per_second": 91.984, "eval_steps_per_second": 5.751, "step": 50000 }, { "epoch": 5.386933591647831, "grad_norm": 0.7351107001304626, "learning_rate": 0.0002771921129188665, "loss": 3.3345, "step": 50050 }, { "epoch": 5.39231514368744, "grad_norm": 0.6939401626586914, "learning_rate": 0.00027686887188880507, "loss": 3.3426, "step": 50100 }, { "epoch": 5.397696695727047, "grad_norm": 0.7426904439926147, "learning_rate": 0.00027654563085874366, "loss": 3.3367, "step": 50150 }, { "epoch": 5.403078247766656, "grad_norm": 0.7072582840919495, "learning_rate": 0.00027622238982868226, "loss": 3.3355, "step": 50200 }, { "epoch": 5.4084597998062645, "grad_norm": 0.829706609249115, "learning_rate": 0.0002758991487986208, "loss": 3.3383, "step": 50250 }, { "epoch": 5.413841351845872, "grad_norm": 0.8014844655990601, "learning_rate": 0.0002755759077685594, "loss": 3.3264, "step": 50300 }, { "epoch": 5.419222903885481, "grad_norm": 0.7258954644203186, "learning_rate": 0.000275252666738498, "loss": 3.3297, "step": 50350 }, { "epoch": 5.424604455925088, "grad_norm": 0.706342339515686, "learning_rate": 0.0002749294257084366, "loss": 3.332, "step": 50400 }, { "epoch": 5.429986007964697, "grad_norm": 0.7490754723548889, "learning_rate": 0.0002746061846783751, "loss": 3.3274, "step": 50450 }, { "epoch": 5.435367560004305, "grad_norm": 0.6892290115356445, "learning_rate": 0.0002742829436483138, "loss": 3.3248, "step": 50500 }, { "epoch": 5.440749112043913, "grad_norm": 0.7274524569511414, "learning_rate": 0.0002739597026182523, "loss": 3.3216, "step": 50550 }, { "epoch": 5.446130664083522, "grad_norm": 0.7700362801551819, "learning_rate": 0.0002736364615881909, "loss": 3.3392, "step": 50600 }, { "epoch": 5.45151221612313, "grad_norm": 0.6922512650489807, "learning_rate": 0.0002733132205581295, "loss": 3.3374, "step": 50650 }, { "epoch": 5.456893768162738, "grad_norm": 0.7054064273834229, "learning_rate": 0.00027298997952806804, "loss": 3.3416, "step": 50700 }, { "epoch": 5.462275320202346, "grad_norm": 0.7377570867538452, "learning_rate": 0.00027266673849800664, "loss": 3.3335, "step": 50750 }, { "epoch": 5.467656872241955, "grad_norm": 0.7138562202453613, "learning_rate": 0.00027234349746794523, "loss": 3.3428, "step": 50800 }, { "epoch": 5.473038424281563, "grad_norm": 0.7018982172012329, "learning_rate": 0.00027202025643788383, "loss": 3.3298, "step": 50850 }, { "epoch": 5.478419976321171, "grad_norm": 0.7368566393852234, "learning_rate": 0.00027169701540782237, "loss": 3.3449, "step": 50900 }, { "epoch": 5.483801528360779, "grad_norm": 0.6727787852287292, "learning_rate": 0.000271373774377761, "loss": 3.3402, "step": 50950 }, { "epoch": 5.489183080400387, "grad_norm": 0.7162392139434814, "learning_rate": 0.00027105053334769956, "loss": 3.3271, "step": 51000 }, { "epoch": 5.489183080400387, "eval_accuracy": 0.38230561645248107, "eval_loss": 3.398566722869873, "eval_runtime": 200.4209, "eval_samples_per_second": 89.866, "eval_steps_per_second": 5.618, "step": 51000 }, { "epoch": 5.494564632439996, "grad_norm": 0.7033275365829468, "learning_rate": 0.00027072729231763815, "loss": 3.3318, "step": 51050 }, { "epoch": 5.499946184479604, "grad_norm": 0.6969659328460693, "learning_rate": 0.00027040405128757675, "loss": 3.34, "step": 51100 }, { "epoch": 5.505327736519212, "grad_norm": 0.827999472618103, "learning_rate": 0.00027008081025751534, "loss": 3.3444, "step": 51150 }, { "epoch": 5.510709288558821, "grad_norm": 0.7825914025306702, "learning_rate": 0.00026975756922745394, "loss": 3.3355, "step": 51200 }, { "epoch": 5.516090840598428, "grad_norm": 0.7196942567825317, "learning_rate": 0.0002694343281973925, "loss": 3.3247, "step": 51250 }, { "epoch": 5.521472392638037, "grad_norm": 0.6952111721038818, "learning_rate": 0.00026911108716733107, "loss": 3.3484, "step": 51300 }, { "epoch": 5.5268539446776455, "grad_norm": 0.7286564111709595, "learning_rate": 0.00026878784613726967, "loss": 3.3394, "step": 51350 }, { "epoch": 5.532235496717253, "grad_norm": 0.7400366067886353, "learning_rate": 0.00026846460510720826, "loss": 3.3425, "step": 51400 }, { "epoch": 5.537617048756862, "grad_norm": 0.6877460479736328, "learning_rate": 0.0002681413640771468, "loss": 3.3296, "step": 51450 }, { "epoch": 5.542998600796469, "grad_norm": 0.7166876196861267, "learning_rate": 0.00026781812304708545, "loss": 3.3404, "step": 51500 }, { "epoch": 5.548380152836078, "grad_norm": 0.7324938178062439, "learning_rate": 0.000267494882017024, "loss": 3.3422, "step": 51550 }, { "epoch": 5.553761704875686, "grad_norm": 0.7295821309089661, "learning_rate": 0.0002671716409869626, "loss": 3.3304, "step": 51600 }, { "epoch": 5.559143256915294, "grad_norm": 0.7150875926017761, "learning_rate": 0.0002668483999569012, "loss": 3.3561, "step": 51650 }, { "epoch": 5.564524808954903, "grad_norm": 0.7677608728408813, "learning_rate": 0.0002665251589268398, "loss": 3.3307, "step": 51700 }, { "epoch": 5.569906360994511, "grad_norm": 0.6857333183288574, "learning_rate": 0.0002662019178967783, "loss": 3.3412, "step": 51750 }, { "epoch": 5.575287913034119, "grad_norm": 0.6856406927108765, "learning_rate": 0.0002658786768667169, "loss": 3.3386, "step": 51800 }, { "epoch": 5.580669465073727, "grad_norm": 0.7286893129348755, "learning_rate": 0.0002655554358366555, "loss": 3.3352, "step": 51850 }, { "epoch": 5.586051017113336, "grad_norm": 0.7269234657287598, "learning_rate": 0.0002652321948065941, "loss": 3.3422, "step": 51900 }, { "epoch": 5.591432569152944, "grad_norm": 0.7077545523643494, "learning_rate": 0.0002649089537765327, "loss": 3.3353, "step": 51950 }, { "epoch": 5.596814121192552, "grad_norm": 0.7062633633613586, "learning_rate": 0.00026458571274647123, "loss": 3.3457, "step": 52000 }, { "epoch": 5.596814121192552, "eval_accuracy": 0.38252248761655877, "eval_loss": 3.3943653106689453, "eval_runtime": 202.3361, "eval_samples_per_second": 89.015, "eval_steps_per_second": 5.565, "step": 52000 }, { "epoch": 5.60219567323216, "grad_norm": 0.7586387991905212, "learning_rate": 0.0002642624717164099, "loss": 3.3286, "step": 52050 }, { "epoch": 5.607577225271768, "grad_norm": 0.6984559297561646, "learning_rate": 0.0002639392306863484, "loss": 3.3364, "step": 52100 }, { "epoch": 5.612958777311377, "grad_norm": 0.7536754608154297, "learning_rate": 0.000263615989656287, "loss": 3.3353, "step": 52150 }, { "epoch": 5.618340329350985, "grad_norm": 0.7495378851890564, "learning_rate": 0.0002632927486262256, "loss": 3.3361, "step": 52200 }, { "epoch": 5.623721881390593, "grad_norm": 0.684282124042511, "learning_rate": 0.0002629695075961642, "loss": 3.3353, "step": 52250 }, { "epoch": 5.629103433430201, "grad_norm": 0.7329372763633728, "learning_rate": 0.00026264626656610275, "loss": 3.3291, "step": 52300 }, { "epoch": 5.634484985469809, "grad_norm": 0.8108019232749939, "learning_rate": 0.00026232302553604134, "loss": 3.3548, "step": 52350 }, { "epoch": 5.639866537509418, "grad_norm": 0.7560662627220154, "learning_rate": 0.00026199978450597994, "loss": 3.325, "step": 52400 }, { "epoch": 5.645248089549026, "grad_norm": 0.7306268215179443, "learning_rate": 0.00026167654347591853, "loss": 3.334, "step": 52450 }, { "epoch": 5.650629641588634, "grad_norm": 0.7203580737113953, "learning_rate": 0.0002613533024458571, "loss": 3.3221, "step": 52500 }, { "epoch": 5.656011193628243, "grad_norm": 0.6746958494186401, "learning_rate": 0.00026103006141579567, "loss": 3.3364, "step": 52550 }, { "epoch": 5.66139274566785, "grad_norm": 0.7283848524093628, "learning_rate": 0.00026070682038573426, "loss": 3.3309, "step": 52600 }, { "epoch": 5.666774297707459, "grad_norm": 0.7701716423034668, "learning_rate": 0.00026038357935567286, "loss": 3.3383, "step": 52650 }, { "epoch": 5.672155849747067, "grad_norm": 0.6934614777565002, "learning_rate": 0.00026006033832561145, "loss": 3.3301, "step": 52700 }, { "epoch": 5.677537401786675, "grad_norm": 0.7119128108024597, "learning_rate": 0.00025973709729555, "loss": 3.3367, "step": 52750 }, { "epoch": 5.682918953826284, "grad_norm": 0.7460734248161316, "learning_rate": 0.00025941385626548864, "loss": 3.3561, "step": 52800 }, { "epoch": 5.688300505865891, "grad_norm": 0.7418410181999207, "learning_rate": 0.0002590906152354272, "loss": 3.324, "step": 52850 }, { "epoch": 5.6936820579055, "grad_norm": 0.7116801142692566, "learning_rate": 0.0002587673742053658, "loss": 3.3337, "step": 52900 }, { "epoch": 5.699063609945108, "grad_norm": 0.7011289000511169, "learning_rate": 0.00025844413317530437, "loss": 3.3404, "step": 52950 }, { "epoch": 5.704445161984716, "grad_norm": 0.7183201909065247, "learning_rate": 0.00025812089214524296, "loss": 3.3489, "step": 53000 }, { "epoch": 5.704445161984716, "eval_accuracy": 0.3831644088777667, "eval_loss": 3.3893256187438965, "eval_runtime": 213.8188, "eval_samples_per_second": 84.235, "eval_steps_per_second": 5.266, "step": 53000 }, { "epoch": 5.709826714024325, "grad_norm": 0.7784079313278198, "learning_rate": 0.00025779765111518156, "loss": 3.3336, "step": 53050 }, { "epoch": 5.715208266063933, "grad_norm": 0.7234706282615662, "learning_rate": 0.0002574744100851201, "loss": 3.3246, "step": 53100 }, { "epoch": 5.720589818103541, "grad_norm": 0.7097893953323364, "learning_rate": 0.0002571511690550587, "loss": 3.3239, "step": 53150 }, { "epoch": 5.725971370143149, "grad_norm": 0.6980921030044556, "learning_rate": 0.0002568279280249973, "loss": 3.3451, "step": 53200 }, { "epoch": 5.731352922182758, "grad_norm": 0.7374033331871033, "learning_rate": 0.0002565046869949359, "loss": 3.345, "step": 53250 }, { "epoch": 5.736734474222366, "grad_norm": 0.7064562439918518, "learning_rate": 0.0002561814459648744, "loss": 3.3339, "step": 53300 }, { "epoch": 5.742116026261974, "grad_norm": 0.7073032259941101, "learning_rate": 0.00025585820493481307, "loss": 3.3426, "step": 53350 }, { "epoch": 5.747497578301582, "grad_norm": 0.6804144382476807, "learning_rate": 0.0002555349639047516, "loss": 3.3255, "step": 53400 }, { "epoch": 5.75287913034119, "grad_norm": 0.6917299032211304, "learning_rate": 0.0002552117228746902, "loss": 3.3281, "step": 53450 }, { "epoch": 5.758260682380799, "grad_norm": 0.7830952405929565, "learning_rate": 0.0002548884818446288, "loss": 3.3367, "step": 53500 }, { "epoch": 5.763642234420407, "grad_norm": 0.6828339099884033, "learning_rate": 0.0002545652408145674, "loss": 3.3337, "step": 53550 }, { "epoch": 5.769023786460015, "grad_norm": 0.7331600785255432, "learning_rate": 0.00025424199978450594, "loss": 3.336, "step": 53600 }, { "epoch": 5.774405338499624, "grad_norm": 0.6974768042564392, "learning_rate": 0.00025391875875444453, "loss": 3.3284, "step": 53650 }, { "epoch": 5.779786890539231, "grad_norm": 0.7154784798622131, "learning_rate": 0.0002535955177243831, "loss": 3.3236, "step": 53700 }, { "epoch": 5.78516844257884, "grad_norm": 0.7374145984649658, "learning_rate": 0.0002532722766943217, "loss": 3.3258, "step": 53750 }, { "epoch": 5.790549994618448, "grad_norm": 0.6925607323646545, "learning_rate": 0.00025295550048486153, "loss": 3.3431, "step": 53800 }, { "epoch": 5.795931546658056, "grad_norm": 0.6958518028259277, "learning_rate": 0.0002526322594548001, "loss": 3.3338, "step": 53850 }, { "epoch": 5.801313098697665, "grad_norm": 0.6767904758453369, "learning_rate": 0.0002523090184247387, "loss": 3.3531, "step": 53900 }, { "epoch": 5.806694650737272, "grad_norm": 0.7042453289031982, "learning_rate": 0.00025198577739467726, "loss": 3.3333, "step": 53950 }, { "epoch": 5.812076202776881, "grad_norm": 0.7105914354324341, "learning_rate": 0.00025166253636461585, "loss": 3.3353, "step": 54000 }, { "epoch": 5.812076202776881, "eval_accuracy": 0.38365334687293184, "eval_loss": 3.386536121368408, "eval_runtime": 195.2212, "eval_samples_per_second": 92.259, "eval_steps_per_second": 5.768, "step": 54000 }, { "epoch": 5.817457754816489, "grad_norm": 0.7248342037200928, "learning_rate": 0.00025133929533455445, "loss": 3.3423, "step": 54050 }, { "epoch": 5.822839306856097, "grad_norm": 0.7134623527526855, "learning_rate": 0.00025101605430449304, "loss": 3.3469, "step": 54100 }, { "epoch": 5.828220858895706, "grad_norm": 0.7342662215232849, "learning_rate": 0.00025069281327443164, "loss": 3.3406, "step": 54150 }, { "epoch": 5.833602410935313, "grad_norm": 0.6971819996833801, "learning_rate": 0.0002503695722443702, "loss": 3.3296, "step": 54200 }, { "epoch": 5.838983962974922, "grad_norm": 0.6914299130439758, "learning_rate": 0.00025004633121430877, "loss": 3.335, "step": 54250 }, { "epoch": 5.84436551501453, "grad_norm": 0.6908444762229919, "learning_rate": 0.00024972309018424737, "loss": 3.3218, "step": 54300 }, { "epoch": 5.849747067054138, "grad_norm": 0.7167321443557739, "learning_rate": 0.00024939984915418596, "loss": 3.3309, "step": 54350 }, { "epoch": 5.855128619093747, "grad_norm": 0.7270901799201965, "learning_rate": 0.0002490766081241245, "loss": 3.3447, "step": 54400 }, { "epoch": 5.860510171133355, "grad_norm": 0.7137880325317383, "learning_rate": 0.00024875336709406315, "loss": 3.3156, "step": 54450 }, { "epoch": 5.865891723172963, "grad_norm": 0.6899557709693909, "learning_rate": 0.0002484301260640017, "loss": 3.3324, "step": 54500 }, { "epoch": 5.871273275212571, "grad_norm": 0.7073144316673279, "learning_rate": 0.0002481068850339403, "loss": 3.3377, "step": 54550 }, { "epoch": 5.87665482725218, "grad_norm": 0.7121581435203552, "learning_rate": 0.0002477836440038789, "loss": 3.3358, "step": 54600 }, { "epoch": 5.882036379291788, "grad_norm": 0.7033504843711853, "learning_rate": 0.0002474604029738175, "loss": 3.3309, "step": 54650 }, { "epoch": 5.887417931331396, "grad_norm": 0.7366927862167358, "learning_rate": 0.000247137161943756, "loss": 3.3426, "step": 54700 }, { "epoch": 5.892799483371004, "grad_norm": 0.6953083276748657, "learning_rate": 0.0002468139209136946, "loss": 3.3304, "step": 54750 }, { "epoch": 5.898181035410612, "grad_norm": 0.707262396812439, "learning_rate": 0.0002464906798836332, "loss": 3.3298, "step": 54800 }, { "epoch": 5.903562587450221, "grad_norm": 0.6863567233085632, "learning_rate": 0.0002461674388535718, "loss": 3.3294, "step": 54850 }, { "epoch": 5.9089441394898286, "grad_norm": 0.7244144678115845, "learning_rate": 0.0002458441978235104, "loss": 3.3334, "step": 54900 }, { "epoch": 5.914325691529437, "grad_norm": 0.7477735877037048, "learning_rate": 0.00024552095679344893, "loss": 3.3464, "step": 54950 }, { "epoch": 5.919707243569046, "grad_norm": 0.7203809022903442, "learning_rate": 0.0002451977157633876, "loss": 3.3306, "step": 55000 }, { "epoch": 5.919707243569046, "eval_accuracy": 0.3839801747594778, "eval_loss": 3.3797736167907715, "eval_runtime": 195.5869, "eval_samples_per_second": 92.087, "eval_steps_per_second": 5.757, "step": 55000 }, { "epoch": 5.925088795608653, "grad_norm": 0.7410472631454468, "learning_rate": 0.0002448744747333261, "loss": 3.3449, "step": 55050 }, { "epoch": 5.930470347648262, "grad_norm": 0.6759241223335266, "learning_rate": 0.0002445512337032647, "loss": 3.3361, "step": 55100 }, { "epoch": 5.93585189968787, "grad_norm": 0.6957202553749084, "learning_rate": 0.0002442279926732033, "loss": 3.3363, "step": 55150 }, { "epoch": 5.941233451727478, "grad_norm": 0.7306824922561646, "learning_rate": 0.00024390475164314188, "loss": 3.3315, "step": 55200 }, { "epoch": 5.946615003767087, "grad_norm": 0.760296106338501, "learning_rate": 0.00024358151061308045, "loss": 3.3511, "step": 55250 }, { "epoch": 5.951996555806694, "grad_norm": 0.7307605743408203, "learning_rate": 0.00024325826958301907, "loss": 3.3263, "step": 55300 }, { "epoch": 5.957378107846303, "grad_norm": 0.6909796595573425, "learning_rate": 0.00024293502855295764, "loss": 3.3515, "step": 55350 }, { "epoch": 5.962759659885911, "grad_norm": 0.7553929686546326, "learning_rate": 0.0002426117875228962, "loss": 3.3455, "step": 55400 }, { "epoch": 5.968141211925519, "grad_norm": 0.7335132360458374, "learning_rate": 0.0002422885464928348, "loss": 3.3374, "step": 55450 }, { "epoch": 5.973522763965128, "grad_norm": 0.7116664052009583, "learning_rate": 0.0002419653054627734, "loss": 3.3307, "step": 55500 }, { "epoch": 5.978904316004736, "grad_norm": 0.7170029282569885, "learning_rate": 0.00024164206443271196, "loss": 3.355, "step": 55550 }, { "epoch": 5.984285868044344, "grad_norm": 0.6848576068878174, "learning_rate": 0.00024131882340265056, "loss": 3.3344, "step": 55600 }, { "epoch": 5.989667420083952, "grad_norm": 0.7043144702911377, "learning_rate": 0.00024099558237258912, "loss": 3.3398, "step": 55650 }, { "epoch": 5.995048972123561, "grad_norm": 0.7126272320747375, "learning_rate": 0.00024067234134252772, "loss": 3.3339, "step": 55700 }, { "epoch": 6.000430524163169, "grad_norm": 0.7398678064346313, "learning_rate": 0.00024035556513306752, "loss": 3.3327, "step": 55750 }, { "epoch": 6.005812076202777, "grad_norm": 0.745004415512085, "learning_rate": 0.00024003232410300615, "loss": 3.2401, "step": 55800 }, { "epoch": 6.011193628242385, "grad_norm": 0.7168341875076294, "learning_rate": 0.00023970908307294471, "loss": 3.254, "step": 55850 }, { "epoch": 6.016575180281993, "grad_norm": 0.7345452904701233, "learning_rate": 0.00023938584204288328, "loss": 3.2373, "step": 55900 }, { "epoch": 6.021956732321602, "grad_norm": 0.7234600186347961, "learning_rate": 0.00023906260101282188, "loss": 3.2584, "step": 55950 }, { "epoch": 6.0273382843612096, "grad_norm": 0.7159119248390198, "learning_rate": 0.00023873935998276047, "loss": 3.2266, "step": 56000 }, { "epoch": 6.0273382843612096, "eval_accuracy": 0.3844155468809504, "eval_loss": 3.382514715194702, "eval_runtime": 205.1809, "eval_samples_per_second": 87.781, "eval_steps_per_second": 5.488, "step": 56000 }, { "epoch": 6.032719836400818, "grad_norm": 0.6958070397377014, "learning_rate": 0.00023841611895269904, "loss": 3.2395, "step": 56050 }, { "epoch": 6.038101388440427, "grad_norm": 0.7117956280708313, "learning_rate": 0.00023809287792263763, "loss": 3.2614, "step": 56100 }, { "epoch": 6.043482940480034, "grad_norm": 0.7864623069763184, "learning_rate": 0.0002377696368925762, "loss": 3.2521, "step": 56150 }, { "epoch": 6.048864492519643, "grad_norm": 0.715238630771637, "learning_rate": 0.0002374463958625148, "loss": 3.2547, "step": 56200 }, { "epoch": 6.0542460445592505, "grad_norm": 0.6912676692008972, "learning_rate": 0.0002371231548324534, "loss": 3.253, "step": 56250 }, { "epoch": 6.059627596598859, "grad_norm": 0.7019177675247192, "learning_rate": 0.00023679991380239196, "loss": 3.2513, "step": 56300 }, { "epoch": 6.065009148638468, "grad_norm": 0.8073034286499023, "learning_rate": 0.00023647667277233053, "loss": 3.2551, "step": 56350 }, { "epoch": 6.070390700678075, "grad_norm": 0.716173529624939, "learning_rate": 0.00023615343174226915, "loss": 3.2406, "step": 56400 }, { "epoch": 6.075772252717684, "grad_norm": 0.7097351551055908, "learning_rate": 0.00023583019071220771, "loss": 3.2585, "step": 56450 }, { "epoch": 6.081153804757292, "grad_norm": 0.7224815487861633, "learning_rate": 0.00023550694968214628, "loss": 3.2563, "step": 56500 }, { "epoch": 6.0865353567969, "grad_norm": 0.7451142072677612, "learning_rate": 0.00023518370865208488, "loss": 3.2523, "step": 56550 }, { "epoch": 6.091916908836509, "grad_norm": 0.7142341732978821, "learning_rate": 0.00023486046762202347, "loss": 3.2581, "step": 56600 }, { "epoch": 6.097298460876116, "grad_norm": 0.699760913848877, "learning_rate": 0.00023453722659196207, "loss": 3.2556, "step": 56650 }, { "epoch": 6.102680012915725, "grad_norm": 0.7422381639480591, "learning_rate": 0.00023421398556190063, "loss": 3.2764, "step": 56700 }, { "epoch": 6.108061564955333, "grad_norm": 0.8956770896911621, "learning_rate": 0.0002338907445318392, "loss": 3.2671, "step": 56750 }, { "epoch": 6.113443116994941, "grad_norm": 0.7111889123916626, "learning_rate": 0.00023356750350177782, "loss": 3.2572, "step": 56800 }, { "epoch": 6.11882466903455, "grad_norm": 0.9345306158065796, "learning_rate": 0.0002332442624717164, "loss": 3.2521, "step": 56850 }, { "epoch": 6.124206221074158, "grad_norm": 0.7105541825294495, "learning_rate": 0.00023292102144165496, "loss": 3.2668, "step": 56900 }, { "epoch": 6.129587773113766, "grad_norm": 0.7327850461006165, "learning_rate": 0.00023259778041159358, "loss": 3.2601, "step": 56950 }, { "epoch": 6.134969325153374, "grad_norm": 0.7219511866569519, "learning_rate": 0.00023227453938153215, "loss": 3.2555, "step": 57000 }, { "epoch": 6.134969325153374, "eval_accuracy": 0.38421410442694237, "eval_loss": 3.3847293853759766, "eval_runtime": 196.0722, "eval_samples_per_second": 91.859, "eval_steps_per_second": 5.743, "step": 57000 }, { "epoch": 6.140350877192983, "grad_norm": 0.7666511535644531, "learning_rate": 0.00023195129835147071, "loss": 3.2573, "step": 57050 }, { "epoch": 6.1457324292325906, "grad_norm": 0.8009618520736694, "learning_rate": 0.0002316280573214093, "loss": 3.2679, "step": 57100 }, { "epoch": 6.151113981272199, "grad_norm": 0.6945069432258606, "learning_rate": 0.0002313048162913479, "loss": 3.2667, "step": 57150 }, { "epoch": 6.156495533311807, "grad_norm": 0.750709593296051, "learning_rate": 0.00023098157526128647, "loss": 3.2475, "step": 57200 }, { "epoch": 6.161877085351415, "grad_norm": 0.7243725657463074, "learning_rate": 0.00023065833423122507, "loss": 3.2824, "step": 57250 }, { "epoch": 6.167258637391024, "grad_norm": 0.7340604066848755, "learning_rate": 0.00023033509320116363, "loss": 3.2589, "step": 57300 }, { "epoch": 6.1726401894306315, "grad_norm": 0.7163415551185608, "learning_rate": 0.00023001185217110223, "loss": 3.2712, "step": 57350 }, { "epoch": 6.17802174147024, "grad_norm": 0.7414380311965942, "learning_rate": 0.00022968861114104082, "loss": 3.2653, "step": 57400 }, { "epoch": 6.183403293509849, "grad_norm": 0.7300434112548828, "learning_rate": 0.0002293653701109794, "loss": 3.274, "step": 57450 }, { "epoch": 6.188784845549456, "grad_norm": 0.7358796000480652, "learning_rate": 0.00022904212908091796, "loss": 3.2541, "step": 57500 }, { "epoch": 6.194166397589065, "grad_norm": 0.7499557137489319, "learning_rate": 0.00022871888805085658, "loss": 3.2607, "step": 57550 }, { "epoch": 6.1995479496286725, "grad_norm": 0.77440345287323, "learning_rate": 0.00022839564702079515, "loss": 3.2597, "step": 57600 }, { "epoch": 6.204929501668281, "grad_norm": 0.759096622467041, "learning_rate": 0.00022807240599073374, "loss": 3.2771, "step": 57650 }, { "epoch": 6.21031105370789, "grad_norm": 0.7167288064956665, "learning_rate": 0.00022774916496067234, "loss": 3.2498, "step": 57700 }, { "epoch": 6.215692605747497, "grad_norm": 0.7278882265090942, "learning_rate": 0.0002274259239306109, "loss": 3.2825, "step": 57750 }, { "epoch": 6.221074157787106, "grad_norm": 0.7202818989753723, "learning_rate": 0.0002271026829005495, "loss": 3.2604, "step": 57800 }, { "epoch": 6.226455709826714, "grad_norm": 0.7370897531509399, "learning_rate": 0.00022677944187048807, "loss": 3.2528, "step": 57850 }, { "epoch": 6.231837261866322, "grad_norm": 0.7153465151786804, "learning_rate": 0.00022645620084042666, "loss": 3.2778, "step": 57900 }, { "epoch": 6.237218813905931, "grad_norm": 0.7415947914123535, "learning_rate": 0.00022613295981036526, "loss": 3.2753, "step": 57950 }, { "epoch": 6.242600365945538, "grad_norm": 0.734861433506012, "learning_rate": 0.00022580971878030382, "loss": 3.2849, "step": 58000 }, { "epoch": 6.242600365945538, "eval_accuracy": 0.3847145596282159, "eval_loss": 3.380995988845825, "eval_runtime": 200.7754, "eval_samples_per_second": 89.707, "eval_steps_per_second": 5.608, "step": 58000 }, { "epoch": 6.247981917985147, "grad_norm": 0.7297642827033997, "learning_rate": 0.00022549294257084366, "loss": 3.2695, "step": 58050 }, { "epoch": 6.253363470024755, "grad_norm": 0.7514773011207581, "learning_rate": 0.00022516970154078222, "loss": 3.2576, "step": 58100 }, { "epoch": 6.258745022064363, "grad_norm": 0.7224261164665222, "learning_rate": 0.0002248464605107208, "loss": 3.2726, "step": 58150 }, { "epoch": 6.264126574103972, "grad_norm": 0.7479972839355469, "learning_rate": 0.0002245232194806594, "loss": 3.2817, "step": 58200 }, { "epoch": 6.26950812614358, "grad_norm": 0.7621592879295349, "learning_rate": 0.00022419997845059798, "loss": 3.2763, "step": 58250 }, { "epoch": 6.274889678183188, "grad_norm": 0.7210646271705627, "learning_rate": 0.00022387673742053655, "loss": 3.277, "step": 58300 }, { "epoch": 6.280271230222796, "grad_norm": 0.7708672881126404, "learning_rate": 0.00022355349639047514, "loss": 3.2824, "step": 58350 }, { "epoch": 6.285652782262405, "grad_norm": 0.7061500549316406, "learning_rate": 0.0002232302553604137, "loss": 3.2647, "step": 58400 }, { "epoch": 6.2910343343020125, "grad_norm": 0.7927210927009583, "learning_rate": 0.00022290701433035233, "loss": 3.2727, "step": 58450 }, { "epoch": 6.296415886341621, "grad_norm": 0.7816336154937744, "learning_rate": 0.0002225837733002909, "loss": 3.2825, "step": 58500 }, { "epoch": 6.301797438381229, "grad_norm": 0.7397032380104065, "learning_rate": 0.00022226053227022947, "loss": 3.2754, "step": 58550 }, { "epoch": 6.307178990420837, "grad_norm": 0.7220003008842468, "learning_rate": 0.0002219372912401681, "loss": 3.2776, "step": 58600 }, { "epoch": 6.312560542460446, "grad_norm": 0.7121575474739075, "learning_rate": 0.00022161405021010666, "loss": 3.2601, "step": 58650 }, { "epoch": 6.3179420945000535, "grad_norm": 0.7781762480735779, "learning_rate": 0.00022129080918004523, "loss": 3.2796, "step": 58700 }, { "epoch": 6.323323646539662, "grad_norm": 0.7582905888557434, "learning_rate": 0.00022096756814998382, "loss": 3.2895, "step": 58750 }, { "epoch": 6.328705198579271, "grad_norm": 0.7163707613945007, "learning_rate": 0.00022064432711992241, "loss": 3.2841, "step": 58800 }, { "epoch": 6.334086750618878, "grad_norm": 0.7681860327720642, "learning_rate": 0.00022032108608986098, "loss": 3.2909, "step": 58850 }, { "epoch": 6.339468302658487, "grad_norm": 0.7766373157501221, "learning_rate": 0.00021999784505979958, "loss": 3.2916, "step": 58900 }, { "epoch": 6.344849854698095, "grad_norm": 0.739279568195343, "learning_rate": 0.00021967460402973814, "loss": 3.2823, "step": 58950 }, { "epoch": 6.350231406737703, "grad_norm": 0.7410812377929688, "learning_rate": 0.00021935136299967674, "loss": 3.2801, "step": 59000 }, { "epoch": 6.350231406737703, "eval_accuracy": 0.38521621001125533, "eval_loss": 3.3765883445739746, "eval_runtime": 194.9686, "eval_samples_per_second": 92.379, "eval_steps_per_second": 5.775, "step": 59000 }, { "epoch": 6.355612958777312, "grad_norm": 0.7452576756477356, "learning_rate": 0.00021902812196961533, "loss": 3.2732, "step": 59050 }, { "epoch": 6.360994510816919, "grad_norm": 0.7552573680877686, "learning_rate": 0.0002187048809395539, "loss": 3.266, "step": 59100 }, { "epoch": 6.366376062856528, "grad_norm": 0.7315678596496582, "learning_rate": 0.00021838163990949247, "loss": 3.2772, "step": 59150 }, { "epoch": 6.371757614896136, "grad_norm": 0.7016833424568176, "learning_rate": 0.0002180583988794311, "loss": 3.2874, "step": 59200 }, { "epoch": 6.377139166935744, "grad_norm": 0.7350359559059143, "learning_rate": 0.00021773515784936966, "loss": 3.2959, "step": 59250 }, { "epoch": 6.382520718975353, "grad_norm": 0.7227166891098022, "learning_rate": 0.00021741191681930823, "loss": 3.281, "step": 59300 }, { "epoch": 6.387902271014961, "grad_norm": 0.7294051051139832, "learning_rate": 0.00021708867578924685, "loss": 3.298, "step": 59350 }, { "epoch": 6.393283823054569, "grad_norm": 0.718024730682373, "learning_rate": 0.00021676543475918541, "loss": 3.2599, "step": 59400 }, { "epoch": 6.398665375094177, "grad_norm": 0.7561959624290466, "learning_rate": 0.000216442193729124, "loss": 3.2947, "step": 59450 }, { "epoch": 6.404046927133785, "grad_norm": 0.7592124938964844, "learning_rate": 0.00021611895269906258, "loss": 3.2771, "step": 59500 }, { "epoch": 6.4094284791733935, "grad_norm": 0.750074565410614, "learning_rate": 0.00021579571166900117, "loss": 3.26, "step": 59550 }, { "epoch": 6.414810031213002, "grad_norm": 0.7350614666938782, "learning_rate": 0.00021547247063893977, "loss": 3.2863, "step": 59600 }, { "epoch": 6.42019158325261, "grad_norm": 0.7412868738174438, "learning_rate": 0.00021514922960887833, "loss": 3.269, "step": 59650 }, { "epoch": 6.425573135292218, "grad_norm": 0.7642093300819397, "learning_rate": 0.0002148259885788169, "loss": 3.2749, "step": 59700 }, { "epoch": 6.430954687331827, "grad_norm": 0.7282441258430481, "learning_rate": 0.00021450274754875552, "loss": 3.269, "step": 59750 }, { "epoch": 6.4363362393714345, "grad_norm": 0.7450237274169922, "learning_rate": 0.0002141795065186941, "loss": 3.2695, "step": 59800 }, { "epoch": 6.441717791411043, "grad_norm": 0.7366235852241516, "learning_rate": 0.00021385626548863266, "loss": 3.2876, "step": 59850 }, { "epoch": 6.447099343450651, "grad_norm": 0.7377769947052002, "learning_rate": 0.00021353302445857128, "loss": 3.2793, "step": 59900 }, { "epoch": 6.452480895490259, "grad_norm": 0.771925151348114, "learning_rate": 0.00021320978342850985, "loss": 3.2807, "step": 59950 }, { "epoch": 6.457862447529868, "grad_norm": 0.7846429944038391, "learning_rate": 0.00021288654239844842, "loss": 3.2773, "step": 60000 }, { "epoch": 6.457862447529868, "eval_accuracy": 0.385375712450567, "eval_loss": 3.3726203441619873, "eval_runtime": 198.7032, "eval_samples_per_second": 90.643, "eval_steps_per_second": 5.667, "step": 60000 }, { "epoch": 6.4632439995694755, "grad_norm": 0.7637952566146851, "learning_rate": 0.000212563301368387, "loss": 3.2915, "step": 60050 }, { "epoch": 6.468625551609084, "grad_norm": 0.7368430495262146, "learning_rate": 0.00021224006033832558, "loss": 3.2963, "step": 60100 }, { "epoch": 6.474007103648693, "grad_norm": 0.7520938515663147, "learning_rate": 0.0002119232841288654, "loss": 3.2928, "step": 60150 }, { "epoch": 6.4793886556883, "grad_norm": 0.739131510257721, "learning_rate": 0.00021160004309880398, "loss": 3.2874, "step": 60200 }, { "epoch": 6.484770207727909, "grad_norm": 0.7567487359046936, "learning_rate": 0.0002112768020687426, "loss": 3.2851, "step": 60250 }, { "epoch": 6.490151759767517, "grad_norm": 0.7027845978736877, "learning_rate": 0.00021095356103868117, "loss": 3.2906, "step": 60300 }, { "epoch": 6.495533311807125, "grad_norm": 0.7968448400497437, "learning_rate": 0.00021063032000861974, "loss": 3.2912, "step": 60350 }, { "epoch": 6.500914863846734, "grad_norm": 0.7443140745162964, "learning_rate": 0.00021030707897855833, "loss": 3.2914, "step": 60400 }, { "epoch": 6.506296415886341, "grad_norm": 0.7305548787117004, "learning_rate": 0.00020998383794849692, "loss": 3.2959, "step": 60450 }, { "epoch": 6.51167796792595, "grad_norm": 0.7129182815551758, "learning_rate": 0.0002096605969184355, "loss": 3.275, "step": 60500 }, { "epoch": 6.517059519965558, "grad_norm": 0.697638213634491, "learning_rate": 0.0002093373558883741, "loss": 3.3068, "step": 60550 }, { "epoch": 6.522441072005166, "grad_norm": 0.7496232986450195, "learning_rate": 0.00020901411485831265, "loss": 3.2661, "step": 60600 }, { "epoch": 6.5278226240447745, "grad_norm": 0.800989031791687, "learning_rate": 0.00020869087382825125, "loss": 3.2937, "step": 60650 }, { "epoch": 6.533204176084383, "grad_norm": 0.7812984585762024, "learning_rate": 0.00020836763279818984, "loss": 3.2756, "step": 60700 }, { "epoch": 6.538585728123991, "grad_norm": 0.7700937986373901, "learning_rate": 0.0002080443917681284, "loss": 3.2949, "step": 60750 }, { "epoch": 6.543967280163599, "grad_norm": 0.7460906505584717, "learning_rate": 0.00020772115073806698, "loss": 3.2751, "step": 60800 }, { "epoch": 6.549348832203208, "grad_norm": 0.7481933832168579, "learning_rate": 0.0002073979097080056, "loss": 3.2899, "step": 60850 }, { "epoch": 6.5547303842428155, "grad_norm": 0.7331646680831909, "learning_rate": 0.00020707466867794417, "loss": 3.306, "step": 60900 }, { "epoch": 6.560111936282424, "grad_norm": 0.7299278974533081, "learning_rate": 0.00020675142764788274, "loss": 3.2903, "step": 60950 }, { "epoch": 6.565493488322032, "grad_norm": 0.7748175859451294, "learning_rate": 0.00020642818661782136, "loss": 3.3014, "step": 61000 }, { "epoch": 6.565493488322032, "eval_accuracy": 0.38595026892133, "eval_loss": 3.3667492866516113, "eval_runtime": 196.5736, "eval_samples_per_second": 91.625, "eval_steps_per_second": 5.728, "step": 61000 }, { "epoch": 6.57087504036164, "grad_norm": 0.7791305184364319, "learning_rate": 0.00020610494558775993, "loss": 3.2782, "step": 61050 }, { "epoch": 6.576256592401249, "grad_norm": 0.7687615752220154, "learning_rate": 0.0002057817045576985, "loss": 3.292, "step": 61100 }, { "epoch": 6.5816381444408565, "grad_norm": 0.7428541779518127, "learning_rate": 0.0002054584635276371, "loss": 3.2634, "step": 61150 }, { "epoch": 6.587019696480465, "grad_norm": 0.7501689195632935, "learning_rate": 0.00020513522249757568, "loss": 3.2993, "step": 61200 }, { "epoch": 6.592401248520073, "grad_norm": 0.7482814192771912, "learning_rate": 0.00020481198146751428, "loss": 3.2895, "step": 61250 }, { "epoch": 6.597782800559681, "grad_norm": 0.7531591653823853, "learning_rate": 0.00020448874043745284, "loss": 3.2832, "step": 61300 }, { "epoch": 6.60316435259929, "grad_norm": 0.8052058815956116, "learning_rate": 0.0002041654994073914, "loss": 3.3006, "step": 61350 }, { "epoch": 6.608545904638898, "grad_norm": 0.7585821151733398, "learning_rate": 0.00020384225837733003, "loss": 3.2867, "step": 61400 }, { "epoch": 6.613927456678506, "grad_norm": 0.7599231004714966, "learning_rate": 0.0002035190173472686, "loss": 3.2948, "step": 61450 }, { "epoch": 6.619309008718115, "grad_norm": 0.7827469110488892, "learning_rate": 0.00020319577631720717, "loss": 3.2863, "step": 61500 }, { "epoch": 6.624690560757722, "grad_norm": 0.7314283847808838, "learning_rate": 0.00020287253528714576, "loss": 3.2803, "step": 61550 }, { "epoch": 6.630072112797331, "grad_norm": 0.7610256671905518, "learning_rate": 0.00020254929425708436, "loss": 3.2964, "step": 61600 }, { "epoch": 6.635453664836939, "grad_norm": 0.7744382619857788, "learning_rate": 0.00020222605322702293, "loss": 3.3023, "step": 61650 }, { "epoch": 6.640835216876547, "grad_norm": 0.7310662269592285, "learning_rate": 0.00020190281219696152, "loss": 3.2774, "step": 61700 }, { "epoch": 6.6462167689161555, "grad_norm": 0.7460449934005737, "learning_rate": 0.0002015795711669001, "loss": 3.2858, "step": 61750 }, { "epoch": 6.651598320955763, "grad_norm": 0.790534496307373, "learning_rate": 0.00020125633013683868, "loss": 3.2875, "step": 61800 }, { "epoch": 6.656979872995372, "grad_norm": 0.7475619316101074, "learning_rate": 0.00020093308910677728, "loss": 3.2868, "step": 61850 }, { "epoch": 6.66236142503498, "grad_norm": 0.7804775834083557, "learning_rate": 0.00020060984807671584, "loss": 3.2854, "step": 61900 }, { "epoch": 6.667742977074588, "grad_norm": 0.7852612733840942, "learning_rate": 0.0002002866070466544, "loss": 3.2824, "step": 61950 }, { "epoch": 6.6731245291141965, "grad_norm": 0.7747927904129028, "learning_rate": 0.00019996336601659303, "loss": 3.2774, "step": 62000 }, { "epoch": 6.6731245291141965, "eval_accuracy": 0.3861736592586766, "eval_loss": 3.363661050796509, "eval_runtime": 194.5926, "eval_samples_per_second": 92.557, "eval_steps_per_second": 5.786, "step": 62000 }, { "epoch": 6.678506081153805, "grad_norm": 0.7766702175140381, "learning_rate": 0.0001996401249865316, "loss": 3.2727, "step": 62050 }, { "epoch": 6.683887633193413, "grad_norm": 0.759154200553894, "learning_rate": 0.00019931688395647017, "loss": 3.2908, "step": 62100 }, { "epoch": 6.689269185233021, "grad_norm": 0.7269113063812256, "learning_rate": 0.0001989936429264088, "loss": 3.2694, "step": 62150 }, { "epoch": 6.69465073727263, "grad_norm": 0.7533604502677917, "learning_rate": 0.00019867040189634736, "loss": 3.2954, "step": 62200 }, { "epoch": 6.7000322893122375, "grad_norm": 0.7951676845550537, "learning_rate": 0.00019834716086628595, "loss": 3.291, "step": 62250 }, { "epoch": 6.705413841351846, "grad_norm": 0.7553754448890686, "learning_rate": 0.00019802391983622452, "loss": 3.2993, "step": 62300 }, { "epoch": 6.710795393391454, "grad_norm": 0.7384345531463623, "learning_rate": 0.00019770714362676435, "loss": 3.3098, "step": 62350 }, { "epoch": 6.716176945431062, "grad_norm": 0.7837821841239929, "learning_rate": 0.00019738390259670292, "loss": 3.2873, "step": 62400 }, { "epoch": 6.721558497470671, "grad_norm": 0.8101162910461426, "learning_rate": 0.0001970606615666415, "loss": 3.2793, "step": 62450 }, { "epoch": 6.7269400495102785, "grad_norm": 0.7428802847862244, "learning_rate": 0.0001967374205365801, "loss": 3.2889, "step": 62500 }, { "epoch": 6.732321601549887, "grad_norm": 0.7610371708869934, "learning_rate": 0.00019641417950651868, "loss": 3.2868, "step": 62550 }, { "epoch": 6.737703153589496, "grad_norm": 0.7788252830505371, "learning_rate": 0.00019609093847645725, "loss": 3.2811, "step": 62600 }, { "epoch": 6.743084705629103, "grad_norm": 0.7891542911529541, "learning_rate": 0.00019576769744639587, "loss": 3.2871, "step": 62650 }, { "epoch": 6.748466257668712, "grad_norm": 0.7551553845405579, "learning_rate": 0.00019544445641633444, "loss": 3.2824, "step": 62700 }, { "epoch": 6.75384780970832, "grad_norm": 0.7407161593437195, "learning_rate": 0.000195121215386273, "loss": 3.2785, "step": 62750 }, { "epoch": 6.759229361747928, "grad_norm": 0.7767391204833984, "learning_rate": 0.0001947979743562116, "loss": 3.2957, "step": 62800 }, { "epoch": 6.7646109137875365, "grad_norm": 0.7716450691223145, "learning_rate": 0.00019447473332615017, "loss": 3.2908, "step": 62850 }, { "epoch": 6.769992465827144, "grad_norm": 0.7589283585548401, "learning_rate": 0.00019415149229608876, "loss": 3.2708, "step": 62900 }, { "epoch": 6.775374017866753, "grad_norm": 0.7275950312614441, "learning_rate": 0.00019382825126602735, "loss": 3.283, "step": 62950 }, { "epoch": 6.780755569906361, "grad_norm": 0.7468757629394531, "learning_rate": 0.00019350501023596592, "loss": 3.2826, "step": 63000 }, { "epoch": 6.780755569906361, "eval_accuracy": 0.38676788190213396, "eval_loss": 3.3597800731658936, "eval_runtime": 200.2482, "eval_samples_per_second": 89.943, "eval_steps_per_second": 5.623, "step": 63000 }, { "epoch": 6.786137121945969, "grad_norm": 0.745414137840271, "learning_rate": 0.00019318176920590454, "loss": 3.273, "step": 63050 }, { "epoch": 6.7915186739855775, "grad_norm": 0.756697952747345, "learning_rate": 0.0001928585281758431, "loss": 3.2861, "step": 63100 }, { "epoch": 6.796900226025185, "grad_norm": 0.7485399842262268, "learning_rate": 0.00019253528714578168, "loss": 3.2847, "step": 63150 }, { "epoch": 6.802281778064794, "grad_norm": 0.7797415256500244, "learning_rate": 0.00019221204611572027, "loss": 3.2908, "step": 63200 }, { "epoch": 6.807663330104402, "grad_norm": 0.818111002445221, "learning_rate": 0.00019188880508565887, "loss": 3.2909, "step": 63250 }, { "epoch": 6.813044882144011, "grad_norm": 0.7770239114761353, "learning_rate": 0.00019156556405559744, "loss": 3.2904, "step": 63300 }, { "epoch": 6.8184264341836185, "grad_norm": 0.7970494627952576, "learning_rate": 0.00019124232302553603, "loss": 3.2932, "step": 63350 }, { "epoch": 6.823807986223227, "grad_norm": 0.7903392910957336, "learning_rate": 0.0001909190819954746, "loss": 3.2974, "step": 63400 }, { "epoch": 6.829189538262835, "grad_norm": 0.8986805081367493, "learning_rate": 0.0001905958409654132, "loss": 3.2837, "step": 63450 }, { "epoch": 6.834571090302443, "grad_norm": 0.7686243057250977, "learning_rate": 0.0001902725999353518, "loss": 3.2957, "step": 63500 }, { "epoch": 6.839952642342052, "grad_norm": 0.7945030927658081, "learning_rate": 0.00018994935890529036, "loss": 3.275, "step": 63550 }, { "epoch": 6.8453341943816595, "grad_norm": 0.8145069479942322, "learning_rate": 0.00018962611787522892, "loss": 3.287, "step": 63600 }, { "epoch": 6.850715746421268, "grad_norm": 0.7765173316001892, "learning_rate": 0.00018930287684516754, "loss": 3.2964, "step": 63650 }, { "epoch": 6.856097298460876, "grad_norm": 0.8863644599914551, "learning_rate": 0.0001889796358151061, "loss": 3.2674, "step": 63700 }, { "epoch": 6.861478850500484, "grad_norm": 0.7589664459228516, "learning_rate": 0.00018865639478504468, "loss": 3.2718, "step": 63750 }, { "epoch": 6.866860402540093, "grad_norm": 0.7652623057365417, "learning_rate": 0.0001883331537549833, "loss": 3.2736, "step": 63800 }, { "epoch": 6.8722419545797, "grad_norm": 0.7880659103393555, "learning_rate": 0.00018800991272492187, "loss": 3.2994, "step": 63850 }, { "epoch": 6.877623506619309, "grad_norm": 0.7556933760643005, "learning_rate": 0.00018768667169486044, "loss": 3.2908, "step": 63900 }, { "epoch": 6.8830050586589175, "grad_norm": 0.778171956539154, "learning_rate": 0.00018736343066479903, "loss": 3.2839, "step": 63950 }, { "epoch": 6.888386610698525, "grad_norm": 0.7967923879623413, "learning_rate": 0.00018704018963473763, "loss": 3.2734, "step": 64000 }, { "epoch": 6.888386610698525, "eval_accuracy": 0.38728550425968217, "eval_loss": 3.3551573753356934, "eval_runtime": 203.474, "eval_samples_per_second": 88.517, "eval_steps_per_second": 5.534, "step": 64000 }, { "epoch": 6.893768162738134, "grad_norm": 0.761313796043396, "learning_rate": 0.00018671694860467622, "loss": 3.2713, "step": 64050 }, { "epoch": 6.899149714777742, "grad_norm": 0.7254140377044678, "learning_rate": 0.0001863937075746148, "loss": 3.2736, "step": 64100 }, { "epoch": 6.90453126681735, "grad_norm": 0.8009861707687378, "learning_rate": 0.00018607046654455336, "loss": 3.2954, "step": 64150 }, { "epoch": 6.9099128188569585, "grad_norm": 0.7890373468399048, "learning_rate": 0.00018574722551449198, "loss": 3.2827, "step": 64200 }, { "epoch": 6.915294370896566, "grad_norm": 0.7626961469650269, "learning_rate": 0.00018542398448443054, "loss": 3.2909, "step": 64250 }, { "epoch": 6.920675922936175, "grad_norm": 0.8804692029953003, "learning_rate": 0.0001851007434543691, "loss": 3.289, "step": 64300 }, { "epoch": 6.926057474975783, "grad_norm": 0.7699129581451416, "learning_rate": 0.00018477750242430773, "loss": 3.2924, "step": 64350 }, { "epoch": 6.931439027015391, "grad_norm": 0.773622989654541, "learning_rate": 0.0001844542613942463, "loss": 3.2788, "step": 64400 }, { "epoch": 6.9368205790549995, "grad_norm": 0.7274935245513916, "learning_rate": 0.00018413102036418487, "loss": 3.2926, "step": 64450 }, { "epoch": 6.942202131094608, "grad_norm": 0.7527948021888733, "learning_rate": 0.00018380777933412346, "loss": 3.2984, "step": 64500 }, { "epoch": 6.947583683134216, "grad_norm": 0.8110983371734619, "learning_rate": 0.00018348453830406206, "loss": 3.2862, "step": 64550 }, { "epoch": 6.952965235173824, "grad_norm": 0.7983884215354919, "learning_rate": 0.00018316129727400063, "loss": 3.286, "step": 64600 }, { "epoch": 6.958346787213433, "grad_norm": 0.772709310054779, "learning_rate": 0.00018283805624393922, "loss": 3.3001, "step": 64650 }, { "epoch": 6.9637283392530405, "grad_norm": 0.7958887815475464, "learning_rate": 0.0001825148152138778, "loss": 3.2976, "step": 64700 }, { "epoch": 6.969109891292649, "grad_norm": 0.7898955941200256, "learning_rate": 0.00018219157418381636, "loss": 3.2906, "step": 64750 }, { "epoch": 6.974491443332257, "grad_norm": 0.7751755118370056, "learning_rate": 0.00018186833315375498, "loss": 3.2859, "step": 64800 }, { "epoch": 6.979872995371865, "grad_norm": 0.7762618660926819, "learning_rate": 0.00018154509212369355, "loss": 3.2973, "step": 64850 }, { "epoch": 6.985254547411474, "grad_norm": 0.7949537038803101, "learning_rate": 0.0001812218510936321, "loss": 3.2853, "step": 64900 }, { "epoch": 6.990636099451081, "grad_norm": 0.7892162799835205, "learning_rate": 0.00018089861006357073, "loss": 3.2872, "step": 64950 }, { "epoch": 6.99601765149069, "grad_norm": 0.7522981762886047, "learning_rate": 0.0001805753690335093, "loss": 3.2676, "step": 65000 }, { "epoch": 6.99601765149069, "eval_accuracy": 0.3876287387322881, "eval_loss": 3.352322578430176, "eval_runtime": 215.3455, "eval_samples_per_second": 83.638, "eval_steps_per_second": 5.229, "step": 65000 }, { "epoch": 7.0013992035302985, "grad_norm": 0.7797788381576538, "learning_rate": 0.0001802521280034479, "loss": 3.2643, "step": 65050 }, { "epoch": 7.006780755569906, "grad_norm": 0.7758247256278992, "learning_rate": 0.00017992888697338646, "loss": 3.2046, "step": 65100 }, { "epoch": 7.012162307609515, "grad_norm": 0.8069040775299072, "learning_rate": 0.00017960564594332506, "loss": 3.2005, "step": 65150 }, { "epoch": 7.017543859649122, "grad_norm": 0.7784696817398071, "learning_rate": 0.00017928240491326365, "loss": 3.2059, "step": 65200 }, { "epoch": 7.022925411688731, "grad_norm": 0.7886298298835754, "learning_rate": 0.00017895916388320222, "loss": 3.2059, "step": 65250 }, { "epoch": 7.0283069637283395, "grad_norm": 0.810814380645752, "learning_rate": 0.0001786359228531408, "loss": 3.2004, "step": 65300 }, { "epoch": 7.033688515767947, "grad_norm": 0.8277598023414612, "learning_rate": 0.0001783126818230794, "loss": 3.1962, "step": 65350 }, { "epoch": 7.039070067807556, "grad_norm": 0.7609949707984924, "learning_rate": 0.00017798944079301798, "loss": 3.2008, "step": 65400 }, { "epoch": 7.044451619847164, "grad_norm": 0.7364971041679382, "learning_rate": 0.00017766619976295655, "loss": 3.2008, "step": 65450 }, { "epoch": 7.049833171886772, "grad_norm": 0.7349782586097717, "learning_rate": 0.00017734295873289517, "loss": 3.1923, "step": 65500 }, { "epoch": 7.0552147239263805, "grad_norm": 0.8025264739990234, "learning_rate": 0.00017702618252343495, "loss": 3.2097, "step": 65550 }, { "epoch": 7.060596275965988, "grad_norm": 0.7878516316413879, "learning_rate": 0.00017670294149337354, "loss": 3.1973, "step": 65600 }, { "epoch": 7.065977828005597, "grad_norm": 0.7897598147392273, "learning_rate": 0.00017637970046331214, "loss": 3.196, "step": 65650 }, { "epoch": 7.071359380045205, "grad_norm": 0.7731457352638245, "learning_rate": 0.0001760564594332507, "loss": 3.2343, "step": 65700 }, { "epoch": 7.076740932084813, "grad_norm": 1.2706592082977295, "learning_rate": 0.0001757332184031893, "loss": 3.2107, "step": 65750 }, { "epoch": 7.0821224841244215, "grad_norm": 0.8142618536949158, "learning_rate": 0.00017540997737312787, "loss": 3.2197, "step": 65800 }, { "epoch": 7.08750403616403, "grad_norm": 0.7961957454681396, "learning_rate": 0.0001750867363430665, "loss": 3.2175, "step": 65850 }, { "epoch": 7.092885588203638, "grad_norm": 0.8617919683456421, "learning_rate": 0.00017476349531300506, "loss": 3.2236, "step": 65900 }, { "epoch": 7.098267140243246, "grad_norm": 0.7776252031326294, "learning_rate": 0.00017444025428294362, "loss": 3.2133, "step": 65950 }, { "epoch": 7.103648692282855, "grad_norm": 0.7629320621490479, "learning_rate": 0.00017411701325288224, "loss": 3.2141, "step": 66000 }, { "epoch": 7.103648692282855, "eval_accuracy": 0.3874447893932182, "eval_loss": 3.357611894607544, "eval_runtime": 211.7423, "eval_samples_per_second": 85.061, "eval_steps_per_second": 5.318, "step": 66000 }, { "epoch": 7.109030244322462, "grad_norm": 0.8089470863342285, "learning_rate": 0.0001737937722228208, "loss": 3.2077, "step": 66050 }, { "epoch": 7.114411796362071, "grad_norm": 0.7836511135101318, "learning_rate": 0.00017347053119275938, "loss": 3.2124, "step": 66100 }, { "epoch": 7.119793348401679, "grad_norm": 0.7699388861656189, "learning_rate": 0.00017314729016269797, "loss": 3.2208, "step": 66150 }, { "epoch": 7.125174900441287, "grad_norm": 0.7971923351287842, "learning_rate": 0.00017282404913263654, "loss": 3.2245, "step": 66200 }, { "epoch": 7.130556452480896, "grad_norm": 0.8028796911239624, "learning_rate": 0.00017250080810257514, "loss": 3.2306, "step": 66250 }, { "epoch": 7.135938004520503, "grad_norm": 0.7812018394470215, "learning_rate": 0.00017217756707251373, "loss": 3.1989, "step": 66300 }, { "epoch": 7.141319556560112, "grad_norm": 0.7548637986183167, "learning_rate": 0.0001718543260424523, "loss": 3.211, "step": 66350 }, { "epoch": 7.1467011085997205, "grad_norm": 0.8434266448020935, "learning_rate": 0.00017153108501239087, "loss": 3.2338, "step": 66400 }, { "epoch": 7.152082660639328, "grad_norm": 0.8046761155128479, "learning_rate": 0.0001712078439823295, "loss": 3.2238, "step": 66450 }, { "epoch": 7.157464212678937, "grad_norm": 0.7850152850151062, "learning_rate": 0.00017088460295226806, "loss": 3.2273, "step": 66500 }, { "epoch": 7.162845764718545, "grad_norm": 0.8294812440872192, "learning_rate": 0.00017056136192220662, "loss": 3.2112, "step": 66550 }, { "epoch": 7.168227316758153, "grad_norm": 0.744547426700592, "learning_rate": 0.00017023812089214524, "loss": 3.2101, "step": 66600 }, { "epoch": 7.1736088687977615, "grad_norm": 0.8168179988861084, "learning_rate": 0.0001699148798620838, "loss": 3.2147, "step": 66650 }, { "epoch": 7.178990420837369, "grad_norm": 0.8229835033416748, "learning_rate": 0.00016959163883202238, "loss": 3.2302, "step": 66700 }, { "epoch": 7.184371972876978, "grad_norm": 0.8025431632995605, "learning_rate": 0.00016926839780196097, "loss": 3.2207, "step": 66750 }, { "epoch": 7.189753524916586, "grad_norm": 0.782611608505249, "learning_rate": 0.00016894515677189957, "loss": 3.241, "step": 66800 }, { "epoch": 7.195135076956194, "grad_norm": 0.8258253335952759, "learning_rate": 0.00016862191574183816, "loss": 3.2092, "step": 66850 }, { "epoch": 7.2005166289958025, "grad_norm": 0.8076246976852417, "learning_rate": 0.00016829867471177673, "loss": 3.2358, "step": 66900 }, { "epoch": 7.205898181035411, "grad_norm": 0.8116209506988525, "learning_rate": 0.0001679754336817153, "loss": 3.2076, "step": 66950 }, { "epoch": 7.211279733075019, "grad_norm": 0.8048698306083679, "learning_rate": 0.00016765219265165392, "loss": 3.2343, "step": 67000 }, { "epoch": 7.211279733075019, "eval_accuracy": 0.3876256964514293, "eval_loss": 3.3560845851898193, "eval_runtime": 211.2944, "eval_samples_per_second": 85.241, "eval_steps_per_second": 5.329, "step": 67000 }, { "epoch": 7.216661285114627, "grad_norm": 0.8519846796989441, "learning_rate": 0.0001673289516215925, "loss": 3.2344, "step": 67050 }, { "epoch": 7.222042837154235, "grad_norm": 0.7423728108406067, "learning_rate": 0.00016700571059153106, "loss": 3.2322, "step": 67100 }, { "epoch": 7.2274243891938434, "grad_norm": 0.7732436060905457, "learning_rate": 0.00016668246956146968, "loss": 3.2076, "step": 67150 }, { "epoch": 7.232805941233452, "grad_norm": 0.7401405572891235, "learning_rate": 0.00016635922853140825, "loss": 3.2221, "step": 67200 }, { "epoch": 7.23818749327306, "grad_norm": 0.7472262382507324, "learning_rate": 0.0001660359875013468, "loss": 3.2246, "step": 67250 }, { "epoch": 7.243569045312668, "grad_norm": 0.7432873845100403, "learning_rate": 0.0001657127464712854, "loss": 3.209, "step": 67300 }, { "epoch": 7.248950597352277, "grad_norm": 0.7662221789360046, "learning_rate": 0.000165389505441224, "loss": 3.2232, "step": 67350 }, { "epoch": 7.254332149391884, "grad_norm": 0.804426372051239, "learning_rate": 0.00016506626441116257, "loss": 3.2239, "step": 67400 }, { "epoch": 7.259713701431493, "grad_norm": 0.7635779976844788, "learning_rate": 0.00016474302338110116, "loss": 3.2356, "step": 67450 }, { "epoch": 7.265095253471101, "grad_norm": 0.7849301695823669, "learning_rate": 0.00016441978235103973, "loss": 3.2353, "step": 67500 }, { "epoch": 7.270476805510709, "grad_norm": 0.7454454302787781, "learning_rate": 0.00016409654132097833, "loss": 3.2238, "step": 67550 }, { "epoch": 7.275858357550318, "grad_norm": 0.8164418339729309, "learning_rate": 0.00016377330029091692, "loss": 3.2453, "step": 67600 }, { "epoch": 7.281239909589925, "grad_norm": 0.7791739702224731, "learning_rate": 0.0001634500592608555, "loss": 3.2339, "step": 67650 }, { "epoch": 7.286621461629534, "grad_norm": 0.7832018733024597, "learning_rate": 0.00016312681823079406, "loss": 3.2339, "step": 67700 }, { "epoch": 7.2920030136691425, "grad_norm": 0.8332245945930481, "learning_rate": 0.00016280357720073268, "loss": 3.2326, "step": 67750 }, { "epoch": 7.29738456570875, "grad_norm": 0.7885503172874451, "learning_rate": 0.00016248033617067125, "loss": 3.2424, "step": 67800 }, { "epoch": 7.302766117748359, "grad_norm": 0.7824217677116394, "learning_rate": 0.00016215709514060984, "loss": 3.218, "step": 67850 }, { "epoch": 7.308147669787967, "grad_norm": 0.8223897814750671, "learning_rate": 0.00016183385411054843, "loss": 3.2276, "step": 67900 }, { "epoch": 7.313529221827575, "grad_norm": 0.7712852358818054, "learning_rate": 0.000161510613080487, "loss": 3.2069, "step": 67950 }, { "epoch": 7.3189107738671835, "grad_norm": 0.7743715643882751, "learning_rate": 0.0001611873720504256, "loss": 3.2277, "step": 68000 }, { "epoch": 7.3189107738671835, "eval_accuracy": 0.3880603080026872, "eval_loss": 3.3522820472717285, "eval_runtime": 210.2908, "eval_samples_per_second": 85.648, "eval_steps_per_second": 5.354, "step": 68000 }, { "epoch": 7.324292325906791, "grad_norm": 0.7747835516929626, "learning_rate": 0.00016086413102036416, "loss": 3.2336, "step": 68050 }, { "epoch": 7.3296738779464, "grad_norm": 0.8009321093559265, "learning_rate": 0.00016054088999030273, "loss": 3.2293, "step": 68100 }, { "epoch": 7.335055429986008, "grad_norm": 0.8049393892288208, "learning_rate": 0.00016021764896024135, "loss": 3.2359, "step": 68150 }, { "epoch": 7.340436982025616, "grad_norm": 0.8045018315315247, "learning_rate": 0.00015990087275078113, "loss": 3.2419, "step": 68200 }, { "epoch": 7.3458185340652244, "grad_norm": 0.8061736226081848, "learning_rate": 0.00015957763172071976, "loss": 3.217, "step": 68250 }, { "epoch": 7.351200086104833, "grad_norm": 0.8088467717170715, "learning_rate": 0.00015925439069065832, "loss": 3.2275, "step": 68300 }, { "epoch": 7.356581638144441, "grad_norm": 0.8118233680725098, "learning_rate": 0.0001589311496605969, "loss": 3.2308, "step": 68350 }, { "epoch": 7.361963190184049, "grad_norm": 0.7720018625259399, "learning_rate": 0.00015860790863053548, "loss": 3.231, "step": 68400 }, { "epoch": 7.367344742223658, "grad_norm": 0.7653403878211975, "learning_rate": 0.00015828466760047408, "loss": 3.2414, "step": 68450 }, { "epoch": 7.372726294263265, "grad_norm": 0.7920792102813721, "learning_rate": 0.00015796142657041265, "loss": 3.2353, "step": 68500 }, { "epoch": 7.378107846302874, "grad_norm": 0.8059729337692261, "learning_rate": 0.00015763818554035124, "loss": 3.2339, "step": 68550 }, { "epoch": 7.383489398342482, "grad_norm": 0.7684776186943054, "learning_rate": 0.0001573149445102898, "loss": 3.2355, "step": 68600 }, { "epoch": 7.38887095038209, "grad_norm": 0.8400054574012756, "learning_rate": 0.00015699170348022843, "loss": 3.2392, "step": 68650 }, { "epoch": 7.394252502421699, "grad_norm": 0.7758475542068481, "learning_rate": 0.000156668462450167, "loss": 3.2384, "step": 68700 }, { "epoch": 7.399634054461306, "grad_norm": 0.8178544640541077, "learning_rate": 0.00015634522142010557, "loss": 3.2373, "step": 68750 }, { "epoch": 7.405015606500915, "grad_norm": 0.7898211479187012, "learning_rate": 0.0001560219803900442, "loss": 3.2331, "step": 68800 }, { "epoch": 7.4103971585405235, "grad_norm": 0.7738881707191467, "learning_rate": 0.00015569873935998276, "loss": 3.2337, "step": 68850 }, { "epoch": 7.415778710580131, "grad_norm": 0.7856907844543457, "learning_rate": 0.00015537549832992132, "loss": 3.2221, "step": 68900 }, { "epoch": 7.42116026261974, "grad_norm": 0.7582386136054993, "learning_rate": 0.00015505225729985992, "loss": 3.2397, "step": 68950 }, { "epoch": 7.426541814659347, "grad_norm": 0.8215800523757935, "learning_rate": 0.0001547290162697985, "loss": 3.236, "step": 69000 }, { "epoch": 7.426541814659347, "eval_accuracy": 0.3885883610374656, "eval_loss": 3.349855661392212, "eval_runtime": 210.9678, "eval_samples_per_second": 85.373, "eval_steps_per_second": 5.337, "step": 69000 }, { "epoch": 7.431923366698956, "grad_norm": 0.8055285811424255, "learning_rate": 0.00015440577523973708, "loss": 3.2277, "step": 69050 }, { "epoch": 7.4373049187385645, "grad_norm": 0.7805681824684143, "learning_rate": 0.00015408253420967567, "loss": 3.2534, "step": 69100 }, { "epoch": 7.442686470778172, "grad_norm": 0.8025108575820923, "learning_rate": 0.00015375929317961424, "loss": 3.2343, "step": 69150 }, { "epoch": 7.448068022817781, "grad_norm": 0.8178415894508362, "learning_rate": 0.00015343605214955284, "loss": 3.2282, "step": 69200 }, { "epoch": 7.453449574857389, "grad_norm": 0.7723139524459839, "learning_rate": 0.00015311281111949143, "loss": 3.2373, "step": 69250 }, { "epoch": 7.458831126896997, "grad_norm": 0.8103027939796448, "learning_rate": 0.00015278957008943, "loss": 3.2253, "step": 69300 }, { "epoch": 7.4642126789366054, "grad_norm": 0.7889986038208008, "learning_rate": 0.00015246632905936857, "loss": 3.2271, "step": 69350 }, { "epoch": 7.469594230976213, "grad_norm": 0.7937176823616028, "learning_rate": 0.0001521430880293072, "loss": 3.2375, "step": 69400 }, { "epoch": 7.474975783015822, "grad_norm": 0.8158497214317322, "learning_rate": 0.00015181984699924576, "loss": 3.2368, "step": 69450 }, { "epoch": 7.48035733505543, "grad_norm": 0.8363879323005676, "learning_rate": 0.00015149660596918432, "loss": 3.2239, "step": 69500 }, { "epoch": 7.485738887095038, "grad_norm": 0.7895720601081848, "learning_rate": 0.00015117336493912295, "loss": 3.2266, "step": 69550 }, { "epoch": 7.491120439134646, "grad_norm": 0.7915844917297363, "learning_rate": 0.0001508501239090615, "loss": 3.2448, "step": 69600 }, { "epoch": 7.496501991174255, "grad_norm": 0.7852685451507568, "learning_rate": 0.0001505268828790001, "loss": 3.2139, "step": 69650 }, { "epoch": 7.501883543213863, "grad_norm": 0.7899736166000366, "learning_rate": 0.00015020364184893867, "loss": 3.2317, "step": 69700 }, { "epoch": 7.507265095253471, "grad_norm": 0.7891963124275208, "learning_rate": 0.00014988040081887724, "loss": 3.2278, "step": 69750 }, { "epoch": 7.51264664729308, "grad_norm": 0.7811875343322754, "learning_rate": 0.00014955715978881584, "loss": 3.2406, "step": 69800 }, { "epoch": 7.518028199332687, "grad_norm": 0.7988965511322021, "learning_rate": 0.00014923391875875443, "loss": 3.2298, "step": 69850 }, { "epoch": 7.523409751372296, "grad_norm": 0.8040280342102051, "learning_rate": 0.00014891067772869303, "loss": 3.2465, "step": 69900 }, { "epoch": 7.528791303411904, "grad_norm": 0.8975268602371216, "learning_rate": 0.0001485874366986316, "loss": 3.2452, "step": 69950 }, { "epoch": 7.534172855451512, "grad_norm": 0.8084391951560974, "learning_rate": 0.0001482641956685702, "loss": 3.2201, "step": 70000 }, { "epoch": 7.534172855451512, "eval_accuracy": 0.38874221352661087, "eval_loss": 3.3477556705474854, "eval_runtime": 211.1497, "eval_samples_per_second": 85.3, "eval_steps_per_second": 5.333, "step": 70000 }, { "epoch": 7.539554407491121, "grad_norm": 0.8021398186683655, "learning_rate": 0.00014794095463850878, "loss": 3.2476, "step": 70050 }, { "epoch": 7.544935959530728, "grad_norm": 0.796826183795929, "learning_rate": 0.00014761771360844735, "loss": 3.2431, "step": 70100 }, { "epoch": 7.550317511570337, "grad_norm": 0.8226954340934753, "learning_rate": 0.00014729447257838595, "loss": 3.2146, "step": 70150 }, { "epoch": 7.5556990636099455, "grad_norm": 0.7751065492630005, "learning_rate": 0.0001469712315483245, "loss": 3.2377, "step": 70200 }, { "epoch": 7.561080615649553, "grad_norm": 0.8223302960395813, "learning_rate": 0.0001466479905182631, "loss": 3.2459, "step": 70250 }, { "epoch": 7.566462167689162, "grad_norm": 0.7794532179832458, "learning_rate": 0.00014632474948820168, "loss": 3.2225, "step": 70300 }, { "epoch": 7.57184371972877, "grad_norm": 0.8701120615005493, "learning_rate": 0.00014600150845814027, "loss": 3.2331, "step": 70350 }, { "epoch": 7.577225271768378, "grad_norm": 0.8439012169837952, "learning_rate": 0.00014567826742807886, "loss": 3.2266, "step": 70400 }, { "epoch": 7.5826068238079865, "grad_norm": 0.82621169090271, "learning_rate": 0.00014535502639801743, "loss": 3.2267, "step": 70450 }, { "epoch": 7.587988375847594, "grad_norm": 0.8370195627212524, "learning_rate": 0.00014503178536795603, "loss": 3.2329, "step": 70500 }, { "epoch": 7.593369927887203, "grad_norm": 0.8157381415367126, "learning_rate": 0.00014470854433789462, "loss": 3.2428, "step": 70550 }, { "epoch": 7.598751479926811, "grad_norm": 0.8180111050605774, "learning_rate": 0.0001443853033078332, "loss": 3.2468, "step": 70600 }, { "epoch": 7.604133031966419, "grad_norm": 0.8646537065505981, "learning_rate": 0.00014406206227777178, "loss": 3.22, "step": 70650 }, { "epoch": 7.609514584006027, "grad_norm": 0.8048348426818848, "learning_rate": 0.0001437452860683116, "loss": 3.2472, "step": 70700 }, { "epoch": 7.614896136045635, "grad_norm": 0.7684217691421509, "learning_rate": 0.00014342204503825018, "loss": 3.2327, "step": 70750 }, { "epoch": 7.620277688085244, "grad_norm": 0.8237805962562561, "learning_rate": 0.00014309880400818875, "loss": 3.2486, "step": 70800 }, { "epoch": 7.625659240124852, "grad_norm": 0.8122904896736145, "learning_rate": 0.00014277556297812735, "loss": 3.2419, "step": 70850 }, { "epoch": 7.63104079216446, "grad_norm": 0.8078117966651917, "learning_rate": 0.00014245232194806591, "loss": 3.2286, "step": 70900 }, { "epoch": 7.636422344204068, "grad_norm": 0.8151229619979858, "learning_rate": 0.0001421290809180045, "loss": 3.2208, "step": 70950 }, { "epoch": 7.641803896243677, "grad_norm": 0.8114475011825562, "learning_rate": 0.0001418058398879431, "loss": 3.2337, "step": 71000 }, { "epoch": 7.641803896243677, "eval_accuracy": 0.38909826903997896, "eval_loss": 3.3436226844787598, "eval_runtime": 207.9913, "eval_samples_per_second": 86.595, "eval_steps_per_second": 5.414, "step": 71000 }, { "epoch": 7.647185448283285, "grad_norm": 0.8878672122955322, "learning_rate": 0.00014148259885788167, "loss": 3.2354, "step": 71050 }, { "epoch": 7.652567000322893, "grad_norm": 0.8463232517242432, "learning_rate": 0.00014115935782782027, "loss": 3.2492, "step": 71100 }, { "epoch": 7.657948552362502, "grad_norm": 0.8358443975448608, "learning_rate": 0.00014083611679775886, "loss": 3.2434, "step": 71150 }, { "epoch": 7.663330104402109, "grad_norm": 0.8259372115135193, "learning_rate": 0.00014051287576769743, "loss": 3.2452, "step": 71200 }, { "epoch": 7.668711656441718, "grad_norm": 0.8233510851860046, "learning_rate": 0.00014018963473763602, "loss": 3.2524, "step": 71250 }, { "epoch": 7.674093208481326, "grad_norm": 0.7944367527961731, "learning_rate": 0.0001398663937075746, "loss": 3.2322, "step": 71300 }, { "epoch": 7.679474760520934, "grad_norm": 0.818215012550354, "learning_rate": 0.00013954315267751319, "loss": 3.242, "step": 71350 }, { "epoch": 7.684856312560543, "grad_norm": 0.8027128577232361, "learning_rate": 0.00013921991164745175, "loss": 3.2194, "step": 71400 }, { "epoch": 7.69023786460015, "grad_norm": 0.7849347591400146, "learning_rate": 0.00013889667061739035, "loss": 3.2435, "step": 71450 }, { "epoch": 7.695619416639759, "grad_norm": 0.8004842400550842, "learning_rate": 0.00013857342958732894, "loss": 3.2417, "step": 71500 }, { "epoch": 7.7010009686793675, "grad_norm": 0.7999834418296814, "learning_rate": 0.0001382501885572675, "loss": 3.2345, "step": 71550 }, { "epoch": 7.706382520718975, "grad_norm": 0.813932478427887, "learning_rate": 0.0001379269475272061, "loss": 3.2388, "step": 71600 }, { "epoch": 7.711764072758584, "grad_norm": 0.8163520693778992, "learning_rate": 0.0001376037064971447, "loss": 3.2463, "step": 71650 }, { "epoch": 7.717145624798192, "grad_norm": 0.8218525648117065, "learning_rate": 0.0001372804654670833, "loss": 3.2376, "step": 71700 }, { "epoch": 7.7225271768378, "grad_norm": 0.8164900541305542, "learning_rate": 0.00013695722443702186, "loss": 3.2415, "step": 71750 }, { "epoch": 7.727908728877408, "grad_norm": 0.8270570039749146, "learning_rate": 0.00013663398340696046, "loss": 3.2334, "step": 71800 }, { "epoch": 7.733290280917016, "grad_norm": 0.8093637824058533, "learning_rate": 0.00013631074237689902, "loss": 3.2408, "step": 71850 }, { "epoch": 7.738671832956625, "grad_norm": 0.8747385740280151, "learning_rate": 0.00013598750134683762, "loss": 3.2449, "step": 71900 }, { "epoch": 7.744053384996233, "grad_norm": 0.7810999751091003, "learning_rate": 0.00013566426031677619, "loss": 3.2159, "step": 71950 }, { "epoch": 7.749434937035841, "grad_norm": 0.8121122717857361, "learning_rate": 0.00013534748410731602, "loss": 3.2174, "step": 72000 }, { "epoch": 7.749434937035841, "eval_accuracy": 0.3896095895300339, "eval_loss": 3.3376104831695557, "eval_runtime": 189.9062, "eval_samples_per_second": 94.842, "eval_steps_per_second": 5.929, "step": 72000 }, { "epoch": 7.754816489075449, "grad_norm": 0.8262868523597717, "learning_rate": 0.0001350242430772546, "loss": 3.2463, "step": 72050 }, { "epoch": 7.760198041115058, "grad_norm": 0.8187814354896545, "learning_rate": 0.00013470100204719318, "loss": 3.2571, "step": 72100 }, { "epoch": 7.765579593154666, "grad_norm": 0.8290274739265442, "learning_rate": 0.00013437776101713178, "loss": 3.2556, "step": 72150 }, { "epoch": 7.770961145194274, "grad_norm": 0.8006277680397034, "learning_rate": 0.00013405451998707034, "loss": 3.2402, "step": 72200 }, { "epoch": 7.776342697233883, "grad_norm": 0.8297838568687439, "learning_rate": 0.00013373127895700894, "loss": 3.2314, "step": 72250 }, { "epoch": 7.78172424927349, "grad_norm": 0.8442043662071228, "learning_rate": 0.00013340803792694753, "loss": 3.2356, "step": 72300 }, { "epoch": 7.787105801313099, "grad_norm": 0.8034444451332092, "learning_rate": 0.0001330847968968861, "loss": 3.2229, "step": 72350 }, { "epoch": 7.792487353352707, "grad_norm": 0.8173365592956543, "learning_rate": 0.00013276155586682467, "loss": 3.2286, "step": 72400 }, { "epoch": 7.797868905392315, "grad_norm": 0.777065098285675, "learning_rate": 0.00013243831483676326, "loss": 3.2472, "step": 72450 }, { "epoch": 7.803250457431924, "grad_norm": 0.8149591088294983, "learning_rate": 0.00013211507380670186, "loss": 3.2358, "step": 72500 }, { "epoch": 7.808632009471531, "grad_norm": 0.8245164752006531, "learning_rate": 0.00013179183277664042, "loss": 3.2379, "step": 72550 }, { "epoch": 7.81401356151114, "grad_norm": 0.858371376991272, "learning_rate": 0.00013146859174657902, "loss": 3.2399, "step": 72600 }, { "epoch": 7.819395113550748, "grad_norm": 0.8818589448928833, "learning_rate": 0.00013114535071651761, "loss": 3.2313, "step": 72650 }, { "epoch": 7.824776665590356, "grad_norm": 0.7951632738113403, "learning_rate": 0.00013082210968645618, "loss": 3.2385, "step": 72700 }, { "epoch": 7.830158217629965, "grad_norm": 0.8036244511604309, "learning_rate": 0.00013049886865639478, "loss": 3.2455, "step": 72750 }, { "epoch": 7.835539769669572, "grad_norm": 0.7847592830657959, "learning_rate": 0.00013017562762633337, "loss": 3.2634, "step": 72800 }, { "epoch": 7.840921321709181, "grad_norm": 0.8040270805358887, "learning_rate": 0.00012985238659627194, "loss": 3.2263, "step": 72850 }, { "epoch": 7.846302873748789, "grad_norm": 0.8288953900337219, "learning_rate": 0.00012952914556621053, "loss": 3.252, "step": 72900 }, { "epoch": 7.851684425788397, "grad_norm": 0.8158117532730103, "learning_rate": 0.0001292059045361491, "loss": 3.2296, "step": 72950 }, { "epoch": 7.857065977828006, "grad_norm": 0.8312733173370361, "learning_rate": 0.0001288826635060877, "loss": 3.2182, "step": 73000 }, { "epoch": 7.857065977828006, "eval_accuracy": 0.38989512931921033, "eval_loss": 3.334813117980957, "eval_runtime": 209.8317, "eval_samples_per_second": 85.835, "eval_steps_per_second": 5.366, "step": 73000 }, { "epoch": 7.862447529867614, "grad_norm": 0.8196859359741211, "learning_rate": 0.00012855942247602626, "loss": 3.2287, "step": 73050 }, { "epoch": 7.867829081907222, "grad_norm": 0.810441792011261, "learning_rate": 0.00012823618144596486, "loss": 3.2479, "step": 73100 }, { "epoch": 7.87321063394683, "grad_norm": 0.7915072441101074, "learning_rate": 0.00012791294041590345, "loss": 3.2385, "step": 73150 }, { "epoch": 7.878592185986438, "grad_norm": 0.8137247562408447, "learning_rate": 0.00012758969938584202, "loss": 3.2249, "step": 73200 }, { "epoch": 7.883973738026047, "grad_norm": 0.7836380004882812, "learning_rate": 0.00012726645835578061, "loss": 3.2342, "step": 73250 }, { "epoch": 7.889355290065655, "grad_norm": 0.8190814852714539, "learning_rate": 0.0001269432173257192, "loss": 3.223, "step": 73300 }, { "epoch": 7.894736842105263, "grad_norm": 0.8309457302093506, "learning_rate": 0.00012661997629565778, "loss": 3.2194, "step": 73350 }, { "epoch": 7.900118394144871, "grad_norm": 0.7745057344436646, "learning_rate": 0.00012629673526559637, "loss": 3.2503, "step": 73400 }, { "epoch": 7.90549994618448, "grad_norm": 0.8173632621765137, "learning_rate": 0.00012597349423553497, "loss": 3.2438, "step": 73450 }, { "epoch": 7.910881498224088, "grad_norm": 0.8117119669914246, "learning_rate": 0.00012565025320547353, "loss": 3.2514, "step": 73500 }, { "epoch": 7.916263050263696, "grad_norm": 0.7812601923942566, "learning_rate": 0.00012532701217541213, "loss": 3.2333, "step": 73550 }, { "epoch": 7.921644602303305, "grad_norm": 0.882896363735199, "learning_rate": 0.0001250037711453507, "loss": 3.223, "step": 73600 }, { "epoch": 7.927026154342912, "grad_norm": 0.7868736982345581, "learning_rate": 0.0001246805301152893, "loss": 3.2356, "step": 73650 }, { "epoch": 7.932407706382521, "grad_norm": 0.8104776740074158, "learning_rate": 0.00012435728908522786, "loss": 3.2451, "step": 73700 }, { "epoch": 7.937789258422129, "grad_norm": 0.7862167954444885, "learning_rate": 0.00012403404805516645, "loss": 3.2351, "step": 73750 }, { "epoch": 7.943170810461737, "grad_norm": 0.832856297492981, "learning_rate": 0.00012371080702510505, "loss": 3.2329, "step": 73800 }, { "epoch": 7.948552362501346, "grad_norm": 0.8277978301048279, "learning_rate": 0.00012338756599504362, "loss": 3.2403, "step": 73850 }, { "epoch": 7.953933914540953, "grad_norm": 0.8304550051689148, "learning_rate": 0.0001230643249649822, "loss": 3.2312, "step": 73900 }, { "epoch": 7.959315466580562, "grad_norm": 0.844760000705719, "learning_rate": 0.0001227410839349208, "loss": 3.2351, "step": 73950 }, { "epoch": 7.96469701862017, "grad_norm": 0.8154044151306152, "learning_rate": 0.0001224178429048594, "loss": 3.2393, "step": 74000 }, { "epoch": 7.96469701862017, "eval_accuracy": 0.3899715122993439, "eval_loss": 3.331357002258301, "eval_runtime": 211.3692, "eval_samples_per_second": 85.211, "eval_steps_per_second": 5.327, "step": 74000 }, { "epoch": 7.970078570659778, "grad_norm": 0.8356589078903198, "learning_rate": 0.00012209460187479797, "loss": 3.2136, "step": 74050 }, { "epoch": 7.975460122699387, "grad_norm": 0.8147774338722229, "learning_rate": 0.00012177136084473655, "loss": 3.2281, "step": 74100 }, { "epoch": 7.980841674738995, "grad_norm": 0.8949999809265137, "learning_rate": 0.00012144811981467514, "loss": 3.2365, "step": 74150 }, { "epoch": 7.986223226778603, "grad_norm": 0.7900296449661255, "learning_rate": 0.00012112487878461371, "loss": 3.2091, "step": 74200 }, { "epoch": 7.991604778818211, "grad_norm": 0.844578742980957, "learning_rate": 0.0001208016377545523, "loss": 3.247, "step": 74250 }, { "epoch": 7.996986330857819, "grad_norm": 0.7930771708488464, "learning_rate": 0.00012047839672449089, "loss": 3.2137, "step": 74300 }, { "epoch": 8.002367882897428, "grad_norm": 0.8258639574050903, "learning_rate": 0.00012015515569442947, "loss": 3.1959, "step": 74350 }, { "epoch": 8.007749434937036, "grad_norm": 0.8070922493934631, "learning_rate": 0.00011983191466436805, "loss": 3.1728, "step": 74400 }, { "epoch": 8.013130986976645, "grad_norm": 0.8373987078666687, "learning_rate": 0.00011950867363430664, "loss": 3.1395, "step": 74450 }, { "epoch": 8.018512539016251, "grad_norm": 0.8283763527870178, "learning_rate": 0.00011918543260424522, "loss": 3.1552, "step": 74500 }, { "epoch": 8.02389409105586, "grad_norm": 0.8418523669242859, "learning_rate": 0.0001188621915741838, "loss": 3.1531, "step": 74550 }, { "epoch": 8.029275643095469, "grad_norm": 0.7724992632865906, "learning_rate": 0.00011853895054412239, "loss": 3.1875, "step": 74600 }, { "epoch": 8.034657195135077, "grad_norm": 0.8510676622390747, "learning_rate": 0.00011821570951406098, "loss": 3.1649, "step": 74650 }, { "epoch": 8.040038747174686, "grad_norm": 0.8318743109703064, "learning_rate": 0.00011789246848399955, "loss": 3.164, "step": 74700 }, { "epoch": 8.045420299214294, "grad_norm": 0.8723147511482239, "learning_rate": 0.00011756922745393814, "loss": 3.1604, "step": 74750 }, { "epoch": 8.050801851253901, "grad_norm": 0.8099079132080078, "learning_rate": 0.00011724598642387674, "loss": 3.1651, "step": 74800 }, { "epoch": 8.05618340329351, "grad_norm": 0.8209378123283386, "learning_rate": 0.0001169227453938153, "loss": 3.1699, "step": 74850 }, { "epoch": 8.061564955333118, "grad_norm": 0.8424363136291504, "learning_rate": 0.0001165995043637539, "loss": 3.1525, "step": 74900 }, { "epoch": 8.066946507372727, "grad_norm": 0.822147786617279, "learning_rate": 0.00011627626333369248, "loss": 3.1864, "step": 74950 }, { "epoch": 8.072328059412335, "grad_norm": 0.8202683329582214, "learning_rate": 0.00011595302230363108, "loss": 3.1517, "step": 75000 }, { "epoch": 8.072328059412335, "eval_accuracy": 0.3900216012806264, "eval_loss": 3.336542844772339, "eval_runtime": 191.5515, "eval_samples_per_second": 94.027, "eval_steps_per_second": 5.878, "step": 75000 }, { "epoch": 8.077709611451942, "grad_norm": 0.8051620721817017, "learning_rate": 0.00011562978127356964, "loss": 3.1685, "step": 75050 }, { "epoch": 8.08309116349155, "grad_norm": 0.9138336777687073, "learning_rate": 0.00011530654024350824, "loss": 3.1629, "step": 75100 }, { "epoch": 8.088472715531159, "grad_norm": 0.8304498195648193, "learning_rate": 0.00011498329921344682, "loss": 3.1683, "step": 75150 }, { "epoch": 8.093854267570768, "grad_norm": 0.8430973887443542, "learning_rate": 0.0001146600581833854, "loss": 3.1618, "step": 75200 }, { "epoch": 8.099235819610376, "grad_norm": 0.8061507344245911, "learning_rate": 0.00011433681715332398, "loss": 3.1781, "step": 75250 }, { "epoch": 8.104617371649983, "grad_norm": 0.8056333065032959, "learning_rate": 0.00011401357612326258, "loss": 3.1757, "step": 75300 }, { "epoch": 8.109998923689592, "grad_norm": 0.8301107883453369, "learning_rate": 0.00011369033509320114, "loss": 3.1759, "step": 75350 }, { "epoch": 8.1153804757292, "grad_norm": 0.8385125398635864, "learning_rate": 0.00011336709406313974, "loss": 3.163, "step": 75400 }, { "epoch": 8.120762027768809, "grad_norm": 0.8474127054214478, "learning_rate": 0.00011304385303307832, "loss": 3.1968, "step": 75450 }, { "epoch": 8.126143579808417, "grad_norm": 0.8145176768302917, "learning_rate": 0.00011272061200301691, "loss": 3.1802, "step": 75500 }, { "epoch": 8.131525131848026, "grad_norm": 0.823329508304596, "learning_rate": 0.00011239737097295548, "loss": 3.1642, "step": 75550 }, { "epoch": 8.136906683887632, "grad_norm": 0.8719600439071655, "learning_rate": 0.00011207412994289408, "loss": 3.1745, "step": 75600 }, { "epoch": 8.142288235927241, "grad_norm": 0.8388107419013977, "learning_rate": 0.00011175088891283267, "loss": 3.1866, "step": 75650 }, { "epoch": 8.14766978796685, "grad_norm": 0.8055160641670227, "learning_rate": 0.00011142764788277124, "loss": 3.1704, "step": 75700 }, { "epoch": 8.153051340006458, "grad_norm": 0.8534181714057922, "learning_rate": 0.00011110440685270983, "loss": 3.1663, "step": 75750 }, { "epoch": 8.158432892046067, "grad_norm": 0.8355813026428223, "learning_rate": 0.00011078116582264841, "loss": 3.1759, "step": 75800 }, { "epoch": 8.163814444085673, "grad_norm": 0.8218345046043396, "learning_rate": 0.000110457924792587, "loss": 3.1865, "step": 75850 }, { "epoch": 8.169195996125282, "grad_norm": 0.8139203190803528, "learning_rate": 0.00011013468376252558, "loss": 3.1649, "step": 75900 }, { "epoch": 8.17457754816489, "grad_norm": 0.8463836908340454, "learning_rate": 0.00010981144273246417, "loss": 3.1948, "step": 75950 }, { "epoch": 8.1799591002045, "grad_norm": 0.7959445714950562, "learning_rate": 0.00010948820170240275, "loss": 3.1824, "step": 76000 }, { "epoch": 8.1799591002045, "eval_accuracy": 0.3903759183477894, "eval_loss": 3.334186553955078, "eval_runtime": 221.1785, "eval_samples_per_second": 81.432, "eval_steps_per_second": 5.091, "step": 76000 }, { "epoch": 8.185340652244108, "grad_norm": 0.8416066765785217, "learning_rate": 0.00010916496067234133, "loss": 3.1709, "step": 76050 }, { "epoch": 8.190722204283716, "grad_norm": 0.8329446315765381, "learning_rate": 0.00010884171964227991, "loss": 3.1742, "step": 76100 }, { "epoch": 8.196103756323323, "grad_norm": 0.8333694934844971, "learning_rate": 0.00010851847861221851, "loss": 3.183, "step": 76150 }, { "epoch": 8.201485308362932, "grad_norm": 0.8373638987541199, "learning_rate": 0.00010819523758215708, "loss": 3.1723, "step": 76200 }, { "epoch": 8.20686686040254, "grad_norm": 0.8664990067481995, "learning_rate": 0.00010787199655209567, "loss": 3.182, "step": 76250 }, { "epoch": 8.212248412442149, "grad_norm": 0.8091091513633728, "learning_rate": 0.00010754875552203425, "loss": 3.1728, "step": 76300 }, { "epoch": 8.217629964481757, "grad_norm": 0.8922072052955627, "learning_rate": 0.00010723197931257406, "loss": 3.1758, "step": 76350 }, { "epoch": 8.223011516521364, "grad_norm": 0.8110160231590271, "learning_rate": 0.00010690873828251265, "loss": 3.1921, "step": 76400 }, { "epoch": 8.228393068560973, "grad_norm": 0.8317387104034424, "learning_rate": 0.00010658549725245123, "loss": 3.1855, "step": 76450 }, { "epoch": 8.233774620600581, "grad_norm": 0.8507430553436279, "learning_rate": 0.00010626225622238982, "loss": 3.1745, "step": 76500 }, { "epoch": 8.23915617264019, "grad_norm": 0.8063206672668457, "learning_rate": 0.0001059390151923284, "loss": 3.1763, "step": 76550 }, { "epoch": 8.244537724679798, "grad_norm": 0.8336623907089233, "learning_rate": 0.00010561577416226699, "loss": 3.1586, "step": 76600 }, { "epoch": 8.249919276719407, "grad_norm": 0.844421923160553, "learning_rate": 0.00010529253313220556, "loss": 3.1821, "step": 76650 }, { "epoch": 8.255300828759013, "grad_norm": 0.8457732200622559, "learning_rate": 0.00010496929210214415, "loss": 3.1778, "step": 76700 }, { "epoch": 8.260682380798622, "grad_norm": 0.8985422849655151, "learning_rate": 0.00010464605107208275, "loss": 3.1718, "step": 76750 }, { "epoch": 8.26606393283823, "grad_norm": 0.8263887166976929, "learning_rate": 0.00010432281004202133, "loss": 3.1938, "step": 76800 }, { "epoch": 8.27144548487784, "grad_norm": 0.8684414625167847, "learning_rate": 0.00010399956901195991, "loss": 3.1709, "step": 76850 }, { "epoch": 8.276827036917448, "grad_norm": 0.8270609378814697, "learning_rate": 0.00010367632798189849, "loss": 3.1768, "step": 76900 }, { "epoch": 8.282208588957054, "grad_norm": 0.8352575302124023, "learning_rate": 0.00010335308695183709, "loss": 3.1952, "step": 76950 }, { "epoch": 8.287590140996663, "grad_norm": 0.804658055305481, "learning_rate": 0.00010302984592177565, "loss": 3.1814, "step": 77000 }, { "epoch": 8.287590140996663, "eval_accuracy": 0.3905953971811747, "eval_loss": 3.3317322731018066, "eval_runtime": 190.2074, "eval_samples_per_second": 94.691, "eval_steps_per_second": 5.92, "step": 77000 }, { "epoch": 8.292971693036272, "grad_norm": 0.7943740487098694, "learning_rate": 0.00010270660489171425, "loss": 3.1838, "step": 77050 }, { "epoch": 8.29835324507588, "grad_norm": 0.8035009503364563, "learning_rate": 0.00010238336386165283, "loss": 3.1595, "step": 77100 }, { "epoch": 8.303734797115489, "grad_norm": 0.8640129566192627, "learning_rate": 0.00010206012283159141, "loss": 3.1757, "step": 77150 }, { "epoch": 8.309116349155097, "grad_norm": 0.858672559261322, "learning_rate": 0.00010173688180152999, "loss": 3.1889, "step": 77200 }, { "epoch": 8.314497901194704, "grad_norm": 0.8206643462181091, "learning_rate": 0.00010141364077146859, "loss": 3.1814, "step": 77250 }, { "epoch": 8.319879453234313, "grad_norm": 0.8095281720161438, "learning_rate": 0.00010109039974140718, "loss": 3.1806, "step": 77300 }, { "epoch": 8.325261005273921, "grad_norm": 0.8287177085876465, "learning_rate": 0.00010076715871134575, "loss": 3.1703, "step": 77350 }, { "epoch": 8.33064255731353, "grad_norm": 0.8322443962097168, "learning_rate": 0.00010044391768128433, "loss": 3.187, "step": 77400 }, { "epoch": 8.336024109353138, "grad_norm": 0.8651958107948303, "learning_rate": 0.00010012067665122292, "loss": 3.1705, "step": 77450 }, { "epoch": 8.341405661392745, "grad_norm": 0.842995822429657, "learning_rate": 9.979743562116149e-05, "loss": 3.1877, "step": 77500 }, { "epoch": 8.346787213432354, "grad_norm": 0.8423287868499756, "learning_rate": 9.947419459110009e-05, "loss": 3.2024, "step": 77550 }, { "epoch": 8.352168765471962, "grad_norm": 0.830844521522522, "learning_rate": 9.915095356103868e-05, "loss": 3.1762, "step": 77600 }, { "epoch": 8.35755031751157, "grad_norm": 0.8280004262924194, "learning_rate": 9.882771253097725e-05, "loss": 3.1796, "step": 77650 }, { "epoch": 8.36293186955118, "grad_norm": 0.8298020362854004, "learning_rate": 9.850447150091584e-05, "loss": 3.1803, "step": 77700 }, { "epoch": 8.368313421590786, "grad_norm": 0.883962869644165, "learning_rate": 9.818123047085442e-05, "loss": 3.1974, "step": 77750 }, { "epoch": 8.373694973630395, "grad_norm": 0.7966394424438477, "learning_rate": 9.785798944079302e-05, "loss": 3.1821, "step": 77800 }, { "epoch": 8.379076525670003, "grad_norm": 0.8396945595741272, "learning_rate": 9.753474841073159e-05, "loss": 3.1992, "step": 77850 }, { "epoch": 8.384458077709612, "grad_norm": 0.8430613279342651, "learning_rate": 9.721150738067018e-05, "loss": 3.186, "step": 77900 }, { "epoch": 8.38983962974922, "grad_norm": 0.7840559482574463, "learning_rate": 9.688826635060876e-05, "loss": 3.1821, "step": 77950 }, { "epoch": 8.395221181788829, "grad_norm": 0.8649613261222839, "learning_rate": 9.656502532054734e-05, "loss": 3.1945, "step": 78000 }, { "epoch": 8.395221181788829, "eval_accuracy": 0.39121287154262435, "eval_loss": 3.3277697563171387, "eval_runtime": 189.8853, "eval_samples_per_second": 94.852, "eval_steps_per_second": 5.93, "step": 78000 }, { "epoch": 8.400602733828435, "grad_norm": 0.8213173151016235, "learning_rate": 9.624178429048592e-05, "loss": 3.1755, "step": 78050 }, { "epoch": 8.405984285868044, "grad_norm": 0.8209165930747986, "learning_rate": 9.591854326042452e-05, "loss": 3.18, "step": 78100 }, { "epoch": 8.411365837907653, "grad_norm": 0.8557418584823608, "learning_rate": 9.559530223036309e-05, "loss": 3.169, "step": 78150 }, { "epoch": 8.416747389947261, "grad_norm": 0.8438737988471985, "learning_rate": 9.527206120030168e-05, "loss": 3.2073, "step": 78200 }, { "epoch": 8.42212894198687, "grad_norm": 0.8334610462188721, "learning_rate": 9.494882017024028e-05, "loss": 3.1781, "step": 78250 }, { "epoch": 8.427510494026476, "grad_norm": 0.8696321845054626, "learning_rate": 9.462557914017886e-05, "loss": 3.1714, "step": 78300 }, { "epoch": 8.432892046066085, "grad_norm": 0.9134912490844727, "learning_rate": 9.430233811011742e-05, "loss": 3.177, "step": 78350 }, { "epoch": 8.438273598105694, "grad_norm": 0.8383008241653442, "learning_rate": 9.397909708005602e-05, "loss": 3.173, "step": 78400 }, { "epoch": 8.443655150145302, "grad_norm": 0.86134934425354, "learning_rate": 9.365585604999461e-05, "loss": 3.1812, "step": 78450 }, { "epoch": 8.44903670218491, "grad_norm": 0.8300356268882751, "learning_rate": 9.333261501993318e-05, "loss": 3.1625, "step": 78500 }, { "epoch": 8.45441825422452, "grad_norm": 0.832892119884491, "learning_rate": 9.300937398987178e-05, "loss": 3.1952, "step": 78550 }, { "epoch": 8.459799806264126, "grad_norm": 0.9564111828804016, "learning_rate": 9.268613295981036e-05, "loss": 3.1878, "step": 78600 }, { "epoch": 8.465181358303735, "grad_norm": 0.8386510014533997, "learning_rate": 9.236289192974894e-05, "loss": 3.202, "step": 78650 }, { "epoch": 8.470562910343343, "grad_norm": 0.8402494192123413, "learning_rate": 9.203965089968752e-05, "loss": 3.1691, "step": 78700 }, { "epoch": 8.475944462382952, "grad_norm": 0.8221197128295898, "learning_rate": 9.171640986962611e-05, "loss": 3.181, "step": 78750 }, { "epoch": 8.48132601442256, "grad_norm": 0.8528578877449036, "learning_rate": 9.13931688395647e-05, "loss": 3.1747, "step": 78800 }, { "epoch": 8.486707566462167, "grad_norm": 0.846588134765625, "learning_rate": 9.106992780950328e-05, "loss": 3.1856, "step": 78850 }, { "epoch": 8.492089118501776, "grad_norm": 0.8404663801193237, "learning_rate": 9.074668677944186e-05, "loss": 3.1811, "step": 78900 }, { "epoch": 8.497470670541384, "grad_norm": 0.8380141854286194, "learning_rate": 9.042344574938045e-05, "loss": 3.1771, "step": 78950 }, { "epoch": 8.502852222580993, "grad_norm": 0.8275344371795654, "learning_rate": 9.010020471931902e-05, "loss": 3.1847, "step": 79000 }, { "epoch": 8.502852222580993, "eval_accuracy": 0.3914904796709904, "eval_loss": 3.3259856700897217, "eval_runtime": 189.3897, "eval_samples_per_second": 95.1, "eval_steps_per_second": 5.945, "step": 79000 }, { "epoch": 8.508233774620601, "grad_norm": 0.8297053575515747, "learning_rate": 8.977696368925761e-05, "loss": 3.1785, "step": 79050 }, { "epoch": 8.513615326660208, "grad_norm": 0.8653829097747803, "learning_rate": 8.945372265919621e-05, "loss": 3.2013, "step": 79100 }, { "epoch": 8.518996878699816, "grad_norm": 0.8570589423179626, "learning_rate": 8.913048162913478e-05, "loss": 3.1734, "step": 79150 }, { "epoch": 8.524378430739425, "grad_norm": 0.8229767680168152, "learning_rate": 8.880724059907337e-05, "loss": 3.1873, "step": 79200 }, { "epoch": 8.529759982779034, "grad_norm": 0.8804620504379272, "learning_rate": 8.848399956901195e-05, "loss": 3.1884, "step": 79250 }, { "epoch": 8.535141534818642, "grad_norm": 0.8399264216423035, "learning_rate": 8.816075853895055e-05, "loss": 3.1759, "step": 79300 }, { "epoch": 8.54052308685825, "grad_norm": 0.8268064260482788, "learning_rate": 8.783751750888911e-05, "loss": 3.1752, "step": 79350 }, { "epoch": 8.545904638897857, "grad_norm": 0.8142411112785339, "learning_rate": 8.751427647882771e-05, "loss": 3.1912, "step": 79400 }, { "epoch": 8.551286190937466, "grad_norm": 0.8386333584785461, "learning_rate": 8.719103544876629e-05, "loss": 3.1949, "step": 79450 }, { "epoch": 8.556667742977075, "grad_norm": 0.8724285364151001, "learning_rate": 8.686779441870487e-05, "loss": 3.1779, "step": 79500 }, { "epoch": 8.562049295016683, "grad_norm": 0.8269312977790833, "learning_rate": 8.654455338864345e-05, "loss": 3.1914, "step": 79550 }, { "epoch": 8.567430847056292, "grad_norm": 0.8587058186531067, "learning_rate": 8.622131235858205e-05, "loss": 3.1748, "step": 79600 }, { "epoch": 8.572812399095898, "grad_norm": 0.8489319086074829, "learning_rate": 8.589807132852061e-05, "loss": 3.1859, "step": 79650 }, { "epoch": 8.578193951135507, "grad_norm": 0.8191342949867249, "learning_rate": 8.557483029845921e-05, "loss": 3.1851, "step": 79700 }, { "epoch": 8.583575503175116, "grad_norm": 0.859576404094696, "learning_rate": 8.525158926839779e-05, "loss": 3.1702, "step": 79750 }, { "epoch": 8.588957055214724, "grad_norm": 0.8672488927841187, "learning_rate": 8.492834823833639e-05, "loss": 3.196, "step": 79800 }, { "epoch": 8.594338607254333, "grad_norm": 0.8281680941581726, "learning_rate": 8.460510720827495e-05, "loss": 3.2033, "step": 79850 }, { "epoch": 8.599720159293941, "grad_norm": 0.8545637726783752, "learning_rate": 8.428186617821355e-05, "loss": 3.1826, "step": 79900 }, { "epoch": 8.605101711333548, "grad_norm": 0.8453432321548462, "learning_rate": 8.395862514815214e-05, "loss": 3.1924, "step": 79950 }, { "epoch": 8.610483263373157, "grad_norm": 0.8370970487594604, "learning_rate": 8.363538411809071e-05, "loss": 3.1648, "step": 80000 }, { "epoch": 8.610483263373157, "eval_accuracy": 0.39175287639506234, "eval_loss": 3.322192668914795, "eval_runtime": 189.7221, "eval_samples_per_second": 94.934, "eval_steps_per_second": 5.935, "step": 80000 }, { "epoch": 8.615864815412765, "grad_norm": 0.8636527061462402, "learning_rate": 8.33121430880293e-05, "loss": 3.1804, "step": 80050 }, { "epoch": 8.621246367452374, "grad_norm": 0.8623721599578857, "learning_rate": 8.298890205796789e-05, "loss": 3.1682, "step": 80100 }, { "epoch": 8.626627919491982, "grad_norm": 0.8467229008674622, "learning_rate": 8.266566102790647e-05, "loss": 3.1673, "step": 80150 }, { "epoch": 8.632009471531589, "grad_norm": 0.8296520113945007, "learning_rate": 8.234241999784505e-05, "loss": 3.1888, "step": 80200 }, { "epoch": 8.637391023571197, "grad_norm": 0.8253055214881897, "learning_rate": 8.201917896778364e-05, "loss": 3.1978, "step": 80250 }, { "epoch": 8.642772575610806, "grad_norm": 0.8557524085044861, "learning_rate": 8.169593793772222e-05, "loss": 3.1835, "step": 80300 }, { "epoch": 8.648154127650415, "grad_norm": 0.8345833420753479, "learning_rate": 8.137916172826203e-05, "loss": 3.171, "step": 80350 }, { "epoch": 8.653535679690023, "grad_norm": 0.8740684986114502, "learning_rate": 8.105592069820062e-05, "loss": 3.1772, "step": 80400 }, { "epoch": 8.658917231729632, "grad_norm": 0.8588552474975586, "learning_rate": 8.073267966813919e-05, "loss": 3.1802, "step": 80450 }, { "epoch": 8.664298783769238, "grad_norm": 0.8434314727783203, "learning_rate": 8.040943863807779e-05, "loss": 3.1869, "step": 80500 }, { "epoch": 8.669680335808847, "grad_norm": 0.8790923953056335, "learning_rate": 8.008619760801637e-05, "loss": 3.1811, "step": 80550 }, { "epoch": 8.675061887848456, "grad_norm": 0.8365694880485535, "learning_rate": 7.976295657795496e-05, "loss": 3.1876, "step": 80600 }, { "epoch": 8.680443439888064, "grad_norm": 0.8803290724754333, "learning_rate": 7.943971554789353e-05, "loss": 3.1946, "step": 80650 }, { "epoch": 8.685824991927673, "grad_norm": 0.8246529698371887, "learning_rate": 7.911647451783212e-05, "loss": 3.1868, "step": 80700 }, { "epoch": 8.69120654396728, "grad_norm": 0.8908007144927979, "learning_rate": 7.879323348777072e-05, "loss": 3.1883, "step": 80750 }, { "epoch": 8.696588096006888, "grad_norm": 0.8227264285087585, "learning_rate": 7.846999245770929e-05, "loss": 3.1765, "step": 80800 }, { "epoch": 8.701969648046497, "grad_norm": 0.8562191724777222, "learning_rate": 7.814675142764787e-05, "loss": 3.2047, "step": 80850 }, { "epoch": 8.707351200086105, "grad_norm": 0.840075671672821, "learning_rate": 7.782351039758646e-05, "loss": 3.1723, "step": 80900 }, { "epoch": 8.712732752125714, "grad_norm": 0.8150270581245422, "learning_rate": 7.750026936752503e-05, "loss": 3.1816, "step": 80950 }, { "epoch": 8.718114304165322, "grad_norm": 0.835625410079956, "learning_rate": 7.717702833746362e-05, "loss": 3.179, "step": 81000 }, { "epoch": 8.718114304165322, "eval_accuracy": 0.3919582303530317, "eval_loss": 3.3187644481658936, "eval_runtime": 189.4444, "eval_samples_per_second": 95.073, "eval_steps_per_second": 5.944, "step": 81000 }, { "epoch": 8.723495856204929, "grad_norm": 0.8230716586112976, "learning_rate": 7.685378730740222e-05, "loss": 3.1832, "step": 81050 }, { "epoch": 8.728877408244538, "grad_norm": 0.8545243740081787, "learning_rate": 7.65305462773408e-05, "loss": 3.1767, "step": 81100 }, { "epoch": 8.734258960284146, "grad_norm": 0.8454192280769348, "learning_rate": 7.620730524727938e-05, "loss": 3.1891, "step": 81150 }, { "epoch": 8.739640512323755, "grad_norm": 0.8239614367485046, "learning_rate": 7.588406421721796e-05, "loss": 3.1903, "step": 81200 }, { "epoch": 8.745022064363363, "grad_norm": 0.8292789459228516, "learning_rate": 7.556082318715656e-05, "loss": 3.1815, "step": 81250 }, { "epoch": 8.75040361640297, "grad_norm": 0.897819995880127, "learning_rate": 7.523758215709513e-05, "loss": 3.2009, "step": 81300 }, { "epoch": 8.755785168442578, "grad_norm": 0.8491601943969727, "learning_rate": 7.491434112703372e-05, "loss": 3.1827, "step": 81350 }, { "epoch": 8.761166720482187, "grad_norm": 0.8508619666099548, "learning_rate": 7.45911000969723e-05, "loss": 3.1863, "step": 81400 }, { "epoch": 8.766548272521796, "grad_norm": 0.835036039352417, "learning_rate": 7.426785906691088e-05, "loss": 3.1758, "step": 81450 }, { "epoch": 8.771929824561404, "grad_norm": 0.8110843300819397, "learning_rate": 7.394461803684946e-05, "loss": 3.2087, "step": 81500 }, { "epoch": 8.777311376601011, "grad_norm": 0.9202770590782166, "learning_rate": 7.362137700678806e-05, "loss": 3.171, "step": 81550 }, { "epoch": 8.78269292864062, "grad_norm": 0.8067617416381836, "learning_rate": 7.329813597672664e-05, "loss": 3.2149, "step": 81600 }, { "epoch": 8.788074480680228, "grad_norm": 0.8563730120658875, "learning_rate": 7.297489494666523e-05, "loss": 3.1909, "step": 81650 }, { "epoch": 8.793456032719837, "grad_norm": 0.8543161153793335, "learning_rate": 7.265165391660381e-05, "loss": 3.1819, "step": 81700 }, { "epoch": 8.798837584759445, "grad_norm": 0.8220990300178528, "learning_rate": 7.23284128865424e-05, "loss": 3.2023, "step": 81750 }, { "epoch": 8.804219136799054, "grad_norm": 0.8801039457321167, "learning_rate": 7.200517185648098e-05, "loss": 3.187, "step": 81800 }, { "epoch": 8.80960068883866, "grad_norm": 0.8284077644348145, "learning_rate": 7.168193082641956e-05, "loss": 3.1896, "step": 81850 }, { "epoch": 8.814982240878269, "grad_norm": 0.8685952425003052, "learning_rate": 7.135868979635815e-05, "loss": 3.179, "step": 81900 }, { "epoch": 8.820363792917878, "grad_norm": 0.8335699439048767, "learning_rate": 7.103544876629673e-05, "loss": 3.1853, "step": 81950 }, { "epoch": 8.825745344957486, "grad_norm": 0.8425479531288147, "learning_rate": 7.071220773623531e-05, "loss": 3.1809, "step": 82000 }, { "epoch": 8.825745344957486, "eval_accuracy": 0.39258222388775027, "eval_loss": 3.3150174617767334, "eval_runtime": 190.1288, "eval_samples_per_second": 94.731, "eval_steps_per_second": 5.922, "step": 82000 }, { "epoch": 8.831126896997095, "grad_norm": 0.88204026222229, "learning_rate": 7.03889667061739e-05, "loss": 3.1956, "step": 82050 }, { "epoch": 8.836508449036701, "grad_norm": 0.8744454979896545, "learning_rate": 7.006572567611248e-05, "loss": 3.1756, "step": 82100 }, { "epoch": 8.84189000107631, "grad_norm": 0.8653980493545532, "learning_rate": 6.974248464605107e-05, "loss": 3.1819, "step": 82150 }, { "epoch": 8.847271553115919, "grad_norm": 0.8841899037361145, "learning_rate": 6.941924361598965e-05, "loss": 3.2043, "step": 82200 }, { "epoch": 8.852653105155527, "grad_norm": 0.8339793086051941, "learning_rate": 6.909600258592823e-05, "loss": 3.1922, "step": 82250 }, { "epoch": 8.858034657195136, "grad_norm": 0.8752954006195068, "learning_rate": 6.877276155586681e-05, "loss": 3.1984, "step": 82300 }, { "epoch": 8.863416209234742, "grad_norm": 0.839018702507019, "learning_rate": 6.84495205258054e-05, "loss": 3.1782, "step": 82350 }, { "epoch": 8.868797761274351, "grad_norm": 0.8543075323104858, "learning_rate": 6.813274431634522e-05, "loss": 3.1868, "step": 82400 }, { "epoch": 8.87417931331396, "grad_norm": 0.8411060571670532, "learning_rate": 6.78095032862838e-05, "loss": 3.1812, "step": 82450 }, { "epoch": 8.879560865353568, "grad_norm": 0.8576669692993164, "learning_rate": 6.748626225622238e-05, "loss": 3.1995, "step": 82500 }, { "epoch": 8.884942417393177, "grad_norm": 0.8171575665473938, "learning_rate": 6.716302122616096e-05, "loss": 3.1939, "step": 82550 }, { "epoch": 8.890323969432785, "grad_norm": 0.8460285067558289, "learning_rate": 6.683978019609955e-05, "loss": 3.1678, "step": 82600 }, { "epoch": 8.895705521472392, "grad_norm": 0.8574362993240356, "learning_rate": 6.651653916603814e-05, "loss": 3.1833, "step": 82650 }, { "epoch": 8.901087073512, "grad_norm": 0.8909504413604736, "learning_rate": 6.619329813597673e-05, "loss": 3.1862, "step": 82700 }, { "epoch": 8.906468625551609, "grad_norm": 0.8539124131202698, "learning_rate": 6.587005710591531e-05, "loss": 3.1692, "step": 82750 }, { "epoch": 8.911850177591218, "grad_norm": 0.8660421967506409, "learning_rate": 6.554681607585389e-05, "loss": 3.1867, "step": 82800 }, { "epoch": 8.917231729630826, "grad_norm": 0.8633134365081787, "learning_rate": 6.522357504579247e-05, "loss": 3.1872, "step": 82850 }, { "epoch": 8.922613281670433, "grad_norm": 0.841986894607544, "learning_rate": 6.490033401573105e-05, "loss": 3.1805, "step": 82900 }, { "epoch": 8.927994833710041, "grad_norm": 0.87624591588974, "learning_rate": 6.457709298566965e-05, "loss": 3.1753, "step": 82950 }, { "epoch": 8.93337638574965, "grad_norm": 0.8404093384742737, "learning_rate": 6.425385195560823e-05, "loss": 3.1939, "step": 83000 }, { "epoch": 8.93337638574965, "eval_accuracy": 0.3927638915161761, "eval_loss": 3.311628580093384, "eval_runtime": 189.6303, "eval_samples_per_second": 94.98, "eval_steps_per_second": 5.938, "step": 83000 }, { "epoch": 8.938757937789259, "grad_norm": 0.8670982122421265, "learning_rate": 6.393061092554681e-05, "loss": 3.1851, "step": 83050 }, { "epoch": 8.944139489828867, "grad_norm": 0.8687721490859985, "learning_rate": 6.360736989548539e-05, "loss": 3.1744, "step": 83100 }, { "epoch": 8.949521041868476, "grad_norm": 0.8958396911621094, "learning_rate": 6.328412886542397e-05, "loss": 3.2024, "step": 83150 }, { "epoch": 8.954902593908082, "grad_norm": 0.8612350225448608, "learning_rate": 6.296088783536257e-05, "loss": 3.184, "step": 83200 }, { "epoch": 8.960284145947691, "grad_norm": 0.8626738786697388, "learning_rate": 6.263764680530115e-05, "loss": 3.172, "step": 83250 }, { "epoch": 8.9656656979873, "grad_norm": 0.8333890438079834, "learning_rate": 6.231440577523973e-05, "loss": 3.1774, "step": 83300 }, { "epoch": 8.971047250026908, "grad_norm": 0.8436484932899475, "learning_rate": 6.199116474517831e-05, "loss": 3.1899, "step": 83350 }, { "epoch": 8.976428802066517, "grad_norm": 0.863152265548706, "learning_rate": 6.166792371511689e-05, "loss": 3.1785, "step": 83400 }, { "epoch": 8.981810354106123, "grad_norm": 0.8480373620986938, "learning_rate": 6.134468268505549e-05, "loss": 3.1859, "step": 83450 }, { "epoch": 8.987191906145732, "grad_norm": 0.8849746584892273, "learning_rate": 6.10279064755953e-05, "loss": 3.1647, "step": 83500 }, { "epoch": 8.99257345818534, "grad_norm": 0.8656249642372131, "learning_rate": 6.070466544553388e-05, "loss": 3.1873, "step": 83550 }, { "epoch": 8.997955010224949, "grad_norm": 0.8052066564559937, "learning_rate": 6.038142441547246e-05, "loss": 3.189, "step": 83600 }, { "epoch": 9.003336562264558, "grad_norm": 0.8620032668113708, "learning_rate": 6.005818338541105e-05, "loss": 3.147, "step": 83650 }, { "epoch": 9.008718114304166, "grad_norm": 0.8975006937980652, "learning_rate": 5.973494235534963e-05, "loss": 3.119, "step": 83700 }, { "epoch": 9.014099666343773, "grad_norm": 0.8371387720108032, "learning_rate": 5.941170132528822e-05, "loss": 3.1233, "step": 83750 }, { "epoch": 9.019481218383381, "grad_norm": 0.8606297373771667, "learning_rate": 5.90884602952268e-05, "loss": 3.1383, "step": 83800 }, { "epoch": 9.02486277042299, "grad_norm": 0.8248050212860107, "learning_rate": 5.876521926516538e-05, "loss": 3.1193, "step": 83850 }, { "epoch": 9.030244322462599, "grad_norm": 0.9047676920890808, "learning_rate": 5.844197823510397e-05, "loss": 3.1173, "step": 83900 }, { "epoch": 9.035625874502207, "grad_norm": 0.8255630731582642, "learning_rate": 5.811873720504255e-05, "loss": 3.1424, "step": 83950 }, { "epoch": 9.041007426541814, "grad_norm": 0.8557283878326416, "learning_rate": 5.7795496174981145e-05, "loss": 3.1185, "step": 84000 }, { "epoch": 9.041007426541814, "eval_accuracy": 0.39279083743235405, "eval_loss": 3.3150100708007812, "eval_runtime": 189.3544, "eval_samples_per_second": 95.118, "eval_steps_per_second": 5.947, "step": 84000 }, { "epoch": 9.046388978581422, "grad_norm": 0.8470101952552795, "learning_rate": 5.7472255144919726e-05, "loss": 3.1401, "step": 84050 }, { "epoch": 9.051770530621031, "grad_norm": 0.8201737999916077, "learning_rate": 5.714901411485831e-05, "loss": 3.1163, "step": 84100 }, { "epoch": 9.05715208266064, "grad_norm": 0.8869134187698364, "learning_rate": 5.6825773084796895e-05, "loss": 3.1184, "step": 84150 }, { "epoch": 9.062533634700248, "grad_norm": 0.8410740494728088, "learning_rate": 5.6502532054735476e-05, "loss": 3.1237, "step": 84200 }, { "epoch": 9.067915186739857, "grad_norm": 0.8496425747871399, "learning_rate": 5.6179291024674064e-05, "loss": 3.1181, "step": 84250 }, { "epoch": 9.073296738779463, "grad_norm": 0.8405110239982605, "learning_rate": 5.5856049994612645e-05, "loss": 3.1268, "step": 84300 }, { "epoch": 9.078678290819072, "grad_norm": 0.8484364748001099, "learning_rate": 5.5532808964551227e-05, "loss": 3.1385, "step": 84350 }, { "epoch": 9.08405984285868, "grad_norm": 0.8305612206459045, "learning_rate": 5.5209567934489814e-05, "loss": 3.1149, "step": 84400 }, { "epoch": 9.089441394898289, "grad_norm": 0.8964749574661255, "learning_rate": 5.4886326904428396e-05, "loss": 3.1238, "step": 84450 }, { "epoch": 9.094822946937898, "grad_norm": 0.8592599630355835, "learning_rate": 5.4563085874366983e-05, "loss": 3.1313, "step": 84500 }, { "epoch": 9.100204498977504, "grad_norm": 0.8475759625434875, "learning_rate": 5.4239844844305565e-05, "loss": 3.1228, "step": 84550 }, { "epoch": 9.105586051017113, "grad_norm": 0.8360592126846313, "learning_rate": 5.3916603814244146e-05, "loss": 3.1551, "step": 84600 }, { "epoch": 9.110967603056721, "grad_norm": 0.8481616973876953, "learning_rate": 5.3593362784182734e-05, "loss": 3.1232, "step": 84650 }, { "epoch": 9.11634915509633, "grad_norm": 0.8523578643798828, "learning_rate": 5.3270121754121315e-05, "loss": 3.1473, "step": 84700 }, { "epoch": 9.121730707135939, "grad_norm": 0.828809916973114, "learning_rate": 5.294688072405991e-05, "loss": 3.1185, "step": 84750 }, { "epoch": 9.127112259175545, "grad_norm": 0.8841274380683899, "learning_rate": 5.262363969399849e-05, "loss": 3.1403, "step": 84800 }, { "epoch": 9.132493811215154, "grad_norm": 0.8152642250061035, "learning_rate": 5.230039866393707e-05, "loss": 3.1395, "step": 84850 }, { "epoch": 9.137875363254762, "grad_norm": 0.8742380738258362, "learning_rate": 5.197715763387566e-05, "loss": 3.1114, "step": 84900 }, { "epoch": 9.143256915294371, "grad_norm": 0.8858259916305542, "learning_rate": 5.165391660381424e-05, "loss": 3.1252, "step": 84950 }, { "epoch": 9.14863846733398, "grad_norm": 0.8560479879379272, "learning_rate": 5.133067557375283e-05, "loss": 3.1242, "step": 85000 }, { "epoch": 9.14863846733398, "eval_accuracy": 0.39295196966498297, "eval_loss": 3.3140878677368164, "eval_runtime": 189.6918, "eval_samples_per_second": 94.949, "eval_steps_per_second": 5.936, "step": 85000 }, { "epoch": 9.154020019373588, "grad_norm": 0.9070020318031311, "learning_rate": 5.100743454369141e-05, "loss": 3.1242, "step": 85050 }, { "epoch": 9.159401571413195, "grad_norm": 0.8739373087882996, "learning_rate": 5.068419351362999e-05, "loss": 3.1321, "step": 85100 }, { "epoch": 9.164783123452803, "grad_norm": 0.8586959838867188, "learning_rate": 5.036095248356858e-05, "loss": 3.1283, "step": 85150 }, { "epoch": 9.170164675492412, "grad_norm": 0.8272525668144226, "learning_rate": 5.003771145350716e-05, "loss": 3.1298, "step": 85200 }, { "epoch": 9.17554622753202, "grad_norm": 0.8490020036697388, "learning_rate": 4.971447042344575e-05, "loss": 3.1514, "step": 85250 }, { "epoch": 9.180927779571629, "grad_norm": 0.8110401630401611, "learning_rate": 4.939122939338433e-05, "loss": 3.1387, "step": 85300 }, { "epoch": 9.186309331611236, "grad_norm": 0.8785157203674316, "learning_rate": 4.906798836332291e-05, "loss": 3.1274, "step": 85350 }, { "epoch": 9.191690883650844, "grad_norm": 0.8156988024711609, "learning_rate": 4.87447473332615e-05, "loss": 3.1198, "step": 85400 }, { "epoch": 9.197072435690453, "grad_norm": 0.8705768585205078, "learning_rate": 4.842150630320008e-05, "loss": 3.1515, "step": 85450 }, { "epoch": 9.202453987730062, "grad_norm": 0.8462039232254028, "learning_rate": 4.8098265273138667e-05, "loss": 3.1292, "step": 85500 }, { "epoch": 9.20783553976967, "grad_norm": 0.859347939491272, "learning_rate": 4.777502424307725e-05, "loss": 3.1349, "step": 85550 }, { "epoch": 9.213217091809279, "grad_norm": 0.8954098224639893, "learning_rate": 4.745178321301583e-05, "loss": 3.1338, "step": 85600 }, { "epoch": 9.218598643848885, "grad_norm": 0.8277496099472046, "learning_rate": 4.7128542182954423e-05, "loss": 3.1399, "step": 85650 }, { "epoch": 9.223980195888494, "grad_norm": 0.8506129384040833, "learning_rate": 4.6805301152893005e-05, "loss": 3.1263, "step": 85700 }, { "epoch": 9.229361747928102, "grad_norm": 0.8558870553970337, "learning_rate": 4.648206012283159e-05, "loss": 3.115, "step": 85750 }, { "epoch": 9.234743299967711, "grad_norm": 0.8409327268600464, "learning_rate": 4.6158819092770174e-05, "loss": 3.1123, "step": 85800 }, { "epoch": 9.24012485200732, "grad_norm": 0.8239386677742004, "learning_rate": 4.5835578062708755e-05, "loss": 3.1228, "step": 85850 }, { "epoch": 9.245506404046926, "grad_norm": 0.863751232624054, "learning_rate": 4.551233703264734e-05, "loss": 3.1394, "step": 85900 }, { "epoch": 9.250887956086535, "grad_norm": 0.8437860012054443, "learning_rate": 4.5189096002585924e-05, "loss": 3.1205, "step": 85950 }, { "epoch": 9.256269508126143, "grad_norm": 0.8293090462684631, "learning_rate": 4.486585497252451e-05, "loss": 3.1324, "step": 86000 }, { "epoch": 9.256269508126143, "eval_accuracy": 0.3932856426834612, "eval_loss": 3.3111629486083984, "eval_runtime": 189.3869, "eval_samples_per_second": 95.102, "eval_steps_per_second": 5.945, "step": 86000 }, { "epoch": 9.261651060165752, "grad_norm": 0.8659238815307617, "learning_rate": 4.4549078763064325e-05, "loss": 3.1371, "step": 86050 }, { "epoch": 9.26703261220536, "grad_norm": 0.8517236113548279, "learning_rate": 4.4225837733002906e-05, "loss": 3.1217, "step": 86100 }, { "epoch": 9.272414164244967, "grad_norm": 0.8780714273452759, "learning_rate": 4.390259670294149e-05, "loss": 3.1419, "step": 86150 }, { "epoch": 9.277795716284576, "grad_norm": 0.8498876690864563, "learning_rate": 4.3579355672880075e-05, "loss": 3.128, "step": 86200 }, { "epoch": 9.283177268324184, "grad_norm": 0.8308305740356445, "learning_rate": 4.3256114642818656e-05, "loss": 3.1307, "step": 86250 }, { "epoch": 9.288558820363793, "grad_norm": 0.8509485125541687, "learning_rate": 4.2932873612757244e-05, "loss": 3.1287, "step": 86300 }, { "epoch": 9.293940372403402, "grad_norm": 0.8401972055435181, "learning_rate": 4.2609632582695825e-05, "loss": 3.1373, "step": 86350 }, { "epoch": 9.29932192444301, "grad_norm": 0.8404197096824646, "learning_rate": 4.2286391552634406e-05, "loss": 3.1289, "step": 86400 }, { "epoch": 9.304703476482617, "grad_norm": 0.8347073197364807, "learning_rate": 4.1963150522572994e-05, "loss": 3.1325, "step": 86450 }, { "epoch": 9.310085028522225, "grad_norm": 0.8774033188819885, "learning_rate": 4.1639909492511575e-05, "loss": 3.1243, "step": 86500 }, { "epoch": 9.315466580561834, "grad_norm": 0.9243816137313843, "learning_rate": 4.131666846245017e-05, "loss": 3.1181, "step": 86550 }, { "epoch": 9.320848132601443, "grad_norm": 0.8504797220230103, "learning_rate": 4.099342743238875e-05, "loss": 3.1397, "step": 86600 }, { "epoch": 9.326229684641051, "grad_norm": 0.8603664636611938, "learning_rate": 4.067018640232733e-05, "loss": 3.1403, "step": 86650 }, { "epoch": 9.331611236680658, "grad_norm": 0.8422338366508484, "learning_rate": 4.034694537226592e-05, "loss": 3.1305, "step": 86700 }, { "epoch": 9.336992788720266, "grad_norm": 0.8984904885292053, "learning_rate": 4.00237043422045e-05, "loss": 3.1268, "step": 86750 }, { "epoch": 9.342374340759875, "grad_norm": 0.8661045432090759, "learning_rate": 3.970046331214309e-05, "loss": 3.1122, "step": 86800 }, { "epoch": 9.347755892799483, "grad_norm": 0.8606688380241394, "learning_rate": 3.937722228208167e-05, "loss": 3.1295, "step": 86850 }, { "epoch": 9.353137444839092, "grad_norm": 0.8438649773597717, "learning_rate": 3.905398125202025e-05, "loss": 3.1299, "step": 86900 }, { "epoch": 9.3585189968787, "grad_norm": 0.8938318490982056, "learning_rate": 3.873074022195884e-05, "loss": 3.1416, "step": 86950 }, { "epoch": 9.363900548918307, "grad_norm": 0.8666595816612244, "learning_rate": 3.840749919189742e-05, "loss": 3.1224, "step": 87000 }, { "epoch": 9.363900548918307, "eval_accuracy": 0.3935082724505931, "eval_loss": 3.309908151626587, "eval_runtime": 189.3766, "eval_samples_per_second": 95.107, "eval_steps_per_second": 5.946, "step": 87000 }, { "epoch": 9.369282100957916, "grad_norm": 0.8648133873939514, "learning_rate": 3.808425816183601e-05, "loss": 3.1365, "step": 87050 }, { "epoch": 9.374663652997524, "grad_norm": 0.8668041825294495, "learning_rate": 3.776101713177459e-05, "loss": 3.1471, "step": 87100 }, { "epoch": 9.380045205037133, "grad_norm": 0.8217251300811768, "learning_rate": 3.743777610171318e-05, "loss": 3.131, "step": 87150 }, { "epoch": 9.385426757076742, "grad_norm": 0.8287473320960999, "learning_rate": 3.711453507165176e-05, "loss": 3.1532, "step": 87200 }, { "epoch": 9.390808309116348, "grad_norm": 0.8670193552970886, "learning_rate": 3.679129404159034e-05, "loss": 3.1399, "step": 87250 }, { "epoch": 9.396189861155957, "grad_norm": 0.8458430171012878, "learning_rate": 3.646805301152893e-05, "loss": 3.1549, "step": 87300 }, { "epoch": 9.401571413195565, "grad_norm": 0.8716211318969727, "learning_rate": 3.614481198146751e-05, "loss": 3.1314, "step": 87350 }, { "epoch": 9.406952965235174, "grad_norm": 0.8429900407791138, "learning_rate": 3.5821570951406096e-05, "loss": 3.128, "step": 87400 }, { "epoch": 9.412334517274783, "grad_norm": 0.8358698487281799, "learning_rate": 3.5498329921344684e-05, "loss": 3.1314, "step": 87450 }, { "epoch": 9.417716069314391, "grad_norm": 0.8356906175613403, "learning_rate": 3.5175088891283265e-05, "loss": 3.1225, "step": 87500 }, { "epoch": 9.423097621353998, "grad_norm": 0.8311634063720703, "learning_rate": 3.4851847861221846e-05, "loss": 3.1125, "step": 87550 }, { "epoch": 9.428479173393606, "grad_norm": 0.8509854674339294, "learning_rate": 3.4528606831160434e-05, "loss": 3.1331, "step": 87600 }, { "epoch": 9.433860725433215, "grad_norm": 0.890250563621521, "learning_rate": 3.4205365801099015e-05, "loss": 3.1257, "step": 87650 }, { "epoch": 9.439242277472824, "grad_norm": 0.8621748089790344, "learning_rate": 3.38821247710376e-05, "loss": 3.1427, "step": 87700 }, { "epoch": 9.444623829512432, "grad_norm": 0.8697271347045898, "learning_rate": 3.3558883740976184e-05, "loss": 3.1607, "step": 87750 }, { "epoch": 9.450005381552039, "grad_norm": 0.8986278772354126, "learning_rate": 3.323564271091477e-05, "loss": 3.1247, "step": 87800 }, { "epoch": 9.455386933591647, "grad_norm": 0.8373309373855591, "learning_rate": 3.291240168085335e-05, "loss": 3.1418, "step": 87850 }, { "epoch": 9.460768485631256, "grad_norm": 0.8242829442024231, "learning_rate": 3.258916065079194e-05, "loss": 3.1379, "step": 87900 }, { "epoch": 9.466150037670864, "grad_norm": 0.848978579044342, "learning_rate": 3.226591962073052e-05, "loss": 3.1385, "step": 87950 }, { "epoch": 9.471531589710473, "grad_norm": 0.8514211773872375, "learning_rate": 3.19426785906691e-05, "loss": 3.1341, "step": 88000 }, { "epoch": 9.471531589710473, "eval_accuracy": 0.39374882994421434, "eval_loss": 3.307783603668213, "eval_runtime": 189.7402, "eval_samples_per_second": 94.925, "eval_steps_per_second": 5.934, "step": 88000 }, { "epoch": 9.476913141750082, "grad_norm": 0.8711098432540894, "learning_rate": 3.161943756060769e-05, "loss": 3.126, "step": 88050 }, { "epoch": 9.482294693789688, "grad_norm": 0.8469831943511963, "learning_rate": 3.129619653054627e-05, "loss": 3.1443, "step": 88100 }, { "epoch": 9.487676245829297, "grad_norm": 0.8410546183586121, "learning_rate": 3.097295550048486e-05, "loss": 3.1266, "step": 88150 }, { "epoch": 9.493057797868905, "grad_norm": 0.8234712481498718, "learning_rate": 3.064971447042345e-05, "loss": 3.1295, "step": 88200 }, { "epoch": 9.498439349908514, "grad_norm": 0.8497011661529541, "learning_rate": 3.0326473440362026e-05, "loss": 3.131, "step": 88250 }, { "epoch": 9.503820901948123, "grad_norm": 0.8871855735778809, "learning_rate": 3.000323241030061e-05, "loss": 3.1225, "step": 88300 }, { "epoch": 9.50920245398773, "grad_norm": 0.8837377429008484, "learning_rate": 2.9679991380239195e-05, "loss": 3.1358, "step": 88350 }, { "epoch": 9.514584006027338, "grad_norm": 0.8673506379127502, "learning_rate": 2.9356750350177782e-05, "loss": 3.1208, "step": 88400 }, { "epoch": 9.519965558066946, "grad_norm": 0.8431509733200073, "learning_rate": 2.9033509320116367e-05, "loss": 3.1308, "step": 88450 }, { "epoch": 9.525347110106555, "grad_norm": 0.8733773827552795, "learning_rate": 2.8710268290054948e-05, "loss": 3.1542, "step": 88500 }, { "epoch": 9.530728662146164, "grad_norm": 0.9029594659805298, "learning_rate": 2.8387027259993533e-05, "loss": 3.133, "step": 88550 }, { "epoch": 9.536110214185772, "grad_norm": 0.8783562183380127, "learning_rate": 2.8063786229932117e-05, "loss": 3.1165, "step": 88600 }, { "epoch": 9.541491766225379, "grad_norm": 0.8718871474266052, "learning_rate": 2.77405451998707e-05, "loss": 3.1176, "step": 88650 }, { "epoch": 9.546873318264987, "grad_norm": 0.8462399244308472, "learning_rate": 2.7417304169809286e-05, "loss": 3.1366, "step": 88700 }, { "epoch": 9.552254870304596, "grad_norm": 0.8613501191139221, "learning_rate": 2.7094063139747867e-05, "loss": 3.1189, "step": 88750 }, { "epoch": 9.557636422344205, "grad_norm": 0.8645268678665161, "learning_rate": 2.677082210968645e-05, "loss": 3.1303, "step": 88800 }, { "epoch": 9.563017974383813, "grad_norm": 0.861893892288208, "learning_rate": 2.644758107962504e-05, "loss": 3.1502, "step": 88850 }, { "epoch": 9.56839952642342, "grad_norm": 0.828042209148407, "learning_rate": 2.6124340049563624e-05, "loss": 3.1388, "step": 88900 }, { "epoch": 9.573781078463028, "grad_norm": 0.8182950019836426, "learning_rate": 2.580109901950221e-05, "loss": 3.1395, "step": 88950 }, { "epoch": 9.579162630502637, "grad_norm": 0.8487094640731812, "learning_rate": 2.547785798944079e-05, "loss": 3.1354, "step": 89000 }, { "epoch": 9.579162630502637, "eval_accuracy": 0.3941069498624509, "eval_loss": 3.3047800064086914, "eval_runtime": 189.7111, "eval_samples_per_second": 94.939, "eval_steps_per_second": 5.935, "step": 89000 }, { "epoch": 9.584544182542245, "grad_norm": 0.8931549787521362, "learning_rate": 2.5154616959379374e-05, "loss": 3.138, "step": 89050 }, { "epoch": 9.589925734581854, "grad_norm": 0.8639717102050781, "learning_rate": 2.483137592931796e-05, "loss": 3.15, "step": 89100 }, { "epoch": 9.59530728662146, "grad_norm": 0.8543230295181274, "learning_rate": 2.4508134899256543e-05, "loss": 3.1383, "step": 89150 }, { "epoch": 9.60068883866107, "grad_norm": 0.9166204929351807, "learning_rate": 2.418489386919513e-05, "loss": 3.1253, "step": 89200 }, { "epoch": 9.606070390700678, "grad_norm": 0.8401595950126648, "learning_rate": 2.386165283913371e-05, "loss": 3.1514, "step": 89250 }, { "epoch": 9.611451942740286, "grad_norm": 0.8688633441925049, "learning_rate": 2.3544876629673522e-05, "loss": 3.1363, "step": 89300 }, { "epoch": 9.616833494779895, "grad_norm": 0.8411018252372742, "learning_rate": 2.3221635599612106e-05, "loss": 3.1256, "step": 89350 }, { "epoch": 9.622215046819504, "grad_norm": 0.8895683288574219, "learning_rate": 2.2898394569550694e-05, "loss": 3.1329, "step": 89400 }, { "epoch": 9.62759659885911, "grad_norm": 0.8607434034347534, "learning_rate": 2.257515353948928e-05, "loss": 3.1494, "step": 89450 }, { "epoch": 9.632978150898719, "grad_norm": 0.8818168640136719, "learning_rate": 2.2251912509427863e-05, "loss": 3.1479, "step": 89500 }, { "epoch": 9.638359702938327, "grad_norm": 0.8703548312187195, "learning_rate": 2.1928671479366444e-05, "loss": 3.1207, "step": 89550 }, { "epoch": 9.643741254977936, "grad_norm": 0.8854098916053772, "learning_rate": 2.160543044930503e-05, "loss": 3.1324, "step": 89600 }, { "epoch": 9.649122807017545, "grad_norm": 0.8342804908752441, "learning_rate": 2.1282189419243613e-05, "loss": 3.1383, "step": 89650 }, { "epoch": 9.654504359057151, "grad_norm": 0.8617409467697144, "learning_rate": 2.0958948389182198e-05, "loss": 3.1366, "step": 89700 }, { "epoch": 9.65988591109676, "grad_norm": 0.8699505925178528, "learning_rate": 2.0635707359120786e-05, "loss": 3.1416, "step": 89750 }, { "epoch": 9.665267463136368, "grad_norm": 0.8576476573944092, "learning_rate": 2.0312466329059367e-05, "loss": 3.1251, "step": 89800 }, { "epoch": 9.670649015175977, "grad_norm": 0.848074197769165, "learning_rate": 1.998922529899795e-05, "loss": 3.1331, "step": 89850 }, { "epoch": 9.676030567215586, "grad_norm": 0.8670492768287659, "learning_rate": 1.9665984268936536e-05, "loss": 3.1474, "step": 89900 }, { "epoch": 9.681412119255192, "grad_norm": 0.8532721996307373, "learning_rate": 1.934274323887512e-05, "loss": 3.1403, "step": 89950 }, { "epoch": 9.6867936712948, "grad_norm": 0.8756169080734253, "learning_rate": 1.9019502208813705e-05, "loss": 3.1262, "step": 90000 }, { "epoch": 9.6867936712948, "eval_accuracy": 0.3943953146267105, "eval_loss": 3.3019161224365234, "eval_runtime": 189.5494, "eval_samples_per_second": 95.02, "eval_steps_per_second": 5.94, "step": 90000 }, { "epoch": 9.69217522333441, "grad_norm": 0.8606821894645691, "learning_rate": 1.869626117875229e-05, "loss": 3.1353, "step": 90050 }, { "epoch": 9.697556775374018, "grad_norm": 0.8910887241363525, "learning_rate": 1.837302014869087e-05, "loss": 3.1403, "step": 90100 }, { "epoch": 9.702938327413626, "grad_norm": 0.9061274528503418, "learning_rate": 1.8049779118629455e-05, "loss": 3.14, "step": 90150 }, { "epoch": 9.708319879453235, "grad_norm": 0.8429542183876038, "learning_rate": 1.7726538088568043e-05, "loss": 3.1161, "step": 90200 }, { "epoch": 9.713701431492842, "grad_norm": 0.8305377960205078, "learning_rate": 1.7403297058506624e-05, "loss": 3.1368, "step": 90250 }, { "epoch": 9.71908298353245, "grad_norm": 0.837834894657135, "learning_rate": 1.708005602844521e-05, "loss": 3.1374, "step": 90300 }, { "epoch": 9.724464535572059, "grad_norm": 0.852417528629303, "learning_rate": 1.6756814998383793e-05, "loss": 3.1363, "step": 90350 }, { "epoch": 9.729846087611667, "grad_norm": 0.87115079164505, "learning_rate": 1.6433573968322377e-05, "loss": 3.1364, "step": 90400 }, { "epoch": 9.735227639651276, "grad_norm": 0.8634170293807983, "learning_rate": 1.6110332938260962e-05, "loss": 3.1237, "step": 90450 }, { "epoch": 9.740609191690883, "grad_norm": 0.871769905090332, "learning_rate": 1.5787091908199546e-05, "loss": 3.156, "step": 90500 }, { "epoch": 9.745990743730491, "grad_norm": 0.8181995749473572, "learning_rate": 1.546385087813813e-05, "loss": 3.1309, "step": 90550 }, { "epoch": 9.7513722957701, "grad_norm": 0.8241446614265442, "learning_rate": 1.5140609848076714e-05, "loss": 3.1164, "step": 90600 }, { "epoch": 9.756753847809708, "grad_norm": 0.8035761117935181, "learning_rate": 1.4817368818015298e-05, "loss": 3.1295, "step": 90650 }, { "epoch": 9.762135399849317, "grad_norm": 0.8078320622444153, "learning_rate": 1.4494127787953884e-05, "loss": 3.1354, "step": 90700 }, { "epoch": 9.767516951888926, "grad_norm": 0.9000764489173889, "learning_rate": 1.4170886757892467e-05, "loss": 3.1242, "step": 90750 }, { "epoch": 9.772898503928532, "grad_norm": 0.8568733334541321, "learning_rate": 1.3847645727831052e-05, "loss": 3.117, "step": 90800 }, { "epoch": 9.77828005596814, "grad_norm": 0.8243940472602844, "learning_rate": 1.3524404697769635e-05, "loss": 3.1242, "step": 90850 }, { "epoch": 9.78366160800775, "grad_norm": 0.8713697195053101, "learning_rate": 1.320116366770822e-05, "loss": 3.13, "step": 90900 }, { "epoch": 9.789043160047358, "grad_norm": 0.837845504283905, "learning_rate": 1.2877922637646805e-05, "loss": 3.1421, "step": 90950 }, { "epoch": 9.794424712086967, "grad_norm": 0.8562050461769104, "learning_rate": 1.2554681607585388e-05, "loss": 3.129, "step": 91000 }, { "epoch": 9.794424712086967, "eval_accuracy": 0.3945871956265909, "eval_loss": 3.300642490386963, "eval_runtime": 190.2591, "eval_samples_per_second": 94.666, "eval_steps_per_second": 5.918, "step": 91000 }, { "epoch": 9.799806264126573, "grad_norm": 0.8212922811508179, "learning_rate": 1.2231440577523972e-05, "loss": 3.1433, "step": 91050 }, { "epoch": 9.805187816166182, "grad_norm": 0.9242744445800781, "learning_rate": 1.1908199547462555e-05, "loss": 3.1234, "step": 91100 }, { "epoch": 9.81056936820579, "grad_norm": 0.8359904289245605, "learning_rate": 1.1584958517401141e-05, "loss": 3.1384, "step": 91150 }, { "epoch": 9.815950920245399, "grad_norm": 0.8217136263847351, "learning_rate": 1.1261717487339726e-05, "loss": 3.1144, "step": 91200 }, { "epoch": 9.821332472285007, "grad_norm": 0.8538930416107178, "learning_rate": 1.0938476457278309e-05, "loss": 3.1489, "step": 91250 }, { "epoch": 9.826714024324616, "grad_norm": 0.8195740580558777, "learning_rate": 1.0615235427216895e-05, "loss": 3.1369, "step": 91300 }, { "epoch": 9.832095576364223, "grad_norm": 0.8804702162742615, "learning_rate": 1.0291994397155478e-05, "loss": 3.1469, "step": 91350 }, { "epoch": 9.837477128403831, "grad_norm": 0.8412031531333923, "learning_rate": 9.968753367094062e-06, "loss": 3.1415, "step": 91400 }, { "epoch": 9.84285868044344, "grad_norm": 0.8647484183311462, "learning_rate": 9.645512337032647e-06, "loss": 3.1224, "step": 91450 }, { "epoch": 9.848240232483048, "grad_norm": 0.8878872990608215, "learning_rate": 9.32227130697123e-06, "loss": 3.1219, "step": 91500 }, { "epoch": 9.853621784522657, "grad_norm": 0.839019775390625, "learning_rate": 8.999030276909816e-06, "loss": 3.1497, "step": 91550 }, { "epoch": 9.859003336562264, "grad_norm": 0.8272830247879028, "learning_rate": 8.675789246848399e-06, "loss": 3.1342, "step": 91600 }, { "epoch": 9.864384888601872, "grad_norm": 0.8204966187477112, "learning_rate": 8.352548216786983e-06, "loss": 3.1181, "step": 91650 }, { "epoch": 9.869766440641481, "grad_norm": 0.8158440589904785, "learning_rate": 8.029307186725568e-06, "loss": 3.1251, "step": 91700 }, { "epoch": 9.87514799268109, "grad_norm": 0.9214332103729248, "learning_rate": 7.706066156664152e-06, "loss": 3.1234, "step": 91750 }, { "epoch": 9.880529544720698, "grad_norm": 0.8618704676628113, "learning_rate": 7.3828251266027365e-06, "loss": 3.1344, "step": 91800 }, { "epoch": 9.885911096760307, "grad_norm": 0.8866255879402161, "learning_rate": 7.05958409654132e-06, "loss": 3.1481, "step": 91850 }, { "epoch": 9.891292648799913, "grad_norm": 0.8363102674484253, "learning_rate": 6.736343066479905e-06, "loss": 3.1315, "step": 91900 }, { "epoch": 9.896674200839522, "grad_norm": 0.8203515410423279, "learning_rate": 6.413102036418488e-06, "loss": 3.1338, "step": 91950 }, { "epoch": 9.90205575287913, "grad_norm": 0.8286318778991699, "learning_rate": 6.089861006357073e-06, "loss": 3.1392, "step": 92000 }, { "epoch": 9.90205575287913, "eval_accuracy": 0.3947092128196065, "eval_loss": 3.29917311668396, "eval_runtime": 189.5412, "eval_samples_per_second": 95.024, "eval_steps_per_second": 5.941, "step": 92000 }, { "epoch": 9.907437304918739, "grad_norm": 0.8603388071060181, "learning_rate": 5.766619976295657e-06, "loss": 3.1372, "step": 92050 }, { "epoch": 9.912818856958348, "grad_norm": 0.9106926321983337, "learning_rate": 5.443378946234242e-06, "loss": 3.1309, "step": 92100 }, { "epoch": 9.918200408997954, "grad_norm": 0.8053128123283386, "learning_rate": 5.1201379161728254e-06, "loss": 3.1273, "step": 92150 }, { "epoch": 9.923581961037563, "grad_norm": 0.8366634249687195, "learning_rate": 4.79689688611141e-06, "loss": 3.1282, "step": 92200 }, { "epoch": 9.928963513077171, "grad_norm": 0.8284056782722473, "learning_rate": 4.4736558560499944e-06, "loss": 3.141, "step": 92250 }, { "epoch": 9.93434506511678, "grad_norm": 0.8485735058784485, "learning_rate": 4.150414825988578e-06, "loss": 3.1262, "step": 92300 }, { "epoch": 9.939726617156388, "grad_norm": 0.859318196773529, "learning_rate": 3.8271737959271626e-06, "loss": 3.1151, "step": 92350 }, { "epoch": 9.945108169195997, "grad_norm": 0.8240682482719421, "learning_rate": 3.5039327658657466e-06, "loss": 3.1192, "step": 92400 }, { "epoch": 9.950489721235604, "grad_norm": 0.850426971912384, "learning_rate": 3.180691735804331e-06, "loss": 3.1372, "step": 92450 }, { "epoch": 9.955871273275212, "grad_norm": 0.8800275921821594, "learning_rate": 2.857450705742915e-06, "loss": 3.1292, "step": 92500 }, { "epoch": 9.961252825314821, "grad_norm": 0.8482245802879333, "learning_rate": 2.5342096756814993e-06, "loss": 3.141, "step": 92550 }, { "epoch": 9.96663437735443, "grad_norm": 0.8627044558525085, "learning_rate": 2.2109686456200838e-06, "loss": 3.1262, "step": 92600 }, { "epoch": 9.972015929394038, "grad_norm": 0.9186998009681702, "learning_rate": 1.8877276155586683e-06, "loss": 3.1342, "step": 92650 }, { "epoch": 9.977397481433645, "grad_norm": 0.9178165197372437, "learning_rate": 1.5644865854972521e-06, "loss": 3.1263, "step": 92700 }, { "epoch": 9.982779033473253, "grad_norm": 0.8731764554977417, "learning_rate": 1.2412455554358364e-06, "loss": 3.1389, "step": 92750 }, { "epoch": 9.988160585512862, "grad_norm": 0.8474687337875366, "learning_rate": 9.180045253744208e-07, "loss": 3.1368, "step": 92800 }, { "epoch": 9.99354213755247, "grad_norm": 0.8867104649543762, "learning_rate": 5.947634953130051e-07, "loss": 3.153, "step": 92850 }, { "epoch": 9.998923689592079, "grad_norm": 0.8560895919799805, "learning_rate": 2.7152246525158925e-07, "loss": 3.1305, "step": 92900 }, { "epoch": 10.0, "step": 92910, "total_flos": 7.76821211136e+17, "train_loss": 3.4556877505752324, "train_runtime": 82552.4271, "train_samples_per_second": 36.013, "train_steps_per_second": 1.125 } ], "logging_steps": 50, "max_steps": 92910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.76821211136e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }