diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13876 @@ +{ + "best_metric": 3.3019161224365234, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-90000", + "epoch": 10.0, + "eval_steps": 1000, + "global_step": 92910, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005381552039608223, + "grad_norm": 1.3847299814224243, + "learning_rate": 0.0003, + "loss": 8.4655, + "step": 50 + }, + { + "epoch": 0.010763104079216447, + "grad_norm": 1.0632425546646118, + "learning_rate": 0.0006, + "loss": 6.8249, + "step": 100 + }, + { + "epoch": 0.01614465611882467, + "grad_norm": 2.088347911834717, + "learning_rate": 0.0005996767589699385, + "loss": 6.4221, + "step": 150 + }, + { + "epoch": 0.021526208158432893, + "grad_norm": 1.0710954666137695, + "learning_rate": 0.0005993535179398771, + "loss": 6.1909, + "step": 200 + }, + { + "epoch": 0.026907760198041114, + "grad_norm": 1.5236896276474, + "learning_rate": 0.0005990302769098158, + "loss": 6.0442, + "step": 250 + }, + { + "epoch": 0.03228931223764934, + "grad_norm": 1.8247500658035278, + "learning_rate": 0.0005987070358797543, + "loss": 5.9513, + "step": 300 + }, + { + "epoch": 0.03767086427725756, + "grad_norm": 1.2985302209854126, + "learning_rate": 0.0005983837948496929, + "loss": 5.8643, + "step": 350 + }, + { + "epoch": 0.04305241631686579, + "grad_norm": 1.256401538848877, + "learning_rate": 0.0005980605538196314, + "loss": 5.8018, + "step": 400 + }, + { + "epoch": 0.048433968356474004, + "grad_norm": 0.8569295406341553, + "learning_rate": 0.0005977373127895701, + "loss": 5.7053, + "step": 450 + }, + { + "epoch": 0.05381552039608223, + "grad_norm": 0.860461413860321, + "learning_rate": 0.0005974140717595086, + "loss": 5.6413, + "step": 500 + }, + { + "epoch": 0.05919707243569045, + "grad_norm": 1.6800360679626465, + "learning_rate": 0.0005970908307294472, + "loss": 5.5789, + "step": 550 + }, + { + "epoch": 0.06457862447529868, + "grad_norm": 1.408766508102417, + "learning_rate": 0.0005967675896993858, + "loss": 5.4957, + "step": 600 + }, + { + "epoch": 0.0699601765149069, + "grad_norm": 1.263355016708374, + "learning_rate": 0.0005964443486693243, + "loss": 5.421, + "step": 650 + }, + { + "epoch": 0.07534172855451512, + "grad_norm": 1.617833137512207, + "learning_rate": 0.000596121107639263, + "loss": 5.334, + "step": 700 + }, + { + "epoch": 0.08072328059412334, + "grad_norm": 1.2163816690444946, + "learning_rate": 0.0005957978666092015, + "loss": 5.2925, + "step": 750 + }, + { + "epoch": 0.08610483263373157, + "grad_norm": 1.4054591655731201, + "learning_rate": 0.0005954746255791401, + "loss": 5.222, + "step": 800 + }, + { + "epoch": 0.09148638467333979, + "grad_norm": 1.0674018859863281, + "learning_rate": 0.0005951513845490787, + "loss": 5.2071, + "step": 850 + }, + { + "epoch": 0.09686793671294801, + "grad_norm": 1.4989951848983765, + "learning_rate": 0.0005948281435190174, + "loss": 5.1526, + "step": 900 + }, + { + "epoch": 0.10224948875255624, + "grad_norm": 1.028799057006836, + "learning_rate": 0.0005945049024889559, + "loss": 5.1098, + "step": 950 + }, + { + "epoch": 0.10763104079216446, + "grad_norm": 0.955634355545044, + "learning_rate": 0.0005941816614588944, + "loss": 5.0633, + "step": 1000 + }, + { + "epoch": 0.10763104079216446, + "eval_accuracy": 0.22915491630956988, + "eval_loss": 4.996466636657715, + "eval_runtime": 202.5064, + "eval_samples_per_second": 88.94, + "eval_steps_per_second": 5.56, + "step": 1000 + }, + { + "epoch": 0.11301259283177269, + "grad_norm": 1.2216582298278809, + "learning_rate": 0.000593858420428833, + "loss": 5.0219, + "step": 1050 + }, + { + "epoch": 0.1183941448713809, + "grad_norm": 1.2919893264770508, + "learning_rate": 0.0005935351793987716, + "loss": 4.9962, + "step": 1100 + }, + { + "epoch": 0.12377569691098914, + "grad_norm": 1.842529058456421, + "learning_rate": 0.0005932119383687103, + "loss": 4.9736, + "step": 1150 + }, + { + "epoch": 0.12915724895059735, + "grad_norm": 0.9416630268096924, + "learning_rate": 0.0005928886973386488, + "loss": 4.9687, + "step": 1200 + }, + { + "epoch": 0.13453880099020557, + "grad_norm": 1.340468406677246, + "learning_rate": 0.0005925654563085874, + "loss": 4.9251, + "step": 1250 + }, + { + "epoch": 0.1399203530298138, + "grad_norm": 1.4057867527008057, + "learning_rate": 0.000592242215278526, + "loss": 4.8782, + "step": 1300 + }, + { + "epoch": 0.14530190506942203, + "grad_norm": 1.3462789058685303, + "learning_rate": 0.0005919189742484645, + "loss": 4.8431, + "step": 1350 + }, + { + "epoch": 0.15068345710903025, + "grad_norm": 0.9291279911994934, + "learning_rate": 0.0005915957332184032, + "loss": 4.8316, + "step": 1400 + }, + { + "epoch": 0.15606500914863847, + "grad_norm": 0.7789126634597778, + "learning_rate": 0.0005912724921883417, + "loss": 4.8028, + "step": 1450 + }, + { + "epoch": 0.16144656118824668, + "grad_norm": 1.2111430168151855, + "learning_rate": 0.0005909492511582803, + "loss": 4.7837, + "step": 1500 + }, + { + "epoch": 0.1668281132278549, + "grad_norm": 1.1703121662139893, + "learning_rate": 0.0005906260101282189, + "loss": 4.7456, + "step": 1550 + }, + { + "epoch": 0.17220966526746315, + "grad_norm": 0.9755436182022095, + "learning_rate": 0.0005903027690981575, + "loss": 4.7303, + "step": 1600 + }, + { + "epoch": 0.17759121730707136, + "grad_norm": 0.9121830463409424, + "learning_rate": 0.000589979528068096, + "loss": 4.7437, + "step": 1650 + }, + { + "epoch": 0.18297276934667958, + "grad_norm": 1.2337779998779297, + "learning_rate": 0.0005896562870380347, + "loss": 4.6991, + "step": 1700 + }, + { + "epoch": 0.1883543213862878, + "grad_norm": 0.7923302054405212, + "learning_rate": 0.0005893330460079732, + "loss": 4.6819, + "step": 1750 + }, + { + "epoch": 0.19373587342589602, + "grad_norm": 0.9414947032928467, + "learning_rate": 0.0005890098049779118, + "loss": 4.6401, + "step": 1800 + }, + { + "epoch": 0.19911742546550426, + "grad_norm": 0.8575401306152344, + "learning_rate": 0.0005886865639478504, + "loss": 4.6503, + "step": 1850 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.874113917350769, + "learning_rate": 0.0005883633229177889, + "loss": 4.6262, + "step": 1900 + }, + { + "epoch": 0.2098805295447207, + "grad_norm": 0.7579442262649536, + "learning_rate": 0.0005880400818877276, + "loss": 4.5946, + "step": 1950 + }, + { + "epoch": 0.2152620815843289, + "grad_norm": 0.8062272071838379, + "learning_rate": 0.0005877168408576662, + "loss": 4.579, + "step": 2000 + }, + { + "epoch": 0.2152620815843289, + "eval_accuracy": 0.2712283567521957, + "eval_loss": 4.50425910949707, + "eval_runtime": 211.1194, + "eval_samples_per_second": 85.312, + "eval_steps_per_second": 5.333, + "step": 2000 + }, + { + "epoch": 0.22064363362393713, + "grad_norm": 1.1572167873382568, + "learning_rate": 0.0005873935998276048, + "loss": 4.5564, + "step": 2050 + }, + { + "epoch": 0.22602518566354537, + "grad_norm": 1.1494579315185547, + "learning_rate": 0.0005870703587975433, + "loss": 4.5425, + "step": 2100 + }, + { + "epoch": 0.2314067377031536, + "grad_norm": 0.9105241298675537, + "learning_rate": 0.0005867471177674818, + "loss": 4.5083, + "step": 2150 + }, + { + "epoch": 0.2367882897427618, + "grad_norm": 0.9526411890983582, + "learning_rate": 0.0005864238767374205, + "loss": 4.5128, + "step": 2200 + }, + { + "epoch": 0.24216984178237003, + "grad_norm": 0.7726457715034485, + "learning_rate": 0.0005861006357073591, + "loss": 4.4899, + "step": 2250 + }, + { + "epoch": 0.24755139382197827, + "grad_norm": 1.0442702770233154, + "learning_rate": 0.0005857773946772977, + "loss": 4.4716, + "step": 2300 + }, + { + "epoch": 0.2529329458615865, + "grad_norm": 1.1448233127593994, + "learning_rate": 0.0005854541536472362, + "loss": 4.4607, + "step": 2350 + }, + { + "epoch": 0.2583144979011947, + "grad_norm": 0.9515467286109924, + "learning_rate": 0.0005851309126171749, + "loss": 4.4544, + "step": 2400 + }, + { + "epoch": 0.2636960499408029, + "grad_norm": 0.7929104566574097, + "learning_rate": 0.0005848076715871134, + "loss": 4.4392, + "step": 2450 + }, + { + "epoch": 0.26907760198041114, + "grad_norm": 1.1213116645812988, + "learning_rate": 0.000584484430557052, + "loss": 4.4407, + "step": 2500 + }, + { + "epoch": 0.27445915402001936, + "grad_norm": 1.006108045578003, + "learning_rate": 0.0005841611895269906, + "loss": 4.3997, + "step": 2550 + }, + { + "epoch": 0.2798407060596276, + "grad_norm": 0.7756773829460144, + "learning_rate": 0.0005838379484969291, + "loss": 4.3982, + "step": 2600 + }, + { + "epoch": 0.2852222580992358, + "grad_norm": 0.7442255020141602, + "learning_rate": 0.0005835147074668678, + "loss": 4.3751, + "step": 2650 + }, + { + "epoch": 0.29060381013884407, + "grad_norm": 0.7645350694656372, + "learning_rate": 0.0005831914664368063, + "loss": 4.3873, + "step": 2700 + }, + { + "epoch": 0.2959853621784523, + "grad_norm": 0.7028666734695435, + "learning_rate": 0.0005828682254067449, + "loss": 4.3659, + "step": 2750 + }, + { + "epoch": 0.3013669142180605, + "grad_norm": 0.8504071831703186, + "learning_rate": 0.0005825449843766835, + "loss": 4.3617, + "step": 2800 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 0.880102276802063, + "learning_rate": 0.0005822217433466221, + "loss": 4.3115, + "step": 2850 + }, + { + "epoch": 0.31213001829727693, + "grad_norm": 0.7290977835655212, + "learning_rate": 0.0005818985023165607, + "loss": 4.3215, + "step": 2900 + }, + { + "epoch": 0.31751157033688515, + "grad_norm": 0.8019910454750061, + "learning_rate": 0.0005815752612864992, + "loss": 4.3149, + "step": 2950 + }, + { + "epoch": 0.32289312237649337, + "grad_norm": 0.7739085555076599, + "learning_rate": 0.0005812520202564378, + "loss": 4.322, + "step": 3000 + }, + { + "epoch": 0.32289312237649337, + "eval_accuracy": 0.2992396144952079, + "eval_loss": 4.226711750030518, + "eval_runtime": 199.2176, + "eval_samples_per_second": 90.409, + "eval_steps_per_second": 5.652, + "step": 3000 + }, + { + "epoch": 0.3282746744161016, + "grad_norm": 1.0869840383529663, + "learning_rate": 0.0005809287792263764, + "loss": 4.3072, + "step": 3050 + }, + { + "epoch": 0.3336562264557098, + "grad_norm": 0.7337630391120911, + "learning_rate": 0.0005806055381963151, + "loss": 4.2759, + "step": 3100 + }, + { + "epoch": 0.3390377784953181, + "grad_norm": 0.7336916923522949, + "learning_rate": 0.0005802822971662536, + "loss": 4.2838, + "step": 3150 + }, + { + "epoch": 0.3444193305349263, + "grad_norm": 0.7901497483253479, + "learning_rate": 0.0005799590561361922, + "loss": 4.2757, + "step": 3200 + }, + { + "epoch": 0.3498008825745345, + "grad_norm": 0.9568474888801575, + "learning_rate": 0.0005796358151061307, + "loss": 4.2574, + "step": 3250 + }, + { + "epoch": 0.35518243461414273, + "grad_norm": 0.9219122529029846, + "learning_rate": 0.0005793125740760694, + "loss": 4.2423, + "step": 3300 + }, + { + "epoch": 0.36056398665375095, + "grad_norm": 0.6552030444145203, + "learning_rate": 0.0005789893330460079, + "loss": 4.2339, + "step": 3350 + }, + { + "epoch": 0.36594553869335916, + "grad_norm": 0.6318998336791992, + "learning_rate": 0.0005786660920159465, + "loss": 4.2253, + "step": 3400 + }, + { + "epoch": 0.3713270907329674, + "grad_norm": 0.7610264420509338, + "learning_rate": 0.0005783428509858851, + "loss": 4.2369, + "step": 3450 + }, + { + "epoch": 0.3767086427725756, + "grad_norm": 0.6418978571891785, + "learning_rate": 0.0005780196099558237, + "loss": 4.2239, + "step": 3500 + }, + { + "epoch": 0.3820901948121838, + "grad_norm": 0.7791884541511536, + "learning_rate": 0.0005776963689257623, + "loss": 4.2253, + "step": 3550 + }, + { + "epoch": 0.38747174685179203, + "grad_norm": 0.7014104127883911, + "learning_rate": 0.0005773731278957008, + "loss": 4.1888, + "step": 3600 + }, + { + "epoch": 0.3928532988914003, + "grad_norm": 0.7691745758056641, + "learning_rate": 0.0005770498868656394, + "loss": 4.2055, + "step": 3650 + }, + { + "epoch": 0.3982348509310085, + "grad_norm": 0.7330165505409241, + "learning_rate": 0.000576726645835578, + "loss": 4.1917, + "step": 3700 + }, + { + "epoch": 0.40361640297061674, + "grad_norm": 0.6409156918525696, + "learning_rate": 0.0005764034048055167, + "loss": 4.1739, + "step": 3750 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.751600444316864, + "learning_rate": 0.0005760801637754552, + "loss": 4.1815, + "step": 3800 + }, + { + "epoch": 0.4143795070498332, + "grad_norm": 0.6341384649276733, + "learning_rate": 0.0005757569227453937, + "loss": 4.1752, + "step": 3850 + }, + { + "epoch": 0.4197610590894414, + "grad_norm": 0.8716601729393005, + "learning_rate": 0.0005754336817153324, + "loss": 4.1772, + "step": 3900 + }, + { + "epoch": 0.4251426111290496, + "grad_norm": 0.7268335223197937, + "learning_rate": 0.0005751104406852709, + "loss": 4.1579, + "step": 3950 + }, + { + "epoch": 0.4305241631686578, + "grad_norm": 0.7490660548210144, + "learning_rate": 0.0005747871996552096, + "loss": 4.1439, + "step": 4000 + }, + { + "epoch": 0.4305241631686578, + "eval_accuracy": 0.31218462820231296, + "eval_loss": 4.092499256134033, + "eval_runtime": 199.4467, + "eval_samples_per_second": 90.305, + "eval_steps_per_second": 5.646, + "step": 4000 + }, + { + "epoch": 0.43590571520826604, + "grad_norm": 0.9424974918365479, + "learning_rate": 0.0005744639586251481, + "loss": 4.1592, + "step": 4050 + }, + { + "epoch": 0.44128726724787426, + "grad_norm": 0.771391749382019, + "learning_rate": 0.0005741407175950867, + "loss": 4.1455, + "step": 4100 + }, + { + "epoch": 0.44666881928748253, + "grad_norm": 0.7916195392608643, + "learning_rate": 0.0005738174765650253, + "loss": 4.1329, + "step": 4150 + }, + { + "epoch": 0.45205037132709075, + "grad_norm": 0.7015873789787292, + "learning_rate": 0.0005734942355349638, + "loss": 4.1338, + "step": 4200 + }, + { + "epoch": 0.45743192336669897, + "grad_norm": 0.62185138463974, + "learning_rate": 0.0005731709945049025, + "loss": 4.1517, + "step": 4250 + }, + { + "epoch": 0.4628134754063072, + "grad_norm": 0.6499165296554565, + "learning_rate": 0.000572847753474841, + "loss": 4.1249, + "step": 4300 + }, + { + "epoch": 0.4681950274459154, + "grad_norm": 0.8142028450965881, + "learning_rate": 0.0005725245124447796, + "loss": 4.1094, + "step": 4350 + }, + { + "epoch": 0.4735765794855236, + "grad_norm": 0.686265230178833, + "learning_rate": 0.0005722012714147182, + "loss": 4.1156, + "step": 4400 + }, + { + "epoch": 0.47895813152513184, + "grad_norm": 0.6861261129379272, + "learning_rate": 0.0005718780303846568, + "loss": 4.0911, + "step": 4450 + }, + { + "epoch": 0.48433968356474005, + "grad_norm": 0.717899739742279, + "learning_rate": 0.0005715547893545953, + "loss": 4.0984, + "step": 4500 + }, + { + "epoch": 0.48972123560434827, + "grad_norm": 0.635183572769165, + "learning_rate": 0.000571231548324534, + "loss": 4.0915, + "step": 4550 + }, + { + "epoch": 0.49510278764395654, + "grad_norm": 0.6602552533149719, + "learning_rate": 0.0005709083072944725, + "loss": 4.0886, + "step": 4600 + }, + { + "epoch": 0.5004843396835648, + "grad_norm": 0.7243141531944275, + "learning_rate": 0.0005705850662644111, + "loss": 4.0928, + "step": 4650 + }, + { + "epoch": 0.505865891723173, + "grad_norm": 0.6310231685638428, + "learning_rate": 0.0005702618252343497, + "loss": 4.0986, + "step": 4700 + }, + { + "epoch": 0.5112474437627812, + "grad_norm": 0.5868064761161804, + "learning_rate": 0.0005699385842042882, + "loss": 4.0687, + "step": 4750 + }, + { + "epoch": 0.5166289958023894, + "grad_norm": 0.6389384269714355, + "learning_rate": 0.0005696153431742269, + "loss": 4.0789, + "step": 4800 + }, + { + "epoch": 0.5220105478419976, + "grad_norm": 0.7776073217391968, + "learning_rate": 0.0005692921021441655, + "loss": 4.0977, + "step": 4850 + }, + { + "epoch": 0.5273920998816058, + "grad_norm": 0.6234896779060364, + "learning_rate": 0.0005689688611141041, + "loss": 4.0547, + "step": 4900 + }, + { + "epoch": 0.5327736519212141, + "grad_norm": 0.8750380277633667, + "learning_rate": 0.0005686456200840426, + "loss": 4.0718, + "step": 4950 + }, + { + "epoch": 0.5381552039608223, + "grad_norm": 0.7700529098510742, + "learning_rate": 0.0005683223790539811, + "loss": 4.043, + "step": 5000 + }, + { + "epoch": 0.5381552039608223, + "eval_accuracy": 0.3214325100957547, + "eval_loss": 3.9926369190216064, + "eval_runtime": 225.5059, + "eval_samples_per_second": 79.869, + "eval_steps_per_second": 4.993, + "step": 5000 + }, + { + "epoch": 0.5435367560004305, + "grad_norm": 0.7643857002258301, + "learning_rate": 0.0005679991380239198, + "loss": 4.0605, + "step": 5050 + }, + { + "epoch": 0.5489183080400387, + "grad_norm": 0.554442286491394, + "learning_rate": 0.0005676758969938584, + "loss": 4.0446, + "step": 5100 + }, + { + "epoch": 0.5542998600796469, + "grad_norm": 0.7216308116912842, + "learning_rate": 0.000567352655963797, + "loss": 4.0313, + "step": 5150 + }, + { + "epoch": 0.5596814121192552, + "grad_norm": 0.6057455539703369, + "learning_rate": 0.0005670294149337355, + "loss": 4.0337, + "step": 5200 + }, + { + "epoch": 0.5650629641588634, + "grad_norm": 0.535860002040863, + "learning_rate": 0.0005667061739036742, + "loss": 4.0392, + "step": 5250 + }, + { + "epoch": 0.5704445161984716, + "grad_norm": 0.647204577922821, + "learning_rate": 0.0005663829328736127, + "loss": 4.0421, + "step": 5300 + }, + { + "epoch": 0.5758260682380799, + "grad_norm": 0.5126392841339111, + "learning_rate": 0.0005660596918435512, + "loss": 4.0318, + "step": 5350 + }, + { + "epoch": 0.5812076202776881, + "grad_norm": 0.6040687561035156, + "learning_rate": 0.0005657364508134899, + "loss": 4.0117, + "step": 5400 + }, + { + "epoch": 0.5865891723172963, + "grad_norm": 0.6971921324729919, + "learning_rate": 0.0005654132097834284, + "loss": 4.025, + "step": 5450 + }, + { + "epoch": 0.5919707243569046, + "grad_norm": 0.5343753099441528, + "learning_rate": 0.0005650899687533671, + "loss": 4.015, + "step": 5500 + }, + { + "epoch": 0.5973522763965128, + "grad_norm": 0.5600801706314087, + "learning_rate": 0.0005647667277233056, + "loss": 4.0087, + "step": 5550 + }, + { + "epoch": 0.602733828436121, + "grad_norm": 0.6542683243751526, + "learning_rate": 0.0005644434866932442, + "loss": 3.9957, + "step": 5600 + }, + { + "epoch": 0.6081153804757292, + "grad_norm": 0.6249606609344482, + "learning_rate": 0.0005641202456631828, + "loss": 4.0136, + "step": 5650 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.6572969555854797, + "learning_rate": 0.0005637970046331214, + "loss": 4.0046, + "step": 5700 + }, + { + "epoch": 0.6188784845549457, + "grad_norm": 0.6926669478416443, + "learning_rate": 0.00056347376360306, + "loss": 3.9869, + "step": 5750 + }, + { + "epoch": 0.6242600365945539, + "grad_norm": 0.6491366624832153, + "learning_rate": 0.0005631505225729985, + "loss": 4.0004, + "step": 5800 + }, + { + "epoch": 0.6296415886341621, + "grad_norm": 0.6138956546783447, + "learning_rate": 0.0005628272815429371, + "loss": 3.9814, + "step": 5850 + }, + { + "epoch": 0.6350231406737703, + "grad_norm": 0.6467788219451904, + "learning_rate": 0.0005625040405128757, + "loss": 3.9771, + "step": 5900 + }, + { + "epoch": 0.6404046927133785, + "grad_norm": 0.6469590067863464, + "learning_rate": 0.0005621807994828143, + "loss": 3.9882, + "step": 5950 + }, + { + "epoch": 0.6457862447529867, + "grad_norm": 0.7396846413612366, + "learning_rate": 0.0005618575584527529, + "loss": 3.9759, + "step": 6000 + }, + { + "epoch": 0.6457862447529867, + "eval_accuracy": 0.32833664054615025, + "eval_loss": 3.919649839401245, + "eval_runtime": 200.1239, + "eval_samples_per_second": 89.999, + "eval_steps_per_second": 5.627, + "step": 6000 + }, + { + "epoch": 0.651167796792595, + "grad_norm": 0.7368170619010925, + "learning_rate": 0.0005615343174226915, + "loss": 3.9789, + "step": 6050 + }, + { + "epoch": 0.6565493488322032, + "grad_norm": 0.593928873538971, + "learning_rate": 0.00056121107639263, + "loss": 3.962, + "step": 6100 + }, + { + "epoch": 0.6619309008718114, + "grad_norm": 0.5571395754814148, + "learning_rate": 0.0005608878353625687, + "loss": 3.9732, + "step": 6150 + }, + { + "epoch": 0.6673124529114196, + "grad_norm": 0.6184373497962952, + "learning_rate": 0.0005605645943325072, + "loss": 3.9618, + "step": 6200 + }, + { + "epoch": 0.6726940049510278, + "grad_norm": 0.6731172800064087, + "learning_rate": 0.0005602413533024458, + "loss": 3.9629, + "step": 6250 + }, + { + "epoch": 0.6780755569906362, + "grad_norm": Infinity, + "learning_rate": 0.0005599245770929855, + "loss": 3.9661, + "step": 6300 + }, + { + "epoch": 0.6834571090302444, + "grad_norm": 0.6674894690513611, + "learning_rate": 0.0005596013360629242, + "loss": 3.9598, + "step": 6350 + }, + { + "epoch": 0.6888386610698526, + "grad_norm": 0.6270789504051208, + "learning_rate": 0.0005592780950328628, + "loss": 3.9443, + "step": 6400 + }, + { + "epoch": 0.6942202131094608, + "grad_norm": 0.6163814067840576, + "learning_rate": 0.0005589548540028014, + "loss": 3.9506, + "step": 6450 + }, + { + "epoch": 0.699601765149069, + "grad_norm": 0.6060782670974731, + "learning_rate": 0.0005586316129727399, + "loss": 3.9645, + "step": 6500 + }, + { + "epoch": 0.7049833171886772, + "grad_norm": 0.6246522068977356, + "learning_rate": 0.0005583083719426786, + "loss": 3.9614, + "step": 6550 + }, + { + "epoch": 0.7103648692282855, + "grad_norm": 0.603921115398407, + "learning_rate": 0.0005579851309126171, + "loss": 3.94, + "step": 6600 + }, + { + "epoch": 0.7157464212678937, + "grad_norm": 0.5655505061149597, + "learning_rate": 0.0005576618898825558, + "loss": 3.9318, + "step": 6650 + }, + { + "epoch": 0.7211279733075019, + "grad_norm": 0.604542076587677, + "learning_rate": 0.0005573386488524943, + "loss": 3.9307, + "step": 6700 + }, + { + "epoch": 0.7265095253471101, + "grad_norm": 0.600004255771637, + "learning_rate": 0.0005570154078224328, + "loss": 3.9442, + "step": 6750 + }, + { + "epoch": 0.7318910773867183, + "grad_norm": 0.6960250735282898, + "learning_rate": 0.0005566921667923715, + "loss": 3.9425, + "step": 6800 + }, + { + "epoch": 0.7372726294263265, + "grad_norm": 0.5619109869003296, + "learning_rate": 0.00055636892576231, + "loss": 3.9073, + "step": 6850 + }, + { + "epoch": 0.7426541814659348, + "grad_norm": 0.6036113500595093, + "learning_rate": 0.0005560456847322487, + "loss": 3.9305, + "step": 6900 + }, + { + "epoch": 0.748035733505543, + "grad_norm": 0.6234127283096313, + "learning_rate": 0.0005557224437021872, + "loss": 3.9285, + "step": 6950 + }, + { + "epoch": 0.7534172855451512, + "grad_norm": 0.6131523251533508, + "learning_rate": 0.0005553992026721258, + "loss": 3.922, + "step": 7000 + }, + { + "epoch": 0.7534172855451512, + "eval_accuracy": 0.3334209437785411, + "eval_loss": 3.861107349395752, + "eval_runtime": 205.1306, + "eval_samples_per_second": 87.803, + "eval_steps_per_second": 5.489, + "step": 7000 + }, + { + "epoch": 0.7587988375847594, + "grad_norm": 0.6058487296104431, + "learning_rate": 0.0005550759616420644, + "loss": 3.9083, + "step": 7050 + }, + { + "epoch": 0.7641803896243676, + "grad_norm": 0.624620258808136, + "learning_rate": 0.000554752720612003, + "loss": 3.9349, + "step": 7100 + }, + { + "epoch": 0.7695619416639758, + "grad_norm": 0.5525732040405273, + "learning_rate": 0.0005544294795819415, + "loss": 3.9261, + "step": 7150 + }, + { + "epoch": 0.7749434937035841, + "grad_norm": 0.5950748324394226, + "learning_rate": 0.0005541062385518801, + "loss": 3.9067, + "step": 7200 + }, + { + "epoch": 0.7803250457431924, + "grad_norm": 0.5052813291549683, + "learning_rate": 0.0005537829975218188, + "loss": 3.9004, + "step": 7250 + }, + { + "epoch": 0.7857065977828006, + "grad_norm": 0.5610913038253784, + "learning_rate": 0.0005534662213123586, + "loss": 3.8911, + "step": 7300 + }, + { + "epoch": 0.7910881498224088, + "grad_norm": 0.5639758110046387, + "learning_rate": 0.0005531429802822971, + "loss": 3.931, + "step": 7350 + }, + { + "epoch": 0.796469701862017, + "grad_norm": 0.6221727728843689, + "learning_rate": 0.0005528197392522357, + "loss": 3.9288, + "step": 7400 + }, + { + "epoch": 0.8018512539016253, + "grad_norm": 0.5807225108146667, + "learning_rate": 0.0005524964982221743, + "loss": 3.9206, + "step": 7450 + }, + { + "epoch": 0.8072328059412335, + "grad_norm": 0.6214258074760437, + "learning_rate": 0.0005521732571921129, + "loss": 3.9216, + "step": 7500 + }, + { + "epoch": 0.8126143579808417, + "grad_norm": 0.6088549494743347, + "learning_rate": 0.0005518500161620514, + "loss": 3.9102, + "step": 7550 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.615315318107605, + "learning_rate": 0.00055152677513199, + "loss": 3.8932, + "step": 7600 + }, + { + "epoch": 0.8233774620600581, + "grad_norm": 0.5932325124740601, + "learning_rate": 0.0005512035341019286, + "loss": 3.9037, + "step": 7650 + }, + { + "epoch": 0.8287590140996663, + "grad_norm": 0.632127583026886, + "learning_rate": 0.0005508802930718672, + "loss": 3.9074, + "step": 7700 + }, + { + "epoch": 0.8341405661392746, + "grad_norm": 0.5906286239624023, + "learning_rate": 0.0005505570520418058, + "loss": 3.8969, + "step": 7750 + }, + { + "epoch": 0.8395221181788828, + "grad_norm": 0.7419958114624023, + "learning_rate": 0.0005502338110117443, + "loss": 3.911, + "step": 7800 + }, + { + "epoch": 0.844903670218491, + "grad_norm": 0.5959650874137878, + "learning_rate": 0.000549910569981683, + "loss": 3.885, + "step": 7850 + }, + { + "epoch": 0.8502852222580992, + "grad_norm": 0.5370450615882874, + "learning_rate": 0.0005495873289516215, + "loss": 3.8883, + "step": 7900 + }, + { + "epoch": 0.8556667742977074, + "grad_norm": 0.6142526865005493, + "learning_rate": 0.0005492640879215602, + "loss": 3.8891, + "step": 7950 + }, + { + "epoch": 0.8610483263373157, + "grad_norm": 0.5250054001808167, + "learning_rate": 0.0005489408468914987, + "loss": 3.8731, + "step": 8000 + }, + { + "epoch": 0.8610483263373157, + "eval_accuracy": 0.3380041398923315, + "eval_loss": 3.814573049545288, + "eval_runtime": 218.7029, + "eval_samples_per_second": 82.354, + "eval_steps_per_second": 5.149, + "step": 8000 + }, + { + "epoch": 0.8664298783769239, + "grad_norm": 0.5592741370201111, + "learning_rate": 0.0005486176058614372, + "loss": 3.8661, + "step": 8050 + }, + { + "epoch": 0.8718114304165321, + "grad_norm": 0.558358371257782, + "learning_rate": 0.0005482943648313759, + "loss": 3.8665, + "step": 8100 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.6065065860748291, + "learning_rate": 0.0005479711238013145, + "loss": 3.8815, + "step": 8150 + }, + { + "epoch": 0.8825745344957485, + "grad_norm": 0.566906750202179, + "learning_rate": 0.0005476478827712531, + "loss": 3.8788, + "step": 8200 + }, + { + "epoch": 0.8879560865353568, + "grad_norm": 0.5542231202125549, + "learning_rate": 0.0005473246417411916, + "loss": 3.8757, + "step": 8250 + }, + { + "epoch": 0.8933376385749651, + "grad_norm": 0.5650736093521118, + "learning_rate": 0.0005470014007111302, + "loss": 3.867, + "step": 8300 + }, + { + "epoch": 0.8987191906145733, + "grad_norm": 0.6359168887138367, + "learning_rate": 0.0005466781596810688, + "loss": 3.8704, + "step": 8350 + }, + { + "epoch": 0.9041007426541815, + "grad_norm": 0.5609973669052124, + "learning_rate": 0.0005463549186510073, + "loss": 3.8704, + "step": 8400 + }, + { + "epoch": 0.9094822946937897, + "grad_norm": 0.5966047048568726, + "learning_rate": 0.000546031677620946, + "loss": 3.8649, + "step": 8450 + }, + { + "epoch": 0.9148638467333979, + "grad_norm": 0.555415689945221, + "learning_rate": 0.0005457084365908845, + "loss": 3.8775, + "step": 8500 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 0.5719990134239197, + "learning_rate": 0.0005453851955608232, + "loss": 3.8601, + "step": 8550 + }, + { + "epoch": 0.9256269508126144, + "grad_norm": 0.5464230179786682, + "learning_rate": 0.0005450619545307617, + "loss": 3.8599, + "step": 8600 + }, + { + "epoch": 0.9310085028522226, + "grad_norm": 0.6069023013114929, + "learning_rate": 0.0005447387135007003, + "loss": 3.8584, + "step": 8650 + }, + { + "epoch": 0.9363900548918308, + "grad_norm": 0.556610643863678, + "learning_rate": 0.0005444154724706389, + "loss": 3.8636, + "step": 8700 + }, + { + "epoch": 0.941771606931439, + "grad_norm": 0.5988168120384216, + "learning_rate": 0.0005440922314405775, + "loss": 3.8366, + "step": 8750 + }, + { + "epoch": 0.9471531589710472, + "grad_norm": 0.5634347200393677, + "learning_rate": 0.0005437689904105161, + "loss": 3.8604, + "step": 8800 + }, + { + "epoch": 0.9525347110106555, + "grad_norm": 0.5349180102348328, + "learning_rate": 0.0005434457493804546, + "loss": 3.8395, + "step": 8850 + }, + { + "epoch": 0.9579162630502637, + "grad_norm": 0.5537461042404175, + "learning_rate": 0.0005431225083503932, + "loss": 3.8644, + "step": 8900 + }, + { + "epoch": 0.9632978150898719, + "grad_norm": 0.5711461901664734, + "learning_rate": 0.0005427992673203318, + "loss": 3.8352, + "step": 8950 + }, + { + "epoch": 0.9686793671294801, + "grad_norm": 0.6128414273262024, + "learning_rate": 0.0005424760262902704, + "loss": 3.8405, + "step": 9000 + }, + { + "epoch": 0.9686793671294801, + "eval_accuracy": 0.3417073562677125, + "eval_loss": 3.779106378555298, + "eval_runtime": 210.7209, + "eval_samples_per_second": 85.473, + "eval_steps_per_second": 5.344, + "step": 9000 + }, + { + "epoch": 0.9740609191690883, + "grad_norm": 0.6156434416770935, + "learning_rate": 0.000542152785260209, + "loss": 3.8333, + "step": 9050 + }, + { + "epoch": 0.9794424712086965, + "grad_norm": 0.5652556419372559, + "learning_rate": 0.0005418295442301476, + "loss": 3.8327, + "step": 9100 + }, + { + "epoch": 0.9848240232483048, + "grad_norm": 0.6443074941635132, + "learning_rate": 0.0005415063032000861, + "loss": 3.842, + "step": 9150 + }, + { + "epoch": 0.9902055752879131, + "grad_norm": 0.632297158241272, + "learning_rate": 0.0005411830621700248, + "loss": 3.8312, + "step": 9200 + }, + { + "epoch": 0.9955871273275213, + "grad_norm": 0.5863285064697266, + "learning_rate": 0.0005408598211399633, + "loss": 3.8298, + "step": 9250 + }, + { + "epoch": 1.0009686793671295, + "grad_norm": 0.5393086671829224, + "learning_rate": 0.0005405365801099019, + "loss": 3.8069, + "step": 9300 + }, + { + "epoch": 1.0063502314067376, + "grad_norm": 0.5285312533378601, + "learning_rate": 0.0005402133390798405, + "loss": 3.7619, + "step": 9350 + }, + { + "epoch": 1.011731783446346, + "grad_norm": 0.5921708941459656, + "learning_rate": 0.000539890098049779, + "loss": 3.7701, + "step": 9400 + }, + { + "epoch": 1.017113335485954, + "grad_norm": 0.5384525060653687, + "learning_rate": 0.0005395668570197177, + "loss": 3.7657, + "step": 9450 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.5624929666519165, + "learning_rate": 0.0005392436159896562, + "loss": 3.7754, + "step": 9500 + }, + { + "epoch": 1.0278764395651705, + "grad_norm": 0.5882208943367004, + "learning_rate": 0.0005389203749595948, + "loss": 3.7848, + "step": 9550 + }, + { + "epoch": 1.0332579916047788, + "grad_norm": 0.5588924884796143, + "learning_rate": 0.0005385971339295334, + "loss": 3.7603, + "step": 9600 + }, + { + "epoch": 1.0386395436443872, + "grad_norm": 0.5837873220443726, + "learning_rate": 0.000538273892899472, + "loss": 3.7785, + "step": 9650 + }, + { + "epoch": 1.0440210956839953, + "grad_norm": 0.5880899429321289, + "learning_rate": 0.0005379506518694106, + "loss": 3.7506, + "step": 9700 + }, + { + "epoch": 1.0494026477236036, + "grad_norm": 0.5579642653465271, + "learning_rate": 0.0005376274108393491, + "loss": 3.7714, + "step": 9750 + }, + { + "epoch": 1.0547841997632117, + "grad_norm": 0.576766312122345, + "learning_rate": 0.0005373041698092877, + "loss": 3.7653, + "step": 9800 + }, + { + "epoch": 1.06016575180282, + "grad_norm": 0.5471925735473633, + "learning_rate": 0.0005369809287792263, + "loss": 3.7686, + "step": 9850 + }, + { + "epoch": 1.0655473038424281, + "grad_norm": 0.59092116355896, + "learning_rate": 0.000536657687749165, + "loss": 3.7667, + "step": 9900 + }, + { + "epoch": 1.0709288558820365, + "grad_norm": 0.638380765914917, + "learning_rate": 0.0005363344467191035, + "loss": 3.7722, + "step": 9950 + }, + { + "epoch": 1.0763104079216446, + "grad_norm": 0.5594536066055298, + "learning_rate": 0.000536011205689042, + "loss": 3.7655, + "step": 10000 + }, + { + "epoch": 1.0763104079216446, + "eval_accuracy": 0.3451049320696713, + "eval_loss": 3.7489354610443115, + "eval_runtime": 202.8767, + "eval_samples_per_second": 88.778, + "eval_steps_per_second": 5.55, + "step": 10000 + }, + { + "epoch": 1.081691959961253, + "grad_norm": 0.640766441822052, + "learning_rate": 0.0005356879646589807, + "loss": 3.7646, + "step": 10050 + }, + { + "epoch": 1.087073512000861, + "grad_norm": 0.6311773657798767, + "learning_rate": 0.0005353647236289192, + "loss": 3.7643, + "step": 10100 + }, + { + "epoch": 1.0924550640404693, + "grad_norm": 0.5654868483543396, + "learning_rate": 0.0005350414825988579, + "loss": 3.7592, + "step": 10150 + }, + { + "epoch": 1.0978366160800774, + "grad_norm": 0.5341681838035583, + "learning_rate": 0.0005347182415687964, + "loss": 3.7609, + "step": 10200 + }, + { + "epoch": 1.1032181681196858, + "grad_norm": 0.8984493613243103, + "learning_rate": 0.000534395000538735, + "loss": 3.7588, + "step": 10250 + }, + { + "epoch": 1.1085997201592939, + "grad_norm": 0.6130673885345459, + "learning_rate": 0.0005340717595086736, + "loss": 3.7725, + "step": 10300 + }, + { + "epoch": 1.1139812721989022, + "grad_norm": 0.5171802043914795, + "learning_rate": 0.0005337485184786122, + "loss": 3.7686, + "step": 10350 + }, + { + "epoch": 1.1193628242385103, + "grad_norm": 0.6430924534797668, + "learning_rate": 0.0005334252774485507, + "loss": 3.7493, + "step": 10400 + }, + { + "epoch": 1.1247443762781186, + "grad_norm": 0.5376786589622498, + "learning_rate": 0.0005331020364184894, + "loss": 3.7686, + "step": 10450 + }, + { + "epoch": 1.1301259283177267, + "grad_norm": 0.5798326730728149, + "learning_rate": 0.0005327787953884279, + "loss": 3.7654, + "step": 10500 + }, + { + "epoch": 1.135507480357335, + "grad_norm": 0.5384038090705872, + "learning_rate": 0.0005324555543583665, + "loss": 3.7485, + "step": 10550 + }, + { + "epoch": 1.1408890323969434, + "grad_norm": 0.5550758838653564, + "learning_rate": 0.0005321323133283051, + "loss": 3.748, + "step": 10600 + }, + { + "epoch": 1.1462705844365515, + "grad_norm": 0.5656107664108276, + "learning_rate": 0.0005318090722982436, + "loss": 3.7485, + "step": 10650 + }, + { + "epoch": 1.1516521364761596, + "grad_norm": 0.5888227224349976, + "learning_rate": 0.0005314858312681823, + "loss": 3.7438, + "step": 10700 + }, + { + "epoch": 1.157033688515768, + "grad_norm": 0.5515899658203125, + "learning_rate": 0.0005311625902381209, + "loss": 3.7721, + "step": 10750 + }, + { + "epoch": 1.1624152405553763, + "grad_norm": 0.6210424900054932, + "learning_rate": 0.0005308393492080595, + "loss": 3.7652, + "step": 10800 + }, + { + "epoch": 1.1677967925949844, + "grad_norm": 0.5752713084220886, + "learning_rate": 0.000530516108177998, + "loss": 3.7486, + "step": 10850 + }, + { + "epoch": 1.1731783446345927, + "grad_norm": 0.6572223901748657, + "learning_rate": 0.0005301928671479365, + "loss": 3.7272, + "step": 10900 + }, + { + "epoch": 1.1785598966742008, + "grad_norm": 0.6310757994651794, + "learning_rate": 0.0005298696261178752, + "loss": 3.7563, + "step": 10950 + }, + { + "epoch": 1.1839414487138091, + "grad_norm": 0.5674658417701721, + "learning_rate": 0.0005295463850878138, + "loss": 3.7441, + "step": 11000 + }, + { + "epoch": 1.1839414487138091, + "eval_accuracy": 0.34708784727228553, + "eval_loss": 3.7200334072113037, + "eval_runtime": 208.4996, + "eval_samples_per_second": 86.384, + "eval_steps_per_second": 5.4, + "step": 11000 + }, + { + "epoch": 1.1893230007534172, + "grad_norm": 0.5541161298751831, + "learning_rate": 0.0005292231440577524, + "loss": 3.7499, + "step": 11050 + }, + { + "epoch": 1.1947045527930256, + "grad_norm": 0.5533831715583801, + "learning_rate": 0.0005288999030276909, + "loss": 3.7418, + "step": 11100 + }, + { + "epoch": 1.2000861048326337, + "grad_norm": 0.5780303478240967, + "learning_rate": 0.0005285766619976295, + "loss": 3.7409, + "step": 11150 + }, + { + "epoch": 1.205467656872242, + "grad_norm": 0.6292614340782166, + "learning_rate": 0.0005282534209675681, + "loss": 3.7435, + "step": 11200 + }, + { + "epoch": 1.21084920891185, + "grad_norm": 0.5940732359886169, + "learning_rate": 0.0005279301799375066, + "loss": 3.7362, + "step": 11250 + }, + { + "epoch": 1.2162307609514584, + "grad_norm": 0.5676620006561279, + "learning_rate": 0.0005276134037280465, + "loss": 3.7241, + "step": 11300 + }, + { + "epoch": 1.2216123129910665, + "grad_norm": 0.5294714570045471, + "learning_rate": 0.0005272901626979851, + "loss": 3.7192, + "step": 11350 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.5521119832992554, + "learning_rate": 0.0005269669216679236, + "loss": 3.734, + "step": 11400 + }, + { + "epoch": 1.232375417070283, + "grad_norm": 0.7975606918334961, + "learning_rate": 0.0005266436806378623, + "loss": 3.7188, + "step": 11450 + }, + { + "epoch": 1.2377569691098913, + "grad_norm": 0.5794736742973328, + "learning_rate": 0.0005263204396078008, + "loss": 3.7279, + "step": 11500 + }, + { + "epoch": 1.2431385211494996, + "grad_norm": 0.5361841917037964, + "learning_rate": 0.0005259971985777394, + "loss": 3.7432, + "step": 11550 + }, + { + "epoch": 1.2485200731891077, + "grad_norm": 0.5248964428901672, + "learning_rate": 0.000525673957547678, + "loss": 3.7447, + "step": 11600 + }, + { + "epoch": 1.2539016252287158, + "grad_norm": 0.5553768873214722, + "learning_rate": 0.0005253507165176167, + "loss": 3.7477, + "step": 11650 + }, + { + "epoch": 1.2592831772683242, + "grad_norm": 0.5761224627494812, + "learning_rate": 0.0005250274754875552, + "loss": 3.7211, + "step": 11700 + }, + { + "epoch": 1.2646647293079325, + "grad_norm": 0.607130229473114, + "learning_rate": 0.0005247042344574938, + "loss": 3.7359, + "step": 11750 + }, + { + "epoch": 1.2700462813475406, + "grad_norm": 0.5540531873703003, + "learning_rate": 0.0005243809934274323, + "loss": 3.7217, + "step": 11800 + }, + { + "epoch": 1.275427833387149, + "grad_norm": 0.6098884344100952, + "learning_rate": 0.0005240577523973709, + "loss": 3.7431, + "step": 11850 + }, + { + "epoch": 1.280809385426757, + "grad_norm": 0.5816884636878967, + "learning_rate": 0.0005237345113673095, + "loss": 3.7404, + "step": 11900 + }, + { + "epoch": 1.2861909374663654, + "grad_norm": 0.5769429802894592, + "learning_rate": 0.0005234112703372481, + "loss": 3.7404, + "step": 11950 + }, + { + "epoch": 1.2915724895059735, + "grad_norm": 0.563298761844635, + "learning_rate": 0.0005230880293071867, + "loss": 3.7293, + "step": 12000 + }, + { + "epoch": 1.2915724895059735, + "eval_accuracy": 0.34958697234490643, + "eval_loss": 3.6987245082855225, + "eval_runtime": 217.4267, + "eval_samples_per_second": 82.837, + "eval_steps_per_second": 5.179, + "step": 12000 + }, + { + "epoch": 1.2969540415455818, + "grad_norm": 0.5788484811782837, + "learning_rate": 0.0005227647882771253, + "loss": 3.7336, + "step": 12050 + }, + { + "epoch": 1.30233559358519, + "grad_norm": 0.6204023957252502, + "learning_rate": 0.0005224415472470639, + "loss": 3.7243, + "step": 12100 + }, + { + "epoch": 1.3077171456247982, + "grad_norm": 0.5986481308937073, + "learning_rate": 0.0005221183062170024, + "loss": 3.7316, + "step": 12150 + }, + { + "epoch": 1.3130986976644063, + "grad_norm": 0.6356789469718933, + "learning_rate": 0.0005217950651869409, + "loss": 3.7352, + "step": 12200 + }, + { + "epoch": 1.3184802497040147, + "grad_norm": 0.555164098739624, + "learning_rate": 0.0005214718241568796, + "loss": 3.7149, + "step": 12250 + }, + { + "epoch": 1.3238618017436228, + "grad_norm": 0.6156308054924011, + "learning_rate": 0.0005211485831268182, + "loss": 3.6991, + "step": 12300 + }, + { + "epoch": 1.329243353783231, + "grad_norm": 0.5790920853614807, + "learning_rate": 0.0005208253420967568, + "loss": 3.7206, + "step": 12350 + }, + { + "epoch": 1.3346249058228392, + "grad_norm": 0.5524982810020447, + "learning_rate": 0.0005205021010666953, + "loss": 3.7344, + "step": 12400 + }, + { + "epoch": 1.3400064578624475, + "grad_norm": 0.5789136290550232, + "learning_rate": 0.0005201788600366339, + "loss": 3.714, + "step": 12450 + }, + { + "epoch": 1.3453880099020559, + "grad_norm": 0.5845145583152771, + "learning_rate": 0.0005198556190065725, + "loss": 3.7083, + "step": 12500 + }, + { + "epoch": 1.350769561941664, + "grad_norm": 0.6391981840133667, + "learning_rate": 0.0005195323779765112, + "loss": 3.7103, + "step": 12550 + }, + { + "epoch": 1.356151113981272, + "grad_norm": 0.543088972568512, + "learning_rate": 0.0005192091369464497, + "loss": 3.7256, + "step": 12600 + }, + { + "epoch": 1.3615326660208804, + "grad_norm": 0.5750318169593811, + "learning_rate": 0.0005188858959163882, + "loss": 3.7289, + "step": 12650 + }, + { + "epoch": 1.3669142180604887, + "grad_norm": 0.6135967969894409, + "learning_rate": 0.0005185626548863269, + "loss": 3.7014, + "step": 12700 + }, + { + "epoch": 1.3722957701000968, + "grad_norm": 0.5833747982978821, + "learning_rate": 0.0005182394138562654, + "loss": 3.7292, + "step": 12750 + }, + { + "epoch": 1.3776773221397052, + "grad_norm": 0.571729838848114, + "learning_rate": 0.0005179161728262041, + "loss": 3.7158, + "step": 12800 + }, + { + "epoch": 1.3830588741793133, + "grad_norm": 0.6265289783477783, + "learning_rate": 0.0005175929317961426, + "loss": 3.7261, + "step": 12850 + }, + { + "epoch": 1.3884404262189216, + "grad_norm": 0.6207230091094971, + "learning_rate": 0.0005172696907660812, + "loss": 3.699, + "step": 12900 + }, + { + "epoch": 1.3938219782585297, + "grad_norm": 0.5325709581375122, + "learning_rate": 0.0005169464497360198, + "loss": 3.7059, + "step": 12950 + }, + { + "epoch": 1.399203530298138, + "grad_norm": 0.5781683325767517, + "learning_rate": 0.0005166232087059583, + "loss": 3.7029, + "step": 13000 + }, + { + "epoch": 1.399203530298138, + "eval_accuracy": 0.3522667871699628, + "eval_loss": 3.674454927444458, + "eval_runtime": 217.915, + "eval_samples_per_second": 82.651, + "eval_steps_per_second": 5.167, + "step": 13000 + }, + { + "epoch": 1.4045850823377461, + "grad_norm": 0.565986156463623, + "learning_rate": 0.0005162999676758969, + "loss": 3.7075, + "step": 13050 + }, + { + "epoch": 1.4099666343773545, + "grad_norm": 0.5653162002563477, + "learning_rate": 0.0005159767266458355, + "loss": 3.6983, + "step": 13100 + }, + { + "epoch": 1.4153481864169626, + "grad_norm": 0.5863538384437561, + "learning_rate": 0.0005156534856157741, + "loss": 3.7083, + "step": 13150 + }, + { + "epoch": 1.420729738456571, + "grad_norm": 0.5486993789672852, + "learning_rate": 0.0005153302445857127, + "loss": 3.7251, + "step": 13200 + }, + { + "epoch": 1.426111290496179, + "grad_norm": 0.6212597489356995, + "learning_rate": 0.0005150070035556513, + "loss": 3.7196, + "step": 13250 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.6025354862213135, + "learning_rate": 0.0005146837625255898, + "loss": 3.7195, + "step": 13300 + }, + { + "epoch": 1.4368743945753955, + "grad_norm": 0.5640348792076111, + "learning_rate": 0.0005143669863161297, + "loss": 3.7035, + "step": 13350 + }, + { + "epoch": 1.4422559466150038, + "grad_norm": 0.5984680652618408, + "learning_rate": 0.0005140437452860683, + "loss": 3.7052, + "step": 13400 + }, + { + "epoch": 1.447637498654612, + "grad_norm": 0.5823872685432434, + "learning_rate": 0.0005137205042560069, + "loss": 3.695, + "step": 13450 + }, + { + "epoch": 1.4530190506942202, + "grad_norm": 0.5364094972610474, + "learning_rate": 0.0005133972632259455, + "loss": 3.7199, + "step": 13500 + }, + { + "epoch": 1.4584006027338283, + "grad_norm": 0.5685153603553772, + "learning_rate": 0.000513074022195884, + "loss": 3.6992, + "step": 13550 + }, + { + "epoch": 1.4637821547734367, + "grad_norm": 0.5787658095359802, + "learning_rate": 0.0005127507811658226, + "loss": 3.6923, + "step": 13600 + }, + { + "epoch": 1.469163706813045, + "grad_norm": 0.5486370325088501, + "learning_rate": 0.0005124275401357612, + "loss": 3.7024, + "step": 13650 + }, + { + "epoch": 1.474545258852653, + "grad_norm": 0.5473746061325073, + "learning_rate": 0.0005121042991056997, + "loss": 3.6833, + "step": 13700 + }, + { + "epoch": 1.4799268108922612, + "grad_norm": 0.5517615675926208, + "learning_rate": 0.0005117810580756384, + "loss": 3.6902, + "step": 13750 + }, + { + "epoch": 1.4853083629318695, + "grad_norm": 0.5971811413764954, + "learning_rate": 0.0005114578170455769, + "loss": 3.6845, + "step": 13800 + }, + { + "epoch": 1.4906899149714778, + "grad_norm": 0.5672309398651123, + "learning_rate": 0.0005111345760155156, + "loss": 3.6854, + "step": 13850 + }, + { + "epoch": 1.496071467011086, + "grad_norm": 0.5523454546928406, + "learning_rate": 0.0005108113349854541, + "loss": 3.6961, + "step": 13900 + }, + { + "epoch": 1.501453019050694, + "grad_norm": 0.5691514611244202, + "learning_rate": 0.0005104880939553926, + "loss": 3.6803, + "step": 13950 + }, + { + "epoch": 1.5068345710903024, + "grad_norm": 0.5720673203468323, + "learning_rate": 0.0005101648529253313, + "loss": 3.6962, + "step": 14000 + }, + { + "epoch": 1.5068345710903024, + "eval_accuracy": 0.35364048563060124, + "eval_loss": 3.655740261077881, + "eval_runtime": 204.9718, + "eval_samples_per_second": 87.871, + "eval_steps_per_second": 5.493, + "step": 14000 + }, + { + "epoch": 1.5122161231299107, + "grad_norm": 0.5242102742195129, + "learning_rate": 0.0005098480767158711, + "loss": 3.697, + "step": 14050 + }, + { + "epoch": 1.5175976751695188, + "grad_norm": 0.5905733108520508, + "learning_rate": 0.0005095248356858097, + "loss": 3.6869, + "step": 14100 + }, + { + "epoch": 1.5229792272091272, + "grad_norm": 0.6295709609985352, + "learning_rate": 0.0005092015946557483, + "loss": 3.6839, + "step": 14150 + }, + { + "epoch": 1.5283607792487355, + "grad_norm": 0.6163989901542664, + "learning_rate": 0.0005088783536256868, + "loss": 3.6979, + "step": 14200 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 0.5677395462989807, + "learning_rate": 0.0005085551125956255, + "loss": 3.698, + "step": 14250 + }, + { + "epoch": 1.5391238833279517, + "grad_norm": 0.5294774174690247, + "learning_rate": 0.000508231871565564, + "loss": 3.6858, + "step": 14300 + }, + { + "epoch": 1.54450543536756, + "grad_norm": 0.585757315158844, + "learning_rate": 0.0005079086305355026, + "loss": 3.7028, + "step": 14350 + }, + { + "epoch": 1.5498869874071683, + "grad_norm": 0.5384769439697266, + "learning_rate": 0.0005075853895054412, + "loss": 3.686, + "step": 14400 + }, + { + "epoch": 1.5552685394467765, + "grad_norm": 0.5806359052658081, + "learning_rate": 0.0005072621484753797, + "loss": 3.689, + "step": 14450 + }, + { + "epoch": 1.5606500914863846, + "grad_norm": 0.5702711939811707, + "learning_rate": 0.0005069389074453184, + "loss": 3.675, + "step": 14500 + }, + { + "epoch": 1.566031643525993, + "grad_norm": 0.5366608500480652, + "learning_rate": 0.0005066156664152569, + "loss": 3.6914, + "step": 14550 + }, + { + "epoch": 1.5714131955656012, + "grad_norm": 0.60133296251297, + "learning_rate": 0.0005062924253851955, + "loss": 3.693, + "step": 14600 + }, + { + "epoch": 1.5767947476052093, + "grad_norm": 0.5777215361595154, + "learning_rate": 0.0005059691843551341, + "loss": 3.6786, + "step": 14650 + }, + { + "epoch": 1.5821762996448174, + "grad_norm": 0.5946151614189148, + "learning_rate": 0.0005056459433250727, + "loss": 3.6849, + "step": 14700 + }, + { + "epoch": 1.5875578516844258, + "grad_norm": 0.601273775100708, + "learning_rate": 0.0005053227022950113, + "loss": 3.6833, + "step": 14750 + }, + { + "epoch": 1.592939403724034, + "grad_norm": 0.5582096576690674, + "learning_rate": 0.0005049994612649499, + "loss": 3.681, + "step": 14800 + }, + { + "epoch": 1.5983209557636422, + "grad_norm": 0.5676015615463257, + "learning_rate": 0.0005046762202348884, + "loss": 3.6892, + "step": 14850 + }, + { + "epoch": 1.6037025078032503, + "grad_norm": 0.5280758738517761, + "learning_rate": 0.000504352979204827, + "loss": 3.6736, + "step": 14900 + }, + { + "epoch": 1.6090840598428586, + "grad_norm": 0.6497607827186584, + "learning_rate": 0.0005040297381747656, + "loss": 3.6761, + "step": 14950 + }, + { + "epoch": 1.614465611882467, + "grad_norm": 0.6543067693710327, + "learning_rate": 0.0005037064971447042, + "loss": 3.6601, + "step": 15000 + }, + { + "epoch": 1.614465611882467, + "eval_accuracy": 0.3556327449815676, + "eval_loss": 3.6367475986480713, + "eval_runtime": 205.3772, + "eval_samples_per_second": 87.697, + "eval_steps_per_second": 5.483, + "step": 15000 + }, + { + "epoch": 1.619847163922075, + "grad_norm": 0.5906841158866882, + "learning_rate": 0.0005033832561146428, + "loss": 3.6741, + "step": 15050 + }, + { + "epoch": 1.6252287159616834, + "grad_norm": 0.5810515880584717, + "learning_rate": 0.0005030600150845813, + "loss": 3.6827, + "step": 15100 + }, + { + "epoch": 1.6306102680012917, + "grad_norm": 0.6275368928909302, + "learning_rate": 0.00050273677405452, + "loss": 3.673, + "step": 15150 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.5321255922317505, + "learning_rate": 0.0005024135330244585, + "loss": 3.6789, + "step": 15200 + }, + { + "epoch": 1.641373372080508, + "grad_norm": 0.5455909371376038, + "learning_rate": 0.0005020902919943972, + "loss": 3.6691, + "step": 15250 + }, + { + "epoch": 1.6467549241201163, + "grad_norm": 0.5684463977813721, + "learning_rate": 0.0005017670509643357, + "loss": 3.656, + "step": 15300 + }, + { + "epoch": 1.6521364761597246, + "grad_norm": 0.5891856551170349, + "learning_rate": 0.0005014438099342743, + "loss": 3.666, + "step": 15350 + }, + { + "epoch": 1.6575180281993327, + "grad_norm": 0.5768228769302368, + "learning_rate": 0.0005011205689042129, + "loss": 3.6802, + "step": 15400 + }, + { + "epoch": 1.6628995802389408, + "grad_norm": 0.5384355187416077, + "learning_rate": 0.0005007973278741514, + "loss": 3.6677, + "step": 15450 + }, + { + "epoch": 1.6682811322785491, + "grad_norm": 0.5776270031929016, + "learning_rate": 0.00050047408684409, + "loss": 3.6913, + "step": 15500 + }, + { + "epoch": 1.6736626843181575, + "grad_norm": 0.5457106232643127, + "learning_rate": 0.0005001508458140286, + "loss": 3.6747, + "step": 15550 + }, + { + "epoch": 1.6790442363577656, + "grad_norm": 0.5584999322891235, + "learning_rate": 0.0004998276047839673, + "loss": 3.6656, + "step": 15600 + }, + { + "epoch": 1.6844257883973737, + "grad_norm": 0.5563948154449463, + "learning_rate": 0.0004995043637539058, + "loss": 3.6699, + "step": 15650 + }, + { + "epoch": 1.689807340436982, + "grad_norm": 0.5881028175354004, + "learning_rate": 0.0004991811227238443, + "loss": 3.6704, + "step": 15700 + }, + { + "epoch": 1.6951888924765903, + "grad_norm": 0.5460503101348877, + "learning_rate": 0.0004988578816937829, + "loss": 3.6895, + "step": 15750 + }, + { + "epoch": 1.7005704445161984, + "grad_norm": 0.6377148032188416, + "learning_rate": 0.0004985346406637215, + "loss": 3.6985, + "step": 15800 + }, + { + "epoch": 1.7059519965558065, + "grad_norm": 0.5566238760948181, + "learning_rate": 0.0004982113996336602, + "loss": 3.6537, + "step": 15850 + }, + { + "epoch": 1.7113335485954149, + "grad_norm": 0.5709816813468933, + "learning_rate": 0.0004978881586035987, + "loss": 3.6434, + "step": 15900 + }, + { + "epoch": 1.7167151006350232, + "grad_norm": 0.5734738111495972, + "learning_rate": 0.0004975649175735373, + "loss": 3.6449, + "step": 15950 + }, + { + "epoch": 1.7220966526746313, + "grad_norm": 0.5572895407676697, + "learning_rate": 0.0004972416765434759, + "loss": 3.6579, + "step": 16000 + }, + { + "epoch": 1.7220966526746313, + "eval_accuracy": 0.35718387360800713, + "eval_loss": 3.6182830333709717, + "eval_runtime": 214.6803, + "eval_samples_per_second": 83.897, + "eval_steps_per_second": 5.245, + "step": 16000 + }, + { + "epoch": 1.7274782047142396, + "grad_norm": 0.5495673418045044, + "learning_rate": 0.0004969184355134145, + "loss": 3.6629, + "step": 16050 + }, + { + "epoch": 1.732859756753848, + "grad_norm": 0.6197149753570557, + "learning_rate": 0.0004965951944833531, + "loss": 3.6439, + "step": 16100 + }, + { + "epoch": 1.738241308793456, + "grad_norm": 0.5787531137466431, + "learning_rate": 0.0004962719534532916, + "loss": 3.6703, + "step": 16150 + }, + { + "epoch": 1.7436228608330642, + "grad_norm": 0.5500142574310303, + "learning_rate": 0.0004959487124232302, + "loss": 3.656, + "step": 16200 + }, + { + "epoch": 1.7490044128726725, + "grad_norm": 0.5271518230438232, + "learning_rate": 0.0004956254713931688, + "loss": 3.6594, + "step": 16250 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.5560300350189209, + "learning_rate": 0.0004953022303631074, + "loss": 3.6622, + "step": 16300 + }, + { + "epoch": 1.759767516951889, + "grad_norm": 0.5545780062675476, + "learning_rate": 0.0004949789893330459, + "loss": 3.6657, + "step": 16350 + }, + { + "epoch": 1.765149068991497, + "grad_norm": 0.5637168884277344, + "learning_rate": 0.0004946557483029846, + "loss": 3.6701, + "step": 16400 + }, + { + "epoch": 1.7705306210311054, + "grad_norm": 0.6499923467636108, + "learning_rate": 0.0004943325072729231, + "loss": 3.6591, + "step": 16450 + }, + { + "epoch": 1.7759121730707137, + "grad_norm": 0.5951055884361267, + "learning_rate": 0.0004940092662428617, + "loss": 3.6589, + "step": 16500 + }, + { + "epoch": 1.7812937251103218, + "grad_norm": 0.5989054441452026, + "learning_rate": 0.0004936860252128003, + "loss": 3.6615, + "step": 16550 + }, + { + "epoch": 1.78667527714993, + "grad_norm": 0.6071681976318359, + "learning_rate": 0.0004933627841827388, + "loss": 3.6413, + "step": 16600 + }, + { + "epoch": 1.7920568291895382, + "grad_norm": 0.5401762127876282, + "learning_rate": 0.0004930395431526775, + "loss": 3.6646, + "step": 16650 + }, + { + "epoch": 1.7974383812291466, + "grad_norm": 0.6720755100250244, + "learning_rate": 0.0004927163021226161, + "loss": 3.6535, + "step": 16700 + }, + { + "epoch": 1.8028199332687547, + "grad_norm": 0.5372287034988403, + "learning_rate": 0.0004923930610925547, + "loss": 3.6387, + "step": 16750 + }, + { + "epoch": 1.8082014853083628, + "grad_norm": 0.5767584443092346, + "learning_rate": 0.0004920698200624932, + "loss": 3.6509, + "step": 16800 + }, + { + "epoch": 1.813583037347971, + "grad_norm": 0.619735598564148, + "learning_rate": 0.0004917465790324317, + "loss": 3.6593, + "step": 16850 + }, + { + "epoch": 1.8189645893875794, + "grad_norm": 0.612182080745697, + "learning_rate": 0.0004914233380023704, + "loss": 3.669, + "step": 16900 + }, + { + "epoch": 1.8243461414271875, + "grad_norm": 0.5973613262176514, + "learning_rate": 0.0004911000969723089, + "loss": 3.6581, + "step": 16950 + }, + { + "epoch": 1.8297276934667959, + "grad_norm": 0.5377869606018066, + "learning_rate": 0.0004907768559422476, + "loss": 3.6582, + "step": 17000 + }, + { + "epoch": 1.8297276934667959, + "eval_accuracy": 0.3589950085949867, + "eval_loss": 3.60500168800354, + "eval_runtime": 203.9944, + "eval_samples_per_second": 88.292, + "eval_steps_per_second": 5.52, + "step": 17000 + }, + { + "epoch": 1.8351092455064042, + "grad_norm": 0.5382079482078552, + "learning_rate": 0.0004904536149121861, + "loss": 3.6536, + "step": 17050 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.5734342336654663, + "learning_rate": 0.0004901303738821248, + "loss": 3.6325, + "step": 17100 + }, + { + "epoch": 1.8458723495856204, + "grad_norm": 0.6593245267868042, + "learning_rate": 0.0004898071328520633, + "loss": 3.6733, + "step": 17150 + }, + { + "epoch": 1.8512539016252287, + "grad_norm": 0.5794579982757568, + "learning_rate": 0.0004894838918220019, + "loss": 3.6634, + "step": 17200 + }, + { + "epoch": 1.856635453664837, + "grad_norm": 0.5953862071037292, + "learning_rate": 0.0004891606507919405, + "loss": 3.6544, + "step": 17250 + }, + { + "epoch": 1.8620170057044452, + "grad_norm": 0.6200346946716309, + "learning_rate": 0.000488837409761879, + "loss": 3.6443, + "step": 17300 + }, + { + "epoch": 1.8673985577440533, + "grad_norm": 0.585228681564331, + "learning_rate": 0.0004885141687318177, + "loss": 3.6478, + "step": 17350 + }, + { + "epoch": 1.8727801097836616, + "grad_norm": 0.6394591927528381, + "learning_rate": 0.00048819092770175623, + "loss": 3.6416, + "step": 17400 + }, + { + "epoch": 1.87816166182327, + "grad_norm": 0.5823236107826233, + "learning_rate": 0.0004878676866716948, + "loss": 3.6572, + "step": 17450 + }, + { + "epoch": 1.883543213862878, + "grad_norm": 0.5914448499679565, + "learning_rate": 0.00048754444564163337, + "loss": 3.6479, + "step": 17500 + }, + { + "epoch": 1.8889247659024861, + "grad_norm": 0.6287000775337219, + "learning_rate": 0.000487221204611572, + "loss": 3.6431, + "step": 17550 + }, + { + "epoch": 1.8943063179420945, + "grad_norm": 0.5507499575614929, + "learning_rate": 0.00048689796358151056, + "loss": 3.6444, + "step": 17600 + }, + { + "epoch": 1.8996878699817028, + "grad_norm": 0.5838987827301025, + "learning_rate": 0.00048657472255144915, + "loss": 3.649, + "step": 17650 + }, + { + "epoch": 1.905069422021311, + "grad_norm": 0.6239995360374451, + "learning_rate": 0.00048625148152138775, + "loss": 3.6385, + "step": 17700 + }, + { + "epoch": 1.910450974060919, + "grad_norm": 0.5718323588371277, + "learning_rate": 0.0004859282404913263, + "loss": 3.6248, + "step": 17750 + }, + { + "epoch": 1.9158325261005273, + "grad_norm": 0.5826128125190735, + "learning_rate": 0.0004856049994612649, + "loss": 3.6441, + "step": 17800 + }, + { + "epoch": 1.9212140781401357, + "grad_norm": 0.6261082291603088, + "learning_rate": 0.00048528175843120353, + "loss": 3.6379, + "step": 17850 + }, + { + "epoch": 1.9265956301797438, + "grad_norm": 0.5729628205299377, + "learning_rate": 0.0004849585174011421, + "loss": 3.6403, + "step": 17900 + }, + { + "epoch": 1.931977182219352, + "grad_norm": 0.6479345560073853, + "learning_rate": 0.00048464174119168193, + "loss": 3.6513, + "step": 17950 + }, + { + "epoch": 1.9373587342589604, + "grad_norm": 0.6019191145896912, + "learning_rate": 0.0004843185001616205, + "loss": 3.6427, + "step": 18000 + }, + { + "epoch": 1.9373587342589604, + "eval_accuracy": 0.3606186086975985, + "eval_loss": 3.589313268661499, + "eval_runtime": 204.8701, + "eval_samples_per_second": 87.914, + "eval_steps_per_second": 5.496, + "step": 18000 + }, + { + "epoch": 1.9427402862985685, + "grad_norm": 0.5419387817382812, + "learning_rate": 0.00048399525913155907, + "loss": 3.6361, + "step": 18050 + }, + { + "epoch": 1.9481218383381766, + "grad_norm": 0.6220253109931946, + "learning_rate": 0.0004836720181014976, + "loss": 3.6207, + "step": 18100 + }, + { + "epoch": 1.953503390377785, + "grad_norm": 0.5842125415802002, + "learning_rate": 0.0004833487770714362, + "loss": 3.6405, + "step": 18150 + }, + { + "epoch": 1.9588849424173933, + "grad_norm": 0.5908029675483704, + "learning_rate": 0.00048302553604137485, + "loss": 3.6445, + "step": 18200 + }, + { + "epoch": 1.9642664944570014, + "grad_norm": 0.5424453020095825, + "learning_rate": 0.0004827022950113134, + "loss": 3.6419, + "step": 18250 + }, + { + "epoch": 1.9696480464966095, + "grad_norm": 0.6032727360725403, + "learning_rate": 0.000482379053981252, + "loss": 3.6507, + "step": 18300 + }, + { + "epoch": 1.9750295985362178, + "grad_norm": 0.6096407771110535, + "learning_rate": 0.0004820558129511906, + "loss": 3.6364, + "step": 18350 + }, + { + "epoch": 1.9804111505758262, + "grad_norm": 0.6347204446792603, + "learning_rate": 0.0004817325719211291, + "loss": 3.6451, + "step": 18400 + }, + { + "epoch": 1.9857927026154343, + "grad_norm": 0.5634959936141968, + "learning_rate": 0.0004814093308910677, + "loss": 3.641, + "step": 18450 + }, + { + "epoch": 1.9911742546550424, + "grad_norm": 0.5860807299613953, + "learning_rate": 0.00048108608986100637, + "loss": 3.6336, + "step": 18500 + }, + { + "epoch": 1.9965558066946507, + "grad_norm": 0.5338708162307739, + "learning_rate": 0.0004807628488309449, + "loss": 3.6211, + "step": 18550 + }, + { + "epoch": 2.001937358734259, + "grad_norm": 0.5974145531654358, + "learning_rate": 0.0004804396078008835, + "loss": 3.5958, + "step": 18600 + }, + { + "epoch": 2.007318910773867, + "grad_norm": 0.5562605261802673, + "learning_rate": 0.00048011636677082204, + "loss": 3.5582, + "step": 18650 + }, + { + "epoch": 2.0127004628134753, + "grad_norm": 0.6376757025718689, + "learning_rate": 0.00047979312574076064, + "loss": 3.5377, + "step": 18700 + }, + { + "epoch": 2.018082014853084, + "grad_norm": 0.5702477097511292, + "learning_rate": 0.0004794698847106992, + "loss": 3.5431, + "step": 18750 + }, + { + "epoch": 2.023463566892692, + "grad_norm": 0.5655612945556641, + "learning_rate": 0.0004791466436806378, + "loss": 3.5661, + "step": 18800 + }, + { + "epoch": 2.0288451189323, + "grad_norm": 0.5684214234352112, + "learning_rate": 0.0004788234026505764, + "loss": 3.5378, + "step": 18850 + }, + { + "epoch": 2.034226670971908, + "grad_norm": 0.595730185508728, + "learning_rate": 0.00047850016162051496, + "loss": 3.5512, + "step": 18900 + }, + { + "epoch": 2.0396082230115167, + "grad_norm": 0.602304220199585, + "learning_rate": 0.00047817692059045356, + "loss": 3.5471, + "step": 18950 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.6218283176422119, + "learning_rate": 0.00047785367956039215, + "loss": 3.56, + "step": 19000 + }, + { + "epoch": 2.044989775051125, + "eval_accuracy": 0.3616626542966078, + "eval_loss": 3.5785436630249023, + "eval_runtime": 202.2767, + "eval_samples_per_second": 89.041, + "eval_steps_per_second": 5.567, + "step": 19000 + }, + { + "epoch": 2.050371327090733, + "grad_norm": 0.5897380113601685, + "learning_rate": 0.00047753043853033075, + "loss": 3.5425, + "step": 19050 + }, + { + "epoch": 2.055752879130341, + "grad_norm": 0.5932830572128296, + "learning_rate": 0.00047720719750026934, + "loss": 3.5699, + "step": 19100 + }, + { + "epoch": 2.0611344311699495, + "grad_norm": 0.5921191573143005, + "learning_rate": 0.00047688395647020793, + "loss": 3.5431, + "step": 19150 + }, + { + "epoch": 2.0665159832095576, + "grad_norm": 0.6052051186561584, + "learning_rate": 0.0004765607154401465, + "loss": 3.5734, + "step": 19200 + }, + { + "epoch": 2.0718975352491658, + "grad_norm": 0.562523603439331, + "learning_rate": 0.00047623747441008507, + "loss": 3.543, + "step": 19250 + }, + { + "epoch": 2.0772790872887743, + "grad_norm": 0.546420156955719, + "learning_rate": 0.0004759142333800236, + "loss": 3.5441, + "step": 19300 + }, + { + "epoch": 2.0826606393283824, + "grad_norm": 0.590813159942627, + "learning_rate": 0.00047559099234996226, + "loss": 3.5624, + "step": 19350 + }, + { + "epoch": 2.0880421913679905, + "grad_norm": 0.6105584502220154, + "learning_rate": 0.00047526775131990085, + "loss": 3.557, + "step": 19400 + }, + { + "epoch": 2.0934237434075986, + "grad_norm": 0.6170618534088135, + "learning_rate": 0.0004749445102898394, + "loss": 3.5373, + "step": 19450 + }, + { + "epoch": 2.098805295447207, + "grad_norm": 0.5814986228942871, + "learning_rate": 0.000474621269259778, + "loss": 3.5693, + "step": 19500 + }, + { + "epoch": 2.1041868474868153, + "grad_norm": 0.6113174557685852, + "learning_rate": 0.0004742980282297166, + "loss": 3.5589, + "step": 19550 + }, + { + "epoch": 2.1095683995264234, + "grad_norm": 0.6312413811683655, + "learning_rate": 0.0004739747871996551, + "loss": 3.5398, + "step": 19600 + }, + { + "epoch": 2.1149499515660315, + "grad_norm": 0.6352087259292603, + "learning_rate": 0.00047365154616959377, + "loss": 3.5541, + "step": 19650 + }, + { + "epoch": 2.12033150360564, + "grad_norm": 0.6081045269966125, + "learning_rate": 0.00047332830513953237, + "loss": 3.576, + "step": 19700 + }, + { + "epoch": 2.125713055645248, + "grad_norm": 0.5704367160797119, + "learning_rate": 0.0004730050641094709, + "loss": 3.5563, + "step": 19750 + }, + { + "epoch": 2.1310946076848563, + "grad_norm": 0.6367883682250977, + "learning_rate": 0.0004726818230794095, + "loss": 3.5358, + "step": 19800 + }, + { + "epoch": 2.1364761597244644, + "grad_norm": 0.5543844699859619, + "learning_rate": 0.00047235858204934804, + "loss": 3.5561, + "step": 19850 + }, + { + "epoch": 2.141857711764073, + "grad_norm": 0.6325891017913818, + "learning_rate": 0.0004720353410192867, + "loss": 3.5459, + "step": 19900 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 0.5876049995422363, + "learning_rate": 0.0004717120999892253, + "loss": 3.5568, + "step": 19950 + }, + { + "epoch": 2.152620815843289, + "grad_norm": 0.5623601675033569, + "learning_rate": 0.0004713888589591638, + "loss": 3.5527, + "step": 20000 + }, + { + "epoch": 2.152620815843289, + "eval_accuracy": 0.3627479879929867, + "eval_loss": 3.5705673694610596, + "eval_runtime": 206.2591, + "eval_samples_per_second": 87.322, + "eval_steps_per_second": 5.459, + "step": 20000 + }, + { + "epoch": 2.1580023678828972, + "grad_norm": 0.5725038051605225, + "learning_rate": 0.0004710656179291024, + "loss": 3.5565, + "step": 20050 + }, + { + "epoch": 2.163383919922506, + "grad_norm": 0.6287389993667603, + "learning_rate": 0.000470742376899041, + "loss": 3.5528, + "step": 20100 + }, + { + "epoch": 2.168765471962114, + "grad_norm": 0.5884525179862976, + "learning_rate": 0.00047041913586897956, + "loss": 3.5597, + "step": 20150 + }, + { + "epoch": 2.174147024001722, + "grad_norm": 0.5764286518096924, + "learning_rate": 0.0004700958948389182, + "loss": 3.5426, + "step": 20200 + }, + { + "epoch": 2.1795285760413305, + "grad_norm": 0.6041747331619263, + "learning_rate": 0.0004697726538088568, + "loss": 3.5471, + "step": 20250 + }, + { + "epoch": 2.1849101280809387, + "grad_norm": 0.5907628536224365, + "learning_rate": 0.00046944941277879534, + "loss": 3.5453, + "step": 20300 + }, + { + "epoch": 2.1902916801205468, + "grad_norm": 0.5632272958755493, + "learning_rate": 0.00046912617174873394, + "loss": 3.5558, + "step": 20350 + }, + { + "epoch": 2.195673232160155, + "grad_norm": 0.6226711869239807, + "learning_rate": 0.0004688029307186725, + "loss": 3.5865, + "step": 20400 + }, + { + "epoch": 2.2010547841997634, + "grad_norm": 0.5935404896736145, + "learning_rate": 0.00046847968968861107, + "loss": 3.5594, + "step": 20450 + }, + { + "epoch": 2.2064363362393715, + "grad_norm": 0.5879222750663757, + "learning_rate": 0.0004681564486585497, + "loss": 3.5637, + "step": 20500 + }, + { + "epoch": 2.2118178882789796, + "grad_norm": 0.5888842940330505, + "learning_rate": 0.00046783320762848826, + "loss": 3.5438, + "step": 20550 + }, + { + "epoch": 2.2171994403185877, + "grad_norm": 0.5803065299987793, + "learning_rate": 0.00046750996659842685, + "loss": 3.5429, + "step": 20600 + }, + { + "epoch": 2.2225809923581963, + "grad_norm": 0.5627387166023254, + "learning_rate": 0.00046718672556836545, + "loss": 3.5601, + "step": 20650 + }, + { + "epoch": 2.2279625443978044, + "grad_norm": 0.5622759461402893, + "learning_rate": 0.000466863484538304, + "loss": 3.5676, + "step": 20700 + }, + { + "epoch": 2.2333440964374125, + "grad_norm": 0.5685352087020874, + "learning_rate": 0.0004665402435082426, + "loss": 3.5572, + "step": 20750 + }, + { + "epoch": 2.2387256484770206, + "grad_norm": 0.6102375388145447, + "learning_rate": 0.00046621700247818123, + "loss": 3.5546, + "step": 20800 + }, + { + "epoch": 2.244107200516629, + "grad_norm": 0.5915598273277283, + "learning_rate": 0.0004658937614481198, + "loss": 3.5519, + "step": 20850 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.5531293153762817, + "learning_rate": 0.00046557052041805837, + "loss": 3.5491, + "step": 20900 + }, + { + "epoch": 2.2548703045958454, + "grad_norm": 0.5677109956741333, + "learning_rate": 0.0004652472793879969, + "loss": 3.5562, + "step": 20950 + }, + { + "epoch": 2.2602518566354535, + "grad_norm": 0.5943840742111206, + "learning_rate": 0.0004649240383579355, + "loss": 3.5487, + "step": 21000 + }, + { + "epoch": 2.2602518566354535, + "eval_accuracy": 0.36394262649450687, + "eval_loss": 3.560176372528076, + "eval_runtime": 204.7535, + "eval_samples_per_second": 87.964, + "eval_steps_per_second": 5.499, + "step": 21000 + }, + { + "epoch": 2.265633408675062, + "grad_norm": 0.5741783380508423, + "learning_rate": 0.00046460079732787415, + "loss": 3.5606, + "step": 21050 + }, + { + "epoch": 2.27101496071467, + "grad_norm": 0.6168876886367798, + "learning_rate": 0.0004642775562978127, + "loss": 3.5662, + "step": 21100 + }, + { + "epoch": 2.2763965127542782, + "grad_norm": 0.6492966413497925, + "learning_rate": 0.0004639543152677513, + "loss": 3.5546, + "step": 21150 + }, + { + "epoch": 2.281778064793887, + "grad_norm": 0.634781539440155, + "learning_rate": 0.0004636310742376899, + "loss": 3.5674, + "step": 21200 + }, + { + "epoch": 2.287159616833495, + "grad_norm": 0.5752384066581726, + "learning_rate": 0.0004633078332076284, + "loss": 3.5561, + "step": 21250 + }, + { + "epoch": 2.292541168873103, + "grad_norm": 0.5751746296882629, + "learning_rate": 0.000462984592177567, + "loss": 3.5505, + "step": 21300 + }, + { + "epoch": 2.297922720912711, + "grad_norm": 0.5733442902565002, + "learning_rate": 0.00046266135114750567, + "loss": 3.5489, + "step": 21350 + }, + { + "epoch": 2.303304272952319, + "grad_norm": 0.6262395977973938, + "learning_rate": 0.0004623381101174442, + "loss": 3.552, + "step": 21400 + }, + { + "epoch": 2.3086858249919278, + "grad_norm": 0.6084133982658386, + "learning_rate": 0.0004620148690873828, + "loss": 3.5515, + "step": 21450 + }, + { + "epoch": 2.314067377031536, + "grad_norm": 0.558005690574646, + "learning_rate": 0.00046169162805732134, + "loss": 3.5501, + "step": 21500 + }, + { + "epoch": 2.319448929071144, + "grad_norm": 0.656355619430542, + "learning_rate": 0.00046136838702725994, + "loss": 3.5677, + "step": 21550 + }, + { + "epoch": 2.3248304811107525, + "grad_norm": 0.6125053763389587, + "learning_rate": 0.00046104514599719853, + "loss": 3.5496, + "step": 21600 + }, + { + "epoch": 2.3302120331503606, + "grad_norm": 0.708257257938385, + "learning_rate": 0.0004607219049671371, + "loss": 3.594, + "step": 21650 + }, + { + "epoch": 2.3355935851899687, + "grad_norm": 0.6216398477554321, + "learning_rate": 0.0004603986639370757, + "loss": 3.5749, + "step": 21700 + }, + { + "epoch": 2.340975137229577, + "grad_norm": 0.5760979056358337, + "learning_rate": 0.0004600754229070143, + "loss": 3.5546, + "step": 21750 + }, + { + "epoch": 2.3463566892691854, + "grad_norm": 0.5885084867477417, + "learning_rate": 0.00045975218187695286, + "loss": 3.5654, + "step": 21800 + }, + { + "epoch": 2.3517382413087935, + "grad_norm": 0.6174579858779907, + "learning_rate": 0.00045942894084689145, + "loss": 3.5543, + "step": 21850 + }, + { + "epoch": 2.3571197933484016, + "grad_norm": 0.6659450531005859, + "learning_rate": 0.0004591056998168301, + "loss": 3.553, + "step": 21900 + }, + { + "epoch": 2.3625013453880097, + "grad_norm": 0.5789748430252075, + "learning_rate": 0.00045878245878676864, + "loss": 3.5611, + "step": 21950 + }, + { + "epoch": 2.3678828974276183, + "grad_norm": 0.5881466865539551, + "learning_rate": 0.00045846568257730845, + "loss": 3.5664, + "step": 22000 + }, + { + "epoch": 2.3678828974276183, + "eval_accuracy": 0.36495016472321057, + "eval_loss": 3.5474905967712402, + "eval_runtime": 194.8012, + "eval_samples_per_second": 92.458, + "eval_steps_per_second": 5.78, + "step": 22000 + }, + { + "epoch": 2.3732644494672264, + "grad_norm": 0.6417264342308044, + "learning_rate": 0.00045814244154724704, + "loss": 3.5509, + "step": 22050 + }, + { + "epoch": 2.3786460015068345, + "grad_norm": 0.6134101748466492, + "learning_rate": 0.00045781920051718563, + "loss": 3.5482, + "step": 22100 + }, + { + "epoch": 2.384027553546443, + "grad_norm": 0.8174729943275452, + "learning_rate": 0.0004574959594871242, + "loss": 3.5538, + "step": 22150 + }, + { + "epoch": 2.389409105586051, + "grad_norm": 0.6091681122779846, + "learning_rate": 0.00045717271845706277, + "loss": 3.5516, + "step": 22200 + }, + { + "epoch": 2.3947906576256592, + "grad_norm": 0.5839729905128479, + "learning_rate": 0.0004568494774270013, + "loss": 3.5545, + "step": 22250 + }, + { + "epoch": 2.4001722096652673, + "grad_norm": 0.620452344417572, + "learning_rate": 0.00045652623639693996, + "loss": 3.5427, + "step": 22300 + }, + { + "epoch": 2.4055537617048754, + "grad_norm": 0.6439001560211182, + "learning_rate": 0.00045620299536687855, + "loss": 3.554, + "step": 22350 + }, + { + "epoch": 2.410935313744484, + "grad_norm": 0.6319786906242371, + "learning_rate": 0.0004558797543368171, + "loss": 3.5723, + "step": 22400 + }, + { + "epoch": 2.416316865784092, + "grad_norm": 0.6055615544319153, + "learning_rate": 0.0004555565133067557, + "loss": 3.5568, + "step": 22450 + }, + { + "epoch": 2.4216984178237, + "grad_norm": 0.6199345588684082, + "learning_rate": 0.0004552332722766943, + "loss": 3.549, + "step": 22500 + }, + { + "epoch": 2.4270799698633088, + "grad_norm": 0.587843656539917, + "learning_rate": 0.0004549100312466328, + "loss": 3.5311, + "step": 22550 + }, + { + "epoch": 2.432461521902917, + "grad_norm": 0.5868039727210999, + "learning_rate": 0.0004545867902165715, + "loss": 3.5624, + "step": 22600 + }, + { + "epoch": 2.437843073942525, + "grad_norm": 0.5922572016716003, + "learning_rate": 0.00045426354918651007, + "loss": 3.5538, + "step": 22650 + }, + { + "epoch": 2.443224625982133, + "grad_norm": 0.6361838579177856, + "learning_rate": 0.0004539403081564486, + "loss": 3.5364, + "step": 22700 + }, + { + "epoch": 2.4486061780217416, + "grad_norm": 0.5849791765213013, + "learning_rate": 0.0004536170671263872, + "loss": 3.5606, + "step": 22750 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.6513404250144958, + "learning_rate": 0.00045329382609632574, + "loss": 3.535, + "step": 22800 + }, + { + "epoch": 2.459369282100958, + "grad_norm": 0.5861430168151855, + "learning_rate": 0.0004529705850662644, + "loss": 3.5432, + "step": 22850 + }, + { + "epoch": 2.464750834140566, + "grad_norm": 0.6476746797561646, + "learning_rate": 0.000452647344036203, + "loss": 3.5454, + "step": 22900 + }, + { + "epoch": 2.4701323861801745, + "grad_norm": 0.6115290522575378, + "learning_rate": 0.00045232410300614153, + "loss": 3.5439, + "step": 22950 + }, + { + "epoch": 2.4755139382197826, + "grad_norm": 0.5903562903404236, + "learning_rate": 0.0004520008619760801, + "loss": 3.5464, + "step": 23000 + }, + { + "epoch": 2.4755139382197826, + "eval_accuracy": 0.3655437354493411, + "eval_loss": 3.5417819023132324, + "eval_runtime": 206.588, + "eval_samples_per_second": 87.183, + "eval_steps_per_second": 5.45, + "step": 23000 + }, + { + "epoch": 2.4808954902593907, + "grad_norm": 0.5942795276641846, + "learning_rate": 0.0004516776209460187, + "loss": 3.5293, + "step": 23050 + }, + { + "epoch": 2.4862770422989993, + "grad_norm": 0.5559934973716736, + "learning_rate": 0.00045135437991595726, + "loss": 3.5569, + "step": 23100 + }, + { + "epoch": 2.4916585943386074, + "grad_norm": 0.6209527254104614, + "learning_rate": 0.0004510311388858959, + "loss": 3.5374, + "step": 23150 + }, + { + "epoch": 2.4970401463782155, + "grad_norm": 0.5962895750999451, + "learning_rate": 0.0004507078978558345, + "loss": 3.5538, + "step": 23200 + }, + { + "epoch": 2.5024216984178236, + "grad_norm": 0.6051533222198486, + "learning_rate": 0.00045038465682577304, + "loss": 3.552, + "step": 23250 + }, + { + "epoch": 2.5078032504574317, + "grad_norm": 0.5640018582344055, + "learning_rate": 0.00045006141579571164, + "loss": 3.5596, + "step": 23300 + }, + { + "epoch": 2.5131848024970402, + "grad_norm": 0.6006429195404053, + "learning_rate": 0.0004497381747656502, + "loss": 3.5692, + "step": 23350 + }, + { + "epoch": 2.5185663545366483, + "grad_norm": 0.6432710886001587, + "learning_rate": 0.00044941493373558877, + "loss": 3.5302, + "step": 23400 + }, + { + "epoch": 2.5239479065762565, + "grad_norm": 0.5585759282112122, + "learning_rate": 0.0004490916927055274, + "loss": 3.5559, + "step": 23450 + }, + { + "epoch": 2.529329458615865, + "grad_norm": 0.6355800032615662, + "learning_rate": 0.00044876845167546596, + "loss": 3.5332, + "step": 23500 + }, + { + "epoch": 2.534711010655473, + "grad_norm": 0.5865529179573059, + "learning_rate": 0.00044844521064540455, + "loss": 3.5558, + "step": 23550 + }, + { + "epoch": 2.540092562695081, + "grad_norm": 0.6548492312431335, + "learning_rate": 0.00044812196961534315, + "loss": 3.5339, + "step": 23600 + }, + { + "epoch": 2.5454741147346893, + "grad_norm": 0.6574897766113281, + "learning_rate": 0.0004477987285852817, + "loss": 3.5498, + "step": 23650 + }, + { + "epoch": 2.550855666774298, + "grad_norm": 0.5814394950866699, + "learning_rate": 0.00044747548755522034, + "loss": 3.5552, + "step": 23700 + }, + { + "epoch": 2.556237218813906, + "grad_norm": 0.5807026028633118, + "learning_rate": 0.00044715224652515893, + "loss": 3.5408, + "step": 23750 + }, + { + "epoch": 2.561618770853514, + "grad_norm": 0.6143361926078796, + "learning_rate": 0.0004468290054950975, + "loss": 3.5664, + "step": 23800 + }, + { + "epoch": 2.567000322893122, + "grad_norm": 0.5572286248207092, + "learning_rate": 0.00044650576446503607, + "loss": 3.5545, + "step": 23850 + }, + { + "epoch": 2.5723818749327307, + "grad_norm": 0.5634745955467224, + "learning_rate": 0.0004461825234349746, + "loss": 3.5302, + "step": 23900 + }, + { + "epoch": 2.577763426972339, + "grad_norm": 0.5858882665634155, + "learning_rate": 0.00044586574722551447, + "loss": 3.5528, + "step": 23950 + }, + { + "epoch": 2.583144979011947, + "grad_norm": 0.6945074200630188, + "learning_rate": 0.000445542506195453, + "loss": 3.5619, + "step": 24000 + }, + { + "epoch": 2.583144979011947, + "eval_accuracy": 0.36655561979355733, + "eval_loss": 3.5325984954833984, + "eval_runtime": 195.3908, + "eval_samples_per_second": 92.179, + "eval_steps_per_second": 5.763, + "step": 24000 + }, + { + "epoch": 2.5885265310515555, + "grad_norm": 0.5496438145637512, + "learning_rate": 0.0004452192651653916, + "loss": 3.5522, + "step": 24050 + }, + { + "epoch": 2.5939080830911636, + "grad_norm": 0.5970087051391602, + "learning_rate": 0.00044489602413533025, + "loss": 3.5635, + "step": 24100 + }, + { + "epoch": 2.5992896351307717, + "grad_norm": 0.6501139998435974, + "learning_rate": 0.0004445727831052688, + "loss": 3.533, + "step": 24150 + }, + { + "epoch": 2.60467118717038, + "grad_norm": 0.6015759110450745, + "learning_rate": 0.0004442495420752074, + "loss": 3.5451, + "step": 24200 + }, + { + "epoch": 2.610052739209988, + "grad_norm": 0.5871186852455139, + "learning_rate": 0.00044392630104514593, + "loss": 3.5508, + "step": 24250 + }, + { + "epoch": 2.6154342912495965, + "grad_norm": 0.5801131725311279, + "learning_rate": 0.0004436030600150845, + "loss": 3.5321, + "step": 24300 + }, + { + "epoch": 2.6208158432892046, + "grad_norm": 0.7078419327735901, + "learning_rate": 0.0004432798189850231, + "loss": 3.5526, + "step": 24350 + }, + { + "epoch": 2.6261973953288127, + "grad_norm": 0.6336981654167175, + "learning_rate": 0.0004429565779549617, + "loss": 3.553, + "step": 24400 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.621759831905365, + "learning_rate": 0.0004426333369249003, + "loss": 3.5336, + "step": 24450 + }, + { + "epoch": 2.6369604994080293, + "grad_norm": 0.5916746854782104, + "learning_rate": 0.0004423100958948389, + "loss": 3.5525, + "step": 24500 + }, + { + "epoch": 2.6423420514476375, + "grad_norm": 0.6469952464103699, + "learning_rate": 0.00044198685486477744, + "loss": 3.5264, + "step": 24550 + }, + { + "epoch": 2.6477236034872456, + "grad_norm": 0.6205400824546814, + "learning_rate": 0.00044166361383471604, + "loss": 3.5513, + "step": 24600 + }, + { + "epoch": 2.653105155526854, + "grad_norm": 0.6007969975471497, + "learning_rate": 0.0004413403728046547, + "loss": 3.5303, + "step": 24650 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.6586825847625732, + "learning_rate": 0.0004410171317745932, + "loss": 3.5627, + "step": 24700 + }, + { + "epoch": 2.6638682596060703, + "grad_norm": 0.5926357507705688, + "learning_rate": 0.0004406938907445318, + "loss": 3.5663, + "step": 24750 + }, + { + "epoch": 2.6692498116456784, + "grad_norm": 0.5873600840568542, + "learning_rate": 0.00044037064971447036, + "loss": 3.5241, + "step": 24800 + }, + { + "epoch": 2.674631363685287, + "grad_norm": 0.592735767364502, + "learning_rate": 0.00044004740868440896, + "loss": 3.5567, + "step": 24850 + }, + { + "epoch": 2.680012915724895, + "grad_norm": 0.626936674118042, + "learning_rate": 0.00043972416765434755, + "loss": 3.5204, + "step": 24900 + }, + { + "epoch": 2.685394467764503, + "grad_norm": 0.5691058039665222, + "learning_rate": 0.00043940092662428615, + "loss": 3.5214, + "step": 24950 + }, + { + "epoch": 2.6907760198041117, + "grad_norm": 0.6450164914131165, + "learning_rate": 0.00043907768559422474, + "loss": 3.5487, + "step": 25000 + }, + { + "epoch": 2.6907760198041117, + "eval_accuracy": 0.36812326365894465, + "eval_loss": 3.521350145339966, + "eval_runtime": 197.2244, + "eval_samples_per_second": 91.322, + "eval_steps_per_second": 5.709, + "step": 25000 + }, + { + "epoch": 2.69615757184372, + "grad_norm": 0.6423863768577576, + "learning_rate": 0.00043875444456416334, + "loss": 3.5373, + "step": 25050 + }, + { + "epoch": 2.701539123883328, + "grad_norm": 0.6400243043899536, + "learning_rate": 0.0004384312035341019, + "loss": 3.5428, + "step": 25100 + }, + { + "epoch": 2.706920675922936, + "grad_norm": 0.6252772212028503, + "learning_rate": 0.00043810796250404047, + "loss": 3.5469, + "step": 25150 + }, + { + "epoch": 2.712302227962544, + "grad_norm": 0.5882490873336792, + "learning_rate": 0.000437784721473979, + "loss": 3.5376, + "step": 25200 + }, + { + "epoch": 2.7176837800021527, + "grad_norm": 0.604751467704773, + "learning_rate": 0.00043746148044391766, + "loss": 3.5409, + "step": 25250 + }, + { + "epoch": 2.723065332041761, + "grad_norm": 0.6292399168014526, + "learning_rate": 0.00043713823941385625, + "loss": 3.5313, + "step": 25300 + }, + { + "epoch": 2.728446884081369, + "grad_norm": 0.6144165396690369, + "learning_rate": 0.0004368149983837948, + "loss": 3.5484, + "step": 25350 + }, + { + "epoch": 2.7338284361209775, + "grad_norm": 0.6086465716362, + "learning_rate": 0.0004364917573537334, + "loss": 3.5381, + "step": 25400 + }, + { + "epoch": 2.7392099881605856, + "grad_norm": 0.6256065964698792, + "learning_rate": 0.00043616851632367193, + "loss": 3.557, + "step": 25450 + }, + { + "epoch": 2.7445915402001937, + "grad_norm": 0.6041662096977234, + "learning_rate": 0.0004358452752936106, + "loss": 3.5347, + "step": 25500 + }, + { + "epoch": 2.749973092239802, + "grad_norm": 0.6548029780387878, + "learning_rate": 0.0004355220342635492, + "loss": 3.5374, + "step": 25550 + }, + { + "epoch": 2.7553546442794103, + "grad_norm": 0.6124310493469238, + "learning_rate": 0.00043519879323348777, + "loss": 3.529, + "step": 25600 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 0.596762478351593, + "learning_rate": 0.0004348755522034263, + "loss": 3.5273, + "step": 25650 + }, + { + "epoch": 2.7661177483586266, + "grad_norm": 0.5779131650924683, + "learning_rate": 0.0004345523111733649, + "loss": 3.5448, + "step": 25700 + }, + { + "epoch": 2.7714993003982347, + "grad_norm": 0.6550736427307129, + "learning_rate": 0.00043422907014330344, + "loss": 3.5405, + "step": 25750 + }, + { + "epoch": 2.776880852437843, + "grad_norm": 0.7348501682281494, + "learning_rate": 0.0004339058291132421, + "loss": 3.5329, + "step": 25800 + }, + { + "epoch": 2.7822624044774513, + "grad_norm": 0.6231963038444519, + "learning_rate": 0.0004335825880831807, + "loss": 3.5465, + "step": 25850 + }, + { + "epoch": 2.7876439565170594, + "grad_norm": 0.5558022856712341, + "learning_rate": 0.00043325934705311923, + "loss": 3.5381, + "step": 25900 + }, + { + "epoch": 2.793025508556668, + "grad_norm": 0.6114339232444763, + "learning_rate": 0.0004329361060230578, + "loss": 3.5492, + "step": 25950 + }, + { + "epoch": 2.798407060596276, + "grad_norm": 0.621716320514679, + "learning_rate": 0.00043261286499299636, + "loss": 3.5277, + "step": 26000 + }, + { + "epoch": 2.798407060596276, + "eval_accuracy": 0.36862752171129165, + "eval_loss": 3.51253342628479, + "eval_runtime": 212.7222, + "eval_samples_per_second": 84.669, + "eval_steps_per_second": 5.293, + "step": 26000 + }, + { + "epoch": 2.803788612635884, + "grad_norm": 0.6198678016662598, + "learning_rate": 0.00043228962396293496, + "loss": 3.5241, + "step": 26050 + }, + { + "epoch": 2.8091701646754923, + "grad_norm": 0.5953556299209595, + "learning_rate": 0.0004319663829328736, + "loss": 3.5257, + "step": 26100 + }, + { + "epoch": 2.8145517167151004, + "grad_norm": 0.6188264489173889, + "learning_rate": 0.00043164314190281215, + "loss": 3.5268, + "step": 26150 + }, + { + "epoch": 2.819933268754709, + "grad_norm": 0.5980226993560791, + "learning_rate": 0.00043131990087275074, + "loss": 3.5478, + "step": 26200 + }, + { + "epoch": 2.825314820794317, + "grad_norm": 0.6212344169616699, + "learning_rate": 0.00043099665984268934, + "loss": 3.5297, + "step": 26250 + }, + { + "epoch": 2.830696372833925, + "grad_norm": 0.5705240964889526, + "learning_rate": 0.0004306734188126279, + "loss": 3.5324, + "step": 26300 + }, + { + "epoch": 2.8360779248735337, + "grad_norm": 0.6547979712486267, + "learning_rate": 0.00043035017778256647, + "loss": 3.533, + "step": 26350 + }, + { + "epoch": 2.841459476913142, + "grad_norm": 0.5784933567047119, + "learning_rate": 0.0004300269367525051, + "loss": 3.5297, + "step": 26400 + }, + { + "epoch": 2.84684102895275, + "grad_norm": 0.7146517038345337, + "learning_rate": 0.00042970369572244366, + "loss": 3.5382, + "step": 26450 + }, + { + "epoch": 2.852222580992358, + "grad_norm": 0.6623178124427795, + "learning_rate": 0.00042938045469238226, + "loss": 3.5374, + "step": 26500 + }, + { + "epoch": 2.857604133031966, + "grad_norm": 0.5954160094261169, + "learning_rate": 0.0004290572136623208, + "loss": 3.5509, + "step": 26550 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.6484636068344116, + "learning_rate": 0.0004287339726322594, + "loss": 3.5399, + "step": 26600 + }, + { + "epoch": 2.868367237111183, + "grad_norm": 0.6219885349273682, + "learning_rate": 0.00042841073160219804, + "loss": 3.5334, + "step": 26650 + }, + { + "epoch": 2.873748789150791, + "grad_norm": 0.6355128288269043, + "learning_rate": 0.0004280874905721366, + "loss": 3.5135, + "step": 26700 + }, + { + "epoch": 2.8791303411903995, + "grad_norm": 0.5998795032501221, + "learning_rate": 0.0004277642495420752, + "loss": 3.5442, + "step": 26750 + }, + { + "epoch": 2.8845118932300076, + "grad_norm": 0.5889739394187927, + "learning_rate": 0.00042744100851201377, + "loss": 3.5133, + "step": 26800 + }, + { + "epoch": 2.8898934452696157, + "grad_norm": 0.6281293630599976, + "learning_rate": 0.0004271177674819523, + "loss": 3.5056, + "step": 26850 + }, + { + "epoch": 2.895274997309224, + "grad_norm": 0.5909113883972168, + "learning_rate": 0.0004267945264518909, + "loss": 3.5157, + "step": 26900 + }, + { + "epoch": 2.9006565493488323, + "grad_norm": 0.6297270059585571, + "learning_rate": 0.00042647128542182955, + "loss": 3.5354, + "step": 26950 + }, + { + "epoch": 2.9060381013884404, + "grad_norm": 0.6224883794784546, + "learning_rate": 0.0004261480443917681, + "loss": 3.5346, + "step": 27000 + }, + { + "epoch": 2.9060381013884404, + "eval_accuracy": 0.3694928333098462, + "eval_loss": 3.5037596225738525, + "eval_runtime": 206.2343, + "eval_samples_per_second": 87.333, + "eval_steps_per_second": 5.46, + "step": 27000 + }, + { + "epoch": 2.9114196534280485, + "grad_norm": 0.6158651113510132, + "learning_rate": 0.00042583126818230795, + "loss": 3.5352, + "step": 27050 + }, + { + "epoch": 2.9168012054676566, + "grad_norm": 0.6259952187538147, + "learning_rate": 0.0004255080271522465, + "loss": 3.5156, + "step": 27100 + }, + { + "epoch": 2.922182757507265, + "grad_norm": 0.5961571335792542, + "learning_rate": 0.0004251847861221851, + "loss": 3.5362, + "step": 27150 + }, + { + "epoch": 2.9275643095468733, + "grad_norm": 0.62592613697052, + "learning_rate": 0.00042486154509212363, + "loss": 3.5094, + "step": 27200 + }, + { + "epoch": 2.9329458615864814, + "grad_norm": 0.5974050760269165, + "learning_rate": 0.0004245383040620622, + "loss": 3.5258, + "step": 27250 + }, + { + "epoch": 2.93832741362609, + "grad_norm": 0.5732911229133606, + "learning_rate": 0.0004242150630320009, + "loss": 3.5374, + "step": 27300 + }, + { + "epoch": 2.943708965665698, + "grad_norm": 0.6453855633735657, + "learning_rate": 0.0004238918220019394, + "loss": 3.527, + "step": 27350 + }, + { + "epoch": 2.949090517705306, + "grad_norm": 0.5842043161392212, + "learning_rate": 0.000423568580971878, + "loss": 3.5214, + "step": 27400 + }, + { + "epoch": 2.9544720697449143, + "grad_norm": 0.6021690964698792, + "learning_rate": 0.00042324533994181655, + "loss": 3.5026, + "step": 27450 + }, + { + "epoch": 2.9598536217845224, + "grad_norm": 0.6042524576187134, + "learning_rate": 0.00042292209891175514, + "loss": 3.5169, + "step": 27500 + }, + { + "epoch": 2.965235173824131, + "grad_norm": 0.6471766829490662, + "learning_rate": 0.00042259885788169374, + "loss": 3.5289, + "step": 27550 + }, + { + "epoch": 2.970616725863739, + "grad_norm": 0.6328997611999512, + "learning_rate": 0.00042227561685163233, + "loss": 3.521, + "step": 27600 + }, + { + "epoch": 2.975998277903347, + "grad_norm": 0.6381612420082092, + "learning_rate": 0.00042195237582157093, + "loss": 3.5258, + "step": 27650 + }, + { + "epoch": 2.9813798299429557, + "grad_norm": 0.6035014986991882, + "learning_rate": 0.0004216291347915095, + "loss": 3.5295, + "step": 27700 + }, + { + "epoch": 2.986761381982564, + "grad_norm": 0.6743372678756714, + "learning_rate": 0.00042130589376144806, + "loss": 3.5261, + "step": 27750 + }, + { + "epoch": 2.992142934022172, + "grad_norm": 0.6017929315567017, + "learning_rate": 0.00042098265273138666, + "loss": 3.5172, + "step": 27800 + }, + { + "epoch": 2.9975244860617805, + "grad_norm": 0.6554933786392212, + "learning_rate": 0.0004206594117013252, + "loss": 3.5245, + "step": 27850 + }, + { + "epoch": 3.0029060381013886, + "grad_norm": 0.6437206864356995, + "learning_rate": 0.00042033617067126385, + "loss": 3.4742, + "step": 27900 + }, + { + "epoch": 3.0082875901409967, + "grad_norm": 0.6490093469619751, + "learning_rate": 0.00042001292964120244, + "loss": 3.4372, + "step": 27950 + }, + { + "epoch": 3.0136691421806048, + "grad_norm": 0.6507807374000549, + "learning_rate": 0.000419689688611141, + "loss": 3.4461, + "step": 28000 + }, + { + "epoch": 3.0136691421806048, + "eval_accuracy": 0.3706709565724186, + "eval_loss": 3.497730016708374, + "eval_runtime": 215.1239, + "eval_samples_per_second": 83.724, + "eval_steps_per_second": 5.234, + "step": 28000 + }, + { + "epoch": 3.0190506942202133, + "grad_norm": 0.6191187500953674, + "learning_rate": 0.0004193664475810796, + "loss": 3.4181, + "step": 28050 + }, + { + "epoch": 3.0244322462598214, + "grad_norm": 0.62087082862854, + "learning_rate": 0.00041904320655101817, + "loss": 3.4181, + "step": 28100 + }, + { + "epoch": 3.0298137982994295, + "grad_norm": 0.6341911554336548, + "learning_rate": 0.0004187199655209567, + "loss": 3.4286, + "step": 28150 + }, + { + "epoch": 3.0351953503390376, + "grad_norm": 0.6108666062355042, + "learning_rate": 0.00041839672449089536, + "loss": 3.4278, + "step": 28200 + }, + { + "epoch": 3.040576902378646, + "grad_norm": 0.5951593518257141, + "learning_rate": 0.00041807348346083395, + "loss": 3.441, + "step": 28250 + }, + { + "epoch": 3.0459584544182543, + "grad_norm": 0.6405830383300781, + "learning_rate": 0.0004177502424307725, + "loss": 3.4542, + "step": 28300 + }, + { + "epoch": 3.0513400064578624, + "grad_norm": 0.6463958024978638, + "learning_rate": 0.0004174270014007111, + "loss": 3.4252, + "step": 28350 + }, + { + "epoch": 3.0567215584974705, + "grad_norm": 0.5980671048164368, + "learning_rate": 0.00041710376037064963, + "loss": 3.4458, + "step": 28400 + }, + { + "epoch": 3.062103110537079, + "grad_norm": 0.6921752691268921, + "learning_rate": 0.0004167805193405883, + "loss": 3.4406, + "step": 28450 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.6020042896270752, + "learning_rate": 0.0004164572783105269, + "loss": 3.4446, + "step": 28500 + }, + { + "epoch": 3.0728662146162953, + "grad_norm": 0.6640591025352478, + "learning_rate": 0.0004161340372804654, + "loss": 3.4635, + "step": 28550 + }, + { + "epoch": 3.0782477666559034, + "grad_norm": 0.6354085206985474, + "learning_rate": 0.000415810796250404, + "loss": 3.4555, + "step": 28600 + }, + { + "epoch": 3.083629318695512, + "grad_norm": 0.636901319026947, + "learning_rate": 0.0004154875552203426, + "loss": 3.4515, + "step": 28650 + }, + { + "epoch": 3.08901087073512, + "grad_norm": 0.6186822056770325, + "learning_rate": 0.00041516431419028114, + "loss": 3.4345, + "step": 28700 + }, + { + "epoch": 3.094392422774728, + "grad_norm": 0.6416160464286804, + "learning_rate": 0.0004148410731602198, + "loss": 3.45, + "step": 28750 + }, + { + "epoch": 3.0997739748143363, + "grad_norm": 0.6019704341888428, + "learning_rate": 0.0004145178321301584, + "loss": 3.4452, + "step": 28800 + }, + { + "epoch": 3.105155526853945, + "grad_norm": 0.6140844225883484, + "learning_rate": 0.00041419459110009693, + "loss": 3.4474, + "step": 28850 + }, + { + "epoch": 3.110537078893553, + "grad_norm": 0.6584010720252991, + "learning_rate": 0.0004138713500700355, + "loss": 3.442, + "step": 28900 + }, + { + "epoch": 3.115918630933161, + "grad_norm": 0.6763238906860352, + "learning_rate": 0.00041354810903997406, + "loss": 3.4475, + "step": 28950 + }, + { + "epoch": 3.121300182972769, + "grad_norm": 0.6160973906517029, + "learning_rate": 0.00041322486800991266, + "loss": 3.4491, + "step": 29000 + }, + { + "epoch": 3.121300182972769, + "eval_accuracy": 0.3712210661434233, + "eval_loss": 3.494593858718872, + "eval_runtime": 209.8562, + "eval_samples_per_second": 85.825, + "eval_steps_per_second": 5.366, + "step": 29000 + }, + { + "epoch": 3.1266817350123777, + "grad_norm": 0.6309645771980286, + "learning_rate": 0.0004129016269798513, + "loss": 3.452, + "step": 29050 + }, + { + "epoch": 3.132063287051986, + "grad_norm": 0.616537868976593, + "learning_rate": 0.00041257838594978985, + "loss": 3.4708, + "step": 29100 + }, + { + "epoch": 3.137444839091594, + "grad_norm": 0.6185223460197449, + "learning_rate": 0.00041225514491972844, + "loss": 3.455, + "step": 29150 + }, + { + "epoch": 3.1428263911312024, + "grad_norm": 0.6138353943824768, + "learning_rate": 0.00041193190388966704, + "loss": 3.4406, + "step": 29200 + }, + { + "epoch": 3.1482079431708105, + "grad_norm": 0.608535647392273, + "learning_rate": 0.0004116086628596056, + "loss": 3.4461, + "step": 29250 + }, + { + "epoch": 3.1535894952104186, + "grad_norm": 0.6112627387046814, + "learning_rate": 0.0004112854218295442, + "loss": 3.4395, + "step": 29300 + }, + { + "epoch": 3.1589710472500268, + "grad_norm": 0.5768312811851501, + "learning_rate": 0.0004109621807994828, + "loss": 3.4302, + "step": 29350 + }, + { + "epoch": 3.1643525992896353, + "grad_norm": 0.6325687170028687, + "learning_rate": 0.00041063893976942136, + "loss": 3.4465, + "step": 29400 + }, + { + "epoch": 3.1697341513292434, + "grad_norm": 0.6342477202415466, + "learning_rate": 0.00041031569873935996, + "loss": 3.442, + "step": 29450 + }, + { + "epoch": 3.1751157033688515, + "grad_norm": 0.6336598992347717, + "learning_rate": 0.0004099924577092985, + "loss": 3.4462, + "step": 29500 + }, + { + "epoch": 3.1804972554084596, + "grad_norm": 0.607629656791687, + "learning_rate": 0.0004096692166792371, + "loss": 3.4299, + "step": 29550 + }, + { + "epoch": 3.185878807448068, + "grad_norm": 0.6477324366569519, + "learning_rate": 0.00040934597564917574, + "loss": 3.448, + "step": 29600 + }, + { + "epoch": 3.1912603594876763, + "grad_norm": 0.6741542220115662, + "learning_rate": 0.0004090227346191143, + "loss": 3.4596, + "step": 29650 + }, + { + "epoch": 3.1966419115272844, + "grad_norm": 0.6519025564193726, + "learning_rate": 0.0004086994935890529, + "loss": 3.4715, + "step": 29700 + }, + { + "epoch": 3.2020234635668925, + "grad_norm": 0.6172560453414917, + "learning_rate": 0.00040837625255899147, + "loss": 3.4494, + "step": 29750 + }, + { + "epoch": 3.207405015606501, + "grad_norm": 0.6829916834831238, + "learning_rate": 0.00040805301152893, + "loss": 3.4398, + "step": 29800 + }, + { + "epoch": 3.212786567646109, + "grad_norm": 0.6081319451332092, + "learning_rate": 0.0004077297704988686, + "loss": 3.446, + "step": 29850 + }, + { + "epoch": 3.2181681196857173, + "grad_norm": 0.6863629817962646, + "learning_rate": 0.00040740652946880725, + "loss": 3.4471, + "step": 29900 + }, + { + "epoch": 3.2235496717253254, + "grad_norm": 0.6253100633621216, + "learning_rate": 0.0004070832884387458, + "loss": 3.4593, + "step": 29950 + }, + { + "epoch": 3.228931223764934, + "grad_norm": 0.61070317029953, + "learning_rate": 0.0004067600474086844, + "loss": 3.4684, + "step": 30000 + }, + { + "epoch": 3.228931223764934, + "eval_accuracy": 0.3713381939564873, + "eval_loss": 3.4922268390655518, + "eval_runtime": 208.2583, + "eval_samples_per_second": 86.484, + "eval_steps_per_second": 5.407, + "step": 30000 + }, + { + "epoch": 3.234312775804542, + "grad_norm": 0.6275219321250916, + "learning_rate": 0.00040643680637862293, + "loss": 3.4539, + "step": 30050 + }, + { + "epoch": 3.23969432784415, + "grad_norm": 0.6307212710380554, + "learning_rate": 0.0004061135653485615, + "loss": 3.483, + "step": 30100 + }, + { + "epoch": 3.2450758798837587, + "grad_norm": 0.6696796417236328, + "learning_rate": 0.0004057903243185001, + "loss": 3.4521, + "step": 30150 + }, + { + "epoch": 3.250457431923367, + "grad_norm": 0.6193971037864685, + "learning_rate": 0.0004054670832884387, + "loss": 3.4574, + "step": 30200 + }, + { + "epoch": 3.255838983962975, + "grad_norm": 0.6391464471817017, + "learning_rate": 0.0004051438422583773, + "loss": 3.4539, + "step": 30250 + }, + { + "epoch": 3.261220536002583, + "grad_norm": 0.641261100769043, + "learning_rate": 0.0004048206012283159, + "loss": 3.4772, + "step": 30300 + }, + { + "epoch": 3.2666020880421915, + "grad_norm": 0.6269187331199646, + "learning_rate": 0.00040449736019825444, + "loss": 3.4534, + "step": 30350 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.6142138242721558, + "learning_rate": 0.00040417411916819304, + "loss": 3.4688, + "step": 30400 + }, + { + "epoch": 3.2773651921214078, + "grad_norm": 0.6109180450439453, + "learning_rate": 0.0004038508781381317, + "loss": 3.4442, + "step": 30450 + }, + { + "epoch": 3.282746744161016, + "grad_norm": 0.6190860867500305, + "learning_rate": 0.0004035276371080702, + "loss": 3.4456, + "step": 30500 + }, + { + "epoch": 3.2881282962006244, + "grad_norm": 0.6348389983177185, + "learning_rate": 0.0004032043960780088, + "loss": 3.4503, + "step": 30550 + }, + { + "epoch": 3.2935098482402325, + "grad_norm": 0.669296145439148, + "learning_rate": 0.00040288115504794736, + "loss": 3.4602, + "step": 30600 + }, + { + "epoch": 3.2988914002798406, + "grad_norm": 0.6249747276306152, + "learning_rate": 0.00040255791401788596, + "loss": 3.4655, + "step": 30650 + }, + { + "epoch": 3.304272952319449, + "grad_norm": 0.577464759349823, + "learning_rate": 0.00040223467298782455, + "loss": 3.4727, + "step": 30700 + }, + { + "epoch": 3.3096545043590573, + "grad_norm": 0.6019587516784668, + "learning_rate": 0.00040191143195776315, + "loss": 3.4585, + "step": 30750 + }, + { + "epoch": 3.3150360563986654, + "grad_norm": 0.601098895072937, + "learning_rate": 0.00040158819092770174, + "loss": 3.4467, + "step": 30800 + }, + { + "epoch": 3.3204176084382735, + "grad_norm": 0.6603884696960449, + "learning_rate": 0.00040127141471824155, + "loss": 3.4478, + "step": 30850 + }, + { + "epoch": 3.3257991604778816, + "grad_norm": 0.6579861044883728, + "learning_rate": 0.00040094817368818014, + "loss": 3.4784, + "step": 30900 + }, + { + "epoch": 3.33118071251749, + "grad_norm": 0.622153103351593, + "learning_rate": 0.0004006249326581187, + "loss": 3.469, + "step": 30950 + }, + { + "epoch": 3.3365622645570983, + "grad_norm": 0.6163874864578247, + "learning_rate": 0.0004003016916280573, + "loss": 3.4662, + "step": 31000 + }, + { + "epoch": 3.3365622645570983, + "eval_accuracy": 0.37212973024421586, + "eval_loss": 3.4844980239868164, + "eval_runtime": 207.7673, + "eval_samples_per_second": 86.688, + "eval_steps_per_second": 5.42, + "step": 31000 + }, + { + "epoch": 3.3419438165967064, + "grad_norm": 0.6363682746887207, + "learning_rate": 0.00039997845059799587, + "loss": 3.4479, + "step": 31050 + }, + { + "epoch": 3.347325368636315, + "grad_norm": 0.6347747445106506, + "learning_rate": 0.00039965520956793447, + "loss": 3.4736, + "step": 31100 + }, + { + "epoch": 3.352706920675923, + "grad_norm": 0.6099669933319092, + "learning_rate": 0.00039933196853787306, + "loss": 3.4788, + "step": 31150 + }, + { + "epoch": 3.358088472715531, + "grad_norm": 0.6515194177627563, + "learning_rate": 0.00039900872750781166, + "loss": 3.4613, + "step": 31200 + }, + { + "epoch": 3.3634700247551392, + "grad_norm": 0.6574184894561768, + "learning_rate": 0.0003986854864777502, + "loss": 3.4714, + "step": 31250 + }, + { + "epoch": 3.368851576794748, + "grad_norm": 0.6157483458518982, + "learning_rate": 0.0003983622454476888, + "loss": 3.4752, + "step": 31300 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 0.632178544998169, + "learning_rate": 0.00039803900441762733, + "loss": 3.4579, + "step": 31350 + }, + { + "epoch": 3.379614680873964, + "grad_norm": 0.6561319231987, + "learning_rate": 0.000397715763387566, + "loss": 3.4659, + "step": 31400 + }, + { + "epoch": 3.384996232913572, + "grad_norm": 0.6884437203407288, + "learning_rate": 0.0003973925223575046, + "loss": 3.4761, + "step": 31450 + }, + { + "epoch": 3.3903777849531807, + "grad_norm": 0.6156170964241028, + "learning_rate": 0.0003970692813274431, + "loss": 3.4604, + "step": 31500 + }, + { + "epoch": 3.3957593369927888, + "grad_norm": 0.7083854675292969, + "learning_rate": 0.0003967460402973817, + "loss": 3.465, + "step": 31550 + }, + { + "epoch": 3.401140889032397, + "grad_norm": 0.6180551648139954, + "learning_rate": 0.0003964227992673203, + "loss": 3.4453, + "step": 31600 + }, + { + "epoch": 3.4065224410720054, + "grad_norm": 0.6462782621383667, + "learning_rate": 0.00039609955823725884, + "loss": 3.4589, + "step": 31650 + }, + { + "epoch": 3.4119039931116135, + "grad_norm": 0.6157835721969604, + "learning_rate": 0.0003957763172071975, + "loss": 3.462, + "step": 31700 + }, + { + "epoch": 3.4172855451512216, + "grad_norm": 0.6632956862449646, + "learning_rate": 0.0003954530761771361, + "loss": 3.4574, + "step": 31750 + }, + { + "epoch": 3.4226670971908297, + "grad_norm": 0.5914563536643982, + "learning_rate": 0.00039512983514707463, + "loss": 3.4395, + "step": 31800 + }, + { + "epoch": 3.428048649230438, + "grad_norm": 0.6110662817955017, + "learning_rate": 0.0003948065941170132, + "loss": 3.4704, + "step": 31850 + }, + { + "epoch": 3.4334302012700464, + "grad_norm": 0.6194736361503601, + "learning_rate": 0.00039448335308695176, + "loss": 3.4669, + "step": 31900 + }, + { + "epoch": 3.4388117533096545, + "grad_norm": 0.5982261896133423, + "learning_rate": 0.00039416011205689036, + "loss": 3.4541, + "step": 31950 + }, + { + "epoch": 3.4441933053492626, + "grad_norm": 0.5901939868927002, + "learning_rate": 0.000393836871026829, + "loss": 3.4454, + "step": 32000 + }, + { + "epoch": 3.4441933053492626, + "eval_accuracy": 0.3730818555001341, + "eval_loss": 3.4764976501464844, + "eval_runtime": 194.7932, + "eval_samples_per_second": 92.462, + "eval_steps_per_second": 5.78, + "step": 32000 + }, + { + "epoch": 3.449574857388871, + "grad_norm": 0.633826732635498, + "learning_rate": 0.00039351362999676755, + "loss": 3.4546, + "step": 32050 + }, + { + "epoch": 3.4549564094284793, + "grad_norm": 0.6311871409416199, + "learning_rate": 0.00039319038896670614, + "loss": 3.4664, + "step": 32100 + }, + { + "epoch": 3.4603379614680874, + "grad_norm": 0.6177763342857361, + "learning_rate": 0.00039286714793664474, + "loss": 3.4497, + "step": 32150 + }, + { + "epoch": 3.4657195135076955, + "grad_norm": 0.6959267258644104, + "learning_rate": 0.0003925439069065833, + "loss": 3.461, + "step": 32200 + }, + { + "epoch": 3.471101065547304, + "grad_norm": 0.6201044321060181, + "learning_rate": 0.0003922206658765219, + "loss": 3.4577, + "step": 32250 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.6961297988891602, + "learning_rate": 0.0003918974248464605, + "loss": 3.4482, + "step": 32300 + }, + { + "epoch": 3.4818641696265202, + "grad_norm": 0.6520218253135681, + "learning_rate": 0.00039157418381639906, + "loss": 3.4618, + "step": 32350 + }, + { + "epoch": 3.4872457216661283, + "grad_norm": 0.6764187216758728, + "learning_rate": 0.00039125094278633766, + "loss": 3.4571, + "step": 32400 + }, + { + "epoch": 3.492627273705737, + "grad_norm": 0.6045281291007996, + "learning_rate": 0.0003909277017562762, + "loss": 3.464, + "step": 32450 + }, + { + "epoch": 3.498008825745345, + "grad_norm": 0.6529583930969238, + "learning_rate": 0.0003906044607262148, + "loss": 3.4465, + "step": 32500 + }, + { + "epoch": 3.503390377784953, + "grad_norm": 0.6828312277793884, + "learning_rate": 0.00039028121969615344, + "loss": 3.4478, + "step": 32550 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.6279124021530151, + "learning_rate": 0.000389957978666092, + "loss": 3.4532, + "step": 32600 + }, + { + "epoch": 3.5141534818641698, + "grad_norm": 0.6408252120018005, + "learning_rate": 0.0003896347376360306, + "loss": 3.461, + "step": 32650 + }, + { + "epoch": 3.519535033903778, + "grad_norm": 0.6262297034263611, + "learning_rate": 0.0003893114966059691, + "loss": 3.4685, + "step": 32700 + }, + { + "epoch": 3.524916585943386, + "grad_norm": 0.6279938220977783, + "learning_rate": 0.0003889882555759077, + "loss": 3.4681, + "step": 32750 + }, + { + "epoch": 3.530298137982994, + "grad_norm": 0.6286149621009827, + "learning_rate": 0.0003886650145458463, + "loss": 3.4685, + "step": 32800 + }, + { + "epoch": 3.5356796900226026, + "grad_norm": 0.6136440634727478, + "learning_rate": 0.0003883417735157849, + "loss": 3.4395, + "step": 32850 + }, + { + "epoch": 3.5410612420622107, + "grad_norm": 0.6475735902786255, + "learning_rate": 0.0003880185324857235, + "loss": 3.4601, + "step": 32900 + }, + { + "epoch": 3.546442794101819, + "grad_norm": 0.6448686718940735, + "learning_rate": 0.0003877017562762633, + "loss": 3.4557, + "step": 32950 + }, + { + "epoch": 3.5518243461414274, + "grad_norm": 0.6235514283180237, + "learning_rate": 0.0003873785152462019, + "loss": 3.4605, + "step": 33000 + }, + { + "epoch": 3.5518243461414274, + "eval_accuracy": 0.3741357885119346, + "eval_loss": 3.4696691036224365, + "eval_runtime": 204.336, + "eval_samples_per_second": 88.144, + "eval_steps_per_second": 5.511, + "step": 33000 + }, + { + "epoch": 3.5572058981810355, + "grad_norm": 0.6031110882759094, + "learning_rate": 0.0003870552742161405, + "loss": 3.4576, + "step": 33050 + }, + { + "epoch": 3.5625874502206436, + "grad_norm": 0.6580427289009094, + "learning_rate": 0.00038673203318607903, + "loss": 3.4585, + "step": 33100 + }, + { + "epoch": 3.5679690022602517, + "grad_norm": 0.6172491908073425, + "learning_rate": 0.0003864087921560176, + "loss": 3.445, + "step": 33150 + }, + { + "epoch": 3.57335055429986, + "grad_norm": 0.6818593144416809, + "learning_rate": 0.0003860855511259563, + "loss": 3.4474, + "step": 33200 + }, + { + "epoch": 3.5787321063394684, + "grad_norm": 0.6435357332229614, + "learning_rate": 0.0003857623100958948, + "loss": 3.4501, + "step": 33250 + }, + { + "epoch": 3.5841136583790765, + "grad_norm": 0.639147937297821, + "learning_rate": 0.0003854390690658334, + "loss": 3.481, + "step": 33300 + }, + { + "epoch": 3.5894952104186846, + "grad_norm": 0.6366181373596191, + "learning_rate": 0.00038511582803577195, + "loss": 3.4775, + "step": 33350 + }, + { + "epoch": 3.594876762458293, + "grad_norm": 0.6355386972427368, + "learning_rate": 0.00038479258700571054, + "loss": 3.4591, + "step": 33400 + }, + { + "epoch": 3.6002583144979012, + "grad_norm": 0.6315577030181885, + "learning_rate": 0.00038446934597564914, + "loss": 3.4535, + "step": 33450 + }, + { + "epoch": 3.6056398665375093, + "grad_norm": 0.6854292750358582, + "learning_rate": 0.00038414610494558773, + "loss": 3.4665, + "step": 33500 + }, + { + "epoch": 3.611021418577118, + "grad_norm": 0.680067777633667, + "learning_rate": 0.00038382286391552633, + "loss": 3.4476, + "step": 33550 + }, + { + "epoch": 3.616402970616726, + "grad_norm": 0.6212747693061829, + "learning_rate": 0.0003834996228854649, + "loss": 3.4603, + "step": 33600 + }, + { + "epoch": 3.621784522656334, + "grad_norm": 0.6352766156196594, + "learning_rate": 0.00038317638185540346, + "loss": 3.452, + "step": 33650 + }, + { + "epoch": 3.627166074695942, + "grad_norm": 0.6111792922019958, + "learning_rate": 0.00038285314082534206, + "loss": 3.4636, + "step": 33700 + }, + { + "epoch": 3.6325476267355503, + "grad_norm": 0.6384958028793335, + "learning_rate": 0.0003825298997952806, + "loss": 3.4458, + "step": 33750 + }, + { + "epoch": 3.637929178775159, + "grad_norm": 0.6627931594848633, + "learning_rate": 0.00038220665876521925, + "loss": 3.4675, + "step": 33800 + }, + { + "epoch": 3.643310730814767, + "grad_norm": 0.7328792810440063, + "learning_rate": 0.00038188341773515784, + "loss": 3.434, + "step": 33850 + }, + { + "epoch": 3.648692282854375, + "grad_norm": 0.7406517863273621, + "learning_rate": 0.0003815601767050964, + "loss": 3.4399, + "step": 33900 + }, + { + "epoch": 3.6540738348939836, + "grad_norm": 0.6318268775939941, + "learning_rate": 0.000381236935675035, + "loss": 3.4448, + "step": 33950 + }, + { + "epoch": 3.6594553869335917, + "grad_norm": 0.6655885577201843, + "learning_rate": 0.0003809136946449735, + "loss": 3.4428, + "step": 34000 + }, + { + "epoch": 3.6594553869335917, + "eval_accuracy": 0.3745497560145078, + "eval_loss": 3.463956594467163, + "eval_runtime": 207.206, + "eval_samples_per_second": 86.923, + "eval_steps_per_second": 5.434, + "step": 34000 + }, + { + "epoch": 3.6648369389732, + "grad_norm": 0.6424694061279297, + "learning_rate": 0.00038059045361491217, + "loss": 3.4602, + "step": 34050 + }, + { + "epoch": 3.670218491012808, + "grad_norm": 0.6229196190834045, + "learning_rate": 0.00038026721258485076, + "loss": 3.463, + "step": 34100 + }, + { + "epoch": 3.675600043052416, + "grad_norm": 0.6844025254249573, + "learning_rate": 0.0003799439715547893, + "loss": 3.4696, + "step": 34150 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.707100510597229, + "learning_rate": 0.0003796207305247279, + "loss": 3.465, + "step": 34200 + }, + { + "epoch": 3.6863631471316327, + "grad_norm": 0.6834603548049927, + "learning_rate": 0.0003792974894946665, + "loss": 3.4657, + "step": 34250 + }, + { + "epoch": 3.691744699171241, + "grad_norm": 0.6267171502113342, + "learning_rate": 0.00037897424846460503, + "loss": 3.4584, + "step": 34300 + }, + { + "epoch": 3.6971262512108494, + "grad_norm": 0.6426784992218018, + "learning_rate": 0.0003786510074345437, + "loss": 3.4654, + "step": 34350 + }, + { + "epoch": 3.7025078032504575, + "grad_norm": 0.697043776512146, + "learning_rate": 0.0003783277664044823, + "loss": 3.451, + "step": 34400 + }, + { + "epoch": 3.7078893552900656, + "grad_norm": 0.6688522100448608, + "learning_rate": 0.0003780045253744208, + "loss": 3.4714, + "step": 34450 + }, + { + "epoch": 3.713270907329674, + "grad_norm": 0.6643778085708618, + "learning_rate": 0.0003776812843443594, + "loss": 3.4316, + "step": 34500 + }, + { + "epoch": 3.7186524593692822, + "grad_norm": 0.6475136280059814, + "learning_rate": 0.00037735804331429795, + "loss": 3.4301, + "step": 34550 + }, + { + "epoch": 3.7240340114088903, + "grad_norm": 0.6495978236198425, + "learning_rate": 0.00037703480228423654, + "loss": 3.4566, + "step": 34600 + }, + { + "epoch": 3.7294155634484984, + "grad_norm": 0.6580095887184143, + "learning_rate": 0.0003767115612541752, + "loss": 3.4636, + "step": 34650 + }, + { + "epoch": 3.7347971154881066, + "grad_norm": 0.6825853586196899, + "learning_rate": 0.00037638832022411373, + "loss": 3.459, + "step": 34700 + }, + { + "epoch": 3.740178667527715, + "grad_norm": 0.6098463535308838, + "learning_rate": 0.00037606507919405233, + "loss": 3.4392, + "step": 34750 + }, + { + "epoch": 3.745560219567323, + "grad_norm": 0.7137957811355591, + "learning_rate": 0.0003757418381639909, + "loss": 3.4407, + "step": 34800 + }, + { + "epoch": 3.7509417716069313, + "grad_norm": 0.6484520435333252, + "learning_rate": 0.00037541859713392946, + "loss": 3.4611, + "step": 34850 + }, + { + "epoch": 3.75632332364654, + "grad_norm": 0.7095451354980469, + "learning_rate": 0.0003750953561038681, + "loss": 3.4435, + "step": 34900 + }, + { + "epoch": 3.761704875686148, + "grad_norm": 0.625594973564148, + "learning_rate": 0.0003747721150738067, + "loss": 3.4527, + "step": 34950 + }, + { + "epoch": 3.767086427725756, + "grad_norm": 0.6271294355392456, + "learning_rate": 0.00037444887404374525, + "loss": 3.458, + "step": 35000 + }, + { + "epoch": 3.767086427725756, + "eval_accuracy": 0.37491146147804216, + "eval_loss": 3.45908522605896, + "eval_runtime": 199.5792, + "eval_samples_per_second": 90.245, + "eval_steps_per_second": 5.642, + "step": 35000 + }, + { + "epoch": 3.772467979765364, + "grad_norm": 0.6571464538574219, + "learning_rate": 0.00037412563301368384, + "loss": 3.4627, + "step": 35050 + }, + { + "epoch": 3.7778495318049723, + "grad_norm": 0.6304094791412354, + "learning_rate": 0.0003738023919836224, + "loss": 3.4734, + "step": 35100 + }, + { + "epoch": 3.783231083844581, + "grad_norm": 0.7219865322113037, + "learning_rate": 0.000373479150953561, + "loss": 3.4536, + "step": 35150 + }, + { + "epoch": 3.788612635884189, + "grad_norm": 0.6388824582099915, + "learning_rate": 0.0003731559099234996, + "loss": 3.4584, + "step": 35200 + }, + { + "epoch": 3.793994187923797, + "grad_norm": 0.6524561047554016, + "learning_rate": 0.00037283266889343817, + "loss": 3.4621, + "step": 35250 + }, + { + "epoch": 3.7993757399634056, + "grad_norm": 0.6496708989143372, + "learning_rate": 0.00037250942786337676, + "loss": 3.4551, + "step": 35300 + }, + { + "epoch": 3.8047572920030137, + "grad_norm": 0.668373167514801, + "learning_rate": 0.00037218618683331536, + "loss": 3.4609, + "step": 35350 + }, + { + "epoch": 3.810138844042622, + "grad_norm": 0.6569227576255798, + "learning_rate": 0.0003718629458032539, + "loss": 3.4436, + "step": 35400 + }, + { + "epoch": 3.8155203960822304, + "grad_norm": 0.6292812824249268, + "learning_rate": 0.0003715397047731925, + "loss": 3.4514, + "step": 35450 + }, + { + "epoch": 3.8209019481218385, + "grad_norm": 0.6967467665672302, + "learning_rate": 0.00037121646374313114, + "loss": 3.4759, + "step": 35500 + }, + { + "epoch": 3.8262835001614466, + "grad_norm": 0.6586191058158875, + "learning_rate": 0.0003708932227130697, + "loss": 3.46, + "step": 35550 + }, + { + "epoch": 3.8316650522010547, + "grad_norm": 0.6478423476219177, + "learning_rate": 0.0003705699816830083, + "loss": 3.4529, + "step": 35600 + }, + { + "epoch": 3.837046604240663, + "grad_norm": 0.6029471755027771, + "learning_rate": 0.0003702467406529468, + "loss": 3.4454, + "step": 35650 + }, + { + "epoch": 3.8424281562802713, + "grad_norm": 0.6288083791732788, + "learning_rate": 0.0003699234996228854, + "loss": 3.4531, + "step": 35700 + }, + { + "epoch": 3.8478097083198795, + "grad_norm": 0.6120226383209229, + "learning_rate": 0.000369600258592824, + "loss": 3.4688, + "step": 35750 + }, + { + "epoch": 3.8531912603594876, + "grad_norm": 0.6600449085235596, + "learning_rate": 0.0003692770175627626, + "loss": 3.4735, + "step": 35800 + }, + { + "epoch": 3.858572812399096, + "grad_norm": 0.6423937678337097, + "learning_rate": 0.0003689537765327012, + "loss": 3.4451, + "step": 35850 + }, + { + "epoch": 3.863954364438704, + "grad_norm": 0.644229531288147, + "learning_rate": 0.000368637000323241, + "loss": 3.468, + "step": 35900 + }, + { + "epoch": 3.8693359164783123, + "grad_norm": 0.6580808162689209, + "learning_rate": 0.0003683137592931796, + "loss": 3.4742, + "step": 35950 + }, + { + "epoch": 3.8747174685179204, + "grad_norm": 0.6204085350036621, + "learning_rate": 0.00036799051826311814, + "loss": 3.4501, + "step": 36000 + }, + { + "epoch": 3.8747174685179204, + "eval_accuracy": 0.37576525587048837, + "eval_loss": 3.4518706798553467, + "eval_runtime": 205.4042, + "eval_samples_per_second": 87.686, + "eval_steps_per_second": 5.482, + "step": 36000 + }, + { + "epoch": 3.8800990205575285, + "grad_norm": 0.6474447250366211, + "learning_rate": 0.00036766727723305673, + "loss": 3.4434, + "step": 36050 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.6868647336959839, + "learning_rate": 0.0003673440362029953, + "loss": 3.4569, + "step": 36100 + }, + { + "epoch": 3.890862124636745, + "grad_norm": 0.6220848560333252, + "learning_rate": 0.0003670207951729339, + "loss": 3.4423, + "step": 36150 + }, + { + "epoch": 3.8962436766763533, + "grad_norm": 0.7336555123329163, + "learning_rate": 0.0003666975541428725, + "loss": 3.4505, + "step": 36200 + }, + { + "epoch": 3.901625228715962, + "grad_norm": 0.6814396381378174, + "learning_rate": 0.0003663743131128111, + "loss": 3.453, + "step": 36250 + }, + { + "epoch": 3.90700678075557, + "grad_norm": 0.6469142436981201, + "learning_rate": 0.00036605107208274965, + "loss": 3.4619, + "step": 36300 + }, + { + "epoch": 3.912388332795178, + "grad_norm": 1.2596371173858643, + "learning_rate": 0.00036572783105268824, + "loss": 3.4511, + "step": 36350 + }, + { + "epoch": 3.9177698848347866, + "grad_norm": 0.6139094233512878, + "learning_rate": 0.0003654045900226268, + "loss": 3.4416, + "step": 36400 + }, + { + "epoch": 3.9231514368743947, + "grad_norm": 0.6947947144508362, + "learning_rate": 0.00036508134899256543, + "loss": 3.4434, + "step": 36450 + }, + { + "epoch": 3.928532988914003, + "grad_norm": 0.6977193355560303, + "learning_rate": 0.00036475810796250403, + "loss": 3.4436, + "step": 36500 + }, + { + "epoch": 3.933914540953611, + "grad_norm": 0.6614829301834106, + "learning_rate": 0.00036443486693244257, + "loss": 3.4628, + "step": 36550 + }, + { + "epoch": 3.939296092993219, + "grad_norm": 0.629019558429718, + "learning_rate": 0.00036411162590238116, + "loss": 3.446, + "step": 36600 + }, + { + "epoch": 3.9446776450328276, + "grad_norm": 0.6478345394134521, + "learning_rate": 0.00036378838487231976, + "loss": 3.46, + "step": 36650 + }, + { + "epoch": 3.9500591970724357, + "grad_norm": 0.7331146597862244, + "learning_rate": 0.00036346514384225835, + "loss": 3.4329, + "step": 36700 + }, + { + "epoch": 3.955440749112044, + "grad_norm": 0.6980277299880981, + "learning_rate": 0.00036314190281219695, + "loss": 3.4525, + "step": 36750 + }, + { + "epoch": 3.9608223011516523, + "grad_norm": 0.6497023105621338, + "learning_rate": 0.00036281866178213554, + "loss": 3.4378, + "step": 36800 + }, + { + "epoch": 3.9662038531912605, + "grad_norm": 0.6118669509887695, + "learning_rate": 0.0003624954207520741, + "loss": 3.4387, + "step": 36850 + }, + { + "epoch": 3.9715854052308686, + "grad_norm": 0.6508960127830505, + "learning_rate": 0.0003621721797220127, + "loss": 3.4462, + "step": 36900 + }, + { + "epoch": 3.9769669572704767, + "grad_norm": 0.6341844201087952, + "learning_rate": 0.0003618489386919512, + "loss": 3.4397, + "step": 36950 + }, + { + "epoch": 3.9823485093100848, + "grad_norm": 0.6382229328155518, + "learning_rate": 0.00036152569766188987, + "loss": 3.4582, + "step": 37000 + }, + { + "epoch": 3.9823485093100848, + "eval_accuracy": 0.37578046727478237, + "eval_loss": 3.4466753005981445, + "eval_runtime": 203.9996, + "eval_samples_per_second": 88.289, + "eval_steps_per_second": 5.52, + "step": 37000 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 0.6708056926727295, + "learning_rate": 0.00036120245663182846, + "loss": 3.4533, + "step": 37050 + }, + { + "epoch": 3.9931116133893014, + "grad_norm": 0.7044811248779297, + "learning_rate": 0.000360879215601767, + "loss": 3.4423, + "step": 37100 + }, + { + "epoch": 3.9984931654289095, + "grad_norm": 0.6734232306480408, + "learning_rate": 0.0003605559745717056, + "loss": 3.4398, + "step": 37150 + }, + { + "epoch": 4.003874717468518, + "grad_norm": 0.624794602394104, + "learning_rate": 0.0003602327335416442, + "loss": 3.3979, + "step": 37200 + }, + { + "epoch": 4.009256269508126, + "grad_norm": 0.6523617506027222, + "learning_rate": 0.00035990949251158273, + "loss": 3.3628, + "step": 37250 + }, + { + "epoch": 4.014637821547734, + "grad_norm": 0.6801915764808655, + "learning_rate": 0.0003595862514815214, + "loss": 3.3562, + "step": 37300 + }, + { + "epoch": 4.020019373587343, + "grad_norm": 0.6472498178482056, + "learning_rate": 0.00035926301045146, + "loss": 3.3433, + "step": 37350 + }, + { + "epoch": 4.0254009256269505, + "grad_norm": 0.6406301259994507, + "learning_rate": 0.0003589397694213985, + "loss": 3.3586, + "step": 37400 + }, + { + "epoch": 4.030782477666559, + "grad_norm": 0.635485827922821, + "learning_rate": 0.0003586165283913371, + "loss": 3.3601, + "step": 37450 + }, + { + "epoch": 4.036164029706168, + "grad_norm": 0.6421555876731873, + "learning_rate": 0.00035829328736127565, + "loss": 3.3533, + "step": 37500 + }, + { + "epoch": 4.041545581745775, + "grad_norm": 0.6644110679626465, + "learning_rate": 0.00035797004633121425, + "loss": 3.3551, + "step": 37550 + }, + { + "epoch": 4.046927133785384, + "grad_norm": 0.6391462683677673, + "learning_rate": 0.0003576468053011529, + "loss": 3.3478, + "step": 37600 + }, + { + "epoch": 4.0523086858249915, + "grad_norm": 0.7233789563179016, + "learning_rate": 0.00035732356427109143, + "loss": 3.3716, + "step": 37650 + }, + { + "epoch": 4.0576902378646, + "grad_norm": 0.6202532052993774, + "learning_rate": 0.00035700032324103003, + "loss": 3.375, + "step": 37700 + }, + { + "epoch": 4.063071789904209, + "grad_norm": 0.6700003743171692, + "learning_rate": 0.0003566770822109686, + "loss": 3.3763, + "step": 37750 + }, + { + "epoch": 4.068453341943816, + "grad_norm": 0.6778683066368103, + "learning_rate": 0.00035635384118090716, + "loss": 3.3668, + "step": 37800 + }, + { + "epoch": 4.073834893983425, + "grad_norm": 0.6656447649002075, + "learning_rate": 0.0003560306001508458, + "loss": 3.3624, + "step": 37850 + }, + { + "epoch": 4.079216446023033, + "grad_norm": 0.6423748135566711, + "learning_rate": 0.0003557073591207844, + "loss": 3.3872, + "step": 37900 + }, + { + "epoch": 4.084597998062641, + "grad_norm": 0.6550871729850769, + "learning_rate": 0.00035538411809072295, + "loss": 3.3689, + "step": 37950 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.7397666573524475, + "learning_rate": 0.00035506087706066154, + "loss": 3.3691, + "step": 38000 + }, + { + "epoch": 4.08997955010225, + "eval_accuracy": 0.37662122332069087, + "eval_loss": 3.448406934738159, + "eval_runtime": 209.7019, + "eval_samples_per_second": 85.889, + "eval_steps_per_second": 5.37, + "step": 38000 + }, + { + "epoch": 4.095361102141858, + "grad_norm": 0.6562075614929199, + "learning_rate": 0.00035474410085120135, + "loss": 3.3729, + "step": 38050 + }, + { + "epoch": 4.100742654181466, + "grad_norm": 0.663729190826416, + "learning_rate": 0.00035442085982113994, + "loss": 3.3756, + "step": 38100 + }, + { + "epoch": 4.106124206221074, + "grad_norm": 0.6242133378982544, + "learning_rate": 0.0003540976187910785, + "loss": 3.3834, + "step": 38150 + }, + { + "epoch": 4.111505758260682, + "grad_norm": 0.7042984962463379, + "learning_rate": 0.0003537743777610171, + "loss": 3.3753, + "step": 38200 + }, + { + "epoch": 4.1168873103002905, + "grad_norm": 0.6397444605827332, + "learning_rate": 0.00035345113673095573, + "loss": 3.3607, + "step": 38250 + }, + { + "epoch": 4.122268862339899, + "grad_norm": 0.6845241189002991, + "learning_rate": 0.00035312789570089427, + "loss": 3.3678, + "step": 38300 + }, + { + "epoch": 4.127650414379507, + "grad_norm": 0.6499135494232178, + "learning_rate": 0.00035280465467083286, + "loss": 3.3847, + "step": 38350 + }, + { + "epoch": 4.133031966419115, + "grad_norm": 0.6635656356811523, + "learning_rate": 0.0003524814136407714, + "loss": 3.3735, + "step": 38400 + }, + { + "epoch": 4.138413518458724, + "grad_norm": 0.6924629211425781, + "learning_rate": 0.00035215817261071, + "loss": 3.3783, + "step": 38450 + }, + { + "epoch": 4.1437950704983315, + "grad_norm": 0.6163055300712585, + "learning_rate": 0.00035183493158064865, + "loss": 3.3716, + "step": 38500 + }, + { + "epoch": 4.14917662253794, + "grad_norm": 0.6056356430053711, + "learning_rate": 0.0003515116905505872, + "loss": 3.3781, + "step": 38550 + }, + { + "epoch": 4.154558174577549, + "grad_norm": 0.6951189041137695, + "learning_rate": 0.0003511884495205258, + "loss": 3.3743, + "step": 38600 + }, + { + "epoch": 4.159939726617156, + "grad_norm": 0.6552825570106506, + "learning_rate": 0.0003508652084904644, + "loss": 3.3885, + "step": 38650 + }, + { + "epoch": 4.165321278656765, + "grad_norm": 0.6377124786376953, + "learning_rate": 0.0003505419674604029, + "loss": 3.3997, + "step": 38700 + }, + { + "epoch": 4.1707028306963725, + "grad_norm": 0.6340206861495972, + "learning_rate": 0.0003502187264303415, + "loss": 3.3831, + "step": 38750 + }, + { + "epoch": 4.176084382735981, + "grad_norm": 0.6724329590797424, + "learning_rate": 0.00034989548540028016, + "loss": 3.4015, + "step": 38800 + }, + { + "epoch": 4.18146593477559, + "grad_norm": 0.6731334924697876, + "learning_rate": 0.0003495722443702187, + "loss": 3.3766, + "step": 38850 + }, + { + "epoch": 4.186847486815197, + "grad_norm": 0.6341095566749573, + "learning_rate": 0.0003492490033401573, + "loss": 3.3741, + "step": 38900 + }, + { + "epoch": 4.192229038854806, + "grad_norm": 0.6485639214515686, + "learning_rate": 0.00034892576231009584, + "loss": 3.3827, + "step": 38950 + }, + { + "epoch": 4.197610590894414, + "grad_norm": 0.6944514513015747, + "learning_rate": 0.00034860252128003443, + "loss": 3.3847, + "step": 39000 + }, + { + "epoch": 4.197610590894414, + "eval_accuracy": 0.3769026343001303, + "eval_loss": 3.4444234371185303, + "eval_runtime": 194.3118, + "eval_samples_per_second": 92.691, + "eval_steps_per_second": 5.795, + "step": 39000 + }, + { + "epoch": 4.202992142934022, + "grad_norm": 0.6771103143692017, + "learning_rate": 0.000348279280249973, + "loss": 3.3822, + "step": 39050 + }, + { + "epoch": 4.208373694973631, + "grad_norm": 0.6469762325286865, + "learning_rate": 0.0003479560392199116, + "loss": 3.3847, + "step": 39100 + }, + { + "epoch": 4.213755247013238, + "grad_norm": 0.6654626727104187, + "learning_rate": 0.0003476327981898502, + "loss": 3.3962, + "step": 39150 + }, + { + "epoch": 4.219136799052847, + "grad_norm": 0.6649177074432373, + "learning_rate": 0.0003473095571597888, + "loss": 3.3866, + "step": 39200 + }, + { + "epoch": 4.224518351092455, + "grad_norm": 0.639963686466217, + "learning_rate": 0.00034698631612972735, + "loss": 3.3654, + "step": 39250 + }, + { + "epoch": 4.229899903132063, + "grad_norm": 0.7196988463401794, + "learning_rate": 0.00034666307509966594, + "loss": 3.3669, + "step": 39300 + }, + { + "epoch": 4.2352814551716715, + "grad_norm": 0.6603704690933228, + "learning_rate": 0.0003463398340696045, + "loss": 3.3879, + "step": 39350 + }, + { + "epoch": 4.24066300721128, + "grad_norm": 0.7158796191215515, + "learning_rate": 0.00034601659303954313, + "loss": 3.3925, + "step": 39400 + }, + { + "epoch": 4.246044559250888, + "grad_norm": 0.6354226469993591, + "learning_rate": 0.00034569335200948173, + "loss": 3.3954, + "step": 39450 + }, + { + "epoch": 4.251426111290496, + "grad_norm": 0.6703064441680908, + "learning_rate": 0.00034537011097942027, + "loss": 3.4026, + "step": 39500 + }, + { + "epoch": 4.256807663330104, + "grad_norm": 0.6066316962242126, + "learning_rate": 0.00034504686994935886, + "loss": 3.4001, + "step": 39550 + }, + { + "epoch": 4.2621892153697125, + "grad_norm": 0.6286178827285767, + "learning_rate": 0.00034472362891929746, + "loss": 3.3713, + "step": 39600 + }, + { + "epoch": 4.267570767409321, + "grad_norm": 0.6771307587623596, + "learning_rate": 0.00034440038788923605, + "loss": 3.3985, + "step": 39650 + }, + { + "epoch": 4.272952319448929, + "grad_norm": 0.6318666338920593, + "learning_rate": 0.00034407714685917465, + "loss": 3.381, + "step": 39700 + }, + { + "epoch": 4.278333871488537, + "grad_norm": 0.6529156565666199, + "learning_rate": 0.00034375390582911324, + "loss": 3.3848, + "step": 39750 + }, + { + "epoch": 4.283715423528146, + "grad_norm": 0.6864298582077026, + "learning_rate": 0.0003434306647990518, + "loss": 3.378, + "step": 39800 + }, + { + "epoch": 4.2890969755677535, + "grad_norm": 0.7262616753578186, + "learning_rate": 0.0003431074237689904, + "loss": 3.3899, + "step": 39850 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.7095841765403748, + "learning_rate": 0.0003427841827389289, + "loss": 3.3793, + "step": 39900 + }, + { + "epoch": 4.299860079646971, + "grad_norm": 0.6717966794967651, + "learning_rate": 0.00034246094170886757, + "loss": 3.3844, + "step": 39950 + }, + { + "epoch": 4.305241631686578, + "grad_norm": 0.6544067859649658, + "learning_rate": 0.00034213770067880616, + "loss": 3.3714, + "step": 40000 + }, + { + "epoch": 4.305241631686578, + "eval_accuracy": 0.3771877394777555, + "eval_loss": 3.439746141433716, + "eval_runtime": 210.3742, + "eval_samples_per_second": 85.614, + "eval_steps_per_second": 5.352, + "step": 40000 + }, + { + "epoch": 4.310623183726187, + "grad_norm": 0.7182822227478027, + "learning_rate": 0.0003418144596487447, + "loss": 3.3831, + "step": 40050 + }, + { + "epoch": 4.3160047357657945, + "grad_norm": 0.6663074493408203, + "learning_rate": 0.0003414912186186833, + "loss": 3.3781, + "step": 40100 + }, + { + "epoch": 4.321386287805403, + "grad_norm": 0.7040979862213135, + "learning_rate": 0.0003411679775886219, + "loss": 3.3946, + "step": 40150 + }, + { + "epoch": 4.326767839845012, + "grad_norm": 0.6793686151504517, + "learning_rate": 0.00034084473655856043, + "loss": 3.3925, + "step": 40200 + }, + { + "epoch": 4.332149391884619, + "grad_norm": 0.6753449440002441, + "learning_rate": 0.0003405214955284991, + "loss": 3.3899, + "step": 40250 + }, + { + "epoch": 4.337530943924228, + "grad_norm": 0.6695329546928406, + "learning_rate": 0.0003401982544984377, + "loss": 3.3848, + "step": 40300 + }, + { + "epoch": 4.342912495963836, + "grad_norm": 1.0501422882080078, + "learning_rate": 0.0003398750134683762, + "loss": 3.3769, + "step": 40350 + }, + { + "epoch": 4.348294048003444, + "grad_norm": 0.6779940128326416, + "learning_rate": 0.0003395517724383148, + "loss": 3.3852, + "step": 40400 + }, + { + "epoch": 4.3536756000430525, + "grad_norm": 0.6622795462608337, + "learning_rate": 0.0003392349962288546, + "loss": 3.3954, + "step": 40450 + }, + { + "epoch": 4.359057152082661, + "grad_norm": 0.6417733430862427, + "learning_rate": 0.0003389117551987932, + "loss": 3.3999, + "step": 40500 + }, + { + "epoch": 4.364438704122269, + "grad_norm": 0.6586073040962219, + "learning_rate": 0.00033858851416873175, + "loss": 3.3942, + "step": 40550 + }, + { + "epoch": 4.369820256161877, + "grad_norm": 0.6641150116920471, + "learning_rate": 0.0003382652731386704, + "loss": 3.3892, + "step": 40600 + }, + { + "epoch": 4.375201808201485, + "grad_norm": 0.6859793066978455, + "learning_rate": 0.000337942032108609, + "loss": 3.3936, + "step": 40650 + }, + { + "epoch": 4.3805833602410935, + "grad_norm": 0.6642147302627563, + "learning_rate": 0.00033761879107854754, + "loss": 3.3869, + "step": 40700 + }, + { + "epoch": 4.385964912280702, + "grad_norm": 0.6813688278198242, + "learning_rate": 0.00033729555004848613, + "loss": 3.3792, + "step": 40750 + }, + { + "epoch": 4.39134646432031, + "grad_norm": 0.6705623865127563, + "learning_rate": 0.00033697230901842467, + "loss": 3.3931, + "step": 40800 + }, + { + "epoch": 4.396728016359918, + "grad_norm": 0.6627140045166016, + "learning_rate": 0.00033664906798836327, + "loss": 3.3971, + "step": 40850 + }, + { + "epoch": 4.402109568399527, + "grad_norm": 0.6509461998939514, + "learning_rate": 0.0003363258269583019, + "loss": 3.3875, + "step": 40900 + }, + { + "epoch": 4.4074911204391345, + "grad_norm": 0.7194676399230957, + "learning_rate": 0.00033600258592824046, + "loss": 3.3982, + "step": 40950 + }, + { + "epoch": 4.412872672478743, + "grad_norm": 0.6531248092651367, + "learning_rate": 0.00033567934489817905, + "loss": 3.3768, + "step": 41000 + }, + { + "epoch": 4.412872672478743, + "eval_accuracy": 0.3779571105763699, + "eval_loss": 3.437781810760498, + "eval_runtime": 205.598, + "eval_samples_per_second": 87.603, + "eval_steps_per_second": 5.477, + "step": 41000 + }, + { + "epoch": 4.418254224518351, + "grad_norm": 0.6822459101676941, + "learning_rate": 0.00033535610386811764, + "loss": 3.4082, + "step": 41050 + }, + { + "epoch": 4.423635776557959, + "grad_norm": 0.7048671841621399, + "learning_rate": 0.0003350328628380562, + "loss": 3.4037, + "step": 41100 + }, + { + "epoch": 4.429017328597568, + "grad_norm": 0.6404821276664734, + "learning_rate": 0.0003347096218079948, + "loss": 3.377, + "step": 41150 + }, + { + "epoch": 4.4343988806371755, + "grad_norm": 0.7114750742912292, + "learning_rate": 0.00033438638077793343, + "loss": 3.3984, + "step": 41200 + }, + { + "epoch": 4.439780432676784, + "grad_norm": 0.6833235025405884, + "learning_rate": 0.00033406313974787197, + "loss": 3.3948, + "step": 41250 + }, + { + "epoch": 4.445161984716393, + "grad_norm": 0.671991765499115, + "learning_rate": 0.00033373989871781056, + "loss": 3.3905, + "step": 41300 + }, + { + "epoch": 4.450543536756, + "grad_norm": 0.6879648566246033, + "learning_rate": 0.0003334166576877491, + "loss": 3.4125, + "step": 41350 + }, + { + "epoch": 4.455925088795609, + "grad_norm": 0.6425567865371704, + "learning_rate": 0.0003330934166576877, + "loss": 3.3877, + "step": 41400 + }, + { + "epoch": 4.461306640835216, + "grad_norm": 0.6497949361801147, + "learning_rate": 0.00033277017562762635, + "loss": 3.3787, + "step": 41450 + }, + { + "epoch": 4.466688192874825, + "grad_norm": 0.6711752414703369, + "learning_rate": 0.0003324469345975649, + "loss": 3.4, + "step": 41500 + }, + { + "epoch": 4.4720697449144335, + "grad_norm": 0.6858512163162231, + "learning_rate": 0.0003321236935675035, + "loss": 3.3806, + "step": 41550 + }, + { + "epoch": 4.477451296954041, + "grad_norm": 0.6588053107261658, + "learning_rate": 0.0003318004525374421, + "loss": 3.3769, + "step": 41600 + }, + { + "epoch": 4.48283284899365, + "grad_norm": 0.6885084509849548, + "learning_rate": 0.0003314772115073806, + "loss": 3.3833, + "step": 41650 + }, + { + "epoch": 4.488214401033258, + "grad_norm": 0.6567678451538086, + "learning_rate": 0.0003311539704773192, + "loss": 3.3795, + "step": 41700 + }, + { + "epoch": 4.493595953072866, + "grad_norm": 0.644980251789093, + "learning_rate": 0.00033083072944725786, + "loss": 3.3881, + "step": 41750 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.6543886065483093, + "learning_rate": 0.0003305074884171964, + "loss": 3.3881, + "step": 41800 + }, + { + "epoch": 4.504359057152083, + "grad_norm": 0.6759121417999268, + "learning_rate": 0.000330184247387135, + "loss": 3.3842, + "step": 41850 + }, + { + "epoch": 4.509740609191691, + "grad_norm": 0.68775475025177, + "learning_rate": 0.00032986100635707354, + "loss": 3.3953, + "step": 41900 + }, + { + "epoch": 4.515122161231299, + "grad_norm": 0.6535020470619202, + "learning_rate": 0.00032953776532701213, + "loss": 3.3988, + "step": 41950 + }, + { + "epoch": 4.520503713270907, + "grad_norm": 0.720953643321991, + "learning_rate": 0.00032921452429695067, + "loss": 3.393, + "step": 42000 + }, + { + "epoch": 4.520503713270907, + "eval_accuracy": 0.3784747329339181, + "eval_loss": 3.4299232959747314, + "eval_runtime": 204.2689, + "eval_samples_per_second": 88.173, + "eval_steps_per_second": 5.512, + "step": 42000 + }, + { + "epoch": 4.5258852653105155, + "grad_norm": 0.6605381965637207, + "learning_rate": 0.0003288912832668893, + "loss": 3.4042, + "step": 42050 + }, + { + "epoch": 4.531266817350124, + "grad_norm": 0.6638337969779968, + "learning_rate": 0.0003285680422368279, + "loss": 3.4088, + "step": 42100 + }, + { + "epoch": 4.536648369389732, + "grad_norm": 0.6967762112617493, + "learning_rate": 0.0003282448012067665, + "loss": 3.3833, + "step": 42150 + }, + { + "epoch": 4.54202992142934, + "grad_norm": 0.6514131426811218, + "learning_rate": 0.00032792156017670505, + "loss": 3.3887, + "step": 42200 + }, + { + "epoch": 4.547411473468949, + "grad_norm": 0.6850041747093201, + "learning_rate": 0.00032759831914664365, + "loss": 3.3844, + "step": 42250 + }, + { + "epoch": 4.5527930255085565, + "grad_norm": 0.7080932855606079, + "learning_rate": 0.0003272750781165823, + "loss": 3.3936, + "step": 42300 + }, + { + "epoch": 4.558174577548165, + "grad_norm": 0.6961115598678589, + "learning_rate": 0.00032695183708652083, + "loss": 3.3951, + "step": 42350 + }, + { + "epoch": 4.563556129587774, + "grad_norm": 0.6541742086410522, + "learning_rate": 0.00032662859605645943, + "loss": 3.3929, + "step": 42400 + }, + { + "epoch": 4.568937681627381, + "grad_norm": 0.6856427788734436, + "learning_rate": 0.00032630535502639797, + "loss": 3.3886, + "step": 42450 + }, + { + "epoch": 4.57431923366699, + "grad_norm": 0.684547483921051, + "learning_rate": 0.00032598211399633656, + "loss": 3.3862, + "step": 42500 + }, + { + "epoch": 4.579700785706597, + "grad_norm": 0.6282208561897278, + "learning_rate": 0.0003256588729662751, + "loss": 3.394, + "step": 42550 + }, + { + "epoch": 4.585082337746206, + "grad_norm": 0.6854305863380432, + "learning_rate": 0.00032533563193621375, + "loss": 3.3815, + "step": 42600 + }, + { + "epoch": 4.5904638897858145, + "grad_norm": 0.693110466003418, + "learning_rate": 0.00032501239090615235, + "loss": 3.3888, + "step": 42650 + }, + { + "epoch": 4.595845441825422, + "grad_norm": 0.7092149257659912, + "learning_rate": 0.0003246891498760909, + "loss": 3.3926, + "step": 42700 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 0.6959360241889954, + "learning_rate": 0.0003243659088460295, + "loss": 3.394, + "step": 42750 + }, + { + "epoch": 4.606608545904638, + "grad_norm": 0.6965814828872681, + "learning_rate": 0.0003240426678159681, + "loss": 3.3794, + "step": 42800 + }, + { + "epoch": 4.611990097944247, + "grad_norm": 0.700532078742981, + "learning_rate": 0.0003237194267859066, + "loss": 3.401, + "step": 42850 + }, + { + "epoch": 4.6173716499838555, + "grad_norm": 0.6489181518554688, + "learning_rate": 0.00032339618575584527, + "loss": 3.4006, + "step": 42900 + }, + { + "epoch": 4.622753202023463, + "grad_norm": 0.6220369935035706, + "learning_rate": 0.000323079409546385, + "loss": 3.3956, + "step": 42950 + }, + { + "epoch": 4.628134754063072, + "grad_norm": 0.7331081032752991, + "learning_rate": 0.00032275616851632367, + "loss": 3.3915, + "step": 43000 + }, + { + "epoch": 4.628134754063072, + "eval_accuracy": 0.3789644314992979, + "eval_loss": 3.4236953258514404, + "eval_runtime": 203.2493, + "eval_samples_per_second": 88.615, + "eval_steps_per_second": 5.54, + "step": 43000 + }, + { + "epoch": 4.63351630610268, + "grad_norm": 0.8542162179946899, + "learning_rate": 0.00032243292748626226, + "loss": 3.3898, + "step": 43050 + }, + { + "epoch": 4.638897858142288, + "grad_norm": 0.6471918821334839, + "learning_rate": 0.0003221096864562008, + "loss": 3.4054, + "step": 43100 + }, + { + "epoch": 4.6442794101818965, + "grad_norm": 0.654157280921936, + "learning_rate": 0.0003217864454261394, + "loss": 3.396, + "step": 43150 + }, + { + "epoch": 4.649660962221505, + "grad_norm": 0.6689890623092651, + "learning_rate": 0.00032146320439607794, + "loss": 3.3881, + "step": 43200 + }, + { + "epoch": 4.655042514261113, + "grad_norm": 0.6910762786865234, + "learning_rate": 0.0003211399633660166, + "loss": 3.3807, + "step": 43250 + }, + { + "epoch": 4.660424066300721, + "grad_norm": 0.6583350300788879, + "learning_rate": 0.0003208167223359552, + "loss": 3.3996, + "step": 43300 + }, + { + "epoch": 4.665805618340329, + "grad_norm": 0.7289586663246155, + "learning_rate": 0.0003204934813058937, + "loss": 3.398, + "step": 43350 + }, + { + "epoch": 4.6711871703799375, + "grad_norm": 0.7323175668716431, + "learning_rate": 0.0003201702402758323, + "loss": 3.3893, + "step": 43400 + }, + { + "epoch": 4.676568722419546, + "grad_norm": 0.7023487687110901, + "learning_rate": 0.0003198469992457709, + "loss": 3.381, + "step": 43450 + }, + { + "epoch": 4.681950274459154, + "grad_norm": 0.6924834847450256, + "learning_rate": 0.00031952375821570945, + "loss": 3.3818, + "step": 43500 + }, + { + "epoch": 4.687331826498762, + "grad_norm": 0.6525893211364746, + "learning_rate": 0.0003192005171856481, + "loss": 3.3878, + "step": 43550 + }, + { + "epoch": 4.692713378538371, + "grad_norm": 0.6708674430847168, + "learning_rate": 0.0003188772761555867, + "loss": 3.3879, + "step": 43600 + }, + { + "epoch": 4.6980949305779784, + "grad_norm": 0.6731662750244141, + "learning_rate": 0.00031855403512552524, + "loss": 3.3815, + "step": 43650 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.7286627888679504, + "learning_rate": 0.00031823079409546383, + "loss": 3.396, + "step": 43700 + }, + { + "epoch": 4.7088580346571955, + "grad_norm": 0.6887483596801758, + "learning_rate": 0.00031790755306540237, + "loss": 3.3951, + "step": 43750 + }, + { + "epoch": 4.714239586696803, + "grad_norm": 0.6878728270530701, + "learning_rate": 0.00031758431203534097, + "loss": 3.382, + "step": 43800 + }, + { + "epoch": 4.719621138736412, + "grad_norm": 0.7092850208282471, + "learning_rate": 0.0003172610710052796, + "loss": 3.3839, + "step": 43850 + }, + { + "epoch": 4.725002690776019, + "grad_norm": 0.6616544723510742, + "learning_rate": 0.00031693782997521816, + "loss": 3.3777, + "step": 43900 + }, + { + "epoch": 4.730384242815628, + "grad_norm": 0.6741201281547546, + "learning_rate": 0.000316621053765758, + "loss": 3.3813, + "step": 43950 + }, + { + "epoch": 4.7357657948552365, + "grad_norm": 0.6615583896636963, + "learning_rate": 0.00031629781273569656, + "loss": 3.3875, + "step": 44000 + }, + { + "epoch": 4.7357657948552365, + "eval_accuracy": 0.3794617357668248, + "eval_loss": 3.419157028198242, + "eval_runtime": 196.8203, + "eval_samples_per_second": 91.51, + "eval_steps_per_second": 5.721, + "step": 44000 + }, + { + "epoch": 4.741147346894844, + "grad_norm": 0.6903731226921082, + "learning_rate": 0.00031597457170563515, + "loss": 3.391, + "step": 44050 + }, + { + "epoch": 4.746528898934453, + "grad_norm": 0.6820841431617737, + "learning_rate": 0.0003156513306755737, + "loss": 3.3847, + "step": 44100 + }, + { + "epoch": 4.751910450974061, + "grad_norm": 0.6453799605369568, + "learning_rate": 0.0003153280896455123, + "loss": 3.3879, + "step": 44150 + }, + { + "epoch": 4.757292003013669, + "grad_norm": 0.7102324962615967, + "learning_rate": 0.00031500484861545094, + "loss": 3.4133, + "step": 44200 + }, + { + "epoch": 4.7626735550532775, + "grad_norm": 0.6926384568214417, + "learning_rate": 0.0003146816075853895, + "loss": 3.3997, + "step": 44250 + }, + { + "epoch": 4.768055107092886, + "grad_norm": 0.6980499029159546, + "learning_rate": 0.00031435836655532807, + "loss": 3.3938, + "step": 44300 + }, + { + "epoch": 4.773436659132494, + "grad_norm": 0.6628013253211975, + "learning_rate": 0.00031403512552526667, + "loss": 3.3952, + "step": 44350 + }, + { + "epoch": 4.778818211172102, + "grad_norm": 0.7179418206214905, + "learning_rate": 0.0003137118844952052, + "loss": 3.3886, + "step": 44400 + }, + { + "epoch": 4.78419976321171, + "grad_norm": 0.6799003481864929, + "learning_rate": 0.0003133886434651438, + "loss": 3.385, + "step": 44450 + }, + { + "epoch": 4.7895813152513185, + "grad_norm": 0.6856629848480225, + "learning_rate": 0.00031306540243508245, + "loss": 3.3767, + "step": 44500 + }, + { + "epoch": 4.794962867290927, + "grad_norm": 0.689735472202301, + "learning_rate": 0.000312742161405021, + "loss": 3.4087, + "step": 44550 + }, + { + "epoch": 4.800344419330535, + "grad_norm": 0.685651957988739, + "learning_rate": 0.0003124189203749596, + "loss": 3.3955, + "step": 44600 + }, + { + "epoch": 4.805725971370143, + "grad_norm": 0.6903262138366699, + "learning_rate": 0.0003120956793448981, + "loss": 3.3996, + "step": 44650 + }, + { + "epoch": 4.811107523409751, + "grad_norm": 0.7058689594268799, + "learning_rate": 0.0003117724383148367, + "loss": 3.4022, + "step": 44700 + }, + { + "epoch": 4.8164890754493594, + "grad_norm": 0.6723160147666931, + "learning_rate": 0.00031144919728477526, + "loss": 3.3906, + "step": 44750 + }, + { + "epoch": 4.821870627488968, + "grad_norm": 0.6740667223930359, + "learning_rate": 0.0003111259562547139, + "loss": 3.3985, + "step": 44800 + }, + { + "epoch": 4.827252179528576, + "grad_norm": 0.6879459023475647, + "learning_rate": 0.0003108027152246525, + "loss": 3.379, + "step": 44850 + }, + { + "epoch": 4.832633731568184, + "grad_norm": 0.672219455242157, + "learning_rate": 0.0003104794741945911, + "loss": 3.3935, + "step": 44900 + }, + { + "epoch": 4.838015283607793, + "grad_norm": 0.680568277835846, + "learning_rate": 0.00031015623316452964, + "loss": 3.3993, + "step": 44950 + }, + { + "epoch": 4.8433968356474, + "grad_norm": 0.6724383234977722, + "learning_rate": 0.00030983299213446823, + "loss": 3.4045, + "step": 45000 + }, + { + "epoch": 4.8433968356474, + "eval_accuracy": 0.3801395124810116, + "eval_loss": 3.414053440093994, + "eval_runtime": 206.488, + "eval_samples_per_second": 87.225, + "eval_steps_per_second": 5.453, + "step": 45000 + }, + { + "epoch": 4.848778387687009, + "grad_norm": 0.7093207240104675, + "learning_rate": 0.0003095097511044069, + "loss": 3.3656, + "step": 45050 + }, + { + "epoch": 4.8541599397266175, + "grad_norm": 0.6884026527404785, + "learning_rate": 0.0003091865100743454, + "loss": 3.3815, + "step": 45100 + }, + { + "epoch": 4.859541491766225, + "grad_norm": 0.6394287943840027, + "learning_rate": 0.000308863269044284, + "loss": 3.4021, + "step": 45150 + }, + { + "epoch": 4.864923043805834, + "grad_norm": 0.656760573387146, + "learning_rate": 0.00030854002801422256, + "loss": 3.3942, + "step": 45200 + }, + { + "epoch": 4.870304595845441, + "grad_norm": 0.7228603363037109, + "learning_rate": 0.00030821678698416115, + "loss": 3.3831, + "step": 45250 + }, + { + "epoch": 4.87568614788505, + "grad_norm": 0.6911720633506775, + "learning_rate": 0.0003078935459540997, + "loss": 3.3941, + "step": 45300 + }, + { + "epoch": 4.8810676999246585, + "grad_norm": 0.6649526357650757, + "learning_rate": 0.00030757030492403834, + "loss": 3.3868, + "step": 45350 + }, + { + "epoch": 4.886449251964266, + "grad_norm": 0.6843016743659973, + "learning_rate": 0.00030724706389397694, + "loss": 3.3774, + "step": 45400 + }, + { + "epoch": 4.891830804003875, + "grad_norm": 0.6798348426818848, + "learning_rate": 0.0003069238228639155, + "loss": 3.3856, + "step": 45450 + }, + { + "epoch": 4.897212356043483, + "grad_norm": 0.6509679555892944, + "learning_rate": 0.00030660058183385407, + "loss": 3.3992, + "step": 45500 + }, + { + "epoch": 4.902593908083091, + "grad_norm": 0.6958284378051758, + "learning_rate": 0.00030627734080379267, + "loss": 3.3722, + "step": 45550 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.6654974222183228, + "learning_rate": 0.0003059540997737312, + "loss": 3.3861, + "step": 45600 + }, + { + "epoch": 4.913357012162308, + "grad_norm": 0.7664541602134705, + "learning_rate": 0.00030563085874366986, + "loss": 3.4079, + "step": 45650 + }, + { + "epoch": 4.918738564201916, + "grad_norm": 0.7275959849357605, + "learning_rate": 0.00030530761771360845, + "loss": 3.405, + "step": 45700 + }, + { + "epoch": 4.924120116241524, + "grad_norm": 0.6668584942817688, + "learning_rate": 0.000304984376683547, + "loss": 3.4006, + "step": 45750 + }, + { + "epoch": 4.929501668281132, + "grad_norm": 0.6965872645378113, + "learning_rate": 0.0003046611356534856, + "loss": 3.3963, + "step": 45800 + }, + { + "epoch": 4.9348832203207404, + "grad_norm": 0.7191141247749329, + "learning_rate": 0.0003043378946234241, + "loss": 3.3976, + "step": 45850 + }, + { + "epoch": 4.940264772360349, + "grad_norm": 0.6690873503684998, + "learning_rate": 0.0003040146535933628, + "loss": 3.4008, + "step": 45900 + }, + { + "epoch": 4.945646324399957, + "grad_norm": 0.6937520503997803, + "learning_rate": 0.00030369141256330137, + "loss": 3.3907, + "step": 45950 + }, + { + "epoch": 4.951027876439565, + "grad_norm": 0.6963992118835449, + "learning_rate": 0.0003033681715332399, + "loss": 3.3761, + "step": 46000 + }, + { + "epoch": 4.951027876439565, + "eval_accuracy": 0.3804870930691301, + "eval_loss": 3.4095349311828613, + "eval_runtime": 205.4933, + "eval_samples_per_second": 87.648, + "eval_steps_per_second": 5.479, + "step": 46000 + }, + { + "epoch": 4.956409428479174, + "grad_norm": 0.7414904832839966, + "learning_rate": 0.0003030449305031785, + "loss": 3.3747, + "step": 46050 + }, + { + "epoch": 4.961790980518781, + "grad_norm": 0.7259211540222168, + "learning_rate": 0.0003027216894731171, + "loss": 3.3819, + "step": 46100 + }, + { + "epoch": 4.96717253255839, + "grad_norm": 0.7090394496917725, + "learning_rate": 0.00030239844844305564, + "loss": 3.3889, + "step": 46150 + }, + { + "epoch": 4.9725540845979985, + "grad_norm": 0.7188470363616943, + "learning_rate": 0.0003020752074129943, + "loss": 3.3881, + "step": 46200 + }, + { + "epoch": 4.977935636637606, + "grad_norm": 0.6995404362678528, + "learning_rate": 0.0003017519663829329, + "loss": 3.3543, + "step": 46250 + }, + { + "epoch": 4.983317188677215, + "grad_norm": 0.6940346956253052, + "learning_rate": 0.0003014287253528714, + "loss": 3.378, + "step": 46300 + }, + { + "epoch": 4.988698740716822, + "grad_norm": 0.717013955116272, + "learning_rate": 0.00030110548432281, + "loss": 3.3916, + "step": 46350 + }, + { + "epoch": 4.994080292756431, + "grad_norm": 0.7562733888626099, + "learning_rate": 0.00030078224329274856, + "loss": 3.3738, + "step": 46400 + }, + { + "epoch": 4.9994618447960395, + "grad_norm": 0.6929606199264526, + "learning_rate": 0.00030045900226268715, + "loss": 3.3991, + "step": 46450 + }, + { + "epoch": 5.004843396835647, + "grad_norm": 0.684966504573822, + "learning_rate": 0.0003001357612326258, + "loss": 3.3138, + "step": 46500 + }, + { + "epoch": 5.010224948875256, + "grad_norm": 0.6948528289794922, + "learning_rate": 0.00029981252020256434, + "loss": 3.2973, + "step": 46550 + }, + { + "epoch": 5.015606500914864, + "grad_norm": 0.7069754004478455, + "learning_rate": 0.00029948927917250294, + "loss": 3.2938, + "step": 46600 + }, + { + "epoch": 5.020988052954472, + "grad_norm": 0.6681404113769531, + "learning_rate": 0.00029916603814244153, + "loss": 3.3031, + "step": 46650 + }, + { + "epoch": 5.0263696049940805, + "grad_norm": 0.7222424745559692, + "learning_rate": 0.0002988427971123801, + "loss": 3.2958, + "step": 46700 + }, + { + "epoch": 5.031751157033688, + "grad_norm": 0.6528456211090088, + "learning_rate": 0.00029851955608231867, + "loss": 3.2972, + "step": 46750 + }, + { + "epoch": 5.037132709073297, + "grad_norm": 0.7115670442581177, + "learning_rate": 0.00029819631505225726, + "loss": 3.3177, + "step": 46800 + }, + { + "epoch": 5.042514261112905, + "grad_norm": 0.7243658304214478, + "learning_rate": 0.00029787307402219586, + "loss": 3.3053, + "step": 46850 + }, + { + "epoch": 5.047895813152513, + "grad_norm": 0.6976185441017151, + "learning_rate": 0.00029754983299213445, + "loss": 3.3041, + "step": 46900 + }, + { + "epoch": 5.0532773651921215, + "grad_norm": 0.6580340266227722, + "learning_rate": 0.000297226591962073, + "loss": 3.2969, + "step": 46950 + }, + { + "epoch": 5.05865891723173, + "grad_norm": 0.6749905943870544, + "learning_rate": 0.00029690335093201164, + "loss": 3.3004, + "step": 47000 + }, + { + "epoch": 5.05865891723173, + "eval_accuracy": 0.380867812788032, + "eval_loss": 3.412269353866577, + "eval_runtime": 196.4805, + "eval_samples_per_second": 91.668, + "eval_steps_per_second": 5.731, + "step": 47000 + }, + { + "epoch": 5.064040469271338, + "grad_norm": 0.7184104919433594, + "learning_rate": 0.0002965801099019502, + "loss": 3.3063, + "step": 47050 + }, + { + "epoch": 5.069422021310946, + "grad_norm": 0.6803567409515381, + "learning_rate": 0.0002962568688718888, + "loss": 3.2995, + "step": 47100 + }, + { + "epoch": 5.074803573350554, + "grad_norm": 0.6682900190353394, + "learning_rate": 0.00029593362784182737, + "loss": 3.291, + "step": 47150 + }, + { + "epoch": 5.080185125390162, + "grad_norm": 0.6613481640815735, + "learning_rate": 0.00029561038681176596, + "loss": 3.2996, + "step": 47200 + }, + { + "epoch": 5.085566677429771, + "grad_norm": 0.6656726598739624, + "learning_rate": 0.00029528714578170456, + "loss": 3.3157, + "step": 47250 + }, + { + "epoch": 5.090948229469379, + "grad_norm": 0.6987922191619873, + "learning_rate": 0.0002949639047516431, + "loss": 3.3214, + "step": 47300 + }, + { + "epoch": 5.096329781508987, + "grad_norm": 0.6765496134757996, + "learning_rate": 0.0002946406637215817, + "loss": 3.3238, + "step": 47350 + }, + { + "epoch": 5.101711333548596, + "grad_norm": 0.722225546836853, + "learning_rate": 0.0002943174226915203, + "loss": 3.3189, + "step": 47400 + }, + { + "epoch": 5.107092885588203, + "grad_norm": 0.6817000508308411, + "learning_rate": 0.0002939941816614589, + "loss": 3.3209, + "step": 47450 + }, + { + "epoch": 5.112474437627812, + "grad_norm": 0.7062390446662903, + "learning_rate": 0.0002936709406313974, + "loss": 3.315, + "step": 47500 + }, + { + "epoch": 5.1178559896674205, + "grad_norm": 0.6629499197006226, + "learning_rate": 0.0002933476996013361, + "loss": 3.319, + "step": 47550 + }, + { + "epoch": 5.123237541707028, + "grad_norm": 0.6865987777709961, + "learning_rate": 0.0002930244585712746, + "loss": 3.3216, + "step": 47600 + }, + { + "epoch": 5.128619093746637, + "grad_norm": 0.7235592007637024, + "learning_rate": 0.0002927012175412132, + "loss": 3.3115, + "step": 47650 + }, + { + "epoch": 5.134000645786244, + "grad_norm": 0.7191505432128906, + "learning_rate": 0.0002923779765111518, + "loss": 3.3012, + "step": 47700 + }, + { + "epoch": 5.139382197825853, + "grad_norm": 0.706909716129303, + "learning_rate": 0.0002920547354810904, + "loss": 3.3227, + "step": 47750 + }, + { + "epoch": 5.1447637498654615, + "grad_norm": 0.7347841262817383, + "learning_rate": 0.00029173149445102894, + "loss": 3.3219, + "step": 47800 + }, + { + "epoch": 5.150145301905069, + "grad_norm": 0.7402267456054688, + "learning_rate": 0.00029140825342096753, + "loss": 3.3008, + "step": 47850 + }, + { + "epoch": 5.155526853944678, + "grad_norm": 0.678464949131012, + "learning_rate": 0.0002910850123909061, + "loss": 3.3218, + "step": 47900 + }, + { + "epoch": 5.160908405984286, + "grad_norm": 0.7275342345237732, + "learning_rate": 0.0002907617713608447, + "loss": 3.3034, + "step": 47950 + }, + { + "epoch": 5.166289958023894, + "grad_norm": 0.706177294254303, + "learning_rate": 0.0002904385303307833, + "loss": 3.3289, + "step": 48000 + }, + { + "epoch": 5.166289958023894, + "eval_accuracy": 0.3814682286460948, + "eval_loss": 3.4115307331085205, + "eval_runtime": 209.8199, + "eval_samples_per_second": 85.84, + "eval_steps_per_second": 5.367, + "step": 48000 + }, + { + "epoch": 5.1716715100635025, + "grad_norm": 0.6725974678993225, + "learning_rate": 0.00029011528930072186, + "loss": 3.3346, + "step": 48050 + }, + { + "epoch": 5.17705306210311, + "grad_norm": 0.726219117641449, + "learning_rate": 0.00028979204827066045, + "loss": 3.3088, + "step": 48100 + }, + { + "epoch": 5.182434614142719, + "grad_norm": 0.6821277141571045, + "learning_rate": 0.00028946880724059905, + "loss": 3.3257, + "step": 48150 + }, + { + "epoch": 5.187816166182327, + "grad_norm": 0.695120096206665, + "learning_rate": 0.00028914556621053764, + "loss": 3.3328, + "step": 48200 + }, + { + "epoch": 5.193197718221935, + "grad_norm": 0.694706380367279, + "learning_rate": 0.00028882232518047624, + "loss": 3.3343, + "step": 48250 + }, + { + "epoch": 5.198579270261543, + "grad_norm": 0.7311663031578064, + "learning_rate": 0.00028849908415041483, + "loss": 3.3231, + "step": 48300 + }, + { + "epoch": 5.203960822301152, + "grad_norm": 0.7439178228378296, + "learning_rate": 0.00028817584312035337, + "loss": 3.314, + "step": 48350 + }, + { + "epoch": 5.20934237434076, + "grad_norm": 0.7027797698974609, + "learning_rate": 0.00028785260209029197, + "loss": 3.3172, + "step": 48400 + }, + { + "epoch": 5.214723926380368, + "grad_norm": 0.7429949045181274, + "learning_rate": 0.00028752936106023056, + "loss": 3.3185, + "step": 48450 + }, + { + "epoch": 5.220105478419977, + "grad_norm": 0.6877169013023376, + "learning_rate": 0.00028720612003016915, + "loss": 3.3372, + "step": 48500 + }, + { + "epoch": 5.225487030459584, + "grad_norm": 0.6922637224197388, + "learning_rate": 0.00028688287900010775, + "loss": 3.3232, + "step": 48550 + }, + { + "epoch": 5.230868582499193, + "grad_norm": 0.7518928647041321, + "learning_rate": 0.0002865596379700463, + "loss": 3.3261, + "step": 48600 + }, + { + "epoch": 5.236250134538801, + "grad_norm": 0.7136549353599548, + "learning_rate": 0.0002862363969399849, + "loss": 3.3301, + "step": 48650 + }, + { + "epoch": 5.241631686578409, + "grad_norm": 0.7302454710006714, + "learning_rate": 0.0002859131559099235, + "loss": 3.3319, + "step": 48700 + }, + { + "epoch": 5.247013238618018, + "grad_norm": 0.6988076567649841, + "learning_rate": 0.0002855899148798621, + "loss": 3.3271, + "step": 48750 + }, + { + "epoch": 5.252394790657625, + "grad_norm": 0.6556980609893799, + "learning_rate": 0.0002852666738498006, + "loss": 3.3376, + "step": 48800 + }, + { + "epoch": 5.257776342697234, + "grad_norm": 0.7035303115844727, + "learning_rate": 0.00028494343281973926, + "loss": 3.3302, + "step": 48850 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.7211655974388123, + "learning_rate": 0.0002846201917896778, + "loss": 3.3299, + "step": 48900 + }, + { + "epoch": 5.26853944677645, + "grad_norm": 0.7089269161224365, + "learning_rate": 0.0002842969507596164, + "loss": 3.3217, + "step": 48950 + }, + { + "epoch": 5.273920998816059, + "grad_norm": 0.6631143689155579, + "learning_rate": 0.000283973709729555, + "loss": 3.3151, + "step": 49000 + }, + { + "epoch": 5.273920998816059, + "eval_accuracy": 0.3817304080643912, + "eval_loss": 3.4057440757751465, + "eval_runtime": 211.9424, + "eval_samples_per_second": 84.981, + "eval_steps_per_second": 5.313, + "step": 49000 + }, + { + "epoch": 5.279302550855666, + "grad_norm": 0.682040810585022, + "learning_rate": 0.0002836504686994936, + "loss": 3.3137, + "step": 49050 + }, + { + "epoch": 5.284684102895275, + "grad_norm": 0.6834739446640015, + "learning_rate": 0.00028332722766943213, + "loss": 3.3403, + "step": 49100 + }, + { + "epoch": 5.2900656549348835, + "grad_norm": 0.7186492681503296, + "learning_rate": 0.0002830039866393707, + "loss": 3.3335, + "step": 49150 + }, + { + "epoch": 5.295447206974491, + "grad_norm": 0.6996015310287476, + "learning_rate": 0.0002826807456093093, + "loss": 3.3068, + "step": 49200 + }, + { + "epoch": 5.3008287590141, + "grad_norm": 0.7867818474769592, + "learning_rate": 0.0002823575045792479, + "loss": 3.3175, + "step": 49250 + }, + { + "epoch": 5.306210311053708, + "grad_norm": 0.6740609407424927, + "learning_rate": 0.0002820342635491865, + "loss": 3.307, + "step": 49300 + }, + { + "epoch": 5.311591863093316, + "grad_norm": 0.6935590505599976, + "learning_rate": 0.00028171102251912505, + "loss": 3.3232, + "step": 49350 + }, + { + "epoch": 5.316973415132924, + "grad_norm": 0.690018892288208, + "learning_rate": 0.00028138778148906364, + "loss": 3.3367, + "step": 49400 + }, + { + "epoch": 5.322354967172533, + "grad_norm": 0.7360737919807434, + "learning_rate": 0.00028106454045900224, + "loss": 3.3176, + "step": 49450 + }, + { + "epoch": 5.327736519212141, + "grad_norm": 0.6747078895568848, + "learning_rate": 0.00028074129942894083, + "loss": 3.3376, + "step": 49500 + }, + { + "epoch": 5.333118071251749, + "grad_norm": 0.7133691310882568, + "learning_rate": 0.0002804180583988794, + "loss": 3.3254, + "step": 49550 + }, + { + "epoch": 5.338499623291357, + "grad_norm": 0.8014851212501526, + "learning_rate": 0.00028009481736881797, + "loss": 3.321, + "step": 49600 + }, + { + "epoch": 5.343881175330965, + "grad_norm": 0.7052977085113525, + "learning_rate": 0.00027977157633875656, + "loss": 3.3394, + "step": 49650 + }, + { + "epoch": 5.349262727370574, + "grad_norm": 0.7522420883178711, + "learning_rate": 0.00027944833530869516, + "loss": 3.3236, + "step": 49700 + }, + { + "epoch": 5.354644279410182, + "grad_norm": 0.7162737846374512, + "learning_rate": 0.00027913155909923496, + "loss": 3.3151, + "step": 49750 + }, + { + "epoch": 5.36002583144979, + "grad_norm": 0.7829045653343201, + "learning_rate": 0.00027880831806917356, + "loss": 3.3139, + "step": 49800 + }, + { + "epoch": 5.365407383489399, + "grad_norm": 0.6889187693595886, + "learning_rate": 0.00027848507703911215, + "loss": 3.3473, + "step": 49850 + }, + { + "epoch": 5.370788935529006, + "grad_norm": 0.7904999256134033, + "learning_rate": 0.0002781618360090507, + "loss": 3.3207, + "step": 49900 + }, + { + "epoch": 5.376170487568615, + "grad_norm": 0.6826810836791992, + "learning_rate": 0.00027783859497898934, + "loss": 3.338, + "step": 49950 + }, + { + "epoch": 5.3815520396082235, + "grad_norm": 0.7005066871643066, + "learning_rate": 0.0002775153539489279, + "loss": 3.3528, + "step": 50000 + }, + { + "epoch": 5.3815520396082235, + "eval_accuracy": 0.3818176563333062, + "eval_loss": 3.402635097503662, + "eval_runtime": 195.8066, + "eval_samples_per_second": 91.984, + "eval_steps_per_second": 5.751, + "step": 50000 + }, + { + "epoch": 5.386933591647831, + "grad_norm": 0.7351107001304626, + "learning_rate": 0.0002771921129188665, + "loss": 3.3345, + "step": 50050 + }, + { + "epoch": 5.39231514368744, + "grad_norm": 0.6939401626586914, + "learning_rate": 0.00027686887188880507, + "loss": 3.3426, + "step": 50100 + }, + { + "epoch": 5.397696695727047, + "grad_norm": 0.7426904439926147, + "learning_rate": 0.00027654563085874366, + "loss": 3.3367, + "step": 50150 + }, + { + "epoch": 5.403078247766656, + "grad_norm": 0.7072582840919495, + "learning_rate": 0.00027622238982868226, + "loss": 3.3355, + "step": 50200 + }, + { + "epoch": 5.4084597998062645, + "grad_norm": 0.829706609249115, + "learning_rate": 0.0002758991487986208, + "loss": 3.3383, + "step": 50250 + }, + { + "epoch": 5.413841351845872, + "grad_norm": 0.8014844655990601, + "learning_rate": 0.0002755759077685594, + "loss": 3.3264, + "step": 50300 + }, + { + "epoch": 5.419222903885481, + "grad_norm": 0.7258954644203186, + "learning_rate": 0.000275252666738498, + "loss": 3.3297, + "step": 50350 + }, + { + "epoch": 5.424604455925088, + "grad_norm": 0.706342339515686, + "learning_rate": 0.0002749294257084366, + "loss": 3.332, + "step": 50400 + }, + { + "epoch": 5.429986007964697, + "grad_norm": 0.7490754723548889, + "learning_rate": 0.0002746061846783751, + "loss": 3.3274, + "step": 50450 + }, + { + "epoch": 5.435367560004305, + "grad_norm": 0.6892290115356445, + "learning_rate": 0.0002742829436483138, + "loss": 3.3248, + "step": 50500 + }, + { + "epoch": 5.440749112043913, + "grad_norm": 0.7274524569511414, + "learning_rate": 0.0002739597026182523, + "loss": 3.3216, + "step": 50550 + }, + { + "epoch": 5.446130664083522, + "grad_norm": 0.7700362801551819, + "learning_rate": 0.0002736364615881909, + "loss": 3.3392, + "step": 50600 + }, + { + "epoch": 5.45151221612313, + "grad_norm": 0.6922512650489807, + "learning_rate": 0.0002733132205581295, + "loss": 3.3374, + "step": 50650 + }, + { + "epoch": 5.456893768162738, + "grad_norm": 0.7054064273834229, + "learning_rate": 0.00027298997952806804, + "loss": 3.3416, + "step": 50700 + }, + { + "epoch": 5.462275320202346, + "grad_norm": 0.7377570867538452, + "learning_rate": 0.00027266673849800664, + "loss": 3.3335, + "step": 50750 + }, + { + "epoch": 5.467656872241955, + "grad_norm": 0.7138562202453613, + "learning_rate": 0.00027234349746794523, + "loss": 3.3428, + "step": 50800 + }, + { + "epoch": 5.473038424281563, + "grad_norm": 0.7018982172012329, + "learning_rate": 0.00027202025643788383, + "loss": 3.3298, + "step": 50850 + }, + { + "epoch": 5.478419976321171, + "grad_norm": 0.7368566393852234, + "learning_rate": 0.00027169701540782237, + "loss": 3.3449, + "step": 50900 + }, + { + "epoch": 5.483801528360779, + "grad_norm": 0.6727787852287292, + "learning_rate": 0.000271373774377761, + "loss": 3.3402, + "step": 50950 + }, + { + "epoch": 5.489183080400387, + "grad_norm": 0.7162392139434814, + "learning_rate": 0.00027105053334769956, + "loss": 3.3271, + "step": 51000 + }, + { + "epoch": 5.489183080400387, + "eval_accuracy": 0.38230561645248107, + "eval_loss": 3.398566722869873, + "eval_runtime": 200.4209, + "eval_samples_per_second": 89.866, + "eval_steps_per_second": 5.618, + "step": 51000 + }, + { + "epoch": 5.494564632439996, + "grad_norm": 0.7033275365829468, + "learning_rate": 0.00027072729231763815, + "loss": 3.3318, + "step": 51050 + }, + { + "epoch": 5.499946184479604, + "grad_norm": 0.6969659328460693, + "learning_rate": 0.00027040405128757675, + "loss": 3.34, + "step": 51100 + }, + { + "epoch": 5.505327736519212, + "grad_norm": 0.827999472618103, + "learning_rate": 0.00027008081025751534, + "loss": 3.3444, + "step": 51150 + }, + { + "epoch": 5.510709288558821, + "grad_norm": 0.7825914025306702, + "learning_rate": 0.00026975756922745394, + "loss": 3.3355, + "step": 51200 + }, + { + "epoch": 5.516090840598428, + "grad_norm": 0.7196942567825317, + "learning_rate": 0.0002694343281973925, + "loss": 3.3247, + "step": 51250 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 0.6952111721038818, + "learning_rate": 0.00026911108716733107, + "loss": 3.3484, + "step": 51300 + }, + { + "epoch": 5.5268539446776455, + "grad_norm": 0.7286564111709595, + "learning_rate": 0.00026878784613726967, + "loss": 3.3394, + "step": 51350 + }, + { + "epoch": 5.532235496717253, + "grad_norm": 0.7400366067886353, + "learning_rate": 0.00026846460510720826, + "loss": 3.3425, + "step": 51400 + }, + { + "epoch": 5.537617048756862, + "grad_norm": 0.6877460479736328, + "learning_rate": 0.0002681413640771468, + "loss": 3.3296, + "step": 51450 + }, + { + "epoch": 5.542998600796469, + "grad_norm": 0.7166876196861267, + "learning_rate": 0.00026781812304708545, + "loss": 3.3404, + "step": 51500 + }, + { + "epoch": 5.548380152836078, + "grad_norm": 0.7324938178062439, + "learning_rate": 0.000267494882017024, + "loss": 3.3422, + "step": 51550 + }, + { + "epoch": 5.553761704875686, + "grad_norm": 0.7295821309089661, + "learning_rate": 0.0002671716409869626, + "loss": 3.3304, + "step": 51600 + }, + { + "epoch": 5.559143256915294, + "grad_norm": 0.7150875926017761, + "learning_rate": 0.0002668483999569012, + "loss": 3.3561, + "step": 51650 + }, + { + "epoch": 5.564524808954903, + "grad_norm": 0.7677608728408813, + "learning_rate": 0.0002665251589268398, + "loss": 3.3307, + "step": 51700 + }, + { + "epoch": 5.569906360994511, + "grad_norm": 0.6857333183288574, + "learning_rate": 0.0002662019178967783, + "loss": 3.3412, + "step": 51750 + }, + { + "epoch": 5.575287913034119, + "grad_norm": 0.6856406927108765, + "learning_rate": 0.0002658786768667169, + "loss": 3.3386, + "step": 51800 + }, + { + "epoch": 5.580669465073727, + "grad_norm": 0.7286893129348755, + "learning_rate": 0.0002655554358366555, + "loss": 3.3352, + "step": 51850 + }, + { + "epoch": 5.586051017113336, + "grad_norm": 0.7269234657287598, + "learning_rate": 0.0002652321948065941, + "loss": 3.3422, + "step": 51900 + }, + { + "epoch": 5.591432569152944, + "grad_norm": 0.7077545523643494, + "learning_rate": 0.0002649089537765327, + "loss": 3.3353, + "step": 51950 + }, + { + "epoch": 5.596814121192552, + "grad_norm": 0.7062633633613586, + "learning_rate": 0.00026458571274647123, + "loss": 3.3457, + "step": 52000 + }, + { + "epoch": 5.596814121192552, + "eval_accuracy": 0.38252248761655877, + "eval_loss": 3.3943653106689453, + "eval_runtime": 202.3361, + "eval_samples_per_second": 89.015, + "eval_steps_per_second": 5.565, + "step": 52000 + }, + { + "epoch": 5.60219567323216, + "grad_norm": 0.7586387991905212, + "learning_rate": 0.0002642624717164099, + "loss": 3.3286, + "step": 52050 + }, + { + "epoch": 5.607577225271768, + "grad_norm": 0.6984559297561646, + "learning_rate": 0.0002639392306863484, + "loss": 3.3364, + "step": 52100 + }, + { + "epoch": 5.612958777311377, + "grad_norm": 0.7536754608154297, + "learning_rate": 0.000263615989656287, + "loss": 3.3353, + "step": 52150 + }, + { + "epoch": 5.618340329350985, + "grad_norm": 0.7495378851890564, + "learning_rate": 0.0002632927486262256, + "loss": 3.3361, + "step": 52200 + }, + { + "epoch": 5.623721881390593, + "grad_norm": 0.684282124042511, + "learning_rate": 0.0002629695075961642, + "loss": 3.3353, + "step": 52250 + }, + { + "epoch": 5.629103433430201, + "grad_norm": 0.7329372763633728, + "learning_rate": 0.00026264626656610275, + "loss": 3.3291, + "step": 52300 + }, + { + "epoch": 5.634484985469809, + "grad_norm": 0.8108019232749939, + "learning_rate": 0.00026232302553604134, + "loss": 3.3548, + "step": 52350 + }, + { + "epoch": 5.639866537509418, + "grad_norm": 0.7560662627220154, + "learning_rate": 0.00026199978450597994, + "loss": 3.325, + "step": 52400 + }, + { + "epoch": 5.645248089549026, + "grad_norm": 0.7306268215179443, + "learning_rate": 0.00026167654347591853, + "loss": 3.334, + "step": 52450 + }, + { + "epoch": 5.650629641588634, + "grad_norm": 0.7203580737113953, + "learning_rate": 0.0002613533024458571, + "loss": 3.3221, + "step": 52500 + }, + { + "epoch": 5.656011193628243, + "grad_norm": 0.6746958494186401, + "learning_rate": 0.00026103006141579567, + "loss": 3.3364, + "step": 52550 + }, + { + "epoch": 5.66139274566785, + "grad_norm": 0.7283848524093628, + "learning_rate": 0.00026070682038573426, + "loss": 3.3309, + "step": 52600 + }, + { + "epoch": 5.666774297707459, + "grad_norm": 0.7701716423034668, + "learning_rate": 0.00026038357935567286, + "loss": 3.3383, + "step": 52650 + }, + { + "epoch": 5.672155849747067, + "grad_norm": 0.6934614777565002, + "learning_rate": 0.00026006033832561145, + "loss": 3.3301, + "step": 52700 + }, + { + "epoch": 5.677537401786675, + "grad_norm": 0.7119128108024597, + "learning_rate": 0.00025973709729555, + "loss": 3.3367, + "step": 52750 + }, + { + "epoch": 5.682918953826284, + "grad_norm": 0.7460734248161316, + "learning_rate": 0.00025941385626548864, + "loss": 3.3561, + "step": 52800 + }, + { + "epoch": 5.688300505865891, + "grad_norm": 0.7418410181999207, + "learning_rate": 0.0002590906152354272, + "loss": 3.324, + "step": 52850 + }, + { + "epoch": 5.6936820579055, + "grad_norm": 0.7116801142692566, + "learning_rate": 0.0002587673742053658, + "loss": 3.3337, + "step": 52900 + }, + { + "epoch": 5.699063609945108, + "grad_norm": 0.7011289000511169, + "learning_rate": 0.00025844413317530437, + "loss": 3.3404, + "step": 52950 + }, + { + "epoch": 5.704445161984716, + "grad_norm": 0.7183201909065247, + "learning_rate": 0.00025812089214524296, + "loss": 3.3489, + "step": 53000 + }, + { + "epoch": 5.704445161984716, + "eval_accuracy": 0.3831644088777667, + "eval_loss": 3.3893256187438965, + "eval_runtime": 213.8188, + "eval_samples_per_second": 84.235, + "eval_steps_per_second": 5.266, + "step": 53000 + }, + { + "epoch": 5.709826714024325, + "grad_norm": 0.7784079313278198, + "learning_rate": 0.00025779765111518156, + "loss": 3.3336, + "step": 53050 + }, + { + "epoch": 5.715208266063933, + "grad_norm": 0.7234706282615662, + "learning_rate": 0.0002574744100851201, + "loss": 3.3246, + "step": 53100 + }, + { + "epoch": 5.720589818103541, + "grad_norm": 0.7097893953323364, + "learning_rate": 0.0002571511690550587, + "loss": 3.3239, + "step": 53150 + }, + { + "epoch": 5.725971370143149, + "grad_norm": 0.6980921030044556, + "learning_rate": 0.0002568279280249973, + "loss": 3.3451, + "step": 53200 + }, + { + "epoch": 5.731352922182758, + "grad_norm": 0.7374033331871033, + "learning_rate": 0.0002565046869949359, + "loss": 3.345, + "step": 53250 + }, + { + "epoch": 5.736734474222366, + "grad_norm": 0.7064562439918518, + "learning_rate": 0.0002561814459648744, + "loss": 3.3339, + "step": 53300 + }, + { + "epoch": 5.742116026261974, + "grad_norm": 0.7073032259941101, + "learning_rate": 0.00025585820493481307, + "loss": 3.3426, + "step": 53350 + }, + { + "epoch": 5.747497578301582, + "grad_norm": 0.6804144382476807, + "learning_rate": 0.0002555349639047516, + "loss": 3.3255, + "step": 53400 + }, + { + "epoch": 5.75287913034119, + "grad_norm": 0.6917299032211304, + "learning_rate": 0.0002552117228746902, + "loss": 3.3281, + "step": 53450 + }, + { + "epoch": 5.758260682380799, + "grad_norm": 0.7830952405929565, + "learning_rate": 0.0002548884818446288, + "loss": 3.3367, + "step": 53500 + }, + { + "epoch": 5.763642234420407, + "grad_norm": 0.6828339099884033, + "learning_rate": 0.0002545652408145674, + "loss": 3.3337, + "step": 53550 + }, + { + "epoch": 5.769023786460015, + "grad_norm": 0.7331600785255432, + "learning_rate": 0.00025424199978450594, + "loss": 3.336, + "step": 53600 + }, + { + "epoch": 5.774405338499624, + "grad_norm": 0.6974768042564392, + "learning_rate": 0.00025391875875444453, + "loss": 3.3284, + "step": 53650 + }, + { + "epoch": 5.779786890539231, + "grad_norm": 0.7154784798622131, + "learning_rate": 0.0002535955177243831, + "loss": 3.3236, + "step": 53700 + }, + { + "epoch": 5.78516844257884, + "grad_norm": 0.7374145984649658, + "learning_rate": 0.0002532722766943217, + "loss": 3.3258, + "step": 53750 + }, + { + "epoch": 5.790549994618448, + "grad_norm": 0.6925607323646545, + "learning_rate": 0.00025295550048486153, + "loss": 3.3431, + "step": 53800 + }, + { + "epoch": 5.795931546658056, + "grad_norm": 0.6958518028259277, + "learning_rate": 0.0002526322594548001, + "loss": 3.3338, + "step": 53850 + }, + { + "epoch": 5.801313098697665, + "grad_norm": 0.6767904758453369, + "learning_rate": 0.0002523090184247387, + "loss": 3.3531, + "step": 53900 + }, + { + "epoch": 5.806694650737272, + "grad_norm": 0.7042453289031982, + "learning_rate": 0.00025198577739467726, + "loss": 3.3333, + "step": 53950 + }, + { + "epoch": 5.812076202776881, + "grad_norm": 0.7105914354324341, + "learning_rate": 0.00025166253636461585, + "loss": 3.3353, + "step": 54000 + }, + { + "epoch": 5.812076202776881, + "eval_accuracy": 0.38365334687293184, + "eval_loss": 3.386536121368408, + "eval_runtime": 195.2212, + "eval_samples_per_second": 92.259, + "eval_steps_per_second": 5.768, + "step": 54000 + }, + { + "epoch": 5.817457754816489, + "grad_norm": 0.7248342037200928, + "learning_rate": 0.00025133929533455445, + "loss": 3.3423, + "step": 54050 + }, + { + "epoch": 5.822839306856097, + "grad_norm": 0.7134623527526855, + "learning_rate": 0.00025101605430449304, + "loss": 3.3469, + "step": 54100 + }, + { + "epoch": 5.828220858895706, + "grad_norm": 0.7342662215232849, + "learning_rate": 0.00025069281327443164, + "loss": 3.3406, + "step": 54150 + }, + { + "epoch": 5.833602410935313, + "grad_norm": 0.6971819996833801, + "learning_rate": 0.0002503695722443702, + "loss": 3.3296, + "step": 54200 + }, + { + "epoch": 5.838983962974922, + "grad_norm": 0.6914299130439758, + "learning_rate": 0.00025004633121430877, + "loss": 3.335, + "step": 54250 + }, + { + "epoch": 5.84436551501453, + "grad_norm": 0.6908444762229919, + "learning_rate": 0.00024972309018424737, + "loss": 3.3218, + "step": 54300 + }, + { + "epoch": 5.849747067054138, + "grad_norm": 0.7167321443557739, + "learning_rate": 0.00024939984915418596, + "loss": 3.3309, + "step": 54350 + }, + { + "epoch": 5.855128619093747, + "grad_norm": 0.7270901799201965, + "learning_rate": 0.0002490766081241245, + "loss": 3.3447, + "step": 54400 + }, + { + "epoch": 5.860510171133355, + "grad_norm": 0.7137880325317383, + "learning_rate": 0.00024875336709406315, + "loss": 3.3156, + "step": 54450 + }, + { + "epoch": 5.865891723172963, + "grad_norm": 0.6899557709693909, + "learning_rate": 0.0002484301260640017, + "loss": 3.3324, + "step": 54500 + }, + { + "epoch": 5.871273275212571, + "grad_norm": 0.7073144316673279, + "learning_rate": 0.0002481068850339403, + "loss": 3.3377, + "step": 54550 + }, + { + "epoch": 5.87665482725218, + "grad_norm": 0.7121581435203552, + "learning_rate": 0.0002477836440038789, + "loss": 3.3358, + "step": 54600 + }, + { + "epoch": 5.882036379291788, + "grad_norm": 0.7033504843711853, + "learning_rate": 0.0002474604029738175, + "loss": 3.3309, + "step": 54650 + }, + { + "epoch": 5.887417931331396, + "grad_norm": 0.7366927862167358, + "learning_rate": 0.000247137161943756, + "loss": 3.3426, + "step": 54700 + }, + { + "epoch": 5.892799483371004, + "grad_norm": 0.6953083276748657, + "learning_rate": 0.0002468139209136946, + "loss": 3.3304, + "step": 54750 + }, + { + "epoch": 5.898181035410612, + "grad_norm": 0.707262396812439, + "learning_rate": 0.0002464906798836332, + "loss": 3.3298, + "step": 54800 + }, + { + "epoch": 5.903562587450221, + "grad_norm": 0.6863567233085632, + "learning_rate": 0.0002461674388535718, + "loss": 3.3294, + "step": 54850 + }, + { + "epoch": 5.9089441394898286, + "grad_norm": 0.7244144678115845, + "learning_rate": 0.0002458441978235104, + "loss": 3.3334, + "step": 54900 + }, + { + "epoch": 5.914325691529437, + "grad_norm": 0.7477735877037048, + "learning_rate": 0.00024552095679344893, + "loss": 3.3464, + "step": 54950 + }, + { + "epoch": 5.919707243569046, + "grad_norm": 0.7203809022903442, + "learning_rate": 0.0002451977157633876, + "loss": 3.3306, + "step": 55000 + }, + { + "epoch": 5.919707243569046, + "eval_accuracy": 0.3839801747594778, + "eval_loss": 3.3797736167907715, + "eval_runtime": 195.5869, + "eval_samples_per_second": 92.087, + "eval_steps_per_second": 5.757, + "step": 55000 + }, + { + "epoch": 5.925088795608653, + "grad_norm": 0.7410472631454468, + "learning_rate": 0.0002448744747333261, + "loss": 3.3449, + "step": 55050 + }, + { + "epoch": 5.930470347648262, + "grad_norm": 0.6759241223335266, + "learning_rate": 0.0002445512337032647, + "loss": 3.3361, + "step": 55100 + }, + { + "epoch": 5.93585189968787, + "grad_norm": 0.6957202553749084, + "learning_rate": 0.0002442279926732033, + "loss": 3.3363, + "step": 55150 + }, + { + "epoch": 5.941233451727478, + "grad_norm": 0.7306824922561646, + "learning_rate": 0.00024390475164314188, + "loss": 3.3315, + "step": 55200 + }, + { + "epoch": 5.946615003767087, + "grad_norm": 0.760296106338501, + "learning_rate": 0.00024358151061308045, + "loss": 3.3511, + "step": 55250 + }, + { + "epoch": 5.951996555806694, + "grad_norm": 0.7307605743408203, + "learning_rate": 0.00024325826958301907, + "loss": 3.3263, + "step": 55300 + }, + { + "epoch": 5.957378107846303, + "grad_norm": 0.6909796595573425, + "learning_rate": 0.00024293502855295764, + "loss": 3.3515, + "step": 55350 + }, + { + "epoch": 5.962759659885911, + "grad_norm": 0.7553929686546326, + "learning_rate": 0.0002426117875228962, + "loss": 3.3455, + "step": 55400 + }, + { + "epoch": 5.968141211925519, + "grad_norm": 0.7335132360458374, + "learning_rate": 0.0002422885464928348, + "loss": 3.3374, + "step": 55450 + }, + { + "epoch": 5.973522763965128, + "grad_norm": 0.7116664052009583, + "learning_rate": 0.0002419653054627734, + "loss": 3.3307, + "step": 55500 + }, + { + "epoch": 5.978904316004736, + "grad_norm": 0.7170029282569885, + "learning_rate": 0.00024164206443271196, + "loss": 3.355, + "step": 55550 + }, + { + "epoch": 5.984285868044344, + "grad_norm": 0.6848576068878174, + "learning_rate": 0.00024131882340265056, + "loss": 3.3344, + "step": 55600 + }, + { + "epoch": 5.989667420083952, + "grad_norm": 0.7043144702911377, + "learning_rate": 0.00024099558237258912, + "loss": 3.3398, + "step": 55650 + }, + { + "epoch": 5.995048972123561, + "grad_norm": 0.7126272320747375, + "learning_rate": 0.00024067234134252772, + "loss": 3.3339, + "step": 55700 + }, + { + "epoch": 6.000430524163169, + "grad_norm": 0.7398678064346313, + "learning_rate": 0.00024035556513306752, + "loss": 3.3327, + "step": 55750 + }, + { + "epoch": 6.005812076202777, + "grad_norm": 0.745004415512085, + "learning_rate": 0.00024003232410300615, + "loss": 3.2401, + "step": 55800 + }, + { + "epoch": 6.011193628242385, + "grad_norm": 0.7168341875076294, + "learning_rate": 0.00023970908307294471, + "loss": 3.254, + "step": 55850 + }, + { + "epoch": 6.016575180281993, + "grad_norm": 0.7345452904701233, + "learning_rate": 0.00023938584204288328, + "loss": 3.2373, + "step": 55900 + }, + { + "epoch": 6.021956732321602, + "grad_norm": 0.7234600186347961, + "learning_rate": 0.00023906260101282188, + "loss": 3.2584, + "step": 55950 + }, + { + "epoch": 6.0273382843612096, + "grad_norm": 0.7159119248390198, + "learning_rate": 0.00023873935998276047, + "loss": 3.2266, + "step": 56000 + }, + { + "epoch": 6.0273382843612096, + "eval_accuracy": 0.3844155468809504, + "eval_loss": 3.382514715194702, + "eval_runtime": 205.1809, + "eval_samples_per_second": 87.781, + "eval_steps_per_second": 5.488, + "step": 56000 + }, + { + "epoch": 6.032719836400818, + "grad_norm": 0.6958070397377014, + "learning_rate": 0.00023841611895269904, + "loss": 3.2395, + "step": 56050 + }, + { + "epoch": 6.038101388440427, + "grad_norm": 0.7117956280708313, + "learning_rate": 0.00023809287792263763, + "loss": 3.2614, + "step": 56100 + }, + { + "epoch": 6.043482940480034, + "grad_norm": 0.7864623069763184, + "learning_rate": 0.0002377696368925762, + "loss": 3.2521, + "step": 56150 + }, + { + "epoch": 6.048864492519643, + "grad_norm": 0.715238630771637, + "learning_rate": 0.0002374463958625148, + "loss": 3.2547, + "step": 56200 + }, + { + "epoch": 6.0542460445592505, + "grad_norm": 0.6912676692008972, + "learning_rate": 0.0002371231548324534, + "loss": 3.253, + "step": 56250 + }, + { + "epoch": 6.059627596598859, + "grad_norm": 0.7019177675247192, + "learning_rate": 0.00023679991380239196, + "loss": 3.2513, + "step": 56300 + }, + { + "epoch": 6.065009148638468, + "grad_norm": 0.8073034286499023, + "learning_rate": 0.00023647667277233053, + "loss": 3.2551, + "step": 56350 + }, + { + "epoch": 6.070390700678075, + "grad_norm": 0.716173529624939, + "learning_rate": 0.00023615343174226915, + "loss": 3.2406, + "step": 56400 + }, + { + "epoch": 6.075772252717684, + "grad_norm": 0.7097351551055908, + "learning_rate": 0.00023583019071220771, + "loss": 3.2585, + "step": 56450 + }, + { + "epoch": 6.081153804757292, + "grad_norm": 0.7224815487861633, + "learning_rate": 0.00023550694968214628, + "loss": 3.2563, + "step": 56500 + }, + { + "epoch": 6.0865353567969, + "grad_norm": 0.7451142072677612, + "learning_rate": 0.00023518370865208488, + "loss": 3.2523, + "step": 56550 + }, + { + "epoch": 6.091916908836509, + "grad_norm": 0.7142341732978821, + "learning_rate": 0.00023486046762202347, + "loss": 3.2581, + "step": 56600 + }, + { + "epoch": 6.097298460876116, + "grad_norm": 0.699760913848877, + "learning_rate": 0.00023453722659196207, + "loss": 3.2556, + "step": 56650 + }, + { + "epoch": 6.102680012915725, + "grad_norm": 0.7422381639480591, + "learning_rate": 0.00023421398556190063, + "loss": 3.2764, + "step": 56700 + }, + { + "epoch": 6.108061564955333, + "grad_norm": 0.8956770896911621, + "learning_rate": 0.0002338907445318392, + "loss": 3.2671, + "step": 56750 + }, + { + "epoch": 6.113443116994941, + "grad_norm": 0.7111889123916626, + "learning_rate": 0.00023356750350177782, + "loss": 3.2572, + "step": 56800 + }, + { + "epoch": 6.11882466903455, + "grad_norm": 0.9345306158065796, + "learning_rate": 0.0002332442624717164, + "loss": 3.2521, + "step": 56850 + }, + { + "epoch": 6.124206221074158, + "grad_norm": 0.7105541825294495, + "learning_rate": 0.00023292102144165496, + "loss": 3.2668, + "step": 56900 + }, + { + "epoch": 6.129587773113766, + "grad_norm": 0.7327850461006165, + "learning_rate": 0.00023259778041159358, + "loss": 3.2601, + "step": 56950 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 0.7219511866569519, + "learning_rate": 0.00023227453938153215, + "loss": 3.2555, + "step": 57000 + }, + { + "epoch": 6.134969325153374, + "eval_accuracy": 0.38421410442694237, + "eval_loss": 3.3847293853759766, + "eval_runtime": 196.0722, + "eval_samples_per_second": 91.859, + "eval_steps_per_second": 5.743, + "step": 57000 + }, + { + "epoch": 6.140350877192983, + "grad_norm": 0.7666511535644531, + "learning_rate": 0.00023195129835147071, + "loss": 3.2573, + "step": 57050 + }, + { + "epoch": 6.1457324292325906, + "grad_norm": 0.8009618520736694, + "learning_rate": 0.0002316280573214093, + "loss": 3.2679, + "step": 57100 + }, + { + "epoch": 6.151113981272199, + "grad_norm": 0.6945069432258606, + "learning_rate": 0.0002313048162913479, + "loss": 3.2667, + "step": 57150 + }, + { + "epoch": 6.156495533311807, + "grad_norm": 0.750709593296051, + "learning_rate": 0.00023098157526128647, + "loss": 3.2475, + "step": 57200 + }, + { + "epoch": 6.161877085351415, + "grad_norm": 0.7243725657463074, + "learning_rate": 0.00023065833423122507, + "loss": 3.2824, + "step": 57250 + }, + { + "epoch": 6.167258637391024, + "grad_norm": 0.7340604066848755, + "learning_rate": 0.00023033509320116363, + "loss": 3.2589, + "step": 57300 + }, + { + "epoch": 6.1726401894306315, + "grad_norm": 0.7163415551185608, + "learning_rate": 0.00023001185217110223, + "loss": 3.2712, + "step": 57350 + }, + { + "epoch": 6.17802174147024, + "grad_norm": 0.7414380311965942, + "learning_rate": 0.00022968861114104082, + "loss": 3.2653, + "step": 57400 + }, + { + "epoch": 6.183403293509849, + "grad_norm": 0.7300434112548828, + "learning_rate": 0.0002293653701109794, + "loss": 3.274, + "step": 57450 + }, + { + "epoch": 6.188784845549456, + "grad_norm": 0.7358796000480652, + "learning_rate": 0.00022904212908091796, + "loss": 3.2541, + "step": 57500 + }, + { + "epoch": 6.194166397589065, + "grad_norm": 0.7499557137489319, + "learning_rate": 0.00022871888805085658, + "loss": 3.2607, + "step": 57550 + }, + { + "epoch": 6.1995479496286725, + "grad_norm": 0.77440345287323, + "learning_rate": 0.00022839564702079515, + "loss": 3.2597, + "step": 57600 + }, + { + "epoch": 6.204929501668281, + "grad_norm": 0.759096622467041, + "learning_rate": 0.00022807240599073374, + "loss": 3.2771, + "step": 57650 + }, + { + "epoch": 6.21031105370789, + "grad_norm": 0.7167288064956665, + "learning_rate": 0.00022774916496067234, + "loss": 3.2498, + "step": 57700 + }, + { + "epoch": 6.215692605747497, + "grad_norm": 0.7278882265090942, + "learning_rate": 0.0002274259239306109, + "loss": 3.2825, + "step": 57750 + }, + { + "epoch": 6.221074157787106, + "grad_norm": 0.7202818989753723, + "learning_rate": 0.0002271026829005495, + "loss": 3.2604, + "step": 57800 + }, + { + "epoch": 6.226455709826714, + "grad_norm": 0.7370897531509399, + "learning_rate": 0.00022677944187048807, + "loss": 3.2528, + "step": 57850 + }, + { + "epoch": 6.231837261866322, + "grad_norm": 0.7153465151786804, + "learning_rate": 0.00022645620084042666, + "loss": 3.2778, + "step": 57900 + }, + { + "epoch": 6.237218813905931, + "grad_norm": 0.7415947914123535, + "learning_rate": 0.00022613295981036526, + "loss": 3.2753, + "step": 57950 + }, + { + "epoch": 6.242600365945538, + "grad_norm": 0.734861433506012, + "learning_rate": 0.00022580971878030382, + "loss": 3.2849, + "step": 58000 + }, + { + "epoch": 6.242600365945538, + "eval_accuracy": 0.3847145596282159, + "eval_loss": 3.380995988845825, + "eval_runtime": 200.7754, + "eval_samples_per_second": 89.707, + "eval_steps_per_second": 5.608, + "step": 58000 + }, + { + "epoch": 6.247981917985147, + "grad_norm": 0.7297642827033997, + "learning_rate": 0.00022549294257084366, + "loss": 3.2695, + "step": 58050 + }, + { + "epoch": 6.253363470024755, + "grad_norm": 0.7514773011207581, + "learning_rate": 0.00022516970154078222, + "loss": 3.2576, + "step": 58100 + }, + { + "epoch": 6.258745022064363, + "grad_norm": 0.7224261164665222, + "learning_rate": 0.0002248464605107208, + "loss": 3.2726, + "step": 58150 + }, + { + "epoch": 6.264126574103972, + "grad_norm": 0.7479972839355469, + "learning_rate": 0.0002245232194806594, + "loss": 3.2817, + "step": 58200 + }, + { + "epoch": 6.26950812614358, + "grad_norm": 0.7621592879295349, + "learning_rate": 0.00022419997845059798, + "loss": 3.2763, + "step": 58250 + }, + { + "epoch": 6.274889678183188, + "grad_norm": 0.7210646271705627, + "learning_rate": 0.00022387673742053655, + "loss": 3.277, + "step": 58300 + }, + { + "epoch": 6.280271230222796, + "grad_norm": 0.7708672881126404, + "learning_rate": 0.00022355349639047514, + "loss": 3.2824, + "step": 58350 + }, + { + "epoch": 6.285652782262405, + "grad_norm": 0.7061500549316406, + "learning_rate": 0.0002232302553604137, + "loss": 3.2647, + "step": 58400 + }, + { + "epoch": 6.2910343343020125, + "grad_norm": 0.7927210927009583, + "learning_rate": 0.00022290701433035233, + "loss": 3.2727, + "step": 58450 + }, + { + "epoch": 6.296415886341621, + "grad_norm": 0.7816336154937744, + "learning_rate": 0.0002225837733002909, + "loss": 3.2825, + "step": 58500 + }, + { + "epoch": 6.301797438381229, + "grad_norm": 0.7397032380104065, + "learning_rate": 0.00022226053227022947, + "loss": 3.2754, + "step": 58550 + }, + { + "epoch": 6.307178990420837, + "grad_norm": 0.7220003008842468, + "learning_rate": 0.0002219372912401681, + "loss": 3.2776, + "step": 58600 + }, + { + "epoch": 6.312560542460446, + "grad_norm": 0.7121575474739075, + "learning_rate": 0.00022161405021010666, + "loss": 3.2601, + "step": 58650 + }, + { + "epoch": 6.3179420945000535, + "grad_norm": 0.7781762480735779, + "learning_rate": 0.00022129080918004523, + "loss": 3.2796, + "step": 58700 + }, + { + "epoch": 6.323323646539662, + "grad_norm": 0.7582905888557434, + "learning_rate": 0.00022096756814998382, + "loss": 3.2895, + "step": 58750 + }, + { + "epoch": 6.328705198579271, + "grad_norm": 0.7163707613945007, + "learning_rate": 0.00022064432711992241, + "loss": 3.2841, + "step": 58800 + }, + { + "epoch": 6.334086750618878, + "grad_norm": 0.7681860327720642, + "learning_rate": 0.00022032108608986098, + "loss": 3.2909, + "step": 58850 + }, + { + "epoch": 6.339468302658487, + "grad_norm": 0.7766373157501221, + "learning_rate": 0.00021999784505979958, + "loss": 3.2916, + "step": 58900 + }, + { + "epoch": 6.344849854698095, + "grad_norm": 0.739279568195343, + "learning_rate": 0.00021967460402973814, + "loss": 3.2823, + "step": 58950 + }, + { + "epoch": 6.350231406737703, + "grad_norm": 0.7410812377929688, + "learning_rate": 0.00021935136299967674, + "loss": 3.2801, + "step": 59000 + }, + { + "epoch": 6.350231406737703, + "eval_accuracy": 0.38521621001125533, + "eval_loss": 3.3765883445739746, + "eval_runtime": 194.9686, + "eval_samples_per_second": 92.379, + "eval_steps_per_second": 5.775, + "step": 59000 + }, + { + "epoch": 6.355612958777312, + "grad_norm": 0.7452576756477356, + "learning_rate": 0.00021902812196961533, + "loss": 3.2732, + "step": 59050 + }, + { + "epoch": 6.360994510816919, + "grad_norm": 0.7552573680877686, + "learning_rate": 0.0002187048809395539, + "loss": 3.266, + "step": 59100 + }, + { + "epoch": 6.366376062856528, + "grad_norm": 0.7315678596496582, + "learning_rate": 0.00021838163990949247, + "loss": 3.2772, + "step": 59150 + }, + { + "epoch": 6.371757614896136, + "grad_norm": 0.7016833424568176, + "learning_rate": 0.0002180583988794311, + "loss": 3.2874, + "step": 59200 + }, + { + "epoch": 6.377139166935744, + "grad_norm": 0.7350359559059143, + "learning_rate": 0.00021773515784936966, + "loss": 3.2959, + "step": 59250 + }, + { + "epoch": 6.382520718975353, + "grad_norm": 0.7227166891098022, + "learning_rate": 0.00021741191681930823, + "loss": 3.281, + "step": 59300 + }, + { + "epoch": 6.387902271014961, + "grad_norm": 0.7294051051139832, + "learning_rate": 0.00021708867578924685, + "loss": 3.298, + "step": 59350 + }, + { + "epoch": 6.393283823054569, + "grad_norm": 0.718024730682373, + "learning_rate": 0.00021676543475918541, + "loss": 3.2599, + "step": 59400 + }, + { + "epoch": 6.398665375094177, + "grad_norm": 0.7561959624290466, + "learning_rate": 0.000216442193729124, + "loss": 3.2947, + "step": 59450 + }, + { + "epoch": 6.404046927133785, + "grad_norm": 0.7592124938964844, + "learning_rate": 0.00021611895269906258, + "loss": 3.2771, + "step": 59500 + }, + { + "epoch": 6.4094284791733935, + "grad_norm": 0.750074565410614, + "learning_rate": 0.00021579571166900117, + "loss": 3.26, + "step": 59550 + }, + { + "epoch": 6.414810031213002, + "grad_norm": 0.7350614666938782, + "learning_rate": 0.00021547247063893977, + "loss": 3.2863, + "step": 59600 + }, + { + "epoch": 6.42019158325261, + "grad_norm": 0.7412868738174438, + "learning_rate": 0.00021514922960887833, + "loss": 3.269, + "step": 59650 + }, + { + "epoch": 6.425573135292218, + "grad_norm": 0.7642093300819397, + "learning_rate": 0.0002148259885788169, + "loss": 3.2749, + "step": 59700 + }, + { + "epoch": 6.430954687331827, + "grad_norm": 0.7282441258430481, + "learning_rate": 0.00021450274754875552, + "loss": 3.269, + "step": 59750 + }, + { + "epoch": 6.4363362393714345, + "grad_norm": 0.7450237274169922, + "learning_rate": 0.0002141795065186941, + "loss": 3.2695, + "step": 59800 + }, + { + "epoch": 6.441717791411043, + "grad_norm": 0.7366235852241516, + "learning_rate": 0.00021385626548863266, + "loss": 3.2876, + "step": 59850 + }, + { + "epoch": 6.447099343450651, + "grad_norm": 0.7377769947052002, + "learning_rate": 0.00021353302445857128, + "loss": 3.2793, + "step": 59900 + }, + { + "epoch": 6.452480895490259, + "grad_norm": 0.771925151348114, + "learning_rate": 0.00021320978342850985, + "loss": 3.2807, + "step": 59950 + }, + { + "epoch": 6.457862447529868, + "grad_norm": 0.7846429944038391, + "learning_rate": 0.00021288654239844842, + "loss": 3.2773, + "step": 60000 + }, + { + "epoch": 6.457862447529868, + "eval_accuracy": 0.385375712450567, + "eval_loss": 3.3726203441619873, + "eval_runtime": 198.7032, + "eval_samples_per_second": 90.643, + "eval_steps_per_second": 5.667, + "step": 60000 + }, + { + "epoch": 6.4632439995694755, + "grad_norm": 0.7637952566146851, + "learning_rate": 0.000212563301368387, + "loss": 3.2915, + "step": 60050 + }, + { + "epoch": 6.468625551609084, + "grad_norm": 0.7368430495262146, + "learning_rate": 0.00021224006033832558, + "loss": 3.2963, + "step": 60100 + }, + { + "epoch": 6.474007103648693, + "grad_norm": 0.7520938515663147, + "learning_rate": 0.0002119232841288654, + "loss": 3.2928, + "step": 60150 + }, + { + "epoch": 6.4793886556883, + "grad_norm": 0.739131510257721, + "learning_rate": 0.00021160004309880398, + "loss": 3.2874, + "step": 60200 + }, + { + "epoch": 6.484770207727909, + "grad_norm": 0.7567487359046936, + "learning_rate": 0.0002112768020687426, + "loss": 3.2851, + "step": 60250 + }, + { + "epoch": 6.490151759767517, + "grad_norm": 0.7027845978736877, + "learning_rate": 0.00021095356103868117, + "loss": 3.2906, + "step": 60300 + }, + { + "epoch": 6.495533311807125, + "grad_norm": 0.7968448400497437, + "learning_rate": 0.00021063032000861974, + "loss": 3.2912, + "step": 60350 + }, + { + "epoch": 6.500914863846734, + "grad_norm": 0.7443140745162964, + "learning_rate": 0.00021030707897855833, + "loss": 3.2914, + "step": 60400 + }, + { + "epoch": 6.506296415886341, + "grad_norm": 0.7305548787117004, + "learning_rate": 0.00020998383794849692, + "loss": 3.2959, + "step": 60450 + }, + { + "epoch": 6.51167796792595, + "grad_norm": 0.7129182815551758, + "learning_rate": 0.0002096605969184355, + "loss": 3.275, + "step": 60500 + }, + { + "epoch": 6.517059519965558, + "grad_norm": 0.697638213634491, + "learning_rate": 0.0002093373558883741, + "loss": 3.3068, + "step": 60550 + }, + { + "epoch": 6.522441072005166, + "grad_norm": 0.7496232986450195, + "learning_rate": 0.00020901411485831265, + "loss": 3.2661, + "step": 60600 + }, + { + "epoch": 6.5278226240447745, + "grad_norm": 0.800989031791687, + "learning_rate": 0.00020869087382825125, + "loss": 3.2937, + "step": 60650 + }, + { + "epoch": 6.533204176084383, + "grad_norm": 0.7812984585762024, + "learning_rate": 0.00020836763279818984, + "loss": 3.2756, + "step": 60700 + }, + { + "epoch": 6.538585728123991, + "grad_norm": 0.7700937986373901, + "learning_rate": 0.0002080443917681284, + "loss": 3.2949, + "step": 60750 + }, + { + "epoch": 6.543967280163599, + "grad_norm": 0.7460906505584717, + "learning_rate": 0.00020772115073806698, + "loss": 3.2751, + "step": 60800 + }, + { + "epoch": 6.549348832203208, + "grad_norm": 0.7481933832168579, + "learning_rate": 0.0002073979097080056, + "loss": 3.2899, + "step": 60850 + }, + { + "epoch": 6.5547303842428155, + "grad_norm": 0.7331646680831909, + "learning_rate": 0.00020707466867794417, + "loss": 3.306, + "step": 60900 + }, + { + "epoch": 6.560111936282424, + "grad_norm": 0.7299278974533081, + "learning_rate": 0.00020675142764788274, + "loss": 3.2903, + "step": 60950 + }, + { + "epoch": 6.565493488322032, + "grad_norm": 0.7748175859451294, + "learning_rate": 0.00020642818661782136, + "loss": 3.3014, + "step": 61000 + }, + { + "epoch": 6.565493488322032, + "eval_accuracy": 0.38595026892133, + "eval_loss": 3.3667492866516113, + "eval_runtime": 196.5736, + "eval_samples_per_second": 91.625, + "eval_steps_per_second": 5.728, + "step": 61000 + }, + { + "epoch": 6.57087504036164, + "grad_norm": 0.7791305184364319, + "learning_rate": 0.00020610494558775993, + "loss": 3.2782, + "step": 61050 + }, + { + "epoch": 6.576256592401249, + "grad_norm": 0.7687615752220154, + "learning_rate": 0.0002057817045576985, + "loss": 3.292, + "step": 61100 + }, + { + "epoch": 6.5816381444408565, + "grad_norm": 0.7428541779518127, + "learning_rate": 0.0002054584635276371, + "loss": 3.2634, + "step": 61150 + }, + { + "epoch": 6.587019696480465, + "grad_norm": 0.7501689195632935, + "learning_rate": 0.00020513522249757568, + "loss": 3.2993, + "step": 61200 + }, + { + "epoch": 6.592401248520073, + "grad_norm": 0.7482814192771912, + "learning_rate": 0.00020481198146751428, + "loss": 3.2895, + "step": 61250 + }, + { + "epoch": 6.597782800559681, + "grad_norm": 0.7531591653823853, + "learning_rate": 0.00020448874043745284, + "loss": 3.2832, + "step": 61300 + }, + { + "epoch": 6.60316435259929, + "grad_norm": 0.8052058815956116, + "learning_rate": 0.0002041654994073914, + "loss": 3.3006, + "step": 61350 + }, + { + "epoch": 6.608545904638898, + "grad_norm": 0.7585821151733398, + "learning_rate": 0.00020384225837733003, + "loss": 3.2867, + "step": 61400 + }, + { + "epoch": 6.613927456678506, + "grad_norm": 0.7599231004714966, + "learning_rate": 0.0002035190173472686, + "loss": 3.2948, + "step": 61450 + }, + { + "epoch": 6.619309008718115, + "grad_norm": 0.7827469110488892, + "learning_rate": 0.00020319577631720717, + "loss": 3.2863, + "step": 61500 + }, + { + "epoch": 6.624690560757722, + "grad_norm": 0.7314283847808838, + "learning_rate": 0.00020287253528714576, + "loss": 3.2803, + "step": 61550 + }, + { + "epoch": 6.630072112797331, + "grad_norm": 0.7610256671905518, + "learning_rate": 0.00020254929425708436, + "loss": 3.2964, + "step": 61600 + }, + { + "epoch": 6.635453664836939, + "grad_norm": 0.7744382619857788, + "learning_rate": 0.00020222605322702293, + "loss": 3.3023, + "step": 61650 + }, + { + "epoch": 6.640835216876547, + "grad_norm": 0.7310662269592285, + "learning_rate": 0.00020190281219696152, + "loss": 3.2774, + "step": 61700 + }, + { + "epoch": 6.6462167689161555, + "grad_norm": 0.7460449934005737, + "learning_rate": 0.0002015795711669001, + "loss": 3.2858, + "step": 61750 + }, + { + "epoch": 6.651598320955763, + "grad_norm": 0.790534496307373, + "learning_rate": 0.00020125633013683868, + "loss": 3.2875, + "step": 61800 + }, + { + "epoch": 6.656979872995372, + "grad_norm": 0.7475619316101074, + "learning_rate": 0.00020093308910677728, + "loss": 3.2868, + "step": 61850 + }, + { + "epoch": 6.66236142503498, + "grad_norm": 0.7804775834083557, + "learning_rate": 0.00020060984807671584, + "loss": 3.2854, + "step": 61900 + }, + { + "epoch": 6.667742977074588, + "grad_norm": 0.7852612733840942, + "learning_rate": 0.0002002866070466544, + "loss": 3.2824, + "step": 61950 + }, + { + "epoch": 6.6731245291141965, + "grad_norm": 0.7747927904129028, + "learning_rate": 0.00019996336601659303, + "loss": 3.2774, + "step": 62000 + }, + { + "epoch": 6.6731245291141965, + "eval_accuracy": 0.3861736592586766, + "eval_loss": 3.363661050796509, + "eval_runtime": 194.5926, + "eval_samples_per_second": 92.557, + "eval_steps_per_second": 5.786, + "step": 62000 + }, + { + "epoch": 6.678506081153805, + "grad_norm": 0.7766702175140381, + "learning_rate": 0.0001996401249865316, + "loss": 3.2727, + "step": 62050 + }, + { + "epoch": 6.683887633193413, + "grad_norm": 0.759154200553894, + "learning_rate": 0.00019931688395647017, + "loss": 3.2908, + "step": 62100 + }, + { + "epoch": 6.689269185233021, + "grad_norm": 0.7269113063812256, + "learning_rate": 0.0001989936429264088, + "loss": 3.2694, + "step": 62150 + }, + { + "epoch": 6.69465073727263, + "grad_norm": 0.7533604502677917, + "learning_rate": 0.00019867040189634736, + "loss": 3.2954, + "step": 62200 + }, + { + "epoch": 6.7000322893122375, + "grad_norm": 0.7951676845550537, + "learning_rate": 0.00019834716086628595, + "loss": 3.291, + "step": 62250 + }, + { + "epoch": 6.705413841351846, + "grad_norm": 0.7553754448890686, + "learning_rate": 0.00019802391983622452, + "loss": 3.2993, + "step": 62300 + }, + { + "epoch": 6.710795393391454, + "grad_norm": 0.7384345531463623, + "learning_rate": 0.00019770714362676435, + "loss": 3.3098, + "step": 62350 + }, + { + "epoch": 6.716176945431062, + "grad_norm": 0.7837821841239929, + "learning_rate": 0.00019738390259670292, + "loss": 3.2873, + "step": 62400 + }, + { + "epoch": 6.721558497470671, + "grad_norm": 0.8101162910461426, + "learning_rate": 0.0001970606615666415, + "loss": 3.2793, + "step": 62450 + }, + { + "epoch": 6.7269400495102785, + "grad_norm": 0.7428802847862244, + "learning_rate": 0.0001967374205365801, + "loss": 3.2889, + "step": 62500 + }, + { + "epoch": 6.732321601549887, + "grad_norm": 0.7610371708869934, + "learning_rate": 0.00019641417950651868, + "loss": 3.2868, + "step": 62550 + }, + { + "epoch": 6.737703153589496, + "grad_norm": 0.7788252830505371, + "learning_rate": 0.00019609093847645725, + "loss": 3.2811, + "step": 62600 + }, + { + "epoch": 6.743084705629103, + "grad_norm": 0.7891542911529541, + "learning_rate": 0.00019576769744639587, + "loss": 3.2871, + "step": 62650 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 0.7551553845405579, + "learning_rate": 0.00019544445641633444, + "loss": 3.2824, + "step": 62700 + }, + { + "epoch": 6.75384780970832, + "grad_norm": 0.7407161593437195, + "learning_rate": 0.000195121215386273, + "loss": 3.2785, + "step": 62750 + }, + { + "epoch": 6.759229361747928, + "grad_norm": 0.7767391204833984, + "learning_rate": 0.0001947979743562116, + "loss": 3.2957, + "step": 62800 + }, + { + "epoch": 6.7646109137875365, + "grad_norm": 0.7716450691223145, + "learning_rate": 0.00019447473332615017, + "loss": 3.2908, + "step": 62850 + }, + { + "epoch": 6.769992465827144, + "grad_norm": 0.7589283585548401, + "learning_rate": 0.00019415149229608876, + "loss": 3.2708, + "step": 62900 + }, + { + "epoch": 6.775374017866753, + "grad_norm": 0.7275950312614441, + "learning_rate": 0.00019382825126602735, + "loss": 3.283, + "step": 62950 + }, + { + "epoch": 6.780755569906361, + "grad_norm": 0.7468757629394531, + "learning_rate": 0.00019350501023596592, + "loss": 3.2826, + "step": 63000 + }, + { + "epoch": 6.780755569906361, + "eval_accuracy": 0.38676788190213396, + "eval_loss": 3.3597800731658936, + "eval_runtime": 200.2482, + "eval_samples_per_second": 89.943, + "eval_steps_per_second": 5.623, + "step": 63000 + }, + { + "epoch": 6.786137121945969, + "grad_norm": 0.745414137840271, + "learning_rate": 0.00019318176920590454, + "loss": 3.273, + "step": 63050 + }, + { + "epoch": 6.7915186739855775, + "grad_norm": 0.756697952747345, + "learning_rate": 0.0001928585281758431, + "loss": 3.2861, + "step": 63100 + }, + { + "epoch": 6.796900226025185, + "grad_norm": 0.7485399842262268, + "learning_rate": 0.00019253528714578168, + "loss": 3.2847, + "step": 63150 + }, + { + "epoch": 6.802281778064794, + "grad_norm": 0.7797415256500244, + "learning_rate": 0.00019221204611572027, + "loss": 3.2908, + "step": 63200 + }, + { + "epoch": 6.807663330104402, + "grad_norm": 0.818111002445221, + "learning_rate": 0.00019188880508565887, + "loss": 3.2909, + "step": 63250 + }, + { + "epoch": 6.813044882144011, + "grad_norm": 0.7770239114761353, + "learning_rate": 0.00019156556405559744, + "loss": 3.2904, + "step": 63300 + }, + { + "epoch": 6.8184264341836185, + "grad_norm": 0.7970494627952576, + "learning_rate": 0.00019124232302553603, + "loss": 3.2932, + "step": 63350 + }, + { + "epoch": 6.823807986223227, + "grad_norm": 0.7903392910957336, + "learning_rate": 0.0001909190819954746, + "loss": 3.2974, + "step": 63400 + }, + { + "epoch": 6.829189538262835, + "grad_norm": 0.8986805081367493, + "learning_rate": 0.0001905958409654132, + "loss": 3.2837, + "step": 63450 + }, + { + "epoch": 6.834571090302443, + "grad_norm": 0.7686243057250977, + "learning_rate": 0.0001902725999353518, + "loss": 3.2957, + "step": 63500 + }, + { + "epoch": 6.839952642342052, + "grad_norm": 0.7945030927658081, + "learning_rate": 0.00018994935890529036, + "loss": 3.275, + "step": 63550 + }, + { + "epoch": 6.8453341943816595, + "grad_norm": 0.8145069479942322, + "learning_rate": 0.00018962611787522892, + "loss": 3.287, + "step": 63600 + }, + { + "epoch": 6.850715746421268, + "grad_norm": 0.7765173316001892, + "learning_rate": 0.00018930287684516754, + "loss": 3.2964, + "step": 63650 + }, + { + "epoch": 6.856097298460876, + "grad_norm": 0.8863644599914551, + "learning_rate": 0.0001889796358151061, + "loss": 3.2674, + "step": 63700 + }, + { + "epoch": 6.861478850500484, + "grad_norm": 0.7589664459228516, + "learning_rate": 0.00018865639478504468, + "loss": 3.2718, + "step": 63750 + }, + { + "epoch": 6.866860402540093, + "grad_norm": 0.7652623057365417, + "learning_rate": 0.0001883331537549833, + "loss": 3.2736, + "step": 63800 + }, + { + "epoch": 6.8722419545797, + "grad_norm": 0.7880659103393555, + "learning_rate": 0.00018800991272492187, + "loss": 3.2994, + "step": 63850 + }, + { + "epoch": 6.877623506619309, + "grad_norm": 0.7556933760643005, + "learning_rate": 0.00018768667169486044, + "loss": 3.2908, + "step": 63900 + }, + { + "epoch": 6.8830050586589175, + "grad_norm": 0.778171956539154, + "learning_rate": 0.00018736343066479903, + "loss": 3.2839, + "step": 63950 + }, + { + "epoch": 6.888386610698525, + "grad_norm": 0.7967923879623413, + "learning_rate": 0.00018704018963473763, + "loss": 3.2734, + "step": 64000 + }, + { + "epoch": 6.888386610698525, + "eval_accuracy": 0.38728550425968217, + "eval_loss": 3.3551573753356934, + "eval_runtime": 203.474, + "eval_samples_per_second": 88.517, + "eval_steps_per_second": 5.534, + "step": 64000 + }, + { + "epoch": 6.893768162738134, + "grad_norm": 0.761313796043396, + "learning_rate": 0.00018671694860467622, + "loss": 3.2713, + "step": 64050 + }, + { + "epoch": 6.899149714777742, + "grad_norm": 0.7254140377044678, + "learning_rate": 0.0001863937075746148, + "loss": 3.2736, + "step": 64100 + }, + { + "epoch": 6.90453126681735, + "grad_norm": 0.8009861707687378, + "learning_rate": 0.00018607046654455336, + "loss": 3.2954, + "step": 64150 + }, + { + "epoch": 6.9099128188569585, + "grad_norm": 0.7890373468399048, + "learning_rate": 0.00018574722551449198, + "loss": 3.2827, + "step": 64200 + }, + { + "epoch": 6.915294370896566, + "grad_norm": 0.7626961469650269, + "learning_rate": 0.00018542398448443054, + "loss": 3.2909, + "step": 64250 + }, + { + "epoch": 6.920675922936175, + "grad_norm": 0.8804692029953003, + "learning_rate": 0.0001851007434543691, + "loss": 3.289, + "step": 64300 + }, + { + "epoch": 6.926057474975783, + "grad_norm": 0.7699129581451416, + "learning_rate": 0.00018477750242430773, + "loss": 3.2924, + "step": 64350 + }, + { + "epoch": 6.931439027015391, + "grad_norm": 0.773622989654541, + "learning_rate": 0.0001844542613942463, + "loss": 3.2788, + "step": 64400 + }, + { + "epoch": 6.9368205790549995, + "grad_norm": 0.7274935245513916, + "learning_rate": 0.00018413102036418487, + "loss": 3.2926, + "step": 64450 + }, + { + "epoch": 6.942202131094608, + "grad_norm": 0.7527948021888733, + "learning_rate": 0.00018380777933412346, + "loss": 3.2984, + "step": 64500 + }, + { + "epoch": 6.947583683134216, + "grad_norm": 0.8110983371734619, + "learning_rate": 0.00018348453830406206, + "loss": 3.2862, + "step": 64550 + }, + { + "epoch": 6.952965235173824, + "grad_norm": 0.7983884215354919, + "learning_rate": 0.00018316129727400063, + "loss": 3.286, + "step": 64600 + }, + { + "epoch": 6.958346787213433, + "grad_norm": 0.772709310054779, + "learning_rate": 0.00018283805624393922, + "loss": 3.3001, + "step": 64650 + }, + { + "epoch": 6.9637283392530405, + "grad_norm": 0.7958887815475464, + "learning_rate": 0.0001825148152138778, + "loss": 3.2976, + "step": 64700 + }, + { + "epoch": 6.969109891292649, + "grad_norm": 0.7898955941200256, + "learning_rate": 0.00018219157418381636, + "loss": 3.2906, + "step": 64750 + }, + { + "epoch": 6.974491443332257, + "grad_norm": 0.7751755118370056, + "learning_rate": 0.00018186833315375498, + "loss": 3.2859, + "step": 64800 + }, + { + "epoch": 6.979872995371865, + "grad_norm": 0.7762618660926819, + "learning_rate": 0.00018154509212369355, + "loss": 3.2973, + "step": 64850 + }, + { + "epoch": 6.985254547411474, + "grad_norm": 0.7949537038803101, + "learning_rate": 0.0001812218510936321, + "loss": 3.2853, + "step": 64900 + }, + { + "epoch": 6.990636099451081, + "grad_norm": 0.7892162799835205, + "learning_rate": 0.00018089861006357073, + "loss": 3.2872, + "step": 64950 + }, + { + "epoch": 6.99601765149069, + "grad_norm": 0.7522981762886047, + "learning_rate": 0.0001805753690335093, + "loss": 3.2676, + "step": 65000 + }, + { + "epoch": 6.99601765149069, + "eval_accuracy": 0.3876287387322881, + "eval_loss": 3.352322578430176, + "eval_runtime": 215.3455, + "eval_samples_per_second": 83.638, + "eval_steps_per_second": 5.229, + "step": 65000 + }, + { + "epoch": 7.0013992035302985, + "grad_norm": 0.7797788381576538, + "learning_rate": 0.0001802521280034479, + "loss": 3.2643, + "step": 65050 + }, + { + "epoch": 7.006780755569906, + "grad_norm": 0.7758247256278992, + "learning_rate": 0.00017992888697338646, + "loss": 3.2046, + "step": 65100 + }, + { + "epoch": 7.012162307609515, + "grad_norm": 0.8069040775299072, + "learning_rate": 0.00017960564594332506, + "loss": 3.2005, + "step": 65150 + }, + { + "epoch": 7.017543859649122, + "grad_norm": 0.7784696817398071, + "learning_rate": 0.00017928240491326365, + "loss": 3.2059, + "step": 65200 + }, + { + "epoch": 7.022925411688731, + "grad_norm": 0.7886298298835754, + "learning_rate": 0.00017895916388320222, + "loss": 3.2059, + "step": 65250 + }, + { + "epoch": 7.0283069637283395, + "grad_norm": 0.810814380645752, + "learning_rate": 0.0001786359228531408, + "loss": 3.2004, + "step": 65300 + }, + { + "epoch": 7.033688515767947, + "grad_norm": 0.8277598023414612, + "learning_rate": 0.0001783126818230794, + "loss": 3.1962, + "step": 65350 + }, + { + "epoch": 7.039070067807556, + "grad_norm": 0.7609949707984924, + "learning_rate": 0.00017798944079301798, + "loss": 3.2008, + "step": 65400 + }, + { + "epoch": 7.044451619847164, + "grad_norm": 0.7364971041679382, + "learning_rate": 0.00017766619976295655, + "loss": 3.2008, + "step": 65450 + }, + { + "epoch": 7.049833171886772, + "grad_norm": 0.7349782586097717, + "learning_rate": 0.00017734295873289517, + "loss": 3.1923, + "step": 65500 + }, + { + "epoch": 7.0552147239263805, + "grad_norm": 0.8025264739990234, + "learning_rate": 0.00017702618252343495, + "loss": 3.2097, + "step": 65550 + }, + { + "epoch": 7.060596275965988, + "grad_norm": 0.7878516316413879, + "learning_rate": 0.00017670294149337354, + "loss": 3.1973, + "step": 65600 + }, + { + "epoch": 7.065977828005597, + "grad_norm": 0.7897598147392273, + "learning_rate": 0.00017637970046331214, + "loss": 3.196, + "step": 65650 + }, + { + "epoch": 7.071359380045205, + "grad_norm": 0.7731457352638245, + "learning_rate": 0.0001760564594332507, + "loss": 3.2343, + "step": 65700 + }, + { + "epoch": 7.076740932084813, + "grad_norm": 1.2706592082977295, + "learning_rate": 0.0001757332184031893, + "loss": 3.2107, + "step": 65750 + }, + { + "epoch": 7.0821224841244215, + "grad_norm": 0.8142618536949158, + "learning_rate": 0.00017540997737312787, + "loss": 3.2197, + "step": 65800 + }, + { + "epoch": 7.08750403616403, + "grad_norm": 0.7961957454681396, + "learning_rate": 0.0001750867363430665, + "loss": 3.2175, + "step": 65850 + }, + { + "epoch": 7.092885588203638, + "grad_norm": 0.8617919683456421, + "learning_rate": 0.00017476349531300506, + "loss": 3.2236, + "step": 65900 + }, + { + "epoch": 7.098267140243246, + "grad_norm": 0.7776252031326294, + "learning_rate": 0.00017444025428294362, + "loss": 3.2133, + "step": 65950 + }, + { + "epoch": 7.103648692282855, + "grad_norm": 0.7629320621490479, + "learning_rate": 0.00017411701325288224, + "loss": 3.2141, + "step": 66000 + }, + { + "epoch": 7.103648692282855, + "eval_accuracy": 0.3874447893932182, + "eval_loss": 3.357611894607544, + "eval_runtime": 211.7423, + "eval_samples_per_second": 85.061, + "eval_steps_per_second": 5.318, + "step": 66000 + }, + { + "epoch": 7.109030244322462, + "grad_norm": 0.8089470863342285, + "learning_rate": 0.0001737937722228208, + "loss": 3.2077, + "step": 66050 + }, + { + "epoch": 7.114411796362071, + "grad_norm": 0.7836511135101318, + "learning_rate": 0.00017347053119275938, + "loss": 3.2124, + "step": 66100 + }, + { + "epoch": 7.119793348401679, + "grad_norm": 0.7699388861656189, + "learning_rate": 0.00017314729016269797, + "loss": 3.2208, + "step": 66150 + }, + { + "epoch": 7.125174900441287, + "grad_norm": 0.7971923351287842, + "learning_rate": 0.00017282404913263654, + "loss": 3.2245, + "step": 66200 + }, + { + "epoch": 7.130556452480896, + "grad_norm": 0.8028796911239624, + "learning_rate": 0.00017250080810257514, + "loss": 3.2306, + "step": 66250 + }, + { + "epoch": 7.135938004520503, + "grad_norm": 0.7812018394470215, + "learning_rate": 0.00017217756707251373, + "loss": 3.1989, + "step": 66300 + }, + { + "epoch": 7.141319556560112, + "grad_norm": 0.7548637986183167, + "learning_rate": 0.0001718543260424523, + "loss": 3.211, + "step": 66350 + }, + { + "epoch": 7.1467011085997205, + "grad_norm": 0.8434266448020935, + "learning_rate": 0.00017153108501239087, + "loss": 3.2338, + "step": 66400 + }, + { + "epoch": 7.152082660639328, + "grad_norm": 0.8046761155128479, + "learning_rate": 0.0001712078439823295, + "loss": 3.2238, + "step": 66450 + }, + { + "epoch": 7.157464212678937, + "grad_norm": 0.7850152850151062, + "learning_rate": 0.00017088460295226806, + "loss": 3.2273, + "step": 66500 + }, + { + "epoch": 7.162845764718545, + "grad_norm": 0.8294812440872192, + "learning_rate": 0.00017056136192220662, + "loss": 3.2112, + "step": 66550 + }, + { + "epoch": 7.168227316758153, + "grad_norm": 0.744547426700592, + "learning_rate": 0.00017023812089214524, + "loss": 3.2101, + "step": 66600 + }, + { + "epoch": 7.1736088687977615, + "grad_norm": 0.8168179988861084, + "learning_rate": 0.0001699148798620838, + "loss": 3.2147, + "step": 66650 + }, + { + "epoch": 7.178990420837369, + "grad_norm": 0.8229835033416748, + "learning_rate": 0.00016959163883202238, + "loss": 3.2302, + "step": 66700 + }, + { + "epoch": 7.184371972876978, + "grad_norm": 0.8025431632995605, + "learning_rate": 0.00016926839780196097, + "loss": 3.2207, + "step": 66750 + }, + { + "epoch": 7.189753524916586, + "grad_norm": 0.782611608505249, + "learning_rate": 0.00016894515677189957, + "loss": 3.241, + "step": 66800 + }, + { + "epoch": 7.195135076956194, + "grad_norm": 0.8258253335952759, + "learning_rate": 0.00016862191574183816, + "loss": 3.2092, + "step": 66850 + }, + { + "epoch": 7.2005166289958025, + "grad_norm": 0.8076246976852417, + "learning_rate": 0.00016829867471177673, + "loss": 3.2358, + "step": 66900 + }, + { + "epoch": 7.205898181035411, + "grad_norm": 0.8116209506988525, + "learning_rate": 0.0001679754336817153, + "loss": 3.2076, + "step": 66950 + }, + { + "epoch": 7.211279733075019, + "grad_norm": 0.8048698306083679, + "learning_rate": 0.00016765219265165392, + "loss": 3.2343, + "step": 67000 + }, + { + "epoch": 7.211279733075019, + "eval_accuracy": 0.3876256964514293, + "eval_loss": 3.3560845851898193, + "eval_runtime": 211.2944, + "eval_samples_per_second": 85.241, + "eval_steps_per_second": 5.329, + "step": 67000 + }, + { + "epoch": 7.216661285114627, + "grad_norm": 0.8519846796989441, + "learning_rate": 0.0001673289516215925, + "loss": 3.2344, + "step": 67050 + }, + { + "epoch": 7.222042837154235, + "grad_norm": 0.7423728108406067, + "learning_rate": 0.00016700571059153106, + "loss": 3.2322, + "step": 67100 + }, + { + "epoch": 7.2274243891938434, + "grad_norm": 0.7732436060905457, + "learning_rate": 0.00016668246956146968, + "loss": 3.2076, + "step": 67150 + }, + { + "epoch": 7.232805941233452, + "grad_norm": 0.7401405572891235, + "learning_rate": 0.00016635922853140825, + "loss": 3.2221, + "step": 67200 + }, + { + "epoch": 7.23818749327306, + "grad_norm": 0.7472262382507324, + "learning_rate": 0.0001660359875013468, + "loss": 3.2246, + "step": 67250 + }, + { + "epoch": 7.243569045312668, + "grad_norm": 0.7432873845100403, + "learning_rate": 0.0001657127464712854, + "loss": 3.209, + "step": 67300 + }, + { + "epoch": 7.248950597352277, + "grad_norm": 0.7662221789360046, + "learning_rate": 0.000165389505441224, + "loss": 3.2232, + "step": 67350 + }, + { + "epoch": 7.254332149391884, + "grad_norm": 0.804426372051239, + "learning_rate": 0.00016506626441116257, + "loss": 3.2239, + "step": 67400 + }, + { + "epoch": 7.259713701431493, + "grad_norm": 0.7635779976844788, + "learning_rate": 0.00016474302338110116, + "loss": 3.2356, + "step": 67450 + }, + { + "epoch": 7.265095253471101, + "grad_norm": 0.7849301695823669, + "learning_rate": 0.00016441978235103973, + "loss": 3.2353, + "step": 67500 + }, + { + "epoch": 7.270476805510709, + "grad_norm": 0.7454454302787781, + "learning_rate": 0.00016409654132097833, + "loss": 3.2238, + "step": 67550 + }, + { + "epoch": 7.275858357550318, + "grad_norm": 0.8164418339729309, + "learning_rate": 0.00016377330029091692, + "loss": 3.2453, + "step": 67600 + }, + { + "epoch": 7.281239909589925, + "grad_norm": 0.7791739702224731, + "learning_rate": 0.0001634500592608555, + "loss": 3.2339, + "step": 67650 + }, + { + "epoch": 7.286621461629534, + "grad_norm": 0.7832018733024597, + "learning_rate": 0.00016312681823079406, + "loss": 3.2339, + "step": 67700 + }, + { + "epoch": 7.2920030136691425, + "grad_norm": 0.8332245945930481, + "learning_rate": 0.00016280357720073268, + "loss": 3.2326, + "step": 67750 + }, + { + "epoch": 7.29738456570875, + "grad_norm": 0.7885503172874451, + "learning_rate": 0.00016248033617067125, + "loss": 3.2424, + "step": 67800 + }, + { + "epoch": 7.302766117748359, + "grad_norm": 0.7824217677116394, + "learning_rate": 0.00016215709514060984, + "loss": 3.218, + "step": 67850 + }, + { + "epoch": 7.308147669787967, + "grad_norm": 0.8223897814750671, + "learning_rate": 0.00016183385411054843, + "loss": 3.2276, + "step": 67900 + }, + { + "epoch": 7.313529221827575, + "grad_norm": 0.7712852358818054, + "learning_rate": 0.000161510613080487, + "loss": 3.2069, + "step": 67950 + }, + { + "epoch": 7.3189107738671835, + "grad_norm": 0.7743715643882751, + "learning_rate": 0.0001611873720504256, + "loss": 3.2277, + "step": 68000 + }, + { + "epoch": 7.3189107738671835, + "eval_accuracy": 0.3880603080026872, + "eval_loss": 3.3522820472717285, + "eval_runtime": 210.2908, + "eval_samples_per_second": 85.648, + "eval_steps_per_second": 5.354, + "step": 68000 + }, + { + "epoch": 7.324292325906791, + "grad_norm": 0.7747835516929626, + "learning_rate": 0.00016086413102036416, + "loss": 3.2336, + "step": 68050 + }, + { + "epoch": 7.3296738779464, + "grad_norm": 0.8009321093559265, + "learning_rate": 0.00016054088999030273, + "loss": 3.2293, + "step": 68100 + }, + { + "epoch": 7.335055429986008, + "grad_norm": 0.8049393892288208, + "learning_rate": 0.00016021764896024135, + "loss": 3.2359, + "step": 68150 + }, + { + "epoch": 7.340436982025616, + "grad_norm": 0.8045018315315247, + "learning_rate": 0.00015990087275078113, + "loss": 3.2419, + "step": 68200 + }, + { + "epoch": 7.3458185340652244, + "grad_norm": 0.8061736226081848, + "learning_rate": 0.00015957763172071976, + "loss": 3.217, + "step": 68250 + }, + { + "epoch": 7.351200086104833, + "grad_norm": 0.8088467717170715, + "learning_rate": 0.00015925439069065832, + "loss": 3.2275, + "step": 68300 + }, + { + "epoch": 7.356581638144441, + "grad_norm": 0.8118233680725098, + "learning_rate": 0.0001589311496605969, + "loss": 3.2308, + "step": 68350 + }, + { + "epoch": 7.361963190184049, + "grad_norm": 0.7720018625259399, + "learning_rate": 0.00015860790863053548, + "loss": 3.231, + "step": 68400 + }, + { + "epoch": 7.367344742223658, + "grad_norm": 0.7653403878211975, + "learning_rate": 0.00015828466760047408, + "loss": 3.2414, + "step": 68450 + }, + { + "epoch": 7.372726294263265, + "grad_norm": 0.7920792102813721, + "learning_rate": 0.00015796142657041265, + "loss": 3.2353, + "step": 68500 + }, + { + "epoch": 7.378107846302874, + "grad_norm": 0.8059729337692261, + "learning_rate": 0.00015763818554035124, + "loss": 3.2339, + "step": 68550 + }, + { + "epoch": 7.383489398342482, + "grad_norm": 0.7684776186943054, + "learning_rate": 0.0001573149445102898, + "loss": 3.2355, + "step": 68600 + }, + { + "epoch": 7.38887095038209, + "grad_norm": 0.8400054574012756, + "learning_rate": 0.00015699170348022843, + "loss": 3.2392, + "step": 68650 + }, + { + "epoch": 7.394252502421699, + "grad_norm": 0.7758475542068481, + "learning_rate": 0.000156668462450167, + "loss": 3.2384, + "step": 68700 + }, + { + "epoch": 7.399634054461306, + "grad_norm": 0.8178544640541077, + "learning_rate": 0.00015634522142010557, + "loss": 3.2373, + "step": 68750 + }, + { + "epoch": 7.405015606500915, + "grad_norm": 0.7898211479187012, + "learning_rate": 0.0001560219803900442, + "loss": 3.2331, + "step": 68800 + }, + { + "epoch": 7.4103971585405235, + "grad_norm": 0.7738881707191467, + "learning_rate": 0.00015569873935998276, + "loss": 3.2337, + "step": 68850 + }, + { + "epoch": 7.415778710580131, + "grad_norm": 0.7856907844543457, + "learning_rate": 0.00015537549832992132, + "loss": 3.2221, + "step": 68900 + }, + { + "epoch": 7.42116026261974, + "grad_norm": 0.7582386136054993, + "learning_rate": 0.00015505225729985992, + "loss": 3.2397, + "step": 68950 + }, + { + "epoch": 7.426541814659347, + "grad_norm": 0.8215800523757935, + "learning_rate": 0.0001547290162697985, + "loss": 3.236, + "step": 69000 + }, + { + "epoch": 7.426541814659347, + "eval_accuracy": 0.3885883610374656, + "eval_loss": 3.349855661392212, + "eval_runtime": 210.9678, + "eval_samples_per_second": 85.373, + "eval_steps_per_second": 5.337, + "step": 69000 + }, + { + "epoch": 7.431923366698956, + "grad_norm": 0.8055285811424255, + "learning_rate": 0.00015440577523973708, + "loss": 3.2277, + "step": 69050 + }, + { + "epoch": 7.4373049187385645, + "grad_norm": 0.7805681824684143, + "learning_rate": 0.00015408253420967567, + "loss": 3.2534, + "step": 69100 + }, + { + "epoch": 7.442686470778172, + "grad_norm": 0.8025108575820923, + "learning_rate": 0.00015375929317961424, + "loss": 3.2343, + "step": 69150 + }, + { + "epoch": 7.448068022817781, + "grad_norm": 0.8178415894508362, + "learning_rate": 0.00015343605214955284, + "loss": 3.2282, + "step": 69200 + }, + { + "epoch": 7.453449574857389, + "grad_norm": 0.7723139524459839, + "learning_rate": 0.00015311281111949143, + "loss": 3.2373, + "step": 69250 + }, + { + "epoch": 7.458831126896997, + "grad_norm": 0.8103027939796448, + "learning_rate": 0.00015278957008943, + "loss": 3.2253, + "step": 69300 + }, + { + "epoch": 7.4642126789366054, + "grad_norm": 0.7889986038208008, + "learning_rate": 0.00015246632905936857, + "loss": 3.2271, + "step": 69350 + }, + { + "epoch": 7.469594230976213, + "grad_norm": 0.7937176823616028, + "learning_rate": 0.0001521430880293072, + "loss": 3.2375, + "step": 69400 + }, + { + "epoch": 7.474975783015822, + "grad_norm": 0.8158497214317322, + "learning_rate": 0.00015181984699924576, + "loss": 3.2368, + "step": 69450 + }, + { + "epoch": 7.48035733505543, + "grad_norm": 0.8363879323005676, + "learning_rate": 0.00015149660596918432, + "loss": 3.2239, + "step": 69500 + }, + { + "epoch": 7.485738887095038, + "grad_norm": 0.7895720601081848, + "learning_rate": 0.00015117336493912295, + "loss": 3.2266, + "step": 69550 + }, + { + "epoch": 7.491120439134646, + "grad_norm": 0.7915844917297363, + "learning_rate": 0.0001508501239090615, + "loss": 3.2448, + "step": 69600 + }, + { + "epoch": 7.496501991174255, + "grad_norm": 0.7852685451507568, + "learning_rate": 0.0001505268828790001, + "loss": 3.2139, + "step": 69650 + }, + { + "epoch": 7.501883543213863, + "grad_norm": 0.7899736166000366, + "learning_rate": 0.00015020364184893867, + "loss": 3.2317, + "step": 69700 + }, + { + "epoch": 7.507265095253471, + "grad_norm": 0.7891963124275208, + "learning_rate": 0.00014988040081887724, + "loss": 3.2278, + "step": 69750 + }, + { + "epoch": 7.51264664729308, + "grad_norm": 0.7811875343322754, + "learning_rate": 0.00014955715978881584, + "loss": 3.2406, + "step": 69800 + }, + { + "epoch": 7.518028199332687, + "grad_norm": 0.7988965511322021, + "learning_rate": 0.00014923391875875443, + "loss": 3.2298, + "step": 69850 + }, + { + "epoch": 7.523409751372296, + "grad_norm": 0.8040280342102051, + "learning_rate": 0.00014891067772869303, + "loss": 3.2465, + "step": 69900 + }, + { + "epoch": 7.528791303411904, + "grad_norm": 0.8975268602371216, + "learning_rate": 0.0001485874366986316, + "loss": 3.2452, + "step": 69950 + }, + { + "epoch": 7.534172855451512, + "grad_norm": 0.8084391951560974, + "learning_rate": 0.0001482641956685702, + "loss": 3.2201, + "step": 70000 + }, + { + "epoch": 7.534172855451512, + "eval_accuracy": 0.38874221352661087, + "eval_loss": 3.3477556705474854, + "eval_runtime": 211.1497, + "eval_samples_per_second": 85.3, + "eval_steps_per_second": 5.333, + "step": 70000 + }, + { + "epoch": 7.539554407491121, + "grad_norm": 0.8021398186683655, + "learning_rate": 0.00014794095463850878, + "loss": 3.2476, + "step": 70050 + }, + { + "epoch": 7.544935959530728, + "grad_norm": 0.796826183795929, + "learning_rate": 0.00014761771360844735, + "loss": 3.2431, + "step": 70100 + }, + { + "epoch": 7.550317511570337, + "grad_norm": 0.8226954340934753, + "learning_rate": 0.00014729447257838595, + "loss": 3.2146, + "step": 70150 + }, + { + "epoch": 7.5556990636099455, + "grad_norm": 0.7751065492630005, + "learning_rate": 0.0001469712315483245, + "loss": 3.2377, + "step": 70200 + }, + { + "epoch": 7.561080615649553, + "grad_norm": 0.8223302960395813, + "learning_rate": 0.0001466479905182631, + "loss": 3.2459, + "step": 70250 + }, + { + "epoch": 7.566462167689162, + "grad_norm": 0.7794532179832458, + "learning_rate": 0.00014632474948820168, + "loss": 3.2225, + "step": 70300 + }, + { + "epoch": 7.57184371972877, + "grad_norm": 0.8701120615005493, + "learning_rate": 0.00014600150845814027, + "loss": 3.2331, + "step": 70350 + }, + { + "epoch": 7.577225271768378, + "grad_norm": 0.8439012169837952, + "learning_rate": 0.00014567826742807886, + "loss": 3.2266, + "step": 70400 + }, + { + "epoch": 7.5826068238079865, + "grad_norm": 0.82621169090271, + "learning_rate": 0.00014535502639801743, + "loss": 3.2267, + "step": 70450 + }, + { + "epoch": 7.587988375847594, + "grad_norm": 0.8370195627212524, + "learning_rate": 0.00014503178536795603, + "loss": 3.2329, + "step": 70500 + }, + { + "epoch": 7.593369927887203, + "grad_norm": 0.8157381415367126, + "learning_rate": 0.00014470854433789462, + "loss": 3.2428, + "step": 70550 + }, + { + "epoch": 7.598751479926811, + "grad_norm": 0.8180111050605774, + "learning_rate": 0.0001443853033078332, + "loss": 3.2468, + "step": 70600 + }, + { + "epoch": 7.604133031966419, + "grad_norm": 0.8646537065505981, + "learning_rate": 0.00014406206227777178, + "loss": 3.22, + "step": 70650 + }, + { + "epoch": 7.609514584006027, + "grad_norm": 0.8048348426818848, + "learning_rate": 0.0001437452860683116, + "loss": 3.2472, + "step": 70700 + }, + { + "epoch": 7.614896136045635, + "grad_norm": 0.7684217691421509, + "learning_rate": 0.00014342204503825018, + "loss": 3.2327, + "step": 70750 + }, + { + "epoch": 7.620277688085244, + "grad_norm": 0.8237805962562561, + "learning_rate": 0.00014309880400818875, + "loss": 3.2486, + "step": 70800 + }, + { + "epoch": 7.625659240124852, + "grad_norm": 0.8122904896736145, + "learning_rate": 0.00014277556297812735, + "loss": 3.2419, + "step": 70850 + }, + { + "epoch": 7.63104079216446, + "grad_norm": 0.8078117966651917, + "learning_rate": 0.00014245232194806591, + "loss": 3.2286, + "step": 70900 + }, + { + "epoch": 7.636422344204068, + "grad_norm": 0.8151229619979858, + "learning_rate": 0.0001421290809180045, + "loss": 3.2208, + "step": 70950 + }, + { + "epoch": 7.641803896243677, + "grad_norm": 0.8114475011825562, + "learning_rate": 0.0001418058398879431, + "loss": 3.2337, + "step": 71000 + }, + { + "epoch": 7.641803896243677, + "eval_accuracy": 0.38909826903997896, + "eval_loss": 3.3436226844787598, + "eval_runtime": 207.9913, + "eval_samples_per_second": 86.595, + "eval_steps_per_second": 5.414, + "step": 71000 + }, + { + "epoch": 7.647185448283285, + "grad_norm": 0.8878672122955322, + "learning_rate": 0.00014148259885788167, + "loss": 3.2354, + "step": 71050 + }, + { + "epoch": 7.652567000322893, + "grad_norm": 0.8463232517242432, + "learning_rate": 0.00014115935782782027, + "loss": 3.2492, + "step": 71100 + }, + { + "epoch": 7.657948552362502, + "grad_norm": 0.8358443975448608, + "learning_rate": 0.00014083611679775886, + "loss": 3.2434, + "step": 71150 + }, + { + "epoch": 7.663330104402109, + "grad_norm": 0.8259372115135193, + "learning_rate": 0.00014051287576769743, + "loss": 3.2452, + "step": 71200 + }, + { + "epoch": 7.668711656441718, + "grad_norm": 0.8233510851860046, + "learning_rate": 0.00014018963473763602, + "loss": 3.2524, + "step": 71250 + }, + { + "epoch": 7.674093208481326, + "grad_norm": 0.7944367527961731, + "learning_rate": 0.0001398663937075746, + "loss": 3.2322, + "step": 71300 + }, + { + "epoch": 7.679474760520934, + "grad_norm": 0.818215012550354, + "learning_rate": 0.00013954315267751319, + "loss": 3.242, + "step": 71350 + }, + { + "epoch": 7.684856312560543, + "grad_norm": 0.8027128577232361, + "learning_rate": 0.00013921991164745175, + "loss": 3.2194, + "step": 71400 + }, + { + "epoch": 7.69023786460015, + "grad_norm": 0.7849347591400146, + "learning_rate": 0.00013889667061739035, + "loss": 3.2435, + "step": 71450 + }, + { + "epoch": 7.695619416639759, + "grad_norm": 0.8004842400550842, + "learning_rate": 0.00013857342958732894, + "loss": 3.2417, + "step": 71500 + }, + { + "epoch": 7.7010009686793675, + "grad_norm": 0.7999834418296814, + "learning_rate": 0.0001382501885572675, + "loss": 3.2345, + "step": 71550 + }, + { + "epoch": 7.706382520718975, + "grad_norm": 0.813932478427887, + "learning_rate": 0.0001379269475272061, + "loss": 3.2388, + "step": 71600 + }, + { + "epoch": 7.711764072758584, + "grad_norm": 0.8163520693778992, + "learning_rate": 0.0001376037064971447, + "loss": 3.2463, + "step": 71650 + }, + { + "epoch": 7.717145624798192, + "grad_norm": 0.8218525648117065, + "learning_rate": 0.0001372804654670833, + "loss": 3.2376, + "step": 71700 + }, + { + "epoch": 7.7225271768378, + "grad_norm": 0.8164900541305542, + "learning_rate": 0.00013695722443702186, + "loss": 3.2415, + "step": 71750 + }, + { + "epoch": 7.727908728877408, + "grad_norm": 0.8270570039749146, + "learning_rate": 0.00013663398340696046, + "loss": 3.2334, + "step": 71800 + }, + { + "epoch": 7.733290280917016, + "grad_norm": 0.8093637824058533, + "learning_rate": 0.00013631074237689902, + "loss": 3.2408, + "step": 71850 + }, + { + "epoch": 7.738671832956625, + "grad_norm": 0.8747385740280151, + "learning_rate": 0.00013598750134683762, + "loss": 3.2449, + "step": 71900 + }, + { + "epoch": 7.744053384996233, + "grad_norm": 0.7810999751091003, + "learning_rate": 0.00013566426031677619, + "loss": 3.2159, + "step": 71950 + }, + { + "epoch": 7.749434937035841, + "grad_norm": 0.8121122717857361, + "learning_rate": 0.00013534748410731602, + "loss": 3.2174, + "step": 72000 + }, + { + "epoch": 7.749434937035841, + "eval_accuracy": 0.3896095895300339, + "eval_loss": 3.3376104831695557, + "eval_runtime": 189.9062, + "eval_samples_per_second": 94.842, + "eval_steps_per_second": 5.929, + "step": 72000 + }, + { + "epoch": 7.754816489075449, + "grad_norm": 0.8262868523597717, + "learning_rate": 0.0001350242430772546, + "loss": 3.2463, + "step": 72050 + }, + { + "epoch": 7.760198041115058, + "grad_norm": 0.8187814354896545, + "learning_rate": 0.00013470100204719318, + "loss": 3.2571, + "step": 72100 + }, + { + "epoch": 7.765579593154666, + "grad_norm": 0.8290274739265442, + "learning_rate": 0.00013437776101713178, + "loss": 3.2556, + "step": 72150 + }, + { + "epoch": 7.770961145194274, + "grad_norm": 0.8006277680397034, + "learning_rate": 0.00013405451998707034, + "loss": 3.2402, + "step": 72200 + }, + { + "epoch": 7.776342697233883, + "grad_norm": 0.8297838568687439, + "learning_rate": 0.00013373127895700894, + "loss": 3.2314, + "step": 72250 + }, + { + "epoch": 7.78172424927349, + "grad_norm": 0.8442043662071228, + "learning_rate": 0.00013340803792694753, + "loss": 3.2356, + "step": 72300 + }, + { + "epoch": 7.787105801313099, + "grad_norm": 0.8034444451332092, + "learning_rate": 0.0001330847968968861, + "loss": 3.2229, + "step": 72350 + }, + { + "epoch": 7.792487353352707, + "grad_norm": 0.8173365592956543, + "learning_rate": 0.00013276155586682467, + "loss": 3.2286, + "step": 72400 + }, + { + "epoch": 7.797868905392315, + "grad_norm": 0.777065098285675, + "learning_rate": 0.00013243831483676326, + "loss": 3.2472, + "step": 72450 + }, + { + "epoch": 7.803250457431924, + "grad_norm": 0.8149591088294983, + "learning_rate": 0.00013211507380670186, + "loss": 3.2358, + "step": 72500 + }, + { + "epoch": 7.808632009471531, + "grad_norm": 0.8245164752006531, + "learning_rate": 0.00013179183277664042, + "loss": 3.2379, + "step": 72550 + }, + { + "epoch": 7.81401356151114, + "grad_norm": 0.858371376991272, + "learning_rate": 0.00013146859174657902, + "loss": 3.2399, + "step": 72600 + }, + { + "epoch": 7.819395113550748, + "grad_norm": 0.8818589448928833, + "learning_rate": 0.00013114535071651761, + "loss": 3.2313, + "step": 72650 + }, + { + "epoch": 7.824776665590356, + "grad_norm": 0.7951632738113403, + "learning_rate": 0.00013082210968645618, + "loss": 3.2385, + "step": 72700 + }, + { + "epoch": 7.830158217629965, + "grad_norm": 0.8036244511604309, + "learning_rate": 0.00013049886865639478, + "loss": 3.2455, + "step": 72750 + }, + { + "epoch": 7.835539769669572, + "grad_norm": 0.7847592830657959, + "learning_rate": 0.00013017562762633337, + "loss": 3.2634, + "step": 72800 + }, + { + "epoch": 7.840921321709181, + "grad_norm": 0.8040270805358887, + "learning_rate": 0.00012985238659627194, + "loss": 3.2263, + "step": 72850 + }, + { + "epoch": 7.846302873748789, + "grad_norm": 0.8288953900337219, + "learning_rate": 0.00012952914556621053, + "loss": 3.252, + "step": 72900 + }, + { + "epoch": 7.851684425788397, + "grad_norm": 0.8158117532730103, + "learning_rate": 0.0001292059045361491, + "loss": 3.2296, + "step": 72950 + }, + { + "epoch": 7.857065977828006, + "grad_norm": 0.8312733173370361, + "learning_rate": 0.0001288826635060877, + "loss": 3.2182, + "step": 73000 + }, + { + "epoch": 7.857065977828006, + "eval_accuracy": 0.38989512931921033, + "eval_loss": 3.334813117980957, + "eval_runtime": 209.8317, + "eval_samples_per_second": 85.835, + "eval_steps_per_second": 5.366, + "step": 73000 + }, + { + "epoch": 7.862447529867614, + "grad_norm": 0.8196859359741211, + "learning_rate": 0.00012855942247602626, + "loss": 3.2287, + "step": 73050 + }, + { + "epoch": 7.867829081907222, + "grad_norm": 0.810441792011261, + "learning_rate": 0.00012823618144596486, + "loss": 3.2479, + "step": 73100 + }, + { + "epoch": 7.87321063394683, + "grad_norm": 0.7915072441101074, + "learning_rate": 0.00012791294041590345, + "loss": 3.2385, + "step": 73150 + }, + { + "epoch": 7.878592185986438, + "grad_norm": 0.8137247562408447, + "learning_rate": 0.00012758969938584202, + "loss": 3.2249, + "step": 73200 + }, + { + "epoch": 7.883973738026047, + "grad_norm": 0.7836380004882812, + "learning_rate": 0.00012726645835578061, + "loss": 3.2342, + "step": 73250 + }, + { + "epoch": 7.889355290065655, + "grad_norm": 0.8190814852714539, + "learning_rate": 0.0001269432173257192, + "loss": 3.223, + "step": 73300 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.8309457302093506, + "learning_rate": 0.00012661997629565778, + "loss": 3.2194, + "step": 73350 + }, + { + "epoch": 7.900118394144871, + "grad_norm": 0.7745057344436646, + "learning_rate": 0.00012629673526559637, + "loss": 3.2503, + "step": 73400 + }, + { + "epoch": 7.90549994618448, + "grad_norm": 0.8173632621765137, + "learning_rate": 0.00012597349423553497, + "loss": 3.2438, + "step": 73450 + }, + { + "epoch": 7.910881498224088, + "grad_norm": 0.8117119669914246, + "learning_rate": 0.00012565025320547353, + "loss": 3.2514, + "step": 73500 + }, + { + "epoch": 7.916263050263696, + "grad_norm": 0.7812601923942566, + "learning_rate": 0.00012532701217541213, + "loss": 3.2333, + "step": 73550 + }, + { + "epoch": 7.921644602303305, + "grad_norm": 0.882896363735199, + "learning_rate": 0.0001250037711453507, + "loss": 3.223, + "step": 73600 + }, + { + "epoch": 7.927026154342912, + "grad_norm": 0.7868736982345581, + "learning_rate": 0.0001246805301152893, + "loss": 3.2356, + "step": 73650 + }, + { + "epoch": 7.932407706382521, + "grad_norm": 0.8104776740074158, + "learning_rate": 0.00012435728908522786, + "loss": 3.2451, + "step": 73700 + }, + { + "epoch": 7.937789258422129, + "grad_norm": 0.7862167954444885, + "learning_rate": 0.00012403404805516645, + "loss": 3.2351, + "step": 73750 + }, + { + "epoch": 7.943170810461737, + "grad_norm": 0.832856297492981, + "learning_rate": 0.00012371080702510505, + "loss": 3.2329, + "step": 73800 + }, + { + "epoch": 7.948552362501346, + "grad_norm": 0.8277978301048279, + "learning_rate": 0.00012338756599504362, + "loss": 3.2403, + "step": 73850 + }, + { + "epoch": 7.953933914540953, + "grad_norm": 0.8304550051689148, + "learning_rate": 0.0001230643249649822, + "loss": 3.2312, + "step": 73900 + }, + { + "epoch": 7.959315466580562, + "grad_norm": 0.844760000705719, + "learning_rate": 0.0001227410839349208, + "loss": 3.2351, + "step": 73950 + }, + { + "epoch": 7.96469701862017, + "grad_norm": 0.8154044151306152, + "learning_rate": 0.0001224178429048594, + "loss": 3.2393, + "step": 74000 + }, + { + "epoch": 7.96469701862017, + "eval_accuracy": 0.3899715122993439, + "eval_loss": 3.331357002258301, + "eval_runtime": 211.3692, + "eval_samples_per_second": 85.211, + "eval_steps_per_second": 5.327, + "step": 74000 + }, + { + "epoch": 7.970078570659778, + "grad_norm": 0.8356589078903198, + "learning_rate": 0.00012209460187479797, + "loss": 3.2136, + "step": 74050 + }, + { + "epoch": 7.975460122699387, + "grad_norm": 0.8147774338722229, + "learning_rate": 0.00012177136084473655, + "loss": 3.2281, + "step": 74100 + }, + { + "epoch": 7.980841674738995, + "grad_norm": 0.8949999809265137, + "learning_rate": 0.00012144811981467514, + "loss": 3.2365, + "step": 74150 + }, + { + "epoch": 7.986223226778603, + "grad_norm": 0.7900296449661255, + "learning_rate": 0.00012112487878461371, + "loss": 3.2091, + "step": 74200 + }, + { + "epoch": 7.991604778818211, + "grad_norm": 0.844578742980957, + "learning_rate": 0.0001208016377545523, + "loss": 3.247, + "step": 74250 + }, + { + "epoch": 7.996986330857819, + "grad_norm": 0.7930771708488464, + "learning_rate": 0.00012047839672449089, + "loss": 3.2137, + "step": 74300 + }, + { + "epoch": 8.002367882897428, + "grad_norm": 0.8258639574050903, + "learning_rate": 0.00012015515569442947, + "loss": 3.1959, + "step": 74350 + }, + { + "epoch": 8.007749434937036, + "grad_norm": 0.8070922493934631, + "learning_rate": 0.00011983191466436805, + "loss": 3.1728, + "step": 74400 + }, + { + "epoch": 8.013130986976645, + "grad_norm": 0.8373987078666687, + "learning_rate": 0.00011950867363430664, + "loss": 3.1395, + "step": 74450 + }, + { + "epoch": 8.018512539016251, + "grad_norm": 0.8283763527870178, + "learning_rate": 0.00011918543260424522, + "loss": 3.1552, + "step": 74500 + }, + { + "epoch": 8.02389409105586, + "grad_norm": 0.8418523669242859, + "learning_rate": 0.0001188621915741838, + "loss": 3.1531, + "step": 74550 + }, + { + "epoch": 8.029275643095469, + "grad_norm": 0.7724992632865906, + "learning_rate": 0.00011853895054412239, + "loss": 3.1875, + "step": 74600 + }, + { + "epoch": 8.034657195135077, + "grad_norm": 0.8510676622390747, + "learning_rate": 0.00011821570951406098, + "loss": 3.1649, + "step": 74650 + }, + { + "epoch": 8.040038747174686, + "grad_norm": 0.8318743109703064, + "learning_rate": 0.00011789246848399955, + "loss": 3.164, + "step": 74700 + }, + { + "epoch": 8.045420299214294, + "grad_norm": 0.8723147511482239, + "learning_rate": 0.00011756922745393814, + "loss": 3.1604, + "step": 74750 + }, + { + "epoch": 8.050801851253901, + "grad_norm": 0.8099079132080078, + "learning_rate": 0.00011724598642387674, + "loss": 3.1651, + "step": 74800 + }, + { + "epoch": 8.05618340329351, + "grad_norm": 0.8209378123283386, + "learning_rate": 0.0001169227453938153, + "loss": 3.1699, + "step": 74850 + }, + { + "epoch": 8.061564955333118, + "grad_norm": 0.8424363136291504, + "learning_rate": 0.0001165995043637539, + "loss": 3.1525, + "step": 74900 + }, + { + "epoch": 8.066946507372727, + "grad_norm": 0.822147786617279, + "learning_rate": 0.00011627626333369248, + "loss": 3.1864, + "step": 74950 + }, + { + "epoch": 8.072328059412335, + "grad_norm": 0.8202683329582214, + "learning_rate": 0.00011595302230363108, + "loss": 3.1517, + "step": 75000 + }, + { + "epoch": 8.072328059412335, + "eval_accuracy": 0.3900216012806264, + "eval_loss": 3.336542844772339, + "eval_runtime": 191.5515, + "eval_samples_per_second": 94.027, + "eval_steps_per_second": 5.878, + "step": 75000 + }, + { + "epoch": 8.077709611451942, + "grad_norm": 0.8051620721817017, + "learning_rate": 0.00011562978127356964, + "loss": 3.1685, + "step": 75050 + }, + { + "epoch": 8.08309116349155, + "grad_norm": 0.9138336777687073, + "learning_rate": 0.00011530654024350824, + "loss": 3.1629, + "step": 75100 + }, + { + "epoch": 8.088472715531159, + "grad_norm": 0.8304498195648193, + "learning_rate": 0.00011498329921344682, + "loss": 3.1683, + "step": 75150 + }, + { + "epoch": 8.093854267570768, + "grad_norm": 0.8430973887443542, + "learning_rate": 0.0001146600581833854, + "loss": 3.1618, + "step": 75200 + }, + { + "epoch": 8.099235819610376, + "grad_norm": 0.8061507344245911, + "learning_rate": 0.00011433681715332398, + "loss": 3.1781, + "step": 75250 + }, + { + "epoch": 8.104617371649983, + "grad_norm": 0.8056333065032959, + "learning_rate": 0.00011401357612326258, + "loss": 3.1757, + "step": 75300 + }, + { + "epoch": 8.109998923689592, + "grad_norm": 0.8301107883453369, + "learning_rate": 0.00011369033509320114, + "loss": 3.1759, + "step": 75350 + }, + { + "epoch": 8.1153804757292, + "grad_norm": 0.8385125398635864, + "learning_rate": 0.00011336709406313974, + "loss": 3.163, + "step": 75400 + }, + { + "epoch": 8.120762027768809, + "grad_norm": 0.8474127054214478, + "learning_rate": 0.00011304385303307832, + "loss": 3.1968, + "step": 75450 + }, + { + "epoch": 8.126143579808417, + "grad_norm": 0.8145176768302917, + "learning_rate": 0.00011272061200301691, + "loss": 3.1802, + "step": 75500 + }, + { + "epoch": 8.131525131848026, + "grad_norm": 0.823329508304596, + "learning_rate": 0.00011239737097295548, + "loss": 3.1642, + "step": 75550 + }, + { + "epoch": 8.136906683887632, + "grad_norm": 0.8719600439071655, + "learning_rate": 0.00011207412994289408, + "loss": 3.1745, + "step": 75600 + }, + { + "epoch": 8.142288235927241, + "grad_norm": 0.8388107419013977, + "learning_rate": 0.00011175088891283267, + "loss": 3.1866, + "step": 75650 + }, + { + "epoch": 8.14766978796685, + "grad_norm": 0.8055160641670227, + "learning_rate": 0.00011142764788277124, + "loss": 3.1704, + "step": 75700 + }, + { + "epoch": 8.153051340006458, + "grad_norm": 0.8534181714057922, + "learning_rate": 0.00011110440685270983, + "loss": 3.1663, + "step": 75750 + }, + { + "epoch": 8.158432892046067, + "grad_norm": 0.8355813026428223, + "learning_rate": 0.00011078116582264841, + "loss": 3.1759, + "step": 75800 + }, + { + "epoch": 8.163814444085673, + "grad_norm": 0.8218345046043396, + "learning_rate": 0.000110457924792587, + "loss": 3.1865, + "step": 75850 + }, + { + "epoch": 8.169195996125282, + "grad_norm": 0.8139203190803528, + "learning_rate": 0.00011013468376252558, + "loss": 3.1649, + "step": 75900 + }, + { + "epoch": 8.17457754816489, + "grad_norm": 0.8463836908340454, + "learning_rate": 0.00010981144273246417, + "loss": 3.1948, + "step": 75950 + }, + { + "epoch": 8.1799591002045, + "grad_norm": 0.7959445714950562, + "learning_rate": 0.00010948820170240275, + "loss": 3.1824, + "step": 76000 + }, + { + "epoch": 8.1799591002045, + "eval_accuracy": 0.3903759183477894, + "eval_loss": 3.334186553955078, + "eval_runtime": 221.1785, + "eval_samples_per_second": 81.432, + "eval_steps_per_second": 5.091, + "step": 76000 + }, + { + "epoch": 8.185340652244108, + "grad_norm": 0.8416066765785217, + "learning_rate": 0.00010916496067234133, + "loss": 3.1709, + "step": 76050 + }, + { + "epoch": 8.190722204283716, + "grad_norm": 0.8329446315765381, + "learning_rate": 0.00010884171964227991, + "loss": 3.1742, + "step": 76100 + }, + { + "epoch": 8.196103756323323, + "grad_norm": 0.8333694934844971, + "learning_rate": 0.00010851847861221851, + "loss": 3.183, + "step": 76150 + }, + { + "epoch": 8.201485308362932, + "grad_norm": 0.8373638987541199, + "learning_rate": 0.00010819523758215708, + "loss": 3.1723, + "step": 76200 + }, + { + "epoch": 8.20686686040254, + "grad_norm": 0.8664990067481995, + "learning_rate": 0.00010787199655209567, + "loss": 3.182, + "step": 76250 + }, + { + "epoch": 8.212248412442149, + "grad_norm": 0.8091091513633728, + "learning_rate": 0.00010754875552203425, + "loss": 3.1728, + "step": 76300 + }, + { + "epoch": 8.217629964481757, + "grad_norm": 0.8922072052955627, + "learning_rate": 0.00010723197931257406, + "loss": 3.1758, + "step": 76350 + }, + { + "epoch": 8.223011516521364, + "grad_norm": 0.8110160231590271, + "learning_rate": 0.00010690873828251265, + "loss": 3.1921, + "step": 76400 + }, + { + "epoch": 8.228393068560973, + "grad_norm": 0.8317387104034424, + "learning_rate": 0.00010658549725245123, + "loss": 3.1855, + "step": 76450 + }, + { + "epoch": 8.233774620600581, + "grad_norm": 0.8507430553436279, + "learning_rate": 0.00010626225622238982, + "loss": 3.1745, + "step": 76500 + }, + { + "epoch": 8.23915617264019, + "grad_norm": 0.8063206672668457, + "learning_rate": 0.0001059390151923284, + "loss": 3.1763, + "step": 76550 + }, + { + "epoch": 8.244537724679798, + "grad_norm": 0.8336623907089233, + "learning_rate": 0.00010561577416226699, + "loss": 3.1586, + "step": 76600 + }, + { + "epoch": 8.249919276719407, + "grad_norm": 0.844421923160553, + "learning_rate": 0.00010529253313220556, + "loss": 3.1821, + "step": 76650 + }, + { + "epoch": 8.255300828759013, + "grad_norm": 0.8457732200622559, + "learning_rate": 0.00010496929210214415, + "loss": 3.1778, + "step": 76700 + }, + { + "epoch": 8.260682380798622, + "grad_norm": 0.8985422849655151, + "learning_rate": 0.00010464605107208275, + "loss": 3.1718, + "step": 76750 + }, + { + "epoch": 8.26606393283823, + "grad_norm": 0.8263887166976929, + "learning_rate": 0.00010432281004202133, + "loss": 3.1938, + "step": 76800 + }, + { + "epoch": 8.27144548487784, + "grad_norm": 0.8684414625167847, + "learning_rate": 0.00010399956901195991, + "loss": 3.1709, + "step": 76850 + }, + { + "epoch": 8.276827036917448, + "grad_norm": 0.8270609378814697, + "learning_rate": 0.00010367632798189849, + "loss": 3.1768, + "step": 76900 + }, + { + "epoch": 8.282208588957054, + "grad_norm": 0.8352575302124023, + "learning_rate": 0.00010335308695183709, + "loss": 3.1952, + "step": 76950 + }, + { + "epoch": 8.287590140996663, + "grad_norm": 0.804658055305481, + "learning_rate": 0.00010302984592177565, + "loss": 3.1814, + "step": 77000 + }, + { + "epoch": 8.287590140996663, + "eval_accuracy": 0.3905953971811747, + "eval_loss": 3.3317322731018066, + "eval_runtime": 190.2074, + "eval_samples_per_second": 94.691, + "eval_steps_per_second": 5.92, + "step": 77000 + }, + { + "epoch": 8.292971693036272, + "grad_norm": 0.7943740487098694, + "learning_rate": 0.00010270660489171425, + "loss": 3.1838, + "step": 77050 + }, + { + "epoch": 8.29835324507588, + "grad_norm": 0.8035009503364563, + "learning_rate": 0.00010238336386165283, + "loss": 3.1595, + "step": 77100 + }, + { + "epoch": 8.303734797115489, + "grad_norm": 0.8640129566192627, + "learning_rate": 0.00010206012283159141, + "loss": 3.1757, + "step": 77150 + }, + { + "epoch": 8.309116349155097, + "grad_norm": 0.858672559261322, + "learning_rate": 0.00010173688180152999, + "loss": 3.1889, + "step": 77200 + }, + { + "epoch": 8.314497901194704, + "grad_norm": 0.8206643462181091, + "learning_rate": 0.00010141364077146859, + "loss": 3.1814, + "step": 77250 + }, + { + "epoch": 8.319879453234313, + "grad_norm": 0.8095281720161438, + "learning_rate": 0.00010109039974140718, + "loss": 3.1806, + "step": 77300 + }, + { + "epoch": 8.325261005273921, + "grad_norm": 0.8287177085876465, + "learning_rate": 0.00010076715871134575, + "loss": 3.1703, + "step": 77350 + }, + { + "epoch": 8.33064255731353, + "grad_norm": 0.8322443962097168, + "learning_rate": 0.00010044391768128433, + "loss": 3.187, + "step": 77400 + }, + { + "epoch": 8.336024109353138, + "grad_norm": 0.8651958107948303, + "learning_rate": 0.00010012067665122292, + "loss": 3.1705, + "step": 77450 + }, + { + "epoch": 8.341405661392745, + "grad_norm": 0.842995822429657, + "learning_rate": 9.979743562116149e-05, + "loss": 3.1877, + "step": 77500 + }, + { + "epoch": 8.346787213432354, + "grad_norm": 0.8423287868499756, + "learning_rate": 9.947419459110009e-05, + "loss": 3.2024, + "step": 77550 + }, + { + "epoch": 8.352168765471962, + "grad_norm": 0.830844521522522, + "learning_rate": 9.915095356103868e-05, + "loss": 3.1762, + "step": 77600 + }, + { + "epoch": 8.35755031751157, + "grad_norm": 0.8280004262924194, + "learning_rate": 9.882771253097725e-05, + "loss": 3.1796, + "step": 77650 + }, + { + "epoch": 8.36293186955118, + "grad_norm": 0.8298020362854004, + "learning_rate": 9.850447150091584e-05, + "loss": 3.1803, + "step": 77700 + }, + { + "epoch": 8.368313421590786, + "grad_norm": 0.883962869644165, + "learning_rate": 9.818123047085442e-05, + "loss": 3.1974, + "step": 77750 + }, + { + "epoch": 8.373694973630395, + "grad_norm": 0.7966394424438477, + "learning_rate": 9.785798944079302e-05, + "loss": 3.1821, + "step": 77800 + }, + { + "epoch": 8.379076525670003, + "grad_norm": 0.8396945595741272, + "learning_rate": 9.753474841073159e-05, + "loss": 3.1992, + "step": 77850 + }, + { + "epoch": 8.384458077709612, + "grad_norm": 0.8430613279342651, + "learning_rate": 9.721150738067018e-05, + "loss": 3.186, + "step": 77900 + }, + { + "epoch": 8.38983962974922, + "grad_norm": 0.7840559482574463, + "learning_rate": 9.688826635060876e-05, + "loss": 3.1821, + "step": 77950 + }, + { + "epoch": 8.395221181788829, + "grad_norm": 0.8649613261222839, + "learning_rate": 9.656502532054734e-05, + "loss": 3.1945, + "step": 78000 + }, + { + "epoch": 8.395221181788829, + "eval_accuracy": 0.39121287154262435, + "eval_loss": 3.3277697563171387, + "eval_runtime": 189.8853, + "eval_samples_per_second": 94.852, + "eval_steps_per_second": 5.93, + "step": 78000 + }, + { + "epoch": 8.400602733828435, + "grad_norm": 0.8213173151016235, + "learning_rate": 9.624178429048592e-05, + "loss": 3.1755, + "step": 78050 + }, + { + "epoch": 8.405984285868044, + "grad_norm": 0.8209165930747986, + "learning_rate": 9.591854326042452e-05, + "loss": 3.18, + "step": 78100 + }, + { + "epoch": 8.411365837907653, + "grad_norm": 0.8557418584823608, + "learning_rate": 9.559530223036309e-05, + "loss": 3.169, + "step": 78150 + }, + { + "epoch": 8.416747389947261, + "grad_norm": 0.8438737988471985, + "learning_rate": 9.527206120030168e-05, + "loss": 3.2073, + "step": 78200 + }, + { + "epoch": 8.42212894198687, + "grad_norm": 0.8334610462188721, + "learning_rate": 9.494882017024028e-05, + "loss": 3.1781, + "step": 78250 + }, + { + "epoch": 8.427510494026476, + "grad_norm": 0.8696321845054626, + "learning_rate": 9.462557914017886e-05, + "loss": 3.1714, + "step": 78300 + }, + { + "epoch": 8.432892046066085, + "grad_norm": 0.9134912490844727, + "learning_rate": 9.430233811011742e-05, + "loss": 3.177, + "step": 78350 + }, + { + "epoch": 8.438273598105694, + "grad_norm": 0.8383008241653442, + "learning_rate": 9.397909708005602e-05, + "loss": 3.173, + "step": 78400 + }, + { + "epoch": 8.443655150145302, + "grad_norm": 0.86134934425354, + "learning_rate": 9.365585604999461e-05, + "loss": 3.1812, + "step": 78450 + }, + { + "epoch": 8.44903670218491, + "grad_norm": 0.8300356268882751, + "learning_rate": 9.333261501993318e-05, + "loss": 3.1625, + "step": 78500 + }, + { + "epoch": 8.45441825422452, + "grad_norm": 0.832892119884491, + "learning_rate": 9.300937398987178e-05, + "loss": 3.1952, + "step": 78550 + }, + { + "epoch": 8.459799806264126, + "grad_norm": 0.9564111828804016, + "learning_rate": 9.268613295981036e-05, + "loss": 3.1878, + "step": 78600 + }, + { + "epoch": 8.465181358303735, + "grad_norm": 0.8386510014533997, + "learning_rate": 9.236289192974894e-05, + "loss": 3.202, + "step": 78650 + }, + { + "epoch": 8.470562910343343, + "grad_norm": 0.8402494192123413, + "learning_rate": 9.203965089968752e-05, + "loss": 3.1691, + "step": 78700 + }, + { + "epoch": 8.475944462382952, + "grad_norm": 0.8221197128295898, + "learning_rate": 9.171640986962611e-05, + "loss": 3.181, + "step": 78750 + }, + { + "epoch": 8.48132601442256, + "grad_norm": 0.8528578877449036, + "learning_rate": 9.13931688395647e-05, + "loss": 3.1747, + "step": 78800 + }, + { + "epoch": 8.486707566462167, + "grad_norm": 0.846588134765625, + "learning_rate": 9.106992780950328e-05, + "loss": 3.1856, + "step": 78850 + }, + { + "epoch": 8.492089118501776, + "grad_norm": 0.8404663801193237, + "learning_rate": 9.074668677944186e-05, + "loss": 3.1811, + "step": 78900 + }, + { + "epoch": 8.497470670541384, + "grad_norm": 0.8380141854286194, + "learning_rate": 9.042344574938045e-05, + "loss": 3.1771, + "step": 78950 + }, + { + "epoch": 8.502852222580993, + "grad_norm": 0.8275344371795654, + "learning_rate": 9.010020471931902e-05, + "loss": 3.1847, + "step": 79000 + }, + { + "epoch": 8.502852222580993, + "eval_accuracy": 0.3914904796709904, + "eval_loss": 3.3259856700897217, + "eval_runtime": 189.3897, + "eval_samples_per_second": 95.1, + "eval_steps_per_second": 5.945, + "step": 79000 + }, + { + "epoch": 8.508233774620601, + "grad_norm": 0.8297053575515747, + "learning_rate": 8.977696368925761e-05, + "loss": 3.1785, + "step": 79050 + }, + { + "epoch": 8.513615326660208, + "grad_norm": 0.8653829097747803, + "learning_rate": 8.945372265919621e-05, + "loss": 3.2013, + "step": 79100 + }, + { + "epoch": 8.518996878699816, + "grad_norm": 0.8570589423179626, + "learning_rate": 8.913048162913478e-05, + "loss": 3.1734, + "step": 79150 + }, + { + "epoch": 8.524378430739425, + "grad_norm": 0.8229767680168152, + "learning_rate": 8.880724059907337e-05, + "loss": 3.1873, + "step": 79200 + }, + { + "epoch": 8.529759982779034, + "grad_norm": 0.8804620504379272, + "learning_rate": 8.848399956901195e-05, + "loss": 3.1884, + "step": 79250 + }, + { + "epoch": 8.535141534818642, + "grad_norm": 0.8399264216423035, + "learning_rate": 8.816075853895055e-05, + "loss": 3.1759, + "step": 79300 + }, + { + "epoch": 8.54052308685825, + "grad_norm": 0.8268064260482788, + "learning_rate": 8.783751750888911e-05, + "loss": 3.1752, + "step": 79350 + }, + { + "epoch": 8.545904638897857, + "grad_norm": 0.8142411112785339, + "learning_rate": 8.751427647882771e-05, + "loss": 3.1912, + "step": 79400 + }, + { + "epoch": 8.551286190937466, + "grad_norm": 0.8386333584785461, + "learning_rate": 8.719103544876629e-05, + "loss": 3.1949, + "step": 79450 + }, + { + "epoch": 8.556667742977075, + "grad_norm": 0.8724285364151001, + "learning_rate": 8.686779441870487e-05, + "loss": 3.1779, + "step": 79500 + }, + { + "epoch": 8.562049295016683, + "grad_norm": 0.8269312977790833, + "learning_rate": 8.654455338864345e-05, + "loss": 3.1914, + "step": 79550 + }, + { + "epoch": 8.567430847056292, + "grad_norm": 0.8587058186531067, + "learning_rate": 8.622131235858205e-05, + "loss": 3.1748, + "step": 79600 + }, + { + "epoch": 8.572812399095898, + "grad_norm": 0.8489319086074829, + "learning_rate": 8.589807132852061e-05, + "loss": 3.1859, + "step": 79650 + }, + { + "epoch": 8.578193951135507, + "grad_norm": 0.8191342949867249, + "learning_rate": 8.557483029845921e-05, + "loss": 3.1851, + "step": 79700 + }, + { + "epoch": 8.583575503175116, + "grad_norm": 0.859576404094696, + "learning_rate": 8.525158926839779e-05, + "loss": 3.1702, + "step": 79750 + }, + { + "epoch": 8.588957055214724, + "grad_norm": 0.8672488927841187, + "learning_rate": 8.492834823833639e-05, + "loss": 3.196, + "step": 79800 + }, + { + "epoch": 8.594338607254333, + "grad_norm": 0.8281680941581726, + "learning_rate": 8.460510720827495e-05, + "loss": 3.2033, + "step": 79850 + }, + { + "epoch": 8.599720159293941, + "grad_norm": 0.8545637726783752, + "learning_rate": 8.428186617821355e-05, + "loss": 3.1826, + "step": 79900 + }, + { + "epoch": 8.605101711333548, + "grad_norm": 0.8453432321548462, + "learning_rate": 8.395862514815214e-05, + "loss": 3.1924, + "step": 79950 + }, + { + "epoch": 8.610483263373157, + "grad_norm": 0.8370970487594604, + "learning_rate": 8.363538411809071e-05, + "loss": 3.1648, + "step": 80000 + }, + { + "epoch": 8.610483263373157, + "eval_accuracy": 0.39175287639506234, + "eval_loss": 3.322192668914795, + "eval_runtime": 189.7221, + "eval_samples_per_second": 94.934, + "eval_steps_per_second": 5.935, + "step": 80000 + }, + { + "epoch": 8.615864815412765, + "grad_norm": 0.8636527061462402, + "learning_rate": 8.33121430880293e-05, + "loss": 3.1804, + "step": 80050 + }, + { + "epoch": 8.621246367452374, + "grad_norm": 0.8623721599578857, + "learning_rate": 8.298890205796789e-05, + "loss": 3.1682, + "step": 80100 + }, + { + "epoch": 8.626627919491982, + "grad_norm": 0.8467229008674622, + "learning_rate": 8.266566102790647e-05, + "loss": 3.1673, + "step": 80150 + }, + { + "epoch": 8.632009471531589, + "grad_norm": 0.8296520113945007, + "learning_rate": 8.234241999784505e-05, + "loss": 3.1888, + "step": 80200 + }, + { + "epoch": 8.637391023571197, + "grad_norm": 0.8253055214881897, + "learning_rate": 8.201917896778364e-05, + "loss": 3.1978, + "step": 80250 + }, + { + "epoch": 8.642772575610806, + "grad_norm": 0.8557524085044861, + "learning_rate": 8.169593793772222e-05, + "loss": 3.1835, + "step": 80300 + }, + { + "epoch": 8.648154127650415, + "grad_norm": 0.8345833420753479, + "learning_rate": 8.137916172826203e-05, + "loss": 3.171, + "step": 80350 + }, + { + "epoch": 8.653535679690023, + "grad_norm": 0.8740684986114502, + "learning_rate": 8.105592069820062e-05, + "loss": 3.1772, + "step": 80400 + }, + { + "epoch": 8.658917231729632, + "grad_norm": 0.8588552474975586, + "learning_rate": 8.073267966813919e-05, + "loss": 3.1802, + "step": 80450 + }, + { + "epoch": 8.664298783769238, + "grad_norm": 0.8434314727783203, + "learning_rate": 8.040943863807779e-05, + "loss": 3.1869, + "step": 80500 + }, + { + "epoch": 8.669680335808847, + "grad_norm": 0.8790923953056335, + "learning_rate": 8.008619760801637e-05, + "loss": 3.1811, + "step": 80550 + }, + { + "epoch": 8.675061887848456, + "grad_norm": 0.8365694880485535, + "learning_rate": 7.976295657795496e-05, + "loss": 3.1876, + "step": 80600 + }, + { + "epoch": 8.680443439888064, + "grad_norm": 0.8803290724754333, + "learning_rate": 7.943971554789353e-05, + "loss": 3.1946, + "step": 80650 + }, + { + "epoch": 8.685824991927673, + "grad_norm": 0.8246529698371887, + "learning_rate": 7.911647451783212e-05, + "loss": 3.1868, + "step": 80700 + }, + { + "epoch": 8.69120654396728, + "grad_norm": 0.8908007144927979, + "learning_rate": 7.879323348777072e-05, + "loss": 3.1883, + "step": 80750 + }, + { + "epoch": 8.696588096006888, + "grad_norm": 0.8227264285087585, + "learning_rate": 7.846999245770929e-05, + "loss": 3.1765, + "step": 80800 + }, + { + "epoch": 8.701969648046497, + "grad_norm": 0.8562191724777222, + "learning_rate": 7.814675142764787e-05, + "loss": 3.2047, + "step": 80850 + }, + { + "epoch": 8.707351200086105, + "grad_norm": 0.840075671672821, + "learning_rate": 7.782351039758646e-05, + "loss": 3.1723, + "step": 80900 + }, + { + "epoch": 8.712732752125714, + "grad_norm": 0.8150270581245422, + "learning_rate": 7.750026936752503e-05, + "loss": 3.1816, + "step": 80950 + }, + { + "epoch": 8.718114304165322, + "grad_norm": 0.835625410079956, + "learning_rate": 7.717702833746362e-05, + "loss": 3.179, + "step": 81000 + }, + { + "epoch": 8.718114304165322, + "eval_accuracy": 0.3919582303530317, + "eval_loss": 3.3187644481658936, + "eval_runtime": 189.4444, + "eval_samples_per_second": 95.073, + "eval_steps_per_second": 5.944, + "step": 81000 + }, + { + "epoch": 8.723495856204929, + "grad_norm": 0.8230716586112976, + "learning_rate": 7.685378730740222e-05, + "loss": 3.1832, + "step": 81050 + }, + { + "epoch": 8.728877408244538, + "grad_norm": 0.8545243740081787, + "learning_rate": 7.65305462773408e-05, + "loss": 3.1767, + "step": 81100 + }, + { + "epoch": 8.734258960284146, + "grad_norm": 0.8454192280769348, + "learning_rate": 7.620730524727938e-05, + "loss": 3.1891, + "step": 81150 + }, + { + "epoch": 8.739640512323755, + "grad_norm": 0.8239614367485046, + "learning_rate": 7.588406421721796e-05, + "loss": 3.1903, + "step": 81200 + }, + { + "epoch": 8.745022064363363, + "grad_norm": 0.8292789459228516, + "learning_rate": 7.556082318715656e-05, + "loss": 3.1815, + "step": 81250 + }, + { + "epoch": 8.75040361640297, + "grad_norm": 0.897819995880127, + "learning_rate": 7.523758215709513e-05, + "loss": 3.2009, + "step": 81300 + }, + { + "epoch": 8.755785168442578, + "grad_norm": 0.8491601943969727, + "learning_rate": 7.491434112703372e-05, + "loss": 3.1827, + "step": 81350 + }, + { + "epoch": 8.761166720482187, + "grad_norm": 0.8508619666099548, + "learning_rate": 7.45911000969723e-05, + "loss": 3.1863, + "step": 81400 + }, + { + "epoch": 8.766548272521796, + "grad_norm": 0.835036039352417, + "learning_rate": 7.426785906691088e-05, + "loss": 3.1758, + "step": 81450 + }, + { + "epoch": 8.771929824561404, + "grad_norm": 0.8110843300819397, + "learning_rate": 7.394461803684946e-05, + "loss": 3.2087, + "step": 81500 + }, + { + "epoch": 8.777311376601011, + "grad_norm": 0.9202770590782166, + "learning_rate": 7.362137700678806e-05, + "loss": 3.171, + "step": 81550 + }, + { + "epoch": 8.78269292864062, + "grad_norm": 0.8067617416381836, + "learning_rate": 7.329813597672664e-05, + "loss": 3.2149, + "step": 81600 + }, + { + "epoch": 8.788074480680228, + "grad_norm": 0.8563730120658875, + "learning_rate": 7.297489494666523e-05, + "loss": 3.1909, + "step": 81650 + }, + { + "epoch": 8.793456032719837, + "grad_norm": 0.8543161153793335, + "learning_rate": 7.265165391660381e-05, + "loss": 3.1819, + "step": 81700 + }, + { + "epoch": 8.798837584759445, + "grad_norm": 0.8220990300178528, + "learning_rate": 7.23284128865424e-05, + "loss": 3.2023, + "step": 81750 + }, + { + "epoch": 8.804219136799054, + "grad_norm": 0.8801039457321167, + "learning_rate": 7.200517185648098e-05, + "loss": 3.187, + "step": 81800 + }, + { + "epoch": 8.80960068883866, + "grad_norm": 0.8284077644348145, + "learning_rate": 7.168193082641956e-05, + "loss": 3.1896, + "step": 81850 + }, + { + "epoch": 8.814982240878269, + "grad_norm": 0.8685952425003052, + "learning_rate": 7.135868979635815e-05, + "loss": 3.179, + "step": 81900 + }, + { + "epoch": 8.820363792917878, + "grad_norm": 0.8335699439048767, + "learning_rate": 7.103544876629673e-05, + "loss": 3.1853, + "step": 81950 + }, + { + "epoch": 8.825745344957486, + "grad_norm": 0.8425479531288147, + "learning_rate": 7.071220773623531e-05, + "loss": 3.1809, + "step": 82000 + }, + { + "epoch": 8.825745344957486, + "eval_accuracy": 0.39258222388775027, + "eval_loss": 3.3150174617767334, + "eval_runtime": 190.1288, + "eval_samples_per_second": 94.731, + "eval_steps_per_second": 5.922, + "step": 82000 + }, + { + "epoch": 8.831126896997095, + "grad_norm": 0.88204026222229, + "learning_rate": 7.03889667061739e-05, + "loss": 3.1956, + "step": 82050 + }, + { + "epoch": 8.836508449036701, + "grad_norm": 0.8744454979896545, + "learning_rate": 7.006572567611248e-05, + "loss": 3.1756, + "step": 82100 + }, + { + "epoch": 8.84189000107631, + "grad_norm": 0.8653980493545532, + "learning_rate": 6.974248464605107e-05, + "loss": 3.1819, + "step": 82150 + }, + { + "epoch": 8.847271553115919, + "grad_norm": 0.8841899037361145, + "learning_rate": 6.941924361598965e-05, + "loss": 3.2043, + "step": 82200 + }, + { + "epoch": 8.852653105155527, + "grad_norm": 0.8339793086051941, + "learning_rate": 6.909600258592823e-05, + "loss": 3.1922, + "step": 82250 + }, + { + "epoch": 8.858034657195136, + "grad_norm": 0.8752954006195068, + "learning_rate": 6.877276155586681e-05, + "loss": 3.1984, + "step": 82300 + }, + { + "epoch": 8.863416209234742, + "grad_norm": 0.839018702507019, + "learning_rate": 6.84495205258054e-05, + "loss": 3.1782, + "step": 82350 + }, + { + "epoch": 8.868797761274351, + "grad_norm": 0.8543075323104858, + "learning_rate": 6.813274431634522e-05, + "loss": 3.1868, + "step": 82400 + }, + { + "epoch": 8.87417931331396, + "grad_norm": 0.8411060571670532, + "learning_rate": 6.78095032862838e-05, + "loss": 3.1812, + "step": 82450 + }, + { + "epoch": 8.879560865353568, + "grad_norm": 0.8576669692993164, + "learning_rate": 6.748626225622238e-05, + "loss": 3.1995, + "step": 82500 + }, + { + "epoch": 8.884942417393177, + "grad_norm": 0.8171575665473938, + "learning_rate": 6.716302122616096e-05, + "loss": 3.1939, + "step": 82550 + }, + { + "epoch": 8.890323969432785, + "grad_norm": 0.8460285067558289, + "learning_rate": 6.683978019609955e-05, + "loss": 3.1678, + "step": 82600 + }, + { + "epoch": 8.895705521472392, + "grad_norm": 0.8574362993240356, + "learning_rate": 6.651653916603814e-05, + "loss": 3.1833, + "step": 82650 + }, + { + "epoch": 8.901087073512, + "grad_norm": 0.8909504413604736, + "learning_rate": 6.619329813597673e-05, + "loss": 3.1862, + "step": 82700 + }, + { + "epoch": 8.906468625551609, + "grad_norm": 0.8539124131202698, + "learning_rate": 6.587005710591531e-05, + "loss": 3.1692, + "step": 82750 + }, + { + "epoch": 8.911850177591218, + "grad_norm": 0.8660421967506409, + "learning_rate": 6.554681607585389e-05, + "loss": 3.1867, + "step": 82800 + }, + { + "epoch": 8.917231729630826, + "grad_norm": 0.8633134365081787, + "learning_rate": 6.522357504579247e-05, + "loss": 3.1872, + "step": 82850 + }, + { + "epoch": 8.922613281670433, + "grad_norm": 0.841986894607544, + "learning_rate": 6.490033401573105e-05, + "loss": 3.1805, + "step": 82900 + }, + { + "epoch": 8.927994833710041, + "grad_norm": 0.87624591588974, + "learning_rate": 6.457709298566965e-05, + "loss": 3.1753, + "step": 82950 + }, + { + "epoch": 8.93337638574965, + "grad_norm": 0.8404093384742737, + "learning_rate": 6.425385195560823e-05, + "loss": 3.1939, + "step": 83000 + }, + { + "epoch": 8.93337638574965, + "eval_accuracy": 0.3927638915161761, + "eval_loss": 3.311628580093384, + "eval_runtime": 189.6303, + "eval_samples_per_second": 94.98, + "eval_steps_per_second": 5.938, + "step": 83000 + }, + { + "epoch": 8.938757937789259, + "grad_norm": 0.8670982122421265, + "learning_rate": 6.393061092554681e-05, + "loss": 3.1851, + "step": 83050 + }, + { + "epoch": 8.944139489828867, + "grad_norm": 0.8687721490859985, + "learning_rate": 6.360736989548539e-05, + "loss": 3.1744, + "step": 83100 + }, + { + "epoch": 8.949521041868476, + "grad_norm": 0.8958396911621094, + "learning_rate": 6.328412886542397e-05, + "loss": 3.2024, + "step": 83150 + }, + { + "epoch": 8.954902593908082, + "grad_norm": 0.8612350225448608, + "learning_rate": 6.296088783536257e-05, + "loss": 3.184, + "step": 83200 + }, + { + "epoch": 8.960284145947691, + "grad_norm": 0.8626738786697388, + "learning_rate": 6.263764680530115e-05, + "loss": 3.172, + "step": 83250 + }, + { + "epoch": 8.9656656979873, + "grad_norm": 0.8333890438079834, + "learning_rate": 6.231440577523973e-05, + "loss": 3.1774, + "step": 83300 + }, + { + "epoch": 8.971047250026908, + "grad_norm": 0.8436484932899475, + "learning_rate": 6.199116474517831e-05, + "loss": 3.1899, + "step": 83350 + }, + { + "epoch": 8.976428802066517, + "grad_norm": 0.863152265548706, + "learning_rate": 6.166792371511689e-05, + "loss": 3.1785, + "step": 83400 + }, + { + "epoch": 8.981810354106123, + "grad_norm": 0.8480373620986938, + "learning_rate": 6.134468268505549e-05, + "loss": 3.1859, + "step": 83450 + }, + { + "epoch": 8.987191906145732, + "grad_norm": 0.8849746584892273, + "learning_rate": 6.10279064755953e-05, + "loss": 3.1647, + "step": 83500 + }, + { + "epoch": 8.99257345818534, + "grad_norm": 0.8656249642372131, + "learning_rate": 6.070466544553388e-05, + "loss": 3.1873, + "step": 83550 + }, + { + "epoch": 8.997955010224949, + "grad_norm": 0.8052066564559937, + "learning_rate": 6.038142441547246e-05, + "loss": 3.189, + "step": 83600 + }, + { + "epoch": 9.003336562264558, + "grad_norm": 0.8620032668113708, + "learning_rate": 6.005818338541105e-05, + "loss": 3.147, + "step": 83650 + }, + { + "epoch": 9.008718114304166, + "grad_norm": 0.8975006937980652, + "learning_rate": 5.973494235534963e-05, + "loss": 3.119, + "step": 83700 + }, + { + "epoch": 9.014099666343773, + "grad_norm": 0.8371387720108032, + "learning_rate": 5.941170132528822e-05, + "loss": 3.1233, + "step": 83750 + }, + { + "epoch": 9.019481218383381, + "grad_norm": 0.8606297373771667, + "learning_rate": 5.90884602952268e-05, + "loss": 3.1383, + "step": 83800 + }, + { + "epoch": 9.02486277042299, + "grad_norm": 0.8248050212860107, + "learning_rate": 5.876521926516538e-05, + "loss": 3.1193, + "step": 83850 + }, + { + "epoch": 9.030244322462599, + "grad_norm": 0.9047676920890808, + "learning_rate": 5.844197823510397e-05, + "loss": 3.1173, + "step": 83900 + }, + { + "epoch": 9.035625874502207, + "grad_norm": 0.8255630731582642, + "learning_rate": 5.811873720504255e-05, + "loss": 3.1424, + "step": 83950 + }, + { + "epoch": 9.041007426541814, + "grad_norm": 0.8557283878326416, + "learning_rate": 5.7795496174981145e-05, + "loss": 3.1185, + "step": 84000 + }, + { + "epoch": 9.041007426541814, + "eval_accuracy": 0.39279083743235405, + "eval_loss": 3.3150100708007812, + "eval_runtime": 189.3544, + "eval_samples_per_second": 95.118, + "eval_steps_per_second": 5.947, + "step": 84000 + }, + { + "epoch": 9.046388978581422, + "grad_norm": 0.8470101952552795, + "learning_rate": 5.7472255144919726e-05, + "loss": 3.1401, + "step": 84050 + }, + { + "epoch": 9.051770530621031, + "grad_norm": 0.8201737999916077, + "learning_rate": 5.714901411485831e-05, + "loss": 3.1163, + "step": 84100 + }, + { + "epoch": 9.05715208266064, + "grad_norm": 0.8869134187698364, + "learning_rate": 5.6825773084796895e-05, + "loss": 3.1184, + "step": 84150 + }, + { + "epoch": 9.062533634700248, + "grad_norm": 0.8410740494728088, + "learning_rate": 5.6502532054735476e-05, + "loss": 3.1237, + "step": 84200 + }, + { + "epoch": 9.067915186739857, + "grad_norm": 0.8496425747871399, + "learning_rate": 5.6179291024674064e-05, + "loss": 3.1181, + "step": 84250 + }, + { + "epoch": 9.073296738779463, + "grad_norm": 0.8405110239982605, + "learning_rate": 5.5856049994612645e-05, + "loss": 3.1268, + "step": 84300 + }, + { + "epoch": 9.078678290819072, + "grad_norm": 0.8484364748001099, + "learning_rate": 5.5532808964551227e-05, + "loss": 3.1385, + "step": 84350 + }, + { + "epoch": 9.08405984285868, + "grad_norm": 0.8305612206459045, + "learning_rate": 5.5209567934489814e-05, + "loss": 3.1149, + "step": 84400 + }, + { + "epoch": 9.089441394898289, + "grad_norm": 0.8964749574661255, + "learning_rate": 5.4886326904428396e-05, + "loss": 3.1238, + "step": 84450 + }, + { + "epoch": 9.094822946937898, + "grad_norm": 0.8592599630355835, + "learning_rate": 5.4563085874366983e-05, + "loss": 3.1313, + "step": 84500 + }, + { + "epoch": 9.100204498977504, + "grad_norm": 0.8475759625434875, + "learning_rate": 5.4239844844305565e-05, + "loss": 3.1228, + "step": 84550 + }, + { + "epoch": 9.105586051017113, + "grad_norm": 0.8360592126846313, + "learning_rate": 5.3916603814244146e-05, + "loss": 3.1551, + "step": 84600 + }, + { + "epoch": 9.110967603056721, + "grad_norm": 0.8481616973876953, + "learning_rate": 5.3593362784182734e-05, + "loss": 3.1232, + "step": 84650 + }, + { + "epoch": 9.11634915509633, + "grad_norm": 0.8523578643798828, + "learning_rate": 5.3270121754121315e-05, + "loss": 3.1473, + "step": 84700 + }, + { + "epoch": 9.121730707135939, + "grad_norm": 0.828809916973114, + "learning_rate": 5.294688072405991e-05, + "loss": 3.1185, + "step": 84750 + }, + { + "epoch": 9.127112259175545, + "grad_norm": 0.8841274380683899, + "learning_rate": 5.262363969399849e-05, + "loss": 3.1403, + "step": 84800 + }, + { + "epoch": 9.132493811215154, + "grad_norm": 0.8152642250061035, + "learning_rate": 5.230039866393707e-05, + "loss": 3.1395, + "step": 84850 + }, + { + "epoch": 9.137875363254762, + "grad_norm": 0.8742380738258362, + "learning_rate": 5.197715763387566e-05, + "loss": 3.1114, + "step": 84900 + }, + { + "epoch": 9.143256915294371, + "grad_norm": 0.8858259916305542, + "learning_rate": 5.165391660381424e-05, + "loss": 3.1252, + "step": 84950 + }, + { + "epoch": 9.14863846733398, + "grad_norm": 0.8560479879379272, + "learning_rate": 5.133067557375283e-05, + "loss": 3.1242, + "step": 85000 + }, + { + "epoch": 9.14863846733398, + "eval_accuracy": 0.39295196966498297, + "eval_loss": 3.3140878677368164, + "eval_runtime": 189.6918, + "eval_samples_per_second": 94.949, + "eval_steps_per_second": 5.936, + "step": 85000 + }, + { + "epoch": 9.154020019373588, + "grad_norm": 0.9070020318031311, + "learning_rate": 5.100743454369141e-05, + "loss": 3.1242, + "step": 85050 + }, + { + "epoch": 9.159401571413195, + "grad_norm": 0.8739373087882996, + "learning_rate": 5.068419351362999e-05, + "loss": 3.1321, + "step": 85100 + }, + { + "epoch": 9.164783123452803, + "grad_norm": 0.8586959838867188, + "learning_rate": 5.036095248356858e-05, + "loss": 3.1283, + "step": 85150 + }, + { + "epoch": 9.170164675492412, + "grad_norm": 0.8272525668144226, + "learning_rate": 5.003771145350716e-05, + "loss": 3.1298, + "step": 85200 + }, + { + "epoch": 9.17554622753202, + "grad_norm": 0.8490020036697388, + "learning_rate": 4.971447042344575e-05, + "loss": 3.1514, + "step": 85250 + }, + { + "epoch": 9.180927779571629, + "grad_norm": 0.8110401630401611, + "learning_rate": 4.939122939338433e-05, + "loss": 3.1387, + "step": 85300 + }, + { + "epoch": 9.186309331611236, + "grad_norm": 0.8785157203674316, + "learning_rate": 4.906798836332291e-05, + "loss": 3.1274, + "step": 85350 + }, + { + "epoch": 9.191690883650844, + "grad_norm": 0.8156988024711609, + "learning_rate": 4.87447473332615e-05, + "loss": 3.1198, + "step": 85400 + }, + { + "epoch": 9.197072435690453, + "grad_norm": 0.8705768585205078, + "learning_rate": 4.842150630320008e-05, + "loss": 3.1515, + "step": 85450 + }, + { + "epoch": 9.202453987730062, + "grad_norm": 0.8462039232254028, + "learning_rate": 4.8098265273138667e-05, + "loss": 3.1292, + "step": 85500 + }, + { + "epoch": 9.20783553976967, + "grad_norm": 0.859347939491272, + "learning_rate": 4.777502424307725e-05, + "loss": 3.1349, + "step": 85550 + }, + { + "epoch": 9.213217091809279, + "grad_norm": 0.8954098224639893, + "learning_rate": 4.745178321301583e-05, + "loss": 3.1338, + "step": 85600 + }, + { + "epoch": 9.218598643848885, + "grad_norm": 0.8277496099472046, + "learning_rate": 4.7128542182954423e-05, + "loss": 3.1399, + "step": 85650 + }, + { + "epoch": 9.223980195888494, + "grad_norm": 0.8506129384040833, + "learning_rate": 4.6805301152893005e-05, + "loss": 3.1263, + "step": 85700 + }, + { + "epoch": 9.229361747928102, + "grad_norm": 0.8558870553970337, + "learning_rate": 4.648206012283159e-05, + "loss": 3.115, + "step": 85750 + }, + { + "epoch": 9.234743299967711, + "grad_norm": 0.8409327268600464, + "learning_rate": 4.6158819092770174e-05, + "loss": 3.1123, + "step": 85800 + }, + { + "epoch": 9.24012485200732, + "grad_norm": 0.8239386677742004, + "learning_rate": 4.5835578062708755e-05, + "loss": 3.1228, + "step": 85850 + }, + { + "epoch": 9.245506404046926, + "grad_norm": 0.863751232624054, + "learning_rate": 4.551233703264734e-05, + "loss": 3.1394, + "step": 85900 + }, + { + "epoch": 9.250887956086535, + "grad_norm": 0.8437860012054443, + "learning_rate": 4.5189096002585924e-05, + "loss": 3.1205, + "step": 85950 + }, + { + "epoch": 9.256269508126143, + "grad_norm": 0.8293090462684631, + "learning_rate": 4.486585497252451e-05, + "loss": 3.1324, + "step": 86000 + }, + { + "epoch": 9.256269508126143, + "eval_accuracy": 0.3932856426834612, + "eval_loss": 3.3111629486083984, + "eval_runtime": 189.3869, + "eval_samples_per_second": 95.102, + "eval_steps_per_second": 5.945, + "step": 86000 + }, + { + "epoch": 9.261651060165752, + "grad_norm": 0.8659238815307617, + "learning_rate": 4.4549078763064325e-05, + "loss": 3.1371, + "step": 86050 + }, + { + "epoch": 9.26703261220536, + "grad_norm": 0.8517236113548279, + "learning_rate": 4.4225837733002906e-05, + "loss": 3.1217, + "step": 86100 + }, + { + "epoch": 9.272414164244967, + "grad_norm": 0.8780714273452759, + "learning_rate": 4.390259670294149e-05, + "loss": 3.1419, + "step": 86150 + }, + { + "epoch": 9.277795716284576, + "grad_norm": 0.8498876690864563, + "learning_rate": 4.3579355672880075e-05, + "loss": 3.128, + "step": 86200 + }, + { + "epoch": 9.283177268324184, + "grad_norm": 0.8308305740356445, + "learning_rate": 4.3256114642818656e-05, + "loss": 3.1307, + "step": 86250 + }, + { + "epoch": 9.288558820363793, + "grad_norm": 0.8509485125541687, + "learning_rate": 4.2932873612757244e-05, + "loss": 3.1287, + "step": 86300 + }, + { + "epoch": 9.293940372403402, + "grad_norm": 0.8401972055435181, + "learning_rate": 4.2609632582695825e-05, + "loss": 3.1373, + "step": 86350 + }, + { + "epoch": 9.29932192444301, + "grad_norm": 0.8404197096824646, + "learning_rate": 4.2286391552634406e-05, + "loss": 3.1289, + "step": 86400 + }, + { + "epoch": 9.304703476482617, + "grad_norm": 0.8347073197364807, + "learning_rate": 4.1963150522572994e-05, + "loss": 3.1325, + "step": 86450 + }, + { + "epoch": 9.310085028522225, + "grad_norm": 0.8774033188819885, + "learning_rate": 4.1639909492511575e-05, + "loss": 3.1243, + "step": 86500 + }, + { + "epoch": 9.315466580561834, + "grad_norm": 0.9243816137313843, + "learning_rate": 4.131666846245017e-05, + "loss": 3.1181, + "step": 86550 + }, + { + "epoch": 9.320848132601443, + "grad_norm": 0.8504797220230103, + "learning_rate": 4.099342743238875e-05, + "loss": 3.1397, + "step": 86600 + }, + { + "epoch": 9.326229684641051, + "grad_norm": 0.8603664636611938, + "learning_rate": 4.067018640232733e-05, + "loss": 3.1403, + "step": 86650 + }, + { + "epoch": 9.331611236680658, + "grad_norm": 0.8422338366508484, + "learning_rate": 4.034694537226592e-05, + "loss": 3.1305, + "step": 86700 + }, + { + "epoch": 9.336992788720266, + "grad_norm": 0.8984904885292053, + "learning_rate": 4.00237043422045e-05, + "loss": 3.1268, + "step": 86750 + }, + { + "epoch": 9.342374340759875, + "grad_norm": 0.8661045432090759, + "learning_rate": 3.970046331214309e-05, + "loss": 3.1122, + "step": 86800 + }, + { + "epoch": 9.347755892799483, + "grad_norm": 0.8606688380241394, + "learning_rate": 3.937722228208167e-05, + "loss": 3.1295, + "step": 86850 + }, + { + "epoch": 9.353137444839092, + "grad_norm": 0.8438649773597717, + "learning_rate": 3.905398125202025e-05, + "loss": 3.1299, + "step": 86900 + }, + { + "epoch": 9.3585189968787, + "grad_norm": 0.8938318490982056, + "learning_rate": 3.873074022195884e-05, + "loss": 3.1416, + "step": 86950 + }, + { + "epoch": 9.363900548918307, + "grad_norm": 0.8666595816612244, + "learning_rate": 3.840749919189742e-05, + "loss": 3.1224, + "step": 87000 + }, + { + "epoch": 9.363900548918307, + "eval_accuracy": 0.3935082724505931, + "eval_loss": 3.309908151626587, + "eval_runtime": 189.3766, + "eval_samples_per_second": 95.107, + "eval_steps_per_second": 5.946, + "step": 87000 + }, + { + "epoch": 9.369282100957916, + "grad_norm": 0.8648133873939514, + "learning_rate": 3.808425816183601e-05, + "loss": 3.1365, + "step": 87050 + }, + { + "epoch": 9.374663652997524, + "grad_norm": 0.8668041825294495, + "learning_rate": 3.776101713177459e-05, + "loss": 3.1471, + "step": 87100 + }, + { + "epoch": 9.380045205037133, + "grad_norm": 0.8217251300811768, + "learning_rate": 3.743777610171318e-05, + "loss": 3.131, + "step": 87150 + }, + { + "epoch": 9.385426757076742, + "grad_norm": 0.8287473320960999, + "learning_rate": 3.711453507165176e-05, + "loss": 3.1532, + "step": 87200 + }, + { + "epoch": 9.390808309116348, + "grad_norm": 0.8670193552970886, + "learning_rate": 3.679129404159034e-05, + "loss": 3.1399, + "step": 87250 + }, + { + "epoch": 9.396189861155957, + "grad_norm": 0.8458430171012878, + "learning_rate": 3.646805301152893e-05, + "loss": 3.1549, + "step": 87300 + }, + { + "epoch": 9.401571413195565, + "grad_norm": 0.8716211318969727, + "learning_rate": 3.614481198146751e-05, + "loss": 3.1314, + "step": 87350 + }, + { + "epoch": 9.406952965235174, + "grad_norm": 0.8429900407791138, + "learning_rate": 3.5821570951406096e-05, + "loss": 3.128, + "step": 87400 + }, + { + "epoch": 9.412334517274783, + "grad_norm": 0.8358698487281799, + "learning_rate": 3.5498329921344684e-05, + "loss": 3.1314, + "step": 87450 + }, + { + "epoch": 9.417716069314391, + "grad_norm": 0.8356906175613403, + "learning_rate": 3.5175088891283265e-05, + "loss": 3.1225, + "step": 87500 + }, + { + "epoch": 9.423097621353998, + "grad_norm": 0.8311634063720703, + "learning_rate": 3.4851847861221846e-05, + "loss": 3.1125, + "step": 87550 + }, + { + "epoch": 9.428479173393606, + "grad_norm": 0.8509854674339294, + "learning_rate": 3.4528606831160434e-05, + "loss": 3.1331, + "step": 87600 + }, + { + "epoch": 9.433860725433215, + "grad_norm": 0.890250563621521, + "learning_rate": 3.4205365801099015e-05, + "loss": 3.1257, + "step": 87650 + }, + { + "epoch": 9.439242277472824, + "grad_norm": 0.8621748089790344, + "learning_rate": 3.38821247710376e-05, + "loss": 3.1427, + "step": 87700 + }, + { + "epoch": 9.444623829512432, + "grad_norm": 0.8697271347045898, + "learning_rate": 3.3558883740976184e-05, + "loss": 3.1607, + "step": 87750 + }, + { + "epoch": 9.450005381552039, + "grad_norm": 0.8986278772354126, + "learning_rate": 3.323564271091477e-05, + "loss": 3.1247, + "step": 87800 + }, + { + "epoch": 9.455386933591647, + "grad_norm": 0.8373309373855591, + "learning_rate": 3.291240168085335e-05, + "loss": 3.1418, + "step": 87850 + }, + { + "epoch": 9.460768485631256, + "grad_norm": 0.8242829442024231, + "learning_rate": 3.258916065079194e-05, + "loss": 3.1379, + "step": 87900 + }, + { + "epoch": 9.466150037670864, + "grad_norm": 0.848978579044342, + "learning_rate": 3.226591962073052e-05, + "loss": 3.1385, + "step": 87950 + }, + { + "epoch": 9.471531589710473, + "grad_norm": 0.8514211773872375, + "learning_rate": 3.19426785906691e-05, + "loss": 3.1341, + "step": 88000 + }, + { + "epoch": 9.471531589710473, + "eval_accuracy": 0.39374882994421434, + "eval_loss": 3.307783603668213, + "eval_runtime": 189.7402, + "eval_samples_per_second": 94.925, + "eval_steps_per_second": 5.934, + "step": 88000 + }, + { + "epoch": 9.476913141750082, + "grad_norm": 0.8711098432540894, + "learning_rate": 3.161943756060769e-05, + "loss": 3.126, + "step": 88050 + }, + { + "epoch": 9.482294693789688, + "grad_norm": 0.8469831943511963, + "learning_rate": 3.129619653054627e-05, + "loss": 3.1443, + "step": 88100 + }, + { + "epoch": 9.487676245829297, + "grad_norm": 0.8410546183586121, + "learning_rate": 3.097295550048486e-05, + "loss": 3.1266, + "step": 88150 + }, + { + "epoch": 9.493057797868905, + "grad_norm": 0.8234712481498718, + "learning_rate": 3.064971447042345e-05, + "loss": 3.1295, + "step": 88200 + }, + { + "epoch": 9.498439349908514, + "grad_norm": 0.8497011661529541, + "learning_rate": 3.0326473440362026e-05, + "loss": 3.131, + "step": 88250 + }, + { + "epoch": 9.503820901948123, + "grad_norm": 0.8871855735778809, + "learning_rate": 3.000323241030061e-05, + "loss": 3.1225, + "step": 88300 + }, + { + "epoch": 9.50920245398773, + "grad_norm": 0.8837377429008484, + "learning_rate": 2.9679991380239195e-05, + "loss": 3.1358, + "step": 88350 + }, + { + "epoch": 9.514584006027338, + "grad_norm": 0.8673506379127502, + "learning_rate": 2.9356750350177782e-05, + "loss": 3.1208, + "step": 88400 + }, + { + "epoch": 9.519965558066946, + "grad_norm": 0.8431509733200073, + "learning_rate": 2.9033509320116367e-05, + "loss": 3.1308, + "step": 88450 + }, + { + "epoch": 9.525347110106555, + "grad_norm": 0.8733773827552795, + "learning_rate": 2.8710268290054948e-05, + "loss": 3.1542, + "step": 88500 + }, + { + "epoch": 9.530728662146164, + "grad_norm": 0.9029594659805298, + "learning_rate": 2.8387027259993533e-05, + "loss": 3.133, + "step": 88550 + }, + { + "epoch": 9.536110214185772, + "grad_norm": 0.8783562183380127, + "learning_rate": 2.8063786229932117e-05, + "loss": 3.1165, + "step": 88600 + }, + { + "epoch": 9.541491766225379, + "grad_norm": 0.8718871474266052, + "learning_rate": 2.77405451998707e-05, + "loss": 3.1176, + "step": 88650 + }, + { + "epoch": 9.546873318264987, + "grad_norm": 0.8462399244308472, + "learning_rate": 2.7417304169809286e-05, + "loss": 3.1366, + "step": 88700 + }, + { + "epoch": 9.552254870304596, + "grad_norm": 0.8613501191139221, + "learning_rate": 2.7094063139747867e-05, + "loss": 3.1189, + "step": 88750 + }, + { + "epoch": 9.557636422344205, + "grad_norm": 0.8645268678665161, + "learning_rate": 2.677082210968645e-05, + "loss": 3.1303, + "step": 88800 + }, + { + "epoch": 9.563017974383813, + "grad_norm": 0.861893892288208, + "learning_rate": 2.644758107962504e-05, + "loss": 3.1502, + "step": 88850 + }, + { + "epoch": 9.56839952642342, + "grad_norm": 0.828042209148407, + "learning_rate": 2.6124340049563624e-05, + "loss": 3.1388, + "step": 88900 + }, + { + "epoch": 9.573781078463028, + "grad_norm": 0.8182950019836426, + "learning_rate": 2.580109901950221e-05, + "loss": 3.1395, + "step": 88950 + }, + { + "epoch": 9.579162630502637, + "grad_norm": 0.8487094640731812, + "learning_rate": 2.547785798944079e-05, + "loss": 3.1354, + "step": 89000 + }, + { + "epoch": 9.579162630502637, + "eval_accuracy": 0.3941069498624509, + "eval_loss": 3.3047800064086914, + "eval_runtime": 189.7111, + "eval_samples_per_second": 94.939, + "eval_steps_per_second": 5.935, + "step": 89000 + }, + { + "epoch": 9.584544182542245, + "grad_norm": 0.8931549787521362, + "learning_rate": 2.5154616959379374e-05, + "loss": 3.138, + "step": 89050 + }, + { + "epoch": 9.589925734581854, + "grad_norm": 0.8639717102050781, + "learning_rate": 2.483137592931796e-05, + "loss": 3.15, + "step": 89100 + }, + { + "epoch": 9.59530728662146, + "grad_norm": 0.8543230295181274, + "learning_rate": 2.4508134899256543e-05, + "loss": 3.1383, + "step": 89150 + }, + { + "epoch": 9.60068883866107, + "grad_norm": 0.9166204929351807, + "learning_rate": 2.418489386919513e-05, + "loss": 3.1253, + "step": 89200 + }, + { + "epoch": 9.606070390700678, + "grad_norm": 0.8401595950126648, + "learning_rate": 2.386165283913371e-05, + "loss": 3.1514, + "step": 89250 + }, + { + "epoch": 9.611451942740286, + "grad_norm": 0.8688633441925049, + "learning_rate": 2.3544876629673522e-05, + "loss": 3.1363, + "step": 89300 + }, + { + "epoch": 9.616833494779895, + "grad_norm": 0.8411018252372742, + "learning_rate": 2.3221635599612106e-05, + "loss": 3.1256, + "step": 89350 + }, + { + "epoch": 9.622215046819504, + "grad_norm": 0.8895683288574219, + "learning_rate": 2.2898394569550694e-05, + "loss": 3.1329, + "step": 89400 + }, + { + "epoch": 9.62759659885911, + "grad_norm": 0.8607434034347534, + "learning_rate": 2.257515353948928e-05, + "loss": 3.1494, + "step": 89450 + }, + { + "epoch": 9.632978150898719, + "grad_norm": 0.8818168640136719, + "learning_rate": 2.2251912509427863e-05, + "loss": 3.1479, + "step": 89500 + }, + { + "epoch": 9.638359702938327, + "grad_norm": 0.8703548312187195, + "learning_rate": 2.1928671479366444e-05, + "loss": 3.1207, + "step": 89550 + }, + { + "epoch": 9.643741254977936, + "grad_norm": 0.8854098916053772, + "learning_rate": 2.160543044930503e-05, + "loss": 3.1324, + "step": 89600 + }, + { + "epoch": 9.649122807017545, + "grad_norm": 0.8342804908752441, + "learning_rate": 2.1282189419243613e-05, + "loss": 3.1383, + "step": 89650 + }, + { + "epoch": 9.654504359057151, + "grad_norm": 0.8617409467697144, + "learning_rate": 2.0958948389182198e-05, + "loss": 3.1366, + "step": 89700 + }, + { + "epoch": 9.65988591109676, + "grad_norm": 0.8699505925178528, + "learning_rate": 2.0635707359120786e-05, + "loss": 3.1416, + "step": 89750 + }, + { + "epoch": 9.665267463136368, + "grad_norm": 0.8576476573944092, + "learning_rate": 2.0312466329059367e-05, + "loss": 3.1251, + "step": 89800 + }, + { + "epoch": 9.670649015175977, + "grad_norm": 0.848074197769165, + "learning_rate": 1.998922529899795e-05, + "loss": 3.1331, + "step": 89850 + }, + { + "epoch": 9.676030567215586, + "grad_norm": 0.8670492768287659, + "learning_rate": 1.9665984268936536e-05, + "loss": 3.1474, + "step": 89900 + }, + { + "epoch": 9.681412119255192, + "grad_norm": 0.8532721996307373, + "learning_rate": 1.934274323887512e-05, + "loss": 3.1403, + "step": 89950 + }, + { + "epoch": 9.6867936712948, + "grad_norm": 0.8756169080734253, + "learning_rate": 1.9019502208813705e-05, + "loss": 3.1262, + "step": 90000 + }, + { + "epoch": 9.6867936712948, + "eval_accuracy": 0.3943953146267105, + "eval_loss": 3.3019161224365234, + "eval_runtime": 189.5494, + "eval_samples_per_second": 95.02, + "eval_steps_per_second": 5.94, + "step": 90000 + }, + { + "epoch": 9.69217522333441, + "grad_norm": 0.8606821894645691, + "learning_rate": 1.869626117875229e-05, + "loss": 3.1353, + "step": 90050 + }, + { + "epoch": 9.697556775374018, + "grad_norm": 0.8910887241363525, + "learning_rate": 1.837302014869087e-05, + "loss": 3.1403, + "step": 90100 + }, + { + "epoch": 9.702938327413626, + "grad_norm": 0.9061274528503418, + "learning_rate": 1.8049779118629455e-05, + "loss": 3.14, + "step": 90150 + }, + { + "epoch": 9.708319879453235, + "grad_norm": 0.8429542183876038, + "learning_rate": 1.7726538088568043e-05, + "loss": 3.1161, + "step": 90200 + }, + { + "epoch": 9.713701431492842, + "grad_norm": 0.8305377960205078, + "learning_rate": 1.7403297058506624e-05, + "loss": 3.1368, + "step": 90250 + }, + { + "epoch": 9.71908298353245, + "grad_norm": 0.837834894657135, + "learning_rate": 1.708005602844521e-05, + "loss": 3.1374, + "step": 90300 + }, + { + "epoch": 9.724464535572059, + "grad_norm": 0.852417528629303, + "learning_rate": 1.6756814998383793e-05, + "loss": 3.1363, + "step": 90350 + }, + { + "epoch": 9.729846087611667, + "grad_norm": 0.87115079164505, + "learning_rate": 1.6433573968322377e-05, + "loss": 3.1364, + "step": 90400 + }, + { + "epoch": 9.735227639651276, + "grad_norm": 0.8634170293807983, + "learning_rate": 1.6110332938260962e-05, + "loss": 3.1237, + "step": 90450 + }, + { + "epoch": 9.740609191690883, + "grad_norm": 0.871769905090332, + "learning_rate": 1.5787091908199546e-05, + "loss": 3.156, + "step": 90500 + }, + { + "epoch": 9.745990743730491, + "grad_norm": 0.8181995749473572, + "learning_rate": 1.546385087813813e-05, + "loss": 3.1309, + "step": 90550 + }, + { + "epoch": 9.7513722957701, + "grad_norm": 0.8241446614265442, + "learning_rate": 1.5140609848076714e-05, + "loss": 3.1164, + "step": 90600 + }, + { + "epoch": 9.756753847809708, + "grad_norm": 0.8035761117935181, + "learning_rate": 1.4817368818015298e-05, + "loss": 3.1295, + "step": 90650 + }, + { + "epoch": 9.762135399849317, + "grad_norm": 0.8078320622444153, + "learning_rate": 1.4494127787953884e-05, + "loss": 3.1354, + "step": 90700 + }, + { + "epoch": 9.767516951888926, + "grad_norm": 0.9000764489173889, + "learning_rate": 1.4170886757892467e-05, + "loss": 3.1242, + "step": 90750 + }, + { + "epoch": 9.772898503928532, + "grad_norm": 0.8568733334541321, + "learning_rate": 1.3847645727831052e-05, + "loss": 3.117, + "step": 90800 + }, + { + "epoch": 9.77828005596814, + "grad_norm": 0.8243940472602844, + "learning_rate": 1.3524404697769635e-05, + "loss": 3.1242, + "step": 90850 + }, + { + "epoch": 9.78366160800775, + "grad_norm": 0.8713697195053101, + "learning_rate": 1.320116366770822e-05, + "loss": 3.13, + "step": 90900 + }, + { + "epoch": 9.789043160047358, + "grad_norm": 0.837845504283905, + "learning_rate": 1.2877922637646805e-05, + "loss": 3.1421, + "step": 90950 + }, + { + "epoch": 9.794424712086967, + "grad_norm": 0.8562050461769104, + "learning_rate": 1.2554681607585388e-05, + "loss": 3.129, + "step": 91000 + }, + { + "epoch": 9.794424712086967, + "eval_accuracy": 0.3945871956265909, + "eval_loss": 3.300642490386963, + "eval_runtime": 190.2591, + "eval_samples_per_second": 94.666, + "eval_steps_per_second": 5.918, + "step": 91000 + }, + { + "epoch": 9.799806264126573, + "grad_norm": 0.8212922811508179, + "learning_rate": 1.2231440577523972e-05, + "loss": 3.1433, + "step": 91050 + }, + { + "epoch": 9.805187816166182, + "grad_norm": 0.9242744445800781, + "learning_rate": 1.1908199547462555e-05, + "loss": 3.1234, + "step": 91100 + }, + { + "epoch": 9.81056936820579, + "grad_norm": 0.8359904289245605, + "learning_rate": 1.1584958517401141e-05, + "loss": 3.1384, + "step": 91150 + }, + { + "epoch": 9.815950920245399, + "grad_norm": 0.8217136263847351, + "learning_rate": 1.1261717487339726e-05, + "loss": 3.1144, + "step": 91200 + }, + { + "epoch": 9.821332472285007, + "grad_norm": 0.8538930416107178, + "learning_rate": 1.0938476457278309e-05, + "loss": 3.1489, + "step": 91250 + }, + { + "epoch": 9.826714024324616, + "grad_norm": 0.8195740580558777, + "learning_rate": 1.0615235427216895e-05, + "loss": 3.1369, + "step": 91300 + }, + { + "epoch": 9.832095576364223, + "grad_norm": 0.8804702162742615, + "learning_rate": 1.0291994397155478e-05, + "loss": 3.1469, + "step": 91350 + }, + { + "epoch": 9.837477128403831, + "grad_norm": 0.8412031531333923, + "learning_rate": 9.968753367094062e-06, + "loss": 3.1415, + "step": 91400 + }, + { + "epoch": 9.84285868044344, + "grad_norm": 0.8647484183311462, + "learning_rate": 9.645512337032647e-06, + "loss": 3.1224, + "step": 91450 + }, + { + "epoch": 9.848240232483048, + "grad_norm": 0.8878872990608215, + "learning_rate": 9.32227130697123e-06, + "loss": 3.1219, + "step": 91500 + }, + { + "epoch": 9.853621784522657, + "grad_norm": 0.839019775390625, + "learning_rate": 8.999030276909816e-06, + "loss": 3.1497, + "step": 91550 + }, + { + "epoch": 9.859003336562264, + "grad_norm": 0.8272830247879028, + "learning_rate": 8.675789246848399e-06, + "loss": 3.1342, + "step": 91600 + }, + { + "epoch": 9.864384888601872, + "grad_norm": 0.8204966187477112, + "learning_rate": 8.352548216786983e-06, + "loss": 3.1181, + "step": 91650 + }, + { + "epoch": 9.869766440641481, + "grad_norm": 0.8158440589904785, + "learning_rate": 8.029307186725568e-06, + "loss": 3.1251, + "step": 91700 + }, + { + "epoch": 9.87514799268109, + "grad_norm": 0.9214332103729248, + "learning_rate": 7.706066156664152e-06, + "loss": 3.1234, + "step": 91750 + }, + { + "epoch": 9.880529544720698, + "grad_norm": 0.8618704676628113, + "learning_rate": 7.3828251266027365e-06, + "loss": 3.1344, + "step": 91800 + }, + { + "epoch": 9.885911096760307, + "grad_norm": 0.8866255879402161, + "learning_rate": 7.05958409654132e-06, + "loss": 3.1481, + "step": 91850 + }, + { + "epoch": 9.891292648799913, + "grad_norm": 0.8363102674484253, + "learning_rate": 6.736343066479905e-06, + "loss": 3.1315, + "step": 91900 + }, + { + "epoch": 9.896674200839522, + "grad_norm": 0.8203515410423279, + "learning_rate": 6.413102036418488e-06, + "loss": 3.1338, + "step": 91950 + }, + { + "epoch": 9.90205575287913, + "grad_norm": 0.8286318778991699, + "learning_rate": 6.089861006357073e-06, + "loss": 3.1392, + "step": 92000 + }, + { + "epoch": 9.90205575287913, + "eval_accuracy": 0.3947092128196065, + "eval_loss": 3.29917311668396, + "eval_runtime": 189.5412, + "eval_samples_per_second": 95.024, + "eval_steps_per_second": 5.941, + "step": 92000 + }, + { + "epoch": 9.907437304918739, + "grad_norm": 0.8603388071060181, + "learning_rate": 5.766619976295657e-06, + "loss": 3.1372, + "step": 92050 + }, + { + "epoch": 9.912818856958348, + "grad_norm": 0.9106926321983337, + "learning_rate": 5.443378946234242e-06, + "loss": 3.1309, + "step": 92100 + }, + { + "epoch": 9.918200408997954, + "grad_norm": 0.8053128123283386, + "learning_rate": 5.1201379161728254e-06, + "loss": 3.1273, + "step": 92150 + }, + { + "epoch": 9.923581961037563, + "grad_norm": 0.8366634249687195, + "learning_rate": 4.79689688611141e-06, + "loss": 3.1282, + "step": 92200 + }, + { + "epoch": 9.928963513077171, + "grad_norm": 0.8284056782722473, + "learning_rate": 4.4736558560499944e-06, + "loss": 3.141, + "step": 92250 + }, + { + "epoch": 9.93434506511678, + "grad_norm": 0.8485735058784485, + "learning_rate": 4.150414825988578e-06, + "loss": 3.1262, + "step": 92300 + }, + { + "epoch": 9.939726617156388, + "grad_norm": 0.859318196773529, + "learning_rate": 3.8271737959271626e-06, + "loss": 3.1151, + "step": 92350 + }, + { + "epoch": 9.945108169195997, + "grad_norm": 0.8240682482719421, + "learning_rate": 3.5039327658657466e-06, + "loss": 3.1192, + "step": 92400 + }, + { + "epoch": 9.950489721235604, + "grad_norm": 0.850426971912384, + "learning_rate": 3.180691735804331e-06, + "loss": 3.1372, + "step": 92450 + }, + { + "epoch": 9.955871273275212, + "grad_norm": 0.8800275921821594, + "learning_rate": 2.857450705742915e-06, + "loss": 3.1292, + "step": 92500 + }, + { + "epoch": 9.961252825314821, + "grad_norm": 0.8482245802879333, + "learning_rate": 2.5342096756814993e-06, + "loss": 3.141, + "step": 92550 + }, + { + "epoch": 9.96663437735443, + "grad_norm": 0.8627044558525085, + "learning_rate": 2.2109686456200838e-06, + "loss": 3.1262, + "step": 92600 + }, + { + "epoch": 9.972015929394038, + "grad_norm": 0.9186998009681702, + "learning_rate": 1.8877276155586683e-06, + "loss": 3.1342, + "step": 92650 + }, + { + "epoch": 9.977397481433645, + "grad_norm": 0.9178165197372437, + "learning_rate": 1.5644865854972521e-06, + "loss": 3.1263, + "step": 92700 + }, + { + "epoch": 9.982779033473253, + "grad_norm": 0.8731764554977417, + "learning_rate": 1.2412455554358364e-06, + "loss": 3.1389, + "step": 92750 + }, + { + "epoch": 9.988160585512862, + "grad_norm": 0.8474687337875366, + "learning_rate": 9.180045253744208e-07, + "loss": 3.1368, + "step": 92800 + }, + { + "epoch": 9.99354213755247, + "grad_norm": 0.8867104649543762, + "learning_rate": 5.947634953130051e-07, + "loss": 3.153, + "step": 92850 + }, + { + "epoch": 9.998923689592079, + "grad_norm": 0.8560895919799805, + "learning_rate": 2.7152246525158925e-07, + "loss": 3.1305, + "step": 92900 + }, + { + "epoch": 10.0, + "step": 92910, + "total_flos": 7.76821211136e+17, + "train_loss": 3.4556877505752324, + "train_runtime": 82552.4271, + "train_samples_per_second": 36.013, + "train_steps_per_second": 1.125 + } + ], + "logging_steps": 50, + "max_steps": 92910, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.76821211136e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}