diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13876 @@ +{ + "best_metric": 3.3076210021972656, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-90000", + "epoch": 10.0, + "eval_steps": 1000, + "global_step": 92910, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005381552039608223, + "grad_norm": 1.1363136768341064, + "learning_rate": 0.0003, + "loss": 8.4632, + "step": 50 + }, + { + "epoch": 0.010763104079216447, + "grad_norm": 3.25538969039917, + "learning_rate": 0.0006, + "loss": 6.8415, + "step": 100 + }, + { + "epoch": 0.01614465611882467, + "grad_norm": 3.1040453910827637, + "learning_rate": 0.0005996767589699385, + "loss": 6.4387, + "step": 150 + }, + { + "epoch": 0.021526208158432893, + "grad_norm": 3.3955957889556885, + "learning_rate": 0.0005993535179398771, + "loss": 6.1922, + "step": 200 + }, + { + "epoch": 0.026907760198041114, + "grad_norm": 2.7429745197296143, + "learning_rate": 0.0005990302769098158, + "loss": 6.0811, + "step": 250 + }, + { + "epoch": 0.03228931223764934, + "grad_norm": 1.5651514530181885, + "learning_rate": 0.0005987070358797543, + "loss": 5.9862, + "step": 300 + }, + { + "epoch": 0.03767086427725756, + "grad_norm": 1.5071296691894531, + "learning_rate": 0.0005983837948496929, + "loss": 5.8565, + "step": 350 + }, + { + "epoch": 0.04305241631686579, + "grad_norm": 1.3275337219238281, + "learning_rate": 0.0005980605538196314, + "loss": 5.8096, + "step": 400 + }, + { + "epoch": 0.048433968356474004, + "grad_norm": 1.523363709449768, + "learning_rate": 0.0005977373127895701, + "loss": 5.7397, + "step": 450 + }, + { + "epoch": 0.05381552039608223, + "grad_norm": 1.786197304725647, + "learning_rate": 0.0005974140717595086, + "loss": 5.6499, + "step": 500 + }, + { + "epoch": 0.05919707243569045, + "grad_norm": 1.4374873638153076, + "learning_rate": 0.0005970908307294472, + "loss": 5.5871, + "step": 550 + }, + { + "epoch": 0.06457862447529868, + "grad_norm": 1.401210069656372, + "learning_rate": 0.0005967675896993858, + "loss": 5.5076, + "step": 600 + }, + { + "epoch": 0.0699601765149069, + "grad_norm": 1.2165864706039429, + "learning_rate": 0.0005964443486693243, + "loss": 5.4396, + "step": 650 + }, + { + "epoch": 0.07534172855451512, + "grad_norm": 1.5090405941009521, + "learning_rate": 0.000596121107639263, + "loss": 5.3853, + "step": 700 + }, + { + "epoch": 0.08072328059412334, + "grad_norm": 1.6507272720336914, + "learning_rate": 0.0005957978666092015, + "loss": 5.3282, + "step": 750 + }, + { + "epoch": 0.08610483263373157, + "grad_norm": 1.5006401538848877, + "learning_rate": 0.0005954746255791401, + "loss": 5.2854, + "step": 800 + }, + { + "epoch": 0.09148638467333979, + "grad_norm": 1.1712360382080078, + "learning_rate": 0.0005951513845490787, + "loss": 5.265, + "step": 850 + }, + { + "epoch": 0.09686793671294801, + "grad_norm": 1.2654850482940674, + "learning_rate": 0.0005948281435190174, + "loss": 5.1858, + "step": 900 + }, + { + "epoch": 0.10224948875255624, + "grad_norm": 1.3258107900619507, + "learning_rate": 0.0005945049024889559, + "loss": 5.1631, + "step": 950 + }, + { + "epoch": 0.10763104079216446, + "grad_norm": 1.1906688213348389, + "learning_rate": 0.0005941816614588944, + "loss": 5.1052, + "step": 1000 + }, + { + "epoch": 0.10763104079216446, + "eval_accuracy": 0.22547332185886404, + "eval_loss": 5.034008502960205, + "eval_runtime": 183.4102, + "eval_samples_per_second": 98.201, + "eval_steps_per_second": 6.139, + "step": 1000 + }, + { + "epoch": 0.11301259283177269, + "grad_norm": 0.88250333070755, + "learning_rate": 0.000593858420428833, + "loss": 5.0516, + "step": 1050 + }, + { + "epoch": 0.1183941448713809, + "grad_norm": 0.965934693813324, + "learning_rate": 0.0005935351793987716, + "loss": 5.0322, + "step": 1100 + }, + { + "epoch": 0.12377569691098914, + "grad_norm": 1.1457676887512207, + "learning_rate": 0.0005932119383687103, + "loss": 5.0043, + "step": 1150 + }, + { + "epoch": 0.12915724895059735, + "grad_norm": 1.3504283428192139, + "learning_rate": 0.0005928886973386488, + "loss": 4.9543, + "step": 1200 + }, + { + "epoch": 0.13453880099020557, + "grad_norm": 1.02091383934021, + "learning_rate": 0.0005925654563085874, + "loss": 4.9343, + "step": 1250 + }, + { + "epoch": 0.1399203530298138, + "grad_norm": 0.9588813781738281, + "learning_rate": 0.000592242215278526, + "loss": 4.8984, + "step": 1300 + }, + { + "epoch": 0.14530190506942203, + "grad_norm": 0.8982344269752502, + "learning_rate": 0.0005919189742484645, + "loss": 4.8657, + "step": 1350 + }, + { + "epoch": 0.15068345710903025, + "grad_norm": 1.053356647491455, + "learning_rate": 0.0005915957332184032, + "loss": 4.8735, + "step": 1400 + }, + { + "epoch": 0.15606500914863847, + "grad_norm": 1.0207793712615967, + "learning_rate": 0.0005912724921883417, + "loss": 4.8455, + "step": 1450 + }, + { + "epoch": 0.16144656118824668, + "grad_norm": 0.9741514325141907, + "learning_rate": 0.0005909492511582803, + "loss": 4.8106, + "step": 1500 + }, + { + "epoch": 0.1668281132278549, + "grad_norm": 1.1351131200790405, + "learning_rate": 0.0005906260101282189, + "loss": 4.7854, + "step": 1550 + }, + { + "epoch": 0.17220966526746315, + "grad_norm": 0.8748419880867004, + "learning_rate": 0.0005903027690981575, + "loss": 4.7504, + "step": 1600 + }, + { + "epoch": 0.17759121730707136, + "grad_norm": 0.7746927738189697, + "learning_rate": 0.000589979528068096, + "loss": 4.7232, + "step": 1650 + }, + { + "epoch": 0.18297276934667958, + "grad_norm": 1.0927244424819946, + "learning_rate": 0.0005896562870380347, + "loss": 4.7024, + "step": 1700 + }, + { + "epoch": 0.1883543213862878, + "grad_norm": 1.1120930910110474, + "learning_rate": 0.0005893330460079732, + "loss": 4.7158, + "step": 1750 + }, + { + "epoch": 0.19373587342589602, + "grad_norm": 0.7173694372177124, + "learning_rate": 0.0005890098049779118, + "loss": 4.684, + "step": 1800 + }, + { + "epoch": 0.19911742546550426, + "grad_norm": 0.964622974395752, + "learning_rate": 0.0005886865639478504, + "loss": 4.668, + "step": 1850 + }, + { + "epoch": 0.20449897750511248, + "grad_norm": 0.8760665059089661, + "learning_rate": 0.0005883633229177889, + "loss": 4.64, + "step": 1900 + }, + { + "epoch": 0.2098805295447207, + "grad_norm": 1.0075942277908325, + "learning_rate": 0.0005880400818877276, + "loss": 4.6187, + "step": 1950 + }, + { + "epoch": 0.2152620815843289, + "grad_norm": 1.170619249343872, + "learning_rate": 0.0005877168408576662, + "loss": 4.6124, + "step": 2000 + }, + { + "epoch": 0.2152620815843289, + "eval_accuracy": 0.2689746785531477, + "eval_loss": 4.52186918258667, + "eval_runtime": 182.9507, + "eval_samples_per_second": 98.447, + "eval_steps_per_second": 6.155, + "step": 2000 + }, + { + "epoch": 0.22064363362393713, + "grad_norm": 1.1905301809310913, + "learning_rate": 0.0005873935998276048, + "loss": 4.5808, + "step": 2050 + }, + { + "epoch": 0.22602518566354537, + "grad_norm": 0.7720122933387756, + "learning_rate": 0.0005870703587975433, + "loss": 4.5765, + "step": 2100 + }, + { + "epoch": 0.2314067377031536, + "grad_norm": 0.9038923978805542, + "learning_rate": 0.0005867471177674818, + "loss": 4.5495, + "step": 2150 + }, + { + "epoch": 0.2367882897427618, + "grad_norm": 0.954218327999115, + "learning_rate": 0.0005864238767374205, + "loss": 4.5188, + "step": 2200 + }, + { + "epoch": 0.24216984178237003, + "grad_norm": 0.890593409538269, + "learning_rate": 0.0005861006357073591, + "loss": 4.5113, + "step": 2250 + }, + { + "epoch": 0.24755139382197827, + "grad_norm": 1.0826386213302612, + "learning_rate": 0.0005857773946772977, + "loss": 4.5139, + "step": 2300 + }, + { + "epoch": 0.2529329458615865, + "grad_norm": 1.0157872438430786, + "learning_rate": 0.0005854541536472362, + "loss": 4.4663, + "step": 2350 + }, + { + "epoch": 0.2583144979011947, + "grad_norm": 1.0314841270446777, + "learning_rate": 0.0005851309126171749, + "loss": 4.4576, + "step": 2400 + }, + { + "epoch": 0.2636960499408029, + "grad_norm": 1.0260419845581055, + "learning_rate": 0.0005848076715871134, + "loss": 4.4447, + "step": 2450 + }, + { + "epoch": 0.26907760198041114, + "grad_norm": 0.8484777212142944, + "learning_rate": 0.000584484430557052, + "loss": 4.4135, + "step": 2500 + }, + { + "epoch": 0.27445915402001936, + "grad_norm": 0.9077085256576538, + "learning_rate": 0.0005841611895269906, + "loss": 4.4272, + "step": 2550 + }, + { + "epoch": 0.2798407060596276, + "grad_norm": 0.7669521570205688, + "learning_rate": 0.0005838379484969291, + "loss": 4.402, + "step": 2600 + }, + { + "epoch": 0.2852222580992358, + "grad_norm": 0.7865301966667175, + "learning_rate": 0.0005835147074668678, + "loss": 4.406, + "step": 2650 + }, + { + "epoch": 0.29060381013884407, + "grad_norm": 0.9062381386756897, + "learning_rate": 0.0005831914664368063, + "loss": 4.3869, + "step": 2700 + }, + { + "epoch": 0.2959853621784523, + "grad_norm": 0.8867931962013245, + "learning_rate": 0.0005828682254067449, + "loss": 4.367, + "step": 2750 + }, + { + "epoch": 0.3013669142180605, + "grad_norm": 0.7799649834632874, + "learning_rate": 0.0005825449843766835, + "loss": 4.3764, + "step": 2800 + }, + { + "epoch": 0.3067484662576687, + "grad_norm": 0.998032808303833, + "learning_rate": 0.0005822217433466221, + "loss": 4.3453, + "step": 2850 + }, + { + "epoch": 0.31213001829727693, + "grad_norm": 0.7040951251983643, + "learning_rate": 0.0005818985023165607, + "loss": 4.3335, + "step": 2900 + }, + { + "epoch": 0.31751157033688515, + "grad_norm": 0.8604708313941956, + "learning_rate": 0.0005815752612864992, + "loss": 4.3286, + "step": 2950 + }, + { + "epoch": 0.32289312237649337, + "grad_norm": 0.6292029619216919, + "learning_rate": 0.0005812520202564378, + "loss": 4.3286, + "step": 3000 + }, + { + "epoch": 0.32289312237649337, + "eval_accuracy": 0.2978722179020627, + "eval_loss": 4.24153995513916, + "eval_runtime": 183.0117, + "eval_samples_per_second": 98.414, + "eval_steps_per_second": 6.153, + "step": 3000 + }, + { + "epoch": 0.3282746744161016, + "grad_norm": 0.9364116787910461, + "learning_rate": 0.0005809287792263764, + "loss": 4.3113, + "step": 3050 + }, + { + "epoch": 0.3336562264557098, + "grad_norm": 0.6825273633003235, + "learning_rate": 0.0005806055381963151, + "loss": 4.3019, + "step": 3100 + }, + { + "epoch": 0.3390377784953181, + "grad_norm": 0.6769887208938599, + "learning_rate": 0.0005802822971662536, + "loss": 4.287, + "step": 3150 + }, + { + "epoch": 0.3444193305349263, + "grad_norm": 0.7461885809898376, + "learning_rate": 0.0005799590561361922, + "loss": 4.2488, + "step": 3200 + }, + { + "epoch": 0.3498008825745345, + "grad_norm": 0.8458229899406433, + "learning_rate": 0.0005796358151061307, + "loss": 4.2704, + "step": 3250 + }, + { + "epoch": 0.35518243461414273, + "grad_norm": 0.7842735052108765, + "learning_rate": 0.0005793125740760694, + "loss": 4.2467, + "step": 3300 + }, + { + "epoch": 0.36056398665375095, + "grad_norm": 0.6246998310089111, + "learning_rate": 0.0005789893330460079, + "loss": 4.2382, + "step": 3350 + }, + { + "epoch": 0.36594553869335916, + "grad_norm": 0.7485172748565674, + "learning_rate": 0.0005786660920159465, + "loss": 4.2417, + "step": 3400 + }, + { + "epoch": 0.3713270907329674, + "grad_norm": 0.8092179894447327, + "learning_rate": 0.0005783428509858851, + "loss": 4.2616, + "step": 3450 + }, + { + "epoch": 0.3767086427725756, + "grad_norm": 1.0175601243972778, + "learning_rate": 0.0005780196099558237, + "loss": 4.2304, + "step": 3500 + }, + { + "epoch": 0.3820901948121838, + "grad_norm": 0.6826550960540771, + "learning_rate": 0.0005776963689257623, + "loss": 4.2206, + "step": 3550 + }, + { + "epoch": 0.38747174685179203, + "grad_norm": 0.7783133387565613, + "learning_rate": 0.0005773731278957008, + "loss": 4.2282, + "step": 3600 + }, + { + "epoch": 0.3928532988914003, + "grad_norm": 0.6485751867294312, + "learning_rate": 0.0005770498868656394, + "loss": 4.2179, + "step": 3650 + }, + { + "epoch": 0.3982348509310085, + "grad_norm": 0.778927206993103, + "learning_rate": 0.000576726645835578, + "loss": 4.1895, + "step": 3700 + }, + { + "epoch": 0.40361640297061674, + "grad_norm": 0.8899291753768921, + "learning_rate": 0.0005764034048055167, + "loss": 4.2006, + "step": 3750 + }, + { + "epoch": 0.40899795501022496, + "grad_norm": 0.6624945998191833, + "learning_rate": 0.0005760801637754552, + "loss": 4.2022, + "step": 3800 + }, + { + "epoch": 0.4143795070498332, + "grad_norm": 0.7508805394172668, + "learning_rate": 0.0005757569227453937, + "loss": 4.1884, + "step": 3850 + }, + { + "epoch": 0.4197610590894414, + "grad_norm": 0.6273350119590759, + "learning_rate": 0.0005754336817153324, + "loss": 4.1653, + "step": 3900 + }, + { + "epoch": 0.4251426111290496, + "grad_norm": 0.7216036319732666, + "learning_rate": 0.0005751104406852709, + "loss": 4.168, + "step": 3950 + }, + { + "epoch": 0.4305241631686578, + "grad_norm": 0.6389856934547424, + "learning_rate": 0.0005747871996552096, + "loss": 4.1493, + "step": 4000 + }, + { + "epoch": 0.4305241631686578, + "eval_accuracy": 0.31214225357606534, + "eval_loss": 4.097915172576904, + "eval_runtime": 182.793, + "eval_samples_per_second": 98.532, + "eval_steps_per_second": 6.16, + "step": 4000 + }, + { + "epoch": 0.43590571520826604, + "grad_norm": 0.8230230212211609, + "learning_rate": 0.0005744639586251481, + "loss": 4.1597, + "step": 4050 + }, + { + "epoch": 0.44128726724787426, + "grad_norm": 0.7011985182762146, + "learning_rate": 0.0005741407175950867, + "loss": 4.1713, + "step": 4100 + }, + { + "epoch": 0.44666881928748253, + "grad_norm": 0.6618315577507019, + "learning_rate": 0.0005738174765650253, + "loss": 4.156, + "step": 4150 + }, + { + "epoch": 0.45205037132709075, + "grad_norm": 0.652692973613739, + "learning_rate": 0.0005734942355349638, + "loss": 4.1499, + "step": 4200 + }, + { + "epoch": 0.45743192336669897, + "grad_norm": 0.6924708485603333, + "learning_rate": 0.0005731709945049025, + "loss": 4.1527, + "step": 4250 + }, + { + "epoch": 0.4628134754063072, + "grad_norm": 0.7289921641349792, + "learning_rate": 0.000572847753474841, + "loss": 4.1386, + "step": 4300 + }, + { + "epoch": 0.4681950274459154, + "grad_norm": 0.6634368896484375, + "learning_rate": 0.0005725245124447796, + "loss": 4.1338, + "step": 4350 + }, + { + "epoch": 0.4735765794855236, + "grad_norm": 0.6518203616142273, + "learning_rate": 0.0005722012714147182, + "loss": 4.1474, + "step": 4400 + }, + { + "epoch": 0.47895813152513184, + "grad_norm": 0.5596990585327148, + "learning_rate": 0.0005718780303846568, + "loss": 4.115, + "step": 4450 + }, + { + "epoch": 0.48433968356474005, + "grad_norm": 0.5837390422821045, + "learning_rate": 0.0005715547893545953, + "loss": 4.0993, + "step": 4500 + }, + { + "epoch": 0.48972123560434827, + "grad_norm": 0.5811730027198792, + "learning_rate": 0.000571231548324534, + "loss": 4.099, + "step": 4550 + }, + { + "epoch": 0.49510278764395654, + "grad_norm": 0.5709436535835266, + "learning_rate": 0.0005709083072944725, + "loss": 4.1202, + "step": 4600 + }, + { + "epoch": 0.5004843396835648, + "grad_norm": 0.6790245175361633, + "learning_rate": 0.0005705850662644111, + "loss": 4.0862, + "step": 4650 + }, + { + "epoch": 0.505865891723173, + "grad_norm": 0.629574179649353, + "learning_rate": 0.0005702618252343497, + "loss": 4.1059, + "step": 4700 + }, + { + "epoch": 0.5112474437627812, + "grad_norm": 0.5848677754402161, + "learning_rate": 0.0005699385842042882, + "loss": 4.0903, + "step": 4750 + }, + { + "epoch": 0.5166289958023894, + "grad_norm": 0.6429191827774048, + "learning_rate": 0.0005696153431742269, + "loss": 4.0856, + "step": 4800 + }, + { + "epoch": 0.5220105478419976, + "grad_norm": 0.6784705519676208, + "learning_rate": 0.0005692921021441655, + "loss": 4.0779, + "step": 4850 + }, + { + "epoch": 0.5273920998816058, + "grad_norm": 0.6737756133079529, + "learning_rate": 0.0005689688611141041, + "loss": 4.0737, + "step": 4900 + }, + { + "epoch": 0.5327736519212141, + "grad_norm": 0.614524781703949, + "learning_rate": 0.0005686456200840426, + "loss": 4.055, + "step": 4950 + }, + { + "epoch": 0.5381552039608223, + "grad_norm": 0.5657824277877808, + "learning_rate": 0.0005683223790539811, + "loss": 4.0522, + "step": 5000 + }, + { + "epoch": 0.5381552039608223, + "eval_accuracy": 0.3199428790038182, + "eval_loss": 4.0021162033081055, + "eval_runtime": 183.0646, + "eval_samples_per_second": 98.386, + "eval_steps_per_second": 6.151, + "step": 5000 + }, + { + "epoch": 0.5435367560004305, + "grad_norm": 0.7021715044975281, + "learning_rate": 0.0005679991380239198, + "loss": 4.0655, + "step": 5050 + }, + { + "epoch": 0.5489183080400387, + "grad_norm": 0.796134352684021, + "learning_rate": 0.0005676758969938584, + "loss": 4.0578, + "step": 5100 + }, + { + "epoch": 0.5542998600796469, + "grad_norm": 0.6416186094284058, + "learning_rate": 0.000567352655963797, + "loss": 4.0523, + "step": 5150 + }, + { + "epoch": 0.5596814121192552, + "grad_norm": 0.6666920185089111, + "learning_rate": 0.0005670294149337355, + "loss": 4.0536, + "step": 5200 + }, + { + "epoch": 0.5650629641588634, + "grad_norm": 0.6438349485397339, + "learning_rate": 0.0005667061739036742, + "loss": 4.0504, + "step": 5250 + }, + { + "epoch": 0.5704445161984716, + "grad_norm": 0.5683118104934692, + "learning_rate": 0.0005663829328736127, + "loss": 4.0581, + "step": 5300 + }, + { + "epoch": 0.5758260682380799, + "grad_norm": 0.6448424458503723, + "learning_rate": 0.0005660596918435512, + "loss": 4.0476, + "step": 5350 + }, + { + "epoch": 0.5812076202776881, + "grad_norm": 0.6892325282096863, + "learning_rate": 0.0005657364508134899, + "loss": 4.0357, + "step": 5400 + }, + { + "epoch": 0.5865891723172963, + "grad_norm": 0.6151754260063171, + "learning_rate": 0.0005654132097834284, + "loss": 4.0408, + "step": 5450 + }, + { + "epoch": 0.5919707243569046, + "grad_norm": 0.5521050095558167, + "learning_rate": 0.0005650899687533671, + "loss": 4.0438, + "step": 5500 + }, + { + "epoch": 0.5973522763965128, + "grad_norm": 0.5622237324714661, + "learning_rate": 0.0005647667277233056, + "loss": 4.0191, + "step": 5550 + }, + { + "epoch": 0.602733828436121, + "grad_norm": 0.6457247734069824, + "learning_rate": 0.0005644434866932442, + "loss": 4.0255, + "step": 5600 + }, + { + "epoch": 0.6081153804757292, + "grad_norm": 0.6344576478004456, + "learning_rate": 0.0005641202456631828, + "loss": 4.0171, + "step": 5650 + }, + { + "epoch": 0.6134969325153374, + "grad_norm": 0.8287795186042786, + "learning_rate": 0.0005637970046331214, + "loss": 3.9951, + "step": 5700 + }, + { + "epoch": 0.6188784845549457, + "grad_norm": 0.6326580047607422, + "learning_rate": 0.00056347376360306, + "loss": 4.0027, + "step": 5750 + }, + { + "epoch": 0.6242600365945539, + "grad_norm": 0.7404974102973938, + "learning_rate": 0.0005631505225729985, + "loss": 4.0087, + "step": 5800 + }, + { + "epoch": 0.6296415886341621, + "grad_norm": 0.6510661244392395, + "learning_rate": 0.0005628272815429371, + "loss": 3.9942, + "step": 5850 + }, + { + "epoch": 0.6350231406737703, + "grad_norm": 0.6300317049026489, + "learning_rate": 0.0005625040405128757, + "loss": 4.0002, + "step": 5900 + }, + { + "epoch": 0.6404046927133785, + "grad_norm": 0.5819694399833679, + "learning_rate": 0.0005621807994828143, + "loss": 3.9987, + "step": 5950 + }, + { + "epoch": 0.6457862447529867, + "grad_norm": 0.7095755338668823, + "learning_rate": 0.0005618575584527529, + "loss": 3.9857, + "step": 6000 + }, + { + "epoch": 0.6457862447529867, + "eval_accuracy": 0.32717677096873066, + "eval_loss": 3.925938606262207, + "eval_runtime": 182.691, + "eval_samples_per_second": 98.587, + "eval_steps_per_second": 6.163, + "step": 6000 + }, + { + "epoch": 0.651167796792595, + "grad_norm": 0.6090443134307861, + "learning_rate": 0.0005615343174226915, + "loss": 3.9941, + "step": 6050 + }, + { + "epoch": 0.6565493488322032, + "grad_norm": 0.5509337186813354, + "learning_rate": 0.00056121107639263, + "loss": 3.988, + "step": 6100 + }, + { + "epoch": 0.6619309008718114, + "grad_norm": 0.681161642074585, + "learning_rate": 0.0005608943001831699, + "loss": 3.989, + "step": 6150 + }, + { + "epoch": 0.6673124529114196, + "grad_norm": 0.7362217903137207, + "learning_rate": 0.0005605710591531085, + "loss": 3.9711, + "step": 6200 + }, + { + "epoch": 0.6726940049510278, + "grad_norm": 0.718980610370636, + "learning_rate": 0.000560247818123047, + "loss": 3.9902, + "step": 6250 + }, + { + "epoch": 0.6780755569906362, + "grad_norm": 0.6289951205253601, + "learning_rate": 0.0005599245770929855, + "loss": 3.9774, + "step": 6300 + }, + { + "epoch": 0.6834571090302444, + "grad_norm": 0.6267488598823547, + "learning_rate": 0.0005596013360629242, + "loss": 3.964, + "step": 6350 + }, + { + "epoch": 0.6888386610698526, + "grad_norm": 0.681925892829895, + "learning_rate": 0.0005592780950328628, + "loss": 3.9841, + "step": 6400 + }, + { + "epoch": 0.6942202131094608, + "grad_norm": 0.6765555143356323, + "learning_rate": 0.0005589548540028014, + "loss": 3.9582, + "step": 6450 + }, + { + "epoch": 0.699601765149069, + "grad_norm": 0.6029344201087952, + "learning_rate": 0.0005586316129727399, + "loss": 3.9729, + "step": 6500 + }, + { + "epoch": 0.7049833171886772, + "grad_norm": 0.5697076320648193, + "learning_rate": 0.0005583083719426786, + "loss": 3.9529, + "step": 6550 + }, + { + "epoch": 0.7103648692282855, + "grad_norm": 0.6241387724876404, + "learning_rate": 0.0005579851309126171, + "loss": 3.941, + "step": 6600 + }, + { + "epoch": 0.7157464212678937, + "grad_norm": 0.5610880851745605, + "learning_rate": 0.0005576618898825558, + "loss": 3.948, + "step": 6650 + }, + { + "epoch": 0.7211279733075019, + "grad_norm": 0.5819376111030579, + "learning_rate": 0.0005573386488524943, + "loss": 3.9528, + "step": 6700 + }, + { + "epoch": 0.7265095253471101, + "grad_norm": 0.5621766448020935, + "learning_rate": 0.0005570154078224328, + "loss": 3.9367, + "step": 6750 + }, + { + "epoch": 0.7318910773867183, + "grad_norm": 0.6376521587371826, + "learning_rate": 0.0005566921667923715, + "loss": 3.9523, + "step": 6800 + }, + { + "epoch": 0.7372726294263265, + "grad_norm": 0.5738341212272644, + "learning_rate": 0.00055636892576231, + "loss": 3.9213, + "step": 6850 + }, + { + "epoch": 0.7426541814659348, + "grad_norm": 0.5444178581237793, + "learning_rate": 0.0005560456847322487, + "loss": 3.9256, + "step": 6900 + }, + { + "epoch": 0.748035733505543, + "grad_norm": 0.6856753826141357, + "learning_rate": 0.0005557224437021872, + "loss": 3.9396, + "step": 6950 + }, + { + "epoch": 0.7534172855451512, + "grad_norm": 0.6148762106895447, + "learning_rate": 0.0005553992026721258, + "loss": 3.932, + "step": 7000 + }, + { + "epoch": 0.7534172855451512, + "eval_accuracy": 0.33273132389958254, + "eval_loss": 3.8695037364959717, + "eval_runtime": 183.1477, + "eval_samples_per_second": 98.341, + "eval_steps_per_second": 6.148, + "step": 7000 + }, + { + "epoch": 0.7587988375847594, + "grad_norm": 0.6166629791259766, + "learning_rate": 0.0005550759616420644, + "loss": 3.9215, + "step": 7050 + }, + { + "epoch": 0.7641803896243676, + "grad_norm": 0.5288543701171875, + "learning_rate": 0.000554752720612003, + "loss": 3.9322, + "step": 7100 + }, + { + "epoch": 0.7695619416639758, + "grad_norm": 0.650581955909729, + "learning_rate": 0.0005544294795819415, + "loss": 3.9267, + "step": 7150 + }, + { + "epoch": 0.7749434937035841, + "grad_norm": 0.5915279984474182, + "learning_rate": 0.0005541062385518801, + "loss": 3.9138, + "step": 7200 + }, + { + "epoch": 0.7803250457431924, + "grad_norm": 0.645673930644989, + "learning_rate": 0.0005537829975218188, + "loss": 3.9376, + "step": 7250 + }, + { + "epoch": 0.7857065977828006, + "grad_norm": 0.710521936416626, + "learning_rate": 0.0005534597564917573, + "loss": 3.9161, + "step": 7300 + }, + { + "epoch": 0.7910881498224088, + "grad_norm": 0.6567710041999817, + "learning_rate": 0.0005531365154616959, + "loss": 3.9225, + "step": 7350 + }, + { + "epoch": 0.796469701862017, + "grad_norm": 0.6403442621231079, + "learning_rate": 0.0005528132744316344, + "loss": 3.9261, + "step": 7400 + }, + { + "epoch": 0.8018512539016253, + "grad_norm": 0.5535980463027954, + "learning_rate": 0.0005524900334015731, + "loss": 3.9177, + "step": 7450 + }, + { + "epoch": 0.8072328059412335, + "grad_norm": 0.5908585786819458, + "learning_rate": 0.0005521667923715117, + "loss": 3.9195, + "step": 7500 + }, + { + "epoch": 0.8126143579808417, + "grad_norm": 0.5708103775978088, + "learning_rate": 0.0005518435513414502, + "loss": 3.9055, + "step": 7550 + }, + { + "epoch": 0.8179959100204499, + "grad_norm": 0.604663074016571, + "learning_rate": 0.0005515203103113888, + "loss": 3.9147, + "step": 7600 + }, + { + "epoch": 0.8233774620600581, + "grad_norm": 0.5916433930397034, + "learning_rate": 0.0005511970692813274, + "loss": 3.9041, + "step": 7650 + }, + { + "epoch": 0.8287590140996663, + "grad_norm": 0.74225914478302, + "learning_rate": 0.000550873828251266, + "loss": 3.9067, + "step": 7700 + }, + { + "epoch": 0.8341405661392746, + "grad_norm": 0.5728737115859985, + "learning_rate": 0.0005505505872212045, + "loss": 3.9031, + "step": 7750 + }, + { + "epoch": 0.8395221181788828, + "grad_norm": 0.6314652562141418, + "learning_rate": 0.0005502273461911432, + "loss": 3.9059, + "step": 7800 + }, + { + "epoch": 0.844903670218491, + "grad_norm": 0.6741981506347656, + "learning_rate": 0.0005499041051610817, + "loss": 3.8996, + "step": 7850 + }, + { + "epoch": 0.8502852222580992, + "grad_norm": 0.5826407074928284, + "learning_rate": 0.0005495808641310204, + "loss": 3.8754, + "step": 7900 + }, + { + "epoch": 0.8556667742977074, + "grad_norm": 0.5577662587165833, + "learning_rate": 0.0005492576231009589, + "loss": 3.8799, + "step": 7950 + }, + { + "epoch": 0.8610483263373157, + "grad_norm": 0.6278002858161926, + "learning_rate": 0.0005489343820708974, + "loss": 3.8959, + "step": 8000 + }, + { + "epoch": 0.8610483263373157, + "eval_accuracy": 0.33684915969486356, + "eval_loss": 3.826174020767212, + "eval_runtime": 183.0754, + "eval_samples_per_second": 98.38, + "eval_steps_per_second": 6.15, + "step": 8000 + }, + { + "epoch": 0.8664298783769239, + "grad_norm": 0.6210545301437378, + "learning_rate": 0.0005486111410408361, + "loss": 3.8848, + "step": 8050 + }, + { + "epoch": 0.8718114304165321, + "grad_norm": 0.622451901435852, + "learning_rate": 0.0005482879000107746, + "loss": 3.8772, + "step": 8100 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 0.604086697101593, + "learning_rate": 0.0005479646589807133, + "loss": 3.8865, + "step": 8150 + }, + { + "epoch": 0.8825745344957485, + "grad_norm": 0.5613585710525513, + "learning_rate": 0.0005476414179506518, + "loss": 3.8691, + "step": 8200 + }, + { + "epoch": 0.8879560865353568, + "grad_norm": 0.6614847779273987, + "learning_rate": 0.0005473181769205904, + "loss": 3.861, + "step": 8250 + }, + { + "epoch": 0.8933376385749651, + "grad_norm": 0.551531970500946, + "learning_rate": 0.000546994935890529, + "loss": 3.8908, + "step": 8300 + }, + { + "epoch": 0.8987191906145733, + "grad_norm": 0.7161784768104553, + "learning_rate": 0.0005466716948604677, + "loss": 3.8724, + "step": 8350 + }, + { + "epoch": 0.9041007426541815, + "grad_norm": 0.6812533140182495, + "learning_rate": 0.0005463484538304062, + "loss": 3.8675, + "step": 8400 + }, + { + "epoch": 0.9094822946937897, + "grad_norm": 0.5462357401847839, + "learning_rate": 0.000546031677620946, + "loss": 3.8776, + "step": 8450 + }, + { + "epoch": 0.9148638467333979, + "grad_norm": 0.5445178747177124, + "learning_rate": 0.0005457084365908845, + "loss": 3.8794, + "step": 8500 + }, + { + "epoch": 0.9202453987730062, + "grad_norm": 0.5628532767295837, + "learning_rate": 0.0005453851955608232, + "loss": 3.865, + "step": 8550 + }, + { + "epoch": 0.9256269508126144, + "grad_norm": 0.590034544467926, + "learning_rate": 0.0005450619545307617, + "loss": 3.8665, + "step": 8600 + }, + { + "epoch": 0.9310085028522226, + "grad_norm": 0.5842359662055969, + "learning_rate": 0.0005447387135007003, + "loss": 3.8516, + "step": 8650 + }, + { + "epoch": 0.9363900548918308, + "grad_norm": 0.5638464093208313, + "learning_rate": 0.0005444154724706389, + "loss": 3.8689, + "step": 8700 + }, + { + "epoch": 0.941771606931439, + "grad_norm": 0.5643623471260071, + "learning_rate": 0.0005440922314405775, + "loss": 3.8532, + "step": 8750 + }, + { + "epoch": 0.9471531589710472, + "grad_norm": 0.5182409882545471, + "learning_rate": 0.0005437689904105161, + "loss": 3.8537, + "step": 8800 + }, + { + "epoch": 0.9525347110106555, + "grad_norm": 0.6119173765182495, + "learning_rate": 0.0005434457493804546, + "loss": 3.8483, + "step": 8850 + }, + { + "epoch": 0.9579162630502637, + "grad_norm": 0.609887957572937, + "learning_rate": 0.0005431225083503932, + "loss": 3.8477, + "step": 8900 + }, + { + "epoch": 0.9632978150898719, + "grad_norm": 0.6084715127944946, + "learning_rate": 0.0005427992673203318, + "loss": 3.8383, + "step": 8950 + }, + { + "epoch": 0.9686793671294801, + "grad_norm": 0.6143397092819214, + "learning_rate": 0.0005424760262902704, + "loss": 3.8478, + "step": 9000 + }, + { + "epoch": 0.9686793671294801, + "eval_accuracy": 0.3404499163970355, + "eval_loss": 3.785356044769287, + "eval_runtime": 182.7059, + "eval_samples_per_second": 98.579, + "eval_steps_per_second": 6.163, + "step": 9000 + }, + { + "epoch": 0.9740609191690883, + "grad_norm": 0.5996887683868408, + "learning_rate": 0.000542152785260209, + "loss": 3.8303, + "step": 9050 + }, + { + "epoch": 0.9794424712086965, + "grad_norm": 0.5821929574012756, + "learning_rate": 0.0005418295442301476, + "loss": 3.8485, + "step": 9100 + }, + { + "epoch": 0.9848240232483048, + "grad_norm": 0.5654467940330505, + "learning_rate": 0.0005415063032000861, + "loss": 3.8499, + "step": 9150 + }, + { + "epoch": 0.9902055752879131, + "grad_norm": 0.6263448596000671, + "learning_rate": 0.0005411830621700248, + "loss": 3.8372, + "step": 9200 + }, + { + "epoch": 0.9955871273275213, + "grad_norm": 0.5956529974937439, + "learning_rate": 0.0005408598211399633, + "loss": 3.8455, + "step": 9250 + }, + { + "epoch": 1.0009686793671295, + "grad_norm": 0.5786877274513245, + "learning_rate": 0.0005405365801099019, + "loss": 3.8393, + "step": 9300 + }, + { + "epoch": 1.0063502314067376, + "grad_norm": 0.6130419373512268, + "learning_rate": 0.0005402133390798405, + "loss": 3.7682, + "step": 9350 + }, + { + "epoch": 1.011731783446346, + "grad_norm": 0.6110179424285889, + "learning_rate": 0.000539890098049779, + "loss": 3.7743, + "step": 9400 + }, + { + "epoch": 1.017113335485954, + "grad_norm": 0.5443481206893921, + "learning_rate": 0.0005395668570197177, + "loss": 3.7773, + "step": 9450 + }, + { + "epoch": 1.0224948875255624, + "grad_norm": 0.5167891383171082, + "learning_rate": 0.0005392436159896562, + "loss": 3.7604, + "step": 9500 + }, + { + "epoch": 1.0278764395651705, + "grad_norm": 0.647746741771698, + "learning_rate": 0.0005389203749595948, + "loss": 3.7733, + "step": 9550 + }, + { + "epoch": 1.0332579916047788, + "grad_norm": 0.5472586750984192, + "learning_rate": 0.0005385971339295334, + "loss": 3.7681, + "step": 9600 + }, + { + "epoch": 1.0386395436443872, + "grad_norm": 0.5735807418823242, + "learning_rate": 0.000538273892899472, + "loss": 3.7828, + "step": 9650 + }, + { + "epoch": 1.0440210956839953, + "grad_norm": 0.6225168704986572, + "learning_rate": 0.0005379506518694106, + "loss": 3.7848, + "step": 9700 + }, + { + "epoch": 1.0494026477236036, + "grad_norm": 0.5470787286758423, + "learning_rate": 0.0005376274108393491, + "loss": 3.7819, + "step": 9750 + }, + { + "epoch": 1.0547841997632117, + "grad_norm": 0.5520187616348267, + "learning_rate": 0.0005373041698092877, + "loss": 3.7574, + "step": 9800 + }, + { + "epoch": 1.06016575180282, + "grad_norm": 0.5762578845024109, + "learning_rate": 0.0005369809287792263, + "loss": 3.7761, + "step": 9850 + }, + { + "epoch": 1.0655473038424281, + "grad_norm": 0.5505574345588684, + "learning_rate": 0.000536657687749165, + "loss": 3.7765, + "step": 9900 + }, + { + "epoch": 1.0709288558820365, + "grad_norm": 0.6215469837188721, + "learning_rate": 0.0005363344467191035, + "loss": 3.7648, + "step": 9950 + }, + { + "epoch": 1.0763104079216446, + "grad_norm": 0.5857725739479065, + "learning_rate": 0.000536011205689042, + "loss": 3.7833, + "step": 10000 + }, + { + "epoch": 1.0763104079216446, + "eval_accuracy": 0.3440022139112421, + "eval_loss": 3.7528064250946045, + "eval_runtime": 182.9389, + "eval_samples_per_second": 98.454, + "eval_steps_per_second": 6.155, + "step": 10000 + }, + { + "epoch": 1.081691959961253, + "grad_norm": 0.5563129782676697, + "learning_rate": 0.0005356879646589807, + "loss": 3.7952, + "step": 10050 + }, + { + "epoch": 1.087073512000861, + "grad_norm": 0.5878000259399414, + "learning_rate": 0.0005353647236289192, + "loss": 3.7564, + "step": 10100 + }, + { + "epoch": 1.0924550640404693, + "grad_norm": 0.5479933619499207, + "learning_rate": 0.0005350414825988579, + "loss": 3.7828, + "step": 10150 + }, + { + "epoch": 1.0978366160800774, + "grad_norm": 0.6343529224395752, + "learning_rate": 0.0005347182415687964, + "loss": 3.7892, + "step": 10200 + }, + { + "epoch": 1.1032181681196858, + "grad_norm": 0.6030763387680054, + "learning_rate": 0.000534395000538735, + "loss": 3.779, + "step": 10250 + }, + { + "epoch": 1.1085997201592939, + "grad_norm": 0.6497897505760193, + "learning_rate": 0.0005340717595086736, + "loss": 3.7651, + "step": 10300 + }, + { + "epoch": 1.1139812721989022, + "grad_norm": 0.7304083704948425, + "learning_rate": 0.0005337549832992134, + "loss": 3.7863, + "step": 10350 + }, + { + "epoch": 1.1193628242385103, + "grad_norm": 0.5949697494506836, + "learning_rate": 0.000533431742269152, + "loss": 3.7632, + "step": 10400 + }, + { + "epoch": 1.1247443762781186, + "grad_norm": 0.6299553513526917, + "learning_rate": 0.0005331085012390905, + "loss": 3.7438, + "step": 10450 + }, + { + "epoch": 1.1301259283177267, + "grad_norm": 0.6024655103683472, + "learning_rate": 0.0005327852602090292, + "loss": 3.7659, + "step": 10500 + }, + { + "epoch": 1.135507480357335, + "grad_norm": 0.6806287169456482, + "learning_rate": 0.0005324620191789678, + "loss": 3.7551, + "step": 10550 + }, + { + "epoch": 1.1408890323969434, + "grad_norm": 0.5712878108024597, + "learning_rate": 0.0005321387781489063, + "loss": 3.7723, + "step": 10600 + }, + { + "epoch": 1.1462705844365515, + "grad_norm": 0.5776671171188354, + "learning_rate": 0.0005318155371188449, + "loss": 3.7644, + "step": 10650 + }, + { + "epoch": 1.1516521364761596, + "grad_norm": 0.5746056437492371, + "learning_rate": 0.0005314922960887834, + "loss": 3.7811, + "step": 10700 + }, + { + "epoch": 1.157033688515768, + "grad_norm": 0.5841644406318665, + "learning_rate": 0.0005311690550587221, + "loss": 3.767, + "step": 10750 + }, + { + "epoch": 1.1624152405553763, + "grad_norm": 0.6027348041534424, + "learning_rate": 0.0005308458140286607, + "loss": 3.7558, + "step": 10800 + }, + { + "epoch": 1.1677967925949844, + "grad_norm": 0.6047888994216919, + "learning_rate": 0.0005305225729985993, + "loss": 3.772, + "step": 10850 + }, + { + "epoch": 1.1731783446345927, + "grad_norm": 0.5457755327224731, + "learning_rate": 0.0005301993319685378, + "loss": 3.7738, + "step": 10900 + }, + { + "epoch": 1.1785598966742008, + "grad_norm": 0.6240300536155701, + "learning_rate": 0.0005298760909384765, + "loss": 3.7453, + "step": 10950 + }, + { + "epoch": 1.1839414487138091, + "grad_norm": 0.5848184823989868, + "learning_rate": 0.000529552849908415, + "loss": 3.7495, + "step": 11000 + }, + { + "epoch": 1.1839414487138091, + "eval_accuracy": 0.3466185754498148, + "eval_loss": 3.728104591369629, + "eval_runtime": 182.5387, + "eval_samples_per_second": 98.67, + "eval_steps_per_second": 6.169, + "step": 11000 + }, + { + "epoch": 1.1893230007534172, + "grad_norm": 0.6290469765663147, + "learning_rate": 0.0005292296088783535, + "loss": 3.7552, + "step": 11050 + }, + { + "epoch": 1.1947045527930256, + "grad_norm": 0.5949827432632446, + "learning_rate": 0.0005289063678482922, + "loss": 3.7403, + "step": 11100 + }, + { + "epoch": 1.2000861048326337, + "grad_norm": 0.5734910368919373, + "learning_rate": 0.0005285831268182307, + "loss": 3.7353, + "step": 11150 + }, + { + "epoch": 1.205467656872242, + "grad_norm": 0.5530065894126892, + "learning_rate": 0.0005282598857881694, + "loss": 3.7436, + "step": 11200 + }, + { + "epoch": 1.21084920891185, + "grad_norm": 0.6502180695533752, + "learning_rate": 0.0005279366447581079, + "loss": 3.7462, + "step": 11250 + }, + { + "epoch": 1.2162307609514584, + "grad_norm": 0.546893298625946, + "learning_rate": 0.0005276134037280465, + "loss": 3.7476, + "step": 11300 + }, + { + "epoch": 1.2216123129910665, + "grad_norm": 0.5351948738098145, + "learning_rate": 0.0005272901626979851, + "loss": 3.768, + "step": 11350 + }, + { + "epoch": 1.2269938650306749, + "grad_norm": 0.6159573793411255, + "learning_rate": 0.0005269669216679236, + "loss": 3.7397, + "step": 11400 + }, + { + "epoch": 1.232375417070283, + "grad_norm": 0.5443019270896912, + "learning_rate": 0.0005266436806378623, + "loss": 3.7555, + "step": 11450 + }, + { + "epoch": 1.2377569691098913, + "grad_norm": 0.547874927520752, + "learning_rate": 0.0005263204396078008, + "loss": 3.7532, + "step": 11500 + }, + { + "epoch": 1.2431385211494996, + "grad_norm": 0.672290563583374, + "learning_rate": 0.0005259971985777394, + "loss": 3.7461, + "step": 11550 + }, + { + "epoch": 1.2485200731891077, + "grad_norm": 0.6494690775871277, + "learning_rate": 0.000525673957547678, + "loss": 3.7317, + "step": 11600 + }, + { + "epoch": 1.2539016252287158, + "grad_norm": 0.5554096698760986, + "learning_rate": 0.0005253507165176167, + "loss": 3.7485, + "step": 11650 + }, + { + "epoch": 1.2592831772683242, + "grad_norm": 0.5914068222045898, + "learning_rate": 0.0005250274754875552, + "loss": 3.744, + "step": 11700 + }, + { + "epoch": 1.2646647293079325, + "grad_norm": 0.5564110279083252, + "learning_rate": 0.0005247042344574938, + "loss": 3.734, + "step": 11750 + }, + { + "epoch": 1.2700462813475406, + "grad_norm": 0.5499497056007385, + "learning_rate": 0.0005243809934274323, + "loss": 3.7457, + "step": 11800 + }, + { + "epoch": 1.275427833387149, + "grad_norm": 0.5675379633903503, + "learning_rate": 0.0005240577523973709, + "loss": 3.7338, + "step": 11850 + }, + { + "epoch": 1.280809385426757, + "grad_norm": 0.5705437660217285, + "learning_rate": 0.0005237345113673095, + "loss": 3.759, + "step": 11900 + }, + { + "epoch": 1.2861909374663654, + "grad_norm": 0.5672619342803955, + "learning_rate": 0.0005234112703372481, + "loss": 3.7245, + "step": 11950 + }, + { + "epoch": 1.2915724895059735, + "grad_norm": 0.553011417388916, + "learning_rate": 0.0005230880293071867, + "loss": 3.7503, + "step": 12000 + }, + { + "epoch": 1.2915724895059735, + "eval_accuracy": 0.34867276694683536, + "eval_loss": 3.701185703277588, + "eval_runtime": 183.0501, + "eval_samples_per_second": 98.394, + "eval_steps_per_second": 6.151, + "step": 12000 + }, + { + "epoch": 1.2969540415455818, + "grad_norm": 0.5194526314735413, + "learning_rate": 0.0005227647882771253, + "loss": 3.7416, + "step": 12050 + }, + { + "epoch": 1.30233559358519, + "grad_norm": 0.5847457051277161, + "learning_rate": 0.0005224415472470639, + "loss": 3.7315, + "step": 12100 + }, + { + "epoch": 1.3077171456247982, + "grad_norm": 0.6009708046913147, + "learning_rate": 0.0005221183062170024, + "loss": 3.7389, + "step": 12150 + }, + { + "epoch": 1.3130986976644063, + "grad_norm": 0.5650305151939392, + "learning_rate": 0.0005217950651869409, + "loss": 3.7334, + "step": 12200 + }, + { + "epoch": 1.3184802497040147, + "grad_norm": 0.4992525279521942, + "learning_rate": 0.0005214718241568796, + "loss": 3.7386, + "step": 12250 + }, + { + "epoch": 1.3238618017436228, + "grad_norm": 0.5877363681793213, + "learning_rate": 0.0005211485831268182, + "loss": 3.7243, + "step": 12300 + }, + { + "epoch": 1.329243353783231, + "grad_norm": 0.6050459146499634, + "learning_rate": 0.0005208253420967568, + "loss": 3.7248, + "step": 12350 + }, + { + "epoch": 1.3346249058228392, + "grad_norm": 0.5322745442390442, + "learning_rate": 0.0005205021010666953, + "loss": 3.7197, + "step": 12400 + }, + { + "epoch": 1.3400064578624475, + "grad_norm": 0.5668430328369141, + "learning_rate": 0.0005201788600366339, + "loss": 3.7243, + "step": 12450 + }, + { + "epoch": 1.3453880099020559, + "grad_norm": 0.6442492604255676, + "learning_rate": 0.0005198556190065725, + "loss": 3.7402, + "step": 12500 + }, + { + "epoch": 1.350769561941664, + "grad_norm": 0.593773365020752, + "learning_rate": 0.0005195323779765112, + "loss": 3.7295, + "step": 12550 + }, + { + "epoch": 1.356151113981272, + "grad_norm": 0.6152236461639404, + "learning_rate": 0.0005192091369464497, + "loss": 3.7121, + "step": 12600 + }, + { + "epoch": 1.3615326660208804, + "grad_norm": 0.579685628414154, + "learning_rate": 0.0005188858959163882, + "loss": 3.7207, + "step": 12650 + }, + { + "epoch": 1.3669142180604887, + "grad_norm": 0.5556007623672485, + "learning_rate": 0.0005185626548863269, + "loss": 3.7207, + "step": 12700 + }, + { + "epoch": 1.3722957701000968, + "grad_norm": 0.6135968565940857, + "learning_rate": 0.0005182394138562654, + "loss": 3.7287, + "step": 12750 + }, + { + "epoch": 1.3776773221397052, + "grad_norm": 0.5700423121452332, + "learning_rate": 0.0005179161728262041, + "loss": 3.7336, + "step": 12800 + }, + { + "epoch": 1.3830588741793133, + "grad_norm": 0.5285822749137878, + "learning_rate": 0.0005175929317961426, + "loss": 3.7093, + "step": 12850 + }, + { + "epoch": 1.3884404262189216, + "grad_norm": 0.5742802023887634, + "learning_rate": 0.0005172696907660812, + "loss": 3.7164, + "step": 12900 + }, + { + "epoch": 1.3938219782585297, + "grad_norm": 0.6038579940795898, + "learning_rate": 0.0005169464497360198, + "loss": 3.7429, + "step": 12950 + }, + { + "epoch": 1.399203530298138, + "grad_norm": 0.5283321738243103, + "learning_rate": 0.0005166232087059583, + "loss": 3.7205, + "step": 13000 + }, + { + "epoch": 1.399203530298138, + "eval_accuracy": 0.3511695016559243, + "eval_loss": 3.6827125549316406, + "eval_runtime": 183.2259, + "eval_samples_per_second": 98.299, + "eval_steps_per_second": 6.145, + "step": 13000 + }, + { + "epoch": 1.4045850823377461, + "grad_norm": 0.5694897770881653, + "learning_rate": 0.0005162999676758969, + "loss": 3.7338, + "step": 13050 + }, + { + "epoch": 1.4099666343773545, + "grad_norm": 0.5415697693824768, + "learning_rate": 0.0005159767266458355, + "loss": 3.7092, + "step": 13100 + }, + { + "epoch": 1.4153481864169626, + "grad_norm": 0.5408352017402649, + "learning_rate": 0.0005156534856157741, + "loss": 3.7065, + "step": 13150 + }, + { + "epoch": 1.420729738456571, + "grad_norm": 0.5147499442100525, + "learning_rate": 0.0005153302445857127, + "loss": 3.7059, + "step": 13200 + }, + { + "epoch": 1.426111290496179, + "grad_norm": 0.553383469581604, + "learning_rate": 0.0005150070035556513, + "loss": 3.7238, + "step": 13250 + }, + { + "epoch": 1.4314928425357873, + "grad_norm": 0.6054697632789612, + "learning_rate": 0.0005146837625255898, + "loss": 3.7054, + "step": 13300 + }, + { + "epoch": 1.4368743945753955, + "grad_norm": 0.5648375749588013, + "learning_rate": 0.0005143605214955285, + "loss": 3.7171, + "step": 13350 + }, + { + "epoch": 1.4422559466150038, + "grad_norm": 0.6254073977470398, + "learning_rate": 0.0005140372804654671, + "loss": 3.705, + "step": 13400 + }, + { + "epoch": 1.447637498654612, + "grad_norm": 0.6315209865570068, + "learning_rate": 0.0005137140394354056, + "loss": 3.7054, + "step": 13450 + }, + { + "epoch": 1.4530190506942202, + "grad_norm": 0.5474207997322083, + "learning_rate": 0.0005133907984053442, + "loss": 3.7168, + "step": 13500 + }, + { + "epoch": 1.4584006027338283, + "grad_norm": 0.5616752505302429, + "learning_rate": 0.0005130675573752827, + "loss": 3.7246, + "step": 13550 + }, + { + "epoch": 1.4637821547734367, + "grad_norm": 0.5460011959075928, + "learning_rate": 0.0005127443163452214, + "loss": 3.6915, + "step": 13600 + }, + { + "epoch": 1.469163706813045, + "grad_norm": 0.5980861186981201, + "learning_rate": 0.00051242107531516, + "loss": 3.7136, + "step": 13650 + }, + { + "epoch": 1.474545258852653, + "grad_norm": 0.6376727223396301, + "learning_rate": 0.0005120978342850986, + "loss": 3.6905, + "step": 13700 + }, + { + "epoch": 1.4799268108922612, + "grad_norm": 0.5472317337989807, + "learning_rate": 0.0005117745932550371, + "loss": 3.683, + "step": 13750 + }, + { + "epoch": 1.4853083629318695, + "grad_norm": 0.5763005614280701, + "learning_rate": 0.0005114513522249758, + "loss": 3.6773, + "step": 13800 + }, + { + "epoch": 1.4906899149714778, + "grad_norm": 0.5707553625106812, + "learning_rate": 0.0005111281111949143, + "loss": 3.6924, + "step": 13850 + }, + { + "epoch": 1.496071467011086, + "grad_norm": 0.5680550932884216, + "learning_rate": 0.0005108048701648528, + "loss": 3.7102, + "step": 13900 + }, + { + "epoch": 1.501453019050694, + "grad_norm": 0.6450196504592896, + "learning_rate": 0.0005104816291347915, + "loss": 3.6982, + "step": 13950 + }, + { + "epoch": 1.5068345710903024, + "grad_norm": 0.5898568630218506, + "learning_rate": 0.00051015838810473, + "loss": 3.6956, + "step": 14000 + }, + { + "epoch": 1.5068345710903024, + "eval_accuracy": 0.3528064660637373, + "eval_loss": 3.6613011360168457, + "eval_runtime": 182.9869, + "eval_samples_per_second": 98.428, + "eval_steps_per_second": 6.153, + "step": 14000 + }, + { + "epoch": 1.5122161231299107, + "grad_norm": 0.61032634973526, + "learning_rate": 0.0005098351470746687, + "loss": 3.7066, + "step": 14050 + }, + { + "epoch": 1.5175976751695188, + "grad_norm": 0.5454217791557312, + "learning_rate": 0.0005095119060446072, + "loss": 3.7021, + "step": 14100 + }, + { + "epoch": 1.5229792272091272, + "grad_norm": 0.5414226055145264, + "learning_rate": 0.0005091886650145458, + "loss": 3.7104, + "step": 14150 + }, + { + "epoch": 1.5283607792487355, + "grad_norm": 0.5376770496368408, + "learning_rate": 0.0005088654239844844, + "loss": 3.6958, + "step": 14200 + }, + { + "epoch": 1.5337423312883436, + "grad_norm": 0.5370742082595825, + "learning_rate": 0.0005085421829544229, + "loss": 3.6932, + "step": 14250 + }, + { + "epoch": 1.5391238833279517, + "grad_norm": 0.6070656776428223, + "learning_rate": 0.0005082189419243616, + "loss": 3.6957, + "step": 14300 + }, + { + "epoch": 1.54450543536756, + "grad_norm": 0.5474653244018555, + "learning_rate": 0.0005079021657149014, + "loss": 3.6808, + "step": 14350 + }, + { + "epoch": 1.5498869874071683, + "grad_norm": 0.5363737344741821, + "learning_rate": 0.0005075789246848399, + "loss": 3.6977, + "step": 14400 + }, + { + "epoch": 1.5552685394467765, + "grad_norm": 0.5836102366447449, + "learning_rate": 0.0005072556836547785, + "loss": 3.693, + "step": 14450 + }, + { + "epoch": 1.5606500914863846, + "grad_norm": 0.5627076625823975, + "learning_rate": 0.0005069324426247171, + "loss": 3.6993, + "step": 14500 + }, + { + "epoch": 1.566031643525993, + "grad_norm": 0.5487858057022095, + "learning_rate": 0.0005066092015946557, + "loss": 3.6824, + "step": 14550 + }, + { + "epoch": 1.5714131955656012, + "grad_norm": 0.5511041879653931, + "learning_rate": 0.0005062859605645943, + "loss": 3.6905, + "step": 14600 + }, + { + "epoch": 1.5767947476052093, + "grad_norm": 0.5165522694587708, + "learning_rate": 0.0005059627195345329, + "loss": 3.6881, + "step": 14650 + }, + { + "epoch": 1.5821762996448174, + "grad_norm": 0.5816965103149414, + "learning_rate": 0.0005056394785044715, + "loss": 3.697, + "step": 14700 + }, + { + "epoch": 1.5875578516844258, + "grad_norm": 0.55551677942276, + "learning_rate": 0.00050531623747441, + "loss": 3.6841, + "step": 14750 + }, + { + "epoch": 1.592939403724034, + "grad_norm": 0.5633440017700195, + "learning_rate": 0.0005049929964443486, + "loss": 3.7111, + "step": 14800 + }, + { + "epoch": 1.5983209557636422, + "grad_norm": 0.534885048866272, + "learning_rate": 0.0005046697554142871, + "loss": 3.6722, + "step": 14850 + }, + { + "epoch": 1.6037025078032503, + "grad_norm": 0.5363733768463135, + "learning_rate": 0.0005043465143842258, + "loss": 3.6866, + "step": 14900 + }, + { + "epoch": 1.6090840598428586, + "grad_norm": 0.7027775049209595, + "learning_rate": 0.0005040232733541644, + "loss": 3.6653, + "step": 14950 + }, + { + "epoch": 1.614465611882467, + "grad_norm": 0.5722295045852661, + "learning_rate": 0.000503700032324103, + "loss": 3.6897, + "step": 15000 + }, + { + "epoch": 1.614465611882467, + "eval_accuracy": 0.3547539604249241, + "eval_loss": 3.641711950302124, + "eval_runtime": 182.9321, + "eval_samples_per_second": 98.457, + "eval_steps_per_second": 6.155, + "step": 15000 + }, + { + "epoch": 1.619847163922075, + "grad_norm": 0.5616506934165955, + "learning_rate": 0.0005033767912940415, + "loss": 3.6878, + "step": 15050 + }, + { + "epoch": 1.6252287159616834, + "grad_norm": 0.5472654700279236, + "learning_rate": 0.0005030535502639802, + "loss": 3.7087, + "step": 15100 + }, + { + "epoch": 1.6306102680012917, + "grad_norm": 0.6255720853805542, + "learning_rate": 0.0005027303092339187, + "loss": 3.6824, + "step": 15150 + }, + { + "epoch": 1.6359918200408998, + "grad_norm": 0.49615591764450073, + "learning_rate": 0.0005024070682038573, + "loss": 3.6515, + "step": 15200 + }, + { + "epoch": 1.641373372080508, + "grad_norm": 0.559258759021759, + "learning_rate": 0.0005020838271737959, + "loss": 3.6834, + "step": 15250 + }, + { + "epoch": 1.6467549241201163, + "grad_norm": 0.572930097579956, + "learning_rate": 0.0005017605861437344, + "loss": 3.689, + "step": 15300 + }, + { + "epoch": 1.6521364761597246, + "grad_norm": 0.5224685668945312, + "learning_rate": 0.0005014373451136731, + "loss": 3.6909, + "step": 15350 + }, + { + "epoch": 1.6575180281993327, + "grad_norm": 0.591471791267395, + "learning_rate": 0.0005011141040836116, + "loss": 3.6753, + "step": 15400 + }, + { + "epoch": 1.6628995802389408, + "grad_norm": 0.5919022560119629, + "learning_rate": 0.0005007908630535503, + "loss": 3.687, + "step": 15450 + }, + { + "epoch": 1.6682811322785491, + "grad_norm": 0.5808023810386658, + "learning_rate": 0.0005004676220234888, + "loss": 3.6756, + "step": 15500 + }, + { + "epoch": 1.6736626843181575, + "grad_norm": 0.5336684584617615, + "learning_rate": 0.0005001443809934273, + "loss": 3.6766, + "step": 15550 + }, + { + "epoch": 1.6790442363577656, + "grad_norm": 0.622586190700531, + "learning_rate": 0.000499821139963366, + "loss": 3.689, + "step": 15600 + }, + { + "epoch": 1.6844257883973737, + "grad_norm": 0.5987013578414917, + "learning_rate": 0.0004994978989333045, + "loss": 3.6845, + "step": 15650 + }, + { + "epoch": 1.689807340436982, + "grad_norm": 0.6158627271652222, + "learning_rate": 0.0004991746579032431, + "loss": 3.6704, + "step": 15700 + }, + { + "epoch": 1.6951888924765903, + "grad_norm": 0.6078811287879944, + "learning_rate": 0.0004988514168731817, + "loss": 3.6819, + "step": 15750 + }, + { + "epoch": 1.7005704445161984, + "grad_norm": 0.5467185974121094, + "learning_rate": 0.0004985281758431204, + "loss": 3.6727, + "step": 15800 + }, + { + "epoch": 1.7059519965558065, + "grad_norm": 0.585032045841217, + "learning_rate": 0.0004982049348130589, + "loss": 3.6765, + "step": 15850 + }, + { + "epoch": 1.7113335485954149, + "grad_norm": 0.6013224720954895, + "learning_rate": 0.0004978816937829975, + "loss": 3.6628, + "step": 15900 + }, + { + "epoch": 1.7167151006350232, + "grad_norm": 0.5670937299728394, + "learning_rate": 0.000497558452752936, + "loss": 3.6655, + "step": 15950 + }, + { + "epoch": 1.7220966526746313, + "grad_norm": 0.5367864966392517, + "learning_rate": 0.0004972352117228746, + "loss": 3.6712, + "step": 16000 + }, + { + "epoch": 1.7220966526746313, + "eval_accuracy": 0.3568852954722929, + "eval_loss": 3.62510085105896, + "eval_runtime": 182.7792, + "eval_samples_per_second": 98.54, + "eval_steps_per_second": 6.16, + "step": 16000 + }, + { + "epoch": 1.7274782047142396, + "grad_norm": 0.5834771394729614, + "learning_rate": 0.0004969119706928133, + "loss": 3.6616, + "step": 16050 + }, + { + "epoch": 1.732859756753848, + "grad_norm": 0.552348256111145, + "learning_rate": 0.0004965887296627518, + "loss": 3.6731, + "step": 16100 + }, + { + "epoch": 1.738241308793456, + "grad_norm": 0.5560649037361145, + "learning_rate": 0.0004962654886326904, + "loss": 3.6797, + "step": 16150 + }, + { + "epoch": 1.7436228608330642, + "grad_norm": 0.5562814474105835, + "learning_rate": 0.000495942247602629, + "loss": 3.6658, + "step": 16200 + }, + { + "epoch": 1.7490044128726725, + "grad_norm": 0.5309078097343445, + "learning_rate": 0.0004956190065725676, + "loss": 3.6545, + "step": 16250 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.5403803586959839, + "learning_rate": 0.0004952957655425062, + "loss": 3.661, + "step": 16300 + }, + { + "epoch": 1.759767516951889, + "grad_norm": 0.5780831575393677, + "learning_rate": 0.0004949789893330459, + "loss": 3.6783, + "step": 16350 + }, + { + "epoch": 1.765149068991497, + "grad_norm": 0.5269186496734619, + "learning_rate": 0.0004946557483029846, + "loss": 3.6483, + "step": 16400 + }, + { + "epoch": 1.7705306210311054, + "grad_norm": 0.598962128162384, + "learning_rate": 0.0004943325072729231, + "loss": 3.6521, + "step": 16450 + }, + { + "epoch": 1.7759121730707137, + "grad_norm": 0.5480334758758545, + "learning_rate": 0.0004940092662428617, + "loss": 3.6613, + "step": 16500 + }, + { + "epoch": 1.7812937251103218, + "grad_norm": 0.5782729983329773, + "learning_rate": 0.0004936860252128003, + "loss": 3.6506, + "step": 16550 + }, + { + "epoch": 1.78667527714993, + "grad_norm": 0.5823274850845337, + "learning_rate": 0.0004933627841827388, + "loss": 3.6526, + "step": 16600 + }, + { + "epoch": 1.7920568291895382, + "grad_norm": 0.5701773762702942, + "learning_rate": 0.0004930395431526775, + "loss": 3.6625, + "step": 16650 + }, + { + "epoch": 1.7974383812291466, + "grad_norm": 0.5889811515808105, + "learning_rate": 0.0004927163021226161, + "loss": 3.6565, + "step": 16700 + }, + { + "epoch": 1.8028199332687547, + "grad_norm": 0.5805728435516357, + "learning_rate": 0.0004923930610925547, + "loss": 3.6663, + "step": 16750 + }, + { + "epoch": 1.8082014853083628, + "grad_norm": 0.5075141191482544, + "learning_rate": 0.0004920698200624932, + "loss": 3.6442, + "step": 16800 + }, + { + "epoch": 1.813583037347971, + "grad_norm": 0.5800615549087524, + "learning_rate": 0.0004917465790324317, + "loss": 3.6608, + "step": 16850 + }, + { + "epoch": 1.8189645893875794, + "grad_norm": 0.5941094160079956, + "learning_rate": 0.0004914233380023704, + "loss": 3.66, + "step": 16900 + }, + { + "epoch": 1.8243461414271875, + "grad_norm": 0.5344933867454529, + "learning_rate": 0.0004911000969723089, + "loss": 3.6616, + "step": 16950 + }, + { + "epoch": 1.8297276934667959, + "grad_norm": 0.6212116479873657, + "learning_rate": 0.0004907768559422476, + "loss": 3.6428, + "step": 17000 + }, + { + "epoch": 1.8297276934667959, + "eval_accuracy": 0.3578398110917431, + "eval_loss": 3.610550880432129, + "eval_runtime": 182.6501, + "eval_samples_per_second": 98.609, + "eval_steps_per_second": 6.165, + "step": 17000 + }, + { + "epoch": 1.8351092455064042, + "grad_norm": 0.6466174125671387, + "learning_rate": 0.0004904536149121861, + "loss": 3.6452, + "step": 17050 + }, + { + "epoch": 1.8404907975460123, + "grad_norm": 0.6004204750061035, + "learning_rate": 0.0004901303738821248, + "loss": 3.6527, + "step": 17100 + }, + { + "epoch": 1.8458723495856204, + "grad_norm": 0.5209426283836365, + "learning_rate": 0.0004898071328520633, + "loss": 3.6282, + "step": 17150 + }, + { + "epoch": 1.8512539016252287, + "grad_norm": 0.5604954361915588, + "learning_rate": 0.0004894838918220019, + "loss": 3.6619, + "step": 17200 + }, + { + "epoch": 1.856635453664837, + "grad_norm": 0.5392171144485474, + "learning_rate": 0.0004891606507919405, + "loss": 3.6596, + "step": 17250 + }, + { + "epoch": 1.8620170057044452, + "grad_norm": 0.5408145785331726, + "learning_rate": 0.000488837409761879, + "loss": 3.6552, + "step": 17300 + }, + { + "epoch": 1.8673985577440533, + "grad_norm": 0.5370055437088013, + "learning_rate": 0.0004885141687318177, + "loss": 3.6433, + "step": 17350 + }, + { + "epoch": 1.8727801097836616, + "grad_norm": 0.5374568700790405, + "learning_rate": 0.00048819092770175623, + "loss": 3.6328, + "step": 17400 + }, + { + "epoch": 1.87816166182327, + "grad_norm": 0.5178585648536682, + "learning_rate": 0.0004878676866716948, + "loss": 3.6565, + "step": 17450 + }, + { + "epoch": 1.883543213862878, + "grad_norm": 0.5513762831687927, + "learning_rate": 0.00048754444564163337, + "loss": 3.675, + "step": 17500 + }, + { + "epoch": 1.8889247659024861, + "grad_norm": 0.5339059829711914, + "learning_rate": 0.000487221204611572, + "loss": 3.6502, + "step": 17550 + }, + { + "epoch": 1.8943063179420945, + "grad_norm": 0.6063997745513916, + "learning_rate": 0.00048689796358151056, + "loss": 3.6437, + "step": 17600 + }, + { + "epoch": 1.8996878699817028, + "grad_norm": 0.5742690563201904, + "learning_rate": 0.00048657472255144915, + "loss": 3.646, + "step": 17650 + }, + { + "epoch": 1.905069422021311, + "grad_norm": 0.5425062775611877, + "learning_rate": 0.00048625148152138775, + "loss": 3.6322, + "step": 17700 + }, + { + "epoch": 1.910450974060919, + "grad_norm": 0.5563521981239319, + "learning_rate": 0.0004859282404913263, + "loss": 3.6437, + "step": 17750 + }, + { + "epoch": 1.9158325261005273, + "grad_norm": 0.5626557469367981, + "learning_rate": 0.0004856049994612649, + "loss": 3.6403, + "step": 17800 + }, + { + "epoch": 1.9212140781401357, + "grad_norm": 0.5019347071647644, + "learning_rate": 0.00048528175843120353, + "loss": 3.6476, + "step": 17850 + }, + { + "epoch": 1.9265956301797438, + "grad_norm": 0.6100844144821167, + "learning_rate": 0.0004849585174011421, + "loss": 3.6504, + "step": 17900 + }, + { + "epoch": 1.931977182219352, + "grad_norm": 0.5230750441551208, + "learning_rate": 0.00048463527637108067, + "loss": 3.6293, + "step": 17950 + }, + { + "epoch": 1.9373587342589604, + "grad_norm": 0.5869380831718445, + "learning_rate": 0.0004843120353410192, + "loss": 3.6477, + "step": 18000 + }, + { + "epoch": 1.9373587342589604, + "eval_accuracy": 0.35974362699202844, + "eval_loss": 3.5920450687408447, + "eval_runtime": 182.6734, + "eval_samples_per_second": 98.597, + "eval_steps_per_second": 6.164, + "step": 18000 + }, + { + "epoch": 1.9427402862985685, + "grad_norm": 0.5855605006217957, + "learning_rate": 0.0004839887943109578, + "loss": 3.6606, + "step": 18050 + }, + { + "epoch": 1.9481218383381766, + "grad_norm": 0.6179500222206116, + "learning_rate": 0.00048366555328089645, + "loss": 3.6605, + "step": 18100 + }, + { + "epoch": 1.953503390377785, + "grad_norm": 0.5851153135299683, + "learning_rate": 0.000483342312250835, + "loss": 3.6513, + "step": 18150 + }, + { + "epoch": 1.9588849424173933, + "grad_norm": 0.610054612159729, + "learning_rate": 0.0004830190712207736, + "loss": 3.6214, + "step": 18200 + }, + { + "epoch": 1.9642664944570014, + "grad_norm": 0.6112696528434753, + "learning_rate": 0.0004826958301907122, + "loss": 3.6286, + "step": 18250 + }, + { + "epoch": 1.9696480464966095, + "grad_norm": 0.5437807440757751, + "learning_rate": 0.0004823725891606507, + "loss": 3.6459, + "step": 18300 + }, + { + "epoch": 1.9750295985362178, + "grad_norm": 0.5649225115776062, + "learning_rate": 0.0004820493481305893, + "loss": 3.6386, + "step": 18350 + }, + { + "epoch": 1.9804111505758262, + "grad_norm": 0.5825486183166504, + "learning_rate": 0.00048172610710052797, + "loss": 3.6294, + "step": 18400 + }, + { + "epoch": 1.9857927026154343, + "grad_norm": 0.5508699417114258, + "learning_rate": 0.0004814093308910677, + "loss": 3.6338, + "step": 18450 + }, + { + "epoch": 1.9911742546550424, + "grad_norm": 0.5655311942100525, + "learning_rate": 0.00048108608986100637, + "loss": 3.6588, + "step": 18500 + }, + { + "epoch": 1.9965558066946507, + "grad_norm": 0.5678587555885315, + "learning_rate": 0.0004807628488309449, + "loss": 3.6348, + "step": 18550 + }, + { + "epoch": 2.001937358734259, + "grad_norm": 0.5703258514404297, + "learning_rate": 0.0004804396078008835, + "loss": 3.6138, + "step": 18600 + }, + { + "epoch": 2.007318910773867, + "grad_norm": 0.5701274871826172, + "learning_rate": 0.00048011636677082204, + "loss": 3.5586, + "step": 18650 + }, + { + "epoch": 2.0127004628134753, + "grad_norm": 0.567638635635376, + "learning_rate": 0.00047979312574076064, + "loss": 3.5602, + "step": 18700 + }, + { + "epoch": 2.018082014853084, + "grad_norm": 0.5551114678382874, + "learning_rate": 0.0004794698847106992, + "loss": 3.5546, + "step": 18750 + }, + { + "epoch": 2.023463566892692, + "grad_norm": 0.6258234977722168, + "learning_rate": 0.0004791466436806378, + "loss": 3.5536, + "step": 18800 + }, + { + "epoch": 2.0288451189323, + "grad_norm": 0.7323607206344604, + "learning_rate": 0.0004788234026505764, + "loss": 3.5522, + "step": 18850 + }, + { + "epoch": 2.034226670971908, + "grad_norm": 0.5760945677757263, + "learning_rate": 0.00047850016162051496, + "loss": 3.5633, + "step": 18900 + }, + { + "epoch": 2.0396082230115167, + "grad_norm": 0.561445951461792, + "learning_rate": 0.00047817692059045356, + "loss": 3.5486, + "step": 18950 + }, + { + "epoch": 2.044989775051125, + "grad_norm": 0.5980292558670044, + "learning_rate": 0.00047786014438099336, + "loss": 3.5527, + "step": 19000 + }, + { + "epoch": 2.044989775051125, + "eval_accuracy": 0.36137385492079693, + "eval_loss": 3.583522081375122, + "eval_runtime": 182.8509, + "eval_samples_per_second": 98.501, + "eval_steps_per_second": 6.158, + "step": 19000 + }, + { + "epoch": 2.050371327090733, + "grad_norm": 0.5768479108810425, + "learning_rate": 0.00047753690335093196, + "loss": 3.5578, + "step": 19050 + }, + { + "epoch": 2.055752879130341, + "grad_norm": 0.5865322947502136, + "learning_rate": 0.00047721366232087055, + "loss": 3.5501, + "step": 19100 + }, + { + "epoch": 2.0611344311699495, + "grad_norm": 0.5691093802452087, + "learning_rate": 0.00047689042129080915, + "loss": 3.544, + "step": 19150 + }, + { + "epoch": 2.0665159832095576, + "grad_norm": 0.6482433080673218, + "learning_rate": 0.00047656718026074774, + "loss": 3.5627, + "step": 19200 + }, + { + "epoch": 2.0718975352491658, + "grad_norm": 0.5731157064437866, + "learning_rate": 0.00047624393923068634, + "loss": 3.548, + "step": 19250 + }, + { + "epoch": 2.0772790872887743, + "grad_norm": 0.5512408018112183, + "learning_rate": 0.0004759206982006249, + "loss": 3.5753, + "step": 19300 + }, + { + "epoch": 2.0826606393283824, + "grad_norm": 0.5742067098617554, + "learning_rate": 0.00047559745717056347, + "loss": 3.5479, + "step": 19350 + }, + { + "epoch": 2.0880421913679905, + "grad_norm": 0.5789666175842285, + "learning_rate": 0.000475274216140502, + "loss": 3.5566, + "step": 19400 + }, + { + "epoch": 2.0934237434075986, + "grad_norm": 0.631552517414093, + "learning_rate": 0.00047495097511044066, + "loss": 3.5637, + "step": 19450 + }, + { + "epoch": 2.098805295447207, + "grad_norm": 0.5673854351043701, + "learning_rate": 0.00047462773408037925, + "loss": 3.5659, + "step": 19500 + }, + { + "epoch": 2.1041868474868153, + "grad_norm": 0.6008737683296204, + "learning_rate": 0.0004743044930503178, + "loss": 3.5755, + "step": 19550 + }, + { + "epoch": 2.1095683995264234, + "grad_norm": 0.5127817392349243, + "learning_rate": 0.0004739812520202564, + "loss": 3.576, + "step": 19600 + }, + { + "epoch": 2.1149499515660315, + "grad_norm": 0.5636597871780396, + "learning_rate": 0.000473658010990195, + "loss": 3.5646, + "step": 19650 + }, + { + "epoch": 2.12033150360564, + "grad_norm": 0.566287636756897, + "learning_rate": 0.0004733347699601336, + "loss": 3.5496, + "step": 19700 + }, + { + "epoch": 2.125713055645248, + "grad_norm": 0.5994743704795837, + "learning_rate": 0.0004730115289300722, + "loss": 3.558, + "step": 19750 + }, + { + "epoch": 2.1310946076848563, + "grad_norm": 0.5499207377433777, + "learning_rate": 0.00047268828790001077, + "loss": 3.5618, + "step": 19800 + }, + { + "epoch": 2.1364761597244644, + "grad_norm": 0.5498175621032715, + "learning_rate": 0.0004723650468699493, + "loss": 3.5618, + "step": 19850 + }, + { + "epoch": 2.141857711764073, + "grad_norm": 0.5704463720321655, + "learning_rate": 0.0004720418058398879, + "loss": 3.5423, + "step": 19900 + }, + { + "epoch": 2.147239263803681, + "grad_norm": 0.5655784606933594, + "learning_rate": 0.00047171856480982644, + "loss": 3.5586, + "step": 19950 + }, + { + "epoch": 2.152620815843289, + "grad_norm": 0.5610454678535461, + "learning_rate": 0.0004713953237797651, + "loss": 3.5691, + "step": 20000 + }, + { + "epoch": 2.152620815843289, + "eval_accuracy": 0.36227817290607683, + "eval_loss": 3.5729849338531494, + "eval_runtime": 182.9497, + "eval_samples_per_second": 98.448, + "eval_steps_per_second": 6.155, + "step": 20000 + }, + { + "epoch": 2.1580023678828972, + "grad_norm": 0.5196320414543152, + "learning_rate": 0.0004710720827497037, + "loss": 3.5614, + "step": 20050 + }, + { + "epoch": 2.163383919922506, + "grad_norm": 0.5847726464271545, + "learning_rate": 0.00047074884171964223, + "loss": 3.5468, + "step": 20100 + }, + { + "epoch": 2.168765471962114, + "grad_norm": 0.5680404901504517, + "learning_rate": 0.0004704256006895808, + "loss": 3.5578, + "step": 20150 + }, + { + "epoch": 2.174147024001722, + "grad_norm": 0.5938388705253601, + "learning_rate": 0.00047010235965951936, + "loss": 3.5858, + "step": 20200 + }, + { + "epoch": 2.1795285760413305, + "grad_norm": 0.6427321434020996, + "learning_rate": 0.00046977911862945796, + "loss": 3.5649, + "step": 20250 + }, + { + "epoch": 2.1849101280809387, + "grad_norm": 0.578403651714325, + "learning_rate": 0.0004694558775993966, + "loss": 3.5625, + "step": 20300 + }, + { + "epoch": 2.1902916801205468, + "grad_norm": 0.5661942362785339, + "learning_rate": 0.00046913263656933515, + "loss": 3.5531, + "step": 20350 + }, + { + "epoch": 2.195673232160155, + "grad_norm": 0.603402853012085, + "learning_rate": 0.00046880939553927374, + "loss": 3.5608, + "step": 20400 + }, + { + "epoch": 2.2010547841997634, + "grad_norm": 0.5676315426826477, + "learning_rate": 0.00046848615450921234, + "loss": 3.5555, + "step": 20450 + }, + { + "epoch": 2.2064363362393715, + "grad_norm": 0.592079222202301, + "learning_rate": 0.0004681629134791509, + "loss": 3.5641, + "step": 20500 + }, + { + "epoch": 2.2118178882789796, + "grad_norm": 0.5975874662399292, + "learning_rate": 0.00046783967244908947, + "loss": 3.5477, + "step": 20550 + }, + { + "epoch": 2.2171994403185877, + "grad_norm": 0.6059330701828003, + "learning_rate": 0.0004675164314190281, + "loss": 3.5673, + "step": 20600 + }, + { + "epoch": 2.2225809923581963, + "grad_norm": 0.5840981602668762, + "learning_rate": 0.00046719319038896666, + "loss": 3.5702, + "step": 20650 + }, + { + "epoch": 2.2279625443978044, + "grad_norm": 0.5514042377471924, + "learning_rate": 0.00046686994935890526, + "loss": 3.5504, + "step": 20700 + }, + { + "epoch": 2.2333440964374125, + "grad_norm": 0.5639162659645081, + "learning_rate": 0.0004665467083288438, + "loss": 3.5866, + "step": 20750 + }, + { + "epoch": 2.2387256484770206, + "grad_norm": 0.5674893260002136, + "learning_rate": 0.0004662234672987824, + "loss": 3.5668, + "step": 20800 + }, + { + "epoch": 2.244107200516629, + "grad_norm": 0.6036685109138489, + "learning_rate": 0.00046590022626872104, + "loss": 3.5557, + "step": 20850 + }, + { + "epoch": 2.2494887525562373, + "grad_norm": 0.5608086585998535, + "learning_rate": 0.0004655769852386596, + "loss": 3.5617, + "step": 20900 + }, + { + "epoch": 2.2548703045958454, + "grad_norm": 0.6411144137382507, + "learning_rate": 0.0004652537442085982, + "loss": 3.5523, + "step": 20950 + }, + { + "epoch": 2.2602518566354535, + "grad_norm": 0.5787631273269653, + "learning_rate": 0.00046493050317853677, + "loss": 3.5718, + "step": 21000 + }, + { + "epoch": 2.2602518566354535, + "eval_accuracy": 0.3634697691267383, + "eval_loss": 3.564570665359497, + "eval_runtime": 183.029, + "eval_samples_per_second": 98.405, + "eval_steps_per_second": 6.152, + "step": 21000 + }, + { + "epoch": 2.265633408675062, + "grad_norm": 0.6253136992454529, + "learning_rate": 0.0004646072621484753, + "loss": 3.5701, + "step": 21050 + }, + { + "epoch": 2.27101496071467, + "grad_norm": 0.6177113652229309, + "learning_rate": 0.0004642840211184139, + "loss": 3.575, + "step": 21100 + }, + { + "epoch": 2.2763965127542782, + "grad_norm": 0.6227148771286011, + "learning_rate": 0.00046396078008835255, + "loss": 3.5739, + "step": 21150 + }, + { + "epoch": 2.281778064793887, + "grad_norm": 0.6199923157691956, + "learning_rate": 0.0004636375390582911, + "loss": 3.5579, + "step": 21200 + }, + { + "epoch": 2.287159616833495, + "grad_norm": 0.5631394982337952, + "learning_rate": 0.0004633142980282297, + "loss": 3.5603, + "step": 21250 + }, + { + "epoch": 2.292541168873103, + "grad_norm": 0.5839282870292664, + "learning_rate": 0.00046299105699816823, + "loss": 3.5685, + "step": 21300 + }, + { + "epoch": 2.297922720912711, + "grad_norm": 0.5705982446670532, + "learning_rate": 0.0004626678159681068, + "loss": 3.5749, + "step": 21350 + }, + { + "epoch": 2.303304272952319, + "grad_norm": 0.5807977914810181, + "learning_rate": 0.0004623445749380454, + "loss": 3.5513, + "step": 21400 + }, + { + "epoch": 2.3086858249919278, + "grad_norm": 0.5916268229484558, + "learning_rate": 0.000462021333907984, + "loss": 3.5706, + "step": 21450 + }, + { + "epoch": 2.314067377031536, + "grad_norm": 0.5938501358032227, + "learning_rate": 0.0004616980928779226, + "loss": 3.5709, + "step": 21500 + }, + { + "epoch": 2.319448929071144, + "grad_norm": 0.5628405809402466, + "learning_rate": 0.0004613748518478612, + "loss": 3.5559, + "step": 21550 + }, + { + "epoch": 2.3248304811107525, + "grad_norm": 0.5705186128616333, + "learning_rate": 0.00046105161081779974, + "loss": 3.5768, + "step": 21600 + }, + { + "epoch": 2.3302120331503606, + "grad_norm": 0.5919448137283325, + "learning_rate": 0.00046072836978773834, + "loss": 3.5586, + "step": 21650 + }, + { + "epoch": 2.3355935851899687, + "grad_norm": 0.5722348690032959, + "learning_rate": 0.000460405128757677, + "loss": 3.5473, + "step": 21700 + }, + { + "epoch": 2.340975137229577, + "grad_norm": 0.5616813898086548, + "learning_rate": 0.0004600818877276155, + "loss": 3.5644, + "step": 21750 + }, + { + "epoch": 2.3463566892691854, + "grad_norm": 0.5710437297821045, + "learning_rate": 0.0004597586466975541, + "loss": 3.5366, + "step": 21800 + }, + { + "epoch": 2.3517382413087935, + "grad_norm": 0.5971595644950867, + "learning_rate": 0.00045943540566749266, + "loss": 3.5574, + "step": 21850 + }, + { + "epoch": 2.3571197933484016, + "grad_norm": 0.6181142330169678, + "learning_rate": 0.00045911216463743126, + "loss": 3.5667, + "step": 21900 + }, + { + "epoch": 2.3625013453880097, + "grad_norm": 0.6226775646209717, + "learning_rate": 0.00045878892360736985, + "loss": 3.5504, + "step": 21950 + }, + { + "epoch": 2.3678828974276183, + "grad_norm": 0.6260030269622803, + "learning_rate": 0.00045846568257730845, + "loss": 3.5632, + "step": 22000 + }, + { + "epoch": 2.3678828974276183, + "eval_accuracy": 0.3642475064977143, + "eval_loss": 3.552438497543335, + "eval_runtime": 183.1631, + "eval_samples_per_second": 98.333, + "eval_steps_per_second": 6.148, + "step": 22000 + }, + { + "epoch": 2.3732644494672264, + "grad_norm": 0.5650054216384888, + "learning_rate": 0.00045814244154724704, + "loss": 3.5589, + "step": 22050 + }, + { + "epoch": 2.3786460015068345, + "grad_norm": 0.5674408078193665, + "learning_rate": 0.00045781920051718563, + "loss": 3.5569, + "step": 22100 + }, + { + "epoch": 2.384027553546443, + "grad_norm": 0.6000884175300598, + "learning_rate": 0.0004574959594871242, + "loss": 3.5644, + "step": 22150 + }, + { + "epoch": 2.389409105586051, + "grad_norm": 0.6201997399330139, + "learning_rate": 0.00045717271845706277, + "loss": 3.5686, + "step": 22200 + }, + { + "epoch": 2.3947906576256592, + "grad_norm": 0.5663855075836182, + "learning_rate": 0.0004568494774270013, + "loss": 3.5715, + "step": 22250 + }, + { + "epoch": 2.4001722096652673, + "grad_norm": 0.6110810041427612, + "learning_rate": 0.00045652623639693996, + "loss": 3.5599, + "step": 22300 + }, + { + "epoch": 2.4055537617048754, + "grad_norm": 0.5673940181732178, + "learning_rate": 0.00045620299536687855, + "loss": 3.5529, + "step": 22350 + }, + { + "epoch": 2.410935313744484, + "grad_norm": 0.5478566288948059, + "learning_rate": 0.0004558797543368171, + "loss": 3.5546, + "step": 22400 + }, + { + "epoch": 2.416316865784092, + "grad_norm": 0.5602691769599915, + "learning_rate": 0.0004555565133067557, + "loss": 3.5553, + "step": 22450 + }, + { + "epoch": 2.4216984178237, + "grad_norm": 0.6060113906860352, + "learning_rate": 0.0004552332722766943, + "loss": 3.5723, + "step": 22500 + }, + { + "epoch": 2.4270799698633088, + "grad_norm": 0.607200562953949, + "learning_rate": 0.0004549100312466328, + "loss": 3.5494, + "step": 22550 + }, + { + "epoch": 2.432461521902917, + "grad_norm": 0.5753543376922607, + "learning_rate": 0.0004545867902165715, + "loss": 3.5594, + "step": 22600 + }, + { + "epoch": 2.437843073942525, + "grad_norm": 1.2295790910720825, + "learning_rate": 0.00045426354918651007, + "loss": 3.5365, + "step": 22650 + }, + { + "epoch": 2.443224625982133, + "grad_norm": 0.6025099754333496, + "learning_rate": 0.0004539403081564486, + "loss": 3.5705, + "step": 22700 + }, + { + "epoch": 2.4486061780217416, + "grad_norm": 0.7182654142379761, + "learning_rate": 0.0004536170671263872, + "loss": 3.5582, + "step": 22750 + }, + { + "epoch": 2.4539877300613497, + "grad_norm": 0.5721381306648254, + "learning_rate": 0.00045329382609632574, + "loss": 3.5563, + "step": 22800 + }, + { + "epoch": 2.459369282100958, + "grad_norm": 0.5419154763221741, + "learning_rate": 0.0004529705850662644, + "loss": 3.562, + "step": 22850 + }, + { + "epoch": 2.464750834140566, + "grad_norm": 0.5342304706573486, + "learning_rate": 0.000452647344036203, + "loss": 3.5658, + "step": 22900 + }, + { + "epoch": 2.4701323861801745, + "grad_norm": 0.6328591108322144, + "learning_rate": 0.00045232410300614153, + "loss": 3.5421, + "step": 22950 + }, + { + "epoch": 2.4755139382197826, + "grad_norm": 0.556786060333252, + "learning_rate": 0.0004520008619760801, + "loss": 3.5554, + "step": 23000 + }, + { + "epoch": 2.4755139382197826, + "eval_accuracy": 0.3653008962450757, + "eval_loss": 3.5433034896850586, + "eval_runtime": 182.589, + "eval_samples_per_second": 98.642, + "eval_steps_per_second": 6.167, + "step": 23000 + }, + { + "epoch": 2.4808954902593907, + "grad_norm": 0.5863376259803772, + "learning_rate": 0.00045168408576661993, + "loss": 3.5371, + "step": 23050 + }, + { + "epoch": 2.4862770422989993, + "grad_norm": 0.5524567365646362, + "learning_rate": 0.0004513608447365585, + "loss": 3.5489, + "step": 23100 + }, + { + "epoch": 2.4916585943386074, + "grad_norm": 0.5922284722328186, + "learning_rate": 0.00045103760370649706, + "loss": 3.5695, + "step": 23150 + }, + { + "epoch": 2.4970401463782155, + "grad_norm": 0.5904964804649353, + "learning_rate": 0.00045071436267643566, + "loss": 3.551, + "step": 23200 + }, + { + "epoch": 2.5024216984178236, + "grad_norm": 0.5709808468818665, + "learning_rate": 0.0004503911216463743, + "loss": 3.5497, + "step": 23250 + }, + { + "epoch": 2.5078032504574317, + "grad_norm": 0.6329324841499329, + "learning_rate": 0.00045006788061631285, + "loss": 3.546, + "step": 23300 + }, + { + "epoch": 2.5131848024970402, + "grad_norm": 0.5848966836929321, + "learning_rate": 0.00044974463958625144, + "loss": 3.5475, + "step": 23350 + }, + { + "epoch": 2.5185663545366483, + "grad_norm": 0.6118052005767822, + "learning_rate": 0.00044942139855619004, + "loss": 3.5342, + "step": 23400 + }, + { + "epoch": 2.5239479065762565, + "grad_norm": 0.5881749391555786, + "learning_rate": 0.0004490981575261286, + "loss": 3.5452, + "step": 23450 + }, + { + "epoch": 2.529329458615865, + "grad_norm": 0.5847213864326477, + "learning_rate": 0.0004487749164960672, + "loss": 3.5521, + "step": 23500 + }, + { + "epoch": 2.534711010655473, + "grad_norm": 0.6213960647583008, + "learning_rate": 0.0004484516754660058, + "loss": 3.5339, + "step": 23550 + }, + { + "epoch": 2.540092562695081, + "grad_norm": 0.6048945784568787, + "learning_rate": 0.00044812843443594436, + "loss": 3.5714, + "step": 23600 + }, + { + "epoch": 2.5454741147346893, + "grad_norm": 0.6373373866081238, + "learning_rate": 0.00044780519340588296, + "loss": 3.5608, + "step": 23650 + }, + { + "epoch": 2.550855666774298, + "grad_norm": 0.5849354863166809, + "learning_rate": 0.0004474819523758215, + "loss": 3.5378, + "step": 23700 + }, + { + "epoch": 2.556237218813906, + "grad_norm": 0.5594618916511536, + "learning_rate": 0.0004471587113457601, + "loss": 3.5356, + "step": 23750 + }, + { + "epoch": 2.561618770853514, + "grad_norm": 0.6139757633209229, + "learning_rate": 0.00044683547031569874, + "loss": 3.5414, + "step": 23800 + }, + { + "epoch": 2.567000322893122, + "grad_norm": 0.5510112643241882, + "learning_rate": 0.0004465122292856373, + "loss": 3.5638, + "step": 23850 + }, + { + "epoch": 2.5723818749327307, + "grad_norm": 0.6046518087387085, + "learning_rate": 0.0004461889882555759, + "loss": 3.5382, + "step": 23900 + }, + { + "epoch": 2.577763426972339, + "grad_norm": 0.5786885619163513, + "learning_rate": 0.00044586574722551447, + "loss": 3.571, + "step": 23950 + }, + { + "epoch": 2.583144979011947, + "grad_norm": 0.5536140203475952, + "learning_rate": 0.000445542506195453, + "loss": 3.5367, + "step": 24000 + }, + { + "epoch": 2.583144979011947, + "eval_accuracy": 0.36611677077967464, + "eval_loss": 3.534451484680176, + "eval_runtime": 183.1729, + "eval_samples_per_second": 98.328, + "eval_steps_per_second": 6.147, + "step": 24000 + }, + { + "epoch": 2.5885265310515555, + "grad_norm": 0.5413175225257874, + "learning_rate": 0.0004452192651653916, + "loss": 3.5572, + "step": 24050 + }, + { + "epoch": 2.5939080830911636, + "grad_norm": 0.614355206489563, + "learning_rate": 0.00044489602413533025, + "loss": 3.551, + "step": 24100 + }, + { + "epoch": 2.5992896351307717, + "grad_norm": 0.5778051018714905, + "learning_rate": 0.0004445727831052688, + "loss": 3.551, + "step": 24150 + }, + { + "epoch": 2.60467118717038, + "grad_norm": 0.5734228491783142, + "learning_rate": 0.0004442495420752074, + "loss": 3.5781, + "step": 24200 + }, + { + "epoch": 2.610052739209988, + "grad_norm": 0.6527189016342163, + "learning_rate": 0.00044392630104514593, + "loss": 3.5606, + "step": 24250 + }, + { + "epoch": 2.6154342912495965, + "grad_norm": 0.5808097720146179, + "learning_rate": 0.0004436030600150845, + "loss": 3.539, + "step": 24300 + }, + { + "epoch": 2.6208158432892046, + "grad_norm": 0.578321099281311, + "learning_rate": 0.0004432798189850231, + "loss": 3.5517, + "step": 24350 + }, + { + "epoch": 2.6261973953288127, + "grad_norm": 0.5915061831474304, + "learning_rate": 0.0004429565779549617, + "loss": 3.5437, + "step": 24400 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.6535822749137878, + "learning_rate": 0.0004426333369249003, + "loss": 3.5527, + "step": 24450 + }, + { + "epoch": 2.6369604994080293, + "grad_norm": 0.5836544036865234, + "learning_rate": 0.0004423100958948389, + "loss": 3.5435, + "step": 24500 + }, + { + "epoch": 2.6423420514476375, + "grad_norm": 0.6255151629447937, + "learning_rate": 0.00044198685486477744, + "loss": 3.5353, + "step": 24550 + }, + { + "epoch": 2.6477236034872456, + "grad_norm": 0.5904508829116821, + "learning_rate": 0.00044166361383471604, + "loss": 3.5546, + "step": 24600 + }, + { + "epoch": 2.653105155526854, + "grad_norm": 0.5963026285171509, + "learning_rate": 0.0004413403728046547, + "loss": 3.5417, + "step": 24650 + }, + { + "epoch": 2.658486707566462, + "grad_norm": 0.5733145475387573, + "learning_rate": 0.0004410171317745932, + "loss": 3.5371, + "step": 24700 + }, + { + "epoch": 2.6638682596060703, + "grad_norm": 0.574026346206665, + "learning_rate": 0.0004406938907445318, + "loss": 3.5503, + "step": 24750 + }, + { + "epoch": 2.6692498116456784, + "grad_norm": 0.5410118103027344, + "learning_rate": 0.00044037064971447036, + "loss": 3.5505, + "step": 24800 + }, + { + "epoch": 2.674631363685287, + "grad_norm": 0.5905002951622009, + "learning_rate": 0.00044004740868440896, + "loss": 3.5496, + "step": 24850 + }, + { + "epoch": 2.680012915724895, + "grad_norm": 0.5615861415863037, + "learning_rate": 0.00043972416765434755, + "loss": 3.5361, + "step": 24900 + }, + { + "epoch": 2.685394467764503, + "grad_norm": 0.5930156111717224, + "learning_rate": 0.00043940092662428615, + "loss": 3.5658, + "step": 24950 + }, + { + "epoch": 2.6907760198041117, + "grad_norm": 0.636413037776947, + "learning_rate": 0.00043907768559422474, + "loss": 3.543, + "step": 25000 + }, + { + "epoch": 2.6907760198041117, + "eval_accuracy": 0.36713582621448665, + "eval_loss": 3.526466131210327, + "eval_runtime": 182.6832, + "eval_samples_per_second": 98.591, + "eval_steps_per_second": 6.164, + "step": 25000 + }, + { + "epoch": 2.69615757184372, + "grad_norm": 0.5980111360549927, + "learning_rate": 0.00043875444456416334, + "loss": 3.5362, + "step": 25050 + }, + { + "epoch": 2.701539123883328, + "grad_norm": 0.5899256467819214, + "learning_rate": 0.0004384312035341019, + "loss": 3.5305, + "step": 25100 + }, + { + "epoch": 2.706920675922936, + "grad_norm": 0.6567667722702026, + "learning_rate": 0.00043810796250404047, + "loss": 3.5296, + "step": 25150 + }, + { + "epoch": 2.712302227962544, + "grad_norm": 0.6192686557769775, + "learning_rate": 0.0004377911862945803, + "loss": 3.5487, + "step": 25200 + }, + { + "epoch": 2.7176837800021527, + "grad_norm": 0.5786051154136658, + "learning_rate": 0.00043746794526451887, + "loss": 3.5591, + "step": 25250 + }, + { + "epoch": 2.723065332041761, + "grad_norm": 0.6026502847671509, + "learning_rate": 0.00043714470423445747, + "loss": 3.5304, + "step": 25300 + }, + { + "epoch": 2.728446884081369, + "grad_norm": 0.6267446279525757, + "learning_rate": 0.00043682146320439606, + "loss": 3.5329, + "step": 25350 + }, + { + "epoch": 2.7338284361209775, + "grad_norm": 0.617989182472229, + "learning_rate": 0.00043649822217433466, + "loss": 3.5411, + "step": 25400 + }, + { + "epoch": 2.7392099881605856, + "grad_norm": 0.5602372288703918, + "learning_rate": 0.0004361749811442732, + "loss": 3.5651, + "step": 25450 + }, + { + "epoch": 2.7445915402001937, + "grad_norm": 0.5438847541809082, + "learning_rate": 0.0004358517401142118, + "loss": 3.5549, + "step": 25500 + }, + { + "epoch": 2.749973092239802, + "grad_norm": 0.5827850103378296, + "learning_rate": 0.00043552849908415033, + "loss": 3.5294, + "step": 25550 + }, + { + "epoch": 2.7553546442794103, + "grad_norm": 0.5704156160354614, + "learning_rate": 0.000435205258054089, + "loss": 3.5352, + "step": 25600 + }, + { + "epoch": 2.7607361963190185, + "grad_norm": 0.5868292450904846, + "learning_rate": 0.0004348820170240276, + "loss": 3.5492, + "step": 25650 + }, + { + "epoch": 2.7661177483586266, + "grad_norm": 0.5794882774353027, + "learning_rate": 0.0004345587759939661, + "loss": 3.5491, + "step": 25700 + }, + { + "epoch": 2.7714993003982347, + "grad_norm": 0.6541237235069275, + "learning_rate": 0.0004342355349639047, + "loss": 3.5236, + "step": 25750 + }, + { + "epoch": 2.776880852437843, + "grad_norm": 0.6571041345596313, + "learning_rate": 0.0004339122939338433, + "loss": 3.5395, + "step": 25800 + }, + { + "epoch": 2.7822624044774513, + "grad_norm": 0.5382871627807617, + "learning_rate": 0.00043358905290378184, + "loss": 3.5449, + "step": 25850 + }, + { + "epoch": 2.7876439565170594, + "grad_norm": 0.5958513021469116, + "learning_rate": 0.0004332658118737205, + "loss": 3.5299, + "step": 25900 + }, + { + "epoch": 2.793025508556668, + "grad_norm": 0.5741668939590454, + "learning_rate": 0.0004329425708436591, + "loss": 3.5432, + "step": 25950 + }, + { + "epoch": 2.798407060596276, + "grad_norm": 0.6117798686027527, + "learning_rate": 0.00043261932981359763, + "loss": 3.542, + "step": 26000 + }, + { + "epoch": 2.798407060596276, + "eval_accuracy": 0.3677167932056307, + "eval_loss": 3.520785093307495, + "eval_runtime": 182.753, + "eval_samples_per_second": 98.554, + "eval_steps_per_second": 6.161, + "step": 26000 + }, + { + "epoch": 2.803788612635884, + "grad_norm": 0.5519882440567017, + "learning_rate": 0.0004322960887835362, + "loss": 3.5522, + "step": 26050 + }, + { + "epoch": 2.8091701646754923, + "grad_norm": 0.5597295761108398, + "learning_rate": 0.00043197284775347476, + "loss": 3.5286, + "step": 26100 + }, + { + "epoch": 2.8145517167151004, + "grad_norm": 0.6364133358001709, + "learning_rate": 0.00043164960672341336, + "loss": 3.5472, + "step": 26150 + }, + { + "epoch": 2.819933268754709, + "grad_norm": 0.6041819453239441, + "learning_rate": 0.000431326365693352, + "loss": 3.5536, + "step": 26200 + }, + { + "epoch": 2.825314820794317, + "grad_norm": 0.5981535911560059, + "learning_rate": 0.00043100312466329055, + "loss": 3.5325, + "step": 26250 + }, + { + "epoch": 2.830696372833925, + "grad_norm": 0.6039505004882812, + "learning_rate": 0.00043067988363322914, + "loss": 3.5477, + "step": 26300 + }, + { + "epoch": 2.8360779248735337, + "grad_norm": 0.5583112835884094, + "learning_rate": 0.00043035664260316774, + "loss": 3.5385, + "step": 26350 + }, + { + "epoch": 2.841459476913142, + "grad_norm": 0.6340571045875549, + "learning_rate": 0.0004300334015731063, + "loss": 3.5228, + "step": 26400 + }, + { + "epoch": 2.84684102895275, + "grad_norm": 0.5811586380004883, + "learning_rate": 0.0004297101605430449, + "loss": 3.5302, + "step": 26450 + }, + { + "epoch": 2.852222580992358, + "grad_norm": 0.5881547331809998, + "learning_rate": 0.0004293869195129835, + "loss": 3.5351, + "step": 26500 + }, + { + "epoch": 2.857604133031966, + "grad_norm": 0.6151056289672852, + "learning_rate": 0.00042906367848292206, + "loss": 3.5292, + "step": 26550 + }, + { + "epoch": 2.8629856850715747, + "grad_norm": 0.606568455696106, + "learning_rate": 0.00042874043745286066, + "loss": 3.5441, + "step": 26600 + }, + { + "epoch": 2.868367237111183, + "grad_norm": 0.535579264163971, + "learning_rate": 0.0004284171964227992, + "loss": 3.5116, + "step": 26650 + }, + { + "epoch": 2.873748789150791, + "grad_norm": 0.5709394216537476, + "learning_rate": 0.0004280939553927378, + "loss": 3.5527, + "step": 26700 + }, + { + "epoch": 2.8791303411903995, + "grad_norm": 0.6123267412185669, + "learning_rate": 0.00042777071436267644, + "loss": 3.5198, + "step": 26750 + }, + { + "epoch": 2.8845118932300076, + "grad_norm": 0.6574706435203552, + "learning_rate": 0.000427447473332615, + "loss": 3.5197, + "step": 26800 + }, + { + "epoch": 2.8898934452696157, + "grad_norm": 0.5446913838386536, + "learning_rate": 0.0004271242323025536, + "loss": 3.5375, + "step": 26850 + }, + { + "epoch": 2.895274997309224, + "grad_norm": 0.5856238007545471, + "learning_rate": 0.00042680099127249217, + "loss": 3.5379, + "step": 26900 + }, + { + "epoch": 2.9006565493488323, + "grad_norm": 0.5693891048431396, + "learning_rate": 0.0004264777502424307, + "loss": 3.5373, + "step": 26950 + }, + { + "epoch": 2.9060381013884404, + "grad_norm": 0.5685888528823853, + "learning_rate": 0.0004261545092123693, + "loss": 3.5247, + "step": 27000 + }, + { + "epoch": 2.9060381013884404, + "eval_accuracy": 0.3690562660066076, + "eval_loss": 3.5081608295440674, + "eval_runtime": 182.7742, + "eval_samples_per_second": 98.542, + "eval_steps_per_second": 6.161, + "step": 27000 + }, + { + "epoch": 2.9114196534280485, + "grad_norm": 0.5613411664962769, + "learning_rate": 0.00042583126818230795, + "loss": 3.5421, + "step": 27050 + }, + { + "epoch": 2.9168012054676566, + "grad_norm": 0.5848773717880249, + "learning_rate": 0.0004255080271522465, + "loss": 3.5342, + "step": 27100 + }, + { + "epoch": 2.922182757507265, + "grad_norm": 0.6449905037879944, + "learning_rate": 0.0004251847861221851, + "loss": 3.5196, + "step": 27150 + }, + { + "epoch": 2.9275643095468733, + "grad_norm": 0.6094990372657776, + "learning_rate": 0.00042486154509212363, + "loss": 3.5496, + "step": 27200 + }, + { + "epoch": 2.9329458615864814, + "grad_norm": 0.5689534544944763, + "learning_rate": 0.0004245447688826635, + "loss": 3.5291, + "step": 27250 + }, + { + "epoch": 2.93832741362609, + "grad_norm": 0.6069585084915161, + "learning_rate": 0.00042422152785260203, + "loss": 3.5095, + "step": 27300 + }, + { + "epoch": 2.943708965665698, + "grad_norm": 0.6799548268318176, + "learning_rate": 0.0004238982868225406, + "loss": 3.5521, + "step": 27350 + }, + { + "epoch": 2.949090517705306, + "grad_norm": 0.5622816681861877, + "learning_rate": 0.0004235750457924793, + "loss": 3.5134, + "step": 27400 + }, + { + "epoch": 2.9544720697449143, + "grad_norm": 0.5812619924545288, + "learning_rate": 0.0004232518047624178, + "loss": 3.534, + "step": 27450 + }, + { + "epoch": 2.9598536217845224, + "grad_norm": 0.5781611800193787, + "learning_rate": 0.0004229285637323564, + "loss": 3.5307, + "step": 27500 + }, + { + "epoch": 2.965235173824131, + "grad_norm": 0.6182583570480347, + "learning_rate": 0.00042260532270229495, + "loss": 3.5355, + "step": 27550 + }, + { + "epoch": 2.970616725863739, + "grad_norm": 0.6099269390106201, + "learning_rate": 0.00042228208167223354, + "loss": 3.5141, + "step": 27600 + }, + { + "epoch": 2.975998277903347, + "grad_norm": 0.6089410185813904, + "learning_rate": 0.00042195884064217214, + "loss": 3.5328, + "step": 27650 + }, + { + "epoch": 2.9813798299429557, + "grad_norm": 0.577688992023468, + "learning_rate": 0.00042163559961211073, + "loss": 3.5638, + "step": 27700 + }, + { + "epoch": 2.986761381982564, + "grad_norm": 0.6033281683921814, + "learning_rate": 0.00042131235858204933, + "loss": 3.524, + "step": 27750 + }, + { + "epoch": 2.992142934022172, + "grad_norm": 0.5998152494430542, + "learning_rate": 0.0004209891175519879, + "loss": 3.5123, + "step": 27800 + }, + { + "epoch": 2.9975244860617805, + "grad_norm": 0.676117479801178, + "learning_rate": 0.00042066587652192646, + "loss": 3.5311, + "step": 27850 + }, + { + "epoch": 3.0029060381013886, + "grad_norm": 0.5953896641731262, + "learning_rate": 0.00042034263549186506, + "loss": 3.494, + "step": 27900 + }, + { + "epoch": 3.0082875901409967, + "grad_norm": 0.5766949653625488, + "learning_rate": 0.0004200193944618036, + "loss": 3.419, + "step": 27950 + }, + { + "epoch": 3.0136691421806048, + "grad_norm": 0.5596526265144348, + "learning_rate": 0.00041969615343174225, + "loss": 3.4332, + "step": 28000 + }, + { + "epoch": 3.0136691421806048, + "eval_accuracy": 0.3699133199856882, + "eval_loss": 3.5029404163360596, + "eval_runtime": 182.639, + "eval_samples_per_second": 98.615, + "eval_steps_per_second": 6.165, + "step": 28000 + }, + { + "epoch": 3.0190506942202133, + "grad_norm": 0.581177830696106, + "learning_rate": 0.00041937291240168084, + "loss": 3.4454, + "step": 28050 + }, + { + "epoch": 3.0244322462598214, + "grad_norm": 0.5963121652603149, + "learning_rate": 0.0004190496713716194, + "loss": 3.4537, + "step": 28100 + }, + { + "epoch": 3.0298137982994295, + "grad_norm": 0.5734453201293945, + "learning_rate": 0.000418726430341558, + "loss": 3.4422, + "step": 28150 + }, + { + "epoch": 3.0351953503390376, + "grad_norm": 0.6005439758300781, + "learning_rate": 0.00041840318931149657, + "loss": 3.4446, + "step": 28200 + }, + { + "epoch": 3.040576902378646, + "grad_norm": 0.6174030303955078, + "learning_rate": 0.00041807994828143517, + "loss": 3.4522, + "step": 28250 + }, + { + "epoch": 3.0459584544182543, + "grad_norm": 0.6408053636550903, + "learning_rate": 0.00041775670725137376, + "loss": 3.4324, + "step": 28300 + }, + { + "epoch": 3.0513400064578624, + "grad_norm": 0.6011624336242676, + "learning_rate": 0.00041743346622131236, + "loss": 3.4556, + "step": 28350 + }, + { + "epoch": 3.0567215584974705, + "grad_norm": 0.6361274719238281, + "learning_rate": 0.0004171102251912509, + "loss": 3.4448, + "step": 28400 + }, + { + "epoch": 3.062103110537079, + "grad_norm": 0.602817952632904, + "learning_rate": 0.0004167869841611895, + "loss": 3.4486, + "step": 28450 + }, + { + "epoch": 3.067484662576687, + "grad_norm": 0.5594189167022705, + "learning_rate": 0.00041646374313112803, + "loss": 3.4733, + "step": 28500 + }, + { + "epoch": 3.0728662146162953, + "grad_norm": 0.7374558448791504, + "learning_rate": 0.0004161405021010667, + "loss": 3.4482, + "step": 28550 + }, + { + "epoch": 3.0782477666559034, + "grad_norm": 0.6085382699966431, + "learning_rate": 0.0004158172610710053, + "loss": 3.4641, + "step": 28600 + }, + { + "epoch": 3.083629318695512, + "grad_norm": 0.5731576681137085, + "learning_rate": 0.0004154940200409438, + "loss": 3.4374, + "step": 28650 + }, + { + "epoch": 3.08901087073512, + "grad_norm": 0.5917696356773376, + "learning_rate": 0.0004151707790108824, + "loss": 3.4511, + "step": 28700 + }, + { + "epoch": 3.094392422774728, + "grad_norm": 0.5805858373641968, + "learning_rate": 0.00041484753798082095, + "loss": 3.4436, + "step": 28750 + }, + { + "epoch": 3.0997739748143363, + "grad_norm": 0.575162947177887, + "learning_rate": 0.00041452429695075955, + "loss": 3.4435, + "step": 28800 + }, + { + "epoch": 3.105155526853945, + "grad_norm": 0.5958883166313171, + "learning_rate": 0.0004142010559206982, + "loss": 3.4675, + "step": 28850 + }, + { + "epoch": 3.110537078893553, + "grad_norm": 0.6094611287117004, + "learning_rate": 0.00041387781489063673, + "loss": 3.4567, + "step": 28900 + }, + { + "epoch": 3.115918630933161, + "grad_norm": 0.5872287750244141, + "learning_rate": 0.00041355457386057533, + "loss": 3.439, + "step": 28950 + }, + { + "epoch": 3.121300182972769, + "grad_norm": 0.6196814775466919, + "learning_rate": 0.0004132313328305139, + "loss": 3.444, + "step": 29000 + }, + { + "epoch": 3.121300182972769, + "eval_accuracy": 0.3706220627729021, + "eval_loss": 3.4992904663085938, + "eval_runtime": 182.5954, + "eval_samples_per_second": 98.639, + "eval_steps_per_second": 6.167, + "step": 29000 + }, + { + "epoch": 3.1266817350123777, + "grad_norm": 0.619157612323761, + "learning_rate": 0.00041290809180045246, + "loss": 3.4508, + "step": 29050 + }, + { + "epoch": 3.132063287051986, + "grad_norm": 0.5508050918579102, + "learning_rate": 0.0004125848507703911, + "loss": 3.4648, + "step": 29100 + }, + { + "epoch": 3.137444839091594, + "grad_norm": 0.5980096459388733, + "learning_rate": 0.0004122616097403297, + "loss": 3.4646, + "step": 29150 + }, + { + "epoch": 3.1428263911312024, + "grad_norm": 0.6267610192298889, + "learning_rate": 0.00041193836871026825, + "loss": 3.453, + "step": 29200 + }, + { + "epoch": 3.1482079431708105, + "grad_norm": 0.6516902446746826, + "learning_rate": 0.00041161512768020684, + "loss": 3.441, + "step": 29250 + }, + { + "epoch": 3.1535894952104186, + "grad_norm": 0.5599831938743591, + "learning_rate": 0.0004112918866501454, + "loss": 3.4536, + "step": 29300 + }, + { + "epoch": 3.1589710472500268, + "grad_norm": 0.5650069713592529, + "learning_rate": 0.00041097511044068524, + "loss": 3.462, + "step": 29350 + }, + { + "epoch": 3.1643525992896353, + "grad_norm": 0.597890317440033, + "learning_rate": 0.0004106518694106238, + "loss": 3.4729, + "step": 29400 + }, + { + "epoch": 3.1697341513292434, + "grad_norm": 0.6053642630577087, + "learning_rate": 0.0004103286283805624, + "loss": 3.447, + "step": 29450 + }, + { + "epoch": 3.1751157033688515, + "grad_norm": 0.6315788626670837, + "learning_rate": 0.00041000538735050103, + "loss": 3.4505, + "step": 29500 + }, + { + "epoch": 3.1804972554084596, + "grad_norm": 0.5334057211875916, + "learning_rate": 0.00040968214632043957, + "loss": 3.4588, + "step": 29550 + }, + { + "epoch": 3.185878807448068, + "grad_norm": 0.5461787581443787, + "learning_rate": 0.00040935890529037816, + "loss": 3.4605, + "step": 29600 + }, + { + "epoch": 3.1912603594876763, + "grad_norm": 0.6308553814888, + "learning_rate": 0.00040903566426031676, + "loss": 3.4494, + "step": 29650 + }, + { + "epoch": 3.1966419115272844, + "grad_norm": 0.5765513777732849, + "learning_rate": 0.0004087124232302553, + "loss": 3.4575, + "step": 29700 + }, + { + "epoch": 3.2020234635668925, + "grad_norm": 0.5815839171409607, + "learning_rate": 0.0004083891822001939, + "loss": 3.4578, + "step": 29750 + }, + { + "epoch": 3.207405015606501, + "grad_norm": 0.5885655879974365, + "learning_rate": 0.00040806594117013254, + "loss": 3.4475, + "step": 29800 + }, + { + "epoch": 3.212786567646109, + "grad_norm": 0.6800254583358765, + "learning_rate": 0.0004077427001400711, + "loss": 3.4626, + "step": 29850 + }, + { + "epoch": 3.2181681196857173, + "grad_norm": 0.5856930613517761, + "learning_rate": 0.0004074194591100097, + "loss": 3.454, + "step": 29900 + }, + { + "epoch": 3.2235496717253254, + "grad_norm": 0.6474602222442627, + "learning_rate": 0.0004070962180799482, + "loss": 3.4509, + "step": 29950 + }, + { + "epoch": 3.228931223764934, + "grad_norm": 0.5948309302330017, + "learning_rate": 0.0004067729770498868, + "loss": 3.4617, + "step": 30000 + }, + { + "epoch": 3.228931223764934, + "eval_accuracy": 0.3714459776211993, + "eval_loss": 3.4921934604644775, + "eval_runtime": 182.5164, + "eval_samples_per_second": 98.682, + "eval_steps_per_second": 6.169, + "step": 30000 + }, + { + "epoch": 3.234312775804542, + "grad_norm": 0.5894820094108582, + "learning_rate": 0.00040644973601982546, + "loss": 3.4608, + "step": 30050 + }, + { + "epoch": 3.23969432784415, + "grad_norm": 0.5914416313171387, + "learning_rate": 0.000406126494989764, + "loss": 3.4591, + "step": 30100 + }, + { + "epoch": 3.2450758798837587, + "grad_norm": 0.5802457928657532, + "learning_rate": 0.0004058032539597026, + "loss": 3.4654, + "step": 30150 + }, + { + "epoch": 3.250457431923367, + "grad_norm": 0.584543764591217, + "learning_rate": 0.00040548001292964114, + "loss": 3.4611, + "step": 30200 + }, + { + "epoch": 3.255838983962975, + "grad_norm": 0.5898339748382568, + "learning_rate": 0.00040515677189957973, + "loss": 3.4736, + "step": 30250 + }, + { + "epoch": 3.261220536002583, + "grad_norm": 0.5681132078170776, + "learning_rate": 0.0004048335308695183, + "loss": 3.4583, + "step": 30300 + }, + { + "epoch": 3.2666020880421915, + "grad_norm": 0.6399067640304565, + "learning_rate": 0.0004045102898394569, + "loss": 3.4683, + "step": 30350 + }, + { + "epoch": 3.2719836400817996, + "grad_norm": 0.6160270571708679, + "learning_rate": 0.0004041870488093955, + "loss": 3.4713, + "step": 30400 + }, + { + "epoch": 3.2773651921214078, + "grad_norm": 0.6204001307487488, + "learning_rate": 0.0004038638077793341, + "loss": 3.4577, + "step": 30450 + }, + { + "epoch": 3.282746744161016, + "grad_norm": 0.5580645203590393, + "learning_rate": 0.00040354056674927265, + "loss": 3.4456, + "step": 30500 + }, + { + "epoch": 3.2881282962006244, + "grad_norm": 0.5606558322906494, + "learning_rate": 0.00040321732571921124, + "loss": 3.449, + "step": 30550 + }, + { + "epoch": 3.2935098482402325, + "grad_norm": 0.6012607216835022, + "learning_rate": 0.0004028940846891498, + "loss": 3.4587, + "step": 30600 + }, + { + "epoch": 3.2988914002798406, + "grad_norm": 0.5980702638626099, + "learning_rate": 0.00040257084365908843, + "loss": 3.4565, + "step": 30650 + }, + { + "epoch": 3.304272952319449, + "grad_norm": 0.5933027267456055, + "learning_rate": 0.00040224760262902703, + "loss": 3.4612, + "step": 30700 + }, + { + "epoch": 3.3096545043590573, + "grad_norm": 0.6065646409988403, + "learning_rate": 0.00040192436159896557, + "loss": 3.4632, + "step": 30750 + }, + { + "epoch": 3.3150360563986654, + "grad_norm": 0.6052440404891968, + "learning_rate": 0.00040160112056890416, + "loss": 3.4554, + "step": 30800 + }, + { + "epoch": 3.3204176084382735, + "grad_norm": 0.5899329781532288, + "learning_rate": 0.00040127787953884276, + "loss": 3.4619, + "step": 30850 + }, + { + "epoch": 3.3257991604778816, + "grad_norm": 0.6436607837677002, + "learning_rate": 0.00040095463850878135, + "loss": 3.4441, + "step": 30900 + }, + { + "epoch": 3.33118071251749, + "grad_norm": 0.5700132250785828, + "learning_rate": 0.00040063139747871995, + "loss": 3.4669, + "step": 30950 + }, + { + "epoch": 3.3365622645570983, + "grad_norm": 0.5744343996047974, + "learning_rate": 0.00040030815644865854, + "loss": 3.4675, + "step": 31000 + }, + { + "epoch": 3.3365622645570983, + "eval_accuracy": 0.37178160639165825, + "eval_loss": 3.485490083694458, + "eval_runtime": 182.4931, + "eval_samples_per_second": 98.694, + "eval_steps_per_second": 6.17, + "step": 31000 + }, + { + "epoch": 3.3419438165967064, + "grad_norm": 0.5764909982681274, + "learning_rate": 0.0003999849154185971, + "loss": 3.4516, + "step": 31050 + }, + { + "epoch": 3.347325368636315, + "grad_norm": 0.6199538111686707, + "learning_rate": 0.0003996616743885357, + "loss": 3.4754, + "step": 31100 + }, + { + "epoch": 3.352706920675923, + "grad_norm": 0.6050508618354797, + "learning_rate": 0.0003993384333584742, + "loss": 3.4551, + "step": 31150 + }, + { + "epoch": 3.358088472715531, + "grad_norm": 0.591788113117218, + "learning_rate": 0.00039901519232841287, + "loss": 3.4839, + "step": 31200 + }, + { + "epoch": 3.3634700247551392, + "grad_norm": 0.6040419340133667, + "learning_rate": 0.00039869195129835146, + "loss": 3.4702, + "step": 31250 + }, + { + "epoch": 3.368851576794748, + "grad_norm": 0.680150032043457, + "learning_rate": 0.00039836871026829, + "loss": 3.4417, + "step": 31300 + }, + { + "epoch": 3.374233128834356, + "grad_norm": 0.5792616605758667, + "learning_rate": 0.0003980454692382286, + "loss": 3.4799, + "step": 31350 + }, + { + "epoch": 3.379614680873964, + "grad_norm": 0.5896565914154053, + "learning_rate": 0.0003977286930287684, + "loss": 3.4703, + "step": 31400 + }, + { + "epoch": 3.384996232913572, + "grad_norm": 0.5844624042510986, + "learning_rate": 0.000397405451998707, + "loss": 3.4721, + "step": 31450 + }, + { + "epoch": 3.3903777849531807, + "grad_norm": 0.5530960559844971, + "learning_rate": 0.00039708221096864554, + "loss": 3.4579, + "step": 31500 + }, + { + "epoch": 3.3957593369927888, + "grad_norm": 0.5991605520248413, + "learning_rate": 0.00039675896993858413, + "loss": 3.477, + "step": 31550 + }, + { + "epoch": 3.401140889032397, + "grad_norm": 0.5415410399436951, + "learning_rate": 0.0003964357289085228, + "loss": 3.4624, + "step": 31600 + }, + { + "epoch": 3.4065224410720054, + "grad_norm": 0.6166115403175354, + "learning_rate": 0.0003961124878784613, + "loss": 3.4492, + "step": 31650 + }, + { + "epoch": 3.4119039931116135, + "grad_norm": 0.5882399082183838, + "learning_rate": 0.0003957892468483999, + "loss": 3.4693, + "step": 31700 + }, + { + "epoch": 3.4172855451512216, + "grad_norm": 0.6354446411132812, + "learning_rate": 0.0003954660058183385, + "loss": 3.4692, + "step": 31750 + }, + { + "epoch": 3.4226670971908297, + "grad_norm": 0.5936651229858398, + "learning_rate": 0.00039514276478827705, + "loss": 3.4842, + "step": 31800 + }, + { + "epoch": 3.428048649230438, + "grad_norm": 0.7023192644119263, + "learning_rate": 0.0003948195237582157, + "loss": 3.4645, + "step": 31850 + }, + { + "epoch": 3.4334302012700464, + "grad_norm": 0.6037172675132751, + "learning_rate": 0.0003944962827281543, + "loss": 3.4535, + "step": 31900 + }, + { + "epoch": 3.4388117533096545, + "grad_norm": 0.6036386489868164, + "learning_rate": 0.00039417304169809284, + "loss": 3.48, + "step": 31950 + }, + { + "epoch": 3.4441933053492626, + "grad_norm": 0.6221677660942078, + "learning_rate": 0.00039384980066803143, + "loss": 3.4746, + "step": 32000 + }, + { + "epoch": 3.4441933053492626, + "eval_accuracy": 0.37269016183956294, + "eval_loss": 3.4805102348327637, + "eval_runtime": 182.9051, + "eval_samples_per_second": 98.472, + "eval_steps_per_second": 6.156, + "step": 32000 + }, + { + "epoch": 3.449574857388871, + "grad_norm": 0.5549188852310181, + "learning_rate": 0.00039352655963796997, + "loss": 3.4655, + "step": 32050 + }, + { + "epoch": 3.4549564094284793, + "grad_norm": 0.6282884478569031, + "learning_rate": 0.00039320331860790857, + "loss": 3.473, + "step": 32100 + }, + { + "epoch": 3.4603379614680874, + "grad_norm": 0.5992251634597778, + "learning_rate": 0.0003928800775778472, + "loss": 3.4684, + "step": 32150 + }, + { + "epoch": 3.4657195135076955, + "grad_norm": 0.6112256050109863, + "learning_rate": 0.00039255683654778576, + "loss": 3.4635, + "step": 32200 + }, + { + "epoch": 3.471101065547304, + "grad_norm": 0.6498141288757324, + "learning_rate": 0.00039223359551772435, + "loss": 3.4595, + "step": 32250 + }, + { + "epoch": 3.476482617586912, + "grad_norm": 0.8926645517349243, + "learning_rate": 0.00039191035448766294, + "loss": 3.4693, + "step": 32300 + }, + { + "epoch": 3.4818641696265202, + "grad_norm": 0.5939327478408813, + "learning_rate": 0.0003915871134576015, + "loss": 3.4674, + "step": 32350 + }, + { + "epoch": 3.4872457216661283, + "grad_norm": 0.5974373817443848, + "learning_rate": 0.0003912638724275401, + "loss": 3.4627, + "step": 32400 + }, + { + "epoch": 3.492627273705737, + "grad_norm": 0.6161805987358093, + "learning_rate": 0.00039094063139747873, + "loss": 3.4778, + "step": 32450 + }, + { + "epoch": 3.498008825745345, + "grad_norm": 0.626426637172699, + "learning_rate": 0.00039061739036741727, + "loss": 3.4582, + "step": 32500 + }, + { + "epoch": 3.503390377784953, + "grad_norm": 0.6194849014282227, + "learning_rate": 0.00039029414933735586, + "loss": 3.4596, + "step": 32550 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.5703500509262085, + "learning_rate": 0.0003899709083072944, + "loss": 3.4511, + "step": 32600 + }, + { + "epoch": 3.5141534818641698, + "grad_norm": 0.5987148880958557, + "learning_rate": 0.000389647667277233, + "loss": 3.4513, + "step": 32650 + }, + { + "epoch": 3.519535033903778, + "grad_norm": 0.6004769802093506, + "learning_rate": 0.00038932442624717165, + "loss": 3.4605, + "step": 32700 + }, + { + "epoch": 3.524916585943386, + "grad_norm": 0.6045653820037842, + "learning_rate": 0.0003890011852171102, + "loss": 3.4566, + "step": 32750 + }, + { + "epoch": 3.530298137982994, + "grad_norm": 0.6004464030265808, + "learning_rate": 0.0003886779441870488, + "loss": 3.4689, + "step": 32800 + }, + { + "epoch": 3.5356796900226026, + "grad_norm": 0.6028228998184204, + "learning_rate": 0.0003883547031569874, + "loss": 3.4587, + "step": 32850 + }, + { + "epoch": 3.5410612420622107, + "grad_norm": 0.6320363879203796, + "learning_rate": 0.0003880314621269259, + "loss": 3.4762, + "step": 32900 + }, + { + "epoch": 3.546442794101819, + "grad_norm": 0.6261252164840698, + "learning_rate": 0.0003877082210968645, + "loss": 3.4774, + "step": 32950 + }, + { + "epoch": 3.5518243461414274, + "grad_norm": 0.5780692100524902, + "learning_rate": 0.00038738498006680316, + "loss": 3.468, + "step": 33000 + }, + { + "epoch": 3.5518243461414274, + "eval_accuracy": 0.3728782399883698, + "eval_loss": 3.4731545448303223, + "eval_runtime": 182.9321, + "eval_samples_per_second": 98.457, + "eval_steps_per_second": 6.155, + "step": 33000 + }, + { + "epoch": 3.5572058981810355, + "grad_norm": 0.5741873979568481, + "learning_rate": 0.0003870617390367417, + "loss": 3.4358, + "step": 33050 + }, + { + "epoch": 3.5625874502206436, + "grad_norm": 0.587976336479187, + "learning_rate": 0.0003867384980066803, + "loss": 3.4524, + "step": 33100 + }, + { + "epoch": 3.5679690022602517, + "grad_norm": 0.5886296033859253, + "learning_rate": 0.00038641525697661884, + "loss": 3.4588, + "step": 33150 + }, + { + "epoch": 3.57335055429986, + "grad_norm": 0.6252454519271851, + "learning_rate": 0.00038609201594655743, + "loss": 3.4569, + "step": 33200 + }, + { + "epoch": 3.5787321063394684, + "grad_norm": 0.6233946681022644, + "learning_rate": 0.000385768774916496, + "loss": 3.4602, + "step": 33250 + }, + { + "epoch": 3.5841136583790765, + "grad_norm": 0.6176490783691406, + "learning_rate": 0.0003854455338864346, + "loss": 3.4731, + "step": 33300 + }, + { + "epoch": 3.5894952104186846, + "grad_norm": 0.585235059261322, + "learning_rate": 0.0003851222928563732, + "loss": 3.4542, + "step": 33350 + }, + { + "epoch": 3.594876762458293, + "grad_norm": 0.6597333550453186, + "learning_rate": 0.0003847990518263118, + "loss": 3.4511, + "step": 33400 + }, + { + "epoch": 3.6002583144979012, + "grad_norm": 0.5512192249298096, + "learning_rate": 0.0003844822756168516, + "loss": 3.471, + "step": 33450 + }, + { + "epoch": 3.6056398665375093, + "grad_norm": 0.5989030003547668, + "learning_rate": 0.00038415903458679016, + "loss": 3.4706, + "step": 33500 + }, + { + "epoch": 3.611021418577118, + "grad_norm": 0.643202543258667, + "learning_rate": 0.00038383579355672875, + "loss": 3.4446, + "step": 33550 + }, + { + "epoch": 3.616402970616726, + "grad_norm": 0.6083823442459106, + "learning_rate": 0.00038351255252666735, + "loss": 3.4702, + "step": 33600 + }, + { + "epoch": 3.621784522656334, + "grad_norm": 0.6128966212272644, + "learning_rate": 0.00038318931149660594, + "loss": 3.4584, + "step": 33650 + }, + { + "epoch": 3.627166074695942, + "grad_norm": 0.6018299460411072, + "learning_rate": 0.00038286607046654454, + "loss": 3.4597, + "step": 33700 + }, + { + "epoch": 3.6325476267355503, + "grad_norm": 0.6082707643508911, + "learning_rate": 0.00038254282943648313, + "loss": 3.4459, + "step": 33750 + }, + { + "epoch": 3.637929178775159, + "grad_norm": 0.5940393805503845, + "learning_rate": 0.00038221958840642167, + "loss": 3.4514, + "step": 33800 + }, + { + "epoch": 3.643310730814767, + "grad_norm": 0.6136729121208191, + "learning_rate": 0.00038189634737636027, + "loss": 3.4703, + "step": 33850 + }, + { + "epoch": 3.648692282854375, + "grad_norm": 0.571419358253479, + "learning_rate": 0.0003815731063462988, + "loss": 3.4434, + "step": 33900 + }, + { + "epoch": 3.6540738348939836, + "grad_norm": 0.5937557220458984, + "learning_rate": 0.00038124986531623745, + "loss": 3.4589, + "step": 33950 + }, + { + "epoch": 3.6594553869335917, + "grad_norm": 0.6121259331703186, + "learning_rate": 0.00038092662428617605, + "loss": 3.4691, + "step": 34000 + }, + { + "epoch": 3.6594553869335917, + "eval_accuracy": 0.3735422177858041, + "eval_loss": 3.4664924144744873, + "eval_runtime": 183.1266, + "eval_samples_per_second": 98.353, + "eval_steps_per_second": 6.149, + "step": 34000 + }, + { + "epoch": 3.6648369389732, + "grad_norm": 0.6286929249763489, + "learning_rate": 0.0003806033832561146, + "loss": 3.4675, + "step": 34050 + }, + { + "epoch": 3.670218491012808, + "grad_norm": 0.6216891407966614, + "learning_rate": 0.0003802801422260532, + "loss": 3.4353, + "step": 34100 + }, + { + "epoch": 3.675600043052416, + "grad_norm": 0.5747387409210205, + "learning_rate": 0.0003799569011959918, + "loss": 3.4438, + "step": 34150 + }, + { + "epoch": 3.6809815950920246, + "grad_norm": 0.6043647527694702, + "learning_rate": 0.0003796336601659303, + "loss": 3.4652, + "step": 34200 + }, + { + "epoch": 3.6863631471316327, + "grad_norm": 0.6015361547470093, + "learning_rate": 0.00037931041913586897, + "loss": 3.4511, + "step": 34250 + }, + { + "epoch": 3.691744699171241, + "grad_norm": 0.5804011225700378, + "learning_rate": 0.00037898717810580756, + "loss": 3.4713, + "step": 34300 + }, + { + "epoch": 3.6971262512108494, + "grad_norm": 0.5761457681655884, + "learning_rate": 0.0003786639370757461, + "loss": 3.4658, + "step": 34350 + }, + { + "epoch": 3.7025078032504575, + "grad_norm": 0.5895703434944153, + "learning_rate": 0.0003783406960456847, + "loss": 3.4654, + "step": 34400 + }, + { + "epoch": 3.7078893552900656, + "grad_norm": 0.6082561016082764, + "learning_rate": 0.00037801745501562324, + "loss": 3.4621, + "step": 34450 + }, + { + "epoch": 3.713270907329674, + "grad_norm": 0.6229493021965027, + "learning_rate": 0.0003776942139855619, + "loss": 3.4451, + "step": 34500 + }, + { + "epoch": 3.7186524593692822, + "grad_norm": 0.5866677165031433, + "learning_rate": 0.0003773709729555005, + "loss": 3.4595, + "step": 34550 + }, + { + "epoch": 3.7240340114088903, + "grad_norm": 0.6240717172622681, + "learning_rate": 0.000377047731925439, + "loss": 3.4571, + "step": 34600 + }, + { + "epoch": 3.7294155634484984, + "grad_norm": 0.6172893047332764, + "learning_rate": 0.0003767244908953776, + "loss": 3.4676, + "step": 34650 + }, + { + "epoch": 3.7347971154881066, + "grad_norm": 0.6289220452308655, + "learning_rate": 0.0003764012498653162, + "loss": 3.4464, + "step": 34700 + }, + { + "epoch": 3.740178667527715, + "grad_norm": 0.6364248991012573, + "learning_rate": 0.00037607800883525475, + "loss": 3.465, + "step": 34750 + }, + { + "epoch": 3.745560219567323, + "grad_norm": 0.6611288189888, + "learning_rate": 0.0003757547678051934, + "loss": 3.4465, + "step": 34800 + }, + { + "epoch": 3.7509417716069313, + "grad_norm": 0.5711609125137329, + "learning_rate": 0.000375431526775132, + "loss": 3.4531, + "step": 34850 + }, + { + "epoch": 3.75632332364654, + "grad_norm": 0.6151444315910339, + "learning_rate": 0.00037510828574507054, + "loss": 3.4685, + "step": 34900 + }, + { + "epoch": 3.761704875686148, + "grad_norm": 0.6230692267417908, + "learning_rate": 0.00037478504471500913, + "loss": 3.446, + "step": 34950 + }, + { + "epoch": 3.767086427725756, + "grad_norm": 0.6537262797355652, + "learning_rate": 0.00037446180368494767, + "loss": 3.4681, + "step": 35000 + }, + { + "epoch": 3.767086427725756, + "eval_accuracy": 0.37434853086627534, + "eval_loss": 3.460052728652954, + "eval_runtime": 182.9222, + "eval_samples_per_second": 98.463, + "eval_steps_per_second": 6.156, + "step": 35000 + }, + { + "epoch": 3.772467979765364, + "grad_norm": 0.6269912123680115, + "learning_rate": 0.00037413856265488627, + "loss": 3.4745, + "step": 35050 + }, + { + "epoch": 3.7778495318049723, + "grad_norm": 0.6322194337844849, + "learning_rate": 0.0003738153216248249, + "loss": 3.4693, + "step": 35100 + }, + { + "epoch": 3.783231083844581, + "grad_norm": 0.632526695728302, + "learning_rate": 0.00037349208059476346, + "loss": 3.4557, + "step": 35150 + }, + { + "epoch": 3.788612635884189, + "grad_norm": 0.5771685242652893, + "learning_rate": 0.00037316883956470205, + "loss": 3.4585, + "step": 35200 + }, + { + "epoch": 3.793994187923797, + "grad_norm": 0.5918084979057312, + "learning_rate": 0.00037284559853464064, + "loss": 3.4503, + "step": 35250 + }, + { + "epoch": 3.7993757399634056, + "grad_norm": 0.6038704514503479, + "learning_rate": 0.0003725223575045792, + "loss": 3.4631, + "step": 35300 + }, + { + "epoch": 3.8047572920030137, + "grad_norm": 0.5990985631942749, + "learning_rate": 0.0003721991164745178, + "loss": 3.4527, + "step": 35350 + }, + { + "epoch": 3.810138844042622, + "grad_norm": 0.6696856021881104, + "learning_rate": 0.00037187587544445643, + "loss": 3.4572, + "step": 35400 + }, + { + "epoch": 3.8155203960822304, + "grad_norm": 0.5810059309005737, + "learning_rate": 0.00037155909923499624, + "loss": 3.4439, + "step": 35450 + }, + { + "epoch": 3.8209019481218385, + "grad_norm": 0.6126786470413208, + "learning_rate": 0.0003712358582049348, + "loss": 3.4521, + "step": 35500 + }, + { + "epoch": 3.8262835001614466, + "grad_norm": 0.5759022235870361, + "learning_rate": 0.00037091261717487337, + "loss": 3.439, + "step": 35550 + }, + { + "epoch": 3.8316650522010547, + "grad_norm": 0.6269648671150208, + "learning_rate": 0.00037058937614481197, + "loss": 3.4444, + "step": 35600 + }, + { + "epoch": 3.837046604240663, + "grad_norm": 0.6104834079742432, + "learning_rate": 0.0003702661351147505, + "loss": 3.4494, + "step": 35650 + }, + { + "epoch": 3.8424281562802713, + "grad_norm": 0.623383641242981, + "learning_rate": 0.0003699428940846891, + "loss": 3.4771, + "step": 35700 + }, + { + "epoch": 3.8478097083198795, + "grad_norm": 0.6111641526222229, + "learning_rate": 0.00036961965305462775, + "loss": 3.4553, + "step": 35750 + }, + { + "epoch": 3.8531912603594876, + "grad_norm": 0.6193336844444275, + "learning_rate": 0.0003692964120245663, + "loss": 3.4714, + "step": 35800 + }, + { + "epoch": 3.858572812399096, + "grad_norm": 0.6333847641944885, + "learning_rate": 0.0003689731709945049, + "loss": 3.4516, + "step": 35850 + }, + { + "epoch": 3.863954364438704, + "grad_norm": 0.6365866661071777, + "learning_rate": 0.0003686499299644434, + "loss": 3.4439, + "step": 35900 + }, + { + "epoch": 3.8693359164783123, + "grad_norm": 0.6254480481147766, + "learning_rate": 0.000368326688934382, + "loss": 3.4289, + "step": 35950 + }, + { + "epoch": 3.8747174685179204, + "grad_norm": 0.6164137125015259, + "learning_rate": 0.0003680034479043206, + "loss": 3.4485, + "step": 36000 + }, + { + "epoch": 3.8747174685179204, + "eval_accuracy": 0.37518298504469055, + "eval_loss": 3.4550580978393555, + "eval_runtime": 182.8899, + "eval_samples_per_second": 98.48, + "eval_steps_per_second": 6.157, + "step": 36000 + }, + { + "epoch": 3.8800990205575285, + "grad_norm": 0.599606990814209, + "learning_rate": 0.0003676802068742592, + "loss": 3.4553, + "step": 36050 + }, + { + "epoch": 3.885480572597137, + "grad_norm": 0.5952869653701782, + "learning_rate": 0.0003673569658441978, + "loss": 3.4557, + "step": 36100 + }, + { + "epoch": 3.890862124636745, + "grad_norm": 0.6147683262825012, + "learning_rate": 0.0003670337248141364, + "loss": 3.4397, + "step": 36150 + }, + { + "epoch": 3.8962436766763533, + "grad_norm": 0.6182876229286194, + "learning_rate": 0.00036671048378407494, + "loss": 3.4621, + "step": 36200 + }, + { + "epoch": 3.901625228715962, + "grad_norm": 0.6036136746406555, + "learning_rate": 0.00036638724275401353, + "loss": 3.459, + "step": 36250 + }, + { + "epoch": 3.90700678075557, + "grad_norm": 0.6490514278411865, + "learning_rate": 0.0003660640017239522, + "loss": 3.4717, + "step": 36300 + }, + { + "epoch": 3.912388332795178, + "grad_norm": 0.5808064341545105, + "learning_rate": 0.0003657407606938907, + "loss": 3.4619, + "step": 36350 + }, + { + "epoch": 3.9177698848347866, + "grad_norm": 0.6013696193695068, + "learning_rate": 0.0003654175196638293, + "loss": 3.4525, + "step": 36400 + }, + { + "epoch": 3.9231514368743947, + "grad_norm": 0.609664797782898, + "learning_rate": 0.00036509427863376786, + "loss": 3.4375, + "step": 36450 + }, + { + "epoch": 3.928532988914003, + "grad_norm": 0.6366161704063416, + "learning_rate": 0.00036477103760370645, + "loss": 3.4491, + "step": 36500 + }, + { + "epoch": 3.933914540953611, + "grad_norm": 0.5997065305709839, + "learning_rate": 0.00036444779657364505, + "loss": 3.4635, + "step": 36550 + }, + { + "epoch": 3.939296092993219, + "grad_norm": 0.6210324764251709, + "learning_rate": 0.00036412455554358364, + "loss": 3.4626, + "step": 36600 + }, + { + "epoch": 3.9446776450328276, + "grad_norm": 0.6512171030044556, + "learning_rate": 0.00036380131451352224, + "loss": 3.4469, + "step": 36650 + }, + { + "epoch": 3.9500591970724357, + "grad_norm": 0.6340328454971313, + "learning_rate": 0.00036347807348346083, + "loss": 3.4664, + "step": 36700 + }, + { + "epoch": 3.955440749112044, + "grad_norm": 0.5851263403892517, + "learning_rate": 0.00036315483245339937, + "loss": 3.4502, + "step": 36750 + }, + { + "epoch": 3.9608223011516523, + "grad_norm": 0.6642547845840454, + "learning_rate": 0.00036283159142333797, + "loss": 3.4394, + "step": 36800 + }, + { + "epoch": 3.9662038531912605, + "grad_norm": 0.6904765963554382, + "learning_rate": 0.0003625083503932765, + "loss": 3.4662, + "step": 36850 + }, + { + "epoch": 3.9715854052308686, + "grad_norm": 0.6355055570602417, + "learning_rate": 0.00036218510936321516, + "loss": 3.4568, + "step": 36900 + }, + { + "epoch": 3.9769669572704767, + "grad_norm": 0.585128664970398, + "learning_rate": 0.00036186186833315375, + "loss": 3.4772, + "step": 36950 + }, + { + "epoch": 3.9823485093100848, + "grad_norm": 0.5850505232810974, + "learning_rate": 0.0003615386273030923, + "loss": 3.4428, + "step": 37000 + }, + { + "epoch": 3.9823485093100848, + "eval_accuracy": 0.37551470231118816, + "eval_loss": 3.449396848678589, + "eval_runtime": 182.7917, + "eval_samples_per_second": 98.533, + "eval_steps_per_second": 6.16, + "step": 37000 + }, + { + "epoch": 3.9877300613496933, + "grad_norm": 0.6555930376052856, + "learning_rate": 0.0003612153862730309, + "loss": 3.4316, + "step": 37050 + }, + { + "epoch": 3.9931116133893014, + "grad_norm": 0.6331802010536194, + "learning_rate": 0.0003608921452429695, + "loss": 3.4507, + "step": 37100 + }, + { + "epoch": 3.9984931654289095, + "grad_norm": 0.5921234488487244, + "learning_rate": 0.000360568904212908, + "loss": 3.4469, + "step": 37150 + }, + { + "epoch": 4.003874717468518, + "grad_norm": 0.627491295337677, + "learning_rate": 0.00036024566318284667, + "loss": 3.384, + "step": 37200 + }, + { + "epoch": 4.009256269508126, + "grad_norm": 0.6413177847862244, + "learning_rate": 0.00035992242215278526, + "loss": 3.367, + "step": 37250 + }, + { + "epoch": 4.014637821547734, + "grad_norm": 0.6309844255447388, + "learning_rate": 0.0003595991811227238, + "loss": 3.363, + "step": 37300 + }, + { + "epoch": 4.020019373587343, + "grad_norm": 0.6667311191558838, + "learning_rate": 0.0003592759400926624, + "loss": 3.3572, + "step": 37350 + }, + { + "epoch": 4.0254009256269505, + "grad_norm": 0.6064625382423401, + "learning_rate": 0.00035895269906260094, + "loss": 3.3597, + "step": 37400 + }, + { + "epoch": 4.030782477666559, + "grad_norm": 0.6433827877044678, + "learning_rate": 0.0003586359228531408, + "loss": 3.3492, + "step": 37450 + }, + { + "epoch": 4.036164029706168, + "grad_norm": 0.66212397813797, + "learning_rate": 0.00035831268182307934, + "loss": 3.3711, + "step": 37500 + }, + { + "epoch": 4.041545581745775, + "grad_norm": 0.6347793936729431, + "learning_rate": 0.000357989440793018, + "loss": 3.3516, + "step": 37550 + }, + { + "epoch": 4.046927133785384, + "grad_norm": 0.6682549715042114, + "learning_rate": 0.0003576661997629566, + "loss": 3.3482, + "step": 37600 + }, + { + "epoch": 4.0523086858249915, + "grad_norm": 1.5289701223373413, + "learning_rate": 0.0003573429587328951, + "loss": 3.3792, + "step": 37650 + }, + { + "epoch": 4.0576902378646, + "grad_norm": 0.6569393873214722, + "learning_rate": 0.0003570197177028337, + "loss": 3.3742, + "step": 37700 + }, + { + "epoch": 4.063071789904209, + "grad_norm": 0.630212128162384, + "learning_rate": 0.00035669647667277226, + "loss": 3.368, + "step": 37750 + }, + { + "epoch": 4.068453341943816, + "grad_norm": 0.6595259308815002, + "learning_rate": 0.00035637323564271085, + "loss": 3.3954, + "step": 37800 + }, + { + "epoch": 4.073834893983425, + "grad_norm": 0.6565824151039124, + "learning_rate": 0.0003560499946126495, + "loss": 3.3667, + "step": 37850 + }, + { + "epoch": 4.079216446023033, + "grad_norm": 0.6169015765190125, + "learning_rate": 0.00035572675358258804, + "loss": 3.38, + "step": 37900 + }, + { + "epoch": 4.084597998062641, + "grad_norm": 0.6400800347328186, + "learning_rate": 0.00035540351255252664, + "loss": 3.3704, + "step": 37950 + }, + { + "epoch": 4.08997955010225, + "grad_norm": 0.6978241801261902, + "learning_rate": 0.00035508027152246523, + "loss": 3.3682, + "step": 38000 + }, + { + "epoch": 4.08997955010225, + "eval_accuracy": 0.3757263581366508, + "eval_loss": 3.452324151992798, + "eval_runtime": 182.3279, + "eval_samples_per_second": 98.784, + "eval_steps_per_second": 6.176, + "step": 38000 + }, + { + "epoch": 4.095361102141858, + "grad_norm": 0.6283411383628845, + "learning_rate": 0.0003547570304924038, + "loss": 3.3862, + "step": 38050 + }, + { + "epoch": 4.100742654181466, + "grad_norm": 0.6307471394538879, + "learning_rate": 0.0003544337894623424, + "loss": 3.3747, + "step": 38100 + }, + { + "epoch": 4.106124206221074, + "grad_norm": 0.597294270992279, + "learning_rate": 0.000354110548432281, + "loss": 3.3646, + "step": 38150 + }, + { + "epoch": 4.111505758260682, + "grad_norm": 0.5953186750411987, + "learning_rate": 0.00035378730740221956, + "loss": 3.3824, + "step": 38200 + }, + { + "epoch": 4.1168873103002905, + "grad_norm": 0.624042272567749, + "learning_rate": 0.00035346406637215815, + "loss": 3.3978, + "step": 38250 + }, + { + "epoch": 4.122268862339899, + "grad_norm": 0.6373803615570068, + "learning_rate": 0.0003531408253420967, + "loss": 3.3895, + "step": 38300 + }, + { + "epoch": 4.127650414379507, + "grad_norm": 0.6241115927696228, + "learning_rate": 0.0003528175843120353, + "loss": 3.382, + "step": 38350 + }, + { + "epoch": 4.133031966419115, + "grad_norm": 0.6354626417160034, + "learning_rate": 0.00035249434328197394, + "loss": 3.3788, + "step": 38400 + }, + { + "epoch": 4.138413518458724, + "grad_norm": 0.6405816674232483, + "learning_rate": 0.0003521711022519125, + "loss": 3.4024, + "step": 38450 + }, + { + "epoch": 4.1437950704983315, + "grad_norm": 0.6165269613265991, + "learning_rate": 0.00035184786122185107, + "loss": 3.3723, + "step": 38500 + }, + { + "epoch": 4.14917662253794, + "grad_norm": 0.6947121024131775, + "learning_rate": 0.00035152462019178967, + "loss": 3.3761, + "step": 38550 + }, + { + "epoch": 4.154558174577549, + "grad_norm": 0.6098374724388123, + "learning_rate": 0.0003512013791617282, + "loss": 3.37, + "step": 38600 + }, + { + "epoch": 4.159939726617156, + "grad_norm": 0.6297115087509155, + "learning_rate": 0.0003508781381316668, + "loss": 3.3762, + "step": 38650 + }, + { + "epoch": 4.165321278656765, + "grad_norm": 0.6331884860992432, + "learning_rate": 0.00035055489710160545, + "loss": 3.371, + "step": 38700 + }, + { + "epoch": 4.1707028306963725, + "grad_norm": 0.693463146686554, + "learning_rate": 0.000350231656071544, + "loss": 3.3778, + "step": 38750 + }, + { + "epoch": 4.176084382735981, + "grad_norm": 0.6266399025917053, + "learning_rate": 0.0003499084150414826, + "loss": 3.3779, + "step": 38800 + }, + { + "epoch": 4.18146593477559, + "grad_norm": 0.6141369342803955, + "learning_rate": 0.0003495851740114211, + "loss": 3.3825, + "step": 38850 + }, + { + "epoch": 4.186847486815197, + "grad_norm": 0.632418155670166, + "learning_rate": 0.0003492619329813597, + "loss": 3.3927, + "step": 38900 + }, + { + "epoch": 4.192229038854806, + "grad_norm": 0.6271191239356995, + "learning_rate": 0.0003489386919512983, + "loss": 3.3714, + "step": 38950 + }, + { + "epoch": 4.197610590894414, + "grad_norm": 0.7061901688575745, + "learning_rate": 0.0003486154509212369, + "loss": 3.3809, + "step": 39000 + }, + { + "epoch": 4.197610590894414, + "eval_accuracy": 0.37637001784406376, + "eval_loss": 3.448011875152588, + "eval_runtime": 182.5317, + "eval_samples_per_second": 98.673, + "eval_steps_per_second": 6.169, + "step": 39000 + }, + { + "epoch": 4.202992142934022, + "grad_norm": 0.6355361342430115, + "learning_rate": 0.0003482922098911755, + "loss": 3.3757, + "step": 39050 + }, + { + "epoch": 4.208373694973631, + "grad_norm": 0.6252569556236267, + "learning_rate": 0.0003479689688611141, + "loss": 3.3928, + "step": 39100 + }, + { + "epoch": 4.213755247013238, + "grad_norm": 0.6373631358146667, + "learning_rate": 0.00034764572783105264, + "loss": 3.3829, + "step": 39150 + }, + { + "epoch": 4.219136799052847, + "grad_norm": 0.6141077280044556, + "learning_rate": 0.00034732248680099123, + "loss": 3.3808, + "step": 39200 + }, + { + "epoch": 4.224518351092455, + "grad_norm": 0.6714136004447937, + "learning_rate": 0.00034700571059153104, + "loss": 3.3783, + "step": 39250 + }, + { + "epoch": 4.229899903132063, + "grad_norm": 0.6982507705688477, + "learning_rate": 0.00034668246956146963, + "loss": 3.3836, + "step": 39300 + }, + { + "epoch": 4.2352814551716715, + "grad_norm": 0.6456509828567505, + "learning_rate": 0.00034635922853140823, + "loss": 3.382, + "step": 39350 + }, + { + "epoch": 4.24066300721128, + "grad_norm": 0.6480652689933777, + "learning_rate": 0.0003460359875013468, + "loss": 3.3935, + "step": 39400 + }, + { + "epoch": 4.246044559250888, + "grad_norm": 0.6829937696456909, + "learning_rate": 0.0003457127464712854, + "loss": 3.3913, + "step": 39450 + }, + { + "epoch": 4.251426111290496, + "grad_norm": 0.6466607451438904, + "learning_rate": 0.00034538950544122396, + "loss": 3.3826, + "step": 39500 + }, + { + "epoch": 4.256807663330104, + "grad_norm": 0.665008544921875, + "learning_rate": 0.00034506626441116255, + "loss": 3.3962, + "step": 39550 + }, + { + "epoch": 4.2621892153697125, + "grad_norm": 0.633686363697052, + "learning_rate": 0.0003447430233811011, + "loss": 3.3794, + "step": 39600 + }, + { + "epoch": 4.267570767409321, + "grad_norm": 0.706251859664917, + "learning_rate": 0.00034441978235103974, + "loss": 3.3995, + "step": 39650 + }, + { + "epoch": 4.272952319448929, + "grad_norm": 0.6631447076797485, + "learning_rate": 0.00034409654132097834, + "loss": 3.3963, + "step": 39700 + }, + { + "epoch": 4.278333871488537, + "grad_norm": 0.6600978374481201, + "learning_rate": 0.0003437733002909169, + "loss": 3.3984, + "step": 39750 + }, + { + "epoch": 4.283715423528146, + "grad_norm": 0.624860942363739, + "learning_rate": 0.00034345005926085547, + "loss": 3.3892, + "step": 39800 + }, + { + "epoch": 4.2890969755677535, + "grad_norm": 0.6655292510986328, + "learning_rate": 0.00034312681823079407, + "loss": 3.3873, + "step": 39850 + }, + { + "epoch": 4.294478527607362, + "grad_norm": 0.6067180037498474, + "learning_rate": 0.00034280357720073266, + "loss": 3.4034, + "step": 39900 + }, + { + "epoch": 4.299860079646971, + "grad_norm": 0.6818814873695374, + "learning_rate": 0.00034248033617067126, + "loss": 3.3754, + "step": 39950 + }, + { + "epoch": 4.305241631686578, + "grad_norm": 0.6350970268249512, + "learning_rate": 0.00034215709514060985, + "loss": 3.3874, + "step": 40000 + }, + { + "epoch": 4.305241631686578, + "eval_accuracy": 0.3769179543573122, + "eval_loss": 3.442443609237671, + "eval_runtime": 182.8947, + "eval_samples_per_second": 98.477, + "eval_steps_per_second": 6.157, + "step": 40000 + }, + { + "epoch": 4.310623183726187, + "grad_norm": 0.6428865790367126, + "learning_rate": 0.0003418338541105484, + "loss": 3.3958, + "step": 40050 + }, + { + "epoch": 4.3160047357657945, + "grad_norm": 0.6549261212348938, + "learning_rate": 0.000341510613080487, + "loss": 3.3934, + "step": 40100 + }, + { + "epoch": 4.321386287805403, + "grad_norm": 0.6633482575416565, + "learning_rate": 0.0003411873720504255, + "loss": 3.4035, + "step": 40150 + }, + { + "epoch": 4.326767839845012, + "grad_norm": 0.6681509613990784, + "learning_rate": 0.0003408641310203642, + "loss": 3.4035, + "step": 40200 + }, + { + "epoch": 4.332149391884619, + "grad_norm": 0.6671531200408936, + "learning_rate": 0.00034054088999030277, + "loss": 3.3972, + "step": 40250 + }, + { + "epoch": 4.337530943924228, + "grad_norm": 0.6722056865692139, + "learning_rate": 0.0003402176489602413, + "loss": 3.3925, + "step": 40300 + }, + { + "epoch": 4.342912495963836, + "grad_norm": 0.634778618812561, + "learning_rate": 0.0003398944079301799, + "loss": 3.3951, + "step": 40350 + }, + { + "epoch": 4.348294048003444, + "grad_norm": 0.6151086091995239, + "learning_rate": 0.0003395711669001185, + "loss": 3.4087, + "step": 40400 + }, + { + "epoch": 4.3536756000430525, + "grad_norm": 0.6367116570472717, + "learning_rate": 0.00033924792587005704, + "loss": 3.3993, + "step": 40450 + }, + { + "epoch": 4.359057152082661, + "grad_norm": 0.6156135201454163, + "learning_rate": 0.0003389246848399957, + "loss": 3.3765, + "step": 40500 + }, + { + "epoch": 4.364438704122269, + "grad_norm": 0.6550938487052917, + "learning_rate": 0.0003386014438099343, + "loss": 3.4011, + "step": 40550 + }, + { + "epoch": 4.369820256161877, + "grad_norm": 0.6099169254302979, + "learning_rate": 0.0003382782027798728, + "loss": 3.3948, + "step": 40600 + }, + { + "epoch": 4.375201808201485, + "grad_norm": 0.6787609457969666, + "learning_rate": 0.0003379549617498114, + "loss": 3.4118, + "step": 40650 + }, + { + "epoch": 4.3805833602410935, + "grad_norm": 0.6305999755859375, + "learning_rate": 0.00033763172071974996, + "loss": 3.4038, + "step": 40700 + }, + { + "epoch": 4.385964912280702, + "grad_norm": 0.6593831181526184, + "learning_rate": 0.00033730847968968855, + "loss": 3.4024, + "step": 40750 + }, + { + "epoch": 4.39134646432031, + "grad_norm": 0.6971632838249207, + "learning_rate": 0.0003369852386596272, + "loss": 3.3823, + "step": 40800 + }, + { + "epoch": 4.396728016359918, + "grad_norm": 0.6232396960258484, + "learning_rate": 0.00033666199762956574, + "loss": 3.4003, + "step": 40850 + }, + { + "epoch": 4.402109568399527, + "grad_norm": 0.6262663006782532, + "learning_rate": 0.00033633875659950434, + "loss": 3.391, + "step": 40900 + }, + { + "epoch": 4.4074911204391345, + "grad_norm": 0.652194082736969, + "learning_rate": 0.0003360155155694429, + "loss": 3.373, + "step": 40950 + }, + { + "epoch": 4.412872672478743, + "grad_norm": 0.645897388458252, + "learning_rate": 0.0003356922745393815, + "loss": 3.4029, + "step": 41000 + }, + { + "epoch": 4.412872672478743, + "eval_accuracy": 0.37729009049807677, + "eval_loss": 3.4402434825897217, + "eval_runtime": 182.7308, + "eval_samples_per_second": 98.566, + "eval_steps_per_second": 6.162, + "step": 41000 + }, + { + "epoch": 4.418254224518351, + "grad_norm": 0.6175161004066467, + "learning_rate": 0.0003353690335093201, + "loss": 3.3898, + "step": 41050 + }, + { + "epoch": 4.423635776557959, + "grad_norm": 0.6909878849983215, + "learning_rate": 0.0003350457924792587, + "loss": 3.3979, + "step": 41100 + }, + { + "epoch": 4.429017328597568, + "grad_norm": 0.6703479290008545, + "learning_rate": 0.00033472255144919726, + "loss": 3.403, + "step": 41150 + }, + { + "epoch": 4.4343988806371755, + "grad_norm": 0.635631799697876, + "learning_rate": 0.00033439931041913585, + "loss": 3.3764, + "step": 41200 + }, + { + "epoch": 4.439780432676784, + "grad_norm": 0.6716395616531372, + "learning_rate": 0.0003340760693890744, + "loss": 3.3872, + "step": 41250 + }, + { + "epoch": 4.445161984716393, + "grad_norm": 0.6503021717071533, + "learning_rate": 0.000333752828359013, + "loss": 3.3977, + "step": 41300 + }, + { + "epoch": 4.450543536756, + "grad_norm": 0.6269751191139221, + "learning_rate": 0.00033342958732895164, + "loss": 3.3794, + "step": 41350 + }, + { + "epoch": 4.455925088795609, + "grad_norm": 0.6848242282867432, + "learning_rate": 0.0003331063462988902, + "loss": 3.3979, + "step": 41400 + }, + { + "epoch": 4.461306640835216, + "grad_norm": 0.6538987755775452, + "learning_rate": 0.00033278310526882877, + "loss": 3.3977, + "step": 41450 + }, + { + "epoch": 4.466688192874825, + "grad_norm": 0.687504768371582, + "learning_rate": 0.0003324598642387673, + "loss": 3.388, + "step": 41500 + }, + { + "epoch": 4.4720697449144335, + "grad_norm": 0.6829115748405457, + "learning_rate": 0.0003321366232087059, + "loss": 3.3855, + "step": 41550 + }, + { + "epoch": 4.477451296954041, + "grad_norm": 0.6400046944618225, + "learning_rate": 0.0003318133821786445, + "loss": 3.3888, + "step": 41600 + }, + { + "epoch": 4.48283284899365, + "grad_norm": 0.7195608615875244, + "learning_rate": 0.0003314901411485831, + "loss": 3.386, + "step": 41650 + }, + { + "epoch": 4.488214401033258, + "grad_norm": 0.6601772904396057, + "learning_rate": 0.0003311669001185217, + "loss": 3.4025, + "step": 41700 + }, + { + "epoch": 4.493595953072866, + "grad_norm": 0.6627285480499268, + "learning_rate": 0.0003308436590884603, + "loss": 3.3845, + "step": 41750 + }, + { + "epoch": 4.4989775051124745, + "grad_norm": 0.6553217172622681, + "learning_rate": 0.0003305204180583988, + "loss": 3.3975, + "step": 41800 + }, + { + "epoch": 4.504359057152083, + "grad_norm": 0.6015192270278931, + "learning_rate": 0.0003302036418489387, + "loss": 3.4143, + "step": 41850 + }, + { + "epoch": 4.509740609191691, + "grad_norm": 0.709814727306366, + "learning_rate": 0.0003298804008188772, + "loss": 3.3908, + "step": 41900 + }, + { + "epoch": 4.515122161231299, + "grad_norm": 0.6477102637290955, + "learning_rate": 0.0003295571597888158, + "loss": 3.3887, + "step": 41950 + }, + { + "epoch": 4.520503713270907, + "grad_norm": 0.6681662201881409, + "learning_rate": 0.00032923391875875447, + "loss": 3.4236, + "step": 42000 + }, + { + "epoch": 4.520503713270907, + "eval_accuracy": 0.3778097772604935, + "eval_loss": 3.4332406520843506, + "eval_runtime": 182.6918, + "eval_samples_per_second": 98.587, + "eval_steps_per_second": 6.163, + "step": 42000 + }, + { + "epoch": 4.5258852653105155, + "grad_norm": 0.6888872385025024, + "learning_rate": 0.000328910677728693, + "loss": 3.399, + "step": 42050 + }, + { + "epoch": 4.531266817350124, + "grad_norm": 0.6245088577270508, + "learning_rate": 0.0003285874366986316, + "loss": 3.3807, + "step": 42100 + }, + { + "epoch": 4.536648369389732, + "grad_norm": 0.6789868474006653, + "learning_rate": 0.00032826419566857015, + "loss": 3.385, + "step": 42150 + }, + { + "epoch": 4.54202992142934, + "grad_norm": 0.678841769695282, + "learning_rate": 0.00032794095463850874, + "loss": 3.4002, + "step": 42200 + }, + { + "epoch": 4.547411473468949, + "grad_norm": 0.6410040855407715, + "learning_rate": 0.0003276177136084473, + "loss": 3.3948, + "step": 42250 + }, + { + "epoch": 4.5527930255085565, + "grad_norm": 0.6948608756065369, + "learning_rate": 0.00032729447257838593, + "loss": 3.4055, + "step": 42300 + }, + { + "epoch": 4.558174577548165, + "grad_norm": 0.6666476726531982, + "learning_rate": 0.0003269712315483245, + "loss": 3.4067, + "step": 42350 + }, + { + "epoch": 4.563556129587774, + "grad_norm": 0.6703059673309326, + "learning_rate": 0.00032664799051826306, + "loss": 3.3935, + "step": 42400 + }, + { + "epoch": 4.568937681627381, + "grad_norm": 0.6622447371482849, + "learning_rate": 0.00032632474948820166, + "loss": 3.3907, + "step": 42450 + }, + { + "epoch": 4.57431923366699, + "grad_norm": 0.6864114999771118, + "learning_rate": 0.00032600150845814025, + "loss": 3.3855, + "step": 42500 + }, + { + "epoch": 4.579700785706597, + "grad_norm": 0.6508845686912537, + "learning_rate": 0.0003256782674280788, + "loss": 3.3981, + "step": 42550 + }, + { + "epoch": 4.585082337746206, + "grad_norm": 0.6998566389083862, + "learning_rate": 0.00032535502639801744, + "loss": 3.3939, + "step": 42600 + }, + { + "epoch": 4.5904638897858145, + "grad_norm": 0.6499090194702148, + "learning_rate": 0.00032503178536795604, + "loss": 3.396, + "step": 42650 + }, + { + "epoch": 4.595845441825422, + "grad_norm": 0.6362233757972717, + "learning_rate": 0.0003247085443378946, + "loss": 3.394, + "step": 42700 + }, + { + "epoch": 4.601226993865031, + "grad_norm": 0.6605574488639832, + "learning_rate": 0.0003243853033078332, + "loss": 3.3787, + "step": 42750 + }, + { + "epoch": 4.606608545904638, + "grad_norm": 0.6438785195350647, + "learning_rate": 0.0003240620622777717, + "loss": 3.4107, + "step": 42800 + }, + { + "epoch": 4.611990097944247, + "grad_norm": 0.6508049368858337, + "learning_rate": 0.00032373882124771036, + "loss": 3.3849, + "step": 42850 + }, + { + "epoch": 4.6173716499838555, + "grad_norm": 0.651975691318512, + "learning_rate": 0.00032341558021764896, + "loss": 3.3751, + "step": 42900 + }, + { + "epoch": 4.622753202023463, + "grad_norm": 0.6845095157623291, + "learning_rate": 0.0003230923391875875, + "loss": 3.384, + "step": 42950 + }, + { + "epoch": 4.628134754063072, + "grad_norm": 0.6449900269508362, + "learning_rate": 0.0003227690981575261, + "loss": 3.3834, + "step": 43000 + }, + { + "epoch": 4.628134754063072, + "eval_accuracy": 0.37862836811728773, + "eval_loss": 3.428248643875122, + "eval_runtime": 182.9864, + "eval_samples_per_second": 98.428, + "eval_steps_per_second": 6.153, + "step": 43000 + }, + { + "epoch": 4.63351630610268, + "grad_norm": 0.7843633890151978, + "learning_rate": 0.0003224458571274647, + "loss": 3.394, + "step": 43050 + }, + { + "epoch": 4.638897858142288, + "grad_norm": 0.6628377437591553, + "learning_rate": 0.00032212261609740323, + "loss": 3.4261, + "step": 43100 + }, + { + "epoch": 4.6442794101818965, + "grad_norm": 0.6455616354942322, + "learning_rate": 0.0003217993750673419, + "loss": 3.3975, + "step": 43150 + }, + { + "epoch": 4.649660962221505, + "grad_norm": 0.6171681880950928, + "learning_rate": 0.00032147613403728047, + "loss": 3.3786, + "step": 43200 + }, + { + "epoch": 4.655042514261113, + "grad_norm": 0.6919983625411987, + "learning_rate": 0.000321152893007219, + "loss": 3.3941, + "step": 43250 + }, + { + "epoch": 4.660424066300721, + "grad_norm": 0.6645509004592896, + "learning_rate": 0.0003208296519771576, + "loss": 3.3736, + "step": 43300 + }, + { + "epoch": 4.665805618340329, + "grad_norm": 0.6968538165092468, + "learning_rate": 0.00032050641094709615, + "loss": 3.3966, + "step": 43350 + }, + { + "epoch": 4.6711871703799375, + "grad_norm": 0.6048688888549805, + "learning_rate": 0.00032018316991703474, + "loss": 3.3867, + "step": 43400 + }, + { + "epoch": 4.676568722419546, + "grad_norm": 0.638982892036438, + "learning_rate": 0.0003198599288869734, + "loss": 3.4118, + "step": 43450 + }, + { + "epoch": 4.681950274459154, + "grad_norm": 0.7195716500282288, + "learning_rate": 0.00031953668785691193, + "loss": 3.4058, + "step": 43500 + }, + { + "epoch": 4.687331826498762, + "grad_norm": 0.6764750480651855, + "learning_rate": 0.0003192134468268505, + "loss": 3.4037, + "step": 43550 + }, + { + "epoch": 4.692713378538371, + "grad_norm": 0.6833808422088623, + "learning_rate": 0.0003188902057967891, + "loss": 3.3974, + "step": 43600 + }, + { + "epoch": 4.6980949305779784, + "grad_norm": 0.6663907766342163, + "learning_rate": 0.00031856696476672766, + "loss": 3.3963, + "step": 43650 + }, + { + "epoch": 4.703476482617587, + "grad_norm": 0.661689281463623, + "learning_rate": 0.0003182437237366663, + "loss": 3.3907, + "step": 43700 + }, + { + "epoch": 4.7088580346571955, + "grad_norm": 0.6300943493843079, + "learning_rate": 0.0003179204827066049, + "loss": 3.3981, + "step": 43750 + }, + { + "epoch": 4.714239586696803, + "grad_norm": 0.6751649975776672, + "learning_rate": 0.00031759724167654344, + "loss": 3.4063, + "step": 43800 + }, + { + "epoch": 4.719621138736412, + "grad_norm": 0.6561940908432007, + "learning_rate": 0.00031727400064648204, + "loss": 3.4089, + "step": 43850 + }, + { + "epoch": 4.725002690776019, + "grad_norm": 0.6460406184196472, + "learning_rate": 0.0003169507596164206, + "loss": 3.4121, + "step": 43900 + }, + { + "epoch": 4.730384242815628, + "grad_norm": 0.7084411382675171, + "learning_rate": 0.0003166275185863592, + "loss": 3.397, + "step": 43950 + }, + { + "epoch": 4.7357657948552365, + "grad_norm": 0.6859269142150879, + "learning_rate": 0.0003163042775562978, + "loss": 3.3957, + "step": 44000 + }, + { + "epoch": 4.7357657948552365, + "eval_accuracy": 0.3787118135351293, + "eval_loss": 3.4233696460723877, + "eval_runtime": 182.724, + "eval_samples_per_second": 98.569, + "eval_steps_per_second": 6.162, + "step": 44000 + }, + { + "epoch": 4.741147346894844, + "grad_norm": 0.6808167695999146, + "learning_rate": 0.00031598103652623636, + "loss": 3.3983, + "step": 44050 + }, + { + "epoch": 4.746528898934453, + "grad_norm": 0.6858932375907898, + "learning_rate": 0.00031565779549617496, + "loss": 3.3932, + "step": 44100 + }, + { + "epoch": 4.751910450974061, + "grad_norm": 0.6018813252449036, + "learning_rate": 0.00031533455446611355, + "loss": 3.3963, + "step": 44150 + }, + { + "epoch": 4.757292003013669, + "grad_norm": 0.6454479098320007, + "learning_rate": 0.0003150113134360521, + "loss": 3.4044, + "step": 44200 + }, + { + "epoch": 4.7626735550532775, + "grad_norm": 0.68412846326828, + "learning_rate": 0.0003146880724059907, + "loss": 3.4075, + "step": 44250 + }, + { + "epoch": 4.768055107092886, + "grad_norm": 0.6248152852058411, + "learning_rate": 0.00031436483137592934, + "loss": 3.4068, + "step": 44300 + }, + { + "epoch": 4.773436659132494, + "grad_norm": 0.7138214111328125, + "learning_rate": 0.0003140415903458679, + "loss": 3.4007, + "step": 44350 + }, + { + "epoch": 4.778818211172102, + "grad_norm": 0.6977150440216064, + "learning_rate": 0.00031371834931580647, + "loss": 3.3887, + "step": 44400 + }, + { + "epoch": 4.78419976321171, + "grad_norm": 0.6793955564498901, + "learning_rate": 0.000313395108285745, + "loss": 3.3965, + "step": 44450 + }, + { + "epoch": 4.7895813152513185, + "grad_norm": 0.7022804617881775, + "learning_rate": 0.0003130718672556836, + "loss": 3.3973, + "step": 44500 + }, + { + "epoch": 4.794962867290927, + "grad_norm": 0.6722605228424072, + "learning_rate": 0.0003127486262256222, + "loss": 3.3857, + "step": 44550 + }, + { + "epoch": 4.800344419330535, + "grad_norm": 0.6196901798248291, + "learning_rate": 0.0003124253851955608, + "loss": 3.3995, + "step": 44600 + }, + { + "epoch": 4.805725971370143, + "grad_norm": 0.6370094418525696, + "learning_rate": 0.0003121021441654994, + "loss": 3.3821, + "step": 44650 + }, + { + "epoch": 4.811107523409751, + "grad_norm": 0.6328933238983154, + "learning_rate": 0.0003117853679560392, + "loss": 3.4183, + "step": 44700 + }, + { + "epoch": 4.8164890754493594, + "grad_norm": 0.7133774757385254, + "learning_rate": 0.0003114621269259778, + "loss": 3.376, + "step": 44750 + }, + { + "epoch": 4.821870627488968, + "grad_norm": 0.6690739393234253, + "learning_rate": 0.00031113888589591633, + "loss": 3.3853, + "step": 44800 + }, + { + "epoch": 4.827252179528576, + "grad_norm": 0.6346887350082397, + "learning_rate": 0.00031081564486585493, + "loss": 3.406, + "step": 44850 + }, + { + "epoch": 4.832633731568184, + "grad_norm": 0.7371925115585327, + "learning_rate": 0.0003104924038357935, + "loss": 3.3955, + "step": 44900 + }, + { + "epoch": 4.838015283607793, + "grad_norm": 0.651321530342102, + "learning_rate": 0.0003101691628057321, + "loss": 3.3972, + "step": 44950 + }, + { + "epoch": 4.8433968356474, + "grad_norm": 0.6792730689048767, + "learning_rate": 0.0003098459217756707, + "loss": 3.3904, + "step": 45000 + }, + { + "epoch": 4.8433968356474, + "eval_accuracy": 0.37944913203183833, + "eval_loss": 3.41748309135437, + "eval_runtime": 182.8169, + "eval_samples_per_second": 98.519, + "eval_steps_per_second": 6.159, + "step": 45000 + }, + { + "epoch": 4.848778387687009, + "grad_norm": 0.66081702709198, + "learning_rate": 0.0003095226807456093, + "loss": 3.3904, + "step": 45050 + }, + { + "epoch": 4.8541599397266175, + "grad_norm": 0.7229673266410828, + "learning_rate": 0.00030919943971554785, + "loss": 3.4034, + "step": 45100 + }, + { + "epoch": 4.859541491766225, + "grad_norm": 0.6608504056930542, + "learning_rate": 0.00030887619868548644, + "loss": 3.3975, + "step": 45150 + }, + { + "epoch": 4.864923043805834, + "grad_norm": 0.6587163805961609, + "learning_rate": 0.000308552957655425, + "loss": 3.3832, + "step": 45200 + }, + { + "epoch": 4.870304595845441, + "grad_norm": 0.6220030784606934, + "learning_rate": 0.00030822971662536363, + "loss": 3.3956, + "step": 45250 + }, + { + "epoch": 4.87568614788505, + "grad_norm": 0.6646186113357544, + "learning_rate": 0.0003079064755953022, + "loss": 3.4069, + "step": 45300 + }, + { + "epoch": 4.8810676999246585, + "grad_norm": 0.6473865509033203, + "learning_rate": 0.00030758323456524077, + "loss": 3.3964, + "step": 45350 + }, + { + "epoch": 4.886449251964266, + "grad_norm": 0.6625189781188965, + "learning_rate": 0.00030725999353517936, + "loss": 3.3962, + "step": 45400 + }, + { + "epoch": 4.891830804003875, + "grad_norm": 0.656329333782196, + "learning_rate": 0.00030693675250511795, + "loss": 3.3861, + "step": 45450 + }, + { + "epoch": 4.897212356043483, + "grad_norm": 0.6948346495628357, + "learning_rate": 0.00030661351147505655, + "loss": 3.392, + "step": 45500 + }, + { + "epoch": 4.902593908083091, + "grad_norm": 0.6399689316749573, + "learning_rate": 0.00030629027044499514, + "loss": 3.3961, + "step": 45550 + }, + { + "epoch": 4.9079754601226995, + "grad_norm": 0.6699696779251099, + "learning_rate": 0.00030596702941493374, + "loss": 3.389, + "step": 45600 + }, + { + "epoch": 4.913357012162308, + "grad_norm": 0.6273096799850464, + "learning_rate": 0.0003056437883848723, + "loss": 3.3964, + "step": 45650 + }, + { + "epoch": 4.918738564201916, + "grad_norm": 0.6409399509429932, + "learning_rate": 0.0003053205473548109, + "loss": 3.3961, + "step": 45700 + }, + { + "epoch": 4.924120116241524, + "grad_norm": 0.6525577306747437, + "learning_rate": 0.0003049973063247494, + "loss": 3.3739, + "step": 45750 + }, + { + "epoch": 4.929501668281132, + "grad_norm": 0.6267456412315369, + "learning_rate": 0.00030467406529468806, + "loss": 3.3891, + "step": 45800 + }, + { + "epoch": 4.9348832203207404, + "grad_norm": 0.610181987285614, + "learning_rate": 0.00030435082426462666, + "loss": 3.3808, + "step": 45850 + }, + { + "epoch": 4.940264772360349, + "grad_norm": 0.6637799739837646, + "learning_rate": 0.0003040275832345652, + "loss": 3.392, + "step": 45900 + }, + { + "epoch": 4.945646324399957, + "grad_norm": 0.7256836891174316, + "learning_rate": 0.0003037043422045038, + "loss": 3.3918, + "step": 45950 + }, + { + "epoch": 4.951027876439565, + "grad_norm": 0.6999911069869995, + "learning_rate": 0.0003033811011744424, + "loss": 3.3921, + "step": 46000 + }, + { + "epoch": 4.951027876439565, + "eval_accuracy": 0.37981605283398784, + "eval_loss": 3.412885904312134, + "eval_runtime": 183.1736, + "eval_samples_per_second": 98.327, + "eval_steps_per_second": 6.147, + "step": 46000 + }, + { + "epoch": 4.956409428479174, + "grad_norm": 0.6925266981124878, + "learning_rate": 0.00030305786014438093, + "loss": 3.4043, + "step": 46050 + }, + { + "epoch": 4.961790980518781, + "grad_norm": 0.6724205613136292, + "learning_rate": 0.0003027346191143196, + "loss": 3.3755, + "step": 46100 + }, + { + "epoch": 4.96717253255839, + "grad_norm": 0.6727187633514404, + "learning_rate": 0.00030241137808425817, + "loss": 3.3878, + "step": 46150 + }, + { + "epoch": 4.9725540845979985, + "grad_norm": 0.6427590250968933, + "learning_rate": 0.0003020881370541967, + "loss": 3.4044, + "step": 46200 + }, + { + "epoch": 4.977935636637606, + "grad_norm": 0.6517530083656311, + "learning_rate": 0.0003017648960241353, + "loss": 3.3773, + "step": 46250 + }, + { + "epoch": 4.983317188677215, + "grad_norm": 0.6581140160560608, + "learning_rate": 0.00030144165499407385, + "loss": 3.3772, + "step": 46300 + }, + { + "epoch": 4.988698740716822, + "grad_norm": 0.6741767525672913, + "learning_rate": 0.00030111841396401244, + "loss": 3.3952, + "step": 46350 + }, + { + "epoch": 4.994080292756431, + "grad_norm": 0.6367133259773254, + "learning_rate": 0.0003007951729339511, + "loss": 3.3955, + "step": 46400 + }, + { + "epoch": 4.9994618447960395, + "grad_norm": 0.6892203092575073, + "learning_rate": 0.00030047193190388963, + "loss": 3.3779, + "step": 46450 + }, + { + "epoch": 5.004843396835647, + "grad_norm": 0.6889623403549194, + "learning_rate": 0.0003001486908738282, + "loss": 3.3198, + "step": 46500 + }, + { + "epoch": 5.010224948875256, + "grad_norm": 0.6872815489768982, + "learning_rate": 0.0002998254498437668, + "loss": 3.3037, + "step": 46550 + }, + { + "epoch": 5.015606500914864, + "grad_norm": 0.6558205485343933, + "learning_rate": 0.0002995022088137054, + "loss": 3.2964, + "step": 46600 + }, + { + "epoch": 5.020988052954472, + "grad_norm": 0.6722169518470764, + "learning_rate": 0.00029917896778364396, + "loss": 3.2927, + "step": 46650 + }, + { + "epoch": 5.0263696049940805, + "grad_norm": 0.6730004549026489, + "learning_rate": 0.00029885572675358255, + "loss": 3.3064, + "step": 46700 + }, + { + "epoch": 5.031751157033688, + "grad_norm": 0.6149821281433105, + "learning_rate": 0.00029853248572352114, + "loss": 3.2977, + "step": 46750 + }, + { + "epoch": 5.037132709073297, + "grad_norm": 0.6445924639701843, + "learning_rate": 0.00029820924469345974, + "loss": 3.3056, + "step": 46800 + }, + { + "epoch": 5.042514261112905, + "grad_norm": 0.6724662780761719, + "learning_rate": 0.0002978860036633983, + "loss": 3.3142, + "step": 46850 + }, + { + "epoch": 5.047895813152513, + "grad_norm": 0.6918281316757202, + "learning_rate": 0.00029756276263333693, + "loss": 3.3051, + "step": 46900 + }, + { + "epoch": 5.0532773651921215, + "grad_norm": 0.6730775833129883, + "learning_rate": 0.00029723952160327547, + "loss": 3.3048, + "step": 46950 + }, + { + "epoch": 5.05865891723173, + "grad_norm": 0.6861401200294495, + "learning_rate": 0.00029691628057321406, + "loss": 3.3313, + "step": 47000 + }, + { + "epoch": 5.05865891723173, + "eval_accuracy": 0.3799867465207444, + "eval_loss": 3.4170210361480713, + "eval_runtime": 182.9491, + "eval_samples_per_second": 98.448, + "eval_steps_per_second": 6.155, + "step": 47000 + }, + { + "epoch": 5.064040469271338, + "grad_norm": 0.6703794002532959, + "learning_rate": 0.00029659303954315266, + "loss": 3.3135, + "step": 47050 + }, + { + "epoch": 5.069422021310946, + "grad_norm": 0.6781489849090576, + "learning_rate": 0.00029626979851309125, + "loss": 3.3085, + "step": 47100 + }, + { + "epoch": 5.074803573350554, + "grad_norm": 0.6689127683639526, + "learning_rate": 0.00029594655748302985, + "loss": 3.3014, + "step": 47150 + }, + { + "epoch": 5.080185125390162, + "grad_norm": 0.6758719682693481, + "learning_rate": 0.0002956233164529684, + "loss": 3.3099, + "step": 47200 + }, + { + "epoch": 5.085566677429771, + "grad_norm": 0.6829871535301208, + "learning_rate": 0.000295300075422907, + "loss": 3.3137, + "step": 47250 + }, + { + "epoch": 5.090948229469379, + "grad_norm": 0.7148377299308777, + "learning_rate": 0.0002949768343928456, + "loss": 3.3026, + "step": 47300 + }, + { + "epoch": 5.096329781508987, + "grad_norm": 0.706376314163208, + "learning_rate": 0.00029465359336278417, + "loss": 3.3088, + "step": 47350 + }, + { + "epoch": 5.101711333548596, + "grad_norm": 0.658332109451294, + "learning_rate": 0.0002943303523327227, + "loss": 3.3188, + "step": 47400 + }, + { + "epoch": 5.107092885588203, + "grad_norm": 0.6532283425331116, + "learning_rate": 0.00029400711130266136, + "loss": 3.3087, + "step": 47450 + }, + { + "epoch": 5.112474437627812, + "grad_norm": 0.670990526676178, + "learning_rate": 0.0002936838702725999, + "loss": 3.319, + "step": 47500 + }, + { + "epoch": 5.1178559896674205, + "grad_norm": 0.6792743802070618, + "learning_rate": 0.0002933606292425385, + "loss": 3.3171, + "step": 47550 + }, + { + "epoch": 5.123237541707028, + "grad_norm": 0.6803917288780212, + "learning_rate": 0.0002930373882124771, + "loss": 3.3264, + "step": 47600 + }, + { + "epoch": 5.128619093746637, + "grad_norm": 0.6759734749794006, + "learning_rate": 0.0002927141471824157, + "loss": 3.3128, + "step": 47650 + }, + { + "epoch": 5.134000645786244, + "grad_norm": 0.6762935519218445, + "learning_rate": 0.0002923909061523542, + "loss": 3.3171, + "step": 47700 + }, + { + "epoch": 5.139382197825853, + "grad_norm": 0.7197624444961548, + "learning_rate": 0.0002920676651222928, + "loss": 3.3111, + "step": 47750 + }, + { + "epoch": 5.1447637498654615, + "grad_norm": 0.6673755049705505, + "learning_rate": 0.0002917444240922314, + "loss": 3.3256, + "step": 47800 + }, + { + "epoch": 5.150145301905069, + "grad_norm": 0.6834902167320251, + "learning_rate": 0.00029142118306216996, + "loss": 3.3293, + "step": 47850 + }, + { + "epoch": 5.155526853944678, + "grad_norm": 0.6759807467460632, + "learning_rate": 0.0002910979420321086, + "loss": 3.3295, + "step": 47900 + }, + { + "epoch": 5.160908405984286, + "grad_norm": 0.7257971167564392, + "learning_rate": 0.00029077470100204715, + "loss": 3.3301, + "step": 47950 + }, + { + "epoch": 5.166289958023894, + "grad_norm": 0.7746021747589111, + "learning_rate": 0.00029045145997198574, + "loss": 3.333, + "step": 48000 + }, + { + "epoch": 5.166289958023894, + "eval_accuracy": 0.3801953600653482, + "eval_loss": 3.416738986968994, + "eval_runtime": 183.2917, + "eval_samples_per_second": 98.264, + "eval_steps_per_second": 6.143, + "step": 48000 + }, + { + "epoch": 5.1716715100635025, + "grad_norm": 0.6825886964797974, + "learning_rate": 0.00029012821894192433, + "loss": 3.3222, + "step": 48050 + }, + { + "epoch": 5.17705306210311, + "grad_norm": 0.6440089344978333, + "learning_rate": 0.00028980497791186293, + "loss": 3.3423, + "step": 48100 + }, + { + "epoch": 5.182434614142719, + "grad_norm": 0.6887645125389099, + "learning_rate": 0.0002894817368818015, + "loss": 3.3232, + "step": 48150 + }, + { + "epoch": 5.187816166182327, + "grad_norm": 0.646507978439331, + "learning_rate": 0.00028915849585174006, + "loss": 3.3021, + "step": 48200 + }, + { + "epoch": 5.193197718221935, + "grad_norm": 0.726085364818573, + "learning_rate": 0.00028883525482167866, + "loss": 3.3173, + "step": 48250 + }, + { + "epoch": 5.198579270261543, + "grad_norm": 0.6602137684822083, + "learning_rate": 0.00028851201379161725, + "loss": 3.3115, + "step": 48300 + }, + { + "epoch": 5.203960822301152, + "grad_norm": 0.65272456407547, + "learning_rate": 0.00028818877276155585, + "loss": 3.34, + "step": 48350 + }, + { + "epoch": 5.20934237434076, + "grad_norm": 0.6690257787704468, + "learning_rate": 0.0002878655317314944, + "loss": 3.3203, + "step": 48400 + }, + { + "epoch": 5.214723926380368, + "grad_norm": 0.6628714203834534, + "learning_rate": 0.00028754229070143304, + "loss": 3.3076, + "step": 48450 + }, + { + "epoch": 5.220105478419977, + "grad_norm": 0.6842119097709656, + "learning_rate": 0.0002872190496713716, + "loss": 3.331, + "step": 48500 + }, + { + "epoch": 5.225487030459584, + "grad_norm": 0.696015477180481, + "learning_rate": 0.0002868958086413102, + "loss": 3.3258, + "step": 48550 + }, + { + "epoch": 5.230868582499193, + "grad_norm": 0.6456036567687988, + "learning_rate": 0.00028657256761124877, + "loss": 3.3165, + "step": 48600 + }, + { + "epoch": 5.236250134538801, + "grad_norm": 0.6485376954078674, + "learning_rate": 0.00028624932658118736, + "loss": 3.3451, + "step": 48650 + }, + { + "epoch": 5.241631686578409, + "grad_norm": 0.659721314907074, + "learning_rate": 0.0002859260855511259, + "loss": 3.3305, + "step": 48700 + }, + { + "epoch": 5.247013238618018, + "grad_norm": 0.6913601160049438, + "learning_rate": 0.00028560930934166576, + "loss": 3.3227, + "step": 48750 + }, + { + "epoch": 5.252394790657625, + "grad_norm": 0.6780300736427307, + "learning_rate": 0.00028528606831160436, + "loss": 3.331, + "step": 48800 + }, + { + "epoch": 5.257776342697234, + "grad_norm": 0.6792374849319458, + "learning_rate": 0.0002849628272815429, + "loss": 3.3381, + "step": 48850 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.6867091655731201, + "learning_rate": 0.0002846395862514815, + "loss": 3.3138, + "step": 48900 + }, + { + "epoch": 5.26853944677645, + "grad_norm": 0.6632397770881653, + "learning_rate": 0.0002843163452214201, + "loss": 3.3316, + "step": 48950 + }, + { + "epoch": 5.273920998816059, + "grad_norm": 0.6680681705474854, + "learning_rate": 0.0002839931041913587, + "loss": 3.3374, + "step": 49000 + }, + { + "epoch": 5.273920998816059, + "eval_accuracy": 0.380897692332181, + "eval_loss": 3.412996768951416, + "eval_runtime": 182.5799, + "eval_samples_per_second": 98.647, + "eval_steps_per_second": 6.167, + "step": 49000 + }, + { + "epoch": 5.279302550855666, + "grad_norm": 0.7043352127075195, + "learning_rate": 0.0002836698631612972, + "loss": 3.3241, + "step": 49050 + }, + { + "epoch": 5.284684102895275, + "grad_norm": 0.6589981913566589, + "learning_rate": 0.00028334662213123587, + "loss": 3.3239, + "step": 49100 + }, + { + "epoch": 5.2900656549348835, + "grad_norm": 0.6708270907402039, + "learning_rate": 0.0002830233811011744, + "loss": 3.3458, + "step": 49150 + }, + { + "epoch": 5.295447206974491, + "grad_norm": 0.7209701538085938, + "learning_rate": 0.000282700140071113, + "loss": 3.3305, + "step": 49200 + }, + { + "epoch": 5.3008287590141, + "grad_norm": 0.6668233871459961, + "learning_rate": 0.0002823768990410516, + "loss": 3.3244, + "step": 49250 + }, + { + "epoch": 5.306210311053708, + "grad_norm": 0.7002584934234619, + "learning_rate": 0.00028205365801099014, + "loss": 3.3261, + "step": 49300 + }, + { + "epoch": 5.311591863093316, + "grad_norm": 0.6543057560920715, + "learning_rate": 0.00028173041698092874, + "loss": 3.3427, + "step": 49350 + }, + { + "epoch": 5.316973415132924, + "grad_norm": 0.6252505779266357, + "learning_rate": 0.00028140717595086733, + "loss": 3.3378, + "step": 49400 + }, + { + "epoch": 5.322354967172533, + "grad_norm": 0.7060573101043701, + "learning_rate": 0.0002810839349208059, + "loss": 3.3324, + "step": 49450 + }, + { + "epoch": 5.327736519212141, + "grad_norm": 0.7204796075820923, + "learning_rate": 0.00028076069389074447, + "loss": 3.339, + "step": 49500 + }, + { + "epoch": 5.333118071251749, + "grad_norm": 0.7042475342750549, + "learning_rate": 0.0002804374528606831, + "loss": 3.3392, + "step": 49550 + }, + { + "epoch": 5.338499623291357, + "grad_norm": 0.6284716129302979, + "learning_rate": 0.00028011421183062166, + "loss": 3.3334, + "step": 49600 + }, + { + "epoch": 5.343881175330965, + "grad_norm": 0.7170406579971313, + "learning_rate": 0.00027979097080056025, + "loss": 3.3344, + "step": 49650 + }, + { + "epoch": 5.349262727370574, + "grad_norm": 0.6857984066009521, + "learning_rate": 0.00027946772977049885, + "loss": 3.3497, + "step": 49700 + }, + { + "epoch": 5.354644279410182, + "grad_norm": 0.6786603331565857, + "learning_rate": 0.00027914448874043744, + "loss": 3.3371, + "step": 49750 + }, + { + "epoch": 5.36002583144979, + "grad_norm": 0.6698256134986877, + "learning_rate": 0.00027882124771037603, + "loss": 3.3305, + "step": 49800 + }, + { + "epoch": 5.365407383489399, + "grad_norm": 0.6916489601135254, + "learning_rate": 0.0002784980066803146, + "loss": 3.3315, + "step": 49850 + }, + { + "epoch": 5.370788935529006, + "grad_norm": 0.6628431081771851, + "learning_rate": 0.00027817476565025317, + "loss": 3.3218, + "step": 49900 + }, + { + "epoch": 5.376170487568615, + "grad_norm": 0.6686885356903076, + "learning_rate": 0.00027785152462019176, + "loss": 3.344, + "step": 49950 + }, + { + "epoch": 5.3815520396082235, + "grad_norm": 0.6686023473739624, + "learning_rate": 0.00027752828359013036, + "loss": 3.3217, + "step": 50000 + }, + { + "epoch": 5.3815520396082235, + "eval_accuracy": 0.3814086868635725, + "eval_loss": 3.4041335582733154, + "eval_runtime": 182.7349, + "eval_samples_per_second": 98.564, + "eval_steps_per_second": 6.162, + "step": 50000 + }, + { + "epoch": 5.386933591647831, + "grad_norm": 0.7179530262947083, + "learning_rate": 0.0002772050425600689, + "loss": 3.3474, + "step": 50050 + }, + { + "epoch": 5.39231514368744, + "grad_norm": 0.7137858867645264, + "learning_rate": 0.00027688180153000755, + "loss": 3.342, + "step": 50100 + }, + { + "epoch": 5.397696695727047, + "grad_norm": 0.6810168027877808, + "learning_rate": 0.0002765585604999461, + "loss": 3.3531, + "step": 50150 + }, + { + "epoch": 5.403078247766656, + "grad_norm": 0.6813143491744995, + "learning_rate": 0.0002762353194698847, + "loss": 3.3448, + "step": 50200 + }, + { + "epoch": 5.4084597998062645, + "grad_norm": 0.7286673188209534, + "learning_rate": 0.0002759120784398233, + "loss": 3.351, + "step": 50250 + }, + { + "epoch": 5.413841351845872, + "grad_norm": 0.670843243598938, + "learning_rate": 0.00027558883740976187, + "loss": 3.3436, + "step": 50300 + }, + { + "epoch": 5.419222903885481, + "grad_norm": 0.7313069105148315, + "learning_rate": 0.0002752655963797004, + "loss": 3.3302, + "step": 50350 + }, + { + "epoch": 5.424604455925088, + "grad_norm": 0.7491310834884644, + "learning_rate": 0.000274942355349639, + "loss": 3.3428, + "step": 50400 + }, + { + "epoch": 5.429986007964697, + "grad_norm": 0.6367394924163818, + "learning_rate": 0.0002746191143195776, + "loss": 3.3479, + "step": 50450 + }, + { + "epoch": 5.435367560004305, + "grad_norm": 0.6688310503959656, + "learning_rate": 0.0002742958732895162, + "loss": 3.3284, + "step": 50500 + }, + { + "epoch": 5.440749112043913, + "grad_norm": 0.7176879644393921, + "learning_rate": 0.0002739726322594548, + "loss": 3.3527, + "step": 50550 + }, + { + "epoch": 5.446130664083522, + "grad_norm": 0.6491169333457947, + "learning_rate": 0.00027364939122939333, + "loss": 3.3275, + "step": 50600 + }, + { + "epoch": 5.45151221612313, + "grad_norm": 0.75172358751297, + "learning_rate": 0.0002733261501993319, + "loss": 3.3364, + "step": 50650 + }, + { + "epoch": 5.456893768162738, + "grad_norm": 0.6788306832313538, + "learning_rate": 0.0002730029091692705, + "loss": 3.3293, + "step": 50700 + }, + { + "epoch": 5.462275320202346, + "grad_norm": 0.6975774765014648, + "learning_rate": 0.0002726861329598104, + "loss": 3.3368, + "step": 50750 + }, + { + "epoch": 5.467656872241955, + "grad_norm": 0.6561357378959656, + "learning_rate": 0.0002723628919297489, + "loss": 3.3413, + "step": 50800 + }, + { + "epoch": 5.473038424281563, + "grad_norm": 0.6256112456321716, + "learning_rate": 0.0002720396508996875, + "loss": 3.3281, + "step": 50850 + }, + { + "epoch": 5.478419976321171, + "grad_norm": 0.721541166305542, + "learning_rate": 0.0002717164098696261, + "loss": 3.3276, + "step": 50900 + }, + { + "epoch": 5.483801528360779, + "grad_norm": 0.6738684773445129, + "learning_rate": 0.00027139316883956465, + "loss": 3.35, + "step": 50950 + }, + { + "epoch": 5.489183080400387, + "grad_norm": 0.6639395952224731, + "learning_rate": 0.00027106992780950325, + "loss": 3.3285, + "step": 51000 + }, + { + "epoch": 5.489183080400387, + "eval_accuracy": 0.381844710902372, + "eval_loss": 3.4026851654052734, + "eval_runtime": 182.4881, + "eval_samples_per_second": 98.697, + "eval_steps_per_second": 6.17, + "step": 51000 + }, + { + "epoch": 5.494564632439996, + "grad_norm": 0.6820451617240906, + "learning_rate": 0.00027074668677944184, + "loss": 3.3397, + "step": 51050 + }, + { + "epoch": 5.499946184479604, + "grad_norm": 0.6895923018455505, + "learning_rate": 0.00027042344574938044, + "loss": 3.3435, + "step": 51100 + }, + { + "epoch": 5.505327736519212, + "grad_norm": 0.6487290859222412, + "learning_rate": 0.000270100204719319, + "loss": 3.3394, + "step": 51150 + }, + { + "epoch": 5.510709288558821, + "grad_norm": 0.682824969291687, + "learning_rate": 0.0002697769636892576, + "loss": 3.3384, + "step": 51200 + }, + { + "epoch": 5.516090840598428, + "grad_norm": 0.6406248807907104, + "learning_rate": 0.00026945372265919617, + "loss": 3.3383, + "step": 51250 + }, + { + "epoch": 5.521472392638037, + "grad_norm": 0.6504181027412415, + "learning_rate": 0.00026913048162913476, + "loss": 3.3614, + "step": 51300 + }, + { + "epoch": 5.5268539446776455, + "grad_norm": 0.7287770509719849, + "learning_rate": 0.00026880724059907336, + "loss": 3.3374, + "step": 51350 + }, + { + "epoch": 5.532235496717253, + "grad_norm": 0.7049496173858643, + "learning_rate": 0.00026848399956901195, + "loss": 3.3374, + "step": 51400 + }, + { + "epoch": 5.537617048756862, + "grad_norm": 0.659416913986206, + "learning_rate": 0.0002681607585389505, + "loss": 3.3432, + "step": 51450 + }, + { + "epoch": 5.542998600796469, + "grad_norm": 0.6534185409545898, + "learning_rate": 0.0002678375175088891, + "loss": 3.3324, + "step": 51500 + }, + { + "epoch": 5.548380152836078, + "grad_norm": 0.6897971630096436, + "learning_rate": 0.0002675142764788277, + "loss": 3.3326, + "step": 51550 + }, + { + "epoch": 5.553761704875686, + "grad_norm": 0.6762552261352539, + "learning_rate": 0.0002671910354487663, + "loss": 3.3378, + "step": 51600 + }, + { + "epoch": 5.559143256915294, + "grad_norm": 0.6849751472473145, + "learning_rate": 0.00026686779441870487, + "loss": 3.3355, + "step": 51650 + }, + { + "epoch": 5.564524808954903, + "grad_norm": 0.6968287229537964, + "learning_rate": 0.0002665445533886434, + "loss": 3.344, + "step": 51700 + }, + { + "epoch": 5.569906360994511, + "grad_norm": 0.6949653029441833, + "learning_rate": 0.00026622131235858206, + "loss": 3.3416, + "step": 51750 + }, + { + "epoch": 5.575287913034119, + "grad_norm": 0.677852213382721, + "learning_rate": 0.0002658980713285206, + "loss": 3.3566, + "step": 51800 + }, + { + "epoch": 5.580669465073727, + "grad_norm": 0.6766265034675598, + "learning_rate": 0.0002655748302984592, + "loss": 3.3509, + "step": 51850 + }, + { + "epoch": 5.586051017113336, + "grad_norm": 0.6820150017738342, + "learning_rate": 0.0002652515892683978, + "loss": 3.3529, + "step": 51900 + }, + { + "epoch": 5.591432569152944, + "grad_norm": 0.7299242615699768, + "learning_rate": 0.0002649283482383364, + "loss": 3.3359, + "step": 51950 + }, + { + "epoch": 5.596814121192552, + "grad_norm": 0.67701256275177, + "learning_rate": 0.0002646051072082749, + "loss": 3.3538, + "step": 52000 + }, + { + "epoch": 5.596814121192552, + "eval_accuracy": 0.38225780918184266, + "eval_loss": 3.3965156078338623, + "eval_runtime": 182.6924, + "eval_samples_per_second": 98.586, + "eval_steps_per_second": 6.163, + "step": 52000 + }, + { + "epoch": 5.60219567323216, + "grad_norm": 0.6956893801689148, + "learning_rate": 0.0002642818661782135, + "loss": 3.3411, + "step": 52050 + }, + { + "epoch": 5.607577225271768, + "grad_norm": 0.6988450288772583, + "learning_rate": 0.0002639586251481521, + "loss": 3.3266, + "step": 52100 + }, + { + "epoch": 5.612958777311377, + "grad_norm": 0.6658537983894348, + "learning_rate": 0.0002636353841180907, + "loss": 3.3534, + "step": 52150 + }, + { + "epoch": 5.618340329350985, + "grad_norm": 0.7193296551704407, + "learning_rate": 0.0002633121430880293, + "loss": 3.3355, + "step": 52200 + }, + { + "epoch": 5.623721881390593, + "grad_norm": 0.76201993227005, + "learning_rate": 0.00026298890205796784, + "loss": 3.3333, + "step": 52250 + }, + { + "epoch": 5.629103433430201, + "grad_norm": 0.6642473340034485, + "learning_rate": 0.00026266566102790644, + "loss": 3.3183, + "step": 52300 + }, + { + "epoch": 5.634484985469809, + "grad_norm": 0.7340889573097229, + "learning_rate": 0.00026234241999784503, + "loss": 3.3426, + "step": 52350 + }, + { + "epoch": 5.639866537509418, + "grad_norm": 0.6511938571929932, + "learning_rate": 0.0002620191789677836, + "loss": 3.3428, + "step": 52400 + }, + { + "epoch": 5.645248089549026, + "grad_norm": 0.7516026496887207, + "learning_rate": 0.00026169593793772217, + "loss": 3.3299, + "step": 52450 + }, + { + "epoch": 5.650629641588634, + "grad_norm": 0.6919920444488525, + "learning_rate": 0.0002613726969076608, + "loss": 3.3476, + "step": 52500 + }, + { + "epoch": 5.656011193628243, + "grad_norm": 0.6615656018257141, + "learning_rate": 0.00026104945587759936, + "loss": 3.3536, + "step": 52550 + }, + { + "epoch": 5.66139274566785, + "grad_norm": 0.6720345616340637, + "learning_rate": 0.00026072621484753795, + "loss": 3.3274, + "step": 52600 + }, + { + "epoch": 5.666774297707459, + "grad_norm": 0.7008869647979736, + "learning_rate": 0.00026040297381747655, + "loss": 3.3585, + "step": 52650 + }, + { + "epoch": 5.672155849747067, + "grad_norm": 0.6563118696212769, + "learning_rate": 0.00026007973278741514, + "loss": 3.3435, + "step": 52700 + }, + { + "epoch": 5.677537401786675, + "grad_norm": 0.7016202211380005, + "learning_rate": 0.00025975649175735373, + "loss": 3.3271, + "step": 52750 + }, + { + "epoch": 5.682918953826284, + "grad_norm": 0.6965526938438416, + "learning_rate": 0.0002594397155478935, + "loss": 3.3241, + "step": 52800 + }, + { + "epoch": 5.688300505865891, + "grad_norm": 0.6786898374557495, + "learning_rate": 0.00025911647451783214, + "loss": 3.3389, + "step": 52850 + }, + { + "epoch": 5.6936820579055, + "grad_norm": 0.7544721364974976, + "learning_rate": 0.0002587932334877707, + "loss": 3.3585, + "step": 52900 + }, + { + "epoch": 5.699063609945108, + "grad_norm": 0.6955776810646057, + "learning_rate": 0.00025846999245770927, + "loss": 3.3371, + "step": 52950 + }, + { + "epoch": 5.704445161984716, + "grad_norm": 0.7224853038787842, + "learning_rate": 0.00025814675142764787, + "loss": 3.3236, + "step": 53000 + }, + { + "epoch": 5.704445161984716, + "eval_accuracy": 0.3824792437672086, + "eval_loss": 3.3930840492248535, + "eval_runtime": 182.5431, + "eval_samples_per_second": 98.667, + "eval_steps_per_second": 6.168, + "step": 53000 + }, + { + "epoch": 5.709826714024325, + "grad_norm": 0.6928134560585022, + "learning_rate": 0.00025782351039758646, + "loss": 3.3293, + "step": 53050 + }, + { + "epoch": 5.715208266063933, + "grad_norm": 0.7381559610366821, + "learning_rate": 0.000257500269367525, + "loss": 3.3342, + "step": 53100 + }, + { + "epoch": 5.720589818103541, + "grad_norm": 0.6981896162033081, + "learning_rate": 0.0002571770283374636, + "loss": 3.3432, + "step": 53150 + }, + { + "epoch": 5.725971370143149, + "grad_norm": 0.7084531188011169, + "learning_rate": 0.0002568537873074022, + "loss": 3.3424, + "step": 53200 + }, + { + "epoch": 5.731352922182758, + "grad_norm": 0.6834819316864014, + "learning_rate": 0.0002565305462773408, + "loss": 3.3341, + "step": 53250 + }, + { + "epoch": 5.736734474222366, + "grad_norm": 0.6777629256248474, + "learning_rate": 0.0002562073052472794, + "loss": 3.3627, + "step": 53300 + }, + { + "epoch": 5.742116026261974, + "grad_norm": 0.6871793270111084, + "learning_rate": 0.0002558840642172179, + "loss": 3.3307, + "step": 53350 + }, + { + "epoch": 5.747497578301582, + "grad_norm": 0.6921470165252686, + "learning_rate": 0.00025556082318715657, + "loss": 3.337, + "step": 53400 + }, + { + "epoch": 5.75287913034119, + "grad_norm": 0.6944039463996887, + "learning_rate": 0.0002552375821570951, + "loss": 3.357, + "step": 53450 + }, + { + "epoch": 5.758260682380799, + "grad_norm": 0.7080736756324768, + "learning_rate": 0.0002549143411270337, + "loss": 3.3487, + "step": 53500 + }, + { + "epoch": 5.763642234420407, + "grad_norm": 0.6794440746307373, + "learning_rate": 0.0002545911000969723, + "loss": 3.344, + "step": 53550 + }, + { + "epoch": 5.769023786460015, + "grad_norm": 0.7343289256095886, + "learning_rate": 0.0002542678590669109, + "loss": 3.3341, + "step": 53600 + }, + { + "epoch": 5.774405338499624, + "grad_norm": 0.7471538186073303, + "learning_rate": 0.00025394461803684943, + "loss": 3.3386, + "step": 53650 + }, + { + "epoch": 5.779786890539231, + "grad_norm": 0.7091389894485474, + "learning_rate": 0.00025362137700678803, + "loss": 3.3487, + "step": 53700 + }, + { + "epoch": 5.78516844257884, + "grad_norm": 0.7032609581947327, + "learning_rate": 0.0002532981359767266, + "loss": 3.3359, + "step": 53750 + }, + { + "epoch": 5.790549994618448, + "grad_norm": 0.7148309350013733, + "learning_rate": 0.0002529748949466652, + "loss": 3.3482, + "step": 53800 + }, + { + "epoch": 5.795931546658056, + "grad_norm": 0.6615278124809265, + "learning_rate": 0.0002526516539166038, + "loss": 3.3418, + "step": 53850 + }, + { + "epoch": 5.801313098697665, + "grad_norm": 0.6663516163825989, + "learning_rate": 0.00025232841288654235, + "loss": 3.3545, + "step": 53900 + }, + { + "epoch": 5.806694650737272, + "grad_norm": 0.680908739566803, + "learning_rate": 0.00025200517185648095, + "loss": 3.3334, + "step": 53950 + }, + { + "epoch": 5.812076202776881, + "grad_norm": 0.7598308324813843, + "learning_rate": 0.00025168193082641954, + "loss": 3.3326, + "step": 54000 + }, + { + "epoch": 5.812076202776881, + "eval_accuracy": 0.3831473503743798, + "eval_loss": 3.387981414794922, + "eval_runtime": 182.8436, + "eval_samples_per_second": 98.505, + "eval_steps_per_second": 6.158, + "step": 54000 + }, + { + "epoch": 5.817457754816489, + "grad_norm": 0.6939805746078491, + "learning_rate": 0.00025135868979635814, + "loss": 3.3218, + "step": 54050 + }, + { + "epoch": 5.822839306856097, + "grad_norm": 0.6918478608131409, + "learning_rate": 0.0002510354487662967, + "loss": 3.3544, + "step": 54100 + }, + { + "epoch": 5.828220858895706, + "grad_norm": 0.703644335269928, + "learning_rate": 0.0002507122077362353, + "loss": 3.3398, + "step": 54150 + }, + { + "epoch": 5.833602410935313, + "grad_norm": 0.6816033720970154, + "learning_rate": 0.00025038896670617387, + "loss": 3.3412, + "step": 54200 + }, + { + "epoch": 5.838983962974922, + "grad_norm": 0.7181455492973328, + "learning_rate": 0.00025006572567611246, + "loss": 3.3289, + "step": 54250 + }, + { + "epoch": 5.84436551501453, + "grad_norm": 0.6734956502914429, + "learning_rate": 0.00024974248464605106, + "loss": 3.3377, + "step": 54300 + }, + { + "epoch": 5.849747067054138, + "grad_norm": 0.703682005405426, + "learning_rate": 0.00024941924361598965, + "loss": 3.3415, + "step": 54350 + }, + { + "epoch": 5.855128619093747, + "grad_norm": 0.7012028694152832, + "learning_rate": 0.00024909600258592825, + "loss": 3.3428, + "step": 54400 + }, + { + "epoch": 5.860510171133355, + "grad_norm": 0.7386256456375122, + "learning_rate": 0.0002487727615558668, + "loss": 3.3364, + "step": 54450 + }, + { + "epoch": 5.865891723172963, + "grad_norm": 0.685370683670044, + "learning_rate": 0.0002484495205258054, + "loss": 3.3489, + "step": 54500 + }, + { + "epoch": 5.871273275212571, + "grad_norm": 0.7215905785560608, + "learning_rate": 0.0002481327443163452, + "loss": 3.3417, + "step": 54550 + }, + { + "epoch": 5.87665482725218, + "grad_norm": 0.6396303176879883, + "learning_rate": 0.0002478095032862838, + "loss": 3.3425, + "step": 54600 + }, + { + "epoch": 5.882036379291788, + "grad_norm": 0.7235729098320007, + "learning_rate": 0.0002474862622562224, + "loss": 3.3433, + "step": 54650 + }, + { + "epoch": 5.887417931331396, + "grad_norm": 0.6854497790336609, + "learning_rate": 0.00024716302122616097, + "loss": 3.3425, + "step": 54700 + }, + { + "epoch": 5.892799483371004, + "grad_norm": 0.7037459015846252, + "learning_rate": 0.0002468397801960995, + "loss": 3.343, + "step": 54750 + }, + { + "epoch": 5.898181035410612, + "grad_norm": 0.6938320398330688, + "learning_rate": 0.0002465165391660381, + "loss": 3.3392, + "step": 54800 + }, + { + "epoch": 5.903562587450221, + "grad_norm": 0.7277181148529053, + "learning_rate": 0.0002461932981359767, + "loss": 3.34, + "step": 54850 + }, + { + "epoch": 5.9089441394898286, + "grad_norm": 0.7088878750801086, + "learning_rate": 0.0002458700571059153, + "loss": 3.3342, + "step": 54900 + }, + { + "epoch": 5.914325691529437, + "grad_norm": 0.7254401445388794, + "learning_rate": 0.0002455468160758539, + "loss": 3.334, + "step": 54950 + }, + { + "epoch": 5.919707243569046, + "grad_norm": 0.6835173964500427, + "learning_rate": 0.00024522357504579243, + "loss": 3.3383, + "step": 55000 + }, + { + "epoch": 5.919707243569046, + "eval_accuracy": 0.38354132574559513, + "eval_loss": 3.384719133377075, + "eval_runtime": 183.0018, + "eval_samples_per_second": 98.42, + "eval_steps_per_second": 6.153, + "step": 55000 + }, + { + "epoch": 5.925088795608653, + "grad_norm": 0.6644113063812256, + "learning_rate": 0.000244900334015731, + "loss": 3.3335, + "step": 55050 + }, + { + "epoch": 5.930470347648262, + "grad_norm": 0.6974128484725952, + "learning_rate": 0.0002445770929856696, + "loss": 3.3146, + "step": 55100 + }, + { + "epoch": 5.93585189968787, + "grad_norm": 0.6761989593505859, + "learning_rate": 0.0002442538519556082, + "loss": 3.3408, + "step": 55150 + }, + { + "epoch": 5.941233451727478, + "grad_norm": 0.7247626185417175, + "learning_rate": 0.0002439306109255468, + "loss": 3.3309, + "step": 55200 + }, + { + "epoch": 5.946615003767087, + "grad_norm": 0.7041187882423401, + "learning_rate": 0.00024360736989548538, + "loss": 3.3517, + "step": 55250 + }, + { + "epoch": 5.951996555806694, + "grad_norm": 0.6591287851333618, + "learning_rate": 0.00024328412886542394, + "loss": 3.3237, + "step": 55300 + }, + { + "epoch": 5.957378107846303, + "grad_norm": 0.6712716817855835, + "learning_rate": 0.00024296088783536257, + "loss": 3.3449, + "step": 55350 + }, + { + "epoch": 5.962759659885911, + "grad_norm": 0.7153301239013672, + "learning_rate": 0.00024263764680530113, + "loss": 3.3493, + "step": 55400 + }, + { + "epoch": 5.968141211925519, + "grad_norm": 0.6891111731529236, + "learning_rate": 0.0002423144057752397, + "loss": 3.3462, + "step": 55450 + }, + { + "epoch": 5.973522763965128, + "grad_norm": 0.7062400579452515, + "learning_rate": 0.00024199116474517832, + "loss": 3.3469, + "step": 55500 + }, + { + "epoch": 5.978904316004736, + "grad_norm": 0.763090193271637, + "learning_rate": 0.0002416679237151169, + "loss": 3.3347, + "step": 55550 + }, + { + "epoch": 5.984285868044344, + "grad_norm": 0.7440812587738037, + "learning_rate": 0.00024134468268505546, + "loss": 3.3311, + "step": 55600 + }, + { + "epoch": 5.989667420083952, + "grad_norm": 0.7073535323143005, + "learning_rate": 0.00024102144165499405, + "loss": 3.3357, + "step": 55650 + }, + { + "epoch": 5.995048972123561, + "grad_norm": 0.7137259244918823, + "learning_rate": 0.00024069820062493265, + "loss": 3.3291, + "step": 55700 + }, + { + "epoch": 6.000430524163169, + "grad_norm": 0.7363245487213135, + "learning_rate": 0.00024037495959487121, + "loss": 3.3288, + "step": 55750 + }, + { + "epoch": 6.005812076202777, + "grad_norm": 0.6792331337928772, + "learning_rate": 0.0002400517185648098, + "loss": 3.2392, + "step": 55800 + }, + { + "epoch": 6.011193628242385, + "grad_norm": 0.7233385443687439, + "learning_rate": 0.00023972847753474838, + "loss": 3.2546, + "step": 55850 + }, + { + "epoch": 6.016575180281993, + "grad_norm": 0.721964418888092, + "learning_rate": 0.00023940523650468697, + "loss": 3.2373, + "step": 55900 + }, + { + "epoch": 6.021956732321602, + "grad_norm": 0.7094835638999939, + "learning_rate": 0.00023908199547462557, + "loss": 3.2454, + "step": 55950 + }, + { + "epoch": 6.0273382843612096, + "grad_norm": 0.9055956602096558, + "learning_rate": 0.00023875875444456413, + "loss": 3.2422, + "step": 56000 + }, + { + "epoch": 6.0273382843612096, + "eval_accuracy": 0.3834130066850862, + "eval_loss": 3.3882665634155273, + "eval_runtime": 182.8427, + "eval_samples_per_second": 98.505, + "eval_steps_per_second": 6.158, + "step": 56000 + }, + { + "epoch": 6.032719836400818, + "grad_norm": 0.703880250453949, + "learning_rate": 0.0002384355134145027, + "loss": 3.2557, + "step": 56050 + }, + { + "epoch": 6.038101388440427, + "grad_norm": 0.7022672295570374, + "learning_rate": 0.00023811227238444132, + "loss": 3.2543, + "step": 56100 + }, + { + "epoch": 6.043482940480034, + "grad_norm": 0.6952441334724426, + "learning_rate": 0.0002377890313543799, + "loss": 3.2414, + "step": 56150 + }, + { + "epoch": 6.048864492519643, + "grad_norm": 0.6822983622550964, + "learning_rate": 0.00023746579032431849, + "loss": 3.2709, + "step": 56200 + }, + { + "epoch": 6.0542460445592505, + "grad_norm": 0.7127947807312012, + "learning_rate": 0.00023714254929425708, + "loss": 3.2604, + "step": 56250 + }, + { + "epoch": 6.059627596598859, + "grad_norm": 0.7314467430114746, + "learning_rate": 0.00023681930826419565, + "loss": 3.2558, + "step": 56300 + }, + { + "epoch": 6.065009148638468, + "grad_norm": 0.7139166593551636, + "learning_rate": 0.00023649606723413424, + "loss": 3.2292, + "step": 56350 + }, + { + "epoch": 6.070390700678075, + "grad_norm": 0.7124274969100952, + "learning_rate": 0.0002361728262040728, + "loss": 3.2714, + "step": 56400 + }, + { + "epoch": 6.075772252717684, + "grad_norm": 0.718776285648346, + "learning_rate": 0.00023584958517401138, + "loss": 3.2716, + "step": 56450 + }, + { + "epoch": 6.081153804757292, + "grad_norm": 0.6894996166229248, + "learning_rate": 0.00023552634414395, + "loss": 3.2668, + "step": 56500 + }, + { + "epoch": 6.0865353567969, + "grad_norm": 0.729155421257019, + "learning_rate": 0.00023520310311388857, + "loss": 3.2687, + "step": 56550 + }, + { + "epoch": 6.091916908836509, + "grad_norm": 0.7677587866783142, + "learning_rate": 0.00023487986208382713, + "loss": 3.2677, + "step": 56600 + }, + { + "epoch": 6.097298460876116, + "grad_norm": 0.7513812184333801, + "learning_rate": 0.00023455662105376576, + "loss": 3.2547, + "step": 56650 + }, + { + "epoch": 6.102680012915725, + "grad_norm": 0.691321611404419, + "learning_rate": 0.00023423338002370432, + "loss": 3.2662, + "step": 56700 + }, + { + "epoch": 6.108061564955333, + "grad_norm": 0.70525062084198, + "learning_rate": 0.0002339101389936429, + "loss": 3.2635, + "step": 56750 + }, + { + "epoch": 6.113443116994941, + "grad_norm": 0.6724135279655457, + "learning_rate": 0.00023358689796358149, + "loss": 3.271, + "step": 56800 + }, + { + "epoch": 6.11882466903455, + "grad_norm": 0.7082234025001526, + "learning_rate": 0.00023326365693352008, + "loss": 3.256, + "step": 56850 + }, + { + "epoch": 6.124206221074158, + "grad_norm": 0.7638316750526428, + "learning_rate": 0.00023294041590345865, + "loss": 3.2694, + "step": 56900 + }, + { + "epoch": 6.129587773113766, + "grad_norm": 0.7289136052131653, + "learning_rate": 0.00023261717487339724, + "loss": 3.2692, + "step": 56950 + }, + { + "epoch": 6.134969325153374, + "grad_norm": 0.7192388772964478, + "learning_rate": 0.0002322939338433358, + "loss": 3.2646, + "step": 57000 + }, + { + "epoch": 6.134969325153374, + "eval_accuracy": 0.38360651747828384, + "eval_loss": 3.388136863708496, + "eval_runtime": 182.7928, + "eval_samples_per_second": 98.532, + "eval_steps_per_second": 6.16, + "step": 57000 + }, + { + "epoch": 6.140350877192983, + "grad_norm": 0.7573813796043396, + "learning_rate": 0.0002319706928132744, + "loss": 3.2741, + "step": 57050 + }, + { + "epoch": 6.1457324292325906, + "grad_norm": 0.732204020023346, + "learning_rate": 0.000231647451783213, + "loss": 3.2918, + "step": 57100 + }, + { + "epoch": 6.151113981272199, + "grad_norm": 0.7195457816123962, + "learning_rate": 0.00023132421075315157, + "loss": 3.2779, + "step": 57150 + }, + { + "epoch": 6.156495533311807, + "grad_norm": 0.7368000745773315, + "learning_rate": 0.0002310009697230902, + "loss": 3.2607, + "step": 57200 + }, + { + "epoch": 6.161877085351415, + "grad_norm": 0.711700975894928, + "learning_rate": 0.00023067772869302876, + "loss": 3.2661, + "step": 57250 + }, + { + "epoch": 6.167258637391024, + "grad_norm": 0.6981373429298401, + "learning_rate": 0.00023035448766296732, + "loss": 3.2747, + "step": 57300 + }, + { + "epoch": 6.1726401894306315, + "grad_norm": 0.707578182220459, + "learning_rate": 0.00023003124663290592, + "loss": 3.2662, + "step": 57350 + }, + { + "epoch": 6.17802174147024, + "grad_norm": 0.7146908044815063, + "learning_rate": 0.0002297080056028445, + "loss": 3.2686, + "step": 57400 + }, + { + "epoch": 6.183403293509849, + "grad_norm": 0.7029908895492554, + "learning_rate": 0.00022938476457278308, + "loss": 3.263, + "step": 57450 + }, + { + "epoch": 6.188784845549456, + "grad_norm": 0.8150429725646973, + "learning_rate": 0.00022906152354272168, + "loss": 3.2707, + "step": 57500 + }, + { + "epoch": 6.194166397589065, + "grad_norm": 0.7352506518363953, + "learning_rate": 0.00022873828251266024, + "loss": 3.2762, + "step": 57550 + }, + { + "epoch": 6.1995479496286725, + "grad_norm": 0.6864310503005981, + "learning_rate": 0.00022841504148259884, + "loss": 3.27, + "step": 57600 + }, + { + "epoch": 6.204929501668281, + "grad_norm": 0.737712562084198, + "learning_rate": 0.00022809180045253743, + "loss": 3.2762, + "step": 57650 + }, + { + "epoch": 6.21031105370789, + "grad_norm": 0.7097147107124329, + "learning_rate": 0.000227768559422476, + "loss": 3.2642, + "step": 57700 + }, + { + "epoch": 6.215692605747497, + "grad_norm": 0.6888893842697144, + "learning_rate": 0.00022744531839241457, + "loss": 3.282, + "step": 57750 + }, + { + "epoch": 6.221074157787106, + "grad_norm": 0.739660382270813, + "learning_rate": 0.0002271220773623532, + "loss": 3.2819, + "step": 57800 + }, + { + "epoch": 6.226455709826714, + "grad_norm": 0.7385263442993164, + "learning_rate": 0.00022679883633229176, + "loss": 3.277, + "step": 57850 + }, + { + "epoch": 6.231837261866322, + "grad_norm": 0.751966655254364, + "learning_rate": 0.00022647559530223032, + "loss": 3.2734, + "step": 57900 + }, + { + "epoch": 6.237218813905931, + "grad_norm": 0.7751827239990234, + "learning_rate": 0.00022615881909277016, + "loss": 3.2823, + "step": 57950 + }, + { + "epoch": 6.242600365945538, + "grad_norm": 0.7112472057342529, + "learning_rate": 0.00022583557806270875, + "loss": 3.2708, + "step": 58000 + }, + { + "epoch": 6.242600365945538, + "eval_accuracy": 0.38413565704194036, + "eval_loss": 3.384307384490967, + "eval_runtime": 183.0119, + "eval_samples_per_second": 98.414, + "eval_steps_per_second": 6.153, + "step": 58000 + }, + { + "epoch": 6.247981917985147, + "grad_norm": 0.7275688648223877, + "learning_rate": 0.00022551233703264732, + "loss": 3.2724, + "step": 58050 + }, + { + "epoch": 6.253363470024755, + "grad_norm": 0.7110071778297424, + "learning_rate": 0.0002251890960025859, + "loss": 3.2628, + "step": 58100 + }, + { + "epoch": 6.258745022064363, + "grad_norm": 0.7214199900627136, + "learning_rate": 0.0002248658549725245, + "loss": 3.2907, + "step": 58150 + }, + { + "epoch": 6.264126574103972, + "grad_norm": 0.6868590116500854, + "learning_rate": 0.00022454261394246308, + "loss": 3.2706, + "step": 58200 + }, + { + "epoch": 6.26950812614358, + "grad_norm": 0.7192474007606506, + "learning_rate": 0.00022421937291240164, + "loss": 3.2852, + "step": 58250 + }, + { + "epoch": 6.274889678183188, + "grad_norm": 0.6799070239067078, + "learning_rate": 0.00022389613188234027, + "loss": 3.2645, + "step": 58300 + }, + { + "epoch": 6.280271230222796, + "grad_norm": 0.7178609371185303, + "learning_rate": 0.00022357289085227883, + "loss": 3.2952, + "step": 58350 + }, + { + "epoch": 6.285652782262405, + "grad_norm": 0.7094721794128418, + "learning_rate": 0.0002232496498222174, + "loss": 3.2788, + "step": 58400 + }, + { + "epoch": 6.2910343343020125, + "grad_norm": 0.7339088320732117, + "learning_rate": 0.000222926408792156, + "loss": 3.2805, + "step": 58450 + }, + { + "epoch": 6.296415886341621, + "grad_norm": 0.7479764223098755, + "learning_rate": 0.0002226031677620946, + "loss": 3.2833, + "step": 58500 + }, + { + "epoch": 6.301797438381229, + "grad_norm": 0.7498093843460083, + "learning_rate": 0.00022227992673203316, + "loss": 3.2691, + "step": 58550 + }, + { + "epoch": 6.307178990420837, + "grad_norm": 0.6673323512077332, + "learning_rate": 0.00022195668570197175, + "loss": 3.2946, + "step": 58600 + }, + { + "epoch": 6.312560542460446, + "grad_norm": 0.7104136347770691, + "learning_rate": 0.00022163344467191032, + "loss": 3.2845, + "step": 58650 + }, + { + "epoch": 6.3179420945000535, + "grad_norm": 0.6776658296585083, + "learning_rate": 0.00022131020364184891, + "loss": 3.2848, + "step": 58700 + }, + { + "epoch": 6.323323646539662, + "grad_norm": 0.8017224073410034, + "learning_rate": 0.0002209869626117875, + "loss": 3.2985, + "step": 58750 + }, + { + "epoch": 6.328705198579271, + "grad_norm": 0.7754116654396057, + "learning_rate": 0.00022066372158172608, + "loss": 3.2727, + "step": 58800 + }, + { + "epoch": 6.334086750618878, + "grad_norm": 0.6976533532142639, + "learning_rate": 0.00022034048055166464, + "loss": 3.2717, + "step": 58850 + }, + { + "epoch": 6.339468302658487, + "grad_norm": 0.7677433490753174, + "learning_rate": 0.00022001723952160327, + "loss": 3.27, + "step": 58900 + }, + { + "epoch": 6.344849854698095, + "grad_norm": 0.7851221561431885, + "learning_rate": 0.00021969399849154183, + "loss": 3.2634, + "step": 58950 + }, + { + "epoch": 6.350231406737703, + "grad_norm": 0.7342744469642639, + "learning_rate": 0.00021937075746148043, + "loss": 3.2901, + "step": 59000 + }, + { + "epoch": 6.350231406737703, + "eval_accuracy": 0.3843829510146061, + "eval_loss": 3.38128662109375, + "eval_runtime": 183.0034, + "eval_samples_per_second": 98.419, + "eval_steps_per_second": 6.153, + "step": 59000 + }, + { + "epoch": 6.355612958777312, + "grad_norm": 0.7511043548583984, + "learning_rate": 0.00021904751643141902, + "loss": 3.2897, + "step": 59050 + }, + { + "epoch": 6.360994510816919, + "grad_norm": 0.7121841907501221, + "learning_rate": 0.0002187242754013576, + "loss": 3.2602, + "step": 59100 + }, + { + "epoch": 6.366376062856528, + "grad_norm": 0.7471939325332642, + "learning_rate": 0.00021840103437129619, + "loss": 3.2703, + "step": 59150 + }, + { + "epoch": 6.371757614896136, + "grad_norm": 0.7568403482437134, + "learning_rate": 0.00021807779334123475, + "loss": 3.2817, + "step": 59200 + }, + { + "epoch": 6.377139166935744, + "grad_norm": 0.7506998181343079, + "learning_rate": 0.00021775455231117335, + "loss": 3.2724, + "step": 59250 + }, + { + "epoch": 6.382520718975353, + "grad_norm": 0.7650529146194458, + "learning_rate": 0.00021743131128111194, + "loss": 3.2874, + "step": 59300 + }, + { + "epoch": 6.387902271014961, + "grad_norm": 0.712727963924408, + "learning_rate": 0.0002171080702510505, + "loss": 3.2699, + "step": 59350 + }, + { + "epoch": 6.393283823054569, + "grad_norm": 0.7634322643280029, + "learning_rate": 0.00021678482922098908, + "loss": 3.2916, + "step": 59400 + }, + { + "epoch": 6.398665375094177, + "grad_norm": 0.7388198375701904, + "learning_rate": 0.0002164615881909277, + "loss": 3.2963, + "step": 59450 + }, + { + "epoch": 6.404046927133785, + "grad_norm": 0.7384008169174194, + "learning_rate": 0.00021613834716086627, + "loss": 3.2789, + "step": 59500 + }, + { + "epoch": 6.4094284791733935, + "grad_norm": 0.7477920651435852, + "learning_rate": 0.00021581510613080483, + "loss": 3.2972, + "step": 59550 + }, + { + "epoch": 6.414810031213002, + "grad_norm": 0.7716207504272461, + "learning_rate": 0.00021549186510074346, + "loss": 3.2811, + "step": 59600 + }, + { + "epoch": 6.42019158325261, + "grad_norm": 0.7266528010368347, + "learning_rate": 0.00021516862407068202, + "loss": 3.2955, + "step": 59650 + }, + { + "epoch": 6.425573135292218, + "grad_norm": 0.7093421220779419, + "learning_rate": 0.0002148453830406206, + "loss": 3.2774, + "step": 59700 + }, + { + "epoch": 6.430954687331827, + "grad_norm": 0.6861085891723633, + "learning_rate": 0.00021452214201055919, + "loss": 3.2935, + "step": 59750 + }, + { + "epoch": 6.4363362393714345, + "grad_norm": 0.6963001489639282, + "learning_rate": 0.00021419890098049778, + "loss": 3.2823, + "step": 59800 + }, + { + "epoch": 6.441717791411043, + "grad_norm": 0.7377711534500122, + "learning_rate": 0.00021387565995043638, + "loss": 3.2938, + "step": 59850 + }, + { + "epoch": 6.447099343450651, + "grad_norm": 0.7165238857269287, + "learning_rate": 0.00021355241892037494, + "loss": 3.296, + "step": 59900 + }, + { + "epoch": 6.452480895490259, + "grad_norm": 0.7209922671318054, + "learning_rate": 0.0002132291778903135, + "loss": 3.2764, + "step": 59950 + }, + { + "epoch": 6.457862447529868, + "grad_norm": 0.6959686875343323, + "learning_rate": 0.00021290593686025213, + "loss": 3.3013, + "step": 60000 + }, + { + "epoch": 6.457862447529868, + "eval_accuracy": 0.38479148587278855, + "eval_loss": 3.3754682540893555, + "eval_runtime": 182.7058, + "eval_samples_per_second": 98.579, + "eval_steps_per_second": 6.163, + "step": 60000 + }, + { + "epoch": 6.4632439995694755, + "grad_norm": 0.8473927974700928, + "learning_rate": 0.0002125826958301907, + "loss": 3.2835, + "step": 60050 + }, + { + "epoch": 6.468625551609084, + "grad_norm": 0.7251399755477905, + "learning_rate": 0.00021225945480012927, + "loss": 3.3025, + "step": 60100 + }, + { + "epoch": 6.474007103648693, + "grad_norm": 0.767999529838562, + "learning_rate": 0.00021193621377006786, + "loss": 3.2906, + "step": 60150 + }, + { + "epoch": 6.4793886556883, + "grad_norm": 0.7236109972000122, + "learning_rate": 0.00021161297274000646, + "loss": 3.2944, + "step": 60200 + }, + { + "epoch": 6.484770207727909, + "grad_norm": 0.7092922925949097, + "learning_rate": 0.00021128973170994502, + "loss": 3.2888, + "step": 60250 + }, + { + "epoch": 6.490151759767517, + "grad_norm": 0.7969743609428406, + "learning_rate": 0.00021096649067988362, + "loss": 3.2951, + "step": 60300 + }, + { + "epoch": 6.495533311807125, + "grad_norm": 0.724816620349884, + "learning_rate": 0.00021064324964982219, + "loss": 3.2937, + "step": 60350 + }, + { + "epoch": 6.500914863846734, + "grad_norm": 0.792242169380188, + "learning_rate": 0.00021032000861976078, + "loss": 3.2896, + "step": 60400 + }, + { + "epoch": 6.506296415886341, + "grad_norm": 0.7577232718467712, + "learning_rate": 0.00020999676758969938, + "loss": 3.2881, + "step": 60450 + }, + { + "epoch": 6.51167796792595, + "grad_norm": 0.7519006729125977, + "learning_rate": 0.00020967352655963794, + "loss": 3.2775, + "step": 60500 + }, + { + "epoch": 6.517059519965558, + "grad_norm": 0.7319377660751343, + "learning_rate": 0.0002093502855295765, + "loss": 3.2871, + "step": 60550 + }, + { + "epoch": 6.522441072005166, + "grad_norm": 0.7363983988761902, + "learning_rate": 0.00020902704449951513, + "loss": 3.2811, + "step": 60600 + }, + { + "epoch": 6.5278226240447745, + "grad_norm": 0.7572370171546936, + "learning_rate": 0.0002087038034694537, + "loss": 3.2906, + "step": 60650 + }, + { + "epoch": 6.533204176084383, + "grad_norm": 0.7479851841926575, + "learning_rate": 0.00020838056243939227, + "loss": 3.2969, + "step": 60700 + }, + { + "epoch": 6.538585728123991, + "grad_norm": 0.754562497138977, + "learning_rate": 0.0002080573214093309, + "loss": 3.2942, + "step": 60750 + }, + { + "epoch": 6.543967280163599, + "grad_norm": 0.7065774202346802, + "learning_rate": 0.00020773408037926946, + "loss": 3.2863, + "step": 60800 + }, + { + "epoch": 6.549348832203208, + "grad_norm": 0.7242980599403381, + "learning_rate": 0.00020741083934920805, + "loss": 3.2771, + "step": 60850 + }, + { + "epoch": 6.5547303842428155, + "grad_norm": 0.7278899550437927, + "learning_rate": 0.00020708759831914662, + "loss": 3.2715, + "step": 60900 + }, + { + "epoch": 6.560111936282424, + "grad_norm": 0.738157331943512, + "learning_rate": 0.00020676435728908521, + "loss": 3.2806, + "step": 60950 + }, + { + "epoch": 6.565493488322032, + "grad_norm": 0.7449216842651367, + "learning_rate": 0.0002064411162590238, + "loss": 3.2954, + "step": 61000 + }, + { + "epoch": 6.565493488322032, + "eval_accuracy": 0.3854739346611513, + "eval_loss": 3.3736610412597656, + "eval_runtime": 182.6023, + "eval_samples_per_second": 98.635, + "eval_steps_per_second": 6.166, + "step": 61000 + }, + { + "epoch": 6.57087504036164, + "grad_norm": 0.7469400763511658, + "learning_rate": 0.00020611787522896238, + "loss": 3.2986, + "step": 61050 + }, + { + "epoch": 6.576256592401249, + "grad_norm": 0.73382169008255, + "learning_rate": 0.00020579463419890094, + "loss": 3.2728, + "step": 61100 + }, + { + "epoch": 6.5816381444408565, + "grad_norm": 0.7447705864906311, + "learning_rate": 0.00020547139316883957, + "loss": 3.298, + "step": 61150 + }, + { + "epoch": 6.587019696480465, + "grad_norm": 0.7730774283409119, + "learning_rate": 0.00020514815213877813, + "loss": 3.2885, + "step": 61200 + }, + { + "epoch": 6.592401248520073, + "grad_norm": 0.764479398727417, + "learning_rate": 0.0002048249111087167, + "loss": 3.2927, + "step": 61250 + }, + { + "epoch": 6.597782800559681, + "grad_norm": 0.7595162987709045, + "learning_rate": 0.00020450167007865532, + "loss": 3.2942, + "step": 61300 + }, + { + "epoch": 6.60316435259929, + "grad_norm": 0.7465935945510864, + "learning_rate": 0.0002041784290485939, + "loss": 3.2903, + "step": 61350 + }, + { + "epoch": 6.608545904638898, + "grad_norm": 0.7768982648849487, + "learning_rate": 0.00020385518801853246, + "loss": 3.2867, + "step": 61400 + }, + { + "epoch": 6.613927456678506, + "grad_norm": 0.7525243759155273, + "learning_rate": 0.00020353194698847105, + "loss": 3.2923, + "step": 61450 + }, + { + "epoch": 6.619309008718115, + "grad_norm": 0.7230768799781799, + "learning_rate": 0.00020320870595840965, + "loss": 3.287, + "step": 61500 + }, + { + "epoch": 6.624690560757722, + "grad_norm": 0.7975799441337585, + "learning_rate": 0.00020288546492834821, + "loss": 3.3008, + "step": 61550 + }, + { + "epoch": 6.630072112797331, + "grad_norm": 0.7720596194267273, + "learning_rate": 0.0002025622238982868, + "loss": 3.2733, + "step": 61600 + }, + { + "epoch": 6.635453664836939, + "grad_norm": 0.7760968804359436, + "learning_rate": 0.00020223898286822538, + "loss": 3.2823, + "step": 61650 + }, + { + "epoch": 6.640835216876547, + "grad_norm": 0.7323305010795593, + "learning_rate": 0.00020191574183816397, + "loss": 3.2987, + "step": 61700 + }, + { + "epoch": 6.6462167689161555, + "grad_norm": 0.7435678839683533, + "learning_rate": 0.00020159250080810257, + "loss": 3.2919, + "step": 61750 + }, + { + "epoch": 6.651598320955763, + "grad_norm": 0.779816210269928, + "learning_rate": 0.00020126925977804113, + "loss": 3.3058, + "step": 61800 + }, + { + "epoch": 6.656979872995372, + "grad_norm": 0.7927149534225464, + "learning_rate": 0.00020094601874797976, + "loss": 3.2841, + "step": 61850 + }, + { + "epoch": 6.66236142503498, + "grad_norm": 0.7235358953475952, + "learning_rate": 0.00020062277771791832, + "loss": 3.3052, + "step": 61900 + }, + { + "epoch": 6.667742977074588, + "grad_norm": 0.7375259399414062, + "learning_rate": 0.00020030600150845813, + "loss": 3.2933, + "step": 61950 + }, + { + "epoch": 6.6731245291141965, + "grad_norm": 0.7493704557418823, + "learning_rate": 0.0001999827604783967, + "loss": 3.2993, + "step": 62000 + }, + { + "epoch": 6.6731245291141965, + "eval_accuracy": 0.3856817876355404, + "eval_loss": 3.367237091064453, + "eval_runtime": 182.8132, + "eval_samples_per_second": 98.521, + "eval_steps_per_second": 6.159, + "step": 62000 + }, + { + "epoch": 6.678506081153805, + "grad_norm": 0.7092652916908264, + "learning_rate": 0.0001996595194483353, + "loss": 3.2846, + "step": 62050 + }, + { + "epoch": 6.683887633193413, + "grad_norm": 0.712834894657135, + "learning_rate": 0.00019933627841827389, + "loss": 3.2926, + "step": 62100 + }, + { + "epoch": 6.689269185233021, + "grad_norm": 0.7052910923957825, + "learning_rate": 0.00019901303738821245, + "loss": 3.2788, + "step": 62150 + }, + { + "epoch": 6.69465073727263, + "grad_norm": 0.727817177772522, + "learning_rate": 0.00019868979635815102, + "loss": 3.2872, + "step": 62200 + }, + { + "epoch": 6.7000322893122375, + "grad_norm": 0.7619044780731201, + "learning_rate": 0.00019836655532808964, + "loss": 3.2951, + "step": 62250 + }, + { + "epoch": 6.705413841351846, + "grad_norm": 0.742910623550415, + "learning_rate": 0.0001980433142980282, + "loss": 3.2846, + "step": 62300 + }, + { + "epoch": 6.710795393391454, + "grad_norm": 0.8113905191421509, + "learning_rate": 0.00019772007326796678, + "loss": 3.2751, + "step": 62350 + }, + { + "epoch": 6.716176945431062, + "grad_norm": 0.8383527994155884, + "learning_rate": 0.0001973968322379054, + "loss": 3.2939, + "step": 62400 + }, + { + "epoch": 6.721558497470671, + "grad_norm": 0.7623720169067383, + "learning_rate": 0.00019707359120784397, + "loss": 3.3112, + "step": 62450 + }, + { + "epoch": 6.7269400495102785, + "grad_norm": 0.7660823464393616, + "learning_rate": 0.00019675035017778253, + "loss": 3.2797, + "step": 62500 + }, + { + "epoch": 6.732321601549887, + "grad_norm": 0.7665824890136719, + "learning_rate": 0.00019642710914772113, + "loss": 3.2917, + "step": 62550 + }, + { + "epoch": 6.737703153589496, + "grad_norm": 0.754959762096405, + "learning_rate": 0.00019610386811765972, + "loss": 3.3053, + "step": 62600 + }, + { + "epoch": 6.743084705629103, + "grad_norm": 0.7177004218101501, + "learning_rate": 0.00019578062708759832, + "loss": 3.285, + "step": 62650 + }, + { + "epoch": 6.748466257668712, + "grad_norm": 0.741148054599762, + "learning_rate": 0.00019545738605753689, + "loss": 3.284, + "step": 62700 + }, + { + "epoch": 6.75384780970832, + "grad_norm": 0.7509447932243347, + "learning_rate": 0.00019513414502747545, + "loss": 3.2928, + "step": 62750 + }, + { + "epoch": 6.759229361747928, + "grad_norm": 0.7940971255302429, + "learning_rate": 0.00019481090399741408, + "loss": 3.2915, + "step": 62800 + }, + { + "epoch": 6.7646109137875365, + "grad_norm": 0.7626696228981018, + "learning_rate": 0.00019448766296735264, + "loss": 3.2999, + "step": 62850 + }, + { + "epoch": 6.769992465827144, + "grad_norm": 0.7458931803703308, + "learning_rate": 0.0001941644219372912, + "loss": 3.2895, + "step": 62900 + }, + { + "epoch": 6.775374017866753, + "grad_norm": 0.7777038812637329, + "learning_rate": 0.00019384118090722983, + "loss": 3.2885, + "step": 62950 + }, + { + "epoch": 6.780755569906361, + "grad_norm": 0.7893502116203308, + "learning_rate": 0.0001935179398771684, + "loss": 3.2894, + "step": 63000 + }, + { + "epoch": 6.780755569906361, + "eval_accuracy": 0.3861680093085102, + "eval_loss": 3.364060163497925, + "eval_runtime": 182.8147, + "eval_samples_per_second": 98.521, + "eval_steps_per_second": 6.159, + "step": 63000 + }, + { + "epoch": 6.786137121945969, + "grad_norm": 0.732779324054718, + "learning_rate": 0.00019319469884710697, + "loss": 3.289, + "step": 63050 + }, + { + "epoch": 6.7915186739855775, + "grad_norm": 0.7249733209609985, + "learning_rate": 0.00019287145781704556, + "loss": 3.2795, + "step": 63100 + }, + { + "epoch": 6.796900226025185, + "grad_norm": 0.7652485966682434, + "learning_rate": 0.00019254821678698416, + "loss": 3.2788, + "step": 63150 + }, + { + "epoch": 6.802281778064794, + "grad_norm": 0.7800796031951904, + "learning_rate": 0.00019222497575692272, + "loss": 3.282, + "step": 63200 + }, + { + "epoch": 6.807663330104402, + "grad_norm": 0.7617146968841553, + "learning_rate": 0.00019190173472686132, + "loss": 3.2922, + "step": 63250 + }, + { + "epoch": 6.813044882144011, + "grad_norm": 0.753646969795227, + "learning_rate": 0.0001915784936967999, + "loss": 3.2944, + "step": 63300 + }, + { + "epoch": 6.8184264341836185, + "grad_norm": 0.7656055688858032, + "learning_rate": 0.00019125525266673845, + "loss": 3.2996, + "step": 63350 + }, + { + "epoch": 6.823807986223227, + "grad_norm": 0.7957121133804321, + "learning_rate": 0.00019093201163667708, + "loss": 3.2804, + "step": 63400 + }, + { + "epoch": 6.829189538262835, + "grad_norm": 0.7702323794364929, + "learning_rate": 0.00019060877060661564, + "loss": 3.2949, + "step": 63450 + }, + { + "epoch": 6.834571090302443, + "grad_norm": 0.8194876313209534, + "learning_rate": 0.0001902855295765542, + "loss": 3.2796, + "step": 63500 + }, + { + "epoch": 6.839952642342052, + "grad_norm": 0.75143963098526, + "learning_rate": 0.00018996228854649283, + "loss": 3.3038, + "step": 63550 + }, + { + "epoch": 6.8453341943816595, + "grad_norm": 0.7525258660316467, + "learning_rate": 0.0001896390475164314, + "loss": 3.2956, + "step": 63600 + }, + { + "epoch": 6.850715746421268, + "grad_norm": 0.7692186236381531, + "learning_rate": 0.00018931580648637, + "loss": 3.2958, + "step": 63650 + }, + { + "epoch": 6.856097298460876, + "grad_norm": 0.7735649943351746, + "learning_rate": 0.00018899256545630856, + "loss": 3.2939, + "step": 63700 + }, + { + "epoch": 6.861478850500484, + "grad_norm": 0.7386011481285095, + "learning_rate": 0.00018866932442624716, + "loss": 3.3071, + "step": 63750 + }, + { + "epoch": 6.866860402540093, + "grad_norm": 0.7658208608627319, + "learning_rate": 0.00018834608339618575, + "loss": 3.2894, + "step": 63800 + }, + { + "epoch": 6.8722419545797, + "grad_norm": 0.7646046280860901, + "learning_rate": 0.00018802284236612432, + "loss": 3.3067, + "step": 63850 + }, + { + "epoch": 6.877623506619309, + "grad_norm": 0.732725203037262, + "learning_rate": 0.0001876996013360629, + "loss": 3.2885, + "step": 63900 + }, + { + "epoch": 6.8830050586589175, + "grad_norm": 0.6860552430152893, + "learning_rate": 0.00018738282512660272, + "loss": 3.3071, + "step": 63950 + }, + { + "epoch": 6.888386610698525, + "grad_norm": 0.7384079694747925, + "learning_rate": 0.0001870595840965413, + "loss": 3.294, + "step": 64000 + }, + { + "epoch": 6.888386610698525, + "eval_accuracy": 0.38654601270521677, + "eval_loss": 3.361266851425171, + "eval_runtime": 182.8473, + "eval_samples_per_second": 98.503, + "eval_steps_per_second": 6.158, + "step": 64000 + }, + { + "epoch": 6.893768162738134, + "grad_norm": 0.7515199184417725, + "learning_rate": 0.0001867363430664799, + "loss": 3.2797, + "step": 64050 + }, + { + "epoch": 6.899149714777742, + "grad_norm": 0.7795135378837585, + "learning_rate": 0.00018641310203641848, + "loss": 3.2799, + "step": 64100 + }, + { + "epoch": 6.90453126681735, + "grad_norm": 0.7016361355781555, + "learning_rate": 0.00018608986100635705, + "loss": 3.2991, + "step": 64150 + }, + { + "epoch": 6.9099128188569585, + "grad_norm": 0.7909216284751892, + "learning_rate": 0.00018576661997629564, + "loss": 3.2868, + "step": 64200 + }, + { + "epoch": 6.915294370896566, + "grad_norm": 0.7275353074073792, + "learning_rate": 0.00018544337894623423, + "loss": 3.2848, + "step": 64250 + }, + { + "epoch": 6.920675922936175, + "grad_norm": 0.7585761547088623, + "learning_rate": 0.0001851201379161728, + "loss": 3.2827, + "step": 64300 + }, + { + "epoch": 6.926057474975783, + "grad_norm": 0.7263973355293274, + "learning_rate": 0.0001847968968861114, + "loss": 3.296, + "step": 64350 + }, + { + "epoch": 6.931439027015391, + "grad_norm": 0.751746654510498, + "learning_rate": 0.00018447365585604996, + "loss": 3.2963, + "step": 64400 + }, + { + "epoch": 6.9368205790549995, + "grad_norm": 0.748279333114624, + "learning_rate": 0.00018415041482598859, + "loss": 3.2842, + "step": 64450 + }, + { + "epoch": 6.942202131094608, + "grad_norm": 0.7945259213447571, + "learning_rate": 0.00018382717379592715, + "loss": 3.3071, + "step": 64500 + }, + { + "epoch": 6.947583683134216, + "grad_norm": 0.74016934633255, + "learning_rate": 0.00018350393276586572, + "loss": 3.2925, + "step": 64550 + }, + { + "epoch": 6.952965235173824, + "grad_norm": 0.7379736304283142, + "learning_rate": 0.00018318069173580434, + "loss": 3.2959, + "step": 64600 + }, + { + "epoch": 6.958346787213433, + "grad_norm": 0.7810225486755371, + "learning_rate": 0.0001828574507057429, + "loss": 3.3001, + "step": 64650 + }, + { + "epoch": 6.9637283392530405, + "grad_norm": 0.7810648083686829, + "learning_rate": 0.00018253420967568148, + "loss": 3.2961, + "step": 64700 + }, + { + "epoch": 6.969109891292649, + "grad_norm": 0.740482747554779, + "learning_rate": 0.00018221096864562007, + "loss": 3.3029, + "step": 64750 + }, + { + "epoch": 6.974491443332257, + "grad_norm": 0.7730101943016052, + "learning_rate": 0.00018188772761555867, + "loss": 3.2891, + "step": 64800 + }, + { + "epoch": 6.979872995371865, + "grad_norm": 0.8048725128173828, + "learning_rate": 0.00018156448658549723, + "loss": 3.2735, + "step": 64850 + }, + { + "epoch": 6.985254547411474, + "grad_norm": 0.76474529504776, + "learning_rate": 0.00018124124555543583, + "loss": 3.2922, + "step": 64900 + }, + { + "epoch": 6.990636099451081, + "grad_norm": 0.7955503463745117, + "learning_rate": 0.0001809180045253744, + "loss": 3.2788, + "step": 64950 + }, + { + "epoch": 6.99601765149069, + "grad_norm": 0.7674031257629395, + "learning_rate": 0.00018059476349531296, + "loss": 3.2997, + "step": 65000 + }, + { + "epoch": 6.99601765149069, + "eval_accuracy": 0.3869485716545694, + "eval_loss": 3.357651710510254, + "eval_runtime": 182.7956, + "eval_samples_per_second": 98.531, + "eval_steps_per_second": 6.16, + "step": 65000 + }, + { + "epoch": 7.0013992035302985, + "grad_norm": 0.7507967948913574, + "learning_rate": 0.00018027152246525159, + "loss": 3.2669, + "step": 65050 + }, + { + "epoch": 7.006780755569906, + "grad_norm": 0.7749801278114319, + "learning_rate": 0.00017994828143519015, + "loss": 3.2086, + "step": 65100 + }, + { + "epoch": 7.012162307609515, + "grad_norm": 0.751576840877533, + "learning_rate": 0.00017962504040512872, + "loss": 3.1952, + "step": 65150 + }, + { + "epoch": 7.017543859649122, + "grad_norm": 0.719531774520874, + "learning_rate": 0.00017930179937506734, + "loss": 3.2096, + "step": 65200 + }, + { + "epoch": 7.022925411688731, + "grad_norm": 0.7776346802711487, + "learning_rate": 0.0001789785583450059, + "loss": 3.1961, + "step": 65250 + }, + { + "epoch": 7.0283069637283395, + "grad_norm": 0.7524562478065491, + "learning_rate": 0.00017865531731494448, + "loss": 3.212, + "step": 65300 + }, + { + "epoch": 7.033688515767947, + "grad_norm": 0.7441275119781494, + "learning_rate": 0.00017833207628488307, + "loss": 3.2052, + "step": 65350 + }, + { + "epoch": 7.039070067807556, + "grad_norm": 0.7578763365745544, + "learning_rate": 0.00017800883525482167, + "loss": 3.2185, + "step": 65400 + }, + { + "epoch": 7.044451619847164, + "grad_norm": 0.7912497520446777, + "learning_rate": 0.00017768559422476026, + "loss": 3.2265, + "step": 65450 + }, + { + "epoch": 7.049833171886772, + "grad_norm": 0.7631890177726746, + "learning_rate": 0.00017736235319469883, + "loss": 3.2046, + "step": 65500 + }, + { + "epoch": 7.0552147239263805, + "grad_norm": 0.7851689457893372, + "learning_rate": 0.0001770391121646374, + "loss": 3.2074, + "step": 65550 + }, + { + "epoch": 7.060596275965988, + "grad_norm": 0.734584629535675, + "learning_rate": 0.00017671587113457602, + "loss": 3.2278, + "step": 65600 + }, + { + "epoch": 7.065977828005597, + "grad_norm": 0.7264954447746277, + "learning_rate": 0.0001763926301045146, + "loss": 3.2161, + "step": 65650 + }, + { + "epoch": 7.071359380045205, + "grad_norm": 0.8186044692993164, + "learning_rate": 0.00017606938907445315, + "loss": 3.1974, + "step": 65700 + }, + { + "epoch": 7.076740932084813, + "grad_norm": 0.7647484540939331, + "learning_rate": 0.00017574614804439178, + "loss": 3.1959, + "step": 65750 + }, + { + "epoch": 7.0821224841244215, + "grad_norm": 0.7596768736839294, + "learning_rate": 0.00017542290701433034, + "loss": 3.216, + "step": 65800 + }, + { + "epoch": 7.08750403616403, + "grad_norm": 0.7448734641075134, + "learning_rate": 0.0001750996659842689, + "loss": 3.2196, + "step": 65850 + }, + { + "epoch": 7.092885588203638, + "grad_norm": 0.7379791736602783, + "learning_rate": 0.0001747764249542075, + "loss": 3.2247, + "step": 65900 + }, + { + "epoch": 7.098267140243246, + "grad_norm": 0.7869905829429626, + "learning_rate": 0.0001744531839241461, + "loss": 3.2226, + "step": 65950 + }, + { + "epoch": 7.103648692282855, + "grad_norm": 0.7377243638038635, + "learning_rate": 0.0001741364077146859, + "loss": 3.238, + "step": 66000 + }, + { + "epoch": 7.103648692282855, + "eval_accuracy": 0.3870309305435328, + "eval_loss": 3.3637986183166504, + "eval_runtime": 182.81, + "eval_samples_per_second": 98.523, + "eval_steps_per_second": 6.159, + "step": 66000 + }, + { + "epoch": 7.109030244322462, + "grad_norm": 0.7684816122055054, + "learning_rate": 0.00017381316668462447, + "loss": 3.2179, + "step": 66050 + }, + { + "epoch": 7.114411796362071, + "grad_norm": 0.7641874551773071, + "learning_rate": 0.00017348992565456304, + "loss": 3.2054, + "step": 66100 + }, + { + "epoch": 7.119793348401679, + "grad_norm": 0.7933626174926758, + "learning_rate": 0.00017316668462450166, + "loss": 3.2072, + "step": 66150 + }, + { + "epoch": 7.125174900441287, + "grad_norm": 0.7817862629890442, + "learning_rate": 0.00017284344359444023, + "loss": 3.2125, + "step": 66200 + }, + { + "epoch": 7.130556452480896, + "grad_norm": 0.8065374493598938, + "learning_rate": 0.00017252020256437885, + "loss": 3.236, + "step": 66250 + }, + { + "epoch": 7.135938004520503, + "grad_norm": 0.7458828091621399, + "learning_rate": 0.00017219696153431742, + "loss": 3.219, + "step": 66300 + }, + { + "epoch": 7.141319556560112, + "grad_norm": 0.7864915132522583, + "learning_rate": 0.000171873720504256, + "loss": 3.2115, + "step": 66350 + }, + { + "epoch": 7.1467011085997205, + "grad_norm": 0.7758970260620117, + "learning_rate": 0.0001715569442947958, + "loss": 3.2207, + "step": 66400 + }, + { + "epoch": 7.152082660639328, + "grad_norm": 0.7451347708702087, + "learning_rate": 0.0001712337032647344, + "loss": 3.2267, + "step": 66450 + }, + { + "epoch": 7.157464212678937, + "grad_norm": 0.7944716215133667, + "learning_rate": 0.00017091046223467298, + "loss": 3.2292, + "step": 66500 + }, + { + "epoch": 7.162845764718545, + "grad_norm": 0.7795634269714355, + "learning_rate": 0.00017058722120461155, + "loss": 3.2236, + "step": 66550 + }, + { + "epoch": 7.168227316758153, + "grad_norm": 0.8057452440261841, + "learning_rate": 0.00017026398017455012, + "loss": 3.2259, + "step": 66600 + }, + { + "epoch": 7.1736088687977615, + "grad_norm": 0.7921217679977417, + "learning_rate": 0.00016994073914448874, + "loss": 3.2398, + "step": 66650 + }, + { + "epoch": 7.178990420837369, + "grad_norm": 0.7860040664672852, + "learning_rate": 0.0001696174981144273, + "loss": 3.2137, + "step": 66700 + }, + { + "epoch": 7.184371972876978, + "grad_norm": 0.7447194457054138, + "learning_rate": 0.00016929425708436588, + "loss": 3.2191, + "step": 66750 + }, + { + "epoch": 7.189753524916586, + "grad_norm": 0.757375180721283, + "learning_rate": 0.0001689710160543045, + "loss": 3.2208, + "step": 66800 + }, + { + "epoch": 7.195135076956194, + "grad_norm": 0.7810249924659729, + "learning_rate": 0.00016864777502424307, + "loss": 3.2391, + "step": 66850 + }, + { + "epoch": 7.2005166289958025, + "grad_norm": 0.8149625062942505, + "learning_rate": 0.00016832453399418163, + "loss": 3.2079, + "step": 66900 + }, + { + "epoch": 7.205898181035411, + "grad_norm": 0.7833424210548401, + "learning_rate": 0.00016800129296412023, + "loss": 3.2331, + "step": 66950 + }, + { + "epoch": 7.211279733075019, + "grad_norm": 0.8489099740982056, + "learning_rate": 0.00016767805193405882, + "loss": 3.2202, + "step": 67000 + }, + { + "epoch": 7.211279733075019, + "eval_accuracy": 0.38730658291991815, + "eval_loss": 3.3612101078033447, + "eval_runtime": 182.5912, + "eval_samples_per_second": 98.641, + "eval_steps_per_second": 6.167, + "step": 67000 + }, + { + "epoch": 7.216661285114627, + "grad_norm": 0.7586920857429504, + "learning_rate": 0.0001673548109039974, + "loss": 3.2246, + "step": 67050 + }, + { + "epoch": 7.222042837154235, + "grad_norm": 0.8181234002113342, + "learning_rate": 0.00016703156987393598, + "loss": 3.2262, + "step": 67100 + }, + { + "epoch": 7.2274243891938434, + "grad_norm": 0.7846294641494751, + "learning_rate": 0.00016670832884387455, + "loss": 3.2376, + "step": 67150 + }, + { + "epoch": 7.232805941233452, + "grad_norm": 0.7629982233047485, + "learning_rate": 0.00016638508781381317, + "loss": 3.2267, + "step": 67200 + }, + { + "epoch": 7.23818749327306, + "grad_norm": 0.8482091426849365, + "learning_rate": 0.00016606184678375174, + "loss": 3.237, + "step": 67250 + }, + { + "epoch": 7.243569045312668, + "grad_norm": 0.8140213489532471, + "learning_rate": 0.0001657386057536903, + "loss": 3.2355, + "step": 67300 + }, + { + "epoch": 7.248950597352277, + "grad_norm": 0.7859810590744019, + "learning_rate": 0.00016541536472362893, + "loss": 3.2088, + "step": 67350 + }, + { + "epoch": 7.254332149391884, + "grad_norm": 0.7926174998283386, + "learning_rate": 0.0001650921236935675, + "loss": 3.2284, + "step": 67400 + }, + { + "epoch": 7.259713701431493, + "grad_norm": 0.8039096593856812, + "learning_rate": 0.00016476888266350607, + "loss": 3.2463, + "step": 67450 + }, + { + "epoch": 7.265095253471101, + "grad_norm": 0.7596452832221985, + "learning_rate": 0.00016444564163344466, + "loss": 3.2319, + "step": 67500 + }, + { + "epoch": 7.270476805510709, + "grad_norm": 0.7452762126922607, + "learning_rate": 0.00016412240060338326, + "loss": 3.2248, + "step": 67550 + }, + { + "epoch": 7.275858357550318, + "grad_norm": 0.7983496189117432, + "learning_rate": 0.00016379915957332182, + "loss": 3.2319, + "step": 67600 + }, + { + "epoch": 7.281239909589925, + "grad_norm": 0.7758620977401733, + "learning_rate": 0.00016347591854326042, + "loss": 3.2352, + "step": 67650 + }, + { + "epoch": 7.286621461629534, + "grad_norm": 0.8212805986404419, + "learning_rate": 0.00016315267751319898, + "loss": 3.2403, + "step": 67700 + }, + { + "epoch": 7.2920030136691425, + "grad_norm": 0.8642499446868896, + "learning_rate": 0.00016282943648313755, + "loss": 3.2181, + "step": 67750 + }, + { + "epoch": 7.29738456570875, + "grad_norm": 0.7983525991439819, + "learning_rate": 0.00016250619545307617, + "loss": 3.2292, + "step": 67800 + }, + { + "epoch": 7.302766117748359, + "grad_norm": 0.7927777171134949, + "learning_rate": 0.00016218295442301474, + "loss": 3.2408, + "step": 67850 + }, + { + "epoch": 7.308147669787967, + "grad_norm": 0.7210763692855835, + "learning_rate": 0.0001618597133929533, + "loss": 3.2259, + "step": 67900 + }, + { + "epoch": 7.313529221827575, + "grad_norm": 0.8408624529838562, + "learning_rate": 0.00016153647236289193, + "loss": 3.2333, + "step": 67950 + }, + { + "epoch": 7.3189107738671835, + "grad_norm": 0.7780632376670837, + "learning_rate": 0.0001612132313328305, + "loss": 3.2489, + "step": 68000 + }, + { + "epoch": 7.3189107738671835, + "eval_accuracy": 0.38737057947084086, + "eval_loss": 3.356868028640747, + "eval_runtime": 182.8416, + "eval_samples_per_second": 98.506, + "eval_steps_per_second": 6.158, + "step": 68000 + }, + { + "epoch": 7.324292325906791, + "grad_norm": 0.7512384653091431, + "learning_rate": 0.0001608899903027691, + "loss": 3.2333, + "step": 68050 + }, + { + "epoch": 7.3296738779464, + "grad_norm": 0.8020547032356262, + "learning_rate": 0.00016056674927270766, + "loss": 3.2387, + "step": 68100 + }, + { + "epoch": 7.335055429986008, + "grad_norm": 0.7911236882209778, + "learning_rate": 0.00016024350824264626, + "loss": 3.238, + "step": 68150 + }, + { + "epoch": 7.340436982025616, + "grad_norm": 0.8028789758682251, + "learning_rate": 0.00015992026721258485, + "loss": 3.2412, + "step": 68200 + }, + { + "epoch": 7.3458185340652244, + "grad_norm": 0.8485779166221619, + "learning_rate": 0.00015959702618252342, + "loss": 3.2367, + "step": 68250 + }, + { + "epoch": 7.351200086104833, + "grad_norm": 0.7784638404846191, + "learning_rate": 0.00015927378515246199, + "loss": 3.2297, + "step": 68300 + }, + { + "epoch": 7.356581638144441, + "grad_norm": 0.7919899821281433, + "learning_rate": 0.0001589505441224006, + "loss": 3.2273, + "step": 68350 + }, + { + "epoch": 7.361963190184049, + "grad_norm": 0.7827087044715881, + "learning_rate": 0.00015862730309233917, + "loss": 3.2288, + "step": 68400 + }, + { + "epoch": 7.367344742223658, + "grad_norm": 0.7712171077728271, + "learning_rate": 0.00015830406206227774, + "loss": 3.2382, + "step": 68450 + }, + { + "epoch": 7.372726294263265, + "grad_norm": 0.7548463940620422, + "learning_rate": 0.00015798082103221636, + "loss": 3.2307, + "step": 68500 + }, + { + "epoch": 7.378107846302874, + "grad_norm": 0.7684069275856018, + "learning_rate": 0.00015765758000215493, + "loss": 3.2362, + "step": 68550 + }, + { + "epoch": 7.383489398342482, + "grad_norm": 0.7532280087471008, + "learning_rate": 0.0001573343389720935, + "loss": 3.2386, + "step": 68600 + }, + { + "epoch": 7.38887095038209, + "grad_norm": 0.7906218767166138, + "learning_rate": 0.0001570110979420321, + "loss": 3.2553, + "step": 68650 + }, + { + "epoch": 7.394252502421699, + "grad_norm": 0.7797097563743591, + "learning_rate": 0.0001566878569119707, + "loss": 3.2411, + "step": 68700 + }, + { + "epoch": 7.399634054461306, + "grad_norm": 0.8284671902656555, + "learning_rate": 0.00015636461588190926, + "loss": 3.2456, + "step": 68750 + }, + { + "epoch": 7.405015606500915, + "grad_norm": 0.7855088114738464, + "learning_rate": 0.00015604137485184785, + "loss": 3.2456, + "step": 68800 + }, + { + "epoch": 7.4103971585405235, + "grad_norm": 0.8041961193084717, + "learning_rate": 0.00015571813382178642, + "loss": 3.2236, + "step": 68850 + }, + { + "epoch": 7.415778710580131, + "grad_norm": 0.8667538166046143, + "learning_rate": 0.000155394892791725, + "loss": 3.2492, + "step": 68900 + }, + { + "epoch": 7.42116026261974, + "grad_norm": 0.7783315777778625, + "learning_rate": 0.0001550716517616636, + "loss": 3.2429, + "step": 68950 + }, + { + "epoch": 7.426541814659347, + "grad_norm": 0.7982115149497986, + "learning_rate": 0.00015474841073160217, + "loss": 3.2278, + "step": 69000 + }, + { + "epoch": 7.426541814659347, + "eval_accuracy": 0.3878293119631936, + "eval_loss": 3.3534276485443115, + "eval_runtime": 182.9528, + "eval_samples_per_second": 98.446, + "eval_steps_per_second": 6.155, + "step": 69000 + }, + { + "epoch": 7.431923366698956, + "grad_norm": 0.8143750429153442, + "learning_rate": 0.0001544251697015408, + "loss": 3.2492, + "step": 69050 + }, + { + "epoch": 7.4373049187385645, + "grad_norm": 0.8255060315132141, + "learning_rate": 0.00015410192867147936, + "loss": 3.2362, + "step": 69100 + }, + { + "epoch": 7.442686470778172, + "grad_norm": 0.8084985613822937, + "learning_rate": 0.00015377868764141793, + "loss": 3.2162, + "step": 69150 + }, + { + "epoch": 7.448068022817781, + "grad_norm": 0.8223511576652527, + "learning_rate": 0.00015345544661135653, + "loss": 3.2509, + "step": 69200 + }, + { + "epoch": 7.453449574857389, + "grad_norm": 0.7960752844810486, + "learning_rate": 0.00015313220558129512, + "loss": 3.2334, + "step": 69250 + }, + { + "epoch": 7.458831126896997, + "grad_norm": 0.8062809705734253, + "learning_rate": 0.0001528089645512337, + "loss": 3.2391, + "step": 69300 + }, + { + "epoch": 7.4642126789366054, + "grad_norm": 0.8449177742004395, + "learning_rate": 0.00015248572352117228, + "loss": 3.2271, + "step": 69350 + }, + { + "epoch": 7.469594230976213, + "grad_norm": 0.8337926864624023, + "learning_rate": 0.00015216248249111085, + "loss": 3.2184, + "step": 69400 + }, + { + "epoch": 7.474975783015822, + "grad_norm": 0.7521905899047852, + "learning_rate": 0.00015183924146104945, + "loss": 3.2245, + "step": 69450 + }, + { + "epoch": 7.48035733505543, + "grad_norm": 0.8167929649353027, + "learning_rate": 0.00015151600043098804, + "loss": 3.2562, + "step": 69500 + }, + { + "epoch": 7.485738887095038, + "grad_norm": 0.7685818076133728, + "learning_rate": 0.0001511927594009266, + "loss": 3.2436, + "step": 69550 + }, + { + "epoch": 7.491120439134646, + "grad_norm": 0.7431442141532898, + "learning_rate": 0.00015086951837086518, + "loss": 3.2319, + "step": 69600 + }, + { + "epoch": 7.496501991174255, + "grad_norm": 0.783828854560852, + "learning_rate": 0.0001505462773408038, + "loss": 3.2341, + "step": 69650 + }, + { + "epoch": 7.501883543213863, + "grad_norm": 0.7887234687805176, + "learning_rate": 0.00015022303631074236, + "loss": 3.2257, + "step": 69700 + }, + { + "epoch": 7.507265095253471, + "grad_norm": 0.8269175887107849, + "learning_rate": 0.00014989979528068096, + "loss": 3.2514, + "step": 69750 + }, + { + "epoch": 7.51264664729308, + "grad_norm": 0.8568757772445679, + "learning_rate": 0.00014957655425061953, + "loss": 3.2363, + "step": 69800 + }, + { + "epoch": 7.518028199332687, + "grad_norm": 0.7879490852355957, + "learning_rate": 0.00014925331322055812, + "loss": 3.2664, + "step": 69850 + }, + { + "epoch": 7.523409751372296, + "grad_norm": 0.8363874554634094, + "learning_rate": 0.0001489300721904967, + "loss": 3.2386, + "step": 69900 + }, + { + "epoch": 7.528791303411904, + "grad_norm": 0.7977311015129089, + "learning_rate": 0.00014860683116043528, + "loss": 3.2544, + "step": 69950 + }, + { + "epoch": 7.534172855451512, + "grad_norm": 0.7749576568603516, + "learning_rate": 0.00014828359013037385, + "loss": 3.2375, + "step": 70000 + }, + { + "epoch": 7.534172855451512, + "eval_accuracy": 0.38801554301290764, + "eval_loss": 3.349801540374756, + "eval_runtime": 182.841, + "eval_samples_per_second": 98.506, + "eval_steps_per_second": 6.158, + "step": 70000 + }, + { + "epoch": 7.539554407491121, + "grad_norm": 0.7948490381240845, + "learning_rate": 0.00014796034910031245, + "loss": 3.2395, + "step": 70050 + }, + { + "epoch": 7.544935959530728, + "grad_norm": 0.7794623374938965, + "learning_rate": 0.00014763710807025104, + "loss": 3.2374, + "step": 70100 + }, + { + "epoch": 7.550317511570337, + "grad_norm": 0.7769473791122437, + "learning_rate": 0.0001473138670401896, + "loss": 3.2361, + "step": 70150 + }, + { + "epoch": 7.5556990636099455, + "grad_norm": 0.8074938654899597, + "learning_rate": 0.0001469906260101282, + "loss": 3.2296, + "step": 70200 + }, + { + "epoch": 7.561080615649553, + "grad_norm": 0.7983949184417725, + "learning_rate": 0.0001466673849800668, + "loss": 3.2314, + "step": 70250 + }, + { + "epoch": 7.566462167689162, + "grad_norm": 0.8049135208129883, + "learning_rate": 0.0001463441439500054, + "loss": 3.2324, + "step": 70300 + }, + { + "epoch": 7.57184371972877, + "grad_norm": 0.8315684795379639, + "learning_rate": 0.00014602090291994396, + "loss": 3.256, + "step": 70350 + }, + { + "epoch": 7.577225271768378, + "grad_norm": 0.8284087181091309, + "learning_rate": 0.00014569766188988255, + "loss": 3.2303, + "step": 70400 + }, + { + "epoch": 7.5826068238079865, + "grad_norm": 0.8085214495658875, + "learning_rate": 0.00014538088568042236, + "loss": 3.2405, + "step": 70450 + }, + { + "epoch": 7.587988375847594, + "grad_norm": 0.8195266723632812, + "learning_rate": 0.00014505764465036093, + "loss": 3.2479, + "step": 70500 + }, + { + "epoch": 7.593369927887203, + "grad_norm": 0.7887190580368042, + "learning_rate": 0.00014473440362029952, + "loss": 3.229, + "step": 70550 + }, + { + "epoch": 7.598751479926811, + "grad_norm": 0.7637575268745422, + "learning_rate": 0.00014441116259023812, + "loss": 3.2514, + "step": 70600 + }, + { + "epoch": 7.604133031966419, + "grad_norm": 0.8185248970985413, + "learning_rate": 0.00014408792156017669, + "loss": 3.2379, + "step": 70650 + }, + { + "epoch": 7.609514584006027, + "grad_norm": 0.7940756678581238, + "learning_rate": 0.00014376468053011528, + "loss": 3.2618, + "step": 70700 + }, + { + "epoch": 7.614896136045635, + "grad_norm": 0.7927145957946777, + "learning_rate": 0.00014344143950005387, + "loss": 3.2455, + "step": 70750 + }, + { + "epoch": 7.620277688085244, + "grad_norm": 0.7812114953994751, + "learning_rate": 0.00014311819846999244, + "loss": 3.2428, + "step": 70800 + }, + { + "epoch": 7.625659240124852, + "grad_norm": 0.7778552174568176, + "learning_rate": 0.00014279495743993104, + "loss": 3.2367, + "step": 70850 + }, + { + "epoch": 7.63104079216446, + "grad_norm": 0.8149660229682922, + "learning_rate": 0.00014247171640986963, + "loss": 3.244, + "step": 70900 + }, + { + "epoch": 7.636422344204068, + "grad_norm": 0.8159542679786682, + "learning_rate": 0.0001421484753798082, + "loss": 3.234, + "step": 70950 + }, + { + "epoch": 7.641803896243677, + "grad_norm": 0.7451802492141724, + "learning_rate": 0.0001418252343497468, + "loss": 3.247, + "step": 71000 + }, + { + "epoch": 7.641803896243677, + "eval_accuracy": 0.38847731778611916, + "eval_loss": 3.345766305923462, + "eval_runtime": 182.6769, + "eval_samples_per_second": 98.595, + "eval_steps_per_second": 6.164, + "step": 71000 + }, + { + "epoch": 7.647185448283285, + "grad_norm": 0.8018648624420166, + "learning_rate": 0.00014150199331968536, + "loss": 3.2532, + "step": 71050 + }, + { + "epoch": 7.652567000322893, + "grad_norm": 0.8257297277450562, + "learning_rate": 0.00014118521711022517, + "loss": 3.2431, + "step": 71100 + }, + { + "epoch": 7.657948552362502, + "grad_norm": 0.8156144618988037, + "learning_rate": 0.00014086197608016376, + "loss": 3.2349, + "step": 71150 + }, + { + "epoch": 7.663330104402109, + "grad_norm": 0.792155385017395, + "learning_rate": 0.00014053873505010236, + "loss": 3.2245, + "step": 71200 + }, + { + "epoch": 7.668711656441718, + "grad_norm": 0.8484143018722534, + "learning_rate": 0.00014021549402004092, + "loss": 3.2349, + "step": 71250 + }, + { + "epoch": 7.674093208481326, + "grad_norm": 0.8315868377685547, + "learning_rate": 0.00013989225298997952, + "loss": 3.2703, + "step": 71300 + }, + { + "epoch": 7.679474760520934, + "grad_norm": 0.7925692200660706, + "learning_rate": 0.00013956901195991811, + "loss": 3.2345, + "step": 71350 + }, + { + "epoch": 7.684856312560543, + "grad_norm": 0.8163354396820068, + "learning_rate": 0.00013924577092985668, + "loss": 3.2444, + "step": 71400 + }, + { + "epoch": 7.69023786460015, + "grad_norm": 0.82204669713974, + "learning_rate": 0.00013892252989979528, + "loss": 3.2424, + "step": 71450 + }, + { + "epoch": 7.695619416639759, + "grad_norm": 0.839684247970581, + "learning_rate": 0.00013859928886973384, + "loss": 3.2359, + "step": 71500 + }, + { + "epoch": 7.7010009686793675, + "grad_norm": 0.8018124103546143, + "learning_rate": 0.00013827604783967244, + "loss": 3.2321, + "step": 71550 + }, + { + "epoch": 7.706382520718975, + "grad_norm": 0.7850207090377808, + "learning_rate": 0.000137952806809611, + "loss": 3.2435, + "step": 71600 + }, + { + "epoch": 7.711764072758584, + "grad_norm": 0.7935765981674194, + "learning_rate": 0.0001376295657795496, + "loss": 3.2332, + "step": 71650 + }, + { + "epoch": 7.717145624798192, + "grad_norm": 0.8368144631385803, + "learning_rate": 0.0001373063247494882, + "loss": 3.2416, + "step": 71700 + }, + { + "epoch": 7.7225271768378, + "grad_norm": 0.8086535930633545, + "learning_rate": 0.00013698308371942676, + "loss": 3.2458, + "step": 71750 + }, + { + "epoch": 7.727908728877408, + "grad_norm": 0.8280001282691956, + "learning_rate": 0.00013665984268936536, + "loss": 3.2307, + "step": 71800 + }, + { + "epoch": 7.733290280917016, + "grad_norm": 0.8143231272697449, + "learning_rate": 0.00013633660165930395, + "loss": 3.2272, + "step": 71850 + }, + { + "epoch": 7.738671832956625, + "grad_norm": 0.784056544303894, + "learning_rate": 0.00013601336062924255, + "loss": 3.2341, + "step": 71900 + }, + { + "epoch": 7.744053384996233, + "grad_norm": 0.8832378387451172, + "learning_rate": 0.00013569011959918111, + "loss": 3.2553, + "step": 71950 + }, + { + "epoch": 7.749434937035841, + "grad_norm": 0.806605875492096, + "learning_rate": 0.0001353668785691197, + "loss": 3.2471, + "step": 72000 + }, + { + "epoch": 7.749434937035841, + "eval_accuracy": 0.3890513309924431, + "eval_loss": 3.342980146408081, + "eval_runtime": 182.5155, + "eval_samples_per_second": 98.682, + "eval_steps_per_second": 6.169, + "step": 72000 + }, + { + "epoch": 7.754816489075449, + "grad_norm": 0.8063587546348572, + "learning_rate": 0.00013504363753905828, + "loss": 3.2495, + "step": 72050 + }, + { + "epoch": 7.760198041115058, + "grad_norm": 0.8236513137817383, + "learning_rate": 0.00013472039650899687, + "loss": 3.2403, + "step": 72100 + }, + { + "epoch": 7.765579593154666, + "grad_norm": 0.801658034324646, + "learning_rate": 0.00013439715547893544, + "loss": 3.2375, + "step": 72150 + }, + { + "epoch": 7.770961145194274, + "grad_norm": 0.7991564273834229, + "learning_rate": 0.00013407391444887403, + "loss": 3.2362, + "step": 72200 + }, + { + "epoch": 7.776342697233883, + "grad_norm": 0.780725359916687, + "learning_rate": 0.0001337506734188126, + "loss": 3.2467, + "step": 72250 + }, + { + "epoch": 7.78172424927349, + "grad_norm": 0.7829933762550354, + "learning_rate": 0.0001334274323887512, + "loss": 3.2352, + "step": 72300 + }, + { + "epoch": 7.787105801313099, + "grad_norm": 0.8279498815536499, + "learning_rate": 0.0001331041913586898, + "loss": 3.2495, + "step": 72350 + }, + { + "epoch": 7.792487353352707, + "grad_norm": 0.8042828440666199, + "learning_rate": 0.00013278095032862838, + "loss": 3.2352, + "step": 72400 + }, + { + "epoch": 7.797868905392315, + "grad_norm": 0.8500546813011169, + "learning_rate": 0.00013245770929856695, + "loss": 3.248, + "step": 72450 + }, + { + "epoch": 7.803250457431924, + "grad_norm": 0.8876932859420776, + "learning_rate": 0.00013213446826850555, + "loss": 3.2425, + "step": 72500 + }, + { + "epoch": 7.808632009471531, + "grad_norm": 0.7689300775527954, + "learning_rate": 0.00013181122723844411, + "loss": 3.2308, + "step": 72550 + }, + { + "epoch": 7.81401356151114, + "grad_norm": 0.7971476912498474, + "learning_rate": 0.0001314879862083827, + "loss": 3.2349, + "step": 72600 + }, + { + "epoch": 7.819395113550748, + "grad_norm": 0.817987859249115, + "learning_rate": 0.00013116474517832128, + "loss": 3.2602, + "step": 72650 + }, + { + "epoch": 7.824776665590356, + "grad_norm": 0.819493293762207, + "learning_rate": 0.00013084150414825987, + "loss": 3.2283, + "step": 72700 + }, + { + "epoch": 7.830158217629965, + "grad_norm": 0.8192011117935181, + "learning_rate": 0.00013051826311819844, + "loss": 3.2344, + "step": 72750 + }, + { + "epoch": 7.835539769669572, + "grad_norm": 0.7826513051986694, + "learning_rate": 0.00013019502208813703, + "loss": 3.2499, + "step": 72800 + }, + { + "epoch": 7.840921321709181, + "grad_norm": 0.8338637351989746, + "learning_rate": 0.00012987178105807563, + "loss": 3.2288, + "step": 72850 + }, + { + "epoch": 7.846302873748789, + "grad_norm": 0.7698312997817993, + "learning_rate": 0.00012954854002801422, + "loss": 3.2517, + "step": 72900 + }, + { + "epoch": 7.851684425788397, + "grad_norm": 0.8017268180847168, + "learning_rate": 0.0001292252989979528, + "loss": 3.2544, + "step": 72950 + }, + { + "epoch": 7.857065977828006, + "grad_norm": 0.7961682677268982, + "learning_rate": 0.00012890205796789139, + "loss": 3.2406, + "step": 73000 + }, + { + "epoch": 7.857065977828006, + "eval_accuracy": 0.38943215936423287, + "eval_loss": 3.3387157917022705, + "eval_runtime": 182.7982, + "eval_samples_per_second": 98.529, + "eval_steps_per_second": 6.16, + "step": 73000 + }, + { + "epoch": 7.862447529867614, + "grad_norm": 0.8738141655921936, + "learning_rate": 0.00012857881693782998, + "loss": 3.2274, + "step": 73050 + }, + { + "epoch": 7.867829081907222, + "grad_norm": 0.7610816955566406, + "learning_rate": 0.00012825557590776855, + "loss": 3.2318, + "step": 73100 + }, + { + "epoch": 7.87321063394683, + "grad_norm": 0.8317003846168518, + "learning_rate": 0.00012793233487770714, + "loss": 3.2434, + "step": 73150 + }, + { + "epoch": 7.878592185986438, + "grad_norm": 0.8011530041694641, + "learning_rate": 0.0001276090938476457, + "loss": 3.2545, + "step": 73200 + }, + { + "epoch": 7.883973738026047, + "grad_norm": 0.8046674132347107, + "learning_rate": 0.0001272858528175843, + "loss": 3.2302, + "step": 73250 + }, + { + "epoch": 7.889355290065655, + "grad_norm": 0.8095680475234985, + "learning_rate": 0.00012696261178752287, + "loss": 3.2464, + "step": 73300 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.7865450978279114, + "learning_rate": 0.00012663937075746147, + "loss": 3.2599, + "step": 73350 + }, + { + "epoch": 7.900118394144871, + "grad_norm": 0.7817733287811279, + "learning_rate": 0.00012631612972740006, + "loss": 3.2339, + "step": 73400 + }, + { + "epoch": 7.90549994618448, + "grad_norm": 0.8104103207588196, + "learning_rate": 0.00012599288869733863, + "loss": 3.2297, + "step": 73450 + }, + { + "epoch": 7.910881498224088, + "grad_norm": 0.8111141324043274, + "learning_rate": 0.00012566964766727722, + "loss": 3.2322, + "step": 73500 + }, + { + "epoch": 7.916263050263696, + "grad_norm": 0.8497397899627686, + "learning_rate": 0.00012534640663721582, + "loss": 3.2276, + "step": 73550 + }, + { + "epoch": 7.921644602303305, + "grad_norm": 0.8326553106307983, + "learning_rate": 0.00012502316560715439, + "loss": 3.2472, + "step": 73600 + }, + { + "epoch": 7.927026154342912, + "grad_norm": 0.8586411476135254, + "learning_rate": 0.00012469992457709298, + "loss": 3.2429, + "step": 73650 + }, + { + "epoch": 7.932407706382521, + "grad_norm": 0.7935066819190979, + "learning_rate": 0.00012437668354703158, + "loss": 3.2422, + "step": 73700 + }, + { + "epoch": 7.937789258422129, + "grad_norm": 0.7909044623374939, + "learning_rate": 0.00012405344251697014, + "loss": 3.2395, + "step": 73750 + }, + { + "epoch": 7.943170810461737, + "grad_norm": 0.8109498023986816, + "learning_rate": 0.00012373020148690874, + "loss": 3.2482, + "step": 73800 + }, + { + "epoch": 7.948552362501346, + "grad_norm": 0.8576630353927612, + "learning_rate": 0.0001234069604568473, + "loss": 3.239, + "step": 73850 + }, + { + "epoch": 7.953933914540953, + "grad_norm": 0.7980335354804993, + "learning_rate": 0.0001230837194267859, + "loss": 3.2356, + "step": 73900 + }, + { + "epoch": 7.959315466580562, + "grad_norm": 0.7563722729682922, + "learning_rate": 0.00012276047839672447, + "loss": 3.2249, + "step": 73950 + }, + { + "epoch": 7.96469701862017, + "grad_norm": 0.7781243324279785, + "learning_rate": 0.00012243723736666306, + "loss": 3.2495, + "step": 74000 + }, + { + "epoch": 7.96469701862017, + "eval_accuracy": 0.38969031862568004, + "eval_loss": 3.334991216659546, + "eval_runtime": 182.9415, + "eval_samples_per_second": 98.452, + "eval_steps_per_second": 6.155, + "step": 74000 + }, + { + "epoch": 7.970078570659778, + "grad_norm": 0.8000725507736206, + "learning_rate": 0.00012211399633660166, + "loss": 3.2204, + "step": 74050 + }, + { + "epoch": 7.975460122699387, + "grad_norm": 0.8195854425430298, + "learning_rate": 0.00012179075530654022, + "loss": 3.2446, + "step": 74100 + }, + { + "epoch": 7.980841674738995, + "grad_norm": 0.8186859488487244, + "learning_rate": 0.00012146751427647882, + "loss": 3.2262, + "step": 74150 + }, + { + "epoch": 7.986223226778603, + "grad_norm": 0.8260078430175781, + "learning_rate": 0.0001211442732464174, + "loss": 3.2422, + "step": 74200 + }, + { + "epoch": 7.991604778818211, + "grad_norm": 0.7887690663337708, + "learning_rate": 0.00012082103221635598, + "loss": 3.2354, + "step": 74250 + }, + { + "epoch": 7.996986330857819, + "grad_norm": 0.8294403553009033, + "learning_rate": 0.00012049779118629456, + "loss": 3.252, + "step": 74300 + }, + { + "epoch": 8.002367882897428, + "grad_norm": 0.830507755279541, + "learning_rate": 0.00012017455015623316, + "loss": 3.2075, + "step": 74350 + }, + { + "epoch": 8.007749434937036, + "grad_norm": 0.8426108360290527, + "learning_rate": 0.00011985130912617175, + "loss": 3.1655, + "step": 74400 + }, + { + "epoch": 8.013130986976645, + "grad_norm": 0.8193655610084534, + "learning_rate": 0.00011952806809611032, + "loss": 3.1579, + "step": 74450 + }, + { + "epoch": 8.018512539016251, + "grad_norm": 0.8287805318832397, + "learning_rate": 0.00011920482706604891, + "loss": 3.1689, + "step": 74500 + }, + { + "epoch": 8.02389409105586, + "grad_norm": 0.8014912009239197, + "learning_rate": 0.0001188815860359875, + "loss": 3.1657, + "step": 74550 + }, + { + "epoch": 8.029275643095469, + "grad_norm": 0.7986035943031311, + "learning_rate": 0.00011855834500592608, + "loss": 3.161, + "step": 74600 + }, + { + "epoch": 8.034657195135077, + "grad_norm": 0.7975103855133057, + "learning_rate": 0.00011823510397586466, + "loss": 3.1607, + "step": 74650 + }, + { + "epoch": 8.040038747174686, + "grad_norm": 0.8071293234825134, + "learning_rate": 0.00011791186294580325, + "loss": 3.1557, + "step": 74700 + }, + { + "epoch": 8.045420299214294, + "grad_norm": 0.8412976264953613, + "learning_rate": 0.00011758862191574182, + "loss": 3.1663, + "step": 74750 + }, + { + "epoch": 8.050801851253901, + "grad_norm": 0.8266987204551697, + "learning_rate": 0.00011726538088568041, + "loss": 3.1748, + "step": 74800 + }, + { + "epoch": 8.05618340329351, + "grad_norm": 0.8250101804733276, + "learning_rate": 0.000116942139855619, + "loss": 3.1774, + "step": 74850 + }, + { + "epoch": 8.061564955333118, + "grad_norm": 0.808516800403595, + "learning_rate": 0.00011661889882555759, + "loss": 3.1703, + "step": 74900 + }, + { + "epoch": 8.066946507372727, + "grad_norm": 0.8228300213813782, + "learning_rate": 0.00011629565779549616, + "loss": 3.1582, + "step": 74950 + }, + { + "epoch": 8.072328059412335, + "grad_norm": 0.8521013855934143, + "learning_rate": 0.00011597241676543475, + "loss": 3.191, + "step": 75000 + }, + { + "epoch": 8.072328059412335, + "eval_accuracy": 0.3899119705168216, + "eval_loss": 3.341465711593628, + "eval_runtime": 182.5475, + "eval_samples_per_second": 98.665, + "eval_steps_per_second": 6.168, + "step": 75000 + }, + { + "epoch": 8.077709611451942, + "grad_norm": 0.834549069404602, + "learning_rate": 0.00011564917573537335, + "loss": 3.1519, + "step": 75050 + }, + { + "epoch": 8.08309116349155, + "grad_norm": 0.8692988157272339, + "learning_rate": 0.00011533239952591314, + "loss": 3.1654, + "step": 75100 + }, + { + "epoch": 8.088472715531159, + "grad_norm": 0.8068045973777771, + "learning_rate": 0.00011500915849585173, + "loss": 3.1802, + "step": 75150 + }, + { + "epoch": 8.093854267570768, + "grad_norm": 0.8539791107177734, + "learning_rate": 0.00011468591746579033, + "loss": 3.1812, + "step": 75200 + }, + { + "epoch": 8.099235819610376, + "grad_norm": 0.8729422092437744, + "learning_rate": 0.0001143626764357289, + "loss": 3.1766, + "step": 75250 + }, + { + "epoch": 8.104617371649983, + "grad_norm": 0.8850834369659424, + "learning_rate": 0.00011403943540566749, + "loss": 3.1687, + "step": 75300 + }, + { + "epoch": 8.109998923689592, + "grad_norm": 0.8430905938148499, + "learning_rate": 0.00011371619437560607, + "loss": 3.1833, + "step": 75350 + }, + { + "epoch": 8.1153804757292, + "grad_norm": 0.7951804399490356, + "learning_rate": 0.00011339295334554464, + "loss": 3.1759, + "step": 75400 + }, + { + "epoch": 8.120762027768809, + "grad_norm": 0.8502519130706787, + "learning_rate": 0.00011306971231548323, + "loss": 3.1873, + "step": 75450 + }, + { + "epoch": 8.126143579808417, + "grad_norm": 1.6292310953140259, + "learning_rate": 0.00011274647128542183, + "loss": 3.1808, + "step": 75500 + }, + { + "epoch": 8.131525131848026, + "grad_norm": 0.8119076490402222, + "learning_rate": 0.0001124232302553604, + "loss": 3.1815, + "step": 75550 + }, + { + "epoch": 8.136906683887632, + "grad_norm": 0.8140049576759338, + "learning_rate": 0.00011209998922529899, + "loss": 3.1838, + "step": 75600 + }, + { + "epoch": 8.142288235927241, + "grad_norm": 0.8442477583885193, + "learning_rate": 0.00011177674819523757, + "loss": 3.1647, + "step": 75650 + }, + { + "epoch": 8.14766978796685, + "grad_norm": 0.8645812273025513, + "learning_rate": 0.00011145350716517617, + "loss": 3.1821, + "step": 75700 + }, + { + "epoch": 8.153051340006458, + "grad_norm": 0.778192400932312, + "learning_rate": 0.00011113026613511473, + "loss": 3.19, + "step": 75750 + }, + { + "epoch": 8.158432892046067, + "grad_norm": 0.8154362440109253, + "learning_rate": 0.00011080702510505333, + "loss": 3.1737, + "step": 75800 + }, + { + "epoch": 8.163814444085673, + "grad_norm": 0.8348283767700195, + "learning_rate": 0.00011048378407499191, + "loss": 3.1763, + "step": 75850 + }, + { + "epoch": 8.169195996125282, + "grad_norm": 0.8607150316238403, + "learning_rate": 0.00011016054304493049, + "loss": 3.1867, + "step": 75900 + }, + { + "epoch": 8.17457754816489, + "grad_norm": 0.8294938206672668, + "learning_rate": 0.00010983730201486907, + "loss": 3.179, + "step": 75950 + }, + { + "epoch": 8.1799591002045, + "grad_norm": 0.8422661423683167, + "learning_rate": 0.00010951406098480767, + "loss": 3.1735, + "step": 76000 + }, + { + "epoch": 8.1799591002045, + "eval_accuracy": 0.38991914160741736, + "eval_loss": 3.339107036590576, + "eval_runtime": 182.7233, + "eval_samples_per_second": 98.57, + "eval_steps_per_second": 6.162, + "step": 76000 + }, + { + "epoch": 8.185340652244108, + "grad_norm": 0.8388949036598206, + "learning_rate": 0.00010919081995474623, + "loss": 3.1938, + "step": 76050 + }, + { + "epoch": 8.190722204283716, + "grad_norm": 0.786255419254303, + "learning_rate": 0.00010886757892468483, + "loss": 3.1737, + "step": 76100 + }, + { + "epoch": 8.196103756323323, + "grad_norm": 0.8635239005088806, + "learning_rate": 0.00010854433789462342, + "loss": 3.1797, + "step": 76150 + }, + { + "epoch": 8.201485308362932, + "grad_norm": 0.8113918304443359, + "learning_rate": 0.000108221096864562, + "loss": 3.1891, + "step": 76200 + }, + { + "epoch": 8.20686686040254, + "grad_norm": 0.823483407497406, + "learning_rate": 0.00010789785583450059, + "loss": 3.1757, + "step": 76250 + }, + { + "epoch": 8.212248412442149, + "grad_norm": 0.7900793552398682, + "learning_rate": 0.00010757461480443917, + "loss": 3.1828, + "step": 76300 + }, + { + "epoch": 8.217629964481757, + "grad_norm": 0.8567901253700256, + "learning_rate": 0.00010725137377437776, + "loss": 3.1645, + "step": 76350 + }, + { + "epoch": 8.223011516521364, + "grad_norm": 0.8328141570091248, + "learning_rate": 0.00010692813274431633, + "loss": 3.1711, + "step": 76400 + }, + { + "epoch": 8.228393068560973, + "grad_norm": 0.8192028403282166, + "learning_rate": 0.00010660489171425492, + "loss": 3.1744, + "step": 76450 + }, + { + "epoch": 8.233774620600581, + "grad_norm": 0.8761533498764038, + "learning_rate": 0.0001062816506841935, + "loss": 3.1959, + "step": 76500 + }, + { + "epoch": 8.23915617264019, + "grad_norm": 0.8476336002349854, + "learning_rate": 0.00010595840965413209, + "loss": 3.1788, + "step": 76550 + }, + { + "epoch": 8.244537724679798, + "grad_norm": 0.8509985208511353, + "learning_rate": 0.00010563516862407067, + "loss": 3.1825, + "step": 76600 + }, + { + "epoch": 8.249919276719407, + "grad_norm": 0.8411309123039246, + "learning_rate": 0.00010531192759400926, + "loss": 3.1826, + "step": 76650 + }, + { + "epoch": 8.255300828759013, + "grad_norm": 0.8168168067932129, + "learning_rate": 0.00010498868656394784, + "loss": 3.185, + "step": 76700 + }, + { + "epoch": 8.260682380798622, + "grad_norm": 0.8766189813613892, + "learning_rate": 0.00010466544553388642, + "loss": 3.1826, + "step": 76750 + }, + { + "epoch": 8.26606393283823, + "grad_norm": 0.7768109440803528, + "learning_rate": 0.000104342204503825, + "loss": 3.193, + "step": 76800 + }, + { + "epoch": 8.27144548487784, + "grad_norm": 0.8481165170669556, + "learning_rate": 0.0001040189634737636, + "loss": 3.1845, + "step": 76850 + }, + { + "epoch": 8.276827036917448, + "grad_norm": 0.83498215675354, + "learning_rate": 0.00010369572244370217, + "loss": 3.1856, + "step": 76900 + }, + { + "epoch": 8.282208588957054, + "grad_norm": 0.8582146167755127, + "learning_rate": 0.00010337248141364076, + "loss": 3.1902, + "step": 76950 + }, + { + "epoch": 8.287590140996663, + "grad_norm": 0.8045734167098999, + "learning_rate": 0.00010304924038357936, + "loss": 3.1844, + "step": 77000 + }, + { + "epoch": 8.287590140996663, + "eval_accuracy": 0.3902144601564971, + "eval_loss": 3.3361928462982178, + "eval_runtime": 182.7919, + "eval_samples_per_second": 98.533, + "eval_steps_per_second": 6.16, + "step": 77000 + }, + { + "epoch": 8.292971693036272, + "grad_norm": 0.854202926158905, + "learning_rate": 0.00010272599935351792, + "loss": 3.1828, + "step": 77050 + }, + { + "epoch": 8.29835324507588, + "grad_norm": 0.8577749729156494, + "learning_rate": 0.00010240922314405774, + "loss": 3.1808, + "step": 77100 + }, + { + "epoch": 8.303734797115489, + "grad_norm": 0.8859609365463257, + "learning_rate": 0.00010208598211399634, + "loss": 3.1855, + "step": 77150 + }, + { + "epoch": 8.309116349155097, + "grad_norm": 0.8407235145568848, + "learning_rate": 0.0001017627410839349, + "loss": 3.1775, + "step": 77200 + }, + { + "epoch": 8.314497901194704, + "grad_norm": 0.8498949408531189, + "learning_rate": 0.0001014395000538735, + "loss": 3.1787, + "step": 77250 + }, + { + "epoch": 8.319879453234313, + "grad_norm": 0.8562375903129578, + "learning_rate": 0.00010111625902381208, + "loss": 3.1898, + "step": 77300 + }, + { + "epoch": 8.325261005273921, + "grad_norm": 0.8841971755027771, + "learning_rate": 0.00010079301799375066, + "loss": 3.2001, + "step": 77350 + }, + { + "epoch": 8.33064255731353, + "grad_norm": 0.8151336312294006, + "learning_rate": 0.00010046977696368924, + "loss": 3.1902, + "step": 77400 + }, + { + "epoch": 8.336024109353138, + "grad_norm": 0.8526255488395691, + "learning_rate": 0.00010014653593362784, + "loss": 3.1803, + "step": 77450 + }, + { + "epoch": 8.341405661392745, + "grad_norm": 0.8695437908172607, + "learning_rate": 9.982329490356642e-05, + "loss": 3.1975, + "step": 77500 + }, + { + "epoch": 8.346787213432354, + "grad_norm": 0.9299212694168091, + "learning_rate": 9.9500053873505e-05, + "loss": 3.1894, + "step": 77550 + }, + { + "epoch": 8.352168765471962, + "grad_norm": 0.8620630502700806, + "learning_rate": 9.917681284344358e-05, + "loss": 3.1769, + "step": 77600 + }, + { + "epoch": 8.35755031751157, + "grad_norm": 0.8638424277305603, + "learning_rate": 9.885357181338218e-05, + "loss": 3.1993, + "step": 77650 + }, + { + "epoch": 8.36293186955118, + "grad_norm": 0.832884669303894, + "learning_rate": 9.853033078332074e-05, + "loss": 3.1877, + "step": 77700 + }, + { + "epoch": 8.368313421590786, + "grad_norm": 0.857495129108429, + "learning_rate": 9.820708975325934e-05, + "loss": 3.1844, + "step": 77750 + }, + { + "epoch": 8.373694973630395, + "grad_norm": 0.8742907047271729, + "learning_rate": 9.788384872319793e-05, + "loss": 3.1762, + "step": 77800 + }, + { + "epoch": 8.379076525670003, + "grad_norm": 0.8619995713233948, + "learning_rate": 9.75606076931365e-05, + "loss": 3.1828, + "step": 77850 + }, + { + "epoch": 8.384458077709612, + "grad_norm": 0.8461150527000427, + "learning_rate": 9.723736666307508e-05, + "loss": 3.2053, + "step": 77900 + }, + { + "epoch": 8.38983962974922, + "grad_norm": 0.809444785118103, + "learning_rate": 9.691412563301368e-05, + "loss": 3.1613, + "step": 77950 + }, + { + "epoch": 8.395221181788829, + "grad_norm": 0.8414546251296997, + "learning_rate": 9.659088460295227e-05, + "loss": 3.1819, + "step": 78000 + }, + { + "epoch": 8.395221181788829, + "eval_accuracy": 0.3904907644502093, + "eval_loss": 3.3329615592956543, + "eval_runtime": 182.3978, + "eval_samples_per_second": 98.746, + "eval_steps_per_second": 6.173, + "step": 78000 + }, + { + "epoch": 8.400602733828435, + "grad_norm": 0.8904616832733154, + "learning_rate": 9.626764357289084e-05, + "loss": 3.1935, + "step": 78050 + }, + { + "epoch": 8.405984285868044, + "grad_norm": 0.8500921726226807, + "learning_rate": 9.594440254282943e-05, + "loss": 3.201, + "step": 78100 + }, + { + "epoch": 8.411365837907653, + "grad_norm": 0.8491564989089966, + "learning_rate": 9.562116151276802e-05, + "loss": 3.18, + "step": 78150 + }, + { + "epoch": 8.416747389947261, + "grad_norm": 0.8127497434616089, + "learning_rate": 9.52979204827066e-05, + "loss": 3.1789, + "step": 78200 + }, + { + "epoch": 8.42212894198687, + "grad_norm": 0.8613414764404297, + "learning_rate": 9.497467945264518e-05, + "loss": 3.1947, + "step": 78250 + }, + { + "epoch": 8.427510494026476, + "grad_norm": 0.8296908140182495, + "learning_rate": 9.465143842258377e-05, + "loss": 3.174, + "step": 78300 + }, + { + "epoch": 8.432892046066085, + "grad_norm": 0.8717524409294128, + "learning_rate": 9.432819739252234e-05, + "loss": 3.1924, + "step": 78350 + }, + { + "epoch": 8.438273598105694, + "grad_norm": 0.8354899287223816, + "learning_rate": 9.400495636246093e-05, + "loss": 3.1854, + "step": 78400 + }, + { + "epoch": 8.443655150145302, + "grad_norm": 0.8695220351219177, + "learning_rate": 9.368171533239952e-05, + "loss": 3.1859, + "step": 78450 + }, + { + "epoch": 8.44903670218491, + "grad_norm": 0.8374029994010925, + "learning_rate": 9.335847430233811e-05, + "loss": 3.1781, + "step": 78500 + }, + { + "epoch": 8.45441825422452, + "grad_norm": 0.8391903638839722, + "learning_rate": 9.303523327227668e-05, + "loss": 3.1786, + "step": 78550 + }, + { + "epoch": 8.459799806264126, + "grad_norm": 0.8173428177833557, + "learning_rate": 9.271199224221527e-05, + "loss": 3.1855, + "step": 78600 + }, + { + "epoch": 8.465181358303735, + "grad_norm": 0.8497678637504578, + "learning_rate": 9.238875121215387e-05, + "loss": 3.2016, + "step": 78650 + }, + { + "epoch": 8.470562910343343, + "grad_norm": 0.8355598449707031, + "learning_rate": 9.206551018209243e-05, + "loss": 3.1891, + "step": 78700 + }, + { + "epoch": 8.475944462382952, + "grad_norm": 0.9021919965744019, + "learning_rate": 9.174226915203103e-05, + "loss": 3.1931, + "step": 78750 + }, + { + "epoch": 8.48132601442256, + "grad_norm": 0.8534740805625916, + "learning_rate": 9.141902812196961e-05, + "loss": 3.1779, + "step": 78800 + }, + { + "epoch": 8.486707566462167, + "grad_norm": 0.8480976819992065, + "learning_rate": 9.109578709190818e-05, + "loss": 3.1862, + "step": 78850 + }, + { + "epoch": 8.492089118501776, + "grad_norm": 0.8286227583885193, + "learning_rate": 9.077254606184677e-05, + "loss": 3.214, + "step": 78900 + }, + { + "epoch": 8.497470670541384, + "grad_norm": 0.8237711787223816, + "learning_rate": 9.044930503178537e-05, + "loss": 3.1884, + "step": 78950 + }, + { + "epoch": 8.502852222580993, + "grad_norm": 0.8402367234230042, + "learning_rate": 9.012606400172395e-05, + "loss": 3.1854, + "step": 79000 + }, + { + "epoch": 8.502852222580993, + "eval_accuracy": 0.39106488630942104, + "eval_loss": 3.32970929145813, + "eval_runtime": 182.9601, + "eval_samples_per_second": 98.442, + "eval_steps_per_second": 6.154, + "step": 79000 + }, + { + "epoch": 8.508233774620601, + "grad_norm": 0.8203577995300293, + "learning_rate": 8.980282297166253e-05, + "loss": 3.1807, + "step": 79050 + }, + { + "epoch": 8.513615326660208, + "grad_norm": 0.8231856822967529, + "learning_rate": 8.947958194160111e-05, + "loss": 3.1975, + "step": 79100 + }, + { + "epoch": 8.518996878699816, + "grad_norm": 0.8475301861763, + "learning_rate": 8.916280573214092e-05, + "loss": 3.2022, + "step": 79150 + }, + { + "epoch": 8.524378430739425, + "grad_norm": 0.9068256616592407, + "learning_rate": 8.883956470207951e-05, + "loss": 3.1841, + "step": 79200 + }, + { + "epoch": 8.529759982779034, + "grad_norm": 0.8666347861289978, + "learning_rate": 8.851632367201809e-05, + "loss": 3.1918, + "step": 79250 + }, + { + "epoch": 8.535141534818642, + "grad_norm": 0.8289416432380676, + "learning_rate": 8.819308264195669e-05, + "loss": 3.1962, + "step": 79300 + }, + { + "epoch": 8.54052308685825, + "grad_norm": 0.8145918846130371, + "learning_rate": 8.786984161189526e-05, + "loss": 3.1855, + "step": 79350 + }, + { + "epoch": 8.545904638897857, + "grad_norm": 0.8508062958717346, + "learning_rate": 8.754660058183385e-05, + "loss": 3.1653, + "step": 79400 + }, + { + "epoch": 8.551286190937466, + "grad_norm": 0.8114455342292786, + "learning_rate": 8.722335955177243e-05, + "loss": 3.1935, + "step": 79450 + }, + { + "epoch": 8.556667742977075, + "grad_norm": 0.8585551977157593, + "learning_rate": 8.690658334231224e-05, + "loss": 3.2131, + "step": 79500 + }, + { + "epoch": 8.562049295016683, + "grad_norm": 0.8328741192817688, + "learning_rate": 8.658334231225083e-05, + "loss": 3.1715, + "step": 79550 + }, + { + "epoch": 8.567430847056292, + "grad_norm": 0.8506522178649902, + "learning_rate": 8.626010128218943e-05, + "loss": 3.2035, + "step": 79600 + }, + { + "epoch": 8.572812399095898, + "grad_norm": 0.8559414148330688, + "learning_rate": 8.5936860252128e-05, + "loss": 3.198, + "step": 79650 + }, + { + "epoch": 8.578193951135507, + "grad_norm": 0.8611463904380798, + "learning_rate": 8.561361922206658e-05, + "loss": 3.1907, + "step": 79700 + }, + { + "epoch": 8.583575503175116, + "grad_norm": 0.7964751720428467, + "learning_rate": 8.529037819200517e-05, + "loss": 3.1686, + "step": 79750 + }, + { + "epoch": 8.588957055214724, + "grad_norm": 0.8190116882324219, + "learning_rate": 8.496713716194374e-05, + "loss": 3.1866, + "step": 79800 + }, + { + "epoch": 8.594338607254333, + "grad_norm": 0.9009842872619629, + "learning_rate": 8.464389613188233e-05, + "loss": 3.1879, + "step": 79850 + }, + { + "epoch": 8.599720159293941, + "grad_norm": 0.7971447706222534, + "learning_rate": 8.432065510182093e-05, + "loss": 3.2059, + "step": 79900 + }, + { + "epoch": 8.605101711333548, + "grad_norm": 0.8355123996734619, + "learning_rate": 8.39974140717595e-05, + "loss": 3.1917, + "step": 79950 + }, + { + "epoch": 8.610483263373157, + "grad_norm": 0.9042732119560242, + "learning_rate": 8.367417304169809e-05, + "loss": 3.1847, + "step": 80000 + }, + { + "epoch": 8.610483263373157, + "eval_accuracy": 0.3913339108596497, + "eval_loss": 3.327451705932617, + "eval_runtime": 182.9461, + "eval_samples_per_second": 98.45, + "eval_steps_per_second": 6.155, + "step": 80000 + }, + { + "epoch": 8.615864815412765, + "grad_norm": 0.8184645175933838, + "learning_rate": 8.335093201163667e-05, + "loss": 3.177, + "step": 80050 + }, + { + "epoch": 8.621246367452374, + "grad_norm": 0.8244253396987915, + "learning_rate": 8.302769098157526e-05, + "loss": 3.1778, + "step": 80100 + }, + { + "epoch": 8.626627919491982, + "grad_norm": 0.934569239616394, + "learning_rate": 8.270444995151383e-05, + "loss": 3.2007, + "step": 80150 + }, + { + "epoch": 8.632009471531589, + "grad_norm": 0.798776388168335, + "learning_rate": 8.238120892145243e-05, + "loss": 3.1926, + "step": 80200 + }, + { + "epoch": 8.637391023571197, + "grad_norm": 0.8425439596176147, + "learning_rate": 8.205796789139101e-05, + "loss": 3.2076, + "step": 80250 + }, + { + "epoch": 8.642772575610806, + "grad_norm": 0.8503188490867615, + "learning_rate": 8.173472686132959e-05, + "loss": 3.1863, + "step": 80300 + }, + { + "epoch": 8.648154127650415, + "grad_norm": 0.8668044209480286, + "learning_rate": 8.141148583126817e-05, + "loss": 3.1889, + "step": 80350 + }, + { + "epoch": 8.653535679690023, + "grad_norm": 0.8459951877593994, + "learning_rate": 8.108824480120676e-05, + "loss": 3.1957, + "step": 80400 + }, + { + "epoch": 8.658917231729632, + "grad_norm": 0.911152720451355, + "learning_rate": 8.076500377114533e-05, + "loss": 3.1835, + "step": 80450 + }, + { + "epoch": 8.664298783769238, + "grad_norm": 0.8265844583511353, + "learning_rate": 8.044176274108393e-05, + "loss": 3.2005, + "step": 80500 + }, + { + "epoch": 8.669680335808847, + "grad_norm": 0.8671901226043701, + "learning_rate": 8.011852171102252e-05, + "loss": 3.2096, + "step": 80550 + }, + { + "epoch": 8.675061887848456, + "grad_norm": 0.8355448842048645, + "learning_rate": 7.97952806809611e-05, + "loss": 3.187, + "step": 80600 + }, + { + "epoch": 8.680443439888064, + "grad_norm": 0.8255912065505981, + "learning_rate": 7.947203965089967e-05, + "loss": 3.1844, + "step": 80650 + }, + { + "epoch": 8.685824991927673, + "grad_norm": 0.8755044341087341, + "learning_rate": 7.914879862083827e-05, + "loss": 3.1869, + "step": 80700 + }, + { + "epoch": 8.69120654396728, + "grad_norm": 0.8891885876655579, + "learning_rate": 7.882555759077686e-05, + "loss": 3.1692, + "step": 80750 + }, + { + "epoch": 8.696588096006888, + "grad_norm": 0.8529818058013916, + "learning_rate": 7.850231656071543e-05, + "loss": 3.188, + "step": 80800 + }, + { + "epoch": 8.701969648046497, + "grad_norm": 0.8659524321556091, + "learning_rate": 7.817907553065402e-05, + "loss": 3.1952, + "step": 80850 + }, + { + "epoch": 8.707351200086105, + "grad_norm": 0.8727506995201111, + "learning_rate": 7.78558345005926e-05, + "loss": 3.1946, + "step": 80900 + }, + { + "epoch": 8.712732752125714, + "grad_norm": 0.8414073586463928, + "learning_rate": 7.753259347053118e-05, + "loss": 3.1998, + "step": 80950 + }, + { + "epoch": 8.718114304165322, + "grad_norm": 0.8435977101325989, + "learning_rate": 7.720935244046977e-05, + "loss": 3.1935, + "step": 81000 + }, + { + "epoch": 8.718114304165322, + "eval_accuracy": 0.3915950037490679, + "eval_loss": 3.323737382888794, + "eval_runtime": 183.0449, + "eval_samples_per_second": 98.397, + "eval_steps_per_second": 6.151, + "step": 81000 + }, + { + "epoch": 8.723495856204929, + "grad_norm": 0.8313448429107666, + "learning_rate": 7.688611141040836e-05, + "loss": 3.1835, + "step": 81050 + }, + { + "epoch": 8.728877408244538, + "grad_norm": 0.8675030469894409, + "learning_rate": 7.656287038034694e-05, + "loss": 3.1836, + "step": 81100 + }, + { + "epoch": 8.734258960284146, + "grad_norm": 0.8448699116706848, + "learning_rate": 7.623962935028552e-05, + "loss": 3.2062, + "step": 81150 + }, + { + "epoch": 8.739640512323755, + "grad_norm": 0.8596765995025635, + "learning_rate": 7.59163883202241e-05, + "loss": 3.1863, + "step": 81200 + }, + { + "epoch": 8.745022064363363, + "grad_norm": 0.8727709054946899, + "learning_rate": 7.55931472901627e-05, + "loss": 3.1742, + "step": 81250 + }, + { + "epoch": 8.75040361640297, + "grad_norm": 0.8483537435531616, + "learning_rate": 7.526990626010127e-05, + "loss": 3.1818, + "step": 81300 + }, + { + "epoch": 8.755785168442578, + "grad_norm": 0.8239384293556213, + "learning_rate": 7.494666523003986e-05, + "loss": 3.1747, + "step": 81350 + }, + { + "epoch": 8.761166720482187, + "grad_norm": 0.8851037621498108, + "learning_rate": 7.462342419997844e-05, + "loss": 3.1878, + "step": 81400 + }, + { + "epoch": 8.766548272521796, + "grad_norm": 0.870302677154541, + "learning_rate": 7.430018316991704e-05, + "loss": 3.1834, + "step": 81450 + }, + { + "epoch": 8.771929824561404, + "grad_norm": 0.8676862120628357, + "learning_rate": 7.397694213985562e-05, + "loss": 3.1805, + "step": 81500 + }, + { + "epoch": 8.777311376601011, + "grad_norm": 0.8838236331939697, + "learning_rate": 7.36537011097942e-05, + "loss": 3.1878, + "step": 81550 + }, + { + "epoch": 8.78269292864062, + "grad_norm": 0.9052426218986511, + "learning_rate": 7.333046007973278e-05, + "loss": 3.176, + "step": 81600 + }, + { + "epoch": 8.788074480680228, + "grad_norm": 0.827214241027832, + "learning_rate": 7.300721904967136e-05, + "loss": 3.1943, + "step": 81650 + }, + { + "epoch": 8.793456032719837, + "grad_norm": 0.8450335264205933, + "learning_rate": 7.268397801960996e-05, + "loss": 3.1906, + "step": 81700 + }, + { + "epoch": 8.798837584759445, + "grad_norm": 0.8375703692436218, + "learning_rate": 7.236073698954854e-05, + "loss": 3.1844, + "step": 81750 + }, + { + "epoch": 8.804219136799054, + "grad_norm": 0.816857635974884, + "learning_rate": 7.203749595948712e-05, + "loss": 3.1712, + "step": 81800 + }, + { + "epoch": 8.80960068883866, + "grad_norm": 0.8428307771682739, + "learning_rate": 7.17142549294257e-05, + "loss": 3.1939, + "step": 81850 + }, + { + "epoch": 8.814982240878269, + "grad_norm": 0.8175262212753296, + "learning_rate": 7.139101389936428e-05, + "loss": 3.1919, + "step": 81900 + }, + { + "epoch": 8.820363792917878, + "grad_norm": 0.8650425672531128, + "learning_rate": 7.106777286930287e-05, + "loss": 3.2, + "step": 81950 + }, + { + "epoch": 8.825745344957486, + "grad_norm": 0.866899311542511, + "learning_rate": 7.074453183924146e-05, + "loss": 3.1881, + "step": 82000 + }, + { + "epoch": 8.825745344957486, + "eval_accuracy": 0.39193965070921544, + "eval_loss": 3.3191041946411133, + "eval_runtime": 182.564, + "eval_samples_per_second": 98.656, + "eval_steps_per_second": 6.168, + "step": 82000 + }, + { + "epoch": 8.831126896997095, + "grad_norm": 0.8628833889961243, + "learning_rate": 7.042129080918004e-05, + "loss": 3.1915, + "step": 82050 + }, + { + "epoch": 8.836508449036701, + "grad_norm": 0.9049327969551086, + "learning_rate": 7.009804977911862e-05, + "loss": 3.1922, + "step": 82100 + }, + { + "epoch": 8.84189000107631, + "grad_norm": 0.8244563937187195, + "learning_rate": 6.97748087490572e-05, + "loss": 3.1827, + "step": 82150 + }, + { + "epoch": 8.847271553115919, + "grad_norm": 0.8381883502006531, + "learning_rate": 6.945156771899579e-05, + "loss": 3.2009, + "step": 82200 + }, + { + "epoch": 8.852653105155527, + "grad_norm": 0.8337753415107727, + "learning_rate": 6.912832668893437e-05, + "loss": 3.1808, + "step": 82250 + }, + { + "epoch": 8.858034657195136, + "grad_norm": 0.8778477907180786, + "learning_rate": 6.880508565887297e-05, + "loss": 3.218, + "step": 82300 + }, + { + "epoch": 8.863416209234742, + "grad_norm": 0.8658258318901062, + "learning_rate": 6.848184462881155e-05, + "loss": 3.198, + "step": 82350 + }, + { + "epoch": 8.868797761274351, + "grad_norm": 0.861770749092102, + "learning_rate": 6.815860359875013e-05, + "loss": 3.1882, + "step": 82400 + }, + { + "epoch": 8.87417931331396, + "grad_norm": 1.062533974647522, + "learning_rate": 6.783536256868871e-05, + "loss": 3.2048, + "step": 82450 + }, + { + "epoch": 8.879560865353568, + "grad_norm": 0.878668487071991, + "learning_rate": 6.75121215386273e-05, + "loss": 3.1679, + "step": 82500 + }, + { + "epoch": 8.884942417393177, + "grad_norm": 0.8749240636825562, + "learning_rate": 6.718888050856589e-05, + "loss": 3.1839, + "step": 82550 + }, + { + "epoch": 8.890323969432785, + "grad_norm": 0.8733687996864319, + "learning_rate": 6.686563947850447e-05, + "loss": 3.1964, + "step": 82600 + }, + { + "epoch": 8.895705521472392, + "grad_norm": 0.8685417175292969, + "learning_rate": 6.654239844844305e-05, + "loss": 3.1968, + "step": 82650 + }, + { + "epoch": 8.901087073512, + "grad_norm": 0.8061568140983582, + "learning_rate": 6.621915741838163e-05, + "loss": 3.1726, + "step": 82700 + }, + { + "epoch": 8.906468625551609, + "grad_norm": 0.8258588314056396, + "learning_rate": 6.589591638832021e-05, + "loss": 3.1938, + "step": 82750 + }, + { + "epoch": 8.911850177591218, + "grad_norm": 0.8688393235206604, + "learning_rate": 6.557267535825881e-05, + "loss": 3.1881, + "step": 82800 + }, + { + "epoch": 8.917231729630826, + "grad_norm": 0.870529055595398, + "learning_rate": 6.524943432819739e-05, + "loss": 3.1997, + "step": 82850 + }, + { + "epoch": 8.922613281670433, + "grad_norm": 0.8257046341896057, + "learning_rate": 6.492619329813597e-05, + "loss": 3.1816, + "step": 82900 + }, + { + "epoch": 8.927994833710041, + "grad_norm": 0.858521580696106, + "learning_rate": 6.460295226807455e-05, + "loss": 3.1988, + "step": 82950 + }, + { + "epoch": 8.93337638574965, + "grad_norm": 0.8709513545036316, + "learning_rate": 6.427971123801313e-05, + "loss": 3.1831, + "step": 83000 + }, + { + "epoch": 8.93337638574965, + "eval_accuracy": 0.3923701334507364, + "eval_loss": 3.3174479007720947, + "eval_runtime": 182.9075, + "eval_samples_per_second": 98.471, + "eval_steps_per_second": 6.156, + "step": 83000 + }, + { + "epoch": 8.938757937789259, + "grad_norm": 0.8560492992401123, + "learning_rate": 6.395647020795173e-05, + "loss": 3.1856, + "step": 83050 + }, + { + "epoch": 8.944139489828867, + "grad_norm": 0.8757853507995605, + "learning_rate": 6.363322917789031e-05, + "loss": 3.1807, + "step": 83100 + }, + { + "epoch": 8.949521041868476, + "grad_norm": 0.852837860584259, + "learning_rate": 6.330998814782889e-05, + "loss": 3.1919, + "step": 83150 + }, + { + "epoch": 8.954902593908082, + "grad_norm": 0.8725079298019409, + "learning_rate": 6.298674711776748e-05, + "loss": 3.1904, + "step": 83200 + }, + { + "epoch": 8.960284145947691, + "grad_norm": 0.8584096431732178, + "learning_rate": 6.266350608770606e-05, + "loss": 3.1921, + "step": 83250 + }, + { + "epoch": 8.9656656979873, + "grad_norm": 0.8362786769866943, + "learning_rate": 6.234026505764465e-05, + "loss": 3.1988, + "step": 83300 + }, + { + "epoch": 8.971047250026908, + "grad_norm": 0.8816782832145691, + "learning_rate": 6.201702402758323e-05, + "loss": 3.1755, + "step": 83350 + }, + { + "epoch": 8.976428802066517, + "grad_norm": 0.8913627862930298, + "learning_rate": 6.169378299752181e-05, + "loss": 3.1949, + "step": 83400 + }, + { + "epoch": 8.981810354106123, + "grad_norm": 0.8645463585853577, + "learning_rate": 6.13705419674604e-05, + "loss": 3.1884, + "step": 83450 + }, + { + "epoch": 8.987191906145732, + "grad_norm": 0.8139950633049011, + "learning_rate": 6.104730093739898e-05, + "loss": 3.1769, + "step": 83500 + }, + { + "epoch": 8.99257345818534, + "grad_norm": 0.9117967486381531, + "learning_rate": 6.073052472793879e-05, + "loss": 3.1918, + "step": 83550 + }, + { + "epoch": 8.997955010224949, + "grad_norm": 0.8582161664962769, + "learning_rate": 6.0407283697877384e-05, + "loss": 3.1887, + "step": 83600 + }, + { + "epoch": 9.003336562264558, + "grad_norm": 0.8494151830673218, + "learning_rate": 6.0084042667815966e-05, + "loss": 3.1343, + "step": 83650 + }, + { + "epoch": 9.008718114304166, + "grad_norm": 0.8479065299034119, + "learning_rate": 5.976080163775455e-05, + "loss": 3.1253, + "step": 83700 + }, + { + "epoch": 9.014099666343773, + "grad_norm": 0.8485672473907471, + "learning_rate": 5.9437560607693135e-05, + "loss": 3.1436, + "step": 83750 + }, + { + "epoch": 9.019481218383381, + "grad_norm": 0.8757447004318237, + "learning_rate": 5.9114319577631716e-05, + "loss": 3.1274, + "step": 83800 + }, + { + "epoch": 9.02486277042299, + "grad_norm": 0.8620525598526001, + "learning_rate": 5.8791078547570304e-05, + "loss": 3.1237, + "step": 83850 + }, + { + "epoch": 9.030244322462599, + "grad_norm": 0.8854354619979858, + "learning_rate": 5.8467837517508885e-05, + "loss": 3.1347, + "step": 83900 + }, + { + "epoch": 9.035625874502207, + "grad_norm": 0.8926529884338379, + "learning_rate": 5.8144596487447466e-05, + "loss": 3.1083, + "step": 83950 + }, + { + "epoch": 9.041007426541814, + "grad_norm": 0.8925355076789856, + "learning_rate": 5.7821355457386054e-05, + "loss": 3.126, + "step": 84000 + }, + { + "epoch": 9.041007426541814, + "eval_accuracy": 0.3923292799649182, + "eval_loss": 3.3203017711639404, + "eval_runtime": 182.8076, + "eval_samples_per_second": 98.524, + "eval_steps_per_second": 6.159, + "step": 84000 + }, + { + "epoch": 9.046388978581422, + "grad_norm": 0.8431273698806763, + "learning_rate": 5.7498114427324635e-05, + "loss": 3.1304, + "step": 84050 + }, + { + "epoch": 9.051770530621031, + "grad_norm": 0.8672769665718079, + "learning_rate": 5.717487339726322e-05, + "loss": 3.1198, + "step": 84100 + }, + { + "epoch": 9.05715208266064, + "grad_norm": 0.8500916957855225, + "learning_rate": 5.6851632367201804e-05, + "loss": 3.1346, + "step": 84150 + }, + { + "epoch": 9.062533634700248, + "grad_norm": 0.8439621329307556, + "learning_rate": 5.6528391337140385e-05, + "loss": 3.1345, + "step": 84200 + }, + { + "epoch": 9.067915186739857, + "grad_norm": 0.8591272234916687, + "learning_rate": 5.620515030707897e-05, + "loss": 3.1324, + "step": 84250 + }, + { + "epoch": 9.073296738779463, + "grad_norm": 0.8671176433563232, + "learning_rate": 5.5881909277017554e-05, + "loss": 3.1263, + "step": 84300 + }, + { + "epoch": 9.078678290819072, + "grad_norm": 0.8660874962806702, + "learning_rate": 5.555866824695614e-05, + "loss": 3.1316, + "step": 84350 + }, + { + "epoch": 9.08405984285868, + "grad_norm": 0.8906688690185547, + "learning_rate": 5.523542721689472e-05, + "loss": 3.1293, + "step": 84400 + }, + { + "epoch": 9.089441394898289, + "grad_norm": 0.8381503820419312, + "learning_rate": 5.4912186186833304e-05, + "loss": 3.1144, + "step": 84450 + }, + { + "epoch": 9.094822946937898, + "grad_norm": 0.8566581606864929, + "learning_rate": 5.45889451567719e-05, + "loss": 3.1277, + "step": 84500 + }, + { + "epoch": 9.100204498977504, + "grad_norm": 0.8410578966140747, + "learning_rate": 5.426570412671048e-05, + "loss": 3.1333, + "step": 84550 + }, + { + "epoch": 9.105586051017113, + "grad_norm": 0.8373113870620728, + "learning_rate": 5.394246309664907e-05, + "loss": 3.1343, + "step": 84600 + }, + { + "epoch": 9.110967603056721, + "grad_norm": 0.8358331322669983, + "learning_rate": 5.361922206658765e-05, + "loss": 3.1296, + "step": 84650 + }, + { + "epoch": 9.11634915509633, + "grad_norm": 0.8469173908233643, + "learning_rate": 5.329598103652623e-05, + "loss": 3.1223, + "step": 84700 + }, + { + "epoch": 9.121730707135939, + "grad_norm": 0.8513491153717041, + "learning_rate": 5.297274000646482e-05, + "loss": 3.1239, + "step": 84750 + }, + { + "epoch": 9.127112259175545, + "grad_norm": 0.8536253571510315, + "learning_rate": 5.26494989764034e-05, + "loss": 3.1249, + "step": 84800 + }, + { + "epoch": 9.132493811215154, + "grad_norm": 0.9170660376548767, + "learning_rate": 5.232625794634199e-05, + "loss": 3.1293, + "step": 84850 + }, + { + "epoch": 9.137875363254762, + "grad_norm": 0.8490689992904663, + "learning_rate": 5.200301691628057e-05, + "loss": 3.1382, + "step": 84900 + }, + { + "epoch": 9.143256915294371, + "grad_norm": 0.8707136511802673, + "learning_rate": 5.167977588621915e-05, + "loss": 3.1361, + "step": 84950 + }, + { + "epoch": 9.14863846733398, + "grad_norm": 0.8578998446464539, + "learning_rate": 5.135653485615774e-05, + "loss": 3.1376, + "step": 85000 + }, + { + "epoch": 9.14863846733398, + "eval_accuracy": 0.39240283796996855, + "eval_loss": 3.3188514709472656, + "eval_runtime": 182.6043, + "eval_samples_per_second": 98.634, + "eval_steps_per_second": 6.166, + "step": 85000 + }, + { + "epoch": 9.154020019373588, + "grad_norm": 0.8587227463722229, + "learning_rate": 5.103329382609632e-05, + "loss": 3.1398, + "step": 85050 + }, + { + "epoch": 9.159401571413195, + "grad_norm": 0.8671402931213379, + "learning_rate": 5.0710052796034906e-05, + "loss": 3.1134, + "step": 85100 + }, + { + "epoch": 9.164783123452803, + "grad_norm": 0.8717479109764099, + "learning_rate": 5.038681176597349e-05, + "loss": 3.116, + "step": 85150 + }, + { + "epoch": 9.170164675492412, + "grad_norm": 0.8798679709434509, + "learning_rate": 5.006357073591207e-05, + "loss": 3.1406, + "step": 85200 + }, + { + "epoch": 9.17554622753202, + "grad_norm": 0.9084823727607727, + "learning_rate": 4.974032970585066e-05, + "loss": 3.144, + "step": 85250 + }, + { + "epoch": 9.180927779571629, + "grad_norm": 0.8945959210395813, + "learning_rate": 4.9417088675789244e-05, + "loss": 3.1382, + "step": 85300 + }, + { + "epoch": 9.186309331611236, + "grad_norm": 0.8824502229690552, + "learning_rate": 4.909384764572783e-05, + "loss": 3.1076, + "step": 85350 + }, + { + "epoch": 9.191690883650844, + "grad_norm": 0.8436898589134216, + "learning_rate": 4.877060661566641e-05, + "loss": 3.1517, + "step": 85400 + }, + { + "epoch": 9.197072435690453, + "grad_norm": 0.8343037366867065, + "learning_rate": 4.8447365585604994e-05, + "loss": 3.1237, + "step": 85450 + }, + { + "epoch": 9.202453987730062, + "grad_norm": 0.9071884751319885, + "learning_rate": 4.812412455554358e-05, + "loss": 3.1321, + "step": 85500 + }, + { + "epoch": 9.20783553976967, + "grad_norm": 0.8292504549026489, + "learning_rate": 4.780088352548216e-05, + "loss": 3.1412, + "step": 85550 + }, + { + "epoch": 9.213217091809279, + "grad_norm": 0.8744844794273376, + "learning_rate": 4.7484107316021976e-05, + "loss": 3.1387, + "step": 85600 + }, + { + "epoch": 9.218598643848885, + "grad_norm": 0.8166503310203552, + "learning_rate": 4.7160866285960564e-05, + "loss": 3.1446, + "step": 85650 + }, + { + "epoch": 9.223980195888494, + "grad_norm": 0.8761563897132874, + "learning_rate": 4.6837625255899145e-05, + "loss": 3.1335, + "step": 85700 + }, + { + "epoch": 9.229361747928102, + "grad_norm": 0.9411799907684326, + "learning_rate": 4.6514384225837726e-05, + "loss": 3.1534, + "step": 85750 + }, + { + "epoch": 9.234743299967711, + "grad_norm": 0.9132449626922607, + "learning_rate": 4.6191143195776314e-05, + "loss": 3.127, + "step": 85800 + }, + { + "epoch": 9.24012485200732, + "grad_norm": 0.8338202834129333, + "learning_rate": 4.5867902165714895e-05, + "loss": 3.129, + "step": 85850 + }, + { + "epoch": 9.245506404046926, + "grad_norm": 0.8570502400398254, + "learning_rate": 4.554466113565348e-05, + "loss": 3.1349, + "step": 85900 + }, + { + "epoch": 9.250887956086535, + "grad_norm": 0.8650547862052917, + "learning_rate": 4.5221420105592064e-05, + "loss": 3.1229, + "step": 85950 + }, + { + "epoch": 9.256269508126143, + "grad_norm": 0.8735968470573425, + "learning_rate": 4.4898179075530645e-05, + "loss": 3.1377, + "step": 86000 + }, + { + "epoch": 9.256269508126143, + "eval_accuracy": 0.3926530655706053, + "eval_loss": 3.316767692565918, + "eval_runtime": 182.8243, + "eval_samples_per_second": 98.515, + "eval_steps_per_second": 6.159, + "step": 86000 + }, + { + "epoch": 9.261651060165752, + "grad_norm": 0.8486570715904236, + "learning_rate": 4.457493804546923e-05, + "loss": 3.1238, + "step": 86050 + }, + { + "epoch": 9.26703261220536, + "grad_norm": 0.8732266426086426, + "learning_rate": 4.4251697015407814e-05, + "loss": 3.1398, + "step": 86100 + }, + { + "epoch": 9.272414164244967, + "grad_norm": 0.8636999726295471, + "learning_rate": 4.392845598534641e-05, + "loss": 3.1303, + "step": 86150 + }, + { + "epoch": 9.277795716284576, + "grad_norm": 0.8534356355667114, + "learning_rate": 4.360521495528499e-05, + "loss": 3.156, + "step": 86200 + }, + { + "epoch": 9.283177268324184, + "grad_norm": 0.8626139163970947, + "learning_rate": 4.3281973925223564e-05, + "loss": 3.1201, + "step": 86250 + }, + { + "epoch": 9.288558820363793, + "grad_norm": 0.8573952317237854, + "learning_rate": 4.295873289516216e-05, + "loss": 3.1281, + "step": 86300 + }, + { + "epoch": 9.293940372403402, + "grad_norm": 0.8308889269828796, + "learning_rate": 4.263549186510074e-05, + "loss": 3.1453, + "step": 86350 + }, + { + "epoch": 9.29932192444301, + "grad_norm": 0.813922643661499, + "learning_rate": 4.231225083503933e-05, + "loss": 3.1252, + "step": 86400 + }, + { + "epoch": 9.304703476482617, + "grad_norm": 0.8177182078361511, + "learning_rate": 4.198900980497791e-05, + "loss": 3.1361, + "step": 86450 + }, + { + "epoch": 9.310085028522225, + "grad_norm": 0.8975428938865662, + "learning_rate": 4.167223359551772e-05, + "loss": 3.1266, + "step": 86500 + }, + { + "epoch": 9.315466580561834, + "grad_norm": 0.8596468567848206, + "learning_rate": 4.1348992565456303e-05, + "loss": 3.1277, + "step": 86550 + }, + { + "epoch": 9.320848132601443, + "grad_norm": 0.8630208969116211, + "learning_rate": 4.102575153539489e-05, + "loss": 3.1461, + "step": 86600 + }, + { + "epoch": 9.326229684641051, + "grad_norm": 0.9110468029975891, + "learning_rate": 4.070251050533347e-05, + "loss": 3.1253, + "step": 86650 + }, + { + "epoch": 9.331611236680658, + "grad_norm": 0.8691954612731934, + "learning_rate": 4.037926947527206e-05, + "loss": 3.1465, + "step": 86700 + }, + { + "epoch": 9.336992788720266, + "grad_norm": 0.8495018482208252, + "learning_rate": 4.005602844521064e-05, + "loss": 3.129, + "step": 86750 + }, + { + "epoch": 9.342374340759875, + "grad_norm": 0.8317940831184387, + "learning_rate": 3.973278741514922e-05, + "loss": 3.1439, + "step": 86800 + }, + { + "epoch": 9.347755892799483, + "grad_norm": 0.8267626166343689, + "learning_rate": 3.940954638508781e-05, + "loss": 3.1323, + "step": 86850 + }, + { + "epoch": 9.353137444839092, + "grad_norm": 0.8663368225097656, + "learning_rate": 3.908630535502639e-05, + "loss": 3.1397, + "step": 86900 + }, + { + "epoch": 9.3585189968787, + "grad_norm": 0.8476428389549255, + "learning_rate": 3.876306432496498e-05, + "loss": 3.132, + "step": 86950 + }, + { + "epoch": 9.363900548918307, + "grad_norm": 0.8747643232345581, + "learning_rate": 3.843982329490356e-05, + "loss": 3.1389, + "step": 87000 + }, + { + "epoch": 9.363900548918307, + "eval_accuracy": 0.3930124893234956, + "eval_loss": 3.3138458728790283, + "eval_runtime": 182.6258, + "eval_samples_per_second": 98.622, + "eval_steps_per_second": 6.166, + "step": 87000 + }, + { + "epoch": 9.369282100957916, + "grad_norm": 0.922262966632843, + "learning_rate": 3.811658226484214e-05, + "loss": 3.1331, + "step": 87050 + }, + { + "epoch": 9.374663652997524, + "grad_norm": 0.8760948181152344, + "learning_rate": 3.7793341234780736e-05, + "loss": 3.1383, + "step": 87100 + }, + { + "epoch": 9.380045205037133, + "grad_norm": 0.8842629790306091, + "learning_rate": 3.747010020471931e-05, + "loss": 3.1441, + "step": 87150 + }, + { + "epoch": 9.385426757076742, + "grad_norm": 0.885867178440094, + "learning_rate": 3.71468591746579e-05, + "loss": 3.1412, + "step": 87200 + }, + { + "epoch": 9.390808309116348, + "grad_norm": 0.814979076385498, + "learning_rate": 3.6823618144596486e-05, + "loss": 3.1407, + "step": 87250 + }, + { + "epoch": 9.396189861155957, + "grad_norm": 0.8274505734443665, + "learning_rate": 3.650037711453507e-05, + "loss": 3.1296, + "step": 87300 + }, + { + "epoch": 9.401571413195565, + "grad_norm": 0.8518266081809998, + "learning_rate": 3.6177136084473655e-05, + "loss": 3.115, + "step": 87350 + }, + { + "epoch": 9.406952965235174, + "grad_norm": 0.8622663021087646, + "learning_rate": 3.5853895054412236e-05, + "loss": 3.1405, + "step": 87400 + }, + { + "epoch": 9.412334517274783, + "grad_norm": 0.8241939544677734, + "learning_rate": 3.553065402435082e-05, + "loss": 3.1449, + "step": 87450 + }, + { + "epoch": 9.417716069314391, + "grad_norm": 0.8843784928321838, + "learning_rate": 3.5207412994289405e-05, + "loss": 3.1569, + "step": 87500 + }, + { + "epoch": 9.423097621353998, + "grad_norm": 0.906825840473175, + "learning_rate": 3.488417196422799e-05, + "loss": 3.1392, + "step": 87550 + }, + { + "epoch": 9.428479173393606, + "grad_norm": 0.8589437007904053, + "learning_rate": 3.4560930934166574e-05, + "loss": 3.1291, + "step": 87600 + }, + { + "epoch": 9.433860725433215, + "grad_norm": 0.8674544095993042, + "learning_rate": 3.4237689904105156e-05, + "loss": 3.1476, + "step": 87650 + }, + { + "epoch": 9.439242277472824, + "grad_norm": 0.8766100406646729, + "learning_rate": 3.3914448874043743e-05, + "loss": 3.1407, + "step": 87700 + }, + { + "epoch": 9.444623829512432, + "grad_norm": 0.8393718600273132, + "learning_rate": 3.3591207843982325e-05, + "loss": 3.1521, + "step": 87750 + }, + { + "epoch": 9.450005381552039, + "grad_norm": 0.8990697860717773, + "learning_rate": 3.326796681392091e-05, + "loss": 3.1242, + "step": 87800 + }, + { + "epoch": 9.455386933591647, + "grad_norm": 0.8910731077194214, + "learning_rate": 3.2944725783859494e-05, + "loss": 3.1614, + "step": 87850 + }, + { + "epoch": 9.460768485631256, + "grad_norm": 0.8416497111320496, + "learning_rate": 3.2621484753798075e-05, + "loss": 3.1444, + "step": 87900 + }, + { + "epoch": 9.466150037670864, + "grad_norm": 0.8401027321815491, + "learning_rate": 3.229824372373666e-05, + "loss": 3.1381, + "step": 87950 + }, + { + "epoch": 9.471531589710473, + "grad_norm": 0.8736851215362549, + "learning_rate": 3.197500269367525e-05, + "loss": 3.1323, + "step": 88000 + }, + { + "epoch": 9.471531589710473, + "eval_accuracy": 0.39324837474294083, + "eval_loss": 3.312713384628296, + "eval_runtime": 182.6668, + "eval_samples_per_second": 98.6, + "eval_steps_per_second": 6.164, + "step": 88000 + }, + { + "epoch": 9.476913141750082, + "grad_norm": 0.8908987045288086, + "learning_rate": 3.165176166361383e-05, + "loss": 3.1391, + "step": 88050 + }, + { + "epoch": 9.482294693789688, + "grad_norm": 0.8490933179855347, + "learning_rate": 3.132852063355242e-05, + "loss": 3.1512, + "step": 88100 + }, + { + "epoch": 9.487676245829297, + "grad_norm": 0.9195178151130676, + "learning_rate": 3.1005279603491e-05, + "loss": 3.1404, + "step": 88150 + }, + { + "epoch": 9.493057797868905, + "grad_norm": 0.8848748803138733, + "learning_rate": 3.068203857342958e-05, + "loss": 3.149, + "step": 88200 + }, + { + "epoch": 9.498439349908514, + "grad_norm": 0.86981201171875, + "learning_rate": 3.035879754336817e-05, + "loss": 3.1165, + "step": 88250 + }, + { + "epoch": 9.503820901948123, + "grad_norm": 0.9147974848747253, + "learning_rate": 3.0035556513306754e-05, + "loss": 3.1424, + "step": 88300 + }, + { + "epoch": 9.50920245398773, + "grad_norm": 0.8942081332206726, + "learning_rate": 2.971231548324534e-05, + "loss": 3.1351, + "step": 88350 + }, + { + "epoch": 9.514584006027338, + "grad_norm": 0.8392196893692017, + "learning_rate": 2.938907445318392e-05, + "loss": 3.1381, + "step": 88400 + }, + { + "epoch": 9.519965558066946, + "grad_norm": 0.8911840915679932, + "learning_rate": 2.9065833423122504e-05, + "loss": 3.1417, + "step": 88450 + }, + { + "epoch": 9.525347110106555, + "grad_norm": 0.8524636626243591, + "learning_rate": 2.874259239306109e-05, + "loss": 3.1324, + "step": 88500 + }, + { + "epoch": 9.530728662146164, + "grad_norm": 0.8815006017684937, + "learning_rate": 2.8419351362999673e-05, + "loss": 3.1403, + "step": 88550 + }, + { + "epoch": 9.536110214185772, + "grad_norm": 0.8821752667427063, + "learning_rate": 2.809611033293826e-05, + "loss": 3.1315, + "step": 88600 + }, + { + "epoch": 9.541491766225379, + "grad_norm": 0.9411100745201111, + "learning_rate": 2.7772869302876845e-05, + "loss": 3.1516, + "step": 88650 + }, + { + "epoch": 9.546873318264987, + "grad_norm": 0.851620078086853, + "learning_rate": 2.7449628272815427e-05, + "loss": 3.1355, + "step": 88700 + }, + { + "epoch": 9.552254870304596, + "grad_norm": 0.8492972254753113, + "learning_rate": 2.712638724275401e-05, + "loss": 3.1426, + "step": 88750 + }, + { + "epoch": 9.557636422344205, + "grad_norm": 0.8770946860313416, + "learning_rate": 2.6803146212692596e-05, + "loss": 3.1521, + "step": 88800 + }, + { + "epoch": 9.563017974383813, + "grad_norm": 0.8662365078926086, + "learning_rate": 2.647990518263118e-05, + "loss": 3.1437, + "step": 88850 + }, + { + "epoch": 9.56839952642342, + "grad_norm": 0.8437976837158203, + "learning_rate": 2.6156664152569765e-05, + "loss": 3.1468, + "step": 88900 + }, + { + "epoch": 9.573781078463028, + "grad_norm": 0.8942129015922546, + "learning_rate": 2.5833423122508346e-05, + "loss": 3.1457, + "step": 88950 + }, + { + "epoch": 9.579162630502637, + "grad_norm": 0.9466322064399719, + "learning_rate": 2.5510182092446934e-05, + "loss": 3.1385, + "step": 89000 + }, + { + "epoch": 9.579162630502637, + "eval_accuracy": 0.39364148089105366, + "eval_loss": 3.309281587600708, + "eval_runtime": 182.7896, + "eval_samples_per_second": 98.534, + "eval_steps_per_second": 6.16, + "step": 89000 + }, + { + "epoch": 9.584544182542245, + "grad_norm": 0.9119086265563965, + "learning_rate": 2.5186941062385518e-05, + "loss": 3.1391, + "step": 89050 + }, + { + "epoch": 9.589925734581854, + "grad_norm": 0.9008535146713257, + "learning_rate": 2.4863700032324103e-05, + "loss": 3.1422, + "step": 89100 + }, + { + "epoch": 9.59530728662146, + "grad_norm": 0.8503806591033936, + "learning_rate": 2.4540459002262687e-05, + "loss": 3.151, + "step": 89150 + }, + { + "epoch": 9.60068883866107, + "grad_norm": 0.9381389617919922, + "learning_rate": 2.4217217972201268e-05, + "loss": 3.1439, + "step": 89200 + }, + { + "epoch": 9.606070390700678, + "grad_norm": 0.8869844079017639, + "learning_rate": 2.3893976942139853e-05, + "loss": 3.1347, + "step": 89250 + }, + { + "epoch": 9.611451942740286, + "grad_norm": 0.8782930374145508, + "learning_rate": 2.3570735912078437e-05, + "loss": 3.1562, + "step": 89300 + }, + { + "epoch": 9.616833494779895, + "grad_norm": 0.9496018290519714, + "learning_rate": 2.324749488201702e-05, + "loss": 3.1202, + "step": 89350 + }, + { + "epoch": 9.622215046819504, + "grad_norm": 0.8809753060340881, + "learning_rate": 2.292425385195561e-05, + "loss": 3.136, + "step": 89400 + }, + { + "epoch": 9.62759659885911, + "grad_norm": 0.8729392290115356, + "learning_rate": 2.260101282189419e-05, + "loss": 3.1371, + "step": 89450 + }, + { + "epoch": 9.632978150898719, + "grad_norm": 0.881260871887207, + "learning_rate": 2.2277771791832775e-05, + "loss": 3.1288, + "step": 89500 + }, + { + "epoch": 9.638359702938327, + "grad_norm": 0.8699634671211243, + "learning_rate": 2.195453076177136e-05, + "loss": 3.1467, + "step": 89550 + }, + { + "epoch": 9.643741254977936, + "grad_norm": 0.8865028619766235, + "learning_rate": 2.1631289731709944e-05, + "loss": 3.133, + "step": 89600 + }, + { + "epoch": 9.649122807017545, + "grad_norm": 0.8839409351348877, + "learning_rate": 2.130804870164853e-05, + "loss": 3.1223, + "step": 89650 + }, + { + "epoch": 9.654504359057151, + "grad_norm": 0.9245901703834534, + "learning_rate": 2.098480767158711e-05, + "loss": 3.1552, + "step": 89700 + }, + { + "epoch": 9.65988591109676, + "grad_norm": 0.8770548105239868, + "learning_rate": 2.0661566641525694e-05, + "loss": 3.131, + "step": 89750 + }, + { + "epoch": 9.665267463136368, + "grad_norm": 0.8477882146835327, + "learning_rate": 2.0338325611464282e-05, + "loss": 3.1385, + "step": 89800 + }, + { + "epoch": 9.670649015175977, + "grad_norm": 0.8711922764778137, + "learning_rate": 2.0015084581402867e-05, + "loss": 3.146, + "step": 89850 + }, + { + "epoch": 9.676030567215586, + "grad_norm": 0.8607136607170105, + "learning_rate": 1.969184355134145e-05, + "loss": 3.1414, + "step": 89900 + }, + { + "epoch": 9.681412119255192, + "grad_norm": 0.8953176736831665, + "learning_rate": 1.9368602521280032e-05, + "loss": 3.1304, + "step": 89950 + }, + { + "epoch": 9.6867936712948, + "grad_norm": 0.8567274212837219, + "learning_rate": 1.9045361491218617e-05, + "loss": 3.1358, + "step": 90000 + }, + { + "epoch": 9.6867936712948, + "eval_accuracy": 0.3937655624889378, + "eval_loss": 3.3076210021972656, + "eval_runtime": 182.9314, + "eval_samples_per_second": 98.458, + "eval_steps_per_second": 6.155, + "step": 90000 + }, + { + "epoch": 9.69217522333441, + "grad_norm": 0.8794586062431335, + "learning_rate": 1.87221204611572e-05, + "loss": 3.1459, + "step": 90050 + }, + { + "epoch": 9.697556775374018, + "grad_norm": 0.8762345314025879, + "learning_rate": 1.8398879431095786e-05, + "loss": 3.1327, + "step": 90100 + }, + { + "epoch": 9.702938327413626, + "grad_norm": 0.9184940457344055, + "learning_rate": 1.807563840103437e-05, + "loss": 3.1297, + "step": 90150 + }, + { + "epoch": 9.708319879453235, + "grad_norm": 0.8900254964828491, + "learning_rate": 1.7752397370972955e-05, + "loss": 3.1323, + "step": 90200 + }, + { + "epoch": 9.713701431492842, + "grad_norm": 0.8552128076553345, + "learning_rate": 1.742915634091154e-05, + "loss": 3.1414, + "step": 90250 + }, + { + "epoch": 9.71908298353245, + "grad_norm": 0.84034264087677, + "learning_rate": 1.7105915310850124e-05, + "loss": 3.1346, + "step": 90300 + }, + { + "epoch": 9.724464535572059, + "grad_norm": 0.8617538213729858, + "learning_rate": 1.6782674280788708e-05, + "loss": 3.1243, + "step": 90350 + }, + { + "epoch": 9.729846087611667, + "grad_norm": 0.9199374318122864, + "learning_rate": 1.645943325072729e-05, + "loss": 3.1422, + "step": 90400 + }, + { + "epoch": 9.735227639651276, + "grad_norm": 0.8754702210426331, + "learning_rate": 1.6136192220665877e-05, + "loss": 3.1472, + "step": 90450 + }, + { + "epoch": 9.740609191690883, + "grad_norm": 0.9364842772483826, + "learning_rate": 1.5819416011205687e-05, + "loss": 3.1309, + "step": 90500 + }, + { + "epoch": 9.745990743730491, + "grad_norm": 0.8289980292320251, + "learning_rate": 1.549617498114427e-05, + "loss": 3.1472, + "step": 90550 + }, + { + "epoch": 9.7513722957701, + "grad_norm": 0.8661265969276428, + "learning_rate": 1.5172933951082856e-05, + "loss": 3.1371, + "step": 90600 + }, + { + "epoch": 9.756753847809708, + "grad_norm": 0.8728821873664856, + "learning_rate": 1.4849692921021439e-05, + "loss": 3.1438, + "step": 90650 + }, + { + "epoch": 9.762135399849317, + "grad_norm": 0.8967341780662537, + "learning_rate": 1.4526451890960025e-05, + "loss": 3.1279, + "step": 90700 + }, + { + "epoch": 9.767516951888926, + "grad_norm": 0.8679084181785583, + "learning_rate": 1.420321086089861e-05, + "loss": 3.1178, + "step": 90750 + }, + { + "epoch": 9.772898503928532, + "grad_norm": 0.8503843545913696, + "learning_rate": 1.3879969830837192e-05, + "loss": 3.1446, + "step": 90800 + }, + { + "epoch": 9.77828005596814, + "grad_norm": 0.918580949306488, + "learning_rate": 1.3556728800775778e-05, + "loss": 3.13, + "step": 90850 + }, + { + "epoch": 9.78366160800775, + "grad_norm": 0.8387547135353088, + "learning_rate": 1.3233487770714361e-05, + "loss": 3.1473, + "step": 90900 + }, + { + "epoch": 9.789043160047358, + "grad_norm": 0.8897818326950073, + "learning_rate": 1.2910246740652946e-05, + "loss": 3.1323, + "step": 90950 + }, + { + "epoch": 9.794424712086967, + "grad_norm": 0.8836583495140076, + "learning_rate": 1.258700571059153e-05, + "loss": 3.1303, + "step": 91000 + }, + { + "epoch": 9.794424712086967, + "eval_accuracy": 0.3940082930403153, + "eval_loss": 3.3052875995635986, + "eval_runtime": 182.9499, + "eval_samples_per_second": 98.448, + "eval_steps_per_second": 6.155, + "step": 91000 + }, + { + "epoch": 9.799806264126573, + "grad_norm": 0.8632960319519043, + "learning_rate": 1.2263764680530113e-05, + "loss": 3.1429, + "step": 91050 + }, + { + "epoch": 9.805187816166182, + "grad_norm": 0.8691814541816711, + "learning_rate": 1.19405236504687e-05, + "loss": 3.1271, + "step": 91100 + }, + { + "epoch": 9.81056936820579, + "grad_norm": 0.8277129530906677, + "learning_rate": 1.1617282620407282e-05, + "loss": 3.1307, + "step": 91150 + }, + { + "epoch": 9.815950920245399, + "grad_norm": 0.8648361563682556, + "learning_rate": 1.1294041590345867e-05, + "loss": 3.1471, + "step": 91200 + }, + { + "epoch": 9.821332472285007, + "grad_norm": 0.8396468758583069, + "learning_rate": 1.0970800560284453e-05, + "loss": 3.1444, + "step": 91250 + }, + { + "epoch": 9.826714024324616, + "grad_norm": 0.8713970184326172, + "learning_rate": 1.0647559530223035e-05, + "loss": 3.1379, + "step": 91300 + }, + { + "epoch": 9.832095576364223, + "grad_norm": 0.8426269292831421, + "learning_rate": 1.032431850016162e-05, + "loss": 3.1476, + "step": 91350 + }, + { + "epoch": 9.837477128403831, + "grad_norm": 0.9147149324417114, + "learning_rate": 1.0001077470100203e-05, + "loss": 3.1378, + "step": 91400 + }, + { + "epoch": 9.84285868044344, + "grad_norm": 0.8848015069961548, + "learning_rate": 9.677836440038787e-06, + "loss": 3.1497, + "step": 91450 + }, + { + "epoch": 9.848240232483048, + "grad_norm": 0.9187755584716797, + "learning_rate": 9.354595409977372e-06, + "loss": 3.1406, + "step": 91500 + }, + { + "epoch": 9.853621784522657, + "grad_norm": 0.8338111639022827, + "learning_rate": 9.031354379915956e-06, + "loss": 3.1277, + "step": 91550 + }, + { + "epoch": 9.859003336562264, + "grad_norm": 0.880181074142456, + "learning_rate": 8.70811334985454e-06, + "loss": 3.1482, + "step": 91600 + }, + { + "epoch": 9.864384888601872, + "grad_norm": 0.8883482217788696, + "learning_rate": 8.384872319793125e-06, + "loss": 3.1256, + "step": 91650 + }, + { + "epoch": 9.869766440641481, + "grad_norm": 0.8639237880706787, + "learning_rate": 8.06163128973171e-06, + "loss": 3.1349, + "step": 91700 + }, + { + "epoch": 9.87514799268109, + "grad_norm": 0.8801106810569763, + "learning_rate": 7.738390259670293e-06, + "loss": 3.1469, + "step": 91750 + }, + { + "epoch": 9.880529544720698, + "grad_norm": 0.9389435648918152, + "learning_rate": 7.415149229608878e-06, + "loss": 3.1366, + "step": 91800 + }, + { + "epoch": 9.885911096760307, + "grad_norm": 0.8664288520812988, + "learning_rate": 7.091908199547462e-06, + "loss": 3.1287, + "step": 91850 + }, + { + "epoch": 9.891292648799913, + "grad_norm": 0.8594654202461243, + "learning_rate": 6.768667169486046e-06, + "loss": 3.1367, + "step": 91900 + }, + { + "epoch": 9.896674200839522, + "grad_norm": 0.852776288986206, + "learning_rate": 6.4454261394246305e-06, + "loss": 3.15, + "step": 91950 + }, + { + "epoch": 9.90205575287913, + "grad_norm": 0.871873676776886, + "learning_rate": 6.122185109363214e-06, + "loss": 3.1412, + "step": 92000 + }, + { + "epoch": 9.90205575287913, + "eval_accuracy": 0.3942072364779036, + "eval_loss": 3.303832769393921, + "eval_runtime": 183.128, + "eval_samples_per_second": 98.352, + "eval_steps_per_second": 6.149, + "step": 92000 + }, + { + "epoch": 9.907437304918739, + "grad_norm": 0.8564249277114868, + "learning_rate": 5.7989440793017995e-06, + "loss": 3.1272, + "step": 92050 + }, + { + "epoch": 9.912818856958348, + "grad_norm": 0.8567243814468384, + "learning_rate": 5.475703049240383e-06, + "loss": 3.1432, + "step": 92100 + }, + { + "epoch": 9.918200408997954, + "grad_norm": 0.8726806044578552, + "learning_rate": 5.152462019178968e-06, + "loss": 3.1318, + "step": 92150 + }, + { + "epoch": 9.923581961037563, + "grad_norm": 0.881909191608429, + "learning_rate": 4.829220989117551e-06, + "loss": 3.1186, + "step": 92200 + }, + { + "epoch": 9.928963513077171, + "grad_norm": 0.8314427137374878, + "learning_rate": 4.505979959056136e-06, + "loss": 3.1237, + "step": 92250 + }, + { + "epoch": 9.93434506511678, + "grad_norm": 0.8411848545074463, + "learning_rate": 4.18273892899472e-06, + "loss": 3.1282, + "step": 92300 + }, + { + "epoch": 9.939726617156388, + "grad_norm": 0.9116420745849609, + "learning_rate": 3.859497898933305e-06, + "loss": 3.1365, + "step": 92350 + }, + { + "epoch": 9.945108169195997, + "grad_norm": 0.8482275009155273, + "learning_rate": 3.5362568688718884e-06, + "loss": 3.1371, + "step": 92400 + }, + { + "epoch": 9.950489721235604, + "grad_norm": 0.8526927828788757, + "learning_rate": 3.2194806594117013e-06, + "loss": 3.1521, + "step": 92450 + }, + { + "epoch": 9.955871273275212, + "grad_norm": 0.8895639777183533, + "learning_rate": 2.8962396293502854e-06, + "loss": 3.1432, + "step": 92500 + }, + { + "epoch": 9.961252825314821, + "grad_norm": 0.8692976832389832, + "learning_rate": 2.5729985992888694e-06, + "loss": 3.148, + "step": 92550 + }, + { + "epoch": 9.96663437735443, + "grad_norm": 0.8431194424629211, + "learning_rate": 2.249757569227454e-06, + "loss": 3.1493, + "step": 92600 + }, + { + "epoch": 9.972015929394038, + "grad_norm": 0.8579978346824646, + "learning_rate": 1.926516539166038e-06, + "loss": 3.1416, + "step": 92650 + }, + { + "epoch": 9.977397481433645, + "grad_norm": 0.8592678308486938, + "learning_rate": 1.603275509104622e-06, + "loss": 3.1405, + "step": 92700 + }, + { + "epoch": 9.982779033473253, + "grad_norm": 0.8317117691040039, + "learning_rate": 1.2800344790432064e-06, + "loss": 3.1257, + "step": 92750 + }, + { + "epoch": 9.988160585512862, + "grad_norm": 0.8965446352958679, + "learning_rate": 9.567934489817906e-07, + "loss": 3.1337, + "step": 92800 + }, + { + "epoch": 9.99354213755247, + "grad_norm": 0.8839283585548401, + "learning_rate": 6.335524189203748e-07, + "loss": 3.1165, + "step": 92850 + }, + { + "epoch": 9.998923689592079, + "grad_norm": 0.8802606463432312, + "learning_rate": 3.103113888589591e-07, + "loss": 3.1291, + "step": 92900 + }, + { + "epoch": 10.0, + "step": 92910, + "total_flos": 7.7681859821568e+17, + "train_loss": 3.460850506426251, + "train_runtime": 80822.0047, + "train_samples_per_second": 36.784, + "train_steps_per_second": 1.15 + } + ], + "logging_steps": 50, + "max_steps": 92910, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.7681859821568e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}