{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8779360800924143, "eval_steps": 300, "global_step": 5700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00030804774740084714, "grad_norm": 1.6617624759674072, "learning_rate": 9.999997658238351e-05, "loss": 1.4818, "step": 2 }, { "epoch": 0.0006160954948016943, "grad_norm": 0.8753126263618469, "learning_rate": 9.9999906329556e-05, "loss": 1.3083, "step": 4 }, { "epoch": 0.0009241432422025414, "grad_norm": 1.0605000257492065, "learning_rate": 9.999978924158326e-05, "loss": 1.5628, "step": 6 }, { "epoch": 0.0012321909896033886, "grad_norm": 0.9619163870811462, "learning_rate": 9.999962531857496e-05, "loss": 1.3115, "step": 8 }, { "epoch": 0.0015402387370042356, "grad_norm": 2.4611310958862305, "learning_rate": 9.999941456068467e-05, "loss": 1.3254, "step": 10 }, { "epoch": 0.0018482864844050829, "grad_norm": 1.5140900611877441, "learning_rate": 9.999915696810979e-05, "loss": 1.7101, "step": 12 }, { "epoch": 0.00215633423180593, "grad_norm": 0.9794526696205139, "learning_rate": 9.999885254109161e-05, "loss": 1.439, "step": 14 }, { "epoch": 0.002464381979206777, "grad_norm": 1.0064067840576172, "learning_rate": 9.99985012799153e-05, "loss": 1.3107, "step": 16 }, { "epoch": 0.002772429726607624, "grad_norm": 1.075746774673462, "learning_rate": 9.999810318490988e-05, "loss": 1.8331, "step": 18 }, { "epoch": 0.0030804774740084712, "grad_norm": 0.9585892558097839, "learning_rate": 9.999765825644824e-05, "loss": 1.3932, "step": 20 }, { "epoch": 0.0033885252214093183, "grad_norm": 0.7582125067710876, "learning_rate": 9.999716649494715e-05, "loss": 1.1928, "step": 22 }, { "epoch": 0.0036965729688101657, "grad_norm": 0.7594268918037415, "learning_rate": 9.999662790086726e-05, "loss": 1.3234, "step": 24 }, { "epoch": 0.004004620716211013, "grad_norm": 0.8150789141654968, "learning_rate": 9.999604247471306e-05, "loss": 1.3775, "step": 26 }, { "epoch": 0.00431266846361186, "grad_norm": 0.7926575541496277, "learning_rate": 9.999541021703292e-05, "loss": 1.2174, "step": 28 }, { "epoch": 0.004620716211012707, "grad_norm": 0.9279683232307434, "learning_rate": 9.999473112841908e-05, "loss": 1.32, "step": 30 }, { "epoch": 0.004928763958413554, "grad_norm": 0.7790488004684448, "learning_rate": 9.999400520950766e-05, "loss": 1.4515, "step": 32 }, { "epoch": 0.005236811705814401, "grad_norm": 1.0332311391830444, "learning_rate": 9.99932324609786e-05, "loss": 1.3014, "step": 34 }, { "epoch": 0.005544859453215248, "grad_norm": 0.9211418628692627, "learning_rate": 9.999241288355577e-05, "loss": 1.2203, "step": 36 }, { "epoch": 0.005852907200616096, "grad_norm": 0.708522379398346, "learning_rate": 9.999154647800686e-05, "loss": 1.2018, "step": 38 }, { "epoch": 0.0061609549480169425, "grad_norm": 1.3211787939071655, "learning_rate": 9.999063324514344e-05, "loss": 0.9897, "step": 40 }, { "epoch": 0.00646900269541779, "grad_norm": 0.7706666588783264, "learning_rate": 9.998967318582092e-05, "loss": 1.2864, "step": 42 }, { "epoch": 0.0067770504428186365, "grad_norm": 0.857083797454834, "learning_rate": 9.998866630093861e-05, "loss": 1.1717, "step": 44 }, { "epoch": 0.007085098190219484, "grad_norm": 0.5936506390571594, "learning_rate": 9.998761259143967e-05, "loss": 1.1459, "step": 46 }, { "epoch": 0.0073931459376203315, "grad_norm": 0.6077166795730591, "learning_rate": 9.99865120583111e-05, "loss": 1.2088, "step": 48 }, { "epoch": 0.007701193685021178, "grad_norm": 2.3327157497406006, "learning_rate": 9.998536470258378e-05, "loss": 1.9314, "step": 50 }, { "epoch": 0.008009241432422026, "grad_norm": 0.8980337381362915, "learning_rate": 9.998417052533244e-05, "loss": 1.1304, "step": 52 }, { "epoch": 0.008317289179822872, "grad_norm": 0.7357211709022522, "learning_rate": 9.998292952767569e-05, "loss": 1.1089, "step": 54 }, { "epoch": 0.00862533692722372, "grad_norm": 0.7480604648590088, "learning_rate": 9.998164171077595e-05, "loss": 1.3085, "step": 56 }, { "epoch": 0.008933384674624567, "grad_norm": 0.7059953808784485, "learning_rate": 9.998030707583953e-05, "loss": 1.2016, "step": 58 }, { "epoch": 0.009241432422025414, "grad_norm": 0.8867676258087158, "learning_rate": 9.99789256241166e-05, "loss": 1.263, "step": 60 }, { "epoch": 0.00954948016942626, "grad_norm": 0.5447019338607788, "learning_rate": 9.997749735690117e-05, "loss": 1.1203, "step": 62 }, { "epoch": 0.009857527916827109, "grad_norm": 1.0171029567718506, "learning_rate": 9.997602227553112e-05, "loss": 1.2449, "step": 64 }, { "epoch": 0.010165575664227955, "grad_norm": 0.6860587000846863, "learning_rate": 9.997450038138811e-05, "loss": 1.1124, "step": 66 }, { "epoch": 0.010473623411628802, "grad_norm": 0.7293751835823059, "learning_rate": 9.997293167589778e-05, "loss": 1.2182, "step": 68 }, { "epoch": 0.01078167115902965, "grad_norm": 0.8685497045516968, "learning_rate": 9.997131616052949e-05, "loss": 1.3284, "step": 70 }, { "epoch": 0.011089718906430497, "grad_norm": 0.662895917892456, "learning_rate": 9.99696538367965e-05, "loss": 1.0932, "step": 72 }, { "epoch": 0.011397766653831343, "grad_norm": 0.8935220241546631, "learning_rate": 9.996794470625597e-05, "loss": 1.4038, "step": 74 }, { "epoch": 0.011705814401232192, "grad_norm": 0.6189644932746887, "learning_rate": 9.996618877050878e-05, "loss": 1.1177, "step": 76 }, { "epoch": 0.012013862148633038, "grad_norm": 0.861652135848999, "learning_rate": 9.996438603119978e-05, "loss": 1.2163, "step": 78 }, { "epoch": 0.012321909896033885, "grad_norm": 0.6359105110168457, "learning_rate": 9.996253649001759e-05, "loss": 1.0015, "step": 80 }, { "epoch": 0.012629957643434732, "grad_norm": 0.8358225226402283, "learning_rate": 9.996064014869466e-05, "loss": 1.0248, "step": 82 }, { "epoch": 0.01293800539083558, "grad_norm": 1.0179784297943115, "learning_rate": 9.995869700900732e-05, "loss": 1.2234, "step": 84 }, { "epoch": 0.013246053138236426, "grad_norm": 0.7293205261230469, "learning_rate": 9.995670707277571e-05, "loss": 1.2366, "step": 86 }, { "epoch": 0.013554100885637273, "grad_norm": 0.5179591178894043, "learning_rate": 9.995467034186383e-05, "loss": 1.0338, "step": 88 }, { "epoch": 0.013862148633038121, "grad_norm": 0.6510319113731384, "learning_rate": 9.995258681817948e-05, "loss": 1.1327, "step": 90 }, { "epoch": 0.014170196380438968, "grad_norm": 0.6923282146453857, "learning_rate": 9.995045650367432e-05, "loss": 1.3055, "step": 92 }, { "epoch": 0.014478244127839815, "grad_norm": 0.7597566843032837, "learning_rate": 9.994827940034379e-05, "loss": 1.5757, "step": 94 }, { "epoch": 0.014786291875240663, "grad_norm": 0.7768942713737488, "learning_rate": 9.994605551022724e-05, "loss": 1.4926, "step": 96 }, { "epoch": 0.01509433962264151, "grad_norm": 0.7350413203239441, "learning_rate": 9.994378483540778e-05, "loss": 1.2505, "step": 98 }, { "epoch": 0.015402387370042356, "grad_norm": 0.7640051245689392, "learning_rate": 9.994146737801235e-05, "loss": 1.2688, "step": 100 }, { "epoch": 0.015710435117443203, "grad_norm": 4.0233635902404785, "learning_rate": 9.993910314021172e-05, "loss": 2.1171, "step": 102 }, { "epoch": 0.01601848286484405, "grad_norm": 0.6492887139320374, "learning_rate": 9.99366921242205e-05, "loss": 1.0807, "step": 104 }, { "epoch": 0.0163265306122449, "grad_norm": 0.7625781893730164, "learning_rate": 9.99342343322971e-05, "loss": 1.0053, "step": 106 }, { "epoch": 0.016634578359645744, "grad_norm": 2.026451587677002, "learning_rate": 9.993172976674374e-05, "loss": 1.36, "step": 108 }, { "epoch": 0.016942626107046593, "grad_norm": 0.490477979183197, "learning_rate": 9.992917842990645e-05, "loss": 0.8772, "step": 110 }, { "epoch": 0.01725067385444744, "grad_norm": 0.7898009419441223, "learning_rate": 9.99265803241751e-05, "loss": 1.0414, "step": 112 }, { "epoch": 0.017558721601848286, "grad_norm": 1.5593082904815674, "learning_rate": 9.992393545198332e-05, "loss": 2.5199, "step": 114 }, { "epoch": 0.017866769349249134, "grad_norm": 0.9973122477531433, "learning_rate": 9.992124381580859e-05, "loss": 1.1275, "step": 116 }, { "epoch": 0.01817481709664998, "grad_norm": 0.5887067914009094, "learning_rate": 9.991850541817219e-05, "loss": 0.9591, "step": 118 }, { "epoch": 0.018482864844050827, "grad_norm": 0.6366308927536011, "learning_rate": 9.991572026163916e-05, "loss": 1.0398, "step": 120 }, { "epoch": 0.018790912591451676, "grad_norm": 1.0203287601470947, "learning_rate": 9.991288834881839e-05, "loss": 1.4016, "step": 122 }, { "epoch": 0.01909896033885252, "grad_norm": 0.9442655444145203, "learning_rate": 9.991000968236255e-05, "loss": 1.2937, "step": 124 }, { "epoch": 0.01940700808625337, "grad_norm": 1.4322446584701538, "learning_rate": 9.990708426496808e-05, "loss": 1.6756, "step": 126 }, { "epoch": 0.019715055833654217, "grad_norm": 0.6030606627464294, "learning_rate": 9.990411209937524e-05, "loss": 1.2115, "step": 128 }, { "epoch": 0.020023103581055062, "grad_norm": 0.7634426951408386, "learning_rate": 9.990109318836809e-05, "loss": 1.1997, "step": 130 }, { "epoch": 0.02033115132845591, "grad_norm": 0.6622510552406311, "learning_rate": 9.989802753477443e-05, "loss": 1.3292, "step": 132 }, { "epoch": 0.02063919907585676, "grad_norm": 0.7911828756332397, "learning_rate": 9.989491514146589e-05, "loss": 1.1769, "step": 134 }, { "epoch": 0.020947246823257604, "grad_norm": 0.8692740201950073, "learning_rate": 9.989175601135786e-05, "loss": 1.2349, "step": 136 }, { "epoch": 0.021255294570658452, "grad_norm": 0.5944163203239441, "learning_rate": 9.988855014740951e-05, "loss": 1.1596, "step": 138 }, { "epoch": 0.0215633423180593, "grad_norm": 0.9593337774276733, "learning_rate": 9.988529755262379e-05, "loss": 1.1337, "step": 140 }, { "epoch": 0.021871390065460145, "grad_norm": 0.7589074373245239, "learning_rate": 9.988199823004741e-05, "loss": 1.1172, "step": 142 }, { "epoch": 0.022179437812860994, "grad_norm": 0.6502323746681213, "learning_rate": 9.987865218277088e-05, "loss": 1.1022, "step": 144 }, { "epoch": 0.022487485560261842, "grad_norm": 0.8099236488342285, "learning_rate": 9.987525941392844e-05, "loss": 1.1416, "step": 146 }, { "epoch": 0.022795533307662687, "grad_norm": 0.9074066877365112, "learning_rate": 9.987181992669812e-05, "loss": 1.3035, "step": 148 }, { "epoch": 0.023103581055063535, "grad_norm": 0.7978343963623047, "learning_rate": 9.98683337243017e-05, "loss": 0.9997, "step": 150 }, { "epoch": 0.023411628802464383, "grad_norm": 0.5891391634941101, "learning_rate": 9.986480081000474e-05, "loss": 0.9937, "step": 152 }, { "epoch": 0.02371967654986523, "grad_norm": 0.747765302658081, "learning_rate": 9.986122118711651e-05, "loss": 1.0666, "step": 154 }, { "epoch": 0.024027724297266077, "grad_norm": 0.7870962619781494, "learning_rate": 9.985759485899009e-05, "loss": 1.0771, "step": 156 }, { "epoch": 0.024335772044666925, "grad_norm": 0.7572788596153259, "learning_rate": 9.985392182902225e-05, "loss": 1.2335, "step": 158 }, { "epoch": 0.02464381979206777, "grad_norm": 1.1162936687469482, "learning_rate": 9.985020210065353e-05, "loss": 1.1569, "step": 160 }, { "epoch": 0.024951867539468618, "grad_norm": 0.7760798335075378, "learning_rate": 9.984643567736824e-05, "loss": 1.5558, "step": 162 }, { "epoch": 0.025259915286869463, "grad_norm": 0.7158915996551514, "learning_rate": 9.984262256269441e-05, "loss": 1.1885, "step": 164 }, { "epoch": 0.02556796303427031, "grad_norm": 0.9121390581130981, "learning_rate": 9.983876276020378e-05, "loss": 1.3927, "step": 166 }, { "epoch": 0.02587601078167116, "grad_norm": 0.5669654607772827, "learning_rate": 9.983485627351187e-05, "loss": 1.0516, "step": 168 }, { "epoch": 0.026184058529072005, "grad_norm": 0.816312313079834, "learning_rate": 9.983090310627787e-05, "loss": 1.1956, "step": 170 }, { "epoch": 0.026492106276472853, "grad_norm": 0.5462418794631958, "learning_rate": 9.982690326220477e-05, "loss": 1.2407, "step": 172 }, { "epoch": 0.0268001540238737, "grad_norm": 0.9155438542366028, "learning_rate": 9.98228567450392e-05, "loss": 1.0894, "step": 174 }, { "epoch": 0.027108201771274546, "grad_norm": 0.6453492045402527, "learning_rate": 9.98187635585716e-05, "loss": 1.1709, "step": 176 }, { "epoch": 0.027416249518675394, "grad_norm": 0.8367354869842529, "learning_rate": 9.981462370663604e-05, "loss": 1.122, "step": 178 }, { "epoch": 0.027724297266076243, "grad_norm": 0.657660961151123, "learning_rate": 9.981043719311034e-05, "loss": 1.3573, "step": 180 }, { "epoch": 0.028032345013477088, "grad_norm": 0.7809942960739136, "learning_rate": 9.980620402191603e-05, "loss": 1.174, "step": 182 }, { "epoch": 0.028340392760877936, "grad_norm": 0.8295182585716248, "learning_rate": 9.980192419701837e-05, "loss": 0.9466, "step": 184 }, { "epoch": 0.028648440508278784, "grad_norm": 0.8026232719421387, "learning_rate": 9.979759772242625e-05, "loss": 1.0066, "step": 186 }, { "epoch": 0.02895648825567963, "grad_norm": 0.714900016784668, "learning_rate": 9.979322460219234e-05, "loss": 1.3128, "step": 188 }, { "epoch": 0.029264536003080478, "grad_norm": 0.8300663232803345, "learning_rate": 9.978880484041292e-05, "loss": 1.356, "step": 190 }, { "epoch": 0.029572583750481326, "grad_norm": 1.3307452201843262, "learning_rate": 9.978433844122804e-05, "loss": 1.5442, "step": 192 }, { "epoch": 0.02988063149788217, "grad_norm": 0.8927571773529053, "learning_rate": 9.977982540882136e-05, "loss": 1.1138, "step": 194 }, { "epoch": 0.03018867924528302, "grad_norm": 0.7937953472137451, "learning_rate": 9.977526574742028e-05, "loss": 1.2531, "step": 196 }, { "epoch": 0.030496726992683867, "grad_norm": 0.9369279742240906, "learning_rate": 9.977065946129586e-05, "loss": 1.1859, "step": 198 }, { "epoch": 0.030804774740084712, "grad_norm": 0.643165647983551, "learning_rate": 9.976600655476283e-05, "loss": 1.1124, "step": 200 }, { "epoch": 0.03111282248748556, "grad_norm": 0.632799506187439, "learning_rate": 9.976130703217956e-05, "loss": 2.1622, "step": 202 }, { "epoch": 0.031420870234886406, "grad_norm": 1.6191530227661133, "learning_rate": 9.975656089794816e-05, "loss": 1.1623, "step": 204 }, { "epoch": 0.031728917982287254, "grad_norm": 0.765177309513092, "learning_rate": 9.975176815651431e-05, "loss": 1.1794, "step": 206 }, { "epoch": 0.0320369657296881, "grad_norm": 0.8509706258773804, "learning_rate": 9.974692881236743e-05, "loss": 1.093, "step": 208 }, { "epoch": 0.03234501347708895, "grad_norm": 0.6735479831695557, "learning_rate": 9.974204287004055e-05, "loss": 1.1755, "step": 210 }, { "epoch": 0.0326530612244898, "grad_norm": 0.8011558055877686, "learning_rate": 9.973711033411034e-05, "loss": 1.2142, "step": 212 }, { "epoch": 0.03296110897189064, "grad_norm": 0.6858620643615723, "learning_rate": 9.973213120919714e-05, "loss": 1.0759, "step": 214 }, { "epoch": 0.03326915671929149, "grad_norm": 0.9899487495422363, "learning_rate": 9.97271054999649e-05, "loss": 1.3075, "step": 216 }, { "epoch": 0.03357720446669234, "grad_norm": 0.5964149236679077, "learning_rate": 9.972203321112126e-05, "loss": 1.0864, "step": 218 }, { "epoch": 0.033885252214093185, "grad_norm": 1.0063085556030273, "learning_rate": 9.971691434741742e-05, "loss": 1.0993, "step": 220 }, { "epoch": 0.034193299961494034, "grad_norm": 0.8755286335945129, "learning_rate": 9.971174891364827e-05, "loss": 1.1378, "step": 222 }, { "epoch": 0.03450134770889488, "grad_norm": 3.5582456588745117, "learning_rate": 9.970653691465229e-05, "loss": 2.7598, "step": 224 }, { "epoch": 0.03480939545629572, "grad_norm": 0.8937438130378723, "learning_rate": 9.970127835531158e-05, "loss": 1.0757, "step": 226 }, { "epoch": 0.03511744320369657, "grad_norm": 0.6416354179382324, "learning_rate": 9.969597324055187e-05, "loss": 1.2164, "step": 228 }, { "epoch": 0.03542549095109742, "grad_norm": 1.0077422857284546, "learning_rate": 9.969062157534246e-05, "loss": 1.0344, "step": 230 }, { "epoch": 0.03573353869849827, "grad_norm": 0.7546489238739014, "learning_rate": 9.96852233646963e-05, "loss": 1.0326, "step": 232 }, { "epoch": 0.03604158644589912, "grad_norm": 0.7511939406394958, "learning_rate": 9.967977861366991e-05, "loss": 1.3572, "step": 234 }, { "epoch": 0.03634963419329996, "grad_norm": 0.7937233448028564, "learning_rate": 9.967428732736341e-05, "loss": 1.2514, "step": 236 }, { "epoch": 0.036657681940700806, "grad_norm": 0.7672502398490906, "learning_rate": 9.966874951092053e-05, "loss": 1.2304, "step": 238 }, { "epoch": 0.036965729688101655, "grad_norm": 0.8722102046012878, "learning_rate": 9.966316516952854e-05, "loss": 1.1674, "step": 240 }, { "epoch": 0.0372737774355025, "grad_norm": 0.9118186831474304, "learning_rate": 9.965753430841835e-05, "loss": 1.3118, "step": 242 }, { "epoch": 0.03758182518290335, "grad_norm": 0.6866461038589478, "learning_rate": 9.96518569328644e-05, "loss": 0.9958, "step": 244 }, { "epoch": 0.0378898729303042, "grad_norm": 0.7743217945098877, "learning_rate": 9.964613304818472e-05, "loss": 1.1417, "step": 246 }, { "epoch": 0.03819792067770504, "grad_norm": 0.7421185374259949, "learning_rate": 9.964036265974089e-05, "loss": 1.2372, "step": 248 }, { "epoch": 0.03850596842510589, "grad_norm": 0.75697922706604, "learning_rate": 9.963454577293808e-05, "loss": 1.0522, "step": 250 }, { "epoch": 0.03881401617250674, "grad_norm": 0.600669264793396, "learning_rate": 9.962868239322495e-05, "loss": 1.1678, "step": 252 }, { "epoch": 0.039122063919907586, "grad_norm": 1.034199833869934, "learning_rate": 9.96227725260938e-05, "loss": 2.1076, "step": 254 }, { "epoch": 0.039430111667308435, "grad_norm": 0.606637716293335, "learning_rate": 9.96168161770804e-05, "loss": 0.8878, "step": 256 }, { "epoch": 0.03973815941470928, "grad_norm": 0.7937979102134705, "learning_rate": 9.961081335176412e-05, "loss": 2.2967, "step": 258 }, { "epoch": 0.040046207162110124, "grad_norm": 0.6674078702926636, "learning_rate": 9.960476405576782e-05, "loss": 1.1733, "step": 260 }, { "epoch": 0.04035425490951097, "grad_norm": 0.6720229983329773, "learning_rate": 9.959866829475789e-05, "loss": 1.1175, "step": 262 }, { "epoch": 0.04066230265691182, "grad_norm": 0.6526698470115662, "learning_rate": 9.959252607444427e-05, "loss": 1.1886, "step": 264 }, { "epoch": 0.04097035040431267, "grad_norm": 0.5717160701751709, "learning_rate": 9.958633740058042e-05, "loss": 0.9741, "step": 266 }, { "epoch": 0.04127839815171352, "grad_norm": 0.6778659820556641, "learning_rate": 9.958010227896329e-05, "loss": 1.0269, "step": 268 }, { "epoch": 0.041586445899114366, "grad_norm": 0.8337912559509277, "learning_rate": 9.957382071543332e-05, "loss": 1.1228, "step": 270 }, { "epoch": 0.04189449364651521, "grad_norm": 0.8081030249595642, "learning_rate": 9.95674927158745e-05, "loss": 1.398, "step": 272 }, { "epoch": 0.042202541393916056, "grad_norm": 0.6906352639198303, "learning_rate": 9.956111828621432e-05, "loss": 0.9677, "step": 274 }, { "epoch": 0.042510589141316904, "grad_norm": 0.5956444144248962, "learning_rate": 9.955469743242372e-05, "loss": 1.0459, "step": 276 }, { "epoch": 0.04281863688871775, "grad_norm": 0.5646885633468628, "learning_rate": 9.954823016051713e-05, "loss": 1.4604, "step": 278 }, { "epoch": 0.0431266846361186, "grad_norm": 0.5150172710418701, "learning_rate": 9.95417164765525e-05, "loss": 0.9466, "step": 280 }, { "epoch": 0.04343473238351944, "grad_norm": 0.6873899102210999, "learning_rate": 9.95351563866312e-05, "loss": 1.1715, "step": 282 }, { "epoch": 0.04374278013092029, "grad_norm": 0.7087437510490417, "learning_rate": 9.952854989689812e-05, "loss": 1.2993, "step": 284 }, { "epoch": 0.04405082787832114, "grad_norm": 0.7204945087432861, "learning_rate": 9.952189701354158e-05, "loss": 1.0131, "step": 286 }, { "epoch": 0.04435887562572199, "grad_norm": 0.8233432769775391, "learning_rate": 9.951519774279334e-05, "loss": 1.1578, "step": 288 }, { "epoch": 0.044666923373122835, "grad_norm": 1.0435971021652222, "learning_rate": 9.95084520909287e-05, "loss": 1.3136, "step": 290 }, { "epoch": 0.044974971120523684, "grad_norm": 0.7032255530357361, "learning_rate": 9.950166006426629e-05, "loss": 0.9537, "step": 292 }, { "epoch": 0.045283018867924525, "grad_norm": 0.8625809550285339, "learning_rate": 9.949482166916826e-05, "loss": 1.0124, "step": 294 }, { "epoch": 0.045591066615325374, "grad_norm": 0.9952526688575745, "learning_rate": 9.948793691204014e-05, "loss": 1.304, "step": 296 }, { "epoch": 0.04589911436272622, "grad_norm": 0.49537548422813416, "learning_rate": 9.948100579933095e-05, "loss": 0.8679, "step": 298 }, { "epoch": 0.04620716211012707, "grad_norm": 0.7977579236030579, "learning_rate": 9.947402833753307e-05, "loss": 1.1701, "step": 300 }, { "epoch": 0.04620716211012707, "eval_loss": 2.4681503772735596, "eval_runtime": 746.0395, "eval_samples_per_second": 2.681, "eval_steps_per_second": 0.67, "step": 300 }, { "epoch": 0.04651520985752792, "grad_norm": 0.5484630465507507, "learning_rate": 9.946700453318234e-05, "loss": 0.9929, "step": 302 }, { "epoch": 0.04682325760492877, "grad_norm": 0.768426239490509, "learning_rate": 9.945993439285797e-05, "loss": 1.1075, "step": 304 }, { "epoch": 0.04713130535232961, "grad_norm": 0.7085797786712646, "learning_rate": 9.945281792318259e-05, "loss": 1.2756, "step": 306 }, { "epoch": 0.04743935309973046, "grad_norm": 0.5655617117881775, "learning_rate": 9.944565513082227e-05, "loss": 1.8454, "step": 308 }, { "epoch": 0.047747400847131305, "grad_norm": 1.111232042312622, "learning_rate": 9.943844602248638e-05, "loss": 1.3407, "step": 310 }, { "epoch": 0.04805544859453215, "grad_norm": 0.8340556025505066, "learning_rate": 9.943119060492776e-05, "loss": 2.5626, "step": 312 }, { "epoch": 0.048363496341933, "grad_norm": 0.6861330270767212, "learning_rate": 9.942388888494258e-05, "loss": 1.1323, "step": 314 }, { "epoch": 0.04867154408933385, "grad_norm": 0.9066111445426941, "learning_rate": 9.94165408693704e-05, "loss": 1.1893, "step": 316 }, { "epoch": 0.04897959183673469, "grad_norm": 0.8065815567970276, "learning_rate": 9.940914656509414e-05, "loss": 1.1128, "step": 318 }, { "epoch": 0.04928763958413554, "grad_norm": 0.7174093127250671, "learning_rate": 9.940170597904006e-05, "loss": 1.0627, "step": 320 }, { "epoch": 0.04959568733153639, "grad_norm": 0.8688803315162659, "learning_rate": 9.939421911817783e-05, "loss": 2.0269, "step": 322 }, { "epoch": 0.049903735078937236, "grad_norm": 0.8032212853431702, "learning_rate": 9.93866859895204e-05, "loss": 1.1595, "step": 324 }, { "epoch": 0.050211782826338085, "grad_norm": 0.8701591491699219, "learning_rate": 9.937910660012408e-05, "loss": 1.41, "step": 326 }, { "epoch": 0.050519830573738926, "grad_norm": 0.8329717516899109, "learning_rate": 9.937148095708855e-05, "loss": 1.5113, "step": 328 }, { "epoch": 0.050827878321139774, "grad_norm": 0.6211066246032715, "learning_rate": 9.936380906755676e-05, "loss": 1.1042, "step": 330 }, { "epoch": 0.05113592606854062, "grad_norm": 0.7419208288192749, "learning_rate": 9.935609093871502e-05, "loss": 2.0661, "step": 332 }, { "epoch": 0.05144397381594147, "grad_norm": 0.7876706719398499, "learning_rate": 9.934832657779291e-05, "loss": 1.0094, "step": 334 }, { "epoch": 0.05175202156334232, "grad_norm": 0.9068336486816406, "learning_rate": 9.934051599206339e-05, "loss": 1.0648, "step": 336 }, { "epoch": 0.05206006931074317, "grad_norm": 0.7844152450561523, "learning_rate": 9.933265918884262e-05, "loss": 1.26, "step": 338 }, { "epoch": 0.05236811705814401, "grad_norm": 0.7152935266494751, "learning_rate": 9.932475617549016e-05, "loss": 1.1199, "step": 340 }, { "epoch": 0.05267616480554486, "grad_norm": 0.8740458488464355, "learning_rate": 9.931680695940873e-05, "loss": 1.2573, "step": 342 }, { "epoch": 0.052984212552945706, "grad_norm": 0.8880050778388977, "learning_rate": 9.930881154804446e-05, "loss": 1.3006, "step": 344 }, { "epoch": 0.053292260300346554, "grad_norm": 0.7322596311569214, "learning_rate": 9.930076994888666e-05, "loss": 1.1697, "step": 346 }, { "epoch": 0.0536003080477474, "grad_norm": 0.64304518699646, "learning_rate": 9.929268216946794e-05, "loss": 1.2965, "step": 348 }, { "epoch": 0.05390835579514825, "grad_norm": 0.706308126449585, "learning_rate": 9.928454821736414e-05, "loss": 1.1261, "step": 350 }, { "epoch": 0.05421640354254909, "grad_norm": 0.6644442677497864, "learning_rate": 9.927636810019441e-05, "loss": 1.3967, "step": 352 }, { "epoch": 0.05452445128994994, "grad_norm": 0.636601984500885, "learning_rate": 9.926814182562108e-05, "loss": 0.9961, "step": 354 }, { "epoch": 0.05483249903735079, "grad_norm": 0.7085344195365906, "learning_rate": 9.925986940134975e-05, "loss": 0.9479, "step": 356 }, { "epoch": 0.05514054678475164, "grad_norm": 0.8576806783676147, "learning_rate": 9.925155083512922e-05, "loss": 1.6484, "step": 358 }, { "epoch": 0.055448594532152486, "grad_norm": 0.7348802089691162, "learning_rate": 9.924318613475156e-05, "loss": 2.0183, "step": 360 }, { "epoch": 0.055756642279553334, "grad_norm": 0.6207509636878967, "learning_rate": 9.923477530805199e-05, "loss": 1.1584, "step": 362 }, { "epoch": 0.056064690026954175, "grad_norm": 0.7382807731628418, "learning_rate": 9.9226318362909e-05, "loss": 1.4034, "step": 364 }, { "epoch": 0.056372737774355024, "grad_norm": 0.6476181745529175, "learning_rate": 9.921781530724421e-05, "loss": 1.0029, "step": 366 }, { "epoch": 0.05668078552175587, "grad_norm": 0.7093887329101562, "learning_rate": 9.920926614902253e-05, "loss": 1.9992, "step": 368 }, { "epoch": 0.05698883326915672, "grad_norm": 0.7479656934738159, "learning_rate": 9.920067089625194e-05, "loss": 1.1022, "step": 370 }, { "epoch": 0.05729688101655757, "grad_norm": 0.6984477043151855, "learning_rate": 9.919202955698367e-05, "loss": 1.2939, "step": 372 }, { "epoch": 0.05760492876395841, "grad_norm": 0.6800373792648315, "learning_rate": 9.918334213931214e-05, "loss": 1.0686, "step": 374 }, { "epoch": 0.05791297651135926, "grad_norm": 1.0920754671096802, "learning_rate": 9.917460865137485e-05, "loss": 1.1059, "step": 376 }, { "epoch": 0.05822102425876011, "grad_norm": 0.7536227107048035, "learning_rate": 9.916582910135252e-05, "loss": 1.0106, "step": 378 }, { "epoch": 0.058529072006160955, "grad_norm": 0.850918173789978, "learning_rate": 9.915700349746898e-05, "loss": 1.1953, "step": 380 }, { "epoch": 0.0588371197535618, "grad_norm": 0.6495062112808228, "learning_rate": 9.914813184799123e-05, "loss": 1.0393, "step": 382 }, { "epoch": 0.05914516750096265, "grad_norm": 0.6886342763900757, "learning_rate": 9.913921416122937e-05, "loss": 1.0439, "step": 384 }, { "epoch": 0.05945321524836349, "grad_norm": 0.7080234885215759, "learning_rate": 9.913025044553666e-05, "loss": 0.9485, "step": 386 }, { "epoch": 0.05976126299576434, "grad_norm": 0.5946139693260193, "learning_rate": 9.912124070930943e-05, "loss": 1.0163, "step": 388 }, { "epoch": 0.06006931074316519, "grad_norm": 0.8572615385055542, "learning_rate": 9.911218496098717e-05, "loss": 1.3743, "step": 390 }, { "epoch": 0.06037735849056604, "grad_norm": 0.7151904106140137, "learning_rate": 9.91030832090524e-05, "loss": 1.2767, "step": 392 }, { "epoch": 0.06068540623796689, "grad_norm": 0.7724493145942688, "learning_rate": 9.909393546203082e-05, "loss": 1.1237, "step": 394 }, { "epoch": 0.060993453985367735, "grad_norm": 1.2182974815368652, "learning_rate": 9.908474172849114e-05, "loss": 1.1877, "step": 396 }, { "epoch": 0.061301501732768576, "grad_norm": 0.5870667695999146, "learning_rate": 9.907550201704519e-05, "loss": 1.1604, "step": 398 }, { "epoch": 0.061609549480169425, "grad_norm": 0.6542190909385681, "learning_rate": 9.906621633634782e-05, "loss": 1.1505, "step": 400 }, { "epoch": 0.06191759722757027, "grad_norm": 0.7185912132263184, "learning_rate": 9.9056884695097e-05, "loss": 1.1226, "step": 402 }, { "epoch": 0.06222564497497112, "grad_norm": 0.802648663520813, "learning_rate": 9.90475071020337e-05, "loss": 1.075, "step": 404 }, { "epoch": 0.06253369272237197, "grad_norm": 0.6152924299240112, "learning_rate": 9.903808356594199e-05, "loss": 1.1796, "step": 406 }, { "epoch": 0.06284174046977281, "grad_norm": 0.8860695362091064, "learning_rate": 9.90286140956489e-05, "loss": 1.1769, "step": 408 }, { "epoch": 0.06314978821717367, "grad_norm": 0.7648366093635559, "learning_rate": 9.901909870002455e-05, "loss": 1.1125, "step": 410 }, { "epoch": 0.06345783596457451, "grad_norm": 0.708521842956543, "learning_rate": 9.900953738798205e-05, "loss": 1.9526, "step": 412 }, { "epoch": 0.06376588371197536, "grad_norm": 0.5266025066375732, "learning_rate": 9.899993016847753e-05, "loss": 1.1469, "step": 414 }, { "epoch": 0.0640739314593762, "grad_norm": 0.5720192193984985, "learning_rate": 9.899027705051011e-05, "loss": 1.0054, "step": 416 }, { "epoch": 0.06438197920677705, "grad_norm": 0.7001132965087891, "learning_rate": 9.89805780431219e-05, "loss": 0.9696, "step": 418 }, { "epoch": 0.0646900269541779, "grad_norm": 0.6034771800041199, "learning_rate": 9.897083315539803e-05, "loss": 1.0963, "step": 420 }, { "epoch": 0.06499807470157874, "grad_norm": 0.6032156944274902, "learning_rate": 9.896104239646658e-05, "loss": 2.0431, "step": 422 }, { "epoch": 0.0653061224489796, "grad_norm": 0.8947567939758301, "learning_rate": 9.895120577549858e-05, "loss": 1.1282, "step": 424 }, { "epoch": 0.06561417019638044, "grad_norm": 0.6361309289932251, "learning_rate": 9.894132330170805e-05, "loss": 1.2933, "step": 426 }, { "epoch": 0.06592221794378128, "grad_norm": 0.8116483092308044, "learning_rate": 9.893139498435194e-05, "loss": 1.1211, "step": 428 }, { "epoch": 0.06623026569118214, "grad_norm": 0.594086766242981, "learning_rate": 9.892142083273017e-05, "loss": 0.8969, "step": 430 }, { "epoch": 0.06653831343858298, "grad_norm": 0.45241397619247437, "learning_rate": 9.891140085618555e-05, "loss": 0.9593, "step": 432 }, { "epoch": 0.06684636118598383, "grad_norm": 0.726063072681427, "learning_rate": 9.890133506410386e-05, "loss": 1.1369, "step": 434 }, { "epoch": 0.06715440893338467, "grad_norm": 0.8245907425880432, "learning_rate": 9.889122346591377e-05, "loss": 1.1889, "step": 436 }, { "epoch": 0.06746245668078552, "grad_norm": 0.5453060269355774, "learning_rate": 9.888106607108687e-05, "loss": 0.9628, "step": 438 }, { "epoch": 0.06777050442818637, "grad_norm": 0.7849345803260803, "learning_rate": 9.88708628891376e-05, "loss": 1.1588, "step": 440 }, { "epoch": 0.06807855217558721, "grad_norm": 0.8772358894348145, "learning_rate": 9.886061392962336e-05, "loss": 0.9983, "step": 442 }, { "epoch": 0.06838659992298807, "grad_norm": 0.7637527585029602, "learning_rate": 9.88503192021444e-05, "loss": 1.2101, "step": 444 }, { "epoch": 0.06869464767038891, "grad_norm": 0.9543507695198059, "learning_rate": 9.883997871634383e-05, "loss": 1.135, "step": 446 }, { "epoch": 0.06900269541778976, "grad_norm": 0.6680512428283691, "learning_rate": 9.882959248190764e-05, "loss": 1.1381, "step": 448 }, { "epoch": 0.0693107431651906, "grad_norm": 0.6533584594726562, "learning_rate": 9.881916050856464e-05, "loss": 0.9652, "step": 450 }, { "epoch": 0.06961879091259145, "grad_norm": 0.9764670133590698, "learning_rate": 9.880868280608654e-05, "loss": 0.9112, "step": 452 }, { "epoch": 0.0699268386599923, "grad_norm": 0.7887918949127197, "learning_rate": 9.879815938428783e-05, "loss": 1.0959, "step": 454 }, { "epoch": 0.07023488640739314, "grad_norm": 0.723955512046814, "learning_rate": 9.878759025302586e-05, "loss": 1.118, "step": 456 }, { "epoch": 0.070542934154794, "grad_norm": 0.8257218599319458, "learning_rate": 9.877697542220078e-05, "loss": 1.2431, "step": 458 }, { "epoch": 0.07085098190219484, "grad_norm": 0.6132649779319763, "learning_rate": 9.876631490175555e-05, "loss": 0.9923, "step": 460 }, { "epoch": 0.07115902964959568, "grad_norm": 0.9241247177124023, "learning_rate": 9.875560870167594e-05, "loss": 1.2972, "step": 462 }, { "epoch": 0.07146707739699654, "grad_norm": 0.6281841993331909, "learning_rate": 9.874485683199048e-05, "loss": 0.9348, "step": 464 }, { "epoch": 0.07177512514439738, "grad_norm": 0.6570271253585815, "learning_rate": 9.87340593027705e-05, "loss": 1.0261, "step": 466 }, { "epoch": 0.07208317289179823, "grad_norm": 0.6641860008239746, "learning_rate": 9.872321612413012e-05, "loss": 1.0926, "step": 468 }, { "epoch": 0.07239122063919907, "grad_norm": 0.6074772477149963, "learning_rate": 9.871232730622618e-05, "loss": 1.2501, "step": 470 }, { "epoch": 0.07269926838659992, "grad_norm": 0.8022063970565796, "learning_rate": 9.870139285925826e-05, "loss": 1.1449, "step": 472 }, { "epoch": 0.07300731613400077, "grad_norm": 0.783936619758606, "learning_rate": 9.869041279346874e-05, "loss": 1.1048, "step": 474 }, { "epoch": 0.07331536388140161, "grad_norm": 0.8866376876831055, "learning_rate": 9.867938711914269e-05, "loss": 2.1867, "step": 476 }, { "epoch": 0.07362341162880247, "grad_norm": 0.7495025992393494, "learning_rate": 9.866831584660791e-05, "loss": 1.0887, "step": 478 }, { "epoch": 0.07393145937620331, "grad_norm": 0.5893321633338928, "learning_rate": 9.86571989862349e-05, "loss": 0.9163, "step": 480 }, { "epoch": 0.07423950712360416, "grad_norm": 0.5532320737838745, "learning_rate": 9.864603654843692e-05, "loss": 1.0393, "step": 482 }, { "epoch": 0.074547554871005, "grad_norm": 0.6310046911239624, "learning_rate": 9.863482854366983e-05, "loss": 1.244, "step": 484 }, { "epoch": 0.07485560261840585, "grad_norm": 0.8252720832824707, "learning_rate": 9.862357498243223e-05, "loss": 1.3308, "step": 486 }, { "epoch": 0.0751636503658067, "grad_norm": 0.820950984954834, "learning_rate": 9.861227587526539e-05, "loss": 1.222, "step": 488 }, { "epoch": 0.07547169811320754, "grad_norm": 0.8616042733192444, "learning_rate": 9.860093123275325e-05, "loss": 1.0324, "step": 490 }, { "epoch": 0.0757797458606084, "grad_norm": 0.68876051902771, "learning_rate": 9.858954106552236e-05, "loss": 1.3133, "step": 492 }, { "epoch": 0.07608779360800924, "grad_norm": 0.66021329164505, "learning_rate": 9.857810538424195e-05, "loss": 0.9999, "step": 494 }, { "epoch": 0.07639584135541008, "grad_norm": 0.5837455987930298, "learning_rate": 9.85666241996239e-05, "loss": 1.0609, "step": 496 }, { "epoch": 0.07670388910281094, "grad_norm": 0.676024317741394, "learning_rate": 9.855509752242267e-05, "loss": 1.0629, "step": 498 }, { "epoch": 0.07701193685021178, "grad_norm": 0.8846476674079895, "learning_rate": 9.854352536343534e-05, "loss": 1.3576, "step": 500 }, { "epoch": 0.07731998459761263, "grad_norm": 0.6220236420631409, "learning_rate": 9.853190773350164e-05, "loss": 1.2044, "step": 502 }, { "epoch": 0.07762803234501348, "grad_norm": 0.7261460423469543, "learning_rate": 9.852024464350382e-05, "loss": 1.1052, "step": 504 }, { "epoch": 0.07793608009241433, "grad_norm": 0.660500168800354, "learning_rate": 9.850853610436679e-05, "loss": 1.1389, "step": 506 }, { "epoch": 0.07824412783981517, "grad_norm": 0.7806474566459656, "learning_rate": 9.849678212705796e-05, "loss": 1.0534, "step": 508 }, { "epoch": 0.07855217558721601, "grad_norm": 0.9472790956497192, "learning_rate": 9.848498272258735e-05, "loss": 1.0524, "step": 510 }, { "epoch": 0.07886022333461687, "grad_norm": 0.5467221736907959, "learning_rate": 9.847313790200751e-05, "loss": 1.1608, "step": 512 }, { "epoch": 0.07916827108201771, "grad_norm": 0.6035184860229492, "learning_rate": 9.846124767641354e-05, "loss": 0.9941, "step": 514 }, { "epoch": 0.07947631882941857, "grad_norm": 0.46280941367149353, "learning_rate": 9.844931205694308e-05, "loss": 0.879, "step": 516 }, { "epoch": 0.07978436657681941, "grad_norm": 0.8107156753540039, "learning_rate": 9.843733105477628e-05, "loss": 1.3569, "step": 518 }, { "epoch": 0.08009241432422025, "grad_norm": 0.6657843589782715, "learning_rate": 9.842530468113578e-05, "loss": 0.9241, "step": 520 }, { "epoch": 0.0804004620716211, "grad_norm": 0.6296514868736267, "learning_rate": 9.841323294728675e-05, "loss": 1.3311, "step": 522 }, { "epoch": 0.08070850981902195, "grad_norm": 0.8303132057189941, "learning_rate": 9.840111586453686e-05, "loss": 1.178, "step": 524 }, { "epoch": 0.0810165575664228, "grad_norm": 0.7254791855812073, "learning_rate": 9.838895344423621e-05, "loss": 1.0316, "step": 526 }, { "epoch": 0.08132460531382364, "grad_norm": 0.8639194369316101, "learning_rate": 9.837674569777742e-05, "loss": 1.1379, "step": 528 }, { "epoch": 0.08163265306122448, "grad_norm": 0.7651161551475525, "learning_rate": 9.836449263659551e-05, "loss": 1.3298, "step": 530 }, { "epoch": 0.08194070080862534, "grad_norm": 0.6473088264465332, "learning_rate": 9.835219427216801e-05, "loss": 1.1758, "step": 532 }, { "epoch": 0.08224874855602618, "grad_norm": 0.7255507111549377, "learning_rate": 9.833985061601485e-05, "loss": 1.2247, "step": 534 }, { "epoch": 0.08255679630342704, "grad_norm": 0.6490479707717896, "learning_rate": 9.832746167969837e-05, "loss": 0.9711, "step": 536 }, { "epoch": 0.08286484405082788, "grad_norm": 0.6560536026954651, "learning_rate": 9.831502747482337e-05, "loss": 1.0156, "step": 538 }, { "epoch": 0.08317289179822873, "grad_norm": 0.6243906021118164, "learning_rate": 9.830254801303702e-05, "loss": 1.4298, "step": 540 }, { "epoch": 0.08348093954562957, "grad_norm": 0.849922239780426, "learning_rate": 9.829002330602888e-05, "loss": 1.2591, "step": 542 }, { "epoch": 0.08378898729303041, "grad_norm": 0.8038306832313538, "learning_rate": 9.827745336553092e-05, "loss": 1.1405, "step": 544 }, { "epoch": 0.08409703504043127, "grad_norm": 0.7083746194839478, "learning_rate": 9.826483820331743e-05, "loss": 1.1005, "step": 546 }, { "epoch": 0.08440508278783211, "grad_norm": 0.643138587474823, "learning_rate": 9.825217783120513e-05, "loss": 0.8967, "step": 548 }, { "epoch": 0.08471313053523297, "grad_norm": 0.5439552068710327, "learning_rate": 9.823947226105302e-05, "loss": 1.805, "step": 550 }, { "epoch": 0.08502117828263381, "grad_norm": 1.0784913301467896, "learning_rate": 9.822672150476249e-05, "loss": 1.3968, "step": 552 }, { "epoch": 0.08532922603003465, "grad_norm": 0.6794597506523132, "learning_rate": 9.82139255742772e-05, "loss": 0.9096, "step": 554 }, { "epoch": 0.0856372737774355, "grad_norm": 0.7618829607963562, "learning_rate": 9.820108448158319e-05, "loss": 1.2315, "step": 556 }, { "epoch": 0.08594532152483635, "grad_norm": 0.6738618612289429, "learning_rate": 9.818819823870876e-05, "loss": 0.9266, "step": 558 }, { "epoch": 0.0862533692722372, "grad_norm": 0.6213587522506714, "learning_rate": 9.817526685772452e-05, "loss": 0.8314, "step": 560 }, { "epoch": 0.08656141701963804, "grad_norm": 0.6162586808204651, "learning_rate": 9.816229035074334e-05, "loss": 0.9922, "step": 562 }, { "epoch": 0.08686946476703888, "grad_norm": 0.9549304246902466, "learning_rate": 9.814926872992038e-05, "loss": 1.0369, "step": 564 }, { "epoch": 0.08717751251443974, "grad_norm": 0.919792890548706, "learning_rate": 9.813620200745307e-05, "loss": 1.1811, "step": 566 }, { "epoch": 0.08748556026184058, "grad_norm": 0.6474389433860779, "learning_rate": 9.812309019558103e-05, "loss": 0.9927, "step": 568 }, { "epoch": 0.08779360800924144, "grad_norm": 0.7442201375961304, "learning_rate": 9.81099333065862e-05, "loss": 1.2592, "step": 570 }, { "epoch": 0.08810165575664228, "grad_norm": 0.7177843451499939, "learning_rate": 9.809673135279268e-05, "loss": 1.1623, "step": 572 }, { "epoch": 0.08840970350404313, "grad_norm": 0.6520625352859497, "learning_rate": 9.80834843465668e-05, "loss": 0.9876, "step": 574 }, { "epoch": 0.08871775125144397, "grad_norm": 0.796640157699585, "learning_rate": 9.807019230031708e-05, "loss": 1.2678, "step": 576 }, { "epoch": 0.08902579899884482, "grad_norm": 0.6173284649848938, "learning_rate": 9.805685522649428e-05, "loss": 0.9507, "step": 578 }, { "epoch": 0.08933384674624567, "grad_norm": 0.6731142401695251, "learning_rate": 9.804347313759126e-05, "loss": 1.2456, "step": 580 }, { "epoch": 0.08964189449364651, "grad_norm": 0.821448802947998, "learning_rate": 9.80300460461431e-05, "loss": 1.1822, "step": 582 }, { "epoch": 0.08994994224104737, "grad_norm": 0.7288122773170471, "learning_rate": 9.801657396472702e-05, "loss": 2.3747, "step": 584 }, { "epoch": 0.09025798998844821, "grad_norm": 0.6716973781585693, "learning_rate": 9.800305690596238e-05, "loss": 1.1216, "step": 586 }, { "epoch": 0.09056603773584905, "grad_norm": 0.627241849899292, "learning_rate": 9.798949488251068e-05, "loss": 1.2473, "step": 588 }, { "epoch": 0.0908740854832499, "grad_norm": 0.7980815172195435, "learning_rate": 9.797588790707551e-05, "loss": 1.0989, "step": 590 }, { "epoch": 0.09118213323065075, "grad_norm": 0.893456220626831, "learning_rate": 9.796223599240262e-05, "loss": 1.3478, "step": 592 }, { "epoch": 0.0914901809780516, "grad_norm": 0.8465490341186523, "learning_rate": 9.794853915127978e-05, "loss": 1.0439, "step": 594 }, { "epoch": 0.09179822872545244, "grad_norm": 0.7354926466941833, "learning_rate": 9.793479739653692e-05, "loss": 1.5374, "step": 596 }, { "epoch": 0.0921062764728533, "grad_norm": 0.7580827474594116, "learning_rate": 9.792101074104598e-05, "loss": 1.0673, "step": 598 }, { "epoch": 0.09241432422025414, "grad_norm": 0.7354786396026611, "learning_rate": 9.790717919772102e-05, "loss": 1.0498, "step": 600 }, { "epoch": 0.09241432422025414, "eval_loss": 2.5175342559814453, "eval_runtime": 736.9022, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.679, "step": 600 }, { "epoch": 0.09272237196765498, "grad_norm": 0.9065431952476501, "learning_rate": 9.789330277951807e-05, "loss": 1.0872, "step": 602 }, { "epoch": 0.09303041971505584, "grad_norm": 0.5320457816123962, "learning_rate": 9.787938149943525e-05, "loss": 3.4212, "step": 604 }, { "epoch": 0.09333846746245668, "grad_norm": 0.693964421749115, "learning_rate": 9.78654153705127e-05, "loss": 1.0635, "step": 606 }, { "epoch": 0.09364651520985753, "grad_norm": 0.7583566904067993, "learning_rate": 9.785140440583256e-05, "loss": 1.0579, "step": 608 }, { "epoch": 0.09395456295725838, "grad_norm": 0.5443295240402222, "learning_rate": 9.783734861851895e-05, "loss": 2.0752, "step": 610 }, { "epoch": 0.09426261070465922, "grad_norm": 0.7155397534370422, "learning_rate": 9.7823248021738e-05, "loss": 0.9966, "step": 612 }, { "epoch": 0.09457065845206007, "grad_norm": 0.6753432154655457, "learning_rate": 9.780910262869779e-05, "loss": 1.8116, "step": 614 }, { "epoch": 0.09487870619946091, "grad_norm": 0.647979199886322, "learning_rate": 9.77949124526484e-05, "loss": 0.8303, "step": 616 }, { "epoch": 0.09518675394686177, "grad_norm": 0.7311884164810181, "learning_rate": 9.77806775068818e-05, "loss": 1.2308, "step": 618 }, { "epoch": 0.09549480169426261, "grad_norm": 0.8863371014595032, "learning_rate": 9.776639780473198e-05, "loss": 1.2158, "step": 620 }, { "epoch": 0.09580284944166345, "grad_norm": 0.8108947277069092, "learning_rate": 9.775207335957476e-05, "loss": 1.157, "step": 622 }, { "epoch": 0.0961108971890643, "grad_norm": 0.790622353553772, "learning_rate": 9.773770418482792e-05, "loss": 0.9097, "step": 624 }, { "epoch": 0.09641894493646515, "grad_norm": 1.0200419425964355, "learning_rate": 9.772329029395116e-05, "loss": 1.3464, "step": 626 }, { "epoch": 0.096726992683866, "grad_norm": 0.7055392265319824, "learning_rate": 9.770883170044603e-05, "loss": 1.2125, "step": 628 }, { "epoch": 0.09703504043126684, "grad_norm": 0.5598780512809753, "learning_rate": 9.769432841785593e-05, "loss": 1.185, "step": 630 }, { "epoch": 0.0973430881786677, "grad_norm": 0.6309000849723816, "learning_rate": 9.767978045976618e-05, "loss": 1.229, "step": 632 }, { "epoch": 0.09765113592606854, "grad_norm": 0.7420125007629395, "learning_rate": 9.766518783980393e-05, "loss": 1.0318, "step": 634 }, { "epoch": 0.09795918367346938, "grad_norm": 0.7199630737304688, "learning_rate": 9.765055057163813e-05, "loss": 1.2673, "step": 636 }, { "epoch": 0.09826723142087024, "grad_norm": 0.76442950963974, "learning_rate": 9.763586866897959e-05, "loss": 1.4397, "step": 638 }, { "epoch": 0.09857527916827108, "grad_norm": 0.8405647873878479, "learning_rate": 9.762114214558092e-05, "loss": 0.9605, "step": 640 }, { "epoch": 0.09888332691567193, "grad_norm": 0.5300808548927307, "learning_rate": 9.76063710152365e-05, "loss": 1.0489, "step": 642 }, { "epoch": 0.09919137466307278, "grad_norm": 0.7019151449203491, "learning_rate": 9.759155529178256e-05, "loss": 1.0425, "step": 644 }, { "epoch": 0.09949942241047362, "grad_norm": 0.6637164950370789, "learning_rate": 9.757669498909701e-05, "loss": 1.5364, "step": 646 }, { "epoch": 0.09980747015787447, "grad_norm": 0.5824893712997437, "learning_rate": 9.756179012109961e-05, "loss": 1.8955, "step": 648 }, { "epoch": 0.10011551790527531, "grad_norm": 0.7904444932937622, "learning_rate": 9.754684070175178e-05, "loss": 1.2757, "step": 650 }, { "epoch": 0.10042356565267617, "grad_norm": 0.8260906338691711, "learning_rate": 9.753184674505672e-05, "loss": 1.0885, "step": 652 }, { "epoch": 0.10073161340007701, "grad_norm": 0.7655699253082275, "learning_rate": 9.751680826505935e-05, "loss": 1.3115, "step": 654 }, { "epoch": 0.10103966114747785, "grad_norm": 0.9455386400222778, "learning_rate": 9.750172527584628e-05, "loss": 1.1202, "step": 656 }, { "epoch": 0.10134770889487871, "grad_norm": 0.6057466864585876, "learning_rate": 9.748659779154583e-05, "loss": 1.3526, "step": 658 }, { "epoch": 0.10165575664227955, "grad_norm": 0.5363863706588745, "learning_rate": 9.747142582632795e-05, "loss": 0.8425, "step": 660 }, { "epoch": 0.1019638043896804, "grad_norm": 0.6003315448760986, "learning_rate": 9.745620939440433e-05, "loss": 1.3913, "step": 662 }, { "epoch": 0.10227185213708125, "grad_norm": 0.6269535422325134, "learning_rate": 9.744094851002825e-05, "loss": 1.0703, "step": 664 }, { "epoch": 0.1025798998844821, "grad_norm": 1.0587024688720703, "learning_rate": 9.742564318749465e-05, "loss": 1.2466, "step": 666 }, { "epoch": 0.10288794763188294, "grad_norm": 0.966397762298584, "learning_rate": 9.741029344114011e-05, "loss": 1.1399, "step": 668 }, { "epoch": 0.10319599537928378, "grad_norm": 0.8659431338310242, "learning_rate": 9.73948992853428e-05, "loss": 1.2475, "step": 670 }, { "epoch": 0.10350404312668464, "grad_norm": 0.6009111404418945, "learning_rate": 9.737946073452249e-05, "loss": 0.9601, "step": 672 }, { "epoch": 0.10381209087408548, "grad_norm": 0.7170124650001526, "learning_rate": 9.736397780314056e-05, "loss": 1.0488, "step": 674 }, { "epoch": 0.10412013862148634, "grad_norm": 0.9177563190460205, "learning_rate": 9.734845050569994e-05, "loss": 1.3932, "step": 676 }, { "epoch": 0.10442818636888718, "grad_norm": 0.5799642205238342, "learning_rate": 9.733287885674512e-05, "loss": 2.0052, "step": 678 }, { "epoch": 0.10473623411628802, "grad_norm": 0.6074812412261963, "learning_rate": 9.731726287086211e-05, "loss": 1.152, "step": 680 }, { "epoch": 0.10504428186368887, "grad_norm": 0.673645555973053, "learning_rate": 9.730160256267853e-05, "loss": 1.3506, "step": 682 }, { "epoch": 0.10535232961108972, "grad_norm": 0.8519735336303711, "learning_rate": 9.728589794686342e-05, "loss": 1.2018, "step": 684 }, { "epoch": 0.10566037735849057, "grad_norm": 0.935734748840332, "learning_rate": 9.727014903812736e-05, "loss": 1.2358, "step": 686 }, { "epoch": 0.10596842510589141, "grad_norm": 0.7728151082992554, "learning_rate": 9.725435585122249e-05, "loss": 2.1438, "step": 688 }, { "epoch": 0.10627647285329227, "grad_norm": 0.6737799644470215, "learning_rate": 9.72385184009423e-05, "loss": 1.0234, "step": 690 }, { "epoch": 0.10658452060069311, "grad_norm": 0.7550671696662903, "learning_rate": 9.722263670212181e-05, "loss": 1.1886, "step": 692 }, { "epoch": 0.10689256834809395, "grad_norm": 0.4803139269351959, "learning_rate": 9.72067107696375e-05, "loss": 1.1422, "step": 694 }, { "epoch": 0.1072006160954948, "grad_norm": 0.7486079931259155, "learning_rate": 9.719074061840726e-05, "loss": 0.998, "step": 696 }, { "epoch": 0.10750866384289565, "grad_norm": 0.7024034261703491, "learning_rate": 9.717472626339041e-05, "loss": 1.1791, "step": 698 }, { "epoch": 0.1078167115902965, "grad_norm": 0.7519829273223877, "learning_rate": 9.715866771958766e-05, "loss": 1.3896, "step": 700 }, { "epoch": 0.10812475933769734, "grad_norm": 0.8522893190383911, "learning_rate": 9.714256500204112e-05, "loss": 0.981, "step": 702 }, { "epoch": 0.10843280708509818, "grad_norm": 0.6957355737686157, "learning_rate": 9.71264181258343e-05, "loss": 0.9747, "step": 704 }, { "epoch": 0.10874085483249904, "grad_norm": 0.6734793186187744, "learning_rate": 9.711022710609204e-05, "loss": 1.1019, "step": 706 }, { "epoch": 0.10904890257989988, "grad_norm": 1.0916528701782227, "learning_rate": 9.709399195798055e-05, "loss": 2.5579, "step": 708 }, { "epoch": 0.10935695032730074, "grad_norm": 0.662269651889801, "learning_rate": 9.707771269670736e-05, "loss": 1.076, "step": 710 }, { "epoch": 0.10966499807470158, "grad_norm": 0.6294137835502625, "learning_rate": 9.706138933752134e-05, "loss": 0.9947, "step": 712 }, { "epoch": 0.10997304582210242, "grad_norm": 0.7161690592765808, "learning_rate": 9.704502189571262e-05, "loss": 1.3204, "step": 714 }, { "epoch": 0.11028109356950327, "grad_norm": 0.7409554719924927, "learning_rate": 9.702861038661273e-05, "loss": 1.0797, "step": 716 }, { "epoch": 0.11058914131690412, "grad_norm": 0.5623186826705933, "learning_rate": 9.701215482559436e-05, "loss": 1.0801, "step": 718 }, { "epoch": 0.11089718906430497, "grad_norm": 0.6551181674003601, "learning_rate": 9.699565522807151e-05, "loss": 0.9734, "step": 720 }, { "epoch": 0.11120523681170581, "grad_norm": 0.9996515512466431, "learning_rate": 9.697911160949944e-05, "loss": 1.109, "step": 722 }, { "epoch": 0.11151328455910667, "grad_norm": 0.6170802712440491, "learning_rate": 9.696252398537462e-05, "loss": 1.0989, "step": 724 }, { "epoch": 0.11182133230650751, "grad_norm": 0.8524353504180908, "learning_rate": 9.69458923712348e-05, "loss": 1.8914, "step": 726 }, { "epoch": 0.11212938005390835, "grad_norm": 0.5866247415542603, "learning_rate": 9.692921678265883e-05, "loss": 1.0485, "step": 728 }, { "epoch": 0.1124374278013092, "grad_norm": 0.7652002573013306, "learning_rate": 9.691249723526683e-05, "loss": 1.0219, "step": 730 }, { "epoch": 0.11274547554871005, "grad_norm": 0.8469516038894653, "learning_rate": 9.689573374472011e-05, "loss": 1.249, "step": 732 }, { "epoch": 0.1130535232961109, "grad_norm": 0.9973403811454773, "learning_rate": 9.687892632672109e-05, "loss": 1.088, "step": 734 }, { "epoch": 0.11336157104351174, "grad_norm": 0.7604619264602661, "learning_rate": 9.686207499701334e-05, "loss": 1.1157, "step": 736 }, { "epoch": 0.11366961879091259, "grad_norm": 0.7274996638298035, "learning_rate": 9.684517977138159e-05, "loss": 0.9415, "step": 738 }, { "epoch": 0.11397766653831344, "grad_norm": 0.6756743788719177, "learning_rate": 9.682824066565168e-05, "loss": 0.9793, "step": 740 }, { "epoch": 0.11428571428571428, "grad_norm": 0.9891169667243958, "learning_rate": 9.681125769569056e-05, "loss": 0.965, "step": 742 }, { "epoch": 0.11459376203311514, "grad_norm": 0.8296619653701782, "learning_rate": 9.679423087740625e-05, "loss": 1.2816, "step": 744 }, { "epoch": 0.11490180978051598, "grad_norm": 0.6557073593139648, "learning_rate": 9.677716022674783e-05, "loss": 1.0373, "step": 746 }, { "epoch": 0.11520985752791682, "grad_norm": 0.7231959700584412, "learning_rate": 9.676004575970547e-05, "loss": 1.0382, "step": 748 }, { "epoch": 0.11551790527531768, "grad_norm": 0.6279606223106384, "learning_rate": 9.67428874923104e-05, "loss": 1.2397, "step": 750 }, { "epoch": 0.11582595302271852, "grad_norm": 0.7001874446868896, "learning_rate": 9.67256854406348e-05, "loss": 1.1748, "step": 752 }, { "epoch": 0.11613400077011937, "grad_norm": 0.829440712928772, "learning_rate": 9.670843962079194e-05, "loss": 0.9886, "step": 754 }, { "epoch": 0.11644204851752021, "grad_norm": 0.7666531205177307, "learning_rate": 9.669115004893606e-05, "loss": 0.9669, "step": 756 }, { "epoch": 0.11675009626492107, "grad_norm": 0.578072190284729, "learning_rate": 9.667381674126238e-05, "loss": 2.1085, "step": 758 }, { "epoch": 0.11705814401232191, "grad_norm": 0.9283096194267273, "learning_rate": 9.665643971400709e-05, "loss": 2.3508, "step": 760 }, { "epoch": 0.11736619175972275, "grad_norm": 0.7107291221618652, "learning_rate": 9.663901898344732e-05, "loss": 0.9454, "step": 762 }, { "epoch": 0.1176742395071236, "grad_norm": 0.7699968218803406, "learning_rate": 9.662155456590116e-05, "loss": 1.0085, "step": 764 }, { "epoch": 0.11798228725452445, "grad_norm": 0.688955545425415, "learning_rate": 9.660404647772763e-05, "loss": 0.9659, "step": 766 }, { "epoch": 0.1182903350019253, "grad_norm": 0.7579169273376465, "learning_rate": 9.65864947353266e-05, "loss": 1.2011, "step": 768 }, { "epoch": 0.11859838274932614, "grad_norm": 0.6933251023292542, "learning_rate": 9.656889935513889e-05, "loss": 1.0996, "step": 770 }, { "epoch": 0.11890643049672699, "grad_norm": 0.8795177340507507, "learning_rate": 9.655126035364617e-05, "loss": 1.0225, "step": 772 }, { "epoch": 0.11921447824412784, "grad_norm": 0.731309711933136, "learning_rate": 9.6533577747371e-05, "loss": 1.0972, "step": 774 }, { "epoch": 0.11952252599152868, "grad_norm": 0.5974037051200867, "learning_rate": 9.651585155287671e-05, "loss": 0.9594, "step": 776 }, { "epoch": 0.11983057373892954, "grad_norm": 0.869696855545044, "learning_rate": 9.649808178676755e-05, "loss": 1.07, "step": 778 }, { "epoch": 0.12013862148633038, "grad_norm": 0.5604158043861389, "learning_rate": 9.648026846568853e-05, "loss": 1.7745, "step": 780 }, { "epoch": 0.12044666923373124, "grad_norm": 0.8833852410316467, "learning_rate": 9.646241160632547e-05, "loss": 2.3886, "step": 782 }, { "epoch": 0.12075471698113208, "grad_norm": 0.6206603050231934, "learning_rate": 9.644451122540496e-05, "loss": 0.9324, "step": 784 }, { "epoch": 0.12106276472853292, "grad_norm": 0.8402155041694641, "learning_rate": 9.642656733969439e-05, "loss": 1.0746, "step": 786 }, { "epoch": 0.12137081247593377, "grad_norm": 0.8293070197105408, "learning_rate": 9.640857996600188e-05, "loss": 2.127, "step": 788 }, { "epoch": 0.12167886022333461, "grad_norm": 0.6803492903709412, "learning_rate": 9.639054912117628e-05, "loss": 0.9553, "step": 790 }, { "epoch": 0.12198690797073547, "grad_norm": 0.817302942276001, "learning_rate": 9.637247482210716e-05, "loss": 1.1407, "step": 792 }, { "epoch": 0.12229495571813631, "grad_norm": 0.7530799508094788, "learning_rate": 9.635435708572482e-05, "loss": 0.9892, "step": 794 }, { "epoch": 0.12260300346553715, "grad_norm": 0.6724453568458557, "learning_rate": 9.63361959290002e-05, "loss": 1.0774, "step": 796 }, { "epoch": 0.12291105121293801, "grad_norm": 0.7561922073364258, "learning_rate": 9.631799136894498e-05, "loss": 1.0483, "step": 798 }, { "epoch": 0.12321909896033885, "grad_norm": 0.7706962823867798, "learning_rate": 9.629974342261142e-05, "loss": 2.1088, "step": 800 }, { "epoch": 0.1235271467077397, "grad_norm": 1.036582350730896, "learning_rate": 9.628145210709245e-05, "loss": 1.1936, "step": 802 }, { "epoch": 0.12383519445514055, "grad_norm": 0.9285162091255188, "learning_rate": 9.626311743952167e-05, "loss": 1.2731, "step": 804 }, { "epoch": 0.12414324220254139, "grad_norm": 0.8123614192008972, "learning_rate": 9.624473943707321e-05, "loss": 1.0282, "step": 806 }, { "epoch": 0.12445128994994224, "grad_norm": 0.6215323805809021, "learning_rate": 9.622631811696187e-05, "loss": 1.0445, "step": 808 }, { "epoch": 0.12475933769734308, "grad_norm": 0.6055769324302673, "learning_rate": 9.620785349644296e-05, "loss": 0.9747, "step": 810 }, { "epoch": 0.12506738544474394, "grad_norm": 0.5595365166664124, "learning_rate": 9.618934559281237e-05, "loss": 1.0832, "step": 812 }, { "epoch": 0.12537543319214478, "grad_norm": 0.7242071628570557, "learning_rate": 9.617079442340656e-05, "loss": 1.161, "step": 814 }, { "epoch": 0.12568348093954562, "grad_norm": 0.7436997294425964, "learning_rate": 9.615220000560248e-05, "loss": 1.0146, "step": 816 }, { "epoch": 0.12599152868694646, "grad_norm": 0.569721519947052, "learning_rate": 9.613356235681762e-05, "loss": 0.9365, "step": 818 }, { "epoch": 0.12629957643434733, "grad_norm": 0.9639449715614319, "learning_rate": 9.611488149450995e-05, "loss": 0.9866, "step": 820 }, { "epoch": 0.12660762418174817, "grad_norm": 0.7013627886772156, "learning_rate": 9.60961574361779e-05, "loss": 1.0161, "step": 822 }, { "epoch": 0.12691567192914902, "grad_norm": 0.735083281993866, "learning_rate": 9.607739019936042e-05, "loss": 1.1411, "step": 824 }, { "epoch": 0.12722371967654986, "grad_norm": 0.6909815669059753, "learning_rate": 9.605857980163684e-05, "loss": 0.9881, "step": 826 }, { "epoch": 0.12753176742395073, "grad_norm": 0.7418811321258545, "learning_rate": 9.603972626062696e-05, "loss": 1.2712, "step": 828 }, { "epoch": 0.12783981517135157, "grad_norm": 0.5920977592468262, "learning_rate": 9.602082959399098e-05, "loss": 1.0488, "step": 830 }, { "epoch": 0.1281478629187524, "grad_norm": 0.7561302781105042, "learning_rate": 9.600188981942947e-05, "loss": 1.1267, "step": 832 }, { "epoch": 0.12845591066615325, "grad_norm": 0.7126810550689697, "learning_rate": 9.598290695468346e-05, "loss": 0.9252, "step": 834 }, { "epoch": 0.1287639584135541, "grad_norm": 0.7497100234031677, "learning_rate": 9.596388101753422e-05, "loss": 2.0032, "step": 836 }, { "epoch": 0.12907200616095496, "grad_norm": 0.6821487545967102, "learning_rate": 9.594481202580349e-05, "loss": 1.6951, "step": 838 }, { "epoch": 0.1293800539083558, "grad_norm": 0.8109884858131409, "learning_rate": 9.592569999735325e-05, "loss": 1.0516, "step": 840 }, { "epoch": 0.12968810165575664, "grad_norm": 0.6355708837509155, "learning_rate": 9.590654495008586e-05, "loss": 0.9621, "step": 842 }, { "epoch": 0.12999614940315748, "grad_norm": 0.574379563331604, "learning_rate": 9.58873469019439e-05, "loss": 1.0012, "step": 844 }, { "epoch": 0.13030419715055833, "grad_norm": 0.8498492240905762, "learning_rate": 9.58681058709103e-05, "loss": 1.2151, "step": 846 }, { "epoch": 0.1306122448979592, "grad_norm": 1.3300247192382812, "learning_rate": 9.584882187500822e-05, "loss": 1.2963, "step": 848 }, { "epoch": 0.13092029264536004, "grad_norm": 0.6958180069923401, "learning_rate": 9.582949493230104e-05, "loss": 1.8582, "step": 850 }, { "epoch": 0.13122834039276088, "grad_norm": 0.7322615385055542, "learning_rate": 9.581012506089243e-05, "loss": 1.1858, "step": 852 }, { "epoch": 0.13153638814016172, "grad_norm": 0.6359802484512329, "learning_rate": 9.579071227892625e-05, "loss": 2.001, "step": 854 }, { "epoch": 0.13184443588756256, "grad_norm": 0.9646987318992615, "learning_rate": 9.577125660458649e-05, "loss": 1.5951, "step": 856 }, { "epoch": 0.13215248363496343, "grad_norm": 0.5302859544754028, "learning_rate": 9.575175805609741e-05, "loss": 1.0739, "step": 858 }, { "epoch": 0.13246053138236427, "grad_norm": 1.0131480693817139, "learning_rate": 9.57322166517234e-05, "loss": 1.2304, "step": 860 }, { "epoch": 0.1327685791297651, "grad_norm": 0.762199878692627, "learning_rate": 9.571263240976897e-05, "loss": 1.0024, "step": 862 }, { "epoch": 0.13307662687716595, "grad_norm": 0.6496574282646179, "learning_rate": 9.569300534857875e-05, "loss": 1.2726, "step": 864 }, { "epoch": 0.1333846746245668, "grad_norm": 0.5563488602638245, "learning_rate": 9.567333548653753e-05, "loss": 1.2358, "step": 866 }, { "epoch": 0.13369272237196766, "grad_norm": 0.6338052153587341, "learning_rate": 9.565362284207016e-05, "loss": 1.1254, "step": 868 }, { "epoch": 0.1340007701193685, "grad_norm": 0.8334317207336426, "learning_rate": 9.563386743364156e-05, "loss": 1.1189, "step": 870 }, { "epoch": 0.13430881786676935, "grad_norm": 0.6131289601325989, "learning_rate": 9.561406927975669e-05, "loss": 0.9274, "step": 872 }, { "epoch": 0.1346168656141702, "grad_norm": 0.5694435238838196, "learning_rate": 9.559422839896061e-05, "loss": 1.1181, "step": 874 }, { "epoch": 0.13492491336157103, "grad_norm": 1.0695499181747437, "learning_rate": 9.557434480983833e-05, "loss": 1.318, "step": 876 }, { "epoch": 0.1352329611089719, "grad_norm": 0.6700896620750427, "learning_rate": 9.555441853101494e-05, "loss": 0.9522, "step": 878 }, { "epoch": 0.13554100885637274, "grad_norm": 0.7727726697921753, "learning_rate": 9.553444958115545e-05, "loss": 1.0807, "step": 880 }, { "epoch": 0.13584905660377358, "grad_norm": 0.7434378266334534, "learning_rate": 9.551443797896487e-05, "loss": 1.0798, "step": 882 }, { "epoch": 0.13615710435117442, "grad_norm": 0.7013871073722839, "learning_rate": 9.549438374318818e-05, "loss": 1.3833, "step": 884 }, { "epoch": 0.13646515209857527, "grad_norm": 0.5864063501358032, "learning_rate": 9.547428689261024e-05, "loss": 1.2405, "step": 886 }, { "epoch": 0.13677319984597613, "grad_norm": 0.6030333638191223, "learning_rate": 9.54541474460559e-05, "loss": 1.056, "step": 888 }, { "epoch": 0.13708124759337698, "grad_norm": 0.8067970275878906, "learning_rate": 9.543396542238986e-05, "loss": 1.1467, "step": 890 }, { "epoch": 0.13738929534077782, "grad_norm": 0.7694763541221619, "learning_rate": 9.541374084051673e-05, "loss": 1.2326, "step": 892 }, { "epoch": 0.13769734308817866, "grad_norm": 0.9057612419128418, "learning_rate": 9.539347371938093e-05, "loss": 1.1664, "step": 894 }, { "epoch": 0.13800539083557953, "grad_norm": 0.9390200972557068, "learning_rate": 9.537316407796681e-05, "loss": 1.0607, "step": 896 }, { "epoch": 0.13831343858298037, "grad_norm": 0.7872670292854309, "learning_rate": 9.535281193529849e-05, "loss": 1.8178, "step": 898 }, { "epoch": 0.1386214863303812, "grad_norm": 0.7058428525924683, "learning_rate": 9.53324173104399e-05, "loss": 0.9798, "step": 900 }, { "epoch": 0.1386214863303812, "eval_loss": 2.5482165813446045, "eval_runtime": 736.9952, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 900 }, { "epoch": 0.13892953407778205, "grad_norm": 0.9759823083877563, "learning_rate": 9.531198022249479e-05, "loss": 1.473, "step": 902 }, { "epoch": 0.1392375818251829, "grad_norm": 1.039426326751709, "learning_rate": 9.52915006906067e-05, "loss": 1.4844, "step": 904 }, { "epoch": 0.13954562957258376, "grad_norm": 0.5784148573875427, "learning_rate": 9.527097873395887e-05, "loss": 1.0574, "step": 906 }, { "epoch": 0.1398536773199846, "grad_norm": 0.7337266802787781, "learning_rate": 9.525041437177433e-05, "loss": 1.1303, "step": 908 }, { "epoch": 0.14016172506738545, "grad_norm": 0.6352843642234802, "learning_rate": 9.522980762331582e-05, "loss": 1.2027, "step": 910 }, { "epoch": 0.1404697728147863, "grad_norm": 0.6290647983551025, "learning_rate": 9.520915850788575e-05, "loss": 2.3878, "step": 912 }, { "epoch": 0.14077782056218713, "grad_norm": 0.8372937440872192, "learning_rate": 9.518846704482627e-05, "loss": 1.0772, "step": 914 }, { "epoch": 0.141085868309588, "grad_norm": 0.826770544052124, "learning_rate": 9.516773325351915e-05, "loss": 1.3271, "step": 916 }, { "epoch": 0.14139391605698884, "grad_norm": 0.6032495498657227, "learning_rate": 9.514695715338585e-05, "loss": 1.001, "step": 918 }, { "epoch": 0.14170196380438968, "grad_norm": 0.8128840923309326, "learning_rate": 9.512613876388742e-05, "loss": 1.1132, "step": 920 }, { "epoch": 0.14201001155179052, "grad_norm": 0.718402087688446, "learning_rate": 9.510527810452455e-05, "loss": 1.1008, "step": 922 }, { "epoch": 0.14231805929919136, "grad_norm": 0.9620109796524048, "learning_rate": 9.508437519483753e-05, "loss": 1.0307, "step": 924 }, { "epoch": 0.14262610704659223, "grad_norm": 0.7499354481697083, "learning_rate": 9.506343005440618e-05, "loss": 0.918, "step": 926 }, { "epoch": 0.14293415479399307, "grad_norm": 0.6020653247833252, "learning_rate": 9.504244270284994e-05, "loss": 1.8571, "step": 928 }, { "epoch": 0.14324220254139391, "grad_norm": 0.718852162361145, "learning_rate": 9.502141315982776e-05, "loss": 1.2873, "step": 930 }, { "epoch": 0.14355025028879476, "grad_norm": 0.8073559403419495, "learning_rate": 9.50003414450381e-05, "loss": 0.8581, "step": 932 }, { "epoch": 0.1438582980361956, "grad_norm": 0.6343957185745239, "learning_rate": 9.497922757821894e-05, "loss": 1.145, "step": 934 }, { "epoch": 0.14416634578359647, "grad_norm": 0.9298047423362732, "learning_rate": 9.495807157914771e-05, "loss": 1.3353, "step": 936 }, { "epoch": 0.1444743935309973, "grad_norm": 0.7584412097930908, "learning_rate": 9.493687346764137e-05, "loss": 1.1033, "step": 938 }, { "epoch": 0.14478244127839815, "grad_norm": 0.5845352411270142, "learning_rate": 9.491563326355628e-05, "loss": 0.7995, "step": 940 }, { "epoch": 0.145090489025799, "grad_norm": 0.6179757714271545, "learning_rate": 9.489435098678823e-05, "loss": 1.1081, "step": 942 }, { "epoch": 0.14539853677319983, "grad_norm": 0.7248619198799133, "learning_rate": 9.487302665727243e-05, "loss": 1.4268, "step": 944 }, { "epoch": 0.1457065845206007, "grad_norm": 0.7348257899284363, "learning_rate": 9.485166029498348e-05, "loss": 1.1433, "step": 946 }, { "epoch": 0.14601463226800154, "grad_norm": 0.711986780166626, "learning_rate": 9.483025191993535e-05, "loss": 1.2964, "step": 948 }, { "epoch": 0.14632268001540238, "grad_norm": 0.6216188073158264, "learning_rate": 9.480880155218136e-05, "loss": 1.4536, "step": 950 }, { "epoch": 0.14663072776280323, "grad_norm": 0.8497354984283447, "learning_rate": 9.478730921181419e-05, "loss": 1.3397, "step": 952 }, { "epoch": 0.1469387755102041, "grad_norm": 0.5710319876670837, "learning_rate": 9.476577491896579e-05, "loss": 0.8536, "step": 954 }, { "epoch": 0.14724682325760494, "grad_norm": 0.6521850228309631, "learning_rate": 9.474419869380745e-05, "loss": 0.872, "step": 956 }, { "epoch": 0.14755487100500578, "grad_norm": 0.6547313928604126, "learning_rate": 9.472258055654971e-05, "loss": 0.8257, "step": 958 }, { "epoch": 0.14786291875240662, "grad_norm": 0.737887978553772, "learning_rate": 9.47009205274424e-05, "loss": 0.8285, "step": 960 }, { "epoch": 0.14817096649980746, "grad_norm": 0.6487706303596497, "learning_rate": 9.467921862677454e-05, "loss": 1.2221, "step": 962 }, { "epoch": 0.14847901424720833, "grad_norm": 0.6891322135925293, "learning_rate": 9.46574748748744e-05, "loss": 0.9617, "step": 964 }, { "epoch": 0.14878706199460917, "grad_norm": 0.6219374537467957, "learning_rate": 9.463568929210949e-05, "loss": 1.0844, "step": 966 }, { "epoch": 0.14909510974201, "grad_norm": 0.6913350820541382, "learning_rate": 9.461386189888643e-05, "loss": 0.9804, "step": 968 }, { "epoch": 0.14940315748941085, "grad_norm": 0.78050297498703, "learning_rate": 9.459199271565107e-05, "loss": 1.1167, "step": 970 }, { "epoch": 0.1497112052368117, "grad_norm": 0.589933454990387, "learning_rate": 9.457008176288837e-05, "loss": 1.1252, "step": 972 }, { "epoch": 0.15001925298421256, "grad_norm": 0.7892959713935852, "learning_rate": 9.45481290611224e-05, "loss": 1.1474, "step": 974 }, { "epoch": 0.1503273007316134, "grad_norm": 0.7642476558685303, "learning_rate": 9.452613463091637e-05, "loss": 1.0619, "step": 976 }, { "epoch": 0.15063534847901425, "grad_norm": 0.5432103872299194, "learning_rate": 9.450409849287258e-05, "loss": 1.1147, "step": 978 }, { "epoch": 0.1509433962264151, "grad_norm": 1.7676953077316284, "learning_rate": 9.448202066763237e-05, "loss": 1.0471, "step": 980 }, { "epoch": 0.15125144397381593, "grad_norm": 0.5658864378929138, "learning_rate": 9.445990117587614e-05, "loss": 1.003, "step": 982 }, { "epoch": 0.1515594917212168, "grad_norm": 0.6045776605606079, "learning_rate": 9.443774003832332e-05, "loss": 1.1408, "step": 984 }, { "epoch": 0.15186753946861764, "grad_norm": 0.6377173662185669, "learning_rate": 9.441553727573236e-05, "loss": 1.0693, "step": 986 }, { "epoch": 0.15217558721601848, "grad_norm": 0.8608678579330444, "learning_rate": 9.439329290890068e-05, "loss": 1.1497, "step": 988 }, { "epoch": 0.15248363496341932, "grad_norm": 0.5447673797607422, "learning_rate": 9.437100695866469e-05, "loss": 0.9368, "step": 990 }, { "epoch": 0.15279168271082016, "grad_norm": 0.6498827934265137, "learning_rate": 9.434867944589973e-05, "loss": 0.986, "step": 992 }, { "epoch": 0.15309973045822103, "grad_norm": 0.686646044254303, "learning_rate": 9.432631039152011e-05, "loss": 0.9154, "step": 994 }, { "epoch": 0.15340777820562188, "grad_norm": 0.7855954170227051, "learning_rate": 9.430389981647901e-05, "loss": 0.9782, "step": 996 }, { "epoch": 0.15371582595302272, "grad_norm": 0.6959761381149292, "learning_rate": 9.428144774176852e-05, "loss": 1.2808, "step": 998 }, { "epoch": 0.15402387370042356, "grad_norm": 0.6581859588623047, "learning_rate": 9.425895418841961e-05, "loss": 1.1032, "step": 1000 }, { "epoch": 0.1543319214478244, "grad_norm": 0.9970901012420654, "learning_rate": 9.42364191775021e-05, "loss": 1.0597, "step": 1002 }, { "epoch": 0.15463996919522527, "grad_norm": 0.6671077013015747, "learning_rate": 9.421384273012463e-05, "loss": 0.9503, "step": 1004 }, { "epoch": 0.1549480169426261, "grad_norm": 0.6460065841674805, "learning_rate": 9.419122486743466e-05, "loss": 0.933, "step": 1006 }, { "epoch": 0.15525606469002695, "grad_norm": 0.9560624361038208, "learning_rate": 9.416856561061846e-05, "loss": 1.2742, "step": 1008 }, { "epoch": 0.1555641124374278, "grad_norm": 0.6850244402885437, "learning_rate": 9.414586498090106e-05, "loss": 1.0752, "step": 1010 }, { "epoch": 0.15587216018482866, "grad_norm": 0.5755826830863953, "learning_rate": 9.412312299954622e-05, "loss": 1.0276, "step": 1012 }, { "epoch": 0.1561802079322295, "grad_norm": 0.5836648941040039, "learning_rate": 9.41003396878565e-05, "loss": 0.9003, "step": 1014 }, { "epoch": 0.15648825567963034, "grad_norm": 0.5940999984741211, "learning_rate": 9.40775150671731e-05, "loss": 0.9651, "step": 1016 }, { "epoch": 0.1567963034270312, "grad_norm": 0.8461489081382751, "learning_rate": 9.405464915887598e-05, "loss": 1.0209, "step": 1018 }, { "epoch": 0.15710435117443203, "grad_norm": 0.660050630569458, "learning_rate": 9.403174198438372e-05, "loss": 1.0409, "step": 1020 }, { "epoch": 0.1574123989218329, "grad_norm": 1.0249272584915161, "learning_rate": 9.400879356515357e-05, "loss": 1.3634, "step": 1022 }, { "epoch": 0.15772044666923374, "grad_norm": 0.6267766952514648, "learning_rate": 9.398580392268145e-05, "loss": 1.211, "step": 1024 }, { "epoch": 0.15802849441663458, "grad_norm": 0.7840976119041443, "learning_rate": 9.396277307850184e-05, "loss": 1.1994, "step": 1026 }, { "epoch": 0.15833654216403542, "grad_norm": 0.6230096817016602, "learning_rate": 9.393970105418786e-05, "loss": 0.9114, "step": 1028 }, { "epoch": 0.15864458991143626, "grad_norm": 1.0648771524429321, "learning_rate": 9.391658787135115e-05, "loss": 1.0579, "step": 1030 }, { "epoch": 0.15895263765883713, "grad_norm": 0.6591848134994507, "learning_rate": 9.389343355164198e-05, "loss": 1.2073, "step": 1032 }, { "epoch": 0.15926068540623797, "grad_norm": 0.9383640885353088, "learning_rate": 9.387023811674909e-05, "loss": 1.2236, "step": 1034 }, { "epoch": 0.15956873315363881, "grad_norm": 0.8099064230918884, "learning_rate": 9.384700158839972e-05, "loss": 0.8945, "step": 1036 }, { "epoch": 0.15987678090103966, "grad_norm": 0.6530327796936035, "learning_rate": 9.382372398835969e-05, "loss": 1.0963, "step": 1038 }, { "epoch": 0.1601848286484405, "grad_norm": 0.6211308836936951, "learning_rate": 9.380040533843319e-05, "loss": 1.0149, "step": 1040 }, { "epoch": 0.16049287639584137, "grad_norm": 0.7817839980125427, "learning_rate": 9.377704566046295e-05, "loss": 1.1448, "step": 1042 }, { "epoch": 0.1608009241432422, "grad_norm": 0.7905937433242798, "learning_rate": 9.375364497633006e-05, "loss": 1.153, "step": 1044 }, { "epoch": 0.16110897189064305, "grad_norm": 0.8386938571929932, "learning_rate": 9.373020330795403e-05, "loss": 1.0107, "step": 1046 }, { "epoch": 0.1614170196380439, "grad_norm": 0.7169632315635681, "learning_rate": 9.370672067729284e-05, "loss": 1.3401, "step": 1048 }, { "epoch": 0.16172506738544473, "grad_norm": 0.808017909526825, "learning_rate": 9.368319710634273e-05, "loss": 1.0624, "step": 1050 }, { "epoch": 0.1620331151328456, "grad_norm": 0.7317773699760437, "learning_rate": 9.365963261713835e-05, "loss": 1.0429, "step": 1052 }, { "epoch": 0.16234116288024644, "grad_norm": 0.7492151260375977, "learning_rate": 9.363602723175268e-05, "loss": 1.078, "step": 1054 }, { "epoch": 0.16264921062764728, "grad_norm": 0.8238731622695923, "learning_rate": 9.361238097229699e-05, "loss": 1.2244, "step": 1056 }, { "epoch": 0.16295725837504813, "grad_norm": 0.899804949760437, "learning_rate": 9.358869386092084e-05, "loss": 1.2607, "step": 1058 }, { "epoch": 0.16326530612244897, "grad_norm": 0.6087053418159485, "learning_rate": 9.356496591981204e-05, "loss": 1.0546, "step": 1060 }, { "epoch": 0.16357335386984984, "grad_norm": 0.574735164642334, "learning_rate": 9.354119717119669e-05, "loss": 1.1068, "step": 1062 }, { "epoch": 0.16388140161725068, "grad_norm": 0.7853001356124878, "learning_rate": 9.351738763733906e-05, "loss": 1.0626, "step": 1064 }, { "epoch": 0.16418944936465152, "grad_norm": 0.7452929019927979, "learning_rate": 9.349353734054167e-05, "loss": 1.0014, "step": 1066 }, { "epoch": 0.16449749711205236, "grad_norm": 0.5956066250801086, "learning_rate": 9.346964630314521e-05, "loss": 1.1041, "step": 1068 }, { "epoch": 0.16480554485945323, "grad_norm": 1.5090982913970947, "learning_rate": 9.344571454752851e-05, "loss": 1.0974, "step": 1070 }, { "epoch": 0.16511359260685407, "grad_norm": 0.7556977272033691, "learning_rate": 9.342174209610857e-05, "loss": 1.0473, "step": 1072 }, { "epoch": 0.1654216403542549, "grad_norm": 0.5461626052856445, "learning_rate": 9.339772897134049e-05, "loss": 1.2671, "step": 1074 }, { "epoch": 0.16572968810165575, "grad_norm": 0.7559049725532532, "learning_rate": 9.337367519571748e-05, "loss": 1.1524, "step": 1076 }, { "epoch": 0.1660377358490566, "grad_norm": 1.1034940481185913, "learning_rate": 9.334958079177081e-05, "loss": 1.058, "step": 1078 }, { "epoch": 0.16634578359645746, "grad_norm": 0.6188117861747742, "learning_rate": 9.332544578206985e-05, "loss": 1.1278, "step": 1080 }, { "epoch": 0.1666538313438583, "grad_norm": 0.6941109895706177, "learning_rate": 9.330127018922194e-05, "loss": 1.0709, "step": 1082 }, { "epoch": 0.16696187909125915, "grad_norm": 0.6960075497627258, "learning_rate": 9.327705403587248e-05, "loss": 0.9336, "step": 1084 }, { "epoch": 0.16726992683866, "grad_norm": 0.9475935101509094, "learning_rate": 9.325279734470488e-05, "loss": 1.0948, "step": 1086 }, { "epoch": 0.16757797458606083, "grad_norm": 0.9592021107673645, "learning_rate": 9.322850013844046e-05, "loss": 2.709, "step": 1088 }, { "epoch": 0.1678860223334617, "grad_norm": 0.6552063226699829, "learning_rate": 9.320416243983856e-05, "loss": 0.8764, "step": 1090 }, { "epoch": 0.16819407008086254, "grad_norm": 0.8499904870986938, "learning_rate": 9.317978427169638e-05, "loss": 1.2129, "step": 1092 }, { "epoch": 0.16850211782826338, "grad_norm": 0.6355566382408142, "learning_rate": 9.31553656568491e-05, "loss": 0.941, "step": 1094 }, { "epoch": 0.16881016557566422, "grad_norm": 0.6042637228965759, "learning_rate": 9.313090661816972e-05, "loss": 1.2016, "step": 1096 }, { "epoch": 0.16911821332306506, "grad_norm": 0.6587199568748474, "learning_rate": 9.310640717856915e-05, "loss": 0.9431, "step": 1098 }, { "epoch": 0.16942626107046593, "grad_norm": 0.6389713883399963, "learning_rate": 9.308186736099614e-05, "loss": 2.0506, "step": 1100 }, { "epoch": 0.16973430881786677, "grad_norm": 0.8145257234573364, "learning_rate": 9.305728718843723e-05, "loss": 1.0665, "step": 1102 }, { "epoch": 0.17004235656526762, "grad_norm": 0.8680522441864014, "learning_rate": 9.303266668391679e-05, "loss": 1.1634, "step": 1104 }, { "epoch": 0.17035040431266846, "grad_norm": 0.9924330115318298, "learning_rate": 9.300800587049696e-05, "loss": 1.0959, "step": 1106 }, { "epoch": 0.1706584520600693, "grad_norm": 0.5335946083068848, "learning_rate": 9.298330477127763e-05, "loss": 1.9391, "step": 1108 }, { "epoch": 0.17096649980747017, "grad_norm": 0.6439701914787292, "learning_rate": 9.295856340939648e-05, "loss": 1.1721, "step": 1110 }, { "epoch": 0.171274547554871, "grad_norm": 0.9099013209342957, "learning_rate": 9.293378180802878e-05, "loss": 1.2054, "step": 1112 }, { "epoch": 0.17158259530227185, "grad_norm": 0.5536646842956543, "learning_rate": 9.290895999038765e-05, "loss": 0.8748, "step": 1114 }, { "epoch": 0.1718906430496727, "grad_norm": 0.7243412733078003, "learning_rate": 9.288409797972375e-05, "loss": 0.9013, "step": 1116 }, { "epoch": 0.17219869079707353, "grad_norm": 0.5222441554069519, "learning_rate": 9.285919579932548e-05, "loss": 0.9313, "step": 1118 }, { "epoch": 0.1725067385444744, "grad_norm": 0.7990663647651672, "learning_rate": 9.28342534725188e-05, "loss": 2.3069, "step": 1120 }, { "epoch": 0.17281478629187524, "grad_norm": 1.0527921915054321, "learning_rate": 9.280927102266729e-05, "loss": 0.9213, "step": 1122 }, { "epoch": 0.17312283403927609, "grad_norm": 0.8192021250724792, "learning_rate": 9.278424847317217e-05, "loss": 2.6465, "step": 1124 }, { "epoch": 0.17343088178667693, "grad_norm": 0.7312417030334473, "learning_rate": 9.275918584747216e-05, "loss": 1.1035, "step": 1126 }, { "epoch": 0.17373892953407777, "grad_norm": 0.7595751881599426, "learning_rate": 9.273408316904353e-05, "loss": 1.0814, "step": 1128 }, { "epoch": 0.17404697728147864, "grad_norm": 0.6808615326881409, "learning_rate": 9.270894046140009e-05, "loss": 1.6817, "step": 1130 }, { "epoch": 0.17435502502887948, "grad_norm": 0.7840389013290405, "learning_rate": 9.268375774809312e-05, "loss": 1.3194, "step": 1132 }, { "epoch": 0.17466307277628032, "grad_norm": 0.665709376335144, "learning_rate": 9.265853505271139e-05, "loss": 1.0137, "step": 1134 }, { "epoch": 0.17497112052368116, "grad_norm": 0.7553781270980835, "learning_rate": 9.26332723988811e-05, "loss": 0.9437, "step": 1136 }, { "epoch": 0.17527916827108203, "grad_norm": 0.6285457015037537, "learning_rate": 9.260796981026591e-05, "loss": 0.9462, "step": 1138 }, { "epoch": 0.17558721601848287, "grad_norm": 0.6842612624168396, "learning_rate": 9.258262731056688e-05, "loss": 1.334, "step": 1140 }, { "epoch": 0.1758952637658837, "grad_norm": 0.8658350110054016, "learning_rate": 9.255724492352245e-05, "loss": 1.1117, "step": 1142 }, { "epoch": 0.17620331151328456, "grad_norm": 0.6949405074119568, "learning_rate": 9.25318226729084e-05, "loss": 1.0724, "step": 1144 }, { "epoch": 0.1765113592606854, "grad_norm": 0.8136616349220276, "learning_rate": 9.250636058253788e-05, "loss": 1.1257, "step": 1146 }, { "epoch": 0.17681940700808627, "grad_norm": 0.7518190741539001, "learning_rate": 9.248085867626136e-05, "loss": 0.912, "step": 1148 }, { "epoch": 0.1771274547554871, "grad_norm": 0.783767819404602, "learning_rate": 9.245531697796656e-05, "loss": 1.1135, "step": 1150 }, { "epoch": 0.17743550250288795, "grad_norm": 0.637515127658844, "learning_rate": 9.242973551157857e-05, "loss": 1.7548, "step": 1152 }, { "epoch": 0.1777435502502888, "grad_norm": 0.6867687702178955, "learning_rate": 9.24041143010596e-05, "loss": 0.9194, "step": 1154 }, { "epoch": 0.17805159799768963, "grad_norm": 0.647327721118927, "learning_rate": 9.23784533704092e-05, "loss": 0.9544, "step": 1156 }, { "epoch": 0.1783596457450905, "grad_norm": 0.8152113556861877, "learning_rate": 9.235275274366406e-05, "loss": 1.0048, "step": 1158 }, { "epoch": 0.17866769349249134, "grad_norm": 0.5865814685821533, "learning_rate": 9.23270124448981e-05, "loss": 0.9702, "step": 1160 }, { "epoch": 0.17897574123989218, "grad_norm": 0.7674340009689331, "learning_rate": 9.230123249822236e-05, "loss": 1.5428, "step": 1162 }, { "epoch": 0.17928378898729302, "grad_norm": 1.6844772100448608, "learning_rate": 9.227541292778504e-05, "loss": 1.0052, "step": 1164 }, { "epoch": 0.17959183673469387, "grad_norm": 0.8579050898551941, "learning_rate": 9.224955375777147e-05, "loss": 1.1531, "step": 1166 }, { "epoch": 0.17989988448209474, "grad_norm": 0.7544540762901306, "learning_rate": 9.222365501240402e-05, "loss": 1.3026, "step": 1168 }, { "epoch": 0.18020793222949558, "grad_norm": 0.7298200130462646, "learning_rate": 9.21977167159422e-05, "loss": 1.0986, "step": 1170 }, { "epoch": 0.18051597997689642, "grad_norm": 0.5601779222488403, "learning_rate": 9.21717388926825e-05, "loss": 1.082, "step": 1172 }, { "epoch": 0.18082402772429726, "grad_norm": 0.6664190888404846, "learning_rate": 9.214572156695849e-05, "loss": 1.7701, "step": 1174 }, { "epoch": 0.1811320754716981, "grad_norm": 0.8289921283721924, "learning_rate": 9.211966476314072e-05, "loss": 2.7131, "step": 1176 }, { "epoch": 0.18144012321909897, "grad_norm": 0.6665297746658325, "learning_rate": 9.209356850563672e-05, "loss": 1.0969, "step": 1178 }, { "epoch": 0.1817481709664998, "grad_norm": 0.5994783043861389, "learning_rate": 9.206743281889097e-05, "loss": 1.1694, "step": 1180 }, { "epoch": 0.18205621871390065, "grad_norm": 0.6745316982269287, "learning_rate": 9.204125772738488e-05, "loss": 1.4434, "step": 1182 }, { "epoch": 0.1823642664613015, "grad_norm": 0.683441162109375, "learning_rate": 9.20150432556368e-05, "loss": 1.0758, "step": 1184 }, { "epoch": 0.18267231420870234, "grad_norm": 0.7067748308181763, "learning_rate": 9.198878942820195e-05, "loss": 1.0938, "step": 1186 }, { "epoch": 0.1829803619561032, "grad_norm": 0.845294713973999, "learning_rate": 9.196249626967237e-05, "loss": 1.2249, "step": 1188 }, { "epoch": 0.18328840970350405, "grad_norm": 0.9288262128829956, "learning_rate": 9.193616380467704e-05, "loss": 1.1201, "step": 1190 }, { "epoch": 0.1835964574509049, "grad_norm": 0.6859925985336304, "learning_rate": 9.190979205788169e-05, "loss": 1.2275, "step": 1192 }, { "epoch": 0.18390450519830573, "grad_norm": 0.6356547474861145, "learning_rate": 9.188338105398882e-05, "loss": 0.9563, "step": 1194 }, { "epoch": 0.1842125529457066, "grad_norm": 0.6628222465515137, "learning_rate": 9.185693081773777e-05, "loss": 0.9248, "step": 1196 }, { "epoch": 0.18452060069310744, "grad_norm": 0.8962293267250061, "learning_rate": 9.183044137390461e-05, "loss": 1.9412, "step": 1198 }, { "epoch": 0.18482864844050828, "grad_norm": 0.673896849155426, "learning_rate": 9.18039127473021e-05, "loss": 1.0603, "step": 1200 }, { "epoch": 0.18482864844050828, "eval_loss": 2.5936074256896973, "eval_runtime": 736.9969, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 1200 }, { "epoch": 0.18513669618790912, "grad_norm": 0.5971192121505737, "learning_rate": 9.177734496277975e-05, "loss": 0.9434, "step": 1202 }, { "epoch": 0.18544474393530996, "grad_norm": 0.867962658405304, "learning_rate": 9.175073804522371e-05, "loss": 1.242, "step": 1204 }, { "epoch": 0.18575279168271083, "grad_norm": 0.6973360776901245, "learning_rate": 9.17240920195568e-05, "loss": 1.0988, "step": 1206 }, { "epoch": 0.18606083943011167, "grad_norm": 0.6780734658241272, "learning_rate": 9.169740691073852e-05, "loss": 1.1157, "step": 1208 }, { "epoch": 0.18636888717751252, "grad_norm": 0.9438827633857727, "learning_rate": 9.167068274376487e-05, "loss": 1.1604, "step": 1210 }, { "epoch": 0.18667693492491336, "grad_norm": 0.7693531513214111, "learning_rate": 9.164391954366855e-05, "loss": 1.1541, "step": 1212 }, { "epoch": 0.1869849826723142, "grad_norm": 0.5881041884422302, "learning_rate": 9.161711733551877e-05, "loss": 2.0453, "step": 1214 }, { "epoch": 0.18729303041971507, "grad_norm": 0.5809153914451599, "learning_rate": 9.159027614442126e-05, "loss": 2.0997, "step": 1216 }, { "epoch": 0.1876010781671159, "grad_norm": 0.7402828335762024, "learning_rate": 9.15633959955183e-05, "loss": 1.0956, "step": 1218 }, { "epoch": 0.18790912591451675, "grad_norm": 0.7167702913284302, "learning_rate": 9.153647691398866e-05, "loss": 3.4171, "step": 1220 }, { "epoch": 0.1882171736619176, "grad_norm": 0.9422446489334106, "learning_rate": 9.150951892504754e-05, "loss": 1.1465, "step": 1222 }, { "epoch": 0.18852522140931843, "grad_norm": 0.4820753037929535, "learning_rate": 9.148252205394665e-05, "loss": 1.045, "step": 1224 }, { "epoch": 0.1888332691567193, "grad_norm": 0.768250048160553, "learning_rate": 9.145548632597408e-05, "loss": 1.2842, "step": 1226 }, { "epoch": 0.18914131690412014, "grad_norm": 0.7467925548553467, "learning_rate": 9.142841176645429e-05, "loss": 1.2167, "step": 1228 }, { "epoch": 0.18944936465152099, "grad_norm": 0.8527848720550537, "learning_rate": 9.140129840074818e-05, "loss": 0.9865, "step": 1230 }, { "epoch": 0.18975741239892183, "grad_norm": 1.1066113710403442, "learning_rate": 9.137414625425295e-05, "loss": 1.2248, "step": 1232 }, { "epoch": 0.19006546014632267, "grad_norm": 1.0038797855377197, "learning_rate": 9.134695535240216e-05, "loss": 1.1177, "step": 1234 }, { "epoch": 0.19037350789372354, "grad_norm": 0.6442514061927795, "learning_rate": 9.131972572066563e-05, "loss": 1.1056, "step": 1236 }, { "epoch": 0.19068155564112438, "grad_norm": 0.5913024544715881, "learning_rate": 9.12924573845495e-05, "loss": 1.052, "step": 1238 }, { "epoch": 0.19098960338852522, "grad_norm": 0.7442435622215271, "learning_rate": 9.126515036959613e-05, "loss": 1.023, "step": 1240 }, { "epoch": 0.19129765113592606, "grad_norm": 0.7399488687515259, "learning_rate": 9.123780470138415e-05, "loss": 1.4022, "step": 1242 }, { "epoch": 0.1916056988833269, "grad_norm": 0.6702150106430054, "learning_rate": 9.121042040552836e-05, "loss": 0.973, "step": 1244 }, { "epoch": 0.19191374663072777, "grad_norm": 0.6792922019958496, "learning_rate": 9.118299750767976e-05, "loss": 1.0224, "step": 1246 }, { "epoch": 0.1922217943781286, "grad_norm": 0.7627959847450256, "learning_rate": 9.115553603352551e-05, "loss": 0.9449, "step": 1248 }, { "epoch": 0.19252984212552945, "grad_norm": 0.7185088992118835, "learning_rate": 9.11280360087889e-05, "loss": 0.9539, "step": 1250 }, { "epoch": 0.1928378898729303, "grad_norm": 0.6071786880493164, "learning_rate": 9.110049745922933e-05, "loss": 1.1379, "step": 1252 }, { "epoch": 0.19314593762033117, "grad_norm": 0.6094626784324646, "learning_rate": 9.107292041064229e-05, "loss": 1.1505, "step": 1254 }, { "epoch": 0.193453985367732, "grad_norm": 0.6313731670379639, "learning_rate": 9.104530488885932e-05, "loss": 0.9058, "step": 1256 }, { "epoch": 0.19376203311513285, "grad_norm": 0.6403363943099976, "learning_rate": 9.1017650919748e-05, "loss": 1.212, "step": 1258 }, { "epoch": 0.1940700808625337, "grad_norm": 0.6836332678794861, "learning_rate": 9.098995852921197e-05, "loss": 1.1166, "step": 1260 }, { "epoch": 0.19437812860993453, "grad_norm": 0.5408475995063782, "learning_rate": 9.09622277431908e-05, "loss": 1.2248, "step": 1262 }, { "epoch": 0.1946861763573354, "grad_norm": 0.586330771446228, "learning_rate": 9.093445858766004e-05, "loss": 0.8889, "step": 1264 }, { "epoch": 0.19499422410473624, "grad_norm": 0.7370517253875732, "learning_rate": 9.090665108863118e-05, "loss": 1.1122, "step": 1266 }, { "epoch": 0.19530227185213708, "grad_norm": 1.0544092655181885, "learning_rate": 9.087880527215167e-05, "loss": 1.2945, "step": 1268 }, { "epoch": 0.19561031959953792, "grad_norm": 0.7124229073524475, "learning_rate": 9.085092116430479e-05, "loss": 1.0174, "step": 1270 }, { "epoch": 0.19591836734693877, "grad_norm": 0.6399815678596497, "learning_rate": 9.08229987912097e-05, "loss": 1.2766, "step": 1272 }, { "epoch": 0.19622641509433963, "grad_norm": 0.6686626076698303, "learning_rate": 9.079503817902144e-05, "loss": 0.9428, "step": 1274 }, { "epoch": 0.19653446284174048, "grad_norm": 0.9779509902000427, "learning_rate": 9.076703935393083e-05, "loss": 1.1812, "step": 1276 }, { "epoch": 0.19684251058914132, "grad_norm": 0.6352618336677551, "learning_rate": 9.073900234216452e-05, "loss": 1.0626, "step": 1278 }, { "epoch": 0.19715055833654216, "grad_norm": 0.6762117147445679, "learning_rate": 9.07109271699849e-05, "loss": 1.2162, "step": 1280 }, { "epoch": 0.197458606083943, "grad_norm": 0.6376475095748901, "learning_rate": 9.06828138636901e-05, "loss": 0.8033, "step": 1282 }, { "epoch": 0.19776665383134387, "grad_norm": 0.6817375421524048, "learning_rate": 9.065466244961402e-05, "loss": 1.1285, "step": 1284 }, { "epoch": 0.1980747015787447, "grad_norm": 0.7756999731063843, "learning_rate": 9.062647295412619e-05, "loss": 0.9947, "step": 1286 }, { "epoch": 0.19838274932614555, "grad_norm": 0.9223238825798035, "learning_rate": 9.059824540363183e-05, "loss": 1.036, "step": 1288 }, { "epoch": 0.1986907970735464, "grad_norm": 0.7431936860084534, "learning_rate": 9.056997982457185e-05, "loss": 1.1404, "step": 1290 }, { "epoch": 0.19899884482094723, "grad_norm": 0.7931793928146362, "learning_rate": 9.054167624342275e-05, "loss": 1.1076, "step": 1292 }, { "epoch": 0.1993068925683481, "grad_norm": 0.5251017808914185, "learning_rate": 9.05133346866966e-05, "loss": 0.92, "step": 1294 }, { "epoch": 0.19961494031574895, "grad_norm": 0.5351060628890991, "learning_rate": 9.048495518094109e-05, "loss": 1.1359, "step": 1296 }, { "epoch": 0.1999229880631498, "grad_norm": 0.7550303936004639, "learning_rate": 9.045653775273942e-05, "loss": 1.1408, "step": 1298 }, { "epoch": 0.20023103581055063, "grad_norm": 0.7095838189125061, "learning_rate": 9.042808242871035e-05, "loss": 1.0531, "step": 1300 }, { "epoch": 0.20053908355795147, "grad_norm": 0.5679411888122559, "learning_rate": 9.039958923550808e-05, "loss": 0.9117, "step": 1302 }, { "epoch": 0.20084713130535234, "grad_norm": 0.7613952159881592, "learning_rate": 9.037105819982234e-05, "loss": 1.1077, "step": 1304 }, { "epoch": 0.20115517905275318, "grad_norm": 0.5979565978050232, "learning_rate": 9.03424893483783e-05, "loss": 1.0161, "step": 1306 }, { "epoch": 0.20146322680015402, "grad_norm": 0.7908322811126709, "learning_rate": 9.03138827079365e-05, "loss": 1.2111, "step": 1308 }, { "epoch": 0.20177127454755486, "grad_norm": 0.7367496490478516, "learning_rate": 9.028523830529295e-05, "loss": 0.923, "step": 1310 }, { "epoch": 0.2020793222949557, "grad_norm": 0.6930207014083862, "learning_rate": 9.025655616727895e-05, "loss": 0.9931, "step": 1312 }, { "epoch": 0.20238737004235657, "grad_norm": 0.8138192296028137, "learning_rate": 9.022783632076122e-05, "loss": 1.1191, "step": 1314 }, { "epoch": 0.20269541778975741, "grad_norm": 0.7855730652809143, "learning_rate": 9.019907879264179e-05, "loss": 1.2692, "step": 1316 }, { "epoch": 0.20300346553715826, "grad_norm": 0.5641903281211853, "learning_rate": 9.017028360985794e-05, "loss": 1.5539, "step": 1318 }, { "epoch": 0.2033115132845591, "grad_norm": 0.8672736883163452, "learning_rate": 9.014145079938228e-05, "loss": 1.1793, "step": 1320 }, { "epoch": 0.20361956103195997, "grad_norm": 0.7759377360343933, "learning_rate": 9.01125803882226e-05, "loss": 1.2328, "step": 1322 }, { "epoch": 0.2039276087793608, "grad_norm": 0.8218508958816528, "learning_rate": 9.008367240342198e-05, "loss": 1.0023, "step": 1324 }, { "epoch": 0.20423565652676165, "grad_norm": 0.7159973382949829, "learning_rate": 9.005472687205867e-05, "loss": 1.7163, "step": 1326 }, { "epoch": 0.2045437042741625, "grad_norm": 0.7014909386634827, "learning_rate": 9.002574382124604e-05, "loss": 1.1984, "step": 1328 }, { "epoch": 0.20485175202156333, "grad_norm": 0.9051117897033691, "learning_rate": 8.999672327813271e-05, "loss": 1.1805, "step": 1330 }, { "epoch": 0.2051597997689642, "grad_norm": 0.6193578839302063, "learning_rate": 8.99676652699023e-05, "loss": 0.9635, "step": 1332 }, { "epoch": 0.20546784751636504, "grad_norm": 0.8723487257957458, "learning_rate": 8.993856982377362e-05, "loss": 1.2039, "step": 1334 }, { "epoch": 0.20577589526376588, "grad_norm": 0.7481098771095276, "learning_rate": 8.990943696700049e-05, "loss": 1.097, "step": 1336 }, { "epoch": 0.20608394301116673, "grad_norm": 0.8442493677139282, "learning_rate": 8.988026672687182e-05, "loss": 1.2301, "step": 1338 }, { "epoch": 0.20639199075856757, "grad_norm": 0.6437552571296692, "learning_rate": 8.985105913071148e-05, "loss": 2.1001, "step": 1340 }, { "epoch": 0.20670003850596844, "grad_norm": 0.6734112501144409, "learning_rate": 8.982181420587836e-05, "loss": 1.1627, "step": 1342 }, { "epoch": 0.20700808625336928, "grad_norm": 0.7252438068389893, "learning_rate": 8.979253197976633e-05, "loss": 0.97, "step": 1344 }, { "epoch": 0.20731613400077012, "grad_norm": 0.670896053314209, "learning_rate": 8.976321247980419e-05, "loss": 1.7838, "step": 1346 }, { "epoch": 0.20762418174817096, "grad_norm": 0.7382272481918335, "learning_rate": 8.973385573345566e-05, "loss": 2.5687, "step": 1348 }, { "epoch": 0.2079322294955718, "grad_norm": 0.6926072835922241, "learning_rate": 8.970446176821933e-05, "loss": 0.9489, "step": 1350 }, { "epoch": 0.20824027724297267, "grad_norm": 0.6148782968521118, "learning_rate": 8.967503061162865e-05, "loss": 0.753, "step": 1352 }, { "epoch": 0.2085483249903735, "grad_norm": 0.8001696467399597, "learning_rate": 8.964556229125194e-05, "loss": 1.9705, "step": 1354 }, { "epoch": 0.20885637273777435, "grad_norm": 0.7240262627601624, "learning_rate": 8.961605683469232e-05, "loss": 0.986, "step": 1356 }, { "epoch": 0.2091644204851752, "grad_norm": 0.7085553407669067, "learning_rate": 8.958651426958767e-05, "loss": 1.1624, "step": 1358 }, { "epoch": 0.20947246823257604, "grad_norm": 0.6068323850631714, "learning_rate": 8.955693462361065e-05, "loss": 1.0409, "step": 1360 }, { "epoch": 0.2097805159799769, "grad_norm": 0.6230590343475342, "learning_rate": 8.952731792446865e-05, "loss": 0.9571, "step": 1362 }, { "epoch": 0.21008856372737775, "grad_norm": 0.6568693518638611, "learning_rate": 8.949766419990379e-05, "loss": 0.9922, "step": 1364 }, { "epoch": 0.2103966114747786, "grad_norm": 0.7212346196174622, "learning_rate": 8.946797347769284e-05, "loss": 1.2634, "step": 1366 }, { "epoch": 0.21070465922217943, "grad_norm": 0.7108849287033081, "learning_rate": 8.943824578564724e-05, "loss": 1.1369, "step": 1368 }, { "epoch": 0.21101270696958027, "grad_norm": 1.3569531440734863, "learning_rate": 8.940848115161307e-05, "loss": 1.1873, "step": 1370 }, { "epoch": 0.21132075471698114, "grad_norm": 0.6778193116188049, "learning_rate": 8.937867960347095e-05, "loss": 1.0359, "step": 1372 }, { "epoch": 0.21162880246438198, "grad_norm": 0.6958675384521484, "learning_rate": 8.93488411691362e-05, "loss": 1.8977, "step": 1374 }, { "epoch": 0.21193685021178282, "grad_norm": 0.7475863695144653, "learning_rate": 8.931896587655857e-05, "loss": 1.4979, "step": 1376 }, { "epoch": 0.21224489795918366, "grad_norm": 0.7854467034339905, "learning_rate": 8.92890537537224e-05, "loss": 1.1155, "step": 1378 }, { "epoch": 0.21255294570658453, "grad_norm": 0.8435900211334229, "learning_rate": 8.925910482864652e-05, "loss": 1.1637, "step": 1380 }, { "epoch": 0.21286099345398538, "grad_norm": 0.6406680941581726, "learning_rate": 8.922911912938422e-05, "loss": 1.8449, "step": 1382 }, { "epoch": 0.21316904120138622, "grad_norm": 0.8102788329124451, "learning_rate": 8.919909668402325e-05, "loss": 0.9947, "step": 1384 }, { "epoch": 0.21347708894878706, "grad_norm": 0.7807202935218811, "learning_rate": 8.916903752068578e-05, "loss": 1.0053, "step": 1386 }, { "epoch": 0.2137851366961879, "grad_norm": 0.7852822542190552, "learning_rate": 8.913894166752835e-05, "loss": 0.9945, "step": 1388 }, { "epoch": 0.21409318444358877, "grad_norm": 0.6369873881340027, "learning_rate": 8.910880915274191e-05, "loss": 0.9943, "step": 1390 }, { "epoch": 0.2144012321909896, "grad_norm": 0.568693220615387, "learning_rate": 8.90786400045517e-05, "loss": 1.1122, "step": 1392 }, { "epoch": 0.21470927993839045, "grad_norm": 0.678931474685669, "learning_rate": 8.904843425121733e-05, "loss": 1.0143, "step": 1394 }, { "epoch": 0.2150173276857913, "grad_norm": 0.6750305891036987, "learning_rate": 8.901819192103266e-05, "loss": 0.8868, "step": 1396 }, { "epoch": 0.21532537543319213, "grad_norm": 0.7823706269264221, "learning_rate": 8.898791304232581e-05, "loss": 1.0879, "step": 1398 }, { "epoch": 0.215633423180593, "grad_norm": 0.6973862051963806, "learning_rate": 8.895759764345914e-05, "loss": 0.9778, "step": 1400 }, { "epoch": 0.21594147092799384, "grad_norm": 0.5516627430915833, "learning_rate": 8.892724575282927e-05, "loss": 1.6965, "step": 1402 }, { "epoch": 0.2162495186753947, "grad_norm": 0.5122855305671692, "learning_rate": 8.889685739886691e-05, "loss": 0.9705, "step": 1404 }, { "epoch": 0.21655756642279553, "grad_norm": 0.7519564032554626, "learning_rate": 8.886643261003697e-05, "loss": 1.0991, "step": 1406 }, { "epoch": 0.21686561417019637, "grad_norm": 0.6111971139907837, "learning_rate": 8.883597141483854e-05, "loss": 0.8201, "step": 1408 }, { "epoch": 0.21717366191759724, "grad_norm": 0.945604145526886, "learning_rate": 8.880547384180473e-05, "loss": 1.2554, "step": 1410 }, { "epoch": 0.21748170966499808, "grad_norm": 0.6552610397338867, "learning_rate": 8.877493991950276e-05, "loss": 2.228, "step": 1412 }, { "epoch": 0.21778975741239892, "grad_norm": 0.7612242102622986, "learning_rate": 8.87443696765339e-05, "loss": 0.9426, "step": 1414 }, { "epoch": 0.21809780515979976, "grad_norm": 0.8174354434013367, "learning_rate": 8.871376314153344e-05, "loss": 1.4622, "step": 1416 }, { "epoch": 0.2184058529072006, "grad_norm": 0.7058425545692444, "learning_rate": 8.868312034317067e-05, "loss": 1.4126, "step": 1418 }, { "epoch": 0.21871390065460147, "grad_norm": 0.7839975953102112, "learning_rate": 8.865244131014883e-05, "loss": 0.9144, "step": 1420 }, { "epoch": 0.21902194840200231, "grad_norm": 0.7724378108978271, "learning_rate": 8.862172607120512e-05, "loss": 0.8829, "step": 1422 }, { "epoch": 0.21932999614940316, "grad_norm": 0.8645272254943848, "learning_rate": 8.859097465511064e-05, "loss": 1.1721, "step": 1424 }, { "epoch": 0.219638043896804, "grad_norm": 0.7928360104560852, "learning_rate": 8.85601870906704e-05, "loss": 0.8773, "step": 1426 }, { "epoch": 0.21994609164420484, "grad_norm": 0.6146901249885559, "learning_rate": 8.852936340672324e-05, "loss": 0.9543, "step": 1428 }, { "epoch": 0.2202541393916057, "grad_norm": 0.7110276818275452, "learning_rate": 8.849850363214186e-05, "loss": 1.2455, "step": 1430 }, { "epoch": 0.22056218713900655, "grad_norm": 0.7560061812400818, "learning_rate": 8.846760779583274e-05, "loss": 1.1127, "step": 1432 }, { "epoch": 0.2208702348864074, "grad_norm": 0.8137415647506714, "learning_rate": 8.843667592673616e-05, "loss": 1.1296, "step": 1434 }, { "epoch": 0.22117828263380823, "grad_norm": 0.9150519371032715, "learning_rate": 8.840570805382617e-05, "loss": 2.6254, "step": 1436 }, { "epoch": 0.2214863303812091, "grad_norm": 0.5578750371932983, "learning_rate": 8.837470420611048e-05, "loss": 1.0047, "step": 1438 }, { "epoch": 0.22179437812860994, "grad_norm": 0.6566774845123291, "learning_rate": 8.834366441263056e-05, "loss": 0.9647, "step": 1440 }, { "epoch": 0.22210242587601078, "grad_norm": 0.7494180202484131, "learning_rate": 8.831258870246154e-05, "loss": 0.8414, "step": 1442 }, { "epoch": 0.22241047362341163, "grad_norm": 0.47362321615219116, "learning_rate": 8.828147710471217e-05, "loss": 0.9061, "step": 1444 }, { "epoch": 0.22271852137081247, "grad_norm": 0.8754947781562805, "learning_rate": 8.825032964852482e-05, "loss": 2.4502, "step": 1446 }, { "epoch": 0.22302656911821334, "grad_norm": 0.6758772134780884, "learning_rate": 8.821914636307547e-05, "loss": 0.9356, "step": 1448 }, { "epoch": 0.22333461686561418, "grad_norm": 0.8317671418190002, "learning_rate": 8.818792727757363e-05, "loss": 1.0184, "step": 1450 }, { "epoch": 0.22364266461301502, "grad_norm": 0.6676676273345947, "learning_rate": 8.81566724212624e-05, "loss": 2.2253, "step": 1452 }, { "epoch": 0.22395071236041586, "grad_norm": 0.7731775045394897, "learning_rate": 8.812538182341832e-05, "loss": 1.1849, "step": 1454 }, { "epoch": 0.2242587601078167, "grad_norm": 0.715045690536499, "learning_rate": 8.809405551335143e-05, "loss": 0.9324, "step": 1456 }, { "epoch": 0.22456680785521757, "grad_norm": 0.8740540742874146, "learning_rate": 8.806269352040527e-05, "loss": 1.2127, "step": 1458 }, { "epoch": 0.2248748556026184, "grad_norm": 0.760273277759552, "learning_rate": 8.803129587395673e-05, "loss": 1.9512, "step": 1460 }, { "epoch": 0.22518290335001925, "grad_norm": 0.6543803811073303, "learning_rate": 8.799986260341615e-05, "loss": 0.9039, "step": 1462 }, { "epoch": 0.2254909510974201, "grad_norm": 0.5931572914123535, "learning_rate": 8.796839373822721e-05, "loss": 1.1781, "step": 1464 }, { "epoch": 0.22579899884482094, "grad_norm": 0.7610328197479248, "learning_rate": 8.793688930786694e-05, "loss": 1.0437, "step": 1466 }, { "epoch": 0.2261070465922218, "grad_norm": 1.0328961610794067, "learning_rate": 8.790534934184569e-05, "loss": 1.1108, "step": 1468 }, { "epoch": 0.22641509433962265, "grad_norm": 0.7776150703430176, "learning_rate": 8.787377386970712e-05, "loss": 1.0043, "step": 1470 }, { "epoch": 0.2267231420870235, "grad_norm": 0.6809033751487732, "learning_rate": 8.784216292102807e-05, "loss": 0.9907, "step": 1472 }, { "epoch": 0.22703118983442433, "grad_norm": 0.7455710768699646, "learning_rate": 8.781051652541872e-05, "loss": 0.9955, "step": 1474 }, { "epoch": 0.22733923758182517, "grad_norm": 0.8081583976745605, "learning_rate": 8.777883471252235e-05, "loss": 1.1366, "step": 1476 }, { "epoch": 0.22764728532922604, "grad_norm": 0.8436789512634277, "learning_rate": 8.774711751201547e-05, "loss": 2.0628, "step": 1478 }, { "epoch": 0.22795533307662688, "grad_norm": 0.7141880393028259, "learning_rate": 8.771536495360776e-05, "loss": 1.0375, "step": 1480 }, { "epoch": 0.22826338082402772, "grad_norm": 0.709006667137146, "learning_rate": 8.768357706704196e-05, "loss": 1.0356, "step": 1482 }, { "epoch": 0.22857142857142856, "grad_norm": 0.6896849870681763, "learning_rate": 8.765175388209395e-05, "loss": 0.8338, "step": 1484 }, { "epoch": 0.2288794763188294, "grad_norm": 0.6931160092353821, "learning_rate": 8.761989542857263e-05, "loss": 1.0104, "step": 1486 }, { "epoch": 0.22918752406623027, "grad_norm": 0.6755717396736145, "learning_rate": 8.758800173631998e-05, "loss": 0.8858, "step": 1488 }, { "epoch": 0.22949557181363112, "grad_norm": 0.8900005221366882, "learning_rate": 8.755607283521097e-05, "loss": 1.0734, "step": 1490 }, { "epoch": 0.22980361956103196, "grad_norm": 0.852319598197937, "learning_rate": 8.752410875515353e-05, "loss": 0.9932, "step": 1492 }, { "epoch": 0.2301116673084328, "grad_norm": 0.7591366171836853, "learning_rate": 8.74921095260886e-05, "loss": 1.2182, "step": 1494 }, { "epoch": 0.23041971505583364, "grad_norm": 0.7753162980079651, "learning_rate": 8.746007517798999e-05, "loss": 0.9069, "step": 1496 }, { "epoch": 0.2307277628032345, "grad_norm": 0.7818796038627625, "learning_rate": 8.742800574086443e-05, "loss": 1.5481, "step": 1498 }, { "epoch": 0.23103581055063535, "grad_norm": 0.5833514332771301, "learning_rate": 8.739590124475148e-05, "loss": 0.9812, "step": 1500 }, { "epoch": 0.23103581055063535, "eval_loss": 2.57389497756958, "eval_runtime": 736.2798, "eval_samples_per_second": 2.716, "eval_steps_per_second": 0.679, "step": 1500 }, { "epoch": 0.2313438582980362, "grad_norm": 0.5849636793136597, "learning_rate": 8.73637617197236e-05, "loss": 0.9969, "step": 1502 }, { "epoch": 0.23165190604543703, "grad_norm": 0.683225154876709, "learning_rate": 8.733158719588603e-05, "loss": 1.9737, "step": 1504 }, { "epoch": 0.2319599537928379, "grad_norm": 0.6674225330352783, "learning_rate": 8.729937770337677e-05, "loss": 1.041, "step": 1506 }, { "epoch": 0.23226800154023874, "grad_norm": 0.9080517292022705, "learning_rate": 8.726713327236666e-05, "loss": 1.1746, "step": 1508 }, { "epoch": 0.23257604928763959, "grad_norm": 0.7534533143043518, "learning_rate": 8.723485393305915e-05, "loss": 0.9784, "step": 1510 }, { "epoch": 0.23288409703504043, "grad_norm": 1.0462878942489624, "learning_rate": 8.720253971569047e-05, "loss": 1.1182, "step": 1512 }, { "epoch": 0.23319214478244127, "grad_norm": 0.6824343800544739, "learning_rate": 8.71701906505295e-05, "loss": 1.0664, "step": 1514 }, { "epoch": 0.23350019252984214, "grad_norm": 0.7235549688339233, "learning_rate": 8.713780676787777e-05, "loss": 0.9804, "step": 1516 }, { "epoch": 0.23380824027724298, "grad_norm": 0.7442678809165955, "learning_rate": 8.710538809806939e-05, "loss": 1.1463, "step": 1518 }, { "epoch": 0.23411628802464382, "grad_norm": 0.47301387786865234, "learning_rate": 8.707293467147109e-05, "loss": 0.9718, "step": 1520 }, { "epoch": 0.23442433577204466, "grad_norm": 0.5563234090805054, "learning_rate": 8.704044651848215e-05, "loss": 1.0898, "step": 1522 }, { "epoch": 0.2347323835194455, "grad_norm": 0.6864919066429138, "learning_rate": 8.700792366953436e-05, "loss": 1.087, "step": 1524 }, { "epoch": 0.23504043126684637, "grad_norm": 0.537579357624054, "learning_rate": 8.697536615509206e-05, "loss": 0.935, "step": 1526 }, { "epoch": 0.2353484790142472, "grad_norm": 0.6670091152191162, "learning_rate": 8.694277400565198e-05, "loss": 1.3778, "step": 1528 }, { "epoch": 0.23565652676164806, "grad_norm": 0.8170010447502136, "learning_rate": 8.691014725174337e-05, "loss": 1.0208, "step": 1530 }, { "epoch": 0.2359645745090489, "grad_norm": 0.677753746509552, "learning_rate": 8.687748592392785e-05, "loss": 0.9473, "step": 1532 }, { "epoch": 0.23627262225644974, "grad_norm": 0.8103045225143433, "learning_rate": 8.684479005279944e-05, "loss": 1.124, "step": 1534 }, { "epoch": 0.2365806700038506, "grad_norm": 0.5518203377723694, "learning_rate": 8.681205966898451e-05, "loss": 0.9217, "step": 1536 }, { "epoch": 0.23688871775125145, "grad_norm": 0.7895511388778687, "learning_rate": 8.677929480314177e-05, "loss": 0.912, "step": 1538 }, { "epoch": 0.2371967654986523, "grad_norm": 0.7995219230651855, "learning_rate": 8.674649548596221e-05, "loss": 1.0235, "step": 1540 }, { "epoch": 0.23750481324605313, "grad_norm": 0.5603652596473694, "learning_rate": 8.671366174816913e-05, "loss": 0.8075, "step": 1542 }, { "epoch": 0.23781286099345397, "grad_norm": 0.5374646782875061, "learning_rate": 8.668079362051802e-05, "loss": 0.9074, "step": 1544 }, { "epoch": 0.23812090874085484, "grad_norm": 0.6922785043716431, "learning_rate": 8.664789113379661e-05, "loss": 1.0318, "step": 1546 }, { "epoch": 0.23842895648825568, "grad_norm": 0.9918892979621887, "learning_rate": 8.661495431882483e-05, "loss": 1.0848, "step": 1548 }, { "epoch": 0.23873700423565652, "grad_norm": 0.7347520589828491, "learning_rate": 8.658198320645473e-05, "loss": 1.0083, "step": 1550 }, { "epoch": 0.23904505198305737, "grad_norm": 0.7262596487998962, "learning_rate": 8.654897782757051e-05, "loss": 1.5047, "step": 1552 }, { "epoch": 0.2393530997304582, "grad_norm": 0.73700350522995, "learning_rate": 8.651593821308847e-05, "loss": 1.1478, "step": 1554 }, { "epoch": 0.23966114747785908, "grad_norm": 0.5896843671798706, "learning_rate": 8.648286439395697e-05, "loss": 0.9431, "step": 1556 }, { "epoch": 0.23996919522525992, "grad_norm": 0.6611576080322266, "learning_rate": 8.644975640115639e-05, "loss": 2.0998, "step": 1558 }, { "epoch": 0.24027724297266076, "grad_norm": 0.7349005937576294, "learning_rate": 8.641661426569916e-05, "loss": 1.1122, "step": 1560 }, { "epoch": 0.2405852907200616, "grad_norm": 0.9728613495826721, "learning_rate": 8.638343801862967e-05, "loss": 1.2116, "step": 1562 }, { "epoch": 0.24089333846746247, "grad_norm": 0.6161354780197144, "learning_rate": 8.635022769102428e-05, "loss": 0.9882, "step": 1564 }, { "epoch": 0.2412013862148633, "grad_norm": 0.89566570520401, "learning_rate": 8.631698331399123e-05, "loss": 1.0898, "step": 1566 }, { "epoch": 0.24150943396226415, "grad_norm": 0.5751842260360718, "learning_rate": 8.628370491867068e-05, "loss": 0.8537, "step": 1568 }, { "epoch": 0.241817481709665, "grad_norm": 0.7686675786972046, "learning_rate": 8.62503925362347e-05, "loss": 0.9874, "step": 1570 }, { "epoch": 0.24212552945706584, "grad_norm": 0.6984812021255493, "learning_rate": 8.621704619788711e-05, "loss": 0.8566, "step": 1572 }, { "epoch": 0.2424335772044667, "grad_norm": 0.7423837780952454, "learning_rate": 8.61836659348636e-05, "loss": 1.1464, "step": 1574 }, { "epoch": 0.24274162495186755, "grad_norm": 0.6439533829689026, "learning_rate": 8.615025177843163e-05, "loss": 1.0957, "step": 1576 }, { "epoch": 0.2430496726992684, "grad_norm": 0.5935307145118713, "learning_rate": 8.611680375989038e-05, "loss": 0.9401, "step": 1578 }, { "epoch": 0.24335772044666923, "grad_norm": 0.7962810397148132, "learning_rate": 8.608332191057076e-05, "loss": 1.0961, "step": 1580 }, { "epoch": 0.24366576819407007, "grad_norm": 0.6294207572937012, "learning_rate": 8.604980626183536e-05, "loss": 1.0722, "step": 1582 }, { "epoch": 0.24397381594147094, "grad_norm": 0.8451805710792542, "learning_rate": 8.60162568450785e-05, "loss": 1.2739, "step": 1584 }, { "epoch": 0.24428186368887178, "grad_norm": 0.6558196544647217, "learning_rate": 8.598267369172603e-05, "loss": 0.9931, "step": 1586 }, { "epoch": 0.24458991143627262, "grad_norm": 0.6440199017524719, "learning_rate": 8.594905683323544e-05, "loss": 0.908, "step": 1588 }, { "epoch": 0.24489795918367346, "grad_norm": 0.8013085722923279, "learning_rate": 8.591540630109583e-05, "loss": 0.9742, "step": 1590 }, { "epoch": 0.2452060069310743, "grad_norm": 0.727178156375885, "learning_rate": 8.588172212682779e-05, "loss": 2.4691, "step": 1592 }, { "epoch": 0.24551405467847517, "grad_norm": 0.9094061851501465, "learning_rate": 8.584800434198346e-05, "loss": 1.0531, "step": 1594 }, { "epoch": 0.24582210242587602, "grad_norm": 0.6858007311820984, "learning_rate": 8.581425297814641e-05, "loss": 1.0649, "step": 1596 }, { "epoch": 0.24613015017327686, "grad_norm": 0.5976001024246216, "learning_rate": 8.578046806693174e-05, "loss": 0.7737, "step": 1598 }, { "epoch": 0.2464381979206777, "grad_norm": 0.6907030344009399, "learning_rate": 8.57466496399859e-05, "loss": 0.8375, "step": 1600 }, { "epoch": 0.24674624566807854, "grad_norm": 0.7412716746330261, "learning_rate": 8.571279772898681e-05, "loss": 1.0501, "step": 1602 }, { "epoch": 0.2470542934154794, "grad_norm": 0.7598106861114502, "learning_rate": 8.567891236564368e-05, "loss": 1.006, "step": 1604 }, { "epoch": 0.24736234116288025, "grad_norm": 0.7194213271141052, "learning_rate": 8.56449935816971e-05, "loss": 0.9603, "step": 1606 }, { "epoch": 0.2476703889102811, "grad_norm": 0.6847167611122131, "learning_rate": 8.561104140891894e-05, "loss": 1.1455, "step": 1608 }, { "epoch": 0.24797843665768193, "grad_norm": 0.6343832612037659, "learning_rate": 8.557705587911238e-05, "loss": 0.8958, "step": 1610 }, { "epoch": 0.24828648440508277, "grad_norm": 0.6502009630203247, "learning_rate": 8.55430370241118e-05, "loss": 0.9663, "step": 1612 }, { "epoch": 0.24859453215248364, "grad_norm": 0.9204524159431458, "learning_rate": 8.550898487578282e-05, "loss": 1.0136, "step": 1614 }, { "epoch": 0.24890257989988449, "grad_norm": 0.6391148567199707, "learning_rate": 8.547489946602227e-05, "loss": 1.0722, "step": 1616 }, { "epoch": 0.24921062764728533, "grad_norm": 0.6553990840911865, "learning_rate": 8.54407808267581e-05, "loss": 0.9506, "step": 1618 }, { "epoch": 0.24951867539468617, "grad_norm": 0.6417995691299438, "learning_rate": 8.54066289899494e-05, "loss": 1.0027, "step": 1620 }, { "epoch": 0.24982672314208704, "grad_norm": 0.7358739972114563, "learning_rate": 8.537244398758636e-05, "loss": 0.9905, "step": 1622 }, { "epoch": 0.2501347708894879, "grad_norm": 0.5125988721847534, "learning_rate": 8.533822585169022e-05, "loss": 1.0122, "step": 1624 }, { "epoch": 0.2504428186368887, "grad_norm": 0.5452527403831482, "learning_rate": 8.530397461431325e-05, "loss": 0.8484, "step": 1626 }, { "epoch": 0.25075086638428956, "grad_norm": 0.8188121914863586, "learning_rate": 8.526969030753879e-05, "loss": 0.8833, "step": 1628 }, { "epoch": 0.2510589141316904, "grad_norm": 0.9002227187156677, "learning_rate": 8.523537296348107e-05, "loss": 1.1187, "step": 1630 }, { "epoch": 0.25136696187909124, "grad_norm": 0.6643141508102417, "learning_rate": 8.520102261428534e-05, "loss": 0.9072, "step": 1632 }, { "epoch": 0.2516750096264921, "grad_norm": 0.9124694466590881, "learning_rate": 8.516663929212769e-05, "loss": 1.2232, "step": 1634 }, { "epoch": 0.2519830573738929, "grad_norm": 0.6632004380226135, "learning_rate": 8.513222302921517e-05, "loss": 1.0628, "step": 1636 }, { "epoch": 0.2522911051212938, "grad_norm": 0.5880236625671387, "learning_rate": 8.509777385778565e-05, "loss": 0.9547, "step": 1638 }, { "epoch": 0.25259915286869467, "grad_norm": 0.9797520041465759, "learning_rate": 8.506329181010781e-05, "loss": 1.1099, "step": 1640 }, { "epoch": 0.2529072006160955, "grad_norm": 0.6555163860321045, "learning_rate": 8.502877691848117e-05, "loss": 0.8764, "step": 1642 }, { "epoch": 0.25321524836349635, "grad_norm": 0.7387675046920776, "learning_rate": 8.499422921523596e-05, "loss": 1.1055, "step": 1644 }, { "epoch": 0.2535232961108972, "grad_norm": 1.114042043685913, "learning_rate": 8.495964873273322e-05, "loss": 1.285, "step": 1646 }, { "epoch": 0.25383134385829803, "grad_norm": 0.9613330364227295, "learning_rate": 8.492503550336462e-05, "loss": 1.951, "step": 1648 }, { "epoch": 0.25413939160569887, "grad_norm": 0.7614074945449829, "learning_rate": 8.489038955955251e-05, "loss": 1.0745, "step": 1650 }, { "epoch": 0.2544474393530997, "grad_norm": 0.7193354964256287, "learning_rate": 8.485571093374995e-05, "loss": 0.9635, "step": 1652 }, { "epoch": 0.25475548710050056, "grad_norm": 0.7330077290534973, "learning_rate": 8.482099965844056e-05, "loss": 1.1106, "step": 1654 }, { "epoch": 0.25506353484790145, "grad_norm": 0.8675753474235535, "learning_rate": 8.478625576613853e-05, "loss": 1.1381, "step": 1656 }, { "epoch": 0.2553715825953023, "grad_norm": 0.7048096060752869, "learning_rate": 8.475147928938866e-05, "loss": 1.2692, "step": 1658 }, { "epoch": 0.25567963034270313, "grad_norm": 0.9633030891418457, "learning_rate": 8.471667026076621e-05, "loss": 1.1013, "step": 1660 }, { "epoch": 0.255987678090104, "grad_norm": 0.5854815244674683, "learning_rate": 8.468182871287695e-05, "loss": 0.8917, "step": 1662 }, { "epoch": 0.2562957258375048, "grad_norm": 0.7640011310577393, "learning_rate": 8.464695467835718e-05, "loss": 0.9966, "step": 1664 }, { "epoch": 0.25660377358490566, "grad_norm": 0.6665043234825134, "learning_rate": 8.461204818987349e-05, "loss": 1.7742, "step": 1666 }, { "epoch": 0.2569118213323065, "grad_norm": 11.362093925476074, "learning_rate": 8.457710928012301e-05, "loss": 1.056, "step": 1668 }, { "epoch": 0.25721986907970734, "grad_norm": 0.998767077922821, "learning_rate": 8.454213798183317e-05, "loss": 1.1667, "step": 1670 }, { "epoch": 0.2575279168271082, "grad_norm": 0.7688530683517456, "learning_rate": 8.450713432776172e-05, "loss": 1.0143, "step": 1672 }, { "epoch": 0.257835964574509, "grad_norm": 0.8430312275886536, "learning_rate": 8.447209835069678e-05, "loss": 0.857, "step": 1674 }, { "epoch": 0.2581440123219099, "grad_norm": 0.6447070837020874, "learning_rate": 8.443703008345669e-05, "loss": 1.1301, "step": 1676 }, { "epoch": 0.25845206006931076, "grad_norm": 0.8711054921150208, "learning_rate": 8.440192955889006e-05, "loss": 1.0479, "step": 1678 }, { "epoch": 0.2587601078167116, "grad_norm": 0.7437043786048889, "learning_rate": 8.436679680987571e-05, "loss": 1.3005, "step": 1680 }, { "epoch": 0.25906815556411245, "grad_norm": 0.9241888523101807, "learning_rate": 8.433163186932268e-05, "loss": 0.9172, "step": 1682 }, { "epoch": 0.2593762033115133, "grad_norm": 2.8876569271087646, "learning_rate": 8.429643477017011e-05, "loss": 0.811, "step": 1684 }, { "epoch": 0.25968425105891413, "grad_norm": 0.9509261846542358, "learning_rate": 8.42612055453873e-05, "loss": 0.9527, "step": 1686 }, { "epoch": 0.25999229880631497, "grad_norm": 0.861937403678894, "learning_rate": 8.42259442279736e-05, "loss": 1.0687, "step": 1688 }, { "epoch": 0.2603003465537158, "grad_norm": 0.7974975109100342, "learning_rate": 8.419065085095849e-05, "loss": 0.8891, "step": 1690 }, { "epoch": 0.26060839430111665, "grad_norm": 0.6367357969284058, "learning_rate": 8.41553254474014e-05, "loss": 1.2569, "step": 1692 }, { "epoch": 0.2609164420485175, "grad_norm": 0.749701738357544, "learning_rate": 8.411996805039184e-05, "loss": 2.3228, "step": 1694 }, { "epoch": 0.2612244897959184, "grad_norm": 0.7712326049804688, "learning_rate": 8.408457869304923e-05, "loss": 1.1975, "step": 1696 }, { "epoch": 0.26153253754331923, "grad_norm": 0.7463422417640686, "learning_rate": 8.404915740852292e-05, "loss": 0.9153, "step": 1698 }, { "epoch": 0.2618405852907201, "grad_norm": 0.6337231397628784, "learning_rate": 8.401370422999224e-05, "loss": 0.8179, "step": 1700 }, { "epoch": 0.2621486330381209, "grad_norm": 0.7018243670463562, "learning_rate": 8.397821919066632e-05, "loss": 2.1112, "step": 1702 }, { "epoch": 0.26245668078552176, "grad_norm": 0.6725606918334961, "learning_rate": 8.394270232378419e-05, "loss": 0.9945, "step": 1704 }, { "epoch": 0.2627647285329226, "grad_norm": 0.8441318869590759, "learning_rate": 8.390715366261461e-05, "loss": 1.2714, "step": 1706 }, { "epoch": 0.26307277628032344, "grad_norm": 1.5008949041366577, "learning_rate": 8.387157324045623e-05, "loss": 1.0337, "step": 1708 }, { "epoch": 0.2633808240277243, "grad_norm": 0.5705763697624207, "learning_rate": 8.383596109063736e-05, "loss": 0.9527, "step": 1710 }, { "epoch": 0.2636888717751251, "grad_norm": 0.7076048851013184, "learning_rate": 8.380031724651608e-05, "loss": 0.9782, "step": 1712 }, { "epoch": 0.263996919522526, "grad_norm": 0.4575868546962738, "learning_rate": 8.376464174148015e-05, "loss": 0.8988, "step": 1714 }, { "epoch": 0.26430496726992686, "grad_norm": 1.2648346424102783, "learning_rate": 8.372893460894699e-05, "loss": 1.0505, "step": 1716 }, { "epoch": 0.2646130150173277, "grad_norm": 0.8442276120185852, "learning_rate": 8.369319588236362e-05, "loss": 1.4117, "step": 1718 }, { "epoch": 0.26492106276472854, "grad_norm": 0.6854913830757141, "learning_rate": 8.365742559520669e-05, "loss": 0.8767, "step": 1720 }, { "epoch": 0.2652291105121294, "grad_norm": 0.7954949140548706, "learning_rate": 8.362162378098234e-05, "loss": 1.1106, "step": 1722 }, { "epoch": 0.2655371582595302, "grad_norm": 0.9195547103881836, "learning_rate": 8.358579047322639e-05, "loss": 0.9618, "step": 1724 }, { "epoch": 0.26584520600693107, "grad_norm": 0.8742767572402954, "learning_rate": 8.3549925705504e-05, "loss": 1.1815, "step": 1726 }, { "epoch": 0.2661532537543319, "grad_norm": 0.7814459204673767, "learning_rate": 8.351402951140988e-05, "loss": 1.2934, "step": 1728 }, { "epoch": 0.26646130150173275, "grad_norm": 0.8386595249176025, "learning_rate": 8.347810192456815e-05, "loss": 0.9509, "step": 1730 }, { "epoch": 0.2667693492491336, "grad_norm": 0.7231822609901428, "learning_rate": 8.344214297863237e-05, "loss": 1.1595, "step": 1732 }, { "epoch": 0.2670773969965345, "grad_norm": 0.6814515590667725, "learning_rate": 8.340615270728545e-05, "loss": 1.0287, "step": 1734 }, { "epoch": 0.26738544474393533, "grad_norm": 0.6391722559928894, "learning_rate": 8.337013114423962e-05, "loss": 1.2981, "step": 1736 }, { "epoch": 0.26769349249133617, "grad_norm": 0.658207893371582, "learning_rate": 8.333407832323647e-05, "loss": 1.2991, "step": 1738 }, { "epoch": 0.268001540238737, "grad_norm": 0.48568904399871826, "learning_rate": 8.329799427804683e-05, "loss": 1.9648, "step": 1740 }, { "epoch": 0.26830958798613785, "grad_norm": 0.5600195527076721, "learning_rate": 8.326187904247083e-05, "loss": 0.8692, "step": 1742 }, { "epoch": 0.2686176357335387, "grad_norm": 0.760915219783783, "learning_rate": 8.322573265033773e-05, "loss": 0.9963, "step": 1744 }, { "epoch": 0.26892568348093954, "grad_norm": 0.6481974720954895, "learning_rate": 8.318955513550604e-05, "loss": 0.79, "step": 1746 }, { "epoch": 0.2692337312283404, "grad_norm": 0.5259435176849365, "learning_rate": 8.315334653186343e-05, "loss": 0.9599, "step": 1748 }, { "epoch": 0.2695417789757412, "grad_norm": 0.967396080493927, "learning_rate": 8.311710687332665e-05, "loss": 1.147, "step": 1750 }, { "epoch": 0.26984982672314206, "grad_norm": 0.7488315105438232, "learning_rate": 8.308083619384154e-05, "loss": 1.0591, "step": 1752 }, { "epoch": 0.27015787447054296, "grad_norm": 1.1080056428909302, "learning_rate": 8.304453452738305e-05, "loss": 1.2517, "step": 1754 }, { "epoch": 0.2704659222179438, "grad_norm": 0.7297698855400085, "learning_rate": 8.300820190795508e-05, "loss": 1.1062, "step": 1756 }, { "epoch": 0.27077396996534464, "grad_norm": 0.7065379619598389, "learning_rate": 8.297183836959062e-05, "loss": 0.9981, "step": 1758 }, { "epoch": 0.2710820177127455, "grad_norm": 0.6783261895179749, "learning_rate": 8.293544394635149e-05, "loss": 0.9019, "step": 1760 }, { "epoch": 0.2713900654601463, "grad_norm": 0.7251535654067993, "learning_rate": 8.289901867232858e-05, "loss": 0.9765, "step": 1762 }, { "epoch": 0.27169811320754716, "grad_norm": 0.8736194968223572, "learning_rate": 8.286256258164158e-05, "loss": 1.1514, "step": 1764 }, { "epoch": 0.272006160954948, "grad_norm": 0.6842576861381531, "learning_rate": 8.28260757084391e-05, "loss": 1.1712, "step": 1766 }, { "epoch": 0.27231420870234885, "grad_norm": 1.0661685466766357, "learning_rate": 8.278955808689856e-05, "loss": 1.2, "step": 1768 }, { "epoch": 0.2726222564497497, "grad_norm": 0.8276075720787048, "learning_rate": 8.275300975122618e-05, "loss": 1.0937, "step": 1770 }, { "epoch": 0.27293030419715053, "grad_norm": 0.8004717826843262, "learning_rate": 8.271643073565695e-05, "loss": 2.487, "step": 1772 }, { "epoch": 0.2732383519445514, "grad_norm": 0.6467103362083435, "learning_rate": 8.267982107445463e-05, "loss": 1.0008, "step": 1774 }, { "epoch": 0.27354639969195227, "grad_norm": 0.7134955525398254, "learning_rate": 8.264318080191162e-05, "loss": 2.025, "step": 1776 }, { "epoch": 0.2738544474393531, "grad_norm": 0.8171098828315735, "learning_rate": 8.260650995234907e-05, "loss": 1.265, "step": 1778 }, { "epoch": 0.27416249518675395, "grad_norm": 0.6574395298957825, "learning_rate": 8.256980856011672e-05, "loss": 0.9239, "step": 1780 }, { "epoch": 0.2744705429341548, "grad_norm": 0.5392150282859802, "learning_rate": 8.253307665959293e-05, "loss": 0.8495, "step": 1782 }, { "epoch": 0.27477859068155563, "grad_norm": 0.7107676267623901, "learning_rate": 8.249631428518465e-05, "loss": 1.0611, "step": 1784 }, { "epoch": 0.2750866384289565, "grad_norm": 0.7121739387512207, "learning_rate": 8.245952147132736e-05, "loss": 1.0221, "step": 1786 }, { "epoch": 0.2753946861763573, "grad_norm": 0.7235463857650757, "learning_rate": 8.242269825248509e-05, "loss": 1.0936, "step": 1788 }, { "epoch": 0.27570273392375816, "grad_norm": 0.8050673604011536, "learning_rate": 8.238584466315027e-05, "loss": 0.974, "step": 1790 }, { "epoch": 0.27601078167115906, "grad_norm": 0.8345509171485901, "learning_rate": 8.234896073784389e-05, "loss": 1.991, "step": 1792 }, { "epoch": 0.2763188294185599, "grad_norm": 0.6287657022476196, "learning_rate": 8.231204651111524e-05, "loss": 1.3361, "step": 1794 }, { "epoch": 0.27662687716596074, "grad_norm": 0.6475083827972412, "learning_rate": 8.227510201754207e-05, "loss": 1.0319, "step": 1796 }, { "epoch": 0.2769349249133616, "grad_norm": 0.4755045771598816, "learning_rate": 8.223812729173045e-05, "loss": 2.33, "step": 1798 }, { "epoch": 0.2772429726607624, "grad_norm": 0.7601513266563416, "learning_rate": 8.22011223683148e-05, "loss": 0.7241, "step": 1800 }, { "epoch": 0.2772429726607624, "eval_loss": 2.5164382457733154, "eval_runtime": 736.9947, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 1800 }, { "epoch": 0.27755102040816326, "grad_norm": 0.7659019827842712, "learning_rate": 8.216408728195779e-05, "loss": 1.0013, "step": 1802 }, { "epoch": 0.2778590681555641, "grad_norm": 0.7980189323425293, "learning_rate": 8.212702206735036e-05, "loss": 1.0552, "step": 1804 }, { "epoch": 0.27816711590296495, "grad_norm": 0.7149158120155334, "learning_rate": 8.208992675921166e-05, "loss": 0.9471, "step": 1806 }, { "epoch": 0.2784751636503658, "grad_norm": 0.7278897166252136, "learning_rate": 8.205280139228906e-05, "loss": 1.0755, "step": 1808 }, { "epoch": 0.27878321139776663, "grad_norm": 0.79888516664505, "learning_rate": 8.201564600135803e-05, "loss": 1.0658, "step": 1810 }, { "epoch": 0.2790912591451675, "grad_norm": 0.929144561290741, "learning_rate": 8.197846062122223e-05, "loss": 1.0584, "step": 1812 }, { "epoch": 0.27939930689256837, "grad_norm": 0.8886906504631042, "learning_rate": 8.194124528671337e-05, "loss": 1.0032, "step": 1814 }, { "epoch": 0.2797073546399692, "grad_norm": 0.8825163245201111, "learning_rate": 8.190400003269121e-05, "loss": 0.7641, "step": 1816 }, { "epoch": 0.28001540238737005, "grad_norm": 0.9231512546539307, "learning_rate": 8.186672489404359e-05, "loss": 1.1552, "step": 1818 }, { "epoch": 0.2803234501347709, "grad_norm": 0.7428817749023438, "learning_rate": 8.182941990568626e-05, "loss": 1.3535, "step": 1820 }, { "epoch": 0.28063149788217173, "grad_norm": 0.6336200833320618, "learning_rate": 8.179208510256302e-05, "loss": 0.9099, "step": 1822 }, { "epoch": 0.2809395456295726, "grad_norm": 0.8053500056266785, "learning_rate": 8.175472051964552e-05, "loss": 1.0487, "step": 1824 }, { "epoch": 0.2812475933769734, "grad_norm": 1.5604395866394043, "learning_rate": 8.171732619193336e-05, "loss": 0.9007, "step": 1826 }, { "epoch": 0.28155564112437426, "grad_norm": 1.509132981300354, "learning_rate": 8.167990215445395e-05, "loss": 1.5257, "step": 1828 }, { "epoch": 0.2818636888717751, "grad_norm": 0.6117202639579773, "learning_rate": 8.164244844226261e-05, "loss": 1.0918, "step": 1830 }, { "epoch": 0.282171736619176, "grad_norm": 0.6877596378326416, "learning_rate": 8.160496509044238e-05, "loss": 1.0748, "step": 1832 }, { "epoch": 0.28247978436657684, "grad_norm": 0.7765544652938843, "learning_rate": 8.156745213410407e-05, "loss": 1.1675, "step": 1834 }, { "epoch": 0.2827878321139777, "grad_norm": 0.6919207572937012, "learning_rate": 8.152990960838628e-05, "loss": 0.981, "step": 1836 }, { "epoch": 0.2830958798613785, "grad_norm": 0.7913166284561157, "learning_rate": 8.149233754845525e-05, "loss": 0.9699, "step": 1838 }, { "epoch": 0.28340392760877936, "grad_norm": 0.7056479454040527, "learning_rate": 8.145473598950489e-05, "loss": 1.3239, "step": 1840 }, { "epoch": 0.2837119753561802, "grad_norm": 0.8406484723091125, "learning_rate": 8.141710496675679e-05, "loss": 1.8384, "step": 1842 }, { "epoch": 0.28402002310358104, "grad_norm": 0.518136739730835, "learning_rate": 8.137944451546007e-05, "loss": 0.752, "step": 1844 }, { "epoch": 0.2843280708509819, "grad_norm": 0.8231174945831299, "learning_rate": 8.134175467089146e-05, "loss": 1.0535, "step": 1846 }, { "epoch": 0.2846361185983827, "grad_norm": 0.5989777445793152, "learning_rate": 8.130403546835523e-05, "loss": 0.7501, "step": 1848 }, { "epoch": 0.2849441663457836, "grad_norm": 0.7494628429412842, "learning_rate": 8.12662869431831e-05, "loss": 0.8713, "step": 1850 }, { "epoch": 0.28525221409318446, "grad_norm": 0.6568158864974976, "learning_rate": 8.122850913073433e-05, "loss": 0.925, "step": 1852 }, { "epoch": 0.2855602618405853, "grad_norm": 0.7191964983940125, "learning_rate": 8.119070206639554e-05, "loss": 1.434, "step": 1854 }, { "epoch": 0.28586830958798615, "grad_norm": 0.6533187627792358, "learning_rate": 8.115286578558081e-05, "loss": 0.9723, "step": 1856 }, { "epoch": 0.286176357335387, "grad_norm": 0.6065565943717957, "learning_rate": 8.111500032373153e-05, "loss": 0.9806, "step": 1858 }, { "epoch": 0.28648440508278783, "grad_norm": 0.5518906116485596, "learning_rate": 8.107710571631648e-05, "loss": 1.0744, "step": 1860 }, { "epoch": 0.28679245283018867, "grad_norm": 0.9548470973968506, "learning_rate": 8.10391819988317e-05, "loss": 1.1006, "step": 1862 }, { "epoch": 0.2871005005775895, "grad_norm": 0.636380136013031, "learning_rate": 8.100122920680052e-05, "loss": 1.3742, "step": 1864 }, { "epoch": 0.28740854832499035, "grad_norm": 0.8006538152694702, "learning_rate": 8.09632473757735e-05, "loss": 0.7724, "step": 1866 }, { "epoch": 0.2877165960723912, "grad_norm": 0.7312654852867126, "learning_rate": 8.09252365413284e-05, "loss": 0.9995, "step": 1868 }, { "epoch": 0.2880246438197921, "grad_norm": 0.8602042198181152, "learning_rate": 8.088719673907013e-05, "loss": 1.32, "step": 1870 }, { "epoch": 0.28833269156719293, "grad_norm": 0.8657503128051758, "learning_rate": 8.084912800463076e-05, "loss": 1.339, "step": 1872 }, { "epoch": 0.2886407393145938, "grad_norm": 0.6329157948493958, "learning_rate": 8.081103037366944e-05, "loss": 1.0352, "step": 1874 }, { "epoch": 0.2889487870619946, "grad_norm": 0.6768485903739929, "learning_rate": 8.077290388187243e-05, "loss": 1.0781, "step": 1876 }, { "epoch": 0.28925683480939546, "grad_norm": 0.687986433506012, "learning_rate": 8.073474856495296e-05, "loss": 1.0905, "step": 1878 }, { "epoch": 0.2895648825567963, "grad_norm": 0.7147440910339355, "learning_rate": 8.06965644586513e-05, "loss": 0.9371, "step": 1880 }, { "epoch": 0.28987293030419714, "grad_norm": 0.7411069273948669, "learning_rate": 8.06583515987347e-05, "loss": 0.9088, "step": 1882 }, { "epoch": 0.290180978051598, "grad_norm": 0.5603320598602295, "learning_rate": 8.06201100209973e-05, "loss": 0.9143, "step": 1884 }, { "epoch": 0.2904890257989988, "grad_norm": 0.75848388671875, "learning_rate": 8.058183976126018e-05, "loss": 1.1381, "step": 1886 }, { "epoch": 0.29079707354639966, "grad_norm": 0.753108561038971, "learning_rate": 8.054354085537126e-05, "loss": 1.8448, "step": 1888 }, { "epoch": 0.29110512129380056, "grad_norm": 0.6318671107292175, "learning_rate": 8.05052133392053e-05, "loss": 1.0209, "step": 1890 }, { "epoch": 0.2914131690412014, "grad_norm": 0.7125961780548096, "learning_rate": 8.046685724866387e-05, "loss": 0.8734, "step": 1892 }, { "epoch": 0.29172121678860224, "grad_norm": 0.7472694516181946, "learning_rate": 8.042847261967531e-05, "loss": 1.0617, "step": 1894 }, { "epoch": 0.2920292645360031, "grad_norm": 0.6471079587936401, "learning_rate": 8.039005948819467e-05, "loss": 0.8885, "step": 1896 }, { "epoch": 0.2923373122834039, "grad_norm": 0.8556898832321167, "learning_rate": 8.03516178902037e-05, "loss": 0.9897, "step": 1898 }, { "epoch": 0.29264536003080477, "grad_norm": 0.6888934373855591, "learning_rate": 8.031314786171083e-05, "loss": 0.9881, "step": 1900 }, { "epoch": 0.2929534077782056, "grad_norm": 1.0639433860778809, "learning_rate": 8.027464943875113e-05, "loss": 1.2831, "step": 1902 }, { "epoch": 0.29326145552560645, "grad_norm": 0.7208219766616821, "learning_rate": 8.023612265738624e-05, "loss": 1.0755, "step": 1904 }, { "epoch": 0.2935695032730073, "grad_norm": 0.6373528242111206, "learning_rate": 8.019756755370437e-05, "loss": 1.0752, "step": 1906 }, { "epoch": 0.2938775510204082, "grad_norm": 0.9459060430526733, "learning_rate": 8.015898416382026e-05, "loss": 1.0827, "step": 1908 }, { "epoch": 0.29418559876780903, "grad_norm": 0.8489103317260742, "learning_rate": 8.012037252387518e-05, "loss": 0.888, "step": 1910 }, { "epoch": 0.2944936465152099, "grad_norm": 0.685509979724884, "learning_rate": 8.00817326700368e-05, "loss": 0.952, "step": 1912 }, { "epoch": 0.2948016942626107, "grad_norm": 0.8169518113136292, "learning_rate": 8.004306463849927e-05, "loss": 0.8875, "step": 1914 }, { "epoch": 0.29510974201001156, "grad_norm": 0.604669451713562, "learning_rate": 8.000436846548314e-05, "loss": 0.9481, "step": 1916 }, { "epoch": 0.2954177897574124, "grad_norm": 0.7678325772285461, "learning_rate": 7.996564418723522e-05, "loss": 1.3087, "step": 1918 }, { "epoch": 0.29572583750481324, "grad_norm": 0.7700533270835876, "learning_rate": 7.99268918400288e-05, "loss": 0.9033, "step": 1920 }, { "epoch": 0.2960338852522141, "grad_norm": 0.5778908729553223, "learning_rate": 7.988811146016336e-05, "loss": 0.9179, "step": 1922 }, { "epoch": 0.2963419329996149, "grad_norm": 0.5745398998260498, "learning_rate": 7.984930308396464e-05, "loss": 0.8747, "step": 1924 }, { "epoch": 0.29664998074701576, "grad_norm": 0.9561242461204529, "learning_rate": 7.981046674778462e-05, "loss": 1.2659, "step": 1926 }, { "epoch": 0.29695802849441666, "grad_norm": 0.8658321499824524, "learning_rate": 7.977160248800152e-05, "loss": 0.9769, "step": 1928 }, { "epoch": 0.2972660762418175, "grad_norm": 0.8289058804512024, "learning_rate": 7.973271034101966e-05, "loss": 1.0655, "step": 1930 }, { "epoch": 0.29757412398921834, "grad_norm": 0.802101194858551, "learning_rate": 7.969379034326949e-05, "loss": 1.1368, "step": 1932 }, { "epoch": 0.2978821717366192, "grad_norm": 0.8443379402160645, "learning_rate": 7.965484253120754e-05, "loss": 1.0558, "step": 1934 }, { "epoch": 0.29819021948402, "grad_norm": 0.7214971780776978, "learning_rate": 7.961586694131643e-05, "loss": 1.4491, "step": 1936 }, { "epoch": 0.29849826723142087, "grad_norm": 0.7081072926521301, "learning_rate": 7.957686361010475e-05, "loss": 0.9587, "step": 1938 }, { "epoch": 0.2988063149788217, "grad_norm": 0.6165660619735718, "learning_rate": 7.953783257410713e-05, "loss": 0.9157, "step": 1940 }, { "epoch": 0.29911436272622255, "grad_norm": 0.7237483263015747, "learning_rate": 7.94987738698841e-05, "loss": 1.1032, "step": 1942 }, { "epoch": 0.2994224104736234, "grad_norm": 0.90876704454422, "learning_rate": 7.945968753402216e-05, "loss": 1.1303, "step": 1944 }, { "epoch": 0.29973045822102423, "grad_norm": 0.6631532907485962, "learning_rate": 7.942057360313361e-05, "loss": 1.3403, "step": 1946 }, { "epoch": 0.30003850596842513, "grad_norm": 0.691525936126709, "learning_rate": 7.938143211385672e-05, "loss": 1.2834, "step": 1948 }, { "epoch": 0.30034655371582597, "grad_norm": 0.8195240497589111, "learning_rate": 7.934226310285543e-05, "loss": 1.1512, "step": 1950 }, { "epoch": 0.3006546014632268, "grad_norm": 0.6079697608947754, "learning_rate": 7.930306660681961e-05, "loss": 0.9393, "step": 1952 }, { "epoch": 0.30096264921062765, "grad_norm": 0.6469001173973083, "learning_rate": 7.926384266246477e-05, "loss": 1.0557, "step": 1954 }, { "epoch": 0.3012706969580285, "grad_norm": 0.5937241315841675, "learning_rate": 7.922459130653213e-05, "loss": 1.1243, "step": 1956 }, { "epoch": 0.30157874470542934, "grad_norm": 0.7131739258766174, "learning_rate": 7.918531257578865e-05, "loss": 0.8834, "step": 1958 }, { "epoch": 0.3018867924528302, "grad_norm": 0.8835099339485168, "learning_rate": 7.914600650702691e-05, "loss": 2.3994, "step": 1960 }, { "epoch": 0.302194840200231, "grad_norm": 0.6994355916976929, "learning_rate": 7.910667313706506e-05, "loss": 0.9667, "step": 1962 }, { "epoch": 0.30250288794763186, "grad_norm": 0.6864457130432129, "learning_rate": 7.906731250274687e-05, "loss": 0.9741, "step": 1964 }, { "epoch": 0.30281093569503276, "grad_norm": 0.5470225214958191, "learning_rate": 7.902792464094163e-05, "loss": 0.9114, "step": 1966 }, { "epoch": 0.3031189834424336, "grad_norm": 0.7735422253608704, "learning_rate": 7.898850958854412e-05, "loss": 1.099, "step": 1968 }, { "epoch": 0.30342703118983444, "grad_norm": 0.7568907737731934, "learning_rate": 7.89490673824746e-05, "loss": 0.9845, "step": 1970 }, { "epoch": 0.3037350789372353, "grad_norm": 0.6680666208267212, "learning_rate": 7.890959805967879e-05, "loss": 1.0772, "step": 1972 }, { "epoch": 0.3040431266846361, "grad_norm": 0.6780060529708862, "learning_rate": 7.887010165712778e-05, "loss": 0.9617, "step": 1974 }, { "epoch": 0.30435117443203696, "grad_norm": 0.5781919360160828, "learning_rate": 7.883057821181803e-05, "loss": 0.9055, "step": 1976 }, { "epoch": 0.3046592221794378, "grad_norm": 0.8196890354156494, "learning_rate": 7.879102776077131e-05, "loss": 1.0094, "step": 1978 }, { "epoch": 0.30496726992683865, "grad_norm": 0.9342995285987854, "learning_rate": 7.875145034103479e-05, "loss": 1.1311, "step": 1980 }, { "epoch": 0.3052753176742395, "grad_norm": 0.8569959998130798, "learning_rate": 7.871184598968073e-05, "loss": 1.0934, "step": 1982 }, { "epoch": 0.30558336542164033, "grad_norm": 0.6865665912628174, "learning_rate": 7.867221474380677e-05, "loss": 0.9035, "step": 1984 }, { "epoch": 0.3058914131690412, "grad_norm": 1.1311630010604858, "learning_rate": 7.863255664053566e-05, "loss": 1.0764, "step": 1986 }, { "epoch": 0.30619946091644207, "grad_norm": 0.7756600975990295, "learning_rate": 7.859287171701534e-05, "loss": 1.0406, "step": 1988 }, { "epoch": 0.3065075086638429, "grad_norm": 0.6949130296707153, "learning_rate": 7.855316001041886e-05, "loss": 0.8968, "step": 1990 }, { "epoch": 0.30681555641124375, "grad_norm": 0.6985813975334167, "learning_rate": 7.851342155794434e-05, "loss": 1.0365, "step": 1992 }, { "epoch": 0.3071236041586446, "grad_norm": 0.4934740364551544, "learning_rate": 7.847365639681501e-05, "loss": 0.9828, "step": 1994 }, { "epoch": 0.30743165190604543, "grad_norm": 0.7085347771644592, "learning_rate": 7.843386456427905e-05, "loss": 0.9736, "step": 1996 }, { "epoch": 0.3077396996534463, "grad_norm": 0.6113731265068054, "learning_rate": 7.839404609760969e-05, "loss": 1.1267, "step": 1998 }, { "epoch": 0.3080477474008471, "grad_norm": 0.720635712146759, "learning_rate": 7.835420103410504e-05, "loss": 0.9767, "step": 2000 }, { "epoch": 0.30835579514824796, "grad_norm": 1.2781391143798828, "learning_rate": 7.831432941108818e-05, "loss": 0.9985, "step": 2002 }, { "epoch": 0.3086638428956488, "grad_norm": 0.6825604438781738, "learning_rate": 7.827443126590701e-05, "loss": 1.0275, "step": 2004 }, { "epoch": 0.3089718906430497, "grad_norm": 0.9183567762374878, "learning_rate": 7.823450663593435e-05, "loss": 1.1826, "step": 2006 }, { "epoch": 0.30927993839045054, "grad_norm": 0.8156041502952576, "learning_rate": 7.819455555856777e-05, "loss": 0.824, "step": 2008 }, { "epoch": 0.3095879861378514, "grad_norm": 0.7667585611343384, "learning_rate": 7.815457807122962e-05, "loss": 0.9641, "step": 2010 }, { "epoch": 0.3098960338852522, "grad_norm": 0.7487335801124573, "learning_rate": 7.8114574211367e-05, "loss": 1.1025, "step": 2012 }, { "epoch": 0.31020408163265306, "grad_norm": 0.7451789975166321, "learning_rate": 7.807454401645174e-05, "loss": 0.8889, "step": 2014 }, { "epoch": 0.3105121293800539, "grad_norm": 0.7183794975280762, "learning_rate": 7.80344875239803e-05, "loss": 0.9154, "step": 2016 }, { "epoch": 0.31082017712745474, "grad_norm": 0.5591393709182739, "learning_rate": 7.799440477147376e-05, "loss": 1.2368, "step": 2018 }, { "epoch": 0.3111282248748556, "grad_norm": 0.7356299757957458, "learning_rate": 7.795429579647781e-05, "loss": 1.0495, "step": 2020 }, { "epoch": 0.3114362726222564, "grad_norm": 0.8593969345092773, "learning_rate": 7.791416063656277e-05, "loss": 1.0617, "step": 2022 }, { "epoch": 0.3117443203696573, "grad_norm": 0.8033467531204224, "learning_rate": 7.787399932932337e-05, "loss": 1.9187, "step": 2024 }, { "epoch": 0.31205236811705817, "grad_norm": 0.7294034361839294, "learning_rate": 7.783381191237895e-05, "loss": 0.9414, "step": 2026 }, { "epoch": 0.312360415864459, "grad_norm": 0.7499777674674988, "learning_rate": 7.779359842337321e-05, "loss": 0.9633, "step": 2028 }, { "epoch": 0.31266846361185985, "grad_norm": 0.9022547006607056, "learning_rate": 7.775335889997435e-05, "loss": 1.0092, "step": 2030 }, { "epoch": 0.3129765113592607, "grad_norm": 0.5967230200767517, "learning_rate": 7.771309337987487e-05, "loss": 1.0805, "step": 2032 }, { "epoch": 0.31328455910666153, "grad_norm": 0.5884339809417725, "learning_rate": 7.76728019007917e-05, "loss": 1.0213, "step": 2034 }, { "epoch": 0.3135926068540624, "grad_norm": 0.7071999311447144, "learning_rate": 7.763248450046605e-05, "loss": 1.0096, "step": 2036 }, { "epoch": 0.3139006546014632, "grad_norm": 0.7165465950965881, "learning_rate": 7.759214121666343e-05, "loss": 2.1081, "step": 2038 }, { "epoch": 0.31420870234886406, "grad_norm": 0.6906457543373108, "learning_rate": 7.755177208717356e-05, "loss": 1.1053, "step": 2040 }, { "epoch": 0.3145167500962649, "grad_norm": 0.7363939881324768, "learning_rate": 7.75113771498104e-05, "loss": 0.8927, "step": 2042 }, { "epoch": 0.3148247978436658, "grad_norm": 0.6364469528198242, "learning_rate": 7.747095644241209e-05, "loss": 0.823, "step": 2044 }, { "epoch": 0.31513284559106663, "grad_norm": 0.6343603134155273, "learning_rate": 7.743051000284087e-05, "loss": 2.1096, "step": 2046 }, { "epoch": 0.3154408933384675, "grad_norm": 0.7235949039459229, "learning_rate": 7.739003786898314e-05, "loss": 1.0984, "step": 2048 }, { "epoch": 0.3157489410858683, "grad_norm": 0.7274911403656006, "learning_rate": 7.734954007874931e-05, "loss": 1.1339, "step": 2050 }, { "epoch": 0.31605698883326916, "grad_norm": 0.7011663913726807, "learning_rate": 7.730901667007384e-05, "loss": 1.0374, "step": 2052 }, { "epoch": 0.31636503658067, "grad_norm": 0.6921355724334717, "learning_rate": 7.726846768091523e-05, "loss": 1.2023, "step": 2054 }, { "epoch": 0.31667308432807084, "grad_norm": 1.0013788938522339, "learning_rate": 7.722789314925589e-05, "loss": 1.2035, "step": 2056 }, { "epoch": 0.3169811320754717, "grad_norm": 0.7735009789466858, "learning_rate": 7.718729311310215e-05, "loss": 0.948, "step": 2058 }, { "epoch": 0.3172891798228725, "grad_norm": 0.9819526076316833, "learning_rate": 7.71466676104843e-05, "loss": 1.1703, "step": 2060 }, { "epoch": 0.31759722757027337, "grad_norm": 0.7479535937309265, "learning_rate": 7.71060166794564e-05, "loss": 0.9304, "step": 2062 }, { "epoch": 0.31790527531767426, "grad_norm": 0.6823731660842896, "learning_rate": 7.70653403580964e-05, "loss": 0.9511, "step": 2064 }, { "epoch": 0.3182133230650751, "grad_norm": 0.8437545895576477, "learning_rate": 7.702463868450596e-05, "loss": 0.9972, "step": 2066 }, { "epoch": 0.31852137081247595, "grad_norm": 1.0784193277359009, "learning_rate": 7.698391169681055e-05, "loss": 0.9541, "step": 2068 }, { "epoch": 0.3188294185598768, "grad_norm": 0.7519109845161438, "learning_rate": 7.694315943315933e-05, "loss": 1.0247, "step": 2070 }, { "epoch": 0.31913746630727763, "grad_norm": 0.9027669429779053, "learning_rate": 7.690238193172511e-05, "loss": 0.9814, "step": 2072 }, { "epoch": 0.31944551405467847, "grad_norm": 0.704380989074707, "learning_rate": 7.686157923070442e-05, "loss": 1.0821, "step": 2074 }, { "epoch": 0.3197535618020793, "grad_norm": 0.7209782004356384, "learning_rate": 7.68207513683173e-05, "loss": 1.1074, "step": 2076 }, { "epoch": 0.32006160954948015, "grad_norm": 0.9780352115631104, "learning_rate": 7.677989838280739e-05, "loss": 0.995, "step": 2078 }, { "epoch": 0.320369657296881, "grad_norm": 0.6577726602554321, "learning_rate": 7.673902031244189e-05, "loss": 0.8098, "step": 2080 }, { "epoch": 0.3206777050442819, "grad_norm": 0.8020167350769043, "learning_rate": 7.669811719551149e-05, "loss": 1.0673, "step": 2082 }, { "epoch": 0.32098575279168273, "grad_norm": 0.5862478613853455, "learning_rate": 7.665718907033031e-05, "loss": 0.9923, "step": 2084 }, { "epoch": 0.3212938005390836, "grad_norm": 0.6254997849464417, "learning_rate": 7.661623597523592e-05, "loss": 1.0672, "step": 2086 }, { "epoch": 0.3216018482864844, "grad_norm": 0.6177172064781189, "learning_rate": 7.657525794858926e-05, "loss": 1.9986, "step": 2088 }, { "epoch": 0.32190989603388526, "grad_norm": 0.7840576171875, "learning_rate": 7.653425502877469e-05, "loss": 1.025, "step": 2090 }, { "epoch": 0.3222179437812861, "grad_norm": 0.7192623019218445, "learning_rate": 7.649322725419977e-05, "loss": 0.9782, "step": 2092 }, { "epoch": 0.32252599152868694, "grad_norm": 0.7550196051597595, "learning_rate": 7.645217466329546e-05, "loss": 1.0586, "step": 2094 }, { "epoch": 0.3228340392760878, "grad_norm": 0.8093752264976501, "learning_rate": 7.641109729451588e-05, "loss": 1.2799, "step": 2096 }, { "epoch": 0.3231420870234886, "grad_norm": 0.6246629953384399, "learning_rate": 7.636999518633841e-05, "loss": 1.1575, "step": 2098 }, { "epoch": 0.32345013477088946, "grad_norm": 0.9272355437278748, "learning_rate": 7.632886837726359e-05, "loss": 1.0441, "step": 2100 }, { "epoch": 0.32345013477088946, "eval_loss": 2.473278045654297, "eval_runtime": 736.1878, "eval_samples_per_second": 2.717, "eval_steps_per_second": 0.679, "step": 2100 }, { "epoch": 0.32375818251829036, "grad_norm": 0.7700662612915039, "learning_rate": 7.628771690581508e-05, "loss": 1.0005, "step": 2102 }, { "epoch": 0.3240662302656912, "grad_norm": 0.9488754272460938, "learning_rate": 7.624654081053966e-05, "loss": 1.0601, "step": 2104 }, { "epoch": 0.32437427801309204, "grad_norm": 0.566880464553833, "learning_rate": 7.620534013000716e-05, "loss": 0.8569, "step": 2106 }, { "epoch": 0.3246823257604929, "grad_norm": 0.8669172525405884, "learning_rate": 7.616411490281048e-05, "loss": 1.013, "step": 2108 }, { "epoch": 0.3249903735078937, "grad_norm": 0.9163511991500854, "learning_rate": 7.612286516756544e-05, "loss": 0.9347, "step": 2110 }, { "epoch": 0.32529842125529457, "grad_norm": 0.5301980376243591, "learning_rate": 7.60815909629109e-05, "loss": 0.8109, "step": 2112 }, { "epoch": 0.3256064690026954, "grad_norm": 0.6251680254936218, "learning_rate": 7.604029232750858e-05, "loss": 1.0528, "step": 2114 }, { "epoch": 0.32591451675009625, "grad_norm": 0.7380292415618896, "learning_rate": 7.599896930004309e-05, "loss": 1.1757, "step": 2116 }, { "epoch": 0.3262225644974971, "grad_norm": 0.6980268955230713, "learning_rate": 7.595762191922192e-05, "loss": 1.5307, "step": 2118 }, { "epoch": 0.32653061224489793, "grad_norm": 0.580763041973114, "learning_rate": 7.591625022377537e-05, "loss": 0.9236, "step": 2120 }, { "epoch": 0.32683865999229883, "grad_norm": 0.6827017664909363, "learning_rate": 7.587485425245648e-05, "loss": 1.0231, "step": 2122 }, { "epoch": 0.32714670773969967, "grad_norm": 0.7438657283782959, "learning_rate": 7.583343404404104e-05, "loss": 1.1546, "step": 2124 }, { "epoch": 0.3274547554871005, "grad_norm": 0.6554436683654785, "learning_rate": 7.579198963732756e-05, "loss": 0.9518, "step": 2126 }, { "epoch": 0.32776280323450135, "grad_norm": 0.6709740161895752, "learning_rate": 7.575052107113722e-05, "loss": 1.3103, "step": 2128 }, { "epoch": 0.3280708509819022, "grad_norm": 0.6320585012435913, "learning_rate": 7.570902838431382e-05, "loss": 1.0456, "step": 2130 }, { "epoch": 0.32837889872930304, "grad_norm": 0.6538925170898438, "learning_rate": 7.566751161572372e-05, "loss": 0.9174, "step": 2132 }, { "epoch": 0.3286869464767039, "grad_norm": 0.6235430836677551, "learning_rate": 7.562597080425592e-05, "loss": 0.9084, "step": 2134 }, { "epoch": 0.3289949942241047, "grad_norm": 0.6424623131752014, "learning_rate": 7.558440598882185e-05, "loss": 0.9133, "step": 2136 }, { "epoch": 0.32930304197150556, "grad_norm": 0.8305116295814514, "learning_rate": 7.554281720835549e-05, "loss": 1.0305, "step": 2138 }, { "epoch": 0.32961108971890646, "grad_norm": 0.7598525285720825, "learning_rate": 7.550120450181324e-05, "loss": 0.8613, "step": 2140 }, { "epoch": 0.3299191374663073, "grad_norm": 0.9990943670272827, "learning_rate": 7.545956790817391e-05, "loss": 0.7471, "step": 2142 }, { "epoch": 0.33022718521370814, "grad_norm": 0.5743617415428162, "learning_rate": 7.54179074664387e-05, "loss": 0.8861, "step": 2144 }, { "epoch": 0.330535232961109, "grad_norm": 0.6631594300270081, "learning_rate": 7.537622321563114e-05, "loss": 1.039, "step": 2146 }, { "epoch": 0.3308432807085098, "grad_norm": 0.7509470582008362, "learning_rate": 7.533451519479704e-05, "loss": 1.086, "step": 2148 }, { "epoch": 0.33115132845591067, "grad_norm": 0.7622755765914917, "learning_rate": 7.529278344300452e-05, "loss": 1.1007, "step": 2150 }, { "epoch": 0.3314593762033115, "grad_norm": 0.5702837109565735, "learning_rate": 7.525102799934392e-05, "loss": 0.9004, "step": 2152 }, { "epoch": 0.33176742395071235, "grad_norm": 0.6059790849685669, "learning_rate": 7.52092489029277e-05, "loss": 0.9381, "step": 2154 }, { "epoch": 0.3320754716981132, "grad_norm": 0.9663304686546326, "learning_rate": 7.51674461928906e-05, "loss": 1.4107, "step": 2156 }, { "epoch": 0.33238351944551403, "grad_norm": 0.641952633857727, "learning_rate": 7.512561990838937e-05, "loss": 1.3133, "step": 2158 }, { "epoch": 0.3326915671929149, "grad_norm": 0.7226306200027466, "learning_rate": 7.508377008860294e-05, "loss": 1.0616, "step": 2160 }, { "epoch": 0.33299961494031577, "grad_norm": 0.6401228904724121, "learning_rate": 7.504189677273217e-05, "loss": 1.0341, "step": 2162 }, { "epoch": 0.3333076626877166, "grad_norm": 0.9536916017532349, "learning_rate": 7.500000000000001e-05, "loss": 0.8679, "step": 2164 }, { "epoch": 0.33361571043511745, "grad_norm": 0.9569481015205383, "learning_rate": 7.495807980965137e-05, "loss": 0.9996, "step": 2166 }, { "epoch": 0.3339237581825183, "grad_norm": 0.6950173377990723, "learning_rate": 7.491613624095307e-05, "loss": 1.0641, "step": 2168 }, { "epoch": 0.33423180592991913, "grad_norm": 0.730665385723114, "learning_rate": 7.487416933319389e-05, "loss": 0.9797, "step": 2170 }, { "epoch": 0.33453985367732, "grad_norm": 0.7115023732185364, "learning_rate": 7.483217912568437e-05, "loss": 1.022, "step": 2172 }, { "epoch": 0.3348479014247208, "grad_norm": 0.6663245558738708, "learning_rate": 7.479016565775697e-05, "loss": 1.0022, "step": 2174 }, { "epoch": 0.33515594917212166, "grad_norm": 0.7663930654525757, "learning_rate": 7.474812896876588e-05, "loss": 1.0008, "step": 2176 }, { "epoch": 0.3354639969195225, "grad_norm": 0.5541350245475769, "learning_rate": 7.47060690980871e-05, "loss": 1.1907, "step": 2178 }, { "epoch": 0.3357720446669234, "grad_norm": 0.654722273349762, "learning_rate": 7.466398608511826e-05, "loss": 0.9735, "step": 2180 }, { "epoch": 0.33608009241432424, "grad_norm": 0.6130832433700562, "learning_rate": 7.462187996927873e-05, "loss": 1.0976, "step": 2182 }, { "epoch": 0.3363881401617251, "grad_norm": 0.822195827960968, "learning_rate": 7.457975079000954e-05, "loss": 0.9524, "step": 2184 }, { "epoch": 0.3366961879091259, "grad_norm": 0.6369136571884155, "learning_rate": 7.453759858677324e-05, "loss": 1.3212, "step": 2186 }, { "epoch": 0.33700423565652676, "grad_norm": 0.7983855605125427, "learning_rate": 7.449542339905401e-05, "loss": 0.8971, "step": 2188 }, { "epoch": 0.3373122834039276, "grad_norm": 0.6447656154632568, "learning_rate": 7.445322526635756e-05, "loss": 0.9881, "step": 2190 }, { "epoch": 0.33762033115132845, "grad_norm": 0.7861150503158569, "learning_rate": 7.441100422821107e-05, "loss": 1.8758, "step": 2192 }, { "epoch": 0.3379283788987293, "grad_norm": 0.6880084872245789, "learning_rate": 7.436876032416317e-05, "loss": 0.799, "step": 2194 }, { "epoch": 0.33823642664613013, "grad_norm": 0.742363691329956, "learning_rate": 7.432649359378393e-05, "loss": 0.8949, "step": 2196 }, { "epoch": 0.33854447439353097, "grad_norm": 0.7017818689346313, "learning_rate": 7.42842040766648e-05, "loss": 0.9626, "step": 2198 }, { "epoch": 0.33885252214093187, "grad_norm": 0.6897172927856445, "learning_rate": 7.424189181241856e-05, "loss": 1.1507, "step": 2200 }, { "epoch": 0.3391605698883327, "grad_norm": 0.6582860946655273, "learning_rate": 7.419955684067929e-05, "loss": 0.822, "step": 2202 }, { "epoch": 0.33946861763573355, "grad_norm": 0.7101274728775024, "learning_rate": 7.41571992011024e-05, "loss": 1.0288, "step": 2204 }, { "epoch": 0.3397766653831344, "grad_norm": 0.702109158039093, "learning_rate": 7.411481893336446e-05, "loss": 2.2937, "step": 2206 }, { "epoch": 0.34008471313053523, "grad_norm": 0.756034255027771, "learning_rate": 7.407241607716326e-05, "loss": 0.9363, "step": 2208 }, { "epoch": 0.3403927608779361, "grad_norm": 0.6211479902267456, "learning_rate": 7.402999067221777e-05, "loss": 0.7543, "step": 2210 }, { "epoch": 0.3407008086253369, "grad_norm": 0.8365891575813293, "learning_rate": 7.398754275826801e-05, "loss": 1.1156, "step": 2212 }, { "epoch": 0.34100885637273776, "grad_norm": 0.824151873588562, "learning_rate": 7.394507237507522e-05, "loss": 1.0464, "step": 2214 }, { "epoch": 0.3413169041201386, "grad_norm": 0.9308121204376221, "learning_rate": 7.390257956242154e-05, "loss": 1.1058, "step": 2216 }, { "epoch": 0.3416249518675395, "grad_norm": 0.6541286110877991, "learning_rate": 7.386006436011026e-05, "loss": 1.2529, "step": 2218 }, { "epoch": 0.34193299961494034, "grad_norm": 0.611873209476471, "learning_rate": 7.381752680796547e-05, "loss": 0.7557, "step": 2220 }, { "epoch": 0.3422410473623412, "grad_norm": 0.8098903894424438, "learning_rate": 7.377496694583237e-05, "loss": 2.2155, "step": 2222 }, { "epoch": 0.342549095109742, "grad_norm": 0.6516982316970825, "learning_rate": 7.373238481357696e-05, "loss": 0.9124, "step": 2224 }, { "epoch": 0.34285714285714286, "grad_norm": 0.5909575819969177, "learning_rate": 7.36897804510861e-05, "loss": 1.0173, "step": 2226 }, { "epoch": 0.3431651906045437, "grad_norm": 0.6586790084838867, "learning_rate": 7.364715389826752e-05, "loss": 1.126, "step": 2228 }, { "epoch": 0.34347323835194454, "grad_norm": 0.8317116498947144, "learning_rate": 7.360450519504972e-05, "loss": 1.014, "step": 2230 }, { "epoch": 0.3437812860993454, "grad_norm": 0.7233576774597168, "learning_rate": 7.35618343813819e-05, "loss": 1.0308, "step": 2232 }, { "epoch": 0.3440893338467462, "grad_norm": 0.8280635476112366, "learning_rate": 7.351914149723404e-05, "loss": 0.9603, "step": 2234 }, { "epoch": 0.34439738159414707, "grad_norm": 0.6424171924591064, "learning_rate": 7.347642658259675e-05, "loss": 1.0437, "step": 2236 }, { "epoch": 0.34470542934154796, "grad_norm": 0.6725434064865112, "learning_rate": 7.343368967748129e-05, "loss": 0.9262, "step": 2238 }, { "epoch": 0.3450134770889488, "grad_norm": 0.8361773490905762, "learning_rate": 7.339093082191953e-05, "loss": 1.1557, "step": 2240 }, { "epoch": 0.34532152483634965, "grad_norm": 0.9322282075881958, "learning_rate": 7.334815005596387e-05, "loss": 0.9511, "step": 2242 }, { "epoch": 0.3456295725837505, "grad_norm": 0.5928962230682373, "learning_rate": 7.330534741968729e-05, "loss": 0.9221, "step": 2244 }, { "epoch": 0.34593762033115133, "grad_norm": 0.8262036442756653, "learning_rate": 7.326252295318318e-05, "loss": 1.0763, "step": 2246 }, { "epoch": 0.34624566807855217, "grad_norm": 0.6837438941001892, "learning_rate": 7.321967669656545e-05, "loss": 1.0927, "step": 2248 }, { "epoch": 0.346553715825953, "grad_norm": 0.9353243708610535, "learning_rate": 7.317680868996833e-05, "loss": 0.9783, "step": 2250 }, { "epoch": 0.34686176357335385, "grad_norm": 0.9607541561126709, "learning_rate": 7.313391897354654e-05, "loss": 1.081, "step": 2252 }, { "epoch": 0.3471698113207547, "grad_norm": 0.6798626184463501, "learning_rate": 7.309100758747506e-05, "loss": 0.9822, "step": 2254 }, { "epoch": 0.34747785906815554, "grad_norm": 1.0110447406768799, "learning_rate": 7.304807457194918e-05, "loss": 0.959, "step": 2256 }, { "epoch": 0.34778590681555643, "grad_norm": 0.9492419958114624, "learning_rate": 7.300511996718447e-05, "loss": 1.0298, "step": 2258 }, { "epoch": 0.3480939545629573, "grad_norm": 0.5809617638587952, "learning_rate": 7.29621438134167e-05, "loss": 0.8646, "step": 2260 }, { "epoch": 0.3484020023103581, "grad_norm": 0.9347992539405823, "learning_rate": 7.29191461509018e-05, "loss": 1.047, "step": 2262 }, { "epoch": 0.34871005005775896, "grad_norm": 0.6787614226341248, "learning_rate": 7.287612701991595e-05, "loss": 0.9601, "step": 2264 }, { "epoch": 0.3490180978051598, "grad_norm": 0.655129611492157, "learning_rate": 7.28330864607553e-05, "loss": 0.9365, "step": 2266 }, { "epoch": 0.34932614555256064, "grad_norm": 0.732175350189209, "learning_rate": 7.27900245137362e-05, "loss": 1.0254, "step": 2268 }, { "epoch": 0.3496341932999615, "grad_norm": 0.7303794026374817, "learning_rate": 7.274694121919495e-05, "loss": 2.1755, "step": 2270 }, { "epoch": 0.3499422410473623, "grad_norm": 0.852927565574646, "learning_rate": 7.270383661748786e-05, "loss": 1.173, "step": 2272 }, { "epoch": 0.35025028879476316, "grad_norm": 0.9458498954772949, "learning_rate": 7.266071074899124e-05, "loss": 1.1105, "step": 2274 }, { "epoch": 0.35055833654216406, "grad_norm": 1.054835557937622, "learning_rate": 7.261756365410126e-05, "loss": 1.0068, "step": 2276 }, { "epoch": 0.3508663842895649, "grad_norm": 0.8554360270500183, "learning_rate": 7.257439537323403e-05, "loss": 1.1411, "step": 2278 }, { "epoch": 0.35117443203696574, "grad_norm": 0.9421424269676208, "learning_rate": 7.253120594682547e-05, "loss": 1.161, "step": 2280 }, { "epoch": 0.3514824797843666, "grad_norm": 0.5921900272369385, "learning_rate": 7.24879954153313e-05, "loss": 1.893, "step": 2282 }, { "epoch": 0.3517905275317674, "grad_norm": 0.8463044762611389, "learning_rate": 7.244476381922708e-05, "loss": 1.1132, "step": 2284 }, { "epoch": 0.35209857527916827, "grad_norm": 0.6275113224983215, "learning_rate": 7.240151119900797e-05, "loss": 1.1475, "step": 2286 }, { "epoch": 0.3524066230265691, "grad_norm": 0.8620054125785828, "learning_rate": 7.2358237595189e-05, "loss": 1.212, "step": 2288 }, { "epoch": 0.35271467077396995, "grad_norm": 0.8541956543922424, "learning_rate": 7.231494304830465e-05, "loss": 1.0452, "step": 2290 }, { "epoch": 0.3530227185213708, "grad_norm": 0.5204923748970032, "learning_rate": 7.227162759890919e-05, "loss": 0.8911, "step": 2292 }, { "epoch": 0.35333076626877163, "grad_norm": 0.5744819045066833, "learning_rate": 7.22282912875764e-05, "loss": 2.2718, "step": 2294 }, { "epoch": 0.35363881401617253, "grad_norm": 0.6028558015823364, "learning_rate": 7.218493415489956e-05, "loss": 1.2403, "step": 2296 }, { "epoch": 0.3539468617635734, "grad_norm": 0.8307321071624756, "learning_rate": 7.214155624149156e-05, "loss": 1.0185, "step": 2298 }, { "epoch": 0.3542549095109742, "grad_norm": 1.0472615957260132, "learning_rate": 7.209815758798464e-05, "loss": 1.0723, "step": 2300 }, { "epoch": 0.35456295725837506, "grad_norm": 0.7444417476654053, "learning_rate": 7.205473823503057e-05, "loss": 0.942, "step": 2302 }, { "epoch": 0.3548710050057759, "grad_norm": 0.681360125541687, "learning_rate": 7.201129822330041e-05, "loss": 1.0413, "step": 2304 }, { "epoch": 0.35517905275317674, "grad_norm": 0.7587233185768127, "learning_rate": 7.196783759348465e-05, "loss": 0.931, "step": 2306 }, { "epoch": 0.3554871005005776, "grad_norm": 0.5804754495620728, "learning_rate": 7.192435638629307e-05, "loss": 1.9602, "step": 2308 }, { "epoch": 0.3557951482479784, "grad_norm": 0.5681557059288025, "learning_rate": 7.18808546424547e-05, "loss": 0.8623, "step": 2310 }, { "epoch": 0.35610319599537926, "grad_norm": 0.9095214009284973, "learning_rate": 7.183733240271784e-05, "loss": 1.2142, "step": 2312 }, { "epoch": 0.3564112437427801, "grad_norm": 0.8182647228240967, "learning_rate": 7.179378970784997e-05, "loss": 1.1915, "step": 2314 }, { "epoch": 0.356719291490181, "grad_norm": 0.8019623160362244, "learning_rate": 7.175022659863773e-05, "loss": 1.843, "step": 2316 }, { "epoch": 0.35702733923758184, "grad_norm": 0.7972326874732971, "learning_rate": 7.17066431158869e-05, "loss": 1.0202, "step": 2318 }, { "epoch": 0.3573353869849827, "grad_norm": 0.5969523787498474, "learning_rate": 7.166303930042233e-05, "loss": 2.6547, "step": 2320 }, { "epoch": 0.3576434347323835, "grad_norm": 1.3465994596481323, "learning_rate": 7.16194151930879e-05, "loss": 1.0466, "step": 2322 }, { "epoch": 0.35795148247978437, "grad_norm": 0.8386397957801819, "learning_rate": 7.157577083474653e-05, "loss": 1.1265, "step": 2324 }, { "epoch": 0.3582595302271852, "grad_norm": 0.675894558429718, "learning_rate": 7.153210626628007e-05, "loss": 1.0571, "step": 2326 }, { "epoch": 0.35856757797458605, "grad_norm": 0.656844973564148, "learning_rate": 7.148842152858938e-05, "loss": 1.0948, "step": 2328 }, { "epoch": 0.3588756257219869, "grad_norm": 0.665457546710968, "learning_rate": 7.144471666259409e-05, "loss": 1.0663, "step": 2330 }, { "epoch": 0.35918367346938773, "grad_norm": 0.7601264119148254, "learning_rate": 7.140099170923281e-05, "loss": 0.9879, "step": 2332 }, { "epoch": 0.35949172121678863, "grad_norm": 1.0152552127838135, "learning_rate": 7.135724670946288e-05, "loss": 1.0867, "step": 2334 }, { "epoch": 0.35979976896418947, "grad_norm": 0.9408657550811768, "learning_rate": 7.131348170426042e-05, "loss": 1.0168, "step": 2336 }, { "epoch": 0.3601078167115903, "grad_norm": 1.0523954629898071, "learning_rate": 7.126969673462037e-05, "loss": 1.1912, "step": 2338 }, { "epoch": 0.36041586445899115, "grad_norm": 0.9186645150184631, "learning_rate": 7.122589184155626e-05, "loss": 1.1382, "step": 2340 }, { "epoch": 0.360723912206392, "grad_norm": 0.7351142168045044, "learning_rate": 7.118206706610038e-05, "loss": 0.9642, "step": 2342 }, { "epoch": 0.36103195995379284, "grad_norm": 0.714341938495636, "learning_rate": 7.113822244930357e-05, "loss": 0.9253, "step": 2344 }, { "epoch": 0.3613400077011937, "grad_norm": 0.7643159627914429, "learning_rate": 7.109435803223531e-05, "loss": 1.0953, "step": 2346 }, { "epoch": 0.3616480554485945, "grad_norm": 0.8597730398178101, "learning_rate": 7.105047385598359e-05, "loss": 1.0206, "step": 2348 }, { "epoch": 0.36195610319599536, "grad_norm": 0.6703843474388123, "learning_rate": 7.100656996165493e-05, "loss": 0.919, "step": 2350 }, { "epoch": 0.3622641509433962, "grad_norm": 0.8356922268867493, "learning_rate": 7.096264639037431e-05, "loss": 0.9216, "step": 2352 }, { "epoch": 0.3625721986907971, "grad_norm": 0.7385930418968201, "learning_rate": 7.091870318328515e-05, "loss": 1.3654, "step": 2354 }, { "epoch": 0.36288024643819794, "grad_norm": 0.6730544567108154, "learning_rate": 7.087474038154924e-05, "loss": 2.8678, "step": 2356 }, { "epoch": 0.3631882941855988, "grad_norm": 0.7496978044509888, "learning_rate": 7.083075802634675e-05, "loss": 0.9139, "step": 2358 }, { "epoch": 0.3634963419329996, "grad_norm": 0.8838191628456116, "learning_rate": 7.078675615887618e-05, "loss": 0.9811, "step": 2360 }, { "epoch": 0.36380438968040046, "grad_norm": 0.642116129398346, "learning_rate": 7.074273482035424e-05, "loss": 1.0675, "step": 2362 }, { "epoch": 0.3641124374278013, "grad_norm": 0.6837906837463379, "learning_rate": 7.069869405201595e-05, "loss": 1.0903, "step": 2364 }, { "epoch": 0.36442048517520215, "grad_norm": 0.7109867334365845, "learning_rate": 7.065463389511449e-05, "loss": 0.9847, "step": 2366 }, { "epoch": 0.364728532922603, "grad_norm": 0.5639575719833374, "learning_rate": 7.061055439092126e-05, "loss": 0.865, "step": 2368 }, { "epoch": 0.36503658067000383, "grad_norm": 0.7767153382301331, "learning_rate": 7.056645558072565e-05, "loss": 1.1613, "step": 2370 }, { "epoch": 0.36534462841740467, "grad_norm": 0.7072895169258118, "learning_rate": 7.052233750583532e-05, "loss": 1.0345, "step": 2372 }, { "epoch": 0.36565267616480557, "grad_norm": 0.7484140992164612, "learning_rate": 7.047820020757579e-05, "loss": 0.905, "step": 2374 }, { "epoch": 0.3659607239122064, "grad_norm": 0.7483952641487122, "learning_rate": 7.043404372729072e-05, "loss": 1.2131, "step": 2376 }, { "epoch": 0.36626877165960725, "grad_norm": 0.7827897071838379, "learning_rate": 7.03898681063417e-05, "loss": 1.0051, "step": 2378 }, { "epoch": 0.3665768194070081, "grad_norm": 0.6883440613746643, "learning_rate": 7.034567338610819e-05, "loss": 1.1551, "step": 2380 }, { "epoch": 0.36688486715440893, "grad_norm": 0.8326652646064758, "learning_rate": 7.030145960798764e-05, "loss": 0.8967, "step": 2382 }, { "epoch": 0.3671929149018098, "grad_norm": 0.776249349117279, "learning_rate": 7.025722681339528e-05, "loss": 1.0518, "step": 2384 }, { "epoch": 0.3675009626492106, "grad_norm": 0.5774699449539185, "learning_rate": 7.021297504376418e-05, "loss": 0.9455, "step": 2386 }, { "epoch": 0.36780901039661146, "grad_norm": 0.7364729642868042, "learning_rate": 7.016870434054517e-05, "loss": 0.8656, "step": 2388 }, { "epoch": 0.3681170581440123, "grad_norm": 0.7618663311004639, "learning_rate": 7.012441474520683e-05, "loss": 1.2443, "step": 2390 }, { "epoch": 0.3684251058914132, "grad_norm": 0.6586353182792664, "learning_rate": 7.008010629923544e-05, "loss": 1.1953, "step": 2392 }, { "epoch": 0.36873315363881404, "grad_norm": 0.6616129279136658, "learning_rate": 7.003577904413492e-05, "loss": 1.0623, "step": 2394 }, { "epoch": 0.3690412013862149, "grad_norm": 0.8203920125961304, "learning_rate": 6.999143302142681e-05, "loss": 1.0939, "step": 2396 }, { "epoch": 0.3693492491336157, "grad_norm": 0.6637328863143921, "learning_rate": 6.994706827265024e-05, "loss": 1.0425, "step": 2398 }, { "epoch": 0.36965729688101656, "grad_norm": 0.7866489887237549, "learning_rate": 6.990268483936189e-05, "loss": 0.989, "step": 2400 }, { "epoch": 0.36965729688101656, "eval_loss": 2.5227484703063965, "eval_runtime": 737.0745, "eval_samples_per_second": 2.713, "eval_steps_per_second": 0.678, "step": 2400 }, { "epoch": 0.3699653446284174, "grad_norm": 0.7644304633140564, "learning_rate": 6.98582827631359e-05, "loss": 2.0393, "step": 2402 }, { "epoch": 0.37027339237581824, "grad_norm": 0.7707456350326538, "learning_rate": 6.981386208556394e-05, "loss": 0.9427, "step": 2404 }, { "epoch": 0.3705814401232191, "grad_norm": 0.7313803434371948, "learning_rate": 6.9769422848255e-05, "loss": 1.0733, "step": 2406 }, { "epoch": 0.3708894878706199, "grad_norm": 0.7003641128540039, "learning_rate": 6.972496509283562e-05, "loss": 1.4143, "step": 2408 }, { "epoch": 0.37119753561802077, "grad_norm": 0.7405939102172852, "learning_rate": 6.96804888609495e-05, "loss": 1.2404, "step": 2410 }, { "epoch": 0.37150558336542167, "grad_norm": 0.4707174003124237, "learning_rate": 6.963599419425777e-05, "loss": 0.8943, "step": 2412 }, { "epoch": 0.3718136311128225, "grad_norm": 0.955906331539154, "learning_rate": 6.959148113443879e-05, "loss": 1.1045, "step": 2414 }, { "epoch": 0.37212167886022335, "grad_norm": 0.674000084400177, "learning_rate": 6.954694972318816e-05, "loss": 0.9863, "step": 2416 }, { "epoch": 0.3724297266076242, "grad_norm": 0.9632213115692139, "learning_rate": 6.950240000221862e-05, "loss": 0.8621, "step": 2418 }, { "epoch": 0.37273777435502503, "grad_norm": 0.9839867353439331, "learning_rate": 6.945783201326015e-05, "loss": 1.1482, "step": 2420 }, { "epoch": 0.3730458221024259, "grad_norm": 0.6643813848495483, "learning_rate": 6.941324579805977e-05, "loss": 0.8935, "step": 2422 }, { "epoch": 0.3733538698498267, "grad_norm": 0.7968766689300537, "learning_rate": 6.936864139838158e-05, "loss": 1.2712, "step": 2424 }, { "epoch": 0.37366191759722756, "grad_norm": 0.7045233845710754, "learning_rate": 6.932401885600678e-05, "loss": 1.0575, "step": 2426 }, { "epoch": 0.3739699653446284, "grad_norm": 0.7721226811408997, "learning_rate": 6.927937821273344e-05, "loss": 1.0979, "step": 2428 }, { "epoch": 0.37427801309202924, "grad_norm": 0.775489330291748, "learning_rate": 6.923471951037672e-05, "loss": 1.2188, "step": 2430 }, { "epoch": 0.37458606083943013, "grad_norm": 0.6731612086296082, "learning_rate": 6.919004279076862e-05, "loss": 1.0237, "step": 2432 }, { "epoch": 0.374894108586831, "grad_norm": 1.1068634986877441, "learning_rate": 6.914534809575802e-05, "loss": 0.9837, "step": 2434 }, { "epoch": 0.3752021563342318, "grad_norm": 0.683112382888794, "learning_rate": 6.910063546721064e-05, "loss": 1.0354, "step": 2436 }, { "epoch": 0.37551020408163266, "grad_norm": 0.7034465670585632, "learning_rate": 6.905590494700905e-05, "loss": 1.2002, "step": 2438 }, { "epoch": 0.3758182518290335, "grad_norm": 0.862964928150177, "learning_rate": 6.901115657705246e-05, "loss": 0.8784, "step": 2440 }, { "epoch": 0.37612629957643434, "grad_norm": 0.6790128350257874, "learning_rate": 6.896639039925697e-05, "loss": 0.9937, "step": 2442 }, { "epoch": 0.3764343473238352, "grad_norm": 1.4234968423843384, "learning_rate": 6.892160645555521e-05, "loss": 0.9885, "step": 2444 }, { "epoch": 0.376742395071236, "grad_norm": 0.7562160491943359, "learning_rate": 6.88768047878965e-05, "loss": 1.0742, "step": 2446 }, { "epoch": 0.37705044281863687, "grad_norm": 0.6968784928321838, "learning_rate": 6.883198543824681e-05, "loss": 1.1895, "step": 2448 }, { "epoch": 0.37735849056603776, "grad_norm": 0.760648787021637, "learning_rate": 6.87871484485886e-05, "loss": 1.0247, "step": 2450 }, { "epoch": 0.3776665383134386, "grad_norm": 0.7358818650245667, "learning_rate": 6.874229386092092e-05, "loss": 1.0636, "step": 2452 }, { "epoch": 0.37797458606083945, "grad_norm": 0.6471631526947021, "learning_rate": 6.869742171725924e-05, "loss": 0.915, "step": 2454 }, { "epoch": 0.3782826338082403, "grad_norm": 0.7820512652397156, "learning_rate": 6.865253205963555e-05, "loss": 0.944, "step": 2456 }, { "epoch": 0.37859068155564113, "grad_norm": 0.7044534683227539, "learning_rate": 6.860762493009814e-05, "loss": 1.0403, "step": 2458 }, { "epoch": 0.37889872930304197, "grad_norm": 0.8854067325592041, "learning_rate": 6.856270037071176e-05, "loss": 1.0855, "step": 2460 }, { "epoch": 0.3792067770504428, "grad_norm": 0.6846293210983276, "learning_rate": 6.851775842355746e-05, "loss": 1.2417, "step": 2462 }, { "epoch": 0.37951482479784365, "grad_norm": 0.7980126738548279, "learning_rate": 6.847279913073255e-05, "loss": 1.2656, "step": 2464 }, { "epoch": 0.3798228725452445, "grad_norm": 1.008276104927063, "learning_rate": 6.842782253435065e-05, "loss": 1.2209, "step": 2466 }, { "epoch": 0.38013092029264534, "grad_norm": 0.6689044237136841, "learning_rate": 6.838282867654149e-05, "loss": 1.6123, "step": 2468 }, { "epoch": 0.38043896804004623, "grad_norm": 0.9234145879745483, "learning_rate": 6.833781759945107e-05, "loss": 1.2141, "step": 2470 }, { "epoch": 0.3807470157874471, "grad_norm": 0.6782144904136658, "learning_rate": 6.829278934524146e-05, "loss": 1.1363, "step": 2472 }, { "epoch": 0.3810550635348479, "grad_norm": 0.569631814956665, "learning_rate": 6.824774395609085e-05, "loss": 1.0716, "step": 2474 }, { "epoch": 0.38136311128224876, "grad_norm": 0.5350009799003601, "learning_rate": 6.820268147419344e-05, "loss": 1.2837, "step": 2476 }, { "epoch": 0.3816711590296496, "grad_norm": 0.6322665810585022, "learning_rate": 6.815760194175949e-05, "loss": 1.1355, "step": 2478 }, { "epoch": 0.38197920677705044, "grad_norm": 0.9357727766036987, "learning_rate": 6.811250540101517e-05, "loss": 2.2662, "step": 2480 }, { "epoch": 0.3822872545244513, "grad_norm": 0.9705163240432739, "learning_rate": 6.806739189420269e-05, "loss": 1.028, "step": 2482 }, { "epoch": 0.3825953022718521, "grad_norm": 0.8012816905975342, "learning_rate": 6.802226146358001e-05, "loss": 0.9716, "step": 2484 }, { "epoch": 0.38290335001925296, "grad_norm": 0.7949168682098389, "learning_rate": 6.797711415142105e-05, "loss": 1.0279, "step": 2486 }, { "epoch": 0.3832113977666538, "grad_norm": 0.7826275825500488, "learning_rate": 6.793195000001551e-05, "loss": 1.3541, "step": 2488 }, { "epoch": 0.3835194455140547, "grad_norm": 0.8680853247642517, "learning_rate": 6.788676905166884e-05, "loss": 1.2099, "step": 2490 }, { "epoch": 0.38382749326145554, "grad_norm": 0.8269588351249695, "learning_rate": 6.784157134870228e-05, "loss": 0.8367, "step": 2492 }, { "epoch": 0.3841355410088564, "grad_norm": 0.8308361172676086, "learning_rate": 6.779635693345268e-05, "loss": 0.9333, "step": 2494 }, { "epoch": 0.3844435887562572, "grad_norm": 0.6732942461967468, "learning_rate": 6.775112584827266e-05, "loss": 0.8547, "step": 2496 }, { "epoch": 0.38475163650365807, "grad_norm": 0.7414329051971436, "learning_rate": 6.77058781355303e-05, "loss": 1.1987, "step": 2498 }, { "epoch": 0.3850596842510589, "grad_norm": 0.7550074458122253, "learning_rate": 6.766061383760943e-05, "loss": 1.1309, "step": 2500 }, { "epoch": 0.38536773199845975, "grad_norm": 0.6689410209655762, "learning_rate": 6.761533299690927e-05, "loss": 0.8807, "step": 2502 }, { "epoch": 0.3856757797458606, "grad_norm": 0.7514511942863464, "learning_rate": 6.757003565584463e-05, "loss": 0.8829, "step": 2504 }, { "epoch": 0.38598382749326143, "grad_norm": 0.6438677310943604, "learning_rate": 6.752472185684573e-05, "loss": 0.8669, "step": 2506 }, { "epoch": 0.38629187524066233, "grad_norm": 0.9370463490486145, "learning_rate": 6.747939164235819e-05, "loss": 0.9831, "step": 2508 }, { "epoch": 0.38659992298806317, "grad_norm": 1.0750024318695068, "learning_rate": 6.743404505484308e-05, "loss": 1.2794, "step": 2510 }, { "epoch": 0.386907970735464, "grad_norm": 0.8463871479034424, "learning_rate": 6.738868213677671e-05, "loss": 2.0352, "step": 2512 }, { "epoch": 0.38721601848286485, "grad_norm": 0.8012046813964844, "learning_rate": 6.734330293065079e-05, "loss": 0.9807, "step": 2514 }, { "epoch": 0.3875240662302657, "grad_norm": 0.9211841225624084, "learning_rate": 6.729790747897219e-05, "loss": 1.1453, "step": 2516 }, { "epoch": 0.38783211397766654, "grad_norm": 0.9312162399291992, "learning_rate": 6.725249582426306e-05, "loss": 1.1544, "step": 2518 }, { "epoch": 0.3881401617250674, "grad_norm": 0.5721031427383423, "learning_rate": 6.72070680090607e-05, "loss": 1.2422, "step": 2520 }, { "epoch": 0.3884482094724682, "grad_norm": 0.8398732542991638, "learning_rate": 6.716162407591757e-05, "loss": 0.9103, "step": 2522 }, { "epoch": 0.38875625721986906, "grad_norm": 1.1913796663284302, "learning_rate": 6.711616406740121e-05, "loss": 1.1755, "step": 2524 }, { "epoch": 0.3890643049672699, "grad_norm": 0.6675463914871216, "learning_rate": 6.707068802609421e-05, "loss": 0.9846, "step": 2526 }, { "epoch": 0.3893723527146708, "grad_norm": 0.6696200370788574, "learning_rate": 6.70251959945942e-05, "loss": 0.922, "step": 2528 }, { "epoch": 0.38968040046207164, "grad_norm": 0.7850413918495178, "learning_rate": 6.697968801551378e-05, "loss": 0.9998, "step": 2530 }, { "epoch": 0.3899884482094725, "grad_norm": 0.7170053124427795, "learning_rate": 6.693416413148045e-05, "loss": 0.9936, "step": 2532 }, { "epoch": 0.3902964959568733, "grad_norm": 0.7705442309379578, "learning_rate": 6.68886243851367e-05, "loss": 0.9031, "step": 2534 }, { "epoch": 0.39060454370427417, "grad_norm": 0.7424684166908264, "learning_rate": 6.684306881913982e-05, "loss": 1.0102, "step": 2536 }, { "epoch": 0.390912591451675, "grad_norm": 0.7100473642349243, "learning_rate": 6.679749747616186e-05, "loss": 0.8898, "step": 2538 }, { "epoch": 0.39122063919907585, "grad_norm": 0.97356116771698, "learning_rate": 6.675191039888978e-05, "loss": 0.863, "step": 2540 }, { "epoch": 0.3915286869464767, "grad_norm": 0.8761134743690491, "learning_rate": 6.670630763002514e-05, "loss": 1.0494, "step": 2542 }, { "epoch": 0.39183673469387753, "grad_norm": 0.8845568895339966, "learning_rate": 6.666068921228433e-05, "loss": 1.2686, "step": 2544 }, { "epoch": 0.3921447824412784, "grad_norm": 0.6904786229133606, "learning_rate": 6.66150551883983e-05, "loss": 0.9315, "step": 2546 }, { "epoch": 0.39245283018867927, "grad_norm": 0.6124597191810608, "learning_rate": 6.656940560111267e-05, "loss": 1.2245, "step": 2548 }, { "epoch": 0.3927608779360801, "grad_norm": 0.860092282295227, "learning_rate": 6.65237404931876e-05, "loss": 1.1557, "step": 2550 }, { "epoch": 0.39306892568348095, "grad_norm": 0.7460459470748901, "learning_rate": 6.647805990739782e-05, "loss": 0.9544, "step": 2552 }, { "epoch": 0.3933769734308818, "grad_norm": 0.7506148815155029, "learning_rate": 6.643236388653255e-05, "loss": 1.2874, "step": 2554 }, { "epoch": 0.39368502117828263, "grad_norm": 0.6355990767478943, "learning_rate": 6.638665247339546e-05, "loss": 0.9506, "step": 2556 }, { "epoch": 0.3939930689256835, "grad_norm": 0.693425714969635, "learning_rate": 6.634092571080465e-05, "loss": 0.8779, "step": 2558 }, { "epoch": 0.3943011166730843, "grad_norm": 0.8063914775848389, "learning_rate": 6.629518364159259e-05, "loss": 1.1405, "step": 2560 }, { "epoch": 0.39460916442048516, "grad_norm": 0.6337122321128845, "learning_rate": 6.624942630860607e-05, "loss": 1.1013, "step": 2562 }, { "epoch": 0.394917212167886, "grad_norm": 0.9229851961135864, "learning_rate": 6.620365375470623e-05, "loss": 0.9953, "step": 2564 }, { "epoch": 0.39522525991528684, "grad_norm": 0.7974163889884949, "learning_rate": 6.615786602276843e-05, "loss": 0.9866, "step": 2566 }, { "epoch": 0.39553330766268774, "grad_norm": 0.7080705165863037, "learning_rate": 6.611206315568223e-05, "loss": 0.9946, "step": 2568 }, { "epoch": 0.3958413554100886, "grad_norm": 0.7690320611000061, "learning_rate": 6.606624519635138e-05, "loss": 0.9264, "step": 2570 }, { "epoch": 0.3961494031574894, "grad_norm": 0.8104195594787598, "learning_rate": 6.602041218769383e-05, "loss": 1.0719, "step": 2572 }, { "epoch": 0.39645745090489026, "grad_norm": 0.5908204317092896, "learning_rate": 6.597456417264151e-05, "loss": 0.9732, "step": 2574 }, { "epoch": 0.3967654986522911, "grad_norm": 0.6712586283683777, "learning_rate": 6.592870119414052e-05, "loss": 1.9431, "step": 2576 }, { "epoch": 0.39707354639969195, "grad_norm": 0.8229988813400269, "learning_rate": 6.588282329515089e-05, "loss": 0.9665, "step": 2578 }, { "epoch": 0.3973815941470928, "grad_norm": 0.6816299557685852, "learning_rate": 6.583693051864668e-05, "loss": 0.6664, "step": 2580 }, { "epoch": 0.39768964189449363, "grad_norm": 0.6721845269203186, "learning_rate": 6.579102290761586e-05, "loss": 0.9116, "step": 2582 }, { "epoch": 0.39799768964189447, "grad_norm": 0.5877856016159058, "learning_rate": 6.57451005050603e-05, "loss": 0.9351, "step": 2584 }, { "epoch": 0.39830573738929537, "grad_norm": 0.7047834396362305, "learning_rate": 6.569916335399576e-05, "loss": 0.9426, "step": 2586 }, { "epoch": 0.3986137851366962, "grad_norm": 0.7781816720962524, "learning_rate": 6.565321149745174e-05, "loss": 1.0048, "step": 2588 }, { "epoch": 0.39892183288409705, "grad_norm": 0.6085528135299683, "learning_rate": 6.560724497847159e-05, "loss": 0.8135, "step": 2590 }, { "epoch": 0.3992298806314979, "grad_norm": 0.8343983888626099, "learning_rate": 6.556126384011233e-05, "loss": 1.1167, "step": 2592 }, { "epoch": 0.39953792837889873, "grad_norm": 0.7468252182006836, "learning_rate": 6.551526812544474e-05, "loss": 1.3543, "step": 2594 }, { "epoch": 0.3998459761262996, "grad_norm": 0.7748050093650818, "learning_rate": 6.546925787755321e-05, "loss": 1.1425, "step": 2596 }, { "epoch": 0.4001540238737004, "grad_norm": 0.7804863452911377, "learning_rate": 6.542323313953574e-05, "loss": 1.0779, "step": 2598 }, { "epoch": 0.40046207162110126, "grad_norm": 0.9270213842391968, "learning_rate": 6.537719395450391e-05, "loss": 0.9406, "step": 2600 }, { "epoch": 0.4007701193685021, "grad_norm": 0.6129812002182007, "learning_rate": 6.533114036558287e-05, "loss": 0.9975, "step": 2602 }, { "epoch": 0.40107816711590294, "grad_norm": 0.7297736406326294, "learning_rate": 6.528507241591121e-05, "loss": 2.0731, "step": 2604 }, { "epoch": 0.40138621486330384, "grad_norm": 0.7494378089904785, "learning_rate": 6.523899014864102e-05, "loss": 1.9419, "step": 2606 }, { "epoch": 0.4016942626107047, "grad_norm": 0.7798280119895935, "learning_rate": 6.519289360693774e-05, "loss": 0.9775, "step": 2608 }, { "epoch": 0.4020023103581055, "grad_norm": 0.8234038352966309, "learning_rate": 6.514678283398022e-05, "loss": 1.6277, "step": 2610 }, { "epoch": 0.40231035810550636, "grad_norm": 0.7135938405990601, "learning_rate": 6.510065787296064e-05, "loss": 0.8109, "step": 2612 }, { "epoch": 0.4026184058529072, "grad_norm": 0.8007364869117737, "learning_rate": 6.505451876708448e-05, "loss": 1.1473, "step": 2614 }, { "epoch": 0.40292645360030804, "grad_norm": 0.8691009879112244, "learning_rate": 6.500836555957046e-05, "loss": 1.1948, "step": 2616 }, { "epoch": 0.4032345013477089, "grad_norm": 0.985650360584259, "learning_rate": 6.496219829365048e-05, "loss": 1.1655, "step": 2618 }, { "epoch": 0.4035425490951097, "grad_norm": 0.8488809466362, "learning_rate": 6.491601701256966e-05, "loss": 1.0303, "step": 2620 }, { "epoch": 0.40385059684251057, "grad_norm": 0.7097958922386169, "learning_rate": 6.486982175958618e-05, "loss": 1.1143, "step": 2622 }, { "epoch": 0.4041586445899114, "grad_norm": 0.6954580545425415, "learning_rate": 6.482361257797138e-05, "loss": 0.9396, "step": 2624 }, { "epoch": 0.4044666923373123, "grad_norm": 0.671699583530426, "learning_rate": 6.477738951100961e-05, "loss": 1.1556, "step": 2626 }, { "epoch": 0.40477474008471315, "grad_norm": 0.6676186323165894, "learning_rate": 6.473115260199823e-05, "loss": 1.0157, "step": 2628 }, { "epoch": 0.405082787832114, "grad_norm": 0.6902966499328613, "learning_rate": 6.468490189424759e-05, "loss": 1.0301, "step": 2630 }, { "epoch": 0.40539083557951483, "grad_norm": 0.7361807227134705, "learning_rate": 6.46386374310809e-05, "loss": 0.9511, "step": 2632 }, { "epoch": 0.40569888332691567, "grad_norm": 0.6098953485488892, "learning_rate": 6.459235925583433e-05, "loss": 1.0431, "step": 2634 }, { "epoch": 0.4060069310743165, "grad_norm": 0.9778101444244385, "learning_rate": 6.454606741185686e-05, "loss": 1.0221, "step": 2636 }, { "epoch": 0.40631497882171735, "grad_norm": 0.7961477041244507, "learning_rate": 6.449976194251026e-05, "loss": 0.8727, "step": 2638 }, { "epoch": 0.4066230265691182, "grad_norm": 0.6664597988128662, "learning_rate": 6.44534428911691e-05, "loss": 1.0133, "step": 2640 }, { "epoch": 0.40693107431651904, "grad_norm": 0.5414862632751465, "learning_rate": 6.440711030122063e-05, "loss": 0.8452, "step": 2642 }, { "epoch": 0.40723912206391993, "grad_norm": 0.6674546599388123, "learning_rate": 6.43607642160648e-05, "loss": 0.9969, "step": 2644 }, { "epoch": 0.4075471698113208, "grad_norm": 1.1050533056259155, "learning_rate": 6.431440467911424e-05, "loss": 1.2907, "step": 2646 }, { "epoch": 0.4078552175587216, "grad_norm": 0.7038127779960632, "learning_rate": 6.426803173379412e-05, "loss": 1.2864, "step": 2648 }, { "epoch": 0.40816326530612246, "grad_norm": 1.0234121084213257, "learning_rate": 6.422164542354219e-05, "loss": 1.075, "step": 2650 }, { "epoch": 0.4084713130535233, "grad_norm": 0.7568143010139465, "learning_rate": 6.417524579180873e-05, "loss": 1.0523, "step": 2652 }, { "epoch": 0.40877936080092414, "grad_norm": 1.0208611488342285, "learning_rate": 6.412883288205647e-05, "loss": 1.9574, "step": 2654 }, { "epoch": 0.409087408548325, "grad_norm": 6.761949062347412, "learning_rate": 6.408240673776065e-05, "loss": 1.047, "step": 2656 }, { "epoch": 0.4093954562957258, "grad_norm": 0.6190847754478455, "learning_rate": 6.40359674024088e-05, "loss": 1.1624, "step": 2658 }, { "epoch": 0.40970350404312667, "grad_norm": 0.5744146704673767, "learning_rate": 6.398951491950089e-05, "loss": 1.1138, "step": 2660 }, { "epoch": 0.4100115517905275, "grad_norm": 0.7018343210220337, "learning_rate": 6.394304933254916e-05, "loss": 1.5961, "step": 2662 }, { "epoch": 0.4103195995379284, "grad_norm": 0.7297990918159485, "learning_rate": 6.389657068507819e-05, "loss": 0.7845, "step": 2664 }, { "epoch": 0.41062764728532924, "grad_norm": 0.4957539439201355, "learning_rate": 6.385007902062467e-05, "loss": 0.9743, "step": 2666 }, { "epoch": 0.4109356950327301, "grad_norm": 0.6580387353897095, "learning_rate": 6.380357438273763e-05, "loss": 0.9583, "step": 2668 }, { "epoch": 0.4112437427801309, "grad_norm": 0.7626873850822449, "learning_rate": 6.375705681497813e-05, "loss": 1.061, "step": 2670 }, { "epoch": 0.41155179052753177, "grad_norm": 0.7961906790733337, "learning_rate": 6.371052636091942e-05, "loss": 1.9489, "step": 2672 }, { "epoch": 0.4118598382749326, "grad_norm": 0.9162095189094543, "learning_rate": 6.366398306414679e-05, "loss": 1.2871, "step": 2674 }, { "epoch": 0.41216788602233345, "grad_norm": 0.7108187675476074, "learning_rate": 6.361742696825755e-05, "loss": 0.992, "step": 2676 }, { "epoch": 0.4124759337697343, "grad_norm": 0.7019790410995483, "learning_rate": 6.357085811686103e-05, "loss": 1.1067, "step": 2678 }, { "epoch": 0.41278398151713513, "grad_norm": 0.6845753192901611, "learning_rate": 6.352427655357848e-05, "loss": 0.979, "step": 2680 }, { "epoch": 0.413092029264536, "grad_norm": 0.5338165760040283, "learning_rate": 6.347768232204305e-05, "loss": 1.0374, "step": 2682 }, { "epoch": 0.4134000770119369, "grad_norm": 0.6664561033248901, "learning_rate": 6.343107546589982e-05, "loss": 0.8928, "step": 2684 }, { "epoch": 0.4137081247593377, "grad_norm": 0.8769541382789612, "learning_rate": 6.33844560288056e-05, "loss": 1.0479, "step": 2686 }, { "epoch": 0.41401617250673856, "grad_norm": 0.86333167552948, "learning_rate": 6.333782405442904e-05, "loss": 0.8223, "step": 2688 }, { "epoch": 0.4143242202541394, "grad_norm": 0.7412071228027344, "learning_rate": 6.329117958645058e-05, "loss": 1.0054, "step": 2690 }, { "epoch": 0.41463226800154024, "grad_norm": 0.6570004820823669, "learning_rate": 6.324452266856225e-05, "loss": 0.8969, "step": 2692 }, { "epoch": 0.4149403157489411, "grad_norm": 0.5948647856712341, "learning_rate": 6.319785334446783e-05, "loss": 0.8415, "step": 2694 }, { "epoch": 0.4152483634963419, "grad_norm": 0.6360468864440918, "learning_rate": 6.315117165788268e-05, "loss": 0.9539, "step": 2696 }, { "epoch": 0.41555641124374276, "grad_norm": 0.5860743522644043, "learning_rate": 6.310447765253376e-05, "loss": 0.8201, "step": 2698 }, { "epoch": 0.4158644589911436, "grad_norm": 0.8867302536964417, "learning_rate": 6.30577713721596e-05, "loss": 2.5665, "step": 2700 }, { "epoch": 0.4158644589911436, "eval_loss": 2.4475715160369873, "eval_runtime": 737.1021, "eval_samples_per_second": 2.713, "eval_steps_per_second": 0.678, "step": 2700 }, { "epoch": 0.4161725067385445, "grad_norm": 0.9580145478248596, "learning_rate": 6.301105286051013e-05, "loss": 0.9133, "step": 2702 }, { "epoch": 0.41648055448594534, "grad_norm": 0.7150710225105286, "learning_rate": 6.296432216134682e-05, "loss": 0.868, "step": 2704 }, { "epoch": 0.4167886022333462, "grad_norm": 0.761711835861206, "learning_rate": 6.291757931844254e-05, "loss": 1.1175, "step": 2706 }, { "epoch": 0.417096649980747, "grad_norm": 0.8936048746109009, "learning_rate": 6.287082437558151e-05, "loss": 1.0394, "step": 2708 }, { "epoch": 0.41740469772814787, "grad_norm": 0.8021292090415955, "learning_rate": 6.282405737655933e-05, "loss": 1.5893, "step": 2710 }, { "epoch": 0.4177127454755487, "grad_norm": 0.7052441835403442, "learning_rate": 6.277727836518286e-05, "loss": 0.7865, "step": 2712 }, { "epoch": 0.41802079322294955, "grad_norm": 0.9390805959701538, "learning_rate": 6.27304873852702e-05, "loss": 1.0504, "step": 2714 }, { "epoch": 0.4183288409703504, "grad_norm": 0.8395248055458069, "learning_rate": 6.268368448065069e-05, "loss": 0.9347, "step": 2716 }, { "epoch": 0.41863688871775123, "grad_norm": 0.6875936985015869, "learning_rate": 6.263686969516483e-05, "loss": 1.0526, "step": 2718 }, { "epoch": 0.4189449364651521, "grad_norm": 0.6971736550331116, "learning_rate": 6.259004307266426e-05, "loss": 1.1244, "step": 2720 }, { "epoch": 0.41925298421255297, "grad_norm": 0.7761056423187256, "learning_rate": 6.254320465701166e-05, "loss": 0.908, "step": 2722 }, { "epoch": 0.4195610319599538, "grad_norm": 0.76072096824646, "learning_rate": 6.249635449208085e-05, "loss": 0.848, "step": 2724 }, { "epoch": 0.41986907970735465, "grad_norm": 0.7384543418884277, "learning_rate": 6.244949262175654e-05, "loss": 1.108, "step": 2726 }, { "epoch": 0.4201771274547555, "grad_norm": 0.9263654947280884, "learning_rate": 6.240261908993447e-05, "loss": 0.9928, "step": 2728 }, { "epoch": 0.42048517520215634, "grad_norm": 0.5557862520217896, "learning_rate": 6.235573394052134e-05, "loss": 0.8398, "step": 2730 }, { "epoch": 0.4207932229495572, "grad_norm": 0.7384987473487854, "learning_rate": 6.230883721743462e-05, "loss": 1.2297, "step": 2732 }, { "epoch": 0.421101270696958, "grad_norm": 0.6595007181167603, "learning_rate": 6.226192896460277e-05, "loss": 0.8903, "step": 2734 }, { "epoch": 0.42140931844435886, "grad_norm": 0.7695440649986267, "learning_rate": 6.221500922596488e-05, "loss": 0.9697, "step": 2736 }, { "epoch": 0.4217173661917597, "grad_norm": 0.933242917060852, "learning_rate": 6.216807804547097e-05, "loss": 1.0735, "step": 2738 }, { "epoch": 0.42202541393916054, "grad_norm": 0.8982356786727905, "learning_rate": 6.212113546708165e-05, "loss": 0.7696, "step": 2740 }, { "epoch": 0.42233346168656144, "grad_norm": 0.8741735816001892, "learning_rate": 6.207418153476824e-05, "loss": 1.334, "step": 2742 }, { "epoch": 0.4226415094339623, "grad_norm": 0.8034881353378296, "learning_rate": 6.202721629251278e-05, "loss": 2.425, "step": 2744 }, { "epoch": 0.4229495571813631, "grad_norm": 0.582089364528656, "learning_rate": 6.198023978430774e-05, "loss": 0.9689, "step": 2746 }, { "epoch": 0.42325760492876396, "grad_norm": 0.6534414887428284, "learning_rate": 6.193325205415629e-05, "loss": 0.9952, "step": 2748 }, { "epoch": 0.4235656526761648, "grad_norm": 0.7840991616249084, "learning_rate": 6.188625314607201e-05, "loss": 0.963, "step": 2750 }, { "epoch": 0.42387370042356565, "grad_norm": 0.7565934062004089, "learning_rate": 6.183924310407905e-05, "loss": 0.8481, "step": 2752 }, { "epoch": 0.4241817481709665, "grad_norm": 0.7571336627006531, "learning_rate": 6.17922219722119e-05, "loss": 0.9408, "step": 2754 }, { "epoch": 0.42448979591836733, "grad_norm": 0.8127564787864685, "learning_rate": 6.17451897945155e-05, "loss": 0.93, "step": 2756 }, { "epoch": 0.42479784366576817, "grad_norm": 1.051659345626831, "learning_rate": 6.169814661504509e-05, "loss": 1.2283, "step": 2758 }, { "epoch": 0.42510589141316907, "grad_norm": 0.8676214814186096, "learning_rate": 6.165109247786624e-05, "loss": 1.3913, "step": 2760 }, { "epoch": 0.4254139391605699, "grad_norm": 0.7785351872444153, "learning_rate": 6.160402742705477e-05, "loss": 0.9559, "step": 2762 }, { "epoch": 0.42572198690797075, "grad_norm": 0.7920562624931335, "learning_rate": 6.155695150669675e-05, "loss": 0.9236, "step": 2764 }, { "epoch": 0.4260300346553716, "grad_norm": 0.6295596957206726, "learning_rate": 6.150986476088841e-05, "loss": 1.2528, "step": 2766 }, { "epoch": 0.42633808240277243, "grad_norm": 0.6867069602012634, "learning_rate": 6.14627672337361e-05, "loss": 1.0019, "step": 2768 }, { "epoch": 0.4266461301501733, "grad_norm": 0.7173376083374023, "learning_rate": 6.141565896935633e-05, "loss": 1.2387, "step": 2770 }, { "epoch": 0.4269541778975741, "grad_norm": 0.7977665662765503, "learning_rate": 6.13685400118756e-05, "loss": 1.277, "step": 2772 }, { "epoch": 0.42726222564497496, "grad_norm": 0.784441351890564, "learning_rate": 6.13214104054305e-05, "loss": 0.885, "step": 2774 }, { "epoch": 0.4275702733923758, "grad_norm": 1.0195475816726685, "learning_rate": 6.127427019416748e-05, "loss": 1.2057, "step": 2776 }, { "epoch": 0.42787832113977664, "grad_norm": 1.036000370979309, "learning_rate": 6.122711942224308e-05, "loss": 1.1531, "step": 2778 }, { "epoch": 0.42818636888717754, "grad_norm": 0.7378907799720764, "learning_rate": 6.117995813382357e-05, "loss": 1.0969, "step": 2780 }, { "epoch": 0.4284944166345784, "grad_norm": 0.8123480081558228, "learning_rate": 6.113278637308519e-05, "loss": 1.0515, "step": 2782 }, { "epoch": 0.4288024643819792, "grad_norm": 0.8694273829460144, "learning_rate": 6.108560418421397e-05, "loss": 0.7927, "step": 2784 }, { "epoch": 0.42911051212938006, "grad_norm": 0.7682204842567444, "learning_rate": 6.103841161140564e-05, "loss": 0.9275, "step": 2786 }, { "epoch": 0.4294185598767809, "grad_norm": 0.8520436882972717, "learning_rate": 6.099120869886573e-05, "loss": 1.2366, "step": 2788 }, { "epoch": 0.42972660762418174, "grad_norm": 0.9056306481361389, "learning_rate": 6.0943995490809403e-05, "loss": 1.9261, "step": 2790 }, { "epoch": 0.4300346553715826, "grad_norm": 0.8186244368553162, "learning_rate": 6.0896772031461514e-05, "loss": 2.4318, "step": 2792 }, { "epoch": 0.4303427031189834, "grad_norm": 0.9430123567581177, "learning_rate": 6.08495383650565e-05, "loss": 1.4405, "step": 2794 }, { "epoch": 0.43065075086638427, "grad_norm": 0.7463430166244507, "learning_rate": 6.0802294535838344e-05, "loss": 1.1768, "step": 2796 }, { "epoch": 0.4309587986137851, "grad_norm": 0.7996833324432373, "learning_rate": 6.0755040588060565e-05, "loss": 0.9411, "step": 2798 }, { "epoch": 0.431266846361186, "grad_norm": 0.6117491722106934, "learning_rate": 6.070777656598615e-05, "loss": 0.9498, "step": 2800 }, { "epoch": 0.43157489410858685, "grad_norm": 0.6472463607788086, "learning_rate": 6.066050251388754e-05, "loss": 0.9428, "step": 2802 }, { "epoch": 0.4318829418559877, "grad_norm": 1.8328849077224731, "learning_rate": 6.061321847604655e-05, "loss": 0.8237, "step": 2804 }, { "epoch": 0.43219098960338853, "grad_norm": 0.6823046207427979, "learning_rate": 6.0565924496754366e-05, "loss": 0.9588, "step": 2806 }, { "epoch": 0.4324990373507894, "grad_norm": 0.6461644172668457, "learning_rate": 6.0518620620311475e-05, "loss": 0.851, "step": 2808 }, { "epoch": 0.4328070850981902, "grad_norm": 0.7930482625961304, "learning_rate": 6.0471306891027637e-05, "loss": 0.9376, "step": 2810 }, { "epoch": 0.43311513284559106, "grad_norm": 0.5953826308250427, "learning_rate": 6.0423983353221836e-05, "loss": 1.0449, "step": 2812 }, { "epoch": 0.4334231805929919, "grad_norm": 0.8456257581710815, "learning_rate": 6.037665005122228e-05, "loss": 0.8364, "step": 2814 }, { "epoch": 0.43373122834039274, "grad_norm": 0.6633259654045105, "learning_rate": 6.032930702936626e-05, "loss": 0.9747, "step": 2816 }, { "epoch": 0.43403927608779364, "grad_norm": 0.7752382755279541, "learning_rate": 6.0281954332000226e-05, "loss": 1.7662, "step": 2818 }, { "epoch": 0.4343473238351945, "grad_norm": 0.766176164150238, "learning_rate": 6.023459200347964e-05, "loss": 1.1785, "step": 2820 }, { "epoch": 0.4346553715825953, "grad_norm": 0.698911726474762, "learning_rate": 6.018722008816905e-05, "loss": 1.3769, "step": 2822 }, { "epoch": 0.43496341932999616, "grad_norm": 0.6189733743667603, "learning_rate": 6.013983863044195e-05, "loss": 0.8285, "step": 2824 }, { "epoch": 0.435271467077397, "grad_norm": 0.9354870915412903, "learning_rate": 6.009244767468074e-05, "loss": 0.8931, "step": 2826 }, { "epoch": 0.43557951482479784, "grad_norm": 0.8708924651145935, "learning_rate": 6.004504726527679e-05, "loss": 0.9616, "step": 2828 }, { "epoch": 0.4358875625721987, "grad_norm": 1.112870693206787, "learning_rate": 5.999763744663024e-05, "loss": 1.2036, "step": 2830 }, { "epoch": 0.4361956103195995, "grad_norm": 0.7736754417419434, "learning_rate": 5.9950218263150114e-05, "loss": 1.0429, "step": 2832 }, { "epoch": 0.43650365806700037, "grad_norm": 0.9180028438568115, "learning_rate": 5.99027897592542e-05, "loss": 0.9176, "step": 2834 }, { "epoch": 0.4368117058144012, "grad_norm": 0.9358363747596741, "learning_rate": 5.985535197936896e-05, "loss": 1.0841, "step": 2836 }, { "epoch": 0.4371197535618021, "grad_norm": 0.8135294318199158, "learning_rate": 5.9807904967929605e-05, "loss": 1.0238, "step": 2838 }, { "epoch": 0.43742780130920295, "grad_norm": 0.8550823330879211, "learning_rate": 5.976044876937997e-05, "loss": 1.2206, "step": 2840 }, { "epoch": 0.4377358490566038, "grad_norm": 0.693490743637085, "learning_rate": 5.9712983428172494e-05, "loss": 0.9696, "step": 2842 }, { "epoch": 0.43804389680400463, "grad_norm": 0.7726758718490601, "learning_rate": 5.9665508988768185e-05, "loss": 0.9961, "step": 2844 }, { "epoch": 0.43835194455140547, "grad_norm": 0.6240848302841187, "learning_rate": 5.961802549563658e-05, "loss": 0.9851, "step": 2846 }, { "epoch": 0.4386599922988063, "grad_norm": 0.8528410196304321, "learning_rate": 5.957053299325566e-05, "loss": 0.9969, "step": 2848 }, { "epoch": 0.43896804004620715, "grad_norm": 0.8000697493553162, "learning_rate": 5.952303152611191e-05, "loss": 1.1812, "step": 2850 }, { "epoch": 0.439276087793608, "grad_norm": 0.6206786036491394, "learning_rate": 5.947552113870013e-05, "loss": 1.0304, "step": 2852 }, { "epoch": 0.43958413554100884, "grad_norm": 0.8891565203666687, "learning_rate": 5.942800187552359e-05, "loss": 1.0588, "step": 2854 }, { "epoch": 0.4398921832884097, "grad_norm": 0.584948718547821, "learning_rate": 5.938047378109373e-05, "loss": 1.0238, "step": 2856 }, { "epoch": 0.4402002310358106, "grad_norm": 0.502342164516449, "learning_rate": 5.93329368999304e-05, "loss": 1.6083, "step": 2858 }, { "epoch": 0.4405082787832114, "grad_norm": 0.7828280925750732, "learning_rate": 5.9285391276561565e-05, "loss": 1.109, "step": 2860 }, { "epoch": 0.44081632653061226, "grad_norm": 0.5425410270690918, "learning_rate": 5.9237836955523484e-05, "loss": 1.9821, "step": 2862 }, { "epoch": 0.4411243742780131, "grad_norm": 0.864776611328125, "learning_rate": 5.9190273981360454e-05, "loss": 2.4012, "step": 2864 }, { "epoch": 0.44143242202541394, "grad_norm": 0.77806156873703, "learning_rate": 5.9142702398624985e-05, "loss": 1.0941, "step": 2866 }, { "epoch": 0.4417404697728148, "grad_norm": 0.6679258942604065, "learning_rate": 5.909512225187759e-05, "loss": 0.8178, "step": 2868 }, { "epoch": 0.4420485175202156, "grad_norm": 1.0282930135726929, "learning_rate": 5.9047533585686776e-05, "loss": 0.9111, "step": 2870 }, { "epoch": 0.44235656526761646, "grad_norm": 0.9571512341499329, "learning_rate": 5.8999936444629125e-05, "loss": 0.9596, "step": 2872 }, { "epoch": 0.4426646130150173, "grad_norm": 0.5731912851333618, "learning_rate": 5.895233087328904e-05, "loss": 0.7965, "step": 2874 }, { "epoch": 0.4429726607624182, "grad_norm": 0.8772318959236145, "learning_rate": 5.890471691625894e-05, "loss": 1.019, "step": 2876 }, { "epoch": 0.44328070850981904, "grad_norm": 0.5373838543891907, "learning_rate": 5.8857094618138996e-05, "loss": 1.4404, "step": 2878 }, { "epoch": 0.4435887562572199, "grad_norm": 1.4746814966201782, "learning_rate": 5.8809464023537265e-05, "loss": 1.1947, "step": 2880 }, { "epoch": 0.4438968040046207, "grad_norm": 0.8003684878349304, "learning_rate": 5.876182517706954e-05, "loss": 2.4831, "step": 2882 }, { "epoch": 0.44420485175202157, "grad_norm": 0.7508583068847656, "learning_rate": 5.8714178123359345e-05, "loss": 0.97, "step": 2884 }, { "epoch": 0.4445128994994224, "grad_norm": 0.7142212986946106, "learning_rate": 5.8666522907037905e-05, "loss": 0.9413, "step": 2886 }, { "epoch": 0.44482094724682325, "grad_norm": 0.8980920910835266, "learning_rate": 5.8618859572744065e-05, "loss": 2.0577, "step": 2888 }, { "epoch": 0.4451289949942241, "grad_norm": 0.6517657041549683, "learning_rate": 5.8571188165124316e-05, "loss": 0.8379, "step": 2890 }, { "epoch": 0.44543704274162493, "grad_norm": 0.7201358675956726, "learning_rate": 5.852350872883267e-05, "loss": 2.1127, "step": 2892 }, { "epoch": 0.4457450904890258, "grad_norm": 0.876204252243042, "learning_rate": 5.847582130853068e-05, "loss": 0.9175, "step": 2894 }, { "epoch": 0.44605313823642667, "grad_norm": 0.7286810278892517, "learning_rate": 5.842812594888737e-05, "loss": 1.0932, "step": 2896 }, { "epoch": 0.4463611859838275, "grad_norm": 0.9662947654724121, "learning_rate": 5.838042269457924e-05, "loss": 0.9577, "step": 2898 }, { "epoch": 0.44666923373122835, "grad_norm": 0.8888316750526428, "learning_rate": 5.83327115902901e-05, "loss": 1.2033, "step": 2900 }, { "epoch": 0.4469772814786292, "grad_norm": 0.6747696399688721, "learning_rate": 5.8284992680711204e-05, "loss": 1.0227, "step": 2902 }, { "epoch": 0.44728532922603004, "grad_norm": 0.6601919531822205, "learning_rate": 5.8237266010541046e-05, "loss": 0.8986, "step": 2904 }, { "epoch": 0.4475933769734309, "grad_norm": 0.6246869564056396, "learning_rate": 5.818953162448545e-05, "loss": 0.94, "step": 2906 }, { "epoch": 0.4479014247208317, "grad_norm": 0.7867985963821411, "learning_rate": 5.814178956725742e-05, "loss": 0.8662, "step": 2908 }, { "epoch": 0.44820947246823256, "grad_norm": 0.7554641366004944, "learning_rate": 5.8094039883577164e-05, "loss": 1.655, "step": 2910 }, { "epoch": 0.4485175202156334, "grad_norm": 0.7745111584663391, "learning_rate": 5.804628261817204e-05, "loss": 0.9167, "step": 2912 }, { "epoch": 0.44882556796303424, "grad_norm": 0.5894488096237183, "learning_rate": 5.79985178157765e-05, "loss": 0.8354, "step": 2914 }, { "epoch": 0.44913361571043514, "grad_norm": 0.7678777575492859, "learning_rate": 5.7950745521132044e-05, "loss": 0.8144, "step": 2916 }, { "epoch": 0.449441663457836, "grad_norm": 0.6224768757820129, "learning_rate": 5.7902965778987215e-05, "loss": 0.8566, "step": 2918 }, { "epoch": 0.4497497112052368, "grad_norm": 0.9957372546195984, "learning_rate": 5.785517863409752e-05, "loss": 0.9116, "step": 2920 }, { "epoch": 0.45005775895263767, "grad_norm": 0.722830057144165, "learning_rate": 5.7807384131225395e-05, "loss": 0.9513, "step": 2922 }, { "epoch": 0.4503658067000385, "grad_norm": 0.7828933000564575, "learning_rate": 5.775958231514018e-05, "loss": 0.9646, "step": 2924 }, { "epoch": 0.45067385444743935, "grad_norm": 0.511208176612854, "learning_rate": 5.771177323061806e-05, "loss": 0.8199, "step": 2926 }, { "epoch": 0.4509819021948402, "grad_norm": 0.8397185206413269, "learning_rate": 5.766395692244202e-05, "loss": 1.0869, "step": 2928 }, { "epoch": 0.45128994994224103, "grad_norm": 0.7995164394378662, "learning_rate": 5.761613343540182e-05, "loss": 1.1447, "step": 2930 }, { "epoch": 0.4515979976896419, "grad_norm": 0.8521782159805298, "learning_rate": 5.756830281429395e-05, "loss": 0.9796, "step": 2932 }, { "epoch": 0.45190604543704277, "grad_norm": 0.8117061257362366, "learning_rate": 5.752046510392156e-05, "loss": 0.8261, "step": 2934 }, { "epoch": 0.4522140931844436, "grad_norm": 0.7211443781852722, "learning_rate": 5.747262034909446e-05, "loss": 1.2989, "step": 2936 }, { "epoch": 0.45252214093184445, "grad_norm": 1.1251598596572876, "learning_rate": 5.7424768594629094e-05, "loss": 1.1287, "step": 2938 }, { "epoch": 0.4528301886792453, "grad_norm": 0.7493430376052856, "learning_rate": 5.737690988534836e-05, "loss": 0.9913, "step": 2940 }, { "epoch": 0.45313823642664613, "grad_norm": 1.022343635559082, "learning_rate": 5.732904426608179e-05, "loss": 1.0708, "step": 2942 }, { "epoch": 0.453446284174047, "grad_norm": 0.7727839946746826, "learning_rate": 5.728117178166528e-05, "loss": 0.9274, "step": 2944 }, { "epoch": 0.4537543319214478, "grad_norm": 1.04244065284729, "learning_rate": 5.7233292476941245e-05, "loss": 1.0758, "step": 2946 }, { "epoch": 0.45406237966884866, "grad_norm": 0.7736064791679382, "learning_rate": 5.7185406396758445e-05, "loss": 0.9561, "step": 2948 }, { "epoch": 0.4543704274162495, "grad_norm": 0.9154314398765564, "learning_rate": 5.7137513585972e-05, "loss": 1.0392, "step": 2950 }, { "epoch": 0.45467847516365034, "grad_norm": 0.8307647109031677, "learning_rate": 5.708961408944333e-05, "loss": 0.837, "step": 2952 }, { "epoch": 0.45498652291105124, "grad_norm": 0.7958799600601196, "learning_rate": 5.704170795204009e-05, "loss": 1.1606, "step": 2954 }, { "epoch": 0.4552945706584521, "grad_norm": 0.8393613696098328, "learning_rate": 5.6993795218636215e-05, "loss": 0.9094, "step": 2956 }, { "epoch": 0.4556026184058529, "grad_norm": 0.6367685794830322, "learning_rate": 5.694587593411176e-05, "loss": 0.9539, "step": 2958 }, { "epoch": 0.45591066615325376, "grad_norm": 0.774171769618988, "learning_rate": 5.689795014335296e-05, "loss": 1.0237, "step": 2960 }, { "epoch": 0.4562187139006546, "grad_norm": 0.5494070649147034, "learning_rate": 5.6850017891252125e-05, "loss": 2.4835, "step": 2962 }, { "epoch": 0.45652676164805545, "grad_norm": 0.764487624168396, "learning_rate": 5.6802079222707614e-05, "loss": 1.0823, "step": 2964 }, { "epoch": 0.4568348093954563, "grad_norm": 0.6190123558044434, "learning_rate": 5.67541341826238e-05, "loss": 0.8227, "step": 2966 }, { "epoch": 0.45714285714285713, "grad_norm": 0.7573233842849731, "learning_rate": 5.6706182815911026e-05, "loss": 0.9032, "step": 2968 }, { "epoch": 0.45745090489025797, "grad_norm": 0.7121224999427795, "learning_rate": 5.665822516748557e-05, "loss": 0.9701, "step": 2970 }, { "epoch": 0.4577589526376588, "grad_norm": 0.8833084106445312, "learning_rate": 5.661026128226956e-05, "loss": 1.0514, "step": 2972 }, { "epoch": 0.4580670003850597, "grad_norm": 0.7853366136550903, "learning_rate": 5.656229120519102e-05, "loss": 0.7792, "step": 2974 }, { "epoch": 0.45837504813246055, "grad_norm": 0.7289166450500488, "learning_rate": 5.651431498118372e-05, "loss": 0.9208, "step": 2976 }, { "epoch": 0.4586830958798614, "grad_norm": 0.7399483323097229, "learning_rate": 5.6466332655187235e-05, "loss": 0.8391, "step": 2978 }, { "epoch": 0.45899114362726223, "grad_norm": 0.6576483249664307, "learning_rate": 5.6418344272146816e-05, "loss": 1.1026, "step": 2980 }, { "epoch": 0.4592991913746631, "grad_norm": 0.9064686894416809, "learning_rate": 5.6370349877013426e-05, "loss": 0.9908, "step": 2982 }, { "epoch": 0.4596072391220639, "grad_norm": 0.6922678351402283, "learning_rate": 5.632234951474361e-05, "loss": 1.0612, "step": 2984 }, { "epoch": 0.45991528686946476, "grad_norm": 0.6031603217124939, "learning_rate": 5.6274343230299566e-05, "loss": 1.0215, "step": 2986 }, { "epoch": 0.4602233346168656, "grad_norm": 0.7492870092391968, "learning_rate": 5.622633106864895e-05, "loss": 0.8476, "step": 2988 }, { "epoch": 0.46053138236426644, "grad_norm": 0.9854443669319153, "learning_rate": 5.617831307476503e-05, "loss": 1.1229, "step": 2990 }, { "epoch": 0.4608394301116673, "grad_norm": 0.8583337068557739, "learning_rate": 5.613028929362647e-05, "loss": 0.9634, "step": 2992 }, { "epoch": 0.4611474778590682, "grad_norm": 0.7198197841644287, "learning_rate": 5.6082259770217363e-05, "loss": 0.931, "step": 2994 }, { "epoch": 0.461455525606469, "grad_norm": 0.8717345595359802, "learning_rate": 5.603422454952719e-05, "loss": 1.1268, "step": 2996 }, { "epoch": 0.46176357335386986, "grad_norm": 0.558197557926178, "learning_rate": 5.598618367655075e-05, "loss": 0.8507, "step": 2998 }, { "epoch": 0.4620716211012707, "grad_norm": 0.8464396595954895, "learning_rate": 5.593813719628819e-05, "loss": 1.0359, "step": 3000 }, { "epoch": 0.4620716211012707, "eval_loss": 2.401945114135742, "eval_runtime": 736.2006, "eval_samples_per_second": 2.717, "eval_steps_per_second": 0.679, "step": 3000 }, { "epoch": 0.46237966884867154, "grad_norm": 0.8869822025299072, "learning_rate": 5.589008515374484e-05, "loss": 1.1223, "step": 3002 }, { "epoch": 0.4626877165960724, "grad_norm": 0.6646120548248291, "learning_rate": 5.584202759393128e-05, "loss": 0.9921, "step": 3004 }, { "epoch": 0.4629957643434732, "grad_norm": 0.8584650158882141, "learning_rate": 5.5793964561863256e-05, "loss": 1.1436, "step": 3006 }, { "epoch": 0.46330381209087407, "grad_norm": 0.6617639660835266, "learning_rate": 5.5745896102561636e-05, "loss": 0.8618, "step": 3008 }, { "epoch": 0.4636118598382749, "grad_norm": 0.825599193572998, "learning_rate": 5.569782226105236e-05, "loss": 0.9942, "step": 3010 }, { "epoch": 0.4639199075856758, "grad_norm": 0.6756584644317627, "learning_rate": 5.564974308236642e-05, "loss": 0.9885, "step": 3012 }, { "epoch": 0.46422795533307665, "grad_norm": 0.6235724687576294, "learning_rate": 5.560165861153982e-05, "loss": 0.7758, "step": 3014 }, { "epoch": 0.4645360030804775, "grad_norm": 1.003424882888794, "learning_rate": 5.555356889361349e-05, "loss": 0.7478, "step": 3016 }, { "epoch": 0.46484405082787833, "grad_norm": 0.6220825910568237, "learning_rate": 5.55054739736333e-05, "loss": 2.3584, "step": 3018 }, { "epoch": 0.46515209857527917, "grad_norm": 0.7365529537200928, "learning_rate": 5.545737389664999e-05, "loss": 1.0472, "step": 3020 }, { "epoch": 0.46546014632268, "grad_norm": 1.023934006690979, "learning_rate": 5.540926870771913e-05, "loss": 0.9631, "step": 3022 }, { "epoch": 0.46576819407008085, "grad_norm": 1.0248138904571533, "learning_rate": 5.536115845190105e-05, "loss": 1.0431, "step": 3024 }, { "epoch": 0.4660762418174817, "grad_norm": 0.6595494747161865, "learning_rate": 5.531304317426089e-05, "loss": 0.9056, "step": 3026 }, { "epoch": 0.46638428956488254, "grad_norm": 0.7172659039497375, "learning_rate": 5.526492291986841e-05, "loss": 1.0834, "step": 3028 }, { "epoch": 0.4666923373122834, "grad_norm": 0.8179621696472168, "learning_rate": 5.521679773379812e-05, "loss": 0.9538, "step": 3030 }, { "epoch": 0.4670003850596843, "grad_norm": 0.6643562912940979, "learning_rate": 5.516866766112908e-05, "loss": 0.9234, "step": 3032 }, { "epoch": 0.4673084328070851, "grad_norm": 0.8943853974342346, "learning_rate": 5.5120532746944955e-05, "loss": 1.2507, "step": 3034 }, { "epoch": 0.46761648055448596, "grad_norm": 0.8801318407058716, "learning_rate": 5.507239303633396e-05, "loss": 1.0128, "step": 3036 }, { "epoch": 0.4679245283018868, "grad_norm": 1.0031734704971313, "learning_rate": 5.502424857438876e-05, "loss": 1.0573, "step": 3038 }, { "epoch": 0.46823257604928764, "grad_norm": 0.686630129814148, "learning_rate": 5.4976099406206516e-05, "loss": 0.9245, "step": 3040 }, { "epoch": 0.4685406237966885, "grad_norm": 0.4664287269115448, "learning_rate": 5.492794557688877e-05, "loss": 1.6719, "step": 3042 }, { "epoch": 0.4688486715440893, "grad_norm": 0.868158221244812, "learning_rate": 5.487978713154144e-05, "loss": 1.6452, "step": 3044 }, { "epoch": 0.46915671929149017, "grad_norm": 0.7039262056350708, "learning_rate": 5.483162411527477e-05, "loss": 0.9206, "step": 3046 }, { "epoch": 0.469464767038891, "grad_norm": 0.7789177894592285, "learning_rate": 5.4783456573203283e-05, "loss": 0.9925, "step": 3048 }, { "epoch": 0.46977281478629185, "grad_norm": 0.8716445565223694, "learning_rate": 5.473528455044572e-05, "loss": 1.0072, "step": 3050 }, { "epoch": 0.47008086253369274, "grad_norm": 0.786055862903595, "learning_rate": 5.4687108092125074e-05, "loss": 1.162, "step": 3052 }, { "epoch": 0.4703889102810936, "grad_norm": 0.9024592638015747, "learning_rate": 5.463892724336843e-05, "loss": 1.053, "step": 3054 }, { "epoch": 0.4706969580284944, "grad_norm": 1.0428143739700317, "learning_rate": 5.459074204930703e-05, "loss": 0.9698, "step": 3056 }, { "epoch": 0.47100500577589527, "grad_norm": 0.6836503148078918, "learning_rate": 5.454255255507615e-05, "loss": 1.0836, "step": 3058 }, { "epoch": 0.4713130535232961, "grad_norm": 0.7136745452880859, "learning_rate": 5.449435880581513e-05, "loss": 0.9677, "step": 3060 }, { "epoch": 0.47162110127069695, "grad_norm": 0.9150458574295044, "learning_rate": 5.444616084666729e-05, "loss": 0.9073, "step": 3062 }, { "epoch": 0.4719291490180978, "grad_norm": 0.7028246521949768, "learning_rate": 5.439795872277985e-05, "loss": 1.0398, "step": 3064 }, { "epoch": 0.47223719676549863, "grad_norm": 1.0035873651504517, "learning_rate": 5.4349752479304e-05, "loss": 1.3181, "step": 3066 }, { "epoch": 0.4725452445128995, "grad_norm": 0.91478031873703, "learning_rate": 5.430154216139471e-05, "loss": 1.049, "step": 3068 }, { "epoch": 0.4728532922603004, "grad_norm": 0.8790503740310669, "learning_rate": 5.425332781421085e-05, "loss": 0.8075, "step": 3070 }, { "epoch": 0.4731613400077012, "grad_norm": 1.371443271636963, "learning_rate": 5.4205109482915017e-05, "loss": 1.3958, "step": 3072 }, { "epoch": 0.47346938775510206, "grad_norm": 0.630165159702301, "learning_rate": 5.4156887212673535e-05, "loss": 1.0174, "step": 3074 }, { "epoch": 0.4737774355025029, "grad_norm": 0.5171129107475281, "learning_rate": 5.410866104865643e-05, "loss": 0.9023, "step": 3076 }, { "epoch": 0.47408548324990374, "grad_norm": 0.7049612998962402, "learning_rate": 5.4060431036037376e-05, "loss": 0.778, "step": 3078 }, { "epoch": 0.4743935309973046, "grad_norm": 0.6293292045593262, "learning_rate": 5.401219721999364e-05, "loss": 0.9847, "step": 3080 }, { "epoch": 0.4747015787447054, "grad_norm": 0.6555522084236145, "learning_rate": 5.3963959645706085e-05, "loss": 1.0141, "step": 3082 }, { "epoch": 0.47500962649210626, "grad_norm": 0.8215299248695374, "learning_rate": 5.3915718358359066e-05, "loss": 1.0369, "step": 3084 }, { "epoch": 0.4753176742395071, "grad_norm": 0.8922412395477295, "learning_rate": 5.386747340314041e-05, "loss": 0.9832, "step": 3086 }, { "epoch": 0.47562572198690795, "grad_norm": 0.6807952523231506, "learning_rate": 5.38192248252414e-05, "loss": 0.9139, "step": 3088 }, { "epoch": 0.47593376973430884, "grad_norm": 0.6609580516815186, "learning_rate": 5.37709726698567e-05, "loss": 1.0296, "step": 3090 }, { "epoch": 0.4762418174817097, "grad_norm": 1.3963803052902222, "learning_rate": 5.372271698218433e-05, "loss": 1.0484, "step": 3092 }, { "epoch": 0.4765498652291105, "grad_norm": 0.6297670602798462, "learning_rate": 5.367445780742559e-05, "loss": 1.0407, "step": 3094 }, { "epoch": 0.47685791297651137, "grad_norm": 0.622728705406189, "learning_rate": 5.362619519078514e-05, "loss": 0.9035, "step": 3096 }, { "epoch": 0.4771659607239122, "grad_norm": 0.5931330919265747, "learning_rate": 5.3577929177470757e-05, "loss": 0.8357, "step": 3098 }, { "epoch": 0.47747400847131305, "grad_norm": 0.9229997992515564, "learning_rate": 5.352965981269342e-05, "loss": 0.9726, "step": 3100 }, { "epoch": 0.4777820562187139, "grad_norm": 0.7359765768051147, "learning_rate": 5.348138714166731e-05, "loss": 1.0161, "step": 3102 }, { "epoch": 0.47809010396611473, "grad_norm": 0.929198682308197, "learning_rate": 5.343311120960962e-05, "loss": 0.8967, "step": 3104 }, { "epoch": 0.4783981517135156, "grad_norm": 0.8329547047615051, "learning_rate": 5.33848320617407e-05, "loss": 1.2662, "step": 3106 }, { "epoch": 0.4787061994609164, "grad_norm": 0.9614522457122803, "learning_rate": 5.333654974328378e-05, "loss": 1.0578, "step": 3108 }, { "epoch": 0.4790142472083173, "grad_norm": 0.6592017412185669, "learning_rate": 5.3288264299465196e-05, "loss": 1.0484, "step": 3110 }, { "epoch": 0.47932229495571815, "grad_norm": 0.906784176826477, "learning_rate": 5.3239975775514097e-05, "loss": 1.086, "step": 3112 }, { "epoch": 0.479630342703119, "grad_norm": 0.7839928269386292, "learning_rate": 5.319168421666261e-05, "loss": 0.8187, "step": 3114 }, { "epoch": 0.47993839045051984, "grad_norm": 0.756097137928009, "learning_rate": 5.314338966814564e-05, "loss": 0.9537, "step": 3116 }, { "epoch": 0.4802464381979207, "grad_norm": 0.846994936466217, "learning_rate": 5.309509217520092e-05, "loss": 1.0448, "step": 3118 }, { "epoch": 0.4805544859453215, "grad_norm": 0.5622255802154541, "learning_rate": 5.304679178306894e-05, "loss": 0.8573, "step": 3120 }, { "epoch": 0.48086253369272236, "grad_norm": 0.6583067774772644, "learning_rate": 5.2998488536992906e-05, "loss": 0.9072, "step": 3122 }, { "epoch": 0.4811705814401232, "grad_norm": 0.6179800629615784, "learning_rate": 5.295018248221868e-05, "loss": 1.9247, "step": 3124 }, { "epoch": 0.48147862918752404, "grad_norm": 0.759346604347229, "learning_rate": 5.290187366399478e-05, "loss": 0.8023, "step": 3126 }, { "epoch": 0.48178667693492494, "grad_norm": 1.0747575759887695, "learning_rate": 5.285356212757231e-05, "loss": 0.8941, "step": 3128 }, { "epoch": 0.4820947246823258, "grad_norm": 0.7229118943214417, "learning_rate": 5.280524791820488e-05, "loss": 0.8908, "step": 3130 }, { "epoch": 0.4824027724297266, "grad_norm": 0.9550780653953552, "learning_rate": 5.275693108114868e-05, "loss": 0.8559, "step": 3132 }, { "epoch": 0.48271082017712746, "grad_norm": 0.9661011099815369, "learning_rate": 5.2708611661662256e-05, "loss": 1.0929, "step": 3134 }, { "epoch": 0.4830188679245283, "grad_norm": 1.5283000469207764, "learning_rate": 5.2660289705006696e-05, "loss": 0.9719, "step": 3136 }, { "epoch": 0.48332691567192915, "grad_norm": 0.9643043279647827, "learning_rate": 5.261196525644535e-05, "loss": 1.1199, "step": 3138 }, { "epoch": 0.48363496341933, "grad_norm": 0.7876790761947632, "learning_rate": 5.2563638361244004e-05, "loss": 0.8679, "step": 3140 }, { "epoch": 0.48394301116673083, "grad_norm": 0.7540126442909241, "learning_rate": 5.251530906467065e-05, "loss": 0.9442, "step": 3142 }, { "epoch": 0.48425105891413167, "grad_norm": 0.7319709062576294, "learning_rate": 5.2466977411995567e-05, "loss": 0.8821, "step": 3144 }, { "epoch": 0.4845591066615325, "grad_norm": 0.8169276118278503, "learning_rate": 5.2418643448491265e-05, "loss": 1.0041, "step": 3146 }, { "epoch": 0.4848671544089334, "grad_norm": 0.8061478137969971, "learning_rate": 5.237030721943236e-05, "loss": 0.9777, "step": 3148 }, { "epoch": 0.48517520215633425, "grad_norm": 0.8491753339767456, "learning_rate": 5.2321968770095654e-05, "loss": 1.2202, "step": 3150 }, { "epoch": 0.4854832499037351, "grad_norm": 0.7268081307411194, "learning_rate": 5.2273628145759954e-05, "loss": 0.8325, "step": 3152 }, { "epoch": 0.48579129765113593, "grad_norm": 1.062333583831787, "learning_rate": 5.2225285391706194e-05, "loss": 0.768, "step": 3154 }, { "epoch": 0.4860993453985368, "grad_norm": 0.8420694470405579, "learning_rate": 5.217694055321724e-05, "loss": 1.969, "step": 3156 }, { "epoch": 0.4864073931459376, "grad_norm": 0.6028314828872681, "learning_rate": 5.212859367557793e-05, "loss": 0.8924, "step": 3158 }, { "epoch": 0.48671544089333846, "grad_norm": 0.5836811065673828, "learning_rate": 5.2080244804075e-05, "loss": 1.2632, "step": 3160 }, { "epoch": 0.4870234886407393, "grad_norm": 0.6890228390693665, "learning_rate": 5.203189398399707e-05, "loss": 1.0361, "step": 3162 }, { "epoch": 0.48733153638814014, "grad_norm": 0.8328240513801575, "learning_rate": 5.1983541260634586e-05, "loss": 1.0121, "step": 3164 }, { "epoch": 0.487639584135541, "grad_norm": 0.7056050896644592, "learning_rate": 5.1935186679279745e-05, "loss": 0.9742, "step": 3166 }, { "epoch": 0.4879476318829419, "grad_norm": 0.6008009314537048, "learning_rate": 5.188683028522654e-05, "loss": 0.8679, "step": 3168 }, { "epoch": 0.4882556796303427, "grad_norm": 0.766815185546875, "learning_rate": 5.183847212377061e-05, "loss": 0.8381, "step": 3170 }, { "epoch": 0.48856372737774356, "grad_norm": 0.7284615635871887, "learning_rate": 5.179011224020928e-05, "loss": 0.8723, "step": 3172 }, { "epoch": 0.4888717751251444, "grad_norm": 0.7973135709762573, "learning_rate": 5.174175067984145e-05, "loss": 0.8574, "step": 3174 }, { "epoch": 0.48917982287254524, "grad_norm": 0.8865051865577698, "learning_rate": 5.169338748796767e-05, "loss": 1.1238, "step": 3176 }, { "epoch": 0.4894878706199461, "grad_norm": 0.6385417580604553, "learning_rate": 5.164502270988992e-05, "loss": 0.7434, "step": 3178 }, { "epoch": 0.4897959183673469, "grad_norm": 0.7411341071128845, "learning_rate": 5.1596656390911756e-05, "loss": 0.9213, "step": 3180 }, { "epoch": 0.49010396611474777, "grad_norm": 0.49342554807662964, "learning_rate": 5.15482885763381e-05, "loss": 0.8866, "step": 3182 }, { "epoch": 0.4904120138621486, "grad_norm": 0.8545454144477844, "learning_rate": 5.149991931147531e-05, "loss": 0.9304, "step": 3184 }, { "epoch": 0.4907200616095495, "grad_norm": 0.9464917778968811, "learning_rate": 5.145154864163114e-05, "loss": 1.0994, "step": 3186 }, { "epoch": 0.49102810935695035, "grad_norm": 0.7000471949577332, "learning_rate": 5.140317661211457e-05, "loss": 1.1924, "step": 3188 }, { "epoch": 0.4913361571043512, "grad_norm": 0.6230635643005371, "learning_rate": 5.135480326823594e-05, "loss": 0.9787, "step": 3190 }, { "epoch": 0.49164420485175203, "grad_norm": 0.5892027020454407, "learning_rate": 5.130642865530676e-05, "loss": 0.9198, "step": 3192 }, { "epoch": 0.4919522525991529, "grad_norm": 0.6804258823394775, "learning_rate": 5.12580528186398e-05, "loss": 1.0212, "step": 3194 }, { "epoch": 0.4922603003465537, "grad_norm": 0.5627198219299316, "learning_rate": 5.1209675803548875e-05, "loss": 0.8279, "step": 3196 }, { "epoch": 0.49256834809395456, "grad_norm": 0.6895137429237366, "learning_rate": 5.116129765534899e-05, "loss": 0.8793, "step": 3198 }, { "epoch": 0.4928763958413554, "grad_norm": 0.6826330423355103, "learning_rate": 5.111291841935619e-05, "loss": 1.0857, "step": 3200 }, { "epoch": 0.49318444358875624, "grad_norm": 1.6944973468780518, "learning_rate": 5.106453814088753e-05, "loss": 1.0824, "step": 3202 }, { "epoch": 0.4934924913361571, "grad_norm": 0.7384228706359863, "learning_rate": 5.101615686526102e-05, "loss": 2.2589, "step": 3204 }, { "epoch": 0.493800539083558, "grad_norm": 0.6678666472434998, "learning_rate": 5.096777463779565e-05, "loss": 1.6213, "step": 3206 }, { "epoch": 0.4941085868309588, "grad_norm": 0.9313806891441345, "learning_rate": 5.091939150381127e-05, "loss": 0.8571, "step": 3208 }, { "epoch": 0.49441663457835966, "grad_norm": 0.7525261044502258, "learning_rate": 5.087100750862857e-05, "loss": 1.0053, "step": 3210 }, { "epoch": 0.4947246823257605, "grad_norm": 0.7191951274871826, "learning_rate": 5.082262269756909e-05, "loss": 0.961, "step": 3212 }, { "epoch": 0.49503273007316134, "grad_norm": 0.5987869501113892, "learning_rate": 5.0774237115955084e-05, "loss": 0.9717, "step": 3214 }, { "epoch": 0.4953407778205622, "grad_norm": 0.7908967137336731, "learning_rate": 5.072585080910958e-05, "loss": 0.7216, "step": 3216 }, { "epoch": 0.495648825567963, "grad_norm": 0.8970842957496643, "learning_rate": 5.067746382235622e-05, "loss": 0.9381, "step": 3218 }, { "epoch": 0.49595687331536387, "grad_norm": 1.010067105293274, "learning_rate": 5.0629076201019364e-05, "loss": 0.9412, "step": 3220 }, { "epoch": 0.4962649210627647, "grad_norm": 0.6909099221229553, "learning_rate": 5.058068799042387e-05, "loss": 0.7825, "step": 3222 }, { "epoch": 0.49657296881016555, "grad_norm": 0.8686575293540955, "learning_rate": 5.053229923589526e-05, "loss": 0.8715, "step": 3224 }, { "epoch": 0.49688101655756645, "grad_norm": 0.6410868167877197, "learning_rate": 5.048390998275947e-05, "loss": 0.8294, "step": 3226 }, { "epoch": 0.4971890643049673, "grad_norm": 0.7121363878250122, "learning_rate": 5.043552027634293e-05, "loss": 1.1581, "step": 3228 }, { "epoch": 0.49749711205236813, "grad_norm": 0.8368136882781982, "learning_rate": 5.0387130161972526e-05, "loss": 1.0451, "step": 3230 }, { "epoch": 0.49780515979976897, "grad_norm": 0.6324896216392517, "learning_rate": 5.0338739684975486e-05, "loss": 0.9246, "step": 3232 }, { "epoch": 0.4981132075471698, "grad_norm": 0.6032724976539612, "learning_rate": 5.029034889067943e-05, "loss": 0.9274, "step": 3234 }, { "epoch": 0.49842125529457065, "grad_norm": 0.8420709371566772, "learning_rate": 5.024195782441219e-05, "loss": 0.9483, "step": 3236 }, { "epoch": 0.4987293030419715, "grad_norm": 0.6967646479606628, "learning_rate": 5.0193566531501946e-05, "loss": 1.0132, "step": 3238 }, { "epoch": 0.49903735078937234, "grad_norm": 0.6845428347587585, "learning_rate": 5.014517505727702e-05, "loss": 1.0773, "step": 3240 }, { "epoch": 0.4993453985367732, "grad_norm": 0.6973444223403931, "learning_rate": 5.0096783447065946e-05, "loss": 0.9261, "step": 3242 }, { "epoch": 0.4996534462841741, "grad_norm": 1.047443151473999, "learning_rate": 5.004839174619736e-05, "loss": 1.0449, "step": 3244 }, { "epoch": 0.4999614940315749, "grad_norm": 0.8661796450614929, "learning_rate": 5e-05, "loss": 0.9442, "step": 3246 }, { "epoch": 0.5002695417789758, "grad_norm": 0.9085307121276855, "learning_rate": 4.995160825380265e-05, "loss": 0.9839, "step": 3248 }, { "epoch": 0.5005775895263765, "grad_norm": 0.7589093446731567, "learning_rate": 4.990321655293406e-05, "loss": 1.0097, "step": 3250 }, { "epoch": 0.5008856372737774, "grad_norm": 0.6354397535324097, "learning_rate": 4.985482494272299e-05, "loss": 0.9261, "step": 3252 }, { "epoch": 0.5011936850211783, "grad_norm": 1.0207045078277588, "learning_rate": 4.980643346849807e-05, "loss": 1.218, "step": 3254 }, { "epoch": 0.5015017327685791, "grad_norm": 0.7316938638687134, "learning_rate": 4.9758042175587824e-05, "loss": 0.9952, "step": 3256 }, { "epoch": 0.50180978051598, "grad_norm": 0.7260974645614624, "learning_rate": 4.9709651109320575e-05, "loss": 1.122, "step": 3258 }, { "epoch": 0.5021178282633808, "grad_norm": 0.6711363792419434, "learning_rate": 4.966126031502452e-05, "loss": 1.4948, "step": 3260 }, { "epoch": 0.5024258760107817, "grad_norm": 1.114310622215271, "learning_rate": 4.9612869838027485e-05, "loss": 1.2719, "step": 3262 }, { "epoch": 0.5027339237581825, "grad_norm": 0.6233516335487366, "learning_rate": 4.9564479723657075e-05, "loss": 0.9378, "step": 3264 }, { "epoch": 0.5030419715055834, "grad_norm": 0.9802983403205872, "learning_rate": 4.951609001724054e-05, "loss": 1.0383, "step": 3266 }, { "epoch": 0.5033500192529842, "grad_norm": 0.9117656350135803, "learning_rate": 4.9467700764104756e-05, "loss": 1.0586, "step": 3268 }, { "epoch": 0.5036580670003851, "grad_norm": 0.836900532245636, "learning_rate": 4.941931200957612e-05, "loss": 1.4957, "step": 3270 }, { "epoch": 0.5039661147477859, "grad_norm": 0.8332207798957825, "learning_rate": 4.937092379898065e-05, "loss": 1.1056, "step": 3272 }, { "epoch": 0.5042741624951868, "grad_norm": 0.8060922026634216, "learning_rate": 4.9322536177643794e-05, "loss": 1.0183, "step": 3274 }, { "epoch": 0.5045822102425876, "grad_norm": 0.8477973937988281, "learning_rate": 4.927414919089045e-05, "loss": 1.0103, "step": 3276 }, { "epoch": 0.5048902579899884, "grad_norm": 2.138157606124878, "learning_rate": 4.922576288404492e-05, "loss": 0.9738, "step": 3278 }, { "epoch": 0.5051983057373893, "grad_norm": 0.9014928936958313, "learning_rate": 4.917737730243093e-05, "loss": 0.9046, "step": 3280 }, { "epoch": 0.5055063534847901, "grad_norm": 0.8116535544395447, "learning_rate": 4.912899249137145e-05, "loss": 0.8913, "step": 3282 }, { "epoch": 0.505814401232191, "grad_norm": 0.7429752349853516, "learning_rate": 4.908060849618875e-05, "loss": 0.9569, "step": 3284 }, { "epoch": 0.5061224489795918, "grad_norm": 0.6032552123069763, "learning_rate": 4.9032225362204356e-05, "loss": 0.8054, "step": 3286 }, { "epoch": 0.5064304967269927, "grad_norm": 0.8589087724685669, "learning_rate": 4.898384313473899e-05, "loss": 2.1833, "step": 3288 }, { "epoch": 0.5067385444743935, "grad_norm": 0.8798043727874756, "learning_rate": 4.893546185911247e-05, "loss": 0.9825, "step": 3290 }, { "epoch": 0.5070465922217944, "grad_norm": 0.7908830046653748, "learning_rate": 4.888708158064381e-05, "loss": 1.2025, "step": 3292 }, { "epoch": 0.5073546399691953, "grad_norm": 0.9762030243873596, "learning_rate": 4.8838702344651014e-05, "loss": 1.0325, "step": 3294 }, { "epoch": 0.5076626877165961, "grad_norm": 0.8435834646224976, "learning_rate": 4.879032419645114e-05, "loss": 0.9938, "step": 3296 }, { "epoch": 0.507970735463997, "grad_norm": 0.754014253616333, "learning_rate": 4.8741947181360213e-05, "loss": 0.9847, "step": 3298 }, { "epoch": 0.5082787832113977, "grad_norm": 0.9299498796463013, "learning_rate": 4.869357134469325e-05, "loss": 0.8945, "step": 3300 }, { "epoch": 0.5082787832113977, "eval_loss": 2.3888819217681885, "eval_runtime": 736.18, "eval_samples_per_second": 2.717, "eval_steps_per_second": 0.679, "step": 3300 }, { "epoch": 0.5085868309587986, "grad_norm": 0.606257975101471, "learning_rate": 4.864519673176408e-05, "loss": 1.0335, "step": 3302 }, { "epoch": 0.5088948787061994, "grad_norm": 0.8606283068656921, "learning_rate": 4.8596823387885435e-05, "loss": 1.1664, "step": 3304 }, { "epoch": 0.5092029264536003, "grad_norm": 0.657559335231781, "learning_rate": 4.8548451358368876e-05, "loss": 0.8417, "step": 3306 }, { "epoch": 0.5095109742010011, "grad_norm": 0.7792297005653381, "learning_rate": 4.8500080688524696e-05, "loss": 1.0581, "step": 3308 }, { "epoch": 0.509819021948402, "grad_norm": 0.8086965084075928, "learning_rate": 4.8451711423661905e-05, "loss": 1.0145, "step": 3310 }, { "epoch": 0.5101270696958029, "grad_norm": 0.7681042551994324, "learning_rate": 4.8403343609088255e-05, "loss": 1.1651, "step": 3312 }, { "epoch": 0.5104351174432037, "grad_norm": 0.6708490252494812, "learning_rate": 4.835497729011009e-05, "loss": 2.3072, "step": 3314 }, { "epoch": 0.5107431651906046, "grad_norm": 0.6013543009757996, "learning_rate": 4.830661251203235e-05, "loss": 1.064, "step": 3316 }, { "epoch": 0.5110512129380054, "grad_norm": 0.9775047898292542, "learning_rate": 4.825824932015855e-05, "loss": 2.4686, "step": 3318 }, { "epoch": 0.5113592606854063, "grad_norm": 0.551460325717926, "learning_rate": 4.820988775979074e-05, "loss": 0.8143, "step": 3320 }, { "epoch": 0.511667308432807, "grad_norm": 0.9195789694786072, "learning_rate": 4.816152787622941e-05, "loss": 0.9829, "step": 3322 }, { "epoch": 0.511975356180208, "grad_norm": 1.0899276733398438, "learning_rate": 4.811316971477346e-05, "loss": 1.2838, "step": 3324 }, { "epoch": 0.5122834039276087, "grad_norm": 0.8045056462287903, "learning_rate": 4.806481332072027e-05, "loss": 1.0896, "step": 3326 }, { "epoch": 0.5125914516750096, "grad_norm": 0.6057413220405579, "learning_rate": 4.801645873936543e-05, "loss": 1.0034, "step": 3328 }, { "epoch": 0.5128994994224104, "grad_norm": 0.8745119571685791, "learning_rate": 4.796810601600293e-05, "loss": 1.0275, "step": 3330 }, { "epoch": 0.5132075471698113, "grad_norm": 0.6802326440811157, "learning_rate": 4.7919755195925014e-05, "loss": 1.0014, "step": 3332 }, { "epoch": 0.5135155949172122, "grad_norm": 0.7547101378440857, "learning_rate": 4.787140632442208e-05, "loss": 1.0106, "step": 3334 }, { "epoch": 0.513823642664613, "grad_norm": 0.883599579334259, "learning_rate": 4.782305944678277e-05, "loss": 0.9676, "step": 3336 }, { "epoch": 0.5141316904120139, "grad_norm": 0.9065552353858948, "learning_rate": 4.7774714608293804e-05, "loss": 1.0767, "step": 3338 }, { "epoch": 0.5144397381594147, "grad_norm": 0.8647933006286621, "learning_rate": 4.772637185424005e-05, "loss": 1.0344, "step": 3340 }, { "epoch": 0.5147477859068156, "grad_norm": 0.7071940302848816, "learning_rate": 4.767803122990437e-05, "loss": 1.0686, "step": 3342 }, { "epoch": 0.5150558336542164, "grad_norm": 0.70546954870224, "learning_rate": 4.762969278056765e-05, "loss": 0.9587, "step": 3344 }, { "epoch": 0.5153638814016173, "grad_norm": 0.7491280436515808, "learning_rate": 4.758135655150875e-05, "loss": 0.9854, "step": 3346 }, { "epoch": 0.515671929149018, "grad_norm": 0.647142767906189, "learning_rate": 4.7533022588004445e-05, "loss": 0.8884, "step": 3348 }, { "epoch": 0.515979976896419, "grad_norm": 0.8309057354927063, "learning_rate": 4.748469093532936e-05, "loss": 0.8062, "step": 3350 }, { "epoch": 0.5162880246438198, "grad_norm": 0.6286987066268921, "learning_rate": 4.743636163875601e-05, "loss": 0.8235, "step": 3352 }, { "epoch": 0.5165960723912206, "grad_norm": 0.6637413501739502, "learning_rate": 4.738803474355466e-05, "loss": 0.9696, "step": 3354 }, { "epoch": 0.5169041201386215, "grad_norm": 0.8996312618255615, "learning_rate": 4.733971029499333e-05, "loss": 0.9661, "step": 3356 }, { "epoch": 0.5172121678860223, "grad_norm": 0.6931231021881104, "learning_rate": 4.729138833833774e-05, "loss": 1.0826, "step": 3358 }, { "epoch": 0.5175202156334232, "grad_norm": 0.8037751317024231, "learning_rate": 4.724306891885134e-05, "loss": 1.4519, "step": 3360 }, { "epoch": 0.517828263380824, "grad_norm": 0.6060133576393127, "learning_rate": 4.719475208179513e-05, "loss": 0.9653, "step": 3362 }, { "epoch": 0.5181363111282249, "grad_norm": 0.8463549017906189, "learning_rate": 4.7146437872427694e-05, "loss": 0.8843, "step": 3364 }, { "epoch": 0.5184443588756257, "grad_norm": 0.6483150124549866, "learning_rate": 4.7098126336005224e-05, "loss": 0.7722, "step": 3366 }, { "epoch": 0.5187524066230266, "grad_norm": 1.063178300857544, "learning_rate": 4.7049817517781325e-05, "loss": 0.9337, "step": 3368 }, { "epoch": 0.5190604543704275, "grad_norm": 0.760132372379303, "learning_rate": 4.700151146300711e-05, "loss": 1.0166, "step": 3370 }, { "epoch": 0.5193685021178283, "grad_norm": 0.6914016008377075, "learning_rate": 4.6953208216931065e-05, "loss": 1.8584, "step": 3372 }, { "epoch": 0.5196765498652292, "grad_norm": 0.6557642817497253, "learning_rate": 4.690490782479909e-05, "loss": 0.8962, "step": 3374 }, { "epoch": 0.5199845976126299, "grad_norm": 0.7154399156570435, "learning_rate": 4.685661033185437e-05, "loss": 1.5047, "step": 3376 }, { "epoch": 0.5202926453600308, "grad_norm": 0.6547550559043884, "learning_rate": 4.6808315783337396e-05, "loss": 0.9173, "step": 3378 }, { "epoch": 0.5206006931074316, "grad_norm": 1.0237712860107422, "learning_rate": 4.6760024224485915e-05, "loss": 0.9651, "step": 3380 }, { "epoch": 0.5209087408548325, "grad_norm": 0.8532687425613403, "learning_rate": 4.671173570053483e-05, "loss": 0.7217, "step": 3382 }, { "epoch": 0.5212167886022333, "grad_norm": 0.9705612063407898, "learning_rate": 4.6663450256716226e-05, "loss": 1.9736, "step": 3384 }, { "epoch": 0.5215248363496342, "grad_norm": 0.9786109924316406, "learning_rate": 4.661516793825932e-05, "loss": 0.9056, "step": 3386 }, { "epoch": 0.521832884097035, "grad_norm": 0.6195076704025269, "learning_rate": 4.656688879039039e-05, "loss": 0.8693, "step": 3388 }, { "epoch": 0.5221409318444359, "grad_norm": 1.7733471393585205, "learning_rate": 4.651861285833272e-05, "loss": 1.181, "step": 3390 }, { "epoch": 0.5224489795918368, "grad_norm": 0.632230818271637, "learning_rate": 4.647034018730658e-05, "loss": 0.8432, "step": 3392 }, { "epoch": 0.5227570273392376, "grad_norm": 0.6315904855728149, "learning_rate": 4.6422070822529255e-05, "loss": 0.9303, "step": 3394 }, { "epoch": 0.5230650750866385, "grad_norm": 0.7267144918441772, "learning_rate": 4.6373804809214875e-05, "loss": 0.9445, "step": 3396 }, { "epoch": 0.5233731228340393, "grad_norm": 0.8266833424568176, "learning_rate": 4.63255421925744e-05, "loss": 1.0778, "step": 3398 }, { "epoch": 0.5236811705814401, "grad_norm": 0.9495794773101807, "learning_rate": 4.627728301781569e-05, "loss": 1.3003, "step": 3400 }, { "epoch": 0.5239892183288409, "grad_norm": 0.7062100768089294, "learning_rate": 4.6229027330143324e-05, "loss": 0.9354, "step": 3402 }, { "epoch": 0.5242972660762418, "grad_norm": 0.838807225227356, "learning_rate": 4.61807751747586e-05, "loss": 0.8519, "step": 3404 }, { "epoch": 0.5246053138236426, "grad_norm": 0.7427268028259277, "learning_rate": 4.61325265968596e-05, "loss": 0.8379, "step": 3406 }, { "epoch": 0.5249133615710435, "grad_norm": 0.6703910231590271, "learning_rate": 4.6084281641640946e-05, "loss": 0.9903, "step": 3408 }, { "epoch": 0.5252214093184444, "grad_norm": 0.7218496203422546, "learning_rate": 4.603604035429393e-05, "loss": 0.8798, "step": 3410 }, { "epoch": 0.5255294570658452, "grad_norm": 0.9973131418228149, "learning_rate": 4.598780278000637e-05, "loss": 1.1471, "step": 3412 }, { "epoch": 0.5258375048132461, "grad_norm": 1.0085009336471558, "learning_rate": 4.593956896396264e-05, "loss": 1.0237, "step": 3414 }, { "epoch": 0.5261455525606469, "grad_norm": 0.7579819560050964, "learning_rate": 4.589133895134359e-05, "loss": 1.035, "step": 3416 }, { "epoch": 0.5264536003080478, "grad_norm": 0.8549184203147888, "learning_rate": 4.584311278732647e-05, "loss": 0.9305, "step": 3418 }, { "epoch": 0.5267616480554486, "grad_norm": 1.1725854873657227, "learning_rate": 4.5794890517084995e-05, "loss": 1.007, "step": 3420 }, { "epoch": 0.5270696958028495, "grad_norm": 0.8996204733848572, "learning_rate": 4.574667218578915e-05, "loss": 0.9192, "step": 3422 }, { "epoch": 0.5273777435502502, "grad_norm": 0.8384501338005066, "learning_rate": 4.5698457838605287e-05, "loss": 0.9124, "step": 3424 }, { "epoch": 0.5276857912976511, "grad_norm": 0.702349066734314, "learning_rate": 4.565024752069601e-05, "loss": 0.9621, "step": 3426 }, { "epoch": 0.527993839045052, "grad_norm": 0.8768123388290405, "learning_rate": 4.560204127722016e-05, "loss": 0.9297, "step": 3428 }, { "epoch": 0.5283018867924528, "grad_norm": 0.8436578512191772, "learning_rate": 4.555383915333273e-05, "loss": 1.4854, "step": 3430 }, { "epoch": 0.5286099345398537, "grad_norm": 0.7559348344802856, "learning_rate": 4.550564119418487e-05, "loss": 0.9969, "step": 3432 }, { "epoch": 0.5289179822872545, "grad_norm": 0.6650515198707581, "learning_rate": 4.5457447444923854e-05, "loss": 0.9929, "step": 3434 }, { "epoch": 0.5292260300346554, "grad_norm": 0.9616847038269043, "learning_rate": 4.540925795069299e-05, "loss": 1.2402, "step": 3436 }, { "epoch": 0.5295340777820562, "grad_norm": 0.7548688054084778, "learning_rate": 4.536107275663157e-05, "loss": 0.9014, "step": 3438 }, { "epoch": 0.5298421255294571, "grad_norm": 0.7742545008659363, "learning_rate": 4.531289190787493e-05, "loss": 0.8877, "step": 3440 }, { "epoch": 0.5301501732768579, "grad_norm": 0.7939615845680237, "learning_rate": 4.526471544955429e-05, "loss": 1.3333, "step": 3442 }, { "epoch": 0.5304582210242588, "grad_norm": 0.7492351531982422, "learning_rate": 4.521654342679672e-05, "loss": 1.0736, "step": 3444 }, { "epoch": 0.5307662687716596, "grad_norm": 0.6150861382484436, "learning_rate": 4.516837588472524e-05, "loss": 0.8605, "step": 3446 }, { "epoch": 0.5310743165190605, "grad_norm": 0.8641752004623413, "learning_rate": 4.5120212868458566e-05, "loss": 0.9416, "step": 3448 }, { "epoch": 0.5313823642664613, "grad_norm": 0.9148188233375549, "learning_rate": 4.507205442311125e-05, "loss": 0.8779, "step": 3450 }, { "epoch": 0.5316904120138621, "grad_norm": 0.8277890682220459, "learning_rate": 4.502390059379349e-05, "loss": 0.9237, "step": 3452 }, { "epoch": 0.531998459761263, "grad_norm": 0.8538454174995422, "learning_rate": 4.497575142561125e-05, "loss": 1.0778, "step": 3454 }, { "epoch": 0.5323065075086638, "grad_norm": 0.7834024429321289, "learning_rate": 4.492760696366606e-05, "loss": 1.4203, "step": 3456 }, { "epoch": 0.5326145552560647, "grad_norm": 0.6071308851242065, "learning_rate": 4.487946725305504e-05, "loss": 1.1278, "step": 3458 }, { "epoch": 0.5329226030034655, "grad_norm": 0.6906324028968811, "learning_rate": 4.483133233887093e-05, "loss": 1.7408, "step": 3460 }, { "epoch": 0.5332306507508664, "grad_norm": 0.5732936263084412, "learning_rate": 4.478320226620189e-05, "loss": 1.0157, "step": 3462 }, { "epoch": 0.5335386984982672, "grad_norm": 0.913801372051239, "learning_rate": 4.473507708013158e-05, "loss": 1.2833, "step": 3464 }, { "epoch": 0.5338467462456681, "grad_norm": 0.8521076440811157, "learning_rate": 4.4686956825739115e-05, "loss": 1.0119, "step": 3466 }, { "epoch": 0.534154793993069, "grad_norm": 0.8757444620132446, "learning_rate": 4.4638841548098956e-05, "loss": 0.8007, "step": 3468 }, { "epoch": 0.5344628417404698, "grad_norm": 0.6936830282211304, "learning_rate": 4.459073129228089e-05, "loss": 0.8151, "step": 3470 }, { "epoch": 0.5347708894878707, "grad_norm": 0.7347168922424316, "learning_rate": 4.4542626103350014e-05, "loss": 0.9071, "step": 3472 }, { "epoch": 0.5350789372352714, "grad_norm": 0.87520432472229, "learning_rate": 4.449452602636671e-05, "loss": 1.0179, "step": 3474 }, { "epoch": 0.5353869849826723, "grad_norm": 0.7437530159950256, "learning_rate": 4.444643110638653e-05, "loss": 1.0384, "step": 3476 }, { "epoch": 0.5356950327300731, "grad_norm": 0.7023616433143616, "learning_rate": 4.439834138846019e-05, "loss": 0.9684, "step": 3478 }, { "epoch": 0.536003080477474, "grad_norm": 0.8628860116004944, "learning_rate": 4.4350256917633585e-05, "loss": 1.6387, "step": 3480 }, { "epoch": 0.5363111282248748, "grad_norm": 0.8111205697059631, "learning_rate": 4.4302177738947655e-05, "loss": 0.836, "step": 3482 }, { "epoch": 0.5366191759722757, "grad_norm": 0.6044906377792358, "learning_rate": 4.425410389743839e-05, "loss": 1.9801, "step": 3484 }, { "epoch": 0.5369272237196766, "grad_norm": 0.7423304915428162, "learning_rate": 4.420603543813675e-05, "loss": 0.8639, "step": 3486 }, { "epoch": 0.5372352714670774, "grad_norm": 0.7321537733078003, "learning_rate": 4.415797240606872e-05, "loss": 0.86, "step": 3488 }, { "epoch": 0.5375433192144783, "grad_norm": 0.7974177598953247, "learning_rate": 4.410991484625518e-05, "loss": 0.9389, "step": 3490 }, { "epoch": 0.5378513669618791, "grad_norm": 0.9002341628074646, "learning_rate": 4.4061862803711815e-05, "loss": 1.0236, "step": 3492 }, { "epoch": 0.53815941470928, "grad_norm": 1.1152660846710205, "learning_rate": 4.401381632344926e-05, "loss": 0.9229, "step": 3494 }, { "epoch": 0.5384674624566808, "grad_norm": 0.7044309973716736, "learning_rate": 4.3965775450472826e-05, "loss": 0.9132, "step": 3496 }, { "epoch": 0.5387755102040817, "grad_norm": 0.5965349674224854, "learning_rate": 4.391774022978264e-05, "loss": 0.9993, "step": 3498 }, { "epoch": 0.5390835579514824, "grad_norm": 0.7283958196640015, "learning_rate": 4.386971070637354e-05, "loss": 1.0205, "step": 3500 }, { "epoch": 0.5393916056988833, "grad_norm": 0.9296109676361084, "learning_rate": 4.382168692523498e-05, "loss": 1.0551, "step": 3502 }, { "epoch": 0.5396996534462841, "grad_norm": 0.6661383509635925, "learning_rate": 4.3773668931351055e-05, "loss": 0.9869, "step": 3504 }, { "epoch": 0.540007701193685, "grad_norm": 1.1538796424865723, "learning_rate": 4.372565676970045e-05, "loss": 0.9673, "step": 3506 }, { "epoch": 0.5403157489410859, "grad_norm": 0.9146564602851868, "learning_rate": 4.367765048525641e-05, "loss": 1.1098, "step": 3508 }, { "epoch": 0.5406237966884867, "grad_norm": 0.8921990394592285, "learning_rate": 4.362965012298659e-05, "loss": 0.9715, "step": 3510 }, { "epoch": 0.5409318444358876, "grad_norm": 0.7014362812042236, "learning_rate": 4.358165572785318e-05, "loss": 0.841, "step": 3512 }, { "epoch": 0.5412398921832884, "grad_norm": 0.8201600313186646, "learning_rate": 4.353366734481277e-05, "loss": 1.0627, "step": 3514 }, { "epoch": 0.5415479399306893, "grad_norm": 0.8277712464332581, "learning_rate": 4.348568501881629e-05, "loss": 1.1129, "step": 3516 }, { "epoch": 0.5418559876780901, "grad_norm": 0.787347674369812, "learning_rate": 4.343770879480899e-05, "loss": 1.1029, "step": 3518 }, { "epoch": 0.542164035425491, "grad_norm": 0.9938316345214844, "learning_rate": 4.338973871773045e-05, "loss": 0.9947, "step": 3520 }, { "epoch": 0.5424720831728918, "grad_norm": 0.9085752367973328, "learning_rate": 4.3341774832514445e-05, "loss": 0.9601, "step": 3522 }, { "epoch": 0.5427801309202926, "grad_norm": 0.5847015976905823, "learning_rate": 4.329381718408899e-05, "loss": 0.7997, "step": 3524 }, { "epoch": 0.5430881786676935, "grad_norm": 0.853076159954071, "learning_rate": 4.3245865817376206e-05, "loss": 0.9325, "step": 3526 }, { "epoch": 0.5433962264150943, "grad_norm": 0.6862278580665588, "learning_rate": 4.319792077729239e-05, "loss": 0.9502, "step": 3528 }, { "epoch": 0.5437042741624952, "grad_norm": 1.0851012468338013, "learning_rate": 4.314998210874789e-05, "loss": 0.8269, "step": 3530 }, { "epoch": 0.544012321909896, "grad_norm": 0.8321728110313416, "learning_rate": 4.310204985664703e-05, "loss": 1.1325, "step": 3532 }, { "epoch": 0.5443203696572969, "grad_norm": 0.6708217859268188, "learning_rate": 4.3054124065888244e-05, "loss": 1.1693, "step": 3534 }, { "epoch": 0.5446284174046977, "grad_norm": 0.8871041536331177, "learning_rate": 4.3006204781363803e-05, "loss": 0.8531, "step": 3536 }, { "epoch": 0.5449364651520986, "grad_norm": 0.9859650135040283, "learning_rate": 4.295829204795991e-05, "loss": 1.1135, "step": 3538 }, { "epoch": 0.5452445128994994, "grad_norm": 0.8064723014831543, "learning_rate": 4.291038591055668e-05, "loss": 0.9088, "step": 3540 }, { "epoch": 0.5455525606469003, "grad_norm": 1.2011666297912598, "learning_rate": 4.286248641402801e-05, "loss": 1.0337, "step": 3542 }, { "epoch": 0.5458606083943011, "grad_norm": 0.8110551238059998, "learning_rate": 4.281459360324156e-05, "loss": 0.8991, "step": 3544 }, { "epoch": 0.546168656141702, "grad_norm": 0.8566579818725586, "learning_rate": 4.276670752305875e-05, "loss": 0.9251, "step": 3546 }, { "epoch": 0.5464767038891029, "grad_norm": 0.7969164848327637, "learning_rate": 4.2718828218334734e-05, "loss": 1.1559, "step": 3548 }, { "epoch": 0.5467847516365036, "grad_norm": 0.6673331260681152, "learning_rate": 4.267095573391824e-05, "loss": 1.0338, "step": 3550 }, { "epoch": 0.5470927993839045, "grad_norm": 0.9492660760879517, "learning_rate": 4.262309011465164e-05, "loss": 1.0716, "step": 3552 }, { "epoch": 0.5474008471313053, "grad_norm": 0.845156729221344, "learning_rate": 4.257523140537092e-05, "loss": 1.0976, "step": 3554 }, { "epoch": 0.5477088948787062, "grad_norm": 0.5479189157485962, "learning_rate": 4.252737965090554e-05, "loss": 0.9575, "step": 3556 }, { "epoch": 0.548016942626107, "grad_norm": 0.703624427318573, "learning_rate": 4.2479534896078444e-05, "loss": 0.833, "step": 3558 }, { "epoch": 0.5483249903735079, "grad_norm": 0.9007683992385864, "learning_rate": 4.243169718570606e-05, "loss": 0.8989, "step": 3560 }, { "epoch": 0.5486330381209087, "grad_norm": 0.9709227681159973, "learning_rate": 4.2383866564598186e-05, "loss": 1.1332, "step": 3562 }, { "epoch": 0.5489410858683096, "grad_norm": 0.6905830502510071, "learning_rate": 4.2336043077557996e-05, "loss": 1.0249, "step": 3564 }, { "epoch": 0.5492491336157105, "grad_norm": 1.027398943901062, "learning_rate": 4.2288226769381944e-05, "loss": 0.9233, "step": 3566 }, { "epoch": 0.5495571813631113, "grad_norm": 0.7679511904716492, "learning_rate": 4.2240417684859826e-05, "loss": 1.0616, "step": 3568 }, { "epoch": 0.5498652291105122, "grad_norm": 0.7321463227272034, "learning_rate": 4.2192615868774624e-05, "loss": 0.9422, "step": 3570 }, { "epoch": 0.550173276857913, "grad_norm": 0.6400331258773804, "learning_rate": 4.214482136590248e-05, "loss": 0.8484, "step": 3572 }, { "epoch": 0.5504813246053138, "grad_norm": 0.9933096766471863, "learning_rate": 4.20970342210128e-05, "loss": 1.9918, "step": 3574 }, { "epoch": 0.5507893723527146, "grad_norm": 0.9462111592292786, "learning_rate": 4.2049254478867974e-05, "loss": 1.0123, "step": 3576 }, { "epoch": 0.5510974201001155, "grad_norm": 0.804648756980896, "learning_rate": 4.2001482184223505e-05, "loss": 1.8671, "step": 3578 }, { "epoch": 0.5514054678475163, "grad_norm": 0.7638084292411804, "learning_rate": 4.195371738182796e-05, "loss": 0.9511, "step": 3580 }, { "epoch": 0.5517135155949172, "grad_norm": 0.6705355048179626, "learning_rate": 4.190596011642285e-05, "loss": 1.1035, "step": 3582 }, { "epoch": 0.5520215633423181, "grad_norm": 0.732434093952179, "learning_rate": 4.185821043274259e-05, "loss": 1.0185, "step": 3584 }, { "epoch": 0.5523296110897189, "grad_norm": 0.712803304195404, "learning_rate": 4.181046837551455e-05, "loss": 0.926, "step": 3586 }, { "epoch": 0.5526376588371198, "grad_norm": 0.7874958515167236, "learning_rate": 4.1762733989458965e-05, "loss": 1.0391, "step": 3588 }, { "epoch": 0.5529457065845206, "grad_norm": 0.5894497632980347, "learning_rate": 4.1715007319288814e-05, "loss": 0.8582, "step": 3590 }, { "epoch": 0.5532537543319215, "grad_norm": 0.6469842195510864, "learning_rate": 4.1667288409709905e-05, "loss": 1.0132, "step": 3592 }, { "epoch": 0.5535618020793223, "grad_norm": 0.8849762082099915, "learning_rate": 4.1619577305420776e-05, "loss": 1.0479, "step": 3594 }, { "epoch": 0.5538698498267232, "grad_norm": 0.704217255115509, "learning_rate": 4.157187405111264e-05, "loss": 1.0849, "step": 3596 }, { "epoch": 0.554177897574124, "grad_norm": 0.8714843392372131, "learning_rate": 4.152417869146935e-05, "loss": 1.185, "step": 3598 }, { "epoch": 0.5544859453215248, "grad_norm": 0.7708948254585266, "learning_rate": 4.147649127116735e-05, "loss": 0.8242, "step": 3600 }, { "epoch": 0.5544859453215248, "eval_loss": 2.3939669132232666, "eval_runtime": 737.0426, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 3600 }, { "epoch": 0.5547939930689256, "grad_norm": 0.7352039217948914, "learning_rate": 4.14288118348757e-05, "loss": 1.7142, "step": 3602 }, { "epoch": 0.5551020408163265, "grad_norm": 0.5873389840126038, "learning_rate": 4.138114042725596e-05, "loss": 1.1663, "step": 3604 }, { "epoch": 0.5554100885637274, "grad_norm": 0.686549961566925, "learning_rate": 4.1333477092962114e-05, "loss": 0.8772, "step": 3606 }, { "epoch": 0.5557181363111282, "grad_norm": 0.8575116395950317, "learning_rate": 4.128582187664066e-05, "loss": 0.9958, "step": 3608 }, { "epoch": 0.5560261840585291, "grad_norm": 0.658591628074646, "learning_rate": 4.123817482293047e-05, "loss": 0.8492, "step": 3610 }, { "epoch": 0.5563342318059299, "grad_norm": 0.7009797096252441, "learning_rate": 4.1190535976462726e-05, "loss": 0.9576, "step": 3612 }, { "epoch": 0.5566422795533308, "grad_norm": 1.0068048238754272, "learning_rate": 4.114290538186101e-05, "loss": 1.1333, "step": 3614 }, { "epoch": 0.5569503273007316, "grad_norm": 0.6233420968055725, "learning_rate": 4.109528308374108e-05, "loss": 1.3391, "step": 3616 }, { "epoch": 0.5572583750481325, "grad_norm": 0.5594993233680725, "learning_rate": 4.104766912671098e-05, "loss": 0.8397, "step": 3618 }, { "epoch": 0.5575664227955333, "grad_norm": 0.9816888570785522, "learning_rate": 4.1000063555370894e-05, "loss": 1.0325, "step": 3620 }, { "epoch": 0.5578744705429342, "grad_norm": 0.9001654386520386, "learning_rate": 4.0952466414313235e-05, "loss": 0.9348, "step": 3622 }, { "epoch": 0.558182518290335, "grad_norm": 0.9082402586936951, "learning_rate": 4.0904877748122436e-05, "loss": 1.1301, "step": 3624 }, { "epoch": 0.5584905660377358, "grad_norm": 0.8761422038078308, "learning_rate": 4.085729760137501e-05, "loss": 0.8416, "step": 3626 }, { "epoch": 0.5587986137851367, "grad_norm": 1.0857791900634766, "learning_rate": 4.080972601863956e-05, "loss": 1.8213, "step": 3628 }, { "epoch": 0.5591066615325375, "grad_norm": 0.7211164236068726, "learning_rate": 4.076216304447654e-05, "loss": 1.7474, "step": 3630 }, { "epoch": 0.5594147092799384, "grad_norm": 0.7563309073448181, "learning_rate": 4.071460872343843e-05, "loss": 1.8916, "step": 3632 }, { "epoch": 0.5597227570273392, "grad_norm": 0.7382598519325256, "learning_rate": 4.066706310006961e-05, "loss": 0.8198, "step": 3634 }, { "epoch": 0.5600308047747401, "grad_norm": 0.9442710280418396, "learning_rate": 4.061952621890628e-05, "loss": 1.0136, "step": 3636 }, { "epoch": 0.5603388525221409, "grad_norm": 0.7352667450904846, "learning_rate": 4.0571998124476437e-05, "loss": 0.754, "step": 3638 }, { "epoch": 0.5606469002695418, "grad_norm": 0.768801212310791, "learning_rate": 4.052447886129986e-05, "loss": 0.9408, "step": 3640 }, { "epoch": 0.5609549480169427, "grad_norm": 0.8420480489730835, "learning_rate": 4.047696847388811e-05, "loss": 1.056, "step": 3642 }, { "epoch": 0.5612629957643435, "grad_norm": 0.7617586851119995, "learning_rate": 4.042946700674436e-05, "loss": 0.975, "step": 3644 }, { "epoch": 0.5615710435117444, "grad_norm": 0.7460455298423767, "learning_rate": 4.038197450436344e-05, "loss": 1.0729, "step": 3646 }, { "epoch": 0.5618790912591451, "grad_norm": 0.8208043575286865, "learning_rate": 4.0334491011231826e-05, "loss": 0.8507, "step": 3648 }, { "epoch": 0.562187139006546, "grad_norm": 0.7699398994445801, "learning_rate": 4.028701657182752e-05, "loss": 1.9133, "step": 3650 }, { "epoch": 0.5624951867539468, "grad_norm": 0.7481504082679749, "learning_rate": 4.0239551230620034e-05, "loss": 1.0713, "step": 3652 }, { "epoch": 0.5628032345013477, "grad_norm": 0.7889412641525269, "learning_rate": 4.0192095032070406e-05, "loss": 0.9672, "step": 3654 }, { "epoch": 0.5631112822487485, "grad_norm": 1.0069886445999146, "learning_rate": 4.014464802063105e-05, "loss": 1.0111, "step": 3656 }, { "epoch": 0.5634193299961494, "grad_norm": 0.6418771147727966, "learning_rate": 4.009721024074583e-05, "loss": 0.8536, "step": 3658 }, { "epoch": 0.5637273777435502, "grad_norm": 0.5600576400756836, "learning_rate": 4.004978173684988e-05, "loss": 0.8057, "step": 3660 }, { "epoch": 0.5640354254909511, "grad_norm": 0.8424544930458069, "learning_rate": 4.000236255336978e-05, "loss": 0.8777, "step": 3662 }, { "epoch": 0.564343473238352, "grad_norm": 1.293593406677246, "learning_rate": 3.995495273472323e-05, "loss": 1.2139, "step": 3664 }, { "epoch": 0.5646515209857528, "grad_norm": 0.7682777643203735, "learning_rate": 3.9907552325319266e-05, "loss": 1.5731, "step": 3666 }, { "epoch": 0.5649595687331537, "grad_norm": 0.7142342925071716, "learning_rate": 3.986016136955806e-05, "loss": 0.9631, "step": 3668 }, { "epoch": 0.5652676164805545, "grad_norm": 0.7129397988319397, "learning_rate": 3.981277991183096e-05, "loss": 0.7958, "step": 3670 }, { "epoch": 0.5655756642279554, "grad_norm": 0.7072484493255615, "learning_rate": 3.976540799652037e-05, "loss": 0.9842, "step": 3672 }, { "epoch": 0.5658837119753561, "grad_norm": 0.8849371075630188, "learning_rate": 3.971804566799979e-05, "loss": 0.8443, "step": 3674 }, { "epoch": 0.566191759722757, "grad_norm": 0.6128670573234558, "learning_rate": 3.967069297063376e-05, "loss": 0.9314, "step": 3676 }, { "epoch": 0.5664998074701578, "grad_norm": 1.0203518867492676, "learning_rate": 3.962334994877774e-05, "loss": 0.8948, "step": 3678 }, { "epoch": 0.5668078552175587, "grad_norm": 1.0024428367614746, "learning_rate": 3.957601664677816e-05, "loss": 0.9889, "step": 3680 }, { "epoch": 0.5671159029649596, "grad_norm": 0.6684615612030029, "learning_rate": 3.952869310897237e-05, "loss": 0.9205, "step": 3682 }, { "epoch": 0.5674239507123604, "grad_norm": 0.7785738110542297, "learning_rate": 3.948137937968854e-05, "loss": 0.9355, "step": 3684 }, { "epoch": 0.5677319984597613, "grad_norm": 0.8673728108406067, "learning_rate": 3.9434075503245646e-05, "loss": 0.8856, "step": 3686 }, { "epoch": 0.5680400462071621, "grad_norm": 0.8597670197486877, "learning_rate": 3.938678152395346e-05, "loss": 0.9154, "step": 3688 }, { "epoch": 0.568348093954563, "grad_norm": 0.7761251330375671, "learning_rate": 3.933949748611247e-05, "loss": 0.8788, "step": 3690 }, { "epoch": 0.5686561417019638, "grad_norm": 0.6056908965110779, "learning_rate": 3.929222343401385e-05, "loss": 0.7823, "step": 3692 }, { "epoch": 0.5689641894493647, "grad_norm": 0.6534133553504944, "learning_rate": 3.9244959411939447e-05, "loss": 1.9013, "step": 3694 }, { "epoch": 0.5692722371967655, "grad_norm": 0.9937305450439453, "learning_rate": 3.9197705464161674e-05, "loss": 1.0152, "step": 3696 }, { "epoch": 0.5695802849441663, "grad_norm": 0.6183207035064697, "learning_rate": 3.915046163494351e-05, "loss": 1.1256, "step": 3698 }, { "epoch": 0.5698883326915672, "grad_norm": 0.8396857380867004, "learning_rate": 3.910322796853848e-05, "loss": 1.1151, "step": 3700 }, { "epoch": 0.570196380438968, "grad_norm": 0.7862452268600464, "learning_rate": 3.905600450919061e-05, "loss": 1.0242, "step": 3702 }, { "epoch": 0.5705044281863689, "grad_norm": 0.9082566499710083, "learning_rate": 3.9008791301134294e-05, "loss": 0.7974, "step": 3704 }, { "epoch": 0.5708124759337697, "grad_norm": 0.7656323313713074, "learning_rate": 3.8961588388594366e-05, "loss": 0.8431, "step": 3706 }, { "epoch": 0.5711205236811706, "grad_norm": 0.901706337928772, "learning_rate": 3.8914395815786045e-05, "loss": 1.4954, "step": 3708 }, { "epoch": 0.5714285714285714, "grad_norm": 1.125427007675171, "learning_rate": 3.886721362691481e-05, "loss": 0.7766, "step": 3710 }, { "epoch": 0.5717366191759723, "grad_norm": 1.0342464447021484, "learning_rate": 3.8820041866176444e-05, "loss": 0.9791, "step": 3712 }, { "epoch": 0.5720446669233731, "grad_norm": 0.843343198299408, "learning_rate": 3.877288057775694e-05, "loss": 0.9627, "step": 3714 }, { "epoch": 0.572352714670774, "grad_norm": 0.6604567766189575, "learning_rate": 3.872572980583253e-05, "loss": 0.9817, "step": 3716 }, { "epoch": 0.5726607624181748, "grad_norm": 0.8827517032623291, "learning_rate": 3.8678589594569535e-05, "loss": 1.3719, "step": 3718 }, { "epoch": 0.5729688101655757, "grad_norm": 0.7129951119422913, "learning_rate": 3.86314599881244e-05, "loss": 0.9466, "step": 3720 }, { "epoch": 0.5732768579129766, "grad_norm": 0.8309202790260315, "learning_rate": 3.858434103064368e-05, "loss": 0.9355, "step": 3722 }, { "epoch": 0.5735849056603773, "grad_norm": 0.8105360865592957, "learning_rate": 3.853723276626392e-05, "loss": 0.9133, "step": 3724 }, { "epoch": 0.5738929534077782, "grad_norm": 0.7791650295257568, "learning_rate": 3.84901352391116e-05, "loss": 0.7613, "step": 3726 }, { "epoch": 0.574201001155179, "grad_norm": 0.9629634618759155, "learning_rate": 3.844304849330326e-05, "loss": 1.0993, "step": 3728 }, { "epoch": 0.5745090489025799, "grad_norm": 0.7873706817626953, "learning_rate": 3.839597257294524e-05, "loss": 0.8449, "step": 3730 }, { "epoch": 0.5748170966499807, "grad_norm": 0.6659789085388184, "learning_rate": 3.834890752213379e-05, "loss": 0.8397, "step": 3732 }, { "epoch": 0.5751251443973816, "grad_norm": 0.7648429870605469, "learning_rate": 3.8301853384954924e-05, "loss": 0.8762, "step": 3734 }, { "epoch": 0.5754331921447824, "grad_norm": 0.9787917137145996, "learning_rate": 3.825481020548451e-05, "loss": 1.344, "step": 3736 }, { "epoch": 0.5757412398921833, "grad_norm": 1.028001308441162, "learning_rate": 3.82077780277881e-05, "loss": 1.0085, "step": 3738 }, { "epoch": 0.5760492876395842, "grad_norm": 0.9443414211273193, "learning_rate": 3.816075689592095e-05, "loss": 1.051, "step": 3740 }, { "epoch": 0.576357335386985, "grad_norm": 0.6627349257469177, "learning_rate": 3.811374685392799e-05, "loss": 1.0108, "step": 3742 }, { "epoch": 0.5766653831343859, "grad_norm": 0.7638726234436035, "learning_rate": 3.806674794584374e-05, "loss": 2.2769, "step": 3744 }, { "epoch": 0.5769734308817867, "grad_norm": 0.7516840100288391, "learning_rate": 3.8019760215692266e-05, "loss": 0.8065, "step": 3746 }, { "epoch": 0.5772814786291875, "grad_norm": 1.1510311365127563, "learning_rate": 3.7972783707487234e-05, "loss": 1.1615, "step": 3748 }, { "epoch": 0.5775895263765883, "grad_norm": 0.8986920714378357, "learning_rate": 3.792581846523175e-05, "loss": 0.9316, "step": 3750 }, { "epoch": 0.5778975741239892, "grad_norm": 0.6290449500083923, "learning_rate": 3.787886453291837e-05, "loss": 1.03, "step": 3752 }, { "epoch": 0.57820562187139, "grad_norm": 0.9564099311828613, "learning_rate": 3.7831921954529035e-05, "loss": 1.0917, "step": 3754 }, { "epoch": 0.5785136696187909, "grad_norm": 0.7102187275886536, "learning_rate": 3.7784990774035124e-05, "loss": 1.1032, "step": 3756 }, { "epoch": 0.5788217173661918, "grad_norm": 0.6460309028625488, "learning_rate": 3.773807103539726e-05, "loss": 1.249, "step": 3758 }, { "epoch": 0.5791297651135926, "grad_norm": 0.8030285239219666, "learning_rate": 3.7691162782565383e-05, "loss": 0.9557, "step": 3760 }, { "epoch": 0.5794378128609935, "grad_norm": 0.9096877574920654, "learning_rate": 3.764426605947868e-05, "loss": 0.9874, "step": 3762 }, { "epoch": 0.5797458606083943, "grad_norm": 0.4688047468662262, "learning_rate": 3.7597380910065547e-05, "loss": 0.909, "step": 3764 }, { "epoch": 0.5800539083557952, "grad_norm": 0.9313383102416992, "learning_rate": 3.755050737824347e-05, "loss": 0.8925, "step": 3766 }, { "epoch": 0.580361956103196, "grad_norm": 0.7512313723564148, "learning_rate": 3.7503645507919174e-05, "loss": 0.9631, "step": 3768 }, { "epoch": 0.5806700038505969, "grad_norm": 0.8948346376419067, "learning_rate": 3.7456795342988336e-05, "loss": 1.3042, "step": 3770 }, { "epoch": 0.5809780515979976, "grad_norm": 0.8193276524543762, "learning_rate": 3.7409956927335766e-05, "loss": 0.8325, "step": 3772 }, { "epoch": 0.5812860993453985, "grad_norm": 0.8436985611915588, "learning_rate": 3.736313030483517e-05, "loss": 1.0058, "step": 3774 }, { "epoch": 0.5815941470927993, "grad_norm": 0.9698597192764282, "learning_rate": 3.731631551934932e-05, "loss": 0.931, "step": 3776 }, { "epoch": 0.5819021948402002, "grad_norm": 0.5956494808197021, "learning_rate": 3.726951261472981e-05, "loss": 0.6849, "step": 3778 }, { "epoch": 0.5822102425876011, "grad_norm": 0.5870460867881775, "learning_rate": 3.7222721634817146e-05, "loss": 0.9, "step": 3780 }, { "epoch": 0.5825182903350019, "grad_norm": 0.7760697603225708, "learning_rate": 3.7175942623440684e-05, "loss": 1.0826, "step": 3782 }, { "epoch": 0.5828263380824028, "grad_norm": 1.0230954885482788, "learning_rate": 3.71291756244185e-05, "loss": 1.0201, "step": 3784 }, { "epoch": 0.5831343858298036, "grad_norm": 0.9187666177749634, "learning_rate": 3.7082420681557476e-05, "loss": 1.0352, "step": 3786 }, { "epoch": 0.5834424335772045, "grad_norm": 0.7570469975471497, "learning_rate": 3.7035677838653195e-05, "loss": 1.8588, "step": 3788 }, { "epoch": 0.5837504813246053, "grad_norm": 0.7575215697288513, "learning_rate": 3.69889471394899e-05, "loss": 1.0532, "step": 3790 }, { "epoch": 0.5840585290720062, "grad_norm": 0.8707351088523865, "learning_rate": 3.694222862784043e-05, "loss": 2.6658, "step": 3792 }, { "epoch": 0.584366576819407, "grad_norm": 0.9095748662948608, "learning_rate": 3.689552234746623e-05, "loss": 1.0685, "step": 3794 }, { "epoch": 0.5846746245668079, "grad_norm": 0.806652843952179, "learning_rate": 3.684882834211732e-05, "loss": 1.082, "step": 3796 }, { "epoch": 0.5849826723142088, "grad_norm": 0.6337845921516418, "learning_rate": 3.6802146655532185e-05, "loss": 1.365, "step": 3798 }, { "epoch": 0.5852907200616095, "grad_norm": 0.5820702910423279, "learning_rate": 3.675547733143776e-05, "loss": 1.9037, "step": 3800 }, { "epoch": 0.5855987678090104, "grad_norm": 0.72236168384552, "learning_rate": 3.670882041354944e-05, "loss": 1.5893, "step": 3802 }, { "epoch": 0.5859068155564112, "grad_norm": 0.8915034532546997, "learning_rate": 3.666217594557097e-05, "loss": 1.1491, "step": 3804 }, { "epoch": 0.5862148633038121, "grad_norm": 0.972583532333374, "learning_rate": 3.6615543971194424e-05, "loss": 0.865, "step": 3806 }, { "epoch": 0.5865229110512129, "grad_norm": 0.6603726744651794, "learning_rate": 3.65689245341002e-05, "loss": 0.8353, "step": 3808 }, { "epoch": 0.5868309587986138, "grad_norm": 0.525361955165863, "learning_rate": 3.652231767795695e-05, "loss": 0.7387, "step": 3810 }, { "epoch": 0.5871390065460146, "grad_norm": 0.8613260388374329, "learning_rate": 3.647572344642155e-05, "loss": 0.8483, "step": 3812 }, { "epoch": 0.5874470542934155, "grad_norm": 0.8724876642227173, "learning_rate": 3.6429141883138986e-05, "loss": 1.1493, "step": 3814 }, { "epoch": 0.5877551020408164, "grad_norm": 0.7819899916648865, "learning_rate": 3.638257303174246e-05, "loss": 1.0313, "step": 3816 }, { "epoch": 0.5880631497882172, "grad_norm": 0.8212162852287292, "learning_rate": 3.6336016935853225e-05, "loss": 1.0326, "step": 3818 }, { "epoch": 0.5883711975356181, "grad_norm": 0.7977713346481323, "learning_rate": 3.628947363908058e-05, "loss": 0.8044, "step": 3820 }, { "epoch": 0.5886792452830188, "grad_norm": 0.7846778631210327, "learning_rate": 3.6242943185021875e-05, "loss": 0.8219, "step": 3822 }, { "epoch": 0.5889872930304197, "grad_norm": 0.7150901556015015, "learning_rate": 3.6196425617262385e-05, "loss": 1.035, "step": 3824 }, { "epoch": 0.5892953407778205, "grad_norm": 0.68860924243927, "learning_rate": 3.614992097937533e-05, "loss": 0.8661, "step": 3826 }, { "epoch": 0.5896033885252214, "grad_norm": 0.6898394823074341, "learning_rate": 3.610342931492182e-05, "loss": 1.3513, "step": 3828 }, { "epoch": 0.5899114362726222, "grad_norm": 0.6659755110740662, "learning_rate": 3.605695066745084e-05, "loss": 1.4634, "step": 3830 }, { "epoch": 0.5902194840200231, "grad_norm": 0.7592305541038513, "learning_rate": 3.601048508049913e-05, "loss": 0.8823, "step": 3832 }, { "epoch": 0.5905275317674239, "grad_norm": 0.761418342590332, "learning_rate": 3.5964032597591215e-05, "loss": 1.001, "step": 3834 }, { "epoch": 0.5908355795148248, "grad_norm": 0.8135924935340881, "learning_rate": 3.591759326223937e-05, "loss": 1.2993, "step": 3836 }, { "epoch": 0.5911436272622257, "grad_norm": 0.7134994268417358, "learning_rate": 3.5871167117943544e-05, "loss": 0.9866, "step": 3838 }, { "epoch": 0.5914516750096265, "grad_norm": 0.6832489967346191, "learning_rate": 3.582475420819129e-05, "loss": 1.9355, "step": 3840 }, { "epoch": 0.5917597227570274, "grad_norm": 0.6804940700531006, "learning_rate": 3.577835457645783e-05, "loss": 0.917, "step": 3842 }, { "epoch": 0.5920677705044282, "grad_norm": 0.92501300573349, "learning_rate": 3.573196826620591e-05, "loss": 1.0095, "step": 3844 }, { "epoch": 0.5923758182518291, "grad_norm": 0.6867468953132629, "learning_rate": 3.5685595320885776e-05, "loss": 0.8609, "step": 3846 }, { "epoch": 0.5926838659992298, "grad_norm": 0.5593806505203247, "learning_rate": 3.56392357839352e-05, "loss": 0.7962, "step": 3848 }, { "epoch": 0.5929919137466307, "grad_norm": 0.693376362323761, "learning_rate": 3.5592889698779385e-05, "loss": 0.9079, "step": 3850 }, { "epoch": 0.5932999614940315, "grad_norm": 0.7576355338096619, "learning_rate": 3.5546557108830925e-05, "loss": 0.8744, "step": 3852 }, { "epoch": 0.5936080092414324, "grad_norm": 0.7912231087684631, "learning_rate": 3.5500238057489746e-05, "loss": 0.9186, "step": 3854 }, { "epoch": 0.5939160569888333, "grad_norm": 0.5722372531890869, "learning_rate": 3.545393258814316e-05, "loss": 0.909, "step": 3856 }, { "epoch": 0.5942241047362341, "grad_norm": 0.537816047668457, "learning_rate": 3.540764074416568e-05, "loss": 0.867, "step": 3858 }, { "epoch": 0.594532152483635, "grad_norm": 0.7094846367835999, "learning_rate": 3.53613625689191e-05, "loss": 0.7947, "step": 3860 }, { "epoch": 0.5948402002310358, "grad_norm": 0.7573848366737366, "learning_rate": 3.5315098105752434e-05, "loss": 1.117, "step": 3862 }, { "epoch": 0.5951482479784367, "grad_norm": 0.8237606883049011, "learning_rate": 3.5268847398001766e-05, "loss": 1.0502, "step": 3864 }, { "epoch": 0.5954562957258375, "grad_norm": 0.7843457460403442, "learning_rate": 3.52226104889904e-05, "loss": 1.145, "step": 3866 }, { "epoch": 0.5957643434732384, "grad_norm": 0.932285487651825, "learning_rate": 3.5176387422028625e-05, "loss": 1.0675, "step": 3868 }, { "epoch": 0.5960723912206392, "grad_norm": 0.6036128997802734, "learning_rate": 3.5130178240413833e-05, "loss": 0.8849, "step": 3870 }, { "epoch": 0.59638043896804, "grad_norm": 0.9221835136413574, "learning_rate": 3.508398298743036e-05, "loss": 0.8411, "step": 3872 }, { "epoch": 0.596688486715441, "grad_norm": 0.7087579369544983, "learning_rate": 3.5037801706349524e-05, "loss": 0.9154, "step": 3874 }, { "epoch": 0.5969965344628417, "grad_norm": 0.9598959684371948, "learning_rate": 3.4991634440429545e-05, "loss": 1.1822, "step": 3876 }, { "epoch": 0.5973045822102426, "grad_norm": 0.848368763923645, "learning_rate": 3.494548123291552e-05, "loss": 1.7755, "step": 3878 }, { "epoch": 0.5976126299576434, "grad_norm": 0.9857352375984192, "learning_rate": 3.489934212703936e-05, "loss": 1.0044, "step": 3880 }, { "epoch": 0.5979206777050443, "grad_norm": 0.9576002359390259, "learning_rate": 3.485321716601979e-05, "loss": 0.8722, "step": 3882 }, { "epoch": 0.5982287254524451, "grad_norm": 0.7168333530426025, "learning_rate": 3.4807106393062275e-05, "loss": 0.9099, "step": 3884 }, { "epoch": 0.598536773199846, "grad_norm": 0.8737758994102478, "learning_rate": 3.476100985135901e-05, "loss": 1.0395, "step": 3886 }, { "epoch": 0.5988448209472468, "grad_norm": 0.8918794393539429, "learning_rate": 3.471492758408879e-05, "loss": 1.055, "step": 3888 }, { "epoch": 0.5991528686946477, "grad_norm": 0.8955882787704468, "learning_rate": 3.466885963441714e-05, "loss": 0.8754, "step": 3890 }, { "epoch": 0.5994609164420485, "grad_norm": 0.7722766399383545, "learning_rate": 3.462280604549611e-05, "loss": 0.763, "step": 3892 }, { "epoch": 0.5997689641894494, "grad_norm": 0.7882518172264099, "learning_rate": 3.457676686046427e-05, "loss": 0.9779, "step": 3894 }, { "epoch": 0.6000770119368503, "grad_norm": 0.8698257803916931, "learning_rate": 3.453074212244681e-05, "loss": 0.8066, "step": 3896 }, { "epoch": 0.600385059684251, "grad_norm": 0.6644619107246399, "learning_rate": 3.4484731874555274e-05, "loss": 0.9186, "step": 3898 }, { "epoch": 0.6006931074316519, "grad_norm": 0.5977016091346741, "learning_rate": 3.4438736159887665e-05, "loss": 0.911, "step": 3900 }, { "epoch": 0.6006931074316519, "eval_loss": 2.3850553035736084, "eval_runtime": 736.9149, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.679, "step": 3900 }, { "epoch": 0.6010011551790527, "grad_norm": 0.8254393339157104, "learning_rate": 3.4392755021528424e-05, "loss": 1.0326, "step": 3902 }, { "epoch": 0.6013092029264536, "grad_norm": 0.6651079654693604, "learning_rate": 3.434678850254827e-05, "loss": 0.9066, "step": 3904 }, { "epoch": 0.6016172506738544, "grad_norm": 0.8798814415931702, "learning_rate": 3.4300836646004253e-05, "loss": 1.0857, "step": 3906 }, { "epoch": 0.6019252984212553, "grad_norm": 0.8257869482040405, "learning_rate": 3.425489949493969e-05, "loss": 0.8308, "step": 3908 }, { "epoch": 0.6022333461686561, "grad_norm": 0.7323641180992126, "learning_rate": 3.420897709238414e-05, "loss": 0.91, "step": 3910 }, { "epoch": 0.602541393916057, "grad_norm": 0.9562618732452393, "learning_rate": 3.4163069481353334e-05, "loss": 1.0447, "step": 3912 }, { "epoch": 0.6028494416634579, "grad_norm": 0.6833957433700562, "learning_rate": 3.4117176704849116e-05, "loss": 0.9323, "step": 3914 }, { "epoch": 0.6031574894108587, "grad_norm": 0.9771287441253662, "learning_rate": 3.4071298805859483e-05, "loss": 0.9317, "step": 3916 }, { "epoch": 0.6034655371582596, "grad_norm": 1.0490721464157104, "learning_rate": 3.4025435827358497e-05, "loss": 0.8953, "step": 3918 }, { "epoch": 0.6037735849056604, "grad_norm": 0.9347744584083557, "learning_rate": 3.3979587812306196e-05, "loss": 1.1698, "step": 3920 }, { "epoch": 0.6040816326530613, "grad_norm": 0.8014428615570068, "learning_rate": 3.393375480364862e-05, "loss": 0.8904, "step": 3922 }, { "epoch": 0.604389680400462, "grad_norm": 0.5310930609703064, "learning_rate": 3.388793684431779e-05, "loss": 1.7, "step": 3924 }, { "epoch": 0.6046977281478629, "grad_norm": 0.8464880585670471, "learning_rate": 3.38421339772316e-05, "loss": 1.2822, "step": 3926 }, { "epoch": 0.6050057758952637, "grad_norm": 0.6187727451324463, "learning_rate": 3.3796346245293775e-05, "loss": 2.4976, "step": 3928 }, { "epoch": 0.6053138236426646, "grad_norm": 0.6648653745651245, "learning_rate": 3.375057369139394e-05, "loss": 0.8097, "step": 3930 }, { "epoch": 0.6056218713900655, "grad_norm": 0.8213443756103516, "learning_rate": 3.370481635840744e-05, "loss": 1.0321, "step": 3932 }, { "epoch": 0.6059299191374663, "grad_norm": 1.0013117790222168, "learning_rate": 3.365907428919536e-05, "loss": 0.9412, "step": 3934 }, { "epoch": 0.6062379668848672, "grad_norm": 0.7968295812606812, "learning_rate": 3.361334752660456e-05, "loss": 0.8192, "step": 3936 }, { "epoch": 0.606546014632268, "grad_norm": 0.6209965944290161, "learning_rate": 3.356763611346747e-05, "loss": 0.9079, "step": 3938 }, { "epoch": 0.6068540623796689, "grad_norm": 0.7653431296348572, "learning_rate": 3.352194009260221e-05, "loss": 1.0268, "step": 3940 }, { "epoch": 0.6071621101270697, "grad_norm": 0.830397367477417, "learning_rate": 3.3476259506812404e-05, "loss": 0.9273, "step": 3942 }, { "epoch": 0.6074701578744706, "grad_norm": 0.6765947937965393, "learning_rate": 3.343059439888735e-05, "loss": 0.9284, "step": 3944 }, { "epoch": 0.6077782056218713, "grad_norm": 0.8520391583442688, "learning_rate": 3.33849448116017e-05, "loss": 0.98, "step": 3946 }, { "epoch": 0.6080862533692722, "grad_norm": 0.8554842472076416, "learning_rate": 3.3339310787715665e-05, "loss": 0.907, "step": 3948 }, { "epoch": 0.608394301116673, "grad_norm": 1.0883864164352417, "learning_rate": 3.329369236997486e-05, "loss": 0.8451, "step": 3950 }, { "epoch": 0.6087023488640739, "grad_norm": 0.65107661485672, "learning_rate": 3.324808960111024e-05, "loss": 1.0974, "step": 3952 }, { "epoch": 0.6090103966114748, "grad_norm": 0.8793396949768066, "learning_rate": 3.320250252383814e-05, "loss": 0.9018, "step": 3954 }, { "epoch": 0.6093184443588756, "grad_norm": 0.9697176218032837, "learning_rate": 3.3156931180860195e-05, "loss": 0.8161, "step": 3956 }, { "epoch": 0.6096264921062765, "grad_norm": 0.8237152099609375, "learning_rate": 3.3111375614863305e-05, "loss": 1.2178, "step": 3958 }, { "epoch": 0.6099345398536773, "grad_norm": 0.8094208240509033, "learning_rate": 3.306583586851956e-05, "loss": 1.0218, "step": 3960 }, { "epoch": 0.6102425876010782, "grad_norm": 0.7797203063964844, "learning_rate": 3.302031198448624e-05, "loss": 0.8513, "step": 3962 }, { "epoch": 0.610550635348479, "grad_norm": 0.7247787714004517, "learning_rate": 3.297480400540581e-05, "loss": 0.9219, "step": 3964 }, { "epoch": 0.6108586830958799, "grad_norm": 0.7033014893531799, "learning_rate": 3.292931197390581e-05, "loss": 0.683, "step": 3966 }, { "epoch": 0.6111667308432807, "grad_norm": 0.6294431090354919, "learning_rate": 3.288383593259881e-05, "loss": 0.9691, "step": 3968 }, { "epoch": 0.6114747785906816, "grad_norm": 0.8306741118431091, "learning_rate": 3.283837592408244e-05, "loss": 1.1146, "step": 3970 }, { "epoch": 0.6117828263380825, "grad_norm": 0.8149116039276123, "learning_rate": 3.279293199093931e-05, "loss": 1.1058, "step": 3972 }, { "epoch": 0.6120908740854832, "grad_norm": 0.7836449146270752, "learning_rate": 3.274750417573694e-05, "loss": 1.1746, "step": 3974 }, { "epoch": 0.6123989218328841, "grad_norm": 0.836641788482666, "learning_rate": 3.270209252102782e-05, "loss": 0.7761, "step": 3976 }, { "epoch": 0.6127069695802849, "grad_norm": 0.9818074703216553, "learning_rate": 3.2656697069349224e-05, "loss": 1.2522, "step": 3978 }, { "epoch": 0.6130150173276858, "grad_norm": 0.775067150592804, "learning_rate": 3.26113178632233e-05, "loss": 0.8516, "step": 3980 }, { "epoch": 0.6133230650750866, "grad_norm": 1.0951143503189087, "learning_rate": 3.2565954945156924e-05, "loss": 1.0875, "step": 3982 }, { "epoch": 0.6136311128224875, "grad_norm": 0.8564605712890625, "learning_rate": 3.252060835764181e-05, "loss": 1.164, "step": 3984 }, { "epoch": 0.6139391605698883, "grad_norm": 0.7555924654006958, "learning_rate": 3.2475278143154284e-05, "loss": 0.8967, "step": 3986 }, { "epoch": 0.6142472083172892, "grad_norm": 0.7195894718170166, "learning_rate": 3.242996434415537e-05, "loss": 0.8661, "step": 3988 }, { "epoch": 0.6145552560646901, "grad_norm": 0.8980828523635864, "learning_rate": 3.2384667003090727e-05, "loss": 0.9645, "step": 3990 }, { "epoch": 0.6148633038120909, "grad_norm": 0.6523622274398804, "learning_rate": 3.233938616239058e-05, "loss": 1.0817, "step": 3992 }, { "epoch": 0.6151713515594918, "grad_norm": 0.997121274471283, "learning_rate": 3.229412186446969e-05, "loss": 0.9931, "step": 3994 }, { "epoch": 0.6154793993068925, "grad_norm": 0.986936628818512, "learning_rate": 3.2248874151727356e-05, "loss": 0.8143, "step": 3996 }, { "epoch": 0.6157874470542934, "grad_norm": 1.0862888097763062, "learning_rate": 3.2203643066547315e-05, "loss": 0.9638, "step": 3998 }, { "epoch": 0.6160954948016942, "grad_norm": 0.6776353716850281, "learning_rate": 3.215842865129773e-05, "loss": 1.4069, "step": 4000 }, { "epoch": 0.6164035425490951, "grad_norm": 0.6925904154777527, "learning_rate": 3.2113230948331154e-05, "loss": 0.9874, "step": 4002 }, { "epoch": 0.6167115902964959, "grad_norm": 6.114987850189209, "learning_rate": 3.20680499999845e-05, "loss": 1.1663, "step": 4004 }, { "epoch": 0.6170196380438968, "grad_norm": 0.7539072632789612, "learning_rate": 3.2022885848578966e-05, "loss": 1.1407, "step": 4006 }, { "epoch": 0.6173276857912976, "grad_norm": 0.5863259434700012, "learning_rate": 3.197773853642e-05, "loss": 0.9743, "step": 4008 }, { "epoch": 0.6176357335386985, "grad_norm": 0.9714782238006592, "learning_rate": 3.193260810579733e-05, "loss": 0.8682, "step": 4010 }, { "epoch": 0.6179437812860994, "grad_norm": 1.0604935884475708, "learning_rate": 3.188749459898482e-05, "loss": 0.874, "step": 4012 }, { "epoch": 0.6182518290335002, "grad_norm": 0.8278828263282776, "learning_rate": 3.184239805824052e-05, "loss": 1.1128, "step": 4014 }, { "epoch": 0.6185598767809011, "grad_norm": 0.7398038506507874, "learning_rate": 3.1797318525806575e-05, "loss": 0.8065, "step": 4016 }, { "epoch": 0.6188679245283019, "grad_norm": 0.6247692704200745, "learning_rate": 3.175225604390917e-05, "loss": 0.8993, "step": 4018 }, { "epoch": 0.6191759722757028, "grad_norm": 0.7333313226699829, "learning_rate": 3.1707210654758556e-05, "loss": 0.7988, "step": 4020 }, { "epoch": 0.6194840200231035, "grad_norm": 0.9650359153747559, "learning_rate": 3.166218240054893e-05, "loss": 0.9056, "step": 4022 }, { "epoch": 0.6197920677705044, "grad_norm": 1.0183436870574951, "learning_rate": 3.161717132345852e-05, "loss": 0.8781, "step": 4024 }, { "epoch": 0.6201001155179052, "grad_norm": 0.8829627633094788, "learning_rate": 3.157217746564937e-05, "loss": 0.9361, "step": 4026 }, { "epoch": 0.6204081632653061, "grad_norm": 0.8895233273506165, "learning_rate": 3.1527200869267446e-05, "loss": 1.0811, "step": 4028 }, { "epoch": 0.620716211012707, "grad_norm": 0.6992722749710083, "learning_rate": 3.148224157644256e-05, "loss": 2.264, "step": 4030 }, { "epoch": 0.6210242587601078, "grad_norm": 0.630937933921814, "learning_rate": 3.143729962928825e-05, "loss": 2.1182, "step": 4032 }, { "epoch": 0.6213323065075087, "grad_norm": 0.8037968873977661, "learning_rate": 3.139237506990188e-05, "loss": 0.8633, "step": 4034 }, { "epoch": 0.6216403542549095, "grad_norm": 0.9234476685523987, "learning_rate": 3.1347467940364466e-05, "loss": 1.0269, "step": 4036 }, { "epoch": 0.6219484020023104, "grad_norm": 0.7384040951728821, "learning_rate": 3.1302578282740764e-05, "loss": 1.135, "step": 4038 }, { "epoch": 0.6222564497497112, "grad_norm": 0.5950331091880798, "learning_rate": 3.125770613907909e-05, "loss": 0.9479, "step": 4040 }, { "epoch": 0.6225644974971121, "grad_norm": 0.9147387742996216, "learning_rate": 3.1212851551411394e-05, "loss": 0.8846, "step": 4042 }, { "epoch": 0.6228725452445129, "grad_norm": 0.782964825630188, "learning_rate": 3.1168014561753195e-05, "loss": 0.9488, "step": 4044 }, { "epoch": 0.6231805929919138, "grad_norm": 0.8128480911254883, "learning_rate": 3.1123195212103515e-05, "loss": 0.9245, "step": 4046 }, { "epoch": 0.6234886407393146, "grad_norm": 0.9903732538223267, "learning_rate": 3.1078393544444804e-05, "loss": 1.4895, "step": 4048 }, { "epoch": 0.6237966884867154, "grad_norm": 0.6795881986618042, "learning_rate": 3.103360960074304e-05, "loss": 0.9809, "step": 4050 }, { "epoch": 0.6241047362341163, "grad_norm": 0.6919244527816772, "learning_rate": 3.098884342294753e-05, "loss": 1.0142, "step": 4052 }, { "epoch": 0.6244127839815171, "grad_norm": 0.6578916907310486, "learning_rate": 3.0944095052990985e-05, "loss": 1.4133, "step": 4054 }, { "epoch": 0.624720831728918, "grad_norm": 0.7969669103622437, "learning_rate": 3.089936453278937e-05, "loss": 0.8925, "step": 4056 }, { "epoch": 0.6250288794763188, "grad_norm": 0.9621204733848572, "learning_rate": 3.0854651904241993e-05, "loss": 0.9851, "step": 4058 }, { "epoch": 0.6253369272237197, "grad_norm": 0.7093731164932251, "learning_rate": 3.08099572092314e-05, "loss": 1.0454, "step": 4060 }, { "epoch": 0.6256449749711205, "grad_norm": 0.9264700412750244, "learning_rate": 3.076528048962327e-05, "loss": 0.814, "step": 4062 }, { "epoch": 0.6259530227185214, "grad_norm": 0.9630382657051086, "learning_rate": 3.072062178726657e-05, "loss": 1.0488, "step": 4064 }, { "epoch": 0.6262610704659222, "grad_norm": 0.7568457126617432, "learning_rate": 3.067598114399325e-05, "loss": 1.015, "step": 4066 }, { "epoch": 0.6265691182133231, "grad_norm": 0.6907071471214294, "learning_rate": 3.063135860161842e-05, "loss": 0.8186, "step": 4068 }, { "epoch": 0.626877165960724, "grad_norm": 1.03848397731781, "learning_rate": 3.0586754201940235e-05, "loss": 0.8725, "step": 4070 }, { "epoch": 0.6271852137081247, "grad_norm": 0.7759397625923157, "learning_rate": 3.054216798673987e-05, "loss": 0.9653, "step": 4072 }, { "epoch": 0.6274932614555256, "grad_norm": 0.544243335723877, "learning_rate": 3.049759999778139e-05, "loss": 0.9639, "step": 4074 }, { "epoch": 0.6278013092029264, "grad_norm": 0.8743525147438049, "learning_rate": 3.0453050276811856e-05, "loss": 0.8889, "step": 4076 }, { "epoch": 0.6281093569503273, "grad_norm": 0.8232191205024719, "learning_rate": 3.0408518865561225e-05, "loss": 1.1389, "step": 4078 }, { "epoch": 0.6284174046977281, "grad_norm": 0.8645479679107666, "learning_rate": 3.0364005805742246e-05, "loss": 1.0798, "step": 4080 }, { "epoch": 0.628725452445129, "grad_norm": 0.8836220502853394, "learning_rate": 3.0319511139050504e-05, "loss": 0.9966, "step": 4082 }, { "epoch": 0.6290335001925298, "grad_norm": 0.9457396268844604, "learning_rate": 3.0275034907164396e-05, "loss": 0.9485, "step": 4084 }, { "epoch": 0.6293415479399307, "grad_norm": 0.8273984789848328, "learning_rate": 3.0230577151745006e-05, "loss": 1.7574, "step": 4086 }, { "epoch": 0.6296495956873316, "grad_norm": 0.6231035590171814, "learning_rate": 3.0186137914436085e-05, "loss": 1.0173, "step": 4088 }, { "epoch": 0.6299576434347324, "grad_norm": 0.7283669710159302, "learning_rate": 3.014171723686411e-05, "loss": 0.9442, "step": 4090 }, { "epoch": 0.6302656911821333, "grad_norm": 1.0157694816589355, "learning_rate": 3.009731516063813e-05, "loss": 1.0788, "step": 4092 }, { "epoch": 0.6305737389295341, "grad_norm": 0.7202984094619751, "learning_rate": 3.0052931727349777e-05, "loss": 0.9958, "step": 4094 }, { "epoch": 0.630881786676935, "grad_norm": 0.699341356754303, "learning_rate": 3.0008566978573206e-05, "loss": 0.8117, "step": 4096 }, { "epoch": 0.6311898344243357, "grad_norm": 0.7673215866088867, "learning_rate": 2.9964220955865095e-05, "loss": 0.7943, "step": 4098 }, { "epoch": 0.6314978821717366, "grad_norm": 0.9850109815597534, "learning_rate": 2.9919893700764566e-05, "loss": 0.9798, "step": 4100 }, { "epoch": 0.6318059299191374, "grad_norm": 0.8283570408821106, "learning_rate": 2.9875585254793163e-05, "loss": 1.0675, "step": 4102 }, { "epoch": 0.6321139776665383, "grad_norm": 0.7977241277694702, "learning_rate": 2.9831295659454838e-05, "loss": 0.9345, "step": 4104 }, { "epoch": 0.6324220254139392, "grad_norm": 1.0206035375595093, "learning_rate": 2.9787024956235837e-05, "loss": 0.9714, "step": 4106 }, { "epoch": 0.63273007316134, "grad_norm": 0.7831523418426514, "learning_rate": 2.974277318660472e-05, "loss": 0.9212, "step": 4108 }, { "epoch": 0.6330381209087409, "grad_norm": 0.7216141223907471, "learning_rate": 2.9698540392012364e-05, "loss": 0.7925, "step": 4110 }, { "epoch": 0.6333461686561417, "grad_norm": 0.9296200275421143, "learning_rate": 2.965432661389182e-05, "loss": 0.8214, "step": 4112 }, { "epoch": 0.6336542164035426, "grad_norm": 0.9630807638168335, "learning_rate": 2.9610131893658328e-05, "loss": 1.0262, "step": 4114 }, { "epoch": 0.6339622641509434, "grad_norm": 0.7839917540550232, "learning_rate": 2.9565956272709282e-05, "loss": 1.5632, "step": 4116 }, { "epoch": 0.6342703118983443, "grad_norm": 0.9187052845954895, "learning_rate": 2.952179979242422e-05, "loss": 0.9697, "step": 4118 }, { "epoch": 0.634578359645745, "grad_norm": 1.2686012983322144, "learning_rate": 2.9477662494164703e-05, "loss": 1.8759, "step": 4120 }, { "epoch": 0.634886407393146, "grad_norm": 1.2480783462524414, "learning_rate": 2.943354441927434e-05, "loss": 1.0542, "step": 4122 }, { "epoch": 0.6351944551405467, "grad_norm": 0.7378079295158386, "learning_rate": 2.938944560907876e-05, "loss": 0.7757, "step": 4124 }, { "epoch": 0.6355025028879476, "grad_norm": 0.715215802192688, "learning_rate": 2.9345366104885514e-05, "loss": 1.0343, "step": 4126 }, { "epoch": 0.6358105506353485, "grad_norm": 1.0192413330078125, "learning_rate": 2.930130594798405e-05, "loss": 1.103, "step": 4128 }, { "epoch": 0.6361185983827493, "grad_norm": 0.7151640057563782, "learning_rate": 2.9257265179645764e-05, "loss": 1.0364, "step": 4130 }, { "epoch": 0.6364266461301502, "grad_norm": 1.1779733896255493, "learning_rate": 2.921324384112384e-05, "loss": 0.9568, "step": 4132 }, { "epoch": 0.636734693877551, "grad_norm": 0.9525445103645325, "learning_rate": 2.916924197365325e-05, "loss": 1.1174, "step": 4134 }, { "epoch": 0.6370427416249519, "grad_norm": 0.894616961479187, "learning_rate": 2.9125259618450768e-05, "loss": 1.0455, "step": 4136 }, { "epoch": 0.6373507893723527, "grad_norm": 0.9701914191246033, "learning_rate": 2.9081296816714864e-05, "loss": 0.9364, "step": 4138 }, { "epoch": 0.6376588371197536, "grad_norm": 0.7523389458656311, "learning_rate": 2.9037353609625695e-05, "loss": 0.8925, "step": 4140 }, { "epoch": 0.6379668848671544, "grad_norm": 0.6569038033485413, "learning_rate": 2.899343003834508e-05, "loss": 0.8933, "step": 4142 }, { "epoch": 0.6382749326145553, "grad_norm": 1.0741729736328125, "learning_rate": 2.894952614401642e-05, "loss": 0.9418, "step": 4144 }, { "epoch": 0.6385829803619562, "grad_norm": 0.872235894203186, "learning_rate": 2.8905641967764706e-05, "loss": 1.0024, "step": 4146 }, { "epoch": 0.6388910281093569, "grad_norm": 0.6244677901268005, "learning_rate": 2.8861777550696444e-05, "loss": 1.1566, "step": 4148 }, { "epoch": 0.6391990758567578, "grad_norm": 0.7942041754722595, "learning_rate": 2.8817932933899637e-05, "loss": 0.9897, "step": 4150 }, { "epoch": 0.6395071236041586, "grad_norm": 0.8112989664077759, "learning_rate": 2.877410815844376e-05, "loss": 0.9111, "step": 4152 }, { "epoch": 0.6398151713515595, "grad_norm": 0.6369266510009766, "learning_rate": 2.8730303265379654e-05, "loss": 0.9752, "step": 4154 }, { "epoch": 0.6401232190989603, "grad_norm": 0.8934615850448608, "learning_rate": 2.8686518295739595e-05, "loss": 0.887, "step": 4156 }, { "epoch": 0.6404312668463612, "grad_norm": 0.8280193209648132, "learning_rate": 2.864275329053715e-05, "loss": 2.0003, "step": 4158 }, { "epoch": 0.640739314593762, "grad_norm": 0.8070312738418579, "learning_rate": 2.8599008290767204e-05, "loss": 0.8533, "step": 4160 }, { "epoch": 0.6410473623411629, "grad_norm": 0.9471308588981628, "learning_rate": 2.8555283337405892e-05, "loss": 1.0428, "step": 4162 }, { "epoch": 0.6413554100885638, "grad_norm": 0.8286849856376648, "learning_rate": 2.8511578471410637e-05, "loss": 0.8051, "step": 4164 }, { "epoch": 0.6416634578359646, "grad_norm": 0.6716766953468323, "learning_rate": 2.846789373371993e-05, "loss": 1.174, "step": 4166 }, { "epoch": 0.6419715055833655, "grad_norm": 1.3844709396362305, "learning_rate": 2.842422916525349e-05, "loss": 0.9692, "step": 4168 }, { "epoch": 0.6422795533307663, "grad_norm": 0.569908857345581, "learning_rate": 2.8380584806912104e-05, "loss": 3.4552, "step": 4170 }, { "epoch": 0.6425876010781671, "grad_norm": 0.8689776062965393, "learning_rate": 2.8336960699577698e-05, "loss": 0.87, "step": 4172 }, { "epoch": 0.6428956488255679, "grad_norm": 0.8701666593551636, "learning_rate": 2.829335688411312e-05, "loss": 0.9399, "step": 4174 }, { "epoch": 0.6432036965729688, "grad_norm": 0.7730369567871094, "learning_rate": 2.8249773401362267e-05, "loss": 1.0089, "step": 4176 }, { "epoch": 0.6435117443203696, "grad_norm": 0.8679829239845276, "learning_rate": 2.820621029215003e-05, "loss": 0.8542, "step": 4178 }, { "epoch": 0.6438197920677705, "grad_norm": 0.6362239122390747, "learning_rate": 2.8162667597282176e-05, "loss": 1.0184, "step": 4180 }, { "epoch": 0.6441278398151713, "grad_norm": 0.887014627456665, "learning_rate": 2.8119145357545295e-05, "loss": 0.986, "step": 4182 }, { "epoch": 0.6444358875625722, "grad_norm": 0.6491660475730896, "learning_rate": 2.8075643613706938e-05, "loss": 0.9471, "step": 4184 }, { "epoch": 0.6447439353099731, "grad_norm": 0.9777161478996277, "learning_rate": 2.8032162406515372e-05, "loss": 1.2648, "step": 4186 }, { "epoch": 0.6450519830573739, "grad_norm": 0.8161064982414246, "learning_rate": 2.7988701776699612e-05, "loss": 0.8511, "step": 4188 }, { "epoch": 0.6453600308047748, "grad_norm": 0.8995189070701599, "learning_rate": 2.7945261764969442e-05, "loss": 0.9348, "step": 4190 }, { "epoch": 0.6456680785521756, "grad_norm": 0.7900372743606567, "learning_rate": 2.7901842412015355e-05, "loss": 1.3994, "step": 4192 }, { "epoch": 0.6459761262995765, "grad_norm": 0.9890312552452087, "learning_rate": 2.785844375850847e-05, "loss": 1.2731, "step": 4194 }, { "epoch": 0.6462841740469772, "grad_norm": 0.77409827709198, "learning_rate": 2.7815065845100436e-05, "loss": 0.8889, "step": 4196 }, { "epoch": 0.6465922217943781, "grad_norm": 0.5893883109092712, "learning_rate": 2.7771708712423615e-05, "loss": 0.8937, "step": 4198 }, { "epoch": 0.6469002695417789, "grad_norm": 0.7560586333274841, "learning_rate": 2.7728372401090806e-05, "loss": 0.9525, "step": 4200 }, { "epoch": 0.6469002695417789, "eval_loss": 2.357358932495117, "eval_runtime": 737.0158, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 4200 }, { "epoch": 0.6472083172891798, "grad_norm": 0.5760608911514282, "learning_rate": 2.7685056951695354e-05, "loss": 0.7893, "step": 4202 }, { "epoch": 0.6475163650365807, "grad_norm": 0.5902154445648193, "learning_rate": 2.764176240481102e-05, "loss": 1.0523, "step": 4204 }, { "epoch": 0.6478244127839815, "grad_norm": 1.1871650218963623, "learning_rate": 2.7598488800992018e-05, "loss": 1.0079, "step": 4206 }, { "epoch": 0.6481324605313824, "grad_norm": 1.000658631324768, "learning_rate": 2.7555236180772937e-05, "loss": 0.9369, "step": 4208 }, { "epoch": 0.6484405082787832, "grad_norm": 0.6931748986244202, "learning_rate": 2.7512004584668694e-05, "loss": 0.9145, "step": 4210 }, { "epoch": 0.6487485560261841, "grad_norm": 0.9558520317077637, "learning_rate": 2.7468794053174547e-05, "loss": 0.9502, "step": 4212 }, { "epoch": 0.6490566037735849, "grad_norm": 0.6980428099632263, "learning_rate": 2.7425604626765988e-05, "loss": 0.7277, "step": 4214 }, { "epoch": 0.6493646515209858, "grad_norm": 0.8523775935173035, "learning_rate": 2.7382436345898754e-05, "loss": 0.9375, "step": 4216 }, { "epoch": 0.6496726992683866, "grad_norm": 0.8459200263023376, "learning_rate": 2.7339289251008782e-05, "loss": 0.8459, "step": 4218 }, { "epoch": 0.6499807470157875, "grad_norm": 1.007840871810913, "learning_rate": 2.729616338251215e-05, "loss": 0.9507, "step": 4220 }, { "epoch": 0.6502887947631883, "grad_norm": 0.9047834277153015, "learning_rate": 2.7253058780805064e-05, "loss": 0.8881, "step": 4222 }, { "epoch": 0.6505968425105891, "grad_norm": 0.7920469641685486, "learning_rate": 2.7209975486263807e-05, "loss": 1.1688, "step": 4224 }, { "epoch": 0.65090489025799, "grad_norm": 0.6454511284828186, "learning_rate": 2.71669135392447e-05, "loss": 2.2615, "step": 4226 }, { "epoch": 0.6512129380053908, "grad_norm": 0.7499375343322754, "learning_rate": 2.7123872980084066e-05, "loss": 1.1278, "step": 4228 }, { "epoch": 0.6515209857527917, "grad_norm": 0.9970037937164307, "learning_rate": 2.7080853849098198e-05, "loss": 0.8617, "step": 4230 }, { "epoch": 0.6518290335001925, "grad_norm": 0.6305007934570312, "learning_rate": 2.703785618658332e-05, "loss": 0.7885, "step": 4232 }, { "epoch": 0.6521370812475934, "grad_norm": 0.6080676317214966, "learning_rate": 2.699488003281554e-05, "loss": 0.8886, "step": 4234 }, { "epoch": 0.6524451289949942, "grad_norm": 0.5880206227302551, "learning_rate": 2.6951925428050807e-05, "loss": 0.9416, "step": 4236 }, { "epoch": 0.6527531767423951, "grad_norm": 0.8096851110458374, "learning_rate": 2.6908992412524948e-05, "loss": 0.9169, "step": 4238 }, { "epoch": 0.6530612244897959, "grad_norm": 0.9902066588401794, "learning_rate": 2.686608102645347e-05, "loss": 1.0152, "step": 4240 }, { "epoch": 0.6533692722371968, "grad_norm": 0.7456113696098328, "learning_rate": 2.682319131003166e-05, "loss": 1.0577, "step": 4242 }, { "epoch": 0.6536773199845977, "grad_norm": 0.7236586809158325, "learning_rate": 2.6780323303434586e-05, "loss": 0.8771, "step": 4244 }, { "epoch": 0.6539853677319984, "grad_norm": 1.0047794580459595, "learning_rate": 2.673747704681684e-05, "loss": 1.178, "step": 4246 }, { "epoch": 0.6542934154793993, "grad_norm": 0.9127473831176758, "learning_rate": 2.669465258031273e-05, "loss": 1.2571, "step": 4248 }, { "epoch": 0.6546014632268001, "grad_norm": 1.1883794069290161, "learning_rate": 2.6651849944036118e-05, "loss": 1.1044, "step": 4250 }, { "epoch": 0.654909510974201, "grad_norm": 0.613161027431488, "learning_rate": 2.6609069178080486e-05, "loss": 0.7232, "step": 4252 }, { "epoch": 0.6552175587216018, "grad_norm": 0.7150982618331909, "learning_rate": 2.656631032251873e-05, "loss": 1.0834, "step": 4254 }, { "epoch": 0.6555256064690027, "grad_norm": 0.6977151036262512, "learning_rate": 2.6523573417403258e-05, "loss": 0.8445, "step": 4256 }, { "epoch": 0.6558336542164035, "grad_norm": 0.626096248626709, "learning_rate": 2.648085850276597e-05, "loss": 0.8876, "step": 4258 }, { "epoch": 0.6561417019638044, "grad_norm": 0.8646566867828369, "learning_rate": 2.6438165618618127e-05, "loss": 1.1347, "step": 4260 }, { "epoch": 0.6564497497112053, "grad_norm": 0.8802438378334045, "learning_rate": 2.6395494804950316e-05, "loss": 1.6441, "step": 4262 }, { "epoch": 0.6567577974586061, "grad_norm": 0.8071707487106323, "learning_rate": 2.6352846101732474e-05, "loss": 1.0018, "step": 4264 }, { "epoch": 0.657065845206007, "grad_norm": 0.8037858605384827, "learning_rate": 2.6310219548913917e-05, "loss": 0.962, "step": 4266 }, { "epoch": 0.6573738929534078, "grad_norm": 0.5976040959358215, "learning_rate": 2.6267615186423068e-05, "loss": 0.853, "step": 4268 }, { "epoch": 0.6576819407008087, "grad_norm": 1.2724300622940063, "learning_rate": 2.6225033054167626e-05, "loss": 1.9666, "step": 4270 }, { "epoch": 0.6579899884482094, "grad_norm": 0.8453805446624756, "learning_rate": 2.6182473192034524e-05, "loss": 1.0949, "step": 4272 }, { "epoch": 0.6582980361956103, "grad_norm": 0.9200473427772522, "learning_rate": 2.613993563988978e-05, "loss": 0.8304, "step": 4274 }, { "epoch": 0.6586060839430111, "grad_norm": 0.8198157548904419, "learning_rate": 2.6097420437578447e-05, "loss": 0.7959, "step": 4276 }, { "epoch": 0.658914131690412, "grad_norm": 0.6957106590270996, "learning_rate": 2.6054927624924785e-05, "loss": 0.7985, "step": 4278 }, { "epoch": 0.6592221794378129, "grad_norm": 0.6710758805274963, "learning_rate": 2.6012457241731986e-05, "loss": 0.9831, "step": 4280 }, { "epoch": 0.6595302271852137, "grad_norm": 1.1137224435806274, "learning_rate": 2.5970009327782274e-05, "loss": 1.033, "step": 4282 }, { "epoch": 0.6598382749326146, "grad_norm": 0.8313379287719727, "learning_rate": 2.592758392283675e-05, "loss": 0.9137, "step": 4284 }, { "epoch": 0.6601463226800154, "grad_norm": 0.7043817043304443, "learning_rate": 2.5885181066635545e-05, "loss": 1.1218, "step": 4286 }, { "epoch": 0.6604543704274163, "grad_norm": 0.8582731485366821, "learning_rate": 2.58428007988976e-05, "loss": 1.0096, "step": 4288 }, { "epoch": 0.6607624181748171, "grad_norm": 0.7957725524902344, "learning_rate": 2.5800443159320696e-05, "loss": 0.9304, "step": 4290 }, { "epoch": 0.661070465922218, "grad_norm": 0.9089280962944031, "learning_rate": 2.575810818758145e-05, "loss": 1.0874, "step": 4292 }, { "epoch": 0.6613785136696188, "grad_norm": 0.9526920318603516, "learning_rate": 2.5715795923335205e-05, "loss": 1.1148, "step": 4294 }, { "epoch": 0.6616865614170196, "grad_norm": 0.8966755867004395, "learning_rate": 2.5673506406216074e-05, "loss": 0.8414, "step": 4296 }, { "epoch": 0.6619946091644204, "grad_norm": 0.9842838644981384, "learning_rate": 2.5631239675836838e-05, "loss": 1.0398, "step": 4298 }, { "epoch": 0.6623026569118213, "grad_norm": 0.7387715578079224, "learning_rate": 2.5588995771788942e-05, "loss": 1.1329, "step": 4300 }, { "epoch": 0.6626107046592222, "grad_norm": 0.8699015974998474, "learning_rate": 2.5546774733642442e-05, "loss": 1.213, "step": 4302 }, { "epoch": 0.662918752406623, "grad_norm": 0.7089219689369202, "learning_rate": 2.5504576600945994e-05, "loss": 1.0877, "step": 4304 }, { "epoch": 0.6632268001540239, "grad_norm": 0.8745829463005066, "learning_rate": 2.5462401413226766e-05, "loss": 0.9235, "step": 4306 }, { "epoch": 0.6635348479014247, "grad_norm": 0.7542290091514587, "learning_rate": 2.542024920999047e-05, "loss": 1.0047, "step": 4308 }, { "epoch": 0.6638428956488256, "grad_norm": 0.7538411617279053, "learning_rate": 2.5378120030721263e-05, "loss": 1.0855, "step": 4310 }, { "epoch": 0.6641509433962264, "grad_norm": 0.7625778317451477, "learning_rate": 2.533601391488175e-05, "loss": 1.8407, "step": 4312 }, { "epoch": 0.6644589911436273, "grad_norm": 0.8806657791137695, "learning_rate": 2.529393090191292e-05, "loss": 1.089, "step": 4314 }, { "epoch": 0.6647670388910281, "grad_norm": 0.6660029888153076, "learning_rate": 2.5251871031234108e-05, "loss": 1.1389, "step": 4316 }, { "epoch": 0.665075086638429, "grad_norm": 0.8225176930427551, "learning_rate": 2.5209834342243042e-05, "loss": 0.7636, "step": 4318 }, { "epoch": 0.6653831343858299, "grad_norm": 0.6568019986152649, "learning_rate": 2.516782087431565e-05, "loss": 1.2996, "step": 4320 }, { "epoch": 0.6656911821332306, "grad_norm": 0.860137403011322, "learning_rate": 2.5125830666806137e-05, "loss": 0.9067, "step": 4322 }, { "epoch": 0.6659992298806315, "grad_norm": 0.7698874473571777, "learning_rate": 2.5083863759046943e-05, "loss": 1.3507, "step": 4324 }, { "epoch": 0.6663072776280323, "grad_norm": 0.8295789957046509, "learning_rate": 2.5041920190348655e-05, "loss": 1.0864, "step": 4326 }, { "epoch": 0.6666153253754332, "grad_norm": 0.7729510068893433, "learning_rate": 2.500000000000001e-05, "loss": 0.9469, "step": 4328 }, { "epoch": 0.666923373122834, "grad_norm": 0.859638512134552, "learning_rate": 2.4958103227267836e-05, "loss": 2.426, "step": 4330 }, { "epoch": 0.6672314208702349, "grad_norm": 0.67510586977005, "learning_rate": 2.4916229911397083e-05, "loss": 1.0348, "step": 4332 }, { "epoch": 0.6675394686176357, "grad_norm": 0.6896673440933228, "learning_rate": 2.4874380091610627e-05, "loss": 1.0989, "step": 4334 }, { "epoch": 0.6678475163650366, "grad_norm": 0.8158189058303833, "learning_rate": 2.4832553807109392e-05, "loss": 1.0081, "step": 4336 }, { "epoch": 0.6681555641124374, "grad_norm": 0.8096099495887756, "learning_rate": 2.479075109707229e-05, "loss": 0.8653, "step": 4338 }, { "epoch": 0.6684636118598383, "grad_norm": 0.8060958385467529, "learning_rate": 2.474897200065611e-05, "loss": 0.9234, "step": 4340 }, { "epoch": 0.6687716596072392, "grad_norm": 1.3544212579727173, "learning_rate": 2.47072165569955e-05, "loss": 1.6383, "step": 4342 }, { "epoch": 0.66907970735464, "grad_norm": 0.7652481198310852, "learning_rate": 2.466548480520296e-05, "loss": 0.9152, "step": 4344 }, { "epoch": 0.6693877551020408, "grad_norm": 1.2091152667999268, "learning_rate": 2.4623776784368868e-05, "loss": 2.234, "step": 4346 }, { "epoch": 0.6696958028494416, "grad_norm": 0.8425005078315735, "learning_rate": 2.4582092533561325e-05, "loss": 0.9222, "step": 4348 }, { "epoch": 0.6700038505968425, "grad_norm": 0.8576799631118774, "learning_rate": 2.4540432091826087e-05, "loss": 1.0903, "step": 4350 }, { "epoch": 0.6703118983442433, "grad_norm": 0.7403795719146729, "learning_rate": 2.449879549818676e-05, "loss": 0.8604, "step": 4352 }, { "epoch": 0.6706199460916442, "grad_norm": 0.845689058303833, "learning_rate": 2.445718279164453e-05, "loss": 1.425, "step": 4354 }, { "epoch": 0.670927993839045, "grad_norm": 0.9343613386154175, "learning_rate": 2.441559401117815e-05, "loss": 0.9845, "step": 4356 }, { "epoch": 0.6712360415864459, "grad_norm": 0.7838619351387024, "learning_rate": 2.4374029195744093e-05, "loss": 0.8254, "step": 4358 }, { "epoch": 0.6715440893338468, "grad_norm": 0.8020996451377869, "learning_rate": 2.433248838427628e-05, "loss": 0.9853, "step": 4360 }, { "epoch": 0.6718521370812476, "grad_norm": 0.9349923729896545, "learning_rate": 2.4290971615686215e-05, "loss": 0.9348, "step": 4362 }, { "epoch": 0.6721601848286485, "grad_norm": 0.5520917773246765, "learning_rate": 2.424947892886279e-05, "loss": 1.0036, "step": 4364 }, { "epoch": 0.6724682325760493, "grad_norm": 0.8372582793235779, "learning_rate": 2.4208010362672444e-05, "loss": 0.9508, "step": 4366 }, { "epoch": 0.6727762803234502, "grad_norm": 0.8463919162750244, "learning_rate": 2.4166565955958976e-05, "loss": 1.0411, "step": 4368 }, { "epoch": 0.673084328070851, "grad_norm": 0.9879681468009949, "learning_rate": 2.4125145747543537e-05, "loss": 0.9868, "step": 4370 }, { "epoch": 0.6733923758182518, "grad_norm": 0.8925761580467224, "learning_rate": 2.408374977622464e-05, "loss": 1.018, "step": 4372 }, { "epoch": 0.6737004235656526, "grad_norm": 0.9484021067619324, "learning_rate": 2.404237808077808e-05, "loss": 0.978, "step": 4374 }, { "epoch": 0.6740084713130535, "grad_norm": 0.8627564907073975, "learning_rate": 2.4001030699956916e-05, "loss": 0.8258, "step": 4376 }, { "epoch": 0.6743165190604544, "grad_norm": 0.8027501702308655, "learning_rate": 2.3959707672491437e-05, "loss": 0.9057, "step": 4378 }, { "epoch": 0.6746245668078552, "grad_norm": 0.9469018578529358, "learning_rate": 2.3918409037089112e-05, "loss": 1.0835, "step": 4380 }, { "epoch": 0.6749326145552561, "grad_norm": 0.8287208676338196, "learning_rate": 2.3877134832434567e-05, "loss": 0.882, "step": 4382 }, { "epoch": 0.6752406623026569, "grad_norm": 0.6991237998008728, "learning_rate": 2.3835885097189535e-05, "loss": 0.9511, "step": 4384 }, { "epoch": 0.6755487100500578, "grad_norm": 0.9674092531204224, "learning_rate": 2.3794659869992848e-05, "loss": 1.1107, "step": 4386 }, { "epoch": 0.6758567577974586, "grad_norm": 0.846439003944397, "learning_rate": 2.375345918946036e-05, "loss": 0.736, "step": 4388 }, { "epoch": 0.6761648055448595, "grad_norm": 0.6624884009361267, "learning_rate": 2.3712283094184934e-05, "loss": 1.1075, "step": 4390 }, { "epoch": 0.6764728532922603, "grad_norm": 0.5450026988983154, "learning_rate": 2.3671131622736427e-05, "loss": 0.7622, "step": 4392 }, { "epoch": 0.6767809010396612, "grad_norm": 0.9617735147476196, "learning_rate": 2.36300048136616e-05, "loss": 1.0834, "step": 4394 }, { "epoch": 0.6770889487870619, "grad_norm": 0.9630662798881531, "learning_rate": 2.358890270548413e-05, "loss": 2.0419, "step": 4396 }, { "epoch": 0.6773969965344628, "grad_norm": 0.7190920114517212, "learning_rate": 2.3547825336704555e-05, "loss": 1.226, "step": 4398 }, { "epoch": 0.6777050442818637, "grad_norm": 0.7295611500740051, "learning_rate": 2.3506772745800238e-05, "loss": 0.7479, "step": 4400 }, { "epoch": 0.6780130920292645, "grad_norm": 0.7448195219039917, "learning_rate": 2.3465744971225333e-05, "loss": 0.9424, "step": 4402 }, { "epoch": 0.6783211397766654, "grad_norm": 0.897513747215271, "learning_rate": 2.342474205141073e-05, "loss": 0.9566, "step": 4404 }, { "epoch": 0.6786291875240662, "grad_norm": 0.7958559393882751, "learning_rate": 2.3383764024764105e-05, "loss": 0.8318, "step": 4406 }, { "epoch": 0.6789372352714671, "grad_norm": 0.6669363379478455, "learning_rate": 2.3342810929669712e-05, "loss": 0.7176, "step": 4408 }, { "epoch": 0.6792452830188679, "grad_norm": 0.6453452110290527, "learning_rate": 2.330188280448851e-05, "loss": 1.4984, "step": 4410 }, { "epoch": 0.6795533307662688, "grad_norm": 0.7844343781471252, "learning_rate": 2.326097968755812e-05, "loss": 0.9918, "step": 4412 }, { "epoch": 0.6798613785136696, "grad_norm": 0.7631637454032898, "learning_rate": 2.322010161719263e-05, "loss": 2.3614, "step": 4414 }, { "epoch": 0.6801694262610705, "grad_norm": 0.7821967005729675, "learning_rate": 2.3179248631682726e-05, "loss": 1.7651, "step": 4416 }, { "epoch": 0.6804774740084714, "grad_norm": 0.754566490650177, "learning_rate": 2.3138420769295577e-05, "loss": 0.964, "step": 4418 }, { "epoch": 0.6807855217558721, "grad_norm": 0.737114667892456, "learning_rate": 2.309761806827489e-05, "loss": 0.7734, "step": 4420 }, { "epoch": 0.681093569503273, "grad_norm": 0.9395533800125122, "learning_rate": 2.3056840566840688e-05, "loss": 0.9731, "step": 4422 }, { "epoch": 0.6814016172506738, "grad_norm": 0.6122387051582336, "learning_rate": 2.301608830318945e-05, "loss": 0.8247, "step": 4424 }, { "epoch": 0.6817096649980747, "grad_norm": 0.6384727358818054, "learning_rate": 2.2975361315494037e-05, "loss": 0.7968, "step": 4426 }, { "epoch": 0.6820177127454755, "grad_norm": 0.8586402535438538, "learning_rate": 2.293465964190362e-05, "loss": 1.0213, "step": 4428 }, { "epoch": 0.6823257604928764, "grad_norm": 0.8942074775695801, "learning_rate": 2.2893983320543588e-05, "loss": 0.9089, "step": 4430 }, { "epoch": 0.6826338082402772, "grad_norm": 0.9709880352020264, "learning_rate": 2.2853332389515698e-05, "loss": 1.2399, "step": 4432 }, { "epoch": 0.6829418559876781, "grad_norm": 0.8261923789978027, "learning_rate": 2.281270688689784e-05, "loss": 0.8675, "step": 4434 }, { "epoch": 0.683249903735079, "grad_norm": 0.6313756704330444, "learning_rate": 2.2772106850744136e-05, "loss": 1.0011, "step": 4436 }, { "epoch": 0.6835579514824798, "grad_norm": 0.818619966506958, "learning_rate": 2.2731532319084774e-05, "loss": 0.7433, "step": 4438 }, { "epoch": 0.6838659992298807, "grad_norm": 0.8863322138786316, "learning_rate": 2.2690983329926157e-05, "loss": 0.9186, "step": 4440 }, { "epoch": 0.6841740469772815, "grad_norm": 1.1261119842529297, "learning_rate": 2.2650459921250723e-05, "loss": 1.0289, "step": 4442 }, { "epoch": 0.6844820947246824, "grad_norm": 0.790084958076477, "learning_rate": 2.2609962131016872e-05, "loss": 1.8737, "step": 4444 }, { "epoch": 0.6847901424720831, "grad_norm": 0.6505240797996521, "learning_rate": 2.2569489997159127e-05, "loss": 0.8129, "step": 4446 }, { "epoch": 0.685098190219484, "grad_norm": 0.7163221836090088, "learning_rate": 2.2529043557587913e-05, "loss": 1.1333, "step": 4448 }, { "epoch": 0.6854062379668848, "grad_norm": 0.7663728594779968, "learning_rate": 2.24886228501896e-05, "loss": 0.8635, "step": 4450 }, { "epoch": 0.6857142857142857, "grad_norm": 0.6637657880783081, "learning_rate": 2.244822791282645e-05, "loss": 0.7793, "step": 4452 }, { "epoch": 0.6860223334616865, "grad_norm": 0.8549447655677795, "learning_rate": 2.2407858783336576e-05, "loss": 0.7412, "step": 4454 }, { "epoch": 0.6863303812090874, "grad_norm": 0.7615169286727905, "learning_rate": 2.2367515499533954e-05, "loss": 1.2909, "step": 4456 }, { "epoch": 0.6866384289564883, "grad_norm": 1.0264763832092285, "learning_rate": 2.2327198099208307e-05, "loss": 2.6198, "step": 4458 }, { "epoch": 0.6869464767038891, "grad_norm": 1.1352583169937134, "learning_rate": 2.228690662012514e-05, "loss": 1.6127, "step": 4460 }, { "epoch": 0.68725452445129, "grad_norm": 0.912841260433197, "learning_rate": 2.2246641100025667e-05, "loss": 1.0174, "step": 4462 }, { "epoch": 0.6875625721986908, "grad_norm": 0.7334194779396057, "learning_rate": 2.220640157662679e-05, "loss": 2.2198, "step": 4464 }, { "epoch": 0.6878706199460917, "grad_norm": 0.5997360944747925, "learning_rate": 2.2166188087621054e-05, "loss": 0.8775, "step": 4466 }, { "epoch": 0.6881786676934925, "grad_norm": 0.8840118050575256, "learning_rate": 2.2126000670676627e-05, "loss": 0.7739, "step": 4468 }, { "epoch": 0.6884867154408933, "grad_norm": 0.9735852479934692, "learning_rate": 2.2085839363437244e-05, "loss": 0.8629, "step": 4470 }, { "epoch": 0.6887947631882941, "grad_norm": 0.5354695320129395, "learning_rate": 2.2045704203522192e-05, "loss": 1.7126, "step": 4472 }, { "epoch": 0.689102810935695, "grad_norm": 0.7044001817703247, "learning_rate": 2.2005595228526265e-05, "loss": 2.1124, "step": 4474 }, { "epoch": 0.6894108586830959, "grad_norm": 1.0412694215774536, "learning_rate": 2.1965512476019724e-05, "loss": 1.3892, "step": 4476 }, { "epoch": 0.6897189064304967, "grad_norm": 0.6908366680145264, "learning_rate": 2.1925455983548264e-05, "loss": 0.9544, "step": 4478 }, { "epoch": 0.6900269541778976, "grad_norm": 0.7264195680618286, "learning_rate": 2.1885425788633e-05, "loss": 1.0133, "step": 4480 }, { "epoch": 0.6903350019252984, "grad_norm": 0.9055928587913513, "learning_rate": 2.1845421928770393e-05, "loss": 0.9205, "step": 4482 }, { "epoch": 0.6906430496726993, "grad_norm": 0.5684531331062317, "learning_rate": 2.1805444441432234e-05, "loss": 0.8977, "step": 4484 }, { "epoch": 0.6909510974201001, "grad_norm": 0.9478229880332947, "learning_rate": 2.1765493364065665e-05, "loss": 0.9969, "step": 4486 }, { "epoch": 0.691259145167501, "grad_norm": 0.8816278576850891, "learning_rate": 2.1725568734093e-05, "loss": 0.9541, "step": 4488 }, { "epoch": 0.6915671929149018, "grad_norm": 0.7542193531990051, "learning_rate": 2.1685670588911843e-05, "loss": 0.9754, "step": 4490 }, { "epoch": 0.6918752406623027, "grad_norm": 0.7939161062240601, "learning_rate": 2.1645798965894953e-05, "loss": 1.3108, "step": 4492 }, { "epoch": 0.6921832884097036, "grad_norm": 0.9942681193351746, "learning_rate": 2.1605953902390326e-05, "loss": 1.076, "step": 4494 }, { "epoch": 0.6924913361571043, "grad_norm": 0.7257193326950073, "learning_rate": 2.1566135435720954e-05, "loss": 0.8966, "step": 4496 }, { "epoch": 0.6927993839045052, "grad_norm": 0.8092132806777954, "learning_rate": 2.1526343603184984e-05, "loss": 1.0302, "step": 4498 }, { "epoch": 0.693107431651906, "grad_norm": 0.7978146076202393, "learning_rate": 2.1486578442055672e-05, "loss": 0.8256, "step": 4500 }, { "epoch": 0.693107431651906, "eval_loss": 2.392524003982544, "eval_runtime": 736.2048, "eval_samples_per_second": 2.717, "eval_steps_per_second": 0.679, "step": 4500 }, { "epoch": 0.6934154793993069, "grad_norm": 0.7591894865036011, "learning_rate": 2.1446839989581165e-05, "loss": 1.0619, "step": 4502 }, { "epoch": 0.6937235271467077, "grad_norm": 0.8062347173690796, "learning_rate": 2.1407128282984662e-05, "loss": 0.8574, "step": 4504 }, { "epoch": 0.6940315748941086, "grad_norm": 0.843224823474884, "learning_rate": 2.136744335946434e-05, "loss": 1.0387, "step": 4506 }, { "epoch": 0.6943396226415094, "grad_norm": 0.7246967554092407, "learning_rate": 2.132778525619325e-05, "loss": 0.956, "step": 4508 }, { "epoch": 0.6946476703889103, "grad_norm": 0.8044782280921936, "learning_rate": 2.128815401031929e-05, "loss": 1.076, "step": 4510 }, { "epoch": 0.6949557181363111, "grad_norm": 0.8453338742256165, "learning_rate": 2.124854965896522e-05, "loss": 0.9274, "step": 4512 }, { "epoch": 0.695263765883712, "grad_norm": 0.7545700073242188, "learning_rate": 2.1208972239228674e-05, "loss": 0.9612, "step": 4514 }, { "epoch": 0.6955718136311129, "grad_norm": 0.7990420460700989, "learning_rate": 2.1169421788181998e-05, "loss": 1.0143, "step": 4516 }, { "epoch": 0.6958798613785137, "grad_norm": 0.6616374850273132, "learning_rate": 2.112989834287223e-05, "loss": 1.4766, "step": 4518 }, { "epoch": 0.6961879091259146, "grad_norm": 0.9233276844024658, "learning_rate": 2.1090401940321212e-05, "loss": 1.2026, "step": 4520 }, { "epoch": 0.6964959568733153, "grad_norm": 0.7011730074882507, "learning_rate": 2.1050932617525406e-05, "loss": 1.139, "step": 4522 }, { "epoch": 0.6968040046207162, "grad_norm": 1.0608782768249512, "learning_rate": 2.1011490411455893e-05, "loss": 0.9284, "step": 4524 }, { "epoch": 0.697112052368117, "grad_norm": 0.7545628547668457, "learning_rate": 2.0972075359058378e-05, "loss": 0.8357, "step": 4526 }, { "epoch": 0.6974201001155179, "grad_norm": 0.8437629342079163, "learning_rate": 2.0932687497253133e-05, "loss": 0.87, "step": 4528 }, { "epoch": 0.6977281478629187, "grad_norm": 0.836249828338623, "learning_rate": 2.0893326862934957e-05, "loss": 0.83, "step": 4530 }, { "epoch": 0.6980361956103196, "grad_norm": 0.8689998984336853, "learning_rate": 2.0853993492973102e-05, "loss": 1.1421, "step": 4532 }, { "epoch": 0.6983442433577205, "grad_norm": 0.8435835242271423, "learning_rate": 2.081468742421135e-05, "loss": 1.7831, "step": 4534 }, { "epoch": 0.6986522911051213, "grad_norm": 0.8725165128707886, "learning_rate": 2.077540869346788e-05, "loss": 0.9537, "step": 4536 }, { "epoch": 0.6989603388525222, "grad_norm": 0.9655625820159912, "learning_rate": 2.073615733753525e-05, "loss": 1.0886, "step": 4538 }, { "epoch": 0.699268386599923, "grad_norm": 0.9760432839393616, "learning_rate": 2.0696933393180397e-05, "loss": 1.1564, "step": 4540 }, { "epoch": 0.6995764343473239, "grad_norm": 0.8607039451599121, "learning_rate": 2.0657736897144564e-05, "loss": 0.8723, "step": 4542 }, { "epoch": 0.6998844820947246, "grad_norm": 0.7545386552810669, "learning_rate": 2.0618567886143297e-05, "loss": 2.6288, "step": 4544 }, { "epoch": 0.7001925298421255, "grad_norm": 0.6643165946006775, "learning_rate": 2.057942639686639e-05, "loss": 0.8196, "step": 4546 }, { "epoch": 0.7005005775895263, "grad_norm": 0.8222883939743042, "learning_rate": 2.0540312465977863e-05, "loss": 0.7392, "step": 4548 }, { "epoch": 0.7008086253369272, "grad_norm": 0.9492761492729187, "learning_rate": 2.050122613011591e-05, "loss": 1.0447, "step": 4550 }, { "epoch": 0.7011166730843281, "grad_norm": 0.6157514452934265, "learning_rate": 2.046216742589288e-05, "loss": 0.7543, "step": 4552 }, { "epoch": 0.7014247208317289, "grad_norm": 0.5949583053588867, "learning_rate": 2.042313638989526e-05, "loss": 1.6896, "step": 4554 }, { "epoch": 0.7017327685791298, "grad_norm": 0.8937816023826599, "learning_rate": 2.0384133058683585e-05, "loss": 1.0241, "step": 4556 }, { "epoch": 0.7020408163265306, "grad_norm": 0.8714761137962341, "learning_rate": 2.034515746879247e-05, "loss": 1.918, "step": 4558 }, { "epoch": 0.7023488640739315, "grad_norm": 0.6842667460441589, "learning_rate": 2.0306209656730523e-05, "loss": 0.8398, "step": 4560 }, { "epoch": 0.7026569118213323, "grad_norm": 0.8343141674995422, "learning_rate": 2.026728965898035e-05, "loss": 0.8353, "step": 4562 }, { "epoch": 0.7029649595687332, "grad_norm": 0.9512957334518433, "learning_rate": 2.0228397511998463e-05, "loss": 0.8459, "step": 4564 }, { "epoch": 0.703273007316134, "grad_norm": 0.8050121068954468, "learning_rate": 2.0189533252215387e-05, "loss": 0.9695, "step": 4566 }, { "epoch": 0.7035810550635349, "grad_norm": 0.784359335899353, "learning_rate": 2.0150696916035388e-05, "loss": 0.9633, "step": 4568 }, { "epoch": 0.7038891028109356, "grad_norm": 0.8005040884017944, "learning_rate": 2.011188853983667e-05, "loss": 0.8726, "step": 4570 }, { "epoch": 0.7041971505583365, "grad_norm": 0.8692052960395813, "learning_rate": 2.0073108159971193e-05, "loss": 2.3908, "step": 4572 }, { "epoch": 0.7045051983057374, "grad_norm": 0.8023272156715393, "learning_rate": 2.003435581276479e-05, "loss": 0.8949, "step": 4574 }, { "epoch": 0.7048132460531382, "grad_norm": 0.7615310549736023, "learning_rate": 1.999563153451689e-05, "loss": 1.2118, "step": 4576 }, { "epoch": 0.7051212938005391, "grad_norm": 1.0422707796096802, "learning_rate": 1.9956935361500717e-05, "loss": 1.0129, "step": 4578 }, { "epoch": 0.7054293415479399, "grad_norm": 0.907596230506897, "learning_rate": 1.991826732996319e-05, "loss": 0.9199, "step": 4580 }, { "epoch": 0.7057373892953408, "grad_norm": 1.3769018650054932, "learning_rate": 1.987962747612484e-05, "loss": 1.0924, "step": 4582 }, { "epoch": 0.7060454370427416, "grad_norm": 0.8509035110473633, "learning_rate": 1.9841015836179734e-05, "loss": 1.9078, "step": 4584 }, { "epoch": 0.7063534847901425, "grad_norm": 1.1849422454833984, "learning_rate": 1.980243244629564e-05, "loss": 0.9282, "step": 4586 }, { "epoch": 0.7066615325375433, "grad_norm": 1.1115151643753052, "learning_rate": 1.9763877342613785e-05, "loss": 0.9941, "step": 4588 }, { "epoch": 0.7069695802849442, "grad_norm": 0.684323251247406, "learning_rate": 1.972535056124889e-05, "loss": 2.0089, "step": 4590 }, { "epoch": 0.7072776280323451, "grad_norm": 0.6070319414138794, "learning_rate": 1.9686852138289162e-05, "loss": 0.8668, "step": 4592 }, { "epoch": 0.7075856757797458, "grad_norm": 0.5560675263404846, "learning_rate": 1.9648382109796304e-05, "loss": 0.9991, "step": 4594 }, { "epoch": 0.7078937235271467, "grad_norm": 1.0245397090911865, "learning_rate": 1.9609940511805353e-05, "loss": 1.0232, "step": 4596 }, { "epoch": 0.7082017712745475, "grad_norm": 1.1413053274154663, "learning_rate": 1.957152738032469e-05, "loss": 1.6201, "step": 4598 }, { "epoch": 0.7085098190219484, "grad_norm": 0.8900070190429688, "learning_rate": 1.9533142751336126e-05, "loss": 0.7824, "step": 4600 }, { "epoch": 0.7088178667693492, "grad_norm": 0.9296131730079651, "learning_rate": 1.9494786660794702e-05, "loss": 0.9638, "step": 4602 }, { "epoch": 0.7091259145167501, "grad_norm": 0.848101019859314, "learning_rate": 1.9456459144628765e-05, "loss": 1.0311, "step": 4604 }, { "epoch": 0.7094339622641509, "grad_norm": 0.7903195023536682, "learning_rate": 1.941816023873983e-05, "loss": 1.0481, "step": 4606 }, { "epoch": 0.7097420100115518, "grad_norm": 0.7192170023918152, "learning_rate": 1.9379889979002704e-05, "loss": 0.8418, "step": 4608 }, { "epoch": 0.7100500577589527, "grad_norm": 0.8225452303886414, "learning_rate": 1.9341648401265307e-05, "loss": 0.9304, "step": 4610 }, { "epoch": 0.7103581055063535, "grad_norm": 0.8312641382217407, "learning_rate": 1.9303435541348695e-05, "loss": 1.0101, "step": 4612 }, { "epoch": 0.7106661532537544, "grad_norm": 0.8498125076293945, "learning_rate": 1.9265251435047044e-05, "loss": 0.9582, "step": 4614 }, { "epoch": 0.7109742010011552, "grad_norm": 0.6809362173080444, "learning_rate": 1.922709611812758e-05, "loss": 1.0944, "step": 4616 }, { "epoch": 0.7112822487485561, "grad_norm": 0.7999347448348999, "learning_rate": 1.918896962633056e-05, "loss": 1.1626, "step": 4618 }, { "epoch": 0.7115902964959568, "grad_norm": 0.6605939865112305, "learning_rate": 1.915087199536925e-05, "loss": 1.0525, "step": 4620 }, { "epoch": 0.7118983442433577, "grad_norm": 0.6720040440559387, "learning_rate": 1.9112803260929884e-05, "loss": 1.1791, "step": 4622 }, { "epoch": 0.7122063919907585, "grad_norm": 0.9006309509277344, "learning_rate": 1.907476345867162e-05, "loss": 0.9723, "step": 4624 }, { "epoch": 0.7125144397381594, "grad_norm": 0.7742815613746643, "learning_rate": 1.9036752624226506e-05, "loss": 0.7847, "step": 4626 }, { "epoch": 0.7128224874855602, "grad_norm": 0.7233114838600159, "learning_rate": 1.899877079319949e-05, "loss": 2.2617, "step": 4628 }, { "epoch": 0.7131305352329611, "grad_norm": 0.6334021091461182, "learning_rate": 1.8960818001168308e-05, "loss": 0.9077, "step": 4630 }, { "epoch": 0.713438582980362, "grad_norm": 1.047119140625, "learning_rate": 1.8922894283683533e-05, "loss": 1.5837, "step": 4632 }, { "epoch": 0.7137466307277628, "grad_norm": 0.7595541477203369, "learning_rate": 1.8884999676268476e-05, "loss": 1.6919, "step": 4634 }, { "epoch": 0.7140546784751637, "grad_norm": 0.6998812556266785, "learning_rate": 1.8847134214419205e-05, "loss": 1.0882, "step": 4636 }, { "epoch": 0.7143627262225645, "grad_norm": 0.9410682916641235, "learning_rate": 1.8809297933604446e-05, "loss": 0.8349, "step": 4638 }, { "epoch": 0.7146707739699654, "grad_norm": 0.8228951692581177, "learning_rate": 1.8771490869265686e-05, "loss": 0.978, "step": 4640 }, { "epoch": 0.7149788217173662, "grad_norm": 0.7356462478637695, "learning_rate": 1.8733713056816905e-05, "loss": 1.5331, "step": 4642 }, { "epoch": 0.715286869464767, "grad_norm": 0.7669535279273987, "learning_rate": 1.869596453164479e-05, "loss": 0.9988, "step": 4644 }, { "epoch": 0.7155949172121678, "grad_norm": 0.7517581582069397, "learning_rate": 1.8658245329108553e-05, "loss": 0.8652, "step": 4646 }, { "epoch": 0.7159029649595687, "grad_norm": 0.8972408771514893, "learning_rate": 1.862055548453995e-05, "loss": 1.0223, "step": 4648 }, { "epoch": 0.7162110127069696, "grad_norm": 0.7768507599830627, "learning_rate": 1.8582895033243232e-05, "loss": 0.7865, "step": 4650 }, { "epoch": 0.7165190604543704, "grad_norm": 0.831098735332489, "learning_rate": 1.8545264010495106e-05, "loss": 1.0744, "step": 4652 }, { "epoch": 0.7168271082017713, "grad_norm": 0.7342033982276917, "learning_rate": 1.8507662451544772e-05, "loss": 0.9346, "step": 4654 }, { "epoch": 0.7171351559491721, "grad_norm": 0.8174619674682617, "learning_rate": 1.8470090391613737e-05, "loss": 0.8898, "step": 4656 }, { "epoch": 0.717443203696573, "grad_norm": 0.7861392498016357, "learning_rate": 1.8432547865895926e-05, "loss": 0.9919, "step": 4658 }, { "epoch": 0.7177512514439738, "grad_norm": 0.8522654175758362, "learning_rate": 1.839503490955763e-05, "loss": 0.9136, "step": 4660 }, { "epoch": 0.7180592991913747, "grad_norm": 1.045430302619934, "learning_rate": 1.8357551557737407e-05, "loss": 1.2186, "step": 4662 }, { "epoch": 0.7183673469387755, "grad_norm": 0.9019323587417603, "learning_rate": 1.8320097845546058e-05, "loss": 0.9851, "step": 4664 }, { "epoch": 0.7186753946861764, "grad_norm": 0.724032998085022, "learning_rate": 1.8282673808066653e-05, "loss": 2.2028, "step": 4666 }, { "epoch": 0.7189834424335773, "grad_norm": 0.8869044184684753, "learning_rate": 1.8245279480354504e-05, "loss": 0.9707, "step": 4668 }, { "epoch": 0.719291490180978, "grad_norm": 0.9228594303131104, "learning_rate": 1.8207914897437005e-05, "loss": 0.9238, "step": 4670 }, { "epoch": 0.7195995379283789, "grad_norm": 0.7796034216880798, "learning_rate": 1.8170580094313738e-05, "loss": 1.0092, "step": 4672 }, { "epoch": 0.7199075856757797, "grad_norm": 0.6384996175765991, "learning_rate": 1.813327510595642e-05, "loss": 1.9974, "step": 4674 }, { "epoch": 0.7202156334231806, "grad_norm": 0.6626761555671692, "learning_rate": 1.80959999673088e-05, "loss": 0.7973, "step": 4676 }, { "epoch": 0.7205236811705814, "grad_norm": 0.8023983240127563, "learning_rate": 1.8058754713286636e-05, "loss": 1.0762, "step": 4678 }, { "epoch": 0.7208317289179823, "grad_norm": 1.1368682384490967, "learning_rate": 1.802153937877777e-05, "loss": 0.8167, "step": 4680 }, { "epoch": 0.7211397766653831, "grad_norm": 1.2227729558944702, "learning_rate": 1.7984353998641973e-05, "loss": 1.1872, "step": 4682 }, { "epoch": 0.721447824412784, "grad_norm": 0.8481996059417725, "learning_rate": 1.794719860771097e-05, "loss": 0.9309, "step": 4684 }, { "epoch": 0.7217558721601848, "grad_norm": 0.708870530128479, "learning_rate": 1.7910073240788346e-05, "loss": 1.3142, "step": 4686 }, { "epoch": 0.7220639199075857, "grad_norm": 0.9485694169998169, "learning_rate": 1.787297793264965e-05, "loss": 0.9818, "step": 4688 }, { "epoch": 0.7223719676549866, "grad_norm": 0.9983515739440918, "learning_rate": 1.7835912718042212e-05, "loss": 0.934, "step": 4690 }, { "epoch": 0.7226800154023874, "grad_norm": 0.9369632005691528, "learning_rate": 1.7798877631685202e-05, "loss": 0.9286, "step": 4692 }, { "epoch": 0.7229880631497883, "grad_norm": 0.730244517326355, "learning_rate": 1.776187270826955e-05, "loss": 0.9192, "step": 4694 }, { "epoch": 0.723296110897189, "grad_norm": 0.5943278670310974, "learning_rate": 1.7724897982457946e-05, "loss": 0.8079, "step": 4696 }, { "epoch": 0.7236041586445899, "grad_norm": 0.88278728723526, "learning_rate": 1.7687953488884773e-05, "loss": 0.8272, "step": 4698 }, { "epoch": 0.7239122063919907, "grad_norm": 0.6160874962806702, "learning_rate": 1.7651039262156126e-05, "loss": 0.9233, "step": 4700 }, { "epoch": 0.7242202541393916, "grad_norm": 0.5485923886299133, "learning_rate": 1.761415533684973e-05, "loss": 0.6803, "step": 4702 }, { "epoch": 0.7245283018867924, "grad_norm": 0.7884142398834229, "learning_rate": 1.7577301747514922e-05, "loss": 0.9881, "step": 4704 }, { "epoch": 0.7248363496341933, "grad_norm": 0.7269513010978699, "learning_rate": 1.7540478528672645e-05, "loss": 0.909, "step": 4706 }, { "epoch": 0.7251443973815942, "grad_norm": 0.8242033123970032, "learning_rate": 1.750368571481536e-05, "loss": 0.7748, "step": 4708 }, { "epoch": 0.725452445128995, "grad_norm": 0.6146319508552551, "learning_rate": 1.7466923340407088e-05, "loss": 1.8569, "step": 4710 }, { "epoch": 0.7257604928763959, "grad_norm": 0.825018048286438, "learning_rate": 1.7430191439883298e-05, "loss": 0.9397, "step": 4712 }, { "epoch": 0.7260685406237967, "grad_norm": 1.081148624420166, "learning_rate": 1.7393490047650944e-05, "loss": 0.9502, "step": 4714 }, { "epoch": 0.7263765883711976, "grad_norm": 0.7197492718696594, "learning_rate": 1.735681919808839e-05, "loss": 0.9597, "step": 4716 }, { "epoch": 0.7266846361185983, "grad_norm": 0.6531423330307007, "learning_rate": 1.7320178925545387e-05, "loss": 0.7852, "step": 4718 }, { "epoch": 0.7269926838659992, "grad_norm": 0.6706268191337585, "learning_rate": 1.728356926434306e-05, "loss": 0.9987, "step": 4720 }, { "epoch": 0.7273007316134, "grad_norm": 0.7522306442260742, "learning_rate": 1.724699024877383e-05, "loss": 1.0304, "step": 4722 }, { "epoch": 0.7276087793608009, "grad_norm": 0.8848979473114014, "learning_rate": 1.721044191310145e-05, "loss": 0.9923, "step": 4724 }, { "epoch": 0.7279168271082018, "grad_norm": 0.7735690474510193, "learning_rate": 1.71739242915609e-05, "loss": 2.0829, "step": 4726 }, { "epoch": 0.7282248748556026, "grad_norm": 0.6652183532714844, "learning_rate": 1.713743741835842e-05, "loss": 1.0008, "step": 4728 }, { "epoch": 0.7285329226030035, "grad_norm": 0.9963399171829224, "learning_rate": 1.710098132767143e-05, "loss": 0.9404, "step": 4730 }, { "epoch": 0.7288409703504043, "grad_norm": 0.647809624671936, "learning_rate": 1.70645560536485e-05, "loss": 1.0353, "step": 4732 }, { "epoch": 0.7291490180978052, "grad_norm": 0.8186082243919373, "learning_rate": 1.7028161630409405e-05, "loss": 1.0937, "step": 4734 }, { "epoch": 0.729457065845206, "grad_norm": 0.7456851005554199, "learning_rate": 1.699179809204493e-05, "loss": 1.1491, "step": 4736 }, { "epoch": 0.7297651135926069, "grad_norm": 0.9909383058547974, "learning_rate": 1.6955465472616973e-05, "loss": 0.9543, "step": 4738 }, { "epoch": 0.7300731613400077, "grad_norm": 0.7538608908653259, "learning_rate": 1.6919163806158455e-05, "loss": 0.7093, "step": 4740 }, { "epoch": 0.7303812090874086, "grad_norm": 0.6593775153160095, "learning_rate": 1.6882893126673372e-05, "loss": 0.7995, "step": 4742 }, { "epoch": 0.7306892568348093, "grad_norm": 0.8437679409980774, "learning_rate": 1.6846653468136588e-05, "loss": 0.9412, "step": 4744 }, { "epoch": 0.7309973045822102, "grad_norm": 1.0077937841415405, "learning_rate": 1.681044486449395e-05, "loss": 0.8965, "step": 4746 }, { "epoch": 0.7313053523296111, "grad_norm": 1.157778024673462, "learning_rate": 1.6774267349662274e-05, "loss": 1.1927, "step": 4748 }, { "epoch": 0.7316134000770119, "grad_norm": 1.2045066356658936, "learning_rate": 1.6738120957529198e-05, "loss": 0.9724, "step": 4750 }, { "epoch": 0.7319214478244128, "grad_norm": 0.8141582608222961, "learning_rate": 1.670200572195316e-05, "loss": 1.0615, "step": 4752 }, { "epoch": 0.7322294955718136, "grad_norm": 0.6798232793807983, "learning_rate": 1.6665921676763536e-05, "loss": 1.8187, "step": 4754 }, { "epoch": 0.7325375433192145, "grad_norm": 0.8349108695983887, "learning_rate": 1.6629868855760406e-05, "loss": 1.0343, "step": 4756 }, { "epoch": 0.7328455910666153, "grad_norm": 0.6482349038124084, "learning_rate": 1.6593847292714582e-05, "loss": 0.9633, "step": 4758 }, { "epoch": 0.7331536388140162, "grad_norm": 0.6155962944030762, "learning_rate": 1.655785702136764e-05, "loss": 0.8955, "step": 4760 }, { "epoch": 0.733461686561417, "grad_norm": 1.1165225505828857, "learning_rate": 1.6521898075431858e-05, "loss": 1.091, "step": 4762 }, { "epoch": 0.7337697343088179, "grad_norm": 0.5706340670585632, "learning_rate": 1.648597048859015e-05, "loss": 0.8817, "step": 4764 }, { "epoch": 0.7340777820562188, "grad_norm": 0.8465478420257568, "learning_rate": 1.645007429449601e-05, "loss": 1.3064, "step": 4766 }, { "epoch": 0.7343858298036195, "grad_norm": 0.8700278997421265, "learning_rate": 1.6414209526773616e-05, "loss": 1.0423, "step": 4768 }, { "epoch": 0.7346938775510204, "grad_norm": 0.6647862792015076, "learning_rate": 1.6378376219017648e-05, "loss": 0.81, "step": 4770 }, { "epoch": 0.7350019252984212, "grad_norm": 0.5894257426261902, "learning_rate": 1.6342574404793326e-05, "loss": 1.8308, "step": 4772 }, { "epoch": 0.7353099730458221, "grad_norm": 0.8121969103813171, "learning_rate": 1.630680411763639e-05, "loss": 1.0954, "step": 4774 }, { "epoch": 0.7356180207932229, "grad_norm": 0.8044182658195496, "learning_rate": 1.6271065391053013e-05, "loss": 0.9867, "step": 4776 }, { "epoch": 0.7359260685406238, "grad_norm": 0.7796005606651306, "learning_rate": 1.623535825851985e-05, "loss": 0.9733, "step": 4778 }, { "epoch": 0.7362341162880246, "grad_norm": 0.6308910846710205, "learning_rate": 1.6199682753483926e-05, "loss": 1.0157, "step": 4780 }, { "epoch": 0.7365421640354255, "grad_norm": 0.8022951483726501, "learning_rate": 1.6164038909362656e-05, "loss": 1.0141, "step": 4782 }, { "epoch": 0.7368502117828264, "grad_norm": 0.6543718576431274, "learning_rate": 1.6128426759543792e-05, "loss": 1.0184, "step": 4784 }, { "epoch": 0.7371582595302272, "grad_norm": 0.6891601085662842, "learning_rate": 1.60928463373854e-05, "loss": 0.875, "step": 4786 }, { "epoch": 0.7374663072776281, "grad_norm": 0.9390774965286255, "learning_rate": 1.6057297676215832e-05, "loss": 1.0448, "step": 4788 }, { "epoch": 0.7377743550250289, "grad_norm": 0.9003983736038208, "learning_rate": 1.602178080933368e-05, "loss": 1.0639, "step": 4790 }, { "epoch": 0.7380824027724298, "grad_norm": 0.7824626564979553, "learning_rate": 1.5986295770007765e-05, "loss": 1.0726, "step": 4792 }, { "epoch": 0.7383904505198305, "grad_norm": 0.7979996800422668, "learning_rate": 1.5950842591477084e-05, "loss": 0.8944, "step": 4794 }, { "epoch": 0.7386984982672314, "grad_norm": 1.148526906967163, "learning_rate": 1.591542130695079e-05, "loss": 1.0637, "step": 4796 }, { "epoch": 0.7390065460146322, "grad_norm": 0.7048614025115967, "learning_rate": 1.588003194960817e-05, "loss": 0.7536, "step": 4798 }, { "epoch": 0.7393145937620331, "grad_norm": 0.6544722318649292, "learning_rate": 1.584467455259861e-05, "loss": 0.8258, "step": 4800 }, { "epoch": 0.7393145937620331, "eval_loss": 2.364302396774292, "eval_runtime": 737.0622, "eval_samples_per_second": 2.713, "eval_steps_per_second": 0.678, "step": 4800 }, { "epoch": 0.7396226415094339, "grad_norm": 0.7770758867263794, "learning_rate": 1.580934914904153e-05, "loss": 0.903, "step": 4802 }, { "epoch": 0.7399306892568348, "grad_norm": 0.7111145853996277, "learning_rate": 1.5774055772026407e-05, "loss": 0.8476, "step": 4804 }, { "epoch": 0.7402387370042357, "grad_norm": 0.6309933066368103, "learning_rate": 1.5738794454612703e-05, "loss": 0.8852, "step": 4806 }, { "epoch": 0.7405467847516365, "grad_norm": 0.6851494908332825, "learning_rate": 1.5703565229829902e-05, "loss": 0.9008, "step": 4808 }, { "epoch": 0.7408548324990374, "grad_norm": 0.9459416270256042, "learning_rate": 1.566836813067733e-05, "loss": 0.968, "step": 4810 }, { "epoch": 0.7411628802464382, "grad_norm": 0.8359102010726929, "learning_rate": 1.563320319012428e-05, "loss": 0.9852, "step": 4812 }, { "epoch": 0.7414709279938391, "grad_norm": 0.7026764750480652, "learning_rate": 1.5598070441109965e-05, "loss": 0.8269, "step": 4814 }, { "epoch": 0.7417789757412399, "grad_norm": 0.7479056715965271, "learning_rate": 1.5562969916543336e-05, "loss": 0.8847, "step": 4816 }, { "epoch": 0.7420870234886408, "grad_norm": 0.9053459167480469, "learning_rate": 1.552790164930324e-05, "loss": 0.7555, "step": 4818 }, { "epoch": 0.7423950712360415, "grad_norm": 0.7266227006912231, "learning_rate": 1.5492865672238276e-05, "loss": 1.1028, "step": 4820 }, { "epoch": 0.7427031189834424, "grad_norm": 0.8516717553138733, "learning_rate": 1.5457862018166847e-05, "loss": 1.2673, "step": 4822 }, { "epoch": 0.7430111667308433, "grad_norm": 0.6059942245483398, "learning_rate": 1.5422890719877e-05, "loss": 0.8081, "step": 4824 }, { "epoch": 0.7433192144782441, "grad_norm": 0.8077753186225891, "learning_rate": 1.53879518101265e-05, "loss": 0.9809, "step": 4826 }, { "epoch": 0.743627262225645, "grad_norm": 0.7996209859848022, "learning_rate": 1.535304532164283e-05, "loss": 0.8094, "step": 4828 }, { "epoch": 0.7439353099730458, "grad_norm": 0.7710662484169006, "learning_rate": 1.531817128712305e-05, "loss": 1.0587, "step": 4830 }, { "epoch": 0.7442433577204467, "grad_norm": 0.792667031288147, "learning_rate": 1.5283329739233808e-05, "loss": 0.9356, "step": 4832 }, { "epoch": 0.7445514054678475, "grad_norm": 0.5613619089126587, "learning_rate": 1.5248520710611347e-05, "loss": 0.8864, "step": 4834 }, { "epoch": 0.7448594532152484, "grad_norm": 0.9855058789253235, "learning_rate": 1.5213744233861465e-05, "loss": 0.9429, "step": 4836 }, { "epoch": 0.7451675009626492, "grad_norm": 0.7374126315116882, "learning_rate": 1.5179000341559463e-05, "loss": 0.8766, "step": 4838 }, { "epoch": 0.7454755487100501, "grad_norm": 0.9540923237800598, "learning_rate": 1.5144289066250045e-05, "loss": 0.9494, "step": 4840 }, { "epoch": 0.745783596457451, "grad_norm": 0.9879464507102966, "learning_rate": 1.5109610440447486e-05, "loss": 0.8697, "step": 4842 }, { "epoch": 0.7460916442048517, "grad_norm": 0.9068211913108826, "learning_rate": 1.5074964496635407e-05, "loss": 1.1991, "step": 4844 }, { "epoch": 0.7463996919522526, "grad_norm": 0.9240580797195435, "learning_rate": 1.5040351267266783e-05, "loss": 0.8796, "step": 4846 }, { "epoch": 0.7467077396996534, "grad_norm": 0.7835018038749695, "learning_rate": 1.5005770784764034e-05, "loss": 0.914, "step": 4848 }, { "epoch": 0.7470157874470543, "grad_norm": 0.9912784695625305, "learning_rate": 1.4971223081518837e-05, "loss": 1.184, "step": 4850 }, { "epoch": 0.7473238351944551, "grad_norm": 0.8975262641906738, "learning_rate": 1.4936708189892212e-05, "loss": 0.8886, "step": 4852 }, { "epoch": 0.747631882941856, "grad_norm": 0.7419629096984863, "learning_rate": 1.4902226142214366e-05, "loss": 1.5564, "step": 4854 }, { "epoch": 0.7479399306892568, "grad_norm": 0.6214430332183838, "learning_rate": 1.4867776970784836e-05, "loss": 0.9269, "step": 4856 }, { "epoch": 0.7482479784366577, "grad_norm": 0.6790211796760559, "learning_rate": 1.4833360707872319e-05, "loss": 0.9317, "step": 4858 }, { "epoch": 0.7485560261840585, "grad_norm": 0.8692981600761414, "learning_rate": 1.479897738571468e-05, "loss": 0.9814, "step": 4860 }, { "epoch": 0.7488640739314594, "grad_norm": 0.8824407458305359, "learning_rate": 1.4764627036518936e-05, "loss": 0.9181, "step": 4862 }, { "epoch": 0.7491721216788603, "grad_norm": 1.0911660194396973, "learning_rate": 1.473030969246122e-05, "loss": 0.9681, "step": 4864 }, { "epoch": 0.7494801694262611, "grad_norm": 0.7026270627975464, "learning_rate": 1.4696025385686752e-05, "loss": 0.9831, "step": 4866 }, { "epoch": 0.749788217173662, "grad_norm": 0.7922086119651794, "learning_rate": 1.4661774148309799e-05, "loss": 0.8616, "step": 4868 }, { "epoch": 0.7500962649210627, "grad_norm": 0.8266775012016296, "learning_rate": 1.462755601241365e-05, "loss": 0.8314, "step": 4870 }, { "epoch": 0.7504043126684636, "grad_norm": 0.9757092595100403, "learning_rate": 1.4593371010050606e-05, "loss": 1.2532, "step": 4872 }, { "epoch": 0.7507123604158644, "grad_norm": 0.6591874957084656, "learning_rate": 1.45592191732419e-05, "loss": 1.854, "step": 4874 }, { "epoch": 0.7510204081632653, "grad_norm": 0.7547690868377686, "learning_rate": 1.4525100533977731e-05, "loss": 0.9278, "step": 4876 }, { "epoch": 0.7513284559106661, "grad_norm": 0.8740079998970032, "learning_rate": 1.4491015124217184e-05, "loss": 1.0041, "step": 4878 }, { "epoch": 0.751636503658067, "grad_norm": 0.9281182885169983, "learning_rate": 1.4456962975888216e-05, "loss": 0.9211, "step": 4880 }, { "epoch": 0.7519445514054679, "grad_norm": 0.6817439198493958, "learning_rate": 1.4422944120887638e-05, "loss": 0.875, "step": 4882 }, { "epoch": 0.7522525991528687, "grad_norm": 0.749985933303833, "learning_rate": 1.438895859108107e-05, "loss": 0.9464, "step": 4884 }, { "epoch": 0.7525606469002696, "grad_norm": 0.9303197264671326, "learning_rate": 1.4355006418302896e-05, "loss": 1.1307, "step": 4886 }, { "epoch": 0.7528686946476704, "grad_norm": 0.6419119238853455, "learning_rate": 1.4321087634356329e-05, "loss": 0.8779, "step": 4888 }, { "epoch": 0.7531767423950713, "grad_norm": 0.8447157144546509, "learning_rate": 1.4287202271013196e-05, "loss": 0.7976, "step": 4890 }, { "epoch": 0.753484790142472, "grad_norm": 0.9394419193267822, "learning_rate": 1.4253350360014095e-05, "loss": 1.0733, "step": 4892 }, { "epoch": 0.753792837889873, "grad_norm": 0.9550667405128479, "learning_rate": 1.4219531933068259e-05, "loss": 1.4181, "step": 4894 }, { "epoch": 0.7541008856372737, "grad_norm": 0.79128497838974, "learning_rate": 1.4185747021853601e-05, "loss": 1.0558, "step": 4896 }, { "epoch": 0.7544089333846746, "grad_norm": 0.8191699981689453, "learning_rate": 1.4151995658016565e-05, "loss": 0.8652, "step": 4898 }, { "epoch": 0.7547169811320755, "grad_norm": 0.921259880065918, "learning_rate": 1.4118277873172208e-05, "loss": 1.017, "step": 4900 }, { "epoch": 0.7550250288794763, "grad_norm": 0.7527545094490051, "learning_rate": 1.4084593698904186e-05, "loss": 1.068, "step": 4902 }, { "epoch": 0.7553330766268772, "grad_norm": 0.7523015737533569, "learning_rate": 1.4050943166764569e-05, "loss": 0.9896, "step": 4904 }, { "epoch": 0.755641124374278, "grad_norm": 1.0874977111816406, "learning_rate": 1.4017326308273975e-05, "loss": 0.9821, "step": 4906 }, { "epoch": 0.7559491721216789, "grad_norm": 0.6856980323791504, "learning_rate": 1.3983743154921503e-05, "loss": 2.4016, "step": 4908 }, { "epoch": 0.7562572198690797, "grad_norm": 1.0397601127624512, "learning_rate": 1.3950193738164646e-05, "loss": 0.7798, "step": 4910 }, { "epoch": 0.7565652676164806, "grad_norm": 0.7780575156211853, "learning_rate": 1.3916678089429264e-05, "loss": 0.8409, "step": 4912 }, { "epoch": 0.7568733153638814, "grad_norm": 0.7275823354721069, "learning_rate": 1.3883196240109631e-05, "loss": 0.9809, "step": 4914 }, { "epoch": 0.7571813631112823, "grad_norm": 0.7291345596313477, "learning_rate": 1.3849748221568371e-05, "loss": 0.8507, "step": 4916 }, { "epoch": 0.757489410858683, "grad_norm": 0.8503779768943787, "learning_rate": 1.381633406513641e-05, "loss": 0.8459, "step": 4918 }, { "epoch": 0.7577974586060839, "grad_norm": 1.0080485343933105, "learning_rate": 1.378295380211289e-05, "loss": 0.82, "step": 4920 }, { "epoch": 0.7581055063534848, "grad_norm": 1.0013842582702637, "learning_rate": 1.3749607463765308e-05, "loss": 2.36, "step": 4922 }, { "epoch": 0.7584135541008856, "grad_norm": 0.6906487345695496, "learning_rate": 1.3716295081329316e-05, "loss": 0.9252, "step": 4924 }, { "epoch": 0.7587216018482865, "grad_norm": 0.819780170917511, "learning_rate": 1.3683016686008799e-05, "loss": 1.4594, "step": 4926 }, { "epoch": 0.7590296495956873, "grad_norm": 0.7055020928382874, "learning_rate": 1.3649772308975733e-05, "loss": 1.0051, "step": 4928 }, { "epoch": 0.7593376973430882, "grad_norm": 0.6653702855110168, "learning_rate": 1.3616561981370329e-05, "loss": 0.6855, "step": 4930 }, { "epoch": 0.759645745090489, "grad_norm": 0.7256321310997009, "learning_rate": 1.3583385734300858e-05, "loss": 1.0082, "step": 4932 }, { "epoch": 0.7599537928378899, "grad_norm": 0.7310523986816406, "learning_rate": 1.3550243598843615e-05, "loss": 0.9595, "step": 4934 }, { "epoch": 0.7602618405852907, "grad_norm": 0.8937221765518188, "learning_rate": 1.3517135606043047e-05, "loss": 1.0844, "step": 4936 }, { "epoch": 0.7605698883326916, "grad_norm": 0.6377236843109131, "learning_rate": 1.348406178691154e-05, "loss": 1.0395, "step": 4938 }, { "epoch": 0.7608779360800925, "grad_norm": 1.0823004245758057, "learning_rate": 1.3451022172429495e-05, "loss": 1.2769, "step": 4940 }, { "epoch": 0.7611859838274933, "grad_norm": 0.7745267152786255, "learning_rate": 1.341801679354528e-05, "loss": 0.7941, "step": 4942 }, { "epoch": 0.7614940315748941, "grad_norm": 0.5816026926040649, "learning_rate": 1.338504568117518e-05, "loss": 0.9029, "step": 4944 }, { "epoch": 0.7618020793222949, "grad_norm": 0.9577437043190002, "learning_rate": 1.3352108866203394e-05, "loss": 0.8861, "step": 4946 }, { "epoch": 0.7621101270696958, "grad_norm": 0.7536446452140808, "learning_rate": 1.3319206379481991e-05, "loss": 1.1366, "step": 4948 }, { "epoch": 0.7624181748170966, "grad_norm": 0.7559531927108765, "learning_rate": 1.3286338251830882e-05, "loss": 1.6357, "step": 4950 }, { "epoch": 0.7627262225644975, "grad_norm": 0.8585458993911743, "learning_rate": 1.3253504514037796e-05, "loss": 0.9577, "step": 4952 }, { "epoch": 0.7630342703118983, "grad_norm": 0.9138365387916565, "learning_rate": 1.3220705196858247e-05, "loss": 0.9647, "step": 4954 }, { "epoch": 0.7633423180592992, "grad_norm": 0.7139084339141846, "learning_rate": 1.3187940331015503e-05, "loss": 1.2702, "step": 4956 }, { "epoch": 0.7636503658067001, "grad_norm": 1.0149308443069458, "learning_rate": 1.3155209947200575e-05, "loss": 1.8406, "step": 4958 }, { "epoch": 0.7639584135541009, "grad_norm": 0.6941062808036804, "learning_rate": 1.3122514076072163e-05, "loss": 0.8376, "step": 4960 }, { "epoch": 0.7642664613015018, "grad_norm": 0.7763035297393799, "learning_rate": 1.3089852748256642e-05, "loss": 0.9862, "step": 4962 }, { "epoch": 0.7645745090489026, "grad_norm": 0.9273561239242554, "learning_rate": 1.3057225994348027e-05, "loss": 1.0633, "step": 4964 }, { "epoch": 0.7648825567963035, "grad_norm": 0.8585821390151978, "learning_rate": 1.3024633844907958e-05, "loss": 1.0902, "step": 4966 }, { "epoch": 0.7651906045437042, "grad_norm": 0.8621907234191895, "learning_rate": 1.2992076330465642e-05, "loss": 0.9008, "step": 4968 }, { "epoch": 0.7654986522911051, "grad_norm": 0.6912165880203247, "learning_rate": 1.2959553481517866e-05, "loss": 0.8129, "step": 4970 }, { "epoch": 0.7658067000385059, "grad_norm": 0.8399752378463745, "learning_rate": 1.2927065328528926e-05, "loss": 1.6187, "step": 4972 }, { "epoch": 0.7661147477859068, "grad_norm": 0.8446044921875, "learning_rate": 1.2894611901930615e-05, "loss": 0.8109, "step": 4974 }, { "epoch": 0.7664227955333076, "grad_norm": 0.8228721022605896, "learning_rate": 1.286219323212225e-05, "loss": 0.9516, "step": 4976 }, { "epoch": 0.7667308432807085, "grad_norm": 0.9982367157936096, "learning_rate": 1.2829809349470512e-05, "loss": 1.016, "step": 4978 }, { "epoch": 0.7670388910281094, "grad_norm": 0.764540433883667, "learning_rate": 1.2797460284309532e-05, "loss": 0.7849, "step": 4980 }, { "epoch": 0.7673469387755102, "grad_norm": 0.718386173248291, "learning_rate": 1.2765146066940853e-05, "loss": 1.0434, "step": 4982 }, { "epoch": 0.7676549865229111, "grad_norm": 0.8509320616722107, "learning_rate": 1.2732866727633364e-05, "loss": 1.1764, "step": 4984 }, { "epoch": 0.7679630342703119, "grad_norm": 0.9963598251342773, "learning_rate": 1.2700622296623239e-05, "loss": 1.0726, "step": 4986 }, { "epoch": 0.7682710820177128, "grad_norm": 1.1380006074905396, "learning_rate": 1.2668412804113983e-05, "loss": 1.0589, "step": 4988 }, { "epoch": 0.7685791297651136, "grad_norm": 0.7528254985809326, "learning_rate": 1.263623828027642e-05, "loss": 0.8949, "step": 4990 }, { "epoch": 0.7688871775125145, "grad_norm": 0.8158612251281738, "learning_rate": 1.260409875524854e-05, "loss": 0.8413, "step": 4992 }, { "epoch": 0.7691952252599152, "grad_norm": 0.8937954306602478, "learning_rate": 1.2571994259135583e-05, "loss": 0.95, "step": 4994 }, { "epoch": 0.7695032730073161, "grad_norm": 0.8776180148124695, "learning_rate": 1.2539924822010007e-05, "loss": 0.9603, "step": 4996 }, { "epoch": 0.769811320754717, "grad_norm": 0.6999343633651733, "learning_rate": 1.250789047391141e-05, "loss": 0.8799, "step": 4998 }, { "epoch": 0.7701193685021178, "grad_norm": 0.9434583187103271, "learning_rate": 1.247589124484646e-05, "loss": 0.8455, "step": 5000 }, { "epoch": 0.7704274162495187, "grad_norm": 1.0791411399841309, "learning_rate": 1.2443927164789037e-05, "loss": 1.0607, "step": 5002 }, { "epoch": 0.7707354639969195, "grad_norm": 0.6646117568016052, "learning_rate": 1.241199826368003e-05, "loss": 1.2331, "step": 5004 }, { "epoch": 0.7710435117443204, "grad_norm": 0.898343026638031, "learning_rate": 1.2380104571427398e-05, "loss": 1.0122, "step": 5006 }, { "epoch": 0.7713515594917212, "grad_norm": 0.7997497320175171, "learning_rate": 1.2348246117906065e-05, "loss": 1.6624, "step": 5008 }, { "epoch": 0.7716596072391221, "grad_norm": 0.9120794534683228, "learning_rate": 1.2316422932958044e-05, "loss": 1.2284, "step": 5010 }, { "epoch": 0.7719676549865229, "grad_norm": 0.7209826707839966, "learning_rate": 1.2284635046392245e-05, "loss": 0.8711, "step": 5012 }, { "epoch": 0.7722757027339238, "grad_norm": 1.0334336757659912, "learning_rate": 1.2252882487984529e-05, "loss": 0.9622, "step": 5014 }, { "epoch": 0.7725837504813247, "grad_norm": 0.8697283864021301, "learning_rate": 1.222116528747766e-05, "loss": 0.8894, "step": 5016 }, { "epoch": 0.7728917982287254, "grad_norm": 1.0194828510284424, "learning_rate": 1.2189483474581292e-05, "loss": 1.0679, "step": 5018 }, { "epoch": 0.7731998459761263, "grad_norm": 0.833803117275238, "learning_rate": 1.2157837078971928e-05, "loss": 0.8022, "step": 5020 }, { "epoch": 0.7735078937235271, "grad_norm": 0.8861052393913269, "learning_rate": 1.2126226130292895e-05, "loss": 0.841, "step": 5022 }, { "epoch": 0.773815941470928, "grad_norm": 0.9703818559646606, "learning_rate": 1.209465065815431e-05, "loss": 1.0202, "step": 5024 }, { "epoch": 0.7741239892183288, "grad_norm": 0.5357424020767212, "learning_rate": 1.2063110692133068e-05, "loss": 0.9335, "step": 5026 }, { "epoch": 0.7744320369657297, "grad_norm": 1.0410465002059937, "learning_rate": 1.2031606261772805e-05, "loss": 0.8673, "step": 5028 }, { "epoch": 0.7747400847131305, "grad_norm": 0.9500507712364197, "learning_rate": 1.200013739658386e-05, "loss": 0.9811, "step": 5030 }, { "epoch": 0.7750481324605314, "grad_norm": 0.7785813212394714, "learning_rate": 1.1968704126043279e-05, "loss": 0.9822, "step": 5032 }, { "epoch": 0.7753561802079322, "grad_norm": 0.7565167546272278, "learning_rate": 1.193730647959474e-05, "loss": 0.7924, "step": 5034 }, { "epoch": 0.7756642279553331, "grad_norm": 0.9191146492958069, "learning_rate": 1.1905944486648568e-05, "loss": 1.0759, "step": 5036 }, { "epoch": 0.775972275702734, "grad_norm": 0.9619866013526917, "learning_rate": 1.1874618176581693e-05, "loss": 0.9241, "step": 5038 }, { "epoch": 0.7762803234501348, "grad_norm": 0.9265114665031433, "learning_rate": 1.1843327578737612e-05, "loss": 0.7815, "step": 5040 }, { "epoch": 0.7765883711975357, "grad_norm": 1.0196962356567383, "learning_rate": 1.1812072722426371e-05, "loss": 1.0626, "step": 5042 }, { "epoch": 0.7768964189449364, "grad_norm": 0.9532772302627563, "learning_rate": 1.1780853636924543e-05, "loss": 0.8243, "step": 5044 }, { "epoch": 0.7772044666923373, "grad_norm": 1.0061920881271362, "learning_rate": 1.1749670351475195e-05, "loss": 1.7845, "step": 5046 }, { "epoch": 0.7775125144397381, "grad_norm": 0.7302465438842773, "learning_rate": 1.1718522895287848e-05, "loss": 0.8505, "step": 5048 }, { "epoch": 0.777820562187139, "grad_norm": 0.8662822842597961, "learning_rate": 1.1687411297538469e-05, "loss": 0.9611, "step": 5050 }, { "epoch": 0.7781286099345398, "grad_norm": 0.704204261302948, "learning_rate": 1.1656335587369444e-05, "loss": 0.9925, "step": 5052 }, { "epoch": 0.7784366576819407, "grad_norm": 0.6102928519248962, "learning_rate": 1.1625295793889512e-05, "loss": 1.2709, "step": 5054 }, { "epoch": 0.7787447054293416, "grad_norm": 0.7031947374343872, "learning_rate": 1.1594291946173846e-05, "loss": 2.1372, "step": 5056 }, { "epoch": 0.7790527531767424, "grad_norm": 1.1103585958480835, "learning_rate": 1.1563324073263843e-05, "loss": 0.9965, "step": 5058 }, { "epoch": 0.7793608009241433, "grad_norm": 0.8092791438102722, "learning_rate": 1.1532392204167275e-05, "loss": 1.0353, "step": 5060 }, { "epoch": 0.7796688486715441, "grad_norm": 0.9561004042625427, "learning_rate": 1.1501496367858144e-05, "loss": 1.0016, "step": 5062 }, { "epoch": 0.779976896418945, "grad_norm": 0.6417694091796875, "learning_rate": 1.1470636593276779e-05, "loss": 0.8021, "step": 5064 }, { "epoch": 0.7802849441663458, "grad_norm": 0.8332743644714355, "learning_rate": 1.1439812909329616e-05, "loss": 1.1154, "step": 5066 }, { "epoch": 0.7805929919137466, "grad_norm": 0.95404452085495, "learning_rate": 1.1409025344889362e-05, "loss": 1.0442, "step": 5068 }, { "epoch": 0.7809010396611474, "grad_norm": 0.6904376149177551, "learning_rate": 1.1378273928794886e-05, "loss": 0.9646, "step": 5070 }, { "epoch": 0.7812090874085483, "grad_norm": 1.0550509691238403, "learning_rate": 1.134755868985119e-05, "loss": 0.8689, "step": 5072 }, { "epoch": 0.7815171351559492, "grad_norm": 0.7711645364761353, "learning_rate": 1.1316879656829338e-05, "loss": 0.985, "step": 5074 }, { "epoch": 0.78182518290335, "grad_norm": 1.0338762998580933, "learning_rate": 1.128623685846656e-05, "loss": 1.0949, "step": 5076 }, { "epoch": 0.7821332306507509, "grad_norm": 0.6583460569381714, "learning_rate": 1.1255630323466116e-05, "loss": 0.8089, "step": 5078 }, { "epoch": 0.7824412783981517, "grad_norm": 0.9409406185150146, "learning_rate": 1.1225060080497257e-05, "loss": 0.9767, "step": 5080 }, { "epoch": 0.7827493261455526, "grad_norm": 0.9006635546684265, "learning_rate": 1.1194526158195274e-05, "loss": 0.7231, "step": 5082 }, { "epoch": 0.7830573738929534, "grad_norm": 1.1432090997695923, "learning_rate": 1.1164028585161456e-05, "loss": 0.9457, "step": 5084 }, { "epoch": 0.7833654216403543, "grad_norm": 0.4902731478214264, "learning_rate": 1.1133567389963035e-05, "loss": 0.6747, "step": 5086 }, { "epoch": 0.7836734693877551, "grad_norm": 1.0363097190856934, "learning_rate": 1.1103142601133098e-05, "loss": 0.955, "step": 5088 }, { "epoch": 0.783981517135156, "grad_norm": 0.7746341228485107, "learning_rate": 1.107275424717074e-05, "loss": 0.7925, "step": 5090 }, { "epoch": 0.7842895648825567, "grad_norm": 0.8868885040283203, "learning_rate": 1.1042402356540853e-05, "loss": 0.8253, "step": 5092 }, { "epoch": 0.7845976126299576, "grad_norm": 1.0761430263519287, "learning_rate": 1.1012086957674194e-05, "loss": 1.0306, "step": 5094 }, { "epoch": 0.7849056603773585, "grad_norm": 0.7536218762397766, "learning_rate": 1.0981808078967348e-05, "loss": 0.9471, "step": 5096 }, { "epoch": 0.7852137081247593, "grad_norm": 0.8555169105529785, "learning_rate": 1.0951565748782666e-05, "loss": 2.3108, "step": 5098 }, { "epoch": 0.7855217558721602, "grad_norm": 0.698821485042572, "learning_rate": 1.092135999544831e-05, "loss": 1.2321, "step": 5100 }, { "epoch": 0.7855217558721602, "eval_loss": 2.402956485748291, "eval_runtime": 737.0859, "eval_samples_per_second": 2.713, "eval_steps_per_second": 0.678, "step": 5100 }, { "epoch": 0.785829803619561, "grad_norm": 0.6485735774040222, "learning_rate": 1.0891190847258093e-05, "loss": 0.7757, "step": 5102 }, { "epoch": 0.7861378513669619, "grad_norm": 0.811083972454071, "learning_rate": 1.0861058332471652e-05, "loss": 1.1812, "step": 5104 }, { "epoch": 0.7864458991143627, "grad_norm": 0.7928170561790466, "learning_rate": 1.0830962479314226e-05, "loss": 0.8711, "step": 5106 }, { "epoch": 0.7867539468617636, "grad_norm": 0.771323025226593, "learning_rate": 1.0800903315976757e-05, "loss": 0.9764, "step": 5108 }, { "epoch": 0.7870619946091644, "grad_norm": 0.9954501390457153, "learning_rate": 1.077088087061579e-05, "loss": 0.9129, "step": 5110 }, { "epoch": 0.7873700423565653, "grad_norm": 0.6200007796287537, "learning_rate": 1.0740895171353493e-05, "loss": 1.8826, "step": 5112 }, { "epoch": 0.7876780901039662, "grad_norm": 1.0519543886184692, "learning_rate": 1.0710946246277615e-05, "loss": 0.9654, "step": 5114 }, { "epoch": 0.787986137851367, "grad_norm": 0.6422151327133179, "learning_rate": 1.0681034123441447e-05, "loss": 1.0095, "step": 5116 }, { "epoch": 0.7882941855987678, "grad_norm": 0.7397940158843994, "learning_rate": 1.0651158830863816e-05, "loss": 0.8926, "step": 5118 }, { "epoch": 0.7886022333461686, "grad_norm": 0.7962545156478882, "learning_rate": 1.0621320396529056e-05, "loss": 0.904, "step": 5120 }, { "epoch": 0.7889102810935695, "grad_norm": 0.7353315949440002, "learning_rate": 1.0591518848386956e-05, "loss": 0.8093, "step": 5122 }, { "epoch": 0.7892183288409703, "grad_norm": 0.8774969577789307, "learning_rate": 1.0561754214352765e-05, "loss": 1.0146, "step": 5124 }, { "epoch": 0.7895263765883712, "grad_norm": 0.759262204170227, "learning_rate": 1.0532026522307164e-05, "loss": 1.0412, "step": 5126 }, { "epoch": 0.789834424335772, "grad_norm": 0.972540020942688, "learning_rate": 1.0502335800096214e-05, "loss": 1.0528, "step": 5128 }, { "epoch": 0.7901424720831729, "grad_norm": 0.7988440990447998, "learning_rate": 1.0472682075531354e-05, "loss": 0.9716, "step": 5130 }, { "epoch": 0.7904505198305737, "grad_norm": 0.6502975821495056, "learning_rate": 1.0443065376389366e-05, "loss": 0.9217, "step": 5132 }, { "epoch": 0.7907585675779746, "grad_norm": 0.9882317185401917, "learning_rate": 1.0413485730412337e-05, "loss": 1.5967, "step": 5134 }, { "epoch": 0.7910666153253755, "grad_norm": 0.7147364020347595, "learning_rate": 1.0383943165307697e-05, "loss": 1.0296, "step": 5136 }, { "epoch": 0.7913746630727763, "grad_norm": 0.9587258100509644, "learning_rate": 1.0354437708748071e-05, "loss": 1.0206, "step": 5138 }, { "epoch": 0.7916827108201772, "grad_norm": 0.8416325449943542, "learning_rate": 1.0324969388371364e-05, "loss": 0.9132, "step": 5140 }, { "epoch": 0.791990758567578, "grad_norm": 0.7498589754104614, "learning_rate": 1.0295538231780677e-05, "loss": 2.1829, "step": 5142 }, { "epoch": 0.7922988063149788, "grad_norm": 0.7145204544067383, "learning_rate": 1.0266144266544353e-05, "loss": 0.8647, "step": 5144 }, { "epoch": 0.7926068540623796, "grad_norm": 0.7027596831321716, "learning_rate": 1.0236787520195812e-05, "loss": 1.0984, "step": 5146 }, { "epoch": 0.7929149018097805, "grad_norm": 0.6741682291030884, "learning_rate": 1.0207468020233663e-05, "loss": 0.9404, "step": 5148 }, { "epoch": 0.7932229495571813, "grad_norm": 1.5651164054870605, "learning_rate": 1.0178185794121643e-05, "loss": 0.8988, "step": 5150 }, { "epoch": 0.7935309973045822, "grad_norm": 0.9429001212120056, "learning_rate": 1.0148940869288543e-05, "loss": 0.883, "step": 5152 }, { "epoch": 0.7938390450519831, "grad_norm": 0.6833556294441223, "learning_rate": 1.01197332731282e-05, "loss": 1.4627, "step": 5154 }, { "epoch": 0.7941470927993839, "grad_norm": 0.6978062391281128, "learning_rate": 1.0090563032999506e-05, "loss": 2.135, "step": 5156 }, { "epoch": 0.7944551405467848, "grad_norm": 1.2680683135986328, "learning_rate": 1.0061430176226394e-05, "loss": 1.1484, "step": 5158 }, { "epoch": 0.7947631882941856, "grad_norm": 0.7495437264442444, "learning_rate": 1.0032334730097715e-05, "loss": 0.9135, "step": 5160 }, { "epoch": 0.7950712360415865, "grad_norm": 1.005279541015625, "learning_rate": 1.00032767218673e-05, "loss": 1.0571, "step": 5162 }, { "epoch": 0.7953792837889873, "grad_norm": 0.7408681511878967, "learning_rate": 9.974256178753954e-06, "loss": 0.8986, "step": 5164 }, { "epoch": 0.7956873315363882, "grad_norm": 0.7218614816665649, "learning_rate": 9.945273127941358e-06, "loss": 0.8026, "step": 5166 }, { "epoch": 0.7959953792837889, "grad_norm": 0.8716273903846741, "learning_rate": 9.916327596578018e-06, "loss": 1.7818, "step": 5168 }, { "epoch": 0.7963034270311898, "grad_norm": 0.7888768315315247, "learning_rate": 9.887419611777405e-06, "loss": 1.3094, "step": 5170 }, { "epoch": 0.7966114747785907, "grad_norm": 0.8038328289985657, "learning_rate": 9.858549200617734e-06, "loss": 0.8248, "step": 5172 }, { "epoch": 0.7969195225259915, "grad_norm": 0.8979176878929138, "learning_rate": 9.829716390142073e-06, "loss": 1.0524, "step": 5174 }, { "epoch": 0.7972275702733924, "grad_norm": 1.2263667583465576, "learning_rate": 9.800921207358216e-06, "loss": 0.9444, "step": 5176 }, { "epoch": 0.7975356180207932, "grad_norm": 0.8946297764778137, "learning_rate": 9.772163679238778e-06, "loss": 2.0287, "step": 5178 }, { "epoch": 0.7978436657681941, "grad_norm": 0.8920976519584656, "learning_rate": 9.743443832721055e-06, "loss": 0.7952, "step": 5180 }, { "epoch": 0.7981517135155949, "grad_norm": 1.0788079500198364, "learning_rate": 9.714761694707069e-06, "loss": 1.0469, "step": 5182 }, { "epoch": 0.7984597612629958, "grad_norm": 1.0200272798538208, "learning_rate": 9.686117292063501e-06, "loss": 0.8722, "step": 5184 }, { "epoch": 0.7987678090103966, "grad_norm": 1.0531045198440552, "learning_rate": 9.657510651621709e-06, "loss": 0.9411, "step": 5186 }, { "epoch": 0.7990758567577975, "grad_norm": 0.6479781270027161, "learning_rate": 9.628941800177654e-06, "loss": 0.916, "step": 5188 }, { "epoch": 0.7993839045051983, "grad_norm": 0.8514047265052795, "learning_rate": 9.600410764491924e-06, "loss": 1.149, "step": 5190 }, { "epoch": 0.7996919522525991, "grad_norm": 0.7890281081199646, "learning_rate": 9.571917571289662e-06, "loss": 0.8978, "step": 5192 }, { "epoch": 0.8, "grad_norm": 0.9271743297576904, "learning_rate": 9.543462247260586e-06, "loss": 1.556, "step": 5194 }, { "epoch": 0.8003080477474008, "grad_norm": 0.7708587646484375, "learning_rate": 9.515044819058922e-06, "loss": 1.6046, "step": 5196 }, { "epoch": 0.8006160954948017, "grad_norm": 0.8407145142555237, "learning_rate": 9.48666531330341e-06, "loss": 0.825, "step": 5198 }, { "epoch": 0.8009241432422025, "grad_norm": 0.6378217339515686, "learning_rate": 9.458323756577264e-06, "loss": 0.691, "step": 5200 }, { "epoch": 0.8012321909896034, "grad_norm": 0.6951934099197388, "learning_rate": 9.430020175428156e-06, "loss": 0.8766, "step": 5202 }, { "epoch": 0.8015402387370042, "grad_norm": 0.9855650067329407, "learning_rate": 9.40175459636818e-06, "loss": 0.9587, "step": 5204 }, { "epoch": 0.8018482864844051, "grad_norm": 0.6208633184432983, "learning_rate": 9.37352704587383e-06, "loss": 2.1109, "step": 5206 }, { "epoch": 0.8021563342318059, "grad_norm": 0.7674615383148193, "learning_rate": 9.345337550385985e-06, "loss": 0.8624, "step": 5208 }, { "epoch": 0.8024643819792068, "grad_norm": 0.8252739906311035, "learning_rate": 9.317186136309901e-06, "loss": 0.9765, "step": 5210 }, { "epoch": 0.8027724297266077, "grad_norm": 0.7470067143440247, "learning_rate": 9.28907283001511e-06, "loss": 1.4624, "step": 5212 }, { "epoch": 0.8030804774740085, "grad_norm": 0.634072482585907, "learning_rate": 9.260997657835486e-06, "loss": 0.6964, "step": 5214 }, { "epoch": 0.8033885252214094, "grad_norm": 0.9666215777397156, "learning_rate": 9.232960646069171e-06, "loss": 0.9129, "step": 5216 }, { "epoch": 0.8036965729688101, "grad_norm": 0.8223074674606323, "learning_rate": 9.204961820978569e-06, "loss": 0.7595, "step": 5218 }, { "epoch": 0.804004620716211, "grad_norm": 0.7416442632675171, "learning_rate": 9.17700120879031e-06, "loss": 2.8862, "step": 5220 }, { "epoch": 0.8043126684636118, "grad_norm": 0.6753026843070984, "learning_rate": 9.149078835695213e-06, "loss": 1.1582, "step": 5222 }, { "epoch": 0.8046207162110127, "grad_norm": 0.8889574408531189, "learning_rate": 9.121194727848337e-06, "loss": 1.0037, "step": 5224 }, { "epoch": 0.8049287639584135, "grad_norm": 1.0888804197311401, "learning_rate": 9.093348911368816e-06, "loss": 1.1738, "step": 5226 }, { "epoch": 0.8052368117058144, "grad_norm": 0.7588722109794617, "learning_rate": 9.065541412339956e-06, "loss": 1.1241, "step": 5228 }, { "epoch": 0.8055448594532153, "grad_norm": 0.9247668385505676, "learning_rate": 9.037772256809195e-06, "loss": 1.1129, "step": 5230 }, { "epoch": 0.8058529072006161, "grad_norm": 0.7385468482971191, "learning_rate": 9.010041470788033e-06, "loss": 1.0471, "step": 5232 }, { "epoch": 0.806160954948017, "grad_norm": 0.6662497520446777, "learning_rate": 8.982349080252e-06, "loss": 0.9419, "step": 5234 }, { "epoch": 0.8064690026954178, "grad_norm": 0.9361608028411865, "learning_rate": 8.954695111140688e-06, "loss": 0.9675, "step": 5236 }, { "epoch": 0.8067770504428187, "grad_norm": 0.6301150918006897, "learning_rate": 8.927079589357722e-06, "loss": 0.8097, "step": 5238 }, { "epoch": 0.8070850981902195, "grad_norm": 0.7306633591651917, "learning_rate": 8.899502540770688e-06, "loss": 0.9882, "step": 5240 }, { "epoch": 0.8073931459376203, "grad_norm": 1.0911128520965576, "learning_rate": 8.871963991211107e-06, "loss": 0.9689, "step": 5242 }, { "epoch": 0.8077011936850211, "grad_norm": 0.9000625610351562, "learning_rate": 8.844463966474491e-06, "loss": 1.0501, "step": 5244 }, { "epoch": 0.808009241432422, "grad_norm": 0.8635241389274597, "learning_rate": 8.81700249232026e-06, "loss": 0.8923, "step": 5246 }, { "epoch": 0.8083172891798228, "grad_norm": 0.5929491519927979, "learning_rate": 8.789579594471648e-06, "loss": 0.8787, "step": 5248 }, { "epoch": 0.8086253369272237, "grad_norm": 0.6980977058410645, "learning_rate": 8.762195298615855e-06, "loss": 0.8485, "step": 5250 }, { "epoch": 0.8089333846746246, "grad_norm": 0.7201548218727112, "learning_rate": 8.734849630403873e-06, "loss": 0.9347, "step": 5252 }, { "epoch": 0.8092414324220254, "grad_norm": 0.7645498514175415, "learning_rate": 8.70754261545052e-06, "loss": 0.8769, "step": 5254 }, { "epoch": 0.8095494801694263, "grad_norm": 0.5888391137123108, "learning_rate": 8.680274279334372e-06, "loss": 0.7781, "step": 5256 }, { "epoch": 0.8098575279168271, "grad_norm": 1.011031150817871, "learning_rate": 8.653044647597846e-06, "loss": 0.95, "step": 5258 }, { "epoch": 0.810165575664228, "grad_norm": 0.564937949180603, "learning_rate": 8.625853745747048e-06, "loss": 0.8399, "step": 5260 }, { "epoch": 0.8104736234116288, "grad_norm": 0.9204162359237671, "learning_rate": 8.598701599251818e-06, "loss": 1.1062, "step": 5262 }, { "epoch": 0.8107816711590297, "grad_norm": 0.6450652480125427, "learning_rate": 8.571588233545713e-06, "loss": 2.0054, "step": 5264 }, { "epoch": 0.8110897189064304, "grad_norm": 1.0156514644622803, "learning_rate": 8.544513674025934e-06, "loss": 1.0376, "step": 5266 }, { "epoch": 0.8113977666538313, "grad_norm": 1.0980149507522583, "learning_rate": 8.517477946053353e-06, "loss": 0.8962, "step": 5268 }, { "epoch": 0.8117058144012322, "grad_norm": 1.1988979578018188, "learning_rate": 8.49048107495246e-06, "loss": 1.1929, "step": 5270 }, { "epoch": 0.812013862148633, "grad_norm": 0.8038528561592102, "learning_rate": 8.46352308601136e-06, "loss": 1.0203, "step": 5272 }, { "epoch": 0.8123219098960339, "grad_norm": 0.7679852247238159, "learning_rate": 8.436604004481713e-06, "loss": 1.1832, "step": 5274 }, { "epoch": 0.8126299576434347, "grad_norm": 0.8102967143058777, "learning_rate": 8.409723855578756e-06, "loss": 2.2757, "step": 5276 }, { "epoch": 0.8129380053908356, "grad_norm": 0.5498160123825073, "learning_rate": 8.382882664481245e-06, "loss": 0.7996, "step": 5278 }, { "epoch": 0.8132460531382364, "grad_norm": 0.8951212763786316, "learning_rate": 8.35608045633145e-06, "loss": 0.9904, "step": 5280 }, { "epoch": 0.8135541008856373, "grad_norm": 0.9070693254470825, "learning_rate": 8.32931725623513e-06, "loss": 2.2744, "step": 5282 }, { "epoch": 0.8138621486330381, "grad_norm": 0.9921269416809082, "learning_rate": 8.302593089261496e-06, "loss": 0.9546, "step": 5284 }, { "epoch": 0.814170196380439, "grad_norm": 0.9248214364051819, "learning_rate": 8.275907980443199e-06, "loss": 0.9497, "step": 5286 }, { "epoch": 0.8144782441278399, "grad_norm": 0.9269345998764038, "learning_rate": 8.2492619547763e-06, "loss": 1.0542, "step": 5288 }, { "epoch": 0.8147862918752407, "grad_norm": 0.7099084854125977, "learning_rate": 8.222655037220261e-06, "loss": 2.1591, "step": 5290 }, { "epoch": 0.8150943396226416, "grad_norm": 0.8812754154205322, "learning_rate": 8.19608725269791e-06, "loss": 0.8864, "step": 5292 }, { "epoch": 0.8154023873700423, "grad_norm": 0.8371664881706238, "learning_rate": 8.169558626095403e-06, "loss": 1.7353, "step": 5294 }, { "epoch": 0.8157104351174432, "grad_norm": 0.7795840501785278, "learning_rate": 8.143069182262226e-06, "loss": 0.9172, "step": 5296 }, { "epoch": 0.816018482864844, "grad_norm": 0.6335655450820923, "learning_rate": 8.116618946011195e-06, "loss": 1.0627, "step": 5298 }, { "epoch": 0.8163265306122449, "grad_norm": 0.7013536691665649, "learning_rate": 8.090207942118333e-06, "loss": 0.9354, "step": 5300 }, { "epoch": 0.8166345783596457, "grad_norm": 0.7837877869606018, "learning_rate": 8.063836195322954e-06, "loss": 1.2341, "step": 5302 }, { "epoch": 0.8169426261070466, "grad_norm": 0.7043695449829102, "learning_rate": 8.037503730327634e-06, "loss": 0.9237, "step": 5304 }, { "epoch": 0.8172506738544474, "grad_norm": 0.6496650576591492, "learning_rate": 8.011210571798073e-06, "loss": 0.9217, "step": 5306 }, { "epoch": 0.8175587216018483, "grad_norm": 0.8028094172477722, "learning_rate": 7.984956744363208e-06, "loss": 1.028, "step": 5308 }, { "epoch": 0.8178667693492492, "grad_norm": 0.9909071922302246, "learning_rate": 7.958742272615117e-06, "loss": 0.9005, "step": 5310 }, { "epoch": 0.81817481709665, "grad_norm": 0.7143129110336304, "learning_rate": 7.932567181109052e-06, "loss": 0.8295, "step": 5312 }, { "epoch": 0.8184828648440509, "grad_norm": 0.6799386739730835, "learning_rate": 7.906431494363298e-06, "loss": 1.0627, "step": 5314 }, { "epoch": 0.8187909125914516, "grad_norm": 0.8612110018730164, "learning_rate": 7.880335236859283e-06, "loss": 0.8716, "step": 5316 }, { "epoch": 0.8190989603388525, "grad_norm": 1.0764349699020386, "learning_rate": 7.854278433041512e-06, "loss": 1.0203, "step": 5318 }, { "epoch": 0.8194070080862533, "grad_norm": 0.8197991847991943, "learning_rate": 7.82826110731752e-06, "loss": 0.8948, "step": 5320 }, { "epoch": 0.8197150558336542, "grad_norm": 0.8864220976829529, "learning_rate": 7.802283284057815e-06, "loss": 0.8376, "step": 5322 }, { "epoch": 0.820023103581055, "grad_norm": 0.7360268235206604, "learning_rate": 7.776344987595985e-06, "loss": 0.9808, "step": 5324 }, { "epoch": 0.8203311513284559, "grad_norm": 0.7204895615577698, "learning_rate": 7.750446242228543e-06, "loss": 1.1138, "step": 5326 }, { "epoch": 0.8206391990758568, "grad_norm": 0.814656138420105, "learning_rate": 7.724587072214973e-06, "loss": 0.8013, "step": 5328 }, { "epoch": 0.8209472468232576, "grad_norm": 0.8463807106018066, "learning_rate": 7.698767501777644e-06, "loss": 0.9006, "step": 5330 }, { "epoch": 0.8212552945706585, "grad_norm": 0.6860989928245544, "learning_rate": 7.672987555101907e-06, "loss": 0.9272, "step": 5332 }, { "epoch": 0.8215633423180593, "grad_norm": 0.7387297749519348, "learning_rate": 7.647247256335955e-06, "loss": 1.1595, "step": 5334 }, { "epoch": 0.8218713900654602, "grad_norm": 0.8675945401191711, "learning_rate": 7.621546629590814e-06, "loss": 0.9349, "step": 5336 }, { "epoch": 0.822179437812861, "grad_norm": 0.6692162156105042, "learning_rate": 7.595885698940408e-06, "loss": 1.5799, "step": 5338 }, { "epoch": 0.8224874855602619, "grad_norm": 0.8025321364402771, "learning_rate": 7.570264488421447e-06, "loss": 1.1254, "step": 5340 }, { "epoch": 0.8227955333076626, "grad_norm": 0.8939410448074341, "learning_rate": 7.544683022033439e-06, "loss": 0.8886, "step": 5342 }, { "epoch": 0.8231035810550635, "grad_norm": 0.9006834030151367, "learning_rate": 7.519141323738654e-06, "loss": 1.1788, "step": 5344 }, { "epoch": 0.8234116288024644, "grad_norm": 0.690070390701294, "learning_rate": 7.493639417462122e-06, "loss": 1.1011, "step": 5346 }, { "epoch": 0.8237196765498652, "grad_norm": 0.633353054523468, "learning_rate": 7.468177327091608e-06, "loss": 1.5389, "step": 5348 }, { "epoch": 0.8240277242972661, "grad_norm": 0.6479977965354919, "learning_rate": 7.442755076477559e-06, "loss": 0.9762, "step": 5350 }, { "epoch": 0.8243357720446669, "grad_norm": 0.7376688718795776, "learning_rate": 7.417372689433122e-06, "loss": 0.8483, "step": 5352 }, { "epoch": 0.8246438197920678, "grad_norm": 0.644631028175354, "learning_rate": 7.3920301897340945e-06, "loss": 0.7513, "step": 5354 }, { "epoch": 0.8249518675394686, "grad_norm": 0.8042016625404358, "learning_rate": 7.366727601118911e-06, "loss": 0.9001, "step": 5356 }, { "epoch": 0.8252599152868695, "grad_norm": 0.8431175947189331, "learning_rate": 7.341464947288629e-06, "loss": 0.7852, "step": 5358 }, { "epoch": 0.8255679630342703, "grad_norm": 0.7508544921875, "learning_rate": 7.3162422519068966e-06, "loss": 1.2253, "step": 5360 }, { "epoch": 0.8258760107816712, "grad_norm": 0.6164590120315552, "learning_rate": 7.29105953859992e-06, "loss": 0.8867, "step": 5362 }, { "epoch": 0.826184058529072, "grad_norm": 0.8672991991043091, "learning_rate": 7.265916830956471e-06, "loss": 1.1461, "step": 5364 }, { "epoch": 0.8264921062764728, "grad_norm": 0.6648744344711304, "learning_rate": 7.240814152527842e-06, "loss": 1.064, "step": 5366 }, { "epoch": 0.8268001540238737, "grad_norm": 0.943226158618927, "learning_rate": 7.21575152682783e-06, "loss": 0.8295, "step": 5368 }, { "epoch": 0.8271082017712745, "grad_norm": 0.7448985576629639, "learning_rate": 7.190728977332706e-06, "loss": 0.8785, "step": 5370 }, { "epoch": 0.8274162495186754, "grad_norm": 1.0283031463623047, "learning_rate": 7.165746527481215e-06, "loss": 0.9735, "step": 5372 }, { "epoch": 0.8277242972660762, "grad_norm": 0.8894911408424377, "learning_rate": 7.140804200674528e-06, "loss": 0.8816, "step": 5374 }, { "epoch": 0.8280323450134771, "grad_norm": 0.8363602161407471, "learning_rate": 7.115902020276239e-06, "loss": 0.8805, "step": 5376 }, { "epoch": 0.8283403927608779, "grad_norm": 0.7517054080963135, "learning_rate": 7.09104000961236e-06, "loss": 0.8632, "step": 5378 }, { "epoch": 0.8286484405082788, "grad_norm": 0.8066534996032715, "learning_rate": 7.066218191971219e-06, "loss": 0.8924, "step": 5380 }, { "epoch": 0.8289564882556796, "grad_norm": 0.8637999892234802, "learning_rate": 7.04143659060354e-06, "loss": 1.0192, "step": 5382 }, { "epoch": 0.8292645360030805, "grad_norm": 1.3805404901504517, "learning_rate": 7.016695228722358e-06, "loss": 1.1149, "step": 5384 }, { "epoch": 0.8295725837504814, "grad_norm": 0.7475231289863586, "learning_rate": 6.991994129503054e-06, "loss": 0.9157, "step": 5386 }, { "epoch": 0.8298806314978822, "grad_norm": 0.9382774233818054, "learning_rate": 6.967333316083225e-06, "loss": 1.1835, "step": 5388 }, { "epoch": 0.8301886792452831, "grad_norm": 0.6275261640548706, "learning_rate": 6.942712811562773e-06, "loss": 0.9794, "step": 5390 }, { "epoch": 0.8304967269926838, "grad_norm": 0.7447370290756226, "learning_rate": 6.918132639003877e-06, "loss": 0.9704, "step": 5392 }, { "epoch": 0.8308047747400847, "grad_norm": 0.5516697764396667, "learning_rate": 6.893592821430856e-06, "loss": 0.9636, "step": 5394 }, { "epoch": 0.8311128224874855, "grad_norm": 0.7253577709197998, "learning_rate": 6.869093381830277e-06, "loss": 0.7356, "step": 5396 }, { "epoch": 0.8314208702348864, "grad_norm": 0.678250789642334, "learning_rate": 6.844634343150902e-06, "loss": 0.8708, "step": 5398 }, { "epoch": 0.8317289179822872, "grad_norm": 0.6933665871620178, "learning_rate": 6.820215728303625e-06, "loss": 1.0718, "step": 5400 }, { "epoch": 0.8317289179822872, "eval_loss": 2.4066405296325684, "eval_runtime": 736.1016, "eval_samples_per_second": 2.717, "eval_steps_per_second": 0.679, "step": 5400 }, { "epoch": 0.8320369657296881, "grad_norm": 0.6631490588188171, "learning_rate": 6.795837560161456e-06, "loss": 0.9066, "step": 5402 }, { "epoch": 0.832345013477089, "grad_norm": 0.9897297024726868, "learning_rate": 6.771499861559538e-06, "loss": 1.3744, "step": 5404 }, { "epoch": 0.8326530612244898, "grad_norm": 0.7145732641220093, "learning_rate": 6.747202655295126e-06, "loss": 0.705, "step": 5406 }, { "epoch": 0.8329611089718907, "grad_norm": 0.8757843375205994, "learning_rate": 6.722945964127525e-06, "loss": 0.9422, "step": 5408 }, { "epoch": 0.8332691567192915, "grad_norm": 0.7739924192428589, "learning_rate": 6.698729810778065e-06, "loss": 0.7416, "step": 5410 }, { "epoch": 0.8335772044666924, "grad_norm": 0.7572183012962341, "learning_rate": 6.674554217930162e-06, "loss": 0.8495, "step": 5412 }, { "epoch": 0.8338852522140932, "grad_norm": 0.725858747959137, "learning_rate": 6.650419208229186e-06, "loss": 0.9817, "step": 5414 }, { "epoch": 0.834193299961494, "grad_norm": 0.7561565041542053, "learning_rate": 6.626324804282525e-06, "loss": 0.8207, "step": 5416 }, { "epoch": 0.8345013477088948, "grad_norm": 0.695875883102417, "learning_rate": 6.6022710286595064e-06, "loss": 0.8971, "step": 5418 }, { "epoch": 0.8348093954562957, "grad_norm": 0.8448840379714966, "learning_rate": 6.578257903891427e-06, "loss": 0.7577, "step": 5420 }, { "epoch": 0.8351174432036965, "grad_norm": 0.7509380578994751, "learning_rate": 6.554285452471498e-06, "loss": 1.0915, "step": 5422 }, { "epoch": 0.8354254909510974, "grad_norm": 0.9362658262252808, "learning_rate": 6.530353696854791e-06, "loss": 1.0626, "step": 5424 }, { "epoch": 0.8357335386984983, "grad_norm": 0.5367106795310974, "learning_rate": 6.506462659458329e-06, "loss": 1.9706, "step": 5426 }, { "epoch": 0.8360415864458991, "grad_norm": 1.0008156299591064, "learning_rate": 6.482612362660945e-06, "loss": 0.9507, "step": 5428 }, { "epoch": 0.8363496341933, "grad_norm": 0.7898880243301392, "learning_rate": 6.458802828803323e-06, "loss": 0.9741, "step": 5430 }, { "epoch": 0.8366576819407008, "grad_norm": 0.719279944896698, "learning_rate": 6.435034080187969e-06, "loss": 1.0709, "step": 5432 }, { "epoch": 0.8369657296881017, "grad_norm": 0.6773216128349304, "learning_rate": 6.411306139079176e-06, "loss": 0.8413, "step": 5434 }, { "epoch": 0.8372737774355025, "grad_norm": 0.6237614750862122, "learning_rate": 6.387619027703018e-06, "loss": 0.9458, "step": 5436 }, { "epoch": 0.8375818251829034, "grad_norm": 0.7203165292739868, "learning_rate": 6.3639727682473225e-06, "loss": 0.9038, "step": 5438 }, { "epoch": 0.8378898729303041, "grad_norm": 0.7765570282936096, "learning_rate": 6.34036738286165e-06, "loss": 1.0212, "step": 5440 }, { "epoch": 0.838197920677705, "grad_norm": 0.8942344784736633, "learning_rate": 6.316802893657275e-06, "loss": 2.2842, "step": 5442 }, { "epoch": 0.8385059684251059, "grad_norm": 1.1004189252853394, "learning_rate": 6.293279322707169e-06, "loss": 0.8196, "step": 5444 }, { "epoch": 0.8388140161725067, "grad_norm": 0.819892168045044, "learning_rate": 6.269796692045965e-06, "loss": 0.8763, "step": 5446 }, { "epoch": 0.8391220639199076, "grad_norm": 1.0760273933410645, "learning_rate": 6.246355023669958e-06, "loss": 1.3298, "step": 5448 }, { "epoch": 0.8394301116673084, "grad_norm": 0.7805618047714233, "learning_rate": 6.222954339537063e-06, "loss": 0.9231, "step": 5450 }, { "epoch": 0.8397381594147093, "grad_norm": 1.1943398714065552, "learning_rate": 6.19959466156681e-06, "loss": 1.0681, "step": 5452 }, { "epoch": 0.8400462071621101, "grad_norm": 0.6588060259819031, "learning_rate": 6.17627601164032e-06, "loss": 0.8829, "step": 5454 }, { "epoch": 0.840354254909511, "grad_norm": 0.9226628541946411, "learning_rate": 6.152998411600269e-06, "loss": 0.7163, "step": 5456 }, { "epoch": 0.8406623026569118, "grad_norm": 0.7645328044891357, "learning_rate": 6.1297618832509285e-06, "loss": 0.7472, "step": 5458 }, { "epoch": 0.8409703504043127, "grad_norm": 0.7715820670127869, "learning_rate": 6.106566448358025e-06, "loss": 1.4482, "step": 5460 }, { "epoch": 0.8412783981517136, "grad_norm": 0.5981425046920776, "learning_rate": 6.083412128648847e-06, "loss": 0.7663, "step": 5462 }, { "epoch": 0.8415864458991144, "grad_norm": 0.7702302932739258, "learning_rate": 6.060298945812143e-06, "loss": 1.0621, "step": 5464 }, { "epoch": 0.8418944936465153, "grad_norm": 0.5986312627792358, "learning_rate": 6.037226921498168e-06, "loss": 1.6585, "step": 5466 }, { "epoch": 0.842202541393916, "grad_norm": 0.7191674113273621, "learning_rate": 6.014196077318562e-06, "loss": 0.8664, "step": 5468 }, { "epoch": 0.8425105891413169, "grad_norm": 1.0426437854766846, "learning_rate": 5.9912064348464305e-06, "loss": 1.0364, "step": 5470 }, { "epoch": 0.8428186368887177, "grad_norm": 0.9384727478027344, "learning_rate": 5.96825801561629e-06, "loss": 1.0487, "step": 5472 }, { "epoch": 0.8431266846361186, "grad_norm": 0.7399026155471802, "learning_rate": 5.945350841124037e-06, "loss": 0.8254, "step": 5474 }, { "epoch": 0.8434347323835194, "grad_norm": 0.7573807835578918, "learning_rate": 5.922484932826899e-06, "loss": 0.9785, "step": 5476 }, { "epoch": 0.8437427801309203, "grad_norm": 0.6942792534828186, "learning_rate": 5.8996603121435065e-06, "loss": 1.0023, "step": 5478 }, { "epoch": 0.8440508278783211, "grad_norm": 0.7304031252861023, "learning_rate": 5.8768770004537894e-06, "loss": 1.9075, "step": 5480 }, { "epoch": 0.844358875625722, "grad_norm": 0.8708906769752502, "learning_rate": 5.854135019098961e-06, "loss": 0.8669, "step": 5482 }, { "epoch": 0.8446669233731229, "grad_norm": 0.7126341462135315, "learning_rate": 5.831434389381546e-06, "loss": 0.925, "step": 5484 }, { "epoch": 0.8449749711205237, "grad_norm": 0.8131961226463318, "learning_rate": 5.808775132565341e-06, "loss": 0.8507, "step": 5486 }, { "epoch": 0.8452830188679246, "grad_norm": 0.852538526058197, "learning_rate": 5.786157269875386e-06, "loss": 0.7647, "step": 5488 }, { "epoch": 0.8455910666153253, "grad_norm": 0.785875141620636, "learning_rate": 5.763580822497905e-06, "loss": 0.7627, "step": 5490 }, { "epoch": 0.8458991143627262, "grad_norm": 0.9879255294799805, "learning_rate": 5.741045811580387e-06, "loss": 0.9807, "step": 5492 }, { "epoch": 0.846207162110127, "grad_norm": 0.8181496262550354, "learning_rate": 5.71855225823148e-06, "loss": 0.926, "step": 5494 }, { "epoch": 0.8465152098575279, "grad_norm": 1.0856611728668213, "learning_rate": 5.696100183521002e-06, "loss": 1.0308, "step": 5496 }, { "epoch": 0.8468232576049287, "grad_norm": 0.5956221222877502, "learning_rate": 5.673689608479893e-06, "loss": 1.0491, "step": 5498 }, { "epoch": 0.8471313053523296, "grad_norm": 0.8387410044670105, "learning_rate": 5.65132055410027e-06, "loss": 1.1071, "step": 5500 }, { "epoch": 0.8474393530997305, "grad_norm": 0.8523972034454346, "learning_rate": 5.628993041335334e-06, "loss": 0.8855, "step": 5502 }, { "epoch": 0.8477474008471313, "grad_norm": 0.5948216915130615, "learning_rate": 5.606707091099334e-06, "loss": 0.8292, "step": 5504 }, { "epoch": 0.8480554485945322, "grad_norm": 0.8574666976928711, "learning_rate": 5.584462724267653e-06, "loss": 1.0905, "step": 5506 }, { "epoch": 0.848363496341933, "grad_norm": 0.92282634973526, "learning_rate": 5.562259961676691e-06, "loss": 0.8807, "step": 5508 }, { "epoch": 0.8486715440893339, "grad_norm": 0.5856600999832153, "learning_rate": 5.540098824123874e-06, "loss": 0.7359, "step": 5510 }, { "epoch": 0.8489795918367347, "grad_norm": 0.7432599067687988, "learning_rate": 5.51797933236764e-06, "loss": 0.8005, "step": 5512 }, { "epoch": 0.8492876395841356, "grad_norm": 0.8326647281646729, "learning_rate": 5.495901507127427e-06, "loss": 0.8521, "step": 5514 }, { "epoch": 0.8495956873315363, "grad_norm": 0.9699840545654297, "learning_rate": 5.47386536908363e-06, "loss": 1.0648, "step": 5516 }, { "epoch": 0.8499037350789372, "grad_norm": 0.6920575499534607, "learning_rate": 5.451870938877607e-06, "loss": 1.1668, "step": 5518 }, { "epoch": 0.8502117828263381, "grad_norm": 0.8997227549552917, "learning_rate": 5.429918237111642e-06, "loss": 2.2459, "step": 5520 }, { "epoch": 0.8505198305737389, "grad_norm": 0.7757606506347656, "learning_rate": 5.408007284348931e-06, "loss": 0.8366, "step": 5522 }, { "epoch": 0.8508278783211398, "grad_norm": 0.7504420876502991, "learning_rate": 5.386138101113569e-06, "loss": 0.8136, "step": 5524 }, { "epoch": 0.8511359260685406, "grad_norm": 0.6319246292114258, "learning_rate": 5.364310707890518e-06, "loss": 1.1945, "step": 5526 }, { "epoch": 0.8514439738159415, "grad_norm": 0.7972545623779297, "learning_rate": 5.342525125125603e-06, "loss": 1.0876, "step": 5528 }, { "epoch": 0.8517520215633423, "grad_norm": 0.7912672162055969, "learning_rate": 5.320781373225481e-06, "loss": 1.05, "step": 5530 }, { "epoch": 0.8520600693107432, "grad_norm": 0.8491456508636475, "learning_rate": 5.299079472557622e-06, "loss": 1.2363, "step": 5532 }, { "epoch": 0.852368117058144, "grad_norm": 0.8818978071212769, "learning_rate": 5.277419443450293e-06, "loss": 0.8654, "step": 5534 }, { "epoch": 0.8526761648055449, "grad_norm": 0.7931317687034607, "learning_rate": 5.255801306192559e-06, "loss": 0.8854, "step": 5536 }, { "epoch": 0.8529842125529457, "grad_norm": 1.018884301185608, "learning_rate": 5.234225081034216e-06, "loss": 0.9068, "step": 5538 }, { "epoch": 0.8532922603003466, "grad_norm": 0.6841465830802917, "learning_rate": 5.21269078818582e-06, "loss": 0.8391, "step": 5540 }, { "epoch": 0.8536003080477474, "grad_norm": 0.8705657720565796, "learning_rate": 5.191198447818646e-06, "loss": 1.657, "step": 5542 }, { "epoch": 0.8539083557951482, "grad_norm": 0.827742874622345, "learning_rate": 5.169748080064651e-06, "loss": 0.9083, "step": 5544 }, { "epoch": 0.8542164035425491, "grad_norm": 0.5416191816329956, "learning_rate": 5.1483397050165365e-06, "loss": 0.8433, "step": 5546 }, { "epoch": 0.8545244512899499, "grad_norm": 0.8019801378250122, "learning_rate": 5.126973342727587e-06, "loss": 0.9913, "step": 5548 }, { "epoch": 0.8548324990373508, "grad_norm": 0.7593039274215698, "learning_rate": 5.105649013211777e-06, "loss": 0.8214, "step": 5550 }, { "epoch": 0.8551405467847516, "grad_norm": 1.0784088373184204, "learning_rate": 5.0843667364437246e-06, "loss": 1.0835, "step": 5552 }, { "epoch": 0.8554485945321525, "grad_norm": 0.7842487096786499, "learning_rate": 5.063126532358642e-06, "loss": 0.715, "step": 5554 }, { "epoch": 0.8557566422795533, "grad_norm": 0.9213837385177612, "learning_rate": 5.041928420852299e-06, "loss": 0.974, "step": 5556 }, { "epoch": 0.8560646900269542, "grad_norm": 0.7843840718269348, "learning_rate": 5.020772421781073e-06, "loss": 0.8333, "step": 5558 }, { "epoch": 0.8563727377743551, "grad_norm": 0.6879072785377502, "learning_rate": 4.999658554961917e-06, "loss": 0.7875, "step": 5560 }, { "epoch": 0.8566807855217559, "grad_norm": 0.9496925473213196, "learning_rate": 4.97858684017225e-06, "loss": 0.9994, "step": 5562 }, { "epoch": 0.8569888332691568, "grad_norm": 0.7569987177848816, "learning_rate": 4.957557297150056e-06, "loss": 0.878, "step": 5564 }, { "epoch": 0.8572968810165575, "grad_norm": 0.7940292954444885, "learning_rate": 4.936569945593817e-06, "loss": 0.7878, "step": 5566 }, { "epoch": 0.8576049287639584, "grad_norm": 1.0187393426895142, "learning_rate": 4.915624805162489e-06, "loss": 0.9715, "step": 5568 }, { "epoch": 0.8579129765113592, "grad_norm": 0.5202600955963135, "learning_rate": 4.894721895475452e-06, "loss": 0.94, "step": 5570 }, { "epoch": 0.8582210242587601, "grad_norm": 0.9687458872795105, "learning_rate": 4.873861236112587e-06, "loss": 0.9434, "step": 5572 }, { "epoch": 0.8585290720061609, "grad_norm": 0.7784386277198792, "learning_rate": 4.853042846614159e-06, "loss": 0.9892, "step": 5574 }, { "epoch": 0.8588371197535618, "grad_norm": 0.98586106300354, "learning_rate": 4.832266746480862e-06, "loss": 1.1127, "step": 5576 }, { "epoch": 0.8591451675009627, "grad_norm": 0.7746574282646179, "learning_rate": 4.811532955173742e-06, "loss": 0.9111, "step": 5578 }, { "epoch": 0.8594532152483635, "grad_norm": 0.850456178188324, "learning_rate": 4.790841492114256e-06, "loss": 0.9865, "step": 5580 }, { "epoch": 0.8597612629957644, "grad_norm": 0.6116123199462891, "learning_rate": 4.770192376684196e-06, "loss": 0.9438, "step": 5582 }, { "epoch": 0.8600693107431652, "grad_norm": 0.9664915204048157, "learning_rate": 4.749585628225678e-06, "loss": 0.9979, "step": 5584 }, { "epoch": 0.8603773584905661, "grad_norm": 0.6345783472061157, "learning_rate": 4.729021266041139e-06, "loss": 1.0397, "step": 5586 }, { "epoch": 0.8606854062379669, "grad_norm": 0.9745763540267944, "learning_rate": 4.70849930939331e-06, "loss": 1.0937, "step": 5588 }, { "epoch": 0.8609934539853678, "grad_norm": 1.003058910369873, "learning_rate": 4.688019777505214e-06, "loss": 1.1315, "step": 5590 }, { "epoch": 0.8613015017327685, "grad_norm": 0.6395254731178284, "learning_rate": 4.667582689560113e-06, "loss": 0.8876, "step": 5592 }, { "epoch": 0.8616095494801694, "grad_norm": 0.6765961647033691, "learning_rate": 4.64718806470153e-06, "loss": 1.0269, "step": 5594 }, { "epoch": 0.8619175972275702, "grad_norm": 0.6591092944145203, "learning_rate": 4.626835922033201e-06, "loss": 1.0236, "step": 5596 }, { "epoch": 0.8622256449749711, "grad_norm": 0.7652791738510132, "learning_rate": 4.606526280619072e-06, "loss": 0.9287, "step": 5598 }, { "epoch": 0.862533692722372, "grad_norm": 0.8166400790214539, "learning_rate": 4.586259159483286e-06, "loss": 0.817, "step": 5600 }, { "epoch": 0.8628417404697728, "grad_norm": 0.8276070952415466, "learning_rate": 4.56603457761014e-06, "loss": 2.0614, "step": 5602 }, { "epoch": 0.8631497882171737, "grad_norm": 0.8190547823905945, "learning_rate": 4.545852553944102e-06, "loss": 1.0747, "step": 5604 }, { "epoch": 0.8634578359645745, "grad_norm": 0.9670174717903137, "learning_rate": 4.525713107389762e-06, "loss": 1.0557, "step": 5606 }, { "epoch": 0.8637658837119754, "grad_norm": 0.7336003184318542, "learning_rate": 4.505616256811835e-06, "loss": 0.8337, "step": 5608 }, { "epoch": 0.8640739314593762, "grad_norm": 0.8529759049415588, "learning_rate": 4.485562021035133e-06, "loss": 0.9215, "step": 5610 }, { "epoch": 0.8643819792067771, "grad_norm": 0.7147994637489319, "learning_rate": 4.465550418844561e-06, "loss": 0.7897, "step": 5612 }, { "epoch": 0.8646900269541778, "grad_norm": 0.9193140268325806, "learning_rate": 4.445581468985066e-06, "loss": 0.8149, "step": 5614 }, { "epoch": 0.8649980747015787, "grad_norm": 0.6203229427337646, "learning_rate": 4.425655190161671e-06, "loss": 0.9411, "step": 5616 }, { "epoch": 0.8653061224489796, "grad_norm": 1.2721714973449707, "learning_rate": 4.405771601039399e-06, "loss": 0.9375, "step": 5618 }, { "epoch": 0.8656141701963804, "grad_norm": 0.9241076707839966, "learning_rate": 4.385930720243314e-06, "loss": 0.8362, "step": 5620 }, { "epoch": 0.8659222179437813, "grad_norm": 0.859778106212616, "learning_rate": 4.366132566358455e-06, "loss": 1.2562, "step": 5622 }, { "epoch": 0.8662302656911821, "grad_norm": 0.7657763957977295, "learning_rate": 4.346377157929838e-06, "loss": 0.8539, "step": 5624 }, { "epoch": 0.866538313438583, "grad_norm": 0.7467623353004456, "learning_rate": 4.32666451346247e-06, "loss": 1.0287, "step": 5626 }, { "epoch": 0.8668463611859838, "grad_norm": 1.0547900199890137, "learning_rate": 4.306994651421253e-06, "loss": 1.0037, "step": 5628 }, { "epoch": 0.8671544089333847, "grad_norm": 0.7862194180488586, "learning_rate": 4.287367590231045e-06, "loss": 0.7485, "step": 5630 }, { "epoch": 0.8674624566807855, "grad_norm": 0.6208391785621643, "learning_rate": 4.2677833482766e-06, "loss": 1.0326, "step": 5632 }, { "epoch": 0.8677705044281864, "grad_norm": 0.8460115790367126, "learning_rate": 4.248241943902592e-06, "loss": 1.4913, "step": 5634 }, { "epoch": 0.8680785521755873, "grad_norm": 0.7300686240196228, "learning_rate": 4.2287433954135205e-06, "loss": 0.978, "step": 5636 }, { "epoch": 0.8683865999229881, "grad_norm": 0.7100600004196167, "learning_rate": 4.2092877210737684e-06, "loss": 1.0427, "step": 5638 }, { "epoch": 0.868694647670389, "grad_norm": 0.7860296368598938, "learning_rate": 4.189874939107574e-06, "loss": 0.8127, "step": 5640 }, { "epoch": 0.8690026954177897, "grad_norm": 0.9475097060203552, "learning_rate": 4.170505067698977e-06, "loss": 0.9262, "step": 5642 }, { "epoch": 0.8693107431651906, "grad_norm": 0.6762198805809021, "learning_rate": 4.1511781249918e-06, "loss": 1.0889, "step": 5644 }, { "epoch": 0.8696187909125914, "grad_norm": 1.0616062879562378, "learning_rate": 4.131894129089709e-06, "loss": 0.8648, "step": 5646 }, { "epoch": 0.8699268386599923, "grad_norm": 0.8382073640823364, "learning_rate": 4.112653098056113e-06, "loss": 0.9439, "step": 5648 }, { "epoch": 0.8702348864073931, "grad_norm": 0.7744151949882507, "learning_rate": 4.0934550499141575e-06, "loss": 0.8389, "step": 5650 }, { "epoch": 0.870542934154794, "grad_norm": 0.9079608917236328, "learning_rate": 4.074300002646742e-06, "loss": 1.0208, "step": 5652 }, { "epoch": 0.8708509819021948, "grad_norm": 0.7319706678390503, "learning_rate": 4.055187974196511e-06, "loss": 0.8994, "step": 5654 }, { "epoch": 0.8711590296495957, "grad_norm": 1.06355881690979, "learning_rate": 4.036118982465787e-06, "loss": 1.0155, "step": 5656 }, { "epoch": 0.8714670773969966, "grad_norm": 0.8588384389877319, "learning_rate": 4.01709304531655e-06, "loss": 1.2075, "step": 5658 }, { "epoch": 0.8717751251443974, "grad_norm": 1.1078176498413086, "learning_rate": 3.998110180570525e-06, "loss": 0.8002, "step": 5660 }, { "epoch": 0.8720831728917983, "grad_norm": 0.8839870691299438, "learning_rate": 3.979170406009031e-06, "loss": 1.2371, "step": 5662 }, { "epoch": 0.872391220639199, "grad_norm": 1.0431289672851562, "learning_rate": 3.960273739373044e-06, "loss": 1.0638, "step": 5664 }, { "epoch": 0.8726992683866, "grad_norm": 0.6601695418357849, "learning_rate": 3.941420198363166e-06, "loss": 0.9727, "step": 5666 }, { "epoch": 0.8730073161340007, "grad_norm": 0.8522267937660217, "learning_rate": 3.922609800639587e-06, "loss": 0.7752, "step": 5668 }, { "epoch": 0.8733153638814016, "grad_norm": 0.9073500633239746, "learning_rate": 3.903842563822102e-06, "loss": 0.8728, "step": 5670 }, { "epoch": 0.8736234116288024, "grad_norm": 0.6048290133476257, "learning_rate": 3.885118505490065e-06, "loss": 0.8422, "step": 5672 }, { "epoch": 0.8739314593762033, "grad_norm": 0.5503186583518982, "learning_rate": 3.866437643182391e-06, "loss": 0.8766, "step": 5674 }, { "epoch": 0.8742395071236042, "grad_norm": 0.9213413000106812, "learning_rate": 3.847799994397527e-06, "loss": 1.0147, "step": 5676 }, { "epoch": 0.874547554871005, "grad_norm": 0.7165212035179138, "learning_rate": 3.829205576593448e-06, "loss": 1.0482, "step": 5678 }, { "epoch": 0.8748556026184059, "grad_norm": 0.979213297367096, "learning_rate": 3.810654407187636e-06, "loss": 0.9725, "step": 5680 }, { "epoch": 0.8751636503658067, "grad_norm": 0.9179180860519409, "learning_rate": 3.7921465035570537e-06, "loss": 1.2204, "step": 5682 }, { "epoch": 0.8754716981132076, "grad_norm": 0.6514227986335754, "learning_rate": 3.773681883038138e-06, "loss": 2.1307, "step": 5684 }, { "epoch": 0.8757797458606084, "grad_norm": 1.0250710248947144, "learning_rate": 3.755260562926788e-06, "loss": 0.9711, "step": 5686 }, { "epoch": 0.8760877936080093, "grad_norm": 0.9408641457557678, "learning_rate": 3.7368825604783386e-06, "loss": 1.7764, "step": 5688 }, { "epoch": 0.87639584135541, "grad_norm": 0.9162575006484985, "learning_rate": 3.7185478929075536e-06, "loss": 0.6678, "step": 5690 }, { "epoch": 0.8767038891028109, "grad_norm": 0.7940691709518433, "learning_rate": 3.7002565773886e-06, "loss": 0.8517, "step": 5692 }, { "epoch": 0.8770119368502118, "grad_norm": 0.8024888038635254, "learning_rate": 3.6820086310550395e-06, "loss": 0.9191, "step": 5694 }, { "epoch": 0.8773199845976126, "grad_norm": 0.7842996716499329, "learning_rate": 3.6638040709998044e-06, "loss": 1.1716, "step": 5696 }, { "epoch": 0.8776280323450135, "grad_norm": 0.6878358721733093, "learning_rate": 3.6456429142751823e-06, "loss": 0.7303, "step": 5698 }, { "epoch": 0.8779360800924143, "grad_norm": 0.792102038860321, "learning_rate": 3.6275251778928487e-06, "loss": 1.0849, "step": 5700 }, { "epoch": 0.8779360800924143, "eval_loss": 2.3906445503234863, "eval_runtime": 737.0453, "eval_samples_per_second": 2.714, "eval_steps_per_second": 0.678, "step": 5700 } ], "logging_steps": 2, "max_steps": 6492, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.193565032834662e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }