{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.17237485886338, "eval_steps": 200, "global_step": 8600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007527286413248024, "grad_norm": 50.4890251159668, "learning_rate": 2.0000000000000002e-07, "loss": 6.9304, "step": 1 }, { "epoch": 0.0015054572826496049, "grad_norm": 49.9339714050293, "learning_rate": 4.0000000000000003e-07, "loss": 6.9689, "step": 2 }, { "epoch": 0.002258185923974407, "grad_norm": 47.445003509521484, "learning_rate": 6.000000000000001e-07, "loss": 6.7552, "step": 3 }, { "epoch": 0.0030109145652992097, "grad_norm": 44.77499771118164, "learning_rate": 8.000000000000001e-07, "loss": 6.6353, "step": 4 }, { "epoch": 0.003763643206624012, "grad_norm": 39.52490997314453, "learning_rate": 1.0000000000000002e-06, "loss": 6.3939, "step": 5 }, { "epoch": 0.004516371847948814, "grad_norm": 33.10089111328125, "learning_rate": 1.2000000000000002e-06, "loss": 6.0928, "step": 6 }, { "epoch": 0.005269100489273617, "grad_norm": 26.767410278320312, "learning_rate": 1.4000000000000001e-06, "loss": 5.7977, "step": 7 }, { "epoch": 0.0060218291305984195, "grad_norm": 17.197263717651367, "learning_rate": 1.6000000000000001e-06, "loss": 5.5343, "step": 8 }, { "epoch": 0.006774557771923222, "grad_norm": 15.009638786315918, "learning_rate": 1.8e-06, "loss": 5.428, "step": 9 }, { "epoch": 0.007527286413248024, "grad_norm": 17.272836685180664, "learning_rate": 2.0000000000000003e-06, "loss": 5.257, "step": 10 }, { "epoch": 0.008280015054572827, "grad_norm": 14.924171447753906, "learning_rate": 2.2e-06, "loss": 5.208, "step": 11 }, { "epoch": 0.009032743695897629, "grad_norm": 9.842083930969238, "learning_rate": 2.4000000000000003e-06, "loss": 5.1321, "step": 12 }, { "epoch": 0.00978547233722243, "grad_norm": 12.939398765563965, "learning_rate": 2.6e-06, "loss": 5.0804, "step": 13 }, { "epoch": 0.010538200978547234, "grad_norm": 13.642485618591309, "learning_rate": 2.8000000000000003e-06, "loss": 5.0283, "step": 14 }, { "epoch": 0.011290929619872036, "grad_norm": 10.221881866455078, "learning_rate": 3e-06, "loss": 4.9967, "step": 15 }, { "epoch": 0.012043658261196839, "grad_norm": 11.98767375946045, "learning_rate": 3.2000000000000003e-06, "loss": 4.8755, "step": 16 }, { "epoch": 0.01279638690252164, "grad_norm": 15.661665916442871, "learning_rate": 3.4000000000000005e-06, "loss": 4.8202, "step": 17 }, { "epoch": 0.013549115543846444, "grad_norm": 10.90164852142334, "learning_rate": 3.6e-06, "loss": 4.6915, "step": 18 }, { "epoch": 0.014301844185171246, "grad_norm": 11.759597778320312, "learning_rate": 3.8e-06, "loss": 4.6141, "step": 19 }, { "epoch": 0.015054572826496047, "grad_norm": 10.210482597351074, "learning_rate": 4.000000000000001e-06, "loss": 4.5204, "step": 20 }, { "epoch": 0.01580730146782085, "grad_norm": 7.574390411376953, "learning_rate": 4.2000000000000004e-06, "loss": 4.3665, "step": 21 }, { "epoch": 0.016560030109145654, "grad_norm": 8.45850944519043, "learning_rate": 4.4e-06, "loss": 4.2701, "step": 22 }, { "epoch": 0.017312758750470454, "grad_norm": 6.602094650268555, "learning_rate": 4.6e-06, "loss": 4.1695, "step": 23 }, { "epoch": 0.018065487391795258, "grad_norm": 8.749544143676758, "learning_rate": 4.800000000000001e-06, "loss": 4.151, "step": 24 }, { "epoch": 0.01881821603312006, "grad_norm": 9.974653244018555, "learning_rate": 5e-06, "loss": 4.0055, "step": 25 }, { "epoch": 0.01957094467444486, "grad_norm": 7.017693996429443, "learning_rate": 5.2e-06, "loss": 3.8675, "step": 26 }, { "epoch": 0.020323673315769664, "grad_norm": 6.604528427124023, "learning_rate": 5.4e-06, "loss": 3.7907, "step": 27 }, { "epoch": 0.021076401957094468, "grad_norm": 5.762628555297852, "learning_rate": 5.600000000000001e-06, "loss": 3.7095, "step": 28 }, { "epoch": 0.02182913059841927, "grad_norm": 6.057140827178955, "learning_rate": 5.8e-06, "loss": 3.6605, "step": 29 }, { "epoch": 0.02258185923974407, "grad_norm": 5.276193618774414, "learning_rate": 6e-06, "loss": 3.5297, "step": 30 }, { "epoch": 0.023334587881068874, "grad_norm": 4.616048812866211, "learning_rate": 6.2e-06, "loss": 3.4556, "step": 31 }, { "epoch": 0.024087316522393678, "grad_norm": 5.253169536590576, "learning_rate": 6.4000000000000006e-06, "loss": 3.271, "step": 32 }, { "epoch": 0.024840045163718478, "grad_norm": 4.49751615524292, "learning_rate": 6.6e-06, "loss": 3.1939, "step": 33 }, { "epoch": 0.02559277380504328, "grad_norm": 3.6966333389282227, "learning_rate": 6.800000000000001e-06, "loss": 3.3101, "step": 34 }, { "epoch": 0.026345502446368085, "grad_norm": 3.549138307571411, "learning_rate": 7.000000000000001e-06, "loss": 3.1603, "step": 35 }, { "epoch": 0.027098231087692888, "grad_norm": 3.4918336868286133, "learning_rate": 7.2e-06, "loss": 3.0801, "step": 36 }, { "epoch": 0.027850959729017688, "grad_norm": 2.8425683975219727, "learning_rate": 7.4e-06, "loss": 2.9903, "step": 37 }, { "epoch": 0.02860368837034249, "grad_norm": 2.4775984287261963, "learning_rate": 7.6e-06, "loss": 3.0005, "step": 38 }, { "epoch": 0.029356417011667295, "grad_norm": 2.348798990249634, "learning_rate": 7.8e-06, "loss": 2.8099, "step": 39 }, { "epoch": 0.030109145652992095, "grad_norm": 2.4019668102264404, "learning_rate": 8.000000000000001e-06, "loss": 2.7189, "step": 40 }, { "epoch": 0.030861874294316898, "grad_norm": 2.0349526405334473, "learning_rate": 8.200000000000001e-06, "loss": 2.7032, "step": 41 }, { "epoch": 0.0316146029356417, "grad_norm": 1.9986299276351929, "learning_rate": 8.400000000000001e-06, "loss": 2.5915, "step": 42 }, { "epoch": 0.0323673315769665, "grad_norm": 1.7868865728378296, "learning_rate": 8.599999999999999e-06, "loss": 2.6067, "step": 43 }, { "epoch": 0.03312006021829131, "grad_norm": 2.0216636657714844, "learning_rate": 8.8e-06, "loss": 2.4897, "step": 44 }, { "epoch": 0.03387278885961611, "grad_norm": 1.957157015800476, "learning_rate": 9e-06, "loss": 2.5739, "step": 45 }, { "epoch": 0.03462551750094091, "grad_norm": 1.840311050415039, "learning_rate": 9.2e-06, "loss": 2.4632, "step": 46 }, { "epoch": 0.035378246142265715, "grad_norm": 1.6761062145233154, "learning_rate": 9.4e-06, "loss": 2.3948, "step": 47 }, { "epoch": 0.036130974783590515, "grad_norm": 1.4352948665618896, "learning_rate": 9.600000000000001e-06, "loss": 2.3809, "step": 48 }, { "epoch": 0.036883703424915315, "grad_norm": 1.5488179922103882, "learning_rate": 9.800000000000001e-06, "loss": 2.3257, "step": 49 }, { "epoch": 0.03763643206624012, "grad_norm": 1.4861825704574585, "learning_rate": 1e-05, "loss": 2.1993, "step": 50 }, { "epoch": 0.03838916070756492, "grad_norm": 1.8510116338729858, "learning_rate": 1.02e-05, "loss": 2.3039, "step": 51 }, { "epoch": 0.03914188934888972, "grad_norm": 1.4077510833740234, "learning_rate": 1.04e-05, "loss": 2.207, "step": 52 }, { "epoch": 0.03989461799021453, "grad_norm": 1.2293425798416138, "learning_rate": 1.06e-05, "loss": 2.2459, "step": 53 }, { "epoch": 0.04064734663153933, "grad_norm": 1.2160245180130005, "learning_rate": 1.08e-05, "loss": 2.2624, "step": 54 }, { "epoch": 0.041400075272864136, "grad_norm": 1.0257819890975952, "learning_rate": 1.1000000000000001e-05, "loss": 2.0783, "step": 55 }, { "epoch": 0.042152803914188935, "grad_norm": 1.0309265851974487, "learning_rate": 1.1200000000000001e-05, "loss": 2.0931, "step": 56 }, { "epoch": 0.042905532555513735, "grad_norm": 0.9451645612716675, "learning_rate": 1.1400000000000001e-05, "loss": 2.0292, "step": 57 }, { "epoch": 0.04365826119683854, "grad_norm": 1.0046815872192383, "learning_rate": 1.16e-05, "loss": 1.9902, "step": 58 }, { "epoch": 0.04441098983816334, "grad_norm": 1.043986439704895, "learning_rate": 1.18e-05, "loss": 2.0001, "step": 59 }, { "epoch": 0.04516371847948814, "grad_norm": 0.9532139301300049, "learning_rate": 1.2e-05, "loss": 1.96, "step": 60 }, { "epoch": 0.04591644712081295, "grad_norm": 0.8764267563819885, "learning_rate": 1.22e-05, "loss": 1.9413, "step": 61 }, { "epoch": 0.04666917576213775, "grad_norm": 0.8437471985816956, "learning_rate": 1.24e-05, "loss": 1.828, "step": 62 }, { "epoch": 0.04742190440346255, "grad_norm": 0.8598520755767822, "learning_rate": 1.2600000000000001e-05, "loss": 1.807, "step": 63 }, { "epoch": 0.048174633044787356, "grad_norm": 1.1899079084396362, "learning_rate": 1.2800000000000001e-05, "loss": 1.9349, "step": 64 }, { "epoch": 0.048927361686112156, "grad_norm": 0.8273900151252747, "learning_rate": 1.3000000000000001e-05, "loss": 1.7332, "step": 65 }, { "epoch": 0.049680090327436956, "grad_norm": 0.9177274703979492, "learning_rate": 1.32e-05, "loss": 1.8569, "step": 66 }, { "epoch": 0.05043281896876176, "grad_norm": 0.7951234579086304, "learning_rate": 1.3400000000000002e-05, "loss": 1.8277, "step": 67 }, { "epoch": 0.05118554761008656, "grad_norm": 1.3731305599212646, "learning_rate": 1.3600000000000002e-05, "loss": 1.8441, "step": 68 }, { "epoch": 0.05193827625141137, "grad_norm": 0.9802387356758118, "learning_rate": 1.3800000000000002e-05, "loss": 1.6918, "step": 69 }, { "epoch": 0.05269100489273617, "grad_norm": 0.9409193992614746, "learning_rate": 1.4000000000000001e-05, "loss": 1.7488, "step": 70 }, { "epoch": 0.05344373353406097, "grad_norm": 0.9579854011535645, "learning_rate": 1.42e-05, "loss": 1.6176, "step": 71 }, { "epoch": 0.054196462175385776, "grad_norm": 0.8916007876396179, "learning_rate": 1.44e-05, "loss": 1.6228, "step": 72 }, { "epoch": 0.054949190816710576, "grad_norm": 0.8085001111030579, "learning_rate": 1.4599999999999999e-05, "loss": 1.5769, "step": 73 }, { "epoch": 0.055701919458035376, "grad_norm": 0.829039454460144, "learning_rate": 1.48e-05, "loss": 1.5884, "step": 74 }, { "epoch": 0.05645464809936018, "grad_norm": 0.8241180777549744, "learning_rate": 1.5e-05, "loss": 1.6098, "step": 75 }, { "epoch": 0.05720737674068498, "grad_norm": 0.7877852916717529, "learning_rate": 1.52e-05, "loss": 1.5963, "step": 76 }, { "epoch": 0.05796010538200978, "grad_norm": 0.7709435820579529, "learning_rate": 1.54e-05, "loss": 1.5302, "step": 77 }, { "epoch": 0.05871283402333459, "grad_norm": 2.9911420345306396, "learning_rate": 1.56e-05, "loss": 1.6794, "step": 78 }, { "epoch": 0.05946556266465939, "grad_norm": 0.8914052844047546, "learning_rate": 1.58e-05, "loss": 1.5334, "step": 79 }, { "epoch": 0.06021829130598419, "grad_norm": 0.7065474390983582, "learning_rate": 1.6000000000000003e-05, "loss": 1.4156, "step": 80 }, { "epoch": 0.060971019947308996, "grad_norm": 0.8877649903297424, "learning_rate": 1.62e-05, "loss": 1.5349, "step": 81 }, { "epoch": 0.061723748588633796, "grad_norm": 0.6819965243339539, "learning_rate": 1.6400000000000002e-05, "loss": 1.5051, "step": 82 }, { "epoch": 0.0624764772299586, "grad_norm": 0.9046909213066101, "learning_rate": 1.66e-05, "loss": 1.652, "step": 83 }, { "epoch": 0.0632292058712834, "grad_norm": 0.5972185730934143, "learning_rate": 1.6800000000000002e-05, "loss": 1.4132, "step": 84 }, { "epoch": 0.06398193451260821, "grad_norm": 0.6016433835029602, "learning_rate": 1.7000000000000003e-05, "loss": 1.388, "step": 85 }, { "epoch": 0.064734663153933, "grad_norm": 0.7669436931610107, "learning_rate": 1.7199999999999998e-05, "loss": 1.5341, "step": 86 }, { "epoch": 0.06548739179525781, "grad_norm": 0.7781327962875366, "learning_rate": 1.74e-05, "loss": 1.4004, "step": 87 }, { "epoch": 0.06624012043658262, "grad_norm": 0.7785030007362366, "learning_rate": 1.76e-05, "loss": 1.5303, "step": 88 }, { "epoch": 0.06699284907790741, "grad_norm": 0.6395800113677979, "learning_rate": 1.78e-05, "loss": 1.3887, "step": 89 }, { "epoch": 0.06774557771923222, "grad_norm": 0.5951112508773804, "learning_rate": 1.8e-05, "loss": 1.4169, "step": 90 }, { "epoch": 0.06849830636055702, "grad_norm": 0.707750141620636, "learning_rate": 1.8200000000000002e-05, "loss": 1.3292, "step": 91 }, { "epoch": 0.06925103500188182, "grad_norm": 0.52686607837677, "learning_rate": 1.84e-05, "loss": 1.3204, "step": 92 }, { "epoch": 0.07000376364320662, "grad_norm": 0.615532636642456, "learning_rate": 1.86e-05, "loss": 1.3202, "step": 93 }, { "epoch": 0.07075649228453143, "grad_norm": 0.6026376485824585, "learning_rate": 1.88e-05, "loss": 1.3294, "step": 94 }, { "epoch": 0.07150922092585622, "grad_norm": 0.7171003818511963, "learning_rate": 1.9e-05, "loss": 1.3217, "step": 95 }, { "epoch": 0.07226194956718103, "grad_norm": 0.5249548554420471, "learning_rate": 1.9200000000000003e-05, "loss": 1.1968, "step": 96 }, { "epoch": 0.07301467820850584, "grad_norm": 0.6191391348838806, "learning_rate": 1.94e-05, "loss": 1.3065, "step": 97 }, { "epoch": 0.07376740684983063, "grad_norm": 0.5176107287406921, "learning_rate": 1.9600000000000002e-05, "loss": 1.2276, "step": 98 }, { "epoch": 0.07452013549115544, "grad_norm": 1.2667311429977417, "learning_rate": 1.9800000000000004e-05, "loss": 1.3904, "step": 99 }, { "epoch": 0.07527286413248024, "grad_norm": 0.5377724170684814, "learning_rate": 2e-05, "loss": 1.1631, "step": 100 }, { "epoch": 0.07602559277380504, "grad_norm": 0.9178743362426758, "learning_rate": 2.0200000000000003e-05, "loss": 1.3674, "step": 101 }, { "epoch": 0.07677832141512984, "grad_norm": 0.4737057387828827, "learning_rate": 2.04e-05, "loss": 1.1897, "step": 102 }, { "epoch": 0.07753105005645465, "grad_norm": 0.707305371761322, "learning_rate": 2.06e-05, "loss": 1.2715, "step": 103 }, { "epoch": 0.07828377869777944, "grad_norm": 0.6070755124092102, "learning_rate": 2.08e-05, "loss": 1.1951, "step": 104 }, { "epoch": 0.07903650733910425, "grad_norm": 0.5484482049942017, "learning_rate": 2.1e-05, "loss": 1.1751, "step": 105 }, { "epoch": 0.07978923598042906, "grad_norm": 0.6246324181556702, "learning_rate": 2.12e-05, "loss": 1.2162, "step": 106 }, { "epoch": 0.08054196462175386, "grad_norm": 0.5780453681945801, "learning_rate": 2.1400000000000002e-05, "loss": 1.238, "step": 107 }, { "epoch": 0.08129469326307866, "grad_norm": 0.6726382374763489, "learning_rate": 2.16e-05, "loss": 1.2124, "step": 108 }, { "epoch": 0.08204742190440346, "grad_norm": 0.645378828048706, "learning_rate": 2.18e-05, "loss": 1.1472, "step": 109 }, { "epoch": 0.08280015054572827, "grad_norm": 0.5620309114456177, "learning_rate": 2.2000000000000003e-05, "loss": 1.1322, "step": 110 }, { "epoch": 0.08355287918705306, "grad_norm": 0.5173045992851257, "learning_rate": 2.22e-05, "loss": 1.0799, "step": 111 }, { "epoch": 0.08430560782837787, "grad_norm": 0.4870627224445343, "learning_rate": 2.2400000000000002e-05, "loss": 1.0212, "step": 112 }, { "epoch": 0.08505833646970268, "grad_norm": 0.44173258543014526, "learning_rate": 2.26e-05, "loss": 1.0227, "step": 113 }, { "epoch": 0.08581106511102747, "grad_norm": 0.3893354535102844, "learning_rate": 2.2800000000000002e-05, "loss": 1.0262, "step": 114 }, { "epoch": 0.08656379375235228, "grad_norm": 0.43965262174606323, "learning_rate": 2.3000000000000003e-05, "loss": 1.0392, "step": 115 }, { "epoch": 0.08731652239367708, "grad_norm": 0.40328851342201233, "learning_rate": 2.32e-05, "loss": 1.0119, "step": 116 }, { "epoch": 0.08806925103500188, "grad_norm": 0.6726312637329102, "learning_rate": 2.3400000000000003e-05, "loss": 1.086, "step": 117 }, { "epoch": 0.08882197967632668, "grad_norm": 0.5165233016014099, "learning_rate": 2.36e-05, "loss": 1.0788, "step": 118 }, { "epoch": 0.08957470831765149, "grad_norm": 0.4085448384284973, "learning_rate": 2.38e-05, "loss": 1.0487, "step": 119 }, { "epoch": 0.09032743695897628, "grad_norm": 0.506576418876648, "learning_rate": 2.4e-05, "loss": 0.985, "step": 120 }, { "epoch": 0.09108016560030109, "grad_norm": 0.35858264565467834, "learning_rate": 2.4200000000000002e-05, "loss": 0.9361, "step": 121 }, { "epoch": 0.0918328942416259, "grad_norm": 0.4061872661113739, "learning_rate": 2.44e-05, "loss": 1.0226, "step": 122 }, { "epoch": 0.09258562288295069, "grad_norm": 0.3031686544418335, "learning_rate": 2.46e-05, "loss": 0.9198, "step": 123 }, { "epoch": 0.0933383515242755, "grad_norm": 0.3231579065322876, "learning_rate": 2.48e-05, "loss": 0.9161, "step": 124 }, { "epoch": 0.0940910801656003, "grad_norm": 0.32463395595550537, "learning_rate": 2.5e-05, "loss": 0.8704, "step": 125 }, { "epoch": 0.0948438088069251, "grad_norm": 0.36916303634643555, "learning_rate": 2.5200000000000003e-05, "loss": 1.01, "step": 126 }, { "epoch": 0.0955965374482499, "grad_norm": 0.6978028416633606, "learning_rate": 2.54e-05, "loss": 1.1076, "step": 127 }, { "epoch": 0.09634926608957471, "grad_norm": 0.2755505442619324, "learning_rate": 2.5600000000000002e-05, "loss": 0.8902, "step": 128 }, { "epoch": 0.0971019947308995, "grad_norm": 0.3795270323753357, "learning_rate": 2.58e-05, "loss": 0.9991, "step": 129 }, { "epoch": 0.09785472337222431, "grad_norm": 0.2842804789543152, "learning_rate": 2.6000000000000002e-05, "loss": 0.896, "step": 130 }, { "epoch": 0.09860745201354912, "grad_norm": 0.3224557340145111, "learning_rate": 2.6200000000000003e-05, "loss": 0.9503, "step": 131 }, { "epoch": 0.09936018065487391, "grad_norm": 0.288059800863266, "learning_rate": 2.64e-05, "loss": 0.9261, "step": 132 }, { "epoch": 0.10011290929619872, "grad_norm": 0.28848254680633545, "learning_rate": 2.6600000000000003e-05, "loss": 1.0214, "step": 133 }, { "epoch": 0.10086563793752353, "grad_norm": 0.2542151212692261, "learning_rate": 2.6800000000000004e-05, "loss": 0.8403, "step": 134 }, { "epoch": 0.10161836657884833, "grad_norm": 0.96816486120224, "learning_rate": 2.7000000000000002e-05, "loss": 0.9584, "step": 135 }, { "epoch": 0.10237109522017313, "grad_norm": 0.4161529242992401, "learning_rate": 2.7200000000000004e-05, "loss": 0.9456, "step": 136 }, { "epoch": 0.10312382386149793, "grad_norm": 0.3158939480781555, "learning_rate": 2.7400000000000002e-05, "loss": 0.9159, "step": 137 }, { "epoch": 0.10387655250282274, "grad_norm": 0.35430020093917847, "learning_rate": 2.7600000000000003e-05, "loss": 0.969, "step": 138 }, { "epoch": 0.10462928114414753, "grad_norm": 0.2892829477787018, "learning_rate": 2.7800000000000005e-05, "loss": 0.9632, "step": 139 }, { "epoch": 0.10538200978547234, "grad_norm": 0.47695109248161316, "learning_rate": 2.8000000000000003e-05, "loss": 0.9867, "step": 140 }, { "epoch": 0.10613473842679715, "grad_norm": 0.35889697074890137, "learning_rate": 2.8199999999999998e-05, "loss": 0.9118, "step": 141 }, { "epoch": 0.10688746706812194, "grad_norm": 0.5504655241966248, "learning_rate": 2.84e-05, "loss": 0.8921, "step": 142 }, { "epoch": 0.10764019570944675, "grad_norm": 0.28475067019462585, "learning_rate": 2.86e-05, "loss": 0.9337, "step": 143 }, { "epoch": 0.10839292435077155, "grad_norm": 0.27235740423202515, "learning_rate": 2.88e-05, "loss": 0.9006, "step": 144 }, { "epoch": 0.10914565299209635, "grad_norm": 0.4641967713832855, "learning_rate": 2.9e-05, "loss": 0.9618, "step": 145 }, { "epoch": 0.10989838163342115, "grad_norm": 0.36937063932418823, "learning_rate": 2.9199999999999998e-05, "loss": 0.9651, "step": 146 }, { "epoch": 0.11065111027474596, "grad_norm": 0.3489801287651062, "learning_rate": 2.94e-05, "loss": 0.9694, "step": 147 }, { "epoch": 0.11140383891607075, "grad_norm": 0.54405677318573, "learning_rate": 2.96e-05, "loss": 1.0111, "step": 148 }, { "epoch": 0.11215656755739556, "grad_norm": 0.37925031781196594, "learning_rate": 2.98e-05, "loss": 0.9083, "step": 149 }, { "epoch": 0.11290929619872037, "grad_norm": 0.47339364886283875, "learning_rate": 3e-05, "loss": 1.0255, "step": 150 }, { "epoch": 0.11366202484004516, "grad_norm": 0.35627296566963196, "learning_rate": 3.02e-05, "loss": 0.9266, "step": 151 }, { "epoch": 0.11441475348136997, "grad_norm": 0.29646748304367065, "learning_rate": 3.04e-05, "loss": 0.8425, "step": 152 }, { "epoch": 0.11516748212269477, "grad_norm": 0.6782879829406738, "learning_rate": 3.06e-05, "loss": 0.9709, "step": 153 }, { "epoch": 0.11592021076401957, "grad_norm": 0.3678097128868103, "learning_rate": 3.08e-05, "loss": 0.9242, "step": 154 }, { "epoch": 0.11667293940534437, "grad_norm": 0.31226426362991333, "learning_rate": 3.1e-05, "loss": 0.816, "step": 155 }, { "epoch": 0.11742566804666918, "grad_norm": 0.427293062210083, "learning_rate": 3.12e-05, "loss": 0.9697, "step": 156 }, { "epoch": 0.11817839668799397, "grad_norm": 0.26701492071151733, "learning_rate": 3.1400000000000004e-05, "loss": 0.8469, "step": 157 }, { "epoch": 0.11893112532931878, "grad_norm": 0.36263585090637207, "learning_rate": 3.16e-05, "loss": 0.8431, "step": 158 }, { "epoch": 0.11968385397064359, "grad_norm": 0.44940638542175293, "learning_rate": 3.18e-05, "loss": 0.8068, "step": 159 }, { "epoch": 0.12043658261196838, "grad_norm": 0.7073978185653687, "learning_rate": 3.2000000000000005e-05, "loss": 1.0438, "step": 160 }, { "epoch": 0.12118931125329319, "grad_norm": 0.39311057329177856, "learning_rate": 3.2200000000000003e-05, "loss": 0.8658, "step": 161 }, { "epoch": 0.12194203989461799, "grad_norm": 0.3423636257648468, "learning_rate": 3.24e-05, "loss": 0.8273, "step": 162 }, { "epoch": 0.12269476853594279, "grad_norm": 0.37941113114356995, "learning_rate": 3.26e-05, "loss": 0.8611, "step": 163 }, { "epoch": 0.12344749717726759, "grad_norm": 0.34221428632736206, "learning_rate": 3.2800000000000004e-05, "loss": 0.9434, "step": 164 }, { "epoch": 0.1242002258185924, "grad_norm": 0.31447362899780273, "learning_rate": 3.3e-05, "loss": 0.8399, "step": 165 }, { "epoch": 0.1249529544599172, "grad_norm": 0.28165510296821594, "learning_rate": 3.32e-05, "loss": 0.8028, "step": 166 }, { "epoch": 0.125705683101242, "grad_norm": 0.2968732714653015, "learning_rate": 3.3400000000000005e-05, "loss": 0.9019, "step": 167 }, { "epoch": 0.1264584117425668, "grad_norm": 0.2548545300960541, "learning_rate": 3.3600000000000004e-05, "loss": 0.8792, "step": 168 }, { "epoch": 0.1272111403838916, "grad_norm": 0.3172474205493927, "learning_rate": 3.38e-05, "loss": 0.8857, "step": 169 }, { "epoch": 0.12796386902521642, "grad_norm": 0.2785339057445526, "learning_rate": 3.4000000000000007e-05, "loss": 0.8365, "step": 170 }, { "epoch": 0.1287165976665412, "grad_norm": 0.2668098509311676, "learning_rate": 3.4200000000000005e-05, "loss": 0.8227, "step": 171 }, { "epoch": 0.129469326307866, "grad_norm": 0.5750877261161804, "learning_rate": 3.4399999999999996e-05, "loss": 0.9215, "step": 172 }, { "epoch": 0.1302220549491908, "grad_norm": 0.33541181683540344, "learning_rate": 3.46e-05, "loss": 0.8989, "step": 173 }, { "epoch": 0.13097478359051562, "grad_norm": 0.40025594830513, "learning_rate": 3.48e-05, "loss": 0.9137, "step": 174 }, { "epoch": 0.13172751223184043, "grad_norm": 0.28235867619514465, "learning_rate": 3.5e-05, "loss": 0.8015, "step": 175 }, { "epoch": 0.13248024087316523, "grad_norm": 0.2596929669380188, "learning_rate": 3.52e-05, "loss": 0.8831, "step": 176 }, { "epoch": 0.13323296951449, "grad_norm": 0.34030628204345703, "learning_rate": 3.54e-05, "loss": 0.8893, "step": 177 }, { "epoch": 0.13398569815581482, "grad_norm": 0.32196301221847534, "learning_rate": 3.56e-05, "loss": 0.9159, "step": 178 }, { "epoch": 0.13473842679713963, "grad_norm": 0.2604764699935913, "learning_rate": 3.58e-05, "loss": 0.8196, "step": 179 }, { "epoch": 0.13549115543846443, "grad_norm": 0.2337617576122284, "learning_rate": 3.6e-05, "loss": 0.8413, "step": 180 }, { "epoch": 0.13624388407978924, "grad_norm": 0.5312713384628296, "learning_rate": 3.62e-05, "loss": 0.9241, "step": 181 }, { "epoch": 0.13699661272111405, "grad_norm": 0.3514293432235718, "learning_rate": 3.6400000000000004e-05, "loss": 0.8861, "step": 182 }, { "epoch": 0.13774934136243885, "grad_norm": 0.571564257144928, "learning_rate": 3.66e-05, "loss": 0.906, "step": 183 }, { "epoch": 0.13850207000376363, "grad_norm": 0.2656637728214264, "learning_rate": 3.68e-05, "loss": 0.8541, "step": 184 }, { "epoch": 0.13925479864508844, "grad_norm": 0.3222091495990753, "learning_rate": 3.7e-05, "loss": 0.8006, "step": 185 }, { "epoch": 0.14000752728641325, "grad_norm": 0.35164767503738403, "learning_rate": 3.72e-05, "loss": 0.8862, "step": 186 }, { "epoch": 0.14076025592773805, "grad_norm": 0.2707628011703491, "learning_rate": 3.74e-05, "loss": 0.8755, "step": 187 }, { "epoch": 0.14151298456906286, "grad_norm": 0.2660597562789917, "learning_rate": 3.76e-05, "loss": 0.8548, "step": 188 }, { "epoch": 0.14226571321038767, "grad_norm": 0.3445446491241455, "learning_rate": 3.7800000000000004e-05, "loss": 0.855, "step": 189 }, { "epoch": 0.14301844185171245, "grad_norm": 0.3263220489025116, "learning_rate": 3.8e-05, "loss": 0.7909, "step": 190 }, { "epoch": 0.14377117049303725, "grad_norm": 0.39673081040382385, "learning_rate": 3.82e-05, "loss": 0.8989, "step": 191 }, { "epoch": 0.14452389913436206, "grad_norm": 0.372717946767807, "learning_rate": 3.8400000000000005e-05, "loss": 0.9119, "step": 192 }, { "epoch": 0.14527662777568687, "grad_norm": 0.3390619456768036, "learning_rate": 3.86e-05, "loss": 0.731, "step": 193 }, { "epoch": 0.14602935641701167, "grad_norm": 0.33846616744995117, "learning_rate": 3.88e-05, "loss": 0.8016, "step": 194 }, { "epoch": 0.14678208505833648, "grad_norm": 0.3176439702510834, "learning_rate": 3.9000000000000006e-05, "loss": 0.8213, "step": 195 }, { "epoch": 0.14753481369966126, "grad_norm": 0.28465113043785095, "learning_rate": 3.9200000000000004e-05, "loss": 0.7599, "step": 196 }, { "epoch": 0.14828754234098607, "grad_norm": 0.36305108666419983, "learning_rate": 3.94e-05, "loss": 0.876, "step": 197 }, { "epoch": 0.14904027098231087, "grad_norm": 0.2996470034122467, "learning_rate": 3.960000000000001e-05, "loss": 0.8247, "step": 198 }, { "epoch": 0.14979299962363568, "grad_norm": 0.3355691432952881, "learning_rate": 3.9800000000000005e-05, "loss": 0.8332, "step": 199 }, { "epoch": 0.1505457282649605, "grad_norm": 0.40371641516685486, "learning_rate": 4e-05, "loss": 0.8013, "step": 200 }, { "epoch": 0.1505457282649605, "eval_loss": 0.8203679919242859, "eval_runtime": 456.1969, "eval_samples_per_second": 21.103, "eval_steps_per_second": 0.66, "step": 200 }, { "epoch": 0.1512984569062853, "grad_norm": 0.4612388014793396, "learning_rate": 4.02e-05, "loss": 0.9646, "step": 201 }, { "epoch": 0.15205118554761007, "grad_norm": 0.3118171989917755, "learning_rate": 4.0400000000000006e-05, "loss": 0.836, "step": 202 }, { "epoch": 0.15280391418893488, "grad_norm": 0.2617281675338745, "learning_rate": 4.0600000000000004e-05, "loss": 0.8638, "step": 203 }, { "epoch": 0.1535566428302597, "grad_norm": 0.275842547416687, "learning_rate": 4.08e-05, "loss": 0.8043, "step": 204 }, { "epoch": 0.1543093714715845, "grad_norm": 0.23513798415660858, "learning_rate": 4.1e-05, "loss": 0.8157, "step": 205 }, { "epoch": 0.1550621001129093, "grad_norm": 0.3753209710121155, "learning_rate": 4.12e-05, "loss": 0.9045, "step": 206 }, { "epoch": 0.1558148287542341, "grad_norm": 0.34167638421058655, "learning_rate": 4.14e-05, "loss": 0.89, "step": 207 }, { "epoch": 0.1565675573955589, "grad_norm": 0.2409437745809555, "learning_rate": 4.16e-05, "loss": 0.85, "step": 208 }, { "epoch": 0.1573202860368837, "grad_norm": 0.34718069434165955, "learning_rate": 4.18e-05, "loss": 0.8462, "step": 209 }, { "epoch": 0.1580730146782085, "grad_norm": 0.3541075587272644, "learning_rate": 4.2e-05, "loss": 0.7713, "step": 210 }, { "epoch": 0.1588257433195333, "grad_norm": 0.26061850786209106, "learning_rate": 4.22e-05, "loss": 0.8884, "step": 211 }, { "epoch": 0.15957847196085811, "grad_norm": 0.3114808201789856, "learning_rate": 4.24e-05, "loss": 0.8849, "step": 212 }, { "epoch": 0.16033120060218292, "grad_norm": 0.3147481381893158, "learning_rate": 4.26e-05, "loss": 0.8613, "step": 213 }, { "epoch": 0.16108392924350773, "grad_norm": 0.2460867464542389, "learning_rate": 4.2800000000000004e-05, "loss": 0.7595, "step": 214 }, { "epoch": 0.1618366578848325, "grad_norm": 0.27717021107673645, "learning_rate": 4.3e-05, "loss": 0.8054, "step": 215 }, { "epoch": 0.16258938652615731, "grad_norm": 0.2939113676548004, "learning_rate": 4.32e-05, "loss": 0.8148, "step": 216 }, { "epoch": 0.16334211516748212, "grad_norm": 0.3336883783340454, "learning_rate": 4.3400000000000005e-05, "loss": 0.7845, "step": 217 }, { "epoch": 0.16409484380880693, "grad_norm": 0.3689129948616028, "learning_rate": 4.36e-05, "loss": 0.7734, "step": 218 }, { "epoch": 0.16484757245013174, "grad_norm": 0.5129280090332031, "learning_rate": 4.38e-05, "loss": 0.8676, "step": 219 }, { "epoch": 0.16560030109145654, "grad_norm": 0.3376157581806183, "learning_rate": 4.4000000000000006e-05, "loss": 0.8731, "step": 220 }, { "epoch": 0.16635302973278132, "grad_norm": 0.46399790048599243, "learning_rate": 4.4200000000000004e-05, "loss": 0.7524, "step": 221 }, { "epoch": 0.16710575837410613, "grad_norm": 0.3968639075756073, "learning_rate": 4.44e-05, "loss": 0.7867, "step": 222 }, { "epoch": 0.16785848701543093, "grad_norm": 0.32370316982269287, "learning_rate": 4.46e-05, "loss": 0.827, "step": 223 }, { "epoch": 0.16861121565675574, "grad_norm": 0.38691896200180054, "learning_rate": 4.4800000000000005e-05, "loss": 0.8368, "step": 224 }, { "epoch": 0.16936394429808055, "grad_norm": 0.42120781540870667, "learning_rate": 4.5e-05, "loss": 0.8421, "step": 225 }, { "epoch": 0.17011667293940536, "grad_norm": 0.2650003731250763, "learning_rate": 4.52e-05, "loss": 0.8395, "step": 226 }, { "epoch": 0.17086940158073013, "grad_norm": 0.6980119943618774, "learning_rate": 4.5400000000000006e-05, "loss": 0.92, "step": 227 }, { "epoch": 0.17162213022205494, "grad_norm": 0.43689489364624023, "learning_rate": 4.5600000000000004e-05, "loss": 0.8064, "step": 228 }, { "epoch": 0.17237485886337975, "grad_norm": 0.3624732494354248, "learning_rate": 4.58e-05, "loss": 0.7897, "step": 229 }, { "epoch": 0.17312758750470456, "grad_norm": 0.3105034828186035, "learning_rate": 4.600000000000001e-05, "loss": 0.7401, "step": 230 }, { "epoch": 0.17388031614602936, "grad_norm": 0.2699925899505615, "learning_rate": 4.6200000000000005e-05, "loss": 0.783, "step": 231 }, { "epoch": 0.17463304478735417, "grad_norm": 0.28839266300201416, "learning_rate": 4.64e-05, "loss": 0.8215, "step": 232 }, { "epoch": 0.17538577342867895, "grad_norm": 0.3672228157520294, "learning_rate": 4.660000000000001e-05, "loss": 0.7912, "step": 233 }, { "epoch": 0.17613850207000376, "grad_norm": 0.38578033447265625, "learning_rate": 4.6800000000000006e-05, "loss": 0.7663, "step": 234 }, { "epoch": 0.17689123071132856, "grad_norm": 0.3876879811286926, "learning_rate": 4.7e-05, "loss": 0.8104, "step": 235 }, { "epoch": 0.17764395935265337, "grad_norm": 0.38406890630722046, "learning_rate": 4.72e-05, "loss": 0.7746, "step": 236 }, { "epoch": 0.17839668799397818, "grad_norm": 0.3411761224269867, "learning_rate": 4.74e-05, "loss": 0.7418, "step": 237 }, { "epoch": 0.17914941663530298, "grad_norm": 0.45418205857276917, "learning_rate": 4.76e-05, "loss": 0.8362, "step": 238 }, { "epoch": 0.1799021452766278, "grad_norm": 0.4977017343044281, "learning_rate": 4.78e-05, "loss": 0.8028, "step": 239 }, { "epoch": 0.18065487391795257, "grad_norm": 0.4045603275299072, "learning_rate": 4.8e-05, "loss": 0.6989, "step": 240 }, { "epoch": 0.18140760255927738, "grad_norm": 0.4060465693473816, "learning_rate": 4.82e-05, "loss": 0.8118, "step": 241 }, { "epoch": 0.18216033120060218, "grad_norm": 0.36543208360671997, "learning_rate": 4.8400000000000004e-05, "loss": 0.8208, "step": 242 }, { "epoch": 0.182913059841927, "grad_norm": 0.3181208074092865, "learning_rate": 4.86e-05, "loss": 0.7889, "step": 243 }, { "epoch": 0.1836657884832518, "grad_norm": 0.9807712435722351, "learning_rate": 4.88e-05, "loss": 0.8099, "step": 244 }, { "epoch": 0.1844185171245766, "grad_norm": 0.8735480308532715, "learning_rate": 4.9e-05, "loss": 0.7819, "step": 245 }, { "epoch": 0.18517124576590138, "grad_norm": 0.49874505400657654, "learning_rate": 4.92e-05, "loss": 0.8369, "step": 246 }, { "epoch": 0.1859239744072262, "grad_norm": 0.4944124221801758, "learning_rate": 4.94e-05, "loss": 0.7247, "step": 247 }, { "epoch": 0.186676703048551, "grad_norm": 0.3712370693683624, "learning_rate": 4.96e-05, "loss": 0.8157, "step": 248 }, { "epoch": 0.1874294316898758, "grad_norm": 0.46207642555236816, "learning_rate": 4.9800000000000004e-05, "loss": 0.7924, "step": 249 }, { "epoch": 0.1881821603312006, "grad_norm": 0.4319447875022888, "learning_rate": 5e-05, "loss": 0.7869, "step": 250 }, { "epoch": 0.18893488897252542, "grad_norm": 0.44595691561698914, "learning_rate": 5.02e-05, "loss": 0.8532, "step": 251 }, { "epoch": 0.1896876176138502, "grad_norm": 0.43170109391212463, "learning_rate": 5.0400000000000005e-05, "loss": 0.8321, "step": 252 }, { "epoch": 0.190440346255175, "grad_norm": 0.3418758511543274, "learning_rate": 5.0600000000000003e-05, "loss": 0.7604, "step": 253 }, { "epoch": 0.1911930748964998, "grad_norm": 0.4122261106967926, "learning_rate": 5.08e-05, "loss": 0.7583, "step": 254 }, { "epoch": 0.19194580353782462, "grad_norm": 0.35605013370513916, "learning_rate": 5.1000000000000006e-05, "loss": 0.7653, "step": 255 }, { "epoch": 0.19269853217914942, "grad_norm": 0.3558778762817383, "learning_rate": 5.1200000000000004e-05, "loss": 0.7891, "step": 256 }, { "epoch": 0.19345126082047423, "grad_norm": 0.31115642189979553, "learning_rate": 5.14e-05, "loss": 0.7913, "step": 257 }, { "epoch": 0.194203989461799, "grad_norm": 0.3224548399448395, "learning_rate": 5.16e-05, "loss": 0.7662, "step": 258 }, { "epoch": 0.19495671810312382, "grad_norm": 0.3463912308216095, "learning_rate": 5.1800000000000005e-05, "loss": 0.7808, "step": 259 }, { "epoch": 0.19570944674444862, "grad_norm": 0.6614294648170471, "learning_rate": 5.2000000000000004e-05, "loss": 0.8056, "step": 260 }, { "epoch": 0.19646217538577343, "grad_norm": 0.39197373390197754, "learning_rate": 5.22e-05, "loss": 0.7701, "step": 261 }, { "epoch": 0.19721490402709824, "grad_norm": 0.403980016708374, "learning_rate": 5.2400000000000007e-05, "loss": 0.7937, "step": 262 }, { "epoch": 0.19796763266842304, "grad_norm": 0.27448728680610657, "learning_rate": 5.2600000000000005e-05, "loss": 0.7875, "step": 263 }, { "epoch": 0.19872036130974782, "grad_norm": 0.3036741018295288, "learning_rate": 5.28e-05, "loss": 0.7743, "step": 264 }, { "epoch": 0.19947308995107263, "grad_norm": 0.30664944648742676, "learning_rate": 5.300000000000001e-05, "loss": 0.7773, "step": 265 }, { "epoch": 0.20022581859239744, "grad_norm": 0.28989657759666443, "learning_rate": 5.3200000000000006e-05, "loss": 0.7845, "step": 266 }, { "epoch": 0.20097854723372224, "grad_norm": 0.4782477617263794, "learning_rate": 5.3400000000000004e-05, "loss": 0.8829, "step": 267 }, { "epoch": 0.20173127587504705, "grad_norm": 0.28491783142089844, "learning_rate": 5.360000000000001e-05, "loss": 0.7965, "step": 268 }, { "epoch": 0.20248400451637186, "grad_norm": 0.4176754951477051, "learning_rate": 5.380000000000001e-05, "loss": 0.7887, "step": 269 }, { "epoch": 0.20323673315769666, "grad_norm": 0.6066096425056458, "learning_rate": 5.4000000000000005e-05, "loss": 0.8195, "step": 270 }, { "epoch": 0.20398946179902144, "grad_norm": 0.3315056264400482, "learning_rate": 5.420000000000001e-05, "loss": 0.8848, "step": 271 }, { "epoch": 0.20474219044034625, "grad_norm": 0.4893193244934082, "learning_rate": 5.440000000000001e-05, "loss": 0.7442, "step": 272 }, { "epoch": 0.20549491908167106, "grad_norm": 0.39621856808662415, "learning_rate": 5.4600000000000006e-05, "loss": 0.7651, "step": 273 }, { "epoch": 0.20624764772299586, "grad_norm": 0.36131107807159424, "learning_rate": 5.4800000000000004e-05, "loss": 0.7964, "step": 274 }, { "epoch": 0.20700037636432067, "grad_norm": 0.3516080379486084, "learning_rate": 5.500000000000001e-05, "loss": 0.8108, "step": 275 }, { "epoch": 0.20775310500564548, "grad_norm": 0.31726184487342834, "learning_rate": 5.520000000000001e-05, "loss": 0.7644, "step": 276 }, { "epoch": 0.20850583364697026, "grad_norm": 0.34202656149864197, "learning_rate": 5.5400000000000005e-05, "loss": 0.8353, "step": 277 }, { "epoch": 0.20925856228829506, "grad_norm": 0.30604588985443115, "learning_rate": 5.560000000000001e-05, "loss": 0.8918, "step": 278 }, { "epoch": 0.21001129092961987, "grad_norm": 0.2831065356731415, "learning_rate": 5.580000000000001e-05, "loss": 0.7605, "step": 279 }, { "epoch": 0.21076401957094468, "grad_norm": 0.31375443935394287, "learning_rate": 5.6000000000000006e-05, "loss": 0.786, "step": 280 }, { "epoch": 0.21151674821226948, "grad_norm": 0.3313942551612854, "learning_rate": 5.620000000000001e-05, "loss": 0.7972, "step": 281 }, { "epoch": 0.2122694768535943, "grad_norm": 0.37037038803100586, "learning_rate": 5.6399999999999995e-05, "loss": 0.8255, "step": 282 }, { "epoch": 0.21302220549491907, "grad_norm": 0.2462696135044098, "learning_rate": 5.66e-05, "loss": 0.7911, "step": 283 }, { "epoch": 0.21377493413624388, "grad_norm": 0.26353734731674194, "learning_rate": 5.68e-05, "loss": 0.6817, "step": 284 }, { "epoch": 0.21452766277756868, "grad_norm": 0.2452244758605957, "learning_rate": 5.6999999999999996e-05, "loss": 0.7888, "step": 285 }, { "epoch": 0.2152803914188935, "grad_norm": 0.25994354486465454, "learning_rate": 5.72e-05, "loss": 0.7943, "step": 286 }, { "epoch": 0.2160331200602183, "grad_norm": 0.2981669008731842, "learning_rate": 5.74e-05, "loss": 0.7328, "step": 287 }, { "epoch": 0.2167858487015431, "grad_norm": 0.3885234296321869, "learning_rate": 5.76e-05, "loss": 0.8466, "step": 288 }, { "epoch": 0.21753857734286788, "grad_norm": 0.5408052802085876, "learning_rate": 5.7799999999999995e-05, "loss": 0.7548, "step": 289 }, { "epoch": 0.2182913059841927, "grad_norm": 0.5943387150764465, "learning_rate": 5.8e-05, "loss": 0.753, "step": 290 }, { "epoch": 0.2190440346255175, "grad_norm": 0.5453189015388489, "learning_rate": 5.82e-05, "loss": 0.7827, "step": 291 }, { "epoch": 0.2197967632668423, "grad_norm": 0.42057299613952637, "learning_rate": 5.8399999999999997e-05, "loss": 0.7611, "step": 292 }, { "epoch": 0.2205494919081671, "grad_norm": 0.3952324688434601, "learning_rate": 5.86e-05, "loss": 0.7639, "step": 293 }, { "epoch": 0.22130222054949192, "grad_norm": 0.3317667543888092, "learning_rate": 5.88e-05, "loss": 0.777, "step": 294 }, { "epoch": 0.2220549491908167, "grad_norm": 0.5286160707473755, "learning_rate": 5.9e-05, "loss": 0.7903, "step": 295 }, { "epoch": 0.2228076778321415, "grad_norm": 0.41035595536231995, "learning_rate": 5.92e-05, "loss": 0.7944, "step": 296 }, { "epoch": 0.2235604064734663, "grad_norm": 0.40989115834236145, "learning_rate": 5.94e-05, "loss": 0.8077, "step": 297 }, { "epoch": 0.22431313511479112, "grad_norm": 0.25725850462913513, "learning_rate": 5.96e-05, "loss": 0.7641, "step": 298 }, { "epoch": 0.22506586375611592, "grad_norm": 0.43568113446235657, "learning_rate": 5.9800000000000003e-05, "loss": 0.7207, "step": 299 }, { "epoch": 0.22581859239744073, "grad_norm": 0.4769858419895172, "learning_rate": 6e-05, "loss": 0.7622, "step": 300 }, { "epoch": 0.22657132103876554, "grad_norm": 0.2518361508846283, "learning_rate": 6.02e-05, "loss": 0.7665, "step": 301 }, { "epoch": 0.22732404968009032, "grad_norm": 0.34996485710144043, "learning_rate": 6.04e-05, "loss": 0.8803, "step": 302 }, { "epoch": 0.22807677832141512, "grad_norm": 0.3380737900733948, "learning_rate": 6.06e-05, "loss": 0.7556, "step": 303 }, { "epoch": 0.22882950696273993, "grad_norm": 0.2602279484272003, "learning_rate": 6.08e-05, "loss": 0.7542, "step": 304 }, { "epoch": 0.22958223560406474, "grad_norm": 0.29198554158210754, "learning_rate": 6.1e-05, "loss": 0.7419, "step": 305 }, { "epoch": 0.23033496424538955, "grad_norm": 0.3119921386241913, "learning_rate": 6.12e-05, "loss": 0.7921, "step": 306 }, { "epoch": 0.23108769288671435, "grad_norm": 0.3673281967639923, "learning_rate": 6.14e-05, "loss": 0.7266, "step": 307 }, { "epoch": 0.23184042152803913, "grad_norm": 0.38749566674232483, "learning_rate": 6.16e-05, "loss": 0.7325, "step": 308 }, { "epoch": 0.23259315016936394, "grad_norm": 0.3868984580039978, "learning_rate": 6.18e-05, "loss": 0.7371, "step": 309 }, { "epoch": 0.23334587881068874, "grad_norm": 0.28804677724838257, "learning_rate": 6.2e-05, "loss": 0.8236, "step": 310 }, { "epoch": 0.23409860745201355, "grad_norm": 0.3390117287635803, "learning_rate": 6.220000000000001e-05, "loss": 0.7592, "step": 311 }, { "epoch": 0.23485133609333836, "grad_norm": 0.3691128194332123, "learning_rate": 6.24e-05, "loss": 0.7263, "step": 312 }, { "epoch": 0.23560406473466317, "grad_norm": 0.30750468373298645, "learning_rate": 6.26e-05, "loss": 0.7442, "step": 313 }, { "epoch": 0.23635679337598794, "grad_norm": 0.3355133831501007, "learning_rate": 6.280000000000001e-05, "loss": 0.744, "step": 314 }, { "epoch": 0.23710952201731275, "grad_norm": 0.5015854835510254, "learning_rate": 6.3e-05, "loss": 0.7642, "step": 315 }, { "epoch": 0.23786225065863756, "grad_norm": 0.49220702052116394, "learning_rate": 6.32e-05, "loss": 0.7105, "step": 316 }, { "epoch": 0.23861497929996237, "grad_norm": 0.6501400470733643, "learning_rate": 6.340000000000001e-05, "loss": 0.8018, "step": 317 }, { "epoch": 0.23936770794128717, "grad_norm": 0.5972819924354553, "learning_rate": 6.36e-05, "loss": 0.7855, "step": 318 }, { "epoch": 0.24012043658261198, "grad_norm": 0.28038111329078674, "learning_rate": 6.38e-05, "loss": 0.7596, "step": 319 }, { "epoch": 0.24087316522393676, "grad_norm": 0.5634212493896484, "learning_rate": 6.400000000000001e-05, "loss": 0.7948, "step": 320 }, { "epoch": 0.24162589386526157, "grad_norm": 0.32602134346961975, "learning_rate": 6.42e-05, "loss": 0.7551, "step": 321 }, { "epoch": 0.24237862250658637, "grad_norm": 0.3773108124732971, "learning_rate": 6.440000000000001e-05, "loss": 0.8171, "step": 322 }, { "epoch": 0.24313135114791118, "grad_norm": 0.23797853291034698, "learning_rate": 6.460000000000001e-05, "loss": 0.7561, "step": 323 }, { "epoch": 0.24388407978923599, "grad_norm": 0.3761266767978668, "learning_rate": 6.48e-05, "loss": 0.7453, "step": 324 }, { "epoch": 0.2446368084305608, "grad_norm": 0.48819491267204285, "learning_rate": 6.500000000000001e-05, "loss": 0.8165, "step": 325 }, { "epoch": 0.24538953707188557, "grad_norm": 0.26395562291145325, "learning_rate": 6.52e-05, "loss": 0.7369, "step": 326 }, { "epoch": 0.24614226571321038, "grad_norm": 0.3274032175540924, "learning_rate": 6.54e-05, "loss": 0.7072, "step": 327 }, { "epoch": 0.24689499435453519, "grad_norm": 0.4241209924221039, "learning_rate": 6.560000000000001e-05, "loss": 0.8563, "step": 328 }, { "epoch": 0.24764772299586, "grad_norm": 0.43141239881515503, "learning_rate": 6.58e-05, "loss": 0.7884, "step": 329 }, { "epoch": 0.2484004516371848, "grad_norm": 0.3286961317062378, "learning_rate": 6.6e-05, "loss": 0.783, "step": 330 }, { "epoch": 0.2491531802785096, "grad_norm": 0.30689504742622375, "learning_rate": 6.620000000000001e-05, "loss": 0.7902, "step": 331 }, { "epoch": 0.2499059089198344, "grad_norm": 0.2803178131580353, "learning_rate": 6.64e-05, "loss": 0.7956, "step": 332 }, { "epoch": 0.2506586375611592, "grad_norm": 0.38576218485832214, "learning_rate": 6.66e-05, "loss": 0.7101, "step": 333 }, { "epoch": 0.251411366202484, "grad_norm": 0.32205697894096375, "learning_rate": 6.680000000000001e-05, "loss": 0.7461, "step": 334 }, { "epoch": 0.2521640948438088, "grad_norm": 0.2674986720085144, "learning_rate": 6.7e-05, "loss": 0.7271, "step": 335 }, { "epoch": 0.2529168234851336, "grad_norm": 0.32082071900367737, "learning_rate": 6.720000000000001e-05, "loss": 0.848, "step": 336 }, { "epoch": 0.2536695521264584, "grad_norm": 0.33319294452667236, "learning_rate": 6.740000000000001e-05, "loss": 0.6647, "step": 337 }, { "epoch": 0.2544222807677832, "grad_norm": 0.4610401690006256, "learning_rate": 6.76e-05, "loss": 0.7402, "step": 338 }, { "epoch": 0.25517500940910803, "grad_norm": 0.6135522723197937, "learning_rate": 6.780000000000001e-05, "loss": 0.6797, "step": 339 }, { "epoch": 0.25592773805043284, "grad_norm": 0.5271710157394409, "learning_rate": 6.800000000000001e-05, "loss": 0.7154, "step": 340 }, { "epoch": 0.25668046669175765, "grad_norm": 0.2914232909679413, "learning_rate": 6.82e-05, "loss": 0.6866, "step": 341 }, { "epoch": 0.2574331953330824, "grad_norm": 0.39607709646224976, "learning_rate": 6.840000000000001e-05, "loss": 0.7322, "step": 342 }, { "epoch": 0.2581859239744072, "grad_norm": 0.4765622019767761, "learning_rate": 6.860000000000001e-05, "loss": 0.7356, "step": 343 }, { "epoch": 0.258938652615732, "grad_norm": 0.4473394453525543, "learning_rate": 6.879999999999999e-05, "loss": 0.6551, "step": 344 }, { "epoch": 0.2596913812570568, "grad_norm": 0.30285343527793884, "learning_rate": 6.9e-05, "loss": 0.7281, "step": 345 }, { "epoch": 0.2604441098983816, "grad_norm": 0.6322281360626221, "learning_rate": 6.92e-05, "loss": 0.8663, "step": 346 }, { "epoch": 0.26119683853970643, "grad_norm": 0.49965527653694153, "learning_rate": 6.939999999999999e-05, "loss": 0.7759, "step": 347 }, { "epoch": 0.26194956718103124, "grad_norm": 0.37054643034935, "learning_rate": 6.96e-05, "loss": 0.6903, "step": 348 }, { "epoch": 0.26270229582235605, "grad_norm": 0.47299280762672424, "learning_rate": 6.98e-05, "loss": 0.7734, "step": 349 }, { "epoch": 0.26345502446368085, "grad_norm": 0.3119524121284485, "learning_rate": 7e-05, "loss": 0.6187, "step": 350 }, { "epoch": 0.26420775310500566, "grad_norm": 0.3282020688056946, "learning_rate": 7.02e-05, "loss": 0.7716, "step": 351 }, { "epoch": 0.26496048174633047, "grad_norm": 0.39386796951293945, "learning_rate": 7.04e-05, "loss": 0.8321, "step": 352 }, { "epoch": 0.2657132103876553, "grad_norm": 0.29106295108795166, "learning_rate": 7.06e-05, "loss": 0.8407, "step": 353 }, { "epoch": 0.26646593902898, "grad_norm": 0.3301931321620941, "learning_rate": 7.08e-05, "loss": 0.6771, "step": 354 }, { "epoch": 0.26721866767030483, "grad_norm": 0.3401299715042114, "learning_rate": 7.1e-05, "loss": 0.7359, "step": 355 }, { "epoch": 0.26797139631162964, "grad_norm": 0.26050397753715515, "learning_rate": 7.12e-05, "loss": 0.7306, "step": 356 }, { "epoch": 0.26872412495295445, "grad_norm": 0.3120352625846863, "learning_rate": 7.14e-05, "loss": 0.6834, "step": 357 }, { "epoch": 0.26947685359427925, "grad_norm": 0.2930803894996643, "learning_rate": 7.16e-05, "loss": 0.7302, "step": 358 }, { "epoch": 0.27022958223560406, "grad_norm": 0.40829208493232727, "learning_rate": 7.18e-05, "loss": 0.6707, "step": 359 }, { "epoch": 0.27098231087692887, "grad_norm": 0.3065964877605438, "learning_rate": 7.2e-05, "loss": 0.6837, "step": 360 }, { "epoch": 0.2717350395182537, "grad_norm": 0.33545950055122375, "learning_rate": 7.22e-05, "loss": 0.7597, "step": 361 }, { "epoch": 0.2724877681595785, "grad_norm": 0.2868618071079254, "learning_rate": 7.24e-05, "loss": 0.7694, "step": 362 }, { "epoch": 0.2732404968009033, "grad_norm": 0.2842622399330139, "learning_rate": 7.26e-05, "loss": 0.7072, "step": 363 }, { "epoch": 0.2739932254422281, "grad_norm": 0.27490857243537903, "learning_rate": 7.280000000000001e-05, "loss": 0.7262, "step": 364 }, { "epoch": 0.2747459540835529, "grad_norm": 0.30756035447120667, "learning_rate": 7.3e-05, "loss": 0.6565, "step": 365 }, { "epoch": 0.2754986827248777, "grad_norm": 0.32821470499038696, "learning_rate": 7.32e-05, "loss": 0.6762, "step": 366 }, { "epoch": 0.27625141136620246, "grad_norm": 0.32173722982406616, "learning_rate": 7.340000000000001e-05, "loss": 0.6997, "step": 367 }, { "epoch": 0.27700414000752727, "grad_norm": 0.3224622309207916, "learning_rate": 7.36e-05, "loss": 0.7733, "step": 368 }, { "epoch": 0.2777568686488521, "grad_norm": 0.4852927327156067, "learning_rate": 7.38e-05, "loss": 0.7414, "step": 369 }, { "epoch": 0.2785095972901769, "grad_norm": 0.34145259857177734, "learning_rate": 7.4e-05, "loss": 0.7654, "step": 370 }, { "epoch": 0.2792623259315017, "grad_norm": 0.40339773893356323, "learning_rate": 7.42e-05, "loss": 0.6566, "step": 371 }, { "epoch": 0.2800150545728265, "grad_norm": 0.3502252995967865, "learning_rate": 7.44e-05, "loss": 0.7048, "step": 372 }, { "epoch": 0.2807677832141513, "grad_norm": 0.49175596237182617, "learning_rate": 7.46e-05, "loss": 0.7076, "step": 373 }, { "epoch": 0.2815205118554761, "grad_norm": 0.39595040678977966, "learning_rate": 7.48e-05, "loss": 0.757, "step": 374 }, { "epoch": 0.2822732404968009, "grad_norm": 0.3812234103679657, "learning_rate": 7.500000000000001e-05, "loss": 0.7221, "step": 375 }, { "epoch": 0.2830259691381257, "grad_norm": 0.30208641290664673, "learning_rate": 7.52e-05, "loss": 0.7544, "step": 376 }, { "epoch": 0.28377869777945053, "grad_norm": 0.3959600329399109, "learning_rate": 7.54e-05, "loss": 0.6643, "step": 377 }, { "epoch": 0.28453142642077534, "grad_norm": 0.44491565227508545, "learning_rate": 7.560000000000001e-05, "loss": 0.7499, "step": 378 }, { "epoch": 0.2852841550621001, "grad_norm": 0.4338065981864929, "learning_rate": 7.58e-05, "loss": 0.7305, "step": 379 }, { "epoch": 0.2860368837034249, "grad_norm": 0.36957499384880066, "learning_rate": 7.6e-05, "loss": 0.661, "step": 380 }, { "epoch": 0.2867896123447497, "grad_norm": 0.5704991817474365, "learning_rate": 7.620000000000001e-05, "loss": 0.751, "step": 381 }, { "epoch": 0.2875423409860745, "grad_norm": 0.5830215811729431, "learning_rate": 7.64e-05, "loss": 0.6552, "step": 382 }, { "epoch": 0.2882950696273993, "grad_norm": 0.4083460867404938, "learning_rate": 7.66e-05, "loss": 0.7048, "step": 383 }, { "epoch": 0.2890477982687241, "grad_norm": 0.8643603324890137, "learning_rate": 7.680000000000001e-05, "loss": 0.7873, "step": 384 }, { "epoch": 0.28980052691004893, "grad_norm": 0.5665083527565002, "learning_rate": 7.7e-05, "loss": 0.6513, "step": 385 }, { "epoch": 0.29055325555137373, "grad_norm": 0.4602196216583252, "learning_rate": 7.72e-05, "loss": 0.6957, "step": 386 }, { "epoch": 0.29130598419269854, "grad_norm": 0.4623255729675293, "learning_rate": 7.740000000000001e-05, "loss": 0.714, "step": 387 }, { "epoch": 0.29205871283402335, "grad_norm": 0.6260291934013367, "learning_rate": 7.76e-05, "loss": 0.7146, "step": 388 }, { "epoch": 0.29281144147534816, "grad_norm": 0.6141284704208374, "learning_rate": 7.780000000000001e-05, "loss": 0.7824, "step": 389 }, { "epoch": 0.29356417011667296, "grad_norm": 0.3731597065925598, "learning_rate": 7.800000000000001e-05, "loss": 0.7498, "step": 390 }, { "epoch": 0.29431689875799777, "grad_norm": 0.4038994312286377, "learning_rate": 7.82e-05, "loss": 0.735, "step": 391 }, { "epoch": 0.2950696273993225, "grad_norm": 0.484579861164093, "learning_rate": 7.840000000000001e-05, "loss": 0.7906, "step": 392 }, { "epoch": 0.2958223560406473, "grad_norm": 0.3141382336616516, "learning_rate": 7.860000000000001e-05, "loss": 0.7596, "step": 393 }, { "epoch": 0.29657508468197213, "grad_norm": 0.639032244682312, "learning_rate": 7.88e-05, "loss": 0.7677, "step": 394 }, { "epoch": 0.29732781332329694, "grad_norm": 0.4770423173904419, "learning_rate": 7.900000000000001e-05, "loss": 0.7491, "step": 395 }, { "epoch": 0.29808054196462175, "grad_norm": 0.3542947471141815, "learning_rate": 7.920000000000001e-05, "loss": 0.69, "step": 396 }, { "epoch": 0.29883327060594655, "grad_norm": 0.5125464797019958, "learning_rate": 7.94e-05, "loss": 0.707, "step": 397 }, { "epoch": 0.29958599924727136, "grad_norm": 0.27063336968421936, "learning_rate": 7.960000000000001e-05, "loss": 0.7662, "step": 398 }, { "epoch": 0.30033872788859617, "grad_norm": 0.41265377402305603, "learning_rate": 7.98e-05, "loss": 0.7511, "step": 399 }, { "epoch": 0.301091456529921, "grad_norm": 0.3290537893772125, "learning_rate": 8e-05, "loss": 0.6924, "step": 400 }, { "epoch": 0.301091456529921, "eval_loss": 0.6964845657348633, "eval_runtime": 455.8184, "eval_samples_per_second": 21.12, "eval_steps_per_second": 0.66, "step": 400 }, { "epoch": 0.0007527286413248024, "grad_norm": 0.3369865417480469, "learning_rate": 8.020000000000001e-05, "loss": 0.6435, "step": 401 }, { "epoch": 0.0015054572826496049, "grad_norm": 0.3413328230381012, "learning_rate": 8.04e-05, "loss": 0.6824, "step": 402 }, { "epoch": 0.002258185923974407, "grad_norm": 0.27245426177978516, "learning_rate": 8.060000000000001e-05, "loss": 0.7822, "step": 403 }, { "epoch": 0.0030109145652992097, "grad_norm": 0.35004276037216187, "learning_rate": 8.080000000000001e-05, "loss": 0.7936, "step": 404 }, { "epoch": 0.003763643206624012, "grad_norm": 0.2878829538822174, "learning_rate": 8.1e-05, "loss": 0.7159, "step": 405 }, { "epoch": 0.004516371847948814, "grad_norm": 0.2342945635318756, "learning_rate": 8.120000000000001e-05, "loss": 0.7357, "step": 406 }, { "epoch": 0.005269100489273617, "grad_norm": 0.30416199564933777, "learning_rate": 8.14e-05, "loss": 0.6441, "step": 407 }, { "epoch": 0.0060218291305984195, "grad_norm": 0.23268623650074005, "learning_rate": 8.16e-05, "loss": 0.6749, "step": 408 }, { "epoch": 0.006774557771923222, "grad_norm": 0.29655784368515015, "learning_rate": 8.18e-05, "loss": 0.7051, "step": 409 }, { "epoch": 0.007527286413248024, "grad_norm": 0.40569770336151123, "learning_rate": 8.2e-05, "loss": 0.5874, "step": 410 }, { "epoch": 0.008280015054572827, "grad_norm": 0.4447745978832245, "learning_rate": 8.22e-05, "loss": 0.6827, "step": 411 }, { "epoch": 0.009032743695897629, "grad_norm": 0.34940043091773987, "learning_rate": 8.24e-05, "loss": 0.6988, "step": 412 }, { "epoch": 0.00978547233722243, "grad_norm": 0.3142530620098114, "learning_rate": 8.26e-05, "loss": 0.5429, "step": 413 }, { "epoch": 0.010538200978547234, "grad_norm": 0.4153319001197815, "learning_rate": 8.28e-05, "loss": 0.7171, "step": 414 }, { "epoch": 0.011290929619872036, "grad_norm": 0.39907991886138916, "learning_rate": 8.3e-05, "loss": 0.7288, "step": 415 }, { "epoch": 0.012043658261196839, "grad_norm": 0.34510931372642517, "learning_rate": 8.32e-05, "loss": 0.6934, "step": 416 }, { "epoch": 0.01279638690252164, "grad_norm": 0.3273760974407196, "learning_rate": 8.34e-05, "loss": 0.7209, "step": 417 }, { "epoch": 0.013549115543846444, "grad_norm": 0.28461000323295593, "learning_rate": 8.36e-05, "loss": 0.6812, "step": 418 }, { "epoch": 0.014301844185171246, "grad_norm": 0.32299181818962097, "learning_rate": 8.38e-05, "loss": 0.7809, "step": 419 }, { "epoch": 0.015054572826496047, "grad_norm": 0.4065423905849457, "learning_rate": 8.4e-05, "loss": 0.6599, "step": 420 }, { "epoch": 0.01580730146782085, "grad_norm": 0.33560776710510254, "learning_rate": 8.42e-05, "loss": 0.7003, "step": 421 }, { "epoch": 0.016560030109145654, "grad_norm": 0.303831547498703, "learning_rate": 8.44e-05, "loss": 0.634, "step": 422 }, { "epoch": 0.017312758750470454, "grad_norm": 0.288095623254776, "learning_rate": 8.46e-05, "loss": 0.6468, "step": 423 }, { "epoch": 0.018065487391795258, "grad_norm": 0.3228033185005188, "learning_rate": 8.48e-05, "loss": 0.7862, "step": 424 }, { "epoch": 0.01881821603312006, "grad_norm": 0.314115434885025, "learning_rate": 8.5e-05, "loss": 0.7167, "step": 425 }, { "epoch": 0.01957094467444486, "grad_norm": 0.2543393075466156, "learning_rate": 8.52e-05, "loss": 0.6662, "step": 426 }, { "epoch": 0.020323673315769664, "grad_norm": 0.30278709530830383, "learning_rate": 8.54e-05, "loss": 0.652, "step": 427 }, { "epoch": 0.021076401957094468, "grad_norm": 0.3294178247451782, "learning_rate": 8.560000000000001e-05, "loss": 0.6639, "step": 428 }, { "epoch": 0.02182913059841927, "grad_norm": 0.2630681097507477, "learning_rate": 8.58e-05, "loss": 0.7514, "step": 429 }, { "epoch": 0.02258185923974407, "grad_norm": 0.28472599387168884, "learning_rate": 8.6e-05, "loss": 0.7696, "step": 430 }, { "epoch": 0.023334587881068874, "grad_norm": 0.2664525508880615, "learning_rate": 8.620000000000001e-05, "loss": 0.689, "step": 431 }, { "epoch": 0.024087316522393678, "grad_norm": 0.308310866355896, "learning_rate": 8.64e-05, "loss": 0.623, "step": 432 }, { "epoch": 0.024840045163718478, "grad_norm": 0.3796287178993225, "learning_rate": 8.66e-05, "loss": 0.671, "step": 433 }, { "epoch": 0.02559277380504328, "grad_norm": 0.32023879885673523, "learning_rate": 8.680000000000001e-05, "loss": 0.7662, "step": 434 }, { "epoch": 0.026345502446368085, "grad_norm": 0.3553287386894226, "learning_rate": 8.7e-05, "loss": 0.707, "step": 435 }, { "epoch": 0.027098231087692888, "grad_norm": 0.3352551758289337, "learning_rate": 8.72e-05, "loss": 0.6756, "step": 436 }, { "epoch": 0.027850959729017688, "grad_norm": 0.37620532512664795, "learning_rate": 8.740000000000001e-05, "loss": 0.6611, "step": 437 }, { "epoch": 0.02860368837034249, "grad_norm": 0.3663613796234131, "learning_rate": 8.76e-05, "loss": 0.7495, "step": 438 }, { "epoch": 0.029356417011667295, "grad_norm": 0.3216564655303955, "learning_rate": 8.78e-05, "loss": 0.6407, "step": 439 }, { "epoch": 0.030109145652992095, "grad_norm": 0.3411416709423065, "learning_rate": 8.800000000000001e-05, "loss": 0.6223, "step": 440 }, { "epoch": 0.030861874294316898, "grad_norm": 0.3298925757408142, "learning_rate": 8.82e-05, "loss": 0.6682, "step": 441 }, { "epoch": 0.0316146029356417, "grad_norm": 0.4647982120513916, "learning_rate": 8.840000000000001e-05, "loss": 0.5558, "step": 442 }, { "epoch": 0.0323673315769665, "grad_norm": 0.39863890409469604, "learning_rate": 8.86e-05, "loss": 0.6691, "step": 443 }, { "epoch": 0.03312006021829131, "grad_norm": 0.3680301606655121, "learning_rate": 8.88e-05, "loss": 0.641, "step": 444 }, { "epoch": 0.03387278885961611, "grad_norm": 0.3787747621536255, "learning_rate": 8.900000000000001e-05, "loss": 0.7287, "step": 445 }, { "epoch": 0.03462551750094091, "grad_norm": 0.3175957202911377, "learning_rate": 8.92e-05, "loss": 0.7718, "step": 446 }, { "epoch": 0.035378246142265715, "grad_norm": 0.3634609282016754, "learning_rate": 8.94e-05, "loss": 0.6833, "step": 447 }, { "epoch": 0.036130974783590515, "grad_norm": 0.33546242117881775, "learning_rate": 8.960000000000001e-05, "loss": 0.7272, "step": 448 }, { "epoch": 0.036883703424915315, "grad_norm": 0.3532176911830902, "learning_rate": 8.98e-05, "loss": 0.6376, "step": 449 }, { "epoch": 0.03763643206624012, "grad_norm": 0.2973983883857727, "learning_rate": 9e-05, "loss": 0.6092, "step": 450 }, { "epoch": 0.03838916070756492, "grad_norm": 0.3353894352912903, "learning_rate": 9.020000000000001e-05, "loss": 0.6491, "step": 451 }, { "epoch": 0.03914188934888972, "grad_norm": 0.4555714726448059, "learning_rate": 9.04e-05, "loss": 0.6566, "step": 452 }, { "epoch": 0.03989461799021453, "grad_norm": 0.4091961085796356, "learning_rate": 9.06e-05, "loss": 0.7808, "step": 453 }, { "epoch": 0.04064734663153933, "grad_norm": 0.39859169721603394, "learning_rate": 9.080000000000001e-05, "loss": 0.7803, "step": 454 }, { "epoch": 0.041400075272864136, "grad_norm": 0.4155171811580658, "learning_rate": 9.1e-05, "loss": 0.7018, "step": 455 }, { "epoch": 0.042152803914188935, "grad_norm": 0.4263994097709656, "learning_rate": 9.120000000000001e-05, "loss": 0.7167, "step": 456 }, { "epoch": 0.042905532555513735, "grad_norm": 0.36459866166114807, "learning_rate": 9.140000000000001e-05, "loss": 0.6607, "step": 457 }, { "epoch": 0.04365826119683854, "grad_norm": 0.35827505588531494, "learning_rate": 9.16e-05, "loss": 0.6358, "step": 458 }, { "epoch": 0.04441098983816334, "grad_norm": 0.3631746172904968, "learning_rate": 9.180000000000001e-05, "loss": 0.6643, "step": 459 }, { "epoch": 0.04516371847948814, "grad_norm": 0.3243977725505829, "learning_rate": 9.200000000000001e-05, "loss": 0.7498, "step": 460 }, { "epoch": 0.04591644712081295, "grad_norm": 0.2600984275341034, "learning_rate": 9.22e-05, "loss": 0.7408, "step": 461 }, { "epoch": 0.04666917576213775, "grad_norm": 0.28049081563949585, "learning_rate": 9.240000000000001e-05, "loss": 0.6677, "step": 462 }, { "epoch": 0.04742190440346255, "grad_norm": 0.2740708291530609, "learning_rate": 9.260000000000001e-05, "loss": 0.6284, "step": 463 }, { "epoch": 0.048174633044787356, "grad_norm": 0.3000448942184448, "learning_rate": 9.28e-05, "loss": 0.7552, "step": 464 }, { "epoch": 0.048927361686112156, "grad_norm": 0.49258166551589966, "learning_rate": 9.300000000000001e-05, "loss": 0.548, "step": 465 }, { "epoch": 0.049680090327436956, "grad_norm": 0.839919924736023, "learning_rate": 9.320000000000002e-05, "loss": 0.7342, "step": 466 }, { "epoch": 0.05043281896876176, "grad_norm": 0.8089317083358765, "learning_rate": 9.340000000000001e-05, "loss": 0.7566, "step": 467 }, { "epoch": 0.05118554761008656, "grad_norm": 0.3999558091163635, "learning_rate": 9.360000000000001e-05, "loss": 0.7127, "step": 468 }, { "epoch": 0.05193827625141137, "grad_norm": 0.6408129930496216, "learning_rate": 9.38e-05, "loss": 0.6718, "step": 469 }, { "epoch": 0.05269100489273617, "grad_norm": 0.3740585744380951, "learning_rate": 9.4e-05, "loss": 0.7565, "step": 470 }, { "epoch": 0.05344373353406097, "grad_norm": 0.3849365711212158, "learning_rate": 9.42e-05, "loss": 0.571, "step": 471 }, { "epoch": 0.054196462175385776, "grad_norm": 0.3285852372646332, "learning_rate": 9.44e-05, "loss": 0.6837, "step": 472 }, { "epoch": 0.054949190816710576, "grad_norm": 0.32490381598472595, "learning_rate": 9.46e-05, "loss": 0.626, "step": 473 }, { "epoch": 0.055701919458035376, "grad_norm": 0.28056687116622925, "learning_rate": 9.48e-05, "loss": 0.7008, "step": 474 }, { "epoch": 0.05645464809936018, "grad_norm": 0.32396575808525085, "learning_rate": 9.5e-05, "loss": 0.6881, "step": 475 }, { "epoch": 0.05720737674068498, "grad_norm": 0.2605207562446594, "learning_rate": 9.52e-05, "loss": 0.7573, "step": 476 }, { "epoch": 0.05796010538200978, "grad_norm": 0.26924172043800354, "learning_rate": 9.54e-05, "loss": 0.6369, "step": 477 }, { "epoch": 0.05871283402333459, "grad_norm": 0.2759149372577667, "learning_rate": 9.56e-05, "loss": 0.6982, "step": 478 }, { "epoch": 0.05946556266465939, "grad_norm": 0.2598308324813843, "learning_rate": 9.58e-05, "loss": 0.6866, "step": 479 }, { "epoch": 0.06021829130598419, "grad_norm": 0.24803151190280914, "learning_rate": 9.6e-05, "loss": 0.6173, "step": 480 }, { "epoch": 0.060971019947308996, "grad_norm": 0.2768506109714508, "learning_rate": 9.620000000000001e-05, "loss": 0.6964, "step": 481 }, { "epoch": 0.061723748588633796, "grad_norm": 0.24571861326694489, "learning_rate": 9.64e-05, "loss": 0.7285, "step": 482 }, { "epoch": 0.0624764772299586, "grad_norm": 0.3785524070262909, "learning_rate": 9.66e-05, "loss": 0.7546, "step": 483 }, { "epoch": 0.0632292058712834, "grad_norm": 0.46840885281562805, "learning_rate": 9.680000000000001e-05, "loss": 0.6342, "step": 484 }, { "epoch": 0.06398193451260821, "grad_norm": 0.373477965593338, "learning_rate": 9.7e-05, "loss": 0.618, "step": 485 }, { "epoch": 0.064734663153933, "grad_norm": 0.4449579119682312, "learning_rate": 9.72e-05, "loss": 0.777, "step": 486 }, { "epoch": 0.06548739179525781, "grad_norm": 0.4039005637168884, "learning_rate": 9.74e-05, "loss": 0.6443, "step": 487 }, { "epoch": 0.06624012043658262, "grad_norm": 0.35793036222457886, "learning_rate": 9.76e-05, "loss": 0.7677, "step": 488 }, { "epoch": 0.06699284907790741, "grad_norm": 0.488740473985672, "learning_rate": 9.78e-05, "loss": 0.6209, "step": 489 }, { "epoch": 0.06774557771923222, "grad_norm": 0.4478527903556824, "learning_rate": 9.8e-05, "loss": 0.7405, "step": 490 }, { "epoch": 0.06849830636055702, "grad_norm": 0.3799767792224884, "learning_rate": 9.82e-05, "loss": 0.6736, "step": 491 }, { "epoch": 0.06925103500188182, "grad_norm": 0.3216135799884796, "learning_rate": 9.84e-05, "loss": 0.7084, "step": 492 }, { "epoch": 0.07000376364320662, "grad_norm": 0.49093541502952576, "learning_rate": 9.86e-05, "loss": 0.698, "step": 493 }, { "epoch": 0.07075649228453143, "grad_norm": 0.5390486121177673, "learning_rate": 9.88e-05, "loss": 0.7057, "step": 494 }, { "epoch": 0.07150922092585622, "grad_norm": 0.3567315638065338, "learning_rate": 9.900000000000001e-05, "loss": 0.7065, "step": 495 }, { "epoch": 0.07226194956718103, "grad_norm": 0.32907384634017944, "learning_rate": 9.92e-05, "loss": 0.5653, "step": 496 }, { "epoch": 0.07301467820850584, "grad_norm": 0.4070594906806946, "learning_rate": 9.94e-05, "loss": 0.6801, "step": 497 }, { "epoch": 0.07376740684983063, "grad_norm": 0.28839990496635437, "learning_rate": 9.960000000000001e-05, "loss": 0.6177, "step": 498 }, { "epoch": 0.07452013549115544, "grad_norm": 0.3889813721179962, "learning_rate": 9.98e-05, "loss": 0.6628, "step": 499 }, { "epoch": 0.07527286413248024, "grad_norm": 0.29964154958724976, "learning_rate": 0.0001, "loss": 0.581, "step": 500 }, { "epoch": 0.07602559277380504, "grad_norm": 0.3565109968185425, "learning_rate": 9.999999753943383e-05, "loss": 0.6893, "step": 501 }, { "epoch": 0.07677832141512984, "grad_norm": 0.33301809430122375, "learning_rate": 9.999999015773558e-05, "loss": 0.6538, "step": 502 }, { "epoch": 0.07753105005645465, "grad_norm": 0.32864364981651306, "learning_rate": 9.999997785490607e-05, "loss": 0.6081, "step": 503 }, { "epoch": 0.07828377869777944, "grad_norm": 0.39255091547966003, "learning_rate": 9.999996063094663e-05, "loss": 0.6044, "step": 504 }, { "epoch": 0.07903650733910425, "grad_norm": 0.23630954325199127, "learning_rate": 9.999993848585916e-05, "loss": 0.6361, "step": 505 }, { "epoch": 0.07978923598042906, "grad_norm": 0.2720547020435333, "learning_rate": 9.999991141964607e-05, "loss": 0.6997, "step": 506 }, { "epoch": 0.08054196462175386, "grad_norm": 0.24043364822864532, "learning_rate": 9.999987943231033e-05, "loss": 0.6939, "step": 507 }, { "epoch": 0.08129469326307866, "grad_norm": 0.37353041768074036, "learning_rate": 9.999984252385543e-05, "loss": 0.7353, "step": 508 }, { "epoch": 0.08204742190440346, "grad_norm": 0.26781246066093445, "learning_rate": 9.99998006942854e-05, "loss": 0.6458, "step": 509 }, { "epoch": 0.08280015054572827, "grad_norm": 0.26093432307243347, "learning_rate": 9.999975394360483e-05, "loss": 0.6701, "step": 510 }, { "epoch": 0.08355287918705306, "grad_norm": 0.2784143388271332, "learning_rate": 9.999970227181881e-05, "loss": 0.6118, "step": 511 }, { "epoch": 0.08430560782837787, "grad_norm": 0.31865090131759644, "learning_rate": 9.999964567893302e-05, "loss": 0.6139, "step": 512 }, { "epoch": 0.08505833646970268, "grad_norm": 0.31435245275497437, "learning_rate": 9.999958416495364e-05, "loss": 0.6048, "step": 513 }, { "epoch": 0.08581106511102747, "grad_norm": 0.3912545144557953, "learning_rate": 9.999951772988738e-05, "loss": 0.6238, "step": 514 }, { "epoch": 0.08656379375235228, "grad_norm": 0.4716023802757263, "learning_rate": 9.99994463737415e-05, "loss": 0.6855, "step": 515 }, { "epoch": 0.08731652239367708, "grad_norm": 0.4356096386909485, "learning_rate": 9.999937009652385e-05, "loss": 0.6447, "step": 516 }, { "epoch": 0.08806925103500188, "grad_norm": 0.4474468231201172, "learning_rate": 9.999928889824273e-05, "loss": 0.659, "step": 517 }, { "epoch": 0.08882197967632668, "grad_norm": 0.4059533476829529, "learning_rate": 9.999920277890703e-05, "loss": 0.719, "step": 518 }, { "epoch": 0.08957470831765149, "grad_norm": 0.2404022216796875, "learning_rate": 9.999911173852618e-05, "loss": 0.7516, "step": 519 }, { "epoch": 0.09032743695897628, "grad_norm": 0.4702737331390381, "learning_rate": 9.999901577711012e-05, "loss": 0.641, "step": 520 }, { "epoch": 0.09108016560030109, "grad_norm": 0.39990749955177307, "learning_rate": 9.999891489466934e-05, "loss": 0.5705, "step": 521 }, { "epoch": 0.0918328942416259, "grad_norm": 0.30537956953048706, "learning_rate": 9.999880909121488e-05, "loss": 0.6762, "step": 522 }, { "epoch": 0.09258562288295069, "grad_norm": 0.3641497790813446, "learning_rate": 9.999869836675833e-05, "loss": 0.5566, "step": 523 }, { "epoch": 0.0933383515242755, "grad_norm": 0.2945883274078369, "learning_rate": 9.999858272131177e-05, "loss": 0.5711, "step": 524 }, { "epoch": 0.0940910801656003, "grad_norm": 0.3143121302127838, "learning_rate": 9.999846215488786e-05, "loss": 0.5933, "step": 525 }, { "epoch": 0.0948438088069251, "grad_norm": 0.3275812268257141, "learning_rate": 9.999833666749979e-05, "loss": 0.6875, "step": 526 }, { "epoch": 0.0955965374482499, "grad_norm": 0.8408477902412415, "learning_rate": 9.999820625916127e-05, "loss": 0.7857, "step": 527 }, { "epoch": 0.09634926608957471, "grad_norm": 0.3052905797958374, "learning_rate": 9.999807092988656e-05, "loss": 0.5802, "step": 528 }, { "epoch": 0.0971019947308995, "grad_norm": 0.48404645919799805, "learning_rate": 9.999793067969047e-05, "loss": 0.6512, "step": 529 }, { "epoch": 0.09785472337222431, "grad_norm": 1.2233775854110718, "learning_rate": 9.999778550858834e-05, "loss": 0.6091, "step": 530 }, { "epoch": 0.09860745201354912, "grad_norm": 5.129741191864014, "learning_rate": 9.999763541659605e-05, "loss": 0.7719, "step": 531 }, { "epoch": 0.09936018065487391, "grad_norm": 0.8490756750106812, "learning_rate": 9.999748040372998e-05, "loss": 0.6756, "step": 532 }, { "epoch": 0.10011290929619872, "grad_norm": 3.317746877670288, "learning_rate": 9.999732047000711e-05, "loss": 0.806, "step": 533 }, { "epoch": 0.10086563793752353, "grad_norm": 13.432624816894531, "learning_rate": 9.999715561544494e-05, "loss": 0.8629, "step": 534 }, { "epoch": 0.10161836657884833, "grad_norm": 2.5957140922546387, "learning_rate": 9.999698584006149e-05, "loss": 0.786, "step": 535 }, { "epoch": 0.10237109522017313, "grad_norm": 3.6566264629364014, "learning_rate": 9.999681114387529e-05, "loss": 0.8087, "step": 536 }, { "epoch": 0.10312382386149793, "grad_norm": 1.581263542175293, "learning_rate": 9.999663152690549e-05, "loss": 0.7586, "step": 537 }, { "epoch": 0.10387655250282274, "grad_norm": 0.8524181842803955, "learning_rate": 9.999644698917173e-05, "loss": 0.7889, "step": 538 }, { "epoch": 0.10462928114414753, "grad_norm": 6.440183162689209, "learning_rate": 9.999625753069417e-05, "loss": 0.8253, "step": 539 }, { "epoch": 0.10538200978547234, "grad_norm": 1.6968436241149902, "learning_rate": 9.999606315149354e-05, "loss": 0.8095, "step": 540 }, { "epoch": 0.10613473842679715, "grad_norm": 0.6685400605201721, "learning_rate": 9.999586385159108e-05, "loss": 0.7407, "step": 541 }, { "epoch": 0.10688746706812194, "grad_norm": 1.1900933980941772, "learning_rate": 9.999565963100862e-05, "loss": 0.682, "step": 542 }, { "epoch": 0.10764019570944675, "grad_norm": 0.7587040066719055, "learning_rate": 9.999545048976846e-05, "loss": 0.753, "step": 543 }, { "epoch": 0.10839292435077155, "grad_norm": 0.6514841914176941, "learning_rate": 9.999523642789348e-05, "loss": 0.6441, "step": 544 }, { "epoch": 0.10914565299209635, "grad_norm": 0.48340731859207153, "learning_rate": 9.999501744540712e-05, "loss": 0.7737, "step": 545 }, { "epoch": 0.10989838163342115, "grad_norm": 0.4170205295085907, "learning_rate": 9.999479354233326e-05, "loss": 0.7658, "step": 546 }, { "epoch": 0.11065111027474596, "grad_norm": 0.3457631766796112, "learning_rate": 9.999456471869645e-05, "loss": 0.7625, "step": 547 }, { "epoch": 0.11140383891607075, "grad_norm": 0.3680456876754761, "learning_rate": 9.99943309745217e-05, "loss": 0.7217, "step": 548 }, { "epoch": 0.11215656755739556, "grad_norm": 0.5304470658302307, "learning_rate": 9.999409230983455e-05, "loss": 0.6764, "step": 549 }, { "epoch": 0.11290929619872037, "grad_norm": 0.30280202627182007, "learning_rate": 9.999384872466111e-05, "loss": 0.818, "step": 550 }, { "epoch": 0.11366202484004516, "grad_norm": 0.35331472754478455, "learning_rate": 9.999360021902802e-05, "loss": 0.7142, "step": 551 }, { "epoch": 0.11441475348136997, "grad_norm": 0.32121893763542175, "learning_rate": 9.999334679296246e-05, "loss": 0.5375, "step": 552 }, { "epoch": 0.11516748212269477, "grad_norm": 0.5598880648612976, "learning_rate": 9.999308844649214e-05, "loss": 0.7226, "step": 553 }, { "epoch": 0.11592021076401957, "grad_norm": 0.31906604766845703, "learning_rate": 9.999282517964532e-05, "loss": 0.6594, "step": 554 }, { "epoch": 0.11667293940534437, "grad_norm": 0.3266849219799042, "learning_rate": 9.999255699245078e-05, "loss": 0.6, "step": 555 }, { "epoch": 0.11742566804666918, "grad_norm": 0.39915433526039124, "learning_rate": 9.999228388493786e-05, "loss": 0.7649, "step": 556 }, { "epoch": 0.11817839668799397, "grad_norm": 0.268025279045105, "learning_rate": 9.99920058571364e-05, "loss": 0.6578, "step": 557 }, { "epoch": 0.11893112532931878, "grad_norm": 0.2988092005252838, "learning_rate": 9.999172290907685e-05, "loss": 0.5753, "step": 558 }, { "epoch": 0.11968385397064359, "grad_norm": 0.4389660656452179, "learning_rate": 9.999143504079011e-05, "loss": 0.5709, "step": 559 }, { "epoch": 0.12043658261196838, "grad_norm": 0.38564491271972656, "learning_rate": 9.999114225230768e-05, "loss": 0.7261, "step": 560 }, { "epoch": 0.12118931125329319, "grad_norm": 0.31115400791168213, "learning_rate": 9.999084454366159e-05, "loss": 0.5798, "step": 561 }, { "epoch": 0.12194203989461799, "grad_norm": 0.4043784439563751, "learning_rate": 9.999054191488436e-05, "loss": 0.5615, "step": 562 }, { "epoch": 0.12269476853594279, "grad_norm": 0.48461130261421204, "learning_rate": 9.999023436600911e-05, "loss": 0.6249, "step": 563 }, { "epoch": 0.12344749717726759, "grad_norm": 0.4146468937397003, "learning_rate": 9.998992189706949e-05, "loss": 0.7411, "step": 564 }, { "epoch": 0.1242002258185924, "grad_norm": 0.2936534285545349, "learning_rate": 9.998960450809965e-05, "loss": 0.5998, "step": 565 }, { "epoch": 0.1249529544599172, "grad_norm": 0.32815974950790405, "learning_rate": 9.998928219913428e-05, "loss": 0.6371, "step": 566 }, { "epoch": 0.125705683101242, "grad_norm": 0.34281450510025024, "learning_rate": 9.998895497020868e-05, "loss": 0.6663, "step": 567 }, { "epoch": 0.1264584117425668, "grad_norm": 0.24346116185188293, "learning_rate": 9.998862282135857e-05, "loss": 0.6469, "step": 568 }, { "epoch": 0.1272111403838916, "grad_norm": 0.40003523230552673, "learning_rate": 9.998828575262034e-05, "loss": 0.5895, "step": 569 }, { "epoch": 0.12796386902521642, "grad_norm": 0.37661853432655334, "learning_rate": 9.99879437640308e-05, "loss": 0.5639, "step": 570 }, { "epoch": 0.1287165976665412, "grad_norm": 0.24309441447257996, "learning_rate": 9.998759685562737e-05, "loss": 0.6024, "step": 571 }, { "epoch": 0.129469326307866, "grad_norm": 0.30160146951675415, "learning_rate": 9.9987245027448e-05, "loss": 0.6699, "step": 572 }, { "epoch": 0.1302220549491908, "grad_norm": 0.31320688128471375, "learning_rate": 9.998688827953114e-05, "loss": 0.6671, "step": 573 }, { "epoch": 0.13097478359051562, "grad_norm": 0.2995641529560089, "learning_rate": 9.99865266119158e-05, "loss": 0.7042, "step": 574 }, { "epoch": 0.13172751223184043, "grad_norm": 0.2141716033220291, "learning_rate": 9.998616002464157e-05, "loss": 0.5412, "step": 575 }, { "epoch": 0.13248024087316523, "grad_norm": 0.23864759504795074, "learning_rate": 9.99857885177485e-05, "loss": 0.6414, "step": 576 }, { "epoch": 0.13323296951449, "grad_norm": 0.2445298284292221, "learning_rate": 9.998541209127725e-05, "loss": 0.6635, "step": 577 }, { "epoch": 0.13398569815581482, "grad_norm": 0.24543794989585876, "learning_rate": 9.998503074526896e-05, "loss": 0.6843, "step": 578 }, { "epoch": 0.13473842679713963, "grad_norm": 0.20639851689338684, "learning_rate": 9.998464447976533e-05, "loss": 0.6226, "step": 579 }, { "epoch": 0.13549115543846443, "grad_norm": 0.2053362876176834, "learning_rate": 9.998425329480863e-05, "loss": 0.574, "step": 580 }, { "epoch": 0.13624388407978924, "grad_norm": 0.34019050002098083, "learning_rate": 9.99838571904416e-05, "loss": 0.6711, "step": 581 }, { "epoch": 0.13699661272111405, "grad_norm": 0.4939693510532379, "learning_rate": 9.99834561667076e-05, "loss": 0.6209, "step": 582 }, { "epoch": 0.13774934136243885, "grad_norm": 0.5609293580055237, "learning_rate": 9.998305022365047e-05, "loss": 0.6806, "step": 583 }, { "epoch": 0.13850207000376363, "grad_norm": 0.32022592425346375, "learning_rate": 9.998263936131458e-05, "loss": 0.6789, "step": 584 }, { "epoch": 0.13925479864508844, "grad_norm": 0.33377978205680847, "learning_rate": 9.998222357974488e-05, "loss": 0.5867, "step": 585 }, { "epoch": 0.14000752728641325, "grad_norm": 0.2950896918773651, "learning_rate": 9.998180287898685e-05, "loss": 0.7101, "step": 586 }, { "epoch": 0.14076025592773805, "grad_norm": 0.2724960744380951, "learning_rate": 9.99813772590865e-05, "loss": 0.6984, "step": 587 }, { "epoch": 0.14151298456906286, "grad_norm": 0.3258936405181885, "learning_rate": 9.998094672009034e-05, "loss": 0.6373, "step": 588 }, { "epoch": 0.14226571321038767, "grad_norm": 0.2463599443435669, "learning_rate": 9.998051126204548e-05, "loss": 0.6706, "step": 589 }, { "epoch": 0.14301844185171245, "grad_norm": 0.24674978852272034, "learning_rate": 9.998007088499952e-05, "loss": 0.5698, "step": 590 }, { "epoch": 0.14377117049303725, "grad_norm": 0.38178208470344543, "learning_rate": 9.997962558900065e-05, "loss": 0.672, "step": 591 }, { "epoch": 0.14452389913436206, "grad_norm": 0.24499088525772095, "learning_rate": 9.997917537409755e-05, "loss": 0.7107, "step": 592 }, { "epoch": 0.14527662777568687, "grad_norm": 0.2555769681930542, "learning_rate": 9.997872024033948e-05, "loss": 0.5004, "step": 593 }, { "epoch": 0.14602935641701167, "grad_norm": 0.24335944652557373, "learning_rate": 9.997826018777613e-05, "loss": 0.6404, "step": 594 }, { "epoch": 0.14678208505833648, "grad_norm": 0.26514455676078796, "learning_rate": 9.997779521645793e-05, "loss": 0.6261, "step": 595 }, { "epoch": 0.14753481369966126, "grad_norm": 0.23971344530582428, "learning_rate": 9.997732532643564e-05, "loss": 0.5743, "step": 596 }, { "epoch": 0.14828754234098607, "grad_norm": 0.27140405774116516, "learning_rate": 9.997685051776068e-05, "loss": 0.6657, "step": 597 }, { "epoch": 0.14904027098231087, "grad_norm": 0.24869805574417114, "learning_rate": 9.997637079048497e-05, "loss": 0.6144, "step": 598 }, { "epoch": 0.14979299962363568, "grad_norm": 0.29865697026252747, "learning_rate": 9.997588614466096e-05, "loss": 0.5892, "step": 599 }, { "epoch": 0.1505457282649605, "grad_norm": 0.307038277387619, "learning_rate": 9.997539658034168e-05, "loss": 0.6164, "step": 600 }, { "epoch": 0.1505457282649605, "eval_loss": 0.6129584908485413, "eval_runtime": 457.7884, "eval_samples_per_second": 21.029, "eval_steps_per_second": 0.658, "step": 600 }, { "epoch": 0.1512984569062853, "grad_norm": 0.34421753883361816, "learning_rate": 9.997490209758062e-05, "loss": 0.7224, "step": 601 }, { "epoch": 0.15205118554761007, "grad_norm": 0.3522794842720032, "learning_rate": 9.997440269643191e-05, "loss": 0.6657, "step": 602 }, { "epoch": 0.15280391418893488, "grad_norm": 0.30896666646003723, "learning_rate": 9.997389837695014e-05, "loss": 0.6613, "step": 603 }, { "epoch": 0.1535566428302597, "grad_norm": 0.2621694803237915, "learning_rate": 9.997338913919045e-05, "loss": 0.5983, "step": 604 }, { "epoch": 0.1543093714715845, "grad_norm": 0.2988641858100891, "learning_rate": 9.997287498320855e-05, "loss": 0.5947, "step": 605 }, { "epoch": 0.1550621001129093, "grad_norm": 0.29390087723731995, "learning_rate": 9.997235590906067e-05, "loss": 0.7023, "step": 606 }, { "epoch": 0.1558148287542341, "grad_norm": 0.26654452085494995, "learning_rate": 9.997183191680353e-05, "loss": 0.6975, "step": 607 }, { "epoch": 0.1565675573955589, "grad_norm": 0.2880792021751404, "learning_rate": 9.997130300649448e-05, "loss": 0.6632, "step": 608 }, { "epoch": 0.1573202860368837, "grad_norm": 0.262881875038147, "learning_rate": 9.997076917819135e-05, "loss": 0.6625, "step": 609 }, { "epoch": 0.1580730146782085, "grad_norm": 0.28539979457855225, "learning_rate": 9.997023043195251e-05, "loss": 0.5632, "step": 610 }, { "epoch": 0.1588257433195333, "grad_norm": 0.572927713394165, "learning_rate": 9.99696867678369e-05, "loss": 0.7381, "step": 611 }, { "epoch": 0.15957847196085811, "grad_norm": 0.532811164855957, "learning_rate": 9.996913818590394e-05, "loss": 0.6842, "step": 612 }, { "epoch": 0.16033120060218292, "grad_norm": 0.35085156559944153, "learning_rate": 9.996858468621365e-05, "loss": 0.6909, "step": 613 }, { "epoch": 0.16108392924350773, "grad_norm": 0.4276523292064667, "learning_rate": 9.996802626882653e-05, "loss": 0.5833, "step": 614 }, { "epoch": 0.1618366578848325, "grad_norm": 0.320953369140625, "learning_rate": 9.996746293380366e-05, "loss": 0.5848, "step": 615 }, { "epoch": 0.16258938652615731, "grad_norm": 0.4149932265281677, "learning_rate": 9.996689468120665e-05, "loss": 0.6007, "step": 616 }, { "epoch": 0.16334211516748212, "grad_norm": 0.30133864283561707, "learning_rate": 9.996632151109768e-05, "loss": 0.5594, "step": 617 }, { "epoch": 0.16409484380880693, "grad_norm": 0.2687797546386719, "learning_rate": 9.996574342353936e-05, "loss": 0.5611, "step": 618 }, { "epoch": 0.16484757245013174, "grad_norm": 0.32121169567108154, "learning_rate": 9.996516041859496e-05, "loss": 0.6129, "step": 619 }, { "epoch": 0.16560030109145654, "grad_norm": 0.2793751358985901, "learning_rate": 9.996457249632824e-05, "loss": 0.6824, "step": 620 }, { "epoch": 0.16635302973278132, "grad_norm": 0.3038632571697235, "learning_rate": 9.996397965680344e-05, "loss": 0.5632, "step": 621 }, { "epoch": 0.16710575837410613, "grad_norm": 0.32193195819854736, "learning_rate": 9.996338190008544e-05, "loss": 0.5916, "step": 622 }, { "epoch": 0.16785848701543093, "grad_norm": 0.4189232885837555, "learning_rate": 9.99627792262396e-05, "loss": 0.6219, "step": 623 }, { "epoch": 0.16861121565675574, "grad_norm": 0.48833906650543213, "learning_rate": 9.996217163533183e-05, "loss": 0.6408, "step": 624 }, { "epoch": 0.16936394429808055, "grad_norm": 0.39602169394493103, "learning_rate": 9.996155912742855e-05, "loss": 0.6251, "step": 625 }, { "epoch": 0.17011667293940536, "grad_norm": 0.381504088640213, "learning_rate": 9.996094170259677e-05, "loss": 0.6235, "step": 626 }, { "epoch": 0.17086940158073013, "grad_norm": 0.3411262333393097, "learning_rate": 9.996031936090401e-05, "loss": 0.7174, "step": 627 }, { "epoch": 0.17162213022205494, "grad_norm": 0.3047899007797241, "learning_rate": 9.995969210241833e-05, "loss": 0.5852, "step": 628 }, { "epoch": 0.17237485886337975, "grad_norm": 0.39231282472610474, "learning_rate": 9.995905992720829e-05, "loss": 0.5965, "step": 629 }, { "epoch": 0.17312758750470456, "grad_norm": 0.27344581484794617, "learning_rate": 9.995842283534307e-05, "loss": 0.5562, "step": 630 }, { "epoch": 0.17388031614602936, "grad_norm": 0.29848432540893555, "learning_rate": 9.995778082689233e-05, "loss": 0.5757, "step": 631 }, { "epoch": 0.17463304478735417, "grad_norm": 0.2590131461620331, "learning_rate": 9.995713390192624e-05, "loss": 0.6532, "step": 632 }, { "epoch": 0.17538577342867895, "grad_norm": 0.25024160742759705, "learning_rate": 9.995648206051563e-05, "loss": 0.5766, "step": 633 }, { "epoch": 0.17613850207000376, "grad_norm": 0.25702694058418274, "learning_rate": 9.995582530273169e-05, "loss": 0.6052, "step": 634 }, { "epoch": 0.17689123071132856, "grad_norm": 0.22395361959934235, "learning_rate": 9.995516362864629e-05, "loss": 0.5987, "step": 635 }, { "epoch": 0.17764395935265337, "grad_norm": 0.25346866250038147, "learning_rate": 9.99544970383318e-05, "loss": 0.5888, "step": 636 }, { "epoch": 0.17839668799397818, "grad_norm": 0.31770822405815125, "learning_rate": 9.99538255318611e-05, "loss": 0.5183, "step": 637 }, { "epoch": 0.17914941663530298, "grad_norm": 0.27307644486427307, "learning_rate": 9.995314910930762e-05, "loss": 0.6079, "step": 638 }, { "epoch": 0.1799021452766278, "grad_norm": 0.2620663046836853, "learning_rate": 9.995246777074535e-05, "loss": 0.6287, "step": 639 }, { "epoch": 0.18065487391795257, "grad_norm": 0.317501962184906, "learning_rate": 9.995178151624878e-05, "loss": 0.5347, "step": 640 }, { "epoch": 0.18140760255927738, "grad_norm": 0.3004167377948761, "learning_rate": 9.995109034589296e-05, "loss": 0.6466, "step": 641 }, { "epoch": 0.18216033120060218, "grad_norm": 0.39795443415641785, "learning_rate": 9.995039425975348e-05, "loss": 0.6346, "step": 642 }, { "epoch": 0.182913059841927, "grad_norm": 0.38371217250823975, "learning_rate": 9.99496932579065e-05, "loss": 0.5941, "step": 643 }, { "epoch": 0.1836657884832518, "grad_norm": 0.28919339179992676, "learning_rate": 9.994898734042863e-05, "loss": 0.5853, "step": 644 }, { "epoch": 0.1844185171245766, "grad_norm": 0.2833126187324524, "learning_rate": 9.994827650739707e-05, "loss": 0.5791, "step": 645 }, { "epoch": 0.18517124576590138, "grad_norm": 0.34486374258995056, "learning_rate": 9.994756075888956e-05, "loss": 0.6428, "step": 646 }, { "epoch": 0.1859239744072262, "grad_norm": 0.2901858389377594, "learning_rate": 9.99468400949844e-05, "loss": 0.5429, "step": 647 }, { "epoch": 0.186676703048551, "grad_norm": 0.31909388303756714, "learning_rate": 9.994611451576038e-05, "loss": 0.6093, "step": 648 }, { "epoch": 0.1874294316898758, "grad_norm": 0.3408518433570862, "learning_rate": 9.994538402129686e-05, "loss": 0.6155, "step": 649 }, { "epoch": 0.1881821603312006, "grad_norm": 0.30509328842163086, "learning_rate": 9.994464861167372e-05, "loss": 0.6039, "step": 650 }, { "epoch": 0.18893488897252542, "grad_norm": 0.2713315784931183, "learning_rate": 9.994390828697138e-05, "loss": 0.6586, "step": 651 }, { "epoch": 0.1896876176138502, "grad_norm": 0.30613139271736145, "learning_rate": 9.99431630472708e-05, "loss": 0.5983, "step": 652 }, { "epoch": 0.190440346255175, "grad_norm": 0.4343397915363312, "learning_rate": 9.994241289265347e-05, "loss": 0.5778, "step": 653 }, { "epoch": 0.1911930748964998, "grad_norm": 0.6812400221824646, "learning_rate": 9.994165782320145e-05, "loss": 0.5581, "step": 654 }, { "epoch": 0.19194580353782462, "grad_norm": 0.5225916504859924, "learning_rate": 9.994089783899728e-05, "loss": 0.5954, "step": 655 }, { "epoch": 0.19269853217914942, "grad_norm": 0.31040045619010925, "learning_rate": 9.99401329401241e-05, "loss": 0.6207, "step": 656 }, { "epoch": 0.19345126082047423, "grad_norm": 0.43051478266716003, "learning_rate": 9.993936312666557e-05, "loss": 0.6236, "step": 657 }, { "epoch": 0.194203989461799, "grad_norm": 0.27246758341789246, "learning_rate": 9.993858839870581e-05, "loss": 0.6078, "step": 658 }, { "epoch": 0.19495671810312382, "grad_norm": 0.35744139552116394, "learning_rate": 9.993780875632962e-05, "loss": 0.6045, "step": 659 }, { "epoch": 0.19570944674444862, "grad_norm": 0.32381218671798706, "learning_rate": 9.993702419962222e-05, "loss": 0.6065, "step": 660 }, { "epoch": 0.19646217538577343, "grad_norm": 0.28507664799690247, "learning_rate": 9.993623472866942e-05, "loss": 0.5684, "step": 661 }, { "epoch": 0.19721490402709824, "grad_norm": 0.30598777532577515, "learning_rate": 9.993544034355754e-05, "loss": 0.6101, "step": 662 }, { "epoch": 0.19796763266842304, "grad_norm": 0.27871644496917725, "learning_rate": 9.993464104437346e-05, "loss": 0.6234, "step": 663 }, { "epoch": 0.19872036130974782, "grad_norm": 0.2975785434246063, "learning_rate": 9.993383683120461e-05, "loss": 0.6023, "step": 664 }, { "epoch": 0.19947308995107263, "grad_norm": 0.2518141269683838, "learning_rate": 9.99330277041389e-05, "loss": 0.6124, "step": 665 }, { "epoch": 0.20022581859239744, "grad_norm": 0.3555866777896881, "learning_rate": 9.993221366326486e-05, "loss": 0.6132, "step": 666 }, { "epoch": 0.20097854723372224, "grad_norm": 0.3711056411266327, "learning_rate": 9.993139470867147e-05, "loss": 0.7169, "step": 667 }, { "epoch": 0.20173127587504705, "grad_norm": 0.4588405191898346, "learning_rate": 9.993057084044832e-05, "loss": 0.5841, "step": 668 }, { "epoch": 0.20248400451637186, "grad_norm": 0.29253700375556946, "learning_rate": 9.992974205868549e-05, "loss": 0.6548, "step": 669 }, { "epoch": 0.20323673315769666, "grad_norm": 0.3383312523365021, "learning_rate": 9.992890836347361e-05, "loss": 0.615, "step": 670 }, { "epoch": 0.20398946179902144, "grad_norm": 0.3463241159915924, "learning_rate": 9.992806975490389e-05, "loss": 0.727, "step": 671 }, { "epoch": 0.20474219044034625, "grad_norm": 0.26803091168403625, "learning_rate": 9.992722623306799e-05, "loss": 0.5667, "step": 672 }, { "epoch": 0.20549491908167106, "grad_norm": 0.3190109133720398, "learning_rate": 9.992637779805817e-05, "loss": 0.5516, "step": 673 }, { "epoch": 0.20624764772299586, "grad_norm": 0.29449352622032166, "learning_rate": 9.992552444996722e-05, "loss": 0.618, "step": 674 }, { "epoch": 0.20700037636432067, "grad_norm": 0.2536184787750244, "learning_rate": 9.992466618888847e-05, "loss": 0.6267, "step": 675 }, { "epoch": 0.20775310500564548, "grad_norm": 0.22708696126937866, "learning_rate": 9.992380301491576e-05, "loss": 0.5771, "step": 676 }, { "epoch": 0.20850583364697026, "grad_norm": 0.28020307421684265, "learning_rate": 9.992293492814351e-05, "loss": 0.6896, "step": 677 }, { "epoch": 0.20925856228829506, "grad_norm": 0.26897549629211426, "learning_rate": 9.992206192866663e-05, "loss": 0.7278, "step": 678 }, { "epoch": 0.21001129092961987, "grad_norm": 0.23852774500846863, "learning_rate": 9.99211840165806e-05, "loss": 0.5733, "step": 679 }, { "epoch": 0.21076401957094468, "grad_norm": 0.2765878438949585, "learning_rate": 9.992030119198141e-05, "loss": 0.5916, "step": 680 }, { "epoch": 0.21151674821226948, "grad_norm": 0.32082998752593994, "learning_rate": 9.991941345496562e-05, "loss": 0.6316, "step": 681 }, { "epoch": 0.2122694768535943, "grad_norm": 0.28953656554222107, "learning_rate": 9.991852080563033e-05, "loss": 0.6932, "step": 682 }, { "epoch": 0.21302220549491907, "grad_norm": 0.26258713006973267, "learning_rate": 9.991762324407312e-05, "loss": 0.6157, "step": 683 }, { "epoch": 0.21377493413624388, "grad_norm": 0.3828776478767395, "learning_rate": 9.991672077039217e-05, "loss": 0.4943, "step": 684 }, { "epoch": 0.21452766277756868, "grad_norm": 0.344385027885437, "learning_rate": 9.991581338468616e-05, "loss": 0.6088, "step": 685 }, { "epoch": 0.2152803914188935, "grad_norm": 0.2473217248916626, "learning_rate": 9.991490108705433e-05, "loss": 0.6178, "step": 686 }, { "epoch": 0.2160331200602183, "grad_norm": 0.2766319811344147, "learning_rate": 9.991398387759645e-05, "loss": 0.5639, "step": 687 }, { "epoch": 0.2167858487015431, "grad_norm": 0.2681021988391876, "learning_rate": 9.991306175641283e-05, "loss": 0.6712, "step": 688 }, { "epoch": 0.21753857734286788, "grad_norm": 0.2849178612232208, "learning_rate": 9.991213472360429e-05, "loss": 0.5776, "step": 689 }, { "epoch": 0.2182913059841927, "grad_norm": 0.3052300214767456, "learning_rate": 9.991120277927223e-05, "loss": 0.5508, "step": 690 }, { "epoch": 0.2190440346255175, "grad_norm": 0.4080527126789093, "learning_rate": 9.991026592351854e-05, "loss": 0.5798, "step": 691 }, { "epoch": 0.2197967632668423, "grad_norm": 0.3711971044540405, "learning_rate": 9.990932415644571e-05, "loss": 0.5528, "step": 692 }, { "epoch": 0.2205494919081671, "grad_norm": 0.27907392382621765, "learning_rate": 9.990837747815669e-05, "loss": 0.6219, "step": 693 }, { "epoch": 0.22130222054949192, "grad_norm": 0.2645324170589447, "learning_rate": 9.990742588875505e-05, "loss": 0.6183, "step": 694 }, { "epoch": 0.2220549491908167, "grad_norm": 0.24418014287948608, "learning_rate": 9.990646938834483e-05, "loss": 0.5747, "step": 695 }, { "epoch": 0.2228076778321415, "grad_norm": 0.30321255326271057, "learning_rate": 9.990550797703062e-05, "loss": 0.6455, "step": 696 }, { "epoch": 0.2235604064734663, "grad_norm": 0.3457355499267578, "learning_rate": 9.990454165491757e-05, "loss": 0.6087, "step": 697 }, { "epoch": 0.22431313511479112, "grad_norm": 0.29704856872558594, "learning_rate": 9.990357042211137e-05, "loss": 0.5856, "step": 698 }, { "epoch": 0.22506586375611592, "grad_norm": 0.32709524035453796, "learning_rate": 9.990259427871822e-05, "loss": 0.557, "step": 699 }, { "epoch": 0.22581859239744073, "grad_norm": 0.3203471302986145, "learning_rate": 9.990161322484486e-05, "loss": 0.5804, "step": 700 }, { "epoch": 0.22657132103876554, "grad_norm": 0.27245697379112244, "learning_rate": 9.99006272605986e-05, "loss": 0.6265, "step": 701 }, { "epoch": 0.22732404968009032, "grad_norm": 0.2761586904525757, "learning_rate": 9.989963638608722e-05, "loss": 0.7159, "step": 702 }, { "epoch": 0.22807677832141512, "grad_norm": 0.2667573094367981, "learning_rate": 9.989864060141914e-05, "loss": 0.5816, "step": 703 }, { "epoch": 0.22882950696273993, "grad_norm": 0.23678889870643616, "learning_rate": 9.989763990670322e-05, "loss": 0.595, "step": 704 }, { "epoch": 0.22958223560406474, "grad_norm": 0.2505415081977844, "learning_rate": 9.989663430204891e-05, "loss": 0.5412, "step": 705 }, { "epoch": 0.23033496424538955, "grad_norm": 0.2752394676208496, "learning_rate": 9.989562378756616e-05, "loss": 0.6122, "step": 706 }, { "epoch": 0.23108769288671435, "grad_norm": 0.24968411028385162, "learning_rate": 9.989460836336549e-05, "loss": 0.5294, "step": 707 }, { "epoch": 0.23184042152803913, "grad_norm": 0.24645258486270905, "learning_rate": 9.989358802955798e-05, "loss": 0.5783, "step": 708 }, { "epoch": 0.23259315016936394, "grad_norm": 0.25881507992744446, "learning_rate": 9.989256278625514e-05, "loss": 0.5476, "step": 709 }, { "epoch": 0.23334587881068874, "grad_norm": 0.2684710621833801, "learning_rate": 9.989153263356914e-05, "loss": 0.6917, "step": 710 }, { "epoch": 0.23409860745201355, "grad_norm": 0.20249232649803162, "learning_rate": 9.989049757161264e-05, "loss": 0.5949, "step": 711 }, { "epoch": 0.23485133609333836, "grad_norm": 0.2651006281375885, "learning_rate": 9.98894576004988e-05, "loss": 0.5239, "step": 712 }, { "epoch": 0.23560406473466317, "grad_norm": 0.37177035212516785, "learning_rate": 9.988841272034137e-05, "loss": 0.6241, "step": 713 }, { "epoch": 0.23635679337598794, "grad_norm": 0.4156702756881714, "learning_rate": 9.988736293125462e-05, "loss": 0.5747, "step": 714 }, { "epoch": 0.23710952201731275, "grad_norm": 0.29343706369400024, "learning_rate": 9.988630823335334e-05, "loss": 0.6225, "step": 715 }, { "epoch": 0.23786225065863756, "grad_norm": 0.40985676646232605, "learning_rate": 9.988524862675288e-05, "loss": 0.527, "step": 716 }, { "epoch": 0.23861497929996237, "grad_norm": 0.340462327003479, "learning_rate": 9.988418411156911e-05, "loss": 0.6442, "step": 717 }, { "epoch": 0.23936770794128717, "grad_norm": 0.2732607424259186, "learning_rate": 9.988311468791846e-05, "loss": 0.5954, "step": 718 }, { "epoch": 0.24012043658261198, "grad_norm": 0.3310803472995758, "learning_rate": 9.988204035591786e-05, "loss": 0.6049, "step": 719 }, { "epoch": 0.24087316522393676, "grad_norm": 0.2783429026603699, "learning_rate": 9.98809611156848e-05, "loss": 0.5945, "step": 720 }, { "epoch": 0.24162589386526157, "grad_norm": 0.23636628687381744, "learning_rate": 9.987987696733731e-05, "loss": 0.6037, "step": 721 }, { "epoch": 0.24237862250658637, "grad_norm": 0.2338215708732605, "learning_rate": 9.987878791099397e-05, "loss": 0.6541, "step": 722 }, { "epoch": 0.24313135114791118, "grad_norm": 0.2216925323009491, "learning_rate": 9.987769394677384e-05, "loss": 0.61, "step": 723 }, { "epoch": 0.24388407978923599, "grad_norm": 0.23414190113544464, "learning_rate": 9.987659507479657e-05, "loss": 0.6073, "step": 724 }, { "epoch": 0.2446368084305608, "grad_norm": 0.2394588142633438, "learning_rate": 9.987549129518235e-05, "loss": 0.6322, "step": 725 }, { "epoch": 0.24538953707188557, "grad_norm": 0.23294804990291595, "learning_rate": 9.987438260805186e-05, "loss": 0.5995, "step": 726 }, { "epoch": 0.24614226571321038, "grad_norm": 0.23586899042129517, "learning_rate": 9.987326901352638e-05, "loss": 0.538, "step": 727 }, { "epoch": 0.24689499435453519, "grad_norm": 0.2844596803188324, "learning_rate": 9.987215051172763e-05, "loss": 0.7016, "step": 728 }, { "epoch": 0.24764772299586, "grad_norm": 0.28851956129074097, "learning_rate": 9.987102710277798e-05, "loss": 0.6133, "step": 729 }, { "epoch": 0.2484004516371848, "grad_norm": 0.21596771478652954, "learning_rate": 9.986989878680028e-05, "loss": 0.6553, "step": 730 }, { "epoch": 0.2491531802785096, "grad_norm": 0.271456778049469, "learning_rate": 9.986876556391788e-05, "loss": 0.6226, "step": 731 }, { "epoch": 0.2499059089198344, "grad_norm": 0.2374461144208908, "learning_rate": 9.986762743425476e-05, "loss": 0.6435, "step": 732 }, { "epoch": 0.2506586375611592, "grad_norm": 0.2557368576526642, "learning_rate": 9.986648439793536e-05, "loss": 0.5486, "step": 733 }, { "epoch": 0.251411366202484, "grad_norm": 0.3115346431732178, "learning_rate": 9.986533645508467e-05, "loss": 0.5905, "step": 734 }, { "epoch": 0.2521640948438088, "grad_norm": 0.3143802881240845, "learning_rate": 9.986418360582826e-05, "loss": 0.5613, "step": 735 }, { "epoch": 0.2529168234851336, "grad_norm": 0.45340675115585327, "learning_rate": 9.986302585029216e-05, "loss": 0.709, "step": 736 }, { "epoch": 0.2536695521264584, "grad_norm": 0.5464420914649963, "learning_rate": 9.986186318860302e-05, "loss": 0.4952, "step": 737 }, { "epoch": 0.2544222807677832, "grad_norm": 0.4973028600215912, "learning_rate": 9.986069562088795e-05, "loss": 0.5914, "step": 738 }, { "epoch": 0.25517500940910803, "grad_norm": 0.3173585832118988, "learning_rate": 9.985952314727468e-05, "loss": 0.5071, "step": 739 }, { "epoch": 0.25592773805043284, "grad_norm": 0.3916628956794739, "learning_rate": 9.985834576789139e-05, "loss": 0.5505, "step": 740 }, { "epoch": 0.25668046669175765, "grad_norm": 0.4439134895801544, "learning_rate": 9.985716348286685e-05, "loss": 0.5458, "step": 741 }, { "epoch": 0.2574331953330824, "grad_norm": 0.32744377851486206, "learning_rate": 9.985597629233038e-05, "loss": 0.5777, "step": 742 }, { "epoch": 0.2581859239744072, "grad_norm": 0.3234154284000397, "learning_rate": 9.985478419641174e-05, "loss": 0.5822, "step": 743 }, { "epoch": 0.258938652615732, "grad_norm": 0.3529973328113556, "learning_rate": 9.985358719524136e-05, "loss": 0.4959, "step": 744 }, { "epoch": 0.2596913812570568, "grad_norm": 0.295553058385849, "learning_rate": 9.985238528895012e-05, "loss": 0.5602, "step": 745 }, { "epoch": 0.2604441098983816, "grad_norm": 0.2871539890766144, "learning_rate": 9.985117847766946e-05, "loss": 0.7248, "step": 746 }, { "epoch": 0.26119683853970643, "grad_norm": 0.295327365398407, "learning_rate": 9.984996676153134e-05, "loss": 0.61, "step": 747 }, { "epoch": 0.26194956718103124, "grad_norm": 0.26523783802986145, "learning_rate": 9.984875014066832e-05, "loss": 0.5094, "step": 748 }, { "epoch": 0.26270229582235605, "grad_norm": 0.34404316544532776, "learning_rate": 9.984752861521338e-05, "loss": 0.5798, "step": 749 }, { "epoch": 0.26345502446368085, "grad_norm": 0.3108694553375244, "learning_rate": 9.984630218530014e-05, "loss": 0.4624, "step": 750 }, { "epoch": 0.26420775310500566, "grad_norm": 0.3012259006500244, "learning_rate": 9.984507085106273e-05, "loss": 0.6275, "step": 751 }, { "epoch": 0.26496048174633047, "grad_norm": 0.31519338488578796, "learning_rate": 9.98438346126358e-05, "loss": 0.6768, "step": 752 }, { "epoch": 0.2657132103876553, "grad_norm": 0.2880750298500061, "learning_rate": 9.984259347015453e-05, "loss": 0.6701, "step": 753 }, { "epoch": 0.26646593902898, "grad_norm": 0.26008641719818115, "learning_rate": 9.984134742375466e-05, "loss": 0.5476, "step": 754 }, { "epoch": 0.26721866767030483, "grad_norm": 0.2975877821445465, "learning_rate": 9.984009647357244e-05, "loss": 0.5572, "step": 755 }, { "epoch": 0.26797139631162964, "grad_norm": 0.22505499422550201, "learning_rate": 9.983884061974471e-05, "loss": 0.6152, "step": 756 }, { "epoch": 0.26872412495295445, "grad_norm": 0.2617549002170563, "learning_rate": 9.983757986240877e-05, "loss": 0.5049, "step": 757 }, { "epoch": 0.26947685359427925, "grad_norm": 0.25422149896621704, "learning_rate": 9.983631420170252e-05, "loss": 0.5885, "step": 758 }, { "epoch": 0.27022958223560406, "grad_norm": 0.26040858030319214, "learning_rate": 9.983504363776435e-05, "loss": 0.4885, "step": 759 }, { "epoch": 0.27098231087692887, "grad_norm": 0.23542794585227966, "learning_rate": 9.98337681707332e-05, "loss": 0.533, "step": 760 }, { "epoch": 0.2717350395182537, "grad_norm": 0.24552997946739197, "learning_rate": 9.98324878007486e-05, "loss": 0.6253, "step": 761 }, { "epoch": 0.2724877681595785, "grad_norm": 0.23433150351047516, "learning_rate": 9.983120252795053e-05, "loss": 0.6248, "step": 762 }, { "epoch": 0.2732404968009033, "grad_norm": 0.259512722492218, "learning_rate": 9.982991235247954e-05, "loss": 0.5576, "step": 763 }, { "epoch": 0.2739932254422281, "grad_norm": 0.319949209690094, "learning_rate": 9.982861727447675e-05, "loss": 0.562, "step": 764 }, { "epoch": 0.2747459540835529, "grad_norm": 0.33442753553390503, "learning_rate": 9.982731729408377e-05, "loss": 0.5058, "step": 765 }, { "epoch": 0.2754986827248777, "grad_norm": 0.32845205068588257, "learning_rate": 9.982601241144277e-05, "loss": 0.5319, "step": 766 }, { "epoch": 0.27625141136620246, "grad_norm": 0.31467804312705994, "learning_rate": 9.982470262669643e-05, "loss": 0.5687, "step": 767 }, { "epoch": 0.27700414000752727, "grad_norm": 0.2824096977710724, "learning_rate": 9.982338793998802e-05, "loss": 0.6342, "step": 768 }, { "epoch": 0.2777568686488521, "grad_norm": 0.32189518213272095, "learning_rate": 9.982206835146131e-05, "loss": 0.582, "step": 769 }, { "epoch": 0.2785095972901769, "grad_norm": 0.2265952229499817, "learning_rate": 9.982074386126057e-05, "loss": 0.629, "step": 770 }, { "epoch": 0.2792623259315017, "grad_norm": 0.37726283073425293, "learning_rate": 9.981941446953066e-05, "loss": 0.4958, "step": 771 }, { "epoch": 0.2800150545728265, "grad_norm": 0.47976142168045044, "learning_rate": 9.981808017641699e-05, "loss": 0.5658, "step": 772 }, { "epoch": 0.2807677832141513, "grad_norm": 0.3440079391002655, "learning_rate": 9.981674098206545e-05, "loss": 0.5586, "step": 773 }, { "epoch": 0.2815205118554761, "grad_norm": 0.3813066780567169, "learning_rate": 9.98153968866225e-05, "loss": 0.5897, "step": 774 }, { "epoch": 0.2822732404968009, "grad_norm": 0.2609940469264984, "learning_rate": 9.981404789023512e-05, "loss": 0.6026, "step": 775 }, { "epoch": 0.2830259691381257, "grad_norm": 0.3683112561702728, "learning_rate": 9.981269399305084e-05, "loss": 0.6199, "step": 776 }, { "epoch": 0.28377869777945053, "grad_norm": 0.2804531753063202, "learning_rate": 9.981133519521773e-05, "loss": 0.5359, "step": 777 }, { "epoch": 0.28453142642077534, "grad_norm": 0.27813810110092163, "learning_rate": 9.980997149688437e-05, "loss": 0.6151, "step": 778 }, { "epoch": 0.2852841550621001, "grad_norm": 0.320430189371109, "learning_rate": 9.980860289819989e-05, "loss": 0.5927, "step": 779 }, { "epoch": 0.2860368837034249, "grad_norm": 0.28515303134918213, "learning_rate": 9.980722939931397e-05, "loss": 0.4989, "step": 780 }, { "epoch": 0.2867896123447497, "grad_norm": 0.2639062702655792, "learning_rate": 9.980585100037681e-05, "loss": 0.6137, "step": 781 }, { "epoch": 0.2875423409860745, "grad_norm": 0.2596417963504791, "learning_rate": 9.980446770153917e-05, "loss": 0.5075, "step": 782 }, { "epoch": 0.2882950696273993, "grad_norm": 0.25146999955177307, "learning_rate": 9.980307950295228e-05, "loss": 0.5478, "step": 783 }, { "epoch": 0.2890477982687241, "grad_norm": 0.3319234549999237, "learning_rate": 9.980168640476797e-05, "loss": 0.6092, "step": 784 }, { "epoch": 0.28980052691004893, "grad_norm": 0.24940693378448486, "learning_rate": 9.980028840713861e-05, "loss": 0.4787, "step": 785 }, { "epoch": 0.29055325555137373, "grad_norm": 0.21990908682346344, "learning_rate": 9.979888551021705e-05, "loss": 0.5492, "step": 786 }, { "epoch": 0.29130598419269854, "grad_norm": 0.23395197093486786, "learning_rate": 9.979747771415675e-05, "loss": 0.5395, "step": 787 }, { "epoch": 0.29205871283402335, "grad_norm": 0.2440863698720932, "learning_rate": 9.97960650191116e-05, "loss": 0.5619, "step": 788 }, { "epoch": 0.29281144147534816, "grad_norm": 0.31268787384033203, "learning_rate": 9.979464742523617e-05, "loss": 0.6054, "step": 789 }, { "epoch": 0.29356417011667296, "grad_norm": 0.2579202950000763, "learning_rate": 9.979322493268543e-05, "loss": 0.6338, "step": 790 }, { "epoch": 0.29431689875799777, "grad_norm": 0.2655872702598572, "learning_rate": 9.979179754161494e-05, "loss": 0.6028, "step": 791 }, { "epoch": 0.2950696273993225, "grad_norm": 0.2537118196487427, "learning_rate": 9.979036525218084e-05, "loss": 0.6492, "step": 792 }, { "epoch": 0.2958223560406473, "grad_norm": 0.3042292594909668, "learning_rate": 9.978892806453973e-05, "loss": 0.6268, "step": 793 }, { "epoch": 0.29657508468197213, "grad_norm": 0.3972797095775604, "learning_rate": 9.978748597884878e-05, "loss": 0.6267, "step": 794 }, { "epoch": 0.29732781332329694, "grad_norm": 0.34563493728637695, "learning_rate": 9.97860389952657e-05, "loss": 0.6163, "step": 795 }, { "epoch": 0.29808054196462175, "grad_norm": 0.288851261138916, "learning_rate": 9.978458711394873e-05, "loss": 0.5477, "step": 796 }, { "epoch": 0.29883327060594655, "grad_norm": 0.29261496663093567, "learning_rate": 9.978313033505665e-05, "loss": 0.5795, "step": 797 }, { "epoch": 0.29958599924727136, "grad_norm": 0.2952653467655182, "learning_rate": 9.978166865874878e-05, "loss": 0.6143, "step": 798 }, { "epoch": 0.30033872788859617, "grad_norm": 0.268827885389328, "learning_rate": 9.978020208518493e-05, "loss": 0.6051, "step": 799 }, { "epoch": 0.301091456529921, "grad_norm": 0.2764621675014496, "learning_rate": 9.977873061452552e-05, "loss": 0.524, "step": 800 }, { "epoch": 0.301091456529921, "eval_loss": 0.5531788468360901, "eval_runtime": 456.2186, "eval_samples_per_second": 21.102, "eval_steps_per_second": 0.66, "step": 800 }, { "epoch": 0.3018441851712458, "grad_norm": 0.2915211021900177, "learning_rate": 9.977725424693145e-05, "loss": 0.585, "step": 801 }, { "epoch": 0.3025969138125706, "grad_norm": 0.3197081387042999, "learning_rate": 9.977577298256417e-05, "loss": 0.4663, "step": 802 }, { "epoch": 0.3033496424538954, "grad_norm": 0.24979710578918457, "learning_rate": 9.977428682158569e-05, "loss": 0.5793, "step": 803 }, { "epoch": 0.30410237109522015, "grad_norm": 0.4540333151817322, "learning_rate": 9.977279576415853e-05, "loss": 0.5718, "step": 804 }, { "epoch": 0.30485509973654495, "grad_norm": 0.35731208324432373, "learning_rate": 9.97712998104457e-05, "loss": 0.5824, "step": 805 }, { "epoch": 0.30560782837786976, "grad_norm": 0.29730284214019775, "learning_rate": 9.976979896061086e-05, "loss": 0.6622, "step": 806 }, { "epoch": 0.30636055701919457, "grad_norm": 0.2562820017337799, "learning_rate": 9.976829321481812e-05, "loss": 0.5046, "step": 807 }, { "epoch": 0.3071132856605194, "grad_norm": 0.3767290711402893, "learning_rate": 9.976678257323213e-05, "loss": 0.638, "step": 808 }, { "epoch": 0.3078660143018442, "grad_norm": 0.33998167514801025, "learning_rate": 9.976526703601811e-05, "loss": 0.5232, "step": 809 }, { "epoch": 0.308618742943169, "grad_norm": 0.23478087782859802, "learning_rate": 9.97637466033418e-05, "loss": 0.5272, "step": 810 }, { "epoch": 0.3093714715844938, "grad_norm": 0.3138110637664795, "learning_rate": 9.976222127536944e-05, "loss": 0.4929, "step": 811 }, { "epoch": 0.3101242002258186, "grad_norm": 0.33192312717437744, "learning_rate": 9.976069105226788e-05, "loss": 0.6694, "step": 812 }, { "epoch": 0.3108769288671434, "grad_norm": 0.2693336308002472, "learning_rate": 9.975915593420444e-05, "loss": 0.5358, "step": 813 }, { "epoch": 0.3116296575084682, "grad_norm": 0.3380907475948334, "learning_rate": 9.975761592134699e-05, "loss": 0.5013, "step": 814 }, { "epoch": 0.312382386149793, "grad_norm": 0.4335043132305145, "learning_rate": 9.975607101386398e-05, "loss": 0.6873, "step": 815 }, { "epoch": 0.3131351147911178, "grad_norm": 0.3318432569503784, "learning_rate": 9.975452121192431e-05, "loss": 0.5346, "step": 816 }, { "epoch": 0.3138878434324426, "grad_norm": 0.32577165961265564, "learning_rate": 9.97529665156975e-05, "loss": 0.6463, "step": 817 }, { "epoch": 0.3146405720737674, "grad_norm": 0.3511887788772583, "learning_rate": 9.975140692535354e-05, "loss": 0.5637, "step": 818 }, { "epoch": 0.3153933007150922, "grad_norm": 0.26177850365638733, "learning_rate": 9.974984244106302e-05, "loss": 0.4808, "step": 819 }, { "epoch": 0.316146029356417, "grad_norm": 0.29404309391975403, "learning_rate": 9.974827306299701e-05, "loss": 0.5912, "step": 820 }, { "epoch": 0.3168987579977418, "grad_norm": 0.3165629804134369, "learning_rate": 9.974669879132713e-05, "loss": 0.655, "step": 821 }, { "epoch": 0.3176514866390666, "grad_norm": 0.30745553970336914, "learning_rate": 9.974511962622555e-05, "loss": 0.6384, "step": 822 }, { "epoch": 0.3184042152803914, "grad_norm": 0.4105824828147888, "learning_rate": 9.974353556786496e-05, "loss": 0.7104, "step": 823 }, { "epoch": 0.31915694392171623, "grad_norm": 0.39225050806999207, "learning_rate": 9.974194661641859e-05, "loss": 0.5792, "step": 824 }, { "epoch": 0.31990967256304104, "grad_norm": 0.299409955739975, "learning_rate": 9.974035277206021e-05, "loss": 0.5365, "step": 825 }, { "epoch": 0.32066240120436584, "grad_norm": 0.29286259412765503, "learning_rate": 9.97387540349641e-05, "loss": 0.6069, "step": 826 }, { "epoch": 0.32141512984569065, "grad_norm": 0.2778761088848114, "learning_rate": 9.973715040530514e-05, "loss": 0.6498, "step": 827 }, { "epoch": 0.32216785848701546, "grad_norm": 0.2909785509109497, "learning_rate": 9.973554188325865e-05, "loss": 0.534, "step": 828 }, { "epoch": 0.3229205871283402, "grad_norm": 0.4161387085914612, "learning_rate": 9.973392846900056e-05, "loss": 0.5883, "step": 829 }, { "epoch": 0.323673315769665, "grad_norm": 0.4464836120605469, "learning_rate": 9.973231016270731e-05, "loss": 0.6382, "step": 830 }, { "epoch": 0.3244260444109898, "grad_norm": 0.31394997239112854, "learning_rate": 9.973068696455589e-05, "loss": 0.4938, "step": 831 }, { "epoch": 0.32517877305231463, "grad_norm": 0.3363230228424072, "learning_rate": 9.972905887472377e-05, "loss": 0.4624, "step": 832 }, { "epoch": 0.32593150169363944, "grad_norm": 0.3748658299446106, "learning_rate": 9.972742589338905e-05, "loss": 0.5521, "step": 833 }, { "epoch": 0.32668423033496424, "grad_norm": 0.2635120153427124, "learning_rate": 9.972578802073026e-05, "loss": 0.4968, "step": 834 }, { "epoch": 0.32743695897628905, "grad_norm": 0.3070274293422699, "learning_rate": 9.972414525692653e-05, "loss": 0.6277, "step": 835 }, { "epoch": 0.32818968761761386, "grad_norm": 0.41728705167770386, "learning_rate": 9.972249760215754e-05, "loss": 0.5981, "step": 836 }, { "epoch": 0.32894241625893866, "grad_norm": 0.6310741901397705, "learning_rate": 9.972084505660344e-05, "loss": 0.5539, "step": 837 }, { "epoch": 0.32969514490026347, "grad_norm": 0.4112805426120758, "learning_rate": 9.971918762044496e-05, "loss": 0.4924, "step": 838 }, { "epoch": 0.3304478735415883, "grad_norm": 0.43406054377555847, "learning_rate": 9.971752529386336e-05, "loss": 0.5185, "step": 839 }, { "epoch": 0.3312006021829131, "grad_norm": 0.309025377035141, "learning_rate": 9.971585807704043e-05, "loss": 0.4968, "step": 840 }, { "epoch": 0.33195333082423784, "grad_norm": 0.33418145775794983, "learning_rate": 9.971418597015848e-05, "loss": 0.5096, "step": 841 }, { "epoch": 0.33270605946556264, "grad_norm": 0.3361812233924866, "learning_rate": 9.971250897340038e-05, "loss": 0.6035, "step": 842 }, { "epoch": 0.33345878810688745, "grad_norm": 0.28903672099113464, "learning_rate": 9.971082708694953e-05, "loss": 0.5335, "step": 843 }, { "epoch": 0.33421151674821226, "grad_norm": 0.2499721348285675, "learning_rate": 9.970914031098984e-05, "loss": 0.6851, "step": 844 }, { "epoch": 0.33496424538953706, "grad_norm": 0.3403612971305847, "learning_rate": 9.97074486457058e-05, "loss": 0.5452, "step": 845 }, { "epoch": 0.33571697403086187, "grad_norm": 0.2740510404109955, "learning_rate": 9.970575209128238e-05, "loss": 0.5379, "step": 846 }, { "epoch": 0.3364697026721867, "grad_norm": 0.267524391412735, "learning_rate": 9.970405064790513e-05, "loss": 0.5382, "step": 847 }, { "epoch": 0.3372224313135115, "grad_norm": 0.29750630259513855, "learning_rate": 9.970234431576011e-05, "loss": 0.5524, "step": 848 }, { "epoch": 0.3379751599548363, "grad_norm": 0.27484166622161865, "learning_rate": 9.970063309503394e-05, "loss": 0.5319, "step": 849 }, { "epoch": 0.3387278885961611, "grad_norm": 0.25935766100883484, "learning_rate": 9.969891698591372e-05, "loss": 0.6201, "step": 850 }, { "epoch": 0.3394806172374859, "grad_norm": 0.25876232981681824, "learning_rate": 9.969719598858715e-05, "loss": 0.5568, "step": 851 }, { "epoch": 0.3402333458788107, "grad_norm": 0.2792060077190399, "learning_rate": 9.969547010324244e-05, "loss": 0.5169, "step": 852 }, { "epoch": 0.3409860745201355, "grad_norm": 0.23810240626335144, "learning_rate": 9.96937393300683e-05, "loss": 0.5506, "step": 853 }, { "epoch": 0.34173880316146027, "grad_norm": 0.29577508568763733, "learning_rate": 9.969200366925404e-05, "loss": 0.4581, "step": 854 }, { "epoch": 0.3424915318027851, "grad_norm": 0.28710857033729553, "learning_rate": 9.969026312098942e-05, "loss": 0.5799, "step": 855 }, { "epoch": 0.3432442604441099, "grad_norm": 0.20972034335136414, "learning_rate": 9.968851768546486e-05, "loss": 0.4932, "step": 856 }, { "epoch": 0.3439969890854347, "grad_norm": 0.2887890934944153, "learning_rate": 9.968676736287116e-05, "loss": 0.5411, "step": 857 }, { "epoch": 0.3447497177267595, "grad_norm": 0.28695911169052124, "learning_rate": 9.968501215339978e-05, "loss": 0.5921, "step": 858 }, { "epoch": 0.3455024463680843, "grad_norm": 0.22573857009410858, "learning_rate": 9.968325205724265e-05, "loss": 0.533, "step": 859 }, { "epoch": 0.3462551750094091, "grad_norm": 0.38366183638572693, "learning_rate": 9.968148707459226e-05, "loss": 0.5178, "step": 860 }, { "epoch": 0.3470079036507339, "grad_norm": 0.2567957937717438, "learning_rate": 9.967971720564162e-05, "loss": 0.5313, "step": 861 }, { "epoch": 0.3477606322920587, "grad_norm": 0.4646987020969391, "learning_rate": 9.967794245058428e-05, "loss": 0.5914, "step": 862 }, { "epoch": 0.34851336093338353, "grad_norm": 0.40723684430122375, "learning_rate": 9.967616280961433e-05, "loss": 0.575, "step": 863 }, { "epoch": 0.34926608957470834, "grad_norm": 0.4007025957107544, "learning_rate": 9.967437828292637e-05, "loss": 0.593, "step": 864 }, { "epoch": 0.35001881821603315, "grad_norm": 0.3235537111759186, "learning_rate": 9.96725888707156e-05, "loss": 0.4927, "step": 865 }, { "epoch": 0.3507715468573579, "grad_norm": 0.444448322057724, "learning_rate": 9.967079457317764e-05, "loss": 0.5955, "step": 866 }, { "epoch": 0.3515242754986827, "grad_norm": 0.4435630440711975, "learning_rate": 9.966899539050877e-05, "loss": 0.6756, "step": 867 }, { "epoch": 0.3522770041400075, "grad_norm": 0.324283629655838, "learning_rate": 9.96671913229057e-05, "loss": 0.6019, "step": 868 }, { "epoch": 0.3530297327813323, "grad_norm": 0.24627013504505157, "learning_rate": 9.966538237056577e-05, "loss": 0.4833, "step": 869 }, { "epoch": 0.3537824614226571, "grad_norm": 0.34050101041793823, "learning_rate": 9.966356853368677e-05, "loss": 0.5323, "step": 870 }, { "epoch": 0.35453519006398193, "grad_norm": 0.2619081437587738, "learning_rate": 9.966174981246705e-05, "loss": 0.5527, "step": 871 }, { "epoch": 0.35528791870530674, "grad_norm": 0.2508618235588074, "learning_rate": 9.965992620710552e-05, "loss": 0.4923, "step": 872 }, { "epoch": 0.35604064734663154, "grad_norm": 0.23675216734409332, "learning_rate": 9.965809771780162e-05, "loss": 0.5627, "step": 873 }, { "epoch": 0.35679337598795635, "grad_norm": 0.37293684482574463, "learning_rate": 9.96562643447553e-05, "loss": 0.5474, "step": 874 }, { "epoch": 0.35754610462928116, "grad_norm": 0.4013768136501312, "learning_rate": 9.965442608816703e-05, "loss": 0.6003, "step": 875 }, { "epoch": 0.35829883327060597, "grad_norm": 0.36650213599205017, "learning_rate": 9.965258294823787e-05, "loss": 0.5744, "step": 876 }, { "epoch": 0.35905156191193077, "grad_norm": 0.25879546999931335, "learning_rate": 9.965073492516937e-05, "loss": 0.6357, "step": 877 }, { "epoch": 0.3598042905532556, "grad_norm": 0.29715797305107117, "learning_rate": 9.964888201916364e-05, "loss": 0.5892, "step": 878 }, { "epoch": 0.36055701919458033, "grad_norm": 0.2334136813879013, "learning_rate": 9.964702423042331e-05, "loss": 0.5679, "step": 879 }, { "epoch": 0.36130974783590514, "grad_norm": 0.3139456808567047, "learning_rate": 9.964516155915151e-05, "loss": 0.5339, "step": 880 }, { "epoch": 0.36206247647722994, "grad_norm": 0.2768210172653198, "learning_rate": 9.964329400555197e-05, "loss": 0.5067, "step": 881 }, { "epoch": 0.36281520511855475, "grad_norm": 0.3446118235588074, "learning_rate": 9.964142156982894e-05, "loss": 0.4643, "step": 882 }, { "epoch": 0.36356793375987956, "grad_norm": 0.25530895590782166, "learning_rate": 9.963954425218713e-05, "loss": 0.604, "step": 883 }, { "epoch": 0.36432066240120436, "grad_norm": 0.245814248919487, "learning_rate": 9.96376620528319e-05, "loss": 0.5575, "step": 884 }, { "epoch": 0.36507339104252917, "grad_norm": 0.26235291361808777, "learning_rate": 9.963577497196905e-05, "loss": 0.5421, "step": 885 }, { "epoch": 0.365826119683854, "grad_norm": 0.22320140898227692, "learning_rate": 9.963388300980495e-05, "loss": 0.5711, "step": 886 }, { "epoch": 0.3665788483251788, "grad_norm": 0.24791871011257172, "learning_rate": 9.963198616654653e-05, "loss": 0.6484, "step": 887 }, { "epoch": 0.3673315769665036, "grad_norm": 0.24754199385643005, "learning_rate": 9.96300844424012e-05, "loss": 0.5884, "step": 888 }, { "epoch": 0.3680843056078284, "grad_norm": 0.3465268611907959, "learning_rate": 9.962817783757693e-05, "loss": 0.5305, "step": 889 }, { "epoch": 0.3688370342491532, "grad_norm": 0.3429701626300812, "learning_rate": 9.962626635228223e-05, "loss": 0.4988, "step": 890 }, { "epoch": 0.36958976289047796, "grad_norm": 0.3547210395336151, "learning_rate": 9.962434998672614e-05, "loss": 0.5573, "step": 891 }, { "epoch": 0.37034249153180276, "grad_norm": 0.3058502972126007, "learning_rate": 9.962242874111823e-05, "loss": 0.5628, "step": 892 }, { "epoch": 0.37109522017312757, "grad_norm": 0.27874720096588135, "learning_rate": 9.962050261566859e-05, "loss": 0.5559, "step": 893 }, { "epoch": 0.3718479488144524, "grad_norm": 0.3551805317401886, "learning_rate": 9.961857161058789e-05, "loss": 0.4518, "step": 894 }, { "epoch": 0.3726006774557772, "grad_norm": 0.32730522751808167, "learning_rate": 9.961663572608725e-05, "loss": 0.65, "step": 895 }, { "epoch": 0.373353406097102, "grad_norm": 0.2779262959957123, "learning_rate": 9.961469496237844e-05, "loss": 0.5005, "step": 896 }, { "epoch": 0.3741061347384268, "grad_norm": 0.359273225069046, "learning_rate": 9.961274931967365e-05, "loss": 0.6102, "step": 897 }, { "epoch": 0.3748588633797516, "grad_norm": 0.29356783628463745, "learning_rate": 9.961079879818567e-05, "loss": 0.5792, "step": 898 }, { "epoch": 0.3756115920210764, "grad_norm": 0.2626100182533264, "learning_rate": 9.960884339812781e-05, "loss": 0.574, "step": 899 }, { "epoch": 0.3763643206624012, "grad_norm": 0.35187163949012756, "learning_rate": 9.96068831197139e-05, "loss": 0.6388, "step": 900 }, { "epoch": 0.377117049303726, "grad_norm": 0.31688231229782104, "learning_rate": 9.96049179631583e-05, "loss": 0.6209, "step": 901 }, { "epoch": 0.37786977794505083, "grad_norm": 0.29052263498306274, "learning_rate": 9.960294792867596e-05, "loss": 0.521, "step": 902 }, { "epoch": 0.3786225065863756, "grad_norm": 0.3092794120311737, "learning_rate": 9.96009730164823e-05, "loss": 0.5162, "step": 903 }, { "epoch": 0.3793752352277004, "grad_norm": 0.3965584933757782, "learning_rate": 9.959899322679326e-05, "loss": 0.5141, "step": 904 }, { "epoch": 0.3801279638690252, "grad_norm": 0.3703252673149109, "learning_rate": 9.959700855982538e-05, "loss": 0.6262, "step": 905 }, { "epoch": 0.38088069251035, "grad_norm": 0.2621668577194214, "learning_rate": 9.95950190157957e-05, "loss": 0.6416, "step": 906 }, { "epoch": 0.3816334211516748, "grad_norm": 0.2994947135448456, "learning_rate": 9.95930245949218e-05, "loss": 0.5696, "step": 907 }, { "epoch": 0.3823861497929996, "grad_norm": 0.3368871510028839, "learning_rate": 9.959102529742175e-05, "loss": 0.468, "step": 908 }, { "epoch": 0.3831388784343244, "grad_norm": 0.23110422492027283, "learning_rate": 9.958902112351423e-05, "loss": 0.5515, "step": 909 }, { "epoch": 0.38389160707564923, "grad_norm": 0.2673763632774353, "learning_rate": 9.95870120734184e-05, "loss": 0.5179, "step": 910 }, { "epoch": 0.38464433571697404, "grad_norm": 0.3125317692756653, "learning_rate": 9.958499814735397e-05, "loss": 0.4721, "step": 911 }, { "epoch": 0.38539706435829885, "grad_norm": 0.2695204019546509, "learning_rate": 9.958297934554117e-05, "loss": 0.5121, "step": 912 }, { "epoch": 0.38614979299962365, "grad_norm": 0.24736401438713074, "learning_rate": 9.958095566820078e-05, "loss": 0.618, "step": 913 }, { "epoch": 0.38690252164094846, "grad_norm": 0.23015698790550232, "learning_rate": 9.957892711555409e-05, "loss": 0.5982, "step": 914 }, { "epoch": 0.38765525028227327, "grad_norm": 0.2530623972415924, "learning_rate": 9.957689368782297e-05, "loss": 0.4794, "step": 915 }, { "epoch": 0.388407978923598, "grad_norm": 0.2949036955833435, "learning_rate": 9.957485538522978e-05, "loss": 0.4684, "step": 916 }, { "epoch": 0.3891607075649228, "grad_norm": 0.23893460631370544, "learning_rate": 9.95728122079974e-05, "loss": 0.4696, "step": 917 }, { "epoch": 0.38991343620624763, "grad_norm": 0.24412593245506287, "learning_rate": 9.95707641563493e-05, "loss": 0.499, "step": 918 }, { "epoch": 0.39066616484757244, "grad_norm": 0.26614370942115784, "learning_rate": 9.956871123050946e-05, "loss": 0.47, "step": 919 }, { "epoch": 0.39141889348889725, "grad_norm": 0.23155595362186432, "learning_rate": 9.956665343070234e-05, "loss": 0.5076, "step": 920 }, { "epoch": 0.39217162213022205, "grad_norm": 0.23865316808223724, "learning_rate": 9.956459075715305e-05, "loss": 0.4649, "step": 921 }, { "epoch": 0.39292435077154686, "grad_norm": 0.24898599088191986, "learning_rate": 9.956252321008707e-05, "loss": 0.5204, "step": 922 }, { "epoch": 0.39367707941287167, "grad_norm": 0.2693829834461212, "learning_rate": 9.956045078973058e-05, "loss": 0.6187, "step": 923 }, { "epoch": 0.3944298080541965, "grad_norm": 0.23129674792289734, "learning_rate": 9.955837349631016e-05, "loss": 0.5286, "step": 924 }, { "epoch": 0.3951825366955213, "grad_norm": 0.21596774458885193, "learning_rate": 9.955629133005302e-05, "loss": 0.5392, "step": 925 }, { "epoch": 0.3959352653368461, "grad_norm": 0.29742753505706787, "learning_rate": 9.955420429118688e-05, "loss": 0.4254, "step": 926 }, { "epoch": 0.3966879939781709, "grad_norm": 0.27692466974258423, "learning_rate": 9.955211237993989e-05, "loss": 0.4708, "step": 927 }, { "epoch": 0.39744072261949565, "grad_norm": 0.21078342199325562, "learning_rate": 9.955001559654091e-05, "loss": 0.5514, "step": 928 }, { "epoch": 0.39819345126082045, "grad_norm": 0.2530319392681122, "learning_rate": 9.95479139412192e-05, "loss": 0.5833, "step": 929 }, { "epoch": 0.39894617990214526, "grad_norm": 0.27327755093574524, "learning_rate": 9.95458074142046e-05, "loss": 0.4374, "step": 930 }, { "epoch": 0.39969890854347007, "grad_norm": 0.2324512004852295, "learning_rate": 9.954369601572747e-05, "loss": 0.5979, "step": 931 }, { "epoch": 0.4004516371847949, "grad_norm": 0.2299649566411972, "learning_rate": 9.95415797460187e-05, "loss": 0.4924, "step": 932 }, { "epoch": 0.4012043658261197, "grad_norm": 0.24139761924743652, "learning_rate": 9.953945860530976e-05, "loss": 0.5202, "step": 933 }, { "epoch": 0.4019570944674445, "grad_norm": 0.3859105408191681, "learning_rate": 9.953733259383258e-05, "loss": 0.6498, "step": 934 }, { "epoch": 0.4027098231087693, "grad_norm": 0.319953978061676, "learning_rate": 9.953520171181965e-05, "loss": 0.6319, "step": 935 }, { "epoch": 0.4034625517500941, "grad_norm": 0.32357606291770935, "learning_rate": 9.953306595950405e-05, "loss": 0.5303, "step": 936 }, { "epoch": 0.4042152803914189, "grad_norm": 0.32033729553222656, "learning_rate": 9.95309253371193e-05, "loss": 0.5001, "step": 937 }, { "epoch": 0.4049680090327437, "grad_norm": 0.3090396523475647, "learning_rate": 9.952877984489951e-05, "loss": 0.5178, "step": 938 }, { "epoch": 0.4057207376740685, "grad_norm": 0.2933139503002167, "learning_rate": 9.95266294830793e-05, "loss": 0.5243, "step": 939 }, { "epoch": 0.40647346631539333, "grad_norm": 0.2772461771965027, "learning_rate": 9.952447425189382e-05, "loss": 0.5684, "step": 940 }, { "epoch": 0.4072261949567181, "grad_norm": 0.26672911643981934, "learning_rate": 9.95223141515788e-05, "loss": 0.5074, "step": 941 }, { "epoch": 0.4079789235980429, "grad_norm": 0.31132858991622925, "learning_rate": 9.952014918237043e-05, "loss": 0.4471, "step": 942 }, { "epoch": 0.4087316522393677, "grad_norm": 0.30734890699386597, "learning_rate": 9.951797934450548e-05, "loss": 0.462, "step": 943 }, { "epoch": 0.4094843808806925, "grad_norm": 0.27107539772987366, "learning_rate": 9.951580463822124e-05, "loss": 0.5449, "step": 944 }, { "epoch": 0.4102371095220173, "grad_norm": 0.2520793080329895, "learning_rate": 9.951362506375555e-05, "loss": 0.4764, "step": 945 }, { "epoch": 0.4109898381633421, "grad_norm": 0.23660002648830414, "learning_rate": 9.951144062134673e-05, "loss": 0.4672, "step": 946 }, { "epoch": 0.4117425668046669, "grad_norm": 0.2638298273086548, "learning_rate": 9.950925131123369e-05, "loss": 0.4322, "step": 947 }, { "epoch": 0.4124952954459917, "grad_norm": 0.274772584438324, "learning_rate": 9.950705713365585e-05, "loss": 0.5339, "step": 948 }, { "epoch": 0.41324802408731653, "grad_norm": 0.23498772084712982, "learning_rate": 9.950485808885315e-05, "loss": 0.5641, "step": 949 }, { "epoch": 0.41400075272864134, "grad_norm": 0.24583180248737335, "learning_rate": 9.950265417706608e-05, "loss": 0.4856, "step": 950 }, { "epoch": 0.41475348136996615, "grad_norm": 0.250365287065506, "learning_rate": 9.950044539853567e-05, "loss": 0.4936, "step": 951 }, { "epoch": 0.41550621001129096, "grad_norm": 0.25993025302886963, "learning_rate": 9.949823175350345e-05, "loss": 0.5339, "step": 952 }, { "epoch": 0.4162589386526157, "grad_norm": 0.24438945949077606, "learning_rate": 9.949601324221151e-05, "loss": 0.6869, "step": 953 }, { "epoch": 0.4170116672939405, "grad_norm": 0.2702901363372803, "learning_rate": 9.949378986490245e-05, "loss": 0.5414, "step": 954 }, { "epoch": 0.4177643959352653, "grad_norm": 0.23533788323402405, "learning_rate": 9.949156162181944e-05, "loss": 0.6433, "step": 955 }, { "epoch": 0.4185171245765901, "grad_norm": 0.26812487840652466, "learning_rate": 9.948932851320614e-05, "loss": 0.578, "step": 956 }, { "epoch": 0.41926985321791493, "grad_norm": 0.21179401874542236, "learning_rate": 9.948709053930674e-05, "loss": 0.5051, "step": 957 }, { "epoch": 0.42002258185923974, "grad_norm": 0.2092764675617218, "learning_rate": 9.948484770036605e-05, "loss": 0.5299, "step": 958 }, { "epoch": 0.42077531050056455, "grad_norm": 0.23538509011268616, "learning_rate": 9.948259999662925e-05, "loss": 0.4389, "step": 959 }, { "epoch": 0.42152803914188935, "grad_norm": 0.31759095191955566, "learning_rate": 9.948034742834223e-05, "loss": 0.4278, "step": 960 }, { "epoch": 0.42228076778321416, "grad_norm": 0.2896108329296112, "learning_rate": 9.947808999575127e-05, "loss": 0.5762, "step": 961 }, { "epoch": 0.42303349642453897, "grad_norm": 0.2873057425022125, "learning_rate": 9.947582769910326e-05, "loss": 0.4768, "step": 962 }, { "epoch": 0.4237862250658638, "grad_norm": 0.26840507984161377, "learning_rate": 9.94735605386456e-05, "loss": 0.5615, "step": 963 }, { "epoch": 0.4245389537071886, "grad_norm": 0.2852790951728821, "learning_rate": 9.947128851462624e-05, "loss": 0.5654, "step": 964 }, { "epoch": 0.42529168234851333, "grad_norm": 0.25805985927581787, "learning_rate": 9.94690116272936e-05, "loss": 0.5555, "step": 965 }, { "epoch": 0.42604441098983814, "grad_norm": 0.2552996575832367, "learning_rate": 9.946672987689674e-05, "loss": 0.5445, "step": 966 }, { "epoch": 0.42679713963116295, "grad_norm": 0.2876366078853607, "learning_rate": 9.946444326368515e-05, "loss": 0.5725, "step": 967 }, { "epoch": 0.42754986827248775, "grad_norm": 0.29995742440223694, "learning_rate": 9.946215178790888e-05, "loss": 0.5995, "step": 968 }, { "epoch": 0.42830259691381256, "grad_norm": 0.2923589050769806, "learning_rate": 9.945985544981854e-05, "loss": 0.5083, "step": 969 }, { "epoch": 0.42905532555513737, "grad_norm": 0.29476484656333923, "learning_rate": 9.945755424966527e-05, "loss": 0.5742, "step": 970 }, { "epoch": 0.4298080541964622, "grad_norm": 0.2598770558834076, "learning_rate": 9.945524818770069e-05, "loss": 0.5192, "step": 971 }, { "epoch": 0.430560782837787, "grad_norm": 0.22467155754566193, "learning_rate": 9.945293726417702e-05, "loss": 0.3933, "step": 972 }, { "epoch": 0.4313135114791118, "grad_norm": 0.2489568442106247, "learning_rate": 9.945062147934694e-05, "loss": 0.535, "step": 973 }, { "epoch": 0.4320662401204366, "grad_norm": 0.22932292520999908, "learning_rate": 9.944830083346374e-05, "loss": 0.5088, "step": 974 }, { "epoch": 0.4328189687617614, "grad_norm": 0.2522759735584259, "learning_rate": 9.94459753267812e-05, "loss": 0.5079, "step": 975 }, { "epoch": 0.4335716974030862, "grad_norm": 0.2679159343242645, "learning_rate": 9.944364495955362e-05, "loss": 0.5498, "step": 976 }, { "epoch": 0.434324426044411, "grad_norm": 0.2641872763633728, "learning_rate": 9.944130973203584e-05, "loss": 0.499, "step": 977 }, { "epoch": 0.43507715468573577, "grad_norm": 0.28816577792167664, "learning_rate": 9.943896964448324e-05, "loss": 0.4462, "step": 978 }, { "epoch": 0.4358298833270606, "grad_norm": 0.28880366683006287, "learning_rate": 9.943662469715174e-05, "loss": 0.4767, "step": 979 }, { "epoch": 0.4365826119683854, "grad_norm": 0.27029773592948914, "learning_rate": 9.943427489029776e-05, "loss": 0.5797, "step": 980 }, { "epoch": 0.4373353406097102, "grad_norm": 0.24669088423252106, "learning_rate": 9.943192022417829e-05, "loss": 0.5462, "step": 981 }, { "epoch": 0.438088069251035, "grad_norm": 0.22207114100456238, "learning_rate": 9.942956069905083e-05, "loss": 0.5837, "step": 982 }, { "epoch": 0.4388407978923598, "grad_norm": 0.21680086851119995, "learning_rate": 9.942719631517341e-05, "loss": 0.4412, "step": 983 }, { "epoch": 0.4395935265336846, "grad_norm": 0.2495070844888687, "learning_rate": 9.94248270728046e-05, "loss": 0.45, "step": 984 }, { "epoch": 0.4403462551750094, "grad_norm": 0.2463860660791397, "learning_rate": 9.94224529722035e-05, "loss": 0.5161, "step": 985 }, { "epoch": 0.4410989838163342, "grad_norm": 0.23257146775722504, "learning_rate": 9.94200740136297e-05, "loss": 0.4629, "step": 986 }, { "epoch": 0.44185171245765903, "grad_norm": 0.39280542731285095, "learning_rate": 9.941769019734341e-05, "loss": 0.5679, "step": 987 }, { "epoch": 0.44260444109898384, "grad_norm": 0.3608780801296234, "learning_rate": 9.941530152360531e-05, "loss": 0.6182, "step": 988 }, { "epoch": 0.44335716974030864, "grad_norm": 0.33975085616111755, "learning_rate": 9.941290799267661e-05, "loss": 0.5612, "step": 989 }, { "epoch": 0.4441098983816334, "grad_norm": 0.3351368308067322, "learning_rate": 9.941050960481906e-05, "loss": 0.5748, "step": 990 }, { "epoch": 0.4448626270229582, "grad_norm": 0.25807496905326843, "learning_rate": 9.940810636029496e-05, "loss": 0.5075, "step": 991 }, { "epoch": 0.445615355664283, "grad_norm": 0.3131100535392761, "learning_rate": 9.940569825936709e-05, "loss": 0.5515, "step": 992 }, { "epoch": 0.4463680843056078, "grad_norm": 0.3781971335411072, "learning_rate": 9.940328530229883e-05, "loss": 0.4619, "step": 993 }, { "epoch": 0.4471208129469326, "grad_norm": 0.3018893003463745, "learning_rate": 9.940086748935406e-05, "loss": 0.5312, "step": 994 }, { "epoch": 0.44787354158825743, "grad_norm": 0.2982284724712372, "learning_rate": 9.939844482079718e-05, "loss": 0.5173, "step": 995 }, { "epoch": 0.44862627022958224, "grad_norm": 0.44059160351753235, "learning_rate": 9.939601729689312e-05, "loss": 0.4552, "step": 996 }, { "epoch": 0.44937899887090704, "grad_norm": 0.4979720711708069, "learning_rate": 9.939358491790735e-05, "loss": 0.5566, "step": 997 }, { "epoch": 0.45013172751223185, "grad_norm": 0.32317423820495605, "learning_rate": 9.93911476841059e-05, "loss": 0.4537, "step": 998 }, { "epoch": 0.45088445615355666, "grad_norm": 0.3036741614341736, "learning_rate": 9.938870559575526e-05, "loss": 0.4472, "step": 999 }, { "epoch": 0.45163718479488146, "grad_norm": 0.4216676354408264, "learning_rate": 9.938625865312251e-05, "loss": 0.4385, "step": 1000 }, { "epoch": 0.45163718479488146, "eval_loss": 0.5053092837333679, "eval_runtime": 455.75, "eval_samples_per_second": 21.123, "eval_steps_per_second": 0.66, "step": 1000 }, { "epoch": 0.45238991343620627, "grad_norm": 0.3278834819793701, "learning_rate": 9.938380685647525e-05, "loss": 0.5347, "step": 1001 }, { "epoch": 0.4531426420775311, "grad_norm": 0.282961905002594, "learning_rate": 9.938135020608163e-05, "loss": 0.5263, "step": 1002 }, { "epoch": 0.45389537071885583, "grad_norm": 0.33874639868736267, "learning_rate": 9.937888870221023e-05, "loss": 0.5524, "step": 1003 }, { "epoch": 0.45464809936018064, "grad_norm": 0.33344894647598267, "learning_rate": 9.937642234513032e-05, "loss": 0.5404, "step": 1004 }, { "epoch": 0.45540082800150544, "grad_norm": 0.27457481622695923, "learning_rate": 9.937395113511156e-05, "loss": 0.5493, "step": 1005 }, { "epoch": 0.45615355664283025, "grad_norm": 0.23923473060131073, "learning_rate": 9.937147507242424e-05, "loss": 0.5584, "step": 1006 }, { "epoch": 0.45690628528415506, "grad_norm": 0.2629500925540924, "learning_rate": 9.936899415733911e-05, "loss": 0.5225, "step": 1007 }, { "epoch": 0.45765901392547986, "grad_norm": 0.25259578227996826, "learning_rate": 9.936650839012749e-05, "loss": 0.5526, "step": 1008 }, { "epoch": 0.45841174256680467, "grad_norm": 0.29823940992355347, "learning_rate": 9.93640177710612e-05, "loss": 0.5816, "step": 1009 }, { "epoch": 0.4591644712081295, "grad_norm": 0.2536996603012085, "learning_rate": 9.936152230041264e-05, "loss": 0.4557, "step": 1010 }, { "epoch": 0.4599171998494543, "grad_norm": 0.27122533321380615, "learning_rate": 9.935902197845471e-05, "loss": 0.5504, "step": 1011 }, { "epoch": 0.4606699284907791, "grad_norm": 0.29913005232810974, "learning_rate": 9.93565168054608e-05, "loss": 0.5004, "step": 1012 }, { "epoch": 0.4614226571321039, "grad_norm": 0.31996914744377136, "learning_rate": 9.935400678170492e-05, "loss": 0.4715, "step": 1013 }, { "epoch": 0.4621753857734287, "grad_norm": 0.34630200266838074, "learning_rate": 9.935149190746153e-05, "loss": 0.5294, "step": 1014 }, { "epoch": 0.46292811441475346, "grad_norm": 0.24399127066135406, "learning_rate": 9.934897218300569e-05, "loss": 0.5796, "step": 1015 }, { "epoch": 0.46368084305607826, "grad_norm": 0.2412591427564621, "learning_rate": 9.934644760861292e-05, "loss": 0.5321, "step": 1016 }, { "epoch": 0.46443357169740307, "grad_norm": 0.23482368886470795, "learning_rate": 9.934391818455931e-05, "loss": 0.4772, "step": 1017 }, { "epoch": 0.4651863003387279, "grad_norm": 0.3133549094200134, "learning_rate": 9.934138391112145e-05, "loss": 0.5582, "step": 1018 }, { "epoch": 0.4659390289800527, "grad_norm": 0.24350227415561676, "learning_rate": 9.933884478857655e-05, "loss": 0.6217, "step": 1019 }, { "epoch": 0.4666917576213775, "grad_norm": 0.2260594666004181, "learning_rate": 9.933630081720224e-05, "loss": 0.6157, "step": 1020 }, { "epoch": 0.4674444862627023, "grad_norm": 0.2222922146320343, "learning_rate": 9.933375199727672e-05, "loss": 0.5343, "step": 1021 }, { "epoch": 0.4681972149040271, "grad_norm": 0.2779523432254791, "learning_rate": 9.933119832907873e-05, "loss": 0.4986, "step": 1022 }, { "epoch": 0.4689499435453519, "grad_norm": 0.20429456233978271, "learning_rate": 9.932863981288757e-05, "loss": 0.5592, "step": 1023 }, { "epoch": 0.4697026721866767, "grad_norm": 0.24868004024028778, "learning_rate": 9.932607644898299e-05, "loss": 0.5906, "step": 1024 }, { "epoch": 0.4704554008280015, "grad_norm": 0.2575432062149048, "learning_rate": 9.932350823764534e-05, "loss": 0.5966, "step": 1025 }, { "epoch": 0.47120812946932633, "grad_norm": 0.2270393967628479, "learning_rate": 9.932093517915546e-05, "loss": 0.584, "step": 1026 }, { "epoch": 0.47196085811065114, "grad_norm": 0.2206883281469345, "learning_rate": 9.931835727379474e-05, "loss": 0.5984, "step": 1027 }, { "epoch": 0.4727135867519759, "grad_norm": 0.2052450329065323, "learning_rate": 9.931577452184512e-05, "loss": 0.5349, "step": 1028 }, { "epoch": 0.4734663153933007, "grad_norm": 0.21915306150913239, "learning_rate": 9.931318692358901e-05, "loss": 0.495, "step": 1029 }, { "epoch": 0.4742190440346255, "grad_norm": 0.2277926504611969, "learning_rate": 9.93105944793094e-05, "loss": 0.5415, "step": 1030 }, { "epoch": 0.4749717726759503, "grad_norm": 0.21859507262706757, "learning_rate": 9.93079971892898e-05, "loss": 0.4328, "step": 1031 }, { "epoch": 0.4757245013172751, "grad_norm": 0.23208613693714142, "learning_rate": 9.930539505381426e-05, "loss": 0.5059, "step": 1032 }, { "epoch": 0.4764772299585999, "grad_norm": 0.23948131501674652, "learning_rate": 9.930278807316729e-05, "loss": 0.4306, "step": 1033 }, { "epoch": 0.47722995859992473, "grad_norm": 0.2437470257282257, "learning_rate": 9.930017624763406e-05, "loss": 0.566, "step": 1034 }, { "epoch": 0.47798268724124954, "grad_norm": 0.2654734253883362, "learning_rate": 9.929755957750015e-05, "loss": 0.4765, "step": 1035 }, { "epoch": 0.47873541588257434, "grad_norm": 0.2508060336112976, "learning_rate": 9.929493806305173e-05, "loss": 0.3503, "step": 1036 }, { "epoch": 0.47948814452389915, "grad_norm": 0.26740455627441406, "learning_rate": 9.929231170457546e-05, "loss": 0.5287, "step": 1037 }, { "epoch": 0.48024087316522396, "grad_norm": 0.28857776522636414, "learning_rate": 9.928968050235861e-05, "loss": 0.5635, "step": 1038 }, { "epoch": 0.48099360180654877, "grad_norm": 0.23665855824947357, "learning_rate": 9.928704445668886e-05, "loss": 0.5978, "step": 1039 }, { "epoch": 0.4817463304478735, "grad_norm": 0.23777931928634644, "learning_rate": 9.928440356785453e-05, "loss": 0.5645, "step": 1040 }, { "epoch": 0.4824990590891983, "grad_norm": 0.2791295051574707, "learning_rate": 9.928175783614438e-05, "loss": 0.5223, "step": 1041 }, { "epoch": 0.48325178773052313, "grad_norm": 0.2229326367378235, "learning_rate": 9.92791072618478e-05, "loss": 0.5503, "step": 1042 }, { "epoch": 0.48400451637184794, "grad_norm": 0.2760826349258423, "learning_rate": 9.927645184525462e-05, "loss": 0.4447, "step": 1043 }, { "epoch": 0.48475724501317274, "grad_norm": 0.33273959159851074, "learning_rate": 9.92737915866552e-05, "loss": 0.5318, "step": 1044 }, { "epoch": 0.48550997365449755, "grad_norm": 0.2699577808380127, "learning_rate": 9.927112648634053e-05, "loss": 0.5707, "step": 1045 }, { "epoch": 0.48626270229582236, "grad_norm": 0.2310740202665329, "learning_rate": 9.926845654460202e-05, "loss": 0.4728, "step": 1046 }, { "epoch": 0.48701543093714716, "grad_norm": 0.24801301956176758, "learning_rate": 9.926578176173166e-05, "loss": 0.4509, "step": 1047 }, { "epoch": 0.48776815957847197, "grad_norm": 0.3163672387599945, "learning_rate": 9.926310213802196e-05, "loss": 0.5699, "step": 1048 }, { "epoch": 0.4885208882197968, "grad_norm": 0.3586770296096802, "learning_rate": 9.926041767376594e-05, "loss": 0.5498, "step": 1049 }, { "epoch": 0.4892736168611216, "grad_norm": 0.3467787802219391, "learning_rate": 9.92577283692572e-05, "loss": 0.5394, "step": 1050 }, { "epoch": 0.4900263455024464, "grad_norm": 0.3736812174320221, "learning_rate": 9.925503422478984e-05, "loss": 0.5838, "step": 1051 }, { "epoch": 0.49077907414377114, "grad_norm": 0.2768717110157013, "learning_rate": 9.925233524065847e-05, "loss": 0.5632, "step": 1052 }, { "epoch": 0.49153180278509595, "grad_norm": 0.25592750310897827, "learning_rate": 9.924963141715824e-05, "loss": 0.4272, "step": 1053 }, { "epoch": 0.49228453142642076, "grad_norm": 0.27634164690971375, "learning_rate": 9.924692275458485e-05, "loss": 0.5106, "step": 1054 }, { "epoch": 0.49303726006774556, "grad_norm": 0.2757260799407959, "learning_rate": 9.92442092532345e-05, "loss": 0.4452, "step": 1055 }, { "epoch": 0.49378998870907037, "grad_norm": 0.21319328248500824, "learning_rate": 9.924149091340397e-05, "loss": 0.515, "step": 1056 }, { "epoch": 0.4945427173503952, "grad_norm": 0.28375956416130066, "learning_rate": 9.92387677353905e-05, "loss": 0.433, "step": 1057 }, { "epoch": 0.49529544599172, "grad_norm": 0.3011733591556549, "learning_rate": 9.923603971949189e-05, "loss": 0.5383, "step": 1058 }, { "epoch": 0.4960481746330448, "grad_norm": 0.267333060503006, "learning_rate": 9.92333068660065e-05, "loss": 0.5849, "step": 1059 }, { "epoch": 0.4968009032743696, "grad_norm": 0.2787369191646576, "learning_rate": 9.923056917523317e-05, "loss": 0.5142, "step": 1060 }, { "epoch": 0.4975536319156944, "grad_norm": 0.2665266692638397, "learning_rate": 9.922782664747129e-05, "loss": 0.4823, "step": 1061 }, { "epoch": 0.4983063605570192, "grad_norm": 0.26259467005729675, "learning_rate": 9.92250792830208e-05, "loss": 0.4579, "step": 1062 }, { "epoch": 0.499059089198344, "grad_norm": 0.3617941737174988, "learning_rate": 9.92223270821821e-05, "loss": 0.5007, "step": 1063 }, { "epoch": 0.4998118178396688, "grad_norm": 0.30651313066482544, "learning_rate": 9.921957004525622e-05, "loss": 0.4238, "step": 1064 }, { "epoch": 0.5005645464809936, "grad_norm": 0.2603810131549835, "learning_rate": 9.921680817254465e-05, "loss": 0.428, "step": 1065 }, { "epoch": 0.5013172751223184, "grad_norm": 0.3736288845539093, "learning_rate": 9.92140414643494e-05, "loss": 0.3911, "step": 1066 }, { "epoch": 0.5020700037636432, "grad_norm": 0.3069724440574646, "learning_rate": 9.921126992097306e-05, "loss": 0.487, "step": 1067 }, { "epoch": 0.502822732404968, "grad_norm": 0.2834354341030121, "learning_rate": 9.92084935427187e-05, "loss": 0.5129, "step": 1068 }, { "epoch": 0.5035754610462928, "grad_norm": 0.25631874799728394, "learning_rate": 9.920571232988996e-05, "loss": 0.4595, "step": 1069 }, { "epoch": 0.5043281896876176, "grad_norm": 0.25483864545822144, "learning_rate": 9.920292628279099e-05, "loss": 0.4136, "step": 1070 }, { "epoch": 0.5050809183289424, "grad_norm": 0.2856726348400116, "learning_rate": 9.920013540172645e-05, "loss": 0.5595, "step": 1071 }, { "epoch": 0.5058336469702672, "grad_norm": 0.29694074392318726, "learning_rate": 9.919733968700157e-05, "loss": 0.4591, "step": 1072 }, { "epoch": 0.506586375611592, "grad_norm": 0.24492426216602325, "learning_rate": 9.919453913892208e-05, "loss": 0.4667, "step": 1073 }, { "epoch": 0.5073391042529168, "grad_norm": 0.24951130151748657, "learning_rate": 9.91917337577942e-05, "loss": 0.5865, "step": 1074 }, { "epoch": 0.5080918328942416, "grad_norm": 0.23692971467971802, "learning_rate": 9.918892354392477e-05, "loss": 0.386, "step": 1075 }, { "epoch": 0.5088445615355665, "grad_norm": 0.25920066237449646, "learning_rate": 9.91861084976211e-05, "loss": 0.3868, "step": 1076 }, { "epoch": 0.5095972901768913, "grad_norm": 0.2656886577606201, "learning_rate": 9.918328861919104e-05, "loss": 0.5714, "step": 1077 }, { "epoch": 0.5103500188182161, "grad_norm": 0.2935027480125427, "learning_rate": 9.918046390894297e-05, "loss": 0.5303, "step": 1078 }, { "epoch": 0.5111027474595409, "grad_norm": 0.26146551966667175, "learning_rate": 9.917763436718579e-05, "loss": 0.6425, "step": 1079 }, { "epoch": 0.5118554761008657, "grad_norm": 0.292454332113266, "learning_rate": 9.917479999422893e-05, "loss": 0.5022, "step": 1080 }, { "epoch": 0.5126082047421905, "grad_norm": 0.2848522961139679, "learning_rate": 9.917196079038237e-05, "loss": 0.4909, "step": 1081 }, { "epoch": 0.5133609333835153, "grad_norm": 0.34108594059944153, "learning_rate": 9.916911675595656e-05, "loss": 0.5007, "step": 1082 }, { "epoch": 0.51411366202484, "grad_norm": 0.32286337018013, "learning_rate": 9.916626789126256e-05, "loss": 0.5388, "step": 1083 }, { "epoch": 0.5148663906661648, "grad_norm": 0.2838993966579437, "learning_rate": 9.916341419661193e-05, "loss": 0.4925, "step": 1084 }, { "epoch": 0.5156191193074896, "grad_norm": 0.4149876832962036, "learning_rate": 9.91605556723167e-05, "loss": 0.5815, "step": 1085 }, { "epoch": 0.5163718479488144, "grad_norm": 0.3126734495162964, "learning_rate": 9.91576923186895e-05, "loss": 0.4879, "step": 1086 }, { "epoch": 0.5171245765901392, "grad_norm": 0.3104715049266815, "learning_rate": 9.915482413604347e-05, "loss": 0.4528, "step": 1087 }, { "epoch": 0.517877305231464, "grad_norm": 0.27277445793151855, "learning_rate": 9.915195112469226e-05, "loss": 0.4283, "step": 1088 }, { "epoch": 0.5186300338727888, "grad_norm": 0.42503437399864197, "learning_rate": 9.914907328495003e-05, "loss": 0.4682, "step": 1089 }, { "epoch": 0.5193827625141136, "grad_norm": 0.38715454936027527, "learning_rate": 9.914619061713154e-05, "loss": 0.5219, "step": 1090 }, { "epoch": 0.5201354911554384, "grad_norm": 0.2989271879196167, "learning_rate": 9.914330312155202e-05, "loss": 0.639, "step": 1091 }, { "epoch": 0.5208882197967633, "grad_norm": 0.25879907608032227, "learning_rate": 9.914041079852724e-05, "loss": 0.4848, "step": 1092 }, { "epoch": 0.5216409484380881, "grad_norm": 0.3081519901752472, "learning_rate": 9.913751364837349e-05, "loss": 0.4691, "step": 1093 }, { "epoch": 0.5223936770794129, "grad_norm": 0.27176180481910706, "learning_rate": 9.91346116714076e-05, "loss": 0.5067, "step": 1094 }, { "epoch": 0.5231464057207377, "grad_norm": 0.2671797275543213, "learning_rate": 9.913170486794697e-05, "loss": 0.5393, "step": 1095 }, { "epoch": 0.5238991343620625, "grad_norm": 0.27080485224723816, "learning_rate": 9.91287932383094e-05, "loss": 0.6275, "step": 1096 }, { "epoch": 0.5246518630033873, "grad_norm": 0.2828846871852875, "learning_rate": 9.912587678281338e-05, "loss": 0.4938, "step": 1097 }, { "epoch": 0.5254045916447121, "grad_norm": 0.24295802414417267, "learning_rate": 9.91229555017778e-05, "loss": 0.5042, "step": 1098 }, { "epoch": 0.5261573202860369, "grad_norm": 0.24335606396198273, "learning_rate": 9.912002939552215e-05, "loss": 0.5111, "step": 1099 }, { "epoch": 0.5269100489273617, "grad_norm": 0.2628116309642792, "learning_rate": 9.911709846436641e-05, "loss": 0.4577, "step": 1100 }, { "epoch": 0.5276627775686865, "grad_norm": 0.3212801218032837, "learning_rate": 9.911416270863113e-05, "loss": 0.3953, "step": 1101 }, { "epoch": 0.5284155062100113, "grad_norm": 0.2955687344074249, "learning_rate": 9.911122212863734e-05, "loss": 0.518, "step": 1102 }, { "epoch": 0.5291682348513361, "grad_norm": 0.2790624797344208, "learning_rate": 9.91082767247066e-05, "loss": 0.4971, "step": 1103 }, { "epoch": 0.5299209634926609, "grad_norm": 0.3506956100463867, "learning_rate": 9.910532649716105e-05, "loss": 0.5589, "step": 1104 }, { "epoch": 0.5306736921339857, "grad_norm": 0.3324027955532074, "learning_rate": 9.91023714463233e-05, "loss": 0.5273, "step": 1105 }, { "epoch": 0.5314264207753105, "grad_norm": 0.25228431820869446, "learning_rate": 9.909941157251651e-05, "loss": 0.4544, "step": 1106 }, { "epoch": 0.5321791494166354, "grad_norm": 0.29727989435195923, "learning_rate": 9.909644687606438e-05, "loss": 0.503, "step": 1107 }, { "epoch": 0.53293187805796, "grad_norm": 0.27363112568855286, "learning_rate": 9.909347735729111e-05, "loss": 0.4858, "step": 1108 }, { "epoch": 0.5336846066992849, "grad_norm": 0.23663125932216644, "learning_rate": 9.909050301652145e-05, "loss": 0.4344, "step": 1109 }, { "epoch": 0.5344373353406097, "grad_norm": 0.24199628829956055, "learning_rate": 9.908752385408067e-05, "loss": 0.6002, "step": 1110 }, { "epoch": 0.5351900639819345, "grad_norm": 0.26234570145606995, "learning_rate": 9.908453987029459e-05, "loss": 0.5124, "step": 1111 }, { "epoch": 0.5359427926232593, "grad_norm": 0.26292750239372253, "learning_rate": 9.908155106548947e-05, "loss": 0.6118, "step": 1112 }, { "epoch": 0.5366955212645841, "grad_norm": 0.22877955436706543, "learning_rate": 9.907855743999223e-05, "loss": 0.4668, "step": 1113 }, { "epoch": 0.5374482499059089, "grad_norm": 0.22189679741859436, "learning_rate": 9.90755589941302e-05, "loss": 0.4638, "step": 1114 }, { "epoch": 0.5382009785472337, "grad_norm": 0.24692784249782562, "learning_rate": 9.907255572823133e-05, "loss": 0.5178, "step": 1115 }, { "epoch": 0.5389537071885585, "grad_norm": 0.217030867934227, "learning_rate": 9.906954764262401e-05, "loss": 0.4456, "step": 1116 }, { "epoch": 0.5397064358298833, "grad_norm": 0.2657017707824707, "learning_rate": 9.90665347376372e-05, "loss": 0.5504, "step": 1117 }, { "epoch": 0.5404591644712081, "grad_norm": 0.24807143211364746, "learning_rate": 9.906351701360044e-05, "loss": 0.4541, "step": 1118 }, { "epoch": 0.5412118931125329, "grad_norm": 0.21768467128276825, "learning_rate": 9.90604944708437e-05, "loss": 0.5205, "step": 1119 }, { "epoch": 0.5419646217538577, "grad_norm": 0.21517729759216309, "learning_rate": 9.905746710969752e-05, "loss": 0.4936, "step": 1120 }, { "epoch": 0.5427173503951825, "grad_norm": 0.21847064793109894, "learning_rate": 9.905443493049296e-05, "loss": 0.5141, "step": 1121 }, { "epoch": 0.5434700790365073, "grad_norm": 0.21460147202014923, "learning_rate": 9.905139793356167e-05, "loss": 0.5043, "step": 1122 }, { "epoch": 0.5442228076778322, "grad_norm": 0.23684439063072205, "learning_rate": 9.904835611923571e-05, "loss": 0.5057, "step": 1123 }, { "epoch": 0.544975536319157, "grad_norm": 0.3188186287879944, "learning_rate": 9.904530948784774e-05, "loss": 0.6126, "step": 1124 }, { "epoch": 0.5457282649604818, "grad_norm": 0.3356349468231201, "learning_rate": 9.904225803973094e-05, "loss": 0.4576, "step": 1125 }, { "epoch": 0.5464809936018066, "grad_norm": 0.2957243323326111, "learning_rate": 9.903920177521906e-05, "loss": 0.5299, "step": 1126 }, { "epoch": 0.5472337222431314, "grad_norm": 0.3821102976799011, "learning_rate": 9.903614069464625e-05, "loss": 0.4994, "step": 1127 }, { "epoch": 0.5479864508844562, "grad_norm": 0.25464391708374023, "learning_rate": 9.903307479834731e-05, "loss": 0.4692, "step": 1128 }, { "epoch": 0.548739179525781, "grad_norm": 0.3213479518890381, "learning_rate": 9.903000408665752e-05, "loss": 0.5244, "step": 1129 }, { "epoch": 0.5494919081671058, "grad_norm": 0.2897646725177765, "learning_rate": 9.902692855991266e-05, "loss": 0.5275, "step": 1130 }, { "epoch": 0.5502446368084306, "grad_norm": 0.2620808184146881, "learning_rate": 9.902384821844911e-05, "loss": 0.3956, "step": 1131 }, { "epoch": 0.5509973654497554, "grad_norm": 0.270736426115036, "learning_rate": 9.90207630626037e-05, "loss": 0.5331, "step": 1132 }, { "epoch": 0.5517500940910801, "grad_norm": 0.2915724515914917, "learning_rate": 9.901767309271383e-05, "loss": 0.5346, "step": 1133 }, { "epoch": 0.5525028227324049, "grad_norm": 0.23187798261642456, "learning_rate": 9.901457830911739e-05, "loss": 0.4297, "step": 1134 }, { "epoch": 0.5532555513737297, "grad_norm": 0.26172617077827454, "learning_rate": 9.901147871215286e-05, "loss": 0.5583, "step": 1135 }, { "epoch": 0.5540082800150545, "grad_norm": 0.25966599583625793, "learning_rate": 9.900837430215918e-05, "loss": 0.5541, "step": 1136 }, { "epoch": 0.5547610086563793, "grad_norm": 0.22829768061637878, "learning_rate": 9.900526507947584e-05, "loss": 0.483, "step": 1137 }, { "epoch": 0.5555137372977041, "grad_norm": 0.22879011929035187, "learning_rate": 9.90021510444429e-05, "loss": 0.4931, "step": 1138 }, { "epoch": 0.556266465939029, "grad_norm": 0.25198566913604736, "learning_rate": 9.899903219740087e-05, "loss": 0.525, "step": 1139 }, { "epoch": 0.5570191945803538, "grad_norm": 0.2345810979604721, "learning_rate": 9.899590853869082e-05, "loss": 0.5421, "step": 1140 }, { "epoch": 0.5577719232216786, "grad_norm": 0.2553841769695282, "learning_rate": 9.899278006865437e-05, "loss": 0.5071, "step": 1141 }, { "epoch": 0.5585246518630034, "grad_norm": 0.49084654450416565, "learning_rate": 9.898964678763362e-05, "loss": 0.5928, "step": 1142 }, { "epoch": 0.5592773805043282, "grad_norm": 0.2702839970588684, "learning_rate": 9.898650869597124e-05, "loss": 0.4702, "step": 1143 }, { "epoch": 0.560030109145653, "grad_norm": 0.3338477313518524, "learning_rate": 9.898336579401042e-05, "loss": 0.4426, "step": 1144 }, { "epoch": 0.5607828377869778, "grad_norm": 0.32494211196899414, "learning_rate": 9.898021808209483e-05, "loss": 0.4459, "step": 1145 }, { "epoch": 0.5615355664283026, "grad_norm": 0.37375083565711975, "learning_rate": 9.897706556056872e-05, "loss": 0.4217, "step": 1146 }, { "epoch": 0.5622882950696274, "grad_norm": 0.29669490456581116, "learning_rate": 9.897390822977682e-05, "loss": 0.4328, "step": 1147 }, { "epoch": 0.5630410237109522, "grad_norm": 0.24440810084342957, "learning_rate": 9.897074609006444e-05, "loss": 0.4551, "step": 1148 }, { "epoch": 0.563793752352277, "grad_norm": 0.3103405833244324, "learning_rate": 9.896757914177738e-05, "loss": 0.4301, "step": 1149 }, { "epoch": 0.5645464809936018, "grad_norm": 0.30300605297088623, "learning_rate": 9.896440738526198e-05, "loss": 0.5659, "step": 1150 }, { "epoch": 0.5652992096349266, "grad_norm": 0.2549082934856415, "learning_rate": 9.896123082086507e-05, "loss": 0.4989, "step": 1151 }, { "epoch": 0.5660519382762514, "grad_norm": 0.27417492866516113, "learning_rate": 9.895804944893407e-05, "loss": 0.5046, "step": 1152 }, { "epoch": 0.5668046669175762, "grad_norm": 0.2183150351047516, "learning_rate": 9.895486326981684e-05, "loss": 0.5639, "step": 1153 }, { "epoch": 0.5675573955589011, "grad_norm": 0.30387526750564575, "learning_rate": 9.895167228386188e-05, "loss": 0.4996, "step": 1154 }, { "epoch": 0.5683101242002259, "grad_norm": 0.3274269104003906, "learning_rate": 9.89484764914181e-05, "loss": 0.4214, "step": 1155 }, { "epoch": 0.5690628528415507, "grad_norm": 0.31911012530326843, "learning_rate": 9.894527589283501e-05, "loss": 0.5409, "step": 1156 }, { "epoch": 0.5698155814828755, "grad_norm": 0.2389630228281021, "learning_rate": 9.894207048846263e-05, "loss": 0.5378, "step": 1157 }, { "epoch": 0.5705683101242002, "grad_norm": 0.2983584403991699, "learning_rate": 9.893886027865148e-05, "loss": 0.4614, "step": 1158 }, { "epoch": 0.571321038765525, "grad_norm": 0.33584803342819214, "learning_rate": 9.893564526375263e-05, "loss": 0.4878, "step": 1159 }, { "epoch": 0.5720737674068498, "grad_norm": 0.24252860248088837, "learning_rate": 9.893242544411769e-05, "loss": 0.4307, "step": 1160 }, { "epoch": 0.5728264960481746, "grad_norm": 0.2726866602897644, "learning_rate": 9.892920082009872e-05, "loss": 0.5482, "step": 1161 }, { "epoch": 0.5735792246894994, "grad_norm": 0.27285417914390564, "learning_rate": 9.892597139204842e-05, "loss": 0.4748, "step": 1162 }, { "epoch": 0.5743319533308242, "grad_norm": 0.2631291449069977, "learning_rate": 9.892273716031991e-05, "loss": 0.5655, "step": 1163 }, { "epoch": 0.575084681972149, "grad_norm": 0.2863776683807373, "learning_rate": 9.891949812526691e-05, "loss": 0.4624, "step": 1164 }, { "epoch": 0.5758374106134738, "grad_norm": 0.2703903913497925, "learning_rate": 9.891625428724363e-05, "loss": 0.5841, "step": 1165 }, { "epoch": 0.5765901392547986, "grad_norm": 0.22141939401626587, "learning_rate": 9.891300564660481e-05, "loss": 0.4573, "step": 1166 }, { "epoch": 0.5773428678961234, "grad_norm": 0.23708957433700562, "learning_rate": 9.890975220370572e-05, "loss": 0.428, "step": 1167 }, { "epoch": 0.5780955965374482, "grad_norm": 0.268694669008255, "learning_rate": 9.890649395890213e-05, "loss": 0.4459, "step": 1168 }, { "epoch": 0.578848325178773, "grad_norm": 0.24643541872501373, "learning_rate": 9.890323091255039e-05, "loss": 0.5392, "step": 1169 }, { "epoch": 0.5796010538200979, "grad_norm": 0.3139578402042389, "learning_rate": 9.889996306500732e-05, "loss": 0.5575, "step": 1170 }, { "epoch": 0.5803537824614227, "grad_norm": 0.2550467848777771, "learning_rate": 9.889669041663029e-05, "loss": 0.5186, "step": 1171 }, { "epoch": 0.5811065111027475, "grad_norm": 0.22912271320819855, "learning_rate": 9.889341296777719e-05, "loss": 0.4801, "step": 1172 }, { "epoch": 0.5818592397440723, "grad_norm": 0.325141042470932, "learning_rate": 9.889013071880645e-05, "loss": 0.4377, "step": 1173 }, { "epoch": 0.5826119683853971, "grad_norm": 0.25600284337997437, "learning_rate": 9.888684367007698e-05, "loss": 0.438, "step": 1174 }, { "epoch": 0.5833646970267219, "grad_norm": 0.2556229531764984, "learning_rate": 9.888355182194829e-05, "loss": 0.3917, "step": 1175 }, { "epoch": 0.5841174256680467, "grad_norm": 0.26027339696884155, "learning_rate": 9.888025517478034e-05, "loss": 0.4473, "step": 1176 }, { "epoch": 0.5848701543093715, "grad_norm": 0.2413770705461502, "learning_rate": 9.887695372893367e-05, "loss": 0.4754, "step": 1177 }, { "epoch": 0.5856228829506963, "grad_norm": 0.25894418358802795, "learning_rate": 9.887364748476929e-05, "loss": 0.4776, "step": 1178 }, { "epoch": 0.5863756115920211, "grad_norm": 0.3487806022167206, "learning_rate": 9.88703364426488e-05, "loss": 0.504, "step": 1179 }, { "epoch": 0.5871283402333459, "grad_norm": 0.35467463731765747, "learning_rate": 9.886702060293428e-05, "loss": 0.4991, "step": 1180 }, { "epoch": 0.5878810688746707, "grad_norm": 0.2744213938713074, "learning_rate": 9.886369996598832e-05, "loss": 0.4129, "step": 1181 }, { "epoch": 0.5886337975159955, "grad_norm": 0.2817953824996948, "learning_rate": 9.88603745321741e-05, "loss": 0.5226, "step": 1182 }, { "epoch": 0.5893865261573202, "grad_norm": 0.326242595911026, "learning_rate": 9.885704430185525e-05, "loss": 0.5591, "step": 1183 }, { "epoch": 0.590139254798645, "grad_norm": 0.3206683397293091, "learning_rate": 9.885370927539598e-05, "loss": 0.5196, "step": 1184 }, { "epoch": 0.5908919834399698, "grad_norm": 0.31755152344703674, "learning_rate": 9.885036945316098e-05, "loss": 0.4636, "step": 1185 }, { "epoch": 0.5916447120812947, "grad_norm": 0.33652275800704956, "learning_rate": 9.884702483551553e-05, "loss": 0.4578, "step": 1186 }, { "epoch": 0.5923974407226195, "grad_norm": 0.3105533719062805, "learning_rate": 9.884367542282534e-05, "loss": 0.5005, "step": 1187 }, { "epoch": 0.5931501693639443, "grad_norm": 0.29773643612861633, "learning_rate": 9.884032121545675e-05, "loss": 0.5481, "step": 1188 }, { "epoch": 0.5939028980052691, "grad_norm": 0.3293536901473999, "learning_rate": 9.883696221377653e-05, "loss": 0.4225, "step": 1189 }, { "epoch": 0.5946556266465939, "grad_norm": 0.27235350012779236, "learning_rate": 9.883359841815203e-05, "loss": 0.4795, "step": 1190 }, { "epoch": 0.5954083552879187, "grad_norm": 0.28878524899482727, "learning_rate": 9.883022982895113e-05, "loss": 0.561, "step": 1191 }, { "epoch": 0.5961610839292435, "grad_norm": 0.26449763774871826, "learning_rate": 9.882685644654218e-05, "loss": 0.4674, "step": 1192 }, { "epoch": 0.5969138125705683, "grad_norm": 0.27733203768730164, "learning_rate": 9.88234782712941e-05, "loss": 0.491, "step": 1193 }, { "epoch": 0.5976665412118931, "grad_norm": 0.25534167885780334, "learning_rate": 9.882009530357632e-05, "loss": 0.4468, "step": 1194 }, { "epoch": 0.5984192698532179, "grad_norm": 0.2464534044265747, "learning_rate": 9.88167075437588e-05, "loss": 0.4588, "step": 1195 }, { "epoch": 0.5991719984945427, "grad_norm": 0.23108543455600739, "learning_rate": 9.881331499221202e-05, "loss": 0.558, "step": 1196 }, { "epoch": 0.5999247271358675, "grad_norm": 0.21600891649723053, "learning_rate": 9.880991764930699e-05, "loss": 0.3802, "step": 1197 }, { "epoch": 0.6006774557771923, "grad_norm": 0.2755924165248871, "learning_rate": 9.880651551541523e-05, "loss": 0.498, "step": 1198 }, { "epoch": 0.6014301844185171, "grad_norm": 0.21368075907230377, "learning_rate": 9.88031085909088e-05, "loss": 0.5068, "step": 1199 }, { "epoch": 0.602182913059842, "grad_norm": 0.2623206675052643, "learning_rate": 9.879969687616027e-05, "loss": 0.4666, "step": 1200 }, { "epoch": 0.602182913059842, "eval_loss": 0.4645377993583679, "eval_runtime": 456.2631, "eval_samples_per_second": 21.1, "eval_steps_per_second": 0.66, "step": 1200 }, { "epoch": 0.6029356417011668, "grad_norm": 0.2409641593694687, "learning_rate": 9.879628037154274e-05, "loss": 0.3946, "step": 1201 }, { "epoch": 0.6036883703424916, "grad_norm": 0.22716979682445526, "learning_rate": 9.879285907742984e-05, "loss": 0.4139, "step": 1202 }, { "epoch": 0.6044410989838164, "grad_norm": 0.23580877482891083, "learning_rate": 9.878943299419571e-05, "loss": 0.4699, "step": 1203 }, { "epoch": 0.6051938276251412, "grad_norm": 0.23804743587970734, "learning_rate": 9.8786002122215e-05, "loss": 0.4268, "step": 1204 }, { "epoch": 0.605946556266466, "grad_norm": 0.25705111026763916, "learning_rate": 9.878256646186298e-05, "loss": 0.3531, "step": 1205 }, { "epoch": 0.6066992849077908, "grad_norm": 0.2738749384880066, "learning_rate": 9.877912601351527e-05, "loss": 0.3309, "step": 1206 }, { "epoch": 0.6074520135491156, "grad_norm": 0.28365954756736755, "learning_rate": 9.877568077754819e-05, "loss": 0.5361, "step": 1207 }, { "epoch": 0.6082047421904403, "grad_norm": 0.25380411744117737, "learning_rate": 9.877223075433844e-05, "loss": 0.4591, "step": 1208 }, { "epoch": 0.6089574708317651, "grad_norm": 0.2364906370639801, "learning_rate": 9.876877594426339e-05, "loss": 0.4361, "step": 1209 }, { "epoch": 0.6097101994730899, "grad_norm": 0.21874341368675232, "learning_rate": 9.876531634770078e-05, "loss": 0.5142, "step": 1210 }, { "epoch": 0.6104629281144147, "grad_norm": 0.22504866123199463, "learning_rate": 9.876185196502899e-05, "loss": 0.4738, "step": 1211 }, { "epoch": 0.6112156567557395, "grad_norm": 0.22722840309143066, "learning_rate": 9.875838279662685e-05, "loss": 0.4538, "step": 1212 }, { "epoch": 0.6119683853970643, "grad_norm": 0.2507009506225586, "learning_rate": 9.875490884287377e-05, "loss": 0.4087, "step": 1213 }, { "epoch": 0.6127211140383891, "grad_norm": 0.24375686049461365, "learning_rate": 9.875143010414965e-05, "loss": 0.6847, "step": 1214 }, { "epoch": 0.6134738426797139, "grad_norm": 0.22544464468955994, "learning_rate": 9.874794658083488e-05, "loss": 0.4923, "step": 1215 }, { "epoch": 0.6142265713210388, "grad_norm": 0.2173485904932022, "learning_rate": 9.874445827331047e-05, "loss": 0.4977, "step": 1216 }, { "epoch": 0.6149792999623636, "grad_norm": 0.21548035740852356, "learning_rate": 9.874096518195788e-05, "loss": 0.5563, "step": 1217 }, { "epoch": 0.6157320286036884, "grad_norm": 0.20339354872703552, "learning_rate": 9.873746730715909e-05, "loss": 0.4624, "step": 1218 }, { "epoch": 0.6164847572450132, "grad_norm": 0.1934690773487091, "learning_rate": 9.873396464929663e-05, "loss": 0.4477, "step": 1219 }, { "epoch": 0.617237485886338, "grad_norm": 0.21802006661891937, "learning_rate": 9.873045720875356e-05, "loss": 0.5575, "step": 1220 }, { "epoch": 0.6179902145276628, "grad_norm": 0.20885612070560455, "learning_rate": 9.872694498591342e-05, "loss": 0.5478, "step": 1221 }, { "epoch": 0.6187429431689876, "grad_norm": 0.26553747057914734, "learning_rate": 9.872342798116033e-05, "loss": 0.5741, "step": 1222 }, { "epoch": 0.6194956718103124, "grad_norm": 0.2878028154373169, "learning_rate": 9.87199061948789e-05, "loss": 0.5866, "step": 1223 }, { "epoch": 0.6202484004516372, "grad_norm": 0.3177533745765686, "learning_rate": 9.871637962745425e-05, "loss": 0.397, "step": 1224 }, { "epoch": 0.621001129092962, "grad_norm": 0.3706894516944885, "learning_rate": 9.871284827927205e-05, "loss": 0.5362, "step": 1225 }, { "epoch": 0.6217538577342868, "grad_norm": 0.345068097114563, "learning_rate": 9.870931215071849e-05, "loss": 0.4612, "step": 1226 }, { "epoch": 0.6225065863756116, "grad_norm": 0.26764976978302, "learning_rate": 9.870577124218027e-05, "loss": 0.4527, "step": 1227 }, { "epoch": 0.6232593150169364, "grad_norm": 0.35887038707733154, "learning_rate": 9.87022255540446e-05, "loss": 0.6168, "step": 1228 }, { "epoch": 0.6240120436582612, "grad_norm": 0.2691974639892578, "learning_rate": 9.869867508669927e-05, "loss": 0.4637, "step": 1229 }, { "epoch": 0.624764772299586, "grad_norm": 0.31438761949539185, "learning_rate": 9.869511984053252e-05, "loss": 0.5007, "step": 1230 }, { "epoch": 0.6255175009409109, "grad_norm": 0.3147425651550293, "learning_rate": 9.869155981593317e-05, "loss": 0.5572, "step": 1231 }, { "epoch": 0.6262702295822355, "grad_norm": 0.3267611265182495, "learning_rate": 9.868799501329051e-05, "loss": 0.4528, "step": 1232 }, { "epoch": 0.6270229582235604, "grad_norm": 0.28941574692726135, "learning_rate": 9.868442543299442e-05, "loss": 0.3751, "step": 1233 }, { "epoch": 0.6277756868648852, "grad_norm": 0.24739721417427063, "learning_rate": 9.868085107543523e-05, "loss": 0.4803, "step": 1234 }, { "epoch": 0.62852841550621, "grad_norm": 0.22883157432079315, "learning_rate": 9.867727194100384e-05, "loss": 0.4697, "step": 1235 }, { "epoch": 0.6292811441475348, "grad_norm": 0.2831576466560364, "learning_rate": 9.867368803009166e-05, "loss": 0.4728, "step": 1236 }, { "epoch": 0.6300338727888596, "grad_norm": 0.24303902685642242, "learning_rate": 9.867009934309063e-05, "loss": 0.5303, "step": 1237 }, { "epoch": 0.6307866014301844, "grad_norm": 0.25681284070014954, "learning_rate": 9.866650588039318e-05, "loss": 0.5024, "step": 1238 }, { "epoch": 0.6315393300715092, "grad_norm": 0.2330607771873474, "learning_rate": 9.86629076423923e-05, "loss": 0.344, "step": 1239 }, { "epoch": 0.632292058712834, "grad_norm": 0.24641557037830353, "learning_rate": 9.86593046294815e-05, "loss": 0.3496, "step": 1240 }, { "epoch": 0.6330447873541588, "grad_norm": 0.24080790579319, "learning_rate": 9.865569684205477e-05, "loss": 0.41, "step": 1241 }, { "epoch": 0.6337975159954836, "grad_norm": 0.2378481775522232, "learning_rate": 9.865208428050668e-05, "loss": 0.5479, "step": 1242 }, { "epoch": 0.6345502446368084, "grad_norm": 0.24346303939819336, "learning_rate": 9.864846694523227e-05, "loss": 0.5327, "step": 1243 }, { "epoch": 0.6353029732781332, "grad_norm": 0.25009289383888245, "learning_rate": 9.864484483662714e-05, "loss": 0.5089, "step": 1244 }, { "epoch": 0.636055701919458, "grad_norm": 0.2739221155643463, "learning_rate": 9.864121795508742e-05, "loss": 0.416, "step": 1245 }, { "epoch": 0.6368084305607828, "grad_norm": 0.22683502733707428, "learning_rate": 9.863758630100969e-05, "loss": 0.427, "step": 1246 }, { "epoch": 0.6375611592021077, "grad_norm": 0.25140899419784546, "learning_rate": 9.863394987479114e-05, "loss": 0.4123, "step": 1247 }, { "epoch": 0.6383138878434325, "grad_norm": 0.2554682493209839, "learning_rate": 9.863030867682944e-05, "loss": 0.4976, "step": 1248 }, { "epoch": 0.6390666164847573, "grad_norm": 0.25327736139297485, "learning_rate": 9.862666270752277e-05, "loss": 0.3933, "step": 1249 }, { "epoch": 0.6398193451260821, "grad_norm": 0.27635276317596436, "learning_rate": 9.862301196726987e-05, "loss": 0.3618, "step": 1250 }, { "epoch": 0.6405720737674069, "grad_norm": 0.26549288630485535, "learning_rate": 9.861935645646997e-05, "loss": 0.5058, "step": 1251 }, { "epoch": 0.6413248024087317, "grad_norm": 0.26495036482810974, "learning_rate": 9.86156961755228e-05, "loss": 0.4168, "step": 1252 }, { "epoch": 0.6420775310500565, "grad_norm": 0.26289069652557373, "learning_rate": 9.86120311248287e-05, "loss": 0.5023, "step": 1253 }, { "epoch": 0.6428302596913813, "grad_norm": 0.25565823912620544, "learning_rate": 9.860836130478844e-05, "loss": 0.6164, "step": 1254 }, { "epoch": 0.6435829883327061, "grad_norm": 0.23190253973007202, "learning_rate": 9.860468671580336e-05, "loss": 0.4846, "step": 1255 }, { "epoch": 0.6443357169740309, "grad_norm": 0.21740113198757172, "learning_rate": 9.860100735827528e-05, "loss": 0.3876, "step": 1256 }, { "epoch": 0.6450884456153556, "grad_norm": 0.25386205315589905, "learning_rate": 9.85973232326066e-05, "loss": 0.4488, "step": 1257 }, { "epoch": 0.6458411742566804, "grad_norm": 0.22250834107398987, "learning_rate": 9.859363433920021e-05, "loss": 0.5132, "step": 1258 }, { "epoch": 0.6465939028980052, "grad_norm": 0.3215833902359009, "learning_rate": 9.85899406784595e-05, "loss": 0.5507, "step": 1259 }, { "epoch": 0.64734663153933, "grad_norm": 0.2646942436695099, "learning_rate": 9.858624225078841e-05, "loss": 0.4604, "step": 1260 }, { "epoch": 0.6480993601806548, "grad_norm": 0.24564485251903534, "learning_rate": 9.85825390565914e-05, "loss": 0.5515, "step": 1261 }, { "epoch": 0.6488520888219796, "grad_norm": 0.27368828654289246, "learning_rate": 9.857883109627344e-05, "loss": 0.515, "step": 1262 }, { "epoch": 0.6496048174633045, "grad_norm": 0.2820463478565216, "learning_rate": 9.857511837024003e-05, "loss": 0.511, "step": 1263 }, { "epoch": 0.6503575461046293, "grad_norm": 0.26797112822532654, "learning_rate": 9.857140087889719e-05, "loss": 0.5227, "step": 1264 }, { "epoch": 0.6511102747459541, "grad_norm": 0.20252685248851776, "learning_rate": 9.856767862265147e-05, "loss": 0.3805, "step": 1265 }, { "epoch": 0.6518630033872789, "grad_norm": 0.2888299822807312, "learning_rate": 9.856395160190991e-05, "loss": 0.5427, "step": 1266 }, { "epoch": 0.6526157320286037, "grad_norm": 0.30865737795829773, "learning_rate": 9.85602198170801e-05, "loss": 0.506, "step": 1267 }, { "epoch": 0.6533684606699285, "grad_norm": 0.31675729155540466, "learning_rate": 9.855648326857015e-05, "loss": 0.4646, "step": 1268 }, { "epoch": 0.6541211893112533, "grad_norm": 0.2726430296897888, "learning_rate": 9.855274195678868e-05, "loss": 0.4581, "step": 1269 }, { "epoch": 0.6548739179525781, "grad_norm": 0.2884460389614105, "learning_rate": 9.854899588214481e-05, "loss": 0.5231, "step": 1270 }, { "epoch": 0.6556266465939029, "grad_norm": 0.258710652589798, "learning_rate": 9.854524504504824e-05, "loss": 0.48, "step": 1271 }, { "epoch": 0.6563793752352277, "grad_norm": 0.2564273476600647, "learning_rate": 9.854148944590914e-05, "loss": 0.4403, "step": 1272 }, { "epoch": 0.6571321038765525, "grad_norm": 0.2598719000816345, "learning_rate": 9.853772908513822e-05, "loss": 0.5432, "step": 1273 }, { "epoch": 0.6578848325178773, "grad_norm": 0.23604260385036469, "learning_rate": 9.853396396314669e-05, "loss": 0.5439, "step": 1274 }, { "epoch": 0.6586375611592021, "grad_norm": 0.2513110339641571, "learning_rate": 9.853019408034632e-05, "loss": 0.4147, "step": 1275 }, { "epoch": 0.6593902898005269, "grad_norm": 0.22345522046089172, "learning_rate": 9.85264194371494e-05, "loss": 0.4043, "step": 1276 }, { "epoch": 0.6601430184418517, "grad_norm": 0.23904691636562347, "learning_rate": 9.852264003396866e-05, "loss": 0.3714, "step": 1277 }, { "epoch": 0.6608957470831766, "grad_norm": 0.27157941460609436, "learning_rate": 9.851885587121744e-05, "loss": 0.4727, "step": 1278 }, { "epoch": 0.6616484757245014, "grad_norm": 0.25302764773368835, "learning_rate": 9.851506694930958e-05, "loss": 0.3915, "step": 1279 }, { "epoch": 0.6624012043658262, "grad_norm": 0.28884434700012207, "learning_rate": 9.851127326865942e-05, "loss": 0.4315, "step": 1280 }, { "epoch": 0.663153933007151, "grad_norm": 0.26999300718307495, "learning_rate": 9.850747482968184e-05, "loss": 0.4641, "step": 1281 }, { "epoch": 0.6639066616484757, "grad_norm": 0.30956515669822693, "learning_rate": 9.850367163279222e-05, "loss": 0.4436, "step": 1282 }, { "epoch": 0.6646593902898005, "grad_norm": 0.26993149518966675, "learning_rate": 9.849986367840648e-05, "loss": 0.4721, "step": 1283 }, { "epoch": 0.6654121189311253, "grad_norm": 0.23367835581302643, "learning_rate": 9.849605096694105e-05, "loss": 0.3608, "step": 1284 }, { "epoch": 0.6661648475724501, "grad_norm": 0.24066036939620972, "learning_rate": 9.849223349881289e-05, "loss": 0.4565, "step": 1285 }, { "epoch": 0.6669175762137749, "grad_norm": 0.2392810583114624, "learning_rate": 9.848841127443944e-05, "loss": 0.5755, "step": 1286 }, { "epoch": 0.6676703048550997, "grad_norm": 0.2087058573961258, "learning_rate": 9.848458429423874e-05, "loss": 0.3634, "step": 1287 }, { "epoch": 0.6684230334964245, "grad_norm": 0.20621994137763977, "learning_rate": 9.848075255862927e-05, "loss": 0.5009, "step": 1288 }, { "epoch": 0.6691757621377493, "grad_norm": 0.23677192628383636, "learning_rate": 9.847691606803006e-05, "loss": 0.4241, "step": 1289 }, { "epoch": 0.6699284907790741, "grad_norm": 0.2756303548812866, "learning_rate": 9.84730748228607e-05, "loss": 0.4135, "step": 1290 }, { "epoch": 0.6706812194203989, "grad_norm": 0.30169734358787537, "learning_rate": 9.846922882354123e-05, "loss": 0.5172, "step": 1291 }, { "epoch": 0.6714339480617237, "grad_norm": 0.3121092915534973, "learning_rate": 9.846537807049223e-05, "loss": 0.3686, "step": 1292 }, { "epoch": 0.6721866767030485, "grad_norm": 0.30369246006011963, "learning_rate": 9.846152256413486e-05, "loss": 0.4526, "step": 1293 }, { "epoch": 0.6729394053443734, "grad_norm": 0.2636399567127228, "learning_rate": 9.845766230489071e-05, "loss": 0.5175, "step": 1294 }, { "epoch": 0.6736921339856982, "grad_norm": 0.24080714583396912, "learning_rate": 9.845379729318196e-05, "loss": 0.4438, "step": 1295 }, { "epoch": 0.674444862627023, "grad_norm": 0.2314116358757019, "learning_rate": 9.844992752943125e-05, "loss": 0.4363, "step": 1296 }, { "epoch": 0.6751975912683478, "grad_norm": 0.20962803065776825, "learning_rate": 9.844605301406181e-05, "loss": 0.5063, "step": 1297 }, { "epoch": 0.6759503199096726, "grad_norm": 0.2623697519302368, "learning_rate": 9.844217374749732e-05, "loss": 0.444, "step": 1298 }, { "epoch": 0.6767030485509974, "grad_norm": 0.36156249046325684, "learning_rate": 9.843828973016204e-05, "loss": 0.4732, "step": 1299 }, { "epoch": 0.6774557771923222, "grad_norm": 0.28615131974220276, "learning_rate": 9.84344009624807e-05, "loss": 0.4103, "step": 1300 }, { "epoch": 0.678208505833647, "grad_norm": 0.32240769267082214, "learning_rate": 9.843050744487857e-05, "loss": 0.5162, "step": 1301 }, { "epoch": 0.6789612344749718, "grad_norm": 0.5539597272872925, "learning_rate": 9.842660917778144e-05, "loss": 0.4896, "step": 1302 }, { "epoch": 0.6797139631162966, "grad_norm": 0.5185930132865906, "learning_rate": 9.842270616161562e-05, "loss": 0.497, "step": 1303 }, { "epoch": 0.6804666917576214, "grad_norm": 0.35417991876602173, "learning_rate": 9.841879839680794e-05, "loss": 0.4714, "step": 1304 }, { "epoch": 0.6812194203989462, "grad_norm": 0.3588627576828003, "learning_rate": 9.841488588378575e-05, "loss": 0.4324, "step": 1305 }, { "epoch": 0.681972149040271, "grad_norm": 0.330730140209198, "learning_rate": 9.841096862297691e-05, "loss": 0.4283, "step": 1306 }, { "epoch": 0.6827248776815957, "grad_norm": 0.2766088843345642, "learning_rate": 9.840704661480981e-05, "loss": 0.4834, "step": 1307 }, { "epoch": 0.6834776063229205, "grad_norm": 0.32007113099098206, "learning_rate": 9.840311985971334e-05, "loss": 0.4322, "step": 1308 }, { "epoch": 0.6842303349642453, "grad_norm": 0.26688817143440247, "learning_rate": 9.839918835811695e-05, "loss": 0.3932, "step": 1309 }, { "epoch": 0.6849830636055702, "grad_norm": 0.26363667845726013, "learning_rate": 9.839525211045058e-05, "loss": 0.4696, "step": 1310 }, { "epoch": 0.685735792246895, "grad_norm": 0.2896108627319336, "learning_rate": 9.839131111714467e-05, "loss": 0.4177, "step": 1311 }, { "epoch": 0.6864885208882198, "grad_norm": 0.2571950852870941, "learning_rate": 9.838736537863023e-05, "loss": 0.4645, "step": 1312 }, { "epoch": 0.6872412495295446, "grad_norm": 0.26054349541664124, "learning_rate": 9.838341489533873e-05, "loss": 0.505, "step": 1313 }, { "epoch": 0.6879939781708694, "grad_norm": 0.25071480870246887, "learning_rate": 9.83794596677022e-05, "loss": 0.4961, "step": 1314 }, { "epoch": 0.6887467068121942, "grad_norm": 0.2818361520767212, "learning_rate": 9.837549969615318e-05, "loss": 0.5291, "step": 1315 }, { "epoch": 0.689499435453519, "grad_norm": 0.2276887148618698, "learning_rate": 9.837153498112475e-05, "loss": 0.4546, "step": 1316 }, { "epoch": 0.6902521640948438, "grad_norm": 0.2603112459182739, "learning_rate": 9.836756552305044e-05, "loss": 0.4416, "step": 1317 }, { "epoch": 0.6910048927361686, "grad_norm": 0.23913916945457458, "learning_rate": 9.836359132236439e-05, "loss": 0.4891, "step": 1318 }, { "epoch": 0.6917576213774934, "grad_norm": 0.22872929275035858, "learning_rate": 9.835961237950115e-05, "loss": 0.3802, "step": 1319 }, { "epoch": 0.6925103500188182, "grad_norm": 0.2798226475715637, "learning_rate": 9.835562869489592e-05, "loss": 0.4398, "step": 1320 }, { "epoch": 0.693263078660143, "grad_norm": 0.26251137256622314, "learning_rate": 9.835164026898431e-05, "loss": 0.5084, "step": 1321 }, { "epoch": 0.6940158073014678, "grad_norm": 0.26231321692466736, "learning_rate": 9.834764710220251e-05, "loss": 0.4935, "step": 1322 }, { "epoch": 0.6947685359427926, "grad_norm": 0.28019005060195923, "learning_rate": 9.834364919498719e-05, "loss": 0.5181, "step": 1323 }, { "epoch": 0.6955212645841174, "grad_norm": 0.23454055190086365, "learning_rate": 9.833964654777556e-05, "loss": 0.5207, "step": 1324 }, { "epoch": 0.6962739932254423, "grad_norm": 0.24999743700027466, "learning_rate": 9.833563916100533e-05, "loss": 0.4527, "step": 1325 }, { "epoch": 0.6970267218667671, "grad_norm": 0.2273949682712555, "learning_rate": 9.833162703511475e-05, "loss": 0.4331, "step": 1326 }, { "epoch": 0.6977794505080919, "grad_norm": 0.22926314175128937, "learning_rate": 9.832761017054261e-05, "loss": 0.3449, "step": 1327 }, { "epoch": 0.6985321791494167, "grad_norm": 0.25965115427970886, "learning_rate": 9.832358856772817e-05, "loss": 0.4536, "step": 1328 }, { "epoch": 0.6992849077907415, "grad_norm": 0.2563928961753845, "learning_rate": 9.83195622271112e-05, "loss": 0.4241, "step": 1329 }, { "epoch": 0.7000376364320663, "grad_norm": 0.2311694324016571, "learning_rate": 9.831553114913204e-05, "loss": 0.4072, "step": 1330 }, { "epoch": 0.7007903650733911, "grad_norm": 0.28795236349105835, "learning_rate": 9.831149533423152e-05, "loss": 0.4715, "step": 1331 }, { "epoch": 0.7015430937147158, "grad_norm": 0.2879253923892975, "learning_rate": 9.830745478285098e-05, "loss": 0.4913, "step": 1332 }, { "epoch": 0.7022958223560406, "grad_norm": 0.26271700859069824, "learning_rate": 9.83034094954323e-05, "loss": 0.5515, "step": 1333 }, { "epoch": 0.7030485509973654, "grad_norm": 0.22904306650161743, "learning_rate": 9.829935947241786e-05, "loss": 0.5639, "step": 1334 }, { "epoch": 0.7038012796386902, "grad_norm": 0.2855572998523712, "learning_rate": 9.82953047142506e-05, "loss": 0.4839, "step": 1335 }, { "epoch": 0.704554008280015, "grad_norm": 0.23240108788013458, "learning_rate": 9.829124522137386e-05, "loss": 0.4921, "step": 1336 }, { "epoch": 0.7053067369213398, "grad_norm": 0.2729482352733612, "learning_rate": 9.828718099423166e-05, "loss": 0.5117, "step": 1337 }, { "epoch": 0.7060594655626646, "grad_norm": 0.27860167622566223, "learning_rate": 9.828311203326843e-05, "loss": 0.4754, "step": 1338 }, { "epoch": 0.7068121942039894, "grad_norm": 0.2521733045578003, "learning_rate": 9.827903833892913e-05, "loss": 0.4817, "step": 1339 }, { "epoch": 0.7075649228453142, "grad_norm": 0.2144048660993576, "learning_rate": 9.827495991165928e-05, "loss": 0.4315, "step": 1340 }, { "epoch": 0.708317651486639, "grad_norm": 0.30221468210220337, "learning_rate": 9.827087675190486e-05, "loss": 0.4741, "step": 1341 }, { "epoch": 0.7090703801279639, "grad_norm": 0.3020239472389221, "learning_rate": 9.826678886011243e-05, "loss": 0.5397, "step": 1342 }, { "epoch": 0.7098231087692887, "grad_norm": 0.27302414178848267, "learning_rate": 9.826269623672901e-05, "loss": 0.5601, "step": 1343 }, { "epoch": 0.7105758374106135, "grad_norm": 0.28655359148979187, "learning_rate": 9.825859888220216e-05, "loss": 0.4655, "step": 1344 }, { "epoch": 0.7113285660519383, "grad_norm": 0.2917414903640747, "learning_rate": 9.825449679698002e-05, "loss": 0.4641, "step": 1345 }, { "epoch": 0.7120812946932631, "grad_norm": 0.24867720901966095, "learning_rate": 9.82503899815111e-05, "loss": 0.4153, "step": 1346 }, { "epoch": 0.7128340233345879, "grad_norm": 0.24440576136112213, "learning_rate": 9.824627843624458e-05, "loss": 0.3552, "step": 1347 }, { "epoch": 0.7135867519759127, "grad_norm": 0.25679031014442444, "learning_rate": 9.824216216163006e-05, "loss": 0.532, "step": 1348 }, { "epoch": 0.7143394806172375, "grad_norm": 0.24419783055782318, "learning_rate": 9.823804115811772e-05, "loss": 0.4923, "step": 1349 }, { "epoch": 0.7150922092585623, "grad_norm": 0.25277218222618103, "learning_rate": 9.823391542615817e-05, "loss": 0.4615, "step": 1350 }, { "epoch": 0.7158449378998871, "grad_norm": 0.262210875749588, "learning_rate": 9.822978496620266e-05, "loss": 0.5025, "step": 1351 }, { "epoch": 0.7165976665412119, "grad_norm": 0.26390528678894043, "learning_rate": 9.822564977870284e-05, "loss": 0.4337, "step": 1352 }, { "epoch": 0.7173503951825367, "grad_norm": 0.27395227551460266, "learning_rate": 9.822150986411097e-05, "loss": 0.3882, "step": 1353 }, { "epoch": 0.7181031238238615, "grad_norm": 0.27112698554992676, "learning_rate": 9.821736522287974e-05, "loss": 0.3771, "step": 1354 }, { "epoch": 0.7188558524651864, "grad_norm": 0.25729942321777344, "learning_rate": 9.821321585546244e-05, "loss": 0.3682, "step": 1355 }, { "epoch": 0.7196085811065112, "grad_norm": 0.2440466284751892, "learning_rate": 9.820906176231283e-05, "loss": 0.4072, "step": 1356 }, { "epoch": 0.7203613097478359, "grad_norm": 0.23554636538028717, "learning_rate": 9.820490294388517e-05, "loss": 0.4066, "step": 1357 }, { "epoch": 0.7211140383891607, "grad_norm": 0.20465758442878723, "learning_rate": 9.82007394006343e-05, "loss": 0.3578, "step": 1358 }, { "epoch": 0.7218667670304855, "grad_norm": 0.2207801342010498, "learning_rate": 9.819657113301551e-05, "loss": 0.433, "step": 1359 }, { "epoch": 0.7226194956718103, "grad_norm": 0.25075048208236694, "learning_rate": 9.819239814148465e-05, "loss": 0.444, "step": 1360 }, { "epoch": 0.7233722243131351, "grad_norm": 0.23618640005588531, "learning_rate": 9.818822042649807e-05, "loss": 0.4153, "step": 1361 }, { "epoch": 0.7241249529544599, "grad_norm": 0.20521095395088196, "learning_rate": 9.818403798851264e-05, "loss": 0.3111, "step": 1362 }, { "epoch": 0.7248776815957847, "grad_norm": 0.2883371114730835, "learning_rate": 9.817985082798574e-05, "loss": 0.3793, "step": 1363 }, { "epoch": 0.7256304102371095, "grad_norm": 0.3059525787830353, "learning_rate": 9.817565894537526e-05, "loss": 0.5115, "step": 1364 }, { "epoch": 0.7263831388784343, "grad_norm": 0.259316623210907, "learning_rate": 9.817146234113964e-05, "loss": 0.3853, "step": 1365 }, { "epoch": 0.7271358675197591, "grad_norm": 0.24412520229816437, "learning_rate": 9.816726101573782e-05, "loss": 0.3949, "step": 1366 }, { "epoch": 0.7278885961610839, "grad_norm": 0.3108189105987549, "learning_rate": 9.816305496962923e-05, "loss": 0.5472, "step": 1367 }, { "epoch": 0.7286413248024087, "grad_norm": 0.2538655400276184, "learning_rate": 9.815884420327383e-05, "loss": 0.454, "step": 1368 }, { "epoch": 0.7293940534437335, "grad_norm": 0.2596602737903595, "learning_rate": 9.815462871713212e-05, "loss": 0.4332, "step": 1369 }, { "epoch": 0.7301467820850583, "grad_norm": 0.31265902519226074, "learning_rate": 9.81504085116651e-05, "loss": 0.4457, "step": 1370 }, { "epoch": 0.7308995107263832, "grad_norm": 0.27637195587158203, "learning_rate": 9.814618358733428e-05, "loss": 0.4354, "step": 1371 }, { "epoch": 0.731652239367708, "grad_norm": 0.26866665482521057, "learning_rate": 9.814195394460168e-05, "loss": 0.3358, "step": 1372 }, { "epoch": 0.7324049680090328, "grad_norm": 0.33015334606170654, "learning_rate": 9.813771958392989e-05, "loss": 0.3601, "step": 1373 }, { "epoch": 0.7331576966503576, "grad_norm": 0.3293009102344513, "learning_rate": 9.813348050578191e-05, "loss": 0.389, "step": 1374 }, { "epoch": 0.7339104252916824, "grad_norm": 0.39420178532600403, "learning_rate": 9.812923671062138e-05, "loss": 0.4898, "step": 1375 }, { "epoch": 0.7346631539330072, "grad_norm": 0.28922995924949646, "learning_rate": 9.812498819891235e-05, "loss": 0.4779, "step": 1376 }, { "epoch": 0.735415882574332, "grad_norm": 0.2676287591457367, "learning_rate": 9.812073497111945e-05, "loss": 0.5256, "step": 1377 }, { "epoch": 0.7361686112156568, "grad_norm": 0.3189471364021301, "learning_rate": 9.811647702770782e-05, "loss": 0.4673, "step": 1378 }, { "epoch": 0.7369213398569816, "grad_norm": 0.28738483786582947, "learning_rate": 9.811221436914307e-05, "loss": 0.4445, "step": 1379 }, { "epoch": 0.7376740684983064, "grad_norm": 0.22164154052734375, "learning_rate": 9.810794699589136e-05, "loss": 0.4903, "step": 1380 }, { "epoch": 0.7384267971396311, "grad_norm": 0.3045859634876251, "learning_rate": 9.81036749084194e-05, "loss": 0.5583, "step": 1381 }, { "epoch": 0.7391795257809559, "grad_norm": 0.2586420178413391, "learning_rate": 9.809939810719436e-05, "loss": 0.3404, "step": 1382 }, { "epoch": 0.7399322544222807, "grad_norm": 0.306570827960968, "learning_rate": 9.809511659268394e-05, "loss": 0.4422, "step": 1383 }, { "epoch": 0.7406849830636055, "grad_norm": 0.2532396614551544, "learning_rate": 9.809083036535635e-05, "loss": 0.4015, "step": 1384 }, { "epoch": 0.7414377117049303, "grad_norm": 0.25644952058792114, "learning_rate": 9.808653942568035e-05, "loss": 0.5097, "step": 1385 }, { "epoch": 0.7421904403462551, "grad_norm": 0.23469305038452148, "learning_rate": 9.808224377412517e-05, "loss": 0.4069, "step": 1386 }, { "epoch": 0.74294316898758, "grad_norm": 0.23263882100582123, "learning_rate": 9.807794341116058e-05, "loss": 0.4416, "step": 1387 }, { "epoch": 0.7436958976289048, "grad_norm": 0.24511806666851044, "learning_rate": 9.807363833725688e-05, "loss": 0.4216, "step": 1388 }, { "epoch": 0.7444486262702296, "grad_norm": 0.27566301822662354, "learning_rate": 9.806932855288485e-05, "loss": 0.3343, "step": 1389 }, { "epoch": 0.7452013549115544, "grad_norm": 0.28976359963417053, "learning_rate": 9.806501405851579e-05, "loss": 0.5772, "step": 1390 }, { "epoch": 0.7459540835528792, "grad_norm": 0.3315032720565796, "learning_rate": 9.806069485462155e-05, "loss": 0.4719, "step": 1391 }, { "epoch": 0.746706812194204, "grad_norm": 0.25928911566734314, "learning_rate": 9.805637094167446e-05, "loss": 0.4674, "step": 1392 }, { "epoch": 0.7474595408355288, "grad_norm": 0.2431102693080902, "learning_rate": 9.805204232014738e-05, "loss": 0.3713, "step": 1393 }, { "epoch": 0.7482122694768536, "grad_norm": 0.2733529508113861, "learning_rate": 9.804770899051367e-05, "loss": 0.5584, "step": 1394 }, { "epoch": 0.7489649981181784, "grad_norm": 0.23392844200134277, "learning_rate": 9.804337095324724e-05, "loss": 0.4571, "step": 1395 }, { "epoch": 0.7497177267595032, "grad_norm": 0.24645039439201355, "learning_rate": 9.803902820882247e-05, "loss": 0.4873, "step": 1396 }, { "epoch": 0.750470455400828, "grad_norm": 0.24060571193695068, "learning_rate": 9.803468075771427e-05, "loss": 0.4305, "step": 1397 }, { "epoch": 0.7512231840421528, "grad_norm": 0.22064609825611115, "learning_rate": 9.803032860039811e-05, "loss": 0.3785, "step": 1398 }, { "epoch": 0.7519759126834776, "grad_norm": 0.22054146230220795, "learning_rate": 9.802597173734989e-05, "loss": 0.4087, "step": 1399 }, { "epoch": 0.7527286413248024, "grad_norm": 0.23884668946266174, "learning_rate": 9.80216101690461e-05, "loss": 0.3867, "step": 1400 }, { "epoch": 0.7527286413248024, "eval_loss": 0.42544063925743103, "eval_runtime": 456.3401, "eval_samples_per_second": 21.096, "eval_steps_per_second": 0.66, "step": 1400 }, { "epoch": 0.7534813699661272, "grad_norm": 0.24308478832244873, "learning_rate": 9.80172438959637e-05, "loss": 0.465, "step": 1401 }, { "epoch": 0.754234098607452, "grad_norm": 0.21691569685935974, "learning_rate": 9.801287291858019e-05, "loss": 0.3672, "step": 1402 }, { "epoch": 0.7549868272487769, "grad_norm": 0.2331613451242447, "learning_rate": 9.800849723737355e-05, "loss": 0.4925, "step": 1403 }, { "epoch": 0.7557395558901017, "grad_norm": 0.2450028359889984, "learning_rate": 9.800411685282232e-05, "loss": 0.5052, "step": 1404 }, { "epoch": 0.7564922845314265, "grad_norm": 0.23671095073223114, "learning_rate": 9.799973176540554e-05, "loss": 0.4995, "step": 1405 }, { "epoch": 0.7572450131727512, "grad_norm": 0.2031913697719574, "learning_rate": 9.799534197560274e-05, "loss": 0.3824, "step": 1406 }, { "epoch": 0.757997741814076, "grad_norm": 0.24365940690040588, "learning_rate": 9.799094748389397e-05, "loss": 0.4705, "step": 1407 }, { "epoch": 0.7587504704554008, "grad_norm": 0.23133303225040436, "learning_rate": 9.798654829075983e-05, "loss": 0.3557, "step": 1408 }, { "epoch": 0.7595031990967256, "grad_norm": 0.2800910770893097, "learning_rate": 9.798214439668139e-05, "loss": 0.4057, "step": 1409 }, { "epoch": 0.7602559277380504, "grad_norm": 0.35394924879074097, "learning_rate": 9.797773580214027e-05, "loss": 0.4102, "step": 1410 }, { "epoch": 0.7610086563793752, "grad_norm": 0.24133409559726715, "learning_rate": 9.797332250761857e-05, "loss": 0.3943, "step": 1411 }, { "epoch": 0.7617613850207, "grad_norm": 0.31467917561531067, "learning_rate": 9.796890451359894e-05, "loss": 0.5024, "step": 1412 }, { "epoch": 0.7625141136620248, "grad_norm": 0.26689445972442627, "learning_rate": 9.79644818205645e-05, "loss": 0.3999, "step": 1413 }, { "epoch": 0.7632668423033496, "grad_norm": 0.27019554376602173, "learning_rate": 9.796005442899894e-05, "loss": 0.4465, "step": 1414 }, { "epoch": 0.7640195709446744, "grad_norm": 0.29798683524131775, "learning_rate": 9.795562233938643e-05, "loss": 0.4673, "step": 1415 }, { "epoch": 0.7647722995859992, "grad_norm": 0.2586008906364441, "learning_rate": 9.795118555221161e-05, "loss": 0.3785, "step": 1416 }, { "epoch": 0.765525028227324, "grad_norm": 0.26438191533088684, "learning_rate": 9.794674406795973e-05, "loss": 0.4131, "step": 1417 }, { "epoch": 0.7662777568686489, "grad_norm": 0.2900276184082031, "learning_rate": 9.794229788711648e-05, "loss": 0.4879, "step": 1418 }, { "epoch": 0.7670304855099737, "grad_norm": 0.26908060908317566, "learning_rate": 9.793784701016812e-05, "loss": 0.4413, "step": 1419 }, { "epoch": 0.7677832141512985, "grad_norm": 0.24913007020950317, "learning_rate": 9.793339143760134e-05, "loss": 0.2324, "step": 1420 }, { "epoch": 0.7685359427926233, "grad_norm": 0.2547054886817932, "learning_rate": 9.792893116990345e-05, "loss": 0.3834, "step": 1421 }, { "epoch": 0.7692886714339481, "grad_norm": 0.21681244671344757, "learning_rate": 9.792446620756216e-05, "loss": 0.5169, "step": 1422 }, { "epoch": 0.7700414000752729, "grad_norm": 0.24930457770824432, "learning_rate": 9.791999655106578e-05, "loss": 0.5592, "step": 1423 }, { "epoch": 0.7707941287165977, "grad_norm": 0.2637272775173187, "learning_rate": 9.791552220090312e-05, "loss": 0.3574, "step": 1424 }, { "epoch": 0.7715468573579225, "grad_norm": 0.28731709718704224, "learning_rate": 9.791104315756349e-05, "loss": 0.3895, "step": 1425 }, { "epoch": 0.7722995859992473, "grad_norm": 0.3502761423587799, "learning_rate": 9.790655942153669e-05, "loss": 0.5494, "step": 1426 }, { "epoch": 0.7730523146405721, "grad_norm": 0.34902265667915344, "learning_rate": 9.790207099331303e-05, "loss": 0.4236, "step": 1427 }, { "epoch": 0.7738050432818969, "grad_norm": 0.2920112609863281, "learning_rate": 9.789757787338342e-05, "loss": 0.5314, "step": 1428 }, { "epoch": 0.7745577719232217, "grad_norm": 0.28049737215042114, "learning_rate": 9.789308006223918e-05, "loss": 0.4201, "step": 1429 }, { "epoch": 0.7753105005645465, "grad_norm": 0.33215072751045227, "learning_rate": 9.78885775603722e-05, "loss": 0.3417, "step": 1430 }, { "epoch": 0.7760632292058712, "grad_norm": 0.27321913838386536, "learning_rate": 9.788407036827486e-05, "loss": 0.3691, "step": 1431 }, { "epoch": 0.776815957847196, "grad_norm": 0.32085832953453064, "learning_rate": 9.787955848644004e-05, "loss": 0.457, "step": 1432 }, { "epoch": 0.7775686864885208, "grad_norm": 0.5063838362693787, "learning_rate": 9.78750419153612e-05, "loss": 0.5489, "step": 1433 }, { "epoch": 0.7783214151298457, "grad_norm": 0.4517333209514618, "learning_rate": 9.787052065553221e-05, "loss": 0.4404, "step": 1434 }, { "epoch": 0.7790741437711705, "grad_norm": 0.24217461049556732, "learning_rate": 9.786599470744757e-05, "loss": 0.4651, "step": 1435 }, { "epoch": 0.7798268724124953, "grad_norm": 0.30190274119377136, "learning_rate": 9.786146407160215e-05, "loss": 0.411, "step": 1436 }, { "epoch": 0.7805796010538201, "grad_norm": 0.3099788725376129, "learning_rate": 9.78569287484915e-05, "loss": 0.4216, "step": 1437 }, { "epoch": 0.7813323296951449, "grad_norm": 0.2984677851200104, "learning_rate": 9.785238873861154e-05, "loss": 0.4717, "step": 1438 }, { "epoch": 0.7820850583364697, "grad_norm": 0.28427308797836304, "learning_rate": 9.784784404245877e-05, "loss": 0.5452, "step": 1439 }, { "epoch": 0.7828377869777945, "grad_norm": 0.27466389536857605, "learning_rate": 9.78432946605302e-05, "loss": 0.3835, "step": 1440 }, { "epoch": 0.7835905156191193, "grad_norm": 0.3062269389629364, "learning_rate": 9.783874059332336e-05, "loss": 0.5369, "step": 1441 }, { "epoch": 0.7843432442604441, "grad_norm": 0.22591553628444672, "learning_rate": 9.783418184133623e-05, "loss": 0.3986, "step": 1442 }, { "epoch": 0.7850959729017689, "grad_norm": 0.29234105348587036, "learning_rate": 9.782961840506738e-05, "loss": 0.4392, "step": 1443 }, { "epoch": 0.7858487015430937, "grad_norm": 0.23695768415927887, "learning_rate": 9.782505028501586e-05, "loss": 0.4812, "step": 1444 }, { "epoch": 0.7866014301844185, "grad_norm": 0.26548829674720764, "learning_rate": 9.782047748168122e-05, "loss": 0.5367, "step": 1445 }, { "epoch": 0.7873541588257433, "grad_norm": 0.2503800690174103, "learning_rate": 9.781589999556356e-05, "loss": 0.423, "step": 1446 }, { "epoch": 0.7881068874670681, "grad_norm": 0.24926361441612244, "learning_rate": 9.781131782716343e-05, "loss": 0.4981, "step": 1447 }, { "epoch": 0.788859616108393, "grad_norm": 0.24517816305160522, "learning_rate": 9.780673097698197e-05, "loss": 0.4779, "step": 1448 }, { "epoch": 0.7896123447497178, "grad_norm": 0.290567010641098, "learning_rate": 9.780213944552075e-05, "loss": 0.4543, "step": 1449 }, { "epoch": 0.7903650733910426, "grad_norm": 0.2642931044101715, "learning_rate": 9.779754323328192e-05, "loss": 0.453, "step": 1450 }, { "epoch": 0.7911178020323674, "grad_norm": 0.2401675134897232, "learning_rate": 9.77929423407681e-05, "loss": 0.3911, "step": 1451 }, { "epoch": 0.7918705306736922, "grad_norm": 0.2914387881755829, "learning_rate": 9.778833676848245e-05, "loss": 0.4495, "step": 1452 }, { "epoch": 0.792623259315017, "grad_norm": 0.22348538041114807, "learning_rate": 9.778372651692863e-05, "loss": 0.4107, "step": 1453 }, { "epoch": 0.7933759879563418, "grad_norm": 0.24289460480213165, "learning_rate": 9.777911158661077e-05, "loss": 0.3024, "step": 1454 }, { "epoch": 0.7941287165976666, "grad_norm": 0.21612803637981415, "learning_rate": 9.777449197803362e-05, "loss": 0.5069, "step": 1455 }, { "epoch": 0.7948814452389913, "grad_norm": 0.24933357536792755, "learning_rate": 9.776986769170232e-05, "loss": 0.3959, "step": 1456 }, { "epoch": 0.7956341738803161, "grad_norm": 0.29837334156036377, "learning_rate": 9.77652387281226e-05, "loss": 0.448, "step": 1457 }, { "epoch": 0.7963869025216409, "grad_norm": 0.2503390610218048, "learning_rate": 9.776060508780066e-05, "loss": 0.4875, "step": 1458 }, { "epoch": 0.7971396311629657, "grad_norm": 0.2998829185962677, "learning_rate": 9.775596677124326e-05, "loss": 0.5346, "step": 1459 }, { "epoch": 0.7978923598042905, "grad_norm": 0.24985618889331818, "learning_rate": 9.775132377895759e-05, "loss": 0.4027, "step": 1460 }, { "epoch": 0.7986450884456153, "grad_norm": 0.33576449751853943, "learning_rate": 9.774667611145143e-05, "loss": 0.4265, "step": 1461 }, { "epoch": 0.7993978170869401, "grad_norm": 0.29463598132133484, "learning_rate": 9.774202376923306e-05, "loss": 0.3799, "step": 1462 }, { "epoch": 0.8001505457282649, "grad_norm": 0.22388827800750732, "learning_rate": 9.773736675281122e-05, "loss": 0.4076, "step": 1463 }, { "epoch": 0.8009032743695897, "grad_norm": 0.25424450635910034, "learning_rate": 9.77327050626952e-05, "loss": 0.5021, "step": 1464 }, { "epoch": 0.8016560030109146, "grad_norm": 0.3301420211791992, "learning_rate": 9.772803869939481e-05, "loss": 0.421, "step": 1465 }, { "epoch": 0.8024087316522394, "grad_norm": 0.33241790533065796, "learning_rate": 9.772336766342034e-05, "loss": 0.5025, "step": 1466 }, { "epoch": 0.8031614602935642, "grad_norm": 0.2817162573337555, "learning_rate": 9.771869195528264e-05, "loss": 0.5267, "step": 1467 }, { "epoch": 0.803914188934889, "grad_norm": 0.25507187843322754, "learning_rate": 9.7714011575493e-05, "loss": 0.3714, "step": 1468 }, { "epoch": 0.8046669175762138, "grad_norm": 0.3124147355556488, "learning_rate": 9.770932652456326e-05, "loss": 0.4479, "step": 1469 }, { "epoch": 0.8054196462175386, "grad_norm": 0.24340976774692535, "learning_rate": 9.77046368030058e-05, "loss": 0.3932, "step": 1470 }, { "epoch": 0.8061723748588634, "grad_norm": 0.28956708312034607, "learning_rate": 9.769994241133345e-05, "loss": 0.3545, "step": 1471 }, { "epoch": 0.8069251035001882, "grad_norm": 0.2709154784679413, "learning_rate": 9.769524335005962e-05, "loss": 0.366, "step": 1472 }, { "epoch": 0.807677832141513, "grad_norm": 0.2535223960876465, "learning_rate": 9.769053961969814e-05, "loss": 0.3109, "step": 1473 }, { "epoch": 0.8084305607828378, "grad_norm": 0.2691313624382019, "learning_rate": 9.768583122076344e-05, "loss": 0.4959, "step": 1474 }, { "epoch": 0.8091832894241626, "grad_norm": 0.256678968667984, "learning_rate": 9.768111815377042e-05, "loss": 0.4063, "step": 1475 }, { "epoch": 0.8099360180654874, "grad_norm": 0.297098845243454, "learning_rate": 9.767640041923449e-05, "loss": 0.3608, "step": 1476 }, { "epoch": 0.8106887467068122, "grad_norm": 0.34293133020401, "learning_rate": 9.767167801767158e-05, "loss": 0.4569, "step": 1477 }, { "epoch": 0.811441475348137, "grad_norm": 0.31014469265937805, "learning_rate": 9.766695094959812e-05, "loss": 0.4361, "step": 1478 }, { "epoch": 0.8121942039894618, "grad_norm": 0.28068163990974426, "learning_rate": 9.766221921553104e-05, "loss": 0.4112, "step": 1479 }, { "epoch": 0.8129469326307867, "grad_norm": 0.262540340423584, "learning_rate": 9.765748281598781e-05, "loss": 0.5239, "step": 1480 }, { "epoch": 0.8136996612721114, "grad_norm": 0.30986449122428894, "learning_rate": 9.765274175148638e-05, "loss": 0.5146, "step": 1481 }, { "epoch": 0.8144523899134362, "grad_norm": 0.2656484842300415, "learning_rate": 9.764799602254527e-05, "loss": 0.471, "step": 1482 }, { "epoch": 0.815205118554761, "grad_norm": 0.2651226222515106, "learning_rate": 9.764324562968341e-05, "loss": 0.3952, "step": 1483 }, { "epoch": 0.8159578471960858, "grad_norm": 0.2643630802631378, "learning_rate": 9.763849057342035e-05, "loss": 0.3934, "step": 1484 }, { "epoch": 0.8167105758374106, "grad_norm": 0.22132860124111176, "learning_rate": 9.763373085427603e-05, "loss": 0.3636, "step": 1485 }, { "epoch": 0.8174633044787354, "grad_norm": 0.24194374680519104, "learning_rate": 9.762896647277102e-05, "loss": 0.4824, "step": 1486 }, { "epoch": 0.8182160331200602, "grad_norm": 0.23076246678829193, "learning_rate": 9.762419742942634e-05, "loss": 0.4141, "step": 1487 }, { "epoch": 0.818968761761385, "grad_norm": 0.217947319149971, "learning_rate": 9.76194237247635e-05, "loss": 0.4417, "step": 1488 }, { "epoch": 0.8197214904027098, "grad_norm": 0.24609766900539398, "learning_rate": 9.761464535930456e-05, "loss": 0.3404, "step": 1489 }, { "epoch": 0.8204742190440346, "grad_norm": 0.29783886671066284, "learning_rate": 9.760986233357208e-05, "loss": 0.4236, "step": 1490 }, { "epoch": 0.8212269476853594, "grad_norm": 0.23346109688282013, "learning_rate": 9.760507464808911e-05, "loss": 0.4565, "step": 1491 }, { "epoch": 0.8219796763266842, "grad_norm": 0.2368694543838501, "learning_rate": 9.760028230337923e-05, "loss": 0.5201, "step": 1492 }, { "epoch": 0.822732404968009, "grad_norm": 0.2265361249446869, "learning_rate": 9.759548529996653e-05, "loss": 0.4302, "step": 1493 }, { "epoch": 0.8234851336093338, "grad_norm": 0.2344938963651657, "learning_rate": 9.75906836383756e-05, "loss": 0.5073, "step": 1494 }, { "epoch": 0.8242378622506586, "grad_norm": 0.23537133634090424, "learning_rate": 9.758587731913156e-05, "loss": 0.4123, "step": 1495 }, { "epoch": 0.8249905908919835, "grad_norm": 0.2370491623878479, "learning_rate": 9.758106634275998e-05, "loss": 0.4131, "step": 1496 }, { "epoch": 0.8257433195333083, "grad_norm": 0.25094863772392273, "learning_rate": 9.7576250709787e-05, "loss": 0.5178, "step": 1497 }, { "epoch": 0.8264960481746331, "grad_norm": 0.22807808220386505, "learning_rate": 9.757143042073927e-05, "loss": 0.5049, "step": 1498 }, { "epoch": 0.8272487768159579, "grad_norm": 0.23259077966213226, "learning_rate": 9.756660547614391e-05, "loss": 0.4372, "step": 1499 }, { "epoch": 0.8280015054572827, "grad_norm": 0.2288927584886551, "learning_rate": 9.756177587652856e-05, "loss": 0.4341, "step": 1500 }, { "epoch": 0.8287542340986075, "grad_norm": 0.2619488835334778, "learning_rate": 9.75569416224214e-05, "loss": 0.2455, "step": 1501 }, { "epoch": 0.8295069627399323, "grad_norm": 0.22301772236824036, "learning_rate": 9.755210271435111e-05, "loss": 0.3473, "step": 1502 }, { "epoch": 0.8302596913812571, "grad_norm": 0.3104006052017212, "learning_rate": 9.754725915284681e-05, "loss": 0.4327, "step": 1503 }, { "epoch": 0.8310124200225819, "grad_norm": 0.2515698969364166, "learning_rate": 9.754241093843822e-05, "loss": 0.3916, "step": 1504 }, { "epoch": 0.8317651486639067, "grad_norm": 0.2334437519311905, "learning_rate": 9.753755807165555e-05, "loss": 0.4356, "step": 1505 }, { "epoch": 0.8325178773052314, "grad_norm": 0.2234559953212738, "learning_rate": 9.753270055302947e-05, "loss": 0.3555, "step": 1506 }, { "epoch": 0.8332706059465562, "grad_norm": 0.22368863224983215, "learning_rate": 9.752783838309123e-05, "loss": 0.4, "step": 1507 }, { "epoch": 0.834023334587881, "grad_norm": 0.20855438709259033, "learning_rate": 9.752297156237248e-05, "loss": 0.4183, "step": 1508 }, { "epoch": 0.8347760632292058, "grad_norm": 0.2460399568080902, "learning_rate": 9.751810009140554e-05, "loss": 0.4348, "step": 1509 }, { "epoch": 0.8355287918705306, "grad_norm": 0.20740154385566711, "learning_rate": 9.751322397072307e-05, "loss": 0.3877, "step": 1510 }, { "epoch": 0.8362815205118554, "grad_norm": 0.29780131578445435, "learning_rate": 9.750834320085835e-05, "loss": 0.3462, "step": 1511 }, { "epoch": 0.8370342491531803, "grad_norm": 0.30786630511283875, "learning_rate": 9.750345778234512e-05, "loss": 0.4018, "step": 1512 }, { "epoch": 0.8377869777945051, "grad_norm": 0.33579933643341064, "learning_rate": 9.749856771571766e-05, "loss": 0.4776, "step": 1513 }, { "epoch": 0.8385397064358299, "grad_norm": 0.2682308256626129, "learning_rate": 9.749367300151073e-05, "loss": 0.4509, "step": 1514 }, { "epoch": 0.8392924350771547, "grad_norm": 0.2760584056377411, "learning_rate": 9.748877364025961e-05, "loss": 0.4301, "step": 1515 }, { "epoch": 0.8400451637184795, "grad_norm": 0.3419676423072815, "learning_rate": 9.748386963250009e-05, "loss": 0.343, "step": 1516 }, { "epoch": 0.8407978923598043, "grad_norm": 0.2683289051055908, "learning_rate": 9.747896097876845e-05, "loss": 0.4472, "step": 1517 }, { "epoch": 0.8415506210011291, "grad_norm": 0.29356154799461365, "learning_rate": 9.747404767960151e-05, "loss": 0.4374, "step": 1518 }, { "epoch": 0.8423033496424539, "grad_norm": 0.2743261754512787, "learning_rate": 9.746912973553657e-05, "loss": 0.4208, "step": 1519 }, { "epoch": 0.8430560782837787, "grad_norm": 0.3436763882637024, "learning_rate": 9.746420714711145e-05, "loss": 0.4453, "step": 1520 }, { "epoch": 0.8438088069251035, "grad_norm": 0.29319220781326294, "learning_rate": 9.745927991486448e-05, "loss": 0.2762, "step": 1521 }, { "epoch": 0.8445615355664283, "grad_norm": 0.29934564232826233, "learning_rate": 9.745434803933451e-05, "loss": 0.3917, "step": 1522 }, { "epoch": 0.8453142642077531, "grad_norm": 0.29800182580947876, "learning_rate": 9.744941152106085e-05, "loss": 0.491, "step": 1523 }, { "epoch": 0.8460669928490779, "grad_norm": 0.2715909779071808, "learning_rate": 9.744447036058338e-05, "loss": 0.3592, "step": 1524 }, { "epoch": 0.8468197214904027, "grad_norm": 0.27722421288490295, "learning_rate": 9.743952455844245e-05, "loss": 0.5479, "step": 1525 }, { "epoch": 0.8475724501317276, "grad_norm": 0.27706336975097656, "learning_rate": 9.743457411517892e-05, "loss": 0.3273, "step": 1526 }, { "epoch": 0.8483251787730524, "grad_norm": 0.26040807366371155, "learning_rate": 9.742961903133415e-05, "loss": 0.4435, "step": 1527 }, { "epoch": 0.8490779074143772, "grad_norm": 0.33437103033065796, "learning_rate": 9.742465930745003e-05, "loss": 0.5386, "step": 1528 }, { "epoch": 0.849830636055702, "grad_norm": 0.29603081941604614, "learning_rate": 9.741969494406898e-05, "loss": 0.5513, "step": 1529 }, { "epoch": 0.8505833646970267, "grad_norm": 0.2501540184020996, "learning_rate": 9.741472594173384e-05, "loss": 0.4066, "step": 1530 }, { "epoch": 0.8513360933383515, "grad_norm": 0.249586820602417, "learning_rate": 9.740975230098805e-05, "loss": 0.4815, "step": 1531 }, { "epoch": 0.8520888219796763, "grad_norm": 0.26083657145500183, "learning_rate": 9.740477402237552e-05, "loss": 0.4138, "step": 1532 }, { "epoch": 0.8528415506210011, "grad_norm": 0.23608864843845367, "learning_rate": 9.739979110644066e-05, "loss": 0.4193, "step": 1533 }, { "epoch": 0.8535942792623259, "grad_norm": 0.25850221514701843, "learning_rate": 9.739480355372838e-05, "loss": 0.4497, "step": 1534 }, { "epoch": 0.8543470079036507, "grad_norm": 0.21573321521282196, "learning_rate": 9.738981136478411e-05, "loss": 0.344, "step": 1535 }, { "epoch": 0.8550997365449755, "grad_norm": 0.2614743411540985, "learning_rate": 9.738481454015382e-05, "loss": 0.3809, "step": 1536 }, { "epoch": 0.8558524651863003, "grad_norm": 0.24722614884376526, "learning_rate": 9.737981308038394e-05, "loss": 0.4316, "step": 1537 }, { "epoch": 0.8566051938276251, "grad_norm": 0.24330392479896545, "learning_rate": 9.737480698602142e-05, "loss": 0.4201, "step": 1538 }, { "epoch": 0.8573579224689499, "grad_norm": 0.25050482153892517, "learning_rate": 9.73697962576137e-05, "loss": 0.4568, "step": 1539 }, { "epoch": 0.8581106511102747, "grad_norm": 0.23777051270008087, "learning_rate": 9.736478089570876e-05, "loss": 0.3386, "step": 1540 }, { "epoch": 0.8588633797515995, "grad_norm": 0.24225662648677826, "learning_rate": 9.735976090085509e-05, "loss": 0.3419, "step": 1541 }, { "epoch": 0.8596161083929243, "grad_norm": 0.25706613063812256, "learning_rate": 9.735473627360166e-05, "loss": 0.3488, "step": 1542 }, { "epoch": 0.8603688370342492, "grad_norm": 0.19751642644405365, "learning_rate": 9.734970701449795e-05, "loss": 0.4093, "step": 1543 }, { "epoch": 0.861121565675574, "grad_norm": 0.22627240419387817, "learning_rate": 9.734467312409395e-05, "loss": 0.4513, "step": 1544 }, { "epoch": 0.8618742943168988, "grad_norm": 0.23444485664367676, "learning_rate": 9.733963460294015e-05, "loss": 0.4523, "step": 1545 }, { "epoch": 0.8626270229582236, "grad_norm": 0.2017977386713028, "learning_rate": 9.733459145158758e-05, "loss": 0.4826, "step": 1546 }, { "epoch": 0.8633797515995484, "grad_norm": 0.2235095053911209, "learning_rate": 9.732954367058772e-05, "loss": 0.317, "step": 1547 }, { "epoch": 0.8641324802408732, "grad_norm": 0.30377840995788574, "learning_rate": 9.732449126049262e-05, "loss": 0.3633, "step": 1548 }, { "epoch": 0.864885208882198, "grad_norm": 0.4098929166793823, "learning_rate": 9.73194342218548e-05, "loss": 0.4127, "step": 1549 }, { "epoch": 0.8656379375235228, "grad_norm": 0.7995094656944275, "learning_rate": 9.731437255522727e-05, "loss": 0.4134, "step": 1550 }, { "epoch": 0.8663906661648476, "grad_norm": 1.1637529134750366, "learning_rate": 9.730930626116356e-05, "loss": 0.3767, "step": 1551 }, { "epoch": 0.8671433948061724, "grad_norm": 0.43186941742897034, "learning_rate": 9.730423534021775e-05, "loss": 0.4804, "step": 1552 }, { "epoch": 0.8678961234474972, "grad_norm": 0.9768980145454407, "learning_rate": 9.729915979294434e-05, "loss": 0.4594, "step": 1553 }, { "epoch": 0.868648852088822, "grad_norm": 0.5241792798042297, "learning_rate": 9.729407961989845e-05, "loss": 0.4338, "step": 1554 }, { "epoch": 0.8694015807301467, "grad_norm": 2.553678035736084, "learning_rate": 9.728899482163557e-05, "loss": 0.4878, "step": 1555 }, { "epoch": 0.8701543093714715, "grad_norm": 0.8692716956138611, "learning_rate": 9.728390539871181e-05, "loss": 0.3752, "step": 1556 }, { "epoch": 0.8709070380127963, "grad_norm": 2.337517261505127, "learning_rate": 9.727881135168374e-05, "loss": 0.5178, "step": 1557 }, { "epoch": 0.8716597666541211, "grad_norm": 1.0947922468185425, "learning_rate": 9.72737126811084e-05, "loss": 0.4959, "step": 1558 }, { "epoch": 0.872412495295446, "grad_norm": 1.38123619556427, "learning_rate": 9.726860938754342e-05, "loss": 0.5153, "step": 1559 }, { "epoch": 0.8731652239367708, "grad_norm": 0.7988998889923096, "learning_rate": 9.726350147154684e-05, "loss": 0.3243, "step": 1560 }, { "epoch": 0.8739179525780956, "grad_norm": 1.1834385395050049, "learning_rate": 9.725838893367731e-05, "loss": 0.3925, "step": 1561 }, { "epoch": 0.8746706812194204, "grad_norm": 4.682585716247559, "learning_rate": 9.725327177449389e-05, "loss": 0.5323, "step": 1562 }, { "epoch": 0.8754234098607452, "grad_norm": 0.6164882183074951, "learning_rate": 9.724814999455619e-05, "loss": 0.4592, "step": 1563 }, { "epoch": 0.87617613850207, "grad_norm": 0.6903394460678101, "learning_rate": 9.724302359442434e-05, "loss": 0.4003, "step": 1564 }, { "epoch": 0.8769288671433948, "grad_norm": 0.46710142493247986, "learning_rate": 9.723789257465892e-05, "loss": 0.4412, "step": 1565 }, { "epoch": 0.8776815957847196, "grad_norm": 0.39138102531433105, "learning_rate": 9.72327569358211e-05, "loss": 0.4886, "step": 1566 }, { "epoch": 0.8784343244260444, "grad_norm": 0.37097540497779846, "learning_rate": 9.722761667847246e-05, "loss": 0.5223, "step": 1567 }, { "epoch": 0.8791870530673692, "grad_norm": 0.33947277069091797, "learning_rate": 9.722247180317515e-05, "loss": 0.4526, "step": 1568 }, { "epoch": 0.879939781708694, "grad_norm": 0.33941999077796936, "learning_rate": 9.72173223104918e-05, "loss": 0.355, "step": 1569 }, { "epoch": 0.8806925103500188, "grad_norm": 0.29284512996673584, "learning_rate": 9.721216820098556e-05, "loss": 0.5236, "step": 1570 }, { "epoch": 0.8814452389913436, "grad_norm": 0.3108694553375244, "learning_rate": 9.720700947522007e-05, "loss": 0.4782, "step": 1571 }, { "epoch": 0.8821979676326684, "grad_norm": 0.26843103766441345, "learning_rate": 9.720184613375947e-05, "loss": 0.425, "step": 1572 }, { "epoch": 0.8829506962739933, "grad_norm": 0.2508660554885864, "learning_rate": 9.719667817716844e-05, "loss": 0.4504, "step": 1573 }, { "epoch": 0.8837034249153181, "grad_norm": 0.2579426169395447, "learning_rate": 9.719150560601212e-05, "loss": 0.4364, "step": 1574 }, { "epoch": 0.8844561535566429, "grad_norm": 0.23563402891159058, "learning_rate": 9.71863284208562e-05, "loss": 0.3843, "step": 1575 }, { "epoch": 0.8852088821979677, "grad_norm": 0.24018298089504242, "learning_rate": 9.718114662226681e-05, "loss": 0.3848, "step": 1576 }, { "epoch": 0.8859616108392925, "grad_norm": 0.2537483274936676, "learning_rate": 9.717596021081065e-05, "loss": 0.4491, "step": 1577 }, { "epoch": 0.8867143394806173, "grad_norm": 0.26452627778053284, "learning_rate": 9.717076918705489e-05, "loss": 0.5154, "step": 1578 }, { "epoch": 0.8874670681219421, "grad_norm": 0.22395062446594238, "learning_rate": 9.716557355156721e-05, "loss": 0.4988, "step": 1579 }, { "epoch": 0.8882197967632668, "grad_norm": 0.2558358609676361, "learning_rate": 9.716037330491581e-05, "loss": 0.4532, "step": 1580 }, { "epoch": 0.8889725254045916, "grad_norm": 0.22624269127845764, "learning_rate": 9.715516844766936e-05, "loss": 0.4047, "step": 1581 }, { "epoch": 0.8897252540459164, "grad_norm": 0.2320350706577301, "learning_rate": 9.714995898039709e-05, "loss": 0.3858, "step": 1582 }, { "epoch": 0.8904779826872412, "grad_norm": 0.24495519697666168, "learning_rate": 9.714474490366866e-05, "loss": 0.3638, "step": 1583 }, { "epoch": 0.891230711328566, "grad_norm": 0.21171391010284424, "learning_rate": 9.71395262180543e-05, "loss": 0.3146, "step": 1584 }, { "epoch": 0.8919834399698908, "grad_norm": 0.2925589978694916, "learning_rate": 9.71343029241247e-05, "loss": 0.4767, "step": 1585 }, { "epoch": 0.8927361686112156, "grad_norm": 0.18320739269256592, "learning_rate": 9.712907502245107e-05, "loss": 0.4177, "step": 1586 }, { "epoch": 0.8934888972525404, "grad_norm": 0.28652724623680115, "learning_rate": 9.712384251360517e-05, "loss": 0.4936, "step": 1587 }, { "epoch": 0.8942416258938652, "grad_norm": 0.2306792438030243, "learning_rate": 9.711860539815916e-05, "loss": 0.4384, "step": 1588 }, { "epoch": 0.89499435453519, "grad_norm": 0.2611534595489502, "learning_rate": 9.711336367668579e-05, "loss": 0.441, "step": 1589 }, { "epoch": 0.8957470831765149, "grad_norm": 0.2354220300912857, "learning_rate": 9.71081173497583e-05, "loss": 0.4818, "step": 1590 }, { "epoch": 0.8964998118178397, "grad_norm": 0.2609775960445404, "learning_rate": 9.710286641795037e-05, "loss": 0.3018, "step": 1591 }, { "epoch": 0.8972525404591645, "grad_norm": 0.2336774468421936, "learning_rate": 9.709761088183631e-05, "loss": 0.3812, "step": 1592 }, { "epoch": 0.8980052691004893, "grad_norm": 0.24580392241477966, "learning_rate": 9.709235074199079e-05, "loss": 0.4009, "step": 1593 }, { "epoch": 0.8987579977418141, "grad_norm": 0.2088300734758377, "learning_rate": 9.708708599898909e-05, "loss": 0.433, "step": 1594 }, { "epoch": 0.8995107263831389, "grad_norm": 0.2327253520488739, "learning_rate": 9.708181665340693e-05, "loss": 0.3175, "step": 1595 }, { "epoch": 0.9002634550244637, "grad_norm": 0.19849275052547455, "learning_rate": 9.707654270582057e-05, "loss": 0.4674, "step": 1596 }, { "epoch": 0.9010161836657885, "grad_norm": 0.2334933876991272, "learning_rate": 9.707126415680679e-05, "loss": 0.424, "step": 1597 }, { "epoch": 0.9017689123071133, "grad_norm": 0.22519852221012115, "learning_rate": 9.706598100694279e-05, "loss": 0.3357, "step": 1598 }, { "epoch": 0.9025216409484381, "grad_norm": 0.2636829614639282, "learning_rate": 9.706069325680634e-05, "loss": 0.4815, "step": 1599 }, { "epoch": 0.9032743695897629, "grad_norm": 0.24387620389461517, "learning_rate": 9.705540090697575e-05, "loss": 0.4361, "step": 1600 }, { "epoch": 0.9032743695897629, "eval_loss": 0.3980558514595032, "eval_runtime": 456.3851, "eval_samples_per_second": 21.094, "eval_steps_per_second": 0.66, "step": 1600 }, { "epoch": 0.9040270982310877, "grad_norm": 0.2186548411846161, "learning_rate": 9.705010395802971e-05, "loss": 0.4637, "step": 1601 }, { "epoch": 0.9047798268724125, "grad_norm": 0.20491449534893036, "learning_rate": 9.704480241054755e-05, "loss": 0.3551, "step": 1602 }, { "epoch": 0.9055325555137373, "grad_norm": 0.23424534499645233, "learning_rate": 9.7039496265109e-05, "loss": 0.3253, "step": 1603 }, { "epoch": 0.9062852841550622, "grad_norm": 0.22472074627876282, "learning_rate": 9.703418552229434e-05, "loss": 0.5405, "step": 1604 }, { "epoch": 0.9070380127963868, "grad_norm": 0.20866833627223969, "learning_rate": 9.702887018268435e-05, "loss": 0.3679, "step": 1605 }, { "epoch": 0.9077907414377117, "grad_norm": 0.20031318068504333, "learning_rate": 9.70235502468603e-05, "loss": 0.4217, "step": 1606 }, { "epoch": 0.9085434700790365, "grad_norm": 0.2843174636363983, "learning_rate": 9.7018225715404e-05, "loss": 0.3522, "step": 1607 }, { "epoch": 0.9092961987203613, "grad_norm": 0.25047603249549866, "learning_rate": 9.701289658889769e-05, "loss": 0.3814, "step": 1608 }, { "epoch": 0.9100489273616861, "grad_norm": 0.20225980877876282, "learning_rate": 9.700756286792419e-05, "loss": 0.4935, "step": 1609 }, { "epoch": 0.9108016560030109, "grad_norm": 0.23489975929260254, "learning_rate": 9.700222455306675e-05, "loss": 0.3621, "step": 1610 }, { "epoch": 0.9115543846443357, "grad_norm": 0.2031397968530655, "learning_rate": 9.69968816449092e-05, "loss": 0.3374, "step": 1611 }, { "epoch": 0.9123071132856605, "grad_norm": 0.21000801026821136, "learning_rate": 9.69915341440358e-05, "loss": 0.41, "step": 1612 }, { "epoch": 0.9130598419269853, "grad_norm": 0.22803743183612823, "learning_rate": 9.698618205103138e-05, "loss": 0.4031, "step": 1613 }, { "epoch": 0.9138125705683101, "grad_norm": 0.2103380262851715, "learning_rate": 9.69808253664812e-05, "loss": 0.3889, "step": 1614 }, { "epoch": 0.9145652992096349, "grad_norm": 0.23813128471374512, "learning_rate": 9.697546409097107e-05, "loss": 0.4984, "step": 1615 }, { "epoch": 0.9153180278509597, "grad_norm": 0.24351508915424347, "learning_rate": 9.69700982250873e-05, "loss": 0.5255, "step": 1616 }, { "epoch": 0.9160707564922845, "grad_norm": 0.210999995470047, "learning_rate": 9.696472776941671e-05, "loss": 0.3498, "step": 1617 }, { "epoch": 0.9168234851336093, "grad_norm": 0.27977117896080017, "learning_rate": 9.695935272454656e-05, "loss": 0.2994, "step": 1618 }, { "epoch": 0.9175762137749341, "grad_norm": 0.2011570930480957, "learning_rate": 9.69539730910647e-05, "loss": 0.4093, "step": 1619 }, { "epoch": 0.918328942416259, "grad_norm": 0.24855642020702362, "learning_rate": 9.694858886955939e-05, "loss": 0.45, "step": 1620 }, { "epoch": 0.9190816710575838, "grad_norm": 0.25860100984573364, "learning_rate": 9.694320006061949e-05, "loss": 0.4429, "step": 1621 }, { "epoch": 0.9198343996989086, "grad_norm": 0.2508963346481323, "learning_rate": 9.693780666483429e-05, "loss": 0.3131, "step": 1622 }, { "epoch": 0.9205871283402334, "grad_norm": 0.22092033922672272, "learning_rate": 9.693240868279362e-05, "loss": 0.3852, "step": 1623 }, { "epoch": 0.9213398569815582, "grad_norm": 0.2593602240085602, "learning_rate": 9.692700611508775e-05, "loss": 0.4054, "step": 1624 }, { "epoch": 0.922092585622883, "grad_norm": 0.2571476995944977, "learning_rate": 9.692159896230756e-05, "loss": 0.5063, "step": 1625 }, { "epoch": 0.9228453142642078, "grad_norm": 0.22426651418209076, "learning_rate": 9.691618722504431e-05, "loss": 0.3697, "step": 1626 }, { "epoch": 0.9235980429055326, "grad_norm": 0.2456275373697281, "learning_rate": 9.691077090388987e-05, "loss": 0.3712, "step": 1627 }, { "epoch": 0.9243507715468574, "grad_norm": 0.21263431012630463, "learning_rate": 9.690534999943652e-05, "loss": 0.4172, "step": 1628 }, { "epoch": 0.9251035001881822, "grad_norm": 0.2495305836200714, "learning_rate": 9.68999245122771e-05, "loss": 0.469, "step": 1629 }, { "epoch": 0.9258562288295069, "grad_norm": 0.2201531082391739, "learning_rate": 9.689449444300493e-05, "loss": 0.3889, "step": 1630 }, { "epoch": 0.9266089574708317, "grad_norm": 0.21631069481372833, "learning_rate": 9.688905979221384e-05, "loss": 0.4265, "step": 1631 }, { "epoch": 0.9273616861121565, "grad_norm": 0.2056736797094345, "learning_rate": 9.688362056049813e-05, "loss": 0.4161, "step": 1632 }, { "epoch": 0.9281144147534813, "grad_norm": 0.23846890032291412, "learning_rate": 9.687817674845267e-05, "loss": 0.3128, "step": 1633 }, { "epoch": 0.9288671433948061, "grad_norm": 0.21716493368148804, "learning_rate": 9.687272835667275e-05, "loss": 0.3881, "step": 1634 }, { "epoch": 0.929619872036131, "grad_norm": 0.24315853416919708, "learning_rate": 9.686727538575422e-05, "loss": 0.3205, "step": 1635 }, { "epoch": 0.9303726006774558, "grad_norm": 0.25411105155944824, "learning_rate": 9.686181783629342e-05, "loss": 0.4009, "step": 1636 }, { "epoch": 0.9311253293187806, "grad_norm": 0.2621065378189087, "learning_rate": 9.685635570888712e-05, "loss": 0.444, "step": 1637 }, { "epoch": 0.9318780579601054, "grad_norm": 0.2522229552268982, "learning_rate": 9.68508890041327e-05, "loss": 0.3669, "step": 1638 }, { "epoch": 0.9326307866014302, "grad_norm": 0.22129859030246735, "learning_rate": 9.6845417722628e-05, "loss": 0.3738, "step": 1639 }, { "epoch": 0.933383515242755, "grad_norm": 0.19421885907649994, "learning_rate": 9.683994186497132e-05, "loss": 0.3001, "step": 1640 }, { "epoch": 0.9341362438840798, "grad_norm": 0.23069500923156738, "learning_rate": 9.683446143176151e-05, "loss": 0.4752, "step": 1641 }, { "epoch": 0.9348889725254046, "grad_norm": 0.2539099156856537, "learning_rate": 9.682897642359789e-05, "loss": 0.4089, "step": 1642 }, { "epoch": 0.9356417011667294, "grad_norm": 0.2498980313539505, "learning_rate": 9.682348684108028e-05, "loss": 0.3845, "step": 1643 }, { "epoch": 0.9363944298080542, "grad_norm": 0.22381317615509033, "learning_rate": 9.681799268480905e-05, "loss": 0.4079, "step": 1644 }, { "epoch": 0.937147158449379, "grad_norm": 0.2107037752866745, "learning_rate": 9.6812493955385e-05, "loss": 0.3921, "step": 1645 }, { "epoch": 0.9378998870907038, "grad_norm": 0.25478917360305786, "learning_rate": 9.680699065340949e-05, "loss": 0.4901, "step": 1646 }, { "epoch": 0.9386526157320286, "grad_norm": 0.21376033127307892, "learning_rate": 9.680148277948433e-05, "loss": 0.4186, "step": 1647 }, { "epoch": 0.9394053443733534, "grad_norm": 0.24150902032852173, "learning_rate": 9.679597033421186e-05, "loss": 0.4352, "step": 1648 }, { "epoch": 0.9401580730146782, "grad_norm": 0.22784559428691864, "learning_rate": 9.679045331819491e-05, "loss": 0.3712, "step": 1649 }, { "epoch": 0.940910801656003, "grad_norm": 0.21520425379276276, "learning_rate": 9.678493173203682e-05, "loss": 0.3015, "step": 1650 }, { "epoch": 0.9416635302973279, "grad_norm": 0.23121477663516998, "learning_rate": 9.677940557634142e-05, "loss": 0.4543, "step": 1651 }, { "epoch": 0.9424162589386527, "grad_norm": 0.2608775794506073, "learning_rate": 9.677387485171305e-05, "loss": 0.4226, "step": 1652 }, { "epoch": 0.9431689875799775, "grad_norm": 0.332959920167923, "learning_rate": 9.676833955875651e-05, "loss": 0.4726, "step": 1653 }, { "epoch": 0.9439217162213023, "grad_norm": 0.27395790815353394, "learning_rate": 9.676279969807717e-05, "loss": 0.4158, "step": 1654 }, { "epoch": 0.944674444862627, "grad_norm": 0.31424131989479065, "learning_rate": 9.675725527028083e-05, "loss": 0.4612, "step": 1655 }, { "epoch": 0.9454271735039518, "grad_norm": 0.26507800817489624, "learning_rate": 9.675170627597386e-05, "loss": 0.3118, "step": 1656 }, { "epoch": 0.9461799021452766, "grad_norm": 0.3031614124774933, "learning_rate": 9.674615271576305e-05, "loss": 0.4103, "step": 1657 }, { "epoch": 0.9469326307866014, "grad_norm": 0.2821958065032959, "learning_rate": 9.674059459025575e-05, "loss": 0.3357, "step": 1658 }, { "epoch": 0.9476853594279262, "grad_norm": 0.25498512387275696, "learning_rate": 9.673503190005977e-05, "loss": 0.3534, "step": 1659 }, { "epoch": 0.948438088069251, "grad_norm": 0.3063252568244934, "learning_rate": 9.672946464578345e-05, "loss": 0.5007, "step": 1660 }, { "epoch": 0.9491908167105758, "grad_norm": 0.3301064968109131, "learning_rate": 9.672389282803563e-05, "loss": 0.3839, "step": 1661 }, { "epoch": 0.9499435453519006, "grad_norm": 0.30932021141052246, "learning_rate": 9.67183164474256e-05, "loss": 0.4298, "step": 1662 }, { "epoch": 0.9506962739932254, "grad_norm": 0.27753424644470215, "learning_rate": 9.671273550456322e-05, "loss": 0.3994, "step": 1663 }, { "epoch": 0.9514490026345502, "grad_norm": 0.2265176773071289, "learning_rate": 9.670715000005878e-05, "loss": 0.3644, "step": 1664 }, { "epoch": 0.952201731275875, "grad_norm": 0.32233819365501404, "learning_rate": 9.670155993452314e-05, "loss": 0.3076, "step": 1665 }, { "epoch": 0.9529544599171998, "grad_norm": 0.32031315565109253, "learning_rate": 9.669596530856761e-05, "loss": 0.381, "step": 1666 }, { "epoch": 0.9537071885585247, "grad_norm": 0.24213753640651703, "learning_rate": 9.6690366122804e-05, "loss": 0.3834, "step": 1667 }, { "epoch": 0.9544599171998495, "grad_norm": 0.24824875593185425, "learning_rate": 9.668476237784462e-05, "loss": 0.4246, "step": 1668 }, { "epoch": 0.9552126458411743, "grad_norm": 0.2874079644680023, "learning_rate": 9.66791540743023e-05, "loss": 0.5101, "step": 1669 }, { "epoch": 0.9559653744824991, "grad_norm": 0.24253815412521362, "learning_rate": 9.667354121279035e-05, "loss": 0.4922, "step": 1670 }, { "epoch": 0.9567181031238239, "grad_norm": 0.24525709450244904, "learning_rate": 9.66679237939226e-05, "loss": 0.3966, "step": 1671 }, { "epoch": 0.9574708317651487, "grad_norm": 0.28118428587913513, "learning_rate": 9.666230181831333e-05, "loss": 0.3995, "step": 1672 }, { "epoch": 0.9582235604064735, "grad_norm": 0.25488007068634033, "learning_rate": 9.66566752865774e-05, "loss": 0.4383, "step": 1673 }, { "epoch": 0.9589762890477983, "grad_norm": 0.2522820830345154, "learning_rate": 9.665104419933009e-05, "loss": 0.3025, "step": 1674 }, { "epoch": 0.9597290176891231, "grad_norm": 0.3152531385421753, "learning_rate": 9.66454085571872e-05, "loss": 0.3541, "step": 1675 }, { "epoch": 0.9604817463304479, "grad_norm": 0.2633424401283264, "learning_rate": 9.663976836076502e-05, "loss": 0.4192, "step": 1676 }, { "epoch": 0.9612344749717727, "grad_norm": 0.24488148093223572, "learning_rate": 9.66341236106804e-05, "loss": 0.4864, "step": 1677 }, { "epoch": 0.9619872036130975, "grad_norm": 0.2409403771162033, "learning_rate": 9.66284743075506e-05, "loss": 0.4219, "step": 1678 }, { "epoch": 0.9627399322544222, "grad_norm": 0.23872989416122437, "learning_rate": 9.662282045199345e-05, "loss": 0.3544, "step": 1679 }, { "epoch": 0.963492660895747, "grad_norm": 0.21248386800289154, "learning_rate": 9.661716204462726e-05, "loss": 0.442, "step": 1680 }, { "epoch": 0.9642453895370718, "grad_norm": 0.23991034924983978, "learning_rate": 9.661149908607077e-05, "loss": 0.4052, "step": 1681 }, { "epoch": 0.9649981181783966, "grad_norm": 0.23095127940177917, "learning_rate": 9.660583157694329e-05, "loss": 0.4385, "step": 1682 }, { "epoch": 0.9657508468197215, "grad_norm": 0.2724332809448242, "learning_rate": 9.660015951786465e-05, "loss": 0.4242, "step": 1683 }, { "epoch": 0.9665035754610463, "grad_norm": 0.2710343897342682, "learning_rate": 9.65944829094551e-05, "loss": 0.4626, "step": 1684 }, { "epoch": 0.9672563041023711, "grad_norm": 0.2717602550983429, "learning_rate": 9.658880175233544e-05, "loss": 0.3796, "step": 1685 }, { "epoch": 0.9680090327436959, "grad_norm": 0.24786429107189178, "learning_rate": 9.658311604712693e-05, "loss": 0.3616, "step": 1686 }, { "epoch": 0.9687617613850207, "grad_norm": 0.25275495648384094, "learning_rate": 9.657742579445138e-05, "loss": 0.3983, "step": 1687 }, { "epoch": 0.9695144900263455, "grad_norm": 0.30711647868156433, "learning_rate": 9.657173099493107e-05, "loss": 0.4128, "step": 1688 }, { "epoch": 0.9702672186676703, "grad_norm": 0.24491199851036072, "learning_rate": 9.656603164918873e-05, "loss": 0.3749, "step": 1689 }, { "epoch": 0.9710199473089951, "grad_norm": 0.27693018317222595, "learning_rate": 9.656032775784769e-05, "loss": 0.4824, "step": 1690 }, { "epoch": 0.9717726759503199, "grad_norm": 0.2616911232471466, "learning_rate": 9.655461932153168e-05, "loss": 0.3546, "step": 1691 }, { "epoch": 0.9725254045916447, "grad_norm": 0.2252359390258789, "learning_rate": 9.654890634086497e-05, "loss": 0.4371, "step": 1692 }, { "epoch": 0.9732781332329695, "grad_norm": 0.2799287736415863, "learning_rate": 9.654318881647235e-05, "loss": 0.4576, "step": 1693 }, { "epoch": 0.9740308618742943, "grad_norm": 0.2617090344429016, "learning_rate": 9.653746674897904e-05, "loss": 0.4372, "step": 1694 }, { "epoch": 0.9747835905156191, "grad_norm": 0.20392389595508575, "learning_rate": 9.653174013901083e-05, "loss": 0.3771, "step": 1695 }, { "epoch": 0.9755363191569439, "grad_norm": 0.26043701171875, "learning_rate": 9.652600898719395e-05, "loss": 0.4634, "step": 1696 }, { "epoch": 0.9762890477982687, "grad_norm": 0.23106318712234497, "learning_rate": 9.652027329415517e-05, "loss": 0.3795, "step": 1697 }, { "epoch": 0.9770417764395936, "grad_norm": 0.30129557847976685, "learning_rate": 9.651453306052173e-05, "loss": 0.4423, "step": 1698 }, { "epoch": 0.9777945050809184, "grad_norm": 0.21886762976646423, "learning_rate": 9.650878828692137e-05, "loss": 0.3549, "step": 1699 }, { "epoch": 0.9785472337222432, "grad_norm": 0.20720849931240082, "learning_rate": 9.650303897398232e-05, "loss": 0.359, "step": 1700 }, { "epoch": 0.979299962363568, "grad_norm": 0.25404754281044006, "learning_rate": 9.649728512233333e-05, "loss": 0.3886, "step": 1701 }, { "epoch": 0.9800526910048928, "grad_norm": 0.21232563257217407, "learning_rate": 9.649152673260363e-05, "loss": 0.3284, "step": 1702 }, { "epoch": 0.9808054196462176, "grad_norm": 0.23879718780517578, "learning_rate": 9.648576380542294e-05, "loss": 0.4128, "step": 1703 }, { "epoch": 0.9815581482875423, "grad_norm": 0.2756645679473877, "learning_rate": 9.647999634142151e-05, "loss": 0.2867, "step": 1704 }, { "epoch": 0.9823108769288671, "grad_norm": 0.2354487031698227, "learning_rate": 9.647422434123004e-05, "loss": 0.3143, "step": 1705 }, { "epoch": 0.9830636055701919, "grad_norm": 0.24849584698677063, "learning_rate": 9.646844780547975e-05, "loss": 0.4012, "step": 1706 }, { "epoch": 0.9838163342115167, "grad_norm": 0.23071274161338806, "learning_rate": 9.646266673480235e-05, "loss": 0.3577, "step": 1707 }, { "epoch": 0.9845690628528415, "grad_norm": 0.2293279618024826, "learning_rate": 9.645688112983006e-05, "loss": 0.4316, "step": 1708 }, { "epoch": 0.9853217914941663, "grad_norm": 0.23169958591461182, "learning_rate": 9.645109099119556e-05, "loss": 0.5086, "step": 1709 }, { "epoch": 0.9860745201354911, "grad_norm": 0.23942865431308746, "learning_rate": 9.64452963195321e-05, "loss": 0.412, "step": 1710 }, { "epoch": 0.9868272487768159, "grad_norm": 0.228075310587883, "learning_rate": 9.643949711547333e-05, "loss": 0.4329, "step": 1711 }, { "epoch": 0.9875799774181407, "grad_norm": 1.0959014892578125, "learning_rate": 9.643369337965346e-05, "loss": 0.3853, "step": 1712 }, { "epoch": 0.9883327060594655, "grad_norm": 0.21287547051906586, "learning_rate": 9.642788511270718e-05, "loss": 0.313, "step": 1713 }, { "epoch": 0.9890854347007904, "grad_norm": 0.2858162820339203, "learning_rate": 9.642207231526968e-05, "loss": 0.4689, "step": 1714 }, { "epoch": 0.9898381633421152, "grad_norm": 0.2554374039173126, "learning_rate": 9.641625498797661e-05, "loss": 0.3026, "step": 1715 }, { "epoch": 0.99059089198344, "grad_norm": 0.4656565487384796, "learning_rate": 9.641043313146417e-05, "loss": 0.4252, "step": 1716 }, { "epoch": 0.9913436206247648, "grad_norm": 0.3562995493412018, "learning_rate": 9.640460674636902e-05, "loss": 0.4018, "step": 1717 }, { "epoch": 0.9920963492660896, "grad_norm": 0.2920820116996765, "learning_rate": 9.639877583332832e-05, "loss": 0.3395, "step": 1718 }, { "epoch": 0.9928490779074144, "grad_norm": 0.3512093424797058, "learning_rate": 9.639294039297976e-05, "loss": 0.4391, "step": 1719 }, { "epoch": 0.9936018065487392, "grad_norm": 0.2832534611225128, "learning_rate": 9.638710042596146e-05, "loss": 0.3448, "step": 1720 }, { "epoch": 0.994354535190064, "grad_norm": 0.29091310501098633, "learning_rate": 9.638125593291208e-05, "loss": 0.4119, "step": 1721 }, { "epoch": 0.9951072638313888, "grad_norm": 0.2442103624343872, "learning_rate": 9.637540691447077e-05, "loss": 0.4053, "step": 1722 }, { "epoch": 0.9958599924727136, "grad_norm": 0.22520245611667633, "learning_rate": 9.636955337127716e-05, "loss": 0.3198, "step": 1723 }, { "epoch": 0.9966127211140384, "grad_norm": 0.23781804740428925, "learning_rate": 9.636369530397141e-05, "loss": 0.3428, "step": 1724 }, { "epoch": 0.9973654497553632, "grad_norm": 0.25277626514434814, "learning_rate": 9.635783271319409e-05, "loss": 0.2427, "step": 1725 }, { "epoch": 0.998118178396688, "grad_norm": 0.34800782799720764, "learning_rate": 9.635196559958641e-05, "loss": 0.3377, "step": 1726 }, { "epoch": 0.9988709070380128, "grad_norm": 0.29896634817123413, "learning_rate": 9.634609396378992e-05, "loss": 0.4445, "step": 1727 }, { "epoch": 0.9996236356793377, "grad_norm": 0.24591395258903503, "learning_rate": 9.634021780644676e-05, "loss": 0.41, "step": 1728 }, { "epoch": 1.0003763643206625, "grad_norm": 0.27475813031196594, "learning_rate": 9.633433712819955e-05, "loss": 0.3166, "step": 1729 }, { "epoch": 1.0011290929619872, "grad_norm": 0.24888430535793304, "learning_rate": 9.632845192969136e-05, "loss": 0.4042, "step": 1730 }, { "epoch": 1.001881821603312, "grad_norm": 0.22860124707221985, "learning_rate": 9.632256221156581e-05, "loss": 0.376, "step": 1731 }, { "epoch": 1.0026345502446368, "grad_norm": 0.24244877696037292, "learning_rate": 9.631666797446696e-05, "loss": 0.3531, "step": 1732 }, { "epoch": 1.0033872788859617, "grad_norm": 0.2665669620037079, "learning_rate": 9.631076921903945e-05, "loss": 0.3591, "step": 1733 }, { "epoch": 1.0041400075272864, "grad_norm": 0.24515606462955475, "learning_rate": 9.630486594592833e-05, "loss": 0.4514, "step": 1734 }, { "epoch": 1.0048927361686113, "grad_norm": 0.22912216186523438, "learning_rate": 9.629895815577916e-05, "loss": 0.4355, "step": 1735 }, { "epoch": 1.005645464809936, "grad_norm": 0.2295413464307785, "learning_rate": 9.629304584923802e-05, "loss": 0.3749, "step": 1736 }, { "epoch": 1.006398193451261, "grad_norm": 0.24447819590568542, "learning_rate": 9.628712902695146e-05, "loss": 0.3839, "step": 1737 }, { "epoch": 1.0071509220925856, "grad_norm": 0.26793187856674194, "learning_rate": 9.628120768956655e-05, "loss": 0.4169, "step": 1738 }, { "epoch": 1.0079036507339105, "grad_norm": 0.24740248918533325, "learning_rate": 9.627528183773083e-05, "loss": 0.4165, "step": 1739 }, { "epoch": 1.0086563793752352, "grad_norm": 0.259247362613678, "learning_rate": 9.626935147209235e-05, "loss": 0.4003, "step": 1740 }, { "epoch": 1.00940910801656, "grad_norm": 0.24582067131996155, "learning_rate": 9.626341659329963e-05, "loss": 0.4375, "step": 1741 }, { "epoch": 1.0101618366578848, "grad_norm": 0.2362472116947174, "learning_rate": 9.625747720200173e-05, "loss": 0.4774, "step": 1742 }, { "epoch": 1.0109145652992095, "grad_norm": 0.30593034625053406, "learning_rate": 9.625153329884815e-05, "loss": 0.4139, "step": 1743 }, { "epoch": 1.0116672939405345, "grad_norm": 0.22290962934494019, "learning_rate": 9.624558488448889e-05, "loss": 0.3821, "step": 1744 }, { "epoch": 1.0124200225818591, "grad_norm": 0.27439287304878235, "learning_rate": 9.62396319595745e-05, "loss": 0.2593, "step": 1745 }, { "epoch": 1.013172751223184, "grad_norm": 0.3136087954044342, "learning_rate": 9.623367452475594e-05, "loss": 0.3812, "step": 1746 }, { "epoch": 1.0139254798645088, "grad_norm": 0.38075926899909973, "learning_rate": 9.622771258068475e-05, "loss": 0.374, "step": 1747 }, { "epoch": 1.0146782085058337, "grad_norm": 0.33518362045288086, "learning_rate": 9.622174612801288e-05, "loss": 0.4123, "step": 1748 }, { "epoch": 1.0154309371471584, "grad_norm": 0.2657265067100525, "learning_rate": 9.621577516739284e-05, "loss": 0.3809, "step": 1749 }, { "epoch": 1.0161836657884833, "grad_norm": 0.3064594268798828, "learning_rate": 9.620979969947759e-05, "loss": 0.4223, "step": 1750 }, { "epoch": 1.016936394429808, "grad_norm": 0.3325202167034149, "learning_rate": 9.620381972492059e-05, "loss": 0.3803, "step": 1751 }, { "epoch": 1.017689123071133, "grad_norm": 0.2278306931257248, "learning_rate": 9.619783524437583e-05, "loss": 0.3748, "step": 1752 }, { "epoch": 1.0184418517124576, "grad_norm": 0.2346266806125641, "learning_rate": 9.619184625849775e-05, "loss": 0.3016, "step": 1753 }, { "epoch": 1.0191945803537825, "grad_norm": 0.23099392652511597, "learning_rate": 9.618585276794129e-05, "loss": 0.3657, "step": 1754 }, { "epoch": 1.0199473089951072, "grad_norm": 0.2834855020046234, "learning_rate": 9.61798547733619e-05, "loss": 0.4387, "step": 1755 }, { "epoch": 1.0207000376364321, "grad_norm": 0.2243211567401886, "learning_rate": 9.61738522754155e-05, "loss": 0.3426, "step": 1756 }, { "epoch": 1.0214527662777568, "grad_norm": 0.2558087110519409, "learning_rate": 9.61678452747585e-05, "loss": 0.4395, "step": 1757 }, { "epoch": 1.0222054949190817, "grad_norm": 0.18982170522212982, "learning_rate": 9.616183377204787e-05, "loss": 0.3925, "step": 1758 }, { "epoch": 1.0229582235604064, "grad_norm": 0.2662048637866974, "learning_rate": 9.615581776794096e-05, "loss": 0.3551, "step": 1759 }, { "epoch": 1.0237109522017314, "grad_norm": 0.23518553376197815, "learning_rate": 9.61497972630957e-05, "loss": 0.3779, "step": 1760 }, { "epoch": 1.024463680843056, "grad_norm": 0.24701912701129913, "learning_rate": 9.614377225817049e-05, "loss": 0.3954, "step": 1761 }, { "epoch": 1.025216409484381, "grad_norm": 0.2650562822818756, "learning_rate": 9.613774275382419e-05, "loss": 0.3648, "step": 1762 }, { "epoch": 1.0259691381257057, "grad_norm": 0.31437668204307556, "learning_rate": 9.61317087507162e-05, "loss": 0.3562, "step": 1763 }, { "epoch": 1.0267218667670306, "grad_norm": 0.25565576553344727, "learning_rate": 9.612567024950637e-05, "loss": 0.3165, "step": 1764 }, { "epoch": 1.0274745954083553, "grad_norm": 0.23478759825229645, "learning_rate": 9.611962725085509e-05, "loss": 0.4491, "step": 1765 }, { "epoch": 1.0282273240496802, "grad_norm": 0.20070144534111023, "learning_rate": 9.611357975542319e-05, "loss": 0.4139, "step": 1766 }, { "epoch": 1.028980052691005, "grad_norm": 0.24210219085216522, "learning_rate": 9.610752776387203e-05, "loss": 0.3977, "step": 1767 }, { "epoch": 1.0297327813323296, "grad_norm": 0.2138083577156067, "learning_rate": 9.610147127686342e-05, "loss": 0.2769, "step": 1768 }, { "epoch": 1.0304855099736545, "grad_norm": 0.21234716475009918, "learning_rate": 9.609541029505972e-05, "loss": 0.4404, "step": 1769 }, { "epoch": 1.0312382386149792, "grad_norm": 0.21779559552669525, "learning_rate": 9.608934481912374e-05, "loss": 0.3145, "step": 1770 }, { "epoch": 1.0319909672563041, "grad_norm": 0.2101481854915619, "learning_rate": 9.608327484971876e-05, "loss": 0.3615, "step": 1771 }, { "epoch": 1.0327436958976288, "grad_norm": 0.2787138521671295, "learning_rate": 9.607720038750864e-05, "loss": 0.2612, "step": 1772 }, { "epoch": 1.0334964245389537, "grad_norm": 0.24551057815551758, "learning_rate": 9.607112143315763e-05, "loss": 0.3555, "step": 1773 }, { "epoch": 1.0342491531802784, "grad_norm": 0.24666017293930054, "learning_rate": 9.606503798733054e-05, "loss": 0.4704, "step": 1774 }, { "epoch": 1.0350018818216034, "grad_norm": 0.2305205911397934, "learning_rate": 9.605895005069262e-05, "loss": 0.3817, "step": 1775 }, { "epoch": 1.035754610462928, "grad_norm": 0.2019781768321991, "learning_rate": 9.605285762390967e-05, "loss": 0.3889, "step": 1776 }, { "epoch": 1.036507339104253, "grad_norm": 0.23916231095790863, "learning_rate": 9.604676070764791e-05, "loss": 0.4142, "step": 1777 }, { "epoch": 1.0372600677455777, "grad_norm": 0.21745885908603668, "learning_rate": 9.604065930257413e-05, "loss": 0.4958, "step": 1778 }, { "epoch": 1.0380127963869026, "grad_norm": 0.19505906105041504, "learning_rate": 9.603455340935557e-05, "loss": 0.3926, "step": 1779 }, { "epoch": 1.0387655250282273, "grad_norm": 0.2621769905090332, "learning_rate": 9.602844302865991e-05, "loss": 0.3357, "step": 1780 }, { "epoch": 1.0395182536695522, "grad_norm": 0.2593105137348175, "learning_rate": 9.602232816115542e-05, "loss": 0.3833, "step": 1781 }, { "epoch": 1.040270982310877, "grad_norm": 0.22006817162036896, "learning_rate": 9.60162088075108e-05, "loss": 0.3769, "step": 1782 }, { "epoch": 1.0410237109522018, "grad_norm": 0.24870184063911438, "learning_rate": 9.601008496839525e-05, "loss": 0.3592, "step": 1783 }, { "epoch": 1.0417764395935265, "grad_norm": 0.22535398602485657, "learning_rate": 9.600395664447846e-05, "loss": 0.3652, "step": 1784 }, { "epoch": 1.0425291682348514, "grad_norm": 0.2741582691669464, "learning_rate": 9.599782383643062e-05, "loss": 0.3209, "step": 1785 }, { "epoch": 1.0432818968761761, "grad_norm": 0.2148016393184662, "learning_rate": 9.59916865449224e-05, "loss": 0.4434, "step": 1786 }, { "epoch": 1.044034625517501, "grad_norm": 0.2520974576473236, "learning_rate": 9.5985544770625e-05, "loss": 0.3321, "step": 1787 }, { "epoch": 1.0447873541588257, "grad_norm": 0.2710311710834503, "learning_rate": 9.597939851421002e-05, "loss": 0.4498, "step": 1788 }, { "epoch": 1.0455400828001506, "grad_norm": 0.24417242407798767, "learning_rate": 9.597324777634962e-05, "loss": 0.3509, "step": 1789 }, { "epoch": 1.0462928114414753, "grad_norm": 0.2521495223045349, "learning_rate": 9.596709255771647e-05, "loss": 0.3671, "step": 1790 }, { "epoch": 1.0470455400828, "grad_norm": 0.27639907598495483, "learning_rate": 9.596093285898366e-05, "loss": 0.3827, "step": 1791 }, { "epoch": 1.047798268724125, "grad_norm": 0.27554816007614136, "learning_rate": 9.595476868082481e-05, "loss": 0.3614, "step": 1792 }, { "epoch": 1.0485509973654497, "grad_norm": 0.22678044438362122, "learning_rate": 9.594860002391404e-05, "loss": 0.3743, "step": 1793 }, { "epoch": 1.0493037260067746, "grad_norm": 0.2190953493118286, "learning_rate": 9.594242688892593e-05, "loss": 0.2888, "step": 1794 }, { "epoch": 1.0500564546480993, "grad_norm": 0.22869452834129333, "learning_rate": 9.593624927653557e-05, "loss": 0.3293, "step": 1795 }, { "epoch": 1.0508091832894242, "grad_norm": 0.24124588072299957, "learning_rate": 9.593006718741855e-05, "loss": 0.4196, "step": 1796 }, { "epoch": 1.0515619119307489, "grad_norm": 0.21153366565704346, "learning_rate": 9.592388062225091e-05, "loss": 0.3131, "step": 1797 }, { "epoch": 1.0523146405720738, "grad_norm": 0.18091249465942383, "learning_rate": 9.591768958170921e-05, "loss": 0.3511, "step": 1798 }, { "epoch": 1.0530673692133985, "grad_norm": 0.26631540060043335, "learning_rate": 9.591149406647051e-05, "loss": 0.3432, "step": 1799 }, { "epoch": 1.0538200978547234, "grad_norm": 0.24632111191749573, "learning_rate": 9.590529407721231e-05, "loss": 0.3648, "step": 1800 }, { "epoch": 1.0538200978547234, "eval_loss": 0.3756145238876343, "eval_runtime": 456.1398, "eval_samples_per_second": 21.105, "eval_steps_per_second": 0.66, "step": 1800 }, { "epoch": 1.054572826496048, "grad_norm": 0.20502068102359772, "learning_rate": 9.589908961461267e-05, "loss": 0.3186, "step": 1801 }, { "epoch": 1.055325555137373, "grad_norm": 0.2305462807416916, "learning_rate": 9.589288067935007e-05, "loss": 0.4012, "step": 1802 }, { "epoch": 1.0560782837786977, "grad_norm": 0.2853091061115265, "learning_rate": 9.588666727210352e-05, "loss": 0.381, "step": 1803 }, { "epoch": 1.0568310124200226, "grad_norm": 0.2769205868244171, "learning_rate": 9.58804493935525e-05, "loss": 0.3637, "step": 1804 }, { "epoch": 1.0575837410613473, "grad_norm": 0.2948039472103119, "learning_rate": 9.5874227044377e-05, "loss": 0.4319, "step": 1805 }, { "epoch": 1.0583364697026723, "grad_norm": 0.2774375081062317, "learning_rate": 9.58680002252575e-05, "loss": 0.4227, "step": 1806 }, { "epoch": 1.059089198343997, "grad_norm": 0.2947389781475067, "learning_rate": 9.586176893687494e-05, "loss": 0.3616, "step": 1807 }, { "epoch": 1.0598419269853219, "grad_norm": 0.25587254762649536, "learning_rate": 9.585553317991075e-05, "loss": 0.4547, "step": 1808 }, { "epoch": 1.0605946556266466, "grad_norm": 0.2564508020877838, "learning_rate": 9.584929295504688e-05, "loss": 0.4434, "step": 1809 }, { "epoch": 1.0613473842679715, "grad_norm": 0.2688274681568146, "learning_rate": 9.584304826296575e-05, "loss": 0.3527, "step": 1810 }, { "epoch": 1.0621001129092962, "grad_norm": 0.25446268916130066, "learning_rate": 9.583679910435026e-05, "loss": 0.3781, "step": 1811 }, { "epoch": 1.062852841550621, "grad_norm": 0.20835645496845245, "learning_rate": 9.583054547988383e-05, "loss": 0.3866, "step": 1812 }, { "epoch": 1.0636055701919458, "grad_norm": 0.2927253544330597, "learning_rate": 9.582428739025033e-05, "loss": 0.3101, "step": 1813 }, { "epoch": 1.0643582988332705, "grad_norm": 0.26922062039375305, "learning_rate": 9.581802483613414e-05, "loss": 0.4096, "step": 1814 }, { "epoch": 1.0651110274745954, "grad_norm": 0.2726929783821106, "learning_rate": 9.581175781822012e-05, "loss": 0.4183, "step": 1815 }, { "epoch": 1.0658637561159203, "grad_norm": 0.26362431049346924, "learning_rate": 9.580548633719363e-05, "loss": 0.3357, "step": 1816 }, { "epoch": 1.066616484757245, "grad_norm": 0.2584637403488159, "learning_rate": 9.57992103937405e-05, "loss": 0.4282, "step": 1817 }, { "epoch": 1.0673692133985697, "grad_norm": 0.24880722165107727, "learning_rate": 9.579292998854706e-05, "loss": 0.4078, "step": 1818 }, { "epoch": 1.0681219420398946, "grad_norm": 0.21408122777938843, "learning_rate": 9.578664512230014e-05, "loss": 0.4427, "step": 1819 }, { "epoch": 1.0688746706812193, "grad_norm": 0.2710149586200714, "learning_rate": 9.5780355795687e-05, "loss": 0.3274, "step": 1820 }, { "epoch": 1.0696273993225442, "grad_norm": 0.2757660746574402, "learning_rate": 9.57740620093955e-05, "loss": 0.3424, "step": 1821 }, { "epoch": 1.070380127963869, "grad_norm": 0.22284236550331116, "learning_rate": 9.576776376411386e-05, "loss": 0.2939, "step": 1822 }, { "epoch": 1.0711328566051939, "grad_norm": 0.23091335594654083, "learning_rate": 9.576146106053088e-05, "loss": 0.2796, "step": 1823 }, { "epoch": 1.0718855852465186, "grad_norm": 0.2327660173177719, "learning_rate": 9.575515389933579e-05, "loss": 0.4284, "step": 1824 }, { "epoch": 1.0726383138878435, "grad_norm": 0.21638572216033936, "learning_rate": 9.574884228121836e-05, "loss": 0.3036, "step": 1825 }, { "epoch": 1.0733910425291682, "grad_norm": 0.23056212067604065, "learning_rate": 9.574252620686879e-05, "loss": 0.3688, "step": 1826 }, { "epoch": 1.074143771170493, "grad_norm": 0.236062154173851, "learning_rate": 9.57362056769778e-05, "loss": 0.2639, "step": 1827 }, { "epoch": 1.0748964998118178, "grad_norm": 0.2499072402715683, "learning_rate": 9.572988069223662e-05, "loss": 0.3978, "step": 1828 }, { "epoch": 1.0756492284531427, "grad_norm": 0.2493269294500351, "learning_rate": 9.572355125333691e-05, "loss": 0.3765, "step": 1829 }, { "epoch": 1.0764019570944674, "grad_norm": 0.2127223163843155, "learning_rate": 9.571721736097089e-05, "loss": 0.4225, "step": 1830 }, { "epoch": 1.0771546857357923, "grad_norm": 0.24787212908267975, "learning_rate": 9.571087901583117e-05, "loss": 0.3441, "step": 1831 }, { "epoch": 1.077907414377117, "grad_norm": 0.2827840745449066, "learning_rate": 9.570453621861093e-05, "loss": 0.4476, "step": 1832 }, { "epoch": 1.078660143018442, "grad_norm": 0.285984605550766, "learning_rate": 9.569818897000382e-05, "loss": 0.3952, "step": 1833 }, { "epoch": 1.0794128716597666, "grad_norm": 0.26719656586647034, "learning_rate": 9.569183727070396e-05, "loss": 0.3225, "step": 1834 }, { "epoch": 1.0801656003010915, "grad_norm": 0.26205623149871826, "learning_rate": 9.568548112140593e-05, "loss": 0.3801, "step": 1835 }, { "epoch": 1.0809183289424162, "grad_norm": 0.23341670632362366, "learning_rate": 9.567912052280486e-05, "loss": 0.4253, "step": 1836 }, { "epoch": 1.0816710575837412, "grad_norm": 0.20876355469226837, "learning_rate": 9.567275547559635e-05, "loss": 0.3708, "step": 1837 }, { "epoch": 1.0824237862250659, "grad_norm": 0.24664807319641113, "learning_rate": 9.566638598047642e-05, "loss": 0.4286, "step": 1838 }, { "epoch": 1.0831765148663908, "grad_norm": 0.27060216665267944, "learning_rate": 9.566001203814169e-05, "loss": 0.3467, "step": 1839 }, { "epoch": 1.0839292435077155, "grad_norm": 0.2897794544696808, "learning_rate": 9.565363364928918e-05, "loss": 0.3285, "step": 1840 }, { "epoch": 1.0846819721490402, "grad_norm": 0.2900921404361725, "learning_rate": 9.564725081461639e-05, "loss": 0.3717, "step": 1841 }, { "epoch": 1.085434700790365, "grad_norm": 0.23845629394054413, "learning_rate": 9.564086353482137e-05, "loss": 0.3648, "step": 1842 }, { "epoch": 1.0861874294316898, "grad_norm": 0.2024158239364624, "learning_rate": 9.563447181060262e-05, "loss": 0.3638, "step": 1843 }, { "epoch": 1.0869401580730147, "grad_norm": 0.2155068963766098, "learning_rate": 9.562807564265913e-05, "loss": 0.2891, "step": 1844 }, { "epoch": 1.0876928867143394, "grad_norm": 0.22107283771038055, "learning_rate": 9.562167503169036e-05, "loss": 0.3741, "step": 1845 }, { "epoch": 1.0884456153556643, "grad_norm": 0.2716062664985657, "learning_rate": 9.56152699783963e-05, "loss": 0.3754, "step": 1846 }, { "epoch": 1.089198343996989, "grad_norm": 0.24283859133720398, "learning_rate": 9.560886048347736e-05, "loss": 0.3447, "step": 1847 }, { "epoch": 1.089951072638314, "grad_norm": 0.2946912944316864, "learning_rate": 9.560244654763449e-05, "loss": 0.3288, "step": 1848 }, { "epoch": 1.0907038012796386, "grad_norm": 0.2611449956893921, "learning_rate": 9.559602817156913e-05, "loss": 0.4205, "step": 1849 }, { "epoch": 1.0914565299209635, "grad_norm": 0.30887287855148315, "learning_rate": 9.558960535598316e-05, "loss": 0.3932, "step": 1850 }, { "epoch": 1.0922092585622882, "grad_norm": 0.2921074628829956, "learning_rate": 9.558317810157897e-05, "loss": 0.3965, "step": 1851 }, { "epoch": 1.0929619872036132, "grad_norm": 0.3431641757488251, "learning_rate": 9.557674640905943e-05, "loss": 0.3786, "step": 1852 }, { "epoch": 1.0937147158449378, "grad_norm": 0.2746260166168213, "learning_rate": 9.557031027912792e-05, "loss": 0.3908, "step": 1853 }, { "epoch": 1.0944674444862628, "grad_norm": 0.24845963716506958, "learning_rate": 9.556386971248827e-05, "loss": 0.2713, "step": 1854 }, { "epoch": 1.0952201731275875, "grad_norm": 0.2917407155036926, "learning_rate": 9.555742470984481e-05, "loss": 0.3116, "step": 1855 }, { "epoch": 1.0959729017689124, "grad_norm": 0.38210391998291016, "learning_rate": 9.555097527190237e-05, "loss": 0.3898, "step": 1856 }, { "epoch": 1.096725630410237, "grad_norm": 0.2226347178220749, "learning_rate": 9.554452139936623e-05, "loss": 0.3337, "step": 1857 }, { "epoch": 1.097478359051562, "grad_norm": 0.3141632676124573, "learning_rate": 9.553806309294221e-05, "loss": 0.2804, "step": 1858 }, { "epoch": 1.0982310876928867, "grad_norm": 0.2663570046424866, "learning_rate": 9.553160035333655e-05, "loss": 0.3362, "step": 1859 }, { "epoch": 1.0989838163342116, "grad_norm": 0.33051323890686035, "learning_rate": 9.552513318125601e-05, "loss": 0.4795, "step": 1860 }, { "epoch": 1.0997365449755363, "grad_norm": 0.3043634593486786, "learning_rate": 9.551866157740782e-05, "loss": 0.3914, "step": 1861 }, { "epoch": 1.1004892736168612, "grad_norm": 0.2749307453632355, "learning_rate": 9.551218554249973e-05, "loss": 0.483, "step": 1862 }, { "epoch": 1.101242002258186, "grad_norm": 0.23724576830863953, "learning_rate": 9.550570507723995e-05, "loss": 0.3096, "step": 1863 }, { "epoch": 1.1019947308995106, "grad_norm": 0.30395209789276123, "learning_rate": 9.549922018233714e-05, "loss": 0.3094, "step": 1864 }, { "epoch": 1.1027474595408355, "grad_norm": 0.27869486808776855, "learning_rate": 9.549273085850051e-05, "loss": 0.3346, "step": 1865 }, { "epoch": 1.1035001881821604, "grad_norm": 0.22254551947116852, "learning_rate": 9.548623710643972e-05, "loss": 0.3792, "step": 1866 }, { "epoch": 1.1042529168234851, "grad_norm": 0.2171884924173355, "learning_rate": 9.54797389268649e-05, "loss": 0.3078, "step": 1867 }, { "epoch": 1.1050056454648098, "grad_norm": 0.23858000338077545, "learning_rate": 9.54732363204867e-05, "loss": 0.257, "step": 1868 }, { "epoch": 1.1057583741061348, "grad_norm": 0.22631607949733734, "learning_rate": 9.546672928801622e-05, "loss": 0.4208, "step": 1869 }, { "epoch": 1.1065111027474595, "grad_norm": 0.20797455310821533, "learning_rate": 9.546021783016508e-05, "loss": 0.352, "step": 1870 }, { "epoch": 1.1072638313887844, "grad_norm": 0.22677908837795258, "learning_rate": 9.545370194764534e-05, "loss": 0.2384, "step": 1871 }, { "epoch": 1.108016560030109, "grad_norm": 0.2211371511220932, "learning_rate": 9.544718164116956e-05, "loss": 0.3821, "step": 1872 }, { "epoch": 1.108769288671434, "grad_norm": 0.2050095796585083, "learning_rate": 9.544065691145082e-05, "loss": 0.4142, "step": 1873 }, { "epoch": 1.1095220173127587, "grad_norm": 0.20779792964458466, "learning_rate": 9.543412775920264e-05, "loss": 0.331, "step": 1874 }, { "epoch": 1.1102747459540836, "grad_norm": 0.18408061563968658, "learning_rate": 9.542759418513906e-05, "loss": 0.3131, "step": 1875 }, { "epoch": 1.1110274745954083, "grad_norm": 0.2970980107784271, "learning_rate": 9.542105618997453e-05, "loss": 0.439, "step": 1876 }, { "epoch": 1.1117802032367332, "grad_norm": 0.27768474817276, "learning_rate": 9.541451377442409e-05, "loss": 0.3888, "step": 1877 }, { "epoch": 1.112532931878058, "grad_norm": 0.26927876472473145, "learning_rate": 9.540796693920318e-05, "loss": 0.3401, "step": 1878 }, { "epoch": 1.1132856605193828, "grad_norm": 0.2342337965965271, "learning_rate": 9.540141568502774e-05, "loss": 0.3593, "step": 1879 }, { "epoch": 1.1140383891607075, "grad_norm": 0.22564657032489777, "learning_rate": 9.539486001261425e-05, "loss": 0.359, "step": 1880 }, { "epoch": 1.1147911178020324, "grad_norm": 0.24985142052173615, "learning_rate": 9.53882999226796e-05, "loss": 0.3251, "step": 1881 }, { "epoch": 1.1155438464433571, "grad_norm": 0.2588150203227997, "learning_rate": 9.538173541594118e-05, "loss": 0.3965, "step": 1882 }, { "epoch": 1.116296575084682, "grad_norm": 0.20211276412010193, "learning_rate": 9.53751664931169e-05, "loss": 0.3431, "step": 1883 }, { "epoch": 1.1170493037260067, "grad_norm": 0.21599018573760986, "learning_rate": 9.53685931549251e-05, "loss": 0.3811, "step": 1884 }, { "epoch": 1.1178020323673317, "grad_norm": 0.19473981857299805, "learning_rate": 9.536201540208466e-05, "loss": 0.3294, "step": 1885 }, { "epoch": 1.1185547610086564, "grad_norm": 0.23802265524864197, "learning_rate": 9.535543323531489e-05, "loss": 0.325, "step": 1886 }, { "epoch": 1.1193074896499813, "grad_norm": 0.2485673725605011, "learning_rate": 9.534884665533563e-05, "loss": 0.4236, "step": 1887 }, { "epoch": 1.120060218291306, "grad_norm": 0.22147533297538757, "learning_rate": 9.534225566286715e-05, "loss": 0.359, "step": 1888 }, { "epoch": 1.120812946932631, "grad_norm": 0.2696925401687622, "learning_rate": 9.533566025863023e-05, "loss": 0.3571, "step": 1889 }, { "epoch": 1.1215656755739556, "grad_norm": 0.23082180321216583, "learning_rate": 9.532906044334616e-05, "loss": 0.3806, "step": 1890 }, { "epoch": 1.1223184042152803, "grad_norm": 0.23549918830394745, "learning_rate": 9.532245621773668e-05, "loss": 0.3515, "step": 1891 }, { "epoch": 1.1230711328566052, "grad_norm": 0.2272789031267166, "learning_rate": 9.531584758252401e-05, "loss": 0.4519, "step": 1892 }, { "epoch": 1.12382386149793, "grad_norm": 0.22452601790428162, "learning_rate": 9.530923453843086e-05, "loss": 0.4036, "step": 1893 }, { "epoch": 1.1245765901392548, "grad_norm": 0.21831907331943512, "learning_rate": 9.530261708618043e-05, "loss": 0.2641, "step": 1894 }, { "epoch": 1.1253293187805795, "grad_norm": 0.22544161975383759, "learning_rate": 9.529599522649639e-05, "loss": 0.394, "step": 1895 }, { "epoch": 1.1260820474219044, "grad_norm": 0.2585974335670471, "learning_rate": 9.528936896010288e-05, "loss": 0.5033, "step": 1896 }, { "epoch": 1.1268347760632291, "grad_norm": 0.2764986753463745, "learning_rate": 9.528273828772458e-05, "loss": 0.3679, "step": 1897 }, { "epoch": 1.127587504704554, "grad_norm": 0.28654205799102783, "learning_rate": 9.527610321008657e-05, "loss": 0.3571, "step": 1898 }, { "epoch": 1.1283402333458787, "grad_norm": 0.276490181684494, "learning_rate": 9.526946372791448e-05, "loss": 0.3424, "step": 1899 }, { "epoch": 1.1290929619872037, "grad_norm": 0.222162127494812, "learning_rate": 9.526281984193436e-05, "loss": 0.3394, "step": 1900 }, { "epoch": 1.1298456906285284, "grad_norm": 0.2948587238788605, "learning_rate": 9.52561715528728e-05, "loss": 0.435, "step": 1901 }, { "epoch": 1.1305984192698533, "grad_norm": 0.290979266166687, "learning_rate": 9.524951886145686e-05, "loss": 0.3876, "step": 1902 }, { "epoch": 1.131351147911178, "grad_norm": 0.2890782058238983, "learning_rate": 9.524286176841404e-05, "loss": 0.3026, "step": 1903 }, { "epoch": 1.1321038765525029, "grad_norm": 0.3431805372238159, "learning_rate": 9.523620027447235e-05, "loss": 0.3536, "step": 1904 }, { "epoch": 1.1328566051938276, "grad_norm": 0.26146939396858215, "learning_rate": 9.522953438036032e-05, "loss": 0.246, "step": 1905 }, { "epoch": 1.1336093338351525, "grad_norm": 0.2676650583744049, "learning_rate": 9.522286408680687e-05, "loss": 0.3805, "step": 1906 }, { "epoch": 1.1343620624764772, "grad_norm": 0.31318122148513794, "learning_rate": 9.521618939454149e-05, "loss": 0.3794, "step": 1907 }, { "epoch": 1.1351147911178021, "grad_norm": 0.24675026535987854, "learning_rate": 9.520951030429409e-05, "loss": 0.3777, "step": 1908 }, { "epoch": 1.1358675197591268, "grad_norm": 0.2319561094045639, "learning_rate": 9.520282681679513e-05, "loss": 0.2739, "step": 1909 }, { "epoch": 1.1366202484004517, "grad_norm": 0.3125397264957428, "learning_rate": 9.519613893277544e-05, "loss": 0.3938, "step": 1910 }, { "epoch": 1.1373729770417764, "grad_norm": 0.251300573348999, "learning_rate": 9.518944665296643e-05, "loss": 0.2139, "step": 1911 }, { "epoch": 1.1381257056831013, "grad_norm": 0.3202332556247711, "learning_rate": 9.518274997809998e-05, "loss": 0.3122, "step": 1912 }, { "epoch": 1.138878434324426, "grad_norm": 0.32709208130836487, "learning_rate": 9.517604890890837e-05, "loss": 0.5343, "step": 1913 }, { "epoch": 1.1396311629657507, "grad_norm": 0.25188833475112915, "learning_rate": 9.51693434461245e-05, "loss": 0.239, "step": 1914 }, { "epoch": 1.1403838916070757, "grad_norm": 0.2699223458766937, "learning_rate": 9.516263359048162e-05, "loss": 0.2801, "step": 1915 }, { "epoch": 1.1411366202484006, "grad_norm": 0.2772495746612549, "learning_rate": 9.515591934271347e-05, "loss": 0.4063, "step": 1916 }, { "epoch": 1.1418893488897253, "grad_norm": 0.28957515954971313, "learning_rate": 9.51492007035544e-05, "loss": 0.3783, "step": 1917 }, { "epoch": 1.14264207753105, "grad_norm": 0.2359600067138672, "learning_rate": 9.514247767373907e-05, "loss": 0.3421, "step": 1918 }, { "epoch": 1.1433948061723749, "grad_norm": 0.26123329997062683, "learning_rate": 9.513575025400275e-05, "loss": 0.3576, "step": 1919 }, { "epoch": 1.1441475348136996, "grad_norm": 0.2733752131462097, "learning_rate": 9.512901844508113e-05, "loss": 0.3455, "step": 1920 }, { "epoch": 1.1449002634550245, "grad_norm": 0.23970773816108704, "learning_rate": 9.512228224771038e-05, "loss": 0.3494, "step": 1921 }, { "epoch": 1.1456529920963492, "grad_norm": 0.2595024108886719, "learning_rate": 9.511554166262717e-05, "loss": 0.4143, "step": 1922 }, { "epoch": 1.146405720737674, "grad_norm": 0.23728978633880615, "learning_rate": 9.510879669056863e-05, "loss": 0.3292, "step": 1923 }, { "epoch": 1.1471584493789988, "grad_norm": 0.2604652941226959, "learning_rate": 9.510204733227239e-05, "loss": 0.3099, "step": 1924 }, { "epoch": 1.1479111780203237, "grad_norm": 0.2237216681241989, "learning_rate": 9.509529358847655e-05, "loss": 0.3971, "step": 1925 }, { "epoch": 1.1486639066616484, "grad_norm": 0.289255291223526, "learning_rate": 9.50885354599197e-05, "loss": 0.4281, "step": 1926 }, { "epoch": 1.1494166353029733, "grad_norm": 0.23782366514205933, "learning_rate": 9.508177294734086e-05, "loss": 0.3475, "step": 1927 }, { "epoch": 1.150169363944298, "grad_norm": 0.277095228433609, "learning_rate": 9.50750060514796e-05, "loss": 0.2862, "step": 1928 }, { "epoch": 1.150922092585623, "grad_norm": 0.2673790156841278, "learning_rate": 9.506823477307593e-05, "loss": 0.2085, "step": 1929 }, { "epoch": 1.1516748212269476, "grad_norm": 0.2197546511888504, "learning_rate": 9.506145911287034e-05, "loss": 0.3221, "step": 1930 }, { "epoch": 1.1524275498682726, "grad_norm": 0.28864991664886475, "learning_rate": 9.505467907160383e-05, "loss": 0.2925, "step": 1931 }, { "epoch": 1.1531802785095973, "grad_norm": 0.20368675887584686, "learning_rate": 9.504789465001783e-05, "loss": 0.3563, "step": 1932 }, { "epoch": 1.1539330071509222, "grad_norm": 0.23362097144126892, "learning_rate": 9.504110584885429e-05, "loss": 0.3092, "step": 1933 }, { "epoch": 1.1546857357922469, "grad_norm": 0.2616417407989502, "learning_rate": 9.50343126688556e-05, "loss": 0.2756, "step": 1934 }, { "epoch": 1.1554384644335718, "grad_norm": 0.25320979952812195, "learning_rate": 9.502751511076468e-05, "loss": 0.3636, "step": 1935 }, { "epoch": 1.1561911930748965, "grad_norm": 0.2412760853767395, "learning_rate": 9.502071317532488e-05, "loss": 0.2492, "step": 1936 }, { "epoch": 1.1569439217162212, "grad_norm": 0.21896690130233765, "learning_rate": 9.501390686328005e-05, "loss": 0.3179, "step": 1937 }, { "epoch": 1.157696650357546, "grad_norm": 0.23468996584415436, "learning_rate": 9.500709617537453e-05, "loss": 0.3702, "step": 1938 }, { "epoch": 1.158449378998871, "grad_norm": 0.23782171308994293, "learning_rate": 9.500028111235313e-05, "loss": 0.3002, "step": 1939 }, { "epoch": 1.1592021076401957, "grad_norm": 0.25287073850631714, "learning_rate": 9.499346167496111e-05, "loss": 0.415, "step": 1940 }, { "epoch": 1.1599548362815204, "grad_norm": 0.28867602348327637, "learning_rate": 9.498663786394427e-05, "loss": 0.4714, "step": 1941 }, { "epoch": 1.1607075649228453, "grad_norm": 0.2655486762523651, "learning_rate": 9.497980968004884e-05, "loss": 0.3775, "step": 1942 }, { "epoch": 1.16146029356417, "grad_norm": 0.25393277406692505, "learning_rate": 9.497297712402152e-05, "loss": 0.45, "step": 1943 }, { "epoch": 1.162213022205495, "grad_norm": 0.2784191370010376, "learning_rate": 9.496614019660951e-05, "loss": 0.394, "step": 1944 }, { "epoch": 1.1629657508468196, "grad_norm": 0.254817932844162, "learning_rate": 9.49592988985605e-05, "loss": 0.3219, "step": 1945 }, { "epoch": 1.1637184794881446, "grad_norm": 0.2546781599521637, "learning_rate": 9.495245323062265e-05, "loss": 0.3884, "step": 1946 }, { "epoch": 1.1644712081294692, "grad_norm": 0.2802959084510803, "learning_rate": 9.494560319354457e-05, "loss": 0.402, "step": 1947 }, { "epoch": 1.1652239367707942, "grad_norm": 0.2895089089870453, "learning_rate": 9.49387487880754e-05, "loss": 0.4042, "step": 1948 }, { "epoch": 1.1659766654121189, "grad_norm": 0.32720401883125305, "learning_rate": 9.49318900149647e-05, "loss": 0.3988, "step": 1949 }, { "epoch": 1.1667293940534438, "grad_norm": 0.2804339528083801, "learning_rate": 9.492502687496253e-05, "loss": 0.3878, "step": 1950 }, { "epoch": 1.1674821226947685, "grad_norm": 0.2623729407787323, "learning_rate": 9.491815936881947e-05, "loss": 0.3238, "step": 1951 }, { "epoch": 1.1682348513360934, "grad_norm": 0.23037992417812347, "learning_rate": 9.49112874972865e-05, "loss": 0.3389, "step": 1952 }, { "epoch": 1.168987579977418, "grad_norm": 0.2889323830604553, "learning_rate": 9.490441126111515e-05, "loss": 0.3952, "step": 1953 }, { "epoch": 1.169740308618743, "grad_norm": 0.282428115606308, "learning_rate": 9.489753066105738e-05, "loss": 0.3153, "step": 1954 }, { "epoch": 1.1704930372600677, "grad_norm": 0.23787635564804077, "learning_rate": 9.489064569786563e-05, "loss": 0.3503, "step": 1955 }, { "epoch": 1.1712457659013926, "grad_norm": 0.27298399806022644, "learning_rate": 9.488375637229285e-05, "loss": 0.3419, "step": 1956 }, { "epoch": 1.1719984945427173, "grad_norm": 0.2994718551635742, "learning_rate": 9.487686268509242e-05, "loss": 0.2959, "step": 1957 }, { "epoch": 1.1727512231840422, "grad_norm": 0.23645472526550293, "learning_rate": 9.486996463701827e-05, "loss": 0.4206, "step": 1958 }, { "epoch": 1.173503951825367, "grad_norm": 0.3118187487125397, "learning_rate": 9.48630622288247e-05, "loss": 0.4417, "step": 1959 }, { "epoch": 1.1742566804666918, "grad_norm": 0.32848626375198364, "learning_rate": 9.48561554612666e-05, "loss": 0.3363, "step": 1960 }, { "epoch": 1.1750094091080165, "grad_norm": 0.3507859706878662, "learning_rate": 9.484924433509926e-05, "loss": 0.4227, "step": 1961 }, { "epoch": 1.1757621377493415, "grad_norm": 0.28434979915618896, "learning_rate": 9.484232885107846e-05, "loss": 0.4601, "step": 1962 }, { "epoch": 1.1765148663906662, "grad_norm": 0.25928258895874023, "learning_rate": 9.483540900996049e-05, "loss": 0.3702, "step": 1963 }, { "epoch": 1.1772675950319909, "grad_norm": 0.25946956872940063, "learning_rate": 9.482848481250208e-05, "loss": 0.3794, "step": 1964 }, { "epoch": 1.1780203236733158, "grad_norm": 0.3021222651004791, "learning_rate": 9.482155625946044e-05, "loss": 0.3665, "step": 1965 }, { "epoch": 1.1787730523146407, "grad_norm": 0.24205270409584045, "learning_rate": 9.481462335159329e-05, "loss": 0.3536, "step": 1966 }, { "epoch": 1.1795257809559654, "grad_norm": 0.2689952254295349, "learning_rate": 9.48076860896588e-05, "loss": 0.3345, "step": 1967 }, { "epoch": 1.18027850959729, "grad_norm": 0.2724282443523407, "learning_rate": 9.480074447441559e-05, "loss": 0.3205, "step": 1968 }, { "epoch": 1.181031238238615, "grad_norm": 0.25282660126686096, "learning_rate": 9.479379850662281e-05, "loss": 0.2803, "step": 1969 }, { "epoch": 1.1817839668799397, "grad_norm": 0.2123008370399475, "learning_rate": 9.478684818704006e-05, "loss": 0.3422, "step": 1970 }, { "epoch": 1.1825366955212646, "grad_norm": 0.2571662664413452, "learning_rate": 9.477989351642741e-05, "loss": 0.353, "step": 1971 }, { "epoch": 1.1832894241625893, "grad_norm": 0.25313273072242737, "learning_rate": 9.477293449554542e-05, "loss": 0.3578, "step": 1972 }, { "epoch": 1.1840421528039142, "grad_norm": 0.24672441184520721, "learning_rate": 9.47659711251551e-05, "loss": 0.4336, "step": 1973 }, { "epoch": 1.184794881445239, "grad_norm": 0.27078548073768616, "learning_rate": 9.475900340601796e-05, "loss": 0.4013, "step": 1974 }, { "epoch": 1.1855476100865638, "grad_norm": 0.2414601445198059, "learning_rate": 9.4752031338896e-05, "loss": 0.4182, "step": 1975 }, { "epoch": 1.1863003387278885, "grad_norm": 0.21030642092227936, "learning_rate": 9.474505492455163e-05, "loss": 0.2708, "step": 1976 }, { "epoch": 1.1870530673692135, "grad_norm": 0.2886611819267273, "learning_rate": 9.473807416374784e-05, "loss": 0.3114, "step": 1977 }, { "epoch": 1.1878057960105382, "grad_norm": 0.2673126459121704, "learning_rate": 9.473108905724798e-05, "loss": 0.3586, "step": 1978 }, { "epoch": 1.188558524651863, "grad_norm": 0.29196131229400635, "learning_rate": 9.472409960581598e-05, "loss": 0.3214, "step": 1979 }, { "epoch": 1.1893112532931878, "grad_norm": 0.25948411226272583, "learning_rate": 9.471710581021616e-05, "loss": 0.3522, "step": 1980 }, { "epoch": 1.1900639819345127, "grad_norm": 0.21824052929878235, "learning_rate": 9.471010767121337e-05, "loss": 0.4248, "step": 1981 }, { "epoch": 1.1908167105758374, "grad_norm": 0.22691328823566437, "learning_rate": 9.47031051895729e-05, "loss": 0.3453, "step": 1982 }, { "epoch": 1.1915694392171623, "grad_norm": 0.25436869263648987, "learning_rate": 9.469609836606055e-05, "loss": 0.3627, "step": 1983 }, { "epoch": 1.192322167858487, "grad_norm": 0.2755100429058075, "learning_rate": 9.468908720144255e-05, "loss": 0.3225, "step": 1984 }, { "epoch": 1.193074896499812, "grad_norm": 0.26836904883384705, "learning_rate": 9.468207169648565e-05, "loss": 0.2987, "step": 1985 }, { "epoch": 1.1938276251411366, "grad_norm": 0.2337244600057602, "learning_rate": 9.467505185195705e-05, "loss": 0.3789, "step": 1986 }, { "epoch": 1.1945803537824613, "grad_norm": 0.2351912409067154, "learning_rate": 9.466802766862444e-05, "loss": 0.3116, "step": 1987 }, { "epoch": 1.1953330824237862, "grad_norm": 0.2956370413303375, "learning_rate": 9.466099914725594e-05, "loss": 0.3196, "step": 1988 }, { "epoch": 1.1960858110651111, "grad_norm": 0.28035420179367065, "learning_rate": 9.465396628862022e-05, "loss": 0.3089, "step": 1989 }, { "epoch": 1.1968385397064358, "grad_norm": 0.24236591160297394, "learning_rate": 9.464692909348637e-05, "loss": 0.4299, "step": 1990 }, { "epoch": 1.1975912683477605, "grad_norm": 0.26226210594177246, "learning_rate": 9.463988756262397e-05, "loss": 0.3659, "step": 1991 }, { "epoch": 1.1983439969890854, "grad_norm": 0.2771969139575958, "learning_rate": 9.463284169680305e-05, "loss": 0.358, "step": 1992 }, { "epoch": 1.1990967256304101, "grad_norm": 0.2188476324081421, "learning_rate": 9.462579149679414e-05, "loss": 0.3258, "step": 1993 }, { "epoch": 1.199849454271735, "grad_norm": 0.30125904083251953, "learning_rate": 9.461873696336825e-05, "loss": 0.3027, "step": 1994 }, { "epoch": 1.2006021829130598, "grad_norm": 0.28485575318336487, "learning_rate": 9.461167809729687e-05, "loss": 0.3028, "step": 1995 }, { "epoch": 1.2013549115543847, "grad_norm": 0.308557391166687, "learning_rate": 9.460461489935191e-05, "loss": 0.3349, "step": 1996 }, { "epoch": 1.2021076401957094, "grad_norm": 0.23459015786647797, "learning_rate": 9.459754737030582e-05, "loss": 0.3736, "step": 1997 }, { "epoch": 1.2028603688370343, "grad_norm": 0.30737558007240295, "learning_rate": 9.459047551093148e-05, "loss": 0.4134, "step": 1998 }, { "epoch": 1.203613097478359, "grad_norm": 0.24615170061588287, "learning_rate": 9.458339932200228e-05, "loss": 0.3552, "step": 1999 }, { "epoch": 1.204365826119684, "grad_norm": 0.2505848705768585, "learning_rate": 9.4576318804292e-05, "loss": 0.2714, "step": 2000 }, { "epoch": 1.204365826119684, "eval_loss": 0.3565354347229004, "eval_runtime": 456.3952, "eval_samples_per_second": 21.094, "eval_steps_per_second": 0.66, "step": 2000 }, { "epoch": 1.2051185547610086, "grad_norm": 0.24795424938201904, "learning_rate": 9.456923395857503e-05, "loss": 0.3314, "step": 2001 }, { "epoch": 1.2058712834023335, "grad_norm": 0.23128829896450043, "learning_rate": 9.456214478562612e-05, "loss": 0.4002, "step": 2002 }, { "epoch": 1.2066240120436582, "grad_norm": 0.20067931711673737, "learning_rate": 9.455505128622053e-05, "loss": 0.3348, "step": 2003 }, { "epoch": 1.2073767406849831, "grad_norm": 0.2245950549840927, "learning_rate": 9.454795346113402e-05, "loss": 0.3742, "step": 2004 }, { "epoch": 1.2081294693263078, "grad_norm": 0.30939793586730957, "learning_rate": 9.454085131114277e-05, "loss": 0.4473, "step": 2005 }, { "epoch": 1.2088821979676327, "grad_norm": 0.2316824495792389, "learning_rate": 9.453374483702346e-05, "loss": 0.3413, "step": 2006 }, { "epoch": 1.2096349266089574, "grad_norm": 0.20176313817501068, "learning_rate": 9.452663403955325e-05, "loss": 0.3684, "step": 2007 }, { "epoch": 1.2103876552502824, "grad_norm": 0.20748282968997955, "learning_rate": 9.451951891950979e-05, "loss": 0.3681, "step": 2008 }, { "epoch": 1.211140383891607, "grad_norm": 0.22521336376667023, "learning_rate": 9.451239947767115e-05, "loss": 0.2636, "step": 2009 }, { "epoch": 1.211893112532932, "grad_norm": 0.19595028460025787, "learning_rate": 9.45052757148159e-05, "loss": 0.2528, "step": 2010 }, { "epoch": 1.2126458411742567, "grad_norm": 0.23044823110103607, "learning_rate": 9.44981476317231e-05, "loss": 0.3605, "step": 2011 }, { "epoch": 1.2133985698155816, "grad_norm": 0.20656390488147736, "learning_rate": 9.449101522917225e-05, "loss": 0.3424, "step": 2012 }, { "epoch": 1.2141512984569063, "grad_norm": 0.23139584064483643, "learning_rate": 9.448387850794336e-05, "loss": 0.4299, "step": 2013 }, { "epoch": 1.214904027098231, "grad_norm": 0.21051932871341705, "learning_rate": 9.447673746881687e-05, "loss": 0.3826, "step": 2014 }, { "epoch": 1.215656755739556, "grad_norm": 0.2529768943786621, "learning_rate": 9.446959211257374e-05, "loss": 0.4112, "step": 2015 }, { "epoch": 1.2164094843808808, "grad_norm": 0.26120683550834656, "learning_rate": 9.446244243999533e-05, "loss": 0.269, "step": 2016 }, { "epoch": 1.2171622130222055, "grad_norm": 0.22994215786457062, "learning_rate": 9.445528845186357e-05, "loss": 0.2938, "step": 2017 }, { "epoch": 1.2179149416635302, "grad_norm": 0.22575058043003082, "learning_rate": 9.444813014896077e-05, "loss": 0.4147, "step": 2018 }, { "epoch": 1.2186676703048551, "grad_norm": 0.2306651771068573, "learning_rate": 9.444096753206977e-05, "loss": 0.318, "step": 2019 }, { "epoch": 1.2194203989461798, "grad_norm": 0.25292301177978516, "learning_rate": 9.443380060197387e-05, "loss": 0.329, "step": 2020 }, { "epoch": 1.2201731275875047, "grad_norm": 0.2703462541103363, "learning_rate": 9.442662935945681e-05, "loss": 0.4205, "step": 2021 }, { "epoch": 1.2209258562288294, "grad_norm": 0.2602419853210449, "learning_rate": 9.441945380530284e-05, "loss": 0.3249, "step": 2022 }, { "epoch": 1.2216785848701543, "grad_norm": 0.2566392421722412, "learning_rate": 9.441227394029668e-05, "loss": 0.3575, "step": 2023 }, { "epoch": 1.222431313511479, "grad_norm": 0.2683957815170288, "learning_rate": 9.44050897652235e-05, "loss": 0.2953, "step": 2024 }, { "epoch": 1.223184042152804, "grad_norm": 0.24580475687980652, "learning_rate": 9.439790128086894e-05, "loss": 0.3819, "step": 2025 }, { "epoch": 1.2239367707941287, "grad_norm": 0.24118399620056152, "learning_rate": 9.439070848801912e-05, "loss": 0.3414, "step": 2026 }, { "epoch": 1.2246894994354536, "grad_norm": 0.22909481823444366, "learning_rate": 9.438351138746065e-05, "loss": 0.336, "step": 2027 }, { "epoch": 1.2254422280767783, "grad_norm": 0.2255982607603073, "learning_rate": 9.437630997998059e-05, "loss": 0.2677, "step": 2028 }, { "epoch": 1.2261949567181032, "grad_norm": 0.24564379453659058, "learning_rate": 9.436910426636647e-05, "loss": 0.2953, "step": 2029 }, { "epoch": 1.2269476853594279, "grad_norm": 0.2146703004837036, "learning_rate": 9.436189424740631e-05, "loss": 0.3081, "step": 2030 }, { "epoch": 1.2277004140007528, "grad_norm": 0.21148814260959625, "learning_rate": 9.435467992388855e-05, "loss": 0.3232, "step": 2031 }, { "epoch": 1.2284531426420775, "grad_norm": 0.22534582018852234, "learning_rate": 9.434746129660219e-05, "loss": 0.3224, "step": 2032 }, { "epoch": 1.2292058712834024, "grad_norm": 0.24035534262657166, "learning_rate": 9.43402383663366e-05, "loss": 0.3663, "step": 2033 }, { "epoch": 1.2299585999247271, "grad_norm": 0.23678599298000336, "learning_rate": 9.43330111338817e-05, "loss": 0.3259, "step": 2034 }, { "epoch": 1.230711328566052, "grad_norm": 0.20425991714000702, "learning_rate": 9.432577960002783e-05, "loss": 0.2795, "step": 2035 }, { "epoch": 1.2314640572073767, "grad_norm": 0.20710602402687073, "learning_rate": 9.431854376556585e-05, "loss": 0.3803, "step": 2036 }, { "epoch": 1.2322167858487014, "grad_norm": 0.22664490342140198, "learning_rate": 9.431130363128702e-05, "loss": 0.324, "step": 2037 }, { "epoch": 1.2329695144900263, "grad_norm": 0.22310855984687805, "learning_rate": 9.430405919798311e-05, "loss": 0.3391, "step": 2038 }, { "epoch": 1.2337222431313513, "grad_norm": 0.21741393208503723, "learning_rate": 9.42968104664464e-05, "loss": 0.4798, "step": 2039 }, { "epoch": 1.234474971772676, "grad_norm": 0.24540889263153076, "learning_rate": 9.428955743746959e-05, "loss": 0.4007, "step": 2040 }, { "epoch": 1.2352277004140007, "grad_norm": 0.3217681050300598, "learning_rate": 9.428230011184583e-05, "loss": 0.3384, "step": 2041 }, { "epoch": 1.2359804290553256, "grad_norm": 0.3423099219799042, "learning_rate": 9.427503849036881e-05, "loss": 0.3841, "step": 2042 }, { "epoch": 1.2367331576966503, "grad_norm": 0.2539263665676117, "learning_rate": 9.426777257383261e-05, "loss": 0.4671, "step": 2043 }, { "epoch": 1.2374858863379752, "grad_norm": 0.2604627013206482, "learning_rate": 9.426050236303185e-05, "loss": 0.3146, "step": 2044 }, { "epoch": 1.2382386149792999, "grad_norm": 0.2884140908718109, "learning_rate": 9.425322785876158e-05, "loss": 0.3137, "step": 2045 }, { "epoch": 1.2389913436206248, "grad_norm": 0.3029756247997284, "learning_rate": 9.424594906181732e-05, "loss": 0.3688, "step": 2046 }, { "epoch": 1.2397440722619495, "grad_norm": 0.3808736801147461, "learning_rate": 9.423866597299508e-05, "loss": 0.3165, "step": 2047 }, { "epoch": 1.2404968009032744, "grad_norm": 0.3032066822052002, "learning_rate": 9.423137859309132e-05, "loss": 0.3377, "step": 2048 }, { "epoch": 1.241249529544599, "grad_norm": 0.2554698884487152, "learning_rate": 9.422408692290298e-05, "loss": 0.3825, "step": 2049 }, { "epoch": 1.242002258185924, "grad_norm": 0.297011137008667, "learning_rate": 9.421679096322747e-05, "loss": 0.3024, "step": 2050 }, { "epoch": 1.2427549868272487, "grad_norm": 0.2632114291191101, "learning_rate": 9.420949071486268e-05, "loss": 0.357, "step": 2051 }, { "epoch": 1.2435077154685736, "grad_norm": 0.19163581728935242, "learning_rate": 9.420218617860692e-05, "loss": 0.3209, "step": 2052 }, { "epoch": 1.2442604441098983, "grad_norm": 0.31174924969673157, "learning_rate": 9.419487735525901e-05, "loss": 0.3645, "step": 2053 }, { "epoch": 1.2450131727512233, "grad_norm": 0.27104127407073975, "learning_rate": 9.418756424561826e-05, "loss": 0.4481, "step": 2054 }, { "epoch": 1.245765901392548, "grad_norm": 0.23970849812030792, "learning_rate": 9.418024685048437e-05, "loss": 0.4521, "step": 2055 }, { "epoch": 1.2465186300338729, "grad_norm": 0.24068281054496765, "learning_rate": 9.417292517065762e-05, "loss": 0.3224, "step": 2056 }, { "epoch": 1.2472713586751976, "grad_norm": 0.25929686427116394, "learning_rate": 9.416559920693866e-05, "loss": 0.2531, "step": 2057 }, { "epoch": 1.2480240873165225, "grad_norm": 0.23501215875148773, "learning_rate": 9.415826896012865e-05, "loss": 0.3553, "step": 2058 }, { "epoch": 1.2487768159578472, "grad_norm": 0.23205482959747314, "learning_rate": 9.415093443102924e-05, "loss": 0.3222, "step": 2059 }, { "epoch": 1.249529544599172, "grad_norm": 0.28854116797447205, "learning_rate": 9.414359562044249e-05, "loss": 0.3328, "step": 2060 }, { "epoch": 1.2502822732404968, "grad_norm": 0.2683582603931427, "learning_rate": 9.413625252917098e-05, "loss": 0.3999, "step": 2061 }, { "epoch": 1.2510350018818217, "grad_norm": 0.26253387331962585, "learning_rate": 9.412890515801772e-05, "loss": 0.348, "step": 2062 }, { "epoch": 1.2517877305231464, "grad_norm": 0.2639695107936859, "learning_rate": 9.412155350778622e-05, "loss": 0.4041, "step": 2063 }, { "epoch": 1.252540459164471, "grad_norm": 0.2914212644100189, "learning_rate": 9.411419757928047e-05, "loss": 0.3423, "step": 2064 }, { "epoch": 1.253293187805796, "grad_norm": 0.19542481005191803, "learning_rate": 9.410683737330486e-05, "loss": 0.2811, "step": 2065 }, { "epoch": 1.254045916447121, "grad_norm": 0.24356050789356232, "learning_rate": 9.409947289066431e-05, "loss": 0.2977, "step": 2066 }, { "epoch": 1.2547986450884456, "grad_norm": 0.2475215047597885, "learning_rate": 9.40921041321642e-05, "loss": 0.3606, "step": 2067 }, { "epoch": 1.2555513737297703, "grad_norm": 0.2180887758731842, "learning_rate": 9.408473109861035e-05, "loss": 0.2618, "step": 2068 }, { "epoch": 1.2563041023710952, "grad_norm": 0.2547828257083893, "learning_rate": 9.407735379080908e-05, "loss": 0.3231, "step": 2069 }, { "epoch": 1.25705683101242, "grad_norm": 0.23132608830928802, "learning_rate": 9.406997220956713e-05, "loss": 0.3735, "step": 2070 }, { "epoch": 1.2578095596537449, "grad_norm": 0.25612178444862366, "learning_rate": 9.406258635569179e-05, "loss": 0.3969, "step": 2071 }, { "epoch": 1.2585622882950696, "grad_norm": 0.21632814407348633, "learning_rate": 9.405519622999072e-05, "loss": 0.2649, "step": 2072 }, { "epoch": 1.2593150169363945, "grad_norm": 0.22698312997817993, "learning_rate": 9.40478018332721e-05, "loss": 0.2681, "step": 2073 }, { "epoch": 1.2600677455777192, "grad_norm": 0.2496870756149292, "learning_rate": 9.404040316634459e-05, "loss": 0.2673, "step": 2074 }, { "epoch": 1.260820474219044, "grad_norm": 0.27923786640167236, "learning_rate": 9.403300023001728e-05, "loss": 0.3092, "step": 2075 }, { "epoch": 1.2615732028603688, "grad_norm": 0.2688889801502228, "learning_rate": 9.402559302509975e-05, "loss": 0.328, "step": 2076 }, { "epoch": 1.2623259315016937, "grad_norm": 0.22418636083602905, "learning_rate": 9.401818155240205e-05, "loss": 0.4018, "step": 2077 }, { "epoch": 1.2630786601430184, "grad_norm": 0.26062074303627014, "learning_rate": 9.401076581273468e-05, "loss": 0.3279, "step": 2078 }, { "epoch": 1.2638313887843433, "grad_norm": 0.24725520610809326, "learning_rate": 9.400334580690862e-05, "loss": 0.3037, "step": 2079 }, { "epoch": 1.264584117425668, "grad_norm": 0.1999514102935791, "learning_rate": 9.399592153573528e-05, "loss": 0.3009, "step": 2080 }, { "epoch": 1.265336846066993, "grad_norm": 0.2315671443939209, "learning_rate": 9.39884930000266e-05, "loss": 0.3415, "step": 2081 }, { "epoch": 1.2660895747083176, "grad_norm": 0.26525792479515076, "learning_rate": 9.398106020059494e-05, "loss": 0.3146, "step": 2082 }, { "epoch": 1.2668423033496423, "grad_norm": 0.23200149834156036, "learning_rate": 9.397362313825315e-05, "loss": 0.3293, "step": 2083 }, { "epoch": 1.2675950319909672, "grad_norm": 0.22877711057662964, "learning_rate": 9.396618181381452e-05, "loss": 0.264, "step": 2084 }, { "epoch": 1.2683477606322922, "grad_norm": 0.22888940572738647, "learning_rate": 9.395873622809284e-05, "loss": 0.3374, "step": 2085 }, { "epoch": 1.2691004892736168, "grad_norm": 0.24960963428020477, "learning_rate": 9.395128638190233e-05, "loss": 0.2915, "step": 2086 }, { "epoch": 1.2698532179149415, "grad_norm": 0.21764707565307617, "learning_rate": 9.394383227605771e-05, "loss": 0.3403, "step": 2087 }, { "epoch": 1.2706059465562665, "grad_norm": 0.25193145871162415, "learning_rate": 9.393637391137416e-05, "loss": 0.3366, "step": 2088 }, { "epoch": 1.2713586751975914, "grad_norm": 0.2218022644519806, "learning_rate": 9.392891128866727e-05, "loss": 0.3356, "step": 2089 }, { "epoch": 1.272111403838916, "grad_norm": 0.21824078261852264, "learning_rate": 9.392144440875319e-05, "loss": 0.3107, "step": 2090 }, { "epoch": 1.2728641324802408, "grad_norm": 0.26876407861709595, "learning_rate": 9.391397327244847e-05, "loss": 0.405, "step": 2091 }, { "epoch": 1.2736168611215657, "grad_norm": 0.23049265146255493, "learning_rate": 9.390649788057012e-05, "loss": 0.2975, "step": 2092 }, { "epoch": 1.2743695897628906, "grad_norm": 0.17891544103622437, "learning_rate": 9.389901823393567e-05, "loss": 0.3224, "step": 2093 }, { "epoch": 1.2751223184042153, "grad_norm": 0.23704472184181213, "learning_rate": 9.389153433336306e-05, "loss": 0.3741, "step": 2094 }, { "epoch": 1.27587504704554, "grad_norm": 0.24006801843643188, "learning_rate": 9.388404617967075e-05, "loss": 0.3442, "step": 2095 }, { "epoch": 1.276627775686865, "grad_norm": 0.2545023262500763, "learning_rate": 9.387655377367758e-05, "loss": 0.4204, "step": 2096 }, { "epoch": 1.2773805043281896, "grad_norm": 0.316413938999176, "learning_rate": 9.386905711620298e-05, "loss": 0.3274, "step": 2097 }, { "epoch": 1.2781332329695145, "grad_norm": 0.26969030499458313, "learning_rate": 9.386155620806671e-05, "loss": 0.4161, "step": 2098 }, { "epoch": 1.2788859616108392, "grad_norm": 0.2990574538707733, "learning_rate": 9.385405105008907e-05, "loss": 0.3864, "step": 2099 }, { "epoch": 1.2796386902521641, "grad_norm": 0.2314896285533905, "learning_rate": 9.384654164309083e-05, "loss": 0.3431, "step": 2100 }, { "epoch": 1.2803914188934888, "grad_norm": 0.28819701075553894, "learning_rate": 9.38390279878932e-05, "loss": 0.3484, "step": 2101 }, { "epoch": 1.2811441475348138, "grad_norm": 0.2338038682937622, "learning_rate": 9.383151008531786e-05, "loss": 0.3129, "step": 2102 }, { "epoch": 1.2818968761761385, "grad_norm": 0.2686246931552887, "learning_rate": 9.382398793618697e-05, "loss": 0.3425, "step": 2103 }, { "epoch": 1.2826496048174634, "grad_norm": 0.21517272293567657, "learning_rate": 9.381646154132312e-05, "loss": 0.3407, "step": 2104 }, { "epoch": 1.283402333458788, "grad_norm": 0.25315144658088684, "learning_rate": 9.38089309015494e-05, "loss": 0.334, "step": 2105 }, { "epoch": 1.284155062100113, "grad_norm": 0.23259314894676208, "learning_rate": 9.380139601768935e-05, "loss": 0.3564, "step": 2106 }, { "epoch": 1.2849077907414377, "grad_norm": 0.2404923141002655, "learning_rate": 9.379385689056697e-05, "loss": 0.4025, "step": 2107 }, { "epoch": 1.2856605193827626, "grad_norm": 0.2349364012479782, "learning_rate": 9.378631352100672e-05, "loss": 0.3507, "step": 2108 }, { "epoch": 1.2864132480240873, "grad_norm": 0.2192913293838501, "learning_rate": 9.377876590983353e-05, "loss": 0.2855, "step": 2109 }, { "epoch": 1.287165976665412, "grad_norm": 0.20424790680408478, "learning_rate": 9.377121405787282e-05, "loss": 0.452, "step": 2110 }, { "epoch": 1.287918705306737, "grad_norm": 0.2880214750766754, "learning_rate": 9.376365796595042e-05, "loss": 0.32, "step": 2111 }, { "epoch": 1.2886714339480618, "grad_norm": 0.2506980001926422, "learning_rate": 9.375609763489269e-05, "loss": 0.3933, "step": 2112 }, { "epoch": 1.2894241625893865, "grad_norm": 0.22485823929309845, "learning_rate": 9.374853306552636e-05, "loss": 0.3441, "step": 2113 }, { "epoch": 1.2901768912307112, "grad_norm": 0.21517890691757202, "learning_rate": 9.374096425867872e-05, "loss": 0.2784, "step": 2114 }, { "epoch": 1.2909296198720361, "grad_norm": 0.30050569772720337, "learning_rate": 9.373339121517747e-05, "loss": 0.4395, "step": 2115 }, { "epoch": 1.291682348513361, "grad_norm": 0.2680723965167999, "learning_rate": 9.372581393585081e-05, "loss": 0.4131, "step": 2116 }, { "epoch": 1.2924350771546858, "grad_norm": 0.2565757930278778, "learning_rate": 9.371823242152734e-05, "loss": 0.4231, "step": 2117 }, { "epoch": 1.2931878057960104, "grad_norm": 0.23987513780593872, "learning_rate": 9.371064667303619e-05, "loss": 0.3299, "step": 2118 }, { "epoch": 1.2939405344373354, "grad_norm": 0.2675899565219879, "learning_rate": 9.370305669120693e-05, "loss": 0.3291, "step": 2119 }, { "epoch": 1.29469326307866, "grad_norm": 0.2535725235939026, "learning_rate": 9.369546247686956e-05, "loss": 0.2893, "step": 2120 }, { "epoch": 1.295445991719985, "grad_norm": 0.21275191009044647, "learning_rate": 9.368786403085462e-05, "loss": 0.2769, "step": 2121 }, { "epoch": 1.2961987203613097, "grad_norm": 0.23695705831050873, "learning_rate": 9.368026135399301e-05, "loss": 0.3808, "step": 2122 }, { "epoch": 1.2969514490026346, "grad_norm": 0.27758046984672546, "learning_rate": 9.367265444711619e-05, "loss": 0.3051, "step": 2123 }, { "epoch": 1.2977041776439593, "grad_norm": 0.2283145785331726, "learning_rate": 9.366504331105601e-05, "loss": 0.3829, "step": 2124 }, { "epoch": 1.2984569062852842, "grad_norm": 0.25618910789489746, "learning_rate": 9.365742794664484e-05, "loss": 0.2959, "step": 2125 }, { "epoch": 1.299209634926609, "grad_norm": 0.2624264061450958, "learning_rate": 9.364980835471546e-05, "loss": 0.3032, "step": 2126 }, { "epoch": 1.2999623635679338, "grad_norm": 0.2316112518310547, "learning_rate": 9.364218453610116e-05, "loss": 0.3172, "step": 2127 }, { "epoch": 1.3007150922092585, "grad_norm": 0.2343965321779251, "learning_rate": 9.363455649163564e-05, "loss": 0.3319, "step": 2128 }, { "epoch": 1.3014678208505834, "grad_norm": 0.3077641725540161, "learning_rate": 9.362692422215312e-05, "loss": 0.4086, "step": 2129 }, { "epoch": 1.3022205494919081, "grad_norm": 0.3191680312156677, "learning_rate": 9.361928772848824e-05, "loss": 0.2852, "step": 2130 }, { "epoch": 1.302973278133233, "grad_norm": 0.2095932811498642, "learning_rate": 9.361164701147612e-05, "loss": 0.335, "step": 2131 }, { "epoch": 1.3037260067745577, "grad_norm": 0.32169461250305176, "learning_rate": 9.360400207195232e-05, "loss": 0.3306, "step": 2132 }, { "epoch": 1.3044787354158824, "grad_norm": 0.4343999922275543, "learning_rate": 9.35963529107529e-05, "loss": 0.3533, "step": 2133 }, { "epoch": 1.3052314640572074, "grad_norm": 0.3526364862918854, "learning_rate": 9.358869952871436e-05, "loss": 0.2909, "step": 2134 }, { "epoch": 1.3059841926985323, "grad_norm": 0.2900776267051697, "learning_rate": 9.358104192667365e-05, "loss": 0.2737, "step": 2135 }, { "epoch": 1.306736921339857, "grad_norm": 0.2524682581424713, "learning_rate": 9.357338010546821e-05, "loss": 0.3452, "step": 2136 }, { "epoch": 1.3074896499811817, "grad_norm": 0.2342461794614792, "learning_rate": 9.35657140659359e-05, "loss": 0.2778, "step": 2137 }, { "epoch": 1.3082423786225066, "grad_norm": 0.27111905813217163, "learning_rate": 9.35580438089151e-05, "loss": 0.3196, "step": 2138 }, { "epoch": 1.3089951072638315, "grad_norm": 0.30895522236824036, "learning_rate": 9.355036933524458e-05, "loss": 0.3556, "step": 2139 }, { "epoch": 1.3097478359051562, "grad_norm": 0.26443973183631897, "learning_rate": 9.354269064576366e-05, "loss": 0.2803, "step": 2140 }, { "epoch": 1.310500564546481, "grad_norm": 0.24280136823654175, "learning_rate": 9.3535007741312e-05, "loss": 0.2944, "step": 2141 }, { "epoch": 1.3112532931878058, "grad_norm": 0.23883363604545593, "learning_rate": 9.352732062272988e-05, "loss": 0.3195, "step": 2142 }, { "epoch": 1.3120060218291307, "grad_norm": 0.32293686270713806, "learning_rate": 9.351962929085786e-05, "loss": 0.3877, "step": 2143 }, { "epoch": 1.3127587504704554, "grad_norm": 0.21442118287086487, "learning_rate": 9.35119337465371e-05, "loss": 0.2363, "step": 2144 }, { "epoch": 1.3135114791117801, "grad_norm": 0.33590859174728394, "learning_rate": 9.350423399060917e-05, "loss": 0.3818, "step": 2145 }, { "epoch": 1.314264207753105, "grad_norm": 0.27886027097702026, "learning_rate": 9.349653002391611e-05, "loss": 0.3815, "step": 2146 }, { "epoch": 1.3150169363944297, "grad_norm": 0.27820172905921936, "learning_rate": 9.34888218473004e-05, "loss": 0.3764, "step": 2147 }, { "epoch": 1.3157696650357547, "grad_norm": 0.2572477459907532, "learning_rate": 9.3481109461605e-05, "loss": 0.3461, "step": 2148 }, { "epoch": 1.3165223936770794, "grad_norm": 0.26011690497398376, "learning_rate": 9.347339286767333e-05, "loss": 0.3179, "step": 2149 }, { "epoch": 1.3172751223184043, "grad_norm": 0.23709024488925934, "learning_rate": 9.346567206634927e-05, "loss": 0.3402, "step": 2150 }, { "epoch": 1.318027850959729, "grad_norm": 0.2321607619524002, "learning_rate": 9.345794705847713e-05, "loss": 0.2545, "step": 2151 }, { "epoch": 1.3187805796010539, "grad_norm": 0.2939753830432892, "learning_rate": 9.345021784490173e-05, "loss": 0.3131, "step": 2152 }, { "epoch": 1.3195333082423786, "grad_norm": 0.23859034478664398, "learning_rate": 9.344248442646829e-05, "loss": 0.313, "step": 2153 }, { "epoch": 1.3202860368837035, "grad_norm": 0.2159372866153717, "learning_rate": 9.343474680402255e-05, "loss": 0.3801, "step": 2154 }, { "epoch": 1.3210387655250282, "grad_norm": 0.2607409656047821, "learning_rate": 9.342700497841072e-05, "loss": 0.3797, "step": 2155 }, { "epoch": 1.321791494166353, "grad_norm": 0.24499258399009705, "learning_rate": 9.341925895047937e-05, "loss": 0.3169, "step": 2156 }, { "epoch": 1.3225442228076778, "grad_norm": 0.24547258019447327, "learning_rate": 9.341150872107564e-05, "loss": 0.3383, "step": 2157 }, { "epoch": 1.3232969514490027, "grad_norm": 0.2218824028968811, "learning_rate": 9.340375429104706e-05, "loss": 0.3502, "step": 2158 }, { "epoch": 1.3240496800903274, "grad_norm": 0.25836846232414246, "learning_rate": 9.339599566124166e-05, "loss": 0.4088, "step": 2159 }, { "epoch": 1.3248024087316521, "grad_norm": 0.23708505928516388, "learning_rate": 9.338823283250788e-05, "loss": 0.3212, "step": 2160 }, { "epoch": 1.325555137372977, "grad_norm": 0.2429940402507782, "learning_rate": 9.338046580569469e-05, "loss": 0.4381, "step": 2161 }, { "epoch": 1.326307866014302, "grad_norm": 0.30707553029060364, "learning_rate": 9.337269458165147e-05, "loss": 0.3249, "step": 2162 }, { "epoch": 1.3270605946556266, "grad_norm": 0.2581847012042999, "learning_rate": 9.336491916122806e-05, "loss": 0.3317, "step": 2163 }, { "epoch": 1.3278133232969513, "grad_norm": 0.26411598920822144, "learning_rate": 9.335713954527476e-05, "loss": 0.3497, "step": 2164 }, { "epoch": 1.3285660519382763, "grad_norm": 0.25517794489860535, "learning_rate": 9.334935573464236e-05, "loss": 0.2802, "step": 2165 }, { "epoch": 1.3293187805796012, "grad_norm": 0.3605230450630188, "learning_rate": 9.334156773018207e-05, "loss": 0.4171, "step": 2166 }, { "epoch": 1.3300715092209259, "grad_norm": 0.2707718014717102, "learning_rate": 9.333377553274558e-05, "loss": 0.2751, "step": 2167 }, { "epoch": 1.3308242378622506, "grad_norm": 0.3628377914428711, "learning_rate": 9.332597914318502e-05, "loss": 0.399, "step": 2168 }, { "epoch": 1.3315769665035755, "grad_norm": 0.2508169412612915, "learning_rate": 9.331817856235302e-05, "loss": 0.327, "step": 2169 }, { "epoch": 1.3323296951449002, "grad_norm": 0.2683089077472687, "learning_rate": 9.331037379110262e-05, "loss": 0.4096, "step": 2170 }, { "epoch": 1.333082423786225, "grad_norm": 0.25523436069488525, "learning_rate": 9.330256483028733e-05, "loss": 0.4804, "step": 2171 }, { "epoch": 1.3338351524275498, "grad_norm": 0.22862595319747925, "learning_rate": 9.329475168076114e-05, "loss": 0.3532, "step": 2172 }, { "epoch": 1.3345878810688747, "grad_norm": 0.2588556408882141, "learning_rate": 9.328693434337849e-05, "loss": 0.2762, "step": 2173 }, { "epoch": 1.3353406097101994, "grad_norm": 0.2451566606760025, "learning_rate": 9.327911281899424e-05, "loss": 0.408, "step": 2174 }, { "epoch": 1.3360933383515243, "grad_norm": 0.26131075620651245, "learning_rate": 9.327128710846379e-05, "loss": 0.2405, "step": 2175 }, { "epoch": 1.336846066992849, "grad_norm": 0.2596541941165924, "learning_rate": 9.326345721264293e-05, "loss": 0.3006, "step": 2176 }, { "epoch": 1.337598795634174, "grad_norm": 0.33668696880340576, "learning_rate": 9.32556231323879e-05, "loss": 0.3691, "step": 2177 }, { "epoch": 1.3383515242754986, "grad_norm": 0.24451009929180145, "learning_rate": 9.324778486855543e-05, "loss": 0.2786, "step": 2178 }, { "epoch": 1.3391042529168236, "grad_norm": 0.24214144051074982, "learning_rate": 9.323994242200273e-05, "loss": 0.3704, "step": 2179 }, { "epoch": 1.3398569815581483, "grad_norm": 0.26771050691604614, "learning_rate": 9.323209579358741e-05, "loss": 0.4121, "step": 2180 }, { "epoch": 1.3406097101994732, "grad_norm": 0.2689386308193207, "learning_rate": 9.322424498416757e-05, "loss": 0.2798, "step": 2181 }, { "epoch": 1.3413624388407979, "grad_norm": 0.21417716145515442, "learning_rate": 9.321638999460178e-05, "loss": 0.306, "step": 2182 }, { "epoch": 1.3421151674821226, "grad_norm": 0.26504626870155334, "learning_rate": 9.320853082574904e-05, "loss": 0.3278, "step": 2183 }, { "epoch": 1.3428678961234475, "grad_norm": 0.2268092781305313, "learning_rate": 9.32006674784688e-05, "loss": 0.3641, "step": 2184 }, { "epoch": 1.3436206247647724, "grad_norm": 0.2557710111141205, "learning_rate": 9.319279995362102e-05, "loss": 0.4359, "step": 2185 }, { "epoch": 1.344373353406097, "grad_norm": 0.26443547010421753, "learning_rate": 9.318492825206604e-05, "loss": 0.3193, "step": 2186 }, { "epoch": 1.3451260820474218, "grad_norm": 0.23638135194778442, "learning_rate": 9.317705237466472e-05, "loss": 0.2905, "step": 2187 }, { "epoch": 1.3458788106887467, "grad_norm": 0.23804926872253418, "learning_rate": 9.316917232227837e-05, "loss": 0.4609, "step": 2188 }, { "epoch": 1.3466315393300716, "grad_norm": 0.24030812084674835, "learning_rate": 9.316128809576869e-05, "loss": 0.2798, "step": 2189 }, { "epoch": 1.3473842679713963, "grad_norm": 0.2223653346300125, "learning_rate": 9.315339969599793e-05, "loss": 0.3638, "step": 2190 }, { "epoch": 1.348136996612721, "grad_norm": 0.21767067909240723, "learning_rate": 9.314550712382875e-05, "loss": 0.4369, "step": 2191 }, { "epoch": 1.348889725254046, "grad_norm": 0.2591375708580017, "learning_rate": 9.313761038012425e-05, "loss": 0.298, "step": 2192 }, { "epoch": 1.3496424538953709, "grad_norm": 0.27193838357925415, "learning_rate": 9.312970946574803e-05, "loss": 0.3616, "step": 2193 }, { "epoch": 1.3503951825366955, "grad_norm": 0.23306429386138916, "learning_rate": 9.31218043815641e-05, "loss": 0.2872, "step": 2194 }, { "epoch": 1.3511479111780202, "grad_norm": 0.22239936888217926, "learning_rate": 9.311389512843696e-05, "loss": 0.2853, "step": 2195 }, { "epoch": 1.3519006398193452, "grad_norm": 0.20678818225860596, "learning_rate": 9.310598170723156e-05, "loss": 0.2854, "step": 2196 }, { "epoch": 1.3526533684606699, "grad_norm": 0.22847089171409607, "learning_rate": 9.30980641188133e-05, "loss": 0.358, "step": 2197 }, { "epoch": 1.3534060971019948, "grad_norm": 0.23184525966644287, "learning_rate": 9.3090142364048e-05, "loss": 0.3722, "step": 2198 }, { "epoch": 1.3541588257433195, "grad_norm": 0.22172079980373383, "learning_rate": 9.308221644380202e-05, "loss": 0.3933, "step": 2199 }, { "epoch": 1.3549115543846444, "grad_norm": 0.22665448486804962, "learning_rate": 9.30742863589421e-05, "loss": 0.3006, "step": 2200 }, { "epoch": 1.3549115543846444, "eval_loss": 0.3358829617500305, "eval_runtime": 456.4522, "eval_samples_per_second": 21.091, "eval_steps_per_second": 0.659, "step": 2200 }, { "epoch": 1.355664283025969, "grad_norm": 0.19347183406352997, "learning_rate": 9.306635211033547e-05, "loss": 0.2135, "step": 2201 }, { "epoch": 1.356417011667294, "grad_norm": 0.22868503630161285, "learning_rate": 9.30584136988498e-05, "loss": 0.2105, "step": 2202 }, { "epoch": 1.3571697403086187, "grad_norm": 0.22033317387104034, "learning_rate": 9.305047112535322e-05, "loss": 0.398, "step": 2203 }, { "epoch": 1.3579224689499436, "grad_norm": 0.22077685594558716, "learning_rate": 9.304252439071434e-05, "loss": 0.3945, "step": 2204 }, { "epoch": 1.3586751975912683, "grad_norm": 0.2239653617143631, "learning_rate": 9.303457349580219e-05, "loss": 0.3762, "step": 2205 }, { "epoch": 1.3594279262325932, "grad_norm": 0.222796231508255, "learning_rate": 9.302661844148625e-05, "loss": 0.2657, "step": 2206 }, { "epoch": 1.360180654873918, "grad_norm": 0.2962053418159485, "learning_rate": 9.30186592286365e-05, "loss": 0.3201, "step": 2207 }, { "epoch": 1.3609333835152428, "grad_norm": 0.2548734247684479, "learning_rate": 9.301069585812334e-05, "loss": 0.3087, "step": 2208 }, { "epoch": 1.3616861121565675, "grad_norm": 0.23188801109790802, "learning_rate": 9.300272833081763e-05, "loss": 0.3915, "step": 2209 }, { "epoch": 1.3624388407978922, "grad_norm": 0.41651850938796997, "learning_rate": 9.299475664759069e-05, "loss": 0.3224, "step": 2210 }, { "epoch": 1.3631915694392172, "grad_norm": 0.2294134497642517, "learning_rate": 9.298678080931427e-05, "loss": 0.2295, "step": 2211 }, { "epoch": 1.363944298080542, "grad_norm": 0.5338090062141418, "learning_rate": 9.297880081686064e-05, "loss": 0.3383, "step": 2212 }, { "epoch": 1.3646970267218668, "grad_norm": 0.25546714663505554, "learning_rate": 9.29708166711024e-05, "loss": 0.2968, "step": 2213 }, { "epoch": 1.3654497553631915, "grad_norm": 0.299757182598114, "learning_rate": 9.29628283729128e-05, "loss": 0.2964, "step": 2214 }, { "epoch": 1.3662024840045164, "grad_norm": 0.2950575351715088, "learning_rate": 9.295483592316534e-05, "loss": 0.2755, "step": 2215 }, { "epoch": 1.3669552126458413, "grad_norm": 0.27366188168525696, "learning_rate": 9.294683932273408e-05, "loss": 0.3748, "step": 2216 }, { "epoch": 1.367707941287166, "grad_norm": 0.22378894686698914, "learning_rate": 9.293883857249352e-05, "loss": 0.2712, "step": 2217 }, { "epoch": 1.3684606699284907, "grad_norm": 0.27922940254211426, "learning_rate": 9.293083367331863e-05, "loss": 0.3032, "step": 2218 }, { "epoch": 1.3692133985698156, "grad_norm": 0.32656025886535645, "learning_rate": 9.292282462608479e-05, "loss": 0.3345, "step": 2219 }, { "epoch": 1.3699661272111403, "grad_norm": 0.2714836001396179, "learning_rate": 9.291481143166785e-05, "loss": 0.3478, "step": 2220 }, { "epoch": 1.3707188558524652, "grad_norm": 0.23446626961231232, "learning_rate": 9.290679409094417e-05, "loss": 0.273, "step": 2221 }, { "epoch": 1.37147158449379, "grad_norm": 0.27551937103271484, "learning_rate": 9.289877260479046e-05, "loss": 0.3426, "step": 2222 }, { "epoch": 1.3722243131351148, "grad_norm": 0.316103458404541, "learning_rate": 9.289074697408396e-05, "loss": 0.4859, "step": 2223 }, { "epoch": 1.3729770417764395, "grad_norm": 0.28574511408805847, "learning_rate": 9.288271719970235e-05, "loss": 0.3509, "step": 2224 }, { "epoch": 1.3737297704177645, "grad_norm": 0.22881989181041718, "learning_rate": 9.287468328252372e-05, "loss": 0.2903, "step": 2225 }, { "epoch": 1.3744824990590891, "grad_norm": 0.2273378223180771, "learning_rate": 9.286664522342668e-05, "loss": 0.3154, "step": 2226 }, { "epoch": 1.375235227700414, "grad_norm": 0.2437431514263153, "learning_rate": 9.285860302329026e-05, "loss": 0.3394, "step": 2227 }, { "epoch": 1.3759879563417388, "grad_norm": 0.23628878593444824, "learning_rate": 9.285055668299391e-05, "loss": 0.3832, "step": 2228 }, { "epoch": 1.3767406849830637, "grad_norm": 0.2851439118385315, "learning_rate": 9.28425062034176e-05, "loss": 0.3188, "step": 2229 }, { "epoch": 1.3774934136243884, "grad_norm": 0.23431459069252014, "learning_rate": 9.283445158544172e-05, "loss": 0.2204, "step": 2230 }, { "epoch": 1.3782461422657133, "grad_norm": 0.2479402869939804, "learning_rate": 9.28263928299471e-05, "loss": 0.3381, "step": 2231 }, { "epoch": 1.378998870907038, "grad_norm": 0.2137289047241211, "learning_rate": 9.2818329937815e-05, "loss": 0.2935, "step": 2232 }, { "epoch": 1.3797515995483627, "grad_norm": 0.22784622013568878, "learning_rate": 9.281026290992724e-05, "loss": 0.3821, "step": 2233 }, { "epoch": 1.3805043281896876, "grad_norm": 0.2505617141723633, "learning_rate": 9.280219174716594e-05, "loss": 0.3942, "step": 2234 }, { "epoch": 1.3812570568310125, "grad_norm": 0.21547222137451172, "learning_rate": 9.27941164504138e-05, "loss": 0.3255, "step": 2235 }, { "epoch": 1.3820097854723372, "grad_norm": 0.21752963960170746, "learning_rate": 9.27860370205539e-05, "loss": 0.2996, "step": 2236 }, { "epoch": 1.382762514113662, "grad_norm": 0.23758849501609802, "learning_rate": 9.27779534584698e-05, "loss": 0.3532, "step": 2237 }, { "epoch": 1.3835152427549868, "grad_norm": 0.21354016661643982, "learning_rate": 9.276986576504552e-05, "loss": 0.3512, "step": 2238 }, { "epoch": 1.3842679713963117, "grad_norm": 0.2278788536787033, "learning_rate": 9.27617739411655e-05, "loss": 0.3332, "step": 2239 }, { "epoch": 1.3850207000376364, "grad_norm": 0.25254175066947937, "learning_rate": 9.275367798771466e-05, "loss": 0.3927, "step": 2240 }, { "epoch": 1.3857734286789611, "grad_norm": 0.24895206093788147, "learning_rate": 9.274557790557835e-05, "loss": 0.3436, "step": 2241 }, { "epoch": 1.386526157320286, "grad_norm": 0.24619179964065552, "learning_rate": 9.27374736956424e-05, "loss": 0.2984, "step": 2242 }, { "epoch": 1.387278885961611, "grad_norm": 0.2299034595489502, "learning_rate": 9.272936535879305e-05, "loss": 0.4265, "step": 2243 }, { "epoch": 1.3880316146029357, "grad_norm": 0.2242114245891571, "learning_rate": 9.272125289591703e-05, "loss": 0.3521, "step": 2244 }, { "epoch": 1.3887843432442604, "grad_norm": 0.21658995747566223, "learning_rate": 9.271313630790152e-05, "loss": 0.347, "step": 2245 }, { "epoch": 1.3895370718855853, "grad_norm": 0.24740341305732727, "learning_rate": 9.270501559563412e-05, "loss": 0.3127, "step": 2246 }, { "epoch": 1.39028980052691, "grad_norm": 0.258242130279541, "learning_rate": 9.26968907600029e-05, "loss": 0.3377, "step": 2247 }, { "epoch": 1.391042529168235, "grad_norm": 0.1934036761522293, "learning_rate": 9.268876180189639e-05, "loss": 0.3702, "step": 2248 }, { "epoch": 1.3917952578095596, "grad_norm": 0.21460671722888947, "learning_rate": 9.268062872220353e-05, "loss": 0.2607, "step": 2249 }, { "epoch": 1.3925479864508845, "grad_norm": 0.2211165577173233, "learning_rate": 9.267249152181379e-05, "loss": 0.4197, "step": 2250 }, { "epoch": 1.3933007150922092, "grad_norm": 0.22512850165367126, "learning_rate": 9.2664350201617e-05, "loss": 0.2838, "step": 2251 }, { "epoch": 1.3940534437335341, "grad_norm": 0.2377622276544571, "learning_rate": 9.265620476250352e-05, "loss": 0.3302, "step": 2252 }, { "epoch": 1.3948061723748588, "grad_norm": 0.2406497746706009, "learning_rate": 9.26480552053641e-05, "loss": 0.422, "step": 2253 }, { "epoch": 1.3955589010161837, "grad_norm": 0.28919297456741333, "learning_rate": 9.263990153108994e-05, "loss": 0.3112, "step": 2254 }, { "epoch": 1.3963116296575084, "grad_norm": 0.22498033940792084, "learning_rate": 9.263174374057276e-05, "loss": 0.2857, "step": 2255 }, { "epoch": 1.3970643582988334, "grad_norm": 0.21776430308818817, "learning_rate": 9.262358183470467e-05, "loss": 0.4369, "step": 2256 }, { "epoch": 1.397817086940158, "grad_norm": 0.21669796109199524, "learning_rate": 9.261541581437822e-05, "loss": 0.2593, "step": 2257 }, { "epoch": 1.398569815581483, "grad_norm": 0.23525598645210266, "learning_rate": 9.260724568048644e-05, "loss": 0.324, "step": 2258 }, { "epoch": 1.3993225442228077, "grad_norm": 0.20519563555717468, "learning_rate": 9.259907143392284e-05, "loss": 0.3425, "step": 2259 }, { "epoch": 1.4000752728641324, "grad_norm": 0.188249871134758, "learning_rate": 9.259089307558131e-05, "loss": 0.2415, "step": 2260 }, { "epoch": 1.4008280015054573, "grad_norm": 0.1969917267560959, "learning_rate": 9.258271060635623e-05, "loss": 0.2359, "step": 2261 }, { "epoch": 1.4015807301467822, "grad_norm": 0.2488289177417755, "learning_rate": 9.257452402714242e-05, "loss": 0.2825, "step": 2262 }, { "epoch": 1.402333458788107, "grad_norm": 0.20348218083381653, "learning_rate": 9.256633333883515e-05, "loss": 0.3468, "step": 2263 }, { "epoch": 1.4030861874294316, "grad_norm": 0.21287663280963898, "learning_rate": 9.255813854233016e-05, "loss": 0.3501, "step": 2264 }, { "epoch": 1.4038389160707565, "grad_norm": 0.23406493663787842, "learning_rate": 9.254993963852359e-05, "loss": 0.3184, "step": 2265 }, { "epoch": 1.4045916447120814, "grad_norm": 0.2316550612449646, "learning_rate": 9.25417366283121e-05, "loss": 0.3658, "step": 2266 }, { "epoch": 1.4053443733534061, "grad_norm": 0.2125963717699051, "learning_rate": 9.253352951259271e-05, "loss": 0.4348, "step": 2267 }, { "epoch": 1.4060971019947308, "grad_norm": 0.22729456424713135, "learning_rate": 9.252531829226297e-05, "loss": 0.2443, "step": 2268 }, { "epoch": 1.4068498306360557, "grad_norm": 0.2619558274745941, "learning_rate": 9.251710296822085e-05, "loss": 0.268, "step": 2269 }, { "epoch": 1.4076025592773804, "grad_norm": 0.2584712505340576, "learning_rate": 9.250888354136475e-05, "loss": 0.311, "step": 2270 }, { "epoch": 1.4083552879187053, "grad_norm": 0.2292400449514389, "learning_rate": 9.250066001259353e-05, "loss": 0.3304, "step": 2271 }, { "epoch": 1.40910801656003, "grad_norm": 0.22524097561836243, "learning_rate": 9.249243238280653e-05, "loss": 0.3705, "step": 2272 }, { "epoch": 1.409860745201355, "grad_norm": 0.23808623850345612, "learning_rate": 9.248420065290348e-05, "loss": 0.2696, "step": 2273 }, { "epoch": 1.4106134738426797, "grad_norm": 0.24864903092384338, "learning_rate": 9.247596482378461e-05, "loss": 0.2745, "step": 2274 }, { "epoch": 1.4113662024840046, "grad_norm": 0.26698771119117737, "learning_rate": 9.246772489635057e-05, "loss": 0.3291, "step": 2275 }, { "epoch": 1.4121189311253293, "grad_norm": 0.22589780390262604, "learning_rate": 9.245948087150245e-05, "loss": 0.2582, "step": 2276 }, { "epoch": 1.4128716597666542, "grad_norm": 0.2015070915222168, "learning_rate": 9.245123275014185e-05, "loss": 0.2799, "step": 2277 }, { "epoch": 1.4136243884079789, "grad_norm": 0.2258615791797638, "learning_rate": 9.244298053317074e-05, "loss": 0.3246, "step": 2278 }, { "epoch": 1.4143771170493038, "grad_norm": 0.24816779792308807, "learning_rate": 9.243472422149155e-05, "loss": 0.3798, "step": 2279 }, { "epoch": 1.4151298456906285, "grad_norm": 0.25311121344566345, "learning_rate": 9.242646381600722e-05, "loss": 0.3653, "step": 2280 }, { "epoch": 1.4158825743319534, "grad_norm": 0.3033072054386139, "learning_rate": 9.241819931762108e-05, "loss": 0.2158, "step": 2281 }, { "epoch": 1.416635302973278, "grad_norm": 0.28220850229263306, "learning_rate": 9.240993072723691e-05, "loss": 0.2412, "step": 2282 }, { "epoch": 1.4173880316146028, "grad_norm": 0.27173885703086853, "learning_rate": 9.240165804575897e-05, "loss": 0.334, "step": 2283 }, { "epoch": 1.4181407602559277, "grad_norm": 0.27110421657562256, "learning_rate": 9.239338127409192e-05, "loss": 0.28, "step": 2284 }, { "epoch": 1.4188934888972526, "grad_norm": 0.2375408411026001, "learning_rate": 9.238510041314094e-05, "loss": 0.3398, "step": 2285 }, { "epoch": 1.4196462175385773, "grad_norm": 0.29758602380752563, "learning_rate": 9.237681546381157e-05, "loss": 0.3508, "step": 2286 }, { "epoch": 1.420398946179902, "grad_norm": 0.3295803368091583, "learning_rate": 9.236852642700987e-05, "loss": 0.255, "step": 2287 }, { "epoch": 1.421151674821227, "grad_norm": 0.24135807156562805, "learning_rate": 9.236023330364229e-05, "loss": 0.1965, "step": 2288 }, { "epoch": 1.4219044034625519, "grad_norm": 0.3556971549987793, "learning_rate": 9.235193609461576e-05, "loss": 0.2965, "step": 2289 }, { "epoch": 1.4226571321038766, "grad_norm": 0.35847896337509155, "learning_rate": 9.234363480083768e-05, "loss": 0.2721, "step": 2290 }, { "epoch": 1.4234098607452013, "grad_norm": 0.24697661399841309, "learning_rate": 9.233532942321581e-05, "loss": 0.2409, "step": 2291 }, { "epoch": 1.4241625893865262, "grad_norm": 0.32677072286605835, "learning_rate": 9.232701996265846e-05, "loss": 0.3272, "step": 2292 }, { "epoch": 1.424915318027851, "grad_norm": 0.35866233706474304, "learning_rate": 9.231870642007434e-05, "loss": 0.3597, "step": 2293 }, { "epoch": 1.4256680466691758, "grad_norm": 0.25059211254119873, "learning_rate": 9.23103887963726e-05, "loss": 0.4711, "step": 2294 }, { "epoch": 1.4264207753105005, "grad_norm": 0.2709040641784668, "learning_rate": 9.230206709246282e-05, "loss": 0.2686, "step": 2295 }, { "epoch": 1.4271735039518254, "grad_norm": 0.28150489926338196, "learning_rate": 9.229374130925506e-05, "loss": 0.3343, "step": 2296 }, { "epoch": 1.42792623259315, "grad_norm": 0.2280421108007431, "learning_rate": 9.228541144765983e-05, "loss": 0.3671, "step": 2297 }, { "epoch": 1.428678961234475, "grad_norm": 0.2647674083709717, "learning_rate": 9.227707750858806e-05, "loss": 0.2943, "step": 2298 }, { "epoch": 1.4294316898757997, "grad_norm": 0.21617922186851501, "learning_rate": 9.226873949295115e-05, "loss": 0.1933, "step": 2299 }, { "epoch": 1.4301844185171246, "grad_norm": 0.2923153340816498, "learning_rate": 9.226039740166091e-05, "loss": 0.3725, "step": 2300 }, { "epoch": 1.4309371471584493, "grad_norm": 0.20660170912742615, "learning_rate": 9.225205123562963e-05, "loss": 0.2219, "step": 2301 }, { "epoch": 1.4316898757997742, "grad_norm": 0.22293592989444733, "learning_rate": 9.224370099577003e-05, "loss": 0.3954, "step": 2302 }, { "epoch": 1.432442604441099, "grad_norm": 0.22829128801822662, "learning_rate": 9.22353466829953e-05, "loss": 0.3288, "step": 2303 }, { "epoch": 1.4331953330824239, "grad_norm": 0.21409843862056732, "learning_rate": 9.222698829821903e-05, "loss": 0.3187, "step": 2304 }, { "epoch": 1.4339480617237486, "grad_norm": 0.18849042057991028, "learning_rate": 9.221862584235528e-05, "loss": 0.3862, "step": 2305 }, { "epoch": 1.4347007903650733, "grad_norm": 0.24328546226024628, "learning_rate": 9.22102593163186e-05, "loss": 0.2402, "step": 2306 }, { "epoch": 1.4354535190063982, "grad_norm": 0.21795372664928436, "learning_rate": 9.220188872102386e-05, "loss": 0.2792, "step": 2307 }, { "epoch": 1.436206247647723, "grad_norm": 0.2051759511232376, "learning_rate": 9.219351405738652e-05, "loss": 0.3294, "step": 2308 }, { "epoch": 1.4369589762890478, "grad_norm": 0.20184940099716187, "learning_rate": 9.218513532632241e-05, "loss": 0.3415, "step": 2309 }, { "epoch": 1.4377117049303725, "grad_norm": 0.216873437166214, "learning_rate": 9.21767525287478e-05, "loss": 0.3415, "step": 2310 }, { "epoch": 1.4384644335716974, "grad_norm": 0.24089068174362183, "learning_rate": 9.216836566557943e-05, "loss": 0.2979, "step": 2311 }, { "epoch": 1.4392171622130223, "grad_norm": 0.29301151633262634, "learning_rate": 9.215997473773448e-05, "loss": 0.4672, "step": 2312 }, { "epoch": 1.439969890854347, "grad_norm": 0.24580562114715576, "learning_rate": 9.215157974613056e-05, "loss": 0.3616, "step": 2313 }, { "epoch": 1.4407226194956717, "grad_norm": 0.20642133057117462, "learning_rate": 9.214318069168572e-05, "loss": 0.2967, "step": 2314 }, { "epoch": 1.4414753481369966, "grad_norm": 0.27267128229141235, "learning_rate": 9.213477757531851e-05, "loss": 0.3081, "step": 2315 }, { "epoch": 1.4422280767783215, "grad_norm": 0.21920955181121826, "learning_rate": 9.212637039794783e-05, "loss": 0.2413, "step": 2316 }, { "epoch": 1.4429808054196462, "grad_norm": 0.3315853178501129, "learning_rate": 9.211795916049311e-05, "loss": 0.3493, "step": 2317 }, { "epoch": 1.443733534060971, "grad_norm": 0.43497851490974426, "learning_rate": 9.210954386387418e-05, "loss": 0.2925, "step": 2318 }, { "epoch": 1.4444862627022959, "grad_norm": 0.28558313846588135, "learning_rate": 9.210112450901134e-05, "loss": 0.2459, "step": 2319 }, { "epoch": 1.4452389913436205, "grad_norm": 0.2806636393070221, "learning_rate": 9.20927010968253e-05, "loss": 0.2949, "step": 2320 }, { "epoch": 1.4459917199849455, "grad_norm": 0.3198384642601013, "learning_rate": 9.208427362823721e-05, "loss": 0.2477, "step": 2321 }, { "epoch": 1.4467444486262702, "grad_norm": 0.22881081700325012, "learning_rate": 9.207584210416875e-05, "loss": 0.4527, "step": 2322 }, { "epoch": 1.447497177267595, "grad_norm": 0.2772582769393921, "learning_rate": 9.206740652554192e-05, "loss": 0.301, "step": 2323 }, { "epoch": 1.4482499059089198, "grad_norm": 0.21580660343170166, "learning_rate": 9.205896689327923e-05, "loss": 0.3407, "step": 2324 }, { "epoch": 1.4490026345502447, "grad_norm": 0.22288572788238525, "learning_rate": 9.205052320830367e-05, "loss": 0.1892, "step": 2325 }, { "epoch": 1.4497553631915694, "grad_norm": 0.2657682001590729, "learning_rate": 9.204207547153858e-05, "loss": 0.2756, "step": 2326 }, { "epoch": 1.4505080918328943, "grad_norm": 0.23408913612365723, "learning_rate": 9.20336236839078e-05, "loss": 0.2574, "step": 2327 }, { "epoch": 1.451260820474219, "grad_norm": 0.21756158769130707, "learning_rate": 9.202516784633563e-05, "loss": 0.2934, "step": 2328 }, { "epoch": 1.452013549115544, "grad_norm": 0.23871316015720367, "learning_rate": 9.201670795974676e-05, "loss": 0.2425, "step": 2329 }, { "epoch": 1.4527662777568686, "grad_norm": 0.21951869130134583, "learning_rate": 9.200824402506635e-05, "loss": 0.3342, "step": 2330 }, { "epoch": 1.4535190063981935, "grad_norm": 0.18909882009029388, "learning_rate": 9.199977604322003e-05, "loss": 0.3271, "step": 2331 }, { "epoch": 1.4542717350395182, "grad_norm": 0.1939571052789688, "learning_rate": 9.199130401513382e-05, "loss": 0.3481, "step": 2332 }, { "epoch": 1.455024463680843, "grad_norm": 0.22373062372207642, "learning_rate": 9.198282794173424e-05, "loss": 0.317, "step": 2333 }, { "epoch": 1.4557771923221678, "grad_norm": 0.2005942165851593, "learning_rate": 9.197434782394818e-05, "loss": 0.266, "step": 2334 }, { "epoch": 1.4565299209634928, "grad_norm": 0.24740159511566162, "learning_rate": 9.196586366270303e-05, "loss": 0.3509, "step": 2335 }, { "epoch": 1.4572826496048175, "grad_norm": 0.20673306286334991, "learning_rate": 9.195737545892662e-05, "loss": 0.2584, "step": 2336 }, { "epoch": 1.4580353782461422, "grad_norm": 0.22184468805789948, "learning_rate": 9.194888321354719e-05, "loss": 0.3324, "step": 2337 }, { "epoch": 1.458788106887467, "grad_norm": 0.20738184452056885, "learning_rate": 9.194038692749345e-05, "loss": 0.2956, "step": 2338 }, { "epoch": 1.459540835528792, "grad_norm": 0.22806833684444427, "learning_rate": 9.193188660169451e-05, "loss": 0.4238, "step": 2339 }, { "epoch": 1.4602935641701167, "grad_norm": 0.23831959068775177, "learning_rate": 9.192338223708001e-05, "loss": 0.3289, "step": 2340 }, { "epoch": 1.4610462928114414, "grad_norm": 0.2558211088180542, "learning_rate": 9.191487383457993e-05, "loss": 0.3831, "step": 2341 }, { "epoch": 1.4617990214527663, "grad_norm": 0.25036755204200745, "learning_rate": 9.190636139512473e-05, "loss": 0.2895, "step": 2342 }, { "epoch": 1.4625517500940912, "grad_norm": 0.2756526470184326, "learning_rate": 9.189784491964536e-05, "loss": 0.4461, "step": 2343 }, { "epoch": 1.463304478735416, "grad_norm": 0.25660496950149536, "learning_rate": 9.188932440907313e-05, "loss": 0.2819, "step": 2344 }, { "epoch": 1.4640572073767406, "grad_norm": 0.22646942734718323, "learning_rate": 9.188079986433985e-05, "loss": 0.2608, "step": 2345 }, { "epoch": 1.4648099360180655, "grad_norm": 0.2734328806400299, "learning_rate": 9.187227128637775e-05, "loss": 0.4595, "step": 2346 }, { "epoch": 1.4655626646593902, "grad_norm": 0.2687399685382843, "learning_rate": 9.18637386761195e-05, "loss": 0.2259, "step": 2347 }, { "epoch": 1.4663153933007151, "grad_norm": 0.26862096786499023, "learning_rate": 9.18552020344982e-05, "loss": 0.2849, "step": 2348 }, { "epoch": 1.4670681219420398, "grad_norm": 0.25728264451026917, "learning_rate": 9.184666136244743e-05, "loss": 0.3126, "step": 2349 }, { "epoch": 1.4678208505833648, "grad_norm": 0.2690734565258026, "learning_rate": 9.183811666090118e-05, "loss": 0.2833, "step": 2350 }, { "epoch": 1.4685735792246895, "grad_norm": 0.2562842071056366, "learning_rate": 9.182956793079384e-05, "loss": 0.4062, "step": 2351 }, { "epoch": 1.4693263078660144, "grad_norm": 0.22396647930145264, "learning_rate": 9.182101517306036e-05, "loss": 0.293, "step": 2352 }, { "epoch": 1.470079036507339, "grad_norm": 0.22153060138225555, "learning_rate": 9.1812458388636e-05, "loss": 0.361, "step": 2353 }, { "epoch": 1.470831765148664, "grad_norm": 0.20247215032577515, "learning_rate": 9.180389757845655e-05, "loss": 0.2558, "step": 2354 }, { "epoch": 1.4715844937899887, "grad_norm": 0.2627957761287689, "learning_rate": 9.179533274345818e-05, "loss": 0.3874, "step": 2355 }, { "epoch": 1.4723372224313134, "grad_norm": 0.21853041648864746, "learning_rate": 9.178676388457756e-05, "loss": 0.2642, "step": 2356 }, { "epoch": 1.4730899510726383, "grad_norm": 0.2268245965242386, "learning_rate": 9.177819100275173e-05, "loss": 0.3842, "step": 2357 }, { "epoch": 1.4738426797139632, "grad_norm": 0.24806080758571625, "learning_rate": 9.176961409891824e-05, "loss": 0.4, "step": 2358 }, { "epoch": 1.474595408355288, "grad_norm": 0.2820099890232086, "learning_rate": 9.176103317401503e-05, "loss": 0.324, "step": 2359 }, { "epoch": 1.4753481369966126, "grad_norm": 0.26254984736442566, "learning_rate": 9.17524482289805e-05, "loss": 0.2669, "step": 2360 }, { "epoch": 1.4761008656379375, "grad_norm": 0.1939573436975479, "learning_rate": 9.17438592647535e-05, "loss": 0.3362, "step": 2361 }, { "epoch": 1.4768535942792624, "grad_norm": 0.23504629731178284, "learning_rate": 9.173526628227329e-05, "loss": 0.2518, "step": 2362 }, { "epoch": 1.4776063229205871, "grad_norm": 0.24281716346740723, "learning_rate": 9.172666928247957e-05, "loss": 0.309, "step": 2363 }, { "epoch": 1.4783590515619118, "grad_norm": 0.2400129735469818, "learning_rate": 9.171806826631256e-05, "loss": 0.3174, "step": 2364 }, { "epoch": 1.4791117802032367, "grad_norm": 0.2389213591814041, "learning_rate": 9.170946323471275e-05, "loss": 0.2745, "step": 2365 }, { "epoch": 1.4798645088445617, "grad_norm": 0.23952391743659973, "learning_rate": 9.170085418862126e-05, "loss": 0.3004, "step": 2366 }, { "epoch": 1.4806172374858864, "grad_norm": 0.2282193899154663, "learning_rate": 9.169224112897955e-05, "loss": 0.291, "step": 2367 }, { "epoch": 1.481369966127211, "grad_norm": 0.2064734399318695, "learning_rate": 9.16836240567295e-05, "loss": 0.2969, "step": 2368 }, { "epoch": 1.482122694768536, "grad_norm": 0.2312116175889969, "learning_rate": 9.167500297281348e-05, "loss": 0.3842, "step": 2369 }, { "epoch": 1.4828754234098607, "grad_norm": 0.2006310522556305, "learning_rate": 9.166637787817427e-05, "loss": 0.2708, "step": 2370 }, { "epoch": 1.4836281520511856, "grad_norm": 0.2794744074344635, "learning_rate": 9.165774877375511e-05, "loss": 0.2941, "step": 2371 }, { "epoch": 1.4843808806925103, "grad_norm": 0.2170005440711975, "learning_rate": 9.164911566049967e-05, "loss": 0.3322, "step": 2372 }, { "epoch": 1.4851336093338352, "grad_norm": 0.24198868870735168, "learning_rate": 9.164047853935202e-05, "loss": 0.3083, "step": 2373 }, { "epoch": 1.48588633797516, "grad_norm": 0.2255159467458725, "learning_rate": 9.163183741125673e-05, "loss": 0.2801, "step": 2374 }, { "epoch": 1.4866390666164848, "grad_norm": 0.2463838905096054, "learning_rate": 9.162319227715878e-05, "loss": 0.259, "step": 2375 }, { "epoch": 1.4873917952578095, "grad_norm": 0.21443434059619904, "learning_rate": 9.161454313800357e-05, "loss": 0.2981, "step": 2376 }, { "epoch": 1.4881445238991344, "grad_norm": 0.2190503180027008, "learning_rate": 9.1605889994737e-05, "loss": 0.2849, "step": 2377 }, { "epoch": 1.4888972525404591, "grad_norm": 0.20928208529949188, "learning_rate": 9.159723284830532e-05, "loss": 0.3183, "step": 2378 }, { "epoch": 1.489649981181784, "grad_norm": 0.2629038989543915, "learning_rate": 9.158857169965527e-05, "loss": 0.3291, "step": 2379 }, { "epoch": 1.4904027098231087, "grad_norm": 0.29782384634017944, "learning_rate": 9.157990654973406e-05, "loss": 0.4405, "step": 2380 }, { "epoch": 1.4911554384644337, "grad_norm": 0.2081257700920105, "learning_rate": 9.157123739948924e-05, "loss": 0.3296, "step": 2381 }, { "epoch": 1.4919081671057584, "grad_norm": 0.2704385817050934, "learning_rate": 9.156256424986888e-05, "loss": 0.3563, "step": 2382 }, { "epoch": 1.492660895747083, "grad_norm": 0.29988160729408264, "learning_rate": 9.155388710182147e-05, "loss": 0.2565, "step": 2383 }, { "epoch": 1.493413624388408, "grad_norm": 0.23171347379684448, "learning_rate": 9.154520595629593e-05, "loss": 0.3267, "step": 2384 }, { "epoch": 1.4941663530297329, "grad_norm": 0.2646239101886749, "learning_rate": 9.15365208142416e-05, "loss": 0.2026, "step": 2385 }, { "epoch": 1.4949190816710576, "grad_norm": 0.22890794277191162, "learning_rate": 9.15278316766083e-05, "loss": 0.2532, "step": 2386 }, { "epoch": 1.4956718103123823, "grad_norm": 0.23646803200244904, "learning_rate": 9.151913854434625e-05, "loss": 0.3248, "step": 2387 }, { "epoch": 1.4964245389537072, "grad_norm": 0.2567487955093384, "learning_rate": 9.15104414184061e-05, "loss": 0.3516, "step": 2388 }, { "epoch": 1.4971772675950321, "grad_norm": 0.30610525608062744, "learning_rate": 9.150174029973897e-05, "loss": 0.2979, "step": 2389 }, { "epoch": 1.4979299962363568, "grad_norm": 0.2155969887971878, "learning_rate": 9.149303518929641e-05, "loss": 0.2757, "step": 2390 }, { "epoch": 1.4986827248776815, "grad_norm": 0.23652733862400055, "learning_rate": 9.148432608803038e-05, "loss": 0.3585, "step": 2391 }, { "epoch": 1.4994354535190064, "grad_norm": 0.247394397854805, "learning_rate": 9.14756129968933e-05, "loss": 0.3084, "step": 2392 }, { "epoch": 1.5001881821603313, "grad_norm": 0.30717596411705017, "learning_rate": 9.146689591683803e-05, "loss": 0.2828, "step": 2393 }, { "epoch": 1.500940910801656, "grad_norm": 0.21924830973148346, "learning_rate": 9.145817484881784e-05, "loss": 0.3156, "step": 2394 }, { "epoch": 1.5016936394429807, "grad_norm": 0.22489802539348602, "learning_rate": 9.144944979378648e-05, "loss": 0.2482, "step": 2395 }, { "epoch": 1.5024463680843057, "grad_norm": 0.23581472039222717, "learning_rate": 9.144072075269809e-05, "loss": 0.2718, "step": 2396 }, { "epoch": 1.5031990967256306, "grad_norm": 0.2502330541610718, "learning_rate": 9.143198772650725e-05, "loss": 0.3311, "step": 2397 }, { "epoch": 1.5039518253669553, "grad_norm": 0.2565862834453583, "learning_rate": 9.142325071616901e-05, "loss": 0.3038, "step": 2398 }, { "epoch": 1.50470455400828, "grad_norm": 0.27521470189094543, "learning_rate": 9.141450972263886e-05, "loss": 0.2729, "step": 2399 }, { "epoch": 1.5054572826496049, "grad_norm": 0.272663414478302, "learning_rate": 9.140576474687264e-05, "loss": 0.2841, "step": 2400 }, { "epoch": 1.5054572826496049, "eval_loss": 0.32078826427459717, "eval_runtime": 456.1831, "eval_samples_per_second": 21.103, "eval_steps_per_second": 0.66, "step": 2400 }, { "epoch": 1.5062100112909296, "grad_norm": 0.2303934395313263, "learning_rate": 9.139701578982673e-05, "loss": 0.2887, "step": 2401 }, { "epoch": 1.5069627399322543, "grad_norm": 0.23371680080890656, "learning_rate": 9.13882628524579e-05, "loss": 0.2939, "step": 2402 }, { "epoch": 1.5077154685735792, "grad_norm": 0.34897926449775696, "learning_rate": 9.137950593572335e-05, "loss": 0.3642, "step": 2403 }, { "epoch": 1.508468197214904, "grad_norm": 0.24513112008571625, "learning_rate": 9.137074504058074e-05, "loss": 0.3553, "step": 2404 }, { "epoch": 1.5092209258562288, "grad_norm": 0.21365682780742645, "learning_rate": 9.136198016798812e-05, "loss": 0.2785, "step": 2405 }, { "epoch": 1.5099736544975535, "grad_norm": 0.24950823187828064, "learning_rate": 9.135321131890403e-05, "loss": 0.3312, "step": 2406 }, { "epoch": 1.5107263831388784, "grad_norm": 0.25963446497917175, "learning_rate": 9.13444384942874e-05, "loss": 0.3204, "step": 2407 }, { "epoch": 1.5114791117802033, "grad_norm": 0.2262275665998459, "learning_rate": 9.133566169509763e-05, "loss": 0.3556, "step": 2408 }, { "epoch": 1.512231840421528, "grad_norm": 0.35199791193008423, "learning_rate": 9.132688092229451e-05, "loss": 0.3263, "step": 2409 }, { "epoch": 1.5129845690628527, "grad_norm": 0.26185673475265503, "learning_rate": 9.131809617683833e-05, "loss": 0.1972, "step": 2410 }, { "epoch": 1.5137372977041776, "grad_norm": 0.25399237871170044, "learning_rate": 9.130930745968974e-05, "loss": 0.3254, "step": 2411 }, { "epoch": 1.5144900263455026, "grad_norm": 0.315720796585083, "learning_rate": 9.130051477180988e-05, "loss": 0.3949, "step": 2412 }, { "epoch": 1.5152427549868273, "grad_norm": 0.2078154981136322, "learning_rate": 9.129171811416029e-05, "loss": 0.2374, "step": 2413 }, { "epoch": 1.515995483628152, "grad_norm": 0.23755858838558197, "learning_rate": 9.128291748770298e-05, "loss": 0.2548, "step": 2414 }, { "epoch": 1.5167482122694769, "grad_norm": 0.22237719595432281, "learning_rate": 9.127411289340036e-05, "loss": 0.355, "step": 2415 }, { "epoch": 1.5175009409108018, "grad_norm": 0.29350095987319946, "learning_rate": 9.126530433221531e-05, "loss": 0.3567, "step": 2416 }, { "epoch": 1.5182536695521265, "grad_norm": 0.23898622393608093, "learning_rate": 9.125649180511106e-05, "loss": 0.2558, "step": 2417 }, { "epoch": 1.5190063981934512, "grad_norm": 0.25307175517082214, "learning_rate": 9.124767531305141e-05, "loss": 0.2721, "step": 2418 }, { "epoch": 1.519759126834776, "grad_norm": 0.2169807106256485, "learning_rate": 9.123885485700049e-05, "loss": 0.2282, "step": 2419 }, { "epoch": 1.520511855476101, "grad_norm": 0.2647846043109894, "learning_rate": 9.123003043792289e-05, "loss": 0.3989, "step": 2420 }, { "epoch": 1.5212645841174257, "grad_norm": 0.20444779098033905, "learning_rate": 9.12212020567836e-05, "loss": 0.3796, "step": 2421 }, { "epoch": 1.5220173127587504, "grad_norm": 0.20447580516338348, "learning_rate": 9.121236971454814e-05, "loss": 0.2991, "step": 2422 }, { "epoch": 1.5227700414000753, "grad_norm": 0.23461106419563293, "learning_rate": 9.120353341218237e-05, "loss": 0.3569, "step": 2423 }, { "epoch": 1.5235227700414, "grad_norm": 0.22530218958854675, "learning_rate": 9.119469315065259e-05, "loss": 0.2542, "step": 2424 }, { "epoch": 1.5242754986827247, "grad_norm": 0.20622773468494415, "learning_rate": 9.118584893092563e-05, "loss": 0.3542, "step": 2425 }, { "epoch": 1.5250282273240496, "grad_norm": 0.21461868286132812, "learning_rate": 9.11770007539686e-05, "loss": 0.211, "step": 2426 }, { "epoch": 1.5257809559653746, "grad_norm": 0.24515140056610107, "learning_rate": 9.116814862074916e-05, "loss": 0.2734, "step": 2427 }, { "epoch": 1.5265336846066992, "grad_norm": 0.19179917871952057, "learning_rate": 9.11592925322354e-05, "loss": 0.3032, "step": 2428 }, { "epoch": 1.527286413248024, "grad_norm": 0.23834854364395142, "learning_rate": 9.115043248939573e-05, "loss": 0.3247, "step": 2429 }, { "epoch": 1.5280391418893489, "grad_norm": 0.26338449120521545, "learning_rate": 9.114156849319913e-05, "loss": 0.4365, "step": 2430 }, { "epoch": 1.5287918705306738, "grad_norm": 0.24647027254104614, "learning_rate": 9.113270054461495e-05, "loss": 0.2713, "step": 2431 }, { "epoch": 1.5295445991719985, "grad_norm": 0.2578597366809845, "learning_rate": 9.112382864461296e-05, "loss": 0.3301, "step": 2432 }, { "epoch": 1.5302973278133232, "grad_norm": 0.22209042310714722, "learning_rate": 9.111495279416337e-05, "loss": 0.308, "step": 2433 }, { "epoch": 1.531050056454648, "grad_norm": 0.22784487903118134, "learning_rate": 9.110607299423684e-05, "loss": 0.3285, "step": 2434 }, { "epoch": 1.531802785095973, "grad_norm": 0.24644877016544342, "learning_rate": 9.109718924580446e-05, "loss": 0.3568, "step": 2435 }, { "epoch": 1.5325555137372977, "grad_norm": 0.3250127136707306, "learning_rate": 9.108830154983773e-05, "loss": 0.4024, "step": 2436 }, { "epoch": 1.5333082423786224, "grad_norm": 0.22960500419139862, "learning_rate": 9.10794099073086e-05, "loss": 0.3487, "step": 2437 }, { "epoch": 1.5340609710199473, "grad_norm": 0.23093847930431366, "learning_rate": 9.107051431918944e-05, "loss": 0.3466, "step": 2438 }, { "epoch": 1.5348136996612722, "grad_norm": 0.2493530511856079, "learning_rate": 9.106161478645308e-05, "loss": 0.3327, "step": 2439 }, { "epoch": 1.535566428302597, "grad_norm": 0.21598801016807556, "learning_rate": 9.105271131007274e-05, "loss": 0.2968, "step": 2440 }, { "epoch": 1.5363191569439216, "grad_norm": 0.2397364377975464, "learning_rate": 9.104380389102211e-05, "loss": 0.2421, "step": 2441 }, { "epoch": 1.5370718855852465, "grad_norm": 0.22292491793632507, "learning_rate": 9.103489253027526e-05, "loss": 0.2903, "step": 2442 }, { "epoch": 1.5378246142265715, "grad_norm": 0.24619810283184052, "learning_rate": 9.102597722880674e-05, "loss": 0.2802, "step": 2443 }, { "epoch": 1.5385773428678962, "grad_norm": 0.21983550488948822, "learning_rate": 9.101705798759151e-05, "loss": 0.3047, "step": 2444 }, { "epoch": 1.5393300715092209, "grad_norm": 0.27850425243377686, "learning_rate": 9.100813480760499e-05, "loss": 0.2866, "step": 2445 }, { "epoch": 1.5400828001505458, "grad_norm": 0.3271021246910095, "learning_rate": 9.099920768982297e-05, "loss": 0.3178, "step": 2446 }, { "epoch": 1.5408355287918707, "grad_norm": 0.26494061946868896, "learning_rate": 9.099027663522171e-05, "loss": 0.2595, "step": 2447 }, { "epoch": 1.5415882574331954, "grad_norm": 0.23079097270965576, "learning_rate": 9.098134164477791e-05, "loss": 0.2991, "step": 2448 }, { "epoch": 1.54234098607452, "grad_norm": 0.25441989302635193, "learning_rate": 9.09724027194687e-05, "loss": 0.4503, "step": 2449 }, { "epoch": 1.543093714715845, "grad_norm": 0.22569157183170319, "learning_rate": 9.096345986027161e-05, "loss": 0.2649, "step": 2450 }, { "epoch": 1.5438464433571697, "grad_norm": 0.2113698273897171, "learning_rate": 9.095451306816462e-05, "loss": 0.3131, "step": 2451 }, { "epoch": 1.5445991719984944, "grad_norm": 0.223859965801239, "learning_rate": 9.094556234412614e-05, "loss": 0.4152, "step": 2452 }, { "epoch": 1.5453519006398193, "grad_norm": 0.26834869384765625, "learning_rate": 9.093660768913501e-05, "loss": 0.2718, "step": 2453 }, { "epoch": 1.5461046292811442, "grad_norm": 0.265634149312973, "learning_rate": 9.092764910417047e-05, "loss": 0.3721, "step": 2454 }, { "epoch": 1.546857357922469, "grad_norm": 0.23012201488018036, "learning_rate": 9.091868659021227e-05, "loss": 0.2752, "step": 2455 }, { "epoch": 1.5476100865637936, "grad_norm": 0.24166648089885712, "learning_rate": 9.090972014824049e-05, "loss": 0.3158, "step": 2456 }, { "epoch": 1.5483628152051185, "grad_norm": 0.28170475363731384, "learning_rate": 9.09007497792357e-05, "loss": 0.3167, "step": 2457 }, { "epoch": 1.5491155438464435, "grad_norm": 0.22723795473575592, "learning_rate": 9.08917754841789e-05, "loss": 0.2299, "step": 2458 }, { "epoch": 1.5498682724877682, "grad_norm": 0.20458394289016724, "learning_rate": 9.088279726405148e-05, "loss": 0.3194, "step": 2459 }, { "epoch": 1.5506210011290928, "grad_norm": 0.23361048102378845, "learning_rate": 9.087381511983533e-05, "loss": 0.3036, "step": 2460 }, { "epoch": 1.5513737297704178, "grad_norm": 0.22316448390483856, "learning_rate": 9.086482905251267e-05, "loss": 0.2332, "step": 2461 }, { "epoch": 1.5521264584117427, "grad_norm": 0.2567315697669983, "learning_rate": 9.085583906306623e-05, "loss": 0.3597, "step": 2462 }, { "epoch": 1.5528791870530674, "grad_norm": 0.2230456918478012, "learning_rate": 9.084684515247913e-05, "loss": 0.332, "step": 2463 }, { "epoch": 1.553631915694392, "grad_norm": 0.2650821805000305, "learning_rate": 9.083784732173496e-05, "loss": 0.2676, "step": 2464 }, { "epoch": 1.554384644335717, "grad_norm": 0.2084149569272995, "learning_rate": 9.082884557181768e-05, "loss": 0.2733, "step": 2465 }, { "epoch": 1.555137372977042, "grad_norm": 0.2110624611377716, "learning_rate": 9.081983990371171e-05, "loss": 0.3032, "step": 2466 }, { "epoch": 1.5558901016183666, "grad_norm": 0.22903336584568024, "learning_rate": 9.08108303184019e-05, "loss": 0.2877, "step": 2467 }, { "epoch": 1.5566428302596913, "grad_norm": 0.21550172567367554, "learning_rate": 9.080181681687354e-05, "loss": 0.3326, "step": 2468 }, { "epoch": 1.5573955589010162, "grad_norm": 0.23287935554981232, "learning_rate": 9.079279940011232e-05, "loss": 0.2698, "step": 2469 }, { "epoch": 1.5581482875423411, "grad_norm": 0.246581569314003, "learning_rate": 9.078377806910436e-05, "loss": 0.2952, "step": 2470 }, { "epoch": 1.5589010161836658, "grad_norm": 0.26890191435813904, "learning_rate": 9.077475282483624e-05, "loss": 0.3547, "step": 2471 }, { "epoch": 1.5596537448249905, "grad_norm": 0.2308291643857956, "learning_rate": 9.076572366829493e-05, "loss": 0.2959, "step": 2472 }, { "epoch": 1.5604064734663154, "grad_norm": 0.23841965198516846, "learning_rate": 9.075669060046785e-05, "loss": 0.2641, "step": 2473 }, { "epoch": 1.5611592021076401, "grad_norm": 0.2236277312040329, "learning_rate": 9.074765362234286e-05, "loss": 0.2474, "step": 2474 }, { "epoch": 1.5619119307489648, "grad_norm": 0.2627730965614319, "learning_rate": 9.07386127349082e-05, "loss": 0.34, "step": 2475 }, { "epoch": 1.5626646593902898, "grad_norm": 0.2559717893600464, "learning_rate": 9.07295679391526e-05, "loss": 0.363, "step": 2476 }, { "epoch": 1.5634173880316147, "grad_norm": 0.24339720606803894, "learning_rate": 9.072051923606515e-05, "loss": 0.4457, "step": 2477 }, { "epoch": 1.5641701166729394, "grad_norm": 0.19567187130451202, "learning_rate": 9.071146662663544e-05, "loss": 0.3416, "step": 2478 }, { "epoch": 1.564922845314264, "grad_norm": 0.27012771368026733, "learning_rate": 9.070241011185343e-05, "loss": 0.2919, "step": 2479 }, { "epoch": 1.565675573955589, "grad_norm": 0.19714155793190002, "learning_rate": 9.069334969270952e-05, "loss": 0.3277, "step": 2480 }, { "epoch": 1.566428302596914, "grad_norm": 0.2307066023349762, "learning_rate": 9.068428537019454e-05, "loss": 0.2752, "step": 2481 }, { "epoch": 1.5671810312382386, "grad_norm": 0.22639624774456024, "learning_rate": 9.067521714529976e-05, "loss": 0.2944, "step": 2482 }, { "epoch": 1.5679337598795633, "grad_norm": 0.2148526906967163, "learning_rate": 9.06661450190169e-05, "loss": 0.2737, "step": 2483 }, { "epoch": 1.5686864885208882, "grad_norm": 0.22315752506256104, "learning_rate": 9.065706899233803e-05, "loss": 0.2691, "step": 2484 }, { "epoch": 1.5694392171622131, "grad_norm": 0.25497910380363464, "learning_rate": 9.06479890662557e-05, "loss": 0.3401, "step": 2485 }, { "epoch": 1.5701919458035378, "grad_norm": 0.24450048804283142, "learning_rate": 9.063890524176288e-05, "loss": 0.258, "step": 2486 }, { "epoch": 1.5709446744448625, "grad_norm": 0.22587992250919342, "learning_rate": 9.062981751985296e-05, "loss": 0.3294, "step": 2487 }, { "epoch": 1.5716974030861874, "grad_norm": 0.24199070036411285, "learning_rate": 9.062072590151977e-05, "loss": 0.3519, "step": 2488 }, { "epoch": 1.5724501317275124, "grad_norm": 0.1960574984550476, "learning_rate": 9.061163038775757e-05, "loss": 0.329, "step": 2489 }, { "epoch": 1.573202860368837, "grad_norm": 0.2400839477777481, "learning_rate": 9.060253097956099e-05, "loss": 0.3577, "step": 2490 }, { "epoch": 1.5739555890101617, "grad_norm": 0.214201882481575, "learning_rate": 9.059342767792516e-05, "loss": 0.2626, "step": 2491 }, { "epoch": 1.5747083176514867, "grad_norm": 0.2520996034145355, "learning_rate": 9.058432048384558e-05, "loss": 0.3664, "step": 2492 }, { "epoch": 1.5754610462928116, "grad_norm": 0.20222438871860504, "learning_rate": 9.057520939831824e-05, "loss": 0.259, "step": 2493 }, { "epoch": 1.5762137749341363, "grad_norm": 0.2914273738861084, "learning_rate": 9.056609442233945e-05, "loss": 0.302, "step": 2494 }, { "epoch": 1.576966503575461, "grad_norm": 0.2262597680091858, "learning_rate": 9.055697555690608e-05, "loss": 0.2554, "step": 2495 }, { "epoch": 1.577719232216786, "grad_norm": 0.25220730900764465, "learning_rate": 9.05478528030153e-05, "loss": 0.4512, "step": 2496 }, { "epoch": 1.5784719608581108, "grad_norm": 0.23138195276260376, "learning_rate": 9.05387261616648e-05, "loss": 0.3035, "step": 2497 }, { "epoch": 1.5792246894994355, "grad_norm": 0.23046129941940308, "learning_rate": 9.05295956338526e-05, "loss": 0.3318, "step": 2498 }, { "epoch": 1.5799774181407602, "grad_norm": 0.24859079718589783, "learning_rate": 9.052046122057728e-05, "loss": 0.2962, "step": 2499 }, { "epoch": 1.5807301467820851, "grad_norm": 0.22272136807441711, "learning_rate": 9.051132292283771e-05, "loss": 0.3837, "step": 2500 }, { "epoch": 1.5814828754234098, "grad_norm": 0.2048509567975998, "learning_rate": 9.050218074163327e-05, "loss": 0.3575, "step": 2501 }, { "epoch": 1.5822356040647345, "grad_norm": 0.2567160725593567, "learning_rate": 9.049303467796371e-05, "loss": 0.3219, "step": 2502 }, { "epoch": 1.5829883327060594, "grad_norm": 0.2650901973247528, "learning_rate": 9.048388473282924e-05, "loss": 0.4342, "step": 2503 }, { "epoch": 1.5837410613473843, "grad_norm": 0.2553265392780304, "learning_rate": 9.047473090723049e-05, "loss": 0.2383, "step": 2504 }, { "epoch": 1.584493789988709, "grad_norm": 0.2148304581642151, "learning_rate": 9.046557320216849e-05, "loss": 0.3093, "step": 2505 }, { "epoch": 1.5852465186300337, "grad_norm": 0.21965153515338898, "learning_rate": 9.045641161864474e-05, "loss": 0.2759, "step": 2506 }, { "epoch": 1.5859992472713587, "grad_norm": 0.24225328862667084, "learning_rate": 9.04472461576611e-05, "loss": 0.2931, "step": 2507 }, { "epoch": 1.5867519759126836, "grad_norm": 0.24068161845207214, "learning_rate": 9.043807682021993e-05, "loss": 0.3697, "step": 2508 }, { "epoch": 1.5875047045540083, "grad_norm": 0.20790442824363708, "learning_rate": 9.042890360732397e-05, "loss": 0.2564, "step": 2509 }, { "epoch": 1.588257433195333, "grad_norm": 0.21544335782527924, "learning_rate": 9.041972651997637e-05, "loss": 0.3007, "step": 2510 }, { "epoch": 1.5890101618366579, "grad_norm": 0.22273875772953033, "learning_rate": 9.041054555918074e-05, "loss": 0.2595, "step": 2511 }, { "epoch": 1.5897628904779828, "grad_norm": 0.23919467628002167, "learning_rate": 9.040136072594107e-05, "loss": 0.3257, "step": 2512 }, { "epoch": 1.5905156191193075, "grad_norm": 0.2115405946969986, "learning_rate": 9.039217202126182e-05, "loss": 0.2797, "step": 2513 }, { "epoch": 1.5912683477606322, "grad_norm": 0.2346504032611847, "learning_rate": 9.038297944614785e-05, "loss": 0.3394, "step": 2514 }, { "epoch": 1.5920210764019571, "grad_norm": 0.29113027453422546, "learning_rate": 9.037378300160446e-05, "loss": 0.3124, "step": 2515 }, { "epoch": 1.592773805043282, "grad_norm": 0.26685476303100586, "learning_rate": 9.036458268863732e-05, "loss": 0.3047, "step": 2516 }, { "epoch": 1.5935265336846067, "grad_norm": 0.2899383008480072, "learning_rate": 9.035537850825261e-05, "loss": 0.2783, "step": 2517 }, { "epoch": 1.5942792623259314, "grad_norm": 0.25468945503234863, "learning_rate": 9.034617046145683e-05, "loss": 0.2985, "step": 2518 }, { "epoch": 1.5950319909672563, "grad_norm": 0.24102330207824707, "learning_rate": 9.033695854925703e-05, "loss": 0.3968, "step": 2519 }, { "epoch": 1.5957847196085813, "grad_norm": 0.25879502296447754, "learning_rate": 9.032774277266055e-05, "loss": 0.3431, "step": 2520 }, { "epoch": 1.596537448249906, "grad_norm": 0.24830487370491028, "learning_rate": 9.031852313267525e-05, "loss": 0.3491, "step": 2521 }, { "epoch": 1.5972901768912307, "grad_norm": 0.2268511801958084, "learning_rate": 9.030929963030933e-05, "loss": 0.302, "step": 2522 }, { "epoch": 1.5980429055325556, "grad_norm": 0.3260537385940552, "learning_rate": 9.030007226657151e-05, "loss": 0.3075, "step": 2523 }, { "epoch": 1.5987956341738803, "grad_norm": 0.25844141840934753, "learning_rate": 9.029084104247086e-05, "loss": 0.2885, "step": 2524 }, { "epoch": 1.599548362815205, "grad_norm": 0.23282214999198914, "learning_rate": 9.028160595901689e-05, "loss": 0.27, "step": 2525 }, { "epoch": 1.6003010914565299, "grad_norm": 0.1993257999420166, "learning_rate": 9.027236701721953e-05, "loss": 0.1948, "step": 2526 }, { "epoch": 1.6010538200978548, "grad_norm": 0.2743162214756012, "learning_rate": 9.026312421808916e-05, "loss": 0.2372, "step": 2527 }, { "epoch": 1.6018065487391795, "grad_norm": 0.22590890526771545, "learning_rate": 9.025387756263654e-05, "loss": 0.2672, "step": 2528 }, { "epoch": 1.6025592773805042, "grad_norm": 0.28408482670783997, "learning_rate": 9.024462705187287e-05, "loss": 0.3742, "step": 2529 }, { "epoch": 1.603312006021829, "grad_norm": 0.2163914442062378, "learning_rate": 9.023537268680978e-05, "loss": 0.2303, "step": 2530 }, { "epoch": 1.604064734663154, "grad_norm": 0.22939147055149078, "learning_rate": 9.022611446845929e-05, "loss": 0.2667, "step": 2531 }, { "epoch": 1.6048174633044787, "grad_norm": 0.21218882501125336, "learning_rate": 9.02168523978339e-05, "loss": 0.2776, "step": 2532 }, { "epoch": 1.6055701919458034, "grad_norm": 0.19687822461128235, "learning_rate": 9.020758647594646e-05, "loss": 0.3056, "step": 2533 }, { "epoch": 1.6063229205871283, "grad_norm": 0.19769026339054108, "learning_rate": 9.019831670381032e-05, "loss": 0.2589, "step": 2534 }, { "epoch": 1.6070756492284533, "grad_norm": 0.22986821830272675, "learning_rate": 9.018904308243917e-05, "loss": 0.3323, "step": 2535 }, { "epoch": 1.607828377869778, "grad_norm": 0.22548070549964905, "learning_rate": 9.017976561284719e-05, "loss": 0.3559, "step": 2536 }, { "epoch": 1.6085811065111026, "grad_norm": 0.21560770273208618, "learning_rate": 9.017048429604891e-05, "loss": 0.3266, "step": 2537 }, { "epoch": 1.6093338351524276, "grad_norm": 0.2301805466413498, "learning_rate": 9.016119913305939e-05, "loss": 0.3037, "step": 2538 }, { "epoch": 1.6100865637937525, "grad_norm": 0.18215470016002655, "learning_rate": 9.015191012489396e-05, "loss": 0.2782, "step": 2539 }, { "epoch": 1.6108392924350772, "grad_norm": 0.27167069911956787, "learning_rate": 9.014261727256849e-05, "loss": 0.3672, "step": 2540 }, { "epoch": 1.6115920210764019, "grad_norm": 0.2457159012556076, "learning_rate": 9.013332057709924e-05, "loss": 0.2922, "step": 2541 }, { "epoch": 1.6123447497177268, "grad_norm": 0.2162439376115799, "learning_rate": 9.012402003950286e-05, "loss": 0.2946, "step": 2542 }, { "epoch": 1.6130974783590517, "grad_norm": 0.21578140556812286, "learning_rate": 9.011471566079648e-05, "loss": 0.3773, "step": 2543 }, { "epoch": 1.6138502070003764, "grad_norm": 0.2441944181919098, "learning_rate": 9.010540744199759e-05, "loss": 0.2994, "step": 2544 }, { "epoch": 1.614602935641701, "grad_norm": 0.2049429714679718, "learning_rate": 9.00960953841241e-05, "loss": 0.2913, "step": 2545 }, { "epoch": 1.615355664283026, "grad_norm": 0.22756658494472504, "learning_rate": 9.00867794881944e-05, "loss": 0.2651, "step": 2546 }, { "epoch": 1.616108392924351, "grad_norm": 0.23695968091487885, "learning_rate": 9.007745975522723e-05, "loss": 0.391, "step": 2547 }, { "epoch": 1.6168611215656754, "grad_norm": 0.23706887662410736, "learning_rate": 9.006813618624181e-05, "loss": 0.3031, "step": 2548 }, { "epoch": 1.6176138502070003, "grad_norm": 0.241019144654274, "learning_rate": 9.005880878225774e-05, "loss": 0.2947, "step": 2549 }, { "epoch": 1.6183665788483252, "grad_norm": 0.22383929789066315, "learning_rate": 9.004947754429507e-05, "loss": 0.42, "step": 2550 }, { "epoch": 1.61911930748965, "grad_norm": 0.19085276126861572, "learning_rate": 9.004014247337422e-05, "loss": 0.2533, "step": 2551 }, { "epoch": 1.6198720361309746, "grad_norm": 0.24182604253292084, "learning_rate": 9.003080357051607e-05, "loss": 0.3462, "step": 2552 }, { "epoch": 1.6206247647722996, "grad_norm": 0.2722558379173279, "learning_rate": 9.002146083674189e-05, "loss": 0.314, "step": 2553 }, { "epoch": 1.6213774934136245, "grad_norm": 0.25891146063804626, "learning_rate": 9.001211427307343e-05, "loss": 0.3206, "step": 2554 }, { "epoch": 1.6221302220549492, "grad_norm": 0.22114117443561554, "learning_rate": 9.000276388053279e-05, "loss": 0.2773, "step": 2555 }, { "epoch": 1.6228829506962739, "grad_norm": 0.2737445533275604, "learning_rate": 8.999340966014251e-05, "loss": 0.35, "step": 2556 }, { "epoch": 1.6236356793375988, "grad_norm": 0.2879311442375183, "learning_rate": 8.998405161292557e-05, "loss": 0.3248, "step": 2557 }, { "epoch": 1.6243884079789237, "grad_norm": 0.22123301029205322, "learning_rate": 8.997468973990534e-05, "loss": 0.2836, "step": 2558 }, { "epoch": 1.6251411366202484, "grad_norm": 0.2740740180015564, "learning_rate": 8.996532404210562e-05, "loss": 0.3181, "step": 2559 }, { "epoch": 1.625893865261573, "grad_norm": 0.25353363156318665, "learning_rate": 8.995595452055063e-05, "loss": 0.3009, "step": 2560 }, { "epoch": 1.626646593902898, "grad_norm": 0.2738966643810272, "learning_rate": 8.994658117626503e-05, "loss": 0.3213, "step": 2561 }, { "epoch": 1.627399322544223, "grad_norm": 0.21941688656806946, "learning_rate": 8.993720401027383e-05, "loss": 0.326, "step": 2562 }, { "epoch": 1.6281520511855476, "grad_norm": 0.27013763785362244, "learning_rate": 8.992782302360253e-05, "loss": 0.2751, "step": 2563 }, { "epoch": 1.6289047798268723, "grad_norm": 0.29456907510757446, "learning_rate": 8.991843821727703e-05, "loss": 0.2993, "step": 2564 }, { "epoch": 1.6296575084681972, "grad_norm": 0.2223139852285385, "learning_rate": 8.990904959232362e-05, "loss": 0.28, "step": 2565 }, { "epoch": 1.6304102371095222, "grad_norm": 0.2510461211204529, "learning_rate": 8.989965714976902e-05, "loss": 0.3025, "step": 2566 }, { "epoch": 1.6311629657508468, "grad_norm": 0.3261895775794983, "learning_rate": 8.989026089064041e-05, "loss": 0.3836, "step": 2567 }, { "epoch": 1.6319156943921715, "grad_norm": 0.24971024692058563, "learning_rate": 8.98808608159653e-05, "loss": 0.3148, "step": 2568 }, { "epoch": 1.6326684230334965, "grad_norm": 0.23381651937961578, "learning_rate": 8.987145692677171e-05, "loss": 0.2314, "step": 2569 }, { "epoch": 1.6334211516748214, "grad_norm": 0.23671846091747284, "learning_rate": 8.986204922408801e-05, "loss": 0.2886, "step": 2570 }, { "epoch": 1.634173880316146, "grad_norm": 0.22928926348686218, "learning_rate": 8.985263770894302e-05, "loss": 0.2561, "step": 2571 }, { "epoch": 1.6349266089574708, "grad_norm": 0.20200057327747345, "learning_rate": 8.984322238236598e-05, "loss": 0.1714, "step": 2572 }, { "epoch": 1.6356793375987957, "grad_norm": 0.24379362165927887, "learning_rate": 8.983380324538652e-05, "loss": 0.3529, "step": 2573 }, { "epoch": 1.6364320662401204, "grad_norm": 0.24782928824424744, "learning_rate": 8.982438029903471e-05, "loss": 0.3479, "step": 2574 }, { "epoch": 1.637184794881445, "grad_norm": 0.24228744208812714, "learning_rate": 8.981495354434103e-05, "loss": 0.3331, "step": 2575 }, { "epoch": 1.63793752352277, "grad_norm": 0.21352717280387878, "learning_rate": 8.980552298233638e-05, "loss": 0.2699, "step": 2576 }, { "epoch": 1.638690252164095, "grad_norm": 0.2499847412109375, "learning_rate": 8.979608861405206e-05, "loss": 0.3204, "step": 2577 }, { "epoch": 1.6394429808054196, "grad_norm": 0.23639756441116333, "learning_rate": 8.97866504405198e-05, "loss": 0.2908, "step": 2578 }, { "epoch": 1.6401957094467443, "grad_norm": 0.19182661175727844, "learning_rate": 8.977720846277175e-05, "loss": 0.3308, "step": 2579 }, { "epoch": 1.6409484380880692, "grad_norm": 0.19825582206249237, "learning_rate": 8.976776268184046e-05, "loss": 0.3114, "step": 2580 }, { "epoch": 1.6417011667293941, "grad_norm": 0.22550438344478607, "learning_rate": 8.975831309875893e-05, "loss": 0.2365, "step": 2581 }, { "epoch": 1.6424538953707188, "grad_norm": 0.22457273304462433, "learning_rate": 8.974885971456052e-05, "loss": 0.2397, "step": 2582 }, { "epoch": 1.6432066240120435, "grad_norm": 0.23263417184352875, "learning_rate": 8.973940253027908e-05, "loss": 0.2607, "step": 2583 }, { "epoch": 1.6439593526533685, "grad_norm": 0.23470619320869446, "learning_rate": 8.972994154694881e-05, "loss": 0.2909, "step": 2584 }, { "epoch": 1.6447120812946934, "grad_norm": 0.20839248597621918, "learning_rate": 8.972047676560433e-05, "loss": 0.2135, "step": 2585 }, { "epoch": 1.645464809936018, "grad_norm": 0.2087910771369934, "learning_rate": 8.971100818728072e-05, "loss": 0.2514, "step": 2586 }, { "epoch": 1.6462175385773428, "grad_norm": 0.2628481686115265, "learning_rate": 8.970153581301344e-05, "loss": 0.3671, "step": 2587 }, { "epoch": 1.6469702672186677, "grad_norm": 0.1781458705663681, "learning_rate": 8.969205964383839e-05, "loss": 0.2752, "step": 2588 }, { "epoch": 1.6477229958599926, "grad_norm": 0.24862469732761383, "learning_rate": 8.968257968079184e-05, "loss": 0.309, "step": 2589 }, { "epoch": 1.6484757245013173, "grad_norm": 0.22986894845962524, "learning_rate": 8.967309592491052e-05, "loss": 0.3098, "step": 2590 }, { "epoch": 1.649228453142642, "grad_norm": 0.2536394000053406, "learning_rate": 8.966360837723157e-05, "loss": 0.3086, "step": 2591 }, { "epoch": 1.649981181783967, "grad_norm": 0.25014498829841614, "learning_rate": 8.965411703879251e-05, "loss": 0.2884, "step": 2592 }, { "epoch": 1.6507339104252918, "grad_norm": 0.2271837443113327, "learning_rate": 8.964462191063132e-05, "loss": 0.3541, "step": 2593 }, { "epoch": 1.6514866390666165, "grad_norm": 0.19178907573223114, "learning_rate": 8.963512299378636e-05, "loss": 0.2841, "step": 2594 }, { "epoch": 1.6522393677079412, "grad_norm": 0.2127668410539627, "learning_rate": 8.962562028929645e-05, "loss": 0.2856, "step": 2595 }, { "epoch": 1.6529920963492661, "grad_norm": 0.24281829595565796, "learning_rate": 8.961611379820072e-05, "loss": 0.2128, "step": 2596 }, { "epoch": 1.653744824990591, "grad_norm": 0.2599092423915863, "learning_rate": 8.960660352153885e-05, "loss": 0.3321, "step": 2597 }, { "epoch": 1.6544975536319155, "grad_norm": 0.21231414377689362, "learning_rate": 8.959708946035087e-05, "loss": 0.2601, "step": 2598 }, { "epoch": 1.6552502822732404, "grad_norm": 0.3054015636444092, "learning_rate": 8.958757161567716e-05, "loss": 0.2137, "step": 2599 }, { "epoch": 1.6560030109145654, "grad_norm": 0.2900407016277313, "learning_rate": 8.957804998855866e-05, "loss": 0.2583, "step": 2600 }, { "epoch": 1.6560030109145654, "eval_loss": 0.3050251007080078, "eval_runtime": 456.4243, "eval_samples_per_second": 21.092, "eval_steps_per_second": 0.659, "step": 2600 }, { "epoch": 1.65675573955589, "grad_norm": 0.24885335564613342, "learning_rate": 8.956852458003659e-05, "loss": 0.3219, "step": 2601 }, { "epoch": 1.6575084681972148, "grad_norm": 0.26510122418403625, "learning_rate": 8.955899539115264e-05, "loss": 0.3141, "step": 2602 }, { "epoch": 1.6582611968385397, "grad_norm": 0.27361753582954407, "learning_rate": 8.954946242294891e-05, "loss": 0.275, "step": 2603 }, { "epoch": 1.6590139254798646, "grad_norm": 0.24509455263614655, "learning_rate": 8.953992567646792e-05, "loss": 0.2466, "step": 2604 }, { "epoch": 1.6597666541211893, "grad_norm": 0.26684120297431946, "learning_rate": 8.953038515275258e-05, "loss": 0.2828, "step": 2605 }, { "epoch": 1.660519382762514, "grad_norm": 0.25823327898979187, "learning_rate": 8.952084085284622e-05, "loss": 0.3255, "step": 2606 }, { "epoch": 1.661272111403839, "grad_norm": 0.2253360003232956, "learning_rate": 8.951129277779263e-05, "loss": 0.2565, "step": 2607 }, { "epoch": 1.6620248400451638, "grad_norm": 0.2484254240989685, "learning_rate": 8.950174092863596e-05, "loss": 0.3062, "step": 2608 }, { "epoch": 1.6627775686864885, "grad_norm": 0.19468529522418976, "learning_rate": 8.949218530642075e-05, "loss": 0.2483, "step": 2609 }, { "epoch": 1.6635302973278132, "grad_norm": 0.22244079411029816, "learning_rate": 8.948262591219203e-05, "loss": 0.2483, "step": 2610 }, { "epoch": 1.6642830259691381, "grad_norm": 0.2530156970024109, "learning_rate": 8.947306274699516e-05, "loss": 0.2736, "step": 2611 }, { "epoch": 1.665035754610463, "grad_norm": 0.20633260905742645, "learning_rate": 8.946349581187599e-05, "loss": 0.2071, "step": 2612 }, { "epoch": 1.6657884832517877, "grad_norm": 0.2676049470901489, "learning_rate": 8.945392510788075e-05, "loss": 0.3586, "step": 2613 }, { "epoch": 1.6665412118931124, "grad_norm": 0.26020359992980957, "learning_rate": 8.944435063605604e-05, "loss": 0.2652, "step": 2614 }, { "epoch": 1.6672939405344374, "grad_norm": 0.25393983721733093, "learning_rate": 8.943477239744892e-05, "loss": 0.3768, "step": 2615 }, { "epoch": 1.6680466691757623, "grad_norm": 0.20821429789066315, "learning_rate": 8.94251903931069e-05, "loss": 0.2294, "step": 2616 }, { "epoch": 1.668799397817087, "grad_norm": 0.26414915919303894, "learning_rate": 8.941560462407778e-05, "loss": 0.3141, "step": 2617 }, { "epoch": 1.6695521264584117, "grad_norm": 0.23823434114456177, "learning_rate": 8.940601509140991e-05, "loss": 0.3098, "step": 2618 }, { "epoch": 1.6703048550997366, "grad_norm": 0.26906517148017883, "learning_rate": 8.939642179615194e-05, "loss": 0.2582, "step": 2619 }, { "epoch": 1.6710575837410615, "grad_norm": 0.2171274870634079, "learning_rate": 8.9386824739353e-05, "loss": 0.2464, "step": 2620 }, { "epoch": 1.6718103123823862, "grad_norm": 0.23699185252189636, "learning_rate": 8.937722392206261e-05, "loss": 0.2629, "step": 2621 }, { "epoch": 1.672563041023711, "grad_norm": 0.23745980858802795, "learning_rate": 8.93676193453307e-05, "loss": 0.3235, "step": 2622 }, { "epoch": 1.6733157696650358, "grad_norm": 0.2553633451461792, "learning_rate": 8.93580110102076e-05, "loss": 0.3616, "step": 2623 }, { "epoch": 1.6740684983063605, "grad_norm": 0.23559172451496124, "learning_rate": 8.934839891774408e-05, "loss": 0.3625, "step": 2624 }, { "epoch": 1.6748212269476852, "grad_norm": 0.25901275873184204, "learning_rate": 8.93387830689913e-05, "loss": 0.3463, "step": 2625 }, { "epoch": 1.6755739555890101, "grad_norm": 0.27294012904167175, "learning_rate": 8.932916346500082e-05, "loss": 0.3237, "step": 2626 }, { "epoch": 1.676326684230335, "grad_norm": 0.27732086181640625, "learning_rate": 8.931954010682464e-05, "loss": 0.3647, "step": 2627 }, { "epoch": 1.6770794128716597, "grad_norm": 0.2292364239692688, "learning_rate": 8.930991299551515e-05, "loss": 0.3338, "step": 2628 }, { "epoch": 1.6778321415129844, "grad_norm": 0.19659370183944702, "learning_rate": 8.930028213212517e-05, "loss": 0.2461, "step": 2629 }, { "epoch": 1.6785848701543093, "grad_norm": 0.25036075711250305, "learning_rate": 8.929064751770789e-05, "loss": 0.2539, "step": 2630 }, { "epoch": 1.6793375987956343, "grad_norm": 0.2365996539592743, "learning_rate": 8.928100915331698e-05, "loss": 0.3067, "step": 2631 }, { "epoch": 1.680090327436959, "grad_norm": 0.2014579176902771, "learning_rate": 8.927136704000643e-05, "loss": 0.2471, "step": 2632 }, { "epoch": 1.6808430560782837, "grad_norm": 0.2454236000776291, "learning_rate": 8.926172117883071e-05, "loss": 0.3478, "step": 2633 }, { "epoch": 1.6815957847196086, "grad_norm": 0.2125507891178131, "learning_rate": 8.925207157084466e-05, "loss": 0.3358, "step": 2634 }, { "epoch": 1.6823485133609335, "grad_norm": 0.19479209184646606, "learning_rate": 8.924241821710358e-05, "loss": 0.2865, "step": 2635 }, { "epoch": 1.6831012420022582, "grad_norm": 0.20788168907165527, "learning_rate": 8.923276111866312e-05, "loss": 0.2687, "step": 2636 }, { "epoch": 1.6838539706435829, "grad_norm": 0.22848477959632874, "learning_rate": 8.922310027657937e-05, "loss": 0.342, "step": 2637 }, { "epoch": 1.6846066992849078, "grad_norm": 0.2317168265581131, "learning_rate": 8.921343569190884e-05, "loss": 0.2331, "step": 2638 }, { "epoch": 1.6853594279262327, "grad_norm": 0.2131941169500351, "learning_rate": 8.920376736570839e-05, "loss": 0.2733, "step": 2639 }, { "epoch": 1.6861121565675574, "grad_norm": 0.2270268201828003, "learning_rate": 8.91940952990354e-05, "loss": 0.2457, "step": 2640 }, { "epoch": 1.6868648852088821, "grad_norm": 0.29687368869781494, "learning_rate": 8.918441949294752e-05, "loss": 0.3768, "step": 2641 }, { "epoch": 1.687617613850207, "grad_norm": 0.2136383354663849, "learning_rate": 8.917473994850295e-05, "loss": 0.21, "step": 2642 }, { "epoch": 1.688370342491532, "grad_norm": 0.4052406847476959, "learning_rate": 8.91650566667602e-05, "loss": 0.3858, "step": 2643 }, { "epoch": 1.6891230711328566, "grad_norm": 0.336155503988266, "learning_rate": 8.91553696487782e-05, "loss": 0.2923, "step": 2644 }, { "epoch": 1.6898757997741813, "grad_norm": 0.2664221227169037, "learning_rate": 8.914567889561636e-05, "loss": 0.291, "step": 2645 }, { "epoch": 1.6906285284155063, "grad_norm": 0.28645458817481995, "learning_rate": 8.913598440833438e-05, "loss": 0.2577, "step": 2646 }, { "epoch": 1.6913812570568312, "grad_norm": 0.24563957750797272, "learning_rate": 8.91262861879925e-05, "loss": 0.3158, "step": 2647 }, { "epoch": 1.6921339856981557, "grad_norm": 0.2918443977832794, "learning_rate": 8.911658423565125e-05, "loss": 0.2976, "step": 2648 }, { "epoch": 1.6928867143394806, "grad_norm": 0.23814664781093597, "learning_rate": 8.910687855237164e-05, "loss": 0.3089, "step": 2649 }, { "epoch": 1.6936394429808055, "grad_norm": 0.28502893447875977, "learning_rate": 8.909716913921508e-05, "loss": 0.3599, "step": 2650 }, { "epoch": 1.6943921716221302, "grad_norm": 0.2862470746040344, "learning_rate": 8.908745599724335e-05, "loss": 0.432, "step": 2651 }, { "epoch": 1.6951449002634549, "grad_norm": 0.2500116527080536, "learning_rate": 8.90777391275187e-05, "loss": 0.2204, "step": 2652 }, { "epoch": 1.6958976289047798, "grad_norm": 0.228883296251297, "learning_rate": 8.906801853110373e-05, "loss": 0.2678, "step": 2653 }, { "epoch": 1.6966503575461047, "grad_norm": 0.2516147494316101, "learning_rate": 8.905829420906145e-05, "loss": 0.2698, "step": 2654 }, { "epoch": 1.6974030861874294, "grad_norm": 0.2599347233772278, "learning_rate": 8.904856616245534e-05, "loss": 0.2625, "step": 2655 }, { "epoch": 1.698155814828754, "grad_norm": 0.21956832706928253, "learning_rate": 8.903883439234924e-05, "loss": 0.3077, "step": 2656 }, { "epoch": 1.698908543470079, "grad_norm": 0.21710915863513947, "learning_rate": 8.902909889980737e-05, "loss": 0.3559, "step": 2657 }, { "epoch": 1.699661272111404, "grad_norm": 0.23086126148700714, "learning_rate": 8.901935968589443e-05, "loss": 0.3768, "step": 2658 }, { "epoch": 1.7004140007527286, "grad_norm": 0.23709216713905334, "learning_rate": 8.900961675167543e-05, "loss": 0.2973, "step": 2659 }, { "epoch": 1.7011667293940533, "grad_norm": 0.22032243013381958, "learning_rate": 8.899987009821588e-05, "loss": 0.3049, "step": 2660 }, { "epoch": 1.7019194580353783, "grad_norm": 0.19871854782104492, "learning_rate": 8.899011972658166e-05, "loss": 0.2621, "step": 2661 }, { "epoch": 1.7026721866767032, "grad_norm": 0.23017558455467224, "learning_rate": 8.898036563783906e-05, "loss": 0.2817, "step": 2662 }, { "epoch": 1.7034249153180279, "grad_norm": 0.23770231008529663, "learning_rate": 8.897060783305476e-05, "loss": 0.3155, "step": 2663 }, { "epoch": 1.7041776439593526, "grad_norm": 0.2141065150499344, "learning_rate": 8.896084631329584e-05, "loss": 0.3952, "step": 2664 }, { "epoch": 1.7049303726006775, "grad_norm": 0.2544148564338684, "learning_rate": 8.895108107962985e-05, "loss": 0.358, "step": 2665 }, { "epoch": 1.7056831012420024, "grad_norm": 0.2240646779537201, "learning_rate": 8.894131213312467e-05, "loss": 0.3074, "step": 2666 }, { "epoch": 1.706435829883327, "grad_norm": 0.27542877197265625, "learning_rate": 8.893153947484863e-05, "loss": 0.3063, "step": 2667 }, { "epoch": 1.7071885585246518, "grad_norm": 0.22124774754047394, "learning_rate": 8.892176310587044e-05, "loss": 0.2552, "step": 2668 }, { "epoch": 1.7079412871659767, "grad_norm": 0.1895630955696106, "learning_rate": 8.891198302725925e-05, "loss": 0.2565, "step": 2669 }, { "epoch": 1.7086940158073016, "grad_norm": 0.22589519619941711, "learning_rate": 8.890219924008456e-05, "loss": 0.3065, "step": 2670 }, { "epoch": 1.7094467444486263, "grad_norm": 0.27587947249412537, "learning_rate": 8.889241174541636e-05, "loss": 0.3079, "step": 2671 }, { "epoch": 1.710199473089951, "grad_norm": 0.22763928771018982, "learning_rate": 8.888262054432496e-05, "loss": 0.2976, "step": 2672 }, { "epoch": 1.710952201731276, "grad_norm": 0.2514950931072235, "learning_rate": 8.88728256378811e-05, "loss": 0.308, "step": 2673 }, { "epoch": 1.7117049303726006, "grad_norm": 0.22386296093463898, "learning_rate": 8.886302702715598e-05, "loss": 0.2511, "step": 2674 }, { "epoch": 1.7124576590139253, "grad_norm": 0.24890896677970886, "learning_rate": 8.885322471322112e-05, "loss": 0.3719, "step": 2675 }, { "epoch": 1.7132103876552502, "grad_norm": 0.2232537418603897, "learning_rate": 8.88434186971485e-05, "loss": 0.2893, "step": 2676 }, { "epoch": 1.7139631162965752, "grad_norm": 0.22814078629016876, "learning_rate": 8.883360898001051e-05, "loss": 0.2908, "step": 2677 }, { "epoch": 1.7147158449378999, "grad_norm": 0.21909624338150024, "learning_rate": 8.88237955628799e-05, "loss": 0.2265, "step": 2678 }, { "epoch": 1.7154685735792246, "grad_norm": 0.23599159717559814, "learning_rate": 8.881397844682986e-05, "loss": 0.2752, "step": 2679 }, { "epoch": 1.7162213022205495, "grad_norm": 0.22438961267471313, "learning_rate": 8.880415763293398e-05, "loss": 0.2676, "step": 2680 }, { "epoch": 1.7169740308618744, "grad_norm": 0.22653941810131073, "learning_rate": 8.879433312226625e-05, "loss": 0.2252, "step": 2681 }, { "epoch": 1.717726759503199, "grad_norm": 0.21043463051319122, "learning_rate": 8.878450491590105e-05, "loss": 0.2192, "step": 2682 }, { "epoch": 1.7184794881445238, "grad_norm": 0.24271272122859955, "learning_rate": 8.877467301491318e-05, "loss": 0.3044, "step": 2683 }, { "epoch": 1.7192322167858487, "grad_norm": 0.27517756819725037, "learning_rate": 8.876483742037785e-05, "loss": 0.347, "step": 2684 }, { "epoch": 1.7199849454271736, "grad_norm": 0.24049878120422363, "learning_rate": 8.875499813337069e-05, "loss": 0.2799, "step": 2685 }, { "epoch": 1.7207376740684983, "grad_norm": 0.2069961279630661, "learning_rate": 8.874515515496767e-05, "loss": 0.2585, "step": 2686 }, { "epoch": 1.721490402709823, "grad_norm": 0.21640388667583466, "learning_rate": 8.873530848624521e-05, "loss": 0.3369, "step": 2687 }, { "epoch": 1.722243131351148, "grad_norm": 0.20495401322841644, "learning_rate": 8.872545812828013e-05, "loss": 0.3696, "step": 2688 }, { "epoch": 1.7229958599924728, "grad_norm": 0.2460683286190033, "learning_rate": 8.871560408214967e-05, "loss": 0.2947, "step": 2689 }, { "epoch": 1.7237485886337975, "grad_norm": 0.2082376629114151, "learning_rate": 8.870574634893143e-05, "loss": 0.2, "step": 2690 }, { "epoch": 1.7245013172751222, "grad_norm": 0.2696602940559387, "learning_rate": 8.869588492970344e-05, "loss": 0.3049, "step": 2691 }, { "epoch": 1.7252540459164472, "grad_norm": 0.28272202610969543, "learning_rate": 8.868601982554413e-05, "loss": 0.2707, "step": 2692 }, { "epoch": 1.726006774557772, "grad_norm": 0.2373596578836441, "learning_rate": 8.867615103753236e-05, "loss": 0.3128, "step": 2693 }, { "epoch": 1.7267595031990968, "grad_norm": 0.2905932664871216, "learning_rate": 8.866627856674731e-05, "loss": 0.2657, "step": 2694 }, { "epoch": 1.7275122318404215, "grad_norm": 0.2738843262195587, "learning_rate": 8.86564024142687e-05, "loss": 0.2981, "step": 2695 }, { "epoch": 1.7282649604817464, "grad_norm": 0.2526795566082001, "learning_rate": 8.86465225811765e-05, "loss": 0.3983, "step": 2696 }, { "epoch": 1.729017689123071, "grad_norm": 0.238993838429451, "learning_rate": 8.863663906855117e-05, "loss": 0.2483, "step": 2697 }, { "epoch": 1.7297704177643958, "grad_norm": 0.3193243741989136, "learning_rate": 8.862675187747356e-05, "loss": 0.3564, "step": 2698 }, { "epoch": 1.7305231464057207, "grad_norm": 0.238425150513649, "learning_rate": 8.861686100902495e-05, "loss": 0.2956, "step": 2699 }, { "epoch": 1.7312758750470456, "grad_norm": 0.22722077369689941, "learning_rate": 8.860696646428693e-05, "loss": 0.3066, "step": 2700 }, { "epoch": 1.7320286036883703, "grad_norm": 0.20125381648540497, "learning_rate": 8.85970682443416e-05, "loss": 0.2439, "step": 2701 }, { "epoch": 1.732781332329695, "grad_norm": 0.2301541119813919, "learning_rate": 8.858716635027139e-05, "loss": 0.2996, "step": 2702 }, { "epoch": 1.73353406097102, "grad_norm": 0.23395220935344696, "learning_rate": 8.857726078315918e-05, "loss": 0.2766, "step": 2703 }, { "epoch": 1.7342867896123448, "grad_norm": 0.2799210548400879, "learning_rate": 8.85673515440882e-05, "loss": 0.2224, "step": 2704 }, { "epoch": 1.7350395182536695, "grad_norm": 0.21861055493354797, "learning_rate": 8.855743863414214e-05, "loss": 0.3478, "step": 2705 }, { "epoch": 1.7357922468949942, "grad_norm": 0.24820604920387268, "learning_rate": 8.854752205440501e-05, "loss": 0.2935, "step": 2706 }, { "epoch": 1.7365449755363191, "grad_norm": 0.2766213119029999, "learning_rate": 8.853760180596134e-05, "loss": 0.3961, "step": 2707 }, { "epoch": 1.737297704177644, "grad_norm": 0.29359734058380127, "learning_rate": 8.852767788989594e-05, "loss": 0.3175, "step": 2708 }, { "epoch": 1.7380504328189688, "grad_norm": 0.25400975346565247, "learning_rate": 8.851775030729411e-05, "loss": 0.2753, "step": 2709 }, { "epoch": 1.7388031614602935, "grad_norm": 0.2604159414768219, "learning_rate": 8.85078190592415e-05, "loss": 0.3035, "step": 2710 }, { "epoch": 1.7395558901016184, "grad_norm": 0.2465360462665558, "learning_rate": 8.849788414682416e-05, "loss": 0.2586, "step": 2711 }, { "epoch": 1.7403086187429433, "grad_norm": 0.19204886257648468, "learning_rate": 8.848794557112857e-05, "loss": 0.2593, "step": 2712 }, { "epoch": 1.741061347384268, "grad_norm": 0.2010931819677353, "learning_rate": 8.847800333324162e-05, "loss": 0.2664, "step": 2713 }, { "epoch": 1.7418140760255927, "grad_norm": 0.23564037680625916, "learning_rate": 8.846805743425055e-05, "loss": 0.2746, "step": 2714 }, { "epoch": 1.7425668046669176, "grad_norm": 0.20607030391693115, "learning_rate": 8.845810787524304e-05, "loss": 0.3324, "step": 2715 }, { "epoch": 1.7433195333082425, "grad_norm": 0.22664658725261688, "learning_rate": 8.844815465730716e-05, "loss": 0.2667, "step": 2716 }, { "epoch": 1.7440722619495672, "grad_norm": 0.21509528160095215, "learning_rate": 8.843819778153137e-05, "loss": 0.2728, "step": 2717 }, { "epoch": 1.744824990590892, "grad_norm": 0.21198657155036926, "learning_rate": 8.842823724900453e-05, "loss": 0.2233, "step": 2718 }, { "epoch": 1.7455777192322168, "grad_norm": 0.2631138563156128, "learning_rate": 8.841827306081595e-05, "loss": 0.313, "step": 2719 }, { "epoch": 1.7463304478735417, "grad_norm": 0.24777041375637054, "learning_rate": 8.840830521805525e-05, "loss": 0.3073, "step": 2720 }, { "epoch": 1.7470831765148664, "grad_norm": 0.22068490087985992, "learning_rate": 8.839833372181254e-05, "loss": 0.1786, "step": 2721 }, { "epoch": 1.7478359051561911, "grad_norm": 0.22427372634410858, "learning_rate": 8.838835857317825e-05, "loss": 0.2353, "step": 2722 }, { "epoch": 1.748588633797516, "grad_norm": 0.2059476524591446, "learning_rate": 8.837837977324328e-05, "loss": 0.3007, "step": 2723 }, { "epoch": 1.7493413624388408, "grad_norm": 0.22578054666519165, "learning_rate": 8.836839732309887e-05, "loss": 0.2865, "step": 2724 }, { "epoch": 1.7500940910801654, "grad_norm": 0.24776548147201538, "learning_rate": 8.83584112238367e-05, "loss": 0.2583, "step": 2725 }, { "epoch": 1.7508468197214904, "grad_norm": 0.22081859409809113, "learning_rate": 8.834842147654883e-05, "loss": 0.3025, "step": 2726 }, { "epoch": 1.7515995483628153, "grad_norm": 0.27260810136795044, "learning_rate": 8.833842808232773e-05, "loss": 0.2656, "step": 2727 }, { "epoch": 1.75235227700414, "grad_norm": 0.2671031057834625, "learning_rate": 8.832843104226625e-05, "loss": 0.3163, "step": 2728 }, { "epoch": 1.7531050056454647, "grad_norm": 0.26227349042892456, "learning_rate": 8.831843035745765e-05, "loss": 0.3171, "step": 2729 }, { "epoch": 1.7538577342867896, "grad_norm": 0.2681552469730377, "learning_rate": 8.830842602899563e-05, "loss": 0.3911, "step": 2730 }, { "epoch": 1.7546104629281145, "grad_norm": 0.24220313131809235, "learning_rate": 8.82984180579742e-05, "loss": 0.2831, "step": 2731 }, { "epoch": 1.7553631915694392, "grad_norm": 0.22368259727954865, "learning_rate": 8.828840644548784e-05, "loss": 0.2087, "step": 2732 }, { "epoch": 1.756115920210764, "grad_norm": 0.22129781544208527, "learning_rate": 8.82783911926314e-05, "loss": 0.2639, "step": 2733 }, { "epoch": 1.7568686488520888, "grad_norm": 0.24263858795166016, "learning_rate": 8.826837230050014e-05, "loss": 0.2683, "step": 2734 }, { "epoch": 1.7576213774934137, "grad_norm": 0.24362590909004211, "learning_rate": 8.825834977018968e-05, "loss": 0.2968, "step": 2735 }, { "epoch": 1.7583741061347384, "grad_norm": 0.23811013996601105, "learning_rate": 8.824832360279612e-05, "loss": 0.2998, "step": 2736 }, { "epoch": 1.7591268347760631, "grad_norm": 0.2628886103630066, "learning_rate": 8.823829379941586e-05, "loss": 0.3799, "step": 2737 }, { "epoch": 1.759879563417388, "grad_norm": 0.21919076144695282, "learning_rate": 8.822826036114577e-05, "loss": 0.2953, "step": 2738 }, { "epoch": 1.760632292058713, "grad_norm": 0.27862560749053955, "learning_rate": 8.821822328908308e-05, "loss": 0.3937, "step": 2739 }, { "epoch": 1.7613850207000377, "grad_norm": 0.2687314450740814, "learning_rate": 8.820818258432543e-05, "loss": 0.3068, "step": 2740 }, { "epoch": 1.7621377493413624, "grad_norm": 0.20397432148456573, "learning_rate": 8.819813824797088e-05, "loss": 0.1937, "step": 2741 }, { "epoch": 1.7628904779826873, "grad_norm": 0.2430112063884735, "learning_rate": 8.818809028111783e-05, "loss": 0.2636, "step": 2742 }, { "epoch": 1.7636432066240122, "grad_norm": 0.1895109862089157, "learning_rate": 8.817803868486512e-05, "loss": 0.1747, "step": 2743 }, { "epoch": 1.764395935265337, "grad_norm": 0.23810142278671265, "learning_rate": 8.816798346031199e-05, "loss": 0.3334, "step": 2744 }, { "epoch": 1.7651486639066616, "grad_norm": 0.25433334708213806, "learning_rate": 8.815792460855806e-05, "loss": 0.3675, "step": 2745 }, { "epoch": 1.7659013925479865, "grad_norm": 0.21648821234703064, "learning_rate": 8.814786213070334e-05, "loss": 0.2844, "step": 2746 }, { "epoch": 1.7666541211893112, "grad_norm": 0.25655072927474976, "learning_rate": 8.813779602784825e-05, "loss": 0.2937, "step": 2747 }, { "epoch": 1.767406849830636, "grad_norm": 0.2102089375257492, "learning_rate": 8.812772630109363e-05, "loss": 0.2539, "step": 2748 }, { "epoch": 1.7681595784719608, "grad_norm": 0.22562134265899658, "learning_rate": 8.811765295154064e-05, "loss": 0.2195, "step": 2749 }, { "epoch": 1.7689123071132857, "grad_norm": 0.2583082616329193, "learning_rate": 8.810757598029093e-05, "loss": 0.2443, "step": 2750 }, { "epoch": 1.7696650357546104, "grad_norm": 0.2567432224750519, "learning_rate": 8.809749538844648e-05, "loss": 0.3573, "step": 2751 }, { "epoch": 1.7704177643959351, "grad_norm": 0.235284224152565, "learning_rate": 8.80874111771097e-05, "loss": 0.244, "step": 2752 }, { "epoch": 1.77117049303726, "grad_norm": 0.24928733706474304, "learning_rate": 8.807732334738338e-05, "loss": 0.3394, "step": 2753 }, { "epoch": 1.771923221678585, "grad_norm": 0.27591976523399353, "learning_rate": 8.806723190037071e-05, "loss": 0.3254, "step": 2754 }, { "epoch": 1.7726759503199097, "grad_norm": 0.22765956819057465, "learning_rate": 8.805713683717527e-05, "loss": 0.2531, "step": 2755 }, { "epoch": 1.7734286789612344, "grad_norm": 0.19960978627204895, "learning_rate": 8.804703815890105e-05, "loss": 0.2549, "step": 2756 }, { "epoch": 1.7741814076025593, "grad_norm": 0.23928768932819366, "learning_rate": 8.803693586665244e-05, "loss": 0.2768, "step": 2757 }, { "epoch": 1.7749341362438842, "grad_norm": 0.21591655910015106, "learning_rate": 8.802682996153418e-05, "loss": 0.2326, "step": 2758 }, { "epoch": 1.7756868648852089, "grad_norm": 0.2414378523826599, "learning_rate": 8.801672044465144e-05, "loss": 0.2621, "step": 2759 }, { "epoch": 1.7764395935265336, "grad_norm": 0.22833751142024994, "learning_rate": 8.800660731710981e-05, "loss": 0.3951, "step": 2760 }, { "epoch": 1.7771923221678585, "grad_norm": 0.2689658999443054, "learning_rate": 8.799649058001521e-05, "loss": 0.3278, "step": 2761 }, { "epoch": 1.7779450508091834, "grad_norm": 0.22262988984584808, "learning_rate": 8.798637023447401e-05, "loss": 0.2922, "step": 2762 }, { "epoch": 1.778697779450508, "grad_norm": 0.21302740275859833, "learning_rate": 8.797624628159296e-05, "loss": 0.2078, "step": 2763 }, { "epoch": 1.7794505080918328, "grad_norm": 0.23901310563087463, "learning_rate": 8.796611872247921e-05, "loss": 0.317, "step": 2764 }, { "epoch": 1.7802032367331577, "grad_norm": 0.31611019372940063, "learning_rate": 8.795598755824026e-05, "loss": 0.2975, "step": 2765 }, { "epoch": 1.7809559653744826, "grad_norm": 0.24268025159835815, "learning_rate": 8.794585278998407e-05, "loss": 0.1954, "step": 2766 }, { "epoch": 1.7817086940158073, "grad_norm": 0.24954324960708618, "learning_rate": 8.793571441881896e-05, "loss": 0.3239, "step": 2767 }, { "epoch": 1.782461422657132, "grad_norm": 0.22670194506645203, "learning_rate": 8.792557244585363e-05, "loss": 0.2209, "step": 2768 }, { "epoch": 1.783214151298457, "grad_norm": 0.26488196849823, "learning_rate": 8.79154268721972e-05, "loss": 0.2992, "step": 2769 }, { "epoch": 1.7839668799397819, "grad_norm": 0.2359248697757721, "learning_rate": 8.790527769895917e-05, "loss": 0.2684, "step": 2770 }, { "epoch": 1.7847196085811066, "grad_norm": 0.21801109611988068, "learning_rate": 8.789512492724945e-05, "loss": 0.2899, "step": 2771 }, { "epoch": 1.7854723372224313, "grad_norm": 0.21188554167747498, "learning_rate": 8.788496855817832e-05, "loss": 0.3157, "step": 2772 }, { "epoch": 1.7862250658637562, "grad_norm": 0.191956028342247, "learning_rate": 8.787480859285648e-05, "loss": 0.3474, "step": 2773 }, { "epoch": 1.7869777945050809, "grad_norm": 0.2132023274898529, "learning_rate": 8.7864645032395e-05, "loss": 0.3079, "step": 2774 }, { "epoch": 1.7877305231464056, "grad_norm": 0.2118808776140213, "learning_rate": 8.785447787790534e-05, "loss": 0.2234, "step": 2775 }, { "epoch": 1.7884832517877305, "grad_norm": 0.25374308228492737, "learning_rate": 8.784430713049939e-05, "loss": 0.2633, "step": 2776 }, { "epoch": 1.7892359804290554, "grad_norm": 0.2459305375814438, "learning_rate": 8.783413279128936e-05, "loss": 0.2134, "step": 2777 }, { "epoch": 1.78998870907038, "grad_norm": 0.2447367161512375, "learning_rate": 8.782395486138797e-05, "loss": 0.241, "step": 2778 }, { "epoch": 1.7907414377117048, "grad_norm": 0.23906154930591583, "learning_rate": 8.781377334190819e-05, "loss": 0.2492, "step": 2779 }, { "epoch": 1.7914941663530297, "grad_norm": 0.23140911757946014, "learning_rate": 8.780358823396352e-05, "loss": 0.2882, "step": 2780 }, { "epoch": 1.7922468949943546, "grad_norm": 0.2139889895915985, "learning_rate": 8.779339953866777e-05, "loss": 0.2724, "step": 2781 }, { "epoch": 1.7929996236356793, "grad_norm": 0.2617070972919464, "learning_rate": 8.778320725713512e-05, "loss": 0.326, "step": 2782 }, { "epoch": 1.793752352277004, "grad_norm": 0.26086899638175964, "learning_rate": 8.777301139048025e-05, "loss": 0.2923, "step": 2783 }, { "epoch": 1.794505080918329, "grad_norm": 0.2677668333053589, "learning_rate": 8.776281193981809e-05, "loss": 0.343, "step": 2784 }, { "epoch": 1.7952578095596539, "grad_norm": 0.2356543093919754, "learning_rate": 8.775260890626408e-05, "loss": 0.2678, "step": 2785 }, { "epoch": 1.7960105382009786, "grad_norm": 0.2622126638889313, "learning_rate": 8.774240229093402e-05, "loss": 0.3228, "step": 2786 }, { "epoch": 1.7967632668423033, "grad_norm": 0.2501063644886017, "learning_rate": 8.773219209494407e-05, "loss": 0.2632, "step": 2787 }, { "epoch": 1.7975159954836282, "grad_norm": 0.2331642210483551, "learning_rate": 8.772197831941079e-05, "loss": 0.2809, "step": 2788 }, { "epoch": 1.798268724124953, "grad_norm": 0.21837691962718964, "learning_rate": 8.771176096545116e-05, "loss": 0.2929, "step": 2789 }, { "epoch": 1.7990214527662778, "grad_norm": 0.23509375751018524, "learning_rate": 8.770154003418254e-05, "loss": 0.3311, "step": 2790 }, { "epoch": 1.7997741814076025, "grad_norm": 0.2369178682565689, "learning_rate": 8.769131552672267e-05, "loss": 0.3054, "step": 2791 }, { "epoch": 1.8005269100489274, "grad_norm": 0.24844875931739807, "learning_rate": 8.768108744418968e-05, "loss": 0.3038, "step": 2792 }, { "epoch": 1.8012796386902523, "grad_norm": 0.1929578185081482, "learning_rate": 8.767085578770212e-05, "loss": 0.211, "step": 2793 }, { "epoch": 1.802032367331577, "grad_norm": 0.2921646237373352, "learning_rate": 8.766062055837886e-05, "loss": 0.2318, "step": 2794 }, { "epoch": 1.8027850959729017, "grad_norm": 0.22921934723854065, "learning_rate": 8.765038175733926e-05, "loss": 0.2333, "step": 2795 }, { "epoch": 1.8035378246142266, "grad_norm": 0.21813203394412994, "learning_rate": 8.7640139385703e-05, "loss": 0.2443, "step": 2796 }, { "epoch": 1.8042905532555513, "grad_norm": 0.24138596653938293, "learning_rate": 8.762989344459016e-05, "loss": 0.2934, "step": 2797 }, { "epoch": 1.805043281896876, "grad_norm": 0.23619209229946136, "learning_rate": 8.761964393512124e-05, "loss": 0.3157, "step": 2798 }, { "epoch": 1.805796010538201, "grad_norm": 0.2213306427001953, "learning_rate": 8.76093908584171e-05, "loss": 0.2517, "step": 2799 }, { "epoch": 1.8065487391795259, "grad_norm": 0.2242913842201233, "learning_rate": 8.759913421559902e-05, "loss": 0.2356, "step": 2800 }, { "epoch": 1.8065487391795259, "eval_loss": 0.2915043830871582, "eval_runtime": 455.9877, "eval_samples_per_second": 21.112, "eval_steps_per_second": 0.66, "step": 2800 }, { "epoch": 1.8073014678208505, "grad_norm": 0.24018467962741852, "learning_rate": 8.758887400778862e-05, "loss": 0.365, "step": 2801 }, { "epoch": 1.8080541964621752, "grad_norm": 0.23290587961673737, "learning_rate": 8.757861023610794e-05, "loss": 0.3495, "step": 2802 }, { "epoch": 1.8088069251035002, "grad_norm": 0.19621914625167847, "learning_rate": 8.756834290167944e-05, "loss": 0.2298, "step": 2803 }, { "epoch": 1.809559653744825, "grad_norm": 0.2540406882762909, "learning_rate": 8.755807200562593e-05, "loss": 0.2025, "step": 2804 }, { "epoch": 1.8103123823861498, "grad_norm": 0.24094659090042114, "learning_rate": 8.75477975490706e-05, "loss": 0.2929, "step": 2805 }, { "epoch": 1.8110651110274745, "grad_norm": 0.24527770280838013, "learning_rate": 8.753751953313708e-05, "loss": 0.2692, "step": 2806 }, { "epoch": 1.8118178396687994, "grad_norm": 0.2417868822813034, "learning_rate": 8.752723795894933e-05, "loss": 0.3663, "step": 2807 }, { "epoch": 1.8125705683101243, "grad_norm": 0.27062463760375977, "learning_rate": 8.751695282763174e-05, "loss": 0.3055, "step": 2808 }, { "epoch": 1.813323296951449, "grad_norm": 0.2262653112411499, "learning_rate": 8.750666414030909e-05, "loss": 0.2835, "step": 2809 }, { "epoch": 1.8140760255927737, "grad_norm": 0.2180846631526947, "learning_rate": 8.749637189810654e-05, "loss": 0.3075, "step": 2810 }, { "epoch": 1.8148287542340986, "grad_norm": 0.25188252329826355, "learning_rate": 8.748607610214959e-05, "loss": 0.2934, "step": 2811 }, { "epoch": 1.8155814828754235, "grad_norm": 0.23703154921531677, "learning_rate": 8.74757767535642e-05, "loss": 0.2826, "step": 2812 }, { "epoch": 1.8163342115167482, "grad_norm": 0.21418799459934235, "learning_rate": 8.74654738534767e-05, "loss": 0.2442, "step": 2813 }, { "epoch": 1.817086940158073, "grad_norm": 0.2467823028564453, "learning_rate": 8.745516740301378e-05, "loss": 0.1495, "step": 2814 }, { "epoch": 1.8178396687993978, "grad_norm": 0.23477841913700104, "learning_rate": 8.744485740330256e-05, "loss": 0.3542, "step": 2815 }, { "epoch": 1.8185923974407228, "grad_norm": 0.21137480437755585, "learning_rate": 8.743454385547052e-05, "loss": 0.2599, "step": 2816 }, { "epoch": 1.8193451260820475, "grad_norm": 0.23956574499607086, "learning_rate": 8.742422676064551e-05, "loss": 0.2566, "step": 2817 }, { "epoch": 1.8200978547233722, "grad_norm": 0.21462732553482056, "learning_rate": 8.741390611995581e-05, "loss": 0.2521, "step": 2818 }, { "epoch": 1.820850583364697, "grad_norm": 0.1788559854030609, "learning_rate": 8.740358193453008e-05, "loss": 0.2554, "step": 2819 }, { "epoch": 1.821603312006022, "grad_norm": 0.2536245584487915, "learning_rate": 8.739325420549735e-05, "loss": 0.3102, "step": 2820 }, { "epoch": 1.8223560406473467, "grad_norm": 0.22241193056106567, "learning_rate": 8.738292293398705e-05, "loss": 0.3063, "step": 2821 }, { "epoch": 1.8231087692886714, "grad_norm": 0.21649517118930817, "learning_rate": 8.737258812112896e-05, "loss": 0.3066, "step": 2822 }, { "epoch": 1.8238614979299963, "grad_norm": 0.20589040219783783, "learning_rate": 8.736224976805333e-05, "loss": 0.2412, "step": 2823 }, { "epoch": 1.824614226571321, "grad_norm": 0.2370595932006836, "learning_rate": 8.735190787589069e-05, "loss": 0.335, "step": 2824 }, { "epoch": 1.8253669552126457, "grad_norm": 0.1994960755109787, "learning_rate": 8.734156244577209e-05, "loss": 0.3034, "step": 2825 }, { "epoch": 1.8261196838539706, "grad_norm": 0.21514268219470978, "learning_rate": 8.73312134788288e-05, "loss": 0.2598, "step": 2826 }, { "epoch": 1.8268724124952955, "grad_norm": 0.18540267646312714, "learning_rate": 8.732086097619265e-05, "loss": 0.266, "step": 2827 }, { "epoch": 1.8276251411366202, "grad_norm": 0.23667356371879578, "learning_rate": 8.731050493899572e-05, "loss": 0.3299, "step": 2828 }, { "epoch": 1.828377869777945, "grad_norm": 0.19919835031032562, "learning_rate": 8.730014536837055e-05, "loss": 0.2784, "step": 2829 }, { "epoch": 1.8291305984192698, "grad_norm": 0.2409314662218094, "learning_rate": 8.728978226545003e-05, "loss": 0.281, "step": 2830 }, { "epoch": 1.8298833270605948, "grad_norm": 0.2280791699886322, "learning_rate": 8.72794156313675e-05, "loss": 0.3106, "step": 2831 }, { "epoch": 1.8306360557019195, "grad_norm": 0.3088875412940979, "learning_rate": 8.726904546725658e-05, "loss": 0.2497, "step": 2832 }, { "epoch": 1.8313887843432441, "grad_norm": 0.3412451148033142, "learning_rate": 8.725867177425138e-05, "loss": 0.3176, "step": 2833 }, { "epoch": 1.832141512984569, "grad_norm": 0.30903807282447815, "learning_rate": 8.724829455348633e-05, "loss": 0.2244, "step": 2834 }, { "epoch": 1.832894241625894, "grad_norm": 0.22488762438297272, "learning_rate": 8.723791380609625e-05, "loss": 0.3262, "step": 2835 }, { "epoch": 1.8336469702672187, "grad_norm": 0.298503041267395, "learning_rate": 8.722752953321644e-05, "loss": 0.264, "step": 2836 }, { "epoch": 1.8343996989085434, "grad_norm": 0.2768026888370514, "learning_rate": 8.72171417359824e-05, "loss": 0.2273, "step": 2837 }, { "epoch": 1.8351524275498683, "grad_norm": 0.3448675572872162, "learning_rate": 8.72067504155302e-05, "loss": 0.3148, "step": 2838 }, { "epoch": 1.8359051561911932, "grad_norm": 0.2496698945760727, "learning_rate": 8.71963555729962e-05, "loss": 0.3529, "step": 2839 }, { "epoch": 1.836657884832518, "grad_norm": 0.22723986208438873, "learning_rate": 8.718595720951716e-05, "loss": 0.2742, "step": 2840 }, { "epoch": 1.8374106134738426, "grad_norm": 0.21908727288246155, "learning_rate": 8.717555532623022e-05, "loss": 0.3267, "step": 2841 }, { "epoch": 1.8381633421151675, "grad_norm": 0.23017644882202148, "learning_rate": 8.716514992427293e-05, "loss": 0.2115, "step": 2842 }, { "epoch": 1.8389160707564924, "grad_norm": 0.29185348749160767, "learning_rate": 8.715474100478321e-05, "loss": 0.2924, "step": 2843 }, { "epoch": 1.8396687993978171, "grad_norm": 0.2685585021972656, "learning_rate": 8.714432856889935e-05, "loss": 0.2987, "step": 2844 }, { "epoch": 1.8404215280391418, "grad_norm": 0.23807938396930695, "learning_rate": 8.713391261776004e-05, "loss": 0.295, "step": 2845 }, { "epoch": 1.8411742566804667, "grad_norm": 0.24327579140663147, "learning_rate": 8.712349315250436e-05, "loss": 0.2119, "step": 2846 }, { "epoch": 1.8419269853217914, "grad_norm": 0.24869495630264282, "learning_rate": 8.711307017427178e-05, "loss": 0.2559, "step": 2847 }, { "epoch": 1.8426797139631161, "grad_norm": 0.24818648397922516, "learning_rate": 8.710264368420212e-05, "loss": 0.2091, "step": 2848 }, { "epoch": 1.843432442604441, "grad_norm": 0.20934540033340454, "learning_rate": 8.709221368343562e-05, "loss": 0.2571, "step": 2849 }, { "epoch": 1.844185171245766, "grad_norm": 0.23259326815605164, "learning_rate": 8.708178017311287e-05, "loss": 0.2227, "step": 2850 }, { "epoch": 1.8449378998870907, "grad_norm": 0.22839339077472687, "learning_rate": 8.707134315437489e-05, "loss": 0.2678, "step": 2851 }, { "epoch": 1.8456906285284154, "grad_norm": 0.29071512818336487, "learning_rate": 8.706090262836301e-05, "loss": 0.3196, "step": 2852 }, { "epoch": 1.8464433571697403, "grad_norm": 0.23151472210884094, "learning_rate": 8.705045859621905e-05, "loss": 0.2596, "step": 2853 }, { "epoch": 1.8471960858110652, "grad_norm": 0.24601100385189056, "learning_rate": 8.704001105908512e-05, "loss": 0.3689, "step": 2854 }, { "epoch": 1.84794881445239, "grad_norm": 0.22173075377941132, "learning_rate": 8.702956001810375e-05, "loss": 0.2669, "step": 2855 }, { "epoch": 1.8487015430937146, "grad_norm": 0.22268569469451904, "learning_rate": 8.701910547441786e-05, "loss": 0.2989, "step": 2856 }, { "epoch": 1.8494542717350395, "grad_norm": 0.25744694471359253, "learning_rate": 8.700864742917072e-05, "loss": 0.2558, "step": 2857 }, { "epoch": 1.8502070003763644, "grad_norm": 0.22924372553825378, "learning_rate": 8.699818588350601e-05, "loss": 0.2307, "step": 2858 }, { "epoch": 1.8509597290176891, "grad_norm": 0.21253761649131775, "learning_rate": 8.698772083856782e-05, "loss": 0.3142, "step": 2859 }, { "epoch": 1.8517124576590138, "grad_norm": 0.23942196369171143, "learning_rate": 8.697725229550056e-05, "loss": 0.2164, "step": 2860 }, { "epoch": 1.8524651863003387, "grad_norm": 0.22019614279270172, "learning_rate": 8.696678025544908e-05, "loss": 0.2817, "step": 2861 }, { "epoch": 1.8532179149416637, "grad_norm": 0.2884751558303833, "learning_rate": 8.695630471955859e-05, "loss": 0.2819, "step": 2862 }, { "epoch": 1.8539706435829884, "grad_norm": 0.321354478597641, "learning_rate": 8.694582568897462e-05, "loss": 0.2327, "step": 2863 }, { "epoch": 1.854723372224313, "grad_norm": 0.26361405849456787, "learning_rate": 8.693534316484321e-05, "loss": 0.2907, "step": 2864 }, { "epoch": 1.855476100865638, "grad_norm": 0.25261208415031433, "learning_rate": 8.692485714831067e-05, "loss": 0.2732, "step": 2865 }, { "epoch": 1.8562288295069629, "grad_norm": 0.3338828384876251, "learning_rate": 8.691436764052378e-05, "loss": 0.3309, "step": 2866 }, { "epoch": 1.8569815581482876, "grad_norm": 0.2783341109752655, "learning_rate": 8.69038746426296e-05, "loss": 0.2767, "step": 2867 }, { "epoch": 1.8577342867896123, "grad_norm": 0.23239947855472565, "learning_rate": 8.689337815577564e-05, "loss": 0.2909, "step": 2868 }, { "epoch": 1.8584870154309372, "grad_norm": 0.23658406734466553, "learning_rate": 8.688287818110983e-05, "loss": 0.2499, "step": 2869 }, { "epoch": 1.8592397440722621, "grad_norm": 0.21386252343654633, "learning_rate": 8.687237471978038e-05, "loss": 0.3249, "step": 2870 }, { "epoch": 1.8599924727135866, "grad_norm": 0.23580464720726013, "learning_rate": 8.686186777293594e-05, "loss": 0.3258, "step": 2871 }, { "epoch": 1.8607452013549115, "grad_norm": 0.25041162967681885, "learning_rate": 8.685135734172557e-05, "loss": 0.2691, "step": 2872 }, { "epoch": 1.8614979299962364, "grad_norm": 0.21163125336170197, "learning_rate": 8.684084342729864e-05, "loss": 0.2709, "step": 2873 }, { "epoch": 1.8622506586375611, "grad_norm": 0.21495287120342255, "learning_rate": 8.683032603080494e-05, "loss": 0.3403, "step": 2874 }, { "epoch": 1.8630033872788858, "grad_norm": 0.22675228118896484, "learning_rate": 8.681980515339464e-05, "loss": 0.328, "step": 2875 }, { "epoch": 1.8637561159202107, "grad_norm": 0.21838876605033875, "learning_rate": 8.68092807962183e-05, "loss": 0.3523, "step": 2876 }, { "epoch": 1.8645088445615356, "grad_norm": 0.23061077296733856, "learning_rate": 8.679875296042682e-05, "loss": 0.343, "step": 2877 }, { "epoch": 1.8652615732028603, "grad_norm": 0.1981222927570343, "learning_rate": 8.678822164717155e-05, "loss": 0.2913, "step": 2878 }, { "epoch": 1.866014301844185, "grad_norm": 0.23590002954006195, "learning_rate": 8.677768685760412e-05, "loss": 0.251, "step": 2879 }, { "epoch": 1.86676703048551, "grad_norm": 0.2050963193178177, "learning_rate": 8.676714859287666e-05, "loss": 0.2526, "step": 2880 }, { "epoch": 1.8675197591268349, "grad_norm": 0.22967682778835297, "learning_rate": 8.675660685414157e-05, "loss": 0.2675, "step": 2881 }, { "epoch": 1.8682724877681596, "grad_norm": 0.2015855610370636, "learning_rate": 8.674606164255171e-05, "loss": 0.2876, "step": 2882 }, { "epoch": 1.8690252164094843, "grad_norm": 0.27875471115112305, "learning_rate": 8.673551295926028e-05, "loss": 0.3882, "step": 2883 }, { "epoch": 1.8697779450508092, "grad_norm": 0.25285956263542175, "learning_rate": 8.672496080542087e-05, "loss": 0.3025, "step": 2884 }, { "epoch": 1.870530673692134, "grad_norm": 0.2154601365327835, "learning_rate": 8.671440518218744e-05, "loss": 0.3477, "step": 2885 }, { "epoch": 1.8712834023334588, "grad_norm": 0.2295289933681488, "learning_rate": 8.670384609071435e-05, "loss": 0.2933, "step": 2886 }, { "epoch": 1.8720361309747835, "grad_norm": 0.25231748819351196, "learning_rate": 8.66932835321563e-05, "loss": 0.2481, "step": 2887 }, { "epoch": 1.8727888596161084, "grad_norm": 0.2262069582939148, "learning_rate": 8.668271750766843e-05, "loss": 0.2996, "step": 2888 }, { "epoch": 1.8735415882574333, "grad_norm": 0.23247423768043518, "learning_rate": 8.667214801840619e-05, "loss": 0.2778, "step": 2889 }, { "epoch": 1.874294316898758, "grad_norm": 0.21724571287631989, "learning_rate": 8.666157506552547e-05, "loss": 0.3166, "step": 2890 }, { "epoch": 1.8750470455400827, "grad_norm": 0.2218727320432663, "learning_rate": 8.66509986501825e-05, "loss": 0.3311, "step": 2891 }, { "epoch": 1.8757997741814076, "grad_norm": 0.2138187289237976, "learning_rate": 8.66404187735339e-05, "loss": 0.2455, "step": 2892 }, { "epoch": 1.8765525028227326, "grad_norm": 0.22879035770893097, "learning_rate": 8.662983543673669e-05, "loss": 0.2327, "step": 2893 }, { "epoch": 1.8773052314640573, "grad_norm": 0.2275388389825821, "learning_rate": 8.661924864094822e-05, "loss": 0.2901, "step": 2894 }, { "epoch": 1.878057960105382, "grad_norm": 0.2097891867160797, "learning_rate": 8.660865838732626e-05, "loss": 0.2724, "step": 2895 }, { "epoch": 1.8788106887467069, "grad_norm": 0.2008601874113083, "learning_rate": 8.659806467702894e-05, "loss": 0.23, "step": 2896 }, { "epoch": 1.8795634173880316, "grad_norm": 0.21263530850410461, "learning_rate": 8.658746751121478e-05, "loss": 0.2741, "step": 2897 }, { "epoch": 1.8803161460293563, "grad_norm": 0.24013717472553253, "learning_rate": 8.657686689104267e-05, "loss": 0.3778, "step": 2898 }, { "epoch": 1.8810688746706812, "grad_norm": 0.19852349162101746, "learning_rate": 8.656626281767184e-05, "loss": 0.2975, "step": 2899 }, { "epoch": 1.881821603312006, "grad_norm": 0.248243048787117, "learning_rate": 8.655565529226198e-05, "loss": 0.2007, "step": 2900 }, { "epoch": 1.8825743319533308, "grad_norm": 0.20241428911685944, "learning_rate": 8.654504431597312e-05, "loss": 0.2228, "step": 2901 }, { "epoch": 1.8833270605946555, "grad_norm": 0.23202480375766754, "learning_rate": 8.653442988996563e-05, "loss": 0.2626, "step": 2902 }, { "epoch": 1.8840797892359804, "grad_norm": 0.20530018210411072, "learning_rate": 8.652381201540031e-05, "loss": 0.2685, "step": 2903 }, { "epoch": 1.8848325178773053, "grad_norm": 0.19486404955387115, "learning_rate": 8.651319069343828e-05, "loss": 0.2416, "step": 2904 }, { "epoch": 1.88558524651863, "grad_norm": 0.22673244774341583, "learning_rate": 8.650256592524112e-05, "loss": 0.286, "step": 2905 }, { "epoch": 1.8863379751599547, "grad_norm": 0.2040712684392929, "learning_rate": 8.649193771197068e-05, "loss": 0.2036, "step": 2906 }, { "epoch": 1.8870907038012796, "grad_norm": 0.29954782128334045, "learning_rate": 8.648130605478931e-05, "loss": 0.2809, "step": 2907 }, { "epoch": 1.8878434324426046, "grad_norm": 0.2695918083190918, "learning_rate": 8.647067095485963e-05, "loss": 0.3324, "step": 2908 }, { "epoch": 1.8885961610839292, "grad_norm": 0.233209490776062, "learning_rate": 8.646003241334468e-05, "loss": 0.294, "step": 2909 }, { "epoch": 1.889348889725254, "grad_norm": 0.23592136800289154, "learning_rate": 8.64493904314079e-05, "loss": 0.2974, "step": 2910 }, { "epoch": 1.8901016183665789, "grad_norm": 0.21285352110862732, "learning_rate": 8.643874501021307e-05, "loss": 0.2182, "step": 2911 }, { "epoch": 1.8908543470079038, "grad_norm": 0.20924341678619385, "learning_rate": 8.642809615092435e-05, "loss": 0.3227, "step": 2912 }, { "epoch": 1.8916070756492285, "grad_norm": 0.2423587441444397, "learning_rate": 8.641744385470628e-05, "loss": 0.2606, "step": 2913 }, { "epoch": 1.8923598042905532, "grad_norm": 0.23832157254219055, "learning_rate": 8.640678812272378e-05, "loss": 0.2846, "step": 2914 }, { "epoch": 1.893112532931878, "grad_norm": 0.23278239369392395, "learning_rate": 8.639612895614216e-05, "loss": 0.2206, "step": 2915 }, { "epoch": 1.893865261573203, "grad_norm": 0.2235506922006607, "learning_rate": 8.638546635612708e-05, "loss": 0.2483, "step": 2916 }, { "epoch": 1.8946179902145277, "grad_norm": 0.19948139786720276, "learning_rate": 8.637480032384459e-05, "loss": 0.1926, "step": 2917 }, { "epoch": 1.8953707188558524, "grad_norm": 0.23911461234092712, "learning_rate": 8.636413086046109e-05, "loss": 0.3223, "step": 2918 }, { "epoch": 1.8961234474971773, "grad_norm": 0.18584205210208893, "learning_rate": 8.63534579671434e-05, "loss": 0.2799, "step": 2919 }, { "epoch": 1.8968761761385022, "grad_norm": 0.20499111711978912, "learning_rate": 8.63427816450587e-05, "loss": 0.3185, "step": 2920 }, { "epoch": 1.8976289047798267, "grad_norm": 0.2011103481054306, "learning_rate": 8.633210189537452e-05, "loss": 0.1872, "step": 2921 }, { "epoch": 1.8983816334211516, "grad_norm": 0.21738362312316895, "learning_rate": 8.632141871925877e-05, "loss": 0.254, "step": 2922 }, { "epoch": 1.8991343620624765, "grad_norm": 0.2145114243030548, "learning_rate": 8.631073211787978e-05, "loss": 0.243, "step": 2923 }, { "epoch": 1.8998870907038012, "grad_norm": 0.20909902453422546, "learning_rate": 8.630004209240619e-05, "loss": 0.329, "step": 2924 }, { "epoch": 1.900639819345126, "grad_norm": 0.21982713043689728, "learning_rate": 8.628934864400706e-05, "loss": 0.2496, "step": 2925 }, { "epoch": 1.9013925479864509, "grad_norm": 0.21098129451274872, "learning_rate": 8.627865177385178e-05, "loss": 0.174, "step": 2926 }, { "epoch": 1.9021452766277758, "grad_norm": 0.23815716803073883, "learning_rate": 8.626795148311022e-05, "loss": 0.2853, "step": 2927 }, { "epoch": 1.9028980052691005, "grad_norm": 0.27293330430984497, "learning_rate": 8.625724777295245e-05, "loss": 0.2549, "step": 2928 }, { "epoch": 1.9036507339104252, "grad_norm": 0.20729248225688934, "learning_rate": 8.624654064454907e-05, "loss": 0.3652, "step": 2929 }, { "epoch": 1.90440346255175, "grad_norm": 0.2872817814350128, "learning_rate": 8.623583009907099e-05, "loss": 0.4245, "step": 2930 }, { "epoch": 1.905156191193075, "grad_norm": 0.25292328000068665, "learning_rate": 8.62251161376895e-05, "loss": 0.295, "step": 2931 }, { "epoch": 1.9059089198343997, "grad_norm": 0.2863334119319916, "learning_rate": 8.621439876157622e-05, "loss": 0.3072, "step": 2932 }, { "epoch": 1.9066616484757244, "grad_norm": 0.22648480534553528, "learning_rate": 8.620367797190327e-05, "loss": 0.282, "step": 2933 }, { "epoch": 1.9074143771170493, "grad_norm": 0.268633633852005, "learning_rate": 8.619295376984297e-05, "loss": 0.2837, "step": 2934 }, { "epoch": 1.9081671057583742, "grad_norm": 0.2321457713842392, "learning_rate": 8.618222615656816e-05, "loss": 0.185, "step": 2935 }, { "epoch": 1.908919834399699, "grad_norm": 0.24460190534591675, "learning_rate": 8.617149513325198e-05, "loss": 0.287, "step": 2936 }, { "epoch": 1.9096725630410236, "grad_norm": 0.24697184562683105, "learning_rate": 8.616076070106796e-05, "loss": 0.3049, "step": 2937 }, { "epoch": 1.9104252916823485, "grad_norm": 0.21430233120918274, "learning_rate": 8.615002286119e-05, "loss": 0.3179, "step": 2938 }, { "epoch": 1.9111780203236735, "grad_norm": 0.2456596940755844, "learning_rate": 8.613928161479237e-05, "loss": 0.3956, "step": 2939 }, { "epoch": 1.9119307489649982, "grad_norm": 0.22907528281211853, "learning_rate": 8.612853696304972e-05, "loss": 0.2682, "step": 2940 }, { "epoch": 1.9126834776063228, "grad_norm": 0.22149233520030975, "learning_rate": 8.611778890713707e-05, "loss": 0.2842, "step": 2941 }, { "epoch": 1.9134362062476478, "grad_norm": 0.1800977736711502, "learning_rate": 8.610703744822981e-05, "loss": 0.1835, "step": 2942 }, { "epoch": 1.9141889348889727, "grad_norm": 0.24327979981899261, "learning_rate": 8.60962825875037e-05, "loss": 0.3572, "step": 2943 }, { "epoch": 1.9149416635302974, "grad_norm": 0.19312527775764465, "learning_rate": 8.608552432613488e-05, "loss": 0.2937, "step": 2944 }, { "epoch": 1.915694392171622, "grad_norm": 0.21501602232456207, "learning_rate": 8.607476266529987e-05, "loss": 0.2028, "step": 2945 }, { "epoch": 1.916447120812947, "grad_norm": 0.21633580327033997, "learning_rate": 8.606399760617552e-05, "loss": 0.2078, "step": 2946 }, { "epoch": 1.9171998494542717, "grad_norm": 0.21256864070892334, "learning_rate": 8.605322914993909e-05, "loss": 0.281, "step": 2947 }, { "epoch": 1.9179525780955964, "grad_norm": 0.22933848202228546, "learning_rate": 8.604245729776822e-05, "loss": 0.2595, "step": 2948 }, { "epoch": 1.9187053067369213, "grad_norm": 0.2019030749797821, "learning_rate": 8.60316820508409e-05, "loss": 0.2021, "step": 2949 }, { "epoch": 1.9194580353782462, "grad_norm": 0.20405729115009308, "learning_rate": 8.602090341033547e-05, "loss": 0.2269, "step": 2950 }, { "epoch": 1.920210764019571, "grad_norm": 0.25741708278656006, "learning_rate": 8.601012137743069e-05, "loss": 0.2171, "step": 2951 }, { "epoch": 1.9209634926608956, "grad_norm": 0.20995643734931946, "learning_rate": 8.599933595330566e-05, "loss": 0.3128, "step": 2952 }, { "epoch": 1.9217162213022205, "grad_norm": 0.24055533111095428, "learning_rate": 8.598854713913985e-05, "loss": 0.4239, "step": 2953 }, { "epoch": 1.9224689499435454, "grad_norm": 0.2122773379087448, "learning_rate": 8.597775493611311e-05, "loss": 0.3003, "step": 2954 }, { "epoch": 1.9232216785848701, "grad_norm": 0.27335125207901, "learning_rate": 8.596695934540567e-05, "loss": 0.283, "step": 2955 }, { "epoch": 1.9239744072261948, "grad_norm": 0.21074801683425903, "learning_rate": 8.595616036819812e-05, "loss": 0.1974, "step": 2956 }, { "epoch": 1.9247271358675198, "grad_norm": 0.24287545680999756, "learning_rate": 8.594535800567142e-05, "loss": 0.3073, "step": 2957 }, { "epoch": 1.9254798645088447, "grad_norm": 0.24772042036056519, "learning_rate": 8.593455225900688e-05, "loss": 0.2948, "step": 2958 }, { "epoch": 1.9262325931501694, "grad_norm": 0.19711074233055115, "learning_rate": 8.592374312938623e-05, "loss": 0.3064, "step": 2959 }, { "epoch": 1.926985321791494, "grad_norm": 0.219325989484787, "learning_rate": 8.591293061799151e-05, "loss": 0.2863, "step": 2960 }, { "epoch": 1.927738050432819, "grad_norm": 0.23715239763259888, "learning_rate": 8.590211472600518e-05, "loss": 0.3043, "step": 2961 }, { "epoch": 1.928490779074144, "grad_norm": 0.22474516928195953, "learning_rate": 8.589129545461003e-05, "loss": 0.288, "step": 2962 }, { "epoch": 1.9292435077154686, "grad_norm": 0.21535854041576385, "learning_rate": 8.588047280498926e-05, "loss": 0.2485, "step": 2963 }, { "epoch": 1.9299962363567933, "grad_norm": 0.2367725819349289, "learning_rate": 8.586964677832643e-05, "loss": 0.3467, "step": 2964 }, { "epoch": 1.9307489649981182, "grad_norm": 0.2519165277481079, "learning_rate": 8.585881737580543e-05, "loss": 0.3053, "step": 2965 }, { "epoch": 1.9315016936394431, "grad_norm": 0.19480787217617035, "learning_rate": 8.584798459861055e-05, "loss": 0.1565, "step": 2966 }, { "epoch": 1.9322544222807678, "grad_norm": 0.20121853053569794, "learning_rate": 8.583714844792646e-05, "loss": 0.2644, "step": 2967 }, { "epoch": 1.9330071509220925, "grad_norm": 0.20641329884529114, "learning_rate": 8.582630892493818e-05, "loss": 0.2734, "step": 2968 }, { "epoch": 1.9337598795634174, "grad_norm": 0.19879822432994843, "learning_rate": 8.58154660308311e-05, "loss": 0.2341, "step": 2969 }, { "epoch": 1.9345126082047424, "grad_norm": 0.22249239683151245, "learning_rate": 8.5804619766791e-05, "loss": 0.2772, "step": 2970 }, { "epoch": 1.9352653368460668, "grad_norm": 0.19953785836696625, "learning_rate": 8.579377013400398e-05, "loss": 0.2769, "step": 2971 }, { "epoch": 1.9360180654873917, "grad_norm": 0.2153954654932022, "learning_rate": 8.578291713365656e-05, "loss": 0.2756, "step": 2972 }, { "epoch": 1.9367707941287167, "grad_norm": 0.2132946103811264, "learning_rate": 8.57720607669356e-05, "loss": 0.3362, "step": 2973 }, { "epoch": 1.9375235227700414, "grad_norm": 0.2018149197101593, "learning_rate": 8.576120103502834e-05, "loss": 0.2678, "step": 2974 }, { "epoch": 1.938276251411366, "grad_norm": 0.2617453634738922, "learning_rate": 8.575033793912239e-05, "loss": 0.3848, "step": 2975 }, { "epoch": 1.939028980052691, "grad_norm": 0.20273295044898987, "learning_rate": 8.57394714804057e-05, "loss": 0.2539, "step": 2976 }, { "epoch": 1.939781708694016, "grad_norm": 0.2055341899394989, "learning_rate": 8.572860166006665e-05, "loss": 0.2407, "step": 2977 }, { "epoch": 1.9405344373353406, "grad_norm": 0.2045820653438568, "learning_rate": 8.57177284792939e-05, "loss": 0.2333, "step": 2978 }, { "epoch": 1.9412871659766653, "grad_norm": 0.23753294348716736, "learning_rate": 8.570685193927655e-05, "loss": 0.2353, "step": 2979 }, { "epoch": 1.9420398946179902, "grad_norm": 0.20989519357681274, "learning_rate": 8.569597204120405e-05, "loss": 0.2881, "step": 2980 }, { "epoch": 1.9427926232593151, "grad_norm": 0.22021034359931946, "learning_rate": 8.568508878626618e-05, "loss": 0.2317, "step": 2981 }, { "epoch": 1.9435453519006398, "grad_norm": 1.5946131944656372, "learning_rate": 8.567420217565315e-05, "loss": 0.2252, "step": 2982 }, { "epoch": 1.9442980805419645, "grad_norm": 0.2584066092967987, "learning_rate": 8.566331221055549e-05, "loss": 0.27, "step": 2983 }, { "epoch": 1.9450508091832894, "grad_norm": 0.220820352435112, "learning_rate": 8.56524188921641e-05, "loss": 0.2552, "step": 2984 }, { "epoch": 1.9458035378246143, "grad_norm": 0.2744828462600708, "learning_rate": 8.564152222167027e-05, "loss": 0.2759, "step": 2985 }, { "epoch": 1.946556266465939, "grad_norm": 0.20876671373844147, "learning_rate": 8.563062220026564e-05, "loss": 0.2131, "step": 2986 }, { "epoch": 1.9473089951072637, "grad_norm": 0.23026233911514282, "learning_rate": 8.561971882914223e-05, "loss": 0.3245, "step": 2987 }, { "epoch": 1.9480617237485887, "grad_norm": 0.2811889350414276, "learning_rate": 8.560881210949238e-05, "loss": 0.262, "step": 2988 }, { "epoch": 1.9488144523899136, "grad_norm": 0.22063106298446655, "learning_rate": 8.559790204250887e-05, "loss": 0.3295, "step": 2989 }, { "epoch": 1.9495671810312383, "grad_norm": 0.20668788254261017, "learning_rate": 8.55869886293848e-05, "loss": 0.2091, "step": 2990 }, { "epoch": 1.950319909672563, "grad_norm": 0.2421753853559494, "learning_rate": 8.557607187131364e-05, "loss": 0.2995, "step": 2991 }, { "epoch": 1.9510726383138879, "grad_norm": 0.22836843132972717, "learning_rate": 8.55651517694892e-05, "loss": 0.3348, "step": 2992 }, { "epoch": 1.9518253669552128, "grad_norm": 0.26765379309654236, "learning_rate": 8.555422832510576e-05, "loss": 0.325, "step": 2993 }, { "epoch": 1.9525780955965375, "grad_norm": 0.2882736325263977, "learning_rate": 8.554330153935782e-05, "loss": 0.2464, "step": 2994 }, { "epoch": 1.9533308242378622, "grad_norm": 0.2746528387069702, "learning_rate": 8.553237141344035e-05, "loss": 0.2763, "step": 2995 }, { "epoch": 1.9540835528791871, "grad_norm": 0.2537713050842285, "learning_rate": 8.552143794854865e-05, "loss": 0.307, "step": 2996 }, { "epoch": 1.9548362815205118, "grad_norm": 0.2654704451560974, "learning_rate": 8.55105011458784e-05, "loss": 0.3186, "step": 2997 }, { "epoch": 1.9555890101618365, "grad_norm": 0.2683500051498413, "learning_rate": 8.54995610066256e-05, "loss": 0.296, "step": 2998 }, { "epoch": 1.9563417388031614, "grad_norm": 0.2821999490261078, "learning_rate": 8.548861753198665e-05, "loss": 0.3464, "step": 2999 }, { "epoch": 1.9570944674444863, "grad_norm": 0.22154122591018677, "learning_rate": 8.547767072315835e-05, "loss": 0.3027, "step": 3000 }, { "epoch": 1.9570944674444863, "eval_loss": 0.2772587537765503, "eval_runtime": 456.6837, "eval_samples_per_second": 21.08, "eval_steps_per_second": 0.659, "step": 3000 }, { "epoch": 1.957847196085811, "grad_norm": 0.21864928305149078, "learning_rate": 8.546672058133779e-05, "loss": 0.2215, "step": 3001 }, { "epoch": 1.9585999247271357, "grad_norm": 0.2190273106098175, "learning_rate": 8.545576710772248e-05, "loss": 0.2536, "step": 3002 }, { "epoch": 1.9593526533684607, "grad_norm": 0.24375633895397186, "learning_rate": 8.544481030351028e-05, "loss": 0.3518, "step": 3003 }, { "epoch": 1.9601053820097856, "grad_norm": 0.23248916864395142, "learning_rate": 8.543385016989937e-05, "loss": 0.2288, "step": 3004 }, { "epoch": 1.9608581106511103, "grad_norm": 0.23642736673355103, "learning_rate": 8.542288670808838e-05, "loss": 0.2402, "step": 3005 }, { "epoch": 1.961610839292435, "grad_norm": 0.22408750653266907, "learning_rate": 8.541191991927624e-05, "loss": 0.2685, "step": 3006 }, { "epoch": 1.9623635679337599, "grad_norm": 0.20452839136123657, "learning_rate": 8.540094980466225e-05, "loss": 0.2633, "step": 3007 }, { "epoch": 1.9631162965750848, "grad_norm": 0.20500244200229645, "learning_rate": 8.53899763654461e-05, "loss": 0.3048, "step": 3008 }, { "epoch": 1.9638690252164095, "grad_norm": 0.24516703188419342, "learning_rate": 8.537899960282783e-05, "loss": 0.1974, "step": 3009 }, { "epoch": 1.9646217538577342, "grad_norm": 0.21500150859355927, "learning_rate": 8.536801951800784e-05, "loss": 0.3058, "step": 3010 }, { "epoch": 1.965374482499059, "grad_norm": 0.23131513595581055, "learning_rate": 8.535703611218688e-05, "loss": 0.2691, "step": 3011 }, { "epoch": 1.966127211140384, "grad_norm": 0.2054387629032135, "learning_rate": 8.53460493865661e-05, "loss": 0.3272, "step": 3012 }, { "epoch": 1.9668799397817087, "grad_norm": 0.24086467921733856, "learning_rate": 8.533505934234699e-05, "loss": 0.3632, "step": 3013 }, { "epoch": 1.9676326684230334, "grad_norm": 0.24434475600719452, "learning_rate": 8.53240659807314e-05, "loss": 0.2587, "step": 3014 }, { "epoch": 1.9683853970643583, "grad_norm": 0.18835920095443726, "learning_rate": 8.531306930292153e-05, "loss": 0.1922, "step": 3015 }, { "epoch": 1.9691381257056833, "grad_norm": 0.2346513420343399, "learning_rate": 8.530206931012e-05, "loss": 0.2121, "step": 3016 }, { "epoch": 1.969890854347008, "grad_norm": 0.23613634705543518, "learning_rate": 8.52910660035297e-05, "loss": 0.3836, "step": 3017 }, { "epoch": 1.9706435829883326, "grad_norm": 0.2180359810590744, "learning_rate": 8.528005938435398e-05, "loss": 0.243, "step": 3018 }, { "epoch": 1.9713963116296576, "grad_norm": 0.21022672951221466, "learning_rate": 8.52690494537965e-05, "loss": 0.2962, "step": 3019 }, { "epoch": 1.9721490402709823, "grad_norm": 0.24418272078037262, "learning_rate": 8.525803621306125e-05, "loss": 0.3334, "step": 3020 }, { "epoch": 1.972901768912307, "grad_norm": 0.21116766333580017, "learning_rate": 8.524701966335268e-05, "loss": 0.2024, "step": 3021 }, { "epoch": 1.9736544975536319, "grad_norm": 0.23705634474754333, "learning_rate": 8.523599980587551e-05, "loss": 0.2593, "step": 3022 }, { "epoch": 1.9744072261949568, "grad_norm": 0.1656016856431961, "learning_rate": 8.522497664183486e-05, "loss": 0.2107, "step": 3023 }, { "epoch": 1.9751599548362815, "grad_norm": 0.2452172338962555, "learning_rate": 8.521395017243621e-05, "loss": 0.2841, "step": 3024 }, { "epoch": 1.9759126834776062, "grad_norm": 0.18848060071468353, "learning_rate": 8.520292039888539e-05, "loss": 0.219, "step": 3025 }, { "epoch": 1.976665412118931, "grad_norm": 0.2830885648727417, "learning_rate": 8.51918873223886e-05, "loss": 0.3241, "step": 3026 }, { "epoch": 1.977418140760256, "grad_norm": 0.22855377197265625, "learning_rate": 8.51808509441524e-05, "loss": 0.368, "step": 3027 }, { "epoch": 1.9781708694015807, "grad_norm": 0.2616623044013977, "learning_rate": 8.516981126538374e-05, "loss": 0.2949, "step": 3028 }, { "epoch": 1.9789235980429054, "grad_norm": 0.21715864539146423, "learning_rate": 8.515876828728985e-05, "loss": 0.2809, "step": 3029 }, { "epoch": 1.9796763266842303, "grad_norm": 0.23090210556983948, "learning_rate": 8.514772201107842e-05, "loss": 0.323, "step": 3030 }, { "epoch": 1.9804290553255552, "grad_norm": 0.2105245590209961, "learning_rate": 8.513667243795744e-05, "loss": 0.3534, "step": 3031 }, { "epoch": 1.98118178396688, "grad_norm": 0.21090847253799438, "learning_rate": 8.512561956913528e-05, "loss": 0.2435, "step": 3032 }, { "epoch": 1.9819345126082046, "grad_norm": 0.29027748107910156, "learning_rate": 8.511456340582065e-05, "loss": 0.316, "step": 3033 }, { "epoch": 1.9826872412495296, "grad_norm": 0.22997485101222992, "learning_rate": 8.510350394922263e-05, "loss": 0.245, "step": 3034 }, { "epoch": 1.9834399698908545, "grad_norm": 0.24350984394550323, "learning_rate": 8.50924412005507e-05, "loss": 0.2242, "step": 3035 }, { "epoch": 1.9841926985321792, "grad_norm": 0.19305388629436493, "learning_rate": 8.508137516101461e-05, "loss": 0.255, "step": 3036 }, { "epoch": 1.9849454271735039, "grad_norm": 0.2158816009759903, "learning_rate": 8.507030583182458e-05, "loss": 0.2553, "step": 3037 }, { "epoch": 1.9856981558148288, "grad_norm": 0.2297336906194687, "learning_rate": 8.505923321419111e-05, "loss": 0.3256, "step": 3038 }, { "epoch": 1.9864508844561537, "grad_norm": 0.21916307508945465, "learning_rate": 8.504815730932509e-05, "loss": 0.2744, "step": 3039 }, { "epoch": 1.9872036130974784, "grad_norm": 0.22350212931632996, "learning_rate": 8.503707811843776e-05, "loss": 0.2826, "step": 3040 }, { "epoch": 1.987956341738803, "grad_norm": 0.1920575648546219, "learning_rate": 8.502599564274075e-05, "loss": 0.2934, "step": 3041 }, { "epoch": 1.988709070380128, "grad_norm": 0.2508493959903717, "learning_rate": 8.501490988344598e-05, "loss": 0.2158, "step": 3042 }, { "epoch": 1.989461799021453, "grad_norm": 0.23814593255519867, "learning_rate": 8.500382084176579e-05, "loss": 0.347, "step": 3043 }, { "epoch": 1.9902145276627776, "grad_norm": 0.21692967414855957, "learning_rate": 8.499272851891287e-05, "loss": 0.2398, "step": 3044 }, { "epoch": 1.9909672563041023, "grad_norm": 0.21261534094810486, "learning_rate": 8.498163291610024e-05, "loss": 0.2463, "step": 3045 }, { "epoch": 1.9917199849454272, "grad_norm": 0.202280193567276, "learning_rate": 8.497053403454133e-05, "loss": 0.2662, "step": 3046 }, { "epoch": 1.992472713586752, "grad_norm": 0.19758263230323792, "learning_rate": 8.495943187544987e-05, "loss": 0.2328, "step": 3047 }, { "epoch": 1.9932254422280766, "grad_norm": 0.19490082561969757, "learning_rate": 8.494832644003997e-05, "loss": 0.1693, "step": 3048 }, { "epoch": 1.9939781708694015, "grad_norm": 0.207685649394989, "learning_rate": 8.493721772952613e-05, "loss": 0.2615, "step": 3049 }, { "epoch": 1.9947308995107265, "grad_norm": 0.19758422672748566, "learning_rate": 8.492610574512317e-05, "loss": 0.3012, "step": 3050 }, { "epoch": 1.9954836281520512, "grad_norm": 0.20222823321819305, "learning_rate": 8.491499048804628e-05, "loss": 0.2561, "step": 3051 }, { "epoch": 1.9962363567933759, "grad_norm": 0.19872619211673737, "learning_rate": 8.4903871959511e-05, "loss": 0.2837, "step": 3052 }, { "epoch": 1.9969890854347008, "grad_norm": 0.22462403774261475, "learning_rate": 8.489275016073324e-05, "loss": 0.335, "step": 3053 }, { "epoch": 1.9977418140760257, "grad_norm": 0.19991037249565125, "learning_rate": 8.488162509292928e-05, "loss": 0.3476, "step": 3054 }, { "epoch": 1.9984945427173504, "grad_norm": 0.2648589313030243, "learning_rate": 8.487049675731573e-05, "loss": 0.2655, "step": 3055 }, { "epoch": 1.999247271358675, "grad_norm": 0.2169906347990036, "learning_rate": 8.485936515510954e-05, "loss": 0.2455, "step": 3056 }, { "epoch": 2.0, "grad_norm": 0.21649512648582458, "learning_rate": 8.484823028752809e-05, "loss": 0.2378, "step": 3057 }, { "epoch": 2.000752728641325, "grad_norm": 0.2526013255119324, "learning_rate": 8.483709215578905e-05, "loss": 0.2678, "step": 3058 }, { "epoch": 2.0015054572826494, "grad_norm": 0.24242833256721497, "learning_rate": 8.482595076111045e-05, "loss": 0.2796, "step": 3059 }, { "epoch": 2.0022581859239743, "grad_norm": 0.21256661415100098, "learning_rate": 8.481480610471074e-05, "loss": 0.2798, "step": 3060 }, { "epoch": 2.0030109145652992, "grad_norm": 0.24776628613471985, "learning_rate": 8.480365818780865e-05, "loss": 0.2464, "step": 3061 }, { "epoch": 2.003763643206624, "grad_norm": 0.2043236792087555, "learning_rate": 8.479250701162333e-05, "loss": 0.1404, "step": 3062 }, { "epoch": 2.0045163718479486, "grad_norm": 0.2450585663318634, "learning_rate": 8.47813525773742e-05, "loss": 0.262, "step": 3063 }, { "epoch": 2.0052691004892735, "grad_norm": 0.24167458713054657, "learning_rate": 8.477019488628113e-05, "loss": 0.2143, "step": 3064 }, { "epoch": 2.0060218291305985, "grad_norm": 0.22167405486106873, "learning_rate": 8.475903393956434e-05, "loss": 0.2062, "step": 3065 }, { "epoch": 2.0067745577719234, "grad_norm": 0.2464701235294342, "learning_rate": 8.474786973844432e-05, "loss": 0.2433, "step": 3066 }, { "epoch": 2.007527286413248, "grad_norm": 0.2204335778951645, "learning_rate": 8.4736702284142e-05, "loss": 0.3022, "step": 3067 }, { "epoch": 2.0082800150545728, "grad_norm": 0.26866358518600464, "learning_rate": 8.472553157787861e-05, "loss": 0.2905, "step": 3068 }, { "epoch": 2.0090327436958977, "grad_norm": 0.25410428643226624, "learning_rate": 8.47143576208758e-05, "loss": 0.1891, "step": 3069 }, { "epoch": 2.0097854723372226, "grad_norm": 0.21995723247528076, "learning_rate": 8.470318041435551e-05, "loss": 0.2738, "step": 3070 }, { "epoch": 2.010538200978547, "grad_norm": 0.24845771491527557, "learning_rate": 8.469199995954005e-05, "loss": 0.2197, "step": 3071 }, { "epoch": 2.011290929619872, "grad_norm": 0.2389020174741745, "learning_rate": 8.468081625765212e-05, "loss": 0.2882, "step": 3072 }, { "epoch": 2.012043658261197, "grad_norm": 0.2571491301059723, "learning_rate": 8.466962930991475e-05, "loss": 0.2334, "step": 3073 }, { "epoch": 2.012796386902522, "grad_norm": 0.21783322095870972, "learning_rate": 8.465843911755134e-05, "loss": 0.2239, "step": 3074 }, { "epoch": 2.0135491155438463, "grad_norm": 0.19444601237773895, "learning_rate": 8.46472456817856e-05, "loss": 0.1795, "step": 3075 }, { "epoch": 2.014301844185171, "grad_norm": 0.223035529255867, "learning_rate": 8.463604900384167e-05, "loss": 0.2251, "step": 3076 }, { "epoch": 2.015054572826496, "grad_norm": 0.1799081563949585, "learning_rate": 8.462484908494396e-05, "loss": 0.1667, "step": 3077 }, { "epoch": 2.015807301467821, "grad_norm": 0.22759632766246796, "learning_rate": 8.46136459263173e-05, "loss": 0.3205, "step": 3078 }, { "epoch": 2.0165600301091455, "grad_norm": 0.22614285349845886, "learning_rate": 8.460243952918684e-05, "loss": 0.2706, "step": 3079 }, { "epoch": 2.0173127587504704, "grad_norm": 0.22202995419502258, "learning_rate": 8.459122989477812e-05, "loss": 0.2772, "step": 3080 }, { "epoch": 2.0180654873917954, "grad_norm": 0.20238982141017914, "learning_rate": 8.458001702431695e-05, "loss": 0.201, "step": 3081 }, { "epoch": 2.01881821603312, "grad_norm": 0.24960406124591827, "learning_rate": 8.456880091902962e-05, "loss": 0.2715, "step": 3082 }, { "epoch": 2.0195709446744448, "grad_norm": 0.24364593625068665, "learning_rate": 8.455758158014267e-05, "loss": 0.2824, "step": 3083 }, { "epoch": 2.0203236733157697, "grad_norm": 0.24672295153141022, "learning_rate": 8.454635900888305e-05, "loss": 0.2787, "step": 3084 }, { "epoch": 2.0210764019570946, "grad_norm": 0.23593957722187042, "learning_rate": 8.453513320647801e-05, "loss": 0.235, "step": 3085 }, { "epoch": 2.021829130598419, "grad_norm": 0.2489534616470337, "learning_rate": 8.452390417415522e-05, "loss": 0.2572, "step": 3086 }, { "epoch": 2.022581859239744, "grad_norm": 0.22961026430130005, "learning_rate": 8.451267191314266e-05, "loss": 0.2194, "step": 3087 }, { "epoch": 2.023334587881069, "grad_norm": 0.21747447550296783, "learning_rate": 8.450143642466867e-05, "loss": 0.2458, "step": 3088 }, { "epoch": 2.024087316522394, "grad_norm": 0.2379179298877716, "learning_rate": 8.449019770996194e-05, "loss": 0.3091, "step": 3089 }, { "epoch": 2.0248400451637183, "grad_norm": 0.20735161006450653, "learning_rate": 8.447895577025152e-05, "loss": 0.1908, "step": 3090 }, { "epoch": 2.025592773805043, "grad_norm": 0.1991339921951294, "learning_rate": 8.446771060676683e-05, "loss": 0.1842, "step": 3091 }, { "epoch": 2.026345502446368, "grad_norm": 0.2657207250595093, "learning_rate": 8.445646222073762e-05, "loss": 0.3308, "step": 3092 }, { "epoch": 2.027098231087693, "grad_norm": 0.254769891500473, "learning_rate": 8.444521061339399e-05, "loss": 0.3362, "step": 3093 }, { "epoch": 2.0278509597290175, "grad_norm": 0.22943389415740967, "learning_rate": 8.443395578596637e-05, "loss": 0.2304, "step": 3094 }, { "epoch": 2.0286036883703424, "grad_norm": 0.261860728263855, "learning_rate": 8.442269773968562e-05, "loss": 0.2688, "step": 3095 }, { "epoch": 2.0293564170116674, "grad_norm": 0.24655160307884216, "learning_rate": 8.441143647578287e-05, "loss": 0.2705, "step": 3096 }, { "epoch": 2.0301091456529923, "grad_norm": 0.2639143466949463, "learning_rate": 8.440017199548965e-05, "loss": 0.3696, "step": 3097 }, { "epoch": 2.0308618742943167, "grad_norm": 0.22802570462226868, "learning_rate": 8.438890430003784e-05, "loss": 0.2137, "step": 3098 }, { "epoch": 2.0316146029356417, "grad_norm": 0.21889019012451172, "learning_rate": 8.437763339065963e-05, "loss": 0.177, "step": 3099 }, { "epoch": 2.0323673315769666, "grad_norm": 0.20913393795490265, "learning_rate": 8.436635926858759e-05, "loss": 0.2092, "step": 3100 }, { "epoch": 2.0331200602182915, "grad_norm": 0.20546704530715942, "learning_rate": 8.435508193505466e-05, "loss": 0.1992, "step": 3101 }, { "epoch": 2.033872788859616, "grad_norm": 0.2081167846918106, "learning_rate": 8.434380139129412e-05, "loss": 0.1728, "step": 3102 }, { "epoch": 2.034625517500941, "grad_norm": 0.17705516517162323, "learning_rate": 8.433251763853955e-05, "loss": 0.1726, "step": 3103 }, { "epoch": 2.035378246142266, "grad_norm": 0.26447081565856934, "learning_rate": 8.432123067802496e-05, "loss": 0.336, "step": 3104 }, { "epoch": 2.0361309747835907, "grad_norm": 0.23668836057186127, "learning_rate": 8.430994051098468e-05, "loss": 0.2521, "step": 3105 }, { "epoch": 2.036883703424915, "grad_norm": 0.24291667342185974, "learning_rate": 8.429864713865336e-05, "loss": 0.3844, "step": 3106 }, { "epoch": 2.03763643206624, "grad_norm": 0.2463139444589615, "learning_rate": 8.428735056226604e-05, "loss": 0.2733, "step": 3107 }, { "epoch": 2.038389160707565, "grad_norm": 0.20904818177223206, "learning_rate": 8.42760507830581e-05, "loss": 0.2909, "step": 3108 }, { "epoch": 2.0391418893488895, "grad_norm": 0.24151405692100525, "learning_rate": 8.426474780226527e-05, "loss": 0.312, "step": 3109 }, { "epoch": 2.0398946179902144, "grad_norm": 0.2462405413389206, "learning_rate": 8.425344162112359e-05, "loss": 0.3149, "step": 3110 }, { "epoch": 2.0406473466315393, "grad_norm": 0.2704375088214874, "learning_rate": 8.424213224086954e-05, "loss": 0.2386, "step": 3111 }, { "epoch": 2.0414000752728643, "grad_norm": 0.6131048798561096, "learning_rate": 8.423081966273987e-05, "loss": 0.2697, "step": 3112 }, { "epoch": 2.0421528039141887, "grad_norm": 0.7876570820808411, "learning_rate": 8.42195038879717e-05, "loss": 0.3263, "step": 3113 }, { "epoch": 2.0429055325555137, "grad_norm": 0.8777849674224854, "learning_rate": 8.420818491780254e-05, "loss": 0.2543, "step": 3114 }, { "epoch": 2.0436582611968386, "grad_norm": 0.30478984117507935, "learning_rate": 8.419686275347018e-05, "loss": 0.3517, "step": 3115 }, { "epoch": 2.0444109898381635, "grad_norm": 0.8108913898468018, "learning_rate": 8.418553739621278e-05, "loss": 0.2395, "step": 3116 }, { "epoch": 2.045163718479488, "grad_norm": 0.273945152759552, "learning_rate": 8.417420884726892e-05, "loss": 0.2158, "step": 3117 }, { "epoch": 2.045916447120813, "grad_norm": 0.579669177532196, "learning_rate": 8.416287710787744e-05, "loss": 0.2264, "step": 3118 }, { "epoch": 2.046669175762138, "grad_norm": 0.3009595572948456, "learning_rate": 8.415154217927755e-05, "loss": 0.291, "step": 3119 }, { "epoch": 2.0474219044034627, "grad_norm": 0.32675179839134216, "learning_rate": 8.414020406270885e-05, "loss": 0.3097, "step": 3120 }, { "epoch": 2.048174633044787, "grad_norm": 0.2686757743358612, "learning_rate": 8.412886275941124e-05, "loss": 0.2461, "step": 3121 }, { "epoch": 2.048927361686112, "grad_norm": 0.326274037361145, "learning_rate": 8.4117518270625e-05, "loss": 0.2798, "step": 3122 }, { "epoch": 2.049680090327437, "grad_norm": 0.2625609338283539, "learning_rate": 8.410617059759073e-05, "loss": 0.3159, "step": 3123 }, { "epoch": 2.050432818968762, "grad_norm": 0.24872569739818573, "learning_rate": 8.409481974154942e-05, "loss": 0.2227, "step": 3124 }, { "epoch": 2.0511855476100864, "grad_norm": 0.2685079574584961, "learning_rate": 8.408346570374233e-05, "loss": 0.3036, "step": 3125 }, { "epoch": 2.0519382762514113, "grad_norm": 0.22159776091575623, "learning_rate": 8.40721084854112e-05, "loss": 0.2294, "step": 3126 }, { "epoch": 2.0526910048927363, "grad_norm": 0.21542218327522278, "learning_rate": 8.406074808779797e-05, "loss": 0.271, "step": 3127 }, { "epoch": 2.053443733534061, "grad_norm": 0.21301595866680145, "learning_rate": 8.404938451214503e-05, "loss": 0.2402, "step": 3128 }, { "epoch": 2.0541964621753857, "grad_norm": 0.22471852600574493, "learning_rate": 8.403801775969508e-05, "loss": 0.2408, "step": 3129 }, { "epoch": 2.0549491908167106, "grad_norm": 0.20845641195774078, "learning_rate": 8.402664783169115e-05, "loss": 0.255, "step": 3130 }, { "epoch": 2.0557019194580355, "grad_norm": 0.22264792025089264, "learning_rate": 8.401527472937667e-05, "loss": 0.2364, "step": 3131 }, { "epoch": 2.0564546480993604, "grad_norm": 0.20260700583457947, "learning_rate": 8.400389845399537e-05, "loss": 0.3004, "step": 3132 }, { "epoch": 2.057207376740685, "grad_norm": 0.2353879064321518, "learning_rate": 8.399251900679131e-05, "loss": 0.2161, "step": 3133 }, { "epoch": 2.05796010538201, "grad_norm": 0.2536441385746002, "learning_rate": 8.398113638900901e-05, "loss": 0.2535, "step": 3134 }, { "epoch": 2.0587128340233347, "grad_norm": 0.19049693644046783, "learning_rate": 8.396975060189316e-05, "loss": 0.189, "step": 3135 }, { "epoch": 2.059465562664659, "grad_norm": 0.20505200326442719, "learning_rate": 8.395836164668897e-05, "loss": 0.2072, "step": 3136 }, { "epoch": 2.060218291305984, "grad_norm": 0.18374526500701904, "learning_rate": 8.394696952464187e-05, "loss": 0.2478, "step": 3137 }, { "epoch": 2.060971019947309, "grad_norm": 0.24766355752944946, "learning_rate": 8.393557423699772e-05, "loss": 0.2644, "step": 3138 }, { "epoch": 2.061723748588634, "grad_norm": 0.1804666370153427, "learning_rate": 8.392417578500267e-05, "loss": 0.3033, "step": 3139 }, { "epoch": 2.0624764772299584, "grad_norm": 0.20188137888908386, "learning_rate": 8.391277416990325e-05, "loss": 0.2114, "step": 3140 }, { "epoch": 2.0632292058712833, "grad_norm": 0.22274582087993622, "learning_rate": 8.390136939294631e-05, "loss": 0.303, "step": 3141 }, { "epoch": 2.0639819345126083, "grad_norm": 0.1922132521867752, "learning_rate": 8.388996145537905e-05, "loss": 0.2491, "step": 3142 }, { "epoch": 2.064734663153933, "grad_norm": 0.21991661190986633, "learning_rate": 8.387855035844905e-05, "loss": 0.2883, "step": 3143 }, { "epoch": 2.0654873917952576, "grad_norm": 0.21454522013664246, "learning_rate": 8.386713610340422e-05, "loss": 0.2496, "step": 3144 }, { "epoch": 2.0662401204365826, "grad_norm": 0.195308119058609, "learning_rate": 8.385571869149277e-05, "loss": 0.2434, "step": 3145 }, { "epoch": 2.0669928490779075, "grad_norm": 0.22687305510044098, "learning_rate": 8.384429812396332e-05, "loss": 0.1867, "step": 3146 }, { "epoch": 2.0677455777192324, "grad_norm": 0.20586279034614563, "learning_rate": 8.383287440206477e-05, "loss": 0.3065, "step": 3147 }, { "epoch": 2.068498306360557, "grad_norm": 0.21664854884147644, "learning_rate": 8.382144752704645e-05, "loss": 0.2959, "step": 3148 }, { "epoch": 2.069251035001882, "grad_norm": 0.20049773156642914, "learning_rate": 8.381001750015794e-05, "loss": 0.2831, "step": 3149 }, { "epoch": 2.0700037636432067, "grad_norm": 0.18743447959423065, "learning_rate": 8.379858432264925e-05, "loss": 0.253, "step": 3150 }, { "epoch": 2.0707564922845316, "grad_norm": 0.22087723016738892, "learning_rate": 8.378714799577067e-05, "loss": 0.2684, "step": 3151 }, { "epoch": 2.071509220925856, "grad_norm": 0.18619434535503387, "learning_rate": 8.377570852077286e-05, "loss": 0.1338, "step": 3152 }, { "epoch": 2.072261949567181, "grad_norm": 0.20112580060958862, "learning_rate": 8.376426589890684e-05, "loss": 0.2507, "step": 3153 }, { "epoch": 2.073014678208506, "grad_norm": 0.2202584594488144, "learning_rate": 8.375282013142393e-05, "loss": 0.3627, "step": 3154 }, { "epoch": 2.0737674068498304, "grad_norm": 0.18617333471775055, "learning_rate": 8.374137121957585e-05, "loss": 0.2564, "step": 3155 }, { "epoch": 2.0745201354911553, "grad_norm": 0.19248661398887634, "learning_rate": 8.372991916461463e-05, "loss": 0.2601, "step": 3156 }, { "epoch": 2.0752728641324802, "grad_norm": 0.20293618738651276, "learning_rate": 8.371846396779265e-05, "loss": 0.251, "step": 3157 }, { "epoch": 2.076025592773805, "grad_norm": 0.1892383098602295, "learning_rate": 8.370700563036261e-05, "loss": 0.1767, "step": 3158 }, { "epoch": 2.0767783214151296, "grad_norm": 0.19285933673381805, "learning_rate": 8.369554415357762e-05, "loss": 0.1824, "step": 3159 }, { "epoch": 2.0775310500564546, "grad_norm": 0.19573533535003662, "learning_rate": 8.368407953869104e-05, "loss": 0.2174, "step": 3160 }, { "epoch": 2.0782837786977795, "grad_norm": 0.19922420382499695, "learning_rate": 8.367261178695666e-05, "loss": 0.2961, "step": 3161 }, { "epoch": 2.0790365073391044, "grad_norm": 0.19644443690776825, "learning_rate": 8.366114089962856e-05, "loss": 0.2156, "step": 3162 }, { "epoch": 2.079789235980429, "grad_norm": 0.1890203058719635, "learning_rate": 8.36496668779612e-05, "loss": 0.2944, "step": 3163 }, { "epoch": 2.080541964621754, "grad_norm": 0.2009144127368927, "learning_rate": 8.363818972320931e-05, "loss": 0.2376, "step": 3164 }, { "epoch": 2.0812946932630787, "grad_norm": 0.22323733568191528, "learning_rate": 8.362670943662807e-05, "loss": 0.2738, "step": 3165 }, { "epoch": 2.0820474219044036, "grad_norm": 0.22979046404361725, "learning_rate": 8.361522601947292e-05, "loss": 0.2538, "step": 3166 }, { "epoch": 2.082800150545728, "grad_norm": 0.18332116305828094, "learning_rate": 8.360373947299967e-05, "loss": 0.2454, "step": 3167 }, { "epoch": 2.083552879187053, "grad_norm": 0.19105564057826996, "learning_rate": 8.359224979846449e-05, "loss": 0.2458, "step": 3168 }, { "epoch": 2.084305607828378, "grad_norm": 0.46045681834220886, "learning_rate": 8.358075699712385e-05, "loss": 0.2081, "step": 3169 }, { "epoch": 2.085058336469703, "grad_norm": 0.19315551221370697, "learning_rate": 8.356926107023459e-05, "loss": 0.2735, "step": 3170 }, { "epoch": 2.0858110651110273, "grad_norm": 0.19450809061527252, "learning_rate": 8.35577620190539e-05, "loss": 0.2641, "step": 3171 }, { "epoch": 2.0865637937523522, "grad_norm": 0.2580905258655548, "learning_rate": 8.35462598448393e-05, "loss": 0.2408, "step": 3172 }, { "epoch": 2.087316522393677, "grad_norm": 0.24045264720916748, "learning_rate": 8.353475454884863e-05, "loss": 0.2498, "step": 3173 }, { "epoch": 2.088069251035002, "grad_norm": 0.28329914808273315, "learning_rate": 8.352324613234012e-05, "loss": 0.3186, "step": 3174 }, { "epoch": 2.0888219796763265, "grad_norm": 0.2480863481760025, "learning_rate": 8.351173459657227e-05, "loss": 0.2101, "step": 3175 }, { "epoch": 2.0895747083176515, "grad_norm": 0.19257177412509918, "learning_rate": 8.350021994280401e-05, "loss": 0.266, "step": 3176 }, { "epoch": 2.0903274369589764, "grad_norm": 0.185734823346138, "learning_rate": 8.348870217229454e-05, "loss": 0.1929, "step": 3177 }, { "epoch": 2.0910801656003013, "grad_norm": 0.23393894731998444, "learning_rate": 8.347718128630345e-05, "loss": 0.2724, "step": 3178 }, { "epoch": 2.0918328942416258, "grad_norm": 0.23600000143051147, "learning_rate": 8.34656572860906e-05, "loss": 0.2339, "step": 3179 }, { "epoch": 2.0925856228829507, "grad_norm": 0.2836434543132782, "learning_rate": 8.345413017291629e-05, "loss": 0.1545, "step": 3180 }, { "epoch": 2.0933383515242756, "grad_norm": 0.20875509083271027, "learning_rate": 8.344259994804107e-05, "loss": 0.1736, "step": 3181 }, { "epoch": 2.0940910801656, "grad_norm": 0.20782718062400818, "learning_rate": 8.34310666127259e-05, "loss": 0.3578, "step": 3182 }, { "epoch": 2.094843808806925, "grad_norm": 0.23103170096874237, "learning_rate": 8.341953016823201e-05, "loss": 0.2758, "step": 3183 }, { "epoch": 2.09559653744825, "grad_norm": 0.22485248744487762, "learning_rate": 8.340799061582103e-05, "loss": 0.1988, "step": 3184 }, { "epoch": 2.096349266089575, "grad_norm": 0.3846145570278168, "learning_rate": 8.339644795675493e-05, "loss": 0.2015, "step": 3185 }, { "epoch": 2.0971019947308993, "grad_norm": 0.20867139101028442, "learning_rate": 8.338490219229595e-05, "loss": 0.1832, "step": 3186 }, { "epoch": 2.0978547233722242, "grad_norm": 0.195817232131958, "learning_rate": 8.337335332370676e-05, "loss": 0.1895, "step": 3187 }, { "epoch": 2.098607452013549, "grad_norm": 0.20878785848617554, "learning_rate": 8.336180135225031e-05, "loss": 0.1789, "step": 3188 }, { "epoch": 2.099360180654874, "grad_norm": 0.2610277831554413, "learning_rate": 8.335024627918988e-05, "loss": 0.3198, "step": 3189 }, { "epoch": 2.1001129092961985, "grad_norm": 0.2093379646539688, "learning_rate": 8.333868810578917e-05, "loss": 0.2324, "step": 3190 }, { "epoch": 2.1008656379375235, "grad_norm": 0.23377637565135956, "learning_rate": 8.332712683331212e-05, "loss": 0.2816, "step": 3191 }, { "epoch": 2.1016183665788484, "grad_norm": 0.26196780800819397, "learning_rate": 8.331556246302307e-05, "loss": 0.3021, "step": 3192 }, { "epoch": 2.1023710952201733, "grad_norm": 0.20811069011688232, "learning_rate": 8.330399499618668e-05, "loss": 0.2392, "step": 3193 }, { "epoch": 2.1031238238614978, "grad_norm": 0.2142556756734848, "learning_rate": 8.329242443406794e-05, "loss": 0.2212, "step": 3194 }, { "epoch": 2.1038765525028227, "grad_norm": 0.20339788496494293, "learning_rate": 8.328085077793222e-05, "loss": 0.2094, "step": 3195 }, { "epoch": 2.1046292811441476, "grad_norm": 0.25524669885635376, "learning_rate": 8.326927402904519e-05, "loss": 0.2379, "step": 3196 }, { "epoch": 2.1053820097854725, "grad_norm": 0.21403126418590546, "learning_rate": 8.325769418867283e-05, "loss": 0.1991, "step": 3197 }, { "epoch": 2.106134738426797, "grad_norm": 0.23916861414909363, "learning_rate": 8.324611125808153e-05, "loss": 0.2225, "step": 3198 }, { "epoch": 2.106887467068122, "grad_norm": 0.21650661528110504, "learning_rate": 8.323452523853796e-05, "loss": 0.2813, "step": 3199 }, { "epoch": 2.107640195709447, "grad_norm": 0.23711706697940826, "learning_rate": 8.322293613130917e-05, "loss": 0.1789, "step": 3200 }, { "epoch": 2.107640195709447, "eval_loss": 0.26774588227272034, "eval_runtime": 456.5622, "eval_samples_per_second": 21.086, "eval_steps_per_second": 0.659, "step": 3200 }, { "epoch": 2.1083929243507717, "grad_norm": 0.22150985896587372, "learning_rate": 8.321134393766252e-05, "loss": 0.3397, "step": 3201 }, { "epoch": 2.109145652992096, "grad_norm": 0.20984090864658356, "learning_rate": 8.31997486588657e-05, "loss": 0.154, "step": 3202 }, { "epoch": 2.109898381633421, "grad_norm": 0.25227871537208557, "learning_rate": 8.318815029618677e-05, "loss": 0.2493, "step": 3203 }, { "epoch": 2.110651110274746, "grad_norm": 0.22852428257465363, "learning_rate": 8.317654885089411e-05, "loss": 0.2833, "step": 3204 }, { "epoch": 2.111403838916071, "grad_norm": 0.2395143359899521, "learning_rate": 8.316494432425641e-05, "loss": 0.2404, "step": 3205 }, { "epoch": 2.1121565675573954, "grad_norm": 0.20180048048496246, "learning_rate": 8.315333671754276e-05, "loss": 0.2415, "step": 3206 }, { "epoch": 2.1129092961987204, "grad_norm": 0.2313382476568222, "learning_rate": 8.314172603202254e-05, "loss": 0.2313, "step": 3207 }, { "epoch": 2.1136620248400453, "grad_norm": 0.24695464968681335, "learning_rate": 8.313011226896547e-05, "loss": 0.2326, "step": 3208 }, { "epoch": 2.1144147534813698, "grad_norm": 0.23284128308296204, "learning_rate": 8.31184954296416e-05, "loss": 0.201, "step": 3209 }, { "epoch": 2.1151674821226947, "grad_norm": 0.18865197896957397, "learning_rate": 8.310687551532136e-05, "loss": 0.1677, "step": 3210 }, { "epoch": 2.1159202107640196, "grad_norm": 0.26229727268218994, "learning_rate": 8.309525252727546e-05, "loss": 0.2932, "step": 3211 }, { "epoch": 2.1166729394053445, "grad_norm": 0.2626563608646393, "learning_rate": 8.3083626466775e-05, "loss": 0.2953, "step": 3212 }, { "epoch": 2.117425668046669, "grad_norm": 0.25163185596466064, "learning_rate": 8.307199733509136e-05, "loss": 0.3534, "step": 3213 }, { "epoch": 2.118178396687994, "grad_norm": 0.2161298543214798, "learning_rate": 8.306036513349632e-05, "loss": 0.1976, "step": 3214 }, { "epoch": 2.118931125329319, "grad_norm": 0.2344430834054947, "learning_rate": 8.30487298632619e-05, "loss": 0.2641, "step": 3215 }, { "epoch": 2.1196838539706437, "grad_norm": 0.23923276364803314, "learning_rate": 8.303709152566058e-05, "loss": 0.2413, "step": 3216 }, { "epoch": 2.120436582611968, "grad_norm": 0.2612529993057251, "learning_rate": 8.302545012196506e-05, "loss": 0.2662, "step": 3217 }, { "epoch": 2.121189311253293, "grad_norm": 0.208196222782135, "learning_rate": 8.301380565344847e-05, "loss": 0.2132, "step": 3218 }, { "epoch": 2.121942039894618, "grad_norm": 0.21866323053836823, "learning_rate": 8.30021581213842e-05, "loss": 0.2431, "step": 3219 }, { "epoch": 2.122694768535943, "grad_norm": 0.23672500252723694, "learning_rate": 8.299050752704604e-05, "loss": 0.1925, "step": 3220 }, { "epoch": 2.1234474971772674, "grad_norm": 0.23357392847537994, "learning_rate": 8.297885387170804e-05, "loss": 0.2489, "step": 3221 }, { "epoch": 2.1242002258185924, "grad_norm": 0.23137471079826355, "learning_rate": 8.296719715664465e-05, "loss": 0.2257, "step": 3222 }, { "epoch": 2.1249529544599173, "grad_norm": 0.17968469858169556, "learning_rate": 8.295553738313064e-05, "loss": 0.0994, "step": 3223 }, { "epoch": 2.125705683101242, "grad_norm": 0.22640042006969452, "learning_rate": 8.294387455244108e-05, "loss": 0.2048, "step": 3224 }, { "epoch": 2.1264584117425667, "grad_norm": 0.19910666346549988, "learning_rate": 8.29322086658514e-05, "loss": 0.2371, "step": 3225 }, { "epoch": 2.1272111403838916, "grad_norm": 0.20964199304580688, "learning_rate": 8.29205397246374e-05, "loss": 0.1966, "step": 3226 }, { "epoch": 2.1279638690252165, "grad_norm": 0.17588475346565247, "learning_rate": 8.290886773007513e-05, "loss": 0.1815, "step": 3227 }, { "epoch": 2.128716597666541, "grad_norm": 0.25179323554039, "learning_rate": 8.289719268344106e-05, "loss": 0.2027, "step": 3228 }, { "epoch": 2.129469326307866, "grad_norm": 0.20770396292209625, "learning_rate": 8.288551458601193e-05, "loss": 0.2979, "step": 3229 }, { "epoch": 2.130222054949191, "grad_norm": 0.22137857973575592, "learning_rate": 8.287383343906488e-05, "loss": 0.1953, "step": 3230 }, { "epoch": 2.1309747835905157, "grad_norm": 0.2246575653553009, "learning_rate": 8.286214924387727e-05, "loss": 0.2617, "step": 3231 }, { "epoch": 2.1317275122318406, "grad_norm": 0.2106168568134308, "learning_rate": 8.285046200172694e-05, "loss": 0.2501, "step": 3232 }, { "epoch": 2.132480240873165, "grad_norm": 0.20016349852085114, "learning_rate": 8.283877171389195e-05, "loss": 0.2928, "step": 3233 }, { "epoch": 2.13323296951449, "grad_norm": 0.24323831498622894, "learning_rate": 8.282707838165074e-05, "loss": 0.2369, "step": 3234 }, { "epoch": 2.133985698155815, "grad_norm": 0.2702217996120453, "learning_rate": 8.281538200628209e-05, "loss": 0.2609, "step": 3235 }, { "epoch": 2.1347384267971394, "grad_norm": 0.19558002054691315, "learning_rate": 8.280368258906505e-05, "loss": 0.2206, "step": 3236 }, { "epoch": 2.1354911554384644, "grad_norm": 0.1899026483297348, "learning_rate": 8.27919801312791e-05, "loss": 0.2089, "step": 3237 }, { "epoch": 2.1362438840797893, "grad_norm": 0.2210087925195694, "learning_rate": 8.2780274634204e-05, "loss": 0.2025, "step": 3238 }, { "epoch": 2.136996612721114, "grad_norm": 0.2172725945711136, "learning_rate": 8.276856609911983e-05, "loss": 0.2278, "step": 3239 }, { "epoch": 2.1377493413624387, "grad_norm": 0.2392117828130722, "learning_rate": 8.2756854527307e-05, "loss": 0.2482, "step": 3240 }, { "epoch": 2.1385020700037636, "grad_norm": 0.23081719875335693, "learning_rate": 8.27451399200463e-05, "loss": 0.2924, "step": 3241 }, { "epoch": 2.1392547986450885, "grad_norm": 0.21378399431705475, "learning_rate": 8.273342227861882e-05, "loss": 0.2887, "step": 3242 }, { "epoch": 2.1400075272864134, "grad_norm": 0.1880919486284256, "learning_rate": 8.272170160430597e-05, "loss": 0.2365, "step": 3243 }, { "epoch": 2.140760255927738, "grad_norm": 0.19117216765880585, "learning_rate": 8.270997789838953e-05, "loss": 0.2037, "step": 3244 }, { "epoch": 2.141512984569063, "grad_norm": 0.22724789381027222, "learning_rate": 8.269825116215154e-05, "loss": 0.2029, "step": 3245 }, { "epoch": 2.1422657132103877, "grad_norm": 0.2088402807712555, "learning_rate": 8.268652139687446e-05, "loss": 0.1996, "step": 3246 }, { "epoch": 2.1430184418517126, "grad_norm": 0.19625619053840637, "learning_rate": 8.267478860384102e-05, "loss": 0.2645, "step": 3247 }, { "epoch": 2.143771170493037, "grad_norm": 0.24798248708248138, "learning_rate": 8.266305278433432e-05, "loss": 0.2575, "step": 3248 }, { "epoch": 2.144523899134362, "grad_norm": 0.2171613574028015, "learning_rate": 8.265131393963776e-05, "loss": 0.2156, "step": 3249 }, { "epoch": 2.145276627775687, "grad_norm": 0.25986841320991516, "learning_rate": 8.263957207103507e-05, "loss": 0.2583, "step": 3250 }, { "epoch": 2.146029356417012, "grad_norm": 0.21877890825271606, "learning_rate": 8.262782717981032e-05, "loss": 0.2566, "step": 3251 }, { "epoch": 2.1467820850583363, "grad_norm": 0.2554135322570801, "learning_rate": 8.261607926724795e-05, "loss": 0.2685, "step": 3252 }, { "epoch": 2.1475348136996613, "grad_norm": 0.26804280281066895, "learning_rate": 8.260432833463266e-05, "loss": 0.2468, "step": 3253 }, { "epoch": 2.148287542340986, "grad_norm": 0.24024823307991028, "learning_rate": 8.259257438324954e-05, "loss": 0.235, "step": 3254 }, { "epoch": 2.1490402709823107, "grad_norm": 0.227446049451828, "learning_rate": 8.258081741438395e-05, "loss": 0.3106, "step": 3255 }, { "epoch": 2.1497929996236356, "grad_norm": 0.21625013649463654, "learning_rate": 8.256905742932164e-05, "loss": 0.2776, "step": 3256 }, { "epoch": 2.1505457282649605, "grad_norm": 0.249135360121727, "learning_rate": 8.255729442934866e-05, "loss": 0.2376, "step": 3257 }, { "epoch": 2.1512984569062854, "grad_norm": 0.2197861671447754, "learning_rate": 8.254552841575141e-05, "loss": 0.1861, "step": 3258 }, { "epoch": 2.15205118554761, "grad_norm": 0.251925528049469, "learning_rate": 8.253375938981655e-05, "loss": 0.2482, "step": 3259 }, { "epoch": 2.152803914188935, "grad_norm": 0.2473144233226776, "learning_rate": 8.25219873528312e-05, "loss": 0.283, "step": 3260 }, { "epoch": 2.1535566428302597, "grad_norm": 0.1862538903951645, "learning_rate": 8.251021230608267e-05, "loss": 0.2158, "step": 3261 }, { "epoch": 2.1543093714715846, "grad_norm": 0.21416614949703217, "learning_rate": 8.249843425085868e-05, "loss": 0.2599, "step": 3262 }, { "epoch": 2.155062100112909, "grad_norm": 0.20743905007839203, "learning_rate": 8.248665318844728e-05, "loss": 0.2669, "step": 3263 }, { "epoch": 2.155814828754234, "grad_norm": 0.24642720818519592, "learning_rate": 8.24748691201368e-05, "loss": 0.2069, "step": 3264 }, { "epoch": 2.156567557395559, "grad_norm": 0.20844870805740356, "learning_rate": 8.246308204721594e-05, "loss": 0.2594, "step": 3265 }, { "epoch": 2.157320286036884, "grad_norm": 0.2164532095193863, "learning_rate": 8.245129197097373e-05, "loss": 0.1992, "step": 3266 }, { "epoch": 2.1580730146782083, "grad_norm": 0.2608882188796997, "learning_rate": 8.243949889269949e-05, "loss": 0.2413, "step": 3267 }, { "epoch": 2.1588257433195333, "grad_norm": 0.1900991052389145, "learning_rate": 8.242770281368292e-05, "loss": 0.2151, "step": 3268 }, { "epoch": 2.159578471960858, "grad_norm": 0.23943254351615906, "learning_rate": 8.2415903735214e-05, "loss": 0.27, "step": 3269 }, { "epoch": 2.160331200602183, "grad_norm": 0.2102481871843338, "learning_rate": 8.240410165858306e-05, "loss": 0.2816, "step": 3270 }, { "epoch": 2.1610839292435076, "grad_norm": 0.21121765673160553, "learning_rate": 8.239229658508077e-05, "loss": 0.2036, "step": 3271 }, { "epoch": 2.1618366578848325, "grad_norm": 0.22765162587165833, "learning_rate": 8.23804885159981e-05, "loss": 0.2938, "step": 3272 }, { "epoch": 2.1625893865261574, "grad_norm": 0.23167742788791656, "learning_rate": 8.236867745262638e-05, "loss": 0.2509, "step": 3273 }, { "epoch": 2.1633421151674823, "grad_norm": 0.19294175505638123, "learning_rate": 8.235686339625725e-05, "loss": 0.1747, "step": 3274 }, { "epoch": 2.164094843808807, "grad_norm": 0.26540717482566833, "learning_rate": 8.234504634818267e-05, "loss": 0.2454, "step": 3275 }, { "epoch": 2.1648475724501317, "grad_norm": 0.21051125228405, "learning_rate": 8.233322630969491e-05, "loss": 0.2506, "step": 3276 }, { "epoch": 2.1656003010914566, "grad_norm": 0.20805968344211578, "learning_rate": 8.232140328208664e-05, "loss": 0.1585, "step": 3277 }, { "epoch": 2.1663530297327815, "grad_norm": 0.24636778235435486, "learning_rate": 8.230957726665077e-05, "loss": 0.2255, "step": 3278 }, { "epoch": 2.167105758374106, "grad_norm": 0.21515226364135742, "learning_rate": 8.229774826468059e-05, "loss": 0.2843, "step": 3279 }, { "epoch": 2.167858487015431, "grad_norm": 0.23964112997055054, "learning_rate": 8.228591627746971e-05, "loss": 0.3022, "step": 3280 }, { "epoch": 2.168611215656756, "grad_norm": 0.2066589593887329, "learning_rate": 8.227408130631204e-05, "loss": 0.2233, "step": 3281 }, { "epoch": 2.1693639442980803, "grad_norm": 0.2150086760520935, "learning_rate": 8.226224335250185e-05, "loss": 0.191, "step": 3282 }, { "epoch": 2.1701166729394052, "grad_norm": 0.22427336871623993, "learning_rate": 8.225040241733373e-05, "loss": 0.2352, "step": 3283 }, { "epoch": 2.17086940158073, "grad_norm": 0.2690707743167877, "learning_rate": 8.223855850210256e-05, "loss": 0.3002, "step": 3284 }, { "epoch": 2.171622130222055, "grad_norm": 0.21908652782440186, "learning_rate": 8.222671160810357e-05, "loss": 0.2288, "step": 3285 }, { "epoch": 2.1723748588633796, "grad_norm": 0.24725233018398285, "learning_rate": 8.221486173663234e-05, "loss": 0.2733, "step": 3286 }, { "epoch": 2.1731275875047045, "grad_norm": 0.21616244316101074, "learning_rate": 8.220300888898476e-05, "loss": 0.3365, "step": 3287 }, { "epoch": 2.1738803161460294, "grad_norm": 0.2131408154964447, "learning_rate": 8.219115306645701e-05, "loss": 0.299, "step": 3288 }, { "epoch": 2.1746330447873543, "grad_norm": 0.22922706604003906, "learning_rate": 8.217929427034565e-05, "loss": 0.2395, "step": 3289 }, { "epoch": 2.175385773428679, "grad_norm": 0.2031833827495575, "learning_rate": 8.216743250194753e-05, "loss": 0.1392, "step": 3290 }, { "epoch": 2.1761385020700037, "grad_norm": 0.23285149037837982, "learning_rate": 8.215556776255984e-05, "loss": 0.3261, "step": 3291 }, { "epoch": 2.1768912307113286, "grad_norm": 0.22266627848148346, "learning_rate": 8.214370005348009e-05, "loss": 0.2344, "step": 3292 }, { "epoch": 2.1776439593526535, "grad_norm": 0.238388791680336, "learning_rate": 8.213182937600612e-05, "loss": 0.3563, "step": 3293 }, { "epoch": 2.178396687993978, "grad_norm": 0.21419058740139008, "learning_rate": 8.211995573143605e-05, "loss": 0.1983, "step": 3294 }, { "epoch": 2.179149416635303, "grad_norm": 0.21899908781051636, "learning_rate": 8.210807912106844e-05, "loss": 0.27, "step": 3295 }, { "epoch": 2.179902145276628, "grad_norm": 0.23763418197631836, "learning_rate": 8.209619954620203e-05, "loss": 0.2144, "step": 3296 }, { "epoch": 2.1806548739179528, "grad_norm": 0.23252993822097778, "learning_rate": 8.208431700813596e-05, "loss": 0.1898, "step": 3297 }, { "epoch": 2.1814076025592772, "grad_norm": 0.20055760443210602, "learning_rate": 8.207243150816973e-05, "loss": 0.2165, "step": 3298 }, { "epoch": 2.182160331200602, "grad_norm": 0.23720313608646393, "learning_rate": 8.206054304760308e-05, "loss": 0.267, "step": 3299 }, { "epoch": 2.182913059841927, "grad_norm": 0.18638090789318085, "learning_rate": 8.204865162773613e-05, "loss": 0.2135, "step": 3300 }, { "epoch": 2.183665788483252, "grad_norm": 0.23922251164913177, "learning_rate": 8.203675724986931e-05, "loss": 0.2694, "step": 3301 }, { "epoch": 2.1844185171245765, "grad_norm": 0.2627762258052826, "learning_rate": 8.202485991530335e-05, "loss": 0.2889, "step": 3302 }, { "epoch": 2.1851712457659014, "grad_norm": 0.2152594029903412, "learning_rate": 8.201295962533936e-05, "loss": 0.1713, "step": 3303 }, { "epoch": 2.1859239744072263, "grad_norm": 0.23959752917289734, "learning_rate": 8.20010563812787e-05, "loss": 0.2558, "step": 3304 }, { "epoch": 2.186676703048551, "grad_norm": 0.2778724730014801, "learning_rate": 8.198915018442313e-05, "loss": 0.2664, "step": 3305 }, { "epoch": 2.1874294316898757, "grad_norm": 0.2410106509923935, "learning_rate": 8.197724103607468e-05, "loss": 0.2598, "step": 3306 }, { "epoch": 2.1881821603312006, "grad_norm": 0.24949754774570465, "learning_rate": 8.196532893753568e-05, "loss": 0.2855, "step": 3307 }, { "epoch": 2.1889348889725255, "grad_norm": 0.21430666744709015, "learning_rate": 8.195341389010886e-05, "loss": 0.1895, "step": 3308 }, { "epoch": 2.18968761761385, "grad_norm": 0.2276395857334137, "learning_rate": 8.194149589509724e-05, "loss": 0.2582, "step": 3309 }, { "epoch": 2.190440346255175, "grad_norm": 0.20919466018676758, "learning_rate": 8.192957495380411e-05, "loss": 0.1987, "step": 3310 }, { "epoch": 2.1911930748965, "grad_norm": 0.2529458999633789, "learning_rate": 8.191765106753315e-05, "loss": 0.2911, "step": 3311 }, { "epoch": 2.1919458035378248, "grad_norm": 0.19918297231197357, "learning_rate": 8.190572423758835e-05, "loss": 0.1585, "step": 3312 }, { "epoch": 2.1926985321791492, "grad_norm": 0.24594852328300476, "learning_rate": 8.1893794465274e-05, "loss": 0.1687, "step": 3313 }, { "epoch": 2.193451260820474, "grad_norm": 0.24248607456684113, "learning_rate": 8.188186175189472e-05, "loss": 0.227, "step": 3314 }, { "epoch": 2.194203989461799, "grad_norm": 0.2383016049861908, "learning_rate": 8.186992609875546e-05, "loss": 0.2507, "step": 3315 }, { "epoch": 2.194956718103124, "grad_norm": 0.21092231571674347, "learning_rate": 8.185798750716148e-05, "loss": 0.2643, "step": 3316 }, { "epoch": 2.1957094467444485, "grad_norm": 0.1938280612230301, "learning_rate": 8.184604597841835e-05, "loss": 0.2291, "step": 3317 }, { "epoch": 2.1964621753857734, "grad_norm": 0.21830938756465912, "learning_rate": 8.183410151383201e-05, "loss": 0.2794, "step": 3318 }, { "epoch": 2.1972149040270983, "grad_norm": 0.23728503286838531, "learning_rate": 8.182215411470867e-05, "loss": 0.2942, "step": 3319 }, { "epoch": 2.197967632668423, "grad_norm": 0.2166655957698822, "learning_rate": 8.181020378235487e-05, "loss": 0.2359, "step": 3320 }, { "epoch": 2.1987203613097477, "grad_norm": 0.21103085577487946, "learning_rate": 8.17982505180775e-05, "loss": 0.2197, "step": 3321 }, { "epoch": 2.1994730899510726, "grad_norm": 0.22757332026958466, "learning_rate": 8.178629432318376e-05, "loss": 0.3042, "step": 3322 }, { "epoch": 2.2002258185923975, "grad_norm": 0.21513241529464722, "learning_rate": 8.177433519898113e-05, "loss": 0.2334, "step": 3323 }, { "epoch": 2.2009785472337224, "grad_norm": 0.18190249800682068, "learning_rate": 8.176237314677745e-05, "loss": 0.2112, "step": 3324 }, { "epoch": 2.201731275875047, "grad_norm": 0.25936567783355713, "learning_rate": 8.17504081678809e-05, "loss": 0.2485, "step": 3325 }, { "epoch": 2.202484004516372, "grad_norm": 0.20578239858150482, "learning_rate": 8.173844026359992e-05, "loss": 0.1608, "step": 3326 }, { "epoch": 2.2032367331576967, "grad_norm": 0.21313446760177612, "learning_rate": 8.172646943524332e-05, "loss": 0.2801, "step": 3327 }, { "epoch": 2.203989461799021, "grad_norm": 0.24483945965766907, "learning_rate": 8.171449568412021e-05, "loss": 0.3258, "step": 3328 }, { "epoch": 2.204742190440346, "grad_norm": 0.23057398200035095, "learning_rate": 8.170251901154e-05, "loss": 0.2449, "step": 3329 }, { "epoch": 2.205494919081671, "grad_norm": 0.20785552263259888, "learning_rate": 8.169053941881247e-05, "loss": 0.2084, "step": 3330 }, { "epoch": 2.206247647722996, "grad_norm": 0.1861601024866104, "learning_rate": 8.167855690724767e-05, "loss": 0.2518, "step": 3331 }, { "epoch": 2.207000376364321, "grad_norm": 0.2061607539653778, "learning_rate": 8.166657147815599e-05, "loss": 0.1962, "step": 3332 }, { "epoch": 2.2077531050056454, "grad_norm": 0.20225684344768524, "learning_rate": 8.165458313284818e-05, "loss": 0.2248, "step": 3333 }, { "epoch": 2.2085058336469703, "grad_norm": 0.21387457847595215, "learning_rate": 8.16425918726352e-05, "loss": 0.2562, "step": 3334 }, { "epoch": 2.209258562288295, "grad_norm": 0.2651998996734619, "learning_rate": 8.163059769882844e-05, "loss": 0.2431, "step": 3335 }, { "epoch": 2.2100112909296197, "grad_norm": 0.20324982702732086, "learning_rate": 8.161860061273955e-05, "loss": 0.3166, "step": 3336 }, { "epoch": 2.2107640195709446, "grad_norm": 0.209008127450943, "learning_rate": 8.160660061568051e-05, "loss": 0.1676, "step": 3337 }, { "epoch": 2.2115167482122695, "grad_norm": 0.21025201678276062, "learning_rate": 8.159459770896365e-05, "loss": 0.3028, "step": 3338 }, { "epoch": 2.2122694768535944, "grad_norm": 0.21859999001026154, "learning_rate": 8.158259189390155e-05, "loss": 0.2259, "step": 3339 }, { "epoch": 2.213022205494919, "grad_norm": 0.23396062850952148, "learning_rate": 8.157058317180717e-05, "loss": 0.2717, "step": 3340 }, { "epoch": 2.213774934136244, "grad_norm": 0.22087980806827545, "learning_rate": 8.155857154399377e-05, "loss": 0.2096, "step": 3341 }, { "epoch": 2.2145276627775687, "grad_norm": 0.23048946261405945, "learning_rate": 8.154655701177492e-05, "loss": 0.3511, "step": 3342 }, { "epoch": 2.2152803914188937, "grad_norm": 0.23921675980091095, "learning_rate": 8.153453957646449e-05, "loss": 0.2129, "step": 3343 }, { "epoch": 2.216033120060218, "grad_norm": 0.24303455650806427, "learning_rate": 8.152251923937672e-05, "loss": 0.2317, "step": 3344 }, { "epoch": 2.216785848701543, "grad_norm": 0.2261659950017929, "learning_rate": 8.151049600182612e-05, "loss": 0.2492, "step": 3345 }, { "epoch": 2.217538577342868, "grad_norm": 0.19360123574733734, "learning_rate": 8.149846986512753e-05, "loss": 0.2559, "step": 3346 }, { "epoch": 2.218291305984193, "grad_norm": 0.2506018877029419, "learning_rate": 8.148644083059612e-05, "loss": 0.1758, "step": 3347 }, { "epoch": 2.2190440346255174, "grad_norm": 0.20701034367084503, "learning_rate": 8.147440889954736e-05, "loss": 0.2363, "step": 3348 }, { "epoch": 2.2197967632668423, "grad_norm": 0.2069149613380432, "learning_rate": 8.146237407329705e-05, "loss": 0.2868, "step": 3349 }, { "epoch": 2.220549491908167, "grad_norm": 0.2103297859430313, "learning_rate": 8.14503363531613e-05, "loss": 0.2662, "step": 3350 }, { "epoch": 2.221302220549492, "grad_norm": 0.21240226924419403, "learning_rate": 8.143829574045653e-05, "loss": 0.2797, "step": 3351 }, { "epoch": 2.2220549491908166, "grad_norm": 0.22252631187438965, "learning_rate": 8.142625223649947e-05, "loss": 0.2682, "step": 3352 }, { "epoch": 2.2228076778321415, "grad_norm": 0.2057277262210846, "learning_rate": 8.141420584260723e-05, "loss": 0.2281, "step": 3353 }, { "epoch": 2.2235604064734664, "grad_norm": 0.21349406242370605, "learning_rate": 8.140215656009713e-05, "loss": 0.2622, "step": 3354 }, { "epoch": 2.224313135114791, "grad_norm": 0.21586178243160248, "learning_rate": 8.139010439028688e-05, "loss": 0.2559, "step": 3355 }, { "epoch": 2.225065863756116, "grad_norm": 0.21856218576431274, "learning_rate": 8.137804933449452e-05, "loss": 0.2172, "step": 3356 }, { "epoch": 2.2258185923974407, "grad_norm": 0.19837480783462524, "learning_rate": 8.13659913940383e-05, "loss": 0.2527, "step": 3357 }, { "epoch": 2.2265713210387656, "grad_norm": 0.23260511457920074, "learning_rate": 8.135393057023693e-05, "loss": 0.2472, "step": 3358 }, { "epoch": 2.22732404968009, "grad_norm": 0.19104206562042236, "learning_rate": 8.134186686440935e-05, "loss": 0.2142, "step": 3359 }, { "epoch": 2.228076778321415, "grad_norm": 0.20176677405834198, "learning_rate": 8.132980027787478e-05, "loss": 0.1996, "step": 3360 }, { "epoch": 2.22882950696274, "grad_norm": 0.2191828340291977, "learning_rate": 8.131773081195285e-05, "loss": 0.27, "step": 3361 }, { "epoch": 2.229582235604065, "grad_norm": 0.2229592353105545, "learning_rate": 8.130565846796344e-05, "loss": 0.3128, "step": 3362 }, { "epoch": 2.2303349642453894, "grad_norm": 0.2329380065202713, "learning_rate": 8.12935832472268e-05, "loss": 0.2999, "step": 3363 }, { "epoch": 2.2310876928867143, "grad_norm": 0.22686998546123505, "learning_rate": 8.128150515106339e-05, "loss": 0.2684, "step": 3364 }, { "epoch": 2.231840421528039, "grad_norm": 0.19529923796653748, "learning_rate": 8.126942418079411e-05, "loss": 0.1435, "step": 3365 }, { "epoch": 2.232593150169364, "grad_norm": 0.2597903907299042, "learning_rate": 8.12573403377401e-05, "loss": 0.2851, "step": 3366 }, { "epoch": 2.2333458788106886, "grad_norm": 0.18923404812812805, "learning_rate": 8.12452536232228e-05, "loss": 0.194, "step": 3367 }, { "epoch": 2.2340986074520135, "grad_norm": 0.21525020897388458, "learning_rate": 8.123316403856406e-05, "loss": 0.2434, "step": 3368 }, { "epoch": 2.2348513360933384, "grad_norm": 0.21236379444599152, "learning_rate": 8.122107158508592e-05, "loss": 0.1871, "step": 3369 }, { "epoch": 2.2356040647346633, "grad_norm": 0.19793696701526642, "learning_rate": 8.120897626411082e-05, "loss": 0.2831, "step": 3370 }, { "epoch": 2.236356793375988, "grad_norm": 0.22913958132266998, "learning_rate": 8.11968780769615e-05, "loss": 0.2614, "step": 3371 }, { "epoch": 2.2371095220173127, "grad_norm": 0.21227027475833893, "learning_rate": 8.118477702496096e-05, "loss": 0.2555, "step": 3372 }, { "epoch": 2.2378622506586376, "grad_norm": 0.21185733377933502, "learning_rate": 8.117267310943257e-05, "loss": 0.1968, "step": 3373 }, { "epoch": 2.2386149792999626, "grad_norm": 0.20729376375675201, "learning_rate": 8.116056633169999e-05, "loss": 0.294, "step": 3374 }, { "epoch": 2.239367707941287, "grad_norm": 0.21860986948013306, "learning_rate": 8.114845669308723e-05, "loss": 0.3055, "step": 3375 }, { "epoch": 2.240120436582612, "grad_norm": 0.20461149513721466, "learning_rate": 8.113634419491854e-05, "loss": 0.2083, "step": 3376 }, { "epoch": 2.240873165223937, "grad_norm": 0.18312129378318787, "learning_rate": 8.112422883851856e-05, "loss": 0.2406, "step": 3377 }, { "epoch": 2.241625893865262, "grad_norm": 0.21600469946861267, "learning_rate": 8.111211062521219e-05, "loss": 0.2774, "step": 3378 }, { "epoch": 2.2423786225065863, "grad_norm": 0.2378561645746231, "learning_rate": 8.109998955632466e-05, "loss": 0.2536, "step": 3379 }, { "epoch": 2.243131351147911, "grad_norm": 0.21523502469062805, "learning_rate": 8.108786563318149e-05, "loss": 0.2182, "step": 3380 }, { "epoch": 2.243884079789236, "grad_norm": 0.22365331649780273, "learning_rate": 8.107573885710858e-05, "loss": 0.2941, "step": 3381 }, { "epoch": 2.2446368084305606, "grad_norm": 0.18422484397888184, "learning_rate": 8.106360922943209e-05, "loss": 0.1928, "step": 3382 }, { "epoch": 2.2453895370718855, "grad_norm": 0.19864730536937714, "learning_rate": 8.105147675147844e-05, "loss": 0.1758, "step": 3383 }, { "epoch": 2.2461422657132104, "grad_norm": 0.20420445501804352, "learning_rate": 8.103934142457449e-05, "loss": 0.1826, "step": 3384 }, { "epoch": 2.2468949943545353, "grad_norm": 0.22424954175949097, "learning_rate": 8.10272032500473e-05, "loss": 0.2891, "step": 3385 }, { "epoch": 2.24764772299586, "grad_norm": 0.1909472793340683, "learning_rate": 8.10150622292243e-05, "loss": 0.2516, "step": 3386 }, { "epoch": 2.2484004516371847, "grad_norm": 0.28217944502830505, "learning_rate": 8.10029183634332e-05, "loss": 0.2135, "step": 3387 }, { "epoch": 2.2491531802785096, "grad_norm": 0.22023217380046844, "learning_rate": 8.099077165400204e-05, "loss": 0.2444, "step": 3388 }, { "epoch": 2.2499059089198346, "grad_norm": 0.19089950621128082, "learning_rate": 8.097862210225919e-05, "loss": 0.1886, "step": 3389 }, { "epoch": 2.250658637561159, "grad_norm": 0.2235802710056305, "learning_rate": 8.096646970953326e-05, "loss": 0.2806, "step": 3390 }, { "epoch": 2.251411366202484, "grad_norm": 0.22878322005271912, "learning_rate": 8.095431447715325e-05, "loss": 0.2889, "step": 3391 }, { "epoch": 2.252164094843809, "grad_norm": 0.2026931643486023, "learning_rate": 8.094215640644845e-05, "loss": 0.1894, "step": 3392 }, { "epoch": 2.252916823485134, "grad_norm": 0.2179899960756302, "learning_rate": 8.09299954987484e-05, "loss": 0.257, "step": 3393 }, { "epoch": 2.2536695521264583, "grad_norm": 0.1819714456796646, "learning_rate": 8.091783175538303e-05, "loss": 0.2607, "step": 3394 }, { "epoch": 2.254422280767783, "grad_norm": 0.23573939502239227, "learning_rate": 8.090566517768256e-05, "loss": 0.2993, "step": 3395 }, { "epoch": 2.255175009409108, "grad_norm": 0.2860209047794342, "learning_rate": 8.089349576697749e-05, "loss": 0.2644, "step": 3396 }, { "epoch": 2.255927738050433, "grad_norm": 0.20026277005672455, "learning_rate": 8.088132352459865e-05, "loss": 0.2577, "step": 3397 }, { "epoch": 2.2566804666917575, "grad_norm": 0.2297220677137375, "learning_rate": 8.086914845187719e-05, "loss": 0.2259, "step": 3398 }, { "epoch": 2.2574331953330824, "grad_norm": 0.2623372972011566, "learning_rate": 8.085697055014454e-05, "loss": 0.2242, "step": 3399 }, { "epoch": 2.2581859239744073, "grad_norm": 0.2355906367301941, "learning_rate": 8.084478982073247e-05, "loss": 0.2856, "step": 3400 }, { "epoch": 2.2581859239744073, "eval_loss": 0.25720107555389404, "eval_runtime": 456.6022, "eval_samples_per_second": 21.084, "eval_steps_per_second": 0.659, "step": 3400 }, { "epoch": 2.258938652615732, "grad_norm": 0.19931893050670624, "learning_rate": 8.083260626497304e-05, "loss": 0.2363, "step": 3401 }, { "epoch": 2.2596913812570567, "grad_norm": 0.25287333130836487, "learning_rate": 8.082041988419862e-05, "loss": 0.2869, "step": 3402 }, { "epoch": 2.2604441098983816, "grad_norm": 0.23005792498588562, "learning_rate": 8.080823067974193e-05, "loss": 0.291, "step": 3403 }, { "epoch": 2.2611968385397065, "grad_norm": 0.25087034702301025, "learning_rate": 8.07960386529359e-05, "loss": 0.1922, "step": 3404 }, { "epoch": 2.2619495671810315, "grad_norm": 0.3009170889854431, "learning_rate": 8.078384380511389e-05, "loss": 0.2626, "step": 3405 }, { "epoch": 2.262702295822356, "grad_norm": 0.25530266761779785, "learning_rate": 8.077164613760948e-05, "loss": 0.1952, "step": 3406 }, { "epoch": 2.263455024463681, "grad_norm": 0.22784970700740814, "learning_rate": 8.075944565175659e-05, "loss": 0.1284, "step": 3407 }, { "epoch": 2.2642077531050058, "grad_norm": 0.25269633531570435, "learning_rate": 8.074724234888947e-05, "loss": 0.2795, "step": 3408 }, { "epoch": 2.2649604817463302, "grad_norm": 0.260029137134552, "learning_rate": 8.073503623034262e-05, "loss": 0.2548, "step": 3409 }, { "epoch": 2.265713210387655, "grad_norm": 0.29623138904571533, "learning_rate": 8.07228272974509e-05, "loss": 0.2131, "step": 3410 }, { "epoch": 2.26646593902898, "grad_norm": 0.27541160583496094, "learning_rate": 8.071061555154946e-05, "loss": 0.288, "step": 3411 }, { "epoch": 2.267218667670305, "grad_norm": 0.2265140265226364, "learning_rate": 8.069840099397376e-05, "loss": 0.1228, "step": 3412 }, { "epoch": 2.2679713963116295, "grad_norm": 0.2932869493961334, "learning_rate": 8.068618362605957e-05, "loss": 0.2573, "step": 3413 }, { "epoch": 2.2687241249529544, "grad_norm": 0.2440403550863266, "learning_rate": 8.067396344914297e-05, "loss": 0.2682, "step": 3414 }, { "epoch": 2.2694768535942793, "grad_norm": 0.2398448884487152, "learning_rate": 8.066174046456029e-05, "loss": 0.2967, "step": 3415 }, { "epoch": 2.2702295822356042, "grad_norm": 0.2457093447446823, "learning_rate": 8.064951467364828e-05, "loss": 0.172, "step": 3416 }, { "epoch": 2.2709823108769287, "grad_norm": 0.19004780054092407, "learning_rate": 8.063728607774393e-05, "loss": 0.1729, "step": 3417 }, { "epoch": 2.2717350395182536, "grad_norm": 0.24990634620189667, "learning_rate": 8.062505467818447e-05, "loss": 0.1911, "step": 3418 }, { "epoch": 2.2724877681595785, "grad_norm": 0.23250849545001984, "learning_rate": 8.06128204763076e-05, "loss": 0.199, "step": 3419 }, { "epoch": 2.2732404968009035, "grad_norm": 0.21658489108085632, "learning_rate": 8.060058347345116e-05, "loss": 0.2532, "step": 3420 }, { "epoch": 2.273993225442228, "grad_norm": 0.2677745521068573, "learning_rate": 8.058834367095343e-05, "loss": 0.2698, "step": 3421 }, { "epoch": 2.274745954083553, "grad_norm": 0.22018860280513763, "learning_rate": 8.057610107015288e-05, "loss": 0.2673, "step": 3422 }, { "epoch": 2.2754986827248778, "grad_norm": 0.21756963431835175, "learning_rate": 8.05638556723884e-05, "loss": 0.2889, "step": 3423 }, { "epoch": 2.2762514113662027, "grad_norm": 0.24859803915023804, "learning_rate": 8.055160747899907e-05, "loss": 0.2008, "step": 3424 }, { "epoch": 2.277004140007527, "grad_norm": 0.19479088485240936, "learning_rate": 8.053935649132437e-05, "loss": 0.2129, "step": 3425 }, { "epoch": 2.277756868648852, "grad_norm": 0.1980551928281784, "learning_rate": 8.052710271070405e-05, "loss": 0.2101, "step": 3426 }, { "epoch": 2.278509597290177, "grad_norm": 0.24560539424419403, "learning_rate": 8.051484613847815e-05, "loss": 0.2131, "step": 3427 }, { "epoch": 2.2792623259315015, "grad_norm": 0.2001018524169922, "learning_rate": 8.050258677598703e-05, "loss": 0.1383, "step": 3428 }, { "epoch": 2.2800150545728264, "grad_norm": 0.18486446142196655, "learning_rate": 8.049032462457137e-05, "loss": 0.1703, "step": 3429 }, { "epoch": 2.2807677832141513, "grad_norm": 0.2507585287094116, "learning_rate": 8.047805968557214e-05, "loss": 0.2553, "step": 3430 }, { "epoch": 2.281520511855476, "grad_norm": 0.2093372792005539, "learning_rate": 8.04657919603306e-05, "loss": 0.2044, "step": 3431 }, { "epoch": 2.282273240496801, "grad_norm": 0.23499730229377747, "learning_rate": 8.045352145018835e-05, "loss": 0.1883, "step": 3432 }, { "epoch": 2.2830259691381256, "grad_norm": 0.21208029985427856, "learning_rate": 8.044124815648727e-05, "loss": 0.3387, "step": 3433 }, { "epoch": 2.2837786977794505, "grad_norm": 0.2195087969303131, "learning_rate": 8.042897208056952e-05, "loss": 0.1784, "step": 3434 }, { "epoch": 2.2845314264207754, "grad_norm": 0.225613072514534, "learning_rate": 8.041669322377761e-05, "loss": 0.1834, "step": 3435 }, { "epoch": 2.2852841550621, "grad_norm": 0.19705727696418762, "learning_rate": 8.040441158745437e-05, "loss": 0.2404, "step": 3436 }, { "epoch": 2.286036883703425, "grad_norm": 0.25471675395965576, "learning_rate": 8.039212717294284e-05, "loss": 0.1827, "step": 3437 }, { "epoch": 2.2867896123447498, "grad_norm": 0.2108861654996872, "learning_rate": 8.03798399815865e-05, "loss": 0.1851, "step": 3438 }, { "epoch": 2.2875423409860747, "grad_norm": 0.23347444832324982, "learning_rate": 8.036755001472899e-05, "loss": 0.2831, "step": 3439 }, { "epoch": 2.288295069627399, "grad_norm": 0.20753051340579987, "learning_rate": 8.035525727371436e-05, "loss": 0.2447, "step": 3440 }, { "epoch": 2.289047798268724, "grad_norm": 0.2516370117664337, "learning_rate": 8.03429617598869e-05, "loss": 0.2724, "step": 3441 }, { "epoch": 2.289800526910049, "grad_norm": 0.2203025221824646, "learning_rate": 8.033066347459125e-05, "loss": 0.2263, "step": 3442 }, { "epoch": 2.290553255551374, "grad_norm": 0.23109307885169983, "learning_rate": 8.031836241917234e-05, "loss": 0.1865, "step": 3443 }, { "epoch": 2.2913059841926984, "grad_norm": 0.22276292741298676, "learning_rate": 8.030605859497537e-05, "loss": 0.2638, "step": 3444 }, { "epoch": 2.2920587128340233, "grad_norm": 0.19562552869319916, "learning_rate": 8.029375200334588e-05, "loss": 0.14, "step": 3445 }, { "epoch": 2.292811441475348, "grad_norm": 0.2486756592988968, "learning_rate": 8.028144264562972e-05, "loss": 0.2732, "step": 3446 }, { "epoch": 2.293564170116673, "grad_norm": 0.23385153710842133, "learning_rate": 8.026913052317296e-05, "loss": 0.2396, "step": 3447 }, { "epoch": 2.2943168987579976, "grad_norm": 0.21921378374099731, "learning_rate": 8.025681563732211e-05, "loss": 0.2503, "step": 3448 }, { "epoch": 2.2950696273993225, "grad_norm": 0.23268820345401764, "learning_rate": 8.024449798942387e-05, "loss": 0.3101, "step": 3449 }, { "epoch": 2.2958223560406474, "grad_norm": 0.20966053009033203, "learning_rate": 8.023217758082529e-05, "loss": 0.2174, "step": 3450 }, { "epoch": 2.2965750846819724, "grad_norm": 0.23738014698028564, "learning_rate": 8.021985441287369e-05, "loss": 0.206, "step": 3451 }, { "epoch": 2.297327813323297, "grad_norm": 0.21454080939292908, "learning_rate": 8.020752848691673e-05, "loss": 0.2736, "step": 3452 }, { "epoch": 2.2980805419646217, "grad_norm": 0.20659993588924408, "learning_rate": 8.019519980430236e-05, "loss": 0.272, "step": 3453 }, { "epoch": 2.2988332706059467, "grad_norm": 0.22665786743164062, "learning_rate": 8.018286836637882e-05, "loss": 0.2301, "step": 3454 }, { "epoch": 2.299585999247271, "grad_norm": 0.23268432915210724, "learning_rate": 8.017053417449466e-05, "loss": 0.2106, "step": 3455 }, { "epoch": 2.300338727888596, "grad_norm": 0.1913755238056183, "learning_rate": 8.015819722999872e-05, "loss": 0.2516, "step": 3456 }, { "epoch": 2.301091456529921, "grad_norm": 0.24007105827331543, "learning_rate": 8.014585753424016e-05, "loss": 0.2567, "step": 3457 }, { "epoch": 2.301844185171246, "grad_norm": 0.24707677960395813, "learning_rate": 8.013351508856842e-05, "loss": 0.2162, "step": 3458 }, { "epoch": 2.302596913812571, "grad_norm": 0.2107446789741516, "learning_rate": 8.012116989433328e-05, "loss": 0.1735, "step": 3459 }, { "epoch": 2.3033496424538953, "grad_norm": 0.24099643528461456, "learning_rate": 8.010882195288474e-05, "loss": 0.2919, "step": 3460 }, { "epoch": 2.30410237109522, "grad_norm": 0.21043506264686584, "learning_rate": 8.00964712655732e-05, "loss": 0.1442, "step": 3461 }, { "epoch": 2.304855099736545, "grad_norm": 0.23150359094142914, "learning_rate": 8.008411783374928e-05, "loss": 0.2595, "step": 3462 }, { "epoch": 2.3056078283778696, "grad_norm": 0.22191643714904785, "learning_rate": 8.007176165876395e-05, "loss": 0.2134, "step": 3463 }, { "epoch": 2.3063605570191945, "grad_norm": 0.19727656245231628, "learning_rate": 8.005940274196846e-05, "loss": 0.2289, "step": 3464 }, { "epoch": 2.3071132856605194, "grad_norm": 0.2212575078010559, "learning_rate": 8.004704108471437e-05, "loss": 0.2389, "step": 3465 }, { "epoch": 2.3078660143018443, "grad_norm": 0.2579392194747925, "learning_rate": 8.003467668835353e-05, "loss": 0.2367, "step": 3466 }, { "epoch": 2.308618742943169, "grad_norm": 0.20262131094932556, "learning_rate": 8.002230955423808e-05, "loss": 0.1653, "step": 3467 }, { "epoch": 2.3093714715844937, "grad_norm": 0.24121646583080292, "learning_rate": 8.000993968372047e-05, "loss": 0.2882, "step": 3468 }, { "epoch": 2.3101242002258187, "grad_norm": 0.26251912117004395, "learning_rate": 7.999756707815345e-05, "loss": 0.2664, "step": 3469 }, { "epoch": 2.3108769288671436, "grad_norm": 0.2645053565502167, "learning_rate": 7.998519173889009e-05, "loss": 0.1815, "step": 3470 }, { "epoch": 2.311629657508468, "grad_norm": 0.20649076998233795, "learning_rate": 7.997281366728374e-05, "loss": 0.2324, "step": 3471 }, { "epoch": 2.312382386149793, "grad_norm": 0.2325669676065445, "learning_rate": 7.996043286468802e-05, "loss": 0.2439, "step": 3472 }, { "epoch": 2.313135114791118, "grad_norm": 0.2240188866853714, "learning_rate": 7.99480493324569e-05, "loss": 0.2215, "step": 3473 }, { "epoch": 2.3138878434324424, "grad_norm": 0.18034660816192627, "learning_rate": 7.99356630719446e-05, "loss": 0.1977, "step": 3474 }, { "epoch": 2.3146405720737673, "grad_norm": 0.25509026646614075, "learning_rate": 7.992327408450569e-05, "loss": 0.2991, "step": 3475 }, { "epoch": 2.315393300715092, "grad_norm": 0.21873173117637634, "learning_rate": 7.991088237149497e-05, "loss": 0.2243, "step": 3476 }, { "epoch": 2.316146029356417, "grad_norm": 0.21728385984897614, "learning_rate": 7.989848793426764e-05, "loss": 0.2272, "step": 3477 }, { "epoch": 2.316898757997742, "grad_norm": 0.24483580887317657, "learning_rate": 7.988609077417908e-05, "loss": 0.3054, "step": 3478 }, { "epoch": 2.3176514866390665, "grad_norm": 0.25953882932662964, "learning_rate": 7.987369089258506e-05, "loss": 0.2515, "step": 3479 }, { "epoch": 2.3184042152803914, "grad_norm": 0.23823650181293488, "learning_rate": 7.986128829084161e-05, "loss": 0.2529, "step": 3480 }, { "epoch": 2.3191569439217163, "grad_norm": 0.2242562174797058, "learning_rate": 7.984888297030502e-05, "loss": 0.2789, "step": 3481 }, { "epoch": 2.319909672563041, "grad_norm": 0.2006361335515976, "learning_rate": 7.983647493233199e-05, "loss": 0.1754, "step": 3482 }, { "epoch": 2.3206624012043657, "grad_norm": 0.1914907544851303, "learning_rate": 7.982406417827936e-05, "loss": 0.2524, "step": 3483 }, { "epoch": 2.3214151298456907, "grad_norm": 0.25433361530303955, "learning_rate": 7.981165070950441e-05, "loss": 0.2482, "step": 3484 }, { "epoch": 2.3221678584870156, "grad_norm": 0.2724166512489319, "learning_rate": 7.979923452736464e-05, "loss": 0.2747, "step": 3485 }, { "epoch": 2.32292058712834, "grad_norm": 0.20013456046581268, "learning_rate": 7.978681563321788e-05, "loss": 0.1816, "step": 3486 }, { "epoch": 2.323673315769665, "grad_norm": 0.23627135157585144, "learning_rate": 7.977439402842223e-05, "loss": 0.3211, "step": 3487 }, { "epoch": 2.32442604441099, "grad_norm": 0.17235304415225983, "learning_rate": 7.976196971433608e-05, "loss": 0.2393, "step": 3488 }, { "epoch": 2.325178773052315, "grad_norm": 0.22328251600265503, "learning_rate": 7.974954269231817e-05, "loss": 0.2591, "step": 3489 }, { "epoch": 2.3259315016936393, "grad_norm": 0.2366732805967331, "learning_rate": 7.973711296372748e-05, "loss": 0.3056, "step": 3490 }, { "epoch": 2.326684230334964, "grad_norm": 0.21634416282176971, "learning_rate": 7.97246805299233e-05, "loss": 0.202, "step": 3491 }, { "epoch": 2.327436958976289, "grad_norm": 0.25269946455955505, "learning_rate": 7.971224539226522e-05, "loss": 0.3109, "step": 3492 }, { "epoch": 2.328189687617614, "grad_norm": 0.21751768887043, "learning_rate": 7.969980755211315e-05, "loss": 0.2372, "step": 3493 }, { "epoch": 2.3289424162589385, "grad_norm": 0.24160189926624298, "learning_rate": 7.968736701082726e-05, "loss": 0.3597, "step": 3494 }, { "epoch": 2.3296951449002634, "grad_norm": 0.2009546011686325, "learning_rate": 7.967492376976803e-05, "loss": 0.1791, "step": 3495 }, { "epoch": 2.3304478735415883, "grad_norm": 0.20677600800991058, "learning_rate": 7.966247783029623e-05, "loss": 0.24, "step": 3496 }, { "epoch": 2.3312006021829133, "grad_norm": 0.19954098761081696, "learning_rate": 7.965002919377294e-05, "loss": 0.2152, "step": 3497 }, { "epoch": 2.3319533308242377, "grad_norm": 0.2153128981590271, "learning_rate": 7.963757786155952e-05, "loss": 0.2539, "step": 3498 }, { "epoch": 2.3327060594655626, "grad_norm": 0.17147773504257202, "learning_rate": 7.962512383501764e-05, "loss": 0.2009, "step": 3499 }, { "epoch": 2.3334587881068876, "grad_norm": 0.20578043162822723, "learning_rate": 7.961266711550922e-05, "loss": 0.2238, "step": 3500 }, { "epoch": 2.334211516748212, "grad_norm": 0.16992244124412537, "learning_rate": 7.960020770439653e-05, "loss": 0.2596, "step": 3501 }, { "epoch": 2.334964245389537, "grad_norm": 0.22063565254211426, "learning_rate": 7.958774560304213e-05, "loss": 0.2632, "step": 3502 }, { "epoch": 2.335716974030862, "grad_norm": 0.19499792158603668, "learning_rate": 7.957528081280882e-05, "loss": 0.2654, "step": 3503 }, { "epoch": 2.336469702672187, "grad_norm": 0.1766320914030075, "learning_rate": 7.956281333505975e-05, "loss": 0.2013, "step": 3504 }, { "epoch": 2.3372224313135117, "grad_norm": 0.18747951090335846, "learning_rate": 7.955034317115834e-05, "loss": 0.2024, "step": 3505 }, { "epoch": 2.337975159954836, "grad_norm": 0.20715457201004028, "learning_rate": 7.953787032246832e-05, "loss": 0.278, "step": 3506 }, { "epoch": 2.338727888596161, "grad_norm": 0.19330427050590515, "learning_rate": 7.95253947903537e-05, "loss": 0.2647, "step": 3507 }, { "epoch": 2.339480617237486, "grad_norm": 0.2123214602470398, "learning_rate": 7.951291657617876e-05, "loss": 0.23, "step": 3508 }, { "epoch": 2.3402333458788105, "grad_norm": 0.18789102137088776, "learning_rate": 7.950043568130812e-05, "loss": 0.174, "step": 3509 }, { "epoch": 2.3409860745201354, "grad_norm": 0.2133096307516098, "learning_rate": 7.948795210710669e-05, "loss": 0.2637, "step": 3510 }, { "epoch": 2.3417388031614603, "grad_norm": 0.21861448884010315, "learning_rate": 7.94754658549396e-05, "loss": 0.2685, "step": 3511 }, { "epoch": 2.3424915318027852, "grad_norm": 0.2123902440071106, "learning_rate": 7.94629769261724e-05, "loss": 0.2042, "step": 3512 }, { "epoch": 2.3432442604441097, "grad_norm": 0.21659554541110992, "learning_rate": 7.94504853221708e-05, "loss": 0.1973, "step": 3513 }, { "epoch": 2.3439969890854346, "grad_norm": 0.2474660873413086, "learning_rate": 7.943799104430088e-05, "loss": 0.2879, "step": 3514 }, { "epoch": 2.3447497177267596, "grad_norm": 0.23123005032539368, "learning_rate": 7.942549409392903e-05, "loss": 0.2045, "step": 3515 }, { "epoch": 2.3455024463680845, "grad_norm": 0.2099221795797348, "learning_rate": 7.941299447242186e-05, "loss": 0.2339, "step": 3516 }, { "epoch": 2.346255175009409, "grad_norm": 0.19841338694095612, "learning_rate": 7.940049218114631e-05, "loss": 0.2255, "step": 3517 }, { "epoch": 2.347007903650734, "grad_norm": 0.2549242377281189, "learning_rate": 7.938798722146962e-05, "loss": 0.2754, "step": 3518 }, { "epoch": 2.347760632292059, "grad_norm": 0.20617328584194183, "learning_rate": 7.937547959475932e-05, "loss": 0.2421, "step": 3519 }, { "epoch": 2.3485133609333837, "grad_norm": 0.27425137162208557, "learning_rate": 7.936296930238321e-05, "loss": 0.3603, "step": 3520 }, { "epoch": 2.349266089574708, "grad_norm": 0.2211214005947113, "learning_rate": 7.935045634570941e-05, "loss": 0.2326, "step": 3521 }, { "epoch": 2.350018818216033, "grad_norm": 0.23768185079097748, "learning_rate": 7.933794072610632e-05, "loss": 0.2151, "step": 3522 }, { "epoch": 2.350771546857358, "grad_norm": 0.2858554720878601, "learning_rate": 7.932542244494262e-05, "loss": 0.2599, "step": 3523 }, { "epoch": 2.351524275498683, "grad_norm": 0.21860752999782562, "learning_rate": 7.93129015035873e-05, "loss": 0.2169, "step": 3524 }, { "epoch": 2.3522770041400074, "grad_norm": 0.22120510041713715, "learning_rate": 7.930037790340963e-05, "loss": 0.2249, "step": 3525 }, { "epoch": 2.3530297327813323, "grad_norm": 0.23272040486335754, "learning_rate": 7.928785164577916e-05, "loss": 0.1438, "step": 3526 }, { "epoch": 2.3537824614226572, "grad_norm": 0.2355942577123642, "learning_rate": 7.927532273206574e-05, "loss": 0.1953, "step": 3527 }, { "epoch": 2.3545351900639817, "grad_norm": 0.20182572305202484, "learning_rate": 7.926279116363954e-05, "loss": 0.1871, "step": 3528 }, { "epoch": 2.3552879187053066, "grad_norm": 0.2055741548538208, "learning_rate": 7.925025694187097e-05, "loss": 0.2515, "step": 3529 }, { "epoch": 2.3560406473466315, "grad_norm": 0.22935563325881958, "learning_rate": 7.923772006813075e-05, "loss": 0.3143, "step": 3530 }, { "epoch": 2.3567933759879565, "grad_norm": 0.2549310326576233, "learning_rate": 7.922518054378992e-05, "loss": 0.2553, "step": 3531 }, { "epoch": 2.3575461046292814, "grad_norm": 0.22819143533706665, "learning_rate": 7.921263837021976e-05, "loss": 0.1855, "step": 3532 }, { "epoch": 2.358298833270606, "grad_norm": 0.20884191989898682, "learning_rate": 7.920009354879188e-05, "loss": 0.3405, "step": 3533 }, { "epoch": 2.3590515619119308, "grad_norm": 0.21802668273448944, "learning_rate": 7.918754608087813e-05, "loss": 0.2283, "step": 3534 }, { "epoch": 2.3598042905532557, "grad_norm": 0.24331103265285492, "learning_rate": 7.917499596785072e-05, "loss": 0.2875, "step": 3535 }, { "epoch": 2.36055701919458, "grad_norm": 0.1988477259874344, "learning_rate": 7.916244321108212e-05, "loss": 0.2027, "step": 3536 }, { "epoch": 2.361309747835905, "grad_norm": 0.1855904757976532, "learning_rate": 7.914988781194502e-05, "loss": 0.2377, "step": 3537 }, { "epoch": 2.36206247647723, "grad_norm": 0.1874285787343979, "learning_rate": 7.913732977181252e-05, "loss": 0.1548, "step": 3538 }, { "epoch": 2.362815205118555, "grad_norm": 0.244818776845932, "learning_rate": 7.912476909205791e-05, "loss": 0.21, "step": 3539 }, { "epoch": 2.3635679337598794, "grad_norm": 0.21268151700496674, "learning_rate": 7.911220577405484e-05, "loss": 0.2027, "step": 3540 }, { "epoch": 2.3643206624012043, "grad_norm": 0.21127718687057495, "learning_rate": 7.909963981917721e-05, "loss": 0.1777, "step": 3541 }, { "epoch": 2.3650733910425292, "grad_norm": 0.23806901276111603, "learning_rate": 7.908707122879918e-05, "loss": 0.2525, "step": 3542 }, { "epoch": 2.365826119683854, "grad_norm": 0.20171697437763214, "learning_rate": 7.907450000429526e-05, "loss": 0.1537, "step": 3543 }, { "epoch": 2.3665788483251786, "grad_norm": 0.20511925220489502, "learning_rate": 7.906192614704022e-05, "loss": 0.2342, "step": 3544 }, { "epoch": 2.3673315769665035, "grad_norm": 0.22347357869148254, "learning_rate": 7.904934965840911e-05, "loss": 0.2436, "step": 3545 }, { "epoch": 2.3680843056078285, "grad_norm": 0.20105582475662231, "learning_rate": 7.903677053977728e-05, "loss": 0.2989, "step": 3546 }, { "epoch": 2.3688370342491534, "grad_norm": 0.24721446633338928, "learning_rate": 7.902418879252036e-05, "loss": 0.2702, "step": 3547 }, { "epoch": 2.369589762890478, "grad_norm": 0.20413453876972198, "learning_rate": 7.901160441801427e-05, "loss": 0.1787, "step": 3548 }, { "epoch": 2.3703424915318028, "grad_norm": 0.20126815140247345, "learning_rate": 7.899901741763523e-05, "loss": 0.2066, "step": 3549 }, { "epoch": 2.3710952201731277, "grad_norm": 0.2130788266658783, "learning_rate": 7.898642779275972e-05, "loss": 0.2049, "step": 3550 }, { "epoch": 2.3718479488144526, "grad_norm": 0.21701934933662415, "learning_rate": 7.897383554476454e-05, "loss": 0.2054, "step": 3551 }, { "epoch": 2.372600677455777, "grad_norm": 0.22819887101650238, "learning_rate": 7.896124067502673e-05, "loss": 0.2608, "step": 3552 }, { "epoch": 2.373353406097102, "grad_norm": 0.18686643242835999, "learning_rate": 7.894864318492369e-05, "loss": 0.219, "step": 3553 }, { "epoch": 2.374106134738427, "grad_norm": 0.19374196231365204, "learning_rate": 7.893604307583303e-05, "loss": 0.1926, "step": 3554 }, { "epoch": 2.3748588633797514, "grad_norm": 0.22111903131008148, "learning_rate": 7.892344034913268e-05, "loss": 0.2544, "step": 3555 }, { "epoch": 2.3756115920210763, "grad_norm": 0.20688702166080475, "learning_rate": 7.891083500620085e-05, "loss": 0.2832, "step": 3556 }, { "epoch": 2.376364320662401, "grad_norm": 0.26079678535461426, "learning_rate": 7.889822704841607e-05, "loss": 0.247, "step": 3557 }, { "epoch": 2.377117049303726, "grad_norm": 0.23507142066955566, "learning_rate": 7.888561647715711e-05, "loss": 0.2017, "step": 3558 }, { "epoch": 2.377869777945051, "grad_norm": 0.21099036931991577, "learning_rate": 7.887300329380304e-05, "loss": 0.2077, "step": 3559 }, { "epoch": 2.3786225065863755, "grad_norm": 0.19221696257591248, "learning_rate": 7.886038749973323e-05, "loss": 0.1634, "step": 3560 }, { "epoch": 2.3793752352277004, "grad_norm": 0.250676691532135, "learning_rate": 7.88477690963273e-05, "loss": 0.2899, "step": 3561 }, { "epoch": 2.3801279638690254, "grad_norm": 0.18468178808689117, "learning_rate": 7.883514808496519e-05, "loss": 0.2108, "step": 3562 }, { "epoch": 2.38088069251035, "grad_norm": 0.2087111622095108, "learning_rate": 7.882252446702715e-05, "loss": 0.2171, "step": 3563 }, { "epoch": 2.3816334211516748, "grad_norm": 0.22475086152553558, "learning_rate": 7.880989824389364e-05, "loss": 0.2869, "step": 3564 }, { "epoch": 2.3823861497929997, "grad_norm": 0.21768946945667267, "learning_rate": 7.879726941694544e-05, "loss": 0.2562, "step": 3565 }, { "epoch": 2.3831388784343246, "grad_norm": 0.21526214480400085, "learning_rate": 7.878463798756365e-05, "loss": 0.2946, "step": 3566 }, { "epoch": 2.383891607075649, "grad_norm": 0.19574370980262756, "learning_rate": 7.877200395712961e-05, "loss": 0.2682, "step": 3567 }, { "epoch": 2.384644335716974, "grad_norm": 0.2263791263103485, "learning_rate": 7.875936732702495e-05, "loss": 0.2852, "step": 3568 }, { "epoch": 2.385397064358299, "grad_norm": 0.21176151931285858, "learning_rate": 7.874672809863162e-05, "loss": 0.2976, "step": 3569 }, { "epoch": 2.386149792999624, "grad_norm": 0.26495373249053955, "learning_rate": 7.87340862733318e-05, "loss": 0.2629, "step": 3570 }, { "epoch": 2.3869025216409483, "grad_norm": 0.27862781286239624, "learning_rate": 7.872144185250797e-05, "loss": 0.1889, "step": 3571 }, { "epoch": 2.387655250282273, "grad_norm": 0.22650767862796783, "learning_rate": 7.870879483754293e-05, "loss": 0.2477, "step": 3572 }, { "epoch": 2.388407978923598, "grad_norm": 0.23630303144454956, "learning_rate": 7.869614522981975e-05, "loss": 0.2111, "step": 3573 }, { "epoch": 2.3891607075649226, "grad_norm": 0.24191449582576752, "learning_rate": 7.868349303072174e-05, "loss": 0.2624, "step": 3574 }, { "epoch": 2.3899134362062475, "grad_norm": 0.21889804303646088, "learning_rate": 7.867083824163254e-05, "loss": 0.1494, "step": 3575 }, { "epoch": 2.3906661648475724, "grad_norm": 0.2369832694530487, "learning_rate": 7.865818086393606e-05, "loss": 0.2731, "step": 3576 }, { "epoch": 2.3914188934888974, "grad_norm": 0.2407459318637848, "learning_rate": 7.864552089901648e-05, "loss": 0.2983, "step": 3577 }, { "epoch": 2.3921716221302223, "grad_norm": 0.22939927875995636, "learning_rate": 7.863285834825832e-05, "loss": 0.1661, "step": 3578 }, { "epoch": 2.3929243507715467, "grad_norm": 0.20866723358631134, "learning_rate": 7.862019321304627e-05, "loss": 0.2255, "step": 3579 }, { "epoch": 2.3936770794128717, "grad_norm": 0.20657241344451904, "learning_rate": 7.860752549476542e-05, "loss": 0.1968, "step": 3580 }, { "epoch": 2.3944298080541966, "grad_norm": 0.24193885922431946, "learning_rate": 7.859485519480107e-05, "loss": 0.2865, "step": 3581 }, { "epoch": 2.395182536695521, "grad_norm": 0.21017791330814362, "learning_rate": 7.858218231453882e-05, "loss": 0.2985, "step": 3582 }, { "epoch": 2.395935265336846, "grad_norm": 0.21247486770153046, "learning_rate": 7.856950685536458e-05, "loss": 0.2938, "step": 3583 }, { "epoch": 2.396687993978171, "grad_norm": 0.22558197379112244, "learning_rate": 7.855682881866448e-05, "loss": 0.2023, "step": 3584 }, { "epoch": 2.397440722619496, "grad_norm": 0.22757388651371002, "learning_rate": 7.854414820582503e-05, "loss": 0.195, "step": 3585 }, { "epoch": 2.3981934512608203, "grad_norm": 0.29916465282440186, "learning_rate": 7.853146501823292e-05, "loss": 0.2984, "step": 3586 }, { "epoch": 2.398946179902145, "grad_norm": 0.19998562335968018, "learning_rate": 7.851877925727517e-05, "loss": 0.1933, "step": 3587 }, { "epoch": 2.39969890854347, "grad_norm": 0.23472170531749725, "learning_rate": 7.850609092433909e-05, "loss": 0.1548, "step": 3588 }, { "epoch": 2.400451637184795, "grad_norm": 0.26466113328933716, "learning_rate": 7.849340002081223e-05, "loss": 0.3117, "step": 3589 }, { "epoch": 2.4012043658261195, "grad_norm": 0.238702192902565, "learning_rate": 7.848070654808249e-05, "loss": 0.2483, "step": 3590 }, { "epoch": 2.4019570944674444, "grad_norm": 0.23315422236919403, "learning_rate": 7.846801050753796e-05, "loss": 0.3182, "step": 3591 }, { "epoch": 2.4027098231087693, "grad_norm": 0.24940195679664612, "learning_rate": 7.84553119005671e-05, "loss": 0.173, "step": 3592 }, { "epoch": 2.4034625517500943, "grad_norm": 0.20796126127243042, "learning_rate": 7.844261072855859e-05, "loss": 0.2352, "step": 3593 }, { "epoch": 2.4042152803914187, "grad_norm": 0.21470265090465546, "learning_rate": 7.842990699290142e-05, "loss": 0.1927, "step": 3594 }, { "epoch": 2.4049680090327437, "grad_norm": 0.1916157603263855, "learning_rate": 7.841720069498485e-05, "loss": 0.2049, "step": 3595 }, { "epoch": 2.4057207376740686, "grad_norm": 0.21635901927947998, "learning_rate": 7.840449183619841e-05, "loss": 0.1571, "step": 3596 }, { "epoch": 2.4064734663153935, "grad_norm": 0.24256305396556854, "learning_rate": 7.839178041793193e-05, "loss": 0.2129, "step": 3597 }, { "epoch": 2.407226194956718, "grad_norm": 0.22143420577049255, "learning_rate": 7.837906644157554e-05, "loss": 0.2145, "step": 3598 }, { "epoch": 2.407978923598043, "grad_norm": 0.2058362513780594, "learning_rate": 7.836634990851956e-05, "loss": 0.2341, "step": 3599 }, { "epoch": 2.408731652239368, "grad_norm": 0.2354547679424286, "learning_rate": 7.835363082015468e-05, "loss": 0.3477, "step": 3600 }, { "epoch": 2.408731652239368, "eval_loss": 0.2508648633956909, "eval_runtime": 456.4837, "eval_samples_per_second": 21.089, "eval_steps_per_second": 0.659, "step": 3600 }, { "epoch": 2.4094843808806923, "grad_norm": 0.2616819143295288, "learning_rate": 7.834090917787186e-05, "loss": 0.1366, "step": 3601 }, { "epoch": 2.410237109522017, "grad_norm": 0.24325218796730042, "learning_rate": 7.832818498306233e-05, "loss": 0.2863, "step": 3602 }, { "epoch": 2.410989838163342, "grad_norm": 0.20645950734615326, "learning_rate": 7.831545823711752e-05, "loss": 0.2041, "step": 3603 }, { "epoch": 2.411742566804667, "grad_norm": 0.227847620844841, "learning_rate": 7.830272894142928e-05, "loss": 0.3332, "step": 3604 }, { "epoch": 2.412495295445992, "grad_norm": 0.20173196494579315, "learning_rate": 7.828999709738964e-05, "loss": 0.166, "step": 3605 }, { "epoch": 2.4132480240873164, "grad_norm": 0.18818047642707825, "learning_rate": 7.827726270639091e-05, "loss": 0.3297, "step": 3606 }, { "epoch": 2.4140007527286413, "grad_norm": 0.2360728532075882, "learning_rate": 7.826452576982574e-05, "loss": 0.1163, "step": 3607 }, { "epoch": 2.4147534813699663, "grad_norm": 0.20919352769851685, "learning_rate": 7.8251786289087e-05, "loss": 0.2429, "step": 3608 }, { "epoch": 2.4155062100112907, "grad_norm": 0.21313443779945374, "learning_rate": 7.823904426556789e-05, "loss": 0.2425, "step": 3609 }, { "epoch": 2.4162589386526157, "grad_norm": 0.1886582374572754, "learning_rate": 7.822629970066181e-05, "loss": 0.1906, "step": 3610 }, { "epoch": 2.4170116672939406, "grad_norm": 0.20943370461463928, "learning_rate": 7.821355259576256e-05, "loss": 0.2817, "step": 3611 }, { "epoch": 2.4177643959352655, "grad_norm": 0.17996490001678467, "learning_rate": 7.820080295226406e-05, "loss": 0.1542, "step": 3612 }, { "epoch": 2.41851712457659, "grad_norm": 0.18838150799274445, "learning_rate": 7.818805077156063e-05, "loss": 0.2445, "step": 3613 }, { "epoch": 2.419269853217915, "grad_norm": 0.19396783411502838, "learning_rate": 7.817529605504684e-05, "loss": 0.1747, "step": 3614 }, { "epoch": 2.42002258185924, "grad_norm": 0.17619027197360992, "learning_rate": 7.816253880411753e-05, "loss": 0.1973, "step": 3615 }, { "epoch": 2.4207753105005647, "grad_norm": 0.20814360678195953, "learning_rate": 7.814977902016779e-05, "loss": 0.3141, "step": 3616 }, { "epoch": 2.421528039141889, "grad_norm": 0.20647026598453522, "learning_rate": 7.813701670459302e-05, "loss": 0.3309, "step": 3617 }, { "epoch": 2.422280767783214, "grad_norm": 0.2082589566707611, "learning_rate": 7.81242518587889e-05, "loss": 0.2219, "step": 3618 }, { "epoch": 2.423033496424539, "grad_norm": 0.2380771040916443, "learning_rate": 7.811148448415134e-05, "loss": 0.3112, "step": 3619 }, { "epoch": 2.423786225065864, "grad_norm": 0.20092543959617615, "learning_rate": 7.809871458207661e-05, "loss": 0.2565, "step": 3620 }, { "epoch": 2.4245389537071884, "grad_norm": 0.18121238052845, "learning_rate": 7.808594215396119e-05, "loss": 0.1976, "step": 3621 }, { "epoch": 2.4252916823485133, "grad_norm": 0.23144963383674622, "learning_rate": 7.80731672012018e-05, "loss": 0.2162, "step": 3622 }, { "epoch": 2.4260444109898383, "grad_norm": 0.17867790162563324, "learning_rate": 7.806038972519556e-05, "loss": 0.2451, "step": 3623 }, { "epoch": 2.426797139631163, "grad_norm": 0.21787407994270325, "learning_rate": 7.804760972733979e-05, "loss": 0.1966, "step": 3624 }, { "epoch": 2.4275498682724876, "grad_norm": 0.20421351492404938, "learning_rate": 7.803482720903205e-05, "loss": 0.2361, "step": 3625 }, { "epoch": 2.4283025969138126, "grad_norm": 0.19517141580581665, "learning_rate": 7.802204217167024e-05, "loss": 0.2694, "step": 3626 }, { "epoch": 2.4290553255551375, "grad_norm": 0.2097918689250946, "learning_rate": 7.800925461665253e-05, "loss": 0.1569, "step": 3627 }, { "epoch": 2.429808054196462, "grad_norm": 0.22231954336166382, "learning_rate": 7.799646454537731e-05, "loss": 0.2205, "step": 3628 }, { "epoch": 2.430560782837787, "grad_norm": 0.22477521002292633, "learning_rate": 7.79836719592433e-05, "loss": 0.2496, "step": 3629 }, { "epoch": 2.431313511479112, "grad_norm": 0.2251272052526474, "learning_rate": 7.797087685964948e-05, "loss": 0.2417, "step": 3630 }, { "epoch": 2.4320662401204367, "grad_norm": 0.214249387383461, "learning_rate": 7.79580792479951e-05, "loss": 0.2228, "step": 3631 }, { "epoch": 2.4328189687617616, "grad_norm": 0.2161981463432312, "learning_rate": 7.794527912567971e-05, "loss": 0.2018, "step": 3632 }, { "epoch": 2.433571697403086, "grad_norm": 0.2054065763950348, "learning_rate": 7.793247649410307e-05, "loss": 0.1592, "step": 3633 }, { "epoch": 2.434324426044411, "grad_norm": 0.205178365111351, "learning_rate": 7.791967135466528e-05, "loss": 0.188, "step": 3634 }, { "epoch": 2.435077154685736, "grad_norm": 0.19672410190105438, "learning_rate": 7.790686370876671e-05, "loss": 0.1899, "step": 3635 }, { "epoch": 2.4358298833270604, "grad_norm": 0.1863369196653366, "learning_rate": 7.789405355780795e-05, "loss": 0.1922, "step": 3636 }, { "epoch": 2.4365826119683853, "grad_norm": 0.21264994144439697, "learning_rate": 7.788124090318993e-05, "loss": 0.1686, "step": 3637 }, { "epoch": 2.4373353406097102, "grad_norm": 0.20834572613239288, "learning_rate": 7.786842574631378e-05, "loss": 0.1969, "step": 3638 }, { "epoch": 2.438088069251035, "grad_norm": 0.2022891342639923, "learning_rate": 7.7855608088581e-05, "loss": 0.2272, "step": 3639 }, { "epoch": 2.4388407978923596, "grad_norm": 0.1827555000782013, "learning_rate": 7.784278793139327e-05, "loss": 0.1799, "step": 3640 }, { "epoch": 2.4395935265336846, "grad_norm": 0.2077038437128067, "learning_rate": 7.782996527615259e-05, "loss": 0.1861, "step": 3641 }, { "epoch": 2.4403462551750095, "grad_norm": 0.21200031042099, "learning_rate": 7.781714012426124e-05, "loss": 0.2015, "step": 3642 }, { "epoch": 2.4410989838163344, "grad_norm": 0.18905869126319885, "learning_rate": 7.780431247712175e-05, "loss": 0.2056, "step": 3643 }, { "epoch": 2.441851712457659, "grad_norm": 0.24332134425640106, "learning_rate": 7.779148233613692e-05, "loss": 0.3456, "step": 3644 }, { "epoch": 2.442604441098984, "grad_norm": 0.19180460274219513, "learning_rate": 7.777864970270986e-05, "loss": 0.1732, "step": 3645 }, { "epoch": 2.4433571697403087, "grad_norm": 0.25233930349349976, "learning_rate": 7.776581457824393e-05, "loss": 0.252, "step": 3646 }, { "epoch": 2.444109898381633, "grad_norm": 0.22061382234096527, "learning_rate": 7.775297696414272e-05, "loss": 0.2138, "step": 3647 }, { "epoch": 2.444862627022958, "grad_norm": 0.21331921219825745, "learning_rate": 7.774013686181016e-05, "loss": 0.3548, "step": 3648 }, { "epoch": 2.445615355664283, "grad_norm": 0.22955384850502014, "learning_rate": 7.772729427265045e-05, "loss": 0.3325, "step": 3649 }, { "epoch": 2.446368084305608, "grad_norm": 0.20494921505451202, "learning_rate": 7.771444919806798e-05, "loss": 0.1804, "step": 3650 }, { "epoch": 2.447120812946933, "grad_norm": 0.27983590960502625, "learning_rate": 7.770160163946749e-05, "loss": 0.1777, "step": 3651 }, { "epoch": 2.4478735415882573, "grad_norm": 0.29870668053627014, "learning_rate": 7.768875159825399e-05, "loss": 0.279, "step": 3652 }, { "epoch": 2.4486262702295822, "grad_norm": 0.21428662538528442, "learning_rate": 7.767589907583274e-05, "loss": 0.1905, "step": 3653 }, { "epoch": 2.449378998870907, "grad_norm": 0.26995545625686646, "learning_rate": 7.766304407360924e-05, "loss": 0.261, "step": 3654 }, { "epoch": 2.4501317275122316, "grad_norm": 0.2637564241886139, "learning_rate": 7.765018659298933e-05, "loss": 0.1757, "step": 3655 }, { "epoch": 2.4508844561535565, "grad_norm": 0.23375459015369415, "learning_rate": 7.763732663537905e-05, "loss": 0.2478, "step": 3656 }, { "epoch": 2.4516371847948815, "grad_norm": 0.25322824716567993, "learning_rate": 7.762446420218476e-05, "loss": 0.2269, "step": 3657 }, { "epoch": 2.4523899134362064, "grad_norm": 0.19578297436237335, "learning_rate": 7.761159929481312e-05, "loss": 0.1745, "step": 3658 }, { "epoch": 2.4531426420775313, "grad_norm": 0.20845438539981842, "learning_rate": 7.759873191467094e-05, "loss": 0.3041, "step": 3659 }, { "epoch": 2.4538953707188558, "grad_norm": 0.21165019273757935, "learning_rate": 7.758586206316541e-05, "loss": 0.181, "step": 3660 }, { "epoch": 2.4546480993601807, "grad_norm": 0.20684681832790375, "learning_rate": 7.7572989741704e-05, "loss": 0.2014, "step": 3661 }, { "epoch": 2.4554008280015056, "grad_norm": 0.28627049922943115, "learning_rate": 7.756011495169435e-05, "loss": 0.3317, "step": 3662 }, { "epoch": 2.45615355664283, "grad_norm": 0.19719067215919495, "learning_rate": 7.754723769454443e-05, "loss": 0.2033, "step": 3663 }, { "epoch": 2.456906285284155, "grad_norm": 0.21146976947784424, "learning_rate": 7.753435797166252e-05, "loss": 0.1656, "step": 3664 }, { "epoch": 2.45765901392548, "grad_norm": 0.22395001351833344, "learning_rate": 7.75214757844571e-05, "loss": 0.1557, "step": 3665 }, { "epoch": 2.458411742566805, "grad_norm": 0.20422405004501343, "learning_rate": 7.750859113433693e-05, "loss": 0.1622, "step": 3666 }, { "epoch": 2.4591644712081293, "grad_norm": 0.22181221842765808, "learning_rate": 7.749570402271108e-05, "loss": 0.2223, "step": 3667 }, { "epoch": 2.4599171998494542, "grad_norm": 0.23564663529396057, "learning_rate": 7.748281445098886e-05, "loss": 0.2247, "step": 3668 }, { "epoch": 2.460669928490779, "grad_norm": 0.22702592611312866, "learning_rate": 7.746992242057986e-05, "loss": 0.2224, "step": 3669 }, { "epoch": 2.461422657132104, "grad_norm": 0.20692119002342224, "learning_rate": 7.745702793289393e-05, "loss": 0.2043, "step": 3670 }, { "epoch": 2.4621753857734285, "grad_norm": 0.24131622910499573, "learning_rate": 7.744413098934119e-05, "loss": 0.2472, "step": 3671 }, { "epoch": 2.4629281144147535, "grad_norm": 0.2142266035079956, "learning_rate": 7.743123159133202e-05, "loss": 0.2148, "step": 3672 }, { "epoch": 2.4636808430560784, "grad_norm": 0.21657690405845642, "learning_rate": 7.741832974027709e-05, "loss": 0.1558, "step": 3673 }, { "epoch": 2.464433571697403, "grad_norm": 0.24962307512760162, "learning_rate": 7.740542543758734e-05, "loss": 0.293, "step": 3674 }, { "epoch": 2.4651863003387278, "grad_norm": 0.22020789980888367, "learning_rate": 7.739251868467393e-05, "loss": 0.2862, "step": 3675 }, { "epoch": 2.4659390289800527, "grad_norm": 0.1988322138786316, "learning_rate": 7.737960948294834e-05, "loss": 0.234, "step": 3676 }, { "epoch": 2.4666917576213776, "grad_norm": 0.21606336534023285, "learning_rate": 7.736669783382233e-05, "loss": 0.2247, "step": 3677 }, { "epoch": 2.4674444862627025, "grad_norm": 0.20544560253620148, "learning_rate": 7.735378373870785e-05, "loss": 0.2143, "step": 3678 }, { "epoch": 2.468197214904027, "grad_norm": 0.19925644993782043, "learning_rate": 7.73408671990172e-05, "loss": 0.1699, "step": 3679 }, { "epoch": 2.468949943545352, "grad_norm": 0.19100408256053925, "learning_rate": 7.73279482161629e-05, "loss": 0.2566, "step": 3680 }, { "epoch": 2.469702672186677, "grad_norm": 0.19020560383796692, "learning_rate": 7.731502679155774e-05, "loss": 0.2113, "step": 3681 }, { "epoch": 2.4704554008280013, "grad_norm": 0.2072339951992035, "learning_rate": 7.730210292661482e-05, "loss": 0.1922, "step": 3682 }, { "epoch": 2.471208129469326, "grad_norm": 0.1930442750453949, "learning_rate": 7.728917662274745e-05, "loss": 0.1663, "step": 3683 }, { "epoch": 2.471960858110651, "grad_norm": 0.1725897341966629, "learning_rate": 7.727624788136922e-05, "loss": 0.1416, "step": 3684 }, { "epoch": 2.472713586751976, "grad_norm": 0.25980716943740845, "learning_rate": 7.726331670389404e-05, "loss": 0.2167, "step": 3685 }, { "epoch": 2.4734663153933005, "grad_norm": 0.2657911479473114, "learning_rate": 7.7250383091736e-05, "loss": 0.2872, "step": 3686 }, { "epoch": 2.4742190440346254, "grad_norm": 0.20042838156223297, "learning_rate": 7.723744704630952e-05, "loss": 0.2144, "step": 3687 }, { "epoch": 2.4749717726759504, "grad_norm": 0.24049535393714905, "learning_rate": 7.722450856902926e-05, "loss": 0.2924, "step": 3688 }, { "epoch": 2.4757245013172753, "grad_norm": 0.2111189216375351, "learning_rate": 7.721156766131017e-05, "loss": 0.2192, "step": 3689 }, { "epoch": 2.4764772299585998, "grad_norm": 0.23955978453159332, "learning_rate": 7.719862432456743e-05, "loss": 0.2646, "step": 3690 }, { "epoch": 2.4772299585999247, "grad_norm": 0.26805588603019714, "learning_rate": 7.718567856021651e-05, "loss": 0.2728, "step": 3691 }, { "epoch": 2.4779826872412496, "grad_norm": 0.2126118242740631, "learning_rate": 7.717273036967312e-05, "loss": 0.2167, "step": 3692 }, { "epoch": 2.4787354158825745, "grad_norm": 0.24336732923984528, "learning_rate": 7.715977975435329e-05, "loss": 0.1445, "step": 3693 }, { "epoch": 2.479488144523899, "grad_norm": 0.2286604344844818, "learning_rate": 7.714682671567326e-05, "loss": 0.197, "step": 3694 }, { "epoch": 2.480240873165224, "grad_norm": 0.23163644969463348, "learning_rate": 7.713387125504956e-05, "loss": 0.2439, "step": 3695 }, { "epoch": 2.480993601806549, "grad_norm": 0.3871770203113556, "learning_rate": 7.712091337389898e-05, "loss": 0.2378, "step": 3696 }, { "epoch": 2.4817463304478737, "grad_norm": 0.26879438757896423, "learning_rate": 7.710795307363857e-05, "loss": 0.1654, "step": 3697 }, { "epoch": 2.482499059089198, "grad_norm": 0.2778233289718628, "learning_rate": 7.709499035568563e-05, "loss": 0.2646, "step": 3698 }, { "epoch": 2.483251787730523, "grad_norm": 0.26235437393188477, "learning_rate": 7.70820252214578e-05, "loss": 0.2556, "step": 3699 }, { "epoch": 2.484004516371848, "grad_norm": 0.20623525977134705, "learning_rate": 7.706905767237288e-05, "loss": 0.1361, "step": 3700 }, { "epoch": 2.4847572450131725, "grad_norm": 0.22860655188560486, "learning_rate": 7.705608770984898e-05, "loss": 0.2201, "step": 3701 }, { "epoch": 2.4855099736544974, "grad_norm": 0.23013924062252045, "learning_rate": 7.70431153353045e-05, "loss": 0.1998, "step": 3702 }, { "epoch": 2.4862627022958224, "grad_norm": 0.1956527680158615, "learning_rate": 7.703014055015806e-05, "loss": 0.2048, "step": 3703 }, { "epoch": 2.4870154309371473, "grad_norm": 0.23374012112617493, "learning_rate": 7.701716335582856e-05, "loss": 0.2363, "step": 3704 }, { "epoch": 2.487768159578472, "grad_norm": 0.2300632894039154, "learning_rate": 7.70041837537352e-05, "loss": 0.1935, "step": 3705 }, { "epoch": 2.4885208882197967, "grad_norm": 0.18439806997776031, "learning_rate": 7.699120174529738e-05, "loss": 0.1537, "step": 3706 }, { "epoch": 2.4892736168611216, "grad_norm": 0.23806142807006836, "learning_rate": 7.69782173319348e-05, "loss": 0.19, "step": 3707 }, { "epoch": 2.4900263455024465, "grad_norm": 0.20737536251544952, "learning_rate": 7.696523051506742e-05, "loss": 0.2414, "step": 3708 }, { "epoch": 2.490779074143771, "grad_norm": 0.2335195243358612, "learning_rate": 7.695224129611546e-05, "loss": 0.3173, "step": 3709 }, { "epoch": 2.491531802785096, "grad_norm": 0.22369538247585297, "learning_rate": 7.69392496764994e-05, "loss": 0.2787, "step": 3710 }, { "epoch": 2.492284531426421, "grad_norm": 0.1837736964225769, "learning_rate": 7.692625565763996e-05, "loss": 0.2136, "step": 3711 }, { "epoch": 2.4930372600677457, "grad_norm": 0.25332826375961304, "learning_rate": 7.691325924095818e-05, "loss": 0.1997, "step": 3712 }, { "epoch": 2.49378998870907, "grad_norm": 0.21434617042541504, "learning_rate": 7.690026042787531e-05, "loss": 0.2061, "step": 3713 }, { "epoch": 2.494542717350395, "grad_norm": 0.21487776935100555, "learning_rate": 7.688725921981288e-05, "loss": 0.1987, "step": 3714 }, { "epoch": 2.49529544599172, "grad_norm": 0.1968320608139038, "learning_rate": 7.68742556181927e-05, "loss": 0.1778, "step": 3715 }, { "epoch": 2.496048174633045, "grad_norm": 0.23964406549930573, "learning_rate": 7.686124962443681e-05, "loss": 0.3346, "step": 3716 }, { "epoch": 2.4968009032743694, "grad_norm": 0.22431516647338867, "learning_rate": 7.684824123996754e-05, "loss": 0.1442, "step": 3717 }, { "epoch": 2.4975536319156943, "grad_norm": 0.19611038267612457, "learning_rate": 7.683523046620744e-05, "loss": 0.2195, "step": 3718 }, { "epoch": 2.4983063605570193, "grad_norm": 0.21121782064437866, "learning_rate": 7.682221730457936e-05, "loss": 0.1381, "step": 3719 }, { "epoch": 2.499059089198344, "grad_norm": 0.23984679579734802, "learning_rate": 7.680920175650643e-05, "loss": 0.1992, "step": 3720 }, { "epoch": 2.4998118178396687, "grad_norm": 0.2069931924343109, "learning_rate": 7.679618382341196e-05, "loss": 0.1766, "step": 3721 }, { "epoch": 2.5005645464809936, "grad_norm": 0.21220578253269196, "learning_rate": 7.678316350671961e-05, "loss": 0.2144, "step": 3722 }, { "epoch": 2.5013172751223185, "grad_norm": 0.22183552384376526, "learning_rate": 7.677014080785325e-05, "loss": 0.3146, "step": 3723 }, { "epoch": 2.5020700037636434, "grad_norm": 0.22808001935482025, "learning_rate": 7.675711572823702e-05, "loss": 0.2348, "step": 3724 }, { "epoch": 2.502822732404968, "grad_norm": 0.20788004994392395, "learning_rate": 7.674408826929534e-05, "loss": 0.2139, "step": 3725 }, { "epoch": 2.503575461046293, "grad_norm": 0.192410409450531, "learning_rate": 7.673105843245283e-05, "loss": 0.2138, "step": 3726 }, { "epoch": 2.5043281896876177, "grad_norm": 0.19615495204925537, "learning_rate": 7.671802621913447e-05, "loss": 0.2184, "step": 3727 }, { "epoch": 2.505080918328942, "grad_norm": 0.170883446931839, "learning_rate": 7.670499163076538e-05, "loss": 0.2264, "step": 3728 }, { "epoch": 2.505833646970267, "grad_norm": 0.24118447303771973, "learning_rate": 7.669195466877107e-05, "loss": 0.3699, "step": 3729 }, { "epoch": 2.506586375611592, "grad_norm": 0.21963229775428772, "learning_rate": 7.667891533457719e-05, "loss": 0.214, "step": 3730 }, { "epoch": 2.507339104252917, "grad_norm": 0.2486133873462677, "learning_rate": 7.666587362960973e-05, "loss": 0.3252, "step": 3731 }, { "epoch": 2.508091832894242, "grad_norm": 0.2235121726989746, "learning_rate": 7.665282955529491e-05, "loss": 0.2423, "step": 3732 }, { "epoch": 2.5088445615355663, "grad_norm": 0.2250540405511856, "learning_rate": 7.663978311305919e-05, "loss": 0.2165, "step": 3733 }, { "epoch": 2.5095972901768913, "grad_norm": 0.22325046360492706, "learning_rate": 7.662673430432935e-05, "loss": 0.1946, "step": 3734 }, { "epoch": 2.510350018818216, "grad_norm": 0.24403901398181915, "learning_rate": 7.661368313053235e-05, "loss": 0.1737, "step": 3735 }, { "epoch": 2.5111027474595407, "grad_norm": 0.20776696503162384, "learning_rate": 7.660062959309545e-05, "loss": 0.2261, "step": 3736 }, { "epoch": 2.5118554761008656, "grad_norm": 0.22763392329216003, "learning_rate": 7.658757369344617e-05, "loss": 0.1524, "step": 3737 }, { "epoch": 2.5126082047421905, "grad_norm": 0.2572057545185089, "learning_rate": 7.65745154330123e-05, "loss": 0.2203, "step": 3738 }, { "epoch": 2.5133609333835154, "grad_norm": 0.192934051156044, "learning_rate": 7.656145481322186e-05, "loss": 0.2173, "step": 3739 }, { "epoch": 2.51411366202484, "grad_norm": 0.1929541826248169, "learning_rate": 7.654839183550313e-05, "loss": 0.1783, "step": 3740 }, { "epoch": 2.514866390666165, "grad_norm": 0.23435384035110474, "learning_rate": 7.653532650128468e-05, "loss": 0.206, "step": 3741 }, { "epoch": 2.5156191193074897, "grad_norm": 0.18455259501934052, "learning_rate": 7.652225881199528e-05, "loss": 0.1617, "step": 3742 }, { "epoch": 2.5163718479488146, "grad_norm": 0.2881541848182678, "learning_rate": 7.650918876906403e-05, "loss": 0.2754, "step": 3743 }, { "epoch": 2.517124576590139, "grad_norm": 0.214601531624794, "learning_rate": 7.649611637392023e-05, "loss": 0.2244, "step": 3744 }, { "epoch": 2.517877305231464, "grad_norm": 0.17072993516921997, "learning_rate": 7.648304162799347e-05, "loss": 0.1298, "step": 3745 }, { "epoch": 2.518630033872789, "grad_norm": 0.23161496222019196, "learning_rate": 7.646996453271356e-05, "loss": 0.2271, "step": 3746 }, { "epoch": 2.5193827625141134, "grad_norm": 0.1923999935388565, "learning_rate": 7.645688508951062e-05, "loss": 0.1738, "step": 3747 }, { "epoch": 2.5201354911554383, "grad_norm": 0.26543140411376953, "learning_rate": 7.6443803299815e-05, "loss": 0.2891, "step": 3748 }, { "epoch": 2.5208882197967633, "grad_norm": 0.22059115767478943, "learning_rate": 7.643071916505726e-05, "loss": 0.2288, "step": 3749 }, { "epoch": 2.521640948438088, "grad_norm": 0.20722655951976776, "learning_rate": 7.641763268666831e-05, "loss": 0.2674, "step": 3750 }, { "epoch": 2.522393677079413, "grad_norm": 0.27529454231262207, "learning_rate": 7.640454386607924e-05, "loss": 0.2755, "step": 3751 }, { "epoch": 2.5231464057207376, "grad_norm": 0.2267308533191681, "learning_rate": 7.639145270472147e-05, "loss": 0.2223, "step": 3752 }, { "epoch": 2.5238991343620625, "grad_norm": 0.2728780210018158, "learning_rate": 7.637835920402656e-05, "loss": 0.3137, "step": 3753 }, { "epoch": 2.5246518630033874, "grad_norm": 0.29566875100135803, "learning_rate": 7.636526336542644e-05, "loss": 0.2321, "step": 3754 }, { "epoch": 2.525404591644712, "grad_norm": 0.24214419722557068, "learning_rate": 7.635216519035323e-05, "loss": 0.2259, "step": 3755 }, { "epoch": 2.526157320286037, "grad_norm": 0.2599342465400696, "learning_rate": 7.633906468023933e-05, "loss": 0.2412, "step": 3756 }, { "epoch": 2.5269100489273617, "grad_norm": 0.1871301531791687, "learning_rate": 7.632596183651744e-05, "loss": 0.1644, "step": 3757 }, { "epoch": 2.5276627775686866, "grad_norm": 0.28924760222435, "learning_rate": 7.63128566606204e-05, "loss": 0.1829, "step": 3758 }, { "epoch": 2.5284155062100115, "grad_norm": 0.25680986046791077, "learning_rate": 7.629974915398139e-05, "loss": 0.1853, "step": 3759 }, { "epoch": 2.529168234851336, "grad_norm": 0.20106446743011475, "learning_rate": 7.628663931803385e-05, "loss": 0.1743, "step": 3760 }, { "epoch": 2.529920963492661, "grad_norm": 0.24685335159301758, "learning_rate": 7.627352715421143e-05, "loss": 0.1658, "step": 3761 }, { "epoch": 2.530673692133986, "grad_norm": 0.1741587519645691, "learning_rate": 7.626041266394806e-05, "loss": 0.1175, "step": 3762 }, { "epoch": 2.5314264207753103, "grad_norm": 0.22208158671855927, "learning_rate": 7.624729584867794e-05, "loss": 0.213, "step": 3763 }, { "epoch": 2.5321791494166352, "grad_norm": 0.20096546411514282, "learning_rate": 7.623417670983549e-05, "loss": 0.1923, "step": 3764 }, { "epoch": 2.53293187805796, "grad_norm": 0.27819278836250305, "learning_rate": 7.622105524885539e-05, "loss": 0.2427, "step": 3765 }, { "epoch": 2.5336846066992846, "grad_norm": 0.24590706825256348, "learning_rate": 7.62079314671726e-05, "loss": 0.1947, "step": 3766 }, { "epoch": 2.5344373353406096, "grad_norm": 0.21119582653045654, "learning_rate": 7.619480536622232e-05, "loss": 0.2743, "step": 3767 }, { "epoch": 2.5351900639819345, "grad_norm": 0.24461963772773743, "learning_rate": 7.618167694743998e-05, "loss": 0.2036, "step": 3768 }, { "epoch": 2.5359427926232594, "grad_norm": 0.19228582084178925, "learning_rate": 7.61685462122613e-05, "loss": 0.2941, "step": 3769 }, { "epoch": 2.5366955212645843, "grad_norm": 0.25193044543266296, "learning_rate": 7.615541316212222e-05, "loss": 0.2055, "step": 3770 }, { "epoch": 2.537448249905909, "grad_norm": 0.21255388855934143, "learning_rate": 7.614227779845898e-05, "loss": 0.2331, "step": 3771 }, { "epoch": 2.5382009785472337, "grad_norm": 0.22163358330726624, "learning_rate": 7.6129140122708e-05, "loss": 0.204, "step": 3772 }, { "epoch": 2.5389537071885586, "grad_norm": 0.23526592552661896, "learning_rate": 7.611600013630605e-05, "loss": 0.184, "step": 3773 }, { "epoch": 2.539706435829883, "grad_norm": 0.20821405947208405, "learning_rate": 7.610285784069005e-05, "loss": 0.1998, "step": 3774 }, { "epoch": 2.540459164471208, "grad_norm": 0.22506867349147797, "learning_rate": 7.608971323729728e-05, "loss": 0.2737, "step": 3775 }, { "epoch": 2.541211893112533, "grad_norm": 0.21418273448944092, "learning_rate": 7.607656632756513e-05, "loss": 0.2874, "step": 3776 }, { "epoch": 2.541964621753858, "grad_norm": 0.2107906937599182, "learning_rate": 7.606341711293141e-05, "loss": 0.2387, "step": 3777 }, { "epoch": 2.5427173503951828, "grad_norm": 0.1954883486032486, "learning_rate": 7.605026559483404e-05, "loss": 0.2642, "step": 3778 }, { "epoch": 2.5434700790365072, "grad_norm": 0.19011136889457703, "learning_rate": 7.60371117747113e-05, "loss": 0.264, "step": 3779 }, { "epoch": 2.544222807677832, "grad_norm": 0.21257631480693817, "learning_rate": 7.602395565400164e-05, "loss": 0.3341, "step": 3780 }, { "epoch": 2.544975536319157, "grad_norm": 0.22884339094161987, "learning_rate": 7.601079723414379e-05, "loss": 0.168, "step": 3781 }, { "epoch": 2.5457282649604815, "grad_norm": 0.23676614463329315, "learning_rate": 7.599763651657675e-05, "loss": 0.1714, "step": 3782 }, { "epoch": 2.5464809936018065, "grad_norm": 0.4603070914745331, "learning_rate": 7.598447350273974e-05, "loss": 0.177, "step": 3783 }, { "epoch": 2.5472337222431314, "grad_norm": 0.4376383423805237, "learning_rate": 7.597130819407227e-05, "loss": 0.2256, "step": 3784 }, { "epoch": 2.5479864508844563, "grad_norm": 0.280752956867218, "learning_rate": 7.595814059201404e-05, "loss": 0.2385, "step": 3785 }, { "epoch": 2.548739179525781, "grad_norm": 0.2681352198123932, "learning_rate": 7.59449706980051e-05, "loss": 0.3021, "step": 3786 }, { "epoch": 2.5494919081671057, "grad_norm": 0.2456662505865097, "learning_rate": 7.593179851348563e-05, "loss": 0.1628, "step": 3787 }, { "epoch": 2.5502446368084306, "grad_norm": 0.2706602215766907, "learning_rate": 7.591862403989615e-05, "loss": 0.2561, "step": 3788 }, { "epoch": 2.5509973654497555, "grad_norm": 0.234159454703331, "learning_rate": 7.59054472786774e-05, "loss": 0.2314, "step": 3789 }, { "epoch": 2.55175009409108, "grad_norm": 0.2407209724187851, "learning_rate": 7.589226823127038e-05, "loss": 0.134, "step": 3790 }, { "epoch": 2.552502822732405, "grad_norm": 0.23127056658267975, "learning_rate": 7.587908689911629e-05, "loss": 0.1873, "step": 3791 }, { "epoch": 2.55325555137373, "grad_norm": 0.24240395426750183, "learning_rate": 7.586590328365666e-05, "loss": 0.1818, "step": 3792 }, { "epoch": 2.5540082800150543, "grad_norm": 0.23913376033306122, "learning_rate": 7.585271738633323e-05, "loss": 0.1938, "step": 3793 }, { "epoch": 2.5547610086563792, "grad_norm": 0.2414291650056839, "learning_rate": 7.583952920858795e-05, "loss": 0.2076, "step": 3794 }, { "epoch": 2.555513737297704, "grad_norm": 0.20781253278255463, "learning_rate": 7.582633875186311e-05, "loss": 0.1555, "step": 3795 }, { "epoch": 2.556266465939029, "grad_norm": 0.20794199407100677, "learning_rate": 7.581314601760118e-05, "loss": 0.1908, "step": 3796 }, { "epoch": 2.557019194580354, "grad_norm": 0.22092172503471375, "learning_rate": 7.579995100724487e-05, "loss": 0.2843, "step": 3797 }, { "epoch": 2.5577719232216785, "grad_norm": 0.21034294366836548, "learning_rate": 7.578675372223721e-05, "loss": 0.1958, "step": 3798 }, { "epoch": 2.5585246518630034, "grad_norm": 0.20932407677173615, "learning_rate": 7.57735541640214e-05, "loss": 0.2302, "step": 3799 }, { "epoch": 2.5592773805043283, "grad_norm": 0.17171363532543182, "learning_rate": 7.576035233404096e-05, "loss": 0.1172, "step": 3800 }, { "epoch": 2.5592773805043283, "eval_loss": 0.24481447041034698, "eval_runtime": 456.8333, "eval_samples_per_second": 21.073, "eval_steps_per_second": 0.659, "step": 3800 }, { "epoch": 2.5600301091456528, "grad_norm": 0.23643691837787628, "learning_rate": 7.574714823373958e-05, "loss": 0.2555, "step": 3801 }, { "epoch": 2.5607828377869777, "grad_norm": 0.2108502984046936, "learning_rate": 7.573394186456128e-05, "loss": 0.275, "step": 3802 }, { "epoch": 2.5615355664283026, "grad_norm": 0.18610846996307373, "learning_rate": 7.572073322795025e-05, "loss": 0.2354, "step": 3803 }, { "epoch": 2.5622882950696275, "grad_norm": 0.2016407996416092, "learning_rate": 7.5707522325351e-05, "loss": 0.2481, "step": 3804 }, { "epoch": 2.5630410237109524, "grad_norm": 0.21068565547466278, "learning_rate": 7.569430915820825e-05, "loss": 0.2823, "step": 3805 }, { "epoch": 2.563793752352277, "grad_norm": 0.16341310739517212, "learning_rate": 7.568109372796697e-05, "loss": 0.159, "step": 3806 }, { "epoch": 2.564546480993602, "grad_norm": 0.24071888625621796, "learning_rate": 7.566787603607234e-05, "loss": 0.2492, "step": 3807 }, { "epoch": 2.5652992096349267, "grad_norm": 0.1835266500711441, "learning_rate": 7.565465608396989e-05, "loss": 0.1602, "step": 3808 }, { "epoch": 2.566051938276251, "grad_norm": 0.19388772547245026, "learning_rate": 7.564143387310529e-05, "loss": 0.193, "step": 3809 }, { "epoch": 2.566804666917576, "grad_norm": 0.2250831574201584, "learning_rate": 7.562820940492453e-05, "loss": 0.1729, "step": 3810 }, { "epoch": 2.567557395558901, "grad_norm": 0.22259627282619476, "learning_rate": 7.561498268087378e-05, "loss": 0.2319, "step": 3811 }, { "epoch": 2.568310124200226, "grad_norm": 0.19448669254779816, "learning_rate": 7.560175370239955e-05, "loss": 0.228, "step": 3812 }, { "epoch": 2.569062852841551, "grad_norm": 0.22058871388435364, "learning_rate": 7.558852247094847e-05, "loss": 0.2299, "step": 3813 }, { "epoch": 2.5698155814828754, "grad_norm": 0.1889081746339798, "learning_rate": 7.557528898796757e-05, "loss": 0.1934, "step": 3814 }, { "epoch": 2.5705683101242003, "grad_norm": 0.2028103768825531, "learning_rate": 7.556205325490396e-05, "loss": 0.1586, "step": 3815 }, { "epoch": 2.571321038765525, "grad_norm": 0.19444605708122253, "learning_rate": 7.554881527320514e-05, "loss": 0.1021, "step": 3816 }, { "epoch": 2.5720737674068497, "grad_norm": 0.23077113926410675, "learning_rate": 7.553557504431879e-05, "loss": 0.1738, "step": 3817 }, { "epoch": 2.5728264960481746, "grad_norm": 0.18889518082141876, "learning_rate": 7.552233256969281e-05, "loss": 0.2762, "step": 3818 }, { "epoch": 2.5735792246894995, "grad_norm": 0.19928297400474548, "learning_rate": 7.550908785077539e-05, "loss": 0.174, "step": 3819 }, { "epoch": 2.574331953330824, "grad_norm": 0.18208296597003937, "learning_rate": 7.549584088901496e-05, "loss": 0.1765, "step": 3820 }, { "epoch": 2.575084681972149, "grad_norm": 0.186554953455925, "learning_rate": 7.54825916858602e-05, "loss": 0.177, "step": 3821 }, { "epoch": 2.575837410613474, "grad_norm": 0.20892009139060974, "learning_rate": 7.546934024275998e-05, "loss": 0.2469, "step": 3822 }, { "epoch": 2.5765901392547987, "grad_norm": 0.20249336957931519, "learning_rate": 7.545608656116353e-05, "loss": 0.1971, "step": 3823 }, { "epoch": 2.5773428678961237, "grad_norm": 0.17917561531066895, "learning_rate": 7.544283064252018e-05, "loss": 0.2, "step": 3824 }, { "epoch": 2.578095596537448, "grad_norm": 0.19737380743026733, "learning_rate": 7.542957248827961e-05, "loss": 0.2287, "step": 3825 }, { "epoch": 2.578848325178773, "grad_norm": 0.21617649495601654, "learning_rate": 7.54163120998917e-05, "loss": 0.2548, "step": 3826 }, { "epoch": 2.579601053820098, "grad_norm": 0.1898624151945114, "learning_rate": 7.540304947880661e-05, "loss": 0.1407, "step": 3827 }, { "epoch": 2.5803537824614224, "grad_norm": 0.17849871516227722, "learning_rate": 7.53897846264747e-05, "loss": 0.1162, "step": 3828 }, { "epoch": 2.5811065111027474, "grad_norm": 0.20294179022312164, "learning_rate": 7.537651754434659e-05, "loss": 0.2506, "step": 3829 }, { "epoch": 2.5818592397440723, "grad_norm": 0.17542365193367004, "learning_rate": 7.536324823387317e-05, "loss": 0.1939, "step": 3830 }, { "epoch": 2.582611968385397, "grad_norm": 0.221299946308136, "learning_rate": 7.534997669650553e-05, "loss": 0.306, "step": 3831 }, { "epoch": 2.583364697026722, "grad_norm": 0.18837092816829681, "learning_rate": 7.533670293369504e-05, "loss": 0.1789, "step": 3832 }, { "epoch": 2.5841174256680466, "grad_norm": 0.23823924362659454, "learning_rate": 7.53234269468933e-05, "loss": 0.258, "step": 3833 }, { "epoch": 2.5848701543093715, "grad_norm": 0.2641289234161377, "learning_rate": 7.531014873755214e-05, "loss": 0.2584, "step": 3834 }, { "epoch": 2.5856228829506964, "grad_norm": 0.23287619650363922, "learning_rate": 7.529686830712362e-05, "loss": 0.2, "step": 3835 }, { "epoch": 2.586375611592021, "grad_norm": 0.2110084444284439, "learning_rate": 7.528358565706014e-05, "loss": 0.2567, "step": 3836 }, { "epoch": 2.587128340233346, "grad_norm": 0.2074791043996811, "learning_rate": 7.52703007888142e-05, "loss": 0.2277, "step": 3837 }, { "epoch": 2.5878810688746707, "grad_norm": 0.22816972434520721, "learning_rate": 7.525701370383867e-05, "loss": 0.2427, "step": 3838 }, { "epoch": 2.5886337975159956, "grad_norm": 0.19277869164943695, "learning_rate": 7.524372440358655e-05, "loss": 0.2711, "step": 3839 }, { "epoch": 2.58938652615732, "grad_norm": 0.2287030816078186, "learning_rate": 7.523043288951118e-05, "loss": 0.2835, "step": 3840 }, { "epoch": 2.590139254798645, "grad_norm": 0.19028164446353912, "learning_rate": 7.521713916306609e-05, "loss": 0.1612, "step": 3841 }, { "epoch": 2.59089198343997, "grad_norm": 0.22501789033412933, "learning_rate": 7.520384322570503e-05, "loss": 0.2959, "step": 3842 }, { "epoch": 2.591644712081295, "grad_norm": 0.19877465069293976, "learning_rate": 7.519054507888209e-05, "loss": 0.2119, "step": 3843 }, { "epoch": 2.5923974407226194, "grad_norm": 0.2117142230272293, "learning_rate": 7.517724472405146e-05, "loss": 0.2002, "step": 3844 }, { "epoch": 2.5931501693639443, "grad_norm": 0.20207436382770538, "learning_rate": 7.516394216266769e-05, "loss": 0.1596, "step": 3845 }, { "epoch": 2.593902898005269, "grad_norm": 0.23256640136241913, "learning_rate": 7.515063739618554e-05, "loss": 0.202, "step": 3846 }, { "epoch": 2.5946556266465937, "grad_norm": 0.21981073915958405, "learning_rate": 7.513733042605996e-05, "loss": 0.1562, "step": 3847 }, { "epoch": 2.5954083552879186, "grad_norm": 0.20520026981830597, "learning_rate": 7.512402125374621e-05, "loss": 0.1973, "step": 3848 }, { "epoch": 2.5961610839292435, "grad_norm": 0.19443842768669128, "learning_rate": 7.511070988069975e-05, "loss": 0.2119, "step": 3849 }, { "epoch": 2.5969138125705684, "grad_norm": 0.18750935792922974, "learning_rate": 7.50973963083763e-05, "loss": 0.1948, "step": 3850 }, { "epoch": 2.5976665412118933, "grad_norm": 0.22693203389644623, "learning_rate": 7.508408053823179e-05, "loss": 0.1843, "step": 3851 }, { "epoch": 2.598419269853218, "grad_norm": 0.21885192394256592, "learning_rate": 7.507076257172245e-05, "loss": 0.2179, "step": 3852 }, { "epoch": 2.5991719984945427, "grad_norm": 0.20871855318546295, "learning_rate": 7.505744241030468e-05, "loss": 0.2136, "step": 3853 }, { "epoch": 2.5999247271358676, "grad_norm": 0.186286062002182, "learning_rate": 7.504412005543515e-05, "loss": 0.1652, "step": 3854 }, { "epoch": 2.600677455777192, "grad_norm": 0.24189011752605438, "learning_rate": 7.50307955085708e-05, "loss": 0.2396, "step": 3855 }, { "epoch": 2.601430184418517, "grad_norm": 0.1747843623161316, "learning_rate": 7.501746877116879e-05, "loss": 0.1765, "step": 3856 }, { "epoch": 2.602182913059842, "grad_norm": 0.23080894351005554, "learning_rate": 7.500413984468647e-05, "loss": 0.2078, "step": 3857 }, { "epoch": 2.602935641701167, "grad_norm": 0.24538524448871613, "learning_rate": 7.499080873058148e-05, "loss": 0.253, "step": 3858 }, { "epoch": 2.603688370342492, "grad_norm": 0.2452792227268219, "learning_rate": 7.497747543031172e-05, "loss": 0.2393, "step": 3859 }, { "epoch": 2.6044410989838163, "grad_norm": 0.1759076863527298, "learning_rate": 7.496413994533529e-05, "loss": 0.235, "step": 3860 }, { "epoch": 2.605193827625141, "grad_norm": 0.20822246372699738, "learning_rate": 7.495080227711051e-05, "loss": 0.1626, "step": 3861 }, { "epoch": 2.605946556266466, "grad_norm": 0.21709619462490082, "learning_rate": 7.493746242709601e-05, "loss": 0.2787, "step": 3862 }, { "epoch": 2.6066992849077906, "grad_norm": 0.22224190831184387, "learning_rate": 7.492412039675058e-05, "loss": 0.2229, "step": 3863 }, { "epoch": 2.6074520135491155, "grad_norm": 0.22325249016284943, "learning_rate": 7.49107761875333e-05, "loss": 0.1784, "step": 3864 }, { "epoch": 2.6082047421904404, "grad_norm": 0.21124683320522308, "learning_rate": 7.489742980090347e-05, "loss": 0.1734, "step": 3865 }, { "epoch": 2.608957470831765, "grad_norm": 0.23735783994197845, "learning_rate": 7.488408123832065e-05, "loss": 0.1549, "step": 3866 }, { "epoch": 2.60971019947309, "grad_norm": 0.20672984421253204, "learning_rate": 7.487073050124458e-05, "loss": 0.2139, "step": 3867 }, { "epoch": 2.6104629281144147, "grad_norm": 0.19651922583580017, "learning_rate": 7.48573775911353e-05, "loss": 0.1685, "step": 3868 }, { "epoch": 2.6112156567557396, "grad_norm": 0.19641168415546417, "learning_rate": 7.484402250945307e-05, "loss": 0.268, "step": 3869 }, { "epoch": 2.6119683853970646, "grad_norm": 0.19103099405765533, "learning_rate": 7.483066525765836e-05, "loss": 0.1695, "step": 3870 }, { "epoch": 2.612721114038389, "grad_norm": 0.23258911073207855, "learning_rate": 7.481730583721193e-05, "loss": 0.3312, "step": 3871 }, { "epoch": 2.613473842679714, "grad_norm": 0.19892513751983643, "learning_rate": 7.480394424957471e-05, "loss": 0.218, "step": 3872 }, { "epoch": 2.614226571321039, "grad_norm": 0.16738320887088776, "learning_rate": 7.479058049620794e-05, "loss": 0.1608, "step": 3873 }, { "epoch": 2.6149792999623633, "grad_norm": 0.20214539766311646, "learning_rate": 7.477721457857301e-05, "loss": 0.2942, "step": 3874 }, { "epoch": 2.6157320286036883, "grad_norm": 0.18163107335567474, "learning_rate": 7.476384649813167e-05, "loss": 0.2155, "step": 3875 }, { "epoch": 2.616484757245013, "grad_norm": 0.21525007486343384, "learning_rate": 7.475047625634576e-05, "loss": 0.1796, "step": 3876 }, { "epoch": 2.617237485886338, "grad_norm": 0.23423993587493896, "learning_rate": 7.473710385467748e-05, "loss": 0.2707, "step": 3877 }, { "epoch": 2.617990214527663, "grad_norm": 0.18947401642799377, "learning_rate": 7.472372929458919e-05, "loss": 0.1514, "step": 3878 }, { "epoch": 2.6187429431689875, "grad_norm": 0.18526117503643036, "learning_rate": 7.471035257754353e-05, "loss": 0.138, "step": 3879 }, { "epoch": 2.6194956718103124, "grad_norm": 0.20182232558727264, "learning_rate": 7.469697370500334e-05, "loss": 0.1465, "step": 3880 }, { "epoch": 2.6202484004516373, "grad_norm": 0.2033928483724594, "learning_rate": 7.46835926784317e-05, "loss": 0.183, "step": 3881 }, { "epoch": 2.621001129092962, "grad_norm": 0.24777594208717346, "learning_rate": 7.4670209499292e-05, "loss": 0.2493, "step": 3882 }, { "epoch": 2.6217538577342867, "grad_norm": 0.2020854651927948, "learning_rate": 7.465682416904772e-05, "loss": 0.2349, "step": 3883 }, { "epoch": 2.6225065863756116, "grad_norm": 0.18965643644332886, "learning_rate": 7.464343668916273e-05, "loss": 0.203, "step": 3884 }, { "epoch": 2.6232593150169365, "grad_norm": 0.20662979781627655, "learning_rate": 7.463004706110102e-05, "loss": 0.208, "step": 3885 }, { "epoch": 2.6240120436582615, "grad_norm": 0.19908560812473297, "learning_rate": 7.461665528632687e-05, "loss": 0.2619, "step": 3886 }, { "epoch": 2.624764772299586, "grad_norm": 0.2018863558769226, "learning_rate": 7.460326136630482e-05, "loss": 0.2406, "step": 3887 }, { "epoch": 2.625517500940911, "grad_norm": 0.19039562344551086, "learning_rate": 7.458986530249956e-05, "loss": 0.1553, "step": 3888 }, { "epoch": 2.6262702295822358, "grad_norm": 0.16675041615962982, "learning_rate": 7.457646709637609e-05, "loss": 0.2256, "step": 3889 }, { "epoch": 2.6270229582235602, "grad_norm": 0.22578659653663635, "learning_rate": 7.456306674939961e-05, "loss": 0.2647, "step": 3890 }, { "epoch": 2.627775686864885, "grad_norm": 0.20813728868961334, "learning_rate": 7.454966426303556e-05, "loss": 0.1665, "step": 3891 }, { "epoch": 2.62852841550621, "grad_norm": 0.18180251121520996, "learning_rate": 7.453625963874963e-05, "loss": 0.2082, "step": 3892 }, { "epoch": 2.6292811441475346, "grad_norm": 0.19454823434352875, "learning_rate": 7.45228528780077e-05, "loss": 0.137, "step": 3893 }, { "epoch": 2.6300338727888595, "grad_norm": 0.2083379477262497, "learning_rate": 7.450944398227595e-05, "loss": 0.2076, "step": 3894 }, { "epoch": 2.6307866014301844, "grad_norm": 0.19300007820129395, "learning_rate": 7.449603295302072e-05, "loss": 0.1832, "step": 3895 }, { "epoch": 2.6315393300715093, "grad_norm": 0.21803374588489532, "learning_rate": 7.448261979170865e-05, "loss": 0.2465, "step": 3896 }, { "epoch": 2.6322920587128342, "grad_norm": 0.18714262545108795, "learning_rate": 7.446920449980656e-05, "loss": 0.1718, "step": 3897 }, { "epoch": 2.6330447873541587, "grad_norm": 0.22734151780605316, "learning_rate": 7.445578707878155e-05, "loss": 0.19, "step": 3898 }, { "epoch": 2.6337975159954836, "grad_norm": 0.2575781047344208, "learning_rate": 7.44423675301009e-05, "loss": 0.3092, "step": 3899 }, { "epoch": 2.6345502446368085, "grad_norm": 0.23040412366390228, "learning_rate": 7.442894585523218e-05, "loss": 0.2464, "step": 3900 }, { "epoch": 2.635302973278133, "grad_norm": 0.2262657880783081, "learning_rate": 7.441552205564317e-05, "loss": 0.2639, "step": 3901 }, { "epoch": 2.636055701919458, "grad_norm": 0.2601373791694641, "learning_rate": 7.440209613280183e-05, "loss": 0.3336, "step": 3902 }, { "epoch": 2.636808430560783, "grad_norm": 0.21924567222595215, "learning_rate": 7.438866808817644e-05, "loss": 0.2158, "step": 3903 }, { "epoch": 2.6375611592021078, "grad_norm": 0.17857368290424347, "learning_rate": 7.437523792323545e-05, "loss": 0.1737, "step": 3904 }, { "epoch": 2.6383138878434327, "grad_norm": 0.22084935009479523, "learning_rate": 7.436180563944758e-05, "loss": 0.1751, "step": 3905 }, { "epoch": 2.639066616484757, "grad_norm": 0.20943158864974976, "learning_rate": 7.434837123828176e-05, "loss": 0.1848, "step": 3906 }, { "epoch": 2.639819345126082, "grad_norm": 0.20792077481746674, "learning_rate": 7.433493472120714e-05, "loss": 0.2153, "step": 3907 }, { "epoch": 2.640572073767407, "grad_norm": 0.24010708928108215, "learning_rate": 7.432149608969314e-05, "loss": 0.2886, "step": 3908 }, { "epoch": 2.6413248024087315, "grad_norm": 0.22815172374248505, "learning_rate": 7.430805534520937e-05, "loss": 0.2918, "step": 3909 }, { "epoch": 2.6420775310500564, "grad_norm": 0.2297409623861313, "learning_rate": 7.42946124892257e-05, "loss": 0.2144, "step": 3910 }, { "epoch": 2.6428302596913813, "grad_norm": 0.17343026399612427, "learning_rate": 7.42811675232122e-05, "loss": 0.1634, "step": 3911 }, { "epoch": 2.643582988332706, "grad_norm": 0.2728060781955719, "learning_rate": 7.426772044863922e-05, "loss": 0.2046, "step": 3912 }, { "epoch": 2.644335716974031, "grad_norm": 0.23055274784564972, "learning_rate": 7.42542712669773e-05, "loss": 0.2084, "step": 3913 }, { "epoch": 2.6450884456153556, "grad_norm": 0.2338220179080963, "learning_rate": 7.424081997969723e-05, "loss": 0.2445, "step": 3914 }, { "epoch": 2.6458411742566805, "grad_norm": 0.21371068060398102, "learning_rate": 7.422736658827001e-05, "loss": 0.2057, "step": 3915 }, { "epoch": 2.6465939028980054, "grad_norm": 0.22131125628948212, "learning_rate": 7.421391109416688e-05, "loss": 0.1696, "step": 3916 }, { "epoch": 2.64734663153933, "grad_norm": 0.22530117630958557, "learning_rate": 7.420045349885934e-05, "loss": 0.197, "step": 3917 }, { "epoch": 2.648099360180655, "grad_norm": 0.24999894201755524, "learning_rate": 7.418699380381905e-05, "loss": 0.2394, "step": 3918 }, { "epoch": 2.6488520888219798, "grad_norm": 0.26310476660728455, "learning_rate": 7.417353201051798e-05, "loss": 0.1821, "step": 3919 }, { "epoch": 2.6496048174633042, "grad_norm": 0.21669477224349976, "learning_rate": 7.416006812042828e-05, "loss": 0.186, "step": 3920 }, { "epoch": 2.650357546104629, "grad_norm": 0.23323671519756317, "learning_rate": 7.414660213502231e-05, "loss": 0.2297, "step": 3921 }, { "epoch": 2.651110274745954, "grad_norm": 0.177214115858078, "learning_rate": 7.413313405577275e-05, "loss": 0.2011, "step": 3922 }, { "epoch": 2.651863003387279, "grad_norm": 0.22797101736068726, "learning_rate": 7.41196638841524e-05, "loss": 0.1725, "step": 3923 }, { "epoch": 2.652615732028604, "grad_norm": 0.20133991539478302, "learning_rate": 7.410619162163436e-05, "loss": 0.2093, "step": 3924 }, { "epoch": 2.6533684606699284, "grad_norm": 0.20074687898159027, "learning_rate": 7.409271726969192e-05, "loss": 0.234, "step": 3925 }, { "epoch": 2.6541211893112533, "grad_norm": 0.2095559686422348, "learning_rate": 7.407924082979863e-05, "loss": 0.1959, "step": 3926 }, { "epoch": 2.654873917952578, "grad_norm": 0.18460752069950104, "learning_rate": 7.406576230342826e-05, "loss": 0.17, "step": 3927 }, { "epoch": 2.6556266465939027, "grad_norm": 0.2252594232559204, "learning_rate": 7.405228169205478e-05, "loss": 0.2077, "step": 3928 }, { "epoch": 2.6563793752352276, "grad_norm": 0.1777852326631546, "learning_rate": 7.40387989971524e-05, "loss": 0.1445, "step": 3929 }, { "epoch": 2.6571321038765525, "grad_norm": 0.19648633897304535, "learning_rate": 7.402531422019561e-05, "loss": 0.119, "step": 3930 }, { "epoch": 2.6578848325178774, "grad_norm": 0.21094046533107758, "learning_rate": 7.401182736265905e-05, "loss": 0.1553, "step": 3931 }, { "epoch": 2.6586375611592024, "grad_norm": 0.21077080070972443, "learning_rate": 7.399833842601764e-05, "loss": 0.3157, "step": 3932 }, { "epoch": 2.659390289800527, "grad_norm": 0.2153317779302597, "learning_rate": 7.39848474117465e-05, "loss": 0.1894, "step": 3933 }, { "epoch": 2.6601430184418517, "grad_norm": 0.22191748023033142, "learning_rate": 7.397135432132098e-05, "loss": 0.1593, "step": 3934 }, { "epoch": 2.6608957470831767, "grad_norm": 0.22513347864151, "learning_rate": 7.395785915621671e-05, "loss": 0.1938, "step": 3935 }, { "epoch": 2.661648475724501, "grad_norm": 0.19688525795936584, "learning_rate": 7.394436191790943e-05, "loss": 0.2969, "step": 3936 }, { "epoch": 2.662401204365826, "grad_norm": 0.2538803815841675, "learning_rate": 7.393086260787523e-05, "loss": 0.2297, "step": 3937 }, { "epoch": 2.663153933007151, "grad_norm": 0.22375214099884033, "learning_rate": 7.391736122759035e-05, "loss": 0.1925, "step": 3938 }, { "epoch": 2.6639066616484754, "grad_norm": 0.24104298651218414, "learning_rate": 7.39038577785313e-05, "loss": 0.1406, "step": 3939 }, { "epoch": 2.6646593902898004, "grad_norm": 0.1809138059616089, "learning_rate": 7.389035226217476e-05, "loss": 0.2512, "step": 3940 }, { "epoch": 2.6654121189311253, "grad_norm": 0.18753363192081451, "learning_rate": 7.387684467999771e-05, "loss": 0.1879, "step": 3941 }, { "epoch": 2.66616484757245, "grad_norm": 0.20786164700984955, "learning_rate": 7.386333503347731e-05, "loss": 0.1669, "step": 3942 }, { "epoch": 2.666917576213775, "grad_norm": 0.20969605445861816, "learning_rate": 7.384982332409096e-05, "loss": 0.1957, "step": 3943 }, { "epoch": 2.6676703048550996, "grad_norm": 0.18346580862998962, "learning_rate": 7.383630955331625e-05, "loss": 0.2425, "step": 3944 }, { "epoch": 2.6684230334964245, "grad_norm": 0.21008391678333282, "learning_rate": 7.382279372263105e-05, "loss": 0.1504, "step": 3945 }, { "epoch": 2.6691757621377494, "grad_norm": 0.19977979362010956, "learning_rate": 7.380927583351346e-05, "loss": 0.2169, "step": 3946 }, { "epoch": 2.669928490779074, "grad_norm": 0.2305300384759903, "learning_rate": 7.37957558874417e-05, "loss": 0.2476, "step": 3947 }, { "epoch": 2.670681219420399, "grad_norm": 0.2187517285346985, "learning_rate": 7.378223388589437e-05, "loss": 0.2394, "step": 3948 }, { "epoch": 2.6714339480617237, "grad_norm": 0.23824508488178253, "learning_rate": 7.376870983035015e-05, "loss": 0.1921, "step": 3949 }, { "epoch": 2.6721866767030487, "grad_norm": 0.2170172780752182, "learning_rate": 7.375518372228806e-05, "loss": 0.2847, "step": 3950 }, { "epoch": 2.6729394053443736, "grad_norm": 0.18001706898212433, "learning_rate": 7.374165556318728e-05, "loss": 0.1552, "step": 3951 }, { "epoch": 2.673692133985698, "grad_norm": 0.2160939872264862, "learning_rate": 7.372812535452724e-05, "loss": 0.2258, "step": 3952 }, { "epoch": 2.674444862627023, "grad_norm": 0.2025119662284851, "learning_rate": 7.371459309778754e-05, "loss": 0.2011, "step": 3953 }, { "epoch": 2.675197591268348, "grad_norm": 0.24110950529575348, "learning_rate": 7.370105879444809e-05, "loss": 0.2667, "step": 3954 }, { "epoch": 2.6759503199096724, "grad_norm": 0.20967668294906616, "learning_rate": 7.368752244598895e-05, "loss": 0.1701, "step": 3955 }, { "epoch": 2.6767030485509973, "grad_norm": 0.23871465027332306, "learning_rate": 7.367398405389047e-05, "loss": 0.249, "step": 3956 }, { "epoch": 2.677455777192322, "grad_norm": 0.2243688851594925, "learning_rate": 7.366044361963316e-05, "loss": 0.2149, "step": 3957 }, { "epoch": 2.678208505833647, "grad_norm": 0.1868487298488617, "learning_rate": 7.36469011446978e-05, "loss": 0.1641, "step": 3958 }, { "epoch": 2.678961234474972, "grad_norm": 0.1791503131389618, "learning_rate": 7.363335663056535e-05, "loss": 0.287, "step": 3959 }, { "epoch": 2.6797139631162965, "grad_norm": 0.20990873873233795, "learning_rate": 7.361981007871704e-05, "loss": 0.2465, "step": 3960 }, { "epoch": 2.6804666917576214, "grad_norm": 0.2018563449382782, "learning_rate": 7.360626149063431e-05, "loss": 0.277, "step": 3961 }, { "epoch": 2.6812194203989463, "grad_norm": 0.19716013967990875, "learning_rate": 7.359271086779878e-05, "loss": 0.1607, "step": 3962 }, { "epoch": 2.681972149040271, "grad_norm": 0.20272794365882874, "learning_rate": 7.357915821169234e-05, "loss": 0.2038, "step": 3963 }, { "epoch": 2.6827248776815957, "grad_norm": 0.1777925044298172, "learning_rate": 7.35656035237971e-05, "loss": 0.1377, "step": 3964 }, { "epoch": 2.6834776063229206, "grad_norm": 0.16366760432720184, "learning_rate": 7.355204680559537e-05, "loss": 0.1765, "step": 3965 }, { "epoch": 2.684230334964245, "grad_norm": 0.21724705398082733, "learning_rate": 7.353848805856971e-05, "loss": 0.2944, "step": 3966 }, { "epoch": 2.68498306360557, "grad_norm": 0.19669124484062195, "learning_rate": 7.352492728420284e-05, "loss": 0.2441, "step": 3967 }, { "epoch": 2.685735792246895, "grad_norm": 0.19303439557552338, "learning_rate": 7.351136448397778e-05, "loss": 0.3072, "step": 3968 }, { "epoch": 2.68648852088822, "grad_norm": 0.20942525565624237, "learning_rate": 7.349779965937775e-05, "loss": 0.2034, "step": 3969 }, { "epoch": 2.687241249529545, "grad_norm": 0.22114813327789307, "learning_rate": 7.348423281188616e-05, "loss": 0.3329, "step": 3970 }, { "epoch": 2.6879939781708693, "grad_norm": 0.19572381675243378, "learning_rate": 7.347066394298664e-05, "loss": 0.2977, "step": 3971 }, { "epoch": 2.688746706812194, "grad_norm": 0.2084863781929016, "learning_rate": 7.34570930541631e-05, "loss": 0.2757, "step": 3972 }, { "epoch": 2.689499435453519, "grad_norm": 0.17044022679328918, "learning_rate": 7.344352014689959e-05, "loss": 0.1721, "step": 3973 }, { "epoch": 2.6902521640948436, "grad_norm": 0.2236500084400177, "learning_rate": 7.342994522268046e-05, "loss": 0.1899, "step": 3974 }, { "epoch": 2.6910048927361685, "grad_norm": 0.1991288661956787, "learning_rate": 7.341636828299023e-05, "loss": 0.2403, "step": 3975 }, { "epoch": 2.6917576213774934, "grad_norm": 0.17177072167396545, "learning_rate": 7.340278932931366e-05, "loss": 0.2359, "step": 3976 }, { "epoch": 2.6925103500188183, "grad_norm": 0.20460401475429535, "learning_rate": 7.338920836313572e-05, "loss": 0.2771, "step": 3977 }, { "epoch": 2.6932630786601433, "grad_norm": 0.19695624709129333, "learning_rate": 7.33756253859416e-05, "loss": 0.1906, "step": 3978 }, { "epoch": 2.6940158073014677, "grad_norm": 0.19986043870449066, "learning_rate": 7.336204039921668e-05, "loss": 0.1976, "step": 3979 }, { "epoch": 2.6947685359427926, "grad_norm": 0.19343532621860504, "learning_rate": 7.334845340444667e-05, "loss": 0.1965, "step": 3980 }, { "epoch": 2.6955212645841176, "grad_norm": 0.2194896936416626, "learning_rate": 7.333486440311738e-05, "loss": 0.1733, "step": 3981 }, { "epoch": 2.696273993225442, "grad_norm": 0.21338285505771637, "learning_rate": 7.332127339671485e-05, "loss": 0.1867, "step": 3982 }, { "epoch": 2.697026721866767, "grad_norm": 0.1801009178161621, "learning_rate": 7.330768038672543e-05, "loss": 0.2015, "step": 3983 }, { "epoch": 2.697779450508092, "grad_norm": 0.20104457437992096, "learning_rate": 7.329408537463562e-05, "loss": 0.1909, "step": 3984 }, { "epoch": 2.698532179149417, "grad_norm": 0.22252696752548218, "learning_rate": 7.328048836193211e-05, "loss": 0.1784, "step": 3985 }, { "epoch": 2.6992849077907417, "grad_norm": 0.22085684537887573, "learning_rate": 7.32668893501019e-05, "loss": 0.1539, "step": 3986 }, { "epoch": 2.700037636432066, "grad_norm": 0.22593483328819275, "learning_rate": 7.325328834063214e-05, "loss": 0.2982, "step": 3987 }, { "epoch": 2.700790365073391, "grad_norm": 0.22753597795963287, "learning_rate": 7.323968533501019e-05, "loss": 0.1709, "step": 3988 }, { "epoch": 2.701543093714716, "grad_norm": 0.1894913762807846, "learning_rate": 7.322608033472368e-05, "loss": 0.2877, "step": 3989 }, { "epoch": 2.7022958223560405, "grad_norm": 0.2350333034992218, "learning_rate": 7.321247334126042e-05, "loss": 0.2672, "step": 3990 }, { "epoch": 2.7030485509973654, "grad_norm": 0.2190520167350769, "learning_rate": 7.319886435610847e-05, "loss": 0.2066, "step": 3991 }, { "epoch": 2.7038012796386903, "grad_norm": 0.22769001126289368, "learning_rate": 7.318525338075607e-05, "loss": 0.2564, "step": 3992 }, { "epoch": 2.704554008280015, "grad_norm": 0.20165081322193146, "learning_rate": 7.317164041669172e-05, "loss": 0.2104, "step": 3993 }, { "epoch": 2.7053067369213397, "grad_norm": 0.21682733297348022, "learning_rate": 7.315802546540408e-05, "loss": 0.1971, "step": 3994 }, { "epoch": 2.7060594655626646, "grad_norm": 0.2197018563747406, "learning_rate": 7.314440852838207e-05, "loss": 0.1771, "step": 3995 }, { "epoch": 2.7068121942039896, "grad_norm": 0.21024414896965027, "learning_rate": 7.313078960711483e-05, "loss": 0.2541, "step": 3996 }, { "epoch": 2.7075649228453145, "grad_norm": 0.18707334995269775, "learning_rate": 7.311716870309168e-05, "loss": 0.1346, "step": 3997 }, { "epoch": 2.708317651486639, "grad_norm": 0.22857263684272766, "learning_rate": 7.310354581780222e-05, "loss": 0.2189, "step": 3998 }, { "epoch": 2.709070380127964, "grad_norm": 0.21414758265018463, "learning_rate": 7.30899209527362e-05, "loss": 0.203, "step": 3999 }, { "epoch": 2.709823108769289, "grad_norm": 0.18185731768608093, "learning_rate": 7.307629410938363e-05, "loss": 0.2264, "step": 4000 }, { "epoch": 2.709823108769289, "eval_loss": 0.2342778444290161, "eval_runtime": 456.3005, "eval_samples_per_second": 21.098, "eval_steps_per_second": 0.66, "step": 4000 }, { "epoch": 2.7105758374106133, "grad_norm": 0.2187558263540268, "learning_rate": 7.306266528923471e-05, "loss": 0.3099, "step": 4001 }, { "epoch": 2.711328566051938, "grad_norm": 0.21113891899585724, "learning_rate": 7.304903449377987e-05, "loss": 0.2755, "step": 4002 }, { "epoch": 2.712081294693263, "grad_norm": 0.20582100749015808, "learning_rate": 7.303540172450976e-05, "loss": 0.1956, "step": 4003 }, { "epoch": 2.712834023334588, "grad_norm": 0.2102072685956955, "learning_rate": 7.302176698291521e-05, "loss": 0.1785, "step": 4004 }, { "epoch": 2.713586751975913, "grad_norm": 0.18530936539173126, "learning_rate": 7.300813027048734e-05, "loss": 0.1608, "step": 4005 }, { "epoch": 2.7143394806172374, "grad_norm": 0.18501073122024536, "learning_rate": 7.299449158871742e-05, "loss": 0.159, "step": 4006 }, { "epoch": 2.7150922092585623, "grad_norm": 0.20853127539157867, "learning_rate": 7.298085093909693e-05, "loss": 0.2372, "step": 4007 }, { "epoch": 2.7158449378998872, "grad_norm": 0.23979389667510986, "learning_rate": 7.296720832311765e-05, "loss": 0.1839, "step": 4008 }, { "epoch": 2.7165976665412117, "grad_norm": 0.19307631254196167, "learning_rate": 7.295356374227146e-05, "loss": 0.1205, "step": 4009 }, { "epoch": 2.7173503951825366, "grad_norm": 0.21617518365383148, "learning_rate": 7.293991719805053e-05, "loss": 0.2172, "step": 4010 }, { "epoch": 2.7181031238238615, "grad_norm": 0.22143112123012543, "learning_rate": 7.292626869194723e-05, "loss": 0.2467, "step": 4011 }, { "epoch": 2.7188558524651865, "grad_norm": 0.22397463023662567, "learning_rate": 7.291261822545416e-05, "loss": 0.2688, "step": 4012 }, { "epoch": 2.7196085811065114, "grad_norm": 0.23933494091033936, "learning_rate": 7.289896580006406e-05, "loss": 0.2524, "step": 4013 }, { "epoch": 2.720361309747836, "grad_norm": 0.20004895329475403, "learning_rate": 7.288531141726997e-05, "loss": 0.2215, "step": 4014 }, { "epoch": 2.7211140383891608, "grad_norm": 0.1942901611328125, "learning_rate": 7.287165507856512e-05, "loss": 0.1674, "step": 4015 }, { "epoch": 2.7218667670304857, "grad_norm": 0.19625058770179749, "learning_rate": 7.285799678544296e-05, "loss": 0.1859, "step": 4016 }, { "epoch": 2.72261949567181, "grad_norm": 0.21093742549419403, "learning_rate": 7.284433653939711e-05, "loss": 0.2505, "step": 4017 }, { "epoch": 2.723372224313135, "grad_norm": 0.1798355132341385, "learning_rate": 7.283067434192143e-05, "loss": 0.1756, "step": 4018 }, { "epoch": 2.72412495295446, "grad_norm": 0.2005462497472763, "learning_rate": 7.281701019451004e-05, "loss": 0.2354, "step": 4019 }, { "epoch": 2.7248776815957845, "grad_norm": 0.20067466795444489, "learning_rate": 7.280334409865717e-05, "loss": 0.235, "step": 4020 }, { "epoch": 2.7256304102371094, "grad_norm": 0.16921159625053406, "learning_rate": 7.278967605585738e-05, "loss": 0.2009, "step": 4021 }, { "epoch": 2.7263831388784343, "grad_norm": 0.21999390423297882, "learning_rate": 7.277600606760536e-05, "loss": 0.1887, "step": 4022 }, { "epoch": 2.7271358675197592, "grad_norm": 0.18909712135791779, "learning_rate": 7.276233413539602e-05, "loss": 0.1793, "step": 4023 }, { "epoch": 2.727888596161084, "grad_norm": 0.18333075940608978, "learning_rate": 7.274866026072457e-05, "loss": 0.2495, "step": 4024 }, { "epoch": 2.7286413248024086, "grad_norm": 0.18032582104206085, "learning_rate": 7.273498444508628e-05, "loss": 0.1979, "step": 4025 }, { "epoch": 2.7293940534437335, "grad_norm": 0.16857612133026123, "learning_rate": 7.272130668997677e-05, "loss": 0.1715, "step": 4026 }, { "epoch": 2.7301467820850585, "grad_norm": 0.21462611854076385, "learning_rate": 7.270762699689179e-05, "loss": 0.194, "step": 4027 }, { "epoch": 2.730899510726383, "grad_norm": 0.16639599204063416, "learning_rate": 7.269394536732735e-05, "loss": 0.1549, "step": 4028 }, { "epoch": 2.731652239367708, "grad_norm": 0.18595734238624573, "learning_rate": 7.268026180277967e-05, "loss": 0.2372, "step": 4029 }, { "epoch": 2.7324049680090328, "grad_norm": 0.1781461536884308, "learning_rate": 7.266657630474512e-05, "loss": 0.2544, "step": 4030 }, { "epoch": 2.7331576966503577, "grad_norm": 0.24928471446037292, "learning_rate": 7.265288887472035e-05, "loss": 0.219, "step": 4031 }, { "epoch": 2.7339104252916826, "grad_norm": 0.20164507627487183, "learning_rate": 7.263919951420219e-05, "loss": 0.1776, "step": 4032 }, { "epoch": 2.734663153933007, "grad_norm": 0.1791689246892929, "learning_rate": 7.262550822468769e-05, "loss": 0.2302, "step": 4033 }, { "epoch": 2.735415882574332, "grad_norm": 0.205842986702919, "learning_rate": 7.261181500767413e-05, "loss": 0.1951, "step": 4034 }, { "epoch": 2.736168611215657, "grad_norm": 0.19081637263298035, "learning_rate": 7.259811986465893e-05, "loss": 0.1664, "step": 4035 }, { "epoch": 2.7369213398569814, "grad_norm": 0.21601806581020355, "learning_rate": 7.258442279713981e-05, "loss": 0.1597, "step": 4036 }, { "epoch": 2.7376740684983063, "grad_norm": 0.21792374551296234, "learning_rate": 7.257072380661467e-05, "loss": 0.1915, "step": 4037 }, { "epoch": 2.738426797139631, "grad_norm": 0.21966631710529327, "learning_rate": 7.255702289458157e-05, "loss": 0.2128, "step": 4038 }, { "epoch": 2.7391795257809557, "grad_norm": 0.18238991498947144, "learning_rate": 7.254332006253885e-05, "loss": 0.2265, "step": 4039 }, { "epoch": 2.7399322544222806, "grad_norm": 0.191605344414711, "learning_rate": 7.252961531198501e-05, "loss": 0.3028, "step": 4040 }, { "epoch": 2.7406849830636055, "grad_norm": 0.20377182960510254, "learning_rate": 7.25159086444188e-05, "loss": 0.306, "step": 4041 }, { "epoch": 2.7414377117049304, "grad_norm": 0.2041080743074417, "learning_rate": 7.250220006133916e-05, "loss": 0.2557, "step": 4042 }, { "epoch": 2.7421904403462554, "grad_norm": 0.19815057516098022, "learning_rate": 7.248848956424524e-05, "loss": 0.2106, "step": 4043 }, { "epoch": 2.74294316898758, "grad_norm": 0.18480534851551056, "learning_rate": 7.24747771546364e-05, "loss": 0.2442, "step": 4044 }, { "epoch": 2.7436958976289048, "grad_norm": 0.23194289207458496, "learning_rate": 7.24610628340122e-05, "loss": 0.2533, "step": 4045 }, { "epoch": 2.7444486262702297, "grad_norm": 0.20626406371593475, "learning_rate": 7.244734660387241e-05, "loss": 0.241, "step": 4046 }, { "epoch": 2.745201354911554, "grad_norm": 0.1960395872592926, "learning_rate": 7.243362846571705e-05, "loss": 0.244, "step": 4047 }, { "epoch": 2.745954083552879, "grad_norm": 0.18449053168296814, "learning_rate": 7.241990842104628e-05, "loss": 0.1973, "step": 4048 }, { "epoch": 2.746706812194204, "grad_norm": 0.18600419163703918, "learning_rate": 7.24061864713605e-05, "loss": 0.1878, "step": 4049 }, { "epoch": 2.747459540835529, "grad_norm": 0.21788015961647034, "learning_rate": 7.239246261816035e-05, "loss": 0.2259, "step": 4050 }, { "epoch": 2.748212269476854, "grad_norm": 0.1896744668483734, "learning_rate": 7.237873686294665e-05, "loss": 0.1883, "step": 4051 }, { "epoch": 2.7489649981181783, "grad_norm": 0.2154119312763214, "learning_rate": 7.23650092072204e-05, "loss": 0.2451, "step": 4052 }, { "epoch": 2.749717726759503, "grad_norm": 0.1822153478860855, "learning_rate": 7.235127965248285e-05, "loss": 0.1628, "step": 4053 }, { "epoch": 2.750470455400828, "grad_norm": 0.21352434158325195, "learning_rate": 7.233754820023545e-05, "loss": 0.2196, "step": 4054 }, { "epoch": 2.7512231840421526, "grad_norm": 0.23711860179901123, "learning_rate": 7.232381485197984e-05, "loss": 0.2365, "step": 4055 }, { "epoch": 2.7519759126834775, "grad_norm": 0.19793111085891724, "learning_rate": 7.231007960921789e-05, "loss": 0.1741, "step": 4056 }, { "epoch": 2.7527286413248024, "grad_norm": 0.21732565760612488, "learning_rate": 7.229634247345166e-05, "loss": 0.198, "step": 4057 }, { "epoch": 2.7534813699661274, "grad_norm": 0.20762692391872406, "learning_rate": 7.228260344618341e-05, "loss": 0.258, "step": 4058 }, { "epoch": 2.7542340986074523, "grad_norm": 0.22617091238498688, "learning_rate": 7.226886252891563e-05, "loss": 0.2008, "step": 4059 }, { "epoch": 2.7549868272487767, "grad_norm": 0.22660255432128906, "learning_rate": 7.225511972315104e-05, "loss": 0.2442, "step": 4060 }, { "epoch": 2.7557395558901017, "grad_norm": 0.23736301064491272, "learning_rate": 7.224137503039247e-05, "loss": 0.2897, "step": 4061 }, { "epoch": 2.7564922845314266, "grad_norm": 0.21054567396640778, "learning_rate": 7.222762845214305e-05, "loss": 0.1867, "step": 4062 }, { "epoch": 2.757245013172751, "grad_norm": 0.21896779537200928, "learning_rate": 7.221387998990608e-05, "loss": 0.171, "step": 4063 }, { "epoch": 2.757997741814076, "grad_norm": 0.1918601244688034, "learning_rate": 7.220012964518508e-05, "loss": 0.2058, "step": 4064 }, { "epoch": 2.758750470455401, "grad_norm": 0.20985718071460724, "learning_rate": 7.218637741948376e-05, "loss": 0.1914, "step": 4065 }, { "epoch": 2.7595031990967254, "grad_norm": 0.19526509940624237, "learning_rate": 7.217262331430607e-05, "loss": 0.2167, "step": 4066 }, { "epoch": 2.7602559277380503, "grad_norm": 0.2240877002477646, "learning_rate": 7.21588673311561e-05, "loss": 0.2647, "step": 4067 }, { "epoch": 2.761008656379375, "grad_norm": 0.18200890719890594, "learning_rate": 7.214510947153817e-05, "loss": 0.1886, "step": 4068 }, { "epoch": 2.7617613850207, "grad_norm": 0.20760466158390045, "learning_rate": 7.213134973695687e-05, "loss": 0.1919, "step": 4069 }, { "epoch": 2.762514113662025, "grad_norm": 0.22027285397052765, "learning_rate": 7.211758812891692e-05, "loss": 0.1956, "step": 4070 }, { "epoch": 2.7632668423033495, "grad_norm": 0.24052369594573975, "learning_rate": 7.210382464892326e-05, "loss": 0.2381, "step": 4071 }, { "epoch": 2.7640195709446744, "grad_norm": 0.17994870245456696, "learning_rate": 7.209005929848107e-05, "loss": 0.1634, "step": 4072 }, { "epoch": 2.7647722995859993, "grad_norm": 0.20920665562152863, "learning_rate": 7.207629207909567e-05, "loss": 0.1912, "step": 4073 }, { "epoch": 2.765525028227324, "grad_norm": 0.21384982764720917, "learning_rate": 7.206252299227263e-05, "loss": 0.2382, "step": 4074 }, { "epoch": 2.7662777568686487, "grad_norm": 0.18836753070354462, "learning_rate": 7.204875203951774e-05, "loss": 0.1989, "step": 4075 }, { "epoch": 2.7670304855099737, "grad_norm": 0.19784097373485565, "learning_rate": 7.203497922233699e-05, "loss": 0.1644, "step": 4076 }, { "epoch": 2.7677832141512986, "grad_norm": 0.2121383547782898, "learning_rate": 7.202120454223648e-05, "loss": 0.1745, "step": 4077 }, { "epoch": 2.7685359427926235, "grad_norm": 0.2105753868818283, "learning_rate": 7.200742800072265e-05, "loss": 0.1698, "step": 4078 }, { "epoch": 2.769288671433948, "grad_norm": 0.22498154640197754, "learning_rate": 7.199364959930207e-05, "loss": 0.1964, "step": 4079 }, { "epoch": 2.770041400075273, "grad_norm": 0.19777818024158478, "learning_rate": 7.197986933948152e-05, "loss": 0.1465, "step": 4080 }, { "epoch": 2.770794128716598, "grad_norm": 0.19903530180454254, "learning_rate": 7.196608722276796e-05, "loss": 0.2071, "step": 4081 }, { "epoch": 2.7715468573579223, "grad_norm": 0.2055453360080719, "learning_rate": 7.195230325066864e-05, "loss": 0.1477, "step": 4082 }, { "epoch": 2.772299585999247, "grad_norm": 0.19637075066566467, "learning_rate": 7.193851742469093e-05, "loss": 0.2169, "step": 4083 }, { "epoch": 2.773052314640572, "grad_norm": 0.17449063062667847, "learning_rate": 7.19247297463424e-05, "loss": 0.2362, "step": 4084 }, { "epoch": 2.773805043281897, "grad_norm": 0.20932510495185852, "learning_rate": 7.191094021713087e-05, "loss": 0.3066, "step": 4085 }, { "epoch": 2.774557771923222, "grad_norm": 0.2086806744337082, "learning_rate": 7.189714883856435e-05, "loss": 0.1544, "step": 4086 }, { "epoch": 2.7753105005645464, "grad_norm": 0.18483279645442963, "learning_rate": 7.188335561215104e-05, "loss": 0.2054, "step": 4087 }, { "epoch": 2.7760632292058713, "grad_norm": 0.26411595940589905, "learning_rate": 7.186956053939933e-05, "loss": 0.2733, "step": 4088 }, { "epoch": 2.7768159578471963, "grad_norm": 0.21569685637950897, "learning_rate": 7.185576362181787e-05, "loss": 0.1915, "step": 4089 }, { "epoch": 2.7775686864885207, "grad_norm": 0.19978095591068268, "learning_rate": 7.184196486091541e-05, "loss": 0.2602, "step": 4090 }, { "epoch": 2.7783214151298457, "grad_norm": 0.24849672615528107, "learning_rate": 7.182816425820101e-05, "loss": 0.2186, "step": 4091 }, { "epoch": 2.7790741437711706, "grad_norm": 0.22288596630096436, "learning_rate": 7.181436181518388e-05, "loss": 0.1954, "step": 4092 }, { "epoch": 2.779826872412495, "grad_norm": 0.20979353785514832, "learning_rate": 7.180055753337342e-05, "loss": 0.1978, "step": 4093 }, { "epoch": 2.78057960105382, "grad_norm": 0.23210223019123077, "learning_rate": 7.178675141427923e-05, "loss": 0.2057, "step": 4094 }, { "epoch": 2.781332329695145, "grad_norm": 0.20510779321193695, "learning_rate": 7.177294345941116e-05, "loss": 0.216, "step": 4095 }, { "epoch": 2.78208505833647, "grad_norm": 0.21382887661457062, "learning_rate": 7.175913367027919e-05, "loss": 0.2003, "step": 4096 }, { "epoch": 2.7828377869777947, "grad_norm": 0.19189123809337616, "learning_rate": 7.174532204839357e-05, "loss": 0.1883, "step": 4097 }, { "epoch": 2.783590515619119, "grad_norm": 0.20297138392925262, "learning_rate": 7.173150859526472e-05, "loss": 0.2463, "step": 4098 }, { "epoch": 2.784343244260444, "grad_norm": 0.18362143635749817, "learning_rate": 7.171769331240323e-05, "loss": 0.3259, "step": 4099 }, { "epoch": 2.785095972901769, "grad_norm": 0.1970796138048172, "learning_rate": 7.170387620131993e-05, "loss": 0.1145, "step": 4100 }, { "epoch": 2.7858487015430935, "grad_norm": 0.21652311086654663, "learning_rate": 7.169005726352586e-05, "loss": 0.2225, "step": 4101 }, { "epoch": 2.7866014301844184, "grad_norm": 0.1909678429365158, "learning_rate": 7.167623650053221e-05, "loss": 0.1852, "step": 4102 }, { "epoch": 2.7873541588257433, "grad_norm": 0.19843624532222748, "learning_rate": 7.166241391385041e-05, "loss": 0.2531, "step": 4103 }, { "epoch": 2.7881068874670683, "grad_norm": 0.1771330088376999, "learning_rate": 7.164858950499208e-05, "loss": 0.263, "step": 4104 }, { "epoch": 2.788859616108393, "grad_norm": 0.2058352380990982, "learning_rate": 7.163476327546903e-05, "loss": 0.2604, "step": 4105 }, { "epoch": 2.7896123447497176, "grad_norm": 0.20137959718704224, "learning_rate": 7.162093522679327e-05, "loss": 0.1874, "step": 4106 }, { "epoch": 2.7903650733910426, "grad_norm": 0.211899995803833, "learning_rate": 7.160710536047704e-05, "loss": 0.265, "step": 4107 }, { "epoch": 2.7911178020323675, "grad_norm": 0.22711992263793945, "learning_rate": 7.159327367803273e-05, "loss": 0.1851, "step": 4108 }, { "epoch": 2.791870530673692, "grad_norm": 0.2048376053571701, "learning_rate": 7.157944018097294e-05, "loss": 0.1737, "step": 4109 }, { "epoch": 2.792623259315017, "grad_norm": 0.2647079527378082, "learning_rate": 7.156560487081053e-05, "loss": 0.2242, "step": 4110 }, { "epoch": 2.793375987956342, "grad_norm": 0.2319507598876953, "learning_rate": 7.155176774905845e-05, "loss": 0.2733, "step": 4111 }, { "epoch": 2.7941287165976667, "grad_norm": 0.2039070874452591, "learning_rate": 7.153792881722993e-05, "loss": 0.2016, "step": 4112 }, { "epoch": 2.794881445238991, "grad_norm": 0.20124702155590057, "learning_rate": 7.152408807683839e-05, "loss": 0.2429, "step": 4113 }, { "epoch": 2.795634173880316, "grad_norm": 0.21189774572849274, "learning_rate": 7.151024552939743e-05, "loss": 0.175, "step": 4114 }, { "epoch": 2.796386902521641, "grad_norm": 0.20332001149654388, "learning_rate": 7.149640117642084e-05, "loss": 0.2241, "step": 4115 }, { "epoch": 2.797139631162966, "grad_norm": 0.23892542719841003, "learning_rate": 7.14825550194226e-05, "loss": 0.2293, "step": 4116 }, { "epoch": 2.7978923598042904, "grad_norm": 0.2088896483182907, "learning_rate": 7.146870705991695e-05, "loss": 0.1159, "step": 4117 }, { "epoch": 2.7986450884456153, "grad_norm": 0.21131113171577454, "learning_rate": 7.145485729941826e-05, "loss": 0.2756, "step": 4118 }, { "epoch": 2.7993978170869402, "grad_norm": 0.23264238238334656, "learning_rate": 7.144100573944109e-05, "loss": 0.2017, "step": 4119 }, { "epoch": 2.8001505457282647, "grad_norm": 0.2415085881948471, "learning_rate": 7.142715238150026e-05, "loss": 0.1604, "step": 4120 }, { "epoch": 2.8009032743695896, "grad_norm": 0.18976320326328278, "learning_rate": 7.141329722711075e-05, "loss": 0.2335, "step": 4121 }, { "epoch": 2.8016560030109146, "grad_norm": 0.2006174772977829, "learning_rate": 7.139944027778774e-05, "loss": 0.1679, "step": 4122 }, { "epoch": 2.8024087316522395, "grad_norm": 0.22209542989730835, "learning_rate": 7.138558153504658e-05, "loss": 0.256, "step": 4123 }, { "epoch": 2.8031614602935644, "grad_norm": 0.20740917325019836, "learning_rate": 7.137172100040286e-05, "loss": 0.2291, "step": 4124 }, { "epoch": 2.803914188934889, "grad_norm": 0.21642014384269714, "learning_rate": 7.135785867537234e-05, "loss": 0.2706, "step": 4125 }, { "epoch": 2.804666917576214, "grad_norm": 0.20401348173618317, "learning_rate": 7.1343994561471e-05, "loss": 0.2382, "step": 4126 }, { "epoch": 2.8054196462175387, "grad_norm": 0.18512065708637238, "learning_rate": 7.133012866021499e-05, "loss": 0.1785, "step": 4127 }, { "epoch": 2.806172374858863, "grad_norm": 0.22699366509914398, "learning_rate": 7.131626097312067e-05, "loss": 0.1941, "step": 4128 }, { "epoch": 2.806925103500188, "grad_norm": 0.21471361815929413, "learning_rate": 7.130239150170455e-05, "loss": 0.2224, "step": 4129 }, { "epoch": 2.807677832141513, "grad_norm": 0.20011363923549652, "learning_rate": 7.128852024748344e-05, "loss": 0.2452, "step": 4130 }, { "epoch": 2.808430560782838, "grad_norm": 0.1881280392408371, "learning_rate": 7.127464721197422e-05, "loss": 0.2013, "step": 4131 }, { "epoch": 2.809183289424163, "grad_norm": 0.20967651903629303, "learning_rate": 7.126077239669407e-05, "loss": 0.2596, "step": 4132 }, { "epoch": 2.8099360180654873, "grad_norm": 0.19989483058452606, "learning_rate": 7.124689580316029e-05, "loss": 0.222, "step": 4133 }, { "epoch": 2.8106887467068122, "grad_norm": 0.20068137347698212, "learning_rate": 7.12330174328904e-05, "loss": 0.0987, "step": 4134 }, { "epoch": 2.811441475348137, "grad_norm": 0.19853109121322632, "learning_rate": 7.121913728740215e-05, "loss": 0.268, "step": 4135 }, { "epoch": 2.8121942039894616, "grad_norm": 0.2174428254365921, "learning_rate": 7.120525536821341e-05, "loss": 0.2038, "step": 4136 }, { "epoch": 2.8129469326307865, "grad_norm": 0.19836845993995667, "learning_rate": 7.119137167684233e-05, "loss": 0.273, "step": 4137 }, { "epoch": 2.8136996612721115, "grad_norm": 0.18627138435840607, "learning_rate": 7.117748621480717e-05, "loss": 0.1952, "step": 4138 }, { "epoch": 2.814452389913436, "grad_norm": 0.2441500425338745, "learning_rate": 7.116359898362643e-05, "loss": 0.2123, "step": 4139 }, { "epoch": 2.815205118554761, "grad_norm": 0.1788860559463501, "learning_rate": 7.114970998481883e-05, "loss": 0.1702, "step": 4140 }, { "epoch": 2.8159578471960858, "grad_norm": 0.18823952972888947, "learning_rate": 7.113581921990321e-05, "loss": 0.1198, "step": 4141 }, { "epoch": 2.8167105758374107, "grad_norm": 0.22817754745483398, "learning_rate": 7.112192669039868e-05, "loss": 0.13, "step": 4142 }, { "epoch": 2.8174633044787356, "grad_norm": 0.21559199690818787, "learning_rate": 7.110803239782448e-05, "loss": 0.1808, "step": 4143 }, { "epoch": 2.81821603312006, "grad_norm": 0.24065914750099182, "learning_rate": 7.109413634370006e-05, "loss": 0.2794, "step": 4144 }, { "epoch": 2.818968761761385, "grad_norm": 0.24360133707523346, "learning_rate": 7.108023852954509e-05, "loss": 0.2109, "step": 4145 }, { "epoch": 2.81972149040271, "grad_norm": 0.18222936987876892, "learning_rate": 7.106633895687942e-05, "loss": 0.2327, "step": 4146 }, { "epoch": 2.8204742190440344, "grad_norm": 0.2579769790172577, "learning_rate": 7.105243762722309e-05, "loss": 0.2262, "step": 4147 }, { "epoch": 2.8212269476853593, "grad_norm": 0.24795041978359222, "learning_rate": 7.103853454209628e-05, "loss": 0.2347, "step": 4148 }, { "epoch": 2.8219796763266842, "grad_norm": 0.22451961040496826, "learning_rate": 7.102462970301947e-05, "loss": 0.172, "step": 4149 }, { "epoch": 2.822732404968009, "grad_norm": 0.17986081540584564, "learning_rate": 7.101072311151324e-05, "loss": 0.1555, "step": 4150 }, { "epoch": 2.823485133609334, "grad_norm": 0.19475214183330536, "learning_rate": 7.09968147690984e-05, "loss": 0.2979, "step": 4151 }, { "epoch": 2.8242378622506585, "grad_norm": 0.23687666654586792, "learning_rate": 7.098290467729597e-05, "loss": 0.2355, "step": 4152 }, { "epoch": 2.8249905908919835, "grad_norm": 0.15531004965305328, "learning_rate": 7.09689928376271e-05, "loss": 0.1142, "step": 4153 }, { "epoch": 2.8257433195333084, "grad_norm": 0.21120527386665344, "learning_rate": 7.095507925161318e-05, "loss": 0.1751, "step": 4154 }, { "epoch": 2.826496048174633, "grad_norm": 0.21039408445358276, "learning_rate": 7.094116392077578e-05, "loss": 0.2005, "step": 4155 }, { "epoch": 2.8272487768159578, "grad_norm": 0.20630592107772827, "learning_rate": 7.092724684663666e-05, "loss": 0.2278, "step": 4156 }, { "epoch": 2.8280015054572827, "grad_norm": 0.23127156496047974, "learning_rate": 7.091332803071777e-05, "loss": 0.275, "step": 4157 }, { "epoch": 2.8287542340986076, "grad_norm": 0.23224374651908875, "learning_rate": 7.089940747454126e-05, "loss": 0.2353, "step": 4158 }, { "epoch": 2.8295069627399325, "grad_norm": 0.19060979783535004, "learning_rate": 7.088548517962945e-05, "loss": 0.1841, "step": 4159 }, { "epoch": 2.830259691381257, "grad_norm": 0.21244965493679047, "learning_rate": 7.087156114750487e-05, "loss": 0.1485, "step": 4160 }, { "epoch": 2.831012420022582, "grad_norm": 0.2060585618019104, "learning_rate": 7.08576353796902e-05, "loss": 0.1805, "step": 4161 }, { "epoch": 2.831765148663907, "grad_norm": 0.1818520575761795, "learning_rate": 7.084370787770838e-05, "loss": 0.1378, "step": 4162 }, { "epoch": 2.8325178773052313, "grad_norm": 0.20414479076862335, "learning_rate": 7.08297786430825e-05, "loss": 0.1689, "step": 4163 }, { "epoch": 2.833270605946556, "grad_norm": 0.1980145275592804, "learning_rate": 7.081584767733583e-05, "loss": 0.1868, "step": 4164 }, { "epoch": 2.834023334587881, "grad_norm": 0.17307540774345398, "learning_rate": 7.080191498199181e-05, "loss": 0.2322, "step": 4165 }, { "epoch": 2.8347760632292056, "grad_norm": 0.21632753312587738, "learning_rate": 7.078798055857414e-05, "loss": 0.2412, "step": 4166 }, { "epoch": 2.8355287918705305, "grad_norm": 0.2024582475423813, "learning_rate": 7.077404440860666e-05, "loss": 0.1395, "step": 4167 }, { "epoch": 2.8362815205118554, "grad_norm": 0.17951330542564392, "learning_rate": 7.076010653361339e-05, "loss": 0.2679, "step": 4168 }, { "epoch": 2.8370342491531804, "grad_norm": 0.18788619339466095, "learning_rate": 7.074616693511858e-05, "loss": 0.2281, "step": 4169 }, { "epoch": 2.8377869777945053, "grad_norm": 0.22668562829494476, "learning_rate": 7.073222561464661e-05, "loss": 0.2198, "step": 4170 }, { "epoch": 2.8385397064358298, "grad_norm": 0.2143806368112564, "learning_rate": 7.07182825737221e-05, "loss": 0.1685, "step": 4171 }, { "epoch": 2.8392924350771547, "grad_norm": 0.24012957513332367, "learning_rate": 7.070433781386985e-05, "loss": 0.2356, "step": 4172 }, { "epoch": 2.8400451637184796, "grad_norm": 0.206886887550354, "learning_rate": 7.069039133661481e-05, "loss": 0.2385, "step": 4173 }, { "epoch": 2.840797892359804, "grad_norm": 0.19245079159736633, "learning_rate": 7.06764431434822e-05, "loss": 0.1273, "step": 4174 }, { "epoch": 2.841550621001129, "grad_norm": 0.1881428211927414, "learning_rate": 7.06624932359973e-05, "loss": 0.1838, "step": 4175 }, { "epoch": 2.842303349642454, "grad_norm": 0.2520883083343506, "learning_rate": 7.064854161568571e-05, "loss": 0.27, "step": 4176 }, { "epoch": 2.843056078283779, "grad_norm": 0.23108357191085815, "learning_rate": 7.063458828407312e-05, "loss": 0.1762, "step": 4177 }, { "epoch": 2.8438088069251037, "grad_norm": 0.18131820857524872, "learning_rate": 7.062063324268549e-05, "loss": 0.1591, "step": 4178 }, { "epoch": 2.844561535566428, "grad_norm": 0.22046983242034912, "learning_rate": 7.060667649304886e-05, "loss": 0.2423, "step": 4179 }, { "epoch": 2.845314264207753, "grad_norm": 0.22487439215183258, "learning_rate": 7.059271803668956e-05, "loss": 0.2034, "step": 4180 }, { "epoch": 2.846066992849078, "grad_norm": 0.21439431607723236, "learning_rate": 7.057875787513407e-05, "loss": 0.3513, "step": 4181 }, { "epoch": 2.8468197214904025, "grad_norm": 0.21111531555652618, "learning_rate": 7.056479600990904e-05, "loss": 0.1699, "step": 4182 }, { "epoch": 2.8475724501317274, "grad_norm": 0.2042197585105896, "learning_rate": 7.055083244254131e-05, "loss": 0.1461, "step": 4183 }, { "epoch": 2.8483251787730524, "grad_norm": 0.23181144893169403, "learning_rate": 7.053686717455793e-05, "loss": 0.1339, "step": 4184 }, { "epoch": 2.8490779074143773, "grad_norm": 0.17638517916202545, "learning_rate": 7.052290020748612e-05, "loss": 0.1452, "step": 4185 }, { "epoch": 2.849830636055702, "grad_norm": 0.22687886655330658, "learning_rate": 7.050893154285327e-05, "loss": 0.2175, "step": 4186 }, { "epoch": 2.8505833646970267, "grad_norm": 0.1954822987318039, "learning_rate": 7.049496118218699e-05, "loss": 0.1869, "step": 4187 }, { "epoch": 2.8513360933383516, "grad_norm": 0.2145870178937912, "learning_rate": 7.048098912701507e-05, "loss": 0.2129, "step": 4188 }, { "epoch": 2.8520888219796765, "grad_norm": 0.19493263959884644, "learning_rate": 7.04670153788654e-05, "loss": 0.2132, "step": 4189 }, { "epoch": 2.852841550621001, "grad_norm": 0.18423865735530853, "learning_rate": 7.045303993926625e-05, "loss": 0.118, "step": 4190 }, { "epoch": 2.853594279262326, "grad_norm": 0.18757866322994232, "learning_rate": 7.043906280974584e-05, "loss": 0.1571, "step": 4191 }, { "epoch": 2.854347007903651, "grad_norm": 0.2169368416070938, "learning_rate": 7.042508399183274e-05, "loss": 0.1652, "step": 4192 }, { "epoch": 2.8550997365449753, "grad_norm": 0.20347028970718384, "learning_rate": 7.041110348705568e-05, "loss": 0.1117, "step": 4193 }, { "epoch": 2.8558524651863, "grad_norm": 0.21203577518463135, "learning_rate": 7.039712129694347e-05, "loss": 0.2466, "step": 4194 }, { "epoch": 2.856605193827625, "grad_norm": 0.31271040439605713, "learning_rate": 7.038313742302524e-05, "loss": 0.3532, "step": 4195 }, { "epoch": 2.85735792246895, "grad_norm": 0.22844557464122772, "learning_rate": 7.036915186683024e-05, "loss": 0.1808, "step": 4196 }, { "epoch": 2.858110651110275, "grad_norm": 0.2653273940086365, "learning_rate": 7.035516462988789e-05, "loss": 0.2709, "step": 4197 }, { "epoch": 2.8588633797515994, "grad_norm": 0.21930110454559326, "learning_rate": 7.034117571372781e-05, "loss": 0.2056, "step": 4198 }, { "epoch": 2.8596161083929243, "grad_norm": 0.25351089239120483, "learning_rate": 7.032718511987984e-05, "loss": 0.1698, "step": 4199 }, { "epoch": 2.8603688370342493, "grad_norm": 0.22725455462932587, "learning_rate": 7.031319284987394e-05, "loss": 0.242, "step": 4200 }, { "epoch": 2.8603688370342493, "eval_loss": 0.22624364495277405, "eval_runtime": 456.0222, "eval_samples_per_second": 21.111, "eval_steps_per_second": 0.66, "step": 4200 }, { "epoch": 2.8611215656755737, "grad_norm": 0.2514975070953369, "learning_rate": 7.02991989052403e-05, "loss": 0.1917, "step": 4201 }, { "epoch": 2.8618742943168987, "grad_norm": 0.22979161143302917, "learning_rate": 7.028520328750926e-05, "loss": 0.1858, "step": 4202 }, { "epoch": 2.8626270229582236, "grad_norm": 0.20565655827522278, "learning_rate": 7.027120599821137e-05, "loss": 0.1348, "step": 4203 }, { "epoch": 2.8633797515995485, "grad_norm": 0.22252504527568817, "learning_rate": 7.025720703887738e-05, "loss": 0.1993, "step": 4204 }, { "epoch": 2.8641324802408734, "grad_norm": 0.2249191403388977, "learning_rate": 7.024320641103812e-05, "loss": 0.2374, "step": 4205 }, { "epoch": 2.864885208882198, "grad_norm": 0.20734496414661407, "learning_rate": 7.022920411622476e-05, "loss": 0.2215, "step": 4206 }, { "epoch": 2.865637937523523, "grad_norm": 0.19036096334457397, "learning_rate": 7.021520015596849e-05, "loss": 0.1115, "step": 4207 }, { "epoch": 2.8663906661648477, "grad_norm": 0.2381809800863266, "learning_rate": 7.020119453180084e-05, "loss": 0.2065, "step": 4208 }, { "epoch": 2.867143394806172, "grad_norm": 0.2133529633283615, "learning_rate": 7.018718724525341e-05, "loss": 0.2325, "step": 4209 }, { "epoch": 2.867896123447497, "grad_norm": 0.1789158135652542, "learning_rate": 7.017317829785803e-05, "loss": 0.1981, "step": 4210 }, { "epoch": 2.868648852088822, "grad_norm": 0.23069550096988678, "learning_rate": 7.015916769114667e-05, "loss": 0.2543, "step": 4211 }, { "epoch": 2.8694015807301465, "grad_norm": 0.21497997641563416, "learning_rate": 7.014515542665152e-05, "loss": 0.2757, "step": 4212 }, { "epoch": 2.8701543093714714, "grad_norm": 0.20783580839633942, "learning_rate": 7.013114150590494e-05, "loss": 0.1836, "step": 4213 }, { "epoch": 2.8709070380127963, "grad_norm": 0.18959778547286987, "learning_rate": 7.011712593043948e-05, "loss": 0.1981, "step": 4214 }, { "epoch": 2.8716597666541213, "grad_norm": 0.2059171348810196, "learning_rate": 7.010310870178786e-05, "loss": 0.2493, "step": 4215 }, { "epoch": 2.872412495295446, "grad_norm": 0.17727530002593994, "learning_rate": 7.008908982148297e-05, "loss": 0.1968, "step": 4216 }, { "epoch": 2.8731652239367707, "grad_norm": 0.17751607298851013, "learning_rate": 7.00750692910579e-05, "loss": 0.2041, "step": 4217 }, { "epoch": 2.8739179525780956, "grad_norm": 0.1857912540435791, "learning_rate": 7.006104711204593e-05, "loss": 0.1883, "step": 4218 }, { "epoch": 2.8746706812194205, "grad_norm": 0.20768322050571442, "learning_rate": 7.00470232859805e-05, "loss": 0.2013, "step": 4219 }, { "epoch": 2.875423409860745, "grad_norm": 0.20268702507019043, "learning_rate": 7.00329978143952e-05, "loss": 0.2459, "step": 4220 }, { "epoch": 2.87617613850207, "grad_norm": 0.17254628241062164, "learning_rate": 7.001897069882389e-05, "loss": 0.2309, "step": 4221 }, { "epoch": 2.876928867143395, "grad_norm": 0.2276782989501953, "learning_rate": 7.00049419408005e-05, "loss": 0.1707, "step": 4222 }, { "epoch": 2.8776815957847197, "grad_norm": 0.19859950244426727, "learning_rate": 6.999091154185923e-05, "loss": 0.2044, "step": 4223 }, { "epoch": 2.8784343244260446, "grad_norm": 0.16999514400959015, "learning_rate": 6.99768795035344e-05, "loss": 0.1694, "step": 4224 }, { "epoch": 2.879187053067369, "grad_norm": 0.19033432006835938, "learning_rate": 6.996284582736056e-05, "loss": 0.2015, "step": 4225 }, { "epoch": 2.879939781708694, "grad_norm": 0.21768514811992645, "learning_rate": 6.99488105148724e-05, "loss": 0.1897, "step": 4226 }, { "epoch": 2.880692510350019, "grad_norm": 0.18653592467308044, "learning_rate": 6.993477356760479e-05, "loss": 0.1315, "step": 4227 }, { "epoch": 2.8814452389913434, "grad_norm": 0.22105886042118073, "learning_rate": 6.992073498709279e-05, "loss": 0.197, "step": 4228 }, { "epoch": 2.8821979676326683, "grad_norm": 0.21438047289848328, "learning_rate": 6.990669477487165e-05, "loss": 0.2337, "step": 4229 }, { "epoch": 2.8829506962739933, "grad_norm": 0.1777166873216629, "learning_rate": 6.989265293247678e-05, "loss": 0.112, "step": 4230 }, { "epoch": 2.883703424915318, "grad_norm": 0.19685626029968262, "learning_rate": 6.987860946144377e-05, "loss": 0.1879, "step": 4231 }, { "epoch": 2.884456153556643, "grad_norm": 0.21755346655845642, "learning_rate": 6.986456436330841e-05, "loss": 0.2506, "step": 4232 }, { "epoch": 2.8852088821979676, "grad_norm": 0.1735159307718277, "learning_rate": 6.985051763960664e-05, "loss": 0.1855, "step": 4233 }, { "epoch": 2.8859616108392925, "grad_norm": 0.2214490920305252, "learning_rate": 6.983646929187458e-05, "loss": 0.2038, "step": 4234 }, { "epoch": 2.8867143394806174, "grad_norm": 0.20045320689678192, "learning_rate": 6.982241932164855e-05, "loss": 0.2119, "step": 4235 }, { "epoch": 2.887467068121942, "grad_norm": 0.21021047234535217, "learning_rate": 6.980836773046504e-05, "loss": 0.2411, "step": 4236 }, { "epoch": 2.888219796763267, "grad_norm": 0.2053939700126648, "learning_rate": 6.979431451986067e-05, "loss": 0.1358, "step": 4237 }, { "epoch": 2.8889725254045917, "grad_norm": 0.2050567865371704, "learning_rate": 6.978025969137233e-05, "loss": 0.3071, "step": 4238 }, { "epoch": 2.889725254045916, "grad_norm": 0.22427614033222198, "learning_rate": 6.976620324653701e-05, "loss": 0.1472, "step": 4239 }, { "epoch": 2.890477982687241, "grad_norm": 0.2079395204782486, "learning_rate": 6.975214518689189e-05, "loss": 0.1893, "step": 4240 }, { "epoch": 2.891230711328566, "grad_norm": 0.1994532346725464, "learning_rate": 6.973808551397438e-05, "loss": 0.1876, "step": 4241 }, { "epoch": 2.891983439969891, "grad_norm": 0.2557363212108612, "learning_rate": 6.972402422932197e-05, "loss": 0.1946, "step": 4242 }, { "epoch": 2.892736168611216, "grad_norm": 0.20036454498767853, "learning_rate": 6.97099613344724e-05, "loss": 0.2141, "step": 4243 }, { "epoch": 2.8934888972525403, "grad_norm": 0.22082215547561646, "learning_rate": 6.969589683096361e-05, "loss": 0.2857, "step": 4244 }, { "epoch": 2.8942416258938652, "grad_norm": 0.19461220502853394, "learning_rate": 6.968183072033361e-05, "loss": 0.2261, "step": 4245 }, { "epoch": 2.89499435453519, "grad_norm": 0.17930714786052704, "learning_rate": 6.96677630041207e-05, "loss": 0.2696, "step": 4246 }, { "epoch": 2.8957470831765146, "grad_norm": 0.18807564675807953, "learning_rate": 6.965369368386326e-05, "loss": 0.1411, "step": 4247 }, { "epoch": 2.8964998118178396, "grad_norm": 0.23097383975982666, "learning_rate": 6.96396227610999e-05, "loss": 0.1971, "step": 4248 }, { "epoch": 2.8972525404591645, "grad_norm": 2.3850767612457275, "learning_rate": 6.962555023736943e-05, "loss": 0.1971, "step": 4249 }, { "epoch": 2.8980052691004894, "grad_norm": 0.23563244938850403, "learning_rate": 6.961147611421075e-05, "loss": 0.1808, "step": 4250 }, { "epoch": 2.8987579977418143, "grad_norm": 0.2515755593776703, "learning_rate": 6.959740039316303e-05, "loss": 0.192, "step": 4251 }, { "epoch": 2.899510726383139, "grad_norm": 0.22625257074832916, "learning_rate": 6.958332307576556e-05, "loss": 0.1846, "step": 4252 }, { "epoch": 2.9002634550244637, "grad_norm": 0.24821162223815918, "learning_rate": 6.956924416355779e-05, "loss": 0.2469, "step": 4253 }, { "epoch": 2.9010161836657886, "grad_norm": 0.23299270868301392, "learning_rate": 6.955516365807936e-05, "loss": 0.182, "step": 4254 }, { "epoch": 2.901768912307113, "grad_norm": 0.20775116980075836, "learning_rate": 6.954108156087015e-05, "loss": 0.1599, "step": 4255 }, { "epoch": 2.902521640948438, "grad_norm": 0.1956559121608734, "learning_rate": 6.95269978734701e-05, "loss": 0.2046, "step": 4256 }, { "epoch": 2.903274369589763, "grad_norm": 0.20665663480758667, "learning_rate": 6.951291259741942e-05, "loss": 0.1315, "step": 4257 }, { "epoch": 2.904027098231088, "grad_norm": 0.19699618220329285, "learning_rate": 6.949882573425843e-05, "loss": 0.2197, "step": 4258 }, { "epoch": 2.9047798268724128, "grad_norm": 0.2485676258802414, "learning_rate": 6.948473728552766e-05, "loss": 0.2627, "step": 4259 }, { "epoch": 2.9055325555137372, "grad_norm": 0.21098953485488892, "learning_rate": 6.947064725276779e-05, "loss": 0.2774, "step": 4260 }, { "epoch": 2.906285284155062, "grad_norm": 0.21947340667247772, "learning_rate": 6.945655563751971e-05, "loss": 0.2445, "step": 4261 }, { "epoch": 2.907038012796387, "grad_norm": 0.16476722061634064, "learning_rate": 6.944246244132443e-05, "loss": 0.1458, "step": 4262 }, { "epoch": 2.9077907414377115, "grad_norm": 0.2101641148328781, "learning_rate": 6.942836766572317e-05, "loss": 0.184, "step": 4263 }, { "epoch": 2.9085434700790365, "grad_norm": 0.19516828656196594, "learning_rate": 6.941427131225731e-05, "loss": 0.1944, "step": 4264 }, { "epoch": 2.9092961987203614, "grad_norm": 0.20434384047985077, "learning_rate": 6.940017338246841e-05, "loss": 0.3085, "step": 4265 }, { "epoch": 2.910048927361686, "grad_norm": 0.19359831511974335, "learning_rate": 6.938607387789823e-05, "loss": 0.1693, "step": 4266 }, { "epoch": 2.9108016560030108, "grad_norm": 0.21420453488826752, "learning_rate": 6.937197280008861e-05, "loss": 0.1595, "step": 4267 }, { "epoch": 2.9115543846443357, "grad_norm": 0.1992887705564499, "learning_rate": 6.935787015058165e-05, "loss": 0.1829, "step": 4268 }, { "epoch": 2.9123071132856606, "grad_norm": 0.19192397594451904, "learning_rate": 6.934376593091962e-05, "loss": 0.1576, "step": 4269 }, { "epoch": 2.9130598419269855, "grad_norm": 0.19630394876003265, "learning_rate": 6.932966014264491e-05, "loss": 0.2477, "step": 4270 }, { "epoch": 2.91381257056831, "grad_norm": 0.19048866629600525, "learning_rate": 6.93155527873001e-05, "loss": 0.0897, "step": 4271 }, { "epoch": 2.914565299209635, "grad_norm": 0.20200829207897186, "learning_rate": 6.930144386642794e-05, "loss": 0.2198, "step": 4272 }, { "epoch": 2.91531802785096, "grad_norm": 0.2042773962020874, "learning_rate": 6.92873333815714e-05, "loss": 0.2014, "step": 4273 }, { "epoch": 2.9160707564922843, "grad_norm": 0.19930243492126465, "learning_rate": 6.927322133427358e-05, "loss": 0.2275, "step": 4274 }, { "epoch": 2.9168234851336092, "grad_norm": 0.20853814482688904, "learning_rate": 6.92591077260777e-05, "loss": 0.2138, "step": 4275 }, { "epoch": 2.917576213774934, "grad_norm": 0.20521613955497742, "learning_rate": 6.924499255852726e-05, "loss": 0.301, "step": 4276 }, { "epoch": 2.918328942416259, "grad_norm": 0.1610574573278427, "learning_rate": 6.923087583316584e-05, "loss": 0.1874, "step": 4277 }, { "epoch": 2.919081671057584, "grad_norm": 0.20671460032463074, "learning_rate": 6.921675755153724e-05, "loss": 0.1348, "step": 4278 }, { "epoch": 2.9198343996989085, "grad_norm": 0.2094217836856842, "learning_rate": 6.92026377151854e-05, "loss": 0.2507, "step": 4279 }, { "epoch": 2.9205871283402334, "grad_norm": 0.17085964977741241, "learning_rate": 6.918851632565448e-05, "loss": 0.176, "step": 4280 }, { "epoch": 2.9213398569815583, "grad_norm": 0.22808057069778442, "learning_rate": 6.917439338448872e-05, "loss": 0.1737, "step": 4281 }, { "epoch": 2.9220925856228828, "grad_norm": 0.18153977394104004, "learning_rate": 6.916026889323261e-05, "loss": 0.1801, "step": 4282 }, { "epoch": 2.9228453142642077, "grad_norm": 0.2037578523159027, "learning_rate": 6.91461428534308e-05, "loss": 0.1984, "step": 4283 }, { "epoch": 2.9235980429055326, "grad_norm": 0.19902996718883514, "learning_rate": 6.913201526662806e-05, "loss": 0.1256, "step": 4284 }, { "epoch": 2.9243507715468575, "grad_norm": 0.17290227115154266, "learning_rate": 6.91178861343694e-05, "loss": 0.1744, "step": 4285 }, { "epoch": 2.9251035001881824, "grad_norm": 0.2184361219406128, "learning_rate": 6.910375545819992e-05, "loss": 0.1448, "step": 4286 }, { "epoch": 2.925856228829507, "grad_norm": 0.18413780629634857, "learning_rate": 6.908962323966494e-05, "loss": 0.2023, "step": 4287 }, { "epoch": 2.926608957470832, "grad_norm": 0.1854666918516159, "learning_rate": 6.907548948030997e-05, "loss": 0.1795, "step": 4288 }, { "epoch": 2.9273616861121567, "grad_norm": 0.21037957072257996, "learning_rate": 6.906135418168062e-05, "loss": 0.1861, "step": 4289 }, { "epoch": 2.928114414753481, "grad_norm": 0.19074971973896027, "learning_rate": 6.904721734532272e-05, "loss": 0.1395, "step": 4290 }, { "epoch": 2.928867143394806, "grad_norm": 0.204942986369133, "learning_rate": 6.903307897278224e-05, "loss": 0.1623, "step": 4291 }, { "epoch": 2.929619872036131, "grad_norm": 0.20939549803733826, "learning_rate": 6.901893906560536e-05, "loss": 0.2007, "step": 4292 }, { "epoch": 2.9303726006774555, "grad_norm": 0.1800510585308075, "learning_rate": 6.900479762533838e-05, "loss": 0.1544, "step": 4293 }, { "epoch": 2.9311253293187804, "grad_norm": 0.18257400393486023, "learning_rate": 6.899065465352776e-05, "loss": 0.1965, "step": 4294 }, { "epoch": 2.9318780579601054, "grad_norm": 0.21738429367542267, "learning_rate": 6.897651015172021e-05, "loss": 0.2377, "step": 4295 }, { "epoch": 2.9326307866014303, "grad_norm": 0.18889708817005157, "learning_rate": 6.896236412146254e-05, "loss": 0.259, "step": 4296 }, { "epoch": 2.933383515242755, "grad_norm": 0.22398008406162262, "learning_rate": 6.894821656430167e-05, "loss": 0.1551, "step": 4297 }, { "epoch": 2.9341362438840797, "grad_norm": 0.23659104108810425, "learning_rate": 6.893406748178484e-05, "loss": 0.2338, "step": 4298 }, { "epoch": 2.9348889725254046, "grad_norm": 0.21192416548728943, "learning_rate": 6.891991687545934e-05, "loss": 0.234, "step": 4299 }, { "epoch": 2.9356417011667295, "grad_norm": 0.2009848803281784, "learning_rate": 6.890576474687263e-05, "loss": 0.2329, "step": 4300 }, { "epoch": 2.936394429808054, "grad_norm": 0.20331114530563354, "learning_rate": 6.889161109757242e-05, "loss": 0.0893, "step": 4301 }, { "epoch": 2.937147158449379, "grad_norm": 0.18041284382343292, "learning_rate": 6.88774559291065e-05, "loss": 0.1453, "step": 4302 }, { "epoch": 2.937899887090704, "grad_norm": 0.22581720352172852, "learning_rate": 6.886329924302287e-05, "loss": 0.2503, "step": 4303 }, { "epoch": 2.9386526157320287, "grad_norm": 0.18021483719348907, "learning_rate": 6.884914104086968e-05, "loss": 0.151, "step": 4304 }, { "epoch": 2.9394053443733537, "grad_norm": 0.1905880719423294, "learning_rate": 6.883498132419524e-05, "loss": 0.1326, "step": 4305 }, { "epoch": 2.940158073014678, "grad_norm": 0.19561423361301422, "learning_rate": 6.882082009454804e-05, "loss": 0.2449, "step": 4306 }, { "epoch": 2.940910801656003, "grad_norm": 0.19352087378501892, "learning_rate": 6.880665735347674e-05, "loss": 0.222, "step": 4307 }, { "epoch": 2.941663530297328, "grad_norm": 0.2236386090517044, "learning_rate": 6.879249310253014e-05, "loss": 0.2083, "step": 4308 }, { "epoch": 2.9424162589386524, "grad_norm": 0.19785577058792114, "learning_rate": 6.877832734325725e-05, "loss": 0.1708, "step": 4309 }, { "epoch": 2.9431689875799774, "grad_norm": 0.19746166467666626, "learning_rate": 6.876416007720718e-05, "loss": 0.1262, "step": 4310 }, { "epoch": 2.9439217162213023, "grad_norm": 0.20306934416294098, "learning_rate": 6.874999130592927e-05, "loss": 0.1274, "step": 4311 }, { "epoch": 2.9446744448626267, "grad_norm": 0.20691721141338348, "learning_rate": 6.873582103097298e-05, "loss": 0.1904, "step": 4312 }, { "epoch": 2.9454271735039517, "grad_norm": 0.20971684157848358, "learning_rate": 6.872164925388795e-05, "loss": 0.1561, "step": 4313 }, { "epoch": 2.9461799021452766, "grad_norm": 0.1893453150987625, "learning_rate": 6.870747597622399e-05, "loss": 0.263, "step": 4314 }, { "epoch": 2.9469326307866015, "grad_norm": 0.18577218055725098, "learning_rate": 6.869330119953108e-05, "loss": 0.1637, "step": 4315 }, { "epoch": 2.9476853594279264, "grad_norm": 0.20406261086463928, "learning_rate": 6.867912492535931e-05, "loss": 0.1533, "step": 4316 }, { "epoch": 2.948438088069251, "grad_norm": 0.17519520223140717, "learning_rate": 6.866494715525901e-05, "loss": 0.1841, "step": 4317 }, { "epoch": 2.949190816710576, "grad_norm": 0.20746609568595886, "learning_rate": 6.865076789078067e-05, "loss": 0.183, "step": 4318 }, { "epoch": 2.9499435453519007, "grad_norm": 0.2882871925830841, "learning_rate": 6.863658713347484e-05, "loss": 0.3021, "step": 4319 }, { "epoch": 2.950696273993225, "grad_norm": 0.17758572101593018, "learning_rate": 6.862240488489234e-05, "loss": 0.1737, "step": 4320 }, { "epoch": 2.95144900263455, "grad_norm": 0.22803421318531036, "learning_rate": 6.860822114658415e-05, "loss": 0.1489, "step": 4321 }, { "epoch": 2.952201731275875, "grad_norm": 0.2170223742723465, "learning_rate": 6.859403592010133e-05, "loss": 0.1941, "step": 4322 }, { "epoch": 2.9529544599172, "grad_norm": 0.19303452968597412, "learning_rate": 6.857984920699519e-05, "loss": 0.1589, "step": 4323 }, { "epoch": 2.953707188558525, "grad_norm": 0.2155197411775589, "learning_rate": 6.856566100881716e-05, "loss": 0.1489, "step": 4324 }, { "epoch": 2.9544599171998494, "grad_norm": 0.23057647049427032, "learning_rate": 6.855147132711884e-05, "loss": 0.2521, "step": 4325 }, { "epoch": 2.9552126458411743, "grad_norm": 0.22432270646095276, "learning_rate": 6.853728016345199e-05, "loss": 0.1774, "step": 4326 }, { "epoch": 2.955965374482499, "grad_norm": 0.20261302590370178, "learning_rate": 6.852308751936852e-05, "loss": 0.2171, "step": 4327 }, { "epoch": 2.9567181031238237, "grad_norm": 0.19898667931556702, "learning_rate": 6.850889339642055e-05, "loss": 0.2247, "step": 4328 }, { "epoch": 2.9574708317651486, "grad_norm": 0.20822754502296448, "learning_rate": 6.849469779616029e-05, "loss": 0.2234, "step": 4329 }, { "epoch": 2.9582235604064735, "grad_norm": 0.20489035546779633, "learning_rate": 6.848050072014018e-05, "loss": 0.1597, "step": 4330 }, { "epoch": 2.9589762890477984, "grad_norm": 0.16833214461803436, "learning_rate": 6.84663021699128e-05, "loss": 0.0816, "step": 4331 }, { "epoch": 2.9597290176891233, "grad_norm": 0.21276375651359558, "learning_rate": 6.845210214703081e-05, "loss": 0.259, "step": 4332 }, { "epoch": 2.960481746330448, "grad_norm": 0.21217520534992218, "learning_rate": 6.843790065304719e-05, "loss": 0.1058, "step": 4333 }, { "epoch": 2.9612344749717727, "grad_norm": 0.22389431297779083, "learning_rate": 6.842369768951496e-05, "loss": 0.1891, "step": 4334 }, { "epoch": 2.9619872036130976, "grad_norm": 0.20761743187904358, "learning_rate": 6.840949325798731e-05, "loss": 0.2178, "step": 4335 }, { "epoch": 2.962739932254422, "grad_norm": 0.2229475975036621, "learning_rate": 6.839528736001766e-05, "loss": 0.1866, "step": 4336 }, { "epoch": 2.963492660895747, "grad_norm": 0.17950376868247986, "learning_rate": 6.838107999715951e-05, "loss": 0.1883, "step": 4337 }, { "epoch": 2.964245389537072, "grad_norm": 0.21085020899772644, "learning_rate": 6.836687117096657e-05, "loss": 0.1594, "step": 4338 }, { "epoch": 2.9649981181783964, "grad_norm": 0.24854303896427155, "learning_rate": 6.83526608829927e-05, "loss": 0.1247, "step": 4339 }, { "epoch": 2.9657508468197213, "grad_norm": 0.19233910739421844, "learning_rate": 6.833844913479192e-05, "loss": 0.1993, "step": 4340 }, { "epoch": 2.9665035754610463, "grad_norm": 0.20644725859165192, "learning_rate": 6.832423592791839e-05, "loss": 0.1971, "step": 4341 }, { "epoch": 2.967256304102371, "grad_norm": 0.1710130125284195, "learning_rate": 6.831002126392645e-05, "loss": 0.1489, "step": 4342 }, { "epoch": 2.968009032743696, "grad_norm": 0.19910001754760742, "learning_rate": 6.829580514437059e-05, "loss": 0.3099, "step": 4343 }, { "epoch": 2.9687617613850206, "grad_norm": 0.17538632452487946, "learning_rate": 6.828158757080548e-05, "loss": 0.178, "step": 4344 }, { "epoch": 2.9695144900263455, "grad_norm": 0.21650297939777374, "learning_rate": 6.826736854478592e-05, "loss": 0.159, "step": 4345 }, { "epoch": 2.9702672186676704, "grad_norm": 0.20496520400047302, "learning_rate": 6.825314806786687e-05, "loss": 0.1989, "step": 4346 }, { "epoch": 2.971019947308995, "grad_norm": 0.2000892013311386, "learning_rate": 6.82389261416035e-05, "loss": 0.2418, "step": 4347 }, { "epoch": 2.97177267595032, "grad_norm": 0.19121423363685608, "learning_rate": 6.822470276755104e-05, "loss": 0.2374, "step": 4348 }, { "epoch": 2.9725254045916447, "grad_norm": 0.1974058449268341, "learning_rate": 6.821047794726499e-05, "loss": 0.2626, "step": 4349 }, { "epoch": 2.9732781332329696, "grad_norm": 0.1865338236093521, "learning_rate": 6.819625168230093e-05, "loss": 0.1847, "step": 4350 }, { "epoch": 2.9740308618742946, "grad_norm": 0.1751868575811386, "learning_rate": 6.818202397421461e-05, "loss": 0.1481, "step": 4351 }, { "epoch": 2.974783590515619, "grad_norm": 0.18907558917999268, "learning_rate": 6.816779482456197e-05, "loss": 0.1846, "step": 4352 }, { "epoch": 2.975536319156944, "grad_norm": 0.18928150832653046, "learning_rate": 6.81535642348991e-05, "loss": 0.1487, "step": 4353 }, { "epoch": 2.976289047798269, "grad_norm": 0.20386208593845367, "learning_rate": 6.813933220678222e-05, "loss": 0.223, "step": 4354 }, { "epoch": 2.9770417764395933, "grad_norm": 0.20005494356155396, "learning_rate": 6.812509874176772e-05, "loss": 0.2486, "step": 4355 }, { "epoch": 2.9777945050809183, "grad_norm": 0.20799283683300018, "learning_rate": 6.811086384141216e-05, "loss": 0.2959, "step": 4356 }, { "epoch": 2.978547233722243, "grad_norm": 0.18755221366882324, "learning_rate": 6.809662750727222e-05, "loss": 0.1303, "step": 4357 }, { "epoch": 2.979299962363568, "grad_norm": 0.18971297144889832, "learning_rate": 6.808238974090482e-05, "loss": 0.1288, "step": 4358 }, { "epoch": 2.980052691004893, "grad_norm": 0.20005005598068237, "learning_rate": 6.806815054386694e-05, "loss": 0.198, "step": 4359 }, { "epoch": 2.9808054196462175, "grad_norm": 0.20786207914352417, "learning_rate": 6.805390991771577e-05, "loss": 0.1851, "step": 4360 }, { "epoch": 2.9815581482875424, "grad_norm": 0.18755866587162018, "learning_rate": 6.80396678640086e-05, "loss": 0.2672, "step": 4361 }, { "epoch": 2.9823108769288673, "grad_norm": 0.1920134425163269, "learning_rate": 6.802542438430298e-05, "loss": 0.257, "step": 4362 }, { "epoch": 2.983063605570192, "grad_norm": 0.19270747900009155, "learning_rate": 6.801117948015655e-05, "loss": 0.1166, "step": 4363 }, { "epoch": 2.9838163342115167, "grad_norm": 0.17651857435703278, "learning_rate": 6.799693315312707e-05, "loss": 0.1549, "step": 4364 }, { "epoch": 2.9845690628528416, "grad_norm": 0.18895721435546875, "learning_rate": 6.798268540477253e-05, "loss": 0.2275, "step": 4365 }, { "epoch": 2.985321791494166, "grad_norm": 0.18161794543266296, "learning_rate": 6.796843623665103e-05, "loss": 0.2322, "step": 4366 }, { "epoch": 2.986074520135491, "grad_norm": 0.19175222516059875, "learning_rate": 6.795418565032083e-05, "loss": 0.1586, "step": 4367 }, { "epoch": 2.986827248776816, "grad_norm": 0.22736939787864685, "learning_rate": 6.793993364734038e-05, "loss": 0.2057, "step": 4368 }, { "epoch": 2.987579977418141, "grad_norm": 0.1939079314470291, "learning_rate": 6.792568022926826e-05, "loss": 0.1404, "step": 4369 }, { "epoch": 2.9883327060594658, "grad_norm": 0.19247670471668243, "learning_rate": 6.791142539766315e-05, "loss": 0.1745, "step": 4370 }, { "epoch": 2.9890854347007902, "grad_norm": 0.19553911685943604, "learning_rate": 6.789716915408397e-05, "loss": 0.1254, "step": 4371 }, { "epoch": 2.989838163342115, "grad_norm": 0.1829730123281479, "learning_rate": 6.788291150008977e-05, "loss": 0.1776, "step": 4372 }, { "epoch": 2.99059089198344, "grad_norm": 0.2137947529554367, "learning_rate": 6.786865243723974e-05, "loss": 0.2485, "step": 4373 }, { "epoch": 2.9913436206247646, "grad_norm": 0.19727857410907745, "learning_rate": 6.785439196709324e-05, "loss": 0.2292, "step": 4374 }, { "epoch": 2.9920963492660895, "grad_norm": 0.2025640904903412, "learning_rate": 6.784013009120974e-05, "loss": 0.2529, "step": 4375 }, { "epoch": 2.9928490779074144, "grad_norm": 0.18039792776107788, "learning_rate": 6.782586681114894e-05, "loss": 0.2286, "step": 4376 }, { "epoch": 2.9936018065487393, "grad_norm": 0.20918971300125122, "learning_rate": 6.78116021284706e-05, "loss": 0.257, "step": 4377 }, { "epoch": 2.9943545351900642, "grad_norm": 0.1689852625131607, "learning_rate": 6.779733604473474e-05, "loss": 0.1571, "step": 4378 }, { "epoch": 2.9951072638313887, "grad_norm": 0.21134252846240997, "learning_rate": 6.778306856150144e-05, "loss": 0.2067, "step": 4379 }, { "epoch": 2.9958599924727136, "grad_norm": 0.16774924099445343, "learning_rate": 6.776879968033098e-05, "loss": 0.1546, "step": 4380 }, { "epoch": 2.9966127211140385, "grad_norm": 0.21244753897190094, "learning_rate": 6.775452940278378e-05, "loss": 0.165, "step": 4381 }, { "epoch": 2.997365449755363, "grad_norm": 0.19245153665542603, "learning_rate": 6.774025773042043e-05, "loss": 0.1897, "step": 4382 }, { "epoch": 2.998118178396688, "grad_norm": 0.18469640612602234, "learning_rate": 6.772598466480163e-05, "loss": 0.262, "step": 4383 }, { "epoch": 2.998870907038013, "grad_norm": 0.22851444780826569, "learning_rate": 6.771171020748831e-05, "loss": 0.2462, "step": 4384 }, { "epoch": 2.9996236356793378, "grad_norm": 0.20034493505954742, "learning_rate": 6.769743436004146e-05, "loss": 0.2376, "step": 4385 }, { "epoch": 3.0003763643206622, "grad_norm": 0.19328442215919495, "learning_rate": 6.768315712402227e-05, "loss": 0.2546, "step": 4386 }, { "epoch": 3.001129092961987, "grad_norm": 0.19451870024204254, "learning_rate": 6.76688785009921e-05, "loss": 0.2017, "step": 4387 }, { "epoch": 3.001881821603312, "grad_norm": 0.24998190999031067, "learning_rate": 6.765459849251243e-05, "loss": 0.2794, "step": 4388 }, { "epoch": 3.002634550244637, "grad_norm": 0.2054111808538437, "learning_rate": 6.764031710014487e-05, "loss": 0.2521, "step": 4389 }, { "epoch": 3.0033872788859615, "grad_norm": 0.1825685352087021, "learning_rate": 6.762603432545125e-05, "loss": 0.0891, "step": 4390 }, { "epoch": 3.0041400075272864, "grad_norm": 0.2183891236782074, "learning_rate": 6.761175016999351e-05, "loss": 0.1493, "step": 4391 }, { "epoch": 3.0048927361686113, "grad_norm": 0.18579979240894318, "learning_rate": 6.759746463533372e-05, "loss": 0.1739, "step": 4392 }, { "epoch": 3.005645464809936, "grad_norm": 0.22727985680103302, "learning_rate": 6.758317772303413e-05, "loss": 0.2217, "step": 4393 }, { "epoch": 3.0063981934512607, "grad_norm": 0.2406698614358902, "learning_rate": 6.756888943465715e-05, "loss": 0.2447, "step": 4394 }, { "epoch": 3.0071509220925856, "grad_norm": 0.21038201451301575, "learning_rate": 6.755459977176533e-05, "loss": 0.1462, "step": 4395 }, { "epoch": 3.0079036507339105, "grad_norm": 0.23915015161037445, "learning_rate": 6.754030873592134e-05, "loss": 0.2208, "step": 4396 }, { "epoch": 3.0086563793752354, "grad_norm": 0.20293091237545013, "learning_rate": 6.752601632868805e-05, "loss": 0.2199, "step": 4397 }, { "epoch": 3.00940910801656, "grad_norm": 0.223184734582901, "learning_rate": 6.751172255162843e-05, "loss": 0.1969, "step": 4398 }, { "epoch": 3.010161836657885, "grad_norm": 0.2111319750547409, "learning_rate": 6.749742740630567e-05, "loss": 0.1604, "step": 4399 }, { "epoch": 3.0109145652992098, "grad_norm": 0.2077082097530365, "learning_rate": 6.7483130894283e-05, "loss": 0.1877, "step": 4400 }, { "epoch": 3.0109145652992098, "eval_loss": 0.21962113678455353, "eval_runtime": 456.4644, "eval_samples_per_second": 21.09, "eval_steps_per_second": 0.659, "step": 4400 }, { "epoch": 3.0116672939405342, "grad_norm": 0.18183985352516174, "learning_rate": 6.746883301712393e-05, "loss": 0.1267, "step": 4401 }, { "epoch": 3.012420022581859, "grad_norm": 0.204061821103096, "learning_rate": 6.745453377639203e-05, "loss": 0.1125, "step": 4402 }, { "epoch": 3.013172751223184, "grad_norm": 0.1954290270805359, "learning_rate": 6.744023317365103e-05, "loss": 0.1892, "step": 4403 }, { "epoch": 3.013925479864509, "grad_norm": 0.22706051170825958, "learning_rate": 6.742593121046484e-05, "loss": 0.1777, "step": 4404 }, { "epoch": 3.0146782085058335, "grad_norm": 0.2056218683719635, "learning_rate": 6.74116278883975e-05, "loss": 0.1671, "step": 4405 }, { "epoch": 3.0154309371471584, "grad_norm": 0.20497314631938934, "learning_rate": 6.73973232090132e-05, "loss": 0.2018, "step": 4406 }, { "epoch": 3.0161836657884833, "grad_norm": 0.19175416231155396, "learning_rate": 6.738301717387626e-05, "loss": 0.2365, "step": 4407 }, { "epoch": 3.016936394429808, "grad_norm": 0.18490532040596008, "learning_rate": 6.736870978455117e-05, "loss": 0.1486, "step": 4408 }, { "epoch": 3.0176891230711327, "grad_norm": 0.20826345682144165, "learning_rate": 6.735440104260259e-05, "loss": 0.2292, "step": 4409 }, { "epoch": 3.0184418517124576, "grad_norm": 0.18871371448040009, "learning_rate": 6.734009094959528e-05, "loss": 0.1809, "step": 4410 }, { "epoch": 3.0191945803537825, "grad_norm": 0.19773776829242706, "learning_rate": 6.732577950709419e-05, "loss": 0.2148, "step": 4411 }, { "epoch": 3.0199473089951074, "grad_norm": 0.22971422970294952, "learning_rate": 6.731146671666437e-05, "loss": 0.1982, "step": 4412 }, { "epoch": 3.020700037636432, "grad_norm": 0.18276451528072357, "learning_rate": 6.729715257987106e-05, "loss": 0.1899, "step": 4413 }, { "epoch": 3.021452766277757, "grad_norm": 0.16280093789100647, "learning_rate": 6.728283709827963e-05, "loss": 0.156, "step": 4414 }, { "epoch": 3.0222054949190817, "grad_norm": 0.19230026006698608, "learning_rate": 6.726852027345562e-05, "loss": 0.167, "step": 4415 }, { "epoch": 3.0229582235604067, "grad_norm": 0.16921035945415497, "learning_rate": 6.725420210696467e-05, "loss": 0.1336, "step": 4416 }, { "epoch": 3.023710952201731, "grad_norm": 0.18384990096092224, "learning_rate": 6.72398826003726e-05, "loss": 0.1938, "step": 4417 }, { "epoch": 3.024463680843056, "grad_norm": 0.20195527374744415, "learning_rate": 6.722556175524536e-05, "loss": 0.2425, "step": 4418 }, { "epoch": 3.025216409484381, "grad_norm": 0.18543286621570587, "learning_rate": 6.721123957314908e-05, "loss": 0.2043, "step": 4419 }, { "epoch": 3.025969138125706, "grad_norm": 0.20079639554023743, "learning_rate": 6.719691605565001e-05, "loss": 0.177, "step": 4420 }, { "epoch": 3.0267218667670304, "grad_norm": 0.1792934536933899, "learning_rate": 6.71825912043145e-05, "loss": 0.2277, "step": 4421 }, { "epoch": 3.0274745954083553, "grad_norm": 0.18234451115131378, "learning_rate": 6.716826502070916e-05, "loss": 0.1427, "step": 4422 }, { "epoch": 3.02822732404968, "grad_norm": 0.20199054479599, "learning_rate": 6.715393750640065e-05, "loss": 0.1979, "step": 4423 }, { "epoch": 3.0289800526910047, "grad_norm": 0.19331932067871094, "learning_rate": 6.71396086629558e-05, "loss": 0.1785, "step": 4424 }, { "epoch": 3.0297327813323296, "grad_norm": 0.19060653448104858, "learning_rate": 6.712527849194162e-05, "loss": 0.1804, "step": 4425 }, { "epoch": 3.0304855099736545, "grad_norm": 0.17681483924388885, "learning_rate": 6.711094699492519e-05, "loss": 0.2154, "step": 4426 }, { "epoch": 3.0312382386149794, "grad_norm": 0.19340817630290985, "learning_rate": 6.709661417347382e-05, "loss": 0.2195, "step": 4427 }, { "epoch": 3.031990967256304, "grad_norm": 0.19048744440078735, "learning_rate": 6.708228002915489e-05, "loss": 0.1739, "step": 4428 }, { "epoch": 3.032743695897629, "grad_norm": 0.19508136808872223, "learning_rate": 6.706794456353602e-05, "loss": 0.1306, "step": 4429 }, { "epoch": 3.0334964245389537, "grad_norm": 0.22007504105567932, "learning_rate": 6.705360777818483e-05, "loss": 0.1479, "step": 4430 }, { "epoch": 3.0342491531802787, "grad_norm": 0.1841956526041031, "learning_rate": 6.703926967466924e-05, "loss": 0.14, "step": 4431 }, { "epoch": 3.035001881821603, "grad_norm": 0.1749778687953949, "learning_rate": 6.702493025455722e-05, "loss": 0.1455, "step": 4432 }, { "epoch": 3.035754610462928, "grad_norm": 0.18711023032665253, "learning_rate": 6.701058951941691e-05, "loss": 0.152, "step": 4433 }, { "epoch": 3.036507339104253, "grad_norm": 0.1886516958475113, "learning_rate": 6.699624747081658e-05, "loss": 0.205, "step": 4434 }, { "epoch": 3.037260067745578, "grad_norm": 0.18549612164497375, "learning_rate": 6.698190411032467e-05, "loss": 0.1509, "step": 4435 }, { "epoch": 3.0380127963869024, "grad_norm": 0.19605602324008942, "learning_rate": 6.696755943950973e-05, "loss": 0.1636, "step": 4436 }, { "epoch": 3.0387655250282273, "grad_norm": 0.16924218833446503, "learning_rate": 6.695321345994047e-05, "loss": 0.2348, "step": 4437 }, { "epoch": 3.039518253669552, "grad_norm": 0.19498597085475922, "learning_rate": 6.693886617318579e-05, "loss": 0.2612, "step": 4438 }, { "epoch": 3.040270982310877, "grad_norm": 0.19372664391994476, "learning_rate": 6.692451758081464e-05, "loss": 0.1443, "step": 4439 }, { "epoch": 3.0410237109522016, "grad_norm": 0.21253621578216553, "learning_rate": 6.691016768439618e-05, "loss": 0.1913, "step": 4440 }, { "epoch": 3.0417764395935265, "grad_norm": 0.17907705903053284, "learning_rate": 6.689581648549968e-05, "loss": 0.1191, "step": 4441 }, { "epoch": 3.0425291682348514, "grad_norm": 0.18482369184494019, "learning_rate": 6.688146398569458e-05, "loss": 0.2145, "step": 4442 }, { "epoch": 3.0432818968761763, "grad_norm": 0.19623015820980072, "learning_rate": 6.686711018655044e-05, "loss": 0.2146, "step": 4443 }, { "epoch": 3.044034625517501, "grad_norm": 0.1856631636619568, "learning_rate": 6.685275508963698e-05, "loss": 0.1972, "step": 4444 }, { "epoch": 3.0447873541588257, "grad_norm": 0.19673699140548706, "learning_rate": 6.683839869652402e-05, "loss": 0.2034, "step": 4445 }, { "epoch": 3.0455400828001506, "grad_norm": 0.18375012278556824, "learning_rate": 6.68240410087816e-05, "loss": 0.1686, "step": 4446 }, { "epoch": 3.046292811441475, "grad_norm": 0.18671445548534393, "learning_rate": 6.680968202797983e-05, "loss": 0.2554, "step": 4447 }, { "epoch": 3.0470455400828, "grad_norm": 0.19192546606063843, "learning_rate": 6.679532175568897e-05, "loss": 0.2369, "step": 4448 }, { "epoch": 3.047798268724125, "grad_norm": 0.19655272364616394, "learning_rate": 6.678096019347948e-05, "loss": 0.1711, "step": 4449 }, { "epoch": 3.04855099736545, "grad_norm": 0.19771328568458557, "learning_rate": 6.676659734292189e-05, "loss": 0.1307, "step": 4450 }, { "epoch": 3.0493037260067744, "grad_norm": 0.19069808721542358, "learning_rate": 6.67522332055869e-05, "loss": 0.1725, "step": 4451 }, { "epoch": 3.0500564546480993, "grad_norm": 0.20759007334709167, "learning_rate": 6.673786778304537e-05, "loss": 0.2482, "step": 4452 }, { "epoch": 3.050809183289424, "grad_norm": 0.17127661406993866, "learning_rate": 6.672350107686825e-05, "loss": 0.1473, "step": 4453 }, { "epoch": 3.051561911930749, "grad_norm": 0.19736090302467346, "learning_rate": 6.670913308862668e-05, "loss": 0.2385, "step": 4454 }, { "epoch": 3.0523146405720736, "grad_norm": 0.1983635276556015, "learning_rate": 6.669476381989192e-05, "loss": 0.2808, "step": 4455 }, { "epoch": 3.0530673692133985, "grad_norm": 0.16431362926959991, "learning_rate": 6.668039327223538e-05, "loss": 0.1624, "step": 4456 }, { "epoch": 3.0538200978547234, "grad_norm": 0.1984284222126007, "learning_rate": 6.666602144722858e-05, "loss": 0.2111, "step": 4457 }, { "epoch": 3.0545728264960483, "grad_norm": 0.19934943318367004, "learning_rate": 6.665164834644325e-05, "loss": 0.1513, "step": 4458 }, { "epoch": 3.055325555137373, "grad_norm": 0.20447304844856262, "learning_rate": 6.663727397145114e-05, "loss": 0.1879, "step": 4459 }, { "epoch": 3.0560782837786977, "grad_norm": 0.17370302975177765, "learning_rate": 6.662289832382425e-05, "loss": 0.1908, "step": 4460 }, { "epoch": 3.0568310124200226, "grad_norm": 0.1948656141757965, "learning_rate": 6.660852140513468e-05, "loss": 0.2239, "step": 4461 }, { "epoch": 3.0575837410613476, "grad_norm": 0.1726793646812439, "learning_rate": 6.659414321695465e-05, "loss": 0.1967, "step": 4462 }, { "epoch": 3.058336469702672, "grad_norm": 0.18215268850326538, "learning_rate": 6.657976376085656e-05, "loss": 0.1419, "step": 4463 }, { "epoch": 3.059089198343997, "grad_norm": 0.19702550768852234, "learning_rate": 6.656538303841292e-05, "loss": 0.1598, "step": 4464 }, { "epoch": 3.059841926985322, "grad_norm": 0.21430185437202454, "learning_rate": 6.655100105119637e-05, "loss": 0.2065, "step": 4465 }, { "epoch": 3.060594655626647, "grad_norm": 0.19512839615345, "learning_rate": 6.65366178007797e-05, "loss": 0.1028, "step": 4466 }, { "epoch": 3.0613473842679713, "grad_norm": 0.1820981353521347, "learning_rate": 6.652223328873587e-05, "loss": 0.1582, "step": 4467 }, { "epoch": 3.062100112909296, "grad_norm": 0.25088444352149963, "learning_rate": 6.650784751663793e-05, "loss": 0.2203, "step": 4468 }, { "epoch": 3.062852841550621, "grad_norm": 0.19721217453479767, "learning_rate": 6.649346048605909e-05, "loss": 0.1825, "step": 4469 }, { "epoch": 3.063605570191946, "grad_norm": 0.18889591097831726, "learning_rate": 6.64790721985727e-05, "loss": 0.1512, "step": 4470 }, { "epoch": 3.0643582988332705, "grad_norm": 0.19932778179645538, "learning_rate": 6.646468265575219e-05, "loss": 0.2198, "step": 4471 }, { "epoch": 3.0651110274745954, "grad_norm": 0.20650820434093475, "learning_rate": 6.645029185917126e-05, "loss": 0.1551, "step": 4472 }, { "epoch": 3.0658637561159203, "grad_norm": 0.2055724710226059, "learning_rate": 6.643589981040362e-05, "loss": 0.1933, "step": 4473 }, { "epoch": 3.066616484757245, "grad_norm": 0.16260144114494324, "learning_rate": 6.642150651102317e-05, "loss": 0.1131, "step": 4474 }, { "epoch": 3.0673692133985697, "grad_norm": 0.17203858494758606, "learning_rate": 6.640711196260393e-05, "loss": 0.1388, "step": 4475 }, { "epoch": 3.0681219420398946, "grad_norm": 0.21464678645133972, "learning_rate": 6.63927161667201e-05, "loss": 0.1488, "step": 4476 }, { "epoch": 3.0688746706812196, "grad_norm": 0.20589259266853333, "learning_rate": 6.637831912494593e-05, "loss": 0.1516, "step": 4477 }, { "epoch": 3.069627399322544, "grad_norm": 0.1885376125574112, "learning_rate": 6.636392083885589e-05, "loss": 0.2368, "step": 4478 }, { "epoch": 3.070380127963869, "grad_norm": 0.22106371819972992, "learning_rate": 6.634952131002458e-05, "loss": 0.2221, "step": 4479 }, { "epoch": 3.071132856605194, "grad_norm": 0.21540246903896332, "learning_rate": 6.633512054002667e-05, "loss": 0.1896, "step": 4480 }, { "epoch": 3.071885585246519, "grad_norm": 0.19105574488639832, "learning_rate": 6.632071853043702e-05, "loss": 0.221, "step": 4481 }, { "epoch": 3.0726383138878433, "grad_norm": 0.22492535412311554, "learning_rate": 6.630631528283059e-05, "loss": 0.1602, "step": 4482 }, { "epoch": 3.073391042529168, "grad_norm": 0.20946253836154938, "learning_rate": 6.629191079878254e-05, "loss": 0.1632, "step": 4483 }, { "epoch": 3.074143771170493, "grad_norm": 0.18402431905269623, "learning_rate": 6.627750507986811e-05, "loss": 0.1797, "step": 4484 }, { "epoch": 3.074896499811818, "grad_norm": 0.18692615628242493, "learning_rate": 6.626309812766266e-05, "loss": 0.1567, "step": 4485 }, { "epoch": 3.0756492284531425, "grad_norm": 0.22160230576992035, "learning_rate": 6.624868994374175e-05, "loss": 0.1713, "step": 4486 }, { "epoch": 3.0764019570944674, "grad_norm": 0.2000580132007599, "learning_rate": 6.623428052968103e-05, "loss": 0.1862, "step": 4487 }, { "epoch": 3.0771546857357923, "grad_norm": 0.18684588372707367, "learning_rate": 6.621986988705625e-05, "loss": 0.1924, "step": 4488 }, { "epoch": 3.0779074143771172, "grad_norm": 0.2216147929430008, "learning_rate": 6.620545801744338e-05, "loss": 0.1698, "step": 4489 }, { "epoch": 3.0786601430184417, "grad_norm": 0.18102677166461945, "learning_rate": 6.619104492241848e-05, "loss": 0.1898, "step": 4490 }, { "epoch": 3.0794128716597666, "grad_norm": 0.21473775804042816, "learning_rate": 6.61766306035577e-05, "loss": 0.2455, "step": 4491 }, { "epoch": 3.0801656003010915, "grad_norm": 0.2275240272283554, "learning_rate": 6.616221506243744e-05, "loss": 0.1574, "step": 4492 }, { "epoch": 3.0809183289424165, "grad_norm": 0.20099467039108276, "learning_rate": 6.614779830063411e-05, "loss": 0.1399, "step": 4493 }, { "epoch": 3.081671057583741, "grad_norm": 0.2303713858127594, "learning_rate": 6.613338031972432e-05, "loss": 0.1754, "step": 4494 }, { "epoch": 3.082423786225066, "grad_norm": 0.23113207519054413, "learning_rate": 6.611896112128478e-05, "loss": 0.2721, "step": 4495 }, { "epoch": 3.0831765148663908, "grad_norm": 0.24186556041240692, "learning_rate": 6.610454070689238e-05, "loss": 0.1821, "step": 4496 }, { "epoch": 3.0839292435077157, "grad_norm": 0.18949733674526215, "learning_rate": 6.609011907812411e-05, "loss": 0.1301, "step": 4497 }, { "epoch": 3.08468197214904, "grad_norm": 0.2235642820596695, "learning_rate": 6.607569623655709e-05, "loss": 0.1224, "step": 4498 }, { "epoch": 3.085434700790365, "grad_norm": 0.18476930260658264, "learning_rate": 6.606127218376857e-05, "loss": 0.2229, "step": 4499 }, { "epoch": 3.08618742943169, "grad_norm": 0.25459328293800354, "learning_rate": 6.604684692133597e-05, "loss": 0.1956, "step": 4500 }, { "epoch": 3.0869401580730145, "grad_norm": 0.1817692369222641, "learning_rate": 6.603242045083678e-05, "loss": 0.2408, "step": 4501 }, { "epoch": 3.0876928867143394, "grad_norm": 0.24357552826404572, "learning_rate": 6.601799277384869e-05, "loss": 0.1279, "step": 4502 }, { "epoch": 3.0884456153556643, "grad_norm": 0.18993601202964783, "learning_rate": 6.600356389194948e-05, "loss": 0.1881, "step": 4503 }, { "epoch": 3.0891983439969892, "grad_norm": 0.2034701108932495, "learning_rate": 6.598913380671706e-05, "loss": 0.2051, "step": 4504 }, { "epoch": 3.0899510726383137, "grad_norm": 0.20418807864189148, "learning_rate": 6.59747025197295e-05, "loss": 0.2484, "step": 4505 }, { "epoch": 3.0907038012796386, "grad_norm": 0.1986466497182846, "learning_rate": 6.596027003256495e-05, "loss": 0.209, "step": 4506 }, { "epoch": 3.0914565299209635, "grad_norm": 0.18169760704040527, "learning_rate": 6.594583634680176e-05, "loss": 0.1799, "step": 4507 }, { "epoch": 3.0922092585622885, "grad_norm": 0.23022198677062988, "learning_rate": 6.593140146401836e-05, "loss": 0.2439, "step": 4508 }, { "epoch": 3.092961987203613, "grad_norm": 0.18260669708251953, "learning_rate": 6.591696538579334e-05, "loss": 0.1719, "step": 4509 }, { "epoch": 3.093714715844938, "grad_norm": 0.19539281725883484, "learning_rate": 6.590252811370539e-05, "loss": 0.2055, "step": 4510 }, { "epoch": 3.0944674444862628, "grad_norm": 0.20509622991085052, "learning_rate": 6.588808964933336e-05, "loss": 0.2133, "step": 4511 }, { "epoch": 3.0952201731275877, "grad_norm": 0.23759812116622925, "learning_rate": 6.587364999425621e-05, "loss": 0.195, "step": 4512 }, { "epoch": 3.095972901768912, "grad_norm": 0.18803557753562927, "learning_rate": 6.585920915005306e-05, "loss": 0.1864, "step": 4513 }, { "epoch": 3.096725630410237, "grad_norm": 0.25724494457244873, "learning_rate": 6.584476711830311e-05, "loss": 0.1835, "step": 4514 }, { "epoch": 3.097478359051562, "grad_norm": 0.1993395984172821, "learning_rate": 6.583032390058575e-05, "loss": 0.222, "step": 4515 }, { "epoch": 3.098231087692887, "grad_norm": 0.21519356966018677, "learning_rate": 6.581587949848045e-05, "loss": 0.2118, "step": 4516 }, { "epoch": 3.0989838163342114, "grad_norm": 0.19367535412311554, "learning_rate": 6.58014339135668e-05, "loss": 0.2421, "step": 4517 }, { "epoch": 3.0997365449755363, "grad_norm": 0.22811830043792725, "learning_rate": 6.578698714742459e-05, "loss": 0.1408, "step": 4518 }, { "epoch": 3.100489273616861, "grad_norm": 0.2023441195487976, "learning_rate": 6.57725392016337e-05, "loss": 0.2011, "step": 4519 }, { "epoch": 3.101242002258186, "grad_norm": 0.20500941574573517, "learning_rate": 6.575809007777409e-05, "loss": 0.1482, "step": 4520 }, { "epoch": 3.1019947308995106, "grad_norm": 0.2331056296825409, "learning_rate": 6.574363977742595e-05, "loss": 0.1478, "step": 4521 }, { "epoch": 3.1027474595408355, "grad_norm": 0.16812296211719513, "learning_rate": 6.57291883021695e-05, "loss": 0.1761, "step": 4522 }, { "epoch": 3.1035001881821604, "grad_norm": 0.18687239289283752, "learning_rate": 6.571473565358517e-05, "loss": 0.1657, "step": 4523 }, { "epoch": 3.104252916823485, "grad_norm": 0.2298731803894043, "learning_rate": 6.570028183325346e-05, "loss": 0.1674, "step": 4524 }, { "epoch": 3.10500564546481, "grad_norm": 0.20975860953330994, "learning_rate": 6.5685826842755e-05, "loss": 0.2257, "step": 4525 }, { "epoch": 3.1057583741061348, "grad_norm": 0.1846039891242981, "learning_rate": 6.567137068367057e-05, "loss": 0.1433, "step": 4526 }, { "epoch": 3.1065111027474597, "grad_norm": 0.19397671520709991, "learning_rate": 6.56569133575811e-05, "loss": 0.194, "step": 4527 }, { "epoch": 3.107263831388784, "grad_norm": 0.1652570366859436, "learning_rate": 6.564245486606762e-05, "loss": 0.1947, "step": 4528 }, { "epoch": 3.108016560030109, "grad_norm": 0.2412247657775879, "learning_rate": 6.562799521071126e-05, "loss": 0.2409, "step": 4529 }, { "epoch": 3.108769288671434, "grad_norm": 0.2442944049835205, "learning_rate": 6.561353439309332e-05, "loss": 0.1797, "step": 4530 }, { "epoch": 3.109522017312759, "grad_norm": 0.22019632160663605, "learning_rate": 6.559907241479524e-05, "loss": 0.1706, "step": 4531 }, { "epoch": 3.1102747459540834, "grad_norm": 0.23052066564559937, "learning_rate": 6.558460927739853e-05, "loss": 0.2654, "step": 4532 }, { "epoch": 3.1110274745954083, "grad_norm": 0.18933841586112976, "learning_rate": 6.557014498248482e-05, "loss": 0.2632, "step": 4533 }, { "epoch": 3.111780203236733, "grad_norm": 0.20786820352077484, "learning_rate": 6.555567953163599e-05, "loss": 0.1191, "step": 4534 }, { "epoch": 3.112532931878058, "grad_norm": 0.20181892812252045, "learning_rate": 6.554121292643392e-05, "loss": 0.1733, "step": 4535 }, { "epoch": 3.1132856605193826, "grad_norm": 0.21472764015197754, "learning_rate": 6.552674516846064e-05, "loss": 0.1471, "step": 4536 }, { "epoch": 3.1140383891607075, "grad_norm": 0.19868768751621246, "learning_rate": 6.551227625929832e-05, "loss": 0.2023, "step": 4537 }, { "epoch": 3.1147911178020324, "grad_norm": 0.1881140023469925, "learning_rate": 6.54978062005293e-05, "loss": 0.1728, "step": 4538 }, { "epoch": 3.1155438464433574, "grad_norm": 0.21118319034576416, "learning_rate": 6.548333499373595e-05, "loss": 0.2449, "step": 4539 }, { "epoch": 3.116296575084682, "grad_norm": 0.22746676206588745, "learning_rate": 6.546886264050088e-05, "loss": 0.1671, "step": 4540 }, { "epoch": 3.1170493037260067, "grad_norm": 0.2139521688222885, "learning_rate": 6.54543891424067e-05, "loss": 0.2444, "step": 4541 }, { "epoch": 3.1178020323673317, "grad_norm": 0.19213296473026276, "learning_rate": 6.543991450103625e-05, "loss": 0.1645, "step": 4542 }, { "epoch": 3.1185547610086566, "grad_norm": 0.22556522488594055, "learning_rate": 6.542543871797244e-05, "loss": 0.1784, "step": 4543 }, { "epoch": 3.119307489649981, "grad_norm": 0.1902981847524643, "learning_rate": 6.541096179479836e-05, "loss": 0.2079, "step": 4544 }, { "epoch": 3.120060218291306, "grad_norm": 0.21139103174209595, "learning_rate": 6.53964837330971e-05, "loss": 0.1931, "step": 4545 }, { "epoch": 3.120812946932631, "grad_norm": 0.20243380963802338, "learning_rate": 6.538200453445204e-05, "loss": 0.2257, "step": 4546 }, { "epoch": 3.1215656755739554, "grad_norm": 0.18636776506900787, "learning_rate": 6.536752420044659e-05, "loss": 0.1854, "step": 4547 }, { "epoch": 3.1223184042152803, "grad_norm": 0.19577458500862122, "learning_rate": 6.535304273266426e-05, "loss": 0.1764, "step": 4548 }, { "epoch": 3.123071132856605, "grad_norm": 0.23597295582294464, "learning_rate": 6.533856013268873e-05, "loss": 0.2163, "step": 4549 }, { "epoch": 3.12382386149793, "grad_norm": 0.17164437472820282, "learning_rate": 6.532407640210383e-05, "loss": 0.2342, "step": 4550 }, { "epoch": 3.1245765901392546, "grad_norm": 0.20801863074302673, "learning_rate": 6.530959154249344e-05, "loss": 0.2005, "step": 4551 }, { "epoch": 3.1253293187805795, "grad_norm": 0.2337392270565033, "learning_rate": 6.529510555544163e-05, "loss": 0.2109, "step": 4552 }, { "epoch": 3.1260820474219044, "grad_norm": 0.17024563252925873, "learning_rate": 6.528061844253255e-05, "loss": 0.161, "step": 4553 }, { "epoch": 3.1268347760632293, "grad_norm": 0.2124379724264145, "learning_rate": 6.526613020535051e-05, "loss": 0.1845, "step": 4554 }, { "epoch": 3.127587504704554, "grad_norm": 0.18556775152683258, "learning_rate": 6.525164084547987e-05, "loss": 0.196, "step": 4555 }, { "epoch": 3.1283402333458787, "grad_norm": 0.21198005974292755, "learning_rate": 6.523715036450523e-05, "loss": 0.2031, "step": 4556 }, { "epoch": 3.1290929619872037, "grad_norm": 0.19532892107963562, "learning_rate": 6.522265876401122e-05, "loss": 0.139, "step": 4557 }, { "epoch": 3.1298456906285286, "grad_norm": 0.20067797601222992, "learning_rate": 6.52081660455826e-05, "loss": 0.2004, "step": 4558 }, { "epoch": 3.130598419269853, "grad_norm": 0.19656619429588318, "learning_rate": 6.51936722108043e-05, "loss": 0.2338, "step": 4559 }, { "epoch": 3.131351147911178, "grad_norm": 0.1963535100221634, "learning_rate": 6.517917726126131e-05, "loss": 0.1864, "step": 4560 }, { "epoch": 3.132103876552503, "grad_norm": 0.2268427163362503, "learning_rate": 6.516468119853882e-05, "loss": 0.2144, "step": 4561 }, { "epoch": 3.132856605193828, "grad_norm": 0.17830835282802582, "learning_rate": 6.515018402422209e-05, "loss": 0.145, "step": 4562 }, { "epoch": 3.1336093338351523, "grad_norm": 0.22223037481307983, "learning_rate": 6.513568573989647e-05, "loss": 0.191, "step": 4563 }, { "epoch": 3.134362062476477, "grad_norm": 0.1784520447254181, "learning_rate": 6.51211863471475e-05, "loss": 0.1784, "step": 4564 }, { "epoch": 3.135114791117802, "grad_norm": 0.2200336456298828, "learning_rate": 6.510668584756082e-05, "loss": 0.139, "step": 4565 }, { "epoch": 3.135867519759127, "grad_norm": 0.18361134827136993, "learning_rate": 6.509218424272216e-05, "loss": 0.1515, "step": 4566 }, { "epoch": 3.1366202484004515, "grad_norm": 0.19770751893520355, "learning_rate": 6.507768153421741e-05, "loss": 0.1347, "step": 4567 }, { "epoch": 3.1373729770417764, "grad_norm": 0.2272072434425354, "learning_rate": 6.506317772363255e-05, "loss": 0.1728, "step": 4568 }, { "epoch": 3.1381257056831013, "grad_norm": 0.1903294026851654, "learning_rate": 6.504867281255371e-05, "loss": 0.1406, "step": 4569 }, { "epoch": 3.1388784343244263, "grad_norm": 0.22582951188087463, "learning_rate": 6.503416680256713e-05, "loss": 0.2235, "step": 4570 }, { "epoch": 3.1396311629657507, "grad_norm": 0.16613997519016266, "learning_rate": 6.501965969525912e-05, "loss": 0.0939, "step": 4571 }, { "epoch": 3.1403838916070757, "grad_norm": 0.18336035311222076, "learning_rate": 6.500515149221624e-05, "loss": 0.148, "step": 4572 }, { "epoch": 3.1411366202484006, "grad_norm": 0.17193503677845, "learning_rate": 6.499064219502501e-05, "loss": 0.1243, "step": 4573 }, { "epoch": 3.141889348889725, "grad_norm": 0.1787504106760025, "learning_rate": 6.497613180527216e-05, "loss": 0.1587, "step": 4574 }, { "epoch": 3.14264207753105, "grad_norm": 0.188293918967247, "learning_rate": 6.496162032454454e-05, "loss": 0.1193, "step": 4575 }, { "epoch": 3.143394806172375, "grad_norm": 0.22220668196678162, "learning_rate": 6.494710775442912e-05, "loss": 0.2149, "step": 4576 }, { "epoch": 3.1441475348137, "grad_norm": 0.19711042940616608, "learning_rate": 6.493259409651292e-05, "loss": 0.1274, "step": 4577 }, { "epoch": 3.1449002634550243, "grad_norm": 0.19542346894741058, "learning_rate": 6.49180793523832e-05, "loss": 0.1973, "step": 4578 }, { "epoch": 3.145652992096349, "grad_norm": 0.1818191260099411, "learning_rate": 6.490356352362721e-05, "loss": 0.1894, "step": 4579 }, { "epoch": 3.146405720737674, "grad_norm": 0.21466794610023499, "learning_rate": 6.488904661183239e-05, "loss": 0.19, "step": 4580 }, { "epoch": 3.147158449378999, "grad_norm": 0.23974566161632538, "learning_rate": 6.487452861858633e-05, "loss": 0.1963, "step": 4581 }, { "epoch": 3.1479111780203235, "grad_norm": 0.23397457599639893, "learning_rate": 6.486000954547666e-05, "loss": 0.2253, "step": 4582 }, { "epoch": 3.1486639066616484, "grad_norm": 0.20011714100837708, "learning_rate": 6.484548939409115e-05, "loss": 0.2312, "step": 4583 }, { "epoch": 3.1494166353029733, "grad_norm": 0.1986476480960846, "learning_rate": 6.483096816601772e-05, "loss": 0.1594, "step": 4584 }, { "epoch": 3.1501693639442983, "grad_norm": 0.16775734722614288, "learning_rate": 6.481644586284442e-05, "loss": 0.2077, "step": 4585 }, { "epoch": 3.1509220925856227, "grad_norm": 0.20539997518062592, "learning_rate": 6.480192248615935e-05, "loss": 0.2299, "step": 4586 }, { "epoch": 3.1516748212269476, "grad_norm": 0.19280879199504852, "learning_rate": 6.478739803755077e-05, "loss": 0.1713, "step": 4587 }, { "epoch": 3.1524275498682726, "grad_norm": 0.2111886888742447, "learning_rate": 6.477287251860706e-05, "loss": 0.1607, "step": 4588 }, { "epoch": 3.1531802785095975, "grad_norm": 0.1985040009021759, "learning_rate": 6.47583459309167e-05, "loss": 0.1532, "step": 4589 }, { "epoch": 3.153933007150922, "grad_norm": 0.19518227875232697, "learning_rate": 6.47438182760683e-05, "loss": 0.1273, "step": 4590 }, { "epoch": 3.154685735792247, "grad_norm": 0.18148919939994812, "learning_rate": 6.472928955565056e-05, "loss": 0.1538, "step": 4591 }, { "epoch": 3.155438464433572, "grad_norm": 0.20611371099948883, "learning_rate": 6.471475977125239e-05, "loss": 0.2363, "step": 4592 }, { "epoch": 3.1561911930748967, "grad_norm": 0.15981027483940125, "learning_rate": 6.470022892446265e-05, "loss": 0.1244, "step": 4593 }, { "epoch": 3.156943921716221, "grad_norm": 0.19369371235370636, "learning_rate": 6.46856970168705e-05, "loss": 0.1996, "step": 4594 }, { "epoch": 3.157696650357546, "grad_norm": 0.24888227880001068, "learning_rate": 6.467116405006505e-05, "loss": 0.1749, "step": 4595 }, { "epoch": 3.158449378998871, "grad_norm": 0.19038806855678558, "learning_rate": 6.465663002563566e-05, "loss": 0.1865, "step": 4596 }, { "epoch": 3.159202107640196, "grad_norm": 0.21058283746242523, "learning_rate": 6.464209494517174e-05, "loss": 0.1774, "step": 4597 }, { "epoch": 3.1599548362815204, "grad_norm": 0.17356081306934357, "learning_rate": 6.46275588102628e-05, "loss": 0.1693, "step": 4598 }, { "epoch": 3.1607075649228453, "grad_norm": 0.184470072388649, "learning_rate": 6.46130216224985e-05, "loss": 0.1402, "step": 4599 }, { "epoch": 3.1614602935641702, "grad_norm": 0.1891617774963379, "learning_rate": 6.459848338346861e-05, "loss": 0.1363, "step": 4600 }, { "epoch": 3.1614602935641702, "eval_loss": 0.21445819735527039, "eval_runtime": 456.5725, "eval_samples_per_second": 21.085, "eval_steps_per_second": 0.659, "step": 4600 }, { "epoch": 3.1622130222054947, "grad_norm": 0.1822735220193863, "learning_rate": 6.458394409476301e-05, "loss": 0.1397, "step": 4601 }, { "epoch": 3.1629657508468196, "grad_norm": 0.20368030667304993, "learning_rate": 6.45694037579717e-05, "loss": 0.1627, "step": 4602 }, { "epoch": 3.1637184794881446, "grad_norm": 0.2382420003414154, "learning_rate": 6.455486237468477e-05, "loss": 0.2284, "step": 4603 }, { "epoch": 3.1644712081294695, "grad_norm": 0.2248864769935608, "learning_rate": 6.454031994649247e-05, "loss": 0.1958, "step": 4604 }, { "epoch": 3.165223936770794, "grad_norm": 0.19834795594215393, "learning_rate": 6.452577647498511e-05, "loss": 0.1536, "step": 4605 }, { "epoch": 3.165976665412119, "grad_norm": 0.20843371748924255, "learning_rate": 6.451123196175317e-05, "loss": 0.2319, "step": 4606 }, { "epoch": 3.166729394053444, "grad_norm": 0.20890496671199799, "learning_rate": 6.44966864083872e-05, "loss": 0.1423, "step": 4607 }, { "epoch": 3.1674821226947687, "grad_norm": 0.1974012851715088, "learning_rate": 6.448213981647788e-05, "loss": 0.1728, "step": 4608 }, { "epoch": 3.168234851336093, "grad_norm": 0.20087195932865143, "learning_rate": 6.446759218761599e-05, "loss": 0.2361, "step": 4609 }, { "epoch": 3.168987579977418, "grad_norm": 0.1936747431755066, "learning_rate": 6.445304352339248e-05, "loss": 0.2154, "step": 4610 }, { "epoch": 3.169740308618743, "grad_norm": 0.19505073130130768, "learning_rate": 6.443849382539832e-05, "loss": 0.0923, "step": 4611 }, { "epoch": 3.170493037260068, "grad_norm": 0.18098075687885284, "learning_rate": 6.442394309522467e-05, "loss": 0.2245, "step": 4612 }, { "epoch": 3.1712457659013924, "grad_norm": 0.22874751687049866, "learning_rate": 6.440939133446277e-05, "loss": 0.2348, "step": 4613 }, { "epoch": 3.1719984945427173, "grad_norm": 0.19023646414279938, "learning_rate": 6.439483854470398e-05, "loss": 0.1509, "step": 4614 }, { "epoch": 3.1727512231840422, "grad_norm": 0.20794445276260376, "learning_rate": 6.438028472753978e-05, "loss": 0.2374, "step": 4615 }, { "epoch": 3.173503951825367, "grad_norm": 0.19605451822280884, "learning_rate": 6.436572988456175e-05, "loss": 0.1685, "step": 4616 }, { "epoch": 3.1742566804666916, "grad_norm": 0.1958169788122177, "learning_rate": 6.435117401736158e-05, "loss": 0.1549, "step": 4617 }, { "epoch": 3.1750094091080165, "grad_norm": 0.22646884620189667, "learning_rate": 6.433661712753107e-05, "loss": 0.1685, "step": 4618 }, { "epoch": 3.1757621377493415, "grad_norm": 0.18099772930145264, "learning_rate": 6.432205921666216e-05, "loss": 0.1643, "step": 4619 }, { "epoch": 3.176514866390666, "grad_norm": 0.18968334794044495, "learning_rate": 6.430750028634686e-05, "loss": 0.2325, "step": 4620 }, { "epoch": 3.177267595031991, "grad_norm": 0.17781959474086761, "learning_rate": 6.429294033817733e-05, "loss": 0.1989, "step": 4621 }, { "epoch": 3.1780203236733158, "grad_norm": 0.21953408420085907, "learning_rate": 6.427837937374581e-05, "loss": 0.1958, "step": 4622 }, { "epoch": 3.1787730523146407, "grad_norm": 0.16064058244228363, "learning_rate": 6.426381739464466e-05, "loss": 0.1137, "step": 4623 }, { "epoch": 3.1795257809559656, "grad_norm": 0.18792614340782166, "learning_rate": 6.424925440246638e-05, "loss": 0.2292, "step": 4624 }, { "epoch": 3.18027850959729, "grad_norm": 0.193759486079216, "learning_rate": 6.423469039880354e-05, "loss": 0.2029, "step": 4625 }, { "epoch": 3.181031238238615, "grad_norm": 0.16859039664268494, "learning_rate": 6.422012538524885e-05, "loss": 0.1579, "step": 4626 }, { "epoch": 3.18178396687994, "grad_norm": 0.17391587793827057, "learning_rate": 6.420555936339512e-05, "loss": 0.1648, "step": 4627 }, { "epoch": 3.1825366955212644, "grad_norm": 0.193020299077034, "learning_rate": 6.419099233483523e-05, "loss": 0.1662, "step": 4628 }, { "epoch": 3.1832894241625893, "grad_norm": 0.18836212158203125, "learning_rate": 6.417642430116227e-05, "loss": 0.1895, "step": 4629 }, { "epoch": 3.1840421528039142, "grad_norm": 0.1971048265695572, "learning_rate": 6.416185526396932e-05, "loss": 0.2788, "step": 4630 }, { "epoch": 3.184794881445239, "grad_norm": 0.2108762115240097, "learning_rate": 6.414728522484965e-05, "loss": 0.1899, "step": 4631 }, { "epoch": 3.1855476100865636, "grad_norm": 0.19434207677841187, "learning_rate": 6.413271418539664e-05, "loss": 0.202, "step": 4632 }, { "epoch": 3.1863003387278885, "grad_norm": 0.15663807094097137, "learning_rate": 6.411814214720371e-05, "loss": 0.1229, "step": 4633 }, { "epoch": 3.1870530673692135, "grad_norm": 0.19165927171707153, "learning_rate": 6.410356911186449e-05, "loss": 0.1731, "step": 4634 }, { "epoch": 3.1878057960105384, "grad_norm": 0.19041705131530762, "learning_rate": 6.408899508097262e-05, "loss": 0.2214, "step": 4635 }, { "epoch": 3.188558524651863, "grad_norm": 0.1820852905511856, "learning_rate": 6.407442005612194e-05, "loss": 0.1891, "step": 4636 }, { "epoch": 3.1893112532931878, "grad_norm": 0.2070588320493698, "learning_rate": 6.40598440389063e-05, "loss": 0.1823, "step": 4637 }, { "epoch": 3.1900639819345127, "grad_norm": 0.22485360503196716, "learning_rate": 6.404526703091976e-05, "loss": 0.152, "step": 4638 }, { "epoch": 3.1908167105758376, "grad_norm": 0.18833927810192108, "learning_rate": 6.403068903375638e-05, "loss": 0.2139, "step": 4639 }, { "epoch": 3.191569439217162, "grad_norm": 0.19176998734474182, "learning_rate": 6.401611004901046e-05, "loss": 0.2295, "step": 4640 }, { "epoch": 3.192322167858487, "grad_norm": 0.18893636763095856, "learning_rate": 6.400153007827629e-05, "loss": 0.2037, "step": 4641 }, { "epoch": 3.193074896499812, "grad_norm": 0.15935064852237701, "learning_rate": 6.398694912314831e-05, "loss": 0.1555, "step": 4642 }, { "epoch": 3.193827625141137, "grad_norm": 0.2088753879070282, "learning_rate": 6.39723671852211e-05, "loss": 0.1886, "step": 4643 }, { "epoch": 3.1945803537824613, "grad_norm": 0.21411770582199097, "learning_rate": 6.395778426608931e-05, "loss": 0.1716, "step": 4644 }, { "epoch": 3.195333082423786, "grad_norm": 0.19564911723136902, "learning_rate": 6.39432003673477e-05, "loss": 0.1809, "step": 4645 }, { "epoch": 3.196085811065111, "grad_norm": 0.1975521743297577, "learning_rate": 6.392861549059114e-05, "loss": 0.1791, "step": 4646 }, { "epoch": 3.1968385397064356, "grad_norm": 0.1845906376838684, "learning_rate": 6.391402963741459e-05, "loss": 0.166, "step": 4647 }, { "epoch": 3.1975912683477605, "grad_norm": 0.18016786873340607, "learning_rate": 6.38994428094132e-05, "loss": 0.1109, "step": 4648 }, { "epoch": 3.1983439969890854, "grad_norm": 0.17870868742465973, "learning_rate": 6.388485500818211e-05, "loss": 0.1433, "step": 4649 }, { "epoch": 3.1990967256304104, "grad_norm": 0.22172188758850098, "learning_rate": 6.387026623531661e-05, "loss": 0.1971, "step": 4650 }, { "epoch": 3.199849454271735, "grad_norm": 0.20632247626781464, "learning_rate": 6.385567649241216e-05, "loss": 0.1185, "step": 4651 }, { "epoch": 3.2006021829130598, "grad_norm": 0.1866328865289688, "learning_rate": 6.384108578106423e-05, "loss": 0.1063, "step": 4652 }, { "epoch": 3.2013549115543847, "grad_norm": 0.18351592123508453, "learning_rate": 6.382649410286848e-05, "loss": 0.1608, "step": 4653 }, { "epoch": 3.2021076401957096, "grad_norm": 0.19682841002941132, "learning_rate": 6.381190145942057e-05, "loss": 0.189, "step": 4654 }, { "epoch": 3.202860368837034, "grad_norm": 0.21620753407478333, "learning_rate": 6.379730785231637e-05, "loss": 0.1778, "step": 4655 }, { "epoch": 3.203613097478359, "grad_norm": 0.1770668774843216, "learning_rate": 6.378271328315183e-05, "loss": 0.1212, "step": 4656 }, { "epoch": 3.204365826119684, "grad_norm": 0.17312844097614288, "learning_rate": 6.376811775352293e-05, "loss": 0.1388, "step": 4657 }, { "epoch": 3.205118554761009, "grad_norm": 0.20466788113117218, "learning_rate": 6.375352126502589e-05, "loss": 0.1906, "step": 4658 }, { "epoch": 3.2058712834023333, "grad_norm": 0.17761079967021942, "learning_rate": 6.373892381925692e-05, "loss": 0.1449, "step": 4659 }, { "epoch": 3.206624012043658, "grad_norm": 0.18959423899650574, "learning_rate": 6.372432541781238e-05, "loss": 0.2156, "step": 4660 }, { "epoch": 3.207376740684983, "grad_norm": 0.14687654376029968, "learning_rate": 6.370972606228872e-05, "loss": 0.1763, "step": 4661 }, { "epoch": 3.208129469326308, "grad_norm": 0.19420979917049408, "learning_rate": 6.369512575428251e-05, "loss": 0.1864, "step": 4662 }, { "epoch": 3.2088821979676325, "grad_norm": 0.17795109748840332, "learning_rate": 6.368052449539044e-05, "loss": 0.1556, "step": 4663 }, { "epoch": 3.2096349266089574, "grad_norm": 0.18707360327243805, "learning_rate": 6.366592228720923e-05, "loss": 0.1365, "step": 4664 }, { "epoch": 3.2103876552502824, "grad_norm": 0.1721252053976059, "learning_rate": 6.365131913133582e-05, "loss": 0.1525, "step": 4665 }, { "epoch": 3.2111403838916073, "grad_norm": 0.18144027888774872, "learning_rate": 6.363671502936715e-05, "loss": 0.1356, "step": 4666 }, { "epoch": 3.2118931125329317, "grad_norm": 0.19083134829998016, "learning_rate": 6.36221099829003e-05, "loss": 0.1729, "step": 4667 }, { "epoch": 3.2126458411742567, "grad_norm": 0.20780222117900848, "learning_rate": 6.360750399353247e-05, "loss": 0.1951, "step": 4668 }, { "epoch": 3.2133985698155816, "grad_norm": 0.20391014218330383, "learning_rate": 6.359289706286093e-05, "loss": 0.1824, "step": 4669 }, { "epoch": 3.2141512984569065, "grad_norm": 0.18324242532253265, "learning_rate": 6.35782891924831e-05, "loss": 0.1817, "step": 4670 }, { "epoch": 3.214904027098231, "grad_norm": 0.21003897488117218, "learning_rate": 6.356368038399649e-05, "loss": 0.1433, "step": 4671 }, { "epoch": 3.215656755739556, "grad_norm": 0.20303110778331757, "learning_rate": 6.354907063899864e-05, "loss": 0.2061, "step": 4672 }, { "epoch": 3.216409484380881, "grad_norm": 0.19846008718013763, "learning_rate": 6.353445995908726e-05, "loss": 0.2465, "step": 4673 }, { "epoch": 3.2171622130222053, "grad_norm": 0.21698135137557983, "learning_rate": 6.35198483458602e-05, "loss": 0.1822, "step": 4674 }, { "epoch": 3.21791494166353, "grad_norm": 0.20064522325992584, "learning_rate": 6.350523580091532e-05, "loss": 0.1836, "step": 4675 }, { "epoch": 3.218667670304855, "grad_norm": 0.19532865285873413, "learning_rate": 6.349062232585063e-05, "loss": 0.1863, "step": 4676 }, { "epoch": 3.21942039894618, "grad_norm": 0.22366708517074585, "learning_rate": 6.347600792226429e-05, "loss": 0.1998, "step": 4677 }, { "epoch": 3.2201731275875045, "grad_norm": 0.2003994584083557, "learning_rate": 6.346139259175442e-05, "loss": 0.2027, "step": 4678 }, { "epoch": 3.2209258562288294, "grad_norm": 0.18749278783798218, "learning_rate": 6.34467763359194e-05, "loss": 0.1625, "step": 4679 }, { "epoch": 3.2216785848701543, "grad_norm": 0.16943515837192535, "learning_rate": 6.343215915635762e-05, "loss": 0.1286, "step": 4680 }, { "epoch": 3.2224313135114793, "grad_norm": 0.25707218050956726, "learning_rate": 6.341754105466759e-05, "loss": 0.2652, "step": 4681 }, { "epoch": 3.2231840421528037, "grad_norm": 0.197025328874588, "learning_rate": 6.340292203244791e-05, "loss": 0.207, "step": 4682 }, { "epoch": 3.2239367707941287, "grad_norm": 0.19399450719356537, "learning_rate": 6.338830209129734e-05, "loss": 0.1612, "step": 4683 }, { "epoch": 3.2246894994354536, "grad_norm": 0.2058074027299881, "learning_rate": 6.337368123281464e-05, "loss": 0.1463, "step": 4684 }, { "epoch": 3.2254422280767785, "grad_norm": 0.20488257706165314, "learning_rate": 6.335905945859876e-05, "loss": 0.1514, "step": 4685 }, { "epoch": 3.226194956718103, "grad_norm": 0.22363929450511932, "learning_rate": 6.33444367702487e-05, "loss": 0.1855, "step": 4686 }, { "epoch": 3.226947685359428, "grad_norm": 0.25517722964286804, "learning_rate": 6.332981316936357e-05, "loss": 0.2039, "step": 4687 }, { "epoch": 3.227700414000753, "grad_norm": 0.19564609229564667, "learning_rate": 6.331518865754262e-05, "loss": 0.1879, "step": 4688 }, { "epoch": 3.2284531426420777, "grad_norm": 0.19771243631839752, "learning_rate": 6.330056323638514e-05, "loss": 0.213, "step": 4689 }, { "epoch": 3.229205871283402, "grad_norm": 0.19725561141967773, "learning_rate": 6.328593690749052e-05, "loss": 0.2714, "step": 4690 }, { "epoch": 3.229958599924727, "grad_norm": 0.2344287782907486, "learning_rate": 6.327130967245833e-05, "loss": 0.2528, "step": 4691 }, { "epoch": 3.230711328566052, "grad_norm": 0.19455170631408691, "learning_rate": 6.325668153288811e-05, "loss": 0.2573, "step": 4692 }, { "epoch": 3.2314640572073765, "grad_norm": 0.19224749505519867, "learning_rate": 6.324205249037964e-05, "loss": 0.2041, "step": 4693 }, { "epoch": 3.2322167858487014, "grad_norm": 0.19759956002235413, "learning_rate": 6.322742254653271e-05, "loss": 0.245, "step": 4694 }, { "epoch": 3.2329695144900263, "grad_norm": 0.2052445262670517, "learning_rate": 6.321279170294718e-05, "loss": 0.2138, "step": 4695 }, { "epoch": 3.2337222431313513, "grad_norm": 0.20258815586566925, "learning_rate": 6.319815996122314e-05, "loss": 0.1978, "step": 4696 }, { "epoch": 3.234474971772676, "grad_norm": 0.18045562505722046, "learning_rate": 6.318352732296063e-05, "loss": 0.1213, "step": 4697 }, { "epoch": 3.2352277004140007, "grad_norm": 0.19980540871620178, "learning_rate": 6.316889378975987e-05, "loss": 0.1532, "step": 4698 }, { "epoch": 3.2359804290553256, "grad_norm": 0.1736702024936676, "learning_rate": 6.315425936322118e-05, "loss": 0.194, "step": 4699 }, { "epoch": 3.2367331576966505, "grad_norm": 0.1834656447172165, "learning_rate": 6.313962404494496e-05, "loss": 0.1941, "step": 4700 }, { "epoch": 3.237485886337975, "grad_norm": 0.18737372756004333, "learning_rate": 6.312498783653166e-05, "loss": 0.1458, "step": 4701 }, { "epoch": 3.2382386149793, "grad_norm": 0.20622701942920685, "learning_rate": 6.311035073958192e-05, "loss": 0.2062, "step": 4702 }, { "epoch": 3.238991343620625, "grad_norm": 0.17516976594924927, "learning_rate": 6.309571275569643e-05, "loss": 0.1174, "step": 4703 }, { "epoch": 3.2397440722619497, "grad_norm": 0.18967661261558533, "learning_rate": 6.308107388647595e-05, "loss": 0.173, "step": 4704 }, { "epoch": 3.240496800903274, "grad_norm": 0.2004508078098297, "learning_rate": 6.306643413352138e-05, "loss": 0.1899, "step": 4705 }, { "epoch": 3.241249529544599, "grad_norm": 0.17779800295829773, "learning_rate": 6.30517934984337e-05, "loss": 0.2061, "step": 4706 }, { "epoch": 3.242002258185924, "grad_norm": 0.18336834013462067, "learning_rate": 6.303715198281399e-05, "loss": 0.1151, "step": 4707 }, { "epoch": 3.242754986827249, "grad_norm": 0.17025627195835114, "learning_rate": 6.302250958826343e-05, "loss": 0.1398, "step": 4708 }, { "epoch": 3.2435077154685734, "grad_norm": 0.17571812868118286, "learning_rate": 6.300786631638327e-05, "loss": 0.1214, "step": 4709 }, { "epoch": 3.2442604441098983, "grad_norm": 0.17428156733512878, "learning_rate": 6.299322216877488e-05, "loss": 0.1778, "step": 4710 }, { "epoch": 3.2450131727512233, "grad_norm": 0.1981152594089508, "learning_rate": 6.297857714703977e-05, "loss": 0.1918, "step": 4711 }, { "epoch": 3.245765901392548, "grad_norm": 0.20587041974067688, "learning_rate": 6.296393125277941e-05, "loss": 0.119, "step": 4712 }, { "epoch": 3.2465186300338726, "grad_norm": 0.20384235680103302, "learning_rate": 6.294928448759555e-05, "loss": 0.1921, "step": 4713 }, { "epoch": 3.2472713586751976, "grad_norm": 0.23089508712291718, "learning_rate": 6.293463685308985e-05, "loss": 0.2054, "step": 4714 }, { "epoch": 3.2480240873165225, "grad_norm": 0.1763884723186493, "learning_rate": 6.291998835086423e-05, "loss": 0.1079, "step": 4715 }, { "epoch": 3.2487768159578474, "grad_norm": 0.1994752287864685, "learning_rate": 6.290533898252058e-05, "loss": 0.2087, "step": 4716 }, { "epoch": 3.249529544599172, "grad_norm": 0.20196372270584106, "learning_rate": 6.289068874966092e-05, "loss": 0.14, "step": 4717 }, { "epoch": 3.250282273240497, "grad_norm": 0.21460670232772827, "learning_rate": 6.287603765388743e-05, "loss": 0.1438, "step": 4718 }, { "epoch": 3.2510350018818217, "grad_norm": 0.20358332991600037, "learning_rate": 6.286138569680229e-05, "loss": 0.1815, "step": 4719 }, { "epoch": 3.251787730523146, "grad_norm": 0.19143632054328918, "learning_rate": 6.284673288000783e-05, "loss": 0.1593, "step": 4720 }, { "epoch": 3.252540459164471, "grad_norm": 0.2232847809791565, "learning_rate": 6.283207920510646e-05, "loss": 0.1946, "step": 4721 }, { "epoch": 3.253293187805796, "grad_norm": 0.23224112391471863, "learning_rate": 6.281742467370068e-05, "loss": 0.1462, "step": 4722 }, { "epoch": 3.254045916447121, "grad_norm": 0.1732797622680664, "learning_rate": 6.280276928739312e-05, "loss": 0.1317, "step": 4723 }, { "epoch": 3.254798645088446, "grad_norm": 0.23456670343875885, "learning_rate": 6.27881130477864e-05, "loss": 0.2213, "step": 4724 }, { "epoch": 3.2555513737297703, "grad_norm": 0.17638368904590607, "learning_rate": 6.277345595648337e-05, "loss": 0.1791, "step": 4725 }, { "epoch": 3.2563041023710952, "grad_norm": 0.21282044053077698, "learning_rate": 6.275879801508688e-05, "loss": 0.3369, "step": 4726 }, { "epoch": 3.25705683101242, "grad_norm": 0.20811055600643158, "learning_rate": 6.27441392251999e-05, "loss": 0.1742, "step": 4727 }, { "epoch": 3.2578095596537446, "grad_norm": 0.20685669779777527, "learning_rate": 6.27294795884255e-05, "loss": 0.1896, "step": 4728 }, { "epoch": 3.2585622882950696, "grad_norm": 0.1898210495710373, "learning_rate": 6.271481910636684e-05, "loss": 0.1574, "step": 4729 }, { "epoch": 3.2593150169363945, "grad_norm": 0.21166200935840607, "learning_rate": 6.270015778062715e-05, "loss": 0.126, "step": 4730 }, { "epoch": 3.2600677455777194, "grad_norm": 0.19097602367401123, "learning_rate": 6.268549561280979e-05, "loss": 0.1587, "step": 4731 }, { "epoch": 3.260820474219044, "grad_norm": 0.1626196801662445, "learning_rate": 6.267083260451819e-05, "loss": 0.1556, "step": 4732 }, { "epoch": 3.261573202860369, "grad_norm": 0.16846977174282074, "learning_rate": 6.265616875735585e-05, "loss": 0.1415, "step": 4733 }, { "epoch": 3.2623259315016937, "grad_norm": 0.20201344788074493, "learning_rate": 6.264150407292641e-05, "loss": 0.1881, "step": 4734 }, { "epoch": 3.2630786601430186, "grad_norm": 0.3998895287513733, "learning_rate": 6.26268385528336e-05, "loss": 0.1326, "step": 4735 }, { "epoch": 3.263831388784343, "grad_norm": 0.16424348950386047, "learning_rate": 6.261217219868117e-05, "loss": 0.0838, "step": 4736 }, { "epoch": 3.264584117425668, "grad_norm": 0.17401570081710815, "learning_rate": 6.259750501207302e-05, "loss": 0.1243, "step": 4737 }, { "epoch": 3.265336846066993, "grad_norm": 0.1984204798936844, "learning_rate": 6.258283699461318e-05, "loss": 0.1435, "step": 4738 }, { "epoch": 3.266089574708318, "grad_norm": 0.1871737539768219, "learning_rate": 6.256816814790566e-05, "loss": 0.1521, "step": 4739 }, { "epoch": 3.2668423033496423, "grad_norm": 0.16600817441940308, "learning_rate": 6.255349847355465e-05, "loss": 0.155, "step": 4740 }, { "epoch": 3.2675950319909672, "grad_norm": 0.19609519839286804, "learning_rate": 6.253882797316441e-05, "loss": 0.2051, "step": 4741 }, { "epoch": 3.268347760632292, "grad_norm": 0.2052372395992279, "learning_rate": 6.252415664833928e-05, "loss": 0.1497, "step": 4742 }, { "epoch": 3.269100489273617, "grad_norm": 0.21677805483341217, "learning_rate": 6.25094845006837e-05, "loss": 0.2145, "step": 4743 }, { "epoch": 3.2698532179149415, "grad_norm": 0.1989600956439972, "learning_rate": 6.249481153180217e-05, "loss": 0.2874, "step": 4744 }, { "epoch": 3.2706059465562665, "grad_norm": 0.19842439889907837, "learning_rate": 6.248013774329932e-05, "loss": 0.196, "step": 4745 }, { "epoch": 3.2713586751975914, "grad_norm": 0.18449734151363373, "learning_rate": 6.246546313677986e-05, "loss": 0.1617, "step": 4746 }, { "epoch": 3.272111403838916, "grad_norm": 0.17659103870391846, "learning_rate": 6.245078771384858e-05, "loss": 0.1648, "step": 4747 }, { "epoch": 3.2728641324802408, "grad_norm": 0.1811683177947998, "learning_rate": 6.243611147611035e-05, "loss": 0.1388, "step": 4748 }, { "epoch": 3.2736168611215657, "grad_norm": 0.18005210161209106, "learning_rate": 6.242143442517013e-05, "loss": 0.2383, "step": 4749 }, { "epoch": 3.2743695897628906, "grad_norm": 0.174529030919075, "learning_rate": 6.240675656263303e-05, "loss": 0.153, "step": 4750 }, { "epoch": 3.2751223184042155, "grad_norm": 0.18394576013088226, "learning_rate": 6.239207789010416e-05, "loss": 0.1816, "step": 4751 }, { "epoch": 3.27587504704554, "grad_norm": 0.2046894133090973, "learning_rate": 6.237739840918875e-05, "loss": 0.1936, "step": 4752 }, { "epoch": 3.276627775686865, "grad_norm": 0.2115330845117569, "learning_rate": 6.236271812149216e-05, "loss": 0.2607, "step": 4753 }, { "epoch": 3.27738050432819, "grad_norm": 0.15641799569129944, "learning_rate": 6.234803702861979e-05, "loss": 0.1791, "step": 4754 }, { "epoch": 3.2781332329695143, "grad_norm": 0.15921571850776672, "learning_rate": 6.233335513217711e-05, "loss": 0.1304, "step": 4755 }, { "epoch": 3.2788859616108392, "grad_norm": 0.19671767950057983, "learning_rate": 6.231867243376977e-05, "loss": 0.1982, "step": 4756 }, { "epoch": 3.279638690252164, "grad_norm": 0.19083505868911743, "learning_rate": 6.23039889350034e-05, "loss": 0.208, "step": 4757 }, { "epoch": 3.280391418893489, "grad_norm": 0.183948814868927, "learning_rate": 6.228930463748378e-05, "loss": 0.1097, "step": 4758 }, { "epoch": 3.2811441475348135, "grad_norm": 0.18074122071266174, "learning_rate": 6.227461954281677e-05, "loss": 0.1779, "step": 4759 }, { "epoch": 3.2818968761761385, "grad_norm": 0.18766562640666962, "learning_rate": 6.22599336526083e-05, "loss": 0.1866, "step": 4760 }, { "epoch": 3.2826496048174634, "grad_norm": 0.1908964067697525, "learning_rate": 6.22452469684644e-05, "loss": 0.1979, "step": 4761 }, { "epoch": 3.2834023334587883, "grad_norm": 0.18319222331047058, "learning_rate": 6.223055949199118e-05, "loss": 0.1299, "step": 4762 }, { "epoch": 3.2841550621001128, "grad_norm": 0.18285229802131653, "learning_rate": 6.221587122479486e-05, "loss": 0.1507, "step": 4763 }, { "epoch": 3.2849077907414377, "grad_norm": 0.20451080799102783, "learning_rate": 6.220118216848173e-05, "loss": 0.1745, "step": 4764 }, { "epoch": 3.2856605193827626, "grad_norm": 0.16088837385177612, "learning_rate": 6.21864923246581e-05, "loss": 0.1427, "step": 4765 }, { "epoch": 3.286413248024087, "grad_norm": 0.25192540884017944, "learning_rate": 6.217180169493049e-05, "loss": 0.2449, "step": 4766 }, { "epoch": 3.287165976665412, "grad_norm": 0.1852893829345703, "learning_rate": 6.215711028090544e-05, "loss": 0.1763, "step": 4767 }, { "epoch": 3.287918705306737, "grad_norm": 0.19891943037509918, "learning_rate": 6.214241808418956e-05, "loss": 0.1814, "step": 4768 }, { "epoch": 3.288671433948062, "grad_norm": 0.18417364358901978, "learning_rate": 6.212772510638956e-05, "loss": 0.1285, "step": 4769 }, { "epoch": 3.2894241625893867, "grad_norm": 0.1907896101474762, "learning_rate": 6.211303134911229e-05, "loss": 0.1459, "step": 4770 }, { "epoch": 3.290176891230711, "grad_norm": 0.18533289432525635, "learning_rate": 6.209833681396459e-05, "loss": 0.1343, "step": 4771 }, { "epoch": 3.290929619872036, "grad_norm": 0.17557038366794586, "learning_rate": 6.208364150255345e-05, "loss": 0.1735, "step": 4772 }, { "epoch": 3.291682348513361, "grad_norm": 0.1821289211511612, "learning_rate": 6.206894541648595e-05, "loss": 0.142, "step": 4773 }, { "epoch": 3.2924350771546855, "grad_norm": 0.19677112996578217, "learning_rate": 6.205424855736917e-05, "loss": 0.1546, "step": 4774 }, { "epoch": 3.2931878057960104, "grad_norm": 0.20692162215709686, "learning_rate": 6.203955092681039e-05, "loss": 0.2701, "step": 4775 }, { "epoch": 3.2939405344373354, "grad_norm": 0.16722612082958221, "learning_rate": 6.202485252641692e-05, "loss": 0.1403, "step": 4776 }, { "epoch": 3.2946932630786603, "grad_norm": 0.21250614523887634, "learning_rate": 6.201015335779612e-05, "loss": 0.1836, "step": 4777 }, { "epoch": 3.2954459917199848, "grad_norm": 0.22311818599700928, "learning_rate": 6.199545342255549e-05, "loss": 0.2204, "step": 4778 }, { "epoch": 3.2961987203613097, "grad_norm": 0.19404184818267822, "learning_rate": 6.198075272230258e-05, "loss": 0.201, "step": 4779 }, { "epoch": 3.2969514490026346, "grad_norm": 0.21300208568572998, "learning_rate": 6.196605125864507e-05, "loss": 0.1805, "step": 4780 }, { "epoch": 3.2977041776439595, "grad_norm": 0.17813514173030853, "learning_rate": 6.195134903319066e-05, "loss": 0.1286, "step": 4781 }, { "epoch": 3.298456906285284, "grad_norm": 0.1813291311264038, "learning_rate": 6.193664604754716e-05, "loss": 0.126, "step": 4782 }, { "epoch": 3.299209634926609, "grad_norm": 0.18521572649478912, "learning_rate": 6.192194230332248e-05, "loss": 0.2124, "step": 4783 }, { "epoch": 3.299962363567934, "grad_norm": 0.18094705045223236, "learning_rate": 6.19072378021246e-05, "loss": 0.137, "step": 4784 }, { "epoch": 3.3007150922092587, "grad_norm": 0.20759542286396027, "learning_rate": 6.189253254556159e-05, "loss": 0.2216, "step": 4785 }, { "epoch": 3.301467820850583, "grad_norm": 0.1763729602098465, "learning_rate": 6.187782653524157e-05, "loss": 0.1361, "step": 4786 }, { "epoch": 3.302220549491908, "grad_norm": 0.18265850841999054, "learning_rate": 6.186311977277278e-05, "loss": 0.1491, "step": 4787 }, { "epoch": 3.302973278133233, "grad_norm": 0.18708747625350952, "learning_rate": 6.184841225976354e-05, "loss": 0.1657, "step": 4788 }, { "epoch": 3.303726006774558, "grad_norm": 0.20207493007183075, "learning_rate": 6.183370399782224e-05, "loss": 0.2161, "step": 4789 }, { "epoch": 3.3044787354158824, "grad_norm": 0.2126210778951645, "learning_rate": 6.181899498855732e-05, "loss": 0.1776, "step": 4790 }, { "epoch": 3.3052314640572074, "grad_norm": 0.19466866552829742, "learning_rate": 6.180428523357738e-05, "loss": 0.1302, "step": 4791 }, { "epoch": 3.3059841926985323, "grad_norm": 0.15739630162715912, "learning_rate": 6.178957473449102e-05, "loss": 0.1918, "step": 4792 }, { "epoch": 3.3067369213398567, "grad_norm": 0.19697900116443634, "learning_rate": 6.1774863492907e-05, "loss": 0.1906, "step": 4793 }, { "epoch": 3.3074896499811817, "grad_norm": 0.15402117371559143, "learning_rate": 6.176015151043407e-05, "loss": 0.1105, "step": 4794 }, { "epoch": 3.3082423786225066, "grad_norm": 0.1826791763305664, "learning_rate": 6.174543878868114e-05, "loss": 0.1599, "step": 4795 }, { "epoch": 3.3089951072638315, "grad_norm": 0.18497861921787262, "learning_rate": 6.17307253292572e-05, "loss": 0.1584, "step": 4796 }, { "epoch": 3.3097478359051564, "grad_norm": 0.22330087423324585, "learning_rate": 6.17160111337712e-05, "loss": 0.2919, "step": 4797 }, { "epoch": 3.310500564546481, "grad_norm": 0.19389857351779938, "learning_rate": 6.170129620383235e-05, "loss": 0.1925, "step": 4798 }, { "epoch": 3.311253293187806, "grad_norm": 0.17863425612449646, "learning_rate": 6.168658054104982e-05, "loss": 0.1335, "step": 4799 }, { "epoch": 3.3120060218291307, "grad_norm": 0.21079805493354797, "learning_rate": 6.167186414703289e-05, "loss": 0.1731, "step": 4800 }, { "epoch": 3.3120060218291307, "eval_loss": 0.20823362469673157, "eval_runtime": 455.8958, "eval_samples_per_second": 21.117, "eval_steps_per_second": 0.66, "step": 4800 }, { "epoch": 3.312758750470455, "grad_norm": 0.22168156504631042, "learning_rate": 6.165714702339093e-05, "loss": 0.2373, "step": 4801 }, { "epoch": 3.31351147911178, "grad_norm": 0.18873058259487152, "learning_rate": 6.164242917173339e-05, "loss": 0.1382, "step": 4802 }, { "epoch": 3.314264207753105, "grad_norm": 0.1717524528503418, "learning_rate": 6.162771059366976e-05, "loss": 0.1559, "step": 4803 }, { "epoch": 3.31501693639443, "grad_norm": 0.22441449761390686, "learning_rate": 6.16129912908097e-05, "loss": 0.1562, "step": 4804 }, { "epoch": 3.3157696650357544, "grad_norm": 0.20449934899806976, "learning_rate": 6.159827126476282e-05, "loss": 0.1527, "step": 4805 }, { "epoch": 3.3165223936770794, "grad_norm": 0.19912491738796234, "learning_rate": 6.158355051713894e-05, "loss": 0.1137, "step": 4806 }, { "epoch": 3.3172751223184043, "grad_norm": 0.2089005708694458, "learning_rate": 6.156882904954785e-05, "loss": 0.2094, "step": 4807 }, { "epoch": 3.318027850959729, "grad_norm": 0.176935076713562, "learning_rate": 6.15541068635995e-05, "loss": 0.1592, "step": 4808 }, { "epoch": 3.3187805796010537, "grad_norm": 0.18406526744365692, "learning_rate": 6.153938396090389e-05, "loss": 0.2294, "step": 4809 }, { "epoch": 3.3195333082423786, "grad_norm": 0.17631487548351288, "learning_rate": 6.152466034307107e-05, "loss": 0.1785, "step": 4810 }, { "epoch": 3.3202860368837035, "grad_norm": 0.17986898124217987, "learning_rate": 6.150993601171121e-05, "loss": 0.1925, "step": 4811 }, { "epoch": 3.3210387655250284, "grad_norm": 0.1941087394952774, "learning_rate": 6.149521096843454e-05, "loss": 0.1522, "step": 4812 }, { "epoch": 3.321791494166353, "grad_norm": 0.19979460537433624, "learning_rate": 6.148048521485134e-05, "loss": 0.1796, "step": 4813 }, { "epoch": 3.322544222807678, "grad_norm": 0.18790863454341888, "learning_rate": 6.146575875257205e-05, "loss": 0.1217, "step": 4814 }, { "epoch": 3.3232969514490027, "grad_norm": 0.21025025844573975, "learning_rate": 6.145103158320708e-05, "loss": 0.1636, "step": 4815 }, { "epoch": 3.3240496800903276, "grad_norm": 0.21443144977092743, "learning_rate": 6.143630370836702e-05, "loss": 0.2237, "step": 4816 }, { "epoch": 3.324802408731652, "grad_norm": 0.231980100274086, "learning_rate": 6.142157512966245e-05, "loss": 0.2015, "step": 4817 }, { "epoch": 3.325555137372977, "grad_norm": 0.18538667261600494, "learning_rate": 6.140684584870408e-05, "loss": 0.0942, "step": 4818 }, { "epoch": 3.326307866014302, "grad_norm": 0.19289861619472504, "learning_rate": 6.139211586710267e-05, "loss": 0.1694, "step": 4819 }, { "epoch": 3.3270605946556264, "grad_norm": 0.19856204092502594, "learning_rate": 6.13773851864691e-05, "loss": 0.2323, "step": 4820 }, { "epoch": 3.3278133232969513, "grad_norm": 0.19443252682685852, "learning_rate": 6.136265380841425e-05, "loss": 0.1365, "step": 4821 }, { "epoch": 3.3285660519382763, "grad_norm": 0.19710507988929749, "learning_rate": 6.134792173454915e-05, "loss": 0.1582, "step": 4822 }, { "epoch": 3.329318780579601, "grad_norm": 0.21354971826076508, "learning_rate": 6.133318896648489e-05, "loss": 0.214, "step": 4823 }, { "epoch": 3.330071509220926, "grad_norm": 0.19793692231178284, "learning_rate": 6.13184555058326e-05, "loss": 0.1668, "step": 4824 }, { "epoch": 3.3308242378622506, "grad_norm": 0.1887993961572647, "learning_rate": 6.130372135420351e-05, "loss": 0.1096, "step": 4825 }, { "epoch": 3.3315769665035755, "grad_norm": 0.20928895473480225, "learning_rate": 6.128898651320893e-05, "loss": 0.226, "step": 4826 }, { "epoch": 3.3323296951449004, "grad_norm": 0.22564125061035156, "learning_rate": 6.127425098446026e-05, "loss": 0.184, "step": 4827 }, { "epoch": 3.333082423786225, "grad_norm": 0.1875276118516922, "learning_rate": 6.125951476956891e-05, "loss": 0.1726, "step": 4828 }, { "epoch": 3.33383515242755, "grad_norm": 0.19694404304027557, "learning_rate": 6.124477787014644e-05, "loss": 0.1138, "step": 4829 }, { "epoch": 3.3345878810688747, "grad_norm": 0.20832620561122894, "learning_rate": 6.123004028780445e-05, "loss": 0.179, "step": 4830 }, { "epoch": 3.3353406097101996, "grad_norm": 0.18927790224552155, "learning_rate": 6.121530202415463e-05, "loss": 0.2291, "step": 4831 }, { "epoch": 3.336093338351524, "grad_norm": 0.20648564398288727, "learning_rate": 6.120056308080872e-05, "loss": 0.203, "step": 4832 }, { "epoch": 3.336846066992849, "grad_norm": 0.1991475224494934, "learning_rate": 6.118582345937854e-05, "loss": 0.1511, "step": 4833 }, { "epoch": 3.337598795634174, "grad_norm": 0.20210270583629608, "learning_rate": 6.117108316147604e-05, "loss": 0.2213, "step": 4834 }, { "epoch": 3.338351524275499, "grad_norm": 0.22356632351875305, "learning_rate": 6.115634218871314e-05, "loss": 0.1887, "step": 4835 }, { "epoch": 3.3391042529168233, "grad_norm": 0.19674772024154663, "learning_rate": 6.114160054270191e-05, "loss": 0.1723, "step": 4836 }, { "epoch": 3.3398569815581483, "grad_norm": 0.20833362638950348, "learning_rate": 6.11268582250545e-05, "loss": 0.1663, "step": 4837 }, { "epoch": 3.340609710199473, "grad_norm": 0.19646352529525757, "learning_rate": 6.111211523738307e-05, "loss": 0.157, "step": 4838 }, { "epoch": 3.3413624388407976, "grad_norm": 0.19837777316570282, "learning_rate": 6.109737158129991e-05, "loss": 0.1601, "step": 4839 }, { "epoch": 3.3421151674821226, "grad_norm": 0.16965851187705994, "learning_rate": 6.108262725841736e-05, "loss": 0.1246, "step": 4840 }, { "epoch": 3.3428678961234475, "grad_norm": 0.1630457639694214, "learning_rate": 6.106788227034785e-05, "loss": 0.1125, "step": 4841 }, { "epoch": 3.3436206247647724, "grad_norm": 0.22628186643123627, "learning_rate": 6.105313661870384e-05, "loss": 0.2261, "step": 4842 }, { "epoch": 3.3443733534060973, "grad_norm": 0.19424013793468475, "learning_rate": 6.103839030509793e-05, "loss": 0.1546, "step": 4843 }, { "epoch": 3.345126082047422, "grad_norm": 0.1979076862335205, "learning_rate": 6.102364333114272e-05, "loss": 0.154, "step": 4844 }, { "epoch": 3.3458788106887467, "grad_norm": 0.20769770443439484, "learning_rate": 6.100889569845093e-05, "loss": 0.1981, "step": 4845 }, { "epoch": 3.3466315393300716, "grad_norm": 0.18503305315971375, "learning_rate": 6.0994147408635346e-05, "loss": 0.1291, "step": 4846 }, { "epoch": 3.347384267971396, "grad_norm": 0.19132985174655914, "learning_rate": 6.097939846330882e-05, "loss": 0.1548, "step": 4847 }, { "epoch": 3.348136996612721, "grad_norm": 0.2158019095659256, "learning_rate": 6.096464886408426e-05, "loss": 0.2326, "step": 4848 }, { "epoch": 3.348889725254046, "grad_norm": 0.17837822437286377, "learning_rate": 6.094989861257466e-05, "loss": 0.1766, "step": 4849 }, { "epoch": 3.349642453895371, "grad_norm": 0.15252086520195007, "learning_rate": 6.0935147710393117e-05, "loss": 0.1031, "step": 4850 }, { "epoch": 3.3503951825366958, "grad_norm": 0.22910049557685852, "learning_rate": 6.0920396159152716e-05, "loss": 0.156, "step": 4851 }, { "epoch": 3.3511479111780202, "grad_norm": 0.19137181341648102, "learning_rate": 6.0905643960466704e-05, "loss": 0.1586, "step": 4852 }, { "epoch": 3.351900639819345, "grad_norm": 0.18632861971855164, "learning_rate": 6.0890891115948353e-05, "loss": 0.1798, "step": 4853 }, { "epoch": 3.35265336846067, "grad_norm": 0.18358854949474335, "learning_rate": 6.0876137627211004e-05, "loss": 0.1861, "step": 4854 }, { "epoch": 3.3534060971019946, "grad_norm": 0.21665984392166138, "learning_rate": 6.0861383495868076e-05, "loss": 0.2557, "step": 4855 }, { "epoch": 3.3541588257433195, "grad_norm": 0.22371074557304382, "learning_rate": 6.084662872353306e-05, "loss": 0.2256, "step": 4856 }, { "epoch": 3.3549115543846444, "grad_norm": 0.16305944323539734, "learning_rate": 6.083187331181951e-05, "loss": 0.1655, "step": 4857 }, { "epoch": 3.3556642830259693, "grad_norm": 0.17689016461372375, "learning_rate": 6.0817117262341083e-05, "loss": 0.1892, "step": 4858 }, { "epoch": 3.356417011667294, "grad_norm": 0.19227847456932068, "learning_rate": 6.080236057671144e-05, "loss": 0.187, "step": 4859 }, { "epoch": 3.3571697403086187, "grad_norm": 0.1889904886484146, "learning_rate": 6.078760325654438e-05, "loss": 0.1957, "step": 4860 }, { "epoch": 3.3579224689499436, "grad_norm": 0.19437485933303833, "learning_rate": 6.077284530345372e-05, "loss": 0.1749, "step": 4861 }, { "epoch": 3.3586751975912685, "grad_norm": 0.20790977776050568, "learning_rate": 6.075808671905339e-05, "loss": 0.2252, "step": 4862 }, { "epoch": 3.359427926232593, "grad_norm": 0.2308814376592636, "learning_rate": 6.0743327504957335e-05, "loss": 0.1996, "step": 4863 }, { "epoch": 3.360180654873918, "grad_norm": 0.17640991508960724, "learning_rate": 6.0728567662779635e-05, "loss": 0.1598, "step": 4864 }, { "epoch": 3.360933383515243, "grad_norm": 0.18564154207706451, "learning_rate": 6.071380719413439e-05, "loss": 0.1573, "step": 4865 }, { "epoch": 3.3616861121565673, "grad_norm": 0.17296002805233002, "learning_rate": 6.069904610063577e-05, "loss": 0.1357, "step": 4866 }, { "epoch": 3.3624388407978922, "grad_norm": 0.2124079465866089, "learning_rate": 6.0684284383898026e-05, "loss": 0.1297, "step": 4867 }, { "epoch": 3.363191569439217, "grad_norm": 0.2160569429397583, "learning_rate": 6.066952204553551e-05, "loss": 0.2159, "step": 4868 }, { "epoch": 3.363944298080542, "grad_norm": 0.18709415197372437, "learning_rate": 6.0654759087162586e-05, "loss": 0.1376, "step": 4869 }, { "epoch": 3.364697026721867, "grad_norm": 0.1866178959608078, "learning_rate": 6.06399955103937e-05, "loss": 0.1412, "step": 4870 }, { "epoch": 3.3654497553631915, "grad_norm": 0.19732236862182617, "learning_rate": 6.0625231316843375e-05, "loss": 0.2282, "step": 4871 }, { "epoch": 3.3662024840045164, "grad_norm": 0.21223555505275726, "learning_rate": 6.061046650812623e-05, "loss": 0.1006, "step": 4872 }, { "epoch": 3.3669552126458413, "grad_norm": 0.17765672504901886, "learning_rate": 6.0595701085856895e-05, "loss": 0.1649, "step": 4873 }, { "epoch": 3.3677079412871658, "grad_norm": 0.21199952065944672, "learning_rate": 6.05809350516501e-05, "loss": 0.1782, "step": 4874 }, { "epoch": 3.3684606699284907, "grad_norm": 0.19238907098770142, "learning_rate": 6.056616840712065e-05, "loss": 0.1859, "step": 4875 }, { "epoch": 3.3692133985698156, "grad_norm": 0.1729966551065445, "learning_rate": 6.055140115388338e-05, "loss": 0.1689, "step": 4876 }, { "epoch": 3.3699661272111405, "grad_norm": 0.1997547298669815, "learning_rate": 6.0536633293553236e-05, "loss": 0.1354, "step": 4877 }, { "epoch": 3.370718855852465, "grad_norm": 0.21082064509391785, "learning_rate": 6.052186482774519e-05, "loss": 0.1861, "step": 4878 }, { "epoch": 3.37147158449379, "grad_norm": 0.17605343461036682, "learning_rate": 6.05070957580743e-05, "loss": 0.19, "step": 4879 }, { "epoch": 3.372224313135115, "grad_norm": 0.20440395176410675, "learning_rate": 6.049232608615571e-05, "loss": 0.189, "step": 4880 }, { "epoch": 3.3729770417764398, "grad_norm": 0.21502059698104858, "learning_rate": 6.0477555813604604e-05, "loss": 0.1496, "step": 4881 }, { "epoch": 3.3737297704177642, "grad_norm": 0.2085476666688919, "learning_rate": 6.046278494203622e-05, "loss": 0.1952, "step": 4882 }, { "epoch": 3.374482499059089, "grad_norm": 0.22234423458576202, "learning_rate": 6.044801347306589e-05, "loss": 0.1716, "step": 4883 }, { "epoch": 3.375235227700414, "grad_norm": 0.22486817836761475, "learning_rate": 6.0433241408308995e-05, "loss": 0.1616, "step": 4884 }, { "epoch": 3.375987956341739, "grad_norm": 0.20510534942150116, "learning_rate": 6.0418468749380996e-05, "loss": 0.1484, "step": 4885 }, { "epoch": 3.3767406849830635, "grad_norm": 0.17492778599262238, "learning_rate": 6.04036954978974e-05, "loss": 0.1279, "step": 4886 }, { "epoch": 3.3774934136243884, "grad_norm": 0.18785730004310608, "learning_rate": 6.0388921655473774e-05, "loss": 0.1774, "step": 4887 }, { "epoch": 3.3782461422657133, "grad_norm": 0.20936524868011475, "learning_rate": 6.0374147223725786e-05, "loss": 0.2961, "step": 4888 }, { "epoch": 3.378998870907038, "grad_norm": 0.20216108858585358, "learning_rate": 6.035937220426915e-05, "loss": 0.1442, "step": 4889 }, { "epoch": 3.3797515995483627, "grad_norm": 0.17251093685626984, "learning_rate": 6.03445965987196e-05, "loss": 0.1113, "step": 4890 }, { "epoch": 3.3805043281896876, "grad_norm": 0.1925877034664154, "learning_rate": 6.0329820408693025e-05, "loss": 0.1296, "step": 4891 }, { "epoch": 3.3812570568310125, "grad_norm": 0.21280813217163086, "learning_rate": 6.0315043635805304e-05, "loss": 0.1644, "step": 4892 }, { "epoch": 3.382009785472337, "grad_norm": 0.19690364599227905, "learning_rate": 6.0300266281672394e-05, "loss": 0.1365, "step": 4893 }, { "epoch": 3.382762514113662, "grad_norm": 0.14824122190475464, "learning_rate": 6.028548834791034e-05, "loss": 0.1076, "step": 4894 }, { "epoch": 3.383515242754987, "grad_norm": 0.22728797793388367, "learning_rate": 6.027070983613522e-05, "loss": 0.204, "step": 4895 }, { "epoch": 3.3842679713963117, "grad_norm": 0.17758288979530334, "learning_rate": 6.025593074796318e-05, "loss": 0.1702, "step": 4896 }, { "epoch": 3.3850207000376367, "grad_norm": 0.1952756643295288, "learning_rate": 6.0241151085010484e-05, "loss": 0.089, "step": 4897 }, { "epoch": 3.385773428678961, "grad_norm": 0.18838992714881897, "learning_rate": 6.022637084889338e-05, "loss": 0.1198, "step": 4898 }, { "epoch": 3.386526157320286, "grad_norm": 0.16244810819625854, "learning_rate": 6.021159004122821e-05, "loss": 0.1548, "step": 4899 }, { "epoch": 3.387278885961611, "grad_norm": 0.19997312128543854, "learning_rate": 6.019680866363139e-05, "loss": 0.1992, "step": 4900 }, { "epoch": 3.3880316146029354, "grad_norm": 0.21864870190620422, "learning_rate": 6.0182026717719373e-05, "loss": 0.1803, "step": 4901 }, { "epoch": 3.3887843432442604, "grad_norm": 0.20181266963481903, "learning_rate": 6.016724420510872e-05, "loss": 0.1703, "step": 4902 }, { "epoch": 3.3895370718855853, "grad_norm": 0.20725156366825104, "learning_rate": 6.015246112741601e-05, "loss": 0.1223, "step": 4903 }, { "epoch": 3.39028980052691, "grad_norm": 0.15834467113018036, "learning_rate": 6.0137677486257894e-05, "loss": 0.1579, "step": 4904 }, { "epoch": 3.3910425291682347, "grad_norm": 0.197950080037117, "learning_rate": 6.012289328325109e-05, "loss": 0.2522, "step": 4905 }, { "epoch": 3.3917952578095596, "grad_norm": 0.1803826093673706, "learning_rate": 6.010810852001239e-05, "loss": 0.1275, "step": 4906 }, { "epoch": 3.3925479864508845, "grad_norm": 0.19327928125858307, "learning_rate": 6.009332319815863e-05, "loss": 0.1684, "step": 4907 }, { "epoch": 3.3933007150922094, "grad_norm": 0.1823642998933792, "learning_rate": 6.007853731930667e-05, "loss": 0.1431, "step": 4908 }, { "epoch": 3.394053443733534, "grad_norm": 0.19026875495910645, "learning_rate": 6.0063750885073525e-05, "loss": 0.2097, "step": 4909 }, { "epoch": 3.394806172374859, "grad_norm": 0.24933920800685883, "learning_rate": 6.00489638970762e-05, "loss": 0.2415, "step": 4910 }, { "epoch": 3.3955589010161837, "grad_norm": 0.17171624302864075, "learning_rate": 6.003417635693177e-05, "loss": 0.1151, "step": 4911 }, { "epoch": 3.3963116296575087, "grad_norm": 0.19787774980068207, "learning_rate": 6.0019388266257384e-05, "loss": 0.124, "step": 4912 }, { "epoch": 3.397064358298833, "grad_norm": 0.18684810400009155, "learning_rate": 6.000459962667025e-05, "loss": 0.091, "step": 4913 }, { "epoch": 3.397817086940158, "grad_norm": 0.2112104743719101, "learning_rate": 5.998981043978762e-05, "loss": 0.182, "step": 4914 }, { "epoch": 3.398569815581483, "grad_norm": 0.19936513900756836, "learning_rate": 5.99750207072268e-05, "loss": 0.1883, "step": 4915 }, { "epoch": 3.399322544222808, "grad_norm": 0.17050285637378693, "learning_rate": 5.996023043060522e-05, "loss": 0.0815, "step": 4916 }, { "epoch": 3.4000752728641324, "grad_norm": 0.17872600257396698, "learning_rate": 5.99454396115403e-05, "loss": 0.2148, "step": 4917 }, { "epoch": 3.4008280015054573, "grad_norm": 0.19786395132541656, "learning_rate": 5.993064825164952e-05, "loss": 0.1613, "step": 4918 }, { "epoch": 3.401580730146782, "grad_norm": 0.1826239377260208, "learning_rate": 5.991585635255048e-05, "loss": 0.1883, "step": 4919 }, { "epoch": 3.4023334587881067, "grad_norm": 0.1505897492170334, "learning_rate": 5.9901063915860765e-05, "loss": 0.0864, "step": 4920 }, { "epoch": 3.4030861874294316, "grad_norm": 0.22853100299835205, "learning_rate": 5.9886270943198076e-05, "loss": 0.2175, "step": 4921 }, { "epoch": 3.4038389160707565, "grad_norm": 0.19685551524162292, "learning_rate": 5.987147743618016e-05, "loss": 0.2656, "step": 4922 }, { "epoch": 3.4045916447120814, "grad_norm": 0.16236674785614014, "learning_rate": 5.9856683396424775e-05, "loss": 0.1905, "step": 4923 }, { "epoch": 3.4053443733534063, "grad_norm": 0.2234807163476944, "learning_rate": 5.984188882554979e-05, "loss": 0.1991, "step": 4924 }, { "epoch": 3.406097101994731, "grad_norm": 0.16246476769447327, "learning_rate": 5.982709372517313e-05, "loss": 0.1746, "step": 4925 }, { "epoch": 3.4068498306360557, "grad_norm": 0.20921722054481506, "learning_rate": 5.981229809691276e-05, "loss": 0.1397, "step": 4926 }, { "epoch": 3.4076025592773806, "grad_norm": 0.20399849116802216, "learning_rate": 5.97975019423867e-05, "loss": 0.1728, "step": 4927 }, { "epoch": 3.408355287918705, "grad_norm": 0.19403685629367828, "learning_rate": 5.978270526321305e-05, "loss": 0.182, "step": 4928 }, { "epoch": 3.40910801656003, "grad_norm": 0.18095527589321136, "learning_rate": 5.9767908061009936e-05, "loss": 0.1239, "step": 4929 }, { "epoch": 3.409860745201355, "grad_norm": 0.15592530369758606, "learning_rate": 5.975311033739557e-05, "loss": 0.1166, "step": 4930 }, { "epoch": 3.41061347384268, "grad_norm": 0.17650268971920013, "learning_rate": 5.973831209398819e-05, "loss": 0.1403, "step": 4931 }, { "epoch": 3.4113662024840044, "grad_norm": 0.2019435614347458, "learning_rate": 5.972351333240615e-05, "loss": 0.1848, "step": 4932 }, { "epoch": 3.4121189311253293, "grad_norm": 0.16715586185455322, "learning_rate": 5.970871405426778e-05, "loss": 0.122, "step": 4933 }, { "epoch": 3.412871659766654, "grad_norm": 0.16581548750400543, "learning_rate": 5.9693914261191506e-05, "loss": 0.0799, "step": 4934 }, { "epoch": 3.413624388407979, "grad_norm": 0.19840790331363678, "learning_rate": 5.967911395479584e-05, "loss": 0.2684, "step": 4935 }, { "epoch": 3.4143771170493036, "grad_norm": 0.16279301047325134, "learning_rate": 5.96643131366993e-05, "loss": 0.1264, "step": 4936 }, { "epoch": 3.4151298456906285, "grad_norm": 0.18364174664020538, "learning_rate": 5.964951180852049e-05, "loss": 0.146, "step": 4937 }, { "epoch": 3.4158825743319534, "grad_norm": 0.18611261248588562, "learning_rate": 5.9634709971878067e-05, "loss": 0.2423, "step": 4938 }, { "epoch": 3.416635302973278, "grad_norm": 0.1752510964870453, "learning_rate": 5.961990762839074e-05, "loss": 0.1294, "step": 4939 }, { "epoch": 3.417388031614603, "grad_norm": 0.17209841310977936, "learning_rate": 5.9605104779677233e-05, "loss": 0.1225, "step": 4940 }, { "epoch": 3.4181407602559277, "grad_norm": 0.18778952956199646, "learning_rate": 5.95903014273564e-05, "loss": 0.2742, "step": 4941 }, { "epoch": 3.4188934888972526, "grad_norm": 0.16963286697864532, "learning_rate": 5.9575497573047114e-05, "loss": 0.1739, "step": 4942 }, { "epoch": 3.4196462175385776, "grad_norm": 0.18881727755069733, "learning_rate": 5.956069321836828e-05, "loss": 0.1418, "step": 4943 }, { "epoch": 3.420398946179902, "grad_norm": 0.18244700133800507, "learning_rate": 5.954588836493891e-05, "loss": 0.1308, "step": 4944 }, { "epoch": 3.421151674821227, "grad_norm": 0.19764074683189392, "learning_rate": 5.953108301437801e-05, "loss": 0.1576, "step": 4945 }, { "epoch": 3.421904403462552, "grad_norm": 0.1841246485710144, "learning_rate": 5.951627716830467e-05, "loss": 0.1596, "step": 4946 }, { "epoch": 3.4226571321038763, "grad_norm": 0.1894700974225998, "learning_rate": 5.950147082833807e-05, "loss": 0.2153, "step": 4947 }, { "epoch": 3.4234098607452013, "grad_norm": 0.17112088203430176, "learning_rate": 5.948666399609739e-05, "loss": 0.1166, "step": 4948 }, { "epoch": 3.424162589386526, "grad_norm": 0.2237524837255478, "learning_rate": 5.9471856673201866e-05, "loss": 0.2518, "step": 4949 }, { "epoch": 3.424915318027851, "grad_norm": 0.1925959438085556, "learning_rate": 5.9457048861270834e-05, "loss": 0.2013, "step": 4950 }, { "epoch": 3.4256680466691756, "grad_norm": 0.1548018455505371, "learning_rate": 5.944224056192362e-05, "loss": 0.167, "step": 4951 }, { "epoch": 3.4264207753105005, "grad_norm": 0.20382064580917358, "learning_rate": 5.942743177677969e-05, "loss": 0.1853, "step": 4952 }, { "epoch": 3.4271735039518254, "grad_norm": 0.1577003002166748, "learning_rate": 5.9412622507458436e-05, "loss": 0.0787, "step": 4953 }, { "epoch": 3.4279262325931503, "grad_norm": 0.22419928014278412, "learning_rate": 5.939781275557944e-05, "loss": 0.1836, "step": 4954 }, { "epoch": 3.428678961234475, "grad_norm": 0.17283174395561218, "learning_rate": 5.938300252276226e-05, "loss": 0.1485, "step": 4955 }, { "epoch": 3.4294316898757997, "grad_norm": 0.18534357845783234, "learning_rate": 5.93681918106265e-05, "loss": 0.1421, "step": 4956 }, { "epoch": 3.4301844185171246, "grad_norm": 0.1699354648590088, "learning_rate": 5.935338062079187e-05, "loss": 0.1788, "step": 4957 }, { "epoch": 3.4309371471584496, "grad_norm": 0.19891119003295898, "learning_rate": 5.933856895487807e-05, "loss": 0.1471, "step": 4958 }, { "epoch": 3.431689875799774, "grad_norm": 0.1843203902244568, "learning_rate": 5.9323756814504886e-05, "loss": 0.1545, "step": 4959 }, { "epoch": 3.432442604441099, "grad_norm": 0.20580625534057617, "learning_rate": 5.930894420129216e-05, "loss": 0.2231, "step": 4960 }, { "epoch": 3.433195333082424, "grad_norm": 0.17726154625415802, "learning_rate": 5.929413111685977e-05, "loss": 0.1895, "step": 4961 }, { "epoch": 3.433948061723749, "grad_norm": 0.16405045986175537, "learning_rate": 5.9279317562827664e-05, "loss": 0.137, "step": 4962 }, { "epoch": 3.4347007903650733, "grad_norm": 0.18787282705307007, "learning_rate": 5.926450354081583e-05, "loss": 0.1987, "step": 4963 }, { "epoch": 3.435453519006398, "grad_norm": 0.1978227198123932, "learning_rate": 5.924968905244429e-05, "loss": 0.1919, "step": 4964 }, { "epoch": 3.436206247647723, "grad_norm": 0.172968789935112, "learning_rate": 5.923487409933316e-05, "loss": 0.1688, "step": 4965 }, { "epoch": 3.4369589762890476, "grad_norm": 0.1775856614112854, "learning_rate": 5.922005868310254e-05, "loss": 0.1306, "step": 4966 }, { "epoch": 3.4377117049303725, "grad_norm": 0.22026221454143524, "learning_rate": 5.920524280537268e-05, "loss": 0.1435, "step": 4967 }, { "epoch": 3.4384644335716974, "grad_norm": 0.17950472235679626, "learning_rate": 5.919042646776376e-05, "loss": 0.1271, "step": 4968 }, { "epoch": 3.4392171622130223, "grad_norm": 0.1919325888156891, "learning_rate": 5.9175609671896106e-05, "loss": 0.1934, "step": 4969 }, { "epoch": 3.4399698908543472, "grad_norm": 0.1655593365430832, "learning_rate": 5.9160792419390064e-05, "loss": 0.1195, "step": 4970 }, { "epoch": 3.4407226194956717, "grad_norm": 0.18479353189468384, "learning_rate": 5.9145974711866026e-05, "loss": 0.1184, "step": 4971 }, { "epoch": 3.4414753481369966, "grad_norm": 0.1782495081424713, "learning_rate": 5.913115655094441e-05, "loss": 0.1904, "step": 4972 }, { "epoch": 3.4422280767783215, "grad_norm": 0.15754364430904388, "learning_rate": 5.911633793824573e-05, "loss": 0.1184, "step": 4973 }, { "epoch": 3.442980805419646, "grad_norm": 0.18602730333805084, "learning_rate": 5.910151887539052e-05, "loss": 0.2582, "step": 4974 }, { "epoch": 3.443733534060971, "grad_norm": 0.20007091760635376, "learning_rate": 5.9086699363999373e-05, "loss": 0.2124, "step": 4975 }, { "epoch": 3.444486262702296, "grad_norm": 0.18909958004951477, "learning_rate": 5.907187940569293e-05, "loss": 0.179, "step": 4976 }, { "epoch": 3.4452389913436208, "grad_norm": 0.18261563777923584, "learning_rate": 5.905705900209189e-05, "loss": 0.2371, "step": 4977 }, { "epoch": 3.4459917199849452, "grad_norm": 0.19367040693759918, "learning_rate": 5.9042238154816954e-05, "loss": 0.1707, "step": 4978 }, { "epoch": 3.44674444862627, "grad_norm": 0.20780456066131592, "learning_rate": 5.902741686548895e-05, "loss": 0.1814, "step": 4979 }, { "epoch": 3.447497177267595, "grad_norm": 0.18349173665046692, "learning_rate": 5.9012595135728686e-05, "loss": 0.2531, "step": 4980 }, { "epoch": 3.44824990590892, "grad_norm": 0.18331663310527802, "learning_rate": 5.899777296715704e-05, "loss": 0.1577, "step": 4981 }, { "epoch": 3.4490026345502445, "grad_norm": 0.1722344160079956, "learning_rate": 5.898295036139496e-05, "loss": 0.2183, "step": 4982 }, { "epoch": 3.4497553631915694, "grad_norm": 0.18670758605003357, "learning_rate": 5.8968127320063436e-05, "loss": 0.2641, "step": 4983 }, { "epoch": 3.4505080918328943, "grad_norm": 0.16142895817756653, "learning_rate": 5.8953303844783456e-05, "loss": 0.1561, "step": 4984 }, { "epoch": 3.4512608204742192, "grad_norm": 0.16939102113246918, "learning_rate": 5.8938479937176105e-05, "loss": 0.12, "step": 4985 }, { "epoch": 3.4520135491155437, "grad_norm": 0.15788590908050537, "learning_rate": 5.892365559886253e-05, "loss": 0.1105, "step": 4986 }, { "epoch": 3.4527662777568686, "grad_norm": 0.176722452044487, "learning_rate": 5.8908830831463854e-05, "loss": 0.2396, "step": 4987 }, { "epoch": 3.4535190063981935, "grad_norm": 0.17823180556297302, "learning_rate": 5.889400563660132e-05, "loss": 0.1659, "step": 4988 }, { "epoch": 3.4542717350395185, "grad_norm": 0.19223827123641968, "learning_rate": 5.8879180015896195e-05, "loss": 0.1632, "step": 4989 }, { "epoch": 3.455024463680843, "grad_norm": 0.19482989609241486, "learning_rate": 5.8864353970969775e-05, "loss": 0.1597, "step": 4990 }, { "epoch": 3.455777192322168, "grad_norm": 0.21217189729213715, "learning_rate": 5.8849527503443404e-05, "loss": 0.1562, "step": 4991 }, { "epoch": 3.4565299209634928, "grad_norm": 0.18786723911762238, "learning_rate": 5.883470061493851e-05, "loss": 0.2086, "step": 4992 }, { "epoch": 3.4572826496048172, "grad_norm": 0.1819467842578888, "learning_rate": 5.881987330707651e-05, "loss": 0.1346, "step": 4993 }, { "epoch": 3.458035378246142, "grad_norm": 0.24074451625347137, "learning_rate": 5.88050455814789e-05, "loss": 0.1865, "step": 4994 }, { "epoch": 3.458788106887467, "grad_norm": 0.18295684456825256, "learning_rate": 5.8790217439767246e-05, "loss": 0.1457, "step": 4995 }, { "epoch": 3.459540835528792, "grad_norm": 0.24615167081356049, "learning_rate": 5.877538888356311e-05, "loss": 0.2323, "step": 4996 }, { "epoch": 3.460293564170117, "grad_norm": 0.19171595573425293, "learning_rate": 5.87605599144881e-05, "loss": 0.1422, "step": 4997 }, { "epoch": 3.4610462928114414, "grad_norm": 0.2576714754104614, "learning_rate": 5.874573053416393e-05, "loss": 0.2032, "step": 4998 }, { "epoch": 3.4617990214527663, "grad_norm": 0.2371441274881363, "learning_rate": 5.873090074421229e-05, "loss": 0.2483, "step": 4999 }, { "epoch": 3.462551750094091, "grad_norm": 0.18093179166316986, "learning_rate": 5.8716070546254966e-05, "loss": 0.2431, "step": 5000 }, { "epoch": 3.462551750094091, "eval_loss": 0.20292453467845917, "eval_runtime": 456.4024, "eval_samples_per_second": 21.093, "eval_steps_per_second": 0.66, "step": 5000 }, { "epoch": 3.4633044787354157, "grad_norm": 0.2266332358121872, "learning_rate": 5.8701239941913736e-05, "loss": 0.184, "step": 5001 }, { "epoch": 3.4640572073767406, "grad_norm": 0.16618049144744873, "learning_rate": 5.8686408932810486e-05, "loss": 0.1619, "step": 5002 }, { "epoch": 3.4648099360180655, "grad_norm": 0.16752442717552185, "learning_rate": 5.86715775205671e-05, "loss": 0.0504, "step": 5003 }, { "epoch": 3.4655626646593904, "grad_norm": 0.2047964632511139, "learning_rate": 5.8656745706805505e-05, "loss": 0.191, "step": 5004 }, { "epoch": 3.466315393300715, "grad_norm": 0.1767398715019226, "learning_rate": 5.8641913493147705e-05, "loss": 0.2036, "step": 5005 }, { "epoch": 3.46706812194204, "grad_norm": 0.20784786343574524, "learning_rate": 5.8627080881215726e-05, "loss": 0.1268, "step": 5006 }, { "epoch": 3.4678208505833648, "grad_norm": 0.18284998834133148, "learning_rate": 5.861224787263162e-05, "loss": 0.1494, "step": 5007 }, { "epoch": 3.4685735792246897, "grad_norm": 0.18405577540397644, "learning_rate": 5.859741446901752e-05, "loss": 0.1489, "step": 5008 }, { "epoch": 3.469326307866014, "grad_norm": 0.18039801716804504, "learning_rate": 5.8582580671995604e-05, "loss": 0.1505, "step": 5009 }, { "epoch": 3.470079036507339, "grad_norm": 0.2152279019355774, "learning_rate": 5.856774648318801e-05, "loss": 0.2788, "step": 5010 }, { "epoch": 3.470831765148664, "grad_norm": 0.21487830579280853, "learning_rate": 5.855291190421705e-05, "loss": 0.2444, "step": 5011 }, { "epoch": 3.471584493789989, "grad_norm": 0.2024735063314438, "learning_rate": 5.8538076936705e-05, "loss": 0.214, "step": 5012 }, { "epoch": 3.4723372224313134, "grad_norm": 0.1973320096731186, "learning_rate": 5.852324158227416e-05, "loss": 0.2347, "step": 5013 }, { "epoch": 3.4730899510726383, "grad_norm": 0.19767718017101288, "learning_rate": 5.850840584254692e-05, "loss": 0.1717, "step": 5014 }, { "epoch": 3.473842679713963, "grad_norm": 0.21294622123241425, "learning_rate": 5.849356971914569e-05, "loss": 0.2544, "step": 5015 }, { "epoch": 3.474595408355288, "grad_norm": 0.18098916113376617, "learning_rate": 5.847873321369294e-05, "loss": 0.1103, "step": 5016 }, { "epoch": 3.4753481369966126, "grad_norm": 0.18889200687408447, "learning_rate": 5.846389632781113e-05, "loss": 0.0837, "step": 5017 }, { "epoch": 3.4761008656379375, "grad_norm": 0.18960940837860107, "learning_rate": 5.8449059063122844e-05, "loss": 0.169, "step": 5018 }, { "epoch": 3.4768535942792624, "grad_norm": 0.19943659007549286, "learning_rate": 5.8434221421250655e-05, "loss": 0.1449, "step": 5019 }, { "epoch": 3.477606322920587, "grad_norm": 0.17035162448883057, "learning_rate": 5.841938340381714e-05, "loss": 0.1007, "step": 5020 }, { "epoch": 3.478359051561912, "grad_norm": 0.16866987943649292, "learning_rate": 5.8404545012445024e-05, "loss": 0.1148, "step": 5021 }, { "epoch": 3.4791117802032367, "grad_norm": 0.2062644064426422, "learning_rate": 5.838970624875698e-05, "loss": 0.1711, "step": 5022 }, { "epoch": 3.4798645088445617, "grad_norm": 0.19216415286064148, "learning_rate": 5.837486711437575e-05, "loss": 0.1855, "step": 5023 }, { "epoch": 3.4806172374858866, "grad_norm": 0.19011013209819794, "learning_rate": 5.836002761092413e-05, "loss": 0.2071, "step": 5024 }, { "epoch": 3.481369966127211, "grad_norm": 0.17220629751682281, "learning_rate": 5.8345187740024954e-05, "loss": 0.1705, "step": 5025 }, { "epoch": 3.482122694768536, "grad_norm": 0.21144182980060577, "learning_rate": 5.833034750330105e-05, "loss": 0.2496, "step": 5026 }, { "epoch": 3.482875423409861, "grad_norm": 0.25342899560928345, "learning_rate": 5.8315506902375374e-05, "loss": 0.1991, "step": 5027 }, { "epoch": 3.4836281520511854, "grad_norm": 0.2190820723772049, "learning_rate": 5.8300665938870834e-05, "loss": 0.1513, "step": 5028 }, { "epoch": 3.4843808806925103, "grad_norm": 0.1990685611963272, "learning_rate": 5.8285824614410435e-05, "loss": 0.1982, "step": 5029 }, { "epoch": 3.485133609333835, "grad_norm": 0.2135271281003952, "learning_rate": 5.82709829306172e-05, "loss": 0.2375, "step": 5030 }, { "epoch": 3.48588633797516, "grad_norm": 0.20636428892612457, "learning_rate": 5.825614088911418e-05, "loss": 0.1699, "step": 5031 }, { "epoch": 3.4866390666164846, "grad_norm": 0.2015216052532196, "learning_rate": 5.82412984915245e-05, "loss": 0.2383, "step": 5032 }, { "epoch": 3.4873917952578095, "grad_norm": 0.19108635187149048, "learning_rate": 5.822645573947129e-05, "loss": 0.1109, "step": 5033 }, { "epoch": 3.4881445238991344, "grad_norm": 0.2125907689332962, "learning_rate": 5.821161263457773e-05, "loss": 0.2309, "step": 5034 }, { "epoch": 3.4888972525404593, "grad_norm": 0.1686941236257553, "learning_rate": 5.819676917846702e-05, "loss": 0.211, "step": 5035 }, { "epoch": 3.489649981181784, "grad_norm": 0.21831297874450684, "learning_rate": 5.818192537276246e-05, "loss": 0.0939, "step": 5036 }, { "epoch": 3.4904027098231087, "grad_norm": 0.15648585557937622, "learning_rate": 5.8167081219087324e-05, "loss": 0.0999, "step": 5037 }, { "epoch": 3.4911554384644337, "grad_norm": 0.15885932743549347, "learning_rate": 5.8152236719064945e-05, "loss": 0.1157, "step": 5038 }, { "epoch": 3.491908167105758, "grad_norm": 0.1634446084499359, "learning_rate": 5.813739187431869e-05, "loss": 0.1023, "step": 5039 }, { "epoch": 3.492660895747083, "grad_norm": 0.18838131427764893, "learning_rate": 5.812254668647199e-05, "loss": 0.1571, "step": 5040 }, { "epoch": 3.493413624388408, "grad_norm": 0.1919534057378769, "learning_rate": 5.8107701157148277e-05, "loss": 0.17, "step": 5041 }, { "epoch": 3.494166353029733, "grad_norm": 0.16051927208900452, "learning_rate": 5.809285528797103e-05, "loss": 0.2053, "step": 5042 }, { "epoch": 3.494919081671058, "grad_norm": 0.16879719495773315, "learning_rate": 5.807800908056378e-05, "loss": 0.1834, "step": 5043 }, { "epoch": 3.4956718103123823, "grad_norm": 0.2351587414741516, "learning_rate": 5.8063162536550086e-05, "loss": 0.2328, "step": 5044 }, { "epoch": 3.496424538953707, "grad_norm": 0.1783425360918045, "learning_rate": 5.804831565755353e-05, "loss": 0.1471, "step": 5045 }, { "epoch": 3.497177267595032, "grad_norm": 0.20542176067829132, "learning_rate": 5.803346844519778e-05, "loss": 0.1189, "step": 5046 }, { "epoch": 3.4979299962363566, "grad_norm": 0.17644043266773224, "learning_rate": 5.8018620901106455e-05, "loss": 0.1786, "step": 5047 }, { "epoch": 3.4986827248776815, "grad_norm": 0.25251805782318115, "learning_rate": 5.800377302690332e-05, "loss": 0.2552, "step": 5048 }, { "epoch": 3.4994354535190064, "grad_norm": 0.20878936350345612, "learning_rate": 5.798892482421205e-05, "loss": 0.1472, "step": 5049 }, { "epoch": 3.5001881821603313, "grad_norm": 0.1724919080734253, "learning_rate": 5.7974076294656476e-05, "loss": 0.1566, "step": 5050 }, { "epoch": 3.5009409108016563, "grad_norm": 0.1899658441543579, "learning_rate": 5.795922743986039e-05, "loss": 0.1405, "step": 5051 }, { "epoch": 3.5016936394429807, "grad_norm": 0.15459871292114258, "learning_rate": 5.794437826144763e-05, "loss": 0.1232, "step": 5052 }, { "epoch": 3.5024463680843057, "grad_norm": 0.1896207630634308, "learning_rate": 5.792952876104209e-05, "loss": 0.1485, "step": 5053 }, { "epoch": 3.5031990967256306, "grad_norm": 0.1930207461118698, "learning_rate": 5.791467894026771e-05, "loss": 0.1435, "step": 5054 }, { "epoch": 3.503951825366955, "grad_norm": 0.18299229443073273, "learning_rate": 5.789982880074839e-05, "loss": 0.1889, "step": 5055 }, { "epoch": 3.50470455400828, "grad_norm": 0.19634130597114563, "learning_rate": 5.788497834410818e-05, "loss": 0.1553, "step": 5056 }, { "epoch": 3.505457282649605, "grad_norm": 0.1713162511587143, "learning_rate": 5.787012757197107e-05, "loss": 0.1048, "step": 5057 }, { "epoch": 3.5062100112909294, "grad_norm": 0.2274683713912964, "learning_rate": 5.785527648596111e-05, "loss": 0.1979, "step": 5058 }, { "epoch": 3.5069627399322543, "grad_norm": 0.2098996341228485, "learning_rate": 5.784042508770242e-05, "loss": 0.1964, "step": 5059 }, { "epoch": 3.507715468573579, "grad_norm": 0.2010815441608429, "learning_rate": 5.782557337881911e-05, "loss": 0.199, "step": 5060 }, { "epoch": 3.508468197214904, "grad_norm": 0.18839584290981293, "learning_rate": 5.781072136093534e-05, "loss": 0.188, "step": 5061 }, { "epoch": 3.509220925856229, "grad_norm": 0.17156419157981873, "learning_rate": 5.779586903567531e-05, "loss": 0.1376, "step": 5062 }, { "epoch": 3.5099736544975535, "grad_norm": 0.19937503337860107, "learning_rate": 5.7781016404663246e-05, "loss": 0.1056, "step": 5063 }, { "epoch": 3.5107263831388784, "grad_norm": 0.1661362648010254, "learning_rate": 5.77661634695234e-05, "loss": 0.126, "step": 5064 }, { "epoch": 3.5114791117802033, "grad_norm": 0.16394929587841034, "learning_rate": 5.77513102318801e-05, "loss": 0.121, "step": 5065 }, { "epoch": 3.512231840421528, "grad_norm": 0.21057796478271484, "learning_rate": 5.7736456693357635e-05, "loss": 0.1775, "step": 5066 }, { "epoch": 3.5129845690628527, "grad_norm": 0.183758944272995, "learning_rate": 5.772160285558037e-05, "loss": 0.1189, "step": 5067 }, { "epoch": 3.5137372977041776, "grad_norm": 0.23364543914794922, "learning_rate": 5.7706748720172734e-05, "loss": 0.2349, "step": 5068 }, { "epoch": 3.5144900263455026, "grad_norm": 0.16960583627223969, "learning_rate": 5.769189428875911e-05, "loss": 0.1659, "step": 5069 }, { "epoch": 3.5152427549868275, "grad_norm": 0.18330906331539154, "learning_rate": 5.7677039562963965e-05, "loss": 0.1853, "step": 5070 }, { "epoch": 3.515995483628152, "grad_norm": 0.17397430539131165, "learning_rate": 5.7662184544411814e-05, "loss": 0.1528, "step": 5071 }, { "epoch": 3.516748212269477, "grad_norm": 0.2064015120267868, "learning_rate": 5.7647329234727153e-05, "loss": 0.2251, "step": 5072 }, { "epoch": 3.517500940910802, "grad_norm": 0.1516033411026001, "learning_rate": 5.7632473635534553e-05, "loss": 0.087, "step": 5073 }, { "epoch": 3.5182536695521263, "grad_norm": 0.20152334868907928, "learning_rate": 5.7617617748458575e-05, "loss": 0.1647, "step": 5074 }, { "epoch": 3.519006398193451, "grad_norm": 0.19072836637496948, "learning_rate": 5.760276157512389e-05, "loss": 0.1844, "step": 5075 }, { "epoch": 3.519759126834776, "grad_norm": 0.19148238003253937, "learning_rate": 5.75879051171551e-05, "loss": 0.1443, "step": 5076 }, { "epoch": 3.520511855476101, "grad_norm": 0.187281996011734, "learning_rate": 5.757304837617688e-05, "loss": 0.1485, "step": 5077 }, { "epoch": 3.521264584117426, "grad_norm": 0.20146070420742035, "learning_rate": 5.7558191353813986e-05, "loss": 0.1571, "step": 5078 }, { "epoch": 3.5220173127587504, "grad_norm": 0.18914271891117096, "learning_rate": 5.754333405169111e-05, "loss": 0.1617, "step": 5079 }, { "epoch": 3.5227700414000753, "grad_norm": 0.17944183945655823, "learning_rate": 5.7528476471433044e-05, "loss": 0.1604, "step": 5080 }, { "epoch": 3.5235227700414002, "grad_norm": 0.18367013335227966, "learning_rate": 5.751361861466461e-05, "loss": 0.176, "step": 5081 }, { "epoch": 3.5242754986827247, "grad_norm": 0.1992434859275818, "learning_rate": 5.7498760483010614e-05, "loss": 0.1425, "step": 5082 }, { "epoch": 3.5250282273240496, "grad_norm": 0.18830175697803497, "learning_rate": 5.748390207809593e-05, "loss": 0.1838, "step": 5083 }, { "epoch": 3.5257809559653746, "grad_norm": 0.18795332312583923, "learning_rate": 5.7469043401545455e-05, "loss": 0.1493, "step": 5084 }, { "epoch": 3.526533684606699, "grad_norm": 0.19339853525161743, "learning_rate": 5.7454184454984106e-05, "loss": 0.2541, "step": 5085 }, { "epoch": 3.527286413248024, "grad_norm": 0.170549675822258, "learning_rate": 5.7439325240036824e-05, "loss": 0.1717, "step": 5086 }, { "epoch": 3.528039141889349, "grad_norm": 0.18748289346694946, "learning_rate": 5.7424465758328604e-05, "loss": 0.254, "step": 5087 }, { "epoch": 3.528791870530674, "grad_norm": 0.1802220195531845, "learning_rate": 5.740960601148445e-05, "loss": 0.1904, "step": 5088 }, { "epoch": 3.5295445991719987, "grad_norm": 0.19761425256729126, "learning_rate": 5.739474600112942e-05, "loss": 0.1548, "step": 5089 }, { "epoch": 3.530297327813323, "grad_norm": 0.19675593078136444, "learning_rate": 5.737988572888856e-05, "loss": 0.1712, "step": 5090 }, { "epoch": 3.531050056454648, "grad_norm": 0.17263716459274292, "learning_rate": 5.7365025196386966e-05, "loss": 0.1463, "step": 5091 }, { "epoch": 3.531802785095973, "grad_norm": 0.20296916365623474, "learning_rate": 5.735016440524978e-05, "loss": 0.1, "step": 5092 }, { "epoch": 3.5325555137372975, "grad_norm": 0.18516996502876282, "learning_rate": 5.733530335710215e-05, "loss": 0.206, "step": 5093 }, { "epoch": 3.5333082423786224, "grad_norm": 0.18100960552692413, "learning_rate": 5.732044205356926e-05, "loss": 0.2597, "step": 5094 }, { "epoch": 3.5340609710199473, "grad_norm": 0.18607550859451294, "learning_rate": 5.730558049627629e-05, "loss": 0.2516, "step": 5095 }, { "epoch": 3.5348136996612722, "grad_norm": 0.20903608202934265, "learning_rate": 5.729071868684852e-05, "loss": 0.161, "step": 5096 }, { "epoch": 3.535566428302597, "grad_norm": 0.2027125358581543, "learning_rate": 5.727585662691121e-05, "loss": 0.2229, "step": 5097 }, { "epoch": 3.5363191569439216, "grad_norm": 0.1953122764825821, "learning_rate": 5.726099431808963e-05, "loss": 0.292, "step": 5098 }, { "epoch": 3.5370718855852465, "grad_norm": 0.18765360116958618, "learning_rate": 5.7246131762009114e-05, "loss": 0.163, "step": 5099 }, { "epoch": 3.5378246142265715, "grad_norm": 0.164918914437294, "learning_rate": 5.7231268960295e-05, "loss": 0.1251, "step": 5100 }, { "epoch": 3.538577342867896, "grad_norm": 0.19366511702537537, "learning_rate": 5.721640591457268e-05, "loss": 0.2482, "step": 5101 }, { "epoch": 3.539330071509221, "grad_norm": 0.17176315188407898, "learning_rate": 5.720154262646753e-05, "loss": 0.1413, "step": 5102 }, { "epoch": 3.5400828001505458, "grad_norm": 0.20558133721351624, "learning_rate": 5.7186679097605e-05, "loss": 0.1445, "step": 5103 }, { "epoch": 3.5408355287918707, "grad_norm": 0.16786310076713562, "learning_rate": 5.717181532961052e-05, "loss": 0.1271, "step": 5104 }, { "epoch": 3.5415882574331956, "grad_norm": 0.19288766384124756, "learning_rate": 5.715695132410959e-05, "loss": 0.1643, "step": 5105 }, { "epoch": 3.54234098607452, "grad_norm": 0.20699892938137054, "learning_rate": 5.714208708272769e-05, "loss": 0.2435, "step": 5106 }, { "epoch": 3.543093714715845, "grad_norm": 0.19849184155464172, "learning_rate": 5.712722260709038e-05, "loss": 0.2303, "step": 5107 }, { "epoch": 3.54384644335717, "grad_norm": 0.1532972753047943, "learning_rate": 5.711235789882321e-05, "loss": 0.1295, "step": 5108 }, { "epoch": 3.5445991719984944, "grad_norm": 0.17116108536720276, "learning_rate": 5.7097492959551745e-05, "loss": 0.1601, "step": 5109 }, { "epoch": 3.5453519006398193, "grad_norm": 0.1838022619485855, "learning_rate": 5.70826277909016e-05, "loss": 0.1636, "step": 5110 }, { "epoch": 3.5461046292811442, "grad_norm": 0.17055048048496246, "learning_rate": 5.706776239449841e-05, "loss": 0.0973, "step": 5111 }, { "epoch": 3.5468573579224687, "grad_norm": 0.20833827555179596, "learning_rate": 5.7052896771967844e-05, "loss": 0.2449, "step": 5112 }, { "epoch": 3.5476100865637936, "grad_norm": 0.19291625916957855, "learning_rate": 5.7038030924935584e-05, "loss": 0.1832, "step": 5113 }, { "epoch": 3.5483628152051185, "grad_norm": 0.19823767244815826, "learning_rate": 5.702316485502731e-05, "loss": 0.188, "step": 5114 }, { "epoch": 3.5491155438464435, "grad_norm": 0.1800530105829239, "learning_rate": 5.700829856386876e-05, "loss": 0.0958, "step": 5115 }, { "epoch": 3.5498682724877684, "grad_norm": 0.1981290876865387, "learning_rate": 5.6993432053085706e-05, "loss": 0.1948, "step": 5116 }, { "epoch": 3.550621001129093, "grad_norm": 0.1859399676322937, "learning_rate": 5.6978565324303926e-05, "loss": 0.1901, "step": 5117 }, { "epoch": 3.5513737297704178, "grad_norm": 0.18858954310417175, "learning_rate": 5.696369837914921e-05, "loss": 0.1281, "step": 5118 }, { "epoch": 3.5521264584117427, "grad_norm": 0.17109008133411407, "learning_rate": 5.69488312192474e-05, "loss": 0.1445, "step": 5119 }, { "epoch": 3.552879187053067, "grad_norm": 0.15881377458572388, "learning_rate": 5.6933963846224335e-05, "loss": 0.1105, "step": 5120 }, { "epoch": 3.553631915694392, "grad_norm": 0.2053428590297699, "learning_rate": 5.69190962617059e-05, "loss": 0.1685, "step": 5121 }, { "epoch": 3.554384644335717, "grad_norm": 0.17354312539100647, "learning_rate": 5.690422846731796e-05, "loss": 0.148, "step": 5122 }, { "epoch": 3.555137372977042, "grad_norm": 0.2034069448709488, "learning_rate": 5.688936046468647e-05, "loss": 0.2278, "step": 5123 }, { "epoch": 3.555890101618367, "grad_norm": 0.1642206460237503, "learning_rate": 5.6874492255437365e-05, "loss": 0.1806, "step": 5124 }, { "epoch": 3.5566428302596913, "grad_norm": 0.17660337686538696, "learning_rate": 5.6859623841196595e-05, "loss": 0.2028, "step": 5125 }, { "epoch": 3.557395558901016, "grad_norm": 0.22705259919166565, "learning_rate": 5.6844755223590174e-05, "loss": 0.1859, "step": 5126 }, { "epoch": 3.558148287542341, "grad_norm": 0.22869597375392914, "learning_rate": 5.682988640424408e-05, "loss": 0.293, "step": 5127 }, { "epoch": 3.5589010161836656, "grad_norm": 0.17999866604804993, "learning_rate": 5.6815017384784365e-05, "loss": 0.0961, "step": 5128 }, { "epoch": 3.5596537448249905, "grad_norm": 0.22899124026298523, "learning_rate": 5.6800148166837074e-05, "loss": 0.1766, "step": 5129 }, { "epoch": 3.5604064734663154, "grad_norm": 0.18691842257976532, "learning_rate": 5.6785278752028295e-05, "loss": 0.1663, "step": 5130 }, { "epoch": 3.56115920210764, "grad_norm": 0.18931254744529724, "learning_rate": 5.6770409141984114e-05, "loss": 0.1217, "step": 5131 }, { "epoch": 3.561911930748965, "grad_norm": 0.21891483664512634, "learning_rate": 5.6755539338330644e-05, "loss": 0.1538, "step": 5132 }, { "epoch": 3.5626646593902898, "grad_norm": 0.22224318981170654, "learning_rate": 5.674066934269404e-05, "loss": 0.1449, "step": 5133 }, { "epoch": 3.5634173880316147, "grad_norm": 0.1950511485338211, "learning_rate": 5.672579915670044e-05, "loss": 0.209, "step": 5134 }, { "epoch": 3.5641701166729396, "grad_norm": 0.17047007381916046, "learning_rate": 5.6710928781976056e-05, "loss": 0.2033, "step": 5135 }, { "epoch": 3.564922845314264, "grad_norm": 0.2021760195493698, "learning_rate": 5.669605822014706e-05, "loss": 0.1528, "step": 5136 }, { "epoch": 3.565675573955589, "grad_norm": 0.188362717628479, "learning_rate": 5.668118747283969e-05, "loss": 0.172, "step": 5137 }, { "epoch": 3.566428302596914, "grad_norm": 0.24885620176792145, "learning_rate": 5.66663165416802e-05, "loss": 0.2397, "step": 5138 }, { "epoch": 3.5671810312382384, "grad_norm": 0.20452342927455902, "learning_rate": 5.665144542829481e-05, "loss": 0.2167, "step": 5139 }, { "epoch": 3.5679337598795633, "grad_norm": 0.186635360121727, "learning_rate": 5.663657413430985e-05, "loss": 0.1026, "step": 5140 }, { "epoch": 3.568686488520888, "grad_norm": 0.21929696202278137, "learning_rate": 5.662170266135161e-05, "loss": 0.2635, "step": 5141 }, { "epoch": 3.569439217162213, "grad_norm": 0.1915663182735443, "learning_rate": 5.660683101104638e-05, "loss": 0.1585, "step": 5142 }, { "epoch": 3.570191945803538, "grad_norm": 0.2705668807029724, "learning_rate": 5.659195918502055e-05, "loss": 0.2968, "step": 5143 }, { "epoch": 3.5709446744448625, "grad_norm": 0.19505712389945984, "learning_rate": 5.657708718490044e-05, "loss": 0.1392, "step": 5144 }, { "epoch": 3.5716974030861874, "grad_norm": 0.20815888047218323, "learning_rate": 5.656221501231246e-05, "loss": 0.2069, "step": 5145 }, { "epoch": 3.5724501317275124, "grad_norm": 0.20947878062725067, "learning_rate": 5.654734266888299e-05, "loss": 0.1864, "step": 5146 }, { "epoch": 3.573202860368837, "grad_norm": 0.18674705922603607, "learning_rate": 5.653247015623844e-05, "loss": 0.159, "step": 5147 }, { "epoch": 3.5739555890101617, "grad_norm": 0.2053951770067215, "learning_rate": 5.651759747600528e-05, "loss": 0.2869, "step": 5148 }, { "epoch": 3.5747083176514867, "grad_norm": 0.20271708071231842, "learning_rate": 5.6502724629809934e-05, "loss": 0.1058, "step": 5149 }, { "epoch": 3.5754610462928116, "grad_norm": 0.21514296531677246, "learning_rate": 5.648785161927888e-05, "loss": 0.181, "step": 5150 }, { "epoch": 3.5762137749341365, "grad_norm": 0.20904918015003204, "learning_rate": 5.6472978446038605e-05, "loss": 0.1733, "step": 5151 }, { "epoch": 3.576966503575461, "grad_norm": 0.20295259356498718, "learning_rate": 5.645810511171562e-05, "loss": 0.1798, "step": 5152 }, { "epoch": 3.577719232216786, "grad_norm": 0.18717022240161896, "learning_rate": 5.6443231617936454e-05, "loss": 0.1182, "step": 5153 }, { "epoch": 3.578471960858111, "grad_norm": 0.22272692620754242, "learning_rate": 5.6428357966327664e-05, "loss": 0.1775, "step": 5154 }, { "epoch": 3.5792246894994353, "grad_norm": 0.18669886887073517, "learning_rate": 5.641348415851577e-05, "loss": 0.1672, "step": 5155 }, { "epoch": 3.57997741814076, "grad_norm": 0.2034304291009903, "learning_rate": 5.639861019612741e-05, "loss": 0.2968, "step": 5156 }, { "epoch": 3.580730146782085, "grad_norm": 0.2393789440393448, "learning_rate": 5.638373608078911e-05, "loss": 0.299, "step": 5157 }, { "epoch": 3.5814828754234096, "grad_norm": 0.17040537297725677, "learning_rate": 5.6368861814127515e-05, "loss": 0.1053, "step": 5158 }, { "epoch": 3.5822356040647345, "grad_norm": 0.18126237392425537, "learning_rate": 5.635398739776926e-05, "loss": 0.2025, "step": 5159 }, { "epoch": 3.5829883327060594, "grad_norm": 0.18407569825649261, "learning_rate": 5.633911283334097e-05, "loss": 0.1323, "step": 5160 }, { "epoch": 3.5837410613473843, "grad_norm": 0.20061807334423065, "learning_rate": 5.632423812246932e-05, "loss": 0.1099, "step": 5161 }, { "epoch": 3.5844937899887093, "grad_norm": 0.18895559012889862, "learning_rate": 5.630936326678098e-05, "loss": 0.1706, "step": 5162 }, { "epoch": 3.5852465186300337, "grad_norm": 0.216936394572258, "learning_rate": 5.629448826790263e-05, "loss": 0.2299, "step": 5163 }, { "epoch": 3.5859992472713587, "grad_norm": 0.20713888108730316, "learning_rate": 5.627961312746101e-05, "loss": 0.1253, "step": 5164 }, { "epoch": 3.5867519759126836, "grad_norm": 0.2100488394498825, "learning_rate": 5.6264737847082805e-05, "loss": 0.1203, "step": 5165 }, { "epoch": 3.587504704554008, "grad_norm": 0.21663644909858704, "learning_rate": 5.6249862428394775e-05, "loss": 0.1878, "step": 5166 }, { "epoch": 3.588257433195333, "grad_norm": 0.19040851294994354, "learning_rate": 5.6234986873023667e-05, "loss": 0.221, "step": 5167 }, { "epoch": 3.589010161836658, "grad_norm": 0.1918703317642212, "learning_rate": 5.622011118259626e-05, "loss": 0.2096, "step": 5168 }, { "epoch": 3.589762890477983, "grad_norm": 0.18611305952072144, "learning_rate": 5.6205235358739315e-05, "loss": 0.2291, "step": 5169 }, { "epoch": 3.5905156191193077, "grad_norm": 0.16951565444469452, "learning_rate": 5.6190359403079653e-05, "loss": 0.1346, "step": 5170 }, { "epoch": 3.591268347760632, "grad_norm": 0.18101274967193604, "learning_rate": 5.617548331724408e-05, "loss": 0.1522, "step": 5171 }, { "epoch": 3.592021076401957, "grad_norm": 0.18991892039775848, "learning_rate": 5.616060710285942e-05, "loss": 0.2454, "step": 5172 }, { "epoch": 3.592773805043282, "grad_norm": 0.2158268392086029, "learning_rate": 5.6145730761552505e-05, "loss": 0.1901, "step": 5173 }, { "epoch": 3.5935265336846065, "grad_norm": 0.15574201941490173, "learning_rate": 5.61308542949502e-05, "loss": 0.1555, "step": 5174 }, { "epoch": 3.5942792623259314, "grad_norm": 0.17916011810302734, "learning_rate": 5.611597770467936e-05, "loss": 0.1716, "step": 5175 }, { "epoch": 3.5950319909672563, "grad_norm": 0.15276247262954712, "learning_rate": 5.6101100992366885e-05, "loss": 0.1479, "step": 5176 }, { "epoch": 3.5957847196085813, "grad_norm": 0.17461450397968292, "learning_rate": 5.608622415963965e-05, "loss": 0.1573, "step": 5177 }, { "epoch": 3.596537448249906, "grad_norm": 0.14414243400096893, "learning_rate": 5.60713472081246e-05, "loss": 0.123, "step": 5178 }, { "epoch": 3.5972901768912307, "grad_norm": 0.17083479464054108, "learning_rate": 5.6056470139448605e-05, "loss": 0.1481, "step": 5179 }, { "epoch": 3.5980429055325556, "grad_norm": 0.19555193185806274, "learning_rate": 5.604159295523863e-05, "loss": 0.1816, "step": 5180 }, { "epoch": 3.5987956341738805, "grad_norm": 0.19650425016880035, "learning_rate": 5.602671565712162e-05, "loss": 0.1808, "step": 5181 }, { "epoch": 3.599548362815205, "grad_norm": 0.19692230224609375, "learning_rate": 5.6011838246724525e-05, "loss": 0.1665, "step": 5182 }, { "epoch": 3.60030109145653, "grad_norm": 0.1871134489774704, "learning_rate": 5.599696072567432e-05, "loss": 0.1252, "step": 5183 }, { "epoch": 3.601053820097855, "grad_norm": 0.1901259571313858, "learning_rate": 5.598208309559799e-05, "loss": 0.1849, "step": 5184 }, { "epoch": 3.6018065487391793, "grad_norm": 0.22242605686187744, "learning_rate": 5.596720535812253e-05, "loss": 0.1673, "step": 5185 }, { "epoch": 3.602559277380504, "grad_norm": 0.17299850285053253, "learning_rate": 5.595232751487496e-05, "loss": 0.1506, "step": 5186 }, { "epoch": 3.603312006021829, "grad_norm": 0.21883754432201385, "learning_rate": 5.593744956748227e-05, "loss": 0.2178, "step": 5187 }, { "epoch": 3.604064734663154, "grad_norm": 0.2063785344362259, "learning_rate": 5.59225715175715e-05, "loss": 0.1927, "step": 5188 }, { "epoch": 3.604817463304479, "grad_norm": 0.17144113779067993, "learning_rate": 5.59076933667697e-05, "loss": 0.1447, "step": 5189 }, { "epoch": 3.6055701919458034, "grad_norm": 0.17081138491630554, "learning_rate": 5.589281511670391e-05, "loss": 0.061, "step": 5190 }, { "epoch": 3.6063229205871283, "grad_norm": 0.17130210995674133, "learning_rate": 5.5877936769001224e-05, "loss": 0.2398, "step": 5191 }, { "epoch": 3.6070756492284533, "grad_norm": 0.20298534631729126, "learning_rate": 5.5863058325288674e-05, "loss": 0.1506, "step": 5192 }, { "epoch": 3.6078283778697777, "grad_norm": 0.19320270419120789, "learning_rate": 5.584817978719338e-05, "loss": 0.1141, "step": 5193 }, { "epoch": 3.6085811065111026, "grad_norm": 0.18179337680339813, "learning_rate": 5.583330115634241e-05, "loss": 0.1125, "step": 5194 }, { "epoch": 3.6093338351524276, "grad_norm": 0.16011942923069, "learning_rate": 5.5818422434362874e-05, "loss": 0.1088, "step": 5195 }, { "epoch": 3.6100865637937525, "grad_norm": 0.23013761639595032, "learning_rate": 5.580354362288189e-05, "loss": 0.2439, "step": 5196 }, { "epoch": 3.6108392924350774, "grad_norm": 0.22833789885044098, "learning_rate": 5.57886647235266e-05, "loss": 0.1773, "step": 5197 }, { "epoch": 3.611592021076402, "grad_norm": 0.21439515054225922, "learning_rate": 5.57737857379241e-05, "loss": 0.2101, "step": 5198 }, { "epoch": 3.612344749717727, "grad_norm": 0.22174686193466187, "learning_rate": 5.575890666770156e-05, "loss": 0.1936, "step": 5199 }, { "epoch": 3.6130974783590517, "grad_norm": 0.18489575386047363, "learning_rate": 5.574402751448614e-05, "loss": 0.2004, "step": 5200 }, { "epoch": 3.6130974783590517, "eval_loss": 0.19653339684009552, "eval_runtime": 456.3179, "eval_samples_per_second": 21.097, "eval_steps_per_second": 0.66, "step": 5200 }, { "epoch": 3.613850207000376, "grad_norm": 0.1759132742881775, "learning_rate": 5.572914827990498e-05, "loss": 0.1211, "step": 5201 }, { "epoch": 3.614602935641701, "grad_norm": 0.19443516433238983, "learning_rate": 5.571426896558526e-05, "loss": 0.1439, "step": 5202 }, { "epoch": 3.615355664283026, "grad_norm": 0.19836537539958954, "learning_rate": 5.569938957315416e-05, "loss": 0.1854, "step": 5203 }, { "epoch": 3.616108392924351, "grad_norm": 0.17287534475326538, "learning_rate": 5.568451010423886e-05, "loss": 0.1166, "step": 5204 }, { "epoch": 3.6168611215656754, "grad_norm": 0.19607391953468323, "learning_rate": 5.566963056046659e-05, "loss": 0.2643, "step": 5205 }, { "epoch": 3.6176138502070003, "grad_norm": 0.2216663509607315, "learning_rate": 5.5654750943464515e-05, "loss": 0.2039, "step": 5206 }, { "epoch": 3.6183665788483252, "grad_norm": 0.19845305383205414, "learning_rate": 5.5639871254859864e-05, "loss": 0.1062, "step": 5207 }, { "epoch": 3.61911930748965, "grad_norm": 0.23112821578979492, "learning_rate": 5.562499149627984e-05, "loss": 0.202, "step": 5208 }, { "epoch": 3.6198720361309746, "grad_norm": 0.1981322020292282, "learning_rate": 5.5610111669351694e-05, "loss": 0.189, "step": 5209 }, { "epoch": 3.6206247647722996, "grad_norm": 0.17765240371227264, "learning_rate": 5.559523177570266e-05, "loss": 0.2186, "step": 5210 }, { "epoch": 3.6213774934136245, "grad_norm": 0.21943986415863037, "learning_rate": 5.558035181695998e-05, "loss": 0.1854, "step": 5211 }, { "epoch": 3.622130222054949, "grad_norm": 0.20339475572109222, "learning_rate": 5.556547179475088e-05, "loss": 0.1715, "step": 5212 }, { "epoch": 3.622882950696274, "grad_norm": 0.19944876432418823, "learning_rate": 5.555059171070264e-05, "loss": 0.2174, "step": 5213 }, { "epoch": 3.623635679337599, "grad_norm": 0.17451219260692596, "learning_rate": 5.553571156644252e-05, "loss": 0.1165, "step": 5214 }, { "epoch": 3.6243884079789237, "grad_norm": 0.2010069191455841, "learning_rate": 5.552083136359778e-05, "loss": 0.1793, "step": 5215 }, { "epoch": 3.6251411366202486, "grad_norm": 0.2595932185649872, "learning_rate": 5.5505951103795715e-05, "loss": 0.1932, "step": 5216 }, { "epoch": 3.625893865261573, "grad_norm": 0.21191556751728058, "learning_rate": 5.549107078866359e-05, "loss": 0.1685, "step": 5217 }, { "epoch": 3.626646593902898, "grad_norm": 0.211817666888237, "learning_rate": 5.547619041982871e-05, "loss": 0.1844, "step": 5218 }, { "epoch": 3.627399322544223, "grad_norm": 0.1932569444179535, "learning_rate": 5.5461309998918364e-05, "loss": 0.182, "step": 5219 }, { "epoch": 3.6281520511855474, "grad_norm": 0.17915311455726624, "learning_rate": 5.544642952755984e-05, "loss": 0.2146, "step": 5220 }, { "epoch": 3.6289047798268723, "grad_norm": 0.21822528541088104, "learning_rate": 5.543154900738047e-05, "loss": 0.2011, "step": 5221 }, { "epoch": 3.6296575084681972, "grad_norm": 0.1783531904220581, "learning_rate": 5.5416668440007534e-05, "loss": 0.1531, "step": 5222 }, { "epoch": 3.630410237109522, "grad_norm": 0.24949924647808075, "learning_rate": 5.540178782706836e-05, "loss": 0.24, "step": 5223 }, { "epoch": 3.631162965750847, "grad_norm": 0.1891869306564331, "learning_rate": 5.5386907170190294e-05, "loss": 0.1678, "step": 5224 }, { "epoch": 3.6319156943921715, "grad_norm": 0.21742910146713257, "learning_rate": 5.537202647100063e-05, "loss": 0.1757, "step": 5225 }, { "epoch": 3.6326684230334965, "grad_norm": 0.20000849664211273, "learning_rate": 5.5357145731126716e-05, "loss": 0.1877, "step": 5226 }, { "epoch": 3.6334211516748214, "grad_norm": 0.2056545466184616, "learning_rate": 5.5342264952195864e-05, "loss": 0.1552, "step": 5227 }, { "epoch": 3.634173880316146, "grad_norm": 0.19001416862010956, "learning_rate": 5.5327384135835446e-05, "loss": 0.1247, "step": 5228 }, { "epoch": 3.6349266089574708, "grad_norm": 0.16407151520252228, "learning_rate": 5.53125032836728e-05, "loss": 0.1238, "step": 5229 }, { "epoch": 3.6356793375987957, "grad_norm": 0.1823868304491043, "learning_rate": 5.5297622397335245e-05, "loss": 0.1533, "step": 5230 }, { "epoch": 3.63643206624012, "grad_norm": 0.19373047351837158, "learning_rate": 5.528274147845016e-05, "loss": 0.1935, "step": 5231 }, { "epoch": 3.637184794881445, "grad_norm": 0.19885437190532684, "learning_rate": 5.52678605286449e-05, "loss": 0.1402, "step": 5232 }, { "epoch": 3.63793752352277, "grad_norm": 0.18390029668807983, "learning_rate": 5.52529795495468e-05, "loss": 0.2787, "step": 5233 }, { "epoch": 3.638690252164095, "grad_norm": 0.19383470714092255, "learning_rate": 5.523809854278324e-05, "loss": 0.2406, "step": 5234 }, { "epoch": 3.63944298080542, "grad_norm": 0.17947958409786224, "learning_rate": 5.5223217509981593e-05, "loss": 0.1884, "step": 5235 }, { "epoch": 3.6401957094467443, "grad_norm": 0.16577447950839996, "learning_rate": 5.520833645276921e-05, "loss": 0.1204, "step": 5236 }, { "epoch": 3.6409484380880692, "grad_norm": 0.19772230088710785, "learning_rate": 5.5193455372773464e-05, "loss": 0.1927, "step": 5237 }, { "epoch": 3.641701166729394, "grad_norm": 0.16144177317619324, "learning_rate": 5.517857427162172e-05, "loss": 0.1047, "step": 5238 }, { "epoch": 3.6424538953707186, "grad_norm": 0.17827686667442322, "learning_rate": 5.5163693150941375e-05, "loss": 0.1689, "step": 5239 }, { "epoch": 3.6432066240120435, "grad_norm": 0.19826774299144745, "learning_rate": 5.5148812012359785e-05, "loss": 0.116, "step": 5240 }, { "epoch": 3.6439593526533685, "grad_norm": 0.16640982031822205, "learning_rate": 5.513393085750435e-05, "loss": 0.1456, "step": 5241 }, { "epoch": 3.6447120812946934, "grad_norm": 0.17815405130386353, "learning_rate": 5.511904968800242e-05, "loss": 0.1437, "step": 5242 }, { "epoch": 3.6454648099360183, "grad_norm": 0.21178008615970612, "learning_rate": 5.5104168505481405e-05, "loss": 0.1624, "step": 5243 }, { "epoch": 3.6462175385773428, "grad_norm": 0.21380163729190826, "learning_rate": 5.508928731156867e-05, "loss": 0.2251, "step": 5244 }, { "epoch": 3.6469702672186677, "grad_norm": 0.1917026787996292, "learning_rate": 5.507440610789163e-05, "loss": 0.2179, "step": 5245 }, { "epoch": 3.6477229958599926, "grad_norm": 0.1920931041240692, "learning_rate": 5.505952489607763e-05, "loss": 0.1094, "step": 5246 }, { "epoch": 3.648475724501317, "grad_norm": 0.208588644862175, "learning_rate": 5.504464367775409e-05, "loss": 0.128, "step": 5247 }, { "epoch": 3.649228453142642, "grad_norm": 0.16279126703739166, "learning_rate": 5.502976245454838e-05, "loss": 0.1183, "step": 5248 }, { "epoch": 3.649981181783967, "grad_norm": 0.18126364052295685, "learning_rate": 5.5014881228087875e-05, "loss": 0.159, "step": 5249 }, { "epoch": 3.650733910425292, "grad_norm": 0.22472310066223145, "learning_rate": 5.500000000000001e-05, "loss": 0.1916, "step": 5250 }, { "epoch": 3.6514866390666167, "grad_norm": 0.2557971179485321, "learning_rate": 5.4985118771912135e-05, "loss": 0.1612, "step": 5251 }, { "epoch": 3.652239367707941, "grad_norm": 0.2192237228155136, "learning_rate": 5.497023754545164e-05, "loss": 0.135, "step": 5252 }, { "epoch": 3.652992096349266, "grad_norm": 0.1877821385860443, "learning_rate": 5.4955356322245945e-05, "loss": 0.1601, "step": 5253 }, { "epoch": 3.653744824990591, "grad_norm": 0.23534882068634033, "learning_rate": 5.4940475103922376e-05, "loss": 0.2113, "step": 5254 }, { "epoch": 3.6544975536319155, "grad_norm": 0.23633642494678497, "learning_rate": 5.4925593892108384e-05, "loss": 0.1003, "step": 5255 }, { "epoch": 3.6552502822732404, "grad_norm": 0.21281671524047852, "learning_rate": 5.491071268843133e-05, "loss": 0.1553, "step": 5256 }, { "epoch": 3.6560030109145654, "grad_norm": 0.21767736971378326, "learning_rate": 5.489583149451861e-05, "loss": 0.1294, "step": 5257 }, { "epoch": 3.65675573955589, "grad_norm": 0.2295861393213272, "learning_rate": 5.48809503119976e-05, "loss": 0.2171, "step": 5258 }, { "epoch": 3.6575084681972148, "grad_norm": 0.19856160879135132, "learning_rate": 5.486606914249567e-05, "loss": 0.1998, "step": 5259 }, { "epoch": 3.6582611968385397, "grad_norm": 0.20444701611995697, "learning_rate": 5.485118798764022e-05, "loss": 0.2334, "step": 5260 }, { "epoch": 3.6590139254798646, "grad_norm": 0.1453421711921692, "learning_rate": 5.4836306849058635e-05, "loss": 0.1333, "step": 5261 }, { "epoch": 3.6597666541211895, "grad_norm": 0.1869845986366272, "learning_rate": 5.4821425728378294e-05, "loss": 0.1041, "step": 5262 }, { "epoch": 3.660519382762514, "grad_norm": 0.13990288972854614, "learning_rate": 5.4806544627226554e-05, "loss": 0.0647, "step": 5263 }, { "epoch": 3.661272111403839, "grad_norm": 0.1933494359254837, "learning_rate": 5.479166354723081e-05, "loss": 0.166, "step": 5264 }, { "epoch": 3.662024840045164, "grad_norm": 0.16002000868320465, "learning_rate": 5.477678249001843e-05, "loss": 0.0954, "step": 5265 }, { "epoch": 3.6627775686864883, "grad_norm": 0.23540614545345306, "learning_rate": 5.476190145721677e-05, "loss": 0.1651, "step": 5266 }, { "epoch": 3.663530297327813, "grad_norm": 0.2084127813577652, "learning_rate": 5.47470204504532e-05, "loss": 0.1427, "step": 5267 }, { "epoch": 3.664283025969138, "grad_norm": 0.2202567160129547, "learning_rate": 5.4732139471355116e-05, "loss": 0.1701, "step": 5268 }, { "epoch": 3.665035754610463, "grad_norm": 0.18429292738437653, "learning_rate": 5.4717258521549855e-05, "loss": 0.1157, "step": 5269 }, { "epoch": 3.665788483251788, "grad_norm": 0.18935775756835938, "learning_rate": 5.4702377602664766e-05, "loss": 0.1269, "step": 5270 }, { "epoch": 3.6665412118931124, "grad_norm": 0.17983771860599518, "learning_rate": 5.468749671632722e-05, "loss": 0.0974, "step": 5271 }, { "epoch": 3.6672939405344374, "grad_norm": 0.19650907814502716, "learning_rate": 5.467261586416458e-05, "loss": 0.1785, "step": 5272 }, { "epoch": 3.6680466691757623, "grad_norm": 0.16783402860164642, "learning_rate": 5.465773504780414e-05, "loss": 0.0604, "step": 5273 }, { "epoch": 3.6687993978170867, "grad_norm": 0.21839889883995056, "learning_rate": 5.46428542688733e-05, "loss": 0.1993, "step": 5274 }, { "epoch": 3.6695521264584117, "grad_norm": 0.18298299610614777, "learning_rate": 5.462797352899939e-05, "loss": 0.112, "step": 5275 }, { "epoch": 3.6703048550997366, "grad_norm": 0.19711187481880188, "learning_rate": 5.461309282980973e-05, "loss": 0.1125, "step": 5276 }, { "epoch": 3.6710575837410615, "grad_norm": 0.1853863000869751, "learning_rate": 5.459821217293165e-05, "loss": 0.174, "step": 5277 }, { "epoch": 3.6718103123823864, "grad_norm": 0.1668562889099121, "learning_rate": 5.458333155999249e-05, "loss": 0.1811, "step": 5278 }, { "epoch": 3.672563041023711, "grad_norm": 0.19258007407188416, "learning_rate": 5.456845099261955e-05, "loss": 0.1076, "step": 5279 }, { "epoch": 3.673315769665036, "grad_norm": 0.1767362803220749, "learning_rate": 5.4553570472440165e-05, "loss": 0.1249, "step": 5280 }, { "epoch": 3.6740684983063607, "grad_norm": 0.18455566465854645, "learning_rate": 5.453869000108165e-05, "loss": 0.2445, "step": 5281 }, { "epoch": 3.674821226947685, "grad_norm": 0.20809103548526764, "learning_rate": 5.4523809580171307e-05, "loss": 0.1664, "step": 5282 }, { "epoch": 3.67557395558901, "grad_norm": 0.19587387144565582, "learning_rate": 5.450892921133642e-05, "loss": 0.2131, "step": 5283 }, { "epoch": 3.676326684230335, "grad_norm": 0.18880581855773926, "learning_rate": 5.4494048896204296e-05, "loss": 0.1618, "step": 5284 }, { "epoch": 3.6770794128716595, "grad_norm": 0.20947349071502686, "learning_rate": 5.447916863640223e-05, "loss": 0.1946, "step": 5285 }, { "epoch": 3.6778321415129844, "grad_norm": 0.1581653654575348, "learning_rate": 5.44642884335575e-05, "loss": 0.1223, "step": 5286 }, { "epoch": 3.6785848701543093, "grad_norm": 0.19387446343898773, "learning_rate": 5.444940828929738e-05, "loss": 0.1783, "step": 5287 }, { "epoch": 3.6793375987956343, "grad_norm": 0.2085724174976349, "learning_rate": 5.443452820524913e-05, "loss": 0.2007, "step": 5288 }, { "epoch": 3.680090327436959, "grad_norm": 0.16678382456302643, "learning_rate": 5.4419648183040053e-05, "loss": 0.1798, "step": 5289 }, { "epoch": 3.6808430560782837, "grad_norm": 0.16631093621253967, "learning_rate": 5.440476822429735e-05, "loss": 0.1385, "step": 5290 }, { "epoch": 3.6815957847196086, "grad_norm": 0.20038007199764252, "learning_rate": 5.438988833064832e-05, "loss": 0.1039, "step": 5291 }, { "epoch": 3.6823485133609335, "grad_norm": 0.17623674869537354, "learning_rate": 5.437500850372016e-05, "loss": 0.1055, "step": 5292 }, { "epoch": 3.683101242002258, "grad_norm": 0.1696634143590927, "learning_rate": 5.436012874514015e-05, "loss": 0.089, "step": 5293 }, { "epoch": 3.683853970643583, "grad_norm": 0.19968102872371674, "learning_rate": 5.4345249056535495e-05, "loss": 0.1994, "step": 5294 }, { "epoch": 3.684606699284908, "grad_norm": 0.16261036694049835, "learning_rate": 5.4330369439533425e-05, "loss": 0.1347, "step": 5295 }, { "epoch": 3.6853594279262327, "grad_norm": 0.19292744994163513, "learning_rate": 5.431548989576114e-05, "loss": 0.2047, "step": 5296 }, { "epoch": 3.6861121565675576, "grad_norm": 0.1778174638748169, "learning_rate": 5.4300610426845856e-05, "loss": 0.2035, "step": 5297 }, { "epoch": 3.686864885208882, "grad_norm": 0.19551308453083038, "learning_rate": 5.4285731034414744e-05, "loss": 0.1708, "step": 5298 }, { "epoch": 3.687617613850207, "grad_norm": 0.2131575345993042, "learning_rate": 5.4270851720095026e-05, "loss": 0.1787, "step": 5299 }, { "epoch": 3.688370342491532, "grad_norm": 0.15762147307395935, "learning_rate": 5.425597248551387e-05, "loss": 0.1399, "step": 5300 }, { "epoch": 3.6891230711328564, "grad_norm": 0.17360176146030426, "learning_rate": 5.424109333229845e-05, "loss": 0.2357, "step": 5301 }, { "epoch": 3.6898757997741813, "grad_norm": 0.17691434919834137, "learning_rate": 5.422621426207592e-05, "loss": 0.1922, "step": 5302 }, { "epoch": 3.6906285284155063, "grad_norm": 0.1692945510149002, "learning_rate": 5.421133527647343e-05, "loss": 0.1242, "step": 5303 }, { "epoch": 3.691381257056831, "grad_norm": 0.20659847557544708, "learning_rate": 5.419645637711813e-05, "loss": 0.1678, "step": 5304 }, { "epoch": 3.6921339856981557, "grad_norm": 0.19324584305286407, "learning_rate": 5.418157756563713e-05, "loss": 0.1792, "step": 5305 }, { "epoch": 3.6928867143394806, "grad_norm": 0.17799024283885956, "learning_rate": 5.41666988436576e-05, "loss": 0.1243, "step": 5306 }, { "epoch": 3.6936394429808055, "grad_norm": 0.1968451291322708, "learning_rate": 5.4151820212806633e-05, "loss": 0.1417, "step": 5307 }, { "epoch": 3.6943921716221304, "grad_norm": 0.1886061728000641, "learning_rate": 5.413694167471134e-05, "loss": 0.1818, "step": 5308 }, { "epoch": 3.695144900263455, "grad_norm": 0.1796746850013733, "learning_rate": 5.412206323099879e-05, "loss": 0.1446, "step": 5309 }, { "epoch": 3.69589762890478, "grad_norm": 0.20257766544818878, "learning_rate": 5.4107184883296094e-05, "loss": 0.1472, "step": 5310 }, { "epoch": 3.6966503575461047, "grad_norm": 0.172605961561203, "learning_rate": 5.4092306633230305e-05, "loss": 0.1307, "step": 5311 }, { "epoch": 3.697403086187429, "grad_norm": 0.18220770359039307, "learning_rate": 5.407742848242852e-05, "loss": 0.1433, "step": 5312 }, { "epoch": 3.698155814828754, "grad_norm": 0.20301663875579834, "learning_rate": 5.406255043251775e-05, "loss": 0.1561, "step": 5313 }, { "epoch": 3.698908543470079, "grad_norm": 0.20938719809055328, "learning_rate": 5.404767248512507e-05, "loss": 0.177, "step": 5314 }, { "epoch": 3.699661272111404, "grad_norm": 0.21828190982341766, "learning_rate": 5.403279464187748e-05, "loss": 0.2539, "step": 5315 }, { "epoch": 3.700414000752729, "grad_norm": 0.17433610558509827, "learning_rate": 5.401791690440202e-05, "loss": 0.2348, "step": 5316 }, { "epoch": 3.7011667293940533, "grad_norm": 0.20037029683589935, "learning_rate": 5.4003039274325685e-05, "loss": 0.209, "step": 5317 }, { "epoch": 3.7019194580353783, "grad_norm": 0.19511082768440247, "learning_rate": 5.398816175327548e-05, "loss": 0.1668, "step": 5318 }, { "epoch": 3.702672186676703, "grad_norm": 0.1824859231710434, "learning_rate": 5.397328434287839e-05, "loss": 0.1247, "step": 5319 }, { "epoch": 3.7034249153180276, "grad_norm": 0.22733363509178162, "learning_rate": 5.395840704476138e-05, "loss": 0.151, "step": 5320 }, { "epoch": 3.7041776439593526, "grad_norm": 0.22032125294208527, "learning_rate": 5.394352986055141e-05, "loss": 0.2408, "step": 5321 }, { "epoch": 3.7049303726006775, "grad_norm": 0.20146037638187408, "learning_rate": 5.3928652791875435e-05, "loss": 0.2197, "step": 5322 }, { "epoch": 3.7056831012420024, "grad_norm": 0.20835357904434204, "learning_rate": 5.391377584036036e-05, "loss": 0.1535, "step": 5323 }, { "epoch": 3.7064358298833273, "grad_norm": 0.18746602535247803, "learning_rate": 5.389889900763313e-05, "loss": 0.1658, "step": 5324 }, { "epoch": 3.707188558524652, "grad_norm": 0.18351200222969055, "learning_rate": 5.388402229532065e-05, "loss": 0.1562, "step": 5325 }, { "epoch": 3.7079412871659767, "grad_norm": 0.1886044144630432, "learning_rate": 5.3869145705049814e-05, "loss": 0.1221, "step": 5326 }, { "epoch": 3.7086940158073016, "grad_norm": 0.19364695250988007, "learning_rate": 5.385426923844752e-05, "loss": 0.217, "step": 5327 }, { "epoch": 3.709446744448626, "grad_norm": 0.18579281866550446, "learning_rate": 5.383939289714061e-05, "loss": 0.1515, "step": 5328 }, { "epoch": 3.710199473089951, "grad_norm": 0.19593875110149384, "learning_rate": 5.3824516682755945e-05, "loss": 0.2103, "step": 5329 }, { "epoch": 3.710952201731276, "grad_norm": 0.18191418051719666, "learning_rate": 5.380964059692035e-05, "loss": 0.2073, "step": 5330 }, { "epoch": 3.7117049303726004, "grad_norm": 0.19011171162128448, "learning_rate": 5.379476464126069e-05, "loss": 0.1446, "step": 5331 }, { "epoch": 3.7124576590139253, "grad_norm": 0.18823669850826263, "learning_rate": 5.377988881740376e-05, "loss": 0.1392, "step": 5332 }, { "epoch": 3.7132103876552502, "grad_norm": 0.18639038503170013, "learning_rate": 5.376501312697635e-05, "loss": 0.1397, "step": 5333 }, { "epoch": 3.713963116296575, "grad_norm": 0.21722687780857086, "learning_rate": 5.375013757160524e-05, "loss": 0.1961, "step": 5334 }, { "epoch": 3.7147158449379, "grad_norm": 0.17001508176326752, "learning_rate": 5.373526215291721e-05, "loss": 0.174, "step": 5335 }, { "epoch": 3.7154685735792246, "grad_norm": 0.22354158759117126, "learning_rate": 5.372038687253901e-05, "loss": 0.1289, "step": 5336 }, { "epoch": 3.7162213022205495, "grad_norm": 0.18682073056697845, "learning_rate": 5.370551173209738e-05, "loss": 0.1168, "step": 5337 }, { "epoch": 3.7169740308618744, "grad_norm": 0.1821158081293106, "learning_rate": 5.369063673321904e-05, "loss": 0.1304, "step": 5338 }, { "epoch": 3.717726759503199, "grad_norm": 0.14638465642929077, "learning_rate": 5.367576187753069e-05, "loss": 0.1642, "step": 5339 }, { "epoch": 3.718479488144524, "grad_norm": 0.1786772459745407, "learning_rate": 5.366088716665905e-05, "loss": 0.1711, "step": 5340 }, { "epoch": 3.7192322167858487, "grad_norm": 0.1579362154006958, "learning_rate": 5.364601260223076e-05, "loss": 0.138, "step": 5341 }, { "epoch": 3.7199849454271736, "grad_norm": 0.16318649053573608, "learning_rate": 5.363113818587251e-05, "loss": 0.1276, "step": 5342 }, { "epoch": 3.7207376740684985, "grad_norm": 0.2003650814294815, "learning_rate": 5.36162639192109e-05, "loss": 0.1551, "step": 5343 }, { "epoch": 3.721490402709823, "grad_norm": 0.17183135449886322, "learning_rate": 5.3601389803872615e-05, "loss": 0.1013, "step": 5344 }, { "epoch": 3.722243131351148, "grad_norm": 0.17500624060630798, "learning_rate": 5.358651584148423e-05, "loss": 0.1381, "step": 5345 }, { "epoch": 3.722995859992473, "grad_norm": 0.20010921359062195, "learning_rate": 5.3571642033672354e-05, "loss": 0.1717, "step": 5346 }, { "epoch": 3.7237485886337973, "grad_norm": 0.1960614025592804, "learning_rate": 5.355676838206355e-05, "loss": 0.1379, "step": 5347 }, { "epoch": 3.7245013172751222, "grad_norm": 0.12947995960712433, "learning_rate": 5.3541894888284394e-05, "loss": 0.09, "step": 5348 }, { "epoch": 3.725254045916447, "grad_norm": 0.17467373609542847, "learning_rate": 5.35270215539614e-05, "loss": 0.1165, "step": 5349 }, { "epoch": 3.726006774557772, "grad_norm": 0.19784381985664368, "learning_rate": 5.351214838072113e-05, "loss": 0.152, "step": 5350 }, { "epoch": 3.726759503199097, "grad_norm": 0.16104181110858917, "learning_rate": 5.349727537019008e-05, "loss": 0.1354, "step": 5351 }, { "epoch": 3.7275122318404215, "grad_norm": 0.1581774353981018, "learning_rate": 5.3482402523994734e-05, "loss": 0.17, "step": 5352 }, { "epoch": 3.7282649604817464, "grad_norm": 0.22043758630752563, "learning_rate": 5.3467529843761566e-05, "loss": 0.102, "step": 5353 }, { "epoch": 3.7290176891230713, "grad_norm": 0.20559576153755188, "learning_rate": 5.345265733111703e-05, "loss": 0.2021, "step": 5354 }, { "epoch": 3.7297704177643958, "grad_norm": 0.19651120901107788, "learning_rate": 5.3437784987687554e-05, "loss": 0.1674, "step": 5355 }, { "epoch": 3.7305231464057207, "grad_norm": 0.1931537687778473, "learning_rate": 5.342291281509957e-05, "loss": 0.1924, "step": 5356 }, { "epoch": 3.7312758750470456, "grad_norm": 0.174251988530159, "learning_rate": 5.340804081497946e-05, "loss": 0.0987, "step": 5357 }, { "epoch": 3.73202860368837, "grad_norm": 0.1895965039730072, "learning_rate": 5.339316898895364e-05, "loss": 0.1416, "step": 5358 }, { "epoch": 3.732781332329695, "grad_norm": 0.16127867996692657, "learning_rate": 5.337829733864841e-05, "loss": 0.0953, "step": 5359 }, { "epoch": 3.73353406097102, "grad_norm": 0.1768840104341507, "learning_rate": 5.336342586569016e-05, "loss": 0.077, "step": 5360 }, { "epoch": 3.734286789612345, "grad_norm": 0.22269852459430695, "learning_rate": 5.33485545717052e-05, "loss": 0.2295, "step": 5361 }, { "epoch": 3.7350395182536698, "grad_norm": 0.16859859228134155, "learning_rate": 5.3333683458319825e-05, "loss": 0.1972, "step": 5362 }, { "epoch": 3.7357922468949942, "grad_norm": 0.20677775144577026, "learning_rate": 5.3318812527160314e-05, "loss": 0.2002, "step": 5363 }, { "epoch": 3.736544975536319, "grad_norm": 0.16182449460029602, "learning_rate": 5.330394177985295e-05, "loss": 0.1659, "step": 5364 }, { "epoch": 3.737297704177644, "grad_norm": 0.19546851515769958, "learning_rate": 5.3289071218023955e-05, "loss": 0.1384, "step": 5365 }, { "epoch": 3.7380504328189685, "grad_norm": 0.178078293800354, "learning_rate": 5.327420084329957e-05, "loss": 0.1612, "step": 5366 }, { "epoch": 3.7388031614602935, "grad_norm": 0.18986831605434418, "learning_rate": 5.3259330657305985e-05, "loss": 0.108, "step": 5367 }, { "epoch": 3.7395558901016184, "grad_norm": 0.16627748310565948, "learning_rate": 5.324446066166936e-05, "loss": 0.1924, "step": 5368 }, { "epoch": 3.7403086187429433, "grad_norm": 0.20752990245819092, "learning_rate": 5.32295908580159e-05, "loss": 0.1891, "step": 5369 }, { "epoch": 3.741061347384268, "grad_norm": 0.1814156472682953, "learning_rate": 5.321472124797171e-05, "loss": 0.1997, "step": 5370 }, { "epoch": 3.7418140760255927, "grad_norm": 0.19348861277103424, "learning_rate": 5.319985183316293e-05, "loss": 0.2209, "step": 5371 }, { "epoch": 3.7425668046669176, "grad_norm": 0.1698899120092392, "learning_rate": 5.318498261521565e-05, "loss": 0.0957, "step": 5372 }, { "epoch": 3.7433195333082425, "grad_norm": 0.21195414662361145, "learning_rate": 5.317011359575593e-05, "loss": 0.139, "step": 5373 }, { "epoch": 3.744072261949567, "grad_norm": 0.20829322934150696, "learning_rate": 5.315524477640985e-05, "loss": 0.1455, "step": 5374 }, { "epoch": 3.744824990590892, "grad_norm": 0.21588672697544098, "learning_rate": 5.314037615880341e-05, "loss": 0.1654, "step": 5375 }, { "epoch": 3.745577719232217, "grad_norm": 0.1955781877040863, "learning_rate": 5.312550774456264e-05, "loss": 0.1375, "step": 5376 }, { "epoch": 3.7463304478735417, "grad_norm": 0.2255009561777115, "learning_rate": 5.3110639535313546e-05, "loss": 0.1991, "step": 5377 }, { "epoch": 3.7470831765148667, "grad_norm": 0.21603597700595856, "learning_rate": 5.309577153268206e-05, "loss": 0.1797, "step": 5378 }, { "epoch": 3.747835905156191, "grad_norm": 0.1874963492155075, "learning_rate": 5.308090373829413e-05, "loss": 0.1578, "step": 5379 }, { "epoch": 3.748588633797516, "grad_norm": 0.24479591846466064, "learning_rate": 5.3066036153775675e-05, "loss": 0.1405, "step": 5380 }, { "epoch": 3.749341362438841, "grad_norm": 0.1903408169746399, "learning_rate": 5.3051168780752616e-05, "loss": 0.166, "step": 5381 }, { "epoch": 3.7500940910801654, "grad_norm": 0.23021996021270752, "learning_rate": 5.303630162085079e-05, "loss": 0.1804, "step": 5382 }, { "epoch": 3.7508468197214904, "grad_norm": 0.16214938461780548, "learning_rate": 5.302143467569609e-05, "loss": 0.1532, "step": 5383 }, { "epoch": 3.7515995483628153, "grad_norm": 0.24179798364639282, "learning_rate": 5.30065679469143e-05, "loss": 0.1402, "step": 5384 }, { "epoch": 3.7523522770041398, "grad_norm": 0.16564473509788513, "learning_rate": 5.2991701436131246e-05, "loss": 0.1327, "step": 5385 }, { "epoch": 3.7531050056454647, "grad_norm": 0.20544643700122833, "learning_rate": 5.297683514497271e-05, "loss": 0.2161, "step": 5386 }, { "epoch": 3.7538577342867896, "grad_norm": 0.1802041083574295, "learning_rate": 5.296196907506443e-05, "loss": 0.1647, "step": 5387 }, { "epoch": 3.7546104629281145, "grad_norm": 0.1844628006219864, "learning_rate": 5.294710322803216e-05, "loss": 0.1424, "step": 5388 }, { "epoch": 3.7553631915694394, "grad_norm": 0.19919666647911072, "learning_rate": 5.2932237605501586e-05, "loss": 0.1417, "step": 5389 }, { "epoch": 3.756115920210764, "grad_norm": 0.18044975399971008, "learning_rate": 5.291737220909842e-05, "loss": 0.1698, "step": 5390 }, { "epoch": 3.756868648852089, "grad_norm": 0.22867731750011444, "learning_rate": 5.290250704044828e-05, "loss": 0.1693, "step": 5391 }, { "epoch": 3.7576213774934137, "grad_norm": 0.1963941901922226, "learning_rate": 5.288764210117682e-05, "loss": 0.2326, "step": 5392 }, { "epoch": 3.758374106134738, "grad_norm": 0.1740790456533432, "learning_rate": 5.2872777392909646e-05, "loss": 0.178, "step": 5393 }, { "epoch": 3.759126834776063, "grad_norm": 0.22368307411670685, "learning_rate": 5.2857912917272315e-05, "loss": 0.2397, "step": 5394 }, { "epoch": 3.759879563417388, "grad_norm": 0.19032247364521027, "learning_rate": 5.284304867589043e-05, "loss": 0.1501, "step": 5395 }, { "epoch": 3.760632292058713, "grad_norm": 0.17606361210346222, "learning_rate": 5.282818467038949e-05, "loss": 0.1771, "step": 5396 }, { "epoch": 3.761385020700038, "grad_norm": 0.216240793466568, "learning_rate": 5.281332090239503e-05, "loss": 0.1389, "step": 5397 }, { "epoch": 3.7621377493413624, "grad_norm": 0.17718175053596497, "learning_rate": 5.2798457373532485e-05, "loss": 0.1666, "step": 5398 }, { "epoch": 3.7628904779826873, "grad_norm": 0.2199307680130005, "learning_rate": 5.278359408542734e-05, "loss": 0.2548, "step": 5399 }, { "epoch": 3.763643206624012, "grad_norm": 0.1866772472858429, "learning_rate": 5.2768731039705e-05, "loss": 0.2081, "step": 5400 }, { "epoch": 3.763643206624012, "eval_loss": 0.19213122129440308, "eval_runtime": 456.9222, "eval_samples_per_second": 21.069, "eval_steps_per_second": 0.659, "step": 5400 }, { "epoch": 3.7643959352653367, "grad_norm": 0.17177683115005493, "learning_rate": 5.275386823799089e-05, "loss": 0.1276, "step": 5401 }, { "epoch": 3.7651486639066616, "grad_norm": 0.17000891268253326, "learning_rate": 5.273900568191038e-05, "loss": 0.1812, "step": 5402 }, { "epoch": 3.7659013925479865, "grad_norm": 0.18587850034236908, "learning_rate": 5.272414337308881e-05, "loss": 0.2288, "step": 5403 }, { "epoch": 3.766654121189311, "grad_norm": 0.19673044979572296, "learning_rate": 5.2709281313151484e-05, "loss": 0.1407, "step": 5404 }, { "epoch": 3.767406849830636, "grad_norm": 0.18690834939479828, "learning_rate": 5.269441950372371e-05, "loss": 0.1254, "step": 5405 }, { "epoch": 3.768159578471961, "grad_norm": 0.1974131464958191, "learning_rate": 5.267955794643076e-05, "loss": 0.1525, "step": 5406 }, { "epoch": 3.7689123071132857, "grad_norm": 0.2230214625597, "learning_rate": 5.266469664289787e-05, "loss": 0.1882, "step": 5407 }, { "epoch": 3.7696650357546106, "grad_norm": 0.19612661004066467, "learning_rate": 5.264983559475023e-05, "loss": 0.0986, "step": 5408 }, { "epoch": 3.770417764395935, "grad_norm": 0.1783437281847, "learning_rate": 5.2634974803613045e-05, "loss": 0.1696, "step": 5409 }, { "epoch": 3.77117049303726, "grad_norm": 0.2005731761455536, "learning_rate": 5.262011427111147e-05, "loss": 0.2096, "step": 5410 }, { "epoch": 3.771923221678585, "grad_norm": 0.15857014060020447, "learning_rate": 5.260525399887061e-05, "loss": 0.0867, "step": 5411 }, { "epoch": 3.7726759503199094, "grad_norm": 0.2021746188402176, "learning_rate": 5.259039398851556e-05, "loss": 0.1933, "step": 5412 }, { "epoch": 3.7734286789612344, "grad_norm": 0.18489062786102295, "learning_rate": 5.25755342416714e-05, "loss": 0.1585, "step": 5413 }, { "epoch": 3.7741814076025593, "grad_norm": 0.1901915967464447, "learning_rate": 5.2560674759963194e-05, "loss": 0.105, "step": 5414 }, { "epoch": 3.774934136243884, "grad_norm": 0.18282127380371094, "learning_rate": 5.254581554501591e-05, "loss": 0.1601, "step": 5415 }, { "epoch": 3.775686864885209, "grad_norm": 0.16706493496894836, "learning_rate": 5.253095659845456e-05, "loss": 0.0936, "step": 5416 }, { "epoch": 3.7764395935265336, "grad_norm": 0.16587352752685547, "learning_rate": 5.251609792190408e-05, "loss": 0.1664, "step": 5417 }, { "epoch": 3.7771923221678585, "grad_norm": 0.20567044615745544, "learning_rate": 5.250123951698941e-05, "loss": 0.2358, "step": 5418 }, { "epoch": 3.7779450508091834, "grad_norm": 0.17385558784008026, "learning_rate": 5.2486381385335394e-05, "loss": 0.1622, "step": 5419 }, { "epoch": 3.778697779450508, "grad_norm": 0.18037647008895874, "learning_rate": 5.247152352856696e-05, "loss": 0.2102, "step": 5420 }, { "epoch": 3.779450508091833, "grad_norm": 0.21705631911754608, "learning_rate": 5.24566659483089e-05, "loss": 0.1273, "step": 5421 }, { "epoch": 3.7802032367331577, "grad_norm": 0.2024896889925003, "learning_rate": 5.244180864618604e-05, "loss": 0.1734, "step": 5422 }, { "epoch": 3.7809559653744826, "grad_norm": 0.17708832025527954, "learning_rate": 5.242695162382313e-05, "loss": 0.1953, "step": 5423 }, { "epoch": 3.7817086940158076, "grad_norm": 0.1742921620607376, "learning_rate": 5.2412094882844934e-05, "loss": 0.2165, "step": 5424 }, { "epoch": 3.782461422657132, "grad_norm": 0.20241037011146545, "learning_rate": 5.239723842487613e-05, "loss": 0.1259, "step": 5425 }, { "epoch": 3.783214151298457, "grad_norm": 0.13644464313983917, "learning_rate": 5.2382382251541415e-05, "loss": 0.0592, "step": 5426 }, { "epoch": 3.783966879939782, "grad_norm": 0.1801811158657074, "learning_rate": 5.236752636446547e-05, "loss": 0.1479, "step": 5427 }, { "epoch": 3.7847196085811063, "grad_norm": 0.18447665870189667, "learning_rate": 5.235267076527287e-05, "loss": 0.1353, "step": 5428 }, { "epoch": 3.7854723372224313, "grad_norm": 0.19251637160778046, "learning_rate": 5.2337815455588204e-05, "loss": 0.0987, "step": 5429 }, { "epoch": 3.786225065863756, "grad_norm": 0.2092086225748062, "learning_rate": 5.232296043703605e-05, "loss": 0.2758, "step": 5430 }, { "epoch": 3.7869777945050807, "grad_norm": 0.15464644134044647, "learning_rate": 5.2308105711240916e-05, "loss": 0.0981, "step": 5431 }, { "epoch": 3.7877305231464056, "grad_norm": 0.16940337419509888, "learning_rate": 5.229325127982729e-05, "loss": 0.1793, "step": 5432 }, { "epoch": 3.7884832517877305, "grad_norm": 0.19703231751918793, "learning_rate": 5.2278397144419635e-05, "loss": 0.1844, "step": 5433 }, { "epoch": 3.7892359804290554, "grad_norm": 0.1536184549331665, "learning_rate": 5.226354330664238e-05, "loss": 0.0956, "step": 5434 }, { "epoch": 3.7899887090703803, "grad_norm": 0.19047437608242035, "learning_rate": 5.224868976811992e-05, "loss": 0.191, "step": 5435 }, { "epoch": 3.790741437711705, "grad_norm": 0.2139551043510437, "learning_rate": 5.223383653047661e-05, "loss": 0.1455, "step": 5436 }, { "epoch": 3.7914941663530297, "grad_norm": 0.20845972001552582, "learning_rate": 5.221898359533677e-05, "loss": 0.1455, "step": 5437 }, { "epoch": 3.7922468949943546, "grad_norm": 0.1640598475933075, "learning_rate": 5.2204130964324695e-05, "loss": 0.1235, "step": 5438 }, { "epoch": 3.792999623635679, "grad_norm": 0.20314091444015503, "learning_rate": 5.2189278639064666e-05, "loss": 0.1742, "step": 5439 }, { "epoch": 3.793752352277004, "grad_norm": 0.17720352113246918, "learning_rate": 5.2174426621180906e-05, "loss": 0.1946, "step": 5440 }, { "epoch": 3.794505080918329, "grad_norm": 0.18817782402038574, "learning_rate": 5.215957491229759e-05, "loss": 0.1856, "step": 5441 }, { "epoch": 3.795257809559654, "grad_norm": 0.15980221331119537, "learning_rate": 5.21447235140389e-05, "loss": 0.1226, "step": 5442 }, { "epoch": 3.796010538200979, "grad_norm": 0.17705294489860535, "learning_rate": 5.212987242802896e-05, "loss": 0.178, "step": 5443 }, { "epoch": 3.7967632668423033, "grad_norm": 0.17602728307247162, "learning_rate": 5.211502165589184e-05, "loss": 0.1339, "step": 5444 }, { "epoch": 3.797515995483628, "grad_norm": 0.1776461899280548, "learning_rate": 5.21001711992516e-05, "loss": 0.2, "step": 5445 }, { "epoch": 3.798268724124953, "grad_norm": 0.19782474637031555, "learning_rate": 5.208532105973231e-05, "loss": 0.154, "step": 5446 }, { "epoch": 3.7990214527662776, "grad_norm": 0.18478217720985413, "learning_rate": 5.207047123895792e-05, "loss": 0.1665, "step": 5447 }, { "epoch": 3.7997741814076025, "grad_norm": 0.17922237515449524, "learning_rate": 5.205562173855238e-05, "loss": 0.146, "step": 5448 }, { "epoch": 3.8005269100489274, "grad_norm": 0.14374940097332, "learning_rate": 5.204077256013963e-05, "loss": 0.1165, "step": 5449 }, { "epoch": 3.8012796386902523, "grad_norm": 0.17213982343673706, "learning_rate": 5.2025923705343535e-05, "loss": 0.2155, "step": 5450 }, { "epoch": 3.8020323673315772, "grad_norm": 0.19447895884513855, "learning_rate": 5.201107517578795e-05, "loss": 0.1196, "step": 5451 }, { "epoch": 3.8027850959729017, "grad_norm": 0.2245078980922699, "learning_rate": 5.19962269730967e-05, "loss": 0.2863, "step": 5452 }, { "epoch": 3.8035378246142266, "grad_norm": 0.18872109055519104, "learning_rate": 5.198137909889355e-05, "loss": 0.1295, "step": 5453 }, { "epoch": 3.8042905532555515, "grad_norm": 0.2691500782966614, "learning_rate": 5.196653155480223e-05, "loss": 0.1586, "step": 5454 }, { "epoch": 3.805043281896876, "grad_norm": 0.1733262836933136, "learning_rate": 5.195168434244647e-05, "loss": 0.2098, "step": 5455 }, { "epoch": 3.805796010538201, "grad_norm": 0.1855417788028717, "learning_rate": 5.1936837463449925e-05, "loss": 0.1606, "step": 5456 }, { "epoch": 3.806548739179526, "grad_norm": 0.17817793786525726, "learning_rate": 5.1921990919436234e-05, "loss": 0.1344, "step": 5457 }, { "epoch": 3.8073014678208503, "grad_norm": 0.2089070826768875, "learning_rate": 5.190714471202898e-05, "loss": 0.2582, "step": 5458 }, { "epoch": 3.8080541964621752, "grad_norm": 0.17886687815189362, "learning_rate": 5.189229884285174e-05, "loss": 0.2092, "step": 5459 }, { "epoch": 3.8088069251035, "grad_norm": 0.19514812529087067, "learning_rate": 5.187745331352802e-05, "loss": 0.1462, "step": 5460 }, { "epoch": 3.809559653744825, "grad_norm": 0.20899391174316406, "learning_rate": 5.186260812568132e-05, "loss": 0.23, "step": 5461 }, { "epoch": 3.81031238238615, "grad_norm": 0.2028789222240448, "learning_rate": 5.184776328093508e-05, "loss": 0.1377, "step": 5462 }, { "epoch": 3.8110651110274745, "grad_norm": 0.1688479334115982, "learning_rate": 5.18329187809127e-05, "loss": 0.0873, "step": 5463 }, { "epoch": 3.8118178396687994, "grad_norm": 0.19856636226177216, "learning_rate": 5.181807462723755e-05, "loss": 0.1118, "step": 5464 }, { "epoch": 3.8125705683101243, "grad_norm": 0.19294849038124084, "learning_rate": 5.180323082153298e-05, "loss": 0.1598, "step": 5465 }, { "epoch": 3.813323296951449, "grad_norm": 0.17532788217067719, "learning_rate": 5.17883873654223e-05, "loss": 0.1548, "step": 5466 }, { "epoch": 3.8140760255927737, "grad_norm": 0.16419808566570282, "learning_rate": 5.177354426052873e-05, "loss": 0.0906, "step": 5467 }, { "epoch": 3.8148287542340986, "grad_norm": 0.16860421001911163, "learning_rate": 5.175870150847552e-05, "loss": 0.1353, "step": 5468 }, { "epoch": 3.8155814828754235, "grad_norm": 0.1740824580192566, "learning_rate": 5.174385911088584e-05, "loss": 0.1658, "step": 5469 }, { "epoch": 3.8163342115167485, "grad_norm": 0.18234539031982422, "learning_rate": 5.17290170693828e-05, "loss": 0.2083, "step": 5470 }, { "epoch": 3.817086940158073, "grad_norm": 0.1639629751443863, "learning_rate": 5.171417538558957e-05, "loss": 0.1445, "step": 5471 }, { "epoch": 3.817839668799398, "grad_norm": 0.15640610456466675, "learning_rate": 5.169933406112917e-05, "loss": 0.1222, "step": 5472 }, { "epoch": 3.8185923974407228, "grad_norm": 0.16791021823883057, "learning_rate": 5.1684493097624644e-05, "loss": 0.1683, "step": 5473 }, { "epoch": 3.8193451260820472, "grad_norm": 0.16085907816886902, "learning_rate": 5.166965249669896e-05, "loss": 0.0864, "step": 5474 }, { "epoch": 3.820097854723372, "grad_norm": 0.15336690843105316, "learning_rate": 5.165481225997507e-05, "loss": 0.1943, "step": 5475 }, { "epoch": 3.820850583364697, "grad_norm": 0.18072471022605896, "learning_rate": 5.163997238907589e-05, "loss": 0.2174, "step": 5476 }, { "epoch": 3.821603312006022, "grad_norm": 0.19400781393051147, "learning_rate": 5.162513288562425e-05, "loss": 0.2194, "step": 5477 }, { "epoch": 3.822356040647347, "grad_norm": 0.1884496957063675, "learning_rate": 5.161029375124303e-05, "loss": 0.1384, "step": 5478 }, { "epoch": 3.8231087692886714, "grad_norm": 0.16545067727565765, "learning_rate": 5.159545498755498e-05, "loss": 0.1496, "step": 5479 }, { "epoch": 3.8238614979299963, "grad_norm": 0.1656220555305481, "learning_rate": 5.158061659618286e-05, "loss": 0.1439, "step": 5480 }, { "epoch": 3.824614226571321, "grad_norm": 0.17456191778182983, "learning_rate": 5.156577857874938e-05, "loss": 0.1508, "step": 5481 }, { "epoch": 3.8253669552126457, "grad_norm": 0.16888447105884552, "learning_rate": 5.155094093687717e-05, "loss": 0.122, "step": 5482 }, { "epoch": 3.8261196838539706, "grad_norm": 0.170962855219841, "learning_rate": 5.1536103672188865e-05, "loss": 0.1172, "step": 5483 }, { "epoch": 3.8268724124952955, "grad_norm": 0.16534298658370972, "learning_rate": 5.152126678630708e-05, "loss": 0.1413, "step": 5484 }, { "epoch": 3.82762514113662, "grad_norm": 0.1662570685148239, "learning_rate": 5.150643028085431e-05, "loss": 0.1778, "step": 5485 }, { "epoch": 3.828377869777945, "grad_norm": 0.205159991979599, "learning_rate": 5.1491594157453095e-05, "loss": 0.1886, "step": 5486 }, { "epoch": 3.82913059841927, "grad_norm": 0.15973663330078125, "learning_rate": 5.147675841772586e-05, "loss": 0.089, "step": 5487 }, { "epoch": 3.8298833270605948, "grad_norm": 0.2331833839416504, "learning_rate": 5.1461923063295024e-05, "loss": 0.1349, "step": 5488 }, { "epoch": 3.8306360557019197, "grad_norm": 0.1523866206407547, "learning_rate": 5.1447088095782944e-05, "loss": 0.1098, "step": 5489 }, { "epoch": 3.831388784343244, "grad_norm": 0.23449061810970306, "learning_rate": 5.143225351681198e-05, "loss": 0.2013, "step": 5490 }, { "epoch": 3.832141512984569, "grad_norm": 0.17600645124912262, "learning_rate": 5.141741932800442e-05, "loss": 0.166, "step": 5491 }, { "epoch": 3.832894241625894, "grad_norm": 0.19061972200870514, "learning_rate": 5.1402585530982485e-05, "loss": 0.1601, "step": 5492 }, { "epoch": 3.8336469702672185, "grad_norm": 0.23552276194095612, "learning_rate": 5.13877521273684e-05, "loss": 0.1578, "step": 5493 }, { "epoch": 3.8343996989085434, "grad_norm": 0.20264212787151337, "learning_rate": 5.1372919118784305e-05, "loss": 0.1922, "step": 5494 }, { "epoch": 3.8351524275498683, "grad_norm": 0.18261215090751648, "learning_rate": 5.135808650685232e-05, "loss": 0.2229, "step": 5495 }, { "epoch": 3.835905156191193, "grad_norm": 0.18798880279064178, "learning_rate": 5.13432542931945e-05, "loss": 0.1592, "step": 5496 }, { "epoch": 3.836657884832518, "grad_norm": 0.1631343960762024, "learning_rate": 5.1328422479432915e-05, "loss": 0.1712, "step": 5497 }, { "epoch": 3.8374106134738426, "grad_norm": 0.19148829579353333, "learning_rate": 5.1313591067189525e-05, "loss": 0.1906, "step": 5498 }, { "epoch": 3.8381633421151675, "grad_norm": 0.1650860458612442, "learning_rate": 5.129876005808627e-05, "loss": 0.1207, "step": 5499 }, { "epoch": 3.8389160707564924, "grad_norm": 0.1877177208662033, "learning_rate": 5.128392945374505e-05, "loss": 0.1195, "step": 5500 }, { "epoch": 3.839668799397817, "grad_norm": 0.23256826400756836, "learning_rate": 5.1269099255787725e-05, "loss": 0.1748, "step": 5501 }, { "epoch": 3.840421528039142, "grad_norm": 0.17633099853992462, "learning_rate": 5.125426946583608e-05, "loss": 0.2251, "step": 5502 }, { "epoch": 3.8411742566804667, "grad_norm": 0.1696244776248932, "learning_rate": 5.123944008551191e-05, "loss": 0.1337, "step": 5503 }, { "epoch": 3.841926985321791, "grad_norm": 0.20304381847381592, "learning_rate": 5.122461111643692e-05, "loss": 0.1412, "step": 5504 }, { "epoch": 3.842679713963116, "grad_norm": 0.15329620242118835, "learning_rate": 5.1209782560232765e-05, "loss": 0.1005, "step": 5505 }, { "epoch": 3.843432442604441, "grad_norm": 0.1839240938425064, "learning_rate": 5.11949544185211e-05, "loss": 0.1653, "step": 5506 }, { "epoch": 3.844185171245766, "grad_norm": 0.18411104381084442, "learning_rate": 5.1180126692923515e-05, "loss": 0.1332, "step": 5507 }, { "epoch": 3.844937899887091, "grad_norm": 0.20269116759300232, "learning_rate": 5.1165299385061496e-05, "loss": 0.1396, "step": 5508 }, { "epoch": 3.8456906285284154, "grad_norm": 0.17911048233509064, "learning_rate": 5.11504724965566e-05, "loss": 0.1446, "step": 5509 }, { "epoch": 3.8464433571697403, "grad_norm": 0.22531037032604218, "learning_rate": 5.1135646029030236e-05, "loss": 0.1971, "step": 5510 }, { "epoch": 3.847196085811065, "grad_norm": 0.15924705564975739, "learning_rate": 5.1120819984103816e-05, "loss": 0.0872, "step": 5511 }, { "epoch": 3.8479488144523897, "grad_norm": 0.1996069699525833, "learning_rate": 5.110599436339869e-05, "loss": 0.1446, "step": 5512 }, { "epoch": 3.8487015430937146, "grad_norm": 0.1675160974264145, "learning_rate": 5.109116916853617e-05, "loss": 0.119, "step": 5513 }, { "epoch": 3.8494542717350395, "grad_norm": 0.23837320506572723, "learning_rate": 5.107634440113751e-05, "loss": 0.2637, "step": 5514 }, { "epoch": 3.8502070003763644, "grad_norm": 0.20101051032543182, "learning_rate": 5.10615200628239e-05, "loss": 0.1333, "step": 5515 }, { "epoch": 3.8509597290176893, "grad_norm": 0.21643894910812378, "learning_rate": 5.104669615521657e-05, "loss": 0.1989, "step": 5516 }, { "epoch": 3.851712457659014, "grad_norm": 0.21060416102409363, "learning_rate": 5.103187267993659e-05, "loss": 0.2288, "step": 5517 }, { "epoch": 3.8524651863003387, "grad_norm": 0.17827077209949493, "learning_rate": 5.1017049638605044e-05, "loss": 0.1095, "step": 5518 }, { "epoch": 3.8532179149416637, "grad_norm": 0.22666989266872406, "learning_rate": 5.1002227032842976e-05, "loss": 0.1269, "step": 5519 }, { "epoch": 3.853970643582988, "grad_norm": 0.21276147663593292, "learning_rate": 5.098740486427134e-05, "loss": 0.1398, "step": 5520 }, { "epoch": 3.854723372224313, "grad_norm": 0.17461122572422028, "learning_rate": 5.097258313451106e-05, "loss": 0.0666, "step": 5521 }, { "epoch": 3.855476100865638, "grad_norm": 0.17997239530086517, "learning_rate": 5.095776184518305e-05, "loss": 0.1431, "step": 5522 }, { "epoch": 3.856228829506963, "grad_norm": 0.17068101465702057, "learning_rate": 5.094294099790813e-05, "loss": 0.1441, "step": 5523 }, { "epoch": 3.856981558148288, "grad_norm": 0.18549509346485138, "learning_rate": 5.092812059430708e-05, "loss": 0.1398, "step": 5524 }, { "epoch": 3.8577342867896123, "grad_norm": 0.22579117119312286, "learning_rate": 5.0913300636000624e-05, "loss": 0.1721, "step": 5525 }, { "epoch": 3.858487015430937, "grad_norm": 0.24943433701992035, "learning_rate": 5.0898481124609485e-05, "loss": 0.203, "step": 5526 }, { "epoch": 3.859239744072262, "grad_norm": 0.24256478250026703, "learning_rate": 5.088366206175428e-05, "loss": 0.1916, "step": 5527 }, { "epoch": 3.8599924727135866, "grad_norm": 0.20794977247714996, "learning_rate": 5.08688434490556e-05, "loss": 0.1474, "step": 5528 }, { "epoch": 3.8607452013549115, "grad_norm": 0.1851341426372528, "learning_rate": 5.085402528813399e-05, "loss": 0.1841, "step": 5529 }, { "epoch": 3.8614979299962364, "grad_norm": 0.22855518758296967, "learning_rate": 5.083920758060995e-05, "loss": 0.1448, "step": 5530 }, { "epoch": 3.862250658637561, "grad_norm": 0.16604267060756683, "learning_rate": 5.082439032810391e-05, "loss": 0.1367, "step": 5531 }, { "epoch": 3.863003387278886, "grad_norm": 0.16825447976589203, "learning_rate": 5.0809573532236264e-05, "loss": 0.1145, "step": 5532 }, { "epoch": 3.8637561159202107, "grad_norm": 0.20263521373271942, "learning_rate": 5.0794757194627354e-05, "loss": 0.1619, "step": 5533 }, { "epoch": 3.8645088445615356, "grad_norm": 0.18418656289577484, "learning_rate": 5.077994131689746e-05, "loss": 0.2024, "step": 5534 }, { "epoch": 3.8652615732028606, "grad_norm": 0.21938222646713257, "learning_rate": 5.076512590066685e-05, "loss": 0.1831, "step": 5535 }, { "epoch": 3.866014301844185, "grad_norm": 0.20541448891162872, "learning_rate": 5.0750310947555716e-05, "loss": 0.2303, "step": 5536 }, { "epoch": 3.86676703048551, "grad_norm": 0.17903806269168854, "learning_rate": 5.073549645918418e-05, "loss": 0.1195, "step": 5537 }, { "epoch": 3.867519759126835, "grad_norm": 0.17044906318187714, "learning_rate": 5.072068243717234e-05, "loss": 0.104, "step": 5538 }, { "epoch": 3.8682724877681594, "grad_norm": 0.18774886429309845, "learning_rate": 5.070586888314025e-05, "loss": 0.0973, "step": 5539 }, { "epoch": 3.8690252164094843, "grad_norm": 0.1689593493938446, "learning_rate": 5.0691055798707844e-05, "loss": 0.128, "step": 5540 }, { "epoch": 3.869777945050809, "grad_norm": 0.1910451054573059, "learning_rate": 5.067624318549512e-05, "loss": 0.085, "step": 5541 }, { "epoch": 3.870530673692134, "grad_norm": 0.18809866905212402, "learning_rate": 5.066143104512194e-05, "loss": 0.1784, "step": 5542 }, { "epoch": 3.871283402333459, "grad_norm": 0.17588992416858673, "learning_rate": 5.064661937920815e-05, "loss": 0.1925, "step": 5543 }, { "epoch": 3.8720361309747835, "grad_norm": 0.19011704623699188, "learning_rate": 5.063180818937351e-05, "loss": 0.1762, "step": 5544 }, { "epoch": 3.8727888596161084, "grad_norm": 0.19696520268917084, "learning_rate": 5.061699747723776e-05, "loss": 0.1618, "step": 5545 }, { "epoch": 3.8735415882574333, "grad_norm": 0.2028598040342331, "learning_rate": 5.0602187244420565e-05, "loss": 0.1375, "step": 5546 }, { "epoch": 3.874294316898758, "grad_norm": 0.143503800034523, "learning_rate": 5.058737749254157e-05, "loss": 0.1442, "step": 5547 }, { "epoch": 3.8750470455400827, "grad_norm": 0.18804767727851868, "learning_rate": 5.057256822322034e-05, "loss": 0.0704, "step": 5548 }, { "epoch": 3.8757997741814076, "grad_norm": 0.1785336583852768, "learning_rate": 5.0557759438076394e-05, "loss": 0.1931, "step": 5549 }, { "epoch": 3.8765525028227326, "grad_norm": 0.17008760571479797, "learning_rate": 5.054295113872918e-05, "loss": 0.1647, "step": 5550 }, { "epoch": 3.8773052314640575, "grad_norm": 0.18146558105945587, "learning_rate": 5.052814332679815e-05, "loss": 0.1518, "step": 5551 }, { "epoch": 3.878057960105382, "grad_norm": 0.20342278480529785, "learning_rate": 5.0513336003902635e-05, "loss": 0.2362, "step": 5552 }, { "epoch": 3.878810688746707, "grad_norm": 0.23679795861244202, "learning_rate": 5.0498529171661944e-05, "loss": 0.1747, "step": 5553 }, { "epoch": 3.879563417388032, "grad_norm": 0.16843636333942413, "learning_rate": 5.048372283169532e-05, "loss": 0.1597, "step": 5554 }, { "epoch": 3.8803161460293563, "grad_norm": 0.18239332735538483, "learning_rate": 5.0468916985622e-05, "loss": 0.1852, "step": 5555 }, { "epoch": 3.881068874670681, "grad_norm": 0.17118492722511292, "learning_rate": 5.045411163506112e-05, "loss": 0.1252, "step": 5556 }, { "epoch": 3.881821603312006, "grad_norm": 0.173936665058136, "learning_rate": 5.043930678163174e-05, "loss": 0.1569, "step": 5557 }, { "epoch": 3.8825743319533306, "grad_norm": 0.2199733555316925, "learning_rate": 5.042450242695291e-05, "loss": 0.1479, "step": 5558 }, { "epoch": 3.8833270605946555, "grad_norm": 0.2346131056547165, "learning_rate": 5.0409698572643604e-05, "loss": 0.1554, "step": 5559 }, { "epoch": 3.8840797892359804, "grad_norm": 0.22555670142173767, "learning_rate": 5.039489522032278e-05, "loss": 0.2333, "step": 5560 }, { "epoch": 3.8848325178773053, "grad_norm": 0.21902860701084137, "learning_rate": 5.0380092371609285e-05, "loss": 0.266, "step": 5561 }, { "epoch": 3.8855852465186302, "grad_norm": 0.15691551566123962, "learning_rate": 5.0365290028121944e-05, "loss": 0.1959, "step": 5562 }, { "epoch": 3.8863379751599547, "grad_norm": 0.16143685579299927, "learning_rate": 5.0350488191479515e-05, "loss": 0.1517, "step": 5563 }, { "epoch": 3.8870907038012796, "grad_norm": 0.17459887266159058, "learning_rate": 5.033568686330071e-05, "loss": 0.1132, "step": 5564 }, { "epoch": 3.8878434324426046, "grad_norm": 0.17813917994499207, "learning_rate": 5.0320886045204186e-05, "loss": 0.1619, "step": 5565 }, { "epoch": 3.888596161083929, "grad_norm": 0.18875645101070404, "learning_rate": 5.0306085738808505e-05, "loss": 0.1485, "step": 5566 }, { "epoch": 3.889348889725254, "grad_norm": 0.16573403775691986, "learning_rate": 5.0291285945732245e-05, "loss": 0.0743, "step": 5567 }, { "epoch": 3.890101618366579, "grad_norm": 0.23036405444145203, "learning_rate": 5.027648666759388e-05, "loss": 0.162, "step": 5568 }, { "epoch": 3.890854347007904, "grad_norm": 0.20022965967655182, "learning_rate": 5.026168790601182e-05, "loss": 0.1406, "step": 5569 }, { "epoch": 3.8916070756492287, "grad_norm": 0.15441563725471497, "learning_rate": 5.024688966260446e-05, "loss": 0.0665, "step": 5570 }, { "epoch": 3.892359804290553, "grad_norm": 0.17915195226669312, "learning_rate": 5.0232091938990075e-05, "loss": 0.0666, "step": 5571 }, { "epoch": 3.893112532931878, "grad_norm": 0.1826554536819458, "learning_rate": 5.0217294736786955e-05, "loss": 0.1791, "step": 5572 }, { "epoch": 3.893865261573203, "grad_norm": 0.1766299158334732, "learning_rate": 5.020249805761331e-05, "loss": 0.1939, "step": 5573 }, { "epoch": 3.8946179902145275, "grad_norm": 0.19572578370571136, "learning_rate": 5.018770190308726e-05, "loss": 0.1614, "step": 5574 }, { "epoch": 3.8953707188558524, "grad_norm": 0.17809663712978363, "learning_rate": 5.017290627482688e-05, "loss": 0.1976, "step": 5575 }, { "epoch": 3.8961234474971773, "grad_norm": 0.1683858036994934, "learning_rate": 5.015811117445023e-05, "loss": 0.1674, "step": 5576 }, { "epoch": 3.8968761761385022, "grad_norm": 0.2023787796497345, "learning_rate": 5.014331660357525e-05, "loss": 0.1379, "step": 5577 }, { "epoch": 3.8976289047798267, "grad_norm": 0.19207806885242462, "learning_rate": 5.012852256381987e-05, "loss": 0.0959, "step": 5578 }, { "epoch": 3.8983816334211516, "grad_norm": 0.1670522689819336, "learning_rate": 5.011372905680193e-05, "loss": 0.1439, "step": 5579 }, { "epoch": 3.8991343620624765, "grad_norm": 0.18237614631652832, "learning_rate": 5.009893608413924e-05, "loss": 0.1899, "step": 5580 }, { "epoch": 3.8998870907038015, "grad_norm": 0.18145188689231873, "learning_rate": 5.008414364744953e-05, "loss": 0.1636, "step": 5581 }, { "epoch": 3.900639819345126, "grad_norm": 0.1630517989397049, "learning_rate": 5.006935174835049e-05, "loss": 0.1477, "step": 5582 }, { "epoch": 3.901392547986451, "grad_norm": 0.18013162910938263, "learning_rate": 5.005456038845973e-05, "loss": 0.1657, "step": 5583 }, { "epoch": 3.9021452766277758, "grad_norm": 0.1763620376586914, "learning_rate": 5.00397695693948e-05, "loss": 0.1406, "step": 5584 }, { "epoch": 3.9028980052691002, "grad_norm": 0.19642247259616852, "learning_rate": 5.0024979292773197e-05, "loss": 0.1885, "step": 5585 }, { "epoch": 3.903650733910425, "grad_norm": 0.19991780817508698, "learning_rate": 5.001018956021239e-05, "loss": 0.1556, "step": 5586 }, { "epoch": 3.90440346255175, "grad_norm": 0.16463792324066162, "learning_rate": 4.999540037332977e-05, "loss": 0.1394, "step": 5587 }, { "epoch": 3.905156191193075, "grad_norm": 0.20666934549808502, "learning_rate": 4.998061173374263e-05, "loss": 0.2144, "step": 5588 }, { "epoch": 3.9059089198344, "grad_norm": 0.21359170973300934, "learning_rate": 4.9965823643068245e-05, "loss": 0.1641, "step": 5589 }, { "epoch": 3.9066616484757244, "grad_norm": 0.16680647432804108, "learning_rate": 4.995103610292382e-05, "loss": 0.1649, "step": 5590 }, { "epoch": 3.9074143771170493, "grad_norm": 0.1926252841949463, "learning_rate": 4.993624911492647e-05, "loss": 0.1619, "step": 5591 }, { "epoch": 3.9081671057583742, "grad_norm": 0.1912795603275299, "learning_rate": 4.992146268069333e-05, "loss": 0.1367, "step": 5592 }, { "epoch": 3.9089198343996987, "grad_norm": 0.19530072808265686, "learning_rate": 4.990667680184139e-05, "loss": 0.1486, "step": 5593 }, { "epoch": 3.9096725630410236, "grad_norm": 0.17908675968647003, "learning_rate": 4.9891891479987616e-05, "loss": 0.0927, "step": 5594 }, { "epoch": 3.9104252916823485, "grad_norm": 0.1526760756969452, "learning_rate": 4.987710671674892e-05, "loss": 0.1015, "step": 5595 }, { "epoch": 3.9111780203236735, "grad_norm": 0.18550710380077362, "learning_rate": 4.986232251374211e-05, "loss": 0.1571, "step": 5596 }, { "epoch": 3.9119307489649984, "grad_norm": 0.2258579432964325, "learning_rate": 4.9847538872584e-05, "loss": 0.1688, "step": 5597 }, { "epoch": 3.912683477606323, "grad_norm": 0.17378713190555573, "learning_rate": 4.9832755794891285e-05, "loss": 0.1521, "step": 5598 }, { "epoch": 3.9134362062476478, "grad_norm": 0.21581187844276428, "learning_rate": 4.981797328228064e-05, "loss": 0.1188, "step": 5599 }, { "epoch": 3.9141889348889727, "grad_norm": 0.19749973714351654, "learning_rate": 4.980319133636863e-05, "loss": 0.142, "step": 5600 }, { "epoch": 3.9141889348889727, "eval_loss": 0.1907777190208435, "eval_runtime": 456.8255, "eval_samples_per_second": 21.074, "eval_steps_per_second": 0.659, "step": 5600 }, { "epoch": 3.914941663530297, "grad_norm": 0.1902584731578827, "learning_rate": 4.978840995877181e-05, "loss": 0.1289, "step": 5601 }, { "epoch": 3.915694392171622, "grad_norm": 0.21376074850559235, "learning_rate": 4.9773629151106646e-05, "loss": 0.1748, "step": 5602 }, { "epoch": 3.916447120812947, "grad_norm": 0.17716923356056213, "learning_rate": 4.975884891498954e-05, "loss": 0.1928, "step": 5603 }, { "epoch": 3.9171998494542715, "grad_norm": 0.17546722292900085, "learning_rate": 4.974406925203682e-05, "loss": 0.1272, "step": 5604 }, { "epoch": 3.9179525780955964, "grad_norm": 0.19311591982841492, "learning_rate": 4.9729290163864796e-05, "loss": 0.1118, "step": 5605 }, { "epoch": 3.9187053067369213, "grad_norm": 0.17118874192237854, "learning_rate": 4.971451165208968e-05, "loss": 0.0613, "step": 5606 }, { "epoch": 3.919458035378246, "grad_norm": 0.17441439628601074, "learning_rate": 4.9699733718327624e-05, "loss": 0.0911, "step": 5607 }, { "epoch": 3.920210764019571, "grad_norm": 0.18452978134155273, "learning_rate": 4.9684956364194713e-05, "loss": 0.142, "step": 5608 }, { "epoch": 3.9209634926608956, "grad_norm": 0.1989646553993225, "learning_rate": 4.9670179591306985e-05, "loss": 0.139, "step": 5609 }, { "epoch": 3.9217162213022205, "grad_norm": 0.19966000318527222, "learning_rate": 4.9655403401280396e-05, "loss": 0.1386, "step": 5610 }, { "epoch": 3.9224689499435454, "grad_norm": 0.25986340641975403, "learning_rate": 4.9640627795730866e-05, "loss": 0.1716, "step": 5611 }, { "epoch": 3.92322167858487, "grad_norm": 0.15773652493953705, "learning_rate": 4.962585277627422e-05, "loss": 0.1807, "step": 5612 }, { "epoch": 3.923974407226195, "grad_norm": 0.20271751284599304, "learning_rate": 4.961107834452624e-05, "loss": 0.1587, "step": 5613 }, { "epoch": 3.9247271358675198, "grad_norm": 0.14047613739967346, "learning_rate": 4.959630450210263e-05, "loss": 0.103, "step": 5614 }, { "epoch": 3.9254798645088447, "grad_norm": 0.204386904835701, "learning_rate": 4.958153125061903e-05, "loss": 0.1719, "step": 5615 }, { "epoch": 3.9262325931501696, "grad_norm": 0.18300989270210266, "learning_rate": 4.956675859169102e-05, "loss": 0.1623, "step": 5616 }, { "epoch": 3.926985321791494, "grad_norm": 0.19719748198986053, "learning_rate": 4.955198652693411e-05, "loss": 0.1334, "step": 5617 }, { "epoch": 3.927738050432819, "grad_norm": 0.1655561923980713, "learning_rate": 4.953721505796379e-05, "loss": 0.1078, "step": 5618 }, { "epoch": 3.928490779074144, "grad_norm": 0.18891353905200958, "learning_rate": 4.952244418639541e-05, "loss": 0.1639, "step": 5619 }, { "epoch": 3.9292435077154684, "grad_norm": 0.19476942718029022, "learning_rate": 4.95076739138443e-05, "loss": 0.2013, "step": 5620 }, { "epoch": 3.9299962363567933, "grad_norm": 0.18363676965236664, "learning_rate": 4.9492904241925694e-05, "loss": 0.1178, "step": 5621 }, { "epoch": 3.930748964998118, "grad_norm": 0.19702577590942383, "learning_rate": 4.947813517225483e-05, "loss": 0.1921, "step": 5622 }, { "epoch": 3.931501693639443, "grad_norm": 0.15430386364459991, "learning_rate": 4.946336670644678e-05, "loss": 0.1006, "step": 5623 }, { "epoch": 3.932254422280768, "grad_norm": 0.1873263120651245, "learning_rate": 4.9448598846116635e-05, "loss": 0.1587, "step": 5624 }, { "epoch": 3.9330071509220925, "grad_norm": 0.17300963401794434, "learning_rate": 4.9433831592879355e-05, "loss": 0.1171, "step": 5625 }, { "epoch": 3.9337598795634174, "grad_norm": 0.1741252839565277, "learning_rate": 4.94190649483499e-05, "loss": 0.1233, "step": 5626 }, { "epoch": 3.9345126082047424, "grad_norm": 0.17193397879600525, "learning_rate": 4.940429891414312e-05, "loss": 0.1703, "step": 5627 }, { "epoch": 3.935265336846067, "grad_norm": 0.15222813189029694, "learning_rate": 4.938953349187378e-05, "loss": 0.1227, "step": 5628 }, { "epoch": 3.9360180654873917, "grad_norm": 0.18563447892665863, "learning_rate": 4.9374768683156615e-05, "loss": 0.1638, "step": 5629 }, { "epoch": 3.9367707941287167, "grad_norm": 0.20521798729896545, "learning_rate": 4.936000448960631e-05, "loss": 0.153, "step": 5630 }, { "epoch": 3.937523522770041, "grad_norm": 0.15773384273052216, "learning_rate": 4.934524091283743e-05, "loss": 0.1272, "step": 5631 }, { "epoch": 3.938276251411366, "grad_norm": 0.2164810746908188, "learning_rate": 4.93304779544645e-05, "loss": 0.1904, "step": 5632 }, { "epoch": 3.939028980052691, "grad_norm": 0.19384844601154327, "learning_rate": 4.931571561610198e-05, "loss": 0.242, "step": 5633 }, { "epoch": 3.939781708694016, "grad_norm": 0.18863673508167267, "learning_rate": 4.9300953899364255e-05, "loss": 0.1527, "step": 5634 }, { "epoch": 3.940534437335341, "grad_norm": 0.18415243923664093, "learning_rate": 4.9286192805865636e-05, "loss": 0.1042, "step": 5635 }, { "epoch": 3.9412871659766653, "grad_norm": 0.162299245595932, "learning_rate": 4.927143233722037e-05, "loss": 0.078, "step": 5636 }, { "epoch": 3.94203989461799, "grad_norm": 0.1674528867006302, "learning_rate": 4.925667249504267e-05, "loss": 0.1289, "step": 5637 }, { "epoch": 3.942792623259315, "grad_norm": 0.18776971101760864, "learning_rate": 4.924191328094663e-05, "loss": 0.0854, "step": 5638 }, { "epoch": 3.9435453519006396, "grad_norm": 0.17734220623970032, "learning_rate": 4.92271546965463e-05, "loss": 0.1916, "step": 5639 }, { "epoch": 3.9442980805419645, "grad_norm": 0.1859501153230667, "learning_rate": 4.9212396743455637e-05, "loss": 0.1768, "step": 5640 }, { "epoch": 3.9450508091832894, "grad_norm": 0.1682136207818985, "learning_rate": 4.919763942328858e-05, "loss": 0.1292, "step": 5641 }, { "epoch": 3.9458035378246143, "grad_norm": 0.21898455917835236, "learning_rate": 4.9182882737658934e-05, "loss": 0.2374, "step": 5642 }, { "epoch": 3.9465562664659393, "grad_norm": 0.1586892157793045, "learning_rate": 4.9168126688180496e-05, "loss": 0.1152, "step": 5643 }, { "epoch": 3.9473089951072637, "grad_norm": 0.1747422069311142, "learning_rate": 4.9153371276466956e-05, "loss": 0.1482, "step": 5644 }, { "epoch": 3.9480617237485887, "grad_norm": 0.18626976013183594, "learning_rate": 4.913861650413194e-05, "loss": 0.2287, "step": 5645 }, { "epoch": 3.9488144523899136, "grad_norm": 0.1595603972673416, "learning_rate": 4.912386237278901e-05, "loss": 0.098, "step": 5646 }, { "epoch": 3.949567181031238, "grad_norm": 0.1853874772787094, "learning_rate": 4.910910888405167e-05, "loss": 0.1883, "step": 5647 }, { "epoch": 3.950319909672563, "grad_norm": 0.17647913098335266, "learning_rate": 4.9094356039533306e-05, "loss": 0.1407, "step": 5648 }, { "epoch": 3.951072638313888, "grad_norm": 0.14522375166416168, "learning_rate": 4.907960384084729e-05, "loss": 0.1782, "step": 5649 }, { "epoch": 3.951825366955213, "grad_norm": 0.17584176361560822, "learning_rate": 4.90648522896069e-05, "loss": 0.1951, "step": 5650 }, { "epoch": 3.9525780955965377, "grad_norm": 0.1663157045841217, "learning_rate": 4.905010138742534e-05, "loss": 0.1819, "step": 5651 }, { "epoch": 3.953330824237862, "grad_norm": 0.1859639585018158, "learning_rate": 4.903535113591576e-05, "loss": 0.1507, "step": 5652 }, { "epoch": 3.954083552879187, "grad_norm": 0.16495336592197418, "learning_rate": 4.90206015366912e-05, "loss": 0.101, "step": 5653 }, { "epoch": 3.954836281520512, "grad_norm": 0.16853058338165283, "learning_rate": 4.900585259136468e-05, "loss": 0.1314, "step": 5654 }, { "epoch": 3.9555890101618365, "grad_norm": 0.20489123463630676, "learning_rate": 4.899110430154907e-05, "loss": 0.124, "step": 5655 }, { "epoch": 3.9563417388031614, "grad_norm": 0.18771392107009888, "learning_rate": 4.897635666885729e-05, "loss": 0.2283, "step": 5656 }, { "epoch": 3.9570944674444863, "grad_norm": 0.24027958512306213, "learning_rate": 4.896160969490209e-05, "loss": 0.138, "step": 5657 }, { "epoch": 3.957847196085811, "grad_norm": 0.21181146800518036, "learning_rate": 4.894686338129617e-05, "loss": 0.1735, "step": 5658 }, { "epoch": 3.9585999247271357, "grad_norm": 0.15058162808418274, "learning_rate": 4.893211772965217e-05, "loss": 0.1092, "step": 5659 }, { "epoch": 3.9593526533684607, "grad_norm": 0.17465995252132416, "learning_rate": 4.8917372741582655e-05, "loss": 0.0661, "step": 5660 }, { "epoch": 3.9601053820097856, "grad_norm": 0.1905631422996521, "learning_rate": 4.8902628418700093e-05, "loss": 0.1347, "step": 5661 }, { "epoch": 3.9608581106511105, "grad_norm": 0.19959789514541626, "learning_rate": 4.888788476261693e-05, "loss": 0.1067, "step": 5662 }, { "epoch": 3.961610839292435, "grad_norm": 0.19276750087738037, "learning_rate": 4.887314177494552e-05, "loss": 0.2012, "step": 5663 }, { "epoch": 3.96236356793376, "grad_norm": 0.17160388827323914, "learning_rate": 4.8858399457298096e-05, "loss": 0.161, "step": 5664 }, { "epoch": 3.963116296575085, "grad_norm": 0.20397332310676575, "learning_rate": 4.884365781128688e-05, "loss": 0.1579, "step": 5665 }, { "epoch": 3.9638690252164093, "grad_norm": 0.1650495082139969, "learning_rate": 4.8828916838523986e-05, "loss": 0.1984, "step": 5666 }, { "epoch": 3.964621753857734, "grad_norm": 0.1829240471124649, "learning_rate": 4.881417654062146e-05, "loss": 0.1491, "step": 5667 }, { "epoch": 3.965374482499059, "grad_norm": 0.20719635486602783, "learning_rate": 4.87994369191913e-05, "loss": 0.1389, "step": 5668 }, { "epoch": 3.966127211140384, "grad_norm": 0.1903148591518402, "learning_rate": 4.878469797584539e-05, "loss": 0.1566, "step": 5669 }, { "epoch": 3.966879939781709, "grad_norm": 0.18661320209503174, "learning_rate": 4.8769959712195565e-05, "loss": 0.1867, "step": 5670 }, { "epoch": 3.9676326684230334, "grad_norm": 0.1780858337879181, "learning_rate": 4.875522212985357e-05, "loss": 0.1182, "step": 5671 }, { "epoch": 3.9683853970643583, "grad_norm": 0.18948091566562653, "learning_rate": 4.874048523043111e-05, "loss": 0.1505, "step": 5672 }, { "epoch": 3.9691381257056833, "grad_norm": 0.14244864881038666, "learning_rate": 4.8725749015539776e-05, "loss": 0.0699, "step": 5673 }, { "epoch": 3.9698908543470077, "grad_norm": 0.16354772448539734, "learning_rate": 4.871101348679108e-05, "loss": 0.1378, "step": 5674 }, { "epoch": 3.9706435829883326, "grad_norm": 0.19668184220790863, "learning_rate": 4.86962786457965e-05, "loss": 0.1559, "step": 5675 }, { "epoch": 3.9713963116296576, "grad_norm": 0.17507174611091614, "learning_rate": 4.868154449416741e-05, "loss": 0.1077, "step": 5676 }, { "epoch": 3.972149040270982, "grad_norm": 0.1879604011774063, "learning_rate": 4.8666811033515125e-05, "loss": 0.1653, "step": 5677 }, { "epoch": 3.972901768912307, "grad_norm": 0.17299090325832367, "learning_rate": 4.865207826545085e-05, "loss": 0.1422, "step": 5678 }, { "epoch": 3.973654497553632, "grad_norm": 0.17580071091651917, "learning_rate": 4.863734619158576e-05, "loss": 0.1342, "step": 5679 }, { "epoch": 3.974407226194957, "grad_norm": 0.1647251844406128, "learning_rate": 4.8622614813530906e-05, "loss": 0.2063, "step": 5680 }, { "epoch": 3.9751599548362817, "grad_norm": 0.18508218228816986, "learning_rate": 4.860788413289733e-05, "loss": 0.2052, "step": 5681 }, { "epoch": 3.975912683477606, "grad_norm": 0.15737618505954742, "learning_rate": 4.8593154151295936e-05, "loss": 0.1153, "step": 5682 }, { "epoch": 3.976665412118931, "grad_norm": 0.18452706933021545, "learning_rate": 4.857842487033757e-05, "loss": 0.1025, "step": 5683 }, { "epoch": 3.977418140760256, "grad_norm": 0.17154644429683685, "learning_rate": 4.8563696291632996e-05, "loss": 0.1491, "step": 5684 }, { "epoch": 3.9781708694015805, "grad_norm": 0.1895587146282196, "learning_rate": 4.854896841679293e-05, "loss": 0.1694, "step": 5685 }, { "epoch": 3.9789235980429054, "grad_norm": 0.19853425025939941, "learning_rate": 4.853424124742797e-05, "loss": 0.1376, "step": 5686 }, { "epoch": 3.9796763266842303, "grad_norm": 0.16577750444412231, "learning_rate": 4.851951478514866e-05, "loss": 0.2014, "step": 5687 }, { "epoch": 3.9804290553255552, "grad_norm": 0.17969395220279694, "learning_rate": 4.850478903156547e-05, "loss": 0.0863, "step": 5688 }, { "epoch": 3.98118178396688, "grad_norm": 0.1727467179298401, "learning_rate": 4.84900639882888e-05, "loss": 0.1587, "step": 5689 }, { "epoch": 3.9819345126082046, "grad_norm": 0.15889611840248108, "learning_rate": 4.8475339656928945e-05, "loss": 0.0775, "step": 5690 }, { "epoch": 3.9826872412495296, "grad_norm": 0.22546085715293884, "learning_rate": 4.846061603909613e-05, "loss": 0.2046, "step": 5691 }, { "epoch": 3.9834399698908545, "grad_norm": 0.19342882931232452, "learning_rate": 4.84458931364005e-05, "loss": 0.1584, "step": 5692 }, { "epoch": 3.984192698532179, "grad_norm": 0.18945707380771637, "learning_rate": 4.8431170950452156e-05, "loss": 0.1665, "step": 5693 }, { "epoch": 3.984945427173504, "grad_norm": 0.16674473881721497, "learning_rate": 4.841644948286108e-05, "loss": 0.116, "step": 5694 }, { "epoch": 3.985698155814829, "grad_norm": 0.16175498068332672, "learning_rate": 4.8401728735237186e-05, "loss": 0.1343, "step": 5695 }, { "epoch": 3.9864508844561537, "grad_norm": 0.1907111257314682, "learning_rate": 4.838700870919032e-05, "loss": 0.1571, "step": 5696 }, { "epoch": 3.9872036130974786, "grad_norm": 0.18060241639614105, "learning_rate": 4.837228940633024e-05, "loss": 0.122, "step": 5697 }, { "epoch": 3.987956341738803, "grad_norm": 0.18058839440345764, "learning_rate": 4.835757082826663e-05, "loss": 0.1993, "step": 5698 }, { "epoch": 3.988709070380128, "grad_norm": 0.1724442094564438, "learning_rate": 4.834285297660906e-05, "loss": 0.0979, "step": 5699 }, { "epoch": 3.989461799021453, "grad_norm": 0.21920369565486908, "learning_rate": 4.83281358529671e-05, "loss": 0.1336, "step": 5700 }, { "epoch": 3.9902145276627774, "grad_norm": 0.19072528183460236, "learning_rate": 4.8313419458950195e-05, "loss": 0.1093, "step": 5701 }, { "epoch": 3.9909672563041023, "grad_norm": 0.21890991926193237, "learning_rate": 4.8298703796167664e-05, "loss": 0.1942, "step": 5702 }, { "epoch": 3.9917199849454272, "grad_norm": 0.16622662544250488, "learning_rate": 4.8283988866228816e-05, "loss": 0.125, "step": 5703 }, { "epoch": 3.9924727135867517, "grad_norm": 0.15869247913360596, "learning_rate": 4.826927467074285e-05, "loss": 0.0985, "step": 5704 }, { "epoch": 3.9932254422280766, "grad_norm": 0.18602195382118225, "learning_rate": 4.8254561211318874e-05, "loss": 0.1664, "step": 5705 }, { "epoch": 3.9939781708694015, "grad_norm": 0.16133493185043335, "learning_rate": 4.823984848956593e-05, "loss": 0.106, "step": 5706 }, { "epoch": 3.9947308995107265, "grad_norm": 0.23966698348522186, "learning_rate": 4.8225136507093017e-05, "loss": 0.2143, "step": 5707 }, { "epoch": 3.9954836281520514, "grad_norm": 0.18238122761249542, "learning_rate": 4.8210425265508986e-05, "loss": 0.132, "step": 5708 }, { "epoch": 3.996236356793376, "grad_norm": 0.15670894086360931, "learning_rate": 4.819571476642264e-05, "loss": 0.1001, "step": 5709 }, { "epoch": 3.9969890854347008, "grad_norm": 0.20413705706596375, "learning_rate": 4.8181005011442695e-05, "loss": 0.2473, "step": 5710 }, { "epoch": 3.9977418140760257, "grad_norm": 0.1849728524684906, "learning_rate": 4.816629600217779e-05, "loss": 0.1226, "step": 5711 }, { "epoch": 3.99849454271735, "grad_norm": 0.1854354292154312, "learning_rate": 4.815158774023646e-05, "loss": 0.2107, "step": 5712 }, { "epoch": 3.999247271358675, "grad_norm": 0.18034955859184265, "learning_rate": 4.813688022722722e-05, "loss": 0.1965, "step": 5713 }, { "epoch": 4.0, "grad_norm": 0.20551380515098572, "learning_rate": 4.812217346475844e-05, "loss": 0.2627, "step": 5714 }, { "epoch": 4.0007527286413245, "grad_norm": 0.1864585429430008, "learning_rate": 4.8107467454438424e-05, "loss": 0.1665, "step": 5715 }, { "epoch": 4.00150545728265, "grad_norm": 0.18791599571704865, "learning_rate": 4.80927621978754e-05, "loss": 0.1653, "step": 5716 }, { "epoch": 4.002258185923974, "grad_norm": 0.20557962357997894, "learning_rate": 4.8078057696677523e-05, "loss": 0.1433, "step": 5717 }, { "epoch": 4.003010914565299, "grad_norm": 0.19142182171344757, "learning_rate": 4.806335395245285e-05, "loss": 0.1554, "step": 5718 }, { "epoch": 4.003763643206624, "grad_norm": 0.18141745030879974, "learning_rate": 4.804865096680936e-05, "loss": 0.2092, "step": 5719 }, { "epoch": 4.004516371847949, "grad_norm": 0.18464472889900208, "learning_rate": 4.803394874135494e-05, "loss": 0.1838, "step": 5720 }, { "epoch": 4.005269100489274, "grad_norm": 0.1784563958644867, "learning_rate": 4.801924727769742e-05, "loss": 0.1343, "step": 5721 }, { "epoch": 4.0060218291305985, "grad_norm": 0.16021579504013062, "learning_rate": 4.800454657744453e-05, "loss": 0.1435, "step": 5722 }, { "epoch": 4.006774557771923, "grad_norm": 0.1402646154165268, "learning_rate": 4.7989846642203906e-05, "loss": 0.1241, "step": 5723 }, { "epoch": 4.007527286413248, "grad_norm": 0.154257133603096, "learning_rate": 4.797514747358312e-05, "loss": 0.0598, "step": 5724 }, { "epoch": 4.008280015054573, "grad_norm": 0.18476907908916473, "learning_rate": 4.7960449073189606e-05, "loss": 0.1092, "step": 5725 }, { "epoch": 4.009032743695897, "grad_norm": 0.1798776537179947, "learning_rate": 4.7945751442630826e-05, "loss": 0.1205, "step": 5726 }, { "epoch": 4.009785472337223, "grad_norm": 0.17341963946819305, "learning_rate": 4.7931054583514065e-05, "loss": 0.1464, "step": 5727 }, { "epoch": 4.010538200978547, "grad_norm": 0.1801205426454544, "learning_rate": 4.791635849744656e-05, "loss": 0.1102, "step": 5728 }, { "epoch": 4.011290929619872, "grad_norm": 0.19738906621932983, "learning_rate": 4.790166318603542e-05, "loss": 0.2013, "step": 5729 }, { "epoch": 4.012043658261197, "grad_norm": 0.16556300222873688, "learning_rate": 4.788696865088772e-05, "loss": 0.1319, "step": 5730 }, { "epoch": 4.012796386902521, "grad_norm": 0.1561334878206253, "learning_rate": 4.787227489361042e-05, "loss": 0.1142, "step": 5731 }, { "epoch": 4.013549115543847, "grad_norm": 0.18928024172782898, "learning_rate": 4.7857581915810446e-05, "loss": 0.1892, "step": 5732 }, { "epoch": 4.014301844185171, "grad_norm": 0.17250679433345795, "learning_rate": 4.7842889719094576e-05, "loss": 0.0803, "step": 5733 }, { "epoch": 4.015054572826496, "grad_norm": 0.167672261595726, "learning_rate": 4.782819830506952e-05, "loss": 0.164, "step": 5734 }, { "epoch": 4.015807301467821, "grad_norm": 0.1783701330423355, "learning_rate": 4.7813507675341915e-05, "loss": 0.1855, "step": 5735 }, { "epoch": 4.0165600301091455, "grad_norm": 0.19087247550487518, "learning_rate": 4.779881783151832e-05, "loss": 0.1623, "step": 5736 }, { "epoch": 4.017312758750471, "grad_norm": 0.16503185033798218, "learning_rate": 4.778412877520515e-05, "loss": 0.1576, "step": 5737 }, { "epoch": 4.018065487391795, "grad_norm": 0.17453327775001526, "learning_rate": 4.776944050800881e-05, "loss": 0.1757, "step": 5738 }, { "epoch": 4.01881821603312, "grad_norm": 0.15414519608020782, "learning_rate": 4.77547530315356e-05, "loss": 0.0797, "step": 5739 }, { "epoch": 4.019570944674445, "grad_norm": 0.1866644322872162, "learning_rate": 4.774006634739171e-05, "loss": 0.2111, "step": 5740 }, { "epoch": 4.02032367331577, "grad_norm": 0.14785970747470856, "learning_rate": 4.7725380457183235e-05, "loss": 0.1042, "step": 5741 }, { "epoch": 4.021076401957094, "grad_norm": 0.17310237884521484, "learning_rate": 4.7710695362516234e-05, "loss": 0.1098, "step": 5742 }, { "epoch": 4.0218291305984195, "grad_norm": 0.1807207316160202, "learning_rate": 4.7696011064996615e-05, "loss": 0.2389, "step": 5743 }, { "epoch": 4.022581859239744, "grad_norm": 0.1949743628501892, "learning_rate": 4.768132756623024e-05, "loss": 0.2403, "step": 5744 }, { "epoch": 4.0233345878810685, "grad_norm": 0.1821610927581787, "learning_rate": 4.7666644867822885e-05, "loss": 0.1261, "step": 5745 }, { "epoch": 4.024087316522394, "grad_norm": 0.17006918787956238, "learning_rate": 4.765196297138022e-05, "loss": 0.1059, "step": 5746 }, { "epoch": 4.024840045163718, "grad_norm": 0.19878308475017548, "learning_rate": 4.7637281878507845e-05, "loss": 0.1341, "step": 5747 }, { "epoch": 4.025592773805044, "grad_norm": 0.19255760312080383, "learning_rate": 4.762260159081126e-05, "loss": 0.1315, "step": 5748 }, { "epoch": 4.026345502446368, "grad_norm": 0.18504811823368073, "learning_rate": 4.7607922109895866e-05, "loss": 0.1747, "step": 5749 }, { "epoch": 4.027098231087693, "grad_norm": 0.15562427043914795, "learning_rate": 4.7593243437366975e-05, "loss": 0.1844, "step": 5750 }, { "epoch": 4.027850959729018, "grad_norm": 0.16413185000419617, "learning_rate": 4.757856557482986e-05, "loss": 0.1086, "step": 5751 }, { "epoch": 4.028603688370342, "grad_norm": 0.17087048292160034, "learning_rate": 4.756388852388967e-05, "loss": 0.1053, "step": 5752 }, { "epoch": 4.029356417011667, "grad_norm": 0.20201411843299866, "learning_rate": 4.7549212286151435e-05, "loss": 0.1726, "step": 5753 }, { "epoch": 4.030109145652992, "grad_norm": 0.17801110446453094, "learning_rate": 4.7534536863220156e-05, "loss": 0.1341, "step": 5754 }, { "epoch": 4.030861874294317, "grad_norm": 0.17877498269081116, "learning_rate": 4.7519862256700695e-05, "loss": 0.1314, "step": 5755 }, { "epoch": 4.031614602935642, "grad_norm": 0.15985682606697083, "learning_rate": 4.750518846819785e-05, "loss": 0.102, "step": 5756 }, { "epoch": 4.032367331576967, "grad_norm": 0.17898310720920563, "learning_rate": 4.749051549931631e-05, "loss": 0.1578, "step": 5757 }, { "epoch": 4.033120060218291, "grad_norm": 0.23445042967796326, "learning_rate": 4.747584335166072e-05, "loss": 0.1433, "step": 5758 }, { "epoch": 4.033872788859616, "grad_norm": 0.17147037386894226, "learning_rate": 4.74611720268356e-05, "loss": 0.1249, "step": 5759 }, { "epoch": 4.034625517500941, "grad_norm": 0.16535772383213043, "learning_rate": 4.7446501526445366e-05, "loss": 0.1241, "step": 5760 }, { "epoch": 4.035378246142265, "grad_norm": 0.1563752442598343, "learning_rate": 4.7431831852094356e-05, "loss": 0.1392, "step": 5761 }, { "epoch": 4.036130974783591, "grad_norm": 0.20672722160816193, "learning_rate": 4.741716300538684e-05, "loss": 0.1787, "step": 5762 }, { "epoch": 4.036883703424915, "grad_norm": 0.19029271602630615, "learning_rate": 4.740249498792698e-05, "loss": 0.1981, "step": 5763 }, { "epoch": 4.03763643206624, "grad_norm": 0.16739846765995026, "learning_rate": 4.738782780131885e-05, "loss": 0.094, "step": 5764 }, { "epoch": 4.038389160707565, "grad_norm": 0.15827247500419617, "learning_rate": 4.737316144716643e-05, "loss": 0.155, "step": 5765 }, { "epoch": 4.0391418893488895, "grad_norm": 0.19977083802223206, "learning_rate": 4.735849592707359e-05, "loss": 0.1638, "step": 5766 }, { "epoch": 4.039894617990215, "grad_norm": 0.15228235721588135, "learning_rate": 4.734383124264415e-05, "loss": 0.135, "step": 5767 }, { "epoch": 4.040647346631539, "grad_norm": 0.19353917241096497, "learning_rate": 4.7329167395481835e-05, "loss": 0.1308, "step": 5768 }, { "epoch": 4.041400075272864, "grad_norm": 0.16834484040737152, "learning_rate": 4.731450438719022e-05, "loss": 0.1679, "step": 5769 }, { "epoch": 4.042152803914189, "grad_norm": 0.17108118534088135, "learning_rate": 4.7299842219372845e-05, "loss": 0.1183, "step": 5770 }, { "epoch": 4.042905532555514, "grad_norm": 0.17577342689037323, "learning_rate": 4.728518089363316e-05, "loss": 0.1299, "step": 5771 }, { "epoch": 4.043658261196838, "grad_norm": 0.17100414633750916, "learning_rate": 4.7270520411574504e-05, "loss": 0.1067, "step": 5772 }, { "epoch": 4.0444109898381635, "grad_norm": 0.20865245163440704, "learning_rate": 4.725586077480011e-05, "loss": 0.1489, "step": 5773 }, { "epoch": 4.045163718479488, "grad_norm": 0.18425783514976501, "learning_rate": 4.724120198491314e-05, "loss": 0.1599, "step": 5774 }, { "epoch": 4.045916447120813, "grad_norm": 0.17818871140480042, "learning_rate": 4.722654404351665e-05, "loss": 0.1786, "step": 5775 }, { "epoch": 4.046669175762138, "grad_norm": 0.16580499708652496, "learning_rate": 4.7211886952213605e-05, "loss": 0.0819, "step": 5776 }, { "epoch": 4.047421904403462, "grad_norm": 0.17185798287391663, "learning_rate": 4.71972307126069e-05, "loss": 0.1627, "step": 5777 }, { "epoch": 4.048174633044788, "grad_norm": 0.19773200154304504, "learning_rate": 4.7182575326299314e-05, "loss": 0.1463, "step": 5778 }, { "epoch": 4.048927361686112, "grad_norm": 0.14295506477355957, "learning_rate": 4.716792079489355e-05, "loss": 0.0865, "step": 5779 }, { "epoch": 4.049680090327437, "grad_norm": 0.20357537269592285, "learning_rate": 4.715326711999218e-05, "loss": 0.1491, "step": 5780 }, { "epoch": 4.050432818968762, "grad_norm": 0.21214893460273743, "learning_rate": 4.7138614303197735e-05, "loss": 0.1998, "step": 5781 }, { "epoch": 4.051185547610086, "grad_norm": 0.18833661079406738, "learning_rate": 4.7123962346112584e-05, "loss": 0.2034, "step": 5782 }, { "epoch": 4.051938276251412, "grad_norm": 0.1816614270210266, "learning_rate": 4.7109311250339086e-05, "loss": 0.1196, "step": 5783 }, { "epoch": 4.052691004892736, "grad_norm": 0.19414061307907104, "learning_rate": 4.709466101747945e-05, "loss": 0.1667, "step": 5784 }, { "epoch": 4.053443733534061, "grad_norm": 0.1694689840078354, "learning_rate": 4.708001164913579e-05, "loss": 0.1164, "step": 5785 }, { "epoch": 4.054196462175386, "grad_norm": 0.17324745655059814, "learning_rate": 4.7065363146910144e-05, "loss": 0.1374, "step": 5786 }, { "epoch": 4.054949190816711, "grad_norm": 0.18793518841266632, "learning_rate": 4.7050715512404465e-05, "loss": 0.137, "step": 5787 }, { "epoch": 4.055701919458035, "grad_norm": 0.18351754546165466, "learning_rate": 4.7036068747220585e-05, "loss": 0.1273, "step": 5788 }, { "epoch": 4.05645464809936, "grad_norm": 0.18169736862182617, "learning_rate": 4.702142285296025e-05, "loss": 0.101, "step": 5789 }, { "epoch": 4.057207376740685, "grad_norm": 0.16844701766967773, "learning_rate": 4.700677783122511e-05, "loss": 0.1577, "step": 5790 }, { "epoch": 4.057960105382009, "grad_norm": 0.1791927069425583, "learning_rate": 4.6992133683616735e-05, "loss": 0.1167, "step": 5791 }, { "epoch": 4.058712834023335, "grad_norm": 0.1620166301727295, "learning_rate": 4.697749041173659e-05, "loss": 0.1291, "step": 5792 }, { "epoch": 4.059465562664659, "grad_norm": 0.17525166273117065, "learning_rate": 4.696284801718602e-05, "loss": 0.1222, "step": 5793 }, { "epoch": 4.0602182913059846, "grad_norm": 0.19008898735046387, "learning_rate": 4.694820650156632e-05, "loss": 0.1453, "step": 5794 }, { "epoch": 4.060971019947309, "grad_norm": 0.17576877772808075, "learning_rate": 4.6933565866478626e-05, "loss": 0.1495, "step": 5795 }, { "epoch": 4.0617237485886335, "grad_norm": 0.17976658046245575, "learning_rate": 4.6918926113524065e-05, "loss": 0.1586, "step": 5796 }, { "epoch": 4.062476477229959, "grad_norm": 0.16725578904151917, "learning_rate": 4.690428724430359e-05, "loss": 0.1221, "step": 5797 }, { "epoch": 4.063229205871283, "grad_norm": 0.18088538944721222, "learning_rate": 4.688964926041809e-05, "loss": 0.1858, "step": 5798 }, { "epoch": 4.063981934512608, "grad_norm": 0.15115304291248322, "learning_rate": 4.687501216346835e-05, "loss": 0.1376, "step": 5799 }, { "epoch": 4.064734663153933, "grad_norm": 0.20512884855270386, "learning_rate": 4.686037595505507e-05, "loss": 0.1531, "step": 5800 }, { "epoch": 4.064734663153933, "eval_loss": 0.18588578701019287, "eval_runtime": 456.4592, "eval_samples_per_second": 21.091, "eval_steps_per_second": 0.659, "step": 5800 }, { "epoch": 4.065487391795258, "grad_norm": 0.1805153489112854, "learning_rate": 4.684574063677881e-05, "loss": 0.1409, "step": 5801 }, { "epoch": 4.066240120436583, "grad_norm": 0.15403695404529572, "learning_rate": 4.6831106210240126e-05, "loss": 0.1224, "step": 5802 }, { "epoch": 4.0669928490779075, "grad_norm": 0.19203364849090576, "learning_rate": 4.681647267703938e-05, "loss": 0.1596, "step": 5803 }, { "epoch": 4.067745577719232, "grad_norm": 0.1335964947938919, "learning_rate": 4.6801840038776876e-05, "loss": 0.0825, "step": 5804 }, { "epoch": 4.068498306360557, "grad_norm": 0.2086905688047409, "learning_rate": 4.678720829705282e-05, "loss": 0.1321, "step": 5805 }, { "epoch": 4.069251035001882, "grad_norm": 0.18237310647964478, "learning_rate": 4.6772577453467325e-05, "loss": 0.1226, "step": 5806 }, { "epoch": 4.070003763643206, "grad_norm": 0.17685599625110626, "learning_rate": 4.675794750962038e-05, "loss": 0.1656, "step": 5807 }, { "epoch": 4.070756492284532, "grad_norm": 0.21454057097434998, "learning_rate": 4.674331846711188e-05, "loss": 0.1695, "step": 5808 }, { "epoch": 4.071509220925856, "grad_norm": 0.19954508543014526, "learning_rate": 4.672869032754169e-05, "loss": 0.1789, "step": 5809 }, { "epoch": 4.0722619495671815, "grad_norm": 0.16723476350307465, "learning_rate": 4.671406309250949e-05, "loss": 0.1708, "step": 5810 }, { "epoch": 4.073014678208506, "grad_norm": 0.16819843649864197, "learning_rate": 4.669943676361488e-05, "loss": 0.1475, "step": 5811 }, { "epoch": 4.07376740684983, "grad_norm": 0.17827291786670685, "learning_rate": 4.668481134245739e-05, "loss": 0.2361, "step": 5812 }, { "epoch": 4.074520135491156, "grad_norm": 0.18766191601753235, "learning_rate": 4.667018683063643e-05, "loss": 0.1248, "step": 5813 }, { "epoch": 4.07527286413248, "grad_norm": 0.18335288763046265, "learning_rate": 4.665556322975131e-05, "loss": 0.2124, "step": 5814 }, { "epoch": 4.076025592773805, "grad_norm": 0.14748363196849823, "learning_rate": 4.664094054140126e-05, "loss": 0.0874, "step": 5815 }, { "epoch": 4.07677832141513, "grad_norm": 0.16133499145507812, "learning_rate": 4.6626318767185374e-05, "loss": 0.1178, "step": 5816 }, { "epoch": 4.0775310500564546, "grad_norm": 0.18606917560100555, "learning_rate": 4.6611697908702677e-05, "loss": 0.1839, "step": 5817 }, { "epoch": 4.078283778697779, "grad_norm": 0.15313780307769775, "learning_rate": 4.6597077967552094e-05, "loss": 0.0567, "step": 5818 }, { "epoch": 4.079036507339104, "grad_norm": 0.16853581368923187, "learning_rate": 4.658245894533243e-05, "loss": 0.1491, "step": 5819 }, { "epoch": 4.079789235980429, "grad_norm": 0.1645069569349289, "learning_rate": 4.6567840843642384e-05, "loss": 0.1695, "step": 5820 }, { "epoch": 4.080541964621754, "grad_norm": 0.1694507896900177, "learning_rate": 4.65532236640806e-05, "loss": 0.2021, "step": 5821 }, { "epoch": 4.081294693263079, "grad_norm": 0.18085011839866638, "learning_rate": 4.653860740824558e-05, "loss": 0.1069, "step": 5822 }, { "epoch": 4.082047421904403, "grad_norm": 0.18112625181674957, "learning_rate": 4.652399207773573e-05, "loss": 0.1369, "step": 5823 }, { "epoch": 4.0828001505457285, "grad_norm": 0.18522632122039795, "learning_rate": 4.6509377674149365e-05, "loss": 0.1164, "step": 5824 }, { "epoch": 4.083552879187053, "grad_norm": 0.1694992184638977, "learning_rate": 4.6494764199084695e-05, "loss": 0.1353, "step": 5825 }, { "epoch": 4.0843056078283775, "grad_norm": 0.15565884113311768, "learning_rate": 4.6480151654139823e-05, "loss": 0.1476, "step": 5826 }, { "epoch": 4.085058336469703, "grad_norm": 0.20036929845809937, "learning_rate": 4.6465540040912735e-05, "loss": 0.2219, "step": 5827 }, { "epoch": 4.085811065111027, "grad_norm": 0.15732599794864655, "learning_rate": 4.645092936100138e-05, "loss": 0.1925, "step": 5828 }, { "epoch": 4.086563793752353, "grad_norm": 0.19026345014572144, "learning_rate": 4.643631961600353e-05, "loss": 0.0887, "step": 5829 }, { "epoch": 4.087316522393677, "grad_norm": 0.17627814412117004, "learning_rate": 4.6421710807516906e-05, "loss": 0.1232, "step": 5830 }, { "epoch": 4.088069251035002, "grad_norm": 0.19706441462039948, "learning_rate": 4.640710293713908e-05, "loss": 0.1029, "step": 5831 }, { "epoch": 4.088821979676327, "grad_norm": 0.16869360208511353, "learning_rate": 4.639249600646755e-05, "loss": 0.1744, "step": 5832 }, { "epoch": 4.0895747083176515, "grad_norm": 0.17555688321590424, "learning_rate": 4.637789001709971e-05, "loss": 0.1303, "step": 5833 }, { "epoch": 4.090327436958976, "grad_norm": 0.15570487082004547, "learning_rate": 4.6363284970632855e-05, "loss": 0.1188, "step": 5834 }, { "epoch": 4.091080165600301, "grad_norm": 0.15611949563026428, "learning_rate": 4.6348680868664194e-05, "loss": 0.1469, "step": 5835 }, { "epoch": 4.091832894241626, "grad_norm": 0.14751552045345306, "learning_rate": 4.6334077712790766e-05, "loss": 0.142, "step": 5836 }, { "epoch": 4.09258562288295, "grad_norm": 0.17311671376228333, "learning_rate": 4.631947550460958e-05, "loss": 0.1152, "step": 5837 }, { "epoch": 4.093338351524276, "grad_norm": 0.15811438858509064, "learning_rate": 4.63048742457175e-05, "loss": 0.1358, "step": 5838 }, { "epoch": 4.0940910801656, "grad_norm": 0.203317791223526, "learning_rate": 4.629027393771129e-05, "loss": 0.2165, "step": 5839 }, { "epoch": 4.0948438088069254, "grad_norm": 0.20561546087265015, "learning_rate": 4.6275674582187636e-05, "loss": 0.1799, "step": 5840 }, { "epoch": 4.09559653744825, "grad_norm": 0.19961780309677124, "learning_rate": 4.626107618074309e-05, "loss": 0.2143, "step": 5841 }, { "epoch": 4.096349266089574, "grad_norm": 0.1734989732503891, "learning_rate": 4.624647873497412e-05, "loss": 0.1014, "step": 5842 }, { "epoch": 4.0971019947309, "grad_norm": 0.19941557943820953, "learning_rate": 4.623188224647708e-05, "loss": 0.255, "step": 5843 }, { "epoch": 4.097854723372224, "grad_norm": 0.1707039773464203, "learning_rate": 4.621728671684821e-05, "loss": 0.1202, "step": 5844 }, { "epoch": 4.098607452013549, "grad_norm": 0.16130025684833527, "learning_rate": 4.620269214768366e-05, "loss": 0.1398, "step": 5845 }, { "epoch": 4.099360180654874, "grad_norm": 0.16131971776485443, "learning_rate": 4.6188098540579436e-05, "loss": 0.0849, "step": 5846 }, { "epoch": 4.1001129092961985, "grad_norm": 0.17572568356990814, "learning_rate": 4.617350589713154e-05, "loss": 0.2252, "step": 5847 }, { "epoch": 4.100865637937524, "grad_norm": 0.19865316152572632, "learning_rate": 4.615891421893578e-05, "loss": 0.2089, "step": 5848 }, { "epoch": 4.101618366578848, "grad_norm": 0.18316151201725006, "learning_rate": 4.614432350758785e-05, "loss": 0.2871, "step": 5849 }, { "epoch": 4.102371095220173, "grad_norm": 0.17599104344844818, "learning_rate": 4.612973376468339e-05, "loss": 0.1914, "step": 5850 }, { "epoch": 4.103123823861498, "grad_norm": 0.169012188911438, "learning_rate": 4.6115144991817924e-05, "loss": 0.109, "step": 5851 }, { "epoch": 4.103876552502823, "grad_norm": 0.17222759127616882, "learning_rate": 4.610055719058681e-05, "loss": 0.123, "step": 5852 }, { "epoch": 4.104629281144147, "grad_norm": 0.175465390086174, "learning_rate": 4.6085970362585406e-05, "loss": 0.1175, "step": 5853 }, { "epoch": 4.1053820097854725, "grad_norm": 0.1625763177871704, "learning_rate": 4.607138450940888e-05, "loss": 0.1672, "step": 5854 }, { "epoch": 4.106134738426797, "grad_norm": 0.15800411999225616, "learning_rate": 4.605679963265232e-05, "loss": 0.1225, "step": 5855 }, { "epoch": 4.106887467068122, "grad_norm": 0.19370901584625244, "learning_rate": 4.6042215733910707e-05, "loss": 0.1603, "step": 5856 }, { "epoch": 4.107640195709447, "grad_norm": 0.19209516048431396, "learning_rate": 4.602763281477891e-05, "loss": 0.136, "step": 5857 }, { "epoch": 4.108392924350771, "grad_norm": 0.23309226334095, "learning_rate": 4.601305087685169e-05, "loss": 0.223, "step": 5858 }, { "epoch": 4.109145652992097, "grad_norm": 0.1709754914045334, "learning_rate": 4.599846992172372e-05, "loss": 0.1619, "step": 5859 }, { "epoch": 4.109898381633421, "grad_norm": 0.19852402806282043, "learning_rate": 4.5983889950989554e-05, "loss": 0.1863, "step": 5860 }, { "epoch": 4.110651110274746, "grad_norm": 0.17732569575309753, "learning_rate": 4.596931096624362e-05, "loss": 0.1569, "step": 5861 }, { "epoch": 4.111403838916071, "grad_norm": 0.17658069729804993, "learning_rate": 4.595473296908026e-05, "loss": 0.1218, "step": 5862 }, { "epoch": 4.1121565675573954, "grad_norm": 0.18307454884052277, "learning_rate": 4.5940155961093714e-05, "loss": 0.1675, "step": 5863 }, { "epoch": 4.112909296198721, "grad_norm": 0.1691858470439911, "learning_rate": 4.592557994387808e-05, "loss": 0.0996, "step": 5864 }, { "epoch": 4.113662024840045, "grad_norm": 0.14593233168125153, "learning_rate": 4.591100491902738e-05, "loss": 0.1297, "step": 5865 }, { "epoch": 4.11441475348137, "grad_norm": 0.1652403026819229, "learning_rate": 4.589643088813551e-05, "loss": 0.13, "step": 5866 }, { "epoch": 4.115167482122695, "grad_norm": 0.17502997815608978, "learning_rate": 4.588185785279628e-05, "loss": 0.1242, "step": 5867 }, { "epoch": 4.11592021076402, "grad_norm": 0.17634883522987366, "learning_rate": 4.5867285814603375e-05, "loss": 0.136, "step": 5868 }, { "epoch": 4.116672939405344, "grad_norm": 0.1764715611934662, "learning_rate": 4.5852714775150364e-05, "loss": 0.12, "step": 5869 }, { "epoch": 4.117425668046669, "grad_norm": 0.19295911490917206, "learning_rate": 4.583814473603071e-05, "loss": 0.1849, "step": 5870 }, { "epoch": 4.118178396687994, "grad_norm": 0.18716385960578918, "learning_rate": 4.582357569883775e-05, "loss": 0.1769, "step": 5871 }, { "epoch": 4.118931125329318, "grad_norm": 0.19514279067516327, "learning_rate": 4.580900766516477e-05, "loss": 0.2393, "step": 5872 }, { "epoch": 4.119683853970644, "grad_norm": 0.17051160335540771, "learning_rate": 4.57944406366049e-05, "loss": 0.151, "step": 5873 }, { "epoch": 4.120436582611968, "grad_norm": 0.1699560135602951, "learning_rate": 4.5779874614751164e-05, "loss": 0.1129, "step": 5874 }, { "epoch": 4.121189311253294, "grad_norm": 0.15141527354717255, "learning_rate": 4.576530960119646e-05, "loss": 0.0585, "step": 5875 }, { "epoch": 4.121942039894618, "grad_norm": 0.19122165441513062, "learning_rate": 4.575074559753364e-05, "loss": 0.2102, "step": 5876 }, { "epoch": 4.1226947685359425, "grad_norm": 0.183305561542511, "learning_rate": 4.573618260535536e-05, "loss": 0.2039, "step": 5877 }, { "epoch": 4.123447497177268, "grad_norm": 0.1606471836566925, "learning_rate": 4.5721620626254203e-05, "loss": 0.144, "step": 5878 }, { "epoch": 4.124200225818592, "grad_norm": 0.17135797441005707, "learning_rate": 4.5707059661822684e-05, "loss": 0.1022, "step": 5879 }, { "epoch": 4.124952954459917, "grad_norm": 0.16108731925487518, "learning_rate": 4.5692499713653157e-05, "loss": 0.1211, "step": 5880 }, { "epoch": 4.125705683101242, "grad_norm": 0.14899316430091858, "learning_rate": 4.567794078333786e-05, "loss": 0.092, "step": 5881 }, { "epoch": 4.126458411742567, "grad_norm": 0.1604616641998291, "learning_rate": 4.566338287246894e-05, "loss": 0.1197, "step": 5882 }, { "epoch": 4.127211140383892, "grad_norm": 0.16288958489894867, "learning_rate": 4.5648825982638434e-05, "loss": 0.1792, "step": 5883 }, { "epoch": 4.1279638690252165, "grad_norm": 0.19071830809116364, "learning_rate": 4.563427011543825e-05, "loss": 0.141, "step": 5884 }, { "epoch": 4.128716597666541, "grad_norm": 0.19427193701267242, "learning_rate": 4.561971527246022e-05, "loss": 0.1121, "step": 5885 }, { "epoch": 4.129469326307866, "grad_norm": 0.17912134528160095, "learning_rate": 4.560516145529602e-05, "loss": 0.1343, "step": 5886 }, { "epoch": 4.130222054949191, "grad_norm": 0.21211038529872894, "learning_rate": 4.5590608665537224e-05, "loss": 0.2029, "step": 5887 }, { "epoch": 4.130974783590515, "grad_norm": 0.17002485692501068, "learning_rate": 4.5576056904775344e-05, "loss": 0.1089, "step": 5888 }, { "epoch": 4.131727512231841, "grad_norm": 0.16657444834709167, "learning_rate": 4.55615061746017e-05, "loss": 0.1387, "step": 5889 }, { "epoch": 4.132480240873165, "grad_norm": 0.18633335828781128, "learning_rate": 4.554695647660755e-05, "loss": 0.2159, "step": 5890 }, { "epoch": 4.13323296951449, "grad_norm": 0.16789846122264862, "learning_rate": 4.553240781238402e-05, "loss": 0.1204, "step": 5891 }, { "epoch": 4.133985698155815, "grad_norm": 0.20468252897262573, "learning_rate": 4.551786018352214e-05, "loss": 0.1666, "step": 5892 }, { "epoch": 4.134738426797139, "grad_norm": 0.1550436168909073, "learning_rate": 4.550331359161283e-05, "loss": 0.1154, "step": 5893 }, { "epoch": 4.135491155438465, "grad_norm": 0.15350443124771118, "learning_rate": 4.5488768038246854e-05, "loss": 0.1076, "step": 5894 }, { "epoch": 4.136243884079789, "grad_norm": 0.2029803991317749, "learning_rate": 4.54742235250149e-05, "loss": 0.1988, "step": 5895 }, { "epoch": 4.136996612721114, "grad_norm": 0.17672862112522125, "learning_rate": 4.545968005350756e-05, "loss": 0.0889, "step": 5896 }, { "epoch": 4.137749341362439, "grad_norm": 0.16977933049201965, "learning_rate": 4.544513762531523e-05, "loss": 0.1517, "step": 5897 }, { "epoch": 4.138502070003764, "grad_norm": 0.17704162001609802, "learning_rate": 4.543059624202831e-05, "loss": 0.1495, "step": 5898 }, { "epoch": 4.139254798645088, "grad_norm": 0.18113239109516144, "learning_rate": 4.5416055905236996e-05, "loss": 0.1481, "step": 5899 }, { "epoch": 4.140007527286413, "grad_norm": 0.17903698980808258, "learning_rate": 4.54015166165314e-05, "loss": 0.1454, "step": 5900 }, { "epoch": 4.140760255927738, "grad_norm": 0.16487807035446167, "learning_rate": 4.538697837750151e-05, "loss": 0.1591, "step": 5901 }, { "epoch": 4.141512984569063, "grad_norm": 0.1583976298570633, "learning_rate": 4.537244118973722e-05, "loss": 0.0977, "step": 5902 }, { "epoch": 4.142265713210388, "grad_norm": 0.18734385073184967, "learning_rate": 4.535790505482826e-05, "loss": 0.1966, "step": 5903 }, { "epoch": 4.143018441851712, "grad_norm": 0.17976996302604675, "learning_rate": 4.534336997436433e-05, "loss": 0.1065, "step": 5904 }, { "epoch": 4.143771170493038, "grad_norm": 0.16749387979507446, "learning_rate": 4.532883594993494e-05, "loss": 0.2311, "step": 5905 }, { "epoch": 4.144523899134362, "grad_norm": 0.20189575850963593, "learning_rate": 4.531430298312952e-05, "loss": 0.2064, "step": 5906 }, { "epoch": 4.1452766277756865, "grad_norm": 0.2026253640651703, "learning_rate": 4.529977107553736e-05, "loss": 0.1142, "step": 5907 }, { "epoch": 4.146029356417012, "grad_norm": 0.15241225063800812, "learning_rate": 4.5285240228747625e-05, "loss": 0.1584, "step": 5908 }, { "epoch": 4.146782085058336, "grad_norm": 0.1833658665418625, "learning_rate": 4.527071044434944e-05, "loss": 0.1243, "step": 5909 }, { "epoch": 4.147534813699661, "grad_norm": 0.21256481111049652, "learning_rate": 4.525618172393172e-05, "loss": 0.173, "step": 5910 }, { "epoch": 4.148287542340986, "grad_norm": 0.1496952921152115, "learning_rate": 4.524165406908333e-05, "loss": 0.0977, "step": 5911 }, { "epoch": 4.149040270982311, "grad_norm": 0.1626449078321457, "learning_rate": 4.522712748139296e-05, "loss": 0.125, "step": 5912 }, { "epoch": 4.149792999623636, "grad_norm": 0.18362939357757568, "learning_rate": 4.521260196244924e-05, "loss": 0.1159, "step": 5913 }, { "epoch": 4.1505457282649605, "grad_norm": 0.17328417301177979, "learning_rate": 4.5198077513840666e-05, "loss": 0.2095, "step": 5914 }, { "epoch": 4.151298456906285, "grad_norm": 0.14735014736652374, "learning_rate": 4.5183554137155606e-05, "loss": 0.1278, "step": 5915 }, { "epoch": 4.15205118554761, "grad_norm": 0.17738351225852966, "learning_rate": 4.516903183398227e-05, "loss": 0.1199, "step": 5916 }, { "epoch": 4.152803914188935, "grad_norm": 0.16447600722312927, "learning_rate": 4.515451060590885e-05, "loss": 0.108, "step": 5917 }, { "epoch": 4.153556642830259, "grad_norm": 0.1580411046743393, "learning_rate": 4.513999045452336e-05, "loss": 0.1381, "step": 5918 }, { "epoch": 4.154309371471585, "grad_norm": 0.1693682223558426, "learning_rate": 4.5125471381413685e-05, "loss": 0.1701, "step": 5919 }, { "epoch": 4.155062100112909, "grad_norm": 0.15514957904815674, "learning_rate": 4.511095338816761e-05, "loss": 0.0805, "step": 5920 }, { "epoch": 4.1558148287542345, "grad_norm": 0.18044894933700562, "learning_rate": 4.5096436476372816e-05, "loss": 0.1306, "step": 5921 }, { "epoch": 4.156567557395559, "grad_norm": 0.19138836860656738, "learning_rate": 4.5081920647616815e-05, "loss": 0.1747, "step": 5922 }, { "epoch": 4.157320286036883, "grad_norm": 0.1636093109846115, "learning_rate": 4.506740590348707e-05, "loss": 0.0912, "step": 5923 }, { "epoch": 4.158073014678209, "grad_norm": 0.17723070085048676, "learning_rate": 4.5052892245570886e-05, "loss": 0.1366, "step": 5924 }, { "epoch": 4.158825743319533, "grad_norm": 0.14402246475219727, "learning_rate": 4.5038379675455455e-05, "loss": 0.11, "step": 5925 }, { "epoch": 4.159578471960858, "grad_norm": 0.1658335030078888, "learning_rate": 4.502386819472785e-05, "loss": 0.1244, "step": 5926 }, { "epoch": 4.160331200602183, "grad_norm": 0.1636679768562317, "learning_rate": 4.5009357804975016e-05, "loss": 0.1486, "step": 5927 }, { "epoch": 4.161083929243508, "grad_norm": 0.19227559864521027, "learning_rate": 4.499484850778378e-05, "loss": 0.153, "step": 5928 }, { "epoch": 4.161836657884833, "grad_norm": 0.14630897343158722, "learning_rate": 4.498034030474086e-05, "loss": 0.0828, "step": 5929 }, { "epoch": 4.162589386526157, "grad_norm": 0.16549213230609894, "learning_rate": 4.496583319743288e-05, "loss": 0.1733, "step": 5930 }, { "epoch": 4.163342115167482, "grad_norm": 0.17449039220809937, "learning_rate": 4.49513271874463e-05, "loss": 0.1584, "step": 5931 }, { "epoch": 4.164094843808807, "grad_norm": 0.16663417220115662, "learning_rate": 4.4936822276367464e-05, "loss": 0.1388, "step": 5932 }, { "epoch": 4.164847572450132, "grad_norm": 0.17514318227767944, "learning_rate": 4.4922318465782595e-05, "loss": 0.1067, "step": 5933 }, { "epoch": 4.165600301091456, "grad_norm": 0.18453770875930786, "learning_rate": 4.490781575727786e-05, "loss": 0.1494, "step": 5934 }, { "epoch": 4.1663530297327815, "grad_norm": 0.16950581967830658, "learning_rate": 4.489331415243919e-05, "loss": 0.1152, "step": 5935 }, { "epoch": 4.167105758374106, "grad_norm": 0.1752113550901413, "learning_rate": 4.487881365285251e-05, "loss": 0.1299, "step": 5936 }, { "epoch": 4.167858487015431, "grad_norm": 0.14836914837360382, "learning_rate": 4.486431426010353e-05, "loss": 0.122, "step": 5937 }, { "epoch": 4.168611215656756, "grad_norm": 0.1889505386352539, "learning_rate": 4.484981597577793e-05, "loss": 0.0872, "step": 5938 }, { "epoch": 4.16936394429808, "grad_norm": 0.16884252429008484, "learning_rate": 4.483531880146119e-05, "loss": 0.1698, "step": 5939 }, { "epoch": 4.170116672939406, "grad_norm": 0.1865503489971161, "learning_rate": 4.48208227387387e-05, "loss": 0.1319, "step": 5940 }, { "epoch": 4.17086940158073, "grad_norm": 0.17360152304172516, "learning_rate": 4.4806327789195714e-05, "loss": 0.1432, "step": 5941 }, { "epoch": 4.171622130222055, "grad_norm": 0.17165648937225342, "learning_rate": 4.4791833954417417e-05, "loss": 0.1371, "step": 5942 }, { "epoch": 4.17237485886338, "grad_norm": 0.20785078406333923, "learning_rate": 4.47773412359888e-05, "loss": 0.1916, "step": 5943 }, { "epoch": 4.1731275875047045, "grad_norm": 0.21280932426452637, "learning_rate": 4.4762849635494775e-05, "loss": 0.149, "step": 5944 }, { "epoch": 4.173880316146029, "grad_norm": 0.17065276205539703, "learning_rate": 4.4748359154520136e-05, "loss": 0.1151, "step": 5945 }, { "epoch": 4.174633044787354, "grad_norm": 0.1725679337978363, "learning_rate": 4.473386979464952e-05, "loss": 0.1461, "step": 5946 }, { "epoch": 4.175385773428679, "grad_norm": 0.17101162672042847, "learning_rate": 4.471938155746747e-05, "loss": 0.0887, "step": 5947 }, { "epoch": 4.176138502070004, "grad_norm": 0.17599117755889893, "learning_rate": 4.4704894444558374e-05, "loss": 0.1184, "step": 5948 }, { "epoch": 4.176891230711329, "grad_norm": 0.16723361611366272, "learning_rate": 4.469040845750656e-05, "loss": 0.1524, "step": 5949 }, { "epoch": 4.177643959352653, "grad_norm": 0.18231891095638275, "learning_rate": 4.467592359789618e-05, "loss": 0.1517, "step": 5950 }, { "epoch": 4.1783966879939785, "grad_norm": 0.18061409890651703, "learning_rate": 4.466143986731128e-05, "loss": 0.1225, "step": 5951 }, { "epoch": 4.179149416635303, "grad_norm": 0.16150423884391785, "learning_rate": 4.4646957267335765e-05, "loss": 0.1398, "step": 5952 }, { "epoch": 4.179902145276627, "grad_norm": 0.1725875288248062, "learning_rate": 4.463247579955344e-05, "loss": 0.1648, "step": 5953 }, { "epoch": 4.180654873917953, "grad_norm": 0.17127257585525513, "learning_rate": 4.4617995465547954e-05, "loss": 0.1212, "step": 5954 }, { "epoch": 4.181407602559277, "grad_norm": 0.16477900743484497, "learning_rate": 4.460351626690289e-05, "loss": 0.1186, "step": 5955 }, { "epoch": 4.182160331200603, "grad_norm": 0.17478491365909576, "learning_rate": 4.458903820520167e-05, "loss": 0.1332, "step": 5956 }, { "epoch": 4.182913059841927, "grad_norm": 0.18907879292964935, "learning_rate": 4.4574561282027574e-05, "loss": 0.2155, "step": 5957 }, { "epoch": 4.1836657884832515, "grad_norm": 0.1687215119600296, "learning_rate": 4.456008549896376e-05, "loss": 0.1454, "step": 5958 }, { "epoch": 4.184418517124577, "grad_norm": 0.177315816283226, "learning_rate": 4.454561085759331e-05, "loss": 0.1149, "step": 5959 }, { "epoch": 4.185171245765901, "grad_norm": 0.15064264833927155, "learning_rate": 4.453113735949914e-05, "loss": 0.1109, "step": 5960 }, { "epoch": 4.185923974407226, "grad_norm": 0.1803930103778839, "learning_rate": 4.451666500626406e-05, "loss": 0.1996, "step": 5961 }, { "epoch": 4.186676703048551, "grad_norm": 0.15828640758991241, "learning_rate": 4.4502193799470715e-05, "loss": 0.1363, "step": 5962 }, { "epoch": 4.187429431689876, "grad_norm": 0.1713629961013794, "learning_rate": 4.448772374070168e-05, "loss": 0.2134, "step": 5963 }, { "epoch": 4.1881821603312, "grad_norm": 0.17848150432109833, "learning_rate": 4.447325483153938e-05, "loss": 0.1134, "step": 5964 }, { "epoch": 4.1889348889725255, "grad_norm": 0.169595405459404, "learning_rate": 4.445878707356611e-05, "loss": 0.1346, "step": 5965 }, { "epoch": 4.18968761761385, "grad_norm": 0.1577257663011551, "learning_rate": 4.444432046836402e-05, "loss": 0.1188, "step": 5966 }, { "epoch": 4.190440346255175, "grad_norm": 0.17330259084701538, "learning_rate": 4.442985501751517e-05, "loss": 0.1756, "step": 5967 }, { "epoch": 4.1911930748965, "grad_norm": 0.14673961699008942, "learning_rate": 4.4415390722601504e-05, "loss": 0.1094, "step": 5968 }, { "epoch": 4.191945803537824, "grad_norm": 0.14841286838054657, "learning_rate": 4.4400927585204774e-05, "loss": 0.1318, "step": 5969 }, { "epoch": 4.19269853217915, "grad_norm": 0.17357051372528076, "learning_rate": 4.438646560690668e-05, "loss": 0.1546, "step": 5970 }, { "epoch": 4.193451260820474, "grad_norm": 0.160536527633667, "learning_rate": 4.437200478928876e-05, "loss": 0.147, "step": 5971 }, { "epoch": 4.194203989461799, "grad_norm": 0.1499924659729004, "learning_rate": 4.43575451339324e-05, "loss": 0.14, "step": 5972 }, { "epoch": 4.194956718103124, "grad_norm": 0.18712210655212402, "learning_rate": 4.43430866424189e-05, "loss": 0.159, "step": 5973 }, { "epoch": 4.1957094467444485, "grad_norm": 0.17205657064914703, "learning_rate": 4.432862931632943e-05, "loss": 0.1022, "step": 5974 }, { "epoch": 4.196462175385774, "grad_norm": 0.17229880392551422, "learning_rate": 4.431417315724502e-05, "loss": 0.1371, "step": 5975 }, { "epoch": 4.197214904027098, "grad_norm": 0.15081487596035004, "learning_rate": 4.429971816674656e-05, "loss": 0.1595, "step": 5976 }, { "epoch": 4.197967632668423, "grad_norm": 0.2238783985376358, "learning_rate": 4.428526434641485e-05, "loss": 0.1649, "step": 5977 }, { "epoch": 4.198720361309748, "grad_norm": 0.15650519728660583, "learning_rate": 4.4270811697830506e-05, "loss": 0.0944, "step": 5978 }, { "epoch": 4.199473089951073, "grad_norm": 0.21226324141025543, "learning_rate": 4.425636022257406e-05, "loss": 0.1577, "step": 5979 }, { "epoch": 4.200225818592397, "grad_norm": 0.15472173690795898, "learning_rate": 4.42419099222259e-05, "loss": 0.0742, "step": 5980 }, { "epoch": 4.200978547233722, "grad_norm": 0.2004079520702362, "learning_rate": 4.422746079836632e-05, "loss": 0.1499, "step": 5981 }, { "epoch": 4.201731275875047, "grad_norm": 0.18042083084583282, "learning_rate": 4.421301285257542e-05, "loss": 0.1467, "step": 5982 }, { "epoch": 4.202484004516372, "grad_norm": 0.15604569017887115, "learning_rate": 4.4198566086433215e-05, "loss": 0.1807, "step": 5983 }, { "epoch": 4.203236733157697, "grad_norm": 0.17300249636173248, "learning_rate": 4.418412050151958e-05, "loss": 0.1365, "step": 5984 }, { "epoch": 4.203989461799021, "grad_norm": 0.1959831416606903, "learning_rate": 4.416967609941427e-05, "loss": 0.1529, "step": 5985 }, { "epoch": 4.204742190440347, "grad_norm": 0.19006088376045227, "learning_rate": 4.4155232881696884e-05, "loss": 0.1882, "step": 5986 }, { "epoch": 4.205494919081671, "grad_norm": 0.16873888671398163, "learning_rate": 4.4140790849946946e-05, "loss": 0.1384, "step": 5987 }, { "epoch": 4.2062476477229955, "grad_norm": 0.178487166762352, "learning_rate": 4.4126350005743786e-05, "loss": 0.1097, "step": 5988 }, { "epoch": 4.207000376364321, "grad_norm": 0.17820782959461212, "learning_rate": 4.4111910350666654e-05, "loss": 0.0872, "step": 5989 }, { "epoch": 4.207753105005645, "grad_norm": 0.1463438868522644, "learning_rate": 4.409747188629463e-05, "loss": 0.1217, "step": 5990 }, { "epoch": 4.20850583364697, "grad_norm": 0.15692543983459473, "learning_rate": 4.4083034614206674e-05, "loss": 0.0963, "step": 5991 }, { "epoch": 4.209258562288295, "grad_norm": 0.19286929070949554, "learning_rate": 4.4068598535981634e-05, "loss": 0.1048, "step": 5992 }, { "epoch": 4.21001129092962, "grad_norm": 0.17654557526111603, "learning_rate": 4.405416365319824e-05, "loss": 0.1457, "step": 5993 }, { "epoch": 4.210764019570945, "grad_norm": 0.1719130426645279, "learning_rate": 4.403972996743506e-05, "loss": 0.0993, "step": 5994 }, { "epoch": 4.2115167482122695, "grad_norm": 0.1993575394153595, "learning_rate": 4.402529748027052e-05, "loss": 0.1578, "step": 5995 }, { "epoch": 4.212269476853594, "grad_norm": 0.15788808465003967, "learning_rate": 4.4010866193282953e-05, "loss": 0.0868, "step": 5996 }, { "epoch": 4.213022205494919, "grad_norm": 0.1398990899324417, "learning_rate": 4.399643610805054e-05, "loss": 0.0554, "step": 5997 }, { "epoch": 4.213774934136244, "grad_norm": 0.22327551245689392, "learning_rate": 4.3982007226151325e-05, "loss": 0.1925, "step": 5998 }, { "epoch": 4.214527662777568, "grad_norm": 0.18686525523662567, "learning_rate": 4.396757954916322e-05, "loss": 0.11, "step": 5999 }, { "epoch": 4.215280391418894, "grad_norm": 0.20672962069511414, "learning_rate": 4.395315307866405e-05, "loss": 0.1697, "step": 6000 }, { "epoch": 4.215280391418894, "eval_loss": 0.18062029778957367, "eval_runtime": 455.9725, "eval_samples_per_second": 21.113, "eval_steps_per_second": 0.66, "step": 6000 }, { "epoch": 4.216033120060218, "grad_norm": 0.2210468053817749, "learning_rate": 4.393872781623144e-05, "loss": 0.2061, "step": 6001 }, { "epoch": 4.2167858487015435, "grad_norm": 0.20658008754253387, "learning_rate": 4.392430376344293e-05, "loss": 0.1542, "step": 6002 }, { "epoch": 4.217538577342868, "grad_norm": 0.2149086445569992, "learning_rate": 4.39098809218759e-05, "loss": 0.1911, "step": 6003 }, { "epoch": 4.218291305984192, "grad_norm": 0.188877671957016, "learning_rate": 4.3895459293107635e-05, "loss": 0.1659, "step": 6004 }, { "epoch": 4.219044034625518, "grad_norm": 0.16734513640403748, "learning_rate": 4.388103887871522e-05, "loss": 0.1358, "step": 6005 }, { "epoch": 4.219796763266842, "grad_norm": 0.2045632153749466, "learning_rate": 4.38666196802757e-05, "loss": 0.1686, "step": 6006 }, { "epoch": 4.220549491908167, "grad_norm": 0.17185251414775848, "learning_rate": 4.38522016993659e-05, "loss": 0.1746, "step": 6007 }, { "epoch": 4.221302220549492, "grad_norm": 0.24257054924964905, "learning_rate": 4.3837784937562574e-05, "loss": 0.1312, "step": 6008 }, { "epoch": 4.222054949190817, "grad_norm": 0.18196554481983185, "learning_rate": 4.382336939644229e-05, "loss": 0.1422, "step": 6009 }, { "epoch": 4.222807677832142, "grad_norm": 0.16543954610824585, "learning_rate": 4.380895507758155e-05, "loss": 0.1743, "step": 6010 }, { "epoch": 4.223560406473466, "grad_norm": 0.18536902964115143, "learning_rate": 4.379454198255663e-05, "loss": 0.1929, "step": 6011 }, { "epoch": 4.224313135114791, "grad_norm": 0.19731159508228302, "learning_rate": 4.378013011294375e-05, "loss": 0.1559, "step": 6012 }, { "epoch": 4.225065863756116, "grad_norm": 0.17511878907680511, "learning_rate": 4.3765719470318986e-05, "loss": 0.1343, "step": 6013 }, { "epoch": 4.225818592397441, "grad_norm": 0.16421107947826385, "learning_rate": 4.375131005625826e-05, "loss": 0.1003, "step": 6014 }, { "epoch": 4.226571321038765, "grad_norm": 0.18731820583343506, "learning_rate": 4.3736901872337346e-05, "loss": 0.1474, "step": 6015 }, { "epoch": 4.227324049680091, "grad_norm": 0.1743442565202713, "learning_rate": 4.372249492013191e-05, "loss": 0.1145, "step": 6016 }, { "epoch": 4.228076778321415, "grad_norm": 0.17005954682826996, "learning_rate": 4.370808920121747e-05, "loss": 0.2001, "step": 6017 }, { "epoch": 4.2288295069627395, "grad_norm": 0.1504928320646286, "learning_rate": 4.36936847171694e-05, "loss": 0.1088, "step": 6018 }, { "epoch": 4.229582235604065, "grad_norm": 0.1507776975631714, "learning_rate": 4.3679281469562996e-05, "loss": 0.0851, "step": 6019 }, { "epoch": 4.230334964245389, "grad_norm": 0.18933556973934174, "learning_rate": 4.366487945997335e-05, "loss": 0.2096, "step": 6020 }, { "epoch": 4.231087692886715, "grad_norm": 0.1695169359445572, "learning_rate": 4.365047868997544e-05, "loss": 0.1832, "step": 6021 }, { "epoch": 4.231840421528039, "grad_norm": 0.16591688990592957, "learning_rate": 4.3636079161144105e-05, "loss": 0.0867, "step": 6022 }, { "epoch": 4.232593150169364, "grad_norm": 0.18616902828216553, "learning_rate": 4.3621680875054086e-05, "loss": 0.1886, "step": 6023 }, { "epoch": 4.233345878810689, "grad_norm": 0.19624730944633484, "learning_rate": 4.360728383327991e-05, "loss": 0.1652, "step": 6024 }, { "epoch": 4.2340986074520135, "grad_norm": 0.18987587094306946, "learning_rate": 4.359288803739607e-05, "loss": 0.1785, "step": 6025 }, { "epoch": 4.234851336093338, "grad_norm": 0.17313413321971893, "learning_rate": 4.3578493488976844e-05, "loss": 0.1244, "step": 6026 }, { "epoch": 4.235604064734663, "grad_norm": 0.13979767262935638, "learning_rate": 4.356410018959639e-05, "loss": 0.077, "step": 6027 }, { "epoch": 4.236356793375988, "grad_norm": 0.17670957744121552, "learning_rate": 4.354970814082875e-05, "loss": 0.1215, "step": 6028 }, { "epoch": 4.237109522017313, "grad_norm": 0.16361941397190094, "learning_rate": 4.353531734424782e-05, "loss": 0.0683, "step": 6029 }, { "epoch": 4.237862250658638, "grad_norm": 0.1670798510313034, "learning_rate": 4.3520927801427324e-05, "loss": 0.1268, "step": 6030 }, { "epoch": 4.238614979299962, "grad_norm": 0.1711176186800003, "learning_rate": 4.350653951394093e-05, "loss": 0.148, "step": 6031 }, { "epoch": 4.2393677079412875, "grad_norm": 0.18109959363937378, "learning_rate": 4.349215248336207e-05, "loss": 0.1108, "step": 6032 }, { "epoch": 4.240120436582612, "grad_norm": 0.16205191612243652, "learning_rate": 4.3477766711264136e-05, "loss": 0.0929, "step": 6033 }, { "epoch": 4.240873165223936, "grad_norm": 0.1750706434249878, "learning_rate": 4.3463382199220305e-05, "loss": 0.1543, "step": 6034 }, { "epoch": 4.241625893865262, "grad_norm": 0.17630481719970703, "learning_rate": 4.3448998948803656e-05, "loss": 0.2231, "step": 6035 }, { "epoch": 4.242378622506586, "grad_norm": 0.17236259579658508, "learning_rate": 4.34346169615871e-05, "loss": 0.0812, "step": 6036 }, { "epoch": 4.243131351147911, "grad_norm": 0.18609313666820526, "learning_rate": 4.342023623914344e-05, "loss": 0.1457, "step": 6037 }, { "epoch": 4.243884079789236, "grad_norm": 0.1662972867488861, "learning_rate": 4.340585678304535e-05, "loss": 0.1346, "step": 6038 }, { "epoch": 4.244636808430561, "grad_norm": 0.19805125892162323, "learning_rate": 4.339147859486534e-05, "loss": 0.1131, "step": 6039 }, { "epoch": 4.245389537071886, "grad_norm": 0.1911008656024933, "learning_rate": 4.337710167617577e-05, "loss": 0.1566, "step": 6040 }, { "epoch": 4.24614226571321, "grad_norm": 0.18207360804080963, "learning_rate": 4.336272602854888e-05, "loss": 0.1568, "step": 6041 }, { "epoch": 4.246894994354535, "grad_norm": 0.16821713745594025, "learning_rate": 4.3348351653556784e-05, "loss": 0.0931, "step": 6042 }, { "epoch": 4.24764772299586, "grad_norm": 0.15358254313468933, "learning_rate": 4.3333978552771406e-05, "loss": 0.1259, "step": 6043 }, { "epoch": 4.248400451637185, "grad_norm": 0.17968876659870148, "learning_rate": 4.331960672776462e-05, "loss": 0.1224, "step": 6044 }, { "epoch": 4.249153180278509, "grad_norm": 0.16545841097831726, "learning_rate": 4.330523618010808e-05, "loss": 0.1288, "step": 6045 }, { "epoch": 4.2499059089198346, "grad_norm": 0.17467093467712402, "learning_rate": 4.329086691137333e-05, "loss": 0.1252, "step": 6046 }, { "epoch": 4.250658637561159, "grad_norm": 0.15062950551509857, "learning_rate": 4.327649892313177e-05, "loss": 0.1144, "step": 6047 }, { "epoch": 4.251411366202484, "grad_norm": 0.16213667392730713, "learning_rate": 4.3262132216954656e-05, "loss": 0.1022, "step": 6048 }, { "epoch": 4.252164094843809, "grad_norm": 0.16928835213184357, "learning_rate": 4.32477667944131e-05, "loss": 0.1377, "step": 6049 }, { "epoch": 4.252916823485133, "grad_norm": 0.1748395562171936, "learning_rate": 4.3233402657078115e-05, "loss": 0.1316, "step": 6050 }, { "epoch": 4.253669552126459, "grad_norm": 0.16120494902133942, "learning_rate": 4.321903980652052e-05, "loss": 0.1795, "step": 6051 }, { "epoch": 4.254422280767783, "grad_norm": 0.14396393299102783, "learning_rate": 4.3204678244311024e-05, "loss": 0.1082, "step": 6052 }, { "epoch": 4.255175009409108, "grad_norm": 0.1686316281557083, "learning_rate": 4.319031797202018e-05, "loss": 0.1363, "step": 6053 }, { "epoch": 4.255927738050433, "grad_norm": 0.15393033623695374, "learning_rate": 4.317595899121841e-05, "loss": 0.158, "step": 6054 }, { "epoch": 4.2566804666917575, "grad_norm": 0.15350539982318878, "learning_rate": 4.316160130347599e-05, "loss": 0.1492, "step": 6055 }, { "epoch": 4.257433195333082, "grad_norm": 0.16997715830802917, "learning_rate": 4.314724491036304e-05, "loss": 0.1494, "step": 6056 }, { "epoch": 4.258185923974407, "grad_norm": 0.16819174587726593, "learning_rate": 4.313288981344956e-05, "loss": 0.1588, "step": 6057 }, { "epoch": 4.258938652615732, "grad_norm": 0.14647620916366577, "learning_rate": 4.3118536014305433e-05, "loss": 0.0789, "step": 6058 }, { "epoch": 4.259691381257057, "grad_norm": 0.15481923520565033, "learning_rate": 4.310418351450033e-05, "loss": 0.1205, "step": 6059 }, { "epoch": 4.260444109898382, "grad_norm": 0.17010824382305145, "learning_rate": 4.308983231560384e-05, "loss": 0.2072, "step": 6060 }, { "epoch": 4.261196838539706, "grad_norm": 0.17932280898094177, "learning_rate": 4.307548241918537e-05, "loss": 0.1554, "step": 6061 }, { "epoch": 4.2619495671810315, "grad_norm": 0.14995069801807404, "learning_rate": 4.3061133826814214e-05, "loss": 0.1047, "step": 6062 }, { "epoch": 4.262702295822356, "grad_norm": 0.17249447107315063, "learning_rate": 4.3046786540059524e-05, "loss": 0.0945, "step": 6063 }, { "epoch": 4.263455024463681, "grad_norm": 0.1743735820055008, "learning_rate": 4.303244056049028e-05, "loss": 0.1653, "step": 6064 }, { "epoch": 4.264207753105006, "grad_norm": 0.15289205312728882, "learning_rate": 4.3018095889675344e-05, "loss": 0.1323, "step": 6065 }, { "epoch": 4.26496048174633, "grad_norm": 0.1909562200307846, "learning_rate": 4.300375252918344e-05, "loss": 0.1445, "step": 6066 }, { "epoch": 4.265713210387656, "grad_norm": 0.18411900103092194, "learning_rate": 4.2989410480583116e-05, "loss": 0.112, "step": 6067 }, { "epoch": 4.26646593902898, "grad_norm": 0.1874161809682846, "learning_rate": 4.297506974544281e-05, "loss": 0.1555, "step": 6068 }, { "epoch": 4.2672186676703046, "grad_norm": 0.17491701245307922, "learning_rate": 4.296073032533076e-05, "loss": 0.108, "step": 6069 }, { "epoch": 4.26797139631163, "grad_norm": 0.18439613282680511, "learning_rate": 4.2946392221815165e-05, "loss": 0.1189, "step": 6070 }, { "epoch": 4.268724124952954, "grad_norm": 0.16691374778747559, "learning_rate": 4.2932055436464015e-05, "loss": 0.1428, "step": 6071 }, { "epoch": 4.269476853594279, "grad_norm": 0.14587706327438354, "learning_rate": 4.291771997084512e-05, "loss": 0.1118, "step": 6072 }, { "epoch": 4.270229582235604, "grad_norm": 0.1665160208940506, "learning_rate": 4.290338582652621e-05, "loss": 0.1526, "step": 6073 }, { "epoch": 4.270982310876929, "grad_norm": 0.15455174446105957, "learning_rate": 4.288905300507482e-05, "loss": 0.0738, "step": 6074 }, { "epoch": 4.271735039518254, "grad_norm": 0.15376369655132294, "learning_rate": 4.2874721508058394e-05, "loss": 0.141, "step": 6075 }, { "epoch": 4.2724877681595785, "grad_norm": 0.15382680296897888, "learning_rate": 4.2860391337044204e-05, "loss": 0.1237, "step": 6076 }, { "epoch": 4.273240496800903, "grad_norm": 0.17916695773601532, "learning_rate": 4.284606249359936e-05, "loss": 0.1807, "step": 6077 }, { "epoch": 4.273993225442228, "grad_norm": 0.16859067976474762, "learning_rate": 4.283173497929084e-05, "loss": 0.1132, "step": 6078 }, { "epoch": 4.274745954083553, "grad_norm": 0.15379570424556732, "learning_rate": 4.28174087956855e-05, "loss": 0.1384, "step": 6079 }, { "epoch": 4.275498682724877, "grad_norm": 0.21806707978248596, "learning_rate": 4.2803083944350016e-05, "loss": 0.1778, "step": 6080 }, { "epoch": 4.276251411366203, "grad_norm": 0.15511134266853333, "learning_rate": 4.278876042685093e-05, "loss": 0.0672, "step": 6081 }, { "epoch": 4.277004140007527, "grad_norm": 0.16948537528514862, "learning_rate": 4.277443824475464e-05, "loss": 0.1458, "step": 6082 }, { "epoch": 4.2777568686488525, "grad_norm": 0.15187832713127136, "learning_rate": 4.276011739962741e-05, "loss": 0.1153, "step": 6083 }, { "epoch": 4.278509597290177, "grad_norm": 0.1485111117362976, "learning_rate": 4.2745797893035346e-05, "loss": 0.1321, "step": 6084 }, { "epoch": 4.2792623259315015, "grad_norm": 0.17091618478298187, "learning_rate": 4.2731479726544386e-05, "loss": 0.1626, "step": 6085 }, { "epoch": 4.280015054572827, "grad_norm": 0.1950063556432724, "learning_rate": 4.271716290172038e-05, "loss": 0.1474, "step": 6086 }, { "epoch": 4.280767783214151, "grad_norm": 0.15657338500022888, "learning_rate": 4.2702847420128954e-05, "loss": 0.0814, "step": 6087 }, { "epoch": 4.281520511855476, "grad_norm": 0.19885994493961334, "learning_rate": 4.268853328333563e-05, "loss": 0.0965, "step": 6088 }, { "epoch": 4.282273240496801, "grad_norm": 0.14869806170463562, "learning_rate": 4.2674220492905826e-05, "loss": 0.1348, "step": 6089 }, { "epoch": 4.283025969138126, "grad_norm": 0.17493411898612976, "learning_rate": 4.265990905040472e-05, "loss": 0.1959, "step": 6090 }, { "epoch": 4.28377869777945, "grad_norm": 0.17392049729824066, "learning_rate": 4.2645598957397415e-05, "loss": 0.1143, "step": 6091 }, { "epoch": 4.2845314264207754, "grad_norm": 0.2189701497554779, "learning_rate": 4.263129021544884e-05, "loss": 0.2032, "step": 6092 }, { "epoch": 4.2852841550621, "grad_norm": 0.17626039683818817, "learning_rate": 4.261698282612376e-05, "loss": 0.1458, "step": 6093 }, { "epoch": 4.286036883703425, "grad_norm": 0.16640132665634155, "learning_rate": 4.260267679098681e-05, "loss": 0.1541, "step": 6094 }, { "epoch": 4.28678961234475, "grad_norm": 0.18809254467487335, "learning_rate": 4.2588372111602504e-05, "loss": 0.1696, "step": 6095 }, { "epoch": 4.287542340986074, "grad_norm": 0.1885201781988144, "learning_rate": 4.257406878953516e-05, "loss": 0.1173, "step": 6096 }, { "epoch": 4.2882950696274, "grad_norm": 0.1757071167230606, "learning_rate": 4.255976682634898e-05, "loss": 0.1703, "step": 6097 }, { "epoch": 4.289047798268724, "grad_norm": 0.16948570311069489, "learning_rate": 4.2545466223607985e-05, "loss": 0.1024, "step": 6098 }, { "epoch": 4.2898005269100485, "grad_norm": 0.1648169904947281, "learning_rate": 4.253116698287607e-05, "loss": 0.1289, "step": 6099 }, { "epoch": 4.290553255551374, "grad_norm": 0.18119949102401733, "learning_rate": 4.2516869105717004e-05, "loss": 0.1414, "step": 6100 }, { "epoch": 4.291305984192698, "grad_norm": 0.17972812056541443, "learning_rate": 4.250257259369434e-05, "loss": 0.1372, "step": 6101 }, { "epoch": 4.292058712834024, "grad_norm": 0.17546944320201874, "learning_rate": 4.248827744837157e-05, "loss": 0.1323, "step": 6102 }, { "epoch": 4.292811441475348, "grad_norm": 0.17619597911834717, "learning_rate": 4.2473983671311965e-05, "loss": 0.1564, "step": 6103 }, { "epoch": 4.293564170116673, "grad_norm": 0.19961139559745789, "learning_rate": 4.245969126407868e-05, "loss": 0.1292, "step": 6104 }, { "epoch": 4.294316898757998, "grad_norm": 0.16226400434970856, "learning_rate": 4.2445400228234686e-05, "loss": 0.1381, "step": 6105 }, { "epoch": 4.2950696273993225, "grad_norm": 0.1563248485326767, "learning_rate": 4.2431110565342866e-05, "loss": 0.1225, "step": 6106 }, { "epoch": 4.295822356040647, "grad_norm": 0.18709824979305267, "learning_rate": 4.241682227696587e-05, "loss": 0.1254, "step": 6107 }, { "epoch": 4.296575084681972, "grad_norm": 0.15146656334400177, "learning_rate": 4.240253536466629e-05, "loss": 0.0629, "step": 6108 }, { "epoch": 4.297327813323297, "grad_norm": 0.17470978200435638, "learning_rate": 4.238824983000651e-05, "loss": 0.1842, "step": 6109 }, { "epoch": 4.298080541964621, "grad_norm": 0.1642577201128006, "learning_rate": 4.2373965674548764e-05, "loss": 0.1573, "step": 6110 }, { "epoch": 4.298833270605947, "grad_norm": 0.14976118505001068, "learning_rate": 4.2359682899855146e-05, "loss": 0.065, "step": 6111 }, { "epoch": 4.299585999247271, "grad_norm": 0.14880113303661346, "learning_rate": 4.2345401507487606e-05, "loss": 0.0647, "step": 6112 }, { "epoch": 4.3003387278885965, "grad_norm": 0.1900012046098709, "learning_rate": 4.233112149900791e-05, "loss": 0.1552, "step": 6113 }, { "epoch": 4.301091456529921, "grad_norm": 0.1446414589881897, "learning_rate": 4.2316842875977726e-05, "loss": 0.1689, "step": 6114 }, { "epoch": 4.3018441851712454, "grad_norm": 0.1719525009393692, "learning_rate": 4.2302565639958547e-05, "loss": 0.1368, "step": 6115 }, { "epoch": 4.302596913812571, "grad_norm": 0.15247297286987305, "learning_rate": 4.22882897925117e-05, "loss": 0.0983, "step": 6116 }, { "epoch": 4.303349642453895, "grad_norm": 0.19085709750652313, "learning_rate": 4.227401533519837e-05, "loss": 0.1406, "step": 6117 }, { "epoch": 4.30410237109522, "grad_norm": 0.1807686984539032, "learning_rate": 4.225974226957959e-05, "loss": 0.124, "step": 6118 }, { "epoch": 4.304855099736545, "grad_norm": 0.17725270986557007, "learning_rate": 4.224547059721624e-05, "loss": 0.0847, "step": 6119 }, { "epoch": 4.30560782837787, "grad_norm": 0.18161942064762115, "learning_rate": 4.223120031966903e-05, "loss": 0.1108, "step": 6120 }, { "epoch": 4.306360557019195, "grad_norm": 0.14498066902160645, "learning_rate": 4.221693143849857e-05, "loss": 0.0485, "step": 6121 }, { "epoch": 4.307113285660519, "grad_norm": 0.1548694670200348, "learning_rate": 4.2202663955265286e-05, "loss": 0.0845, "step": 6122 }, { "epoch": 4.307866014301844, "grad_norm": 0.18466123938560486, "learning_rate": 4.218839787152941e-05, "loss": 0.1588, "step": 6123 }, { "epoch": 4.308618742943169, "grad_norm": 0.16905130445957184, "learning_rate": 4.217413318885108e-05, "loss": 0.126, "step": 6124 }, { "epoch": 4.309371471584494, "grad_norm": 0.19543185830116272, "learning_rate": 4.215986990879027e-05, "loss": 0.1978, "step": 6125 }, { "epoch": 4.310124200225818, "grad_norm": 0.1441568285226822, "learning_rate": 4.214560803290677e-05, "loss": 0.0848, "step": 6126 }, { "epoch": 4.310876928867144, "grad_norm": 0.20885904133319855, "learning_rate": 4.213134756276027e-05, "loss": 0.1522, "step": 6127 }, { "epoch": 4.311629657508468, "grad_norm": 0.16268819570541382, "learning_rate": 4.211708849991023e-05, "loss": 0.12, "step": 6128 }, { "epoch": 4.312382386149793, "grad_norm": 0.15200984477996826, "learning_rate": 4.210283084591603e-05, "loss": 0.0656, "step": 6129 }, { "epoch": 4.313135114791118, "grad_norm": 0.18041911721229553, "learning_rate": 4.208857460233687e-05, "loss": 0.2044, "step": 6130 }, { "epoch": 4.313887843432442, "grad_norm": 0.15432709455490112, "learning_rate": 4.2074319770731775e-05, "loss": 0.1237, "step": 6131 }, { "epoch": 4.314640572073768, "grad_norm": 0.16255255043506622, "learning_rate": 4.206006635265962e-05, "loss": 0.1311, "step": 6132 }, { "epoch": 4.315393300715092, "grad_norm": 0.16784879565238953, "learning_rate": 4.204581434967917e-05, "loss": 0.1071, "step": 6133 }, { "epoch": 4.316146029356417, "grad_norm": 0.1560693234205246, "learning_rate": 4.203156376334898e-05, "loss": 0.1475, "step": 6134 }, { "epoch": 4.316898757997742, "grad_norm": 0.1341804563999176, "learning_rate": 4.201731459522749e-05, "loss": 0.0563, "step": 6135 }, { "epoch": 4.3176514866390665, "grad_norm": 0.20070384442806244, "learning_rate": 4.200306684687295e-05, "loss": 0.1353, "step": 6136 }, { "epoch": 4.318404215280392, "grad_norm": 0.17830151319503784, "learning_rate": 4.198882051984349e-05, "loss": 0.1341, "step": 6137 }, { "epoch": 4.319156943921716, "grad_norm": 0.18546420335769653, "learning_rate": 4.197457561569703e-05, "loss": 0.1472, "step": 6138 }, { "epoch": 4.319909672563041, "grad_norm": 0.17295536398887634, "learning_rate": 4.19603321359914e-05, "loss": 0.1181, "step": 6139 }, { "epoch": 4.320662401204366, "grad_norm": 0.16861537098884583, "learning_rate": 4.1946090082284264e-05, "loss": 0.1247, "step": 6140 }, { "epoch": 4.321415129845691, "grad_norm": 0.16706021130084991, "learning_rate": 4.1931849456133074e-05, "loss": 0.1093, "step": 6141 }, { "epoch": 4.322167858487015, "grad_norm": 0.16438138484954834, "learning_rate": 4.1917610259095195e-05, "loss": 0.0975, "step": 6142 }, { "epoch": 4.3229205871283405, "grad_norm": 0.2026090919971466, "learning_rate": 4.190337249272778e-05, "loss": 0.1513, "step": 6143 }, { "epoch": 4.323673315769665, "grad_norm": 0.1385471373796463, "learning_rate": 4.188913615858786e-05, "loss": 0.1503, "step": 6144 }, { "epoch": 4.324426044410989, "grad_norm": 0.20210599899291992, "learning_rate": 4.187490125823228e-05, "loss": 0.1072, "step": 6145 }, { "epoch": 4.325178773052315, "grad_norm": 0.1677982658147812, "learning_rate": 4.186066779321779e-05, "loss": 0.1265, "step": 6146 }, { "epoch": 4.325931501693639, "grad_norm": 0.2267083376646042, "learning_rate": 4.18464357651009e-05, "loss": 0.1561, "step": 6147 }, { "epoch": 4.326684230334965, "grad_norm": 0.14806091785430908, "learning_rate": 4.1832205175438036e-05, "loss": 0.09, "step": 6148 }, { "epoch": 4.327436958976289, "grad_norm": 0.19683410227298737, "learning_rate": 4.18179760257854e-05, "loss": 0.1511, "step": 6149 }, { "epoch": 4.328189687617614, "grad_norm": 0.16287082433700562, "learning_rate": 4.18037483176991e-05, "loss": 0.1374, "step": 6150 }, { "epoch": 4.328942416258939, "grad_norm": 0.17992915213108063, "learning_rate": 4.178952205273503e-05, "loss": 0.1122, "step": 6151 }, { "epoch": 4.329695144900263, "grad_norm": 0.15723855793476105, "learning_rate": 4.177529723244897e-05, "loss": 0.094, "step": 6152 }, { "epoch": 4.330447873541588, "grad_norm": 0.1743958592414856, "learning_rate": 4.1761073858396526e-05, "loss": 0.1535, "step": 6153 }, { "epoch": 4.331200602182913, "grad_norm": 0.17067857086658478, "learning_rate": 4.174685193213313e-05, "loss": 0.1243, "step": 6154 }, { "epoch": 4.331953330824238, "grad_norm": 0.18023928999900818, "learning_rate": 4.17326314552141e-05, "loss": 0.1076, "step": 6155 }, { "epoch": 4.332706059465563, "grad_norm": 0.16378480195999146, "learning_rate": 4.1718412429194545e-05, "loss": 0.0953, "step": 6156 }, { "epoch": 4.333458788106888, "grad_norm": 0.19918425381183624, "learning_rate": 4.1704194855629434e-05, "loss": 0.1576, "step": 6157 }, { "epoch": 4.334211516748212, "grad_norm": 0.17436039447784424, "learning_rate": 4.168997873607355e-05, "loss": 0.202, "step": 6158 }, { "epoch": 4.334964245389537, "grad_norm": 0.175679013133049, "learning_rate": 4.167576407208162e-05, "loss": 0.1421, "step": 6159 }, { "epoch": 4.335716974030862, "grad_norm": 0.1708555817604065, "learning_rate": 4.166155086520809e-05, "loss": 0.1968, "step": 6160 }, { "epoch": 4.336469702672186, "grad_norm": 0.19506897032260895, "learning_rate": 4.16473391170073e-05, "loss": 0.1352, "step": 6161 }, { "epoch": 4.337222431313512, "grad_norm": 0.15694350004196167, "learning_rate": 4.163312882903344e-05, "loss": 0.0775, "step": 6162 }, { "epoch": 4.337975159954836, "grad_norm": 0.1534508466720581, "learning_rate": 4.16189200028405e-05, "loss": 0.0965, "step": 6163 }, { "epoch": 4.338727888596161, "grad_norm": 0.17239461839199066, "learning_rate": 4.160471263998235e-05, "loss": 0.2241, "step": 6164 }, { "epoch": 4.339480617237486, "grad_norm": 0.16014187037944794, "learning_rate": 4.159050674201269e-05, "loss": 0.1403, "step": 6165 }, { "epoch": 4.3402333458788105, "grad_norm": 0.16464847326278687, "learning_rate": 4.157630231048506e-05, "loss": 0.1474, "step": 6166 }, { "epoch": 4.340986074520136, "grad_norm": 0.16507360339164734, "learning_rate": 4.1562099346952824e-05, "loss": 0.0997, "step": 6167 }, { "epoch": 4.34173880316146, "grad_norm": 0.16794128715991974, "learning_rate": 4.15478978529692e-05, "loss": 0.0794, "step": 6168 }, { "epoch": 4.342491531802785, "grad_norm": 0.1807040423154831, "learning_rate": 4.153369783008724e-05, "loss": 0.1684, "step": 6169 }, { "epoch": 4.34324426044411, "grad_norm": 0.1713259369134903, "learning_rate": 4.151949927985983e-05, "loss": 0.1203, "step": 6170 }, { "epoch": 4.343996989085435, "grad_norm": 0.1774558126926422, "learning_rate": 4.15053022038397e-05, "loss": 0.1658, "step": 6171 }, { "epoch": 4.344749717726759, "grad_norm": 0.183060884475708, "learning_rate": 4.1491106603579466e-05, "loss": 0.1655, "step": 6172 }, { "epoch": 4.3455024463680845, "grad_norm": 0.16808100044727325, "learning_rate": 4.1476912480631494e-05, "loss": 0.1021, "step": 6173 }, { "epoch": 4.346255175009409, "grad_norm": 0.18391826748847961, "learning_rate": 4.1462719836548025e-05, "loss": 0.2146, "step": 6174 }, { "epoch": 4.347007903650734, "grad_norm": 0.16362504661083221, "learning_rate": 4.144852867288117e-05, "loss": 0.2117, "step": 6175 }, { "epoch": 4.347760632292059, "grad_norm": 0.17888981103897095, "learning_rate": 4.143433899118285e-05, "loss": 0.1396, "step": 6176 }, { "epoch": 4.348513360933383, "grad_norm": 0.1958789825439453, "learning_rate": 4.142015079300482e-05, "loss": 0.1794, "step": 6177 }, { "epoch": 4.349266089574709, "grad_norm": 0.1975773274898529, "learning_rate": 4.140596407989867e-05, "loss": 0.1382, "step": 6178 }, { "epoch": 4.350018818216033, "grad_norm": 0.18112188577651978, "learning_rate": 4.139177885341586e-05, "loss": 0.1393, "step": 6179 }, { "epoch": 4.350771546857358, "grad_norm": 0.15854214131832123, "learning_rate": 4.137759511510766e-05, "loss": 0.1166, "step": 6180 }, { "epoch": 4.351524275498683, "grad_norm": 0.17539627850055695, "learning_rate": 4.1363412866525185e-05, "loss": 0.1402, "step": 6181 }, { "epoch": 4.352277004140007, "grad_norm": 0.20765873789787292, "learning_rate": 4.134923210921937e-05, "loss": 0.1326, "step": 6182 }, { "epoch": 4.353029732781332, "grad_norm": 0.18251709640026093, "learning_rate": 4.133505284474098e-05, "loss": 0.1903, "step": 6183 }, { "epoch": 4.353782461422657, "grad_norm": 0.16335590183734894, "learning_rate": 4.132087507464068e-05, "loss": 0.07, "step": 6184 }, { "epoch": 4.354535190063982, "grad_norm": 0.19234564900398254, "learning_rate": 4.1306698800468934e-05, "loss": 0.1722, "step": 6185 }, { "epoch": 4.355287918705307, "grad_norm": 0.1701175719499588, "learning_rate": 4.129252402377602e-05, "loss": 0.1578, "step": 6186 }, { "epoch": 4.3560406473466315, "grad_norm": 0.1657828390598297, "learning_rate": 4.127835074611206e-05, "loss": 0.0947, "step": 6187 }, { "epoch": 4.356793375987956, "grad_norm": 0.1643206775188446, "learning_rate": 4.1264178969027034e-05, "loss": 0.1057, "step": 6188 }, { "epoch": 4.357546104629281, "grad_norm": 0.1638907939195633, "learning_rate": 4.125000869407074e-05, "loss": 0.1381, "step": 6189 }, { "epoch": 4.358298833270606, "grad_norm": 0.17990510165691376, "learning_rate": 4.123583992279282e-05, "loss": 0.1099, "step": 6190 }, { "epoch": 4.359051561911931, "grad_norm": 0.16746394336223602, "learning_rate": 4.122167265674276e-05, "loss": 0.1732, "step": 6191 }, { "epoch": 4.359804290553256, "grad_norm": 0.1997857391834259, "learning_rate": 4.120750689746986e-05, "loss": 0.1969, "step": 6192 }, { "epoch": 4.36055701919458, "grad_norm": 0.1481809914112091, "learning_rate": 4.1193342646523266e-05, "loss": 0.1189, "step": 6193 }, { "epoch": 4.3613097478359055, "grad_norm": 0.21273252367973328, "learning_rate": 4.117917990545197e-05, "loss": 0.1476, "step": 6194 }, { "epoch": 4.36206247647723, "grad_norm": 0.16093914210796356, "learning_rate": 4.116501867580477e-05, "loss": 0.1678, "step": 6195 }, { "epoch": 4.3628152051185545, "grad_norm": 0.186054065823555, "learning_rate": 4.1150858959130336e-05, "loss": 0.1919, "step": 6196 }, { "epoch": 4.36356793375988, "grad_norm": 0.17949771881103516, "learning_rate": 4.1136700756977145e-05, "loss": 0.0941, "step": 6197 }, { "epoch": 4.364320662401204, "grad_norm": 0.1560443937778473, "learning_rate": 4.112254407089351e-05, "loss": 0.1785, "step": 6198 }, { "epoch": 4.365073391042529, "grad_norm": 0.1840757131576538, "learning_rate": 4.110838890242759e-05, "loss": 0.1448, "step": 6199 }, { "epoch": 4.365826119683854, "grad_norm": 0.19623781740665436, "learning_rate": 4.109423525312738e-05, "loss": 0.1604, "step": 6200 }, { "epoch": 4.365826119683854, "eval_loss": 0.17568251490592957, "eval_runtime": 456.6125, "eval_samples_per_second": 21.084, "eval_steps_per_second": 0.659, "step": 6200 }, { "epoch": 4.366578848325179, "grad_norm": 0.17881731688976288, "learning_rate": 4.108008312454069e-05, "loss": 0.1467, "step": 6201 }, { "epoch": 4.367331576966504, "grad_norm": 0.21320374310016632, "learning_rate": 4.106593251821518e-05, "loss": 0.1187, "step": 6202 }, { "epoch": 4.3680843056078285, "grad_norm": 0.2020149827003479, "learning_rate": 4.105178343569833e-05, "loss": 0.1789, "step": 6203 }, { "epoch": 4.368837034249153, "grad_norm": 0.1753395050764084, "learning_rate": 4.1037635878537495e-05, "loss": 0.1548, "step": 6204 }, { "epoch": 4.369589762890478, "grad_norm": 0.15373702347278595, "learning_rate": 4.102348984827981e-05, "loss": 0.0681, "step": 6205 }, { "epoch": 4.370342491531803, "grad_norm": 0.16374151408672333, "learning_rate": 4.100934534647224e-05, "loss": 0.1199, "step": 6206 }, { "epoch": 4.371095220173127, "grad_norm": 0.15764541923999786, "learning_rate": 4.099520237466165e-05, "loss": 0.0899, "step": 6207 }, { "epoch": 4.371847948814453, "grad_norm": 0.21810728311538696, "learning_rate": 4.0981060934394655e-05, "loss": 0.1163, "step": 6208 }, { "epoch": 4.372600677455777, "grad_norm": 0.18505467474460602, "learning_rate": 4.096692102721775e-05, "loss": 0.1311, "step": 6209 }, { "epoch": 4.373353406097102, "grad_norm": 0.16754992306232452, "learning_rate": 4.095278265467729e-05, "loss": 0.1843, "step": 6210 }, { "epoch": 4.374106134738427, "grad_norm": 0.17180804908275604, "learning_rate": 4.093864581831939e-05, "loss": 0.1591, "step": 6211 }, { "epoch": 4.374858863379751, "grad_norm": 0.16791851818561554, "learning_rate": 4.092451051969005e-05, "loss": 0.1436, "step": 6212 }, { "epoch": 4.375611592021077, "grad_norm": 0.1814541220664978, "learning_rate": 4.091037676033507e-05, "loss": 0.1421, "step": 6213 }, { "epoch": 4.376364320662401, "grad_norm": 0.19288577139377594, "learning_rate": 4.08962445418001e-05, "loss": 0.1552, "step": 6214 }, { "epoch": 4.377117049303726, "grad_norm": 0.19003693759441376, "learning_rate": 4.0882113865630614e-05, "loss": 0.1702, "step": 6215 }, { "epoch": 4.377869777945051, "grad_norm": 0.19789527356624603, "learning_rate": 4.086798473337193e-05, "loss": 0.246, "step": 6216 }, { "epoch": 4.3786225065863755, "grad_norm": 0.18008887767791748, "learning_rate": 4.085385714656921e-05, "loss": 0.11, "step": 6217 }, { "epoch": 4.3793752352277, "grad_norm": 0.13805995881557465, "learning_rate": 4.08397311067674e-05, "loss": 0.0689, "step": 6218 }, { "epoch": 4.380127963869025, "grad_norm": 0.18281449377536774, "learning_rate": 4.0825606615511305e-05, "loss": 0.128, "step": 6219 }, { "epoch": 4.38088069251035, "grad_norm": 0.16130933165550232, "learning_rate": 4.081148367434554e-05, "loss": 0.181, "step": 6220 }, { "epoch": 4.381633421151675, "grad_norm": 0.24205784499645233, "learning_rate": 4.07973622848146e-05, "loss": 0.1774, "step": 6221 }, { "epoch": 4.382386149793, "grad_norm": 0.19109636545181274, "learning_rate": 4.078324244846277e-05, "loss": 0.1354, "step": 6222 }, { "epoch": 4.383138878434324, "grad_norm": 0.20246995985507965, "learning_rate": 4.076912416683417e-05, "loss": 0.1478, "step": 6223 }, { "epoch": 4.3838916070756495, "grad_norm": 0.19419147074222565, "learning_rate": 4.075500744147275e-05, "loss": 0.1694, "step": 6224 }, { "epoch": 4.384644335716974, "grad_norm": 0.1728624552488327, "learning_rate": 4.07408922739223e-05, "loss": 0.1265, "step": 6225 }, { "epoch": 4.3853970643582985, "grad_norm": 0.2080737054347992, "learning_rate": 4.072677866572645e-05, "loss": 0.2398, "step": 6226 }, { "epoch": 4.386149792999624, "grad_norm": 0.1990901231765747, "learning_rate": 4.071266661842861e-05, "loss": 0.1617, "step": 6227 }, { "epoch": 4.386902521640948, "grad_norm": 0.19564121961593628, "learning_rate": 4.0698556133572064e-05, "loss": 0.1598, "step": 6228 }, { "epoch": 4.387655250282274, "grad_norm": 0.1878281831741333, "learning_rate": 4.068444721269993e-05, "loss": 0.1984, "step": 6229 }, { "epoch": 4.388407978923598, "grad_norm": 0.16103895008563995, "learning_rate": 4.0670339857355124e-05, "loss": 0.1092, "step": 6230 }, { "epoch": 4.389160707564923, "grad_norm": 0.1981949359178543, "learning_rate": 4.06562340690804e-05, "loss": 0.1738, "step": 6231 }, { "epoch": 4.389913436206248, "grad_norm": 0.19864477217197418, "learning_rate": 4.0642129849418356e-05, "loss": 0.2047, "step": 6232 }, { "epoch": 4.390666164847572, "grad_norm": 0.163682758808136, "learning_rate": 4.062802719991141e-05, "loss": 0.1253, "step": 6233 }, { "epoch": 4.391418893488897, "grad_norm": 0.16840966045856476, "learning_rate": 4.0613926122101783e-05, "loss": 0.1014, "step": 6234 }, { "epoch": 4.392171622130222, "grad_norm": 0.19362780451774597, "learning_rate": 4.059982661753158e-05, "loss": 0.1295, "step": 6235 }, { "epoch": 4.392924350771547, "grad_norm": 0.1470218151807785, "learning_rate": 4.0585728687742686e-05, "loss": 0.0982, "step": 6236 }, { "epoch": 4.393677079412871, "grad_norm": 0.17439976334571838, "learning_rate": 4.057163233427684e-05, "loss": 0.1093, "step": 6237 }, { "epoch": 4.394429808054197, "grad_norm": 0.18043072521686554, "learning_rate": 4.0557537558675583e-05, "loss": 0.2129, "step": 6238 }, { "epoch": 4.395182536695521, "grad_norm": 0.17388981580734253, "learning_rate": 4.054344436248031e-05, "loss": 0.0963, "step": 6239 }, { "epoch": 4.395935265336846, "grad_norm": 0.17125709354877472, "learning_rate": 4.052935274723222e-05, "loss": 0.1089, "step": 6240 }, { "epoch": 4.396687993978171, "grad_norm": 0.19145222008228302, "learning_rate": 4.0515262714472336e-05, "loss": 0.1245, "step": 6241 }, { "epoch": 4.397440722619495, "grad_norm": 0.16442568600177765, "learning_rate": 4.050117426574157e-05, "loss": 0.0706, "step": 6242 }, { "epoch": 4.398193451260821, "grad_norm": 0.1726178526878357, "learning_rate": 4.048708740258059e-05, "loss": 0.1073, "step": 6243 }, { "epoch": 4.398946179902145, "grad_norm": 0.14921444654464722, "learning_rate": 4.04730021265299e-05, "loss": 0.1099, "step": 6244 }, { "epoch": 4.39969890854347, "grad_norm": 0.17558614909648895, "learning_rate": 4.0458918439129855e-05, "loss": 0.109, "step": 6245 }, { "epoch": 4.400451637184795, "grad_norm": 0.13440091907978058, "learning_rate": 4.0444836341920646e-05, "loss": 0.0853, "step": 6246 }, { "epoch": 4.4012043658261195, "grad_norm": 0.18047575652599335, "learning_rate": 4.043075583644223e-05, "loss": 0.1143, "step": 6247 }, { "epoch": 4.401957094467445, "grad_norm": 0.19202980399131775, "learning_rate": 4.041667692423447e-05, "loss": 0.1847, "step": 6248 }, { "epoch": 4.402709823108769, "grad_norm": 0.16258330643177032, "learning_rate": 4.0402599606836964e-05, "loss": 0.1084, "step": 6249 }, { "epoch": 4.403462551750094, "grad_norm": 0.1701095700263977, "learning_rate": 4.0388523885789256e-05, "loss": 0.1072, "step": 6250 }, { "epoch": 4.404215280391419, "grad_norm": 0.15820379555225372, "learning_rate": 4.0374449762630585e-05, "loss": 0.1118, "step": 6251 }, { "epoch": 4.404968009032744, "grad_norm": 0.21457776427268982, "learning_rate": 4.0360377238900104e-05, "loss": 0.2349, "step": 6252 }, { "epoch": 4.405720737674068, "grad_norm": 0.1544119268655777, "learning_rate": 4.034630631613675e-05, "loss": 0.0867, "step": 6253 }, { "epoch": 4.4064734663153935, "grad_norm": 0.1768496185541153, "learning_rate": 4.0332236995879315e-05, "loss": 0.1472, "step": 6254 }, { "epoch": 4.407226194956718, "grad_norm": 0.21047097444534302, "learning_rate": 4.03181692796664e-05, "loss": 0.21, "step": 6255 }, { "epoch": 4.407978923598042, "grad_norm": 0.18687081336975098, "learning_rate": 4.03041031690364e-05, "loss": 0.157, "step": 6256 }, { "epoch": 4.408731652239368, "grad_norm": 0.18857231736183167, "learning_rate": 4.0290038665527596e-05, "loss": 0.1551, "step": 6257 }, { "epoch": 4.409484380880692, "grad_norm": 0.18036708235740662, "learning_rate": 4.027597577067804e-05, "loss": 0.1159, "step": 6258 }, { "epoch": 4.410237109522018, "grad_norm": 0.16034428775310516, "learning_rate": 4.026191448602564e-05, "loss": 0.1413, "step": 6259 }, { "epoch": 4.410989838163342, "grad_norm": 0.19168175756931305, "learning_rate": 4.02478548131081e-05, "loss": 0.1801, "step": 6260 }, { "epoch": 4.411742566804667, "grad_norm": 0.14244700968265533, "learning_rate": 4.0233796753463e-05, "loss": 0.1487, "step": 6261 }, { "epoch": 4.412495295445992, "grad_norm": 0.17857100069522858, "learning_rate": 4.0219740308627676e-05, "loss": 0.1611, "step": 6262 }, { "epoch": 4.413248024087316, "grad_norm": 0.15734249353408813, "learning_rate": 4.020568548013933e-05, "loss": 0.1397, "step": 6263 }, { "epoch": 4.414000752728642, "grad_norm": 0.1574215441942215, "learning_rate": 4.019163226953498e-05, "loss": 0.1434, "step": 6264 }, { "epoch": 4.414753481369966, "grad_norm": 0.18428672850131989, "learning_rate": 4.017758067835147e-05, "loss": 0.1735, "step": 6265 }, { "epoch": 4.415506210011291, "grad_norm": 0.15696342289447784, "learning_rate": 4.016353070812542e-05, "loss": 0.1041, "step": 6266 }, { "epoch": 4.416258938652616, "grad_norm": 0.1708754450082779, "learning_rate": 4.014948236039337e-05, "loss": 0.1083, "step": 6267 }, { "epoch": 4.417011667293941, "grad_norm": 0.1652987152338028, "learning_rate": 4.01354356366916e-05, "loss": 0.1186, "step": 6268 }, { "epoch": 4.417764395935265, "grad_norm": 0.1691039353609085, "learning_rate": 4.012139053855624e-05, "loss": 0.1494, "step": 6269 }, { "epoch": 4.41851712457659, "grad_norm": 0.1524382382631302, "learning_rate": 4.010734706752323e-05, "loss": 0.1252, "step": 6270 }, { "epoch": 4.419269853217915, "grad_norm": 0.16020861268043518, "learning_rate": 4.0093305225128365e-05, "loss": 0.1216, "step": 6271 }, { "epoch": 4.420022581859239, "grad_norm": 0.1702273190021515, "learning_rate": 4.007926501290722e-05, "loss": 0.0857, "step": 6272 }, { "epoch": 4.420775310500565, "grad_norm": 0.16321787238121033, "learning_rate": 4.006522643239523e-05, "loss": 0.1322, "step": 6273 }, { "epoch": 4.421528039141889, "grad_norm": 0.16147352755069733, "learning_rate": 4.0051189485127614e-05, "loss": 0.1782, "step": 6274 }, { "epoch": 4.4222807677832146, "grad_norm": 0.14993469417095184, "learning_rate": 4.003715417263945e-05, "loss": 0.0739, "step": 6275 }, { "epoch": 4.423033496424539, "grad_norm": 0.18490011990070343, "learning_rate": 4.00231204964656e-05, "loss": 0.1715, "step": 6276 }, { "epoch": 4.4237862250658635, "grad_norm": 0.15404506027698517, "learning_rate": 4.00090884581408e-05, "loss": 0.1696, "step": 6277 }, { "epoch": 4.424538953707189, "grad_norm": 0.18066783249378204, "learning_rate": 3.999505805919952e-05, "loss": 0.2048, "step": 6278 }, { "epoch": 4.425291682348513, "grad_norm": 0.1843128502368927, "learning_rate": 3.998102930117613e-05, "loss": 0.1681, "step": 6279 }, { "epoch": 4.426044410989838, "grad_norm": 0.15153959393501282, "learning_rate": 3.9967002185604805e-05, "loss": 0.0898, "step": 6280 }, { "epoch": 4.426797139631163, "grad_norm": 0.17098450660705566, "learning_rate": 3.995297671401953e-05, "loss": 0.1198, "step": 6281 }, { "epoch": 4.427549868272488, "grad_norm": 0.16850391030311584, "learning_rate": 3.9938952887954085e-05, "loss": 0.1357, "step": 6282 }, { "epoch": 4.428302596913813, "grad_norm": 0.1466527283191681, "learning_rate": 3.992493070894211e-05, "loss": 0.1199, "step": 6283 }, { "epoch": 4.4290553255551375, "grad_norm": 0.1899995356798172, "learning_rate": 3.9910910178517044e-05, "loss": 0.1338, "step": 6284 }, { "epoch": 4.429808054196462, "grad_norm": 0.17737463116645813, "learning_rate": 3.989689129821215e-05, "loss": 0.1421, "step": 6285 }, { "epoch": 4.430560782837787, "grad_norm": 0.15661688148975372, "learning_rate": 3.988287406956053e-05, "loss": 0.1583, "step": 6286 }, { "epoch": 4.431313511479112, "grad_norm": 0.16209976375102997, "learning_rate": 3.986885849409507e-05, "loss": 0.1324, "step": 6287 }, { "epoch": 4.432066240120436, "grad_norm": 0.18701419234275818, "learning_rate": 3.9854844573348495e-05, "loss": 0.1294, "step": 6288 }, { "epoch": 4.432818968761762, "grad_norm": 0.17546485364437103, "learning_rate": 3.984083230885336e-05, "loss": 0.1244, "step": 6289 }, { "epoch": 4.433571697403086, "grad_norm": 0.17413561046123505, "learning_rate": 3.982682170214199e-05, "loss": 0.1099, "step": 6290 }, { "epoch": 4.434324426044411, "grad_norm": 0.1712554693222046, "learning_rate": 3.98128127547466e-05, "loss": 0.1295, "step": 6291 }, { "epoch": 4.435077154685736, "grad_norm": 0.16138656437397003, "learning_rate": 3.979880546819915e-05, "loss": 0.1052, "step": 6292 }, { "epoch": 4.43582988332706, "grad_norm": 0.1747066080570221, "learning_rate": 3.9784799844031506e-05, "loss": 0.1438, "step": 6293 }, { "epoch": 4.436582611968386, "grad_norm": 0.16948571801185608, "learning_rate": 3.977079588377526e-05, "loss": 0.1184, "step": 6294 }, { "epoch": 4.43733534060971, "grad_norm": 0.1486743539571762, "learning_rate": 3.9756793588961896e-05, "loss": 0.1344, "step": 6295 }, { "epoch": 4.438088069251035, "grad_norm": 0.18574796617031097, "learning_rate": 3.9742792961122654e-05, "loss": 0.1551, "step": 6296 }, { "epoch": 4.43884079789236, "grad_norm": 0.16875390708446503, "learning_rate": 3.972879400178865e-05, "loss": 0.1315, "step": 6297 }, { "epoch": 4.4395935265336846, "grad_norm": 0.18809349834918976, "learning_rate": 3.9714796712490746e-05, "loss": 0.1943, "step": 6298 }, { "epoch": 4.440346255175009, "grad_norm": 0.1742015928030014, "learning_rate": 3.970080109475971e-05, "loss": 0.1526, "step": 6299 }, { "epoch": 4.441098983816334, "grad_norm": 0.1513558030128479, "learning_rate": 3.968680715012606e-05, "loss": 0.0741, "step": 6300 }, { "epoch": 4.441851712457659, "grad_norm": 0.14654704928398132, "learning_rate": 3.967281488012017e-05, "loss": 0.111, "step": 6301 }, { "epoch": 4.442604441098984, "grad_norm": 0.1992480754852295, "learning_rate": 3.965882428627219e-05, "loss": 0.155, "step": 6302 }, { "epoch": 4.443357169740309, "grad_norm": 0.15405592322349548, "learning_rate": 3.964483537011213e-05, "loss": 0.101, "step": 6303 }, { "epoch": 4.444109898381633, "grad_norm": 0.1665678322315216, "learning_rate": 3.963084813316976e-05, "loss": 0.149, "step": 6304 }, { "epoch": 4.4448626270229585, "grad_norm": 0.1837223470211029, "learning_rate": 3.9616862576974754e-05, "loss": 0.1328, "step": 6305 }, { "epoch": 4.445615355664283, "grad_norm": 0.18791106343269348, "learning_rate": 3.9602878703056536e-05, "loss": 0.1128, "step": 6306 }, { "epoch": 4.4463680843056075, "grad_norm": 0.15488508343696594, "learning_rate": 3.958889651294434e-05, "loss": 0.1372, "step": 6307 }, { "epoch": 4.447120812946933, "grad_norm": 0.20371203124523163, "learning_rate": 3.9574916008167264e-05, "loss": 0.1428, "step": 6308 }, { "epoch": 4.447873541588257, "grad_norm": 0.15902793407440186, "learning_rate": 3.9560937190254175e-05, "loss": 0.1306, "step": 6309 }, { "epoch": 4.448626270229582, "grad_norm": 0.14277005195617676, "learning_rate": 3.9546960060733783e-05, "loss": 0.0414, "step": 6310 }, { "epoch": 4.449378998870907, "grad_norm": 0.18986831605434418, "learning_rate": 3.9532984621134585e-05, "loss": 0.1649, "step": 6311 }, { "epoch": 4.450131727512232, "grad_norm": 0.15447701513767242, "learning_rate": 3.951901087298495e-05, "loss": 0.0986, "step": 6312 }, { "epoch": 4.450884456153557, "grad_norm": 0.17034076154232025, "learning_rate": 3.950503881781302e-05, "loss": 0.0933, "step": 6313 }, { "epoch": 4.4516371847948815, "grad_norm": 0.1921168714761734, "learning_rate": 3.949106845714674e-05, "loss": 0.1353, "step": 6314 }, { "epoch": 4.452389913436206, "grad_norm": 0.18449243903160095, "learning_rate": 3.9477099792513894e-05, "loss": 0.2013, "step": 6315 }, { "epoch": 4.453142642077531, "grad_norm": 0.16013969480991364, "learning_rate": 3.946313282544208e-05, "loss": 0.1576, "step": 6316 }, { "epoch": 4.453895370718856, "grad_norm": 0.1955496072769165, "learning_rate": 3.94491675574587e-05, "loss": 0.13, "step": 6317 }, { "epoch": 4.45464809936018, "grad_norm": 0.1542782336473465, "learning_rate": 3.9435203990090984e-05, "loss": 0.0728, "step": 6318 }, { "epoch": 4.455400828001506, "grad_norm": 0.1569531410932541, "learning_rate": 3.942124212486594e-05, "loss": 0.1431, "step": 6319 }, { "epoch": 4.45615355664283, "grad_norm": 0.18324920535087585, "learning_rate": 3.940728196331045e-05, "loss": 0.1609, "step": 6320 }, { "epoch": 4.4569062852841554, "grad_norm": 0.1685354858636856, "learning_rate": 3.9393323506951154e-05, "loss": 0.1049, "step": 6321 }, { "epoch": 4.45765901392548, "grad_norm": 0.18507236242294312, "learning_rate": 3.937936675731454e-05, "loss": 0.1145, "step": 6322 }, { "epoch": 4.458411742566804, "grad_norm": 0.14993806183338165, "learning_rate": 3.936541171592687e-05, "loss": 0.0971, "step": 6323 }, { "epoch": 4.45916447120813, "grad_norm": 0.1611398458480835, "learning_rate": 3.93514583843143e-05, "loss": 0.1468, "step": 6324 }, { "epoch": 4.459917199849454, "grad_norm": 0.1810864806175232, "learning_rate": 3.9337506764002696e-05, "loss": 0.0583, "step": 6325 }, { "epoch": 4.460669928490779, "grad_norm": 0.18146663904190063, "learning_rate": 3.9323556856517816e-05, "loss": 0.1322, "step": 6326 }, { "epoch": 4.461422657132104, "grad_norm": 0.15978772938251495, "learning_rate": 3.930960866338519e-05, "loss": 0.0824, "step": 6327 }, { "epoch": 4.4621753857734285, "grad_norm": 0.20676976442337036, "learning_rate": 3.9295662186130164e-05, "loss": 0.1223, "step": 6328 }, { "epoch": 4.462928114414753, "grad_norm": 0.1529691368341446, "learning_rate": 3.9281717426277916e-05, "loss": 0.0848, "step": 6329 }, { "epoch": 4.463680843056078, "grad_norm": 0.15489567816257477, "learning_rate": 3.92677743853534e-05, "loss": 0.0792, "step": 6330 }, { "epoch": 4.464433571697403, "grad_norm": 0.17041532695293427, "learning_rate": 3.925383306488144e-05, "loss": 0.0879, "step": 6331 }, { "epoch": 4.465186300338728, "grad_norm": 0.15552623569965363, "learning_rate": 3.923989346638661e-05, "loss": 0.1322, "step": 6332 }, { "epoch": 4.465939028980053, "grad_norm": 0.15980295836925507, "learning_rate": 3.922595559139336e-05, "loss": 0.0781, "step": 6333 }, { "epoch": 4.466691757621377, "grad_norm": 0.15516874194145203, "learning_rate": 3.9212019441425876e-05, "loss": 0.1039, "step": 6334 }, { "epoch": 4.4674444862627025, "grad_norm": 0.1594931185245514, "learning_rate": 3.9198085018008204e-05, "loss": 0.0971, "step": 6335 }, { "epoch": 4.468197214904027, "grad_norm": 0.17518064379692078, "learning_rate": 3.918415232266419e-05, "loss": 0.1624, "step": 6336 }, { "epoch": 4.468949943545352, "grad_norm": 0.17374998331069946, "learning_rate": 3.9170221356917516e-05, "loss": 0.1829, "step": 6337 }, { "epoch": 4.469702672186677, "grad_norm": 0.19681109488010406, "learning_rate": 3.915629212229163e-05, "loss": 0.1383, "step": 6338 }, { "epoch": 4.470455400828001, "grad_norm": 0.16157273948192596, "learning_rate": 3.9142364620309815e-05, "loss": 0.1409, "step": 6339 }, { "epoch": 4.471208129469327, "grad_norm": 0.15885214507579803, "learning_rate": 3.912843885249515e-05, "loss": 0.1237, "step": 6340 }, { "epoch": 4.471960858110651, "grad_norm": 0.15861526131629944, "learning_rate": 3.911451482037057e-05, "loss": 0.0686, "step": 6341 }, { "epoch": 4.472713586751976, "grad_norm": 0.16992001235485077, "learning_rate": 3.910059252545876e-05, "loss": 0.1049, "step": 6342 }, { "epoch": 4.473466315393301, "grad_norm": 0.17689234018325806, "learning_rate": 3.908667196928224e-05, "loss": 0.1158, "step": 6343 }, { "epoch": 4.4742190440346254, "grad_norm": 0.14759084582328796, "learning_rate": 3.907275315336335e-05, "loss": 0.1197, "step": 6344 }, { "epoch": 4.47497177267595, "grad_norm": 0.20076313614845276, "learning_rate": 3.9058836079224236e-05, "loss": 0.1324, "step": 6345 }, { "epoch": 4.475724501317275, "grad_norm": 0.20128671824932098, "learning_rate": 3.904492074838684e-05, "loss": 0.147, "step": 6346 }, { "epoch": 4.4764772299586, "grad_norm": 0.13970522582530975, "learning_rate": 3.903100716237292e-05, "loss": 0.122, "step": 6347 }, { "epoch": 4.477229958599925, "grad_norm": 0.14109456539154053, "learning_rate": 3.901709532270406e-05, "loss": 0.1073, "step": 6348 }, { "epoch": 4.47798268724125, "grad_norm": 0.16257700324058533, "learning_rate": 3.900318523090159e-05, "loss": 0.1293, "step": 6349 }, { "epoch": 4.478735415882574, "grad_norm": 0.15215818583965302, "learning_rate": 3.898927688848676e-05, "loss": 0.078, "step": 6350 }, { "epoch": 4.479488144523899, "grad_norm": 0.14492423832416534, "learning_rate": 3.897537029698054e-05, "loss": 0.1172, "step": 6351 }, { "epoch": 4.480240873165224, "grad_norm": 0.1733461320400238, "learning_rate": 3.896146545790372e-05, "loss": 0.1527, "step": 6352 }, { "epoch": 4.480993601806548, "grad_norm": 0.15164172649383545, "learning_rate": 3.894756237277693e-05, "loss": 0.1957, "step": 6353 }, { "epoch": 4.481746330447874, "grad_norm": 0.1475290060043335, "learning_rate": 3.8933661043120594e-05, "loss": 0.11, "step": 6354 }, { "epoch": 4.482499059089198, "grad_norm": 0.18010276556015015, "learning_rate": 3.8919761470454906e-05, "loss": 0.1359, "step": 6355 }, { "epoch": 4.483251787730524, "grad_norm": 0.16956068575382233, "learning_rate": 3.890586365629995e-05, "loss": 0.2074, "step": 6356 }, { "epoch": 4.484004516371848, "grad_norm": 0.15374711155891418, "learning_rate": 3.889196760217554e-05, "loss": 0.1905, "step": 6357 }, { "epoch": 4.4847572450131725, "grad_norm": 0.15983757376670837, "learning_rate": 3.887807330960134e-05, "loss": 0.1621, "step": 6358 }, { "epoch": 4.485509973654498, "grad_norm": 0.1642637550830841, "learning_rate": 3.886418078009679e-05, "loss": 0.1327, "step": 6359 }, { "epoch": 4.486262702295822, "grad_norm": 0.18329736590385437, "learning_rate": 3.885029001518119e-05, "loss": 0.1693, "step": 6360 }, { "epoch": 4.487015430937147, "grad_norm": 0.15813548862934113, "learning_rate": 3.883640101637357e-05, "loss": 0.1356, "step": 6361 }, { "epoch": 4.487768159578472, "grad_norm": 0.15894529223442078, "learning_rate": 3.8822513785192846e-05, "loss": 0.1211, "step": 6362 }, { "epoch": 4.488520888219797, "grad_norm": 0.15210819244384766, "learning_rate": 3.880862832315769e-05, "loss": 0.1166, "step": 6363 }, { "epoch": 4.489273616861121, "grad_norm": 0.16110031306743622, "learning_rate": 3.8794744631786605e-05, "loss": 0.0934, "step": 6364 }, { "epoch": 4.4900263455024465, "grad_norm": 0.1669272631406784, "learning_rate": 3.8780862712597864e-05, "loss": 0.154, "step": 6365 }, { "epoch": 4.490779074143771, "grad_norm": 0.16166914999485016, "learning_rate": 3.876698256710961e-05, "loss": 0.1612, "step": 6366 }, { "epoch": 4.491531802785096, "grad_norm": 0.17576448619365692, "learning_rate": 3.875310419683973e-05, "loss": 0.1349, "step": 6367 }, { "epoch": 4.492284531426421, "grad_norm": 0.1625903993844986, "learning_rate": 3.8739227603305936e-05, "loss": 0.1239, "step": 6368 }, { "epoch": 4.493037260067745, "grad_norm": 0.16623760759830475, "learning_rate": 3.872535278802578e-05, "loss": 0.0917, "step": 6369 }, { "epoch": 4.493789988709071, "grad_norm": 0.18332228064537048, "learning_rate": 3.8711479752516565e-05, "loss": 0.1282, "step": 6370 }, { "epoch": 4.494542717350395, "grad_norm": 0.12292250245809555, "learning_rate": 3.8697608498295445e-05, "loss": 0.0529, "step": 6371 }, { "epoch": 4.49529544599172, "grad_norm": 0.1500219851732254, "learning_rate": 3.8683739026879354e-05, "loss": 0.112, "step": 6372 }, { "epoch": 4.496048174633045, "grad_norm": 0.16113020479679108, "learning_rate": 3.8669871339785025e-05, "loss": 0.1872, "step": 6373 }, { "epoch": 4.496800903274369, "grad_norm": 0.19026319682598114, "learning_rate": 3.8656005438529e-05, "loss": 0.2378, "step": 6374 }, { "epoch": 4.497553631915695, "grad_norm": 0.19603043794631958, "learning_rate": 3.864214132462765e-05, "loss": 0.1508, "step": 6375 }, { "epoch": 4.498306360557019, "grad_norm": 0.13917076587677002, "learning_rate": 3.8628278999597144e-05, "loss": 0.0831, "step": 6376 }, { "epoch": 4.499059089198344, "grad_norm": 0.19345727562904358, "learning_rate": 3.861441846495344e-05, "loss": 0.1253, "step": 6377 }, { "epoch": 4.499811817839669, "grad_norm": 0.16832076013088226, "learning_rate": 3.860055972221228e-05, "loss": 0.087, "step": 6378 }, { "epoch": 4.500564546480994, "grad_norm": 0.15084987878799438, "learning_rate": 3.858670277288927e-05, "loss": 0.1542, "step": 6379 }, { "epoch": 4.501317275122318, "grad_norm": 0.1920982003211975, "learning_rate": 3.857284761849975e-05, "loss": 0.1152, "step": 6380 }, { "epoch": 4.502070003763643, "grad_norm": 0.19474630057811737, "learning_rate": 3.8558994260558913e-05, "loss": 0.183, "step": 6381 }, { "epoch": 4.502822732404968, "grad_norm": 0.15641792118549347, "learning_rate": 3.854514270058176e-05, "loss": 0.2247, "step": 6382 }, { "epoch": 4.503575461046292, "grad_norm": 0.1591908037662506, "learning_rate": 3.853129294008305e-05, "loss": 0.1183, "step": 6383 }, { "epoch": 4.504328189687618, "grad_norm": 0.20498347282409668, "learning_rate": 3.85174449805774e-05, "loss": 0.1361, "step": 6384 }, { "epoch": 4.505080918328942, "grad_norm": 0.1915442943572998, "learning_rate": 3.850359882357918e-05, "loss": 0.139, "step": 6385 }, { "epoch": 4.505833646970268, "grad_norm": 0.1571298986673355, "learning_rate": 3.848975447060258e-05, "loss": 0.109, "step": 6386 }, { "epoch": 4.506586375611592, "grad_norm": 0.15455257892608643, "learning_rate": 3.847591192316161e-05, "loss": 0.1643, "step": 6387 }, { "epoch": 4.5073391042529165, "grad_norm": 0.13280096650123596, "learning_rate": 3.846207118277008e-05, "loss": 0.0456, "step": 6388 }, { "epoch": 4.508091832894242, "grad_norm": 0.16574504971504211, "learning_rate": 3.844823225094157e-05, "loss": 0.1732, "step": 6389 }, { "epoch": 4.508844561535566, "grad_norm": 0.16791215538978577, "learning_rate": 3.843439512918949e-05, "loss": 0.1439, "step": 6390 }, { "epoch": 4.509597290176892, "grad_norm": 0.15531718730926514, "learning_rate": 3.842055981902707e-05, "loss": 0.1747, "step": 6391 }, { "epoch": 4.510350018818216, "grad_norm": 0.18591348826885223, "learning_rate": 3.8406726321967303e-05, "loss": 0.0984, "step": 6392 }, { "epoch": 4.511102747459541, "grad_norm": 0.16994038224220276, "learning_rate": 3.839289463952298e-05, "loss": 0.0779, "step": 6393 }, { "epoch": 4.511855476100866, "grad_norm": 0.19417628645896912, "learning_rate": 3.837906477320673e-05, "loss": 0.1326, "step": 6394 }, { "epoch": 4.5126082047421905, "grad_norm": 0.18982720375061035, "learning_rate": 3.8365236724530985e-05, "loss": 0.2068, "step": 6395 }, { "epoch": 4.513360933383515, "grad_norm": 0.14412803947925568, "learning_rate": 3.835141049500793e-05, "loss": 0.0806, "step": 6396 }, { "epoch": 4.51411366202484, "grad_norm": 0.16490277647972107, "learning_rate": 3.83375860861496e-05, "loss": 0.157, "step": 6397 }, { "epoch": 4.514866390666165, "grad_norm": 0.1673283725976944, "learning_rate": 3.83237634994678e-05, "loss": 0.1187, "step": 6398 }, { "epoch": 4.515619119307489, "grad_norm": 0.15945428609848022, "learning_rate": 3.830994273647416e-05, "loss": 0.1406, "step": 6399 }, { "epoch": 4.516371847948815, "grad_norm": 0.18222138285636902, "learning_rate": 3.829612379868006e-05, "loss": 0.1499, "step": 6400 }, { "epoch": 4.516371847948815, "eval_loss": 0.17409518361091614, "eval_runtime": 456.3165, "eval_samples_per_second": 21.097, "eval_steps_per_second": 0.66, "step": 6400 }, { "epoch": 4.517124576590139, "grad_norm": 0.18639817833900452, "learning_rate": 3.8282306687596783e-05, "loss": 0.1135, "step": 6401 }, { "epoch": 4.517877305231464, "grad_norm": 0.21426062285900116, "learning_rate": 3.82684914047353e-05, "loss": 0.1384, "step": 6402 }, { "epoch": 4.518630033872789, "grad_norm": 0.1621747612953186, "learning_rate": 3.825467795160643e-05, "loss": 0.1243, "step": 6403 }, { "epoch": 4.519382762514113, "grad_norm": 0.16025742888450623, "learning_rate": 3.824086632972082e-05, "loss": 0.129, "step": 6404 }, { "epoch": 4.520135491155439, "grad_norm": 0.15969286859035492, "learning_rate": 3.8227056540588866e-05, "loss": 0.1638, "step": 6405 }, { "epoch": 4.520888219796763, "grad_norm": 0.1596067100763321, "learning_rate": 3.8213248585720776e-05, "loss": 0.1269, "step": 6406 }, { "epoch": 4.521640948438088, "grad_norm": 0.159266859292984, "learning_rate": 3.81994424666266e-05, "loss": 0.0691, "step": 6407 }, { "epoch": 4.522393677079413, "grad_norm": 0.1448194682598114, "learning_rate": 3.818563818481613e-05, "loss": 0.1413, "step": 6408 }, { "epoch": 4.523146405720738, "grad_norm": 0.1738073229789734, "learning_rate": 3.817183574179899e-05, "loss": 0.1725, "step": 6409 }, { "epoch": 4.523899134362063, "grad_norm": 0.1681404858827591, "learning_rate": 3.815803513908459e-05, "loss": 0.1184, "step": 6410 }, { "epoch": 4.524651863003387, "grad_norm": 0.20104022324085236, "learning_rate": 3.8144236378182144e-05, "loss": 0.1586, "step": 6411 }, { "epoch": 4.525404591644712, "grad_norm": 0.16573314368724823, "learning_rate": 3.813043946060068e-05, "loss": 0.1464, "step": 6412 }, { "epoch": 4.526157320286037, "grad_norm": 0.15722380578517914, "learning_rate": 3.811664438784897e-05, "loss": 0.1386, "step": 6413 }, { "epoch": 4.526910048927362, "grad_norm": 0.16850373148918152, "learning_rate": 3.810285116143566e-05, "loss": 0.1475, "step": 6414 }, { "epoch": 4.527662777568686, "grad_norm": 0.20359468460083008, "learning_rate": 3.808905978286913e-05, "loss": 0.2017, "step": 6415 }, { "epoch": 4.5284155062100115, "grad_norm": 0.17804652452468872, "learning_rate": 3.807527025365761e-05, "loss": 0.1504, "step": 6416 }, { "epoch": 4.529168234851336, "grad_norm": 0.18677136301994324, "learning_rate": 3.806148257530909e-05, "loss": 0.1969, "step": 6417 }, { "epoch": 4.5299209634926605, "grad_norm": 0.1459231674671173, "learning_rate": 3.804769674933138e-05, "loss": 0.1222, "step": 6418 }, { "epoch": 4.530673692133986, "grad_norm": 0.17868241667747498, "learning_rate": 3.8033912777232026e-05, "loss": 0.1393, "step": 6419 }, { "epoch": 4.53142642077531, "grad_norm": 0.18075226247310638, "learning_rate": 3.8020130660518495e-05, "loss": 0.2004, "step": 6420 }, { "epoch": 4.532179149416636, "grad_norm": 0.16459451615810394, "learning_rate": 3.800635040069794e-05, "loss": 0.1248, "step": 6421 }, { "epoch": 4.53293187805796, "grad_norm": 0.12919506430625916, "learning_rate": 3.799257199927736e-05, "loss": 0.0948, "step": 6422 }, { "epoch": 4.533684606699285, "grad_norm": 0.16659994423389435, "learning_rate": 3.797879545776353e-05, "loss": 0.101, "step": 6423 }, { "epoch": 4.53443733534061, "grad_norm": 0.1814403086900711, "learning_rate": 3.7965020777663044e-05, "loss": 0.1289, "step": 6424 }, { "epoch": 4.5351900639819345, "grad_norm": 0.1697511076927185, "learning_rate": 3.795124796048225e-05, "loss": 0.1059, "step": 6425 }, { "epoch": 4.535942792623259, "grad_norm": 0.16820725798606873, "learning_rate": 3.793747700772737e-05, "loss": 0.1024, "step": 6426 }, { "epoch": 4.536695521264584, "grad_norm": 0.17847733199596405, "learning_rate": 3.792370792090435e-05, "loss": 0.2081, "step": 6427 }, { "epoch": 4.537448249905909, "grad_norm": 0.1993570476770401, "learning_rate": 3.790994070151895e-05, "loss": 0.2148, "step": 6428 }, { "epoch": 4.538200978547234, "grad_norm": 0.16773416101932526, "learning_rate": 3.7896175351076755e-05, "loss": 0.1018, "step": 6429 }, { "epoch": 4.538953707188559, "grad_norm": 0.16588816046714783, "learning_rate": 3.78824118710831e-05, "loss": 0.1273, "step": 6430 }, { "epoch": 4.539706435829883, "grad_norm": 0.15000009536743164, "learning_rate": 3.786865026304315e-05, "loss": 0.1309, "step": 6431 }, { "epoch": 4.5404591644712085, "grad_norm": 0.1854223906993866, "learning_rate": 3.785489052846183e-05, "loss": 0.143, "step": 6432 }, { "epoch": 4.541211893112533, "grad_norm": 0.1418498307466507, "learning_rate": 3.784113266884393e-05, "loss": 0.0529, "step": 6433 }, { "epoch": 4.541964621753857, "grad_norm": 0.15955212712287903, "learning_rate": 3.782737668569395e-05, "loss": 0.1384, "step": 6434 }, { "epoch": 4.542717350395183, "grad_norm": 0.16215479373931885, "learning_rate": 3.781362258051624e-05, "loss": 0.1709, "step": 6435 }, { "epoch": 4.543470079036507, "grad_norm": 0.15688370168209076, "learning_rate": 3.779987035481493e-05, "loss": 0.1529, "step": 6436 }, { "epoch": 4.544222807677832, "grad_norm": 0.17807821929454803, "learning_rate": 3.778612001009393e-05, "loss": 0.18, "step": 6437 }, { "epoch": 4.544975536319157, "grad_norm": 0.16640415787696838, "learning_rate": 3.777237154785696e-05, "loss": 0.1534, "step": 6438 }, { "epoch": 4.5457282649604815, "grad_norm": 0.2162265032529831, "learning_rate": 3.7758624969607546e-05, "loss": 0.1004, "step": 6439 }, { "epoch": 4.546480993601807, "grad_norm": 0.18404938280582428, "learning_rate": 3.774488027684898e-05, "loss": 0.1769, "step": 6440 }, { "epoch": 4.547233722243131, "grad_norm": 0.16341856122016907, "learning_rate": 3.773113747108436e-05, "loss": 0.1092, "step": 6441 }, { "epoch": 4.547986450884456, "grad_norm": 0.1555493324995041, "learning_rate": 3.77173965538166e-05, "loss": 0.1001, "step": 6442 }, { "epoch": 4.548739179525781, "grad_norm": 0.17604370415210724, "learning_rate": 3.770365752654836e-05, "loss": 0.1696, "step": 6443 }, { "epoch": 4.549491908167106, "grad_norm": 0.1799277514219284, "learning_rate": 3.768992039078212e-05, "loss": 0.1348, "step": 6444 }, { "epoch": 4.550244636808431, "grad_norm": 0.16205424070358276, "learning_rate": 3.767618514802016e-05, "loss": 0.1249, "step": 6445 }, { "epoch": 4.5509973654497555, "grad_norm": 0.1711907535791397, "learning_rate": 3.766245179976455e-05, "loss": 0.0774, "step": 6446 }, { "epoch": 4.55175009409108, "grad_norm": 0.18459618091583252, "learning_rate": 3.7648720347517166e-05, "loss": 0.1371, "step": 6447 }, { "epoch": 4.552502822732405, "grad_norm": 0.20232070982456207, "learning_rate": 3.7634990792779613e-05, "loss": 0.0895, "step": 6448 }, { "epoch": 4.55325555137373, "grad_norm": 0.18904294073581696, "learning_rate": 3.762126313705338e-05, "loss": 0.2197, "step": 6449 }, { "epoch": 4.554008280015054, "grad_norm": 0.2094869166612625, "learning_rate": 3.7607537381839664e-05, "loss": 0.1628, "step": 6450 }, { "epoch": 4.55476100865638, "grad_norm": 0.15155641734600067, "learning_rate": 3.75938135286395e-05, "loss": 0.0825, "step": 6451 }, { "epoch": 4.555513737297704, "grad_norm": 0.14527134597301483, "learning_rate": 3.758009157895373e-05, "loss": 0.0888, "step": 6452 }, { "epoch": 4.556266465939029, "grad_norm": 0.16387426853179932, "learning_rate": 3.756637153428296e-05, "loss": 0.0648, "step": 6453 }, { "epoch": 4.557019194580354, "grad_norm": 0.1759496033191681, "learning_rate": 3.755265339612759e-05, "loss": 0.104, "step": 6454 }, { "epoch": 4.5577719232216785, "grad_norm": 0.17583394050598145, "learning_rate": 3.753893716598781e-05, "loss": 0.1333, "step": 6455 }, { "epoch": 4.558524651863003, "grad_norm": 0.19770587980747223, "learning_rate": 3.752522284536362e-05, "loss": 0.1669, "step": 6456 }, { "epoch": 4.559277380504328, "grad_norm": 0.17465820908546448, "learning_rate": 3.751151043575475e-05, "loss": 0.1264, "step": 6457 }, { "epoch": 4.560030109145653, "grad_norm": 0.1708873063325882, "learning_rate": 3.749779993866083e-05, "loss": 0.1315, "step": 6458 }, { "epoch": 4.560782837786978, "grad_norm": 0.16422630846500397, "learning_rate": 3.74840913555812e-05, "loss": 0.1245, "step": 6459 }, { "epoch": 4.561535566428303, "grad_norm": 0.15810665488243103, "learning_rate": 3.7470384688015e-05, "loss": 0.1443, "step": 6460 }, { "epoch": 4.562288295069627, "grad_norm": 0.17115771770477295, "learning_rate": 3.745667993746117e-05, "loss": 0.1548, "step": 6461 }, { "epoch": 4.563041023710952, "grad_norm": 0.14902448654174805, "learning_rate": 3.744297710541845e-05, "loss": 0.1363, "step": 6462 }, { "epoch": 4.563793752352277, "grad_norm": 0.19665050506591797, "learning_rate": 3.742927619338535e-05, "loss": 0.2295, "step": 6463 }, { "epoch": 4.564546480993602, "grad_norm": 0.12589101493358612, "learning_rate": 3.7415577202860195e-05, "loss": 0.0686, "step": 6464 }, { "epoch": 4.565299209634927, "grad_norm": 0.18403322994709015, "learning_rate": 3.740188013534107e-05, "loss": 0.1501, "step": 6465 }, { "epoch": 4.566051938276251, "grad_norm": 0.16448698937892914, "learning_rate": 3.738818499232589e-05, "loss": 0.1524, "step": 6466 }, { "epoch": 4.566804666917577, "grad_norm": 0.15690690279006958, "learning_rate": 3.7374491775312324e-05, "loss": 0.0905, "step": 6467 }, { "epoch": 4.567557395558901, "grad_norm": 0.1692558079957962, "learning_rate": 3.7360800485797834e-05, "loss": 0.1327, "step": 6468 }, { "epoch": 4.5683101242002255, "grad_norm": 0.16621828079223633, "learning_rate": 3.734711112527968e-05, "loss": 0.122, "step": 6469 }, { "epoch": 4.569062852841551, "grad_norm": 0.18231721222400665, "learning_rate": 3.733342369525489e-05, "loss": 0.1717, "step": 6470 }, { "epoch": 4.569815581482875, "grad_norm": 0.1632997840642929, "learning_rate": 3.7319738197220344e-05, "loss": 0.1458, "step": 6471 }, { "epoch": 4.5705683101242, "grad_norm": 0.1639719158411026, "learning_rate": 3.730605463267265e-05, "loss": 0.1825, "step": 6472 }, { "epoch": 4.571321038765525, "grad_norm": 0.15792813897132874, "learning_rate": 3.7292373003108226e-05, "loss": 0.0802, "step": 6473 }, { "epoch": 4.57207376740685, "grad_norm": 0.14618724584579468, "learning_rate": 3.727869331002325e-05, "loss": 0.1656, "step": 6474 }, { "epoch": 4.572826496048174, "grad_norm": 0.19831261038780212, "learning_rate": 3.726501555491374e-05, "loss": 0.1333, "step": 6475 }, { "epoch": 4.5735792246894995, "grad_norm": 0.16459906101226807, "learning_rate": 3.725133973927545e-05, "loss": 0.1887, "step": 6476 }, { "epoch": 4.574331953330824, "grad_norm": 0.15730559825897217, "learning_rate": 3.723766586460398e-05, "loss": 0.1834, "step": 6477 }, { "epoch": 4.575084681972149, "grad_norm": 0.14457641541957855, "learning_rate": 3.722399393239465e-05, "loss": 0.1015, "step": 6478 }, { "epoch": 4.575837410613474, "grad_norm": 0.158308744430542, "learning_rate": 3.721032394414263e-05, "loss": 0.1114, "step": 6479 }, { "epoch": 4.576590139254798, "grad_norm": 0.1653943955898285, "learning_rate": 3.7196655901342836e-05, "loss": 0.1634, "step": 6480 }, { "epoch": 4.577342867896124, "grad_norm": 0.1967770755290985, "learning_rate": 3.718298980548999e-05, "loss": 0.1317, "step": 6481 }, { "epoch": 4.578095596537448, "grad_norm": 0.16946561634540558, "learning_rate": 3.716932565807858e-05, "loss": 0.1441, "step": 6482 }, { "epoch": 4.5788483251787735, "grad_norm": 0.1476416438817978, "learning_rate": 3.7155663460602896e-05, "loss": 0.1151, "step": 6483 }, { "epoch": 4.579601053820098, "grad_norm": 0.16012245416641235, "learning_rate": 3.714200321455705e-05, "loss": 0.16, "step": 6484 }, { "epoch": 4.580353782461422, "grad_norm": 0.14770372211933136, "learning_rate": 3.712834492143488e-05, "loss": 0.1201, "step": 6485 }, { "epoch": 4.581106511102748, "grad_norm": 0.17550547420978546, "learning_rate": 3.711468858273003e-05, "loss": 0.1131, "step": 6486 }, { "epoch": 4.581859239744072, "grad_norm": 0.15985333919525146, "learning_rate": 3.7101034199935956e-05, "loss": 0.2017, "step": 6487 }, { "epoch": 4.582611968385397, "grad_norm": 0.16035400331020355, "learning_rate": 3.708738177454587e-05, "loss": 0.1145, "step": 6488 }, { "epoch": 4.583364697026722, "grad_norm": 0.1562260538339615, "learning_rate": 3.7073731308052786e-05, "loss": 0.1524, "step": 6489 }, { "epoch": 4.584117425668047, "grad_norm": 0.17214572429656982, "learning_rate": 3.706008280194948e-05, "loss": 0.1241, "step": 6490 }, { "epoch": 4.584870154309371, "grad_norm": 0.15797831118106842, "learning_rate": 3.7046436257728555e-05, "loss": 0.2481, "step": 6491 }, { "epoch": 4.585622882950696, "grad_norm": 0.16617341339588165, "learning_rate": 3.703279167688237e-05, "loss": 0.0728, "step": 6492 }, { "epoch": 4.586375611592021, "grad_norm": 0.16351985931396484, "learning_rate": 3.701914906090307e-05, "loss": 0.1099, "step": 6493 }, { "epoch": 4.587128340233346, "grad_norm": 0.1507348120212555, "learning_rate": 3.7005508411282606e-05, "loss": 0.105, "step": 6494 }, { "epoch": 4.587881068874671, "grad_norm": 0.18160709738731384, "learning_rate": 3.6991869729512665e-05, "loss": 0.1647, "step": 6495 }, { "epoch": 4.588633797515995, "grad_norm": 0.16674502193927765, "learning_rate": 3.69782330170848e-05, "loss": 0.1146, "step": 6496 }, { "epoch": 4.589386526157321, "grad_norm": 0.23277825117111206, "learning_rate": 3.696459827549026e-05, "loss": 0.1492, "step": 6497 }, { "epoch": 4.590139254798645, "grad_norm": 0.17874902486801147, "learning_rate": 3.695096550622015e-05, "loss": 0.1096, "step": 6498 }, { "epoch": 4.5908919834399695, "grad_norm": 0.14544832706451416, "learning_rate": 3.6937334710765305e-05, "loss": 0.1053, "step": 6499 }, { "epoch": 4.591644712081295, "grad_norm": 0.16598199307918549, "learning_rate": 3.692370589061639e-05, "loss": 0.1275, "step": 6500 }, { "epoch": 4.592397440722619, "grad_norm": 0.18802545964717865, "learning_rate": 3.6910079047263815e-05, "loss": 0.135, "step": 6501 }, { "epoch": 4.593150169363945, "grad_norm": 0.1659085601568222, "learning_rate": 3.689645418219778e-05, "loss": 0.1775, "step": 6502 }, { "epoch": 4.593902898005269, "grad_norm": 0.17456702888011932, "learning_rate": 3.6882831296908314e-05, "loss": 0.1938, "step": 6503 }, { "epoch": 4.594655626646594, "grad_norm": 0.15545697510242462, "learning_rate": 3.686921039288519e-05, "loss": 0.1607, "step": 6504 }, { "epoch": 4.595408355287919, "grad_norm": 0.18547891080379486, "learning_rate": 3.6855591471617945e-05, "loss": 0.1742, "step": 6505 }, { "epoch": 4.5961610839292435, "grad_norm": 0.17092540860176086, "learning_rate": 3.684197453459595e-05, "loss": 0.1859, "step": 6506 }, { "epoch": 4.596913812570568, "grad_norm": 0.16151289641857147, "learning_rate": 3.68283595833083e-05, "loss": 0.133, "step": 6507 }, { "epoch": 4.597666541211893, "grad_norm": 0.25049710273742676, "learning_rate": 3.6814746619243925e-05, "loss": 0.1228, "step": 6508 }, { "epoch": 4.598419269853218, "grad_norm": 0.16220520436763763, "learning_rate": 3.680113564389153e-05, "loss": 0.163, "step": 6509 }, { "epoch": 4.599171998494542, "grad_norm": 0.16789549589157104, "learning_rate": 3.678752665873958e-05, "loss": 0.0779, "step": 6510 }, { "epoch": 4.599924727135868, "grad_norm": 0.17461107671260834, "learning_rate": 3.677391966527632e-05, "loss": 0.1817, "step": 6511 }, { "epoch": 4.600677455777192, "grad_norm": 0.1611729860305786, "learning_rate": 3.6760314664989825e-05, "loss": 0.1758, "step": 6512 }, { "epoch": 4.6014301844185175, "grad_norm": 0.1706939935684204, "learning_rate": 3.674671165936788e-05, "loss": 0.14, "step": 6513 }, { "epoch": 4.602182913059842, "grad_norm": 0.16242654621601105, "learning_rate": 3.673311064989811e-05, "loss": 0.0947, "step": 6514 }, { "epoch": 4.602935641701166, "grad_norm": 0.16074107587337494, "learning_rate": 3.671951163806788e-05, "loss": 0.1317, "step": 6515 }, { "epoch": 4.603688370342492, "grad_norm": 0.1858910471200943, "learning_rate": 3.670591462536439e-05, "loss": 0.1107, "step": 6516 }, { "epoch": 4.604441098983816, "grad_norm": 0.1600446254014969, "learning_rate": 3.6692319613274574e-05, "loss": 0.0879, "step": 6517 }, { "epoch": 4.605193827625142, "grad_norm": 0.14218397438526154, "learning_rate": 3.667872660328515e-05, "loss": 0.0873, "step": 6518 }, { "epoch": 4.605946556266466, "grad_norm": 0.1578713208436966, "learning_rate": 3.666513559688265e-05, "loss": 0.0873, "step": 6519 }, { "epoch": 4.606699284907791, "grad_norm": 0.1834711730480194, "learning_rate": 3.665154659555336e-05, "loss": 0.1196, "step": 6520 }, { "epoch": 4.607452013549116, "grad_norm": 0.1631365418434143, "learning_rate": 3.663795960078331e-05, "loss": 0.0626, "step": 6521 }, { "epoch": 4.60820474219044, "grad_norm": 0.1700088083744049, "learning_rate": 3.662437461405842e-05, "loss": 0.1765, "step": 6522 }, { "epoch": 4.608957470831765, "grad_norm": 0.168487548828125, "learning_rate": 3.661079163686431e-05, "loss": 0.0706, "step": 6523 }, { "epoch": 4.60971019947309, "grad_norm": 0.16326437890529633, "learning_rate": 3.6597210670686356e-05, "loss": 0.1355, "step": 6524 }, { "epoch": 4.610462928114415, "grad_norm": 0.14800409972667694, "learning_rate": 3.658363171700978e-05, "loss": 0.1044, "step": 6525 }, { "epoch": 4.611215656755739, "grad_norm": 0.19617974758148193, "learning_rate": 3.657005477731955e-05, "loss": 0.2181, "step": 6526 }, { "epoch": 4.6119683853970646, "grad_norm": 0.1762249916791916, "learning_rate": 3.655647985310041e-05, "loss": 0.1442, "step": 6527 }, { "epoch": 4.612721114038389, "grad_norm": 0.17995774745941162, "learning_rate": 3.6542906945836916e-05, "loss": 0.1253, "step": 6528 }, { "epoch": 4.6134738426797135, "grad_norm": 0.1554146260023117, "learning_rate": 3.652933605701336e-05, "loss": 0.1058, "step": 6529 }, { "epoch": 4.614226571321039, "grad_norm": 0.15418271720409393, "learning_rate": 3.651576718811387e-05, "loss": 0.1143, "step": 6530 }, { "epoch": 4.614979299962363, "grad_norm": 0.14562806487083435, "learning_rate": 3.650220034062226e-05, "loss": 0.0858, "step": 6531 }, { "epoch": 4.615732028603689, "grad_norm": 0.16806122660636902, "learning_rate": 3.648863551602222e-05, "loss": 0.1125, "step": 6532 }, { "epoch": 4.616484757245013, "grad_norm": 0.17794087529182434, "learning_rate": 3.647507271579717e-05, "loss": 0.0759, "step": 6533 }, { "epoch": 4.617237485886338, "grad_norm": 0.16646616160869598, "learning_rate": 3.6461511941430304e-05, "loss": 0.1101, "step": 6534 }, { "epoch": 4.617990214527663, "grad_norm": 0.15984906256198883, "learning_rate": 3.644795319440464e-05, "loss": 0.1009, "step": 6535 }, { "epoch": 4.6187429431689875, "grad_norm": 0.15753233432769775, "learning_rate": 3.6434396476202906e-05, "loss": 0.086, "step": 6536 }, { "epoch": 4.619495671810313, "grad_norm": 0.14180032908916473, "learning_rate": 3.6420841788307666e-05, "loss": 0.0366, "step": 6537 }, { "epoch": 4.620248400451637, "grad_norm": 0.16200049221515656, "learning_rate": 3.640728913220124e-05, "loss": 0.1121, "step": 6538 }, { "epoch": 4.621001129092962, "grad_norm": 0.18272805213928223, "learning_rate": 3.639373850936572e-05, "loss": 0.1239, "step": 6539 }, { "epoch": 4.621753857734287, "grad_norm": 0.1687462478876114, "learning_rate": 3.638018992128296e-05, "loss": 0.1317, "step": 6540 }, { "epoch": 4.622506586375612, "grad_norm": 0.19056852161884308, "learning_rate": 3.636664336943466e-05, "loss": 0.0894, "step": 6541 }, { "epoch": 4.623259315016936, "grad_norm": 0.1497277468442917, "learning_rate": 3.6353098855302215e-05, "loss": 0.0717, "step": 6542 }, { "epoch": 4.6240120436582615, "grad_norm": 0.1691354513168335, "learning_rate": 3.633955638036686e-05, "loss": 0.1118, "step": 6543 }, { "epoch": 4.624764772299586, "grad_norm": 0.16049782931804657, "learning_rate": 3.632601594610955e-05, "loss": 0.1366, "step": 6544 }, { "epoch": 4.62551750094091, "grad_norm": 0.17691722512245178, "learning_rate": 3.631247755401107e-05, "loss": 0.129, "step": 6545 }, { "epoch": 4.626270229582236, "grad_norm": 0.16645075380802155, "learning_rate": 3.6298941205551916e-05, "loss": 0.1296, "step": 6546 }, { "epoch": 4.62702295822356, "grad_norm": 0.18845289945602417, "learning_rate": 3.628540690221247e-05, "loss": 0.1577, "step": 6547 }, { "epoch": 4.627775686864885, "grad_norm": 0.16311414539813995, "learning_rate": 3.627187464547278e-05, "loss": 0.0757, "step": 6548 }, { "epoch": 4.62852841550621, "grad_norm": 0.1908630132675171, "learning_rate": 3.625834443681272e-05, "loss": 0.2008, "step": 6549 }, { "epoch": 4.6292811441475346, "grad_norm": 0.18150943517684937, "learning_rate": 3.624481627771195e-05, "loss": 0.1721, "step": 6550 }, { "epoch": 4.63003387278886, "grad_norm": 0.16222216188907623, "learning_rate": 3.623129016964984e-05, "loss": 0.0801, "step": 6551 }, { "epoch": 4.630786601430184, "grad_norm": 0.13582973182201385, "learning_rate": 3.6217766114105656e-05, "loss": 0.0656, "step": 6552 }, { "epoch": 4.631539330071509, "grad_norm": 0.18268728256225586, "learning_rate": 3.620424411255828e-05, "loss": 0.1618, "step": 6553 }, { "epoch": 4.632292058712834, "grad_norm": 0.18054677546024323, "learning_rate": 3.619072416648657e-05, "loss": 0.1282, "step": 6554 }, { "epoch": 4.633044787354159, "grad_norm": 0.15170659124851227, "learning_rate": 3.617720627736895e-05, "loss": 0.0862, "step": 6555 }, { "epoch": 4.633797515995484, "grad_norm": 0.200566366314888, "learning_rate": 3.616369044668376e-05, "loss": 0.1987, "step": 6556 }, { "epoch": 4.6345502446368085, "grad_norm": 0.1651342511177063, "learning_rate": 3.6150176675909055e-05, "loss": 0.1373, "step": 6557 }, { "epoch": 4.635302973278133, "grad_norm": 0.1632736474275589, "learning_rate": 3.613666496652272e-05, "loss": 0.0728, "step": 6558 }, { "epoch": 4.636055701919458, "grad_norm": 0.17578265070915222, "learning_rate": 3.612315532000229e-05, "loss": 0.1346, "step": 6559 }, { "epoch": 4.636808430560783, "grad_norm": 0.16155420243740082, "learning_rate": 3.6109647737825246e-05, "loss": 0.1611, "step": 6560 }, { "epoch": 4.637561159202107, "grad_norm": 0.17195823788642883, "learning_rate": 3.609614222146872e-05, "loss": 0.1819, "step": 6561 }, { "epoch": 4.638313887843433, "grad_norm": 0.162298321723938, "learning_rate": 3.608263877240967e-05, "loss": 0.1685, "step": 6562 }, { "epoch": 4.639066616484757, "grad_norm": 0.17705513536930084, "learning_rate": 3.606913739212479e-05, "loss": 0.0964, "step": 6563 }, { "epoch": 4.639819345126082, "grad_norm": 0.16987718641757965, "learning_rate": 3.605563808209059e-05, "loss": 0.11, "step": 6564 }, { "epoch": 4.640572073767407, "grad_norm": 0.20167234539985657, "learning_rate": 3.60421408437833e-05, "loss": 0.129, "step": 6565 }, { "epoch": 4.6413248024087315, "grad_norm": 0.1941881626844406, "learning_rate": 3.6028645678679015e-05, "loss": 0.1666, "step": 6566 }, { "epoch": 4.642077531050057, "grad_norm": 0.17669524252414703, "learning_rate": 3.6015152588253495e-05, "loss": 0.1543, "step": 6567 }, { "epoch": 4.642830259691381, "grad_norm": 0.1527896523475647, "learning_rate": 3.600166157398237e-05, "loss": 0.1481, "step": 6568 }, { "epoch": 4.643582988332706, "grad_norm": 0.172234907746315, "learning_rate": 3.5988172637340955e-05, "loss": 0.0789, "step": 6569 }, { "epoch": 4.644335716974031, "grad_norm": 0.16336730122566223, "learning_rate": 3.5974685779804395e-05, "loss": 0.134, "step": 6570 }, { "epoch": 4.645088445615356, "grad_norm": 0.17566755414009094, "learning_rate": 3.596120100284761e-05, "loss": 0.1716, "step": 6571 }, { "epoch": 4.64584117425668, "grad_norm": 0.14104314148426056, "learning_rate": 3.594771830794524e-05, "loss": 0.1299, "step": 6572 }, { "epoch": 4.6465939028980054, "grad_norm": 0.1745671182870865, "learning_rate": 3.593423769657175e-05, "loss": 0.1096, "step": 6573 }, { "epoch": 4.64734663153933, "grad_norm": 0.15358677506446838, "learning_rate": 3.592075917020137e-05, "loss": 0.0748, "step": 6574 }, { "epoch": 4.648099360180655, "grad_norm": 0.13389921188354492, "learning_rate": 3.590728273030809e-05, "loss": 0.0876, "step": 6575 }, { "epoch": 4.64885208882198, "grad_norm": 0.17862167954444885, "learning_rate": 3.589380837836565e-05, "loss": 0.1479, "step": 6576 }, { "epoch": 4.649604817463304, "grad_norm": 0.20082318782806396, "learning_rate": 3.588033611584763e-05, "loss": 0.1457, "step": 6577 }, { "epoch": 4.65035754610463, "grad_norm": 0.16960519552230835, "learning_rate": 3.5866865944227245e-05, "loss": 0.1519, "step": 6578 }, { "epoch": 4.651110274745954, "grad_norm": 0.1423761397600174, "learning_rate": 3.585339786497769e-05, "loss": 0.1193, "step": 6579 }, { "epoch": 4.6518630033872785, "grad_norm": 0.17330721020698547, "learning_rate": 3.583993187957173e-05, "loss": 0.1405, "step": 6580 }, { "epoch": 4.652615732028604, "grad_norm": 0.19288718700408936, "learning_rate": 3.582646798948204e-05, "loss": 0.1462, "step": 6581 }, { "epoch": 4.653368460669928, "grad_norm": 0.18783973157405853, "learning_rate": 3.5813006196180956e-05, "loss": 0.1277, "step": 6582 }, { "epoch": 4.654121189311253, "grad_norm": 0.15505287051200867, "learning_rate": 3.579954650114068e-05, "loss": 0.1494, "step": 6583 }, { "epoch": 4.654873917952578, "grad_norm": 0.1603105366230011, "learning_rate": 3.5786088905833114e-05, "loss": 0.1162, "step": 6584 }, { "epoch": 4.655626646593903, "grad_norm": 0.14555256068706512, "learning_rate": 3.5772633411730004e-05, "loss": 0.0725, "step": 6585 }, { "epoch": 4.656379375235228, "grad_norm": 0.16142232716083527, "learning_rate": 3.575918002030278e-05, "loss": 0.1344, "step": 6586 }, { "epoch": 4.6571321038765525, "grad_norm": 0.13907206058502197, "learning_rate": 3.574572873302271e-05, "loss": 0.095, "step": 6587 }, { "epoch": 4.657884832517877, "grad_norm": 0.1534367799758911, "learning_rate": 3.5732279551360795e-05, "loss": 0.0895, "step": 6588 }, { "epoch": 4.658637561159202, "grad_norm": 0.18257318437099457, "learning_rate": 3.571883247678782e-05, "loss": 0.134, "step": 6589 }, { "epoch": 4.659390289800527, "grad_norm": 0.1841505765914917, "learning_rate": 3.5705387510774335e-05, "loss": 0.1657, "step": 6590 }, { "epoch": 4.660143018441852, "grad_norm": 0.15747769176959991, "learning_rate": 3.569194465479065e-05, "loss": 0.1726, "step": 6591 }, { "epoch": 4.660895747083177, "grad_norm": 0.16902080178260803, "learning_rate": 3.567850391030687e-05, "loss": 0.1459, "step": 6592 }, { "epoch": 4.661648475724501, "grad_norm": 0.14813607931137085, "learning_rate": 3.5665065278792874e-05, "loss": 0.1462, "step": 6593 }, { "epoch": 4.6624012043658265, "grad_norm": 0.16747474670410156, "learning_rate": 3.5651628761718257e-05, "loss": 0.1151, "step": 6594 }, { "epoch": 4.663153933007151, "grad_norm": 0.17464837431907654, "learning_rate": 3.5638194360552426e-05, "loss": 0.1336, "step": 6595 }, { "epoch": 4.6639066616484754, "grad_norm": 0.16605442762374878, "learning_rate": 3.562476207676457e-05, "loss": 0.2549, "step": 6596 }, { "epoch": 4.664659390289801, "grad_norm": 0.16681843996047974, "learning_rate": 3.5611331911823575e-05, "loss": 0.0986, "step": 6597 }, { "epoch": 4.665412118931125, "grad_norm": 0.17252756655216217, "learning_rate": 3.559790386719818e-05, "loss": 0.1144, "step": 6598 }, { "epoch": 4.66616484757245, "grad_norm": 0.17376203835010529, "learning_rate": 3.5584477944356845e-05, "loss": 0.1673, "step": 6599 }, { "epoch": 4.666917576213775, "grad_norm": 0.18115375936031342, "learning_rate": 3.557105414476782e-05, "loss": 0.0887, "step": 6600 }, { "epoch": 4.666917576213775, "eval_loss": 0.168090358376503, "eval_runtime": 456.5948, "eval_samples_per_second": 21.084, "eval_steps_per_second": 0.659, "step": 6600 }, { "epoch": 4.6676703048551, "grad_norm": 0.15402305126190186, "learning_rate": 3.5557632469899104e-05, "loss": 0.1077, "step": 6601 }, { "epoch": 4.668423033496424, "grad_norm": 0.1576068252325058, "learning_rate": 3.5544212921218475e-05, "loss": 0.1218, "step": 6602 }, { "epoch": 4.669175762137749, "grad_norm": 0.17636942863464355, "learning_rate": 3.553079550019346e-05, "loss": 0.1735, "step": 6603 }, { "epoch": 4.669928490779074, "grad_norm": 0.15867599844932556, "learning_rate": 3.551738020829136e-05, "loss": 0.131, "step": 6604 }, { "epoch": 4.670681219420399, "grad_norm": 0.1753733605146408, "learning_rate": 3.5503967046979294e-05, "loss": 0.1334, "step": 6605 }, { "epoch": 4.671433948061724, "grad_norm": 0.14842575788497925, "learning_rate": 3.549055601772408e-05, "loss": 0.1138, "step": 6606 }, { "epoch": 4.672186676703048, "grad_norm": 0.1712091714143753, "learning_rate": 3.5477147121992316e-05, "loss": 0.1688, "step": 6607 }, { "epoch": 4.672939405344374, "grad_norm": 0.16190020740032196, "learning_rate": 3.54637403612504e-05, "loss": 0.0662, "step": 6608 }, { "epoch": 4.673692133985698, "grad_norm": 0.15109367668628693, "learning_rate": 3.5450335736964456e-05, "loss": 0.0489, "step": 6609 }, { "epoch": 4.674444862627023, "grad_norm": 0.1720573604106903, "learning_rate": 3.54369332506004e-05, "loss": 0.1391, "step": 6610 }, { "epoch": 4.675197591268348, "grad_norm": 0.1543080359697342, "learning_rate": 3.5423532903623914e-05, "loss": 0.1184, "step": 6611 }, { "epoch": 4.675950319909672, "grad_norm": 0.16092021763324738, "learning_rate": 3.541013469750045e-05, "loss": 0.1005, "step": 6612 }, { "epoch": 4.676703048550998, "grad_norm": 0.16009648144245148, "learning_rate": 3.53967386336952e-05, "loss": 0.168, "step": 6613 }, { "epoch": 4.677455777192322, "grad_norm": 0.142743781208992, "learning_rate": 3.538334471367314e-05, "loss": 0.0991, "step": 6614 }, { "epoch": 4.678208505833647, "grad_norm": 0.15337371826171875, "learning_rate": 3.5369952938899e-05, "loss": 0.1484, "step": 6615 }, { "epoch": 4.678961234474972, "grad_norm": 0.16042488813400269, "learning_rate": 3.5356563310837285e-05, "loss": 0.1212, "step": 6616 }, { "epoch": 4.6797139631162965, "grad_norm": 0.18873566389083862, "learning_rate": 3.534317583095228e-05, "loss": 0.1462, "step": 6617 }, { "epoch": 4.680466691757621, "grad_norm": 0.15600618720054626, "learning_rate": 3.532979050070804e-05, "loss": 0.1921, "step": 6618 }, { "epoch": 4.681219420398946, "grad_norm": 0.1368323266506195, "learning_rate": 3.53164073215683e-05, "loss": 0.0786, "step": 6619 }, { "epoch": 4.681972149040271, "grad_norm": 0.16691707074642181, "learning_rate": 3.530302629499667e-05, "loss": 0.1531, "step": 6620 }, { "epoch": 4.682724877681595, "grad_norm": 0.14293193817138672, "learning_rate": 3.528964742245649e-05, "loss": 0.1022, "step": 6621 }, { "epoch": 4.683477606322921, "grad_norm": 0.19054530560970306, "learning_rate": 3.5276270705410805e-05, "loss": 0.1725, "step": 6622 }, { "epoch": 4.684230334964245, "grad_norm": 0.18136170506477356, "learning_rate": 3.526289614532252e-05, "loss": 0.1167, "step": 6623 }, { "epoch": 4.6849830636055705, "grad_norm": 0.16729502379894257, "learning_rate": 3.524952374365422e-05, "loss": 0.1741, "step": 6624 }, { "epoch": 4.685735792246895, "grad_norm": 0.15895779430866241, "learning_rate": 3.523615350186834e-05, "loss": 0.1511, "step": 6625 }, { "epoch": 4.686488520888219, "grad_norm": 0.15354222059249878, "learning_rate": 3.5222785421426975e-05, "loss": 0.1154, "step": 6626 }, { "epoch": 4.687241249529545, "grad_norm": 0.17114098370075226, "learning_rate": 3.5209419503792084e-05, "loss": 0.1166, "step": 6627 }, { "epoch": 4.687993978170869, "grad_norm": 0.13734759390354156, "learning_rate": 3.5196055750425306e-05, "loss": 0.1357, "step": 6628 }, { "epoch": 4.688746706812195, "grad_norm": 0.13777509331703186, "learning_rate": 3.5182694162788085e-05, "loss": 0.0883, "step": 6629 }, { "epoch": 4.689499435453519, "grad_norm": 0.1566717028617859, "learning_rate": 3.516933474234164e-05, "loss": 0.0934, "step": 6630 }, { "epoch": 4.690252164094844, "grad_norm": 0.13564114272594452, "learning_rate": 3.515597749054695e-05, "loss": 0.106, "step": 6631 }, { "epoch": 4.691004892736169, "grad_norm": 0.19012759625911713, "learning_rate": 3.514262240886471e-05, "loss": 0.1433, "step": 6632 }, { "epoch": 4.691757621377493, "grad_norm": 0.1737040877342224, "learning_rate": 3.512926949875546e-05, "loss": 0.1276, "step": 6633 }, { "epoch": 4.692510350018818, "grad_norm": 0.1647167205810547, "learning_rate": 3.511591876167938e-05, "loss": 0.0858, "step": 6634 }, { "epoch": 4.693263078660143, "grad_norm": 0.16731484234333038, "learning_rate": 3.510257019909654e-05, "loss": 0.0737, "step": 6635 }, { "epoch": 4.694015807301468, "grad_norm": 0.17547565698623657, "learning_rate": 3.508922381246671e-05, "loss": 0.188, "step": 6636 }, { "epoch": 4.694768535942792, "grad_norm": 0.16327962279319763, "learning_rate": 3.507587960324944e-05, "loss": 0.1346, "step": 6637 }, { "epoch": 4.695521264584118, "grad_norm": 0.1618780791759491, "learning_rate": 3.506253757290401e-05, "loss": 0.1227, "step": 6638 }, { "epoch": 4.696273993225442, "grad_norm": 0.17493264377117157, "learning_rate": 3.50491977228895e-05, "loss": 0.1313, "step": 6639 }, { "epoch": 4.697026721866767, "grad_norm": 0.20187830924987793, "learning_rate": 3.503586005466474e-05, "loss": 0.1261, "step": 6640 }, { "epoch": 4.697779450508092, "grad_norm": 0.1894235759973526, "learning_rate": 3.502252456968828e-05, "loss": 0.1039, "step": 6641 }, { "epoch": 4.698532179149416, "grad_norm": 0.15062560141086578, "learning_rate": 3.500919126941852e-05, "loss": 0.1281, "step": 6642 }, { "epoch": 4.699284907790742, "grad_norm": 0.1896868795156479, "learning_rate": 3.499586015531355e-05, "loss": 0.1165, "step": 6643 }, { "epoch": 4.700037636432066, "grad_norm": 0.1731378734111786, "learning_rate": 3.498253122883123e-05, "loss": 0.1987, "step": 6644 }, { "epoch": 4.7007903650733915, "grad_norm": 0.16473466157913208, "learning_rate": 3.496920449142919e-05, "loss": 0.0869, "step": 6645 }, { "epoch": 4.701543093714716, "grad_norm": 0.1459127515554428, "learning_rate": 3.495587994456485e-05, "loss": 0.1144, "step": 6646 }, { "epoch": 4.7022958223560405, "grad_norm": 0.16073881089687347, "learning_rate": 3.494255758969533e-05, "loss": 0.137, "step": 6647 }, { "epoch": 4.703048550997366, "grad_norm": 0.17139023542404175, "learning_rate": 3.4929237428277557e-05, "loss": 0.0661, "step": 6648 }, { "epoch": 4.70380127963869, "grad_norm": 0.1714189499616623, "learning_rate": 3.49159194617682e-05, "loss": 0.1119, "step": 6649 }, { "epoch": 4.704554008280015, "grad_norm": 0.15990635752677917, "learning_rate": 3.4902603691623715e-05, "loss": 0.1054, "step": 6650 }, { "epoch": 4.70530673692134, "grad_norm": 0.14974547922611237, "learning_rate": 3.488929011930025e-05, "loss": 0.1017, "step": 6651 }, { "epoch": 4.706059465562665, "grad_norm": 0.164729043841362, "learning_rate": 3.487597874625381e-05, "loss": 0.0985, "step": 6652 }, { "epoch": 4.706812194203989, "grad_norm": 0.17307604849338531, "learning_rate": 3.4862669573940055e-05, "loss": 0.1508, "step": 6653 }, { "epoch": 4.7075649228453145, "grad_norm": 0.1588771492242813, "learning_rate": 3.484936260381447e-05, "loss": 0.1199, "step": 6654 }, { "epoch": 4.708317651486639, "grad_norm": 0.12409298866987228, "learning_rate": 3.4836057837332305e-05, "loss": 0.0543, "step": 6655 }, { "epoch": 4.709070380127963, "grad_norm": 0.1596575528383255, "learning_rate": 3.482275527594856e-05, "loss": 0.1265, "step": 6656 }, { "epoch": 4.709823108769289, "grad_norm": 0.15800164639949799, "learning_rate": 3.480945492111793e-05, "loss": 0.1256, "step": 6657 }, { "epoch": 4.710575837410613, "grad_norm": 0.16864565014839172, "learning_rate": 3.4796156774294986e-05, "loss": 0.1497, "step": 6658 }, { "epoch": 4.711328566051939, "grad_norm": 0.15053126215934753, "learning_rate": 3.478286083693394e-05, "loss": 0.0947, "step": 6659 }, { "epoch": 4.712081294693263, "grad_norm": 0.13776452839374542, "learning_rate": 3.476956711048885e-05, "loss": 0.1206, "step": 6660 }, { "epoch": 4.712834023334588, "grad_norm": 0.15809226036071777, "learning_rate": 3.475627559641346e-05, "loss": 0.0712, "step": 6661 }, { "epoch": 4.713586751975913, "grad_norm": 0.14197300374507904, "learning_rate": 3.4742986296161354e-05, "loss": 0.1818, "step": 6662 }, { "epoch": 4.714339480617237, "grad_norm": 0.13520024716854095, "learning_rate": 3.472969921118581e-05, "loss": 0.0478, "step": 6663 }, { "epoch": 4.715092209258563, "grad_norm": 0.17139504849910736, "learning_rate": 3.4716414342939886e-05, "loss": 0.12, "step": 6664 }, { "epoch": 4.715844937899887, "grad_norm": 0.15304173529148102, "learning_rate": 3.470313169287638e-05, "loss": 0.13, "step": 6665 }, { "epoch": 4.716597666541212, "grad_norm": 0.20076321065425873, "learning_rate": 3.468985126244788e-05, "loss": 0.1403, "step": 6666 }, { "epoch": 4.717350395182537, "grad_norm": 0.17020712792873383, "learning_rate": 3.467657305310671e-05, "loss": 0.1205, "step": 6667 }, { "epoch": 4.7181031238238615, "grad_norm": 0.17992833256721497, "learning_rate": 3.466329706630497e-05, "loss": 0.1737, "step": 6668 }, { "epoch": 4.718855852465186, "grad_norm": 0.16561196744441986, "learning_rate": 3.465002330349447e-05, "loss": 0.1093, "step": 6669 }, { "epoch": 4.719608581106511, "grad_norm": 0.1673620045185089, "learning_rate": 3.4636751766126826e-05, "loss": 0.1414, "step": 6670 }, { "epoch": 4.720361309747836, "grad_norm": 0.18613582849502563, "learning_rate": 3.462348245565342e-05, "loss": 0.2075, "step": 6671 }, { "epoch": 4.72111403838916, "grad_norm": 0.14822149276733398, "learning_rate": 3.461021537352531e-05, "loss": 0.0797, "step": 6672 }, { "epoch": 4.721866767030486, "grad_norm": 0.14926433563232422, "learning_rate": 3.459695052119341e-05, "loss": 0.1749, "step": 6673 }, { "epoch": 4.72261949567181, "grad_norm": 0.1542149782180786, "learning_rate": 3.4583687900108286e-05, "loss": 0.1169, "step": 6674 }, { "epoch": 4.723372224313135, "grad_norm": 0.19394345581531525, "learning_rate": 3.45704275117204e-05, "loss": 0.1303, "step": 6675 }, { "epoch": 4.72412495295446, "grad_norm": 0.14623519778251648, "learning_rate": 3.455716935747983e-05, "loss": 0.1069, "step": 6676 }, { "epoch": 4.7248776815957845, "grad_norm": 0.18761995434761047, "learning_rate": 3.4543913438836496e-05, "loss": 0.1213, "step": 6677 }, { "epoch": 4.72563041023711, "grad_norm": 0.1776212602853775, "learning_rate": 3.4530659757240014e-05, "loss": 0.1015, "step": 6678 }, { "epoch": 4.726383138878434, "grad_norm": 0.19419491291046143, "learning_rate": 3.4517408314139825e-05, "loss": 0.1285, "step": 6679 }, { "epoch": 4.727135867519759, "grad_norm": 0.164367213845253, "learning_rate": 3.4504159110985035e-05, "loss": 0.1002, "step": 6680 }, { "epoch": 4.727888596161084, "grad_norm": 0.17490528523921967, "learning_rate": 3.449091214922462e-05, "loss": 0.1253, "step": 6681 }, { "epoch": 4.728641324802409, "grad_norm": 0.15859133005142212, "learning_rate": 3.4477667430307205e-05, "loss": 0.1493, "step": 6682 }, { "epoch": 4.729394053443734, "grad_norm": 0.1372327357530594, "learning_rate": 3.446442495568124e-05, "loss": 0.1007, "step": 6683 }, { "epoch": 4.7301467820850585, "grad_norm": 0.17368347942829132, "learning_rate": 3.4451184726794875e-05, "loss": 0.1283, "step": 6684 }, { "epoch": 4.730899510726383, "grad_norm": 0.16076289117336273, "learning_rate": 3.443794674509606e-05, "loss": 0.2016, "step": 6685 }, { "epoch": 4.731652239367708, "grad_norm": 0.1567259132862091, "learning_rate": 3.442471101203245e-05, "loss": 0.1544, "step": 6686 }, { "epoch": 4.732404968009033, "grad_norm": 0.14565208554267883, "learning_rate": 3.441147752905153e-05, "loss": 0.0612, "step": 6687 }, { "epoch": 4.733157696650357, "grad_norm": 0.15666575729846954, "learning_rate": 3.439824629760047e-05, "loss": 0.1427, "step": 6688 }, { "epoch": 4.733910425291683, "grad_norm": 0.17264969646930695, "learning_rate": 3.438501731912623e-05, "loss": 0.1381, "step": 6689 }, { "epoch": 4.734663153933007, "grad_norm": 0.16712380945682526, "learning_rate": 3.437179059507549e-05, "loss": 0.1336, "step": 6690 }, { "epoch": 4.7354158825743315, "grad_norm": 0.17690786719322205, "learning_rate": 3.435856612689472e-05, "loss": 0.1794, "step": 6691 }, { "epoch": 4.736168611215657, "grad_norm": 0.17633037269115448, "learning_rate": 3.434534391603014e-05, "loss": 0.159, "step": 6692 }, { "epoch": 4.736921339856981, "grad_norm": 0.16436141729354858, "learning_rate": 3.433212396392767e-05, "loss": 0.1575, "step": 6693 }, { "epoch": 4.737674068498307, "grad_norm": 0.1523650735616684, "learning_rate": 3.431890627203305e-05, "loss": 0.1394, "step": 6694 }, { "epoch": 4.738426797139631, "grad_norm": 0.17069880664348602, "learning_rate": 3.430569084179175e-05, "loss": 0.1586, "step": 6695 }, { "epoch": 4.739179525780956, "grad_norm": 0.1802436113357544, "learning_rate": 3.4292477674649004e-05, "loss": 0.18, "step": 6696 }, { "epoch": 4.739932254422281, "grad_norm": 0.14401522278785706, "learning_rate": 3.4279266772049744e-05, "loss": 0.1494, "step": 6697 }, { "epoch": 4.7406849830636055, "grad_norm": 0.1851218193769455, "learning_rate": 3.4266058135438744e-05, "loss": 0.1203, "step": 6698 }, { "epoch": 4.74143771170493, "grad_norm": 0.16491831839084625, "learning_rate": 3.425285176626041e-05, "loss": 0.185, "step": 6699 }, { "epoch": 4.742190440346255, "grad_norm": 0.17284135520458221, "learning_rate": 3.423964766595906e-05, "loss": 0.1329, "step": 6700 }, { "epoch": 4.74294316898758, "grad_norm": 0.16025333106517792, "learning_rate": 3.4226445835978596e-05, "loss": 0.0717, "step": 6701 }, { "epoch": 4.743695897628905, "grad_norm": 0.16364771127700806, "learning_rate": 3.4213246277762804e-05, "loss": 0.1726, "step": 6702 }, { "epoch": 4.74444862627023, "grad_norm": 0.21655888855457306, "learning_rate": 3.420004899275514e-05, "loss": 0.1764, "step": 6703 }, { "epoch": 4.745201354911554, "grad_norm": 0.153486967086792, "learning_rate": 3.4186853982398856e-05, "loss": 0.1357, "step": 6704 }, { "epoch": 4.7459540835528795, "grad_norm": 0.16340371966362, "learning_rate": 3.417366124813689e-05, "loss": 0.1943, "step": 6705 }, { "epoch": 4.746706812194204, "grad_norm": 0.13297796249389648, "learning_rate": 3.416047079141206e-05, "loss": 0.0585, "step": 6706 }, { "epoch": 4.7474595408355285, "grad_norm": 0.15777923166751862, "learning_rate": 3.4147282613666786e-05, "loss": 0.1048, "step": 6707 }, { "epoch": 4.748212269476854, "grad_norm": 0.18185819685459137, "learning_rate": 3.4134096716343365e-05, "loss": 0.082, "step": 6708 }, { "epoch": 4.748964998118178, "grad_norm": 0.13794918358325958, "learning_rate": 3.412091310088372e-05, "loss": 0.0975, "step": 6709 }, { "epoch": 4.749717726759503, "grad_norm": 0.16356809437274933, "learning_rate": 3.4107731768729654e-05, "loss": 0.141, "step": 6710 }, { "epoch": 4.750470455400828, "grad_norm": 0.1677178144454956, "learning_rate": 3.409455272132261e-05, "loss": 0.1185, "step": 6711 }, { "epoch": 4.751223184042153, "grad_norm": 0.1915438324213028, "learning_rate": 3.408137596010386e-05, "loss": 0.1661, "step": 6712 }, { "epoch": 4.751975912683478, "grad_norm": 0.15976232290267944, "learning_rate": 3.4068201486514376e-05, "loss": 0.1947, "step": 6713 }, { "epoch": 4.752728641324802, "grad_norm": 0.19858376681804657, "learning_rate": 3.405502930199493e-05, "loss": 0.1404, "step": 6714 }, { "epoch": 4.753481369966127, "grad_norm": 0.14910584688186646, "learning_rate": 3.404185940798597e-05, "loss": 0.1103, "step": 6715 }, { "epoch": 4.754234098607452, "grad_norm": 0.1732732504606247, "learning_rate": 3.402869180592775e-05, "loss": 0.203, "step": 6716 }, { "epoch": 4.754986827248777, "grad_norm": 0.16581128537654877, "learning_rate": 3.401552649726028e-05, "loss": 0.0971, "step": 6717 }, { "epoch": 4.755739555890102, "grad_norm": 0.18672318756580353, "learning_rate": 3.400236348342325e-05, "loss": 0.0939, "step": 6718 }, { "epoch": 4.756492284531427, "grad_norm": 0.16240467131137848, "learning_rate": 3.398920276585622e-05, "loss": 0.1307, "step": 6719 }, { "epoch": 4.757245013172751, "grad_norm": 0.1723724752664566, "learning_rate": 3.3976044345998365e-05, "loss": 0.1192, "step": 6720 }, { "epoch": 4.757997741814076, "grad_norm": 0.15461811423301697, "learning_rate": 3.396288822528871e-05, "loss": 0.1872, "step": 6721 }, { "epoch": 4.758750470455401, "grad_norm": 0.1816888004541397, "learning_rate": 3.394973440516594e-05, "loss": 0.1084, "step": 6722 }, { "epoch": 4.759503199096725, "grad_norm": 0.1374325305223465, "learning_rate": 3.3936582887068604e-05, "loss": 0.0956, "step": 6723 }, { "epoch": 4.760255927738051, "grad_norm": 0.1816074103116989, "learning_rate": 3.392343367243487e-05, "loss": 0.113, "step": 6724 }, { "epoch": 4.761008656379375, "grad_norm": 0.16573674976825714, "learning_rate": 3.391028676270274e-05, "loss": 0.0793, "step": 6725 }, { "epoch": 4.7617613850207, "grad_norm": 0.19022469222545624, "learning_rate": 3.389714215930995e-05, "loss": 0.1764, "step": 6726 }, { "epoch": 4.762514113662025, "grad_norm": 0.15210048854351044, "learning_rate": 3.388399986369397e-05, "loss": 0.1646, "step": 6727 }, { "epoch": 4.7632668423033495, "grad_norm": 0.1629887968301773, "learning_rate": 3.3870859877292e-05, "loss": 0.0968, "step": 6728 }, { "epoch": 4.764019570944674, "grad_norm": 0.17847628891468048, "learning_rate": 3.385772220154106e-05, "loss": 0.1809, "step": 6729 }, { "epoch": 4.764772299585999, "grad_norm": 0.1804787516593933, "learning_rate": 3.38445868378778e-05, "loss": 0.2049, "step": 6730 }, { "epoch": 4.765525028227324, "grad_norm": 0.17672127485275269, "learning_rate": 3.3831453787738724e-05, "loss": 0.1281, "step": 6731 }, { "epoch": 4.766277756868649, "grad_norm": 0.12331103533506393, "learning_rate": 3.381832305256004e-05, "loss": 0.0401, "step": 6732 }, { "epoch": 4.767030485509974, "grad_norm": 0.16329629719257355, "learning_rate": 3.38051946337777e-05, "loss": 0.1047, "step": 6733 }, { "epoch": 4.767783214151298, "grad_norm": 0.15889444947242737, "learning_rate": 3.37920685328274e-05, "loss": 0.1098, "step": 6734 }, { "epoch": 4.7685359427926235, "grad_norm": 0.17704513669013977, "learning_rate": 3.3778944751144626e-05, "loss": 0.1264, "step": 6735 }, { "epoch": 4.769288671433948, "grad_norm": 0.14847446978092194, "learning_rate": 3.376582329016453e-05, "loss": 0.079, "step": 6736 }, { "epoch": 4.770041400075273, "grad_norm": 0.1930818259716034, "learning_rate": 3.3752704151322074e-05, "loss": 0.1576, "step": 6737 }, { "epoch": 4.770794128716598, "grad_norm": 0.1386289745569229, "learning_rate": 3.3739587336051935e-05, "loss": 0.0458, "step": 6738 }, { "epoch": 4.771546857357922, "grad_norm": 0.17304721474647522, "learning_rate": 3.3726472845788574e-05, "loss": 0.1643, "step": 6739 }, { "epoch": 4.772299585999248, "grad_norm": 0.16277363896369934, "learning_rate": 3.371336068196617e-05, "loss": 0.1349, "step": 6740 }, { "epoch": 4.773052314640572, "grad_norm": 0.1582077592611313, "learning_rate": 3.370025084601862e-05, "loss": 0.097, "step": 6741 }, { "epoch": 4.773805043281897, "grad_norm": 0.16052764654159546, "learning_rate": 3.3687143339379636e-05, "loss": 0.0764, "step": 6742 }, { "epoch": 4.774557771923222, "grad_norm": 0.17705321311950684, "learning_rate": 3.3674038163482584e-05, "loss": 0.1361, "step": 6743 }, { "epoch": 4.775310500564546, "grad_norm": 0.14528557658195496, "learning_rate": 3.366093531976066e-05, "loss": 0.078, "step": 6744 }, { "epoch": 4.776063229205871, "grad_norm": 0.1602514684200287, "learning_rate": 3.364783480964677e-05, "loss": 0.1806, "step": 6745 }, { "epoch": 4.776815957847196, "grad_norm": 0.16741247475147247, "learning_rate": 3.363473663457358e-05, "loss": 0.1547, "step": 6746 }, { "epoch": 4.777568686488521, "grad_norm": 0.1359017789363861, "learning_rate": 3.3621640795973455e-05, "loss": 0.0895, "step": 6747 }, { "epoch": 4.778321415129845, "grad_norm": 0.16506066918373108, "learning_rate": 3.360854729527856e-05, "loss": 0.0989, "step": 6748 }, { "epoch": 4.779074143771171, "grad_norm": 0.18193016946315765, "learning_rate": 3.3595456133920754e-05, "loss": 0.2203, "step": 6749 }, { "epoch": 4.779826872412495, "grad_norm": 0.1533985286951065, "learning_rate": 3.358236731333169e-05, "loss": 0.1664, "step": 6750 }, { "epoch": 4.78057960105382, "grad_norm": 0.16347266733646393, "learning_rate": 3.356928083494274e-05, "loss": 0.1423, "step": 6751 }, { "epoch": 4.781332329695145, "grad_norm": 0.15275564789772034, "learning_rate": 3.3556196700185025e-05, "loss": 0.0587, "step": 6752 }, { "epoch": 4.782085058336469, "grad_norm": 0.1673596054315567, "learning_rate": 3.354311491048938e-05, "loss": 0.1237, "step": 6753 }, { "epoch": 4.782837786977795, "grad_norm": 0.15623334050178528, "learning_rate": 3.3530035467286456e-05, "loss": 0.1317, "step": 6754 }, { "epoch": 4.783590515619119, "grad_norm": 0.15841786563396454, "learning_rate": 3.3516958372006546e-05, "loss": 0.1035, "step": 6755 }, { "epoch": 4.7843432442604445, "grad_norm": 0.15792132914066315, "learning_rate": 3.350388362607977e-05, "loss": 0.1729, "step": 6756 }, { "epoch": 4.785095972901769, "grad_norm": 0.16302140057086945, "learning_rate": 3.3490811230935976e-05, "loss": 0.0764, "step": 6757 }, { "epoch": 4.7858487015430935, "grad_norm": 0.15518638491630554, "learning_rate": 3.347774118800473e-05, "loss": 0.1675, "step": 6758 }, { "epoch": 4.786601430184419, "grad_norm": 0.147507905960083, "learning_rate": 3.346467349871534e-05, "loss": 0.0645, "step": 6759 }, { "epoch": 4.787354158825743, "grad_norm": 0.20508308708667755, "learning_rate": 3.345160816449687e-05, "loss": 0.1101, "step": 6760 }, { "epoch": 4.788106887467068, "grad_norm": 0.16182805597782135, "learning_rate": 3.3438545186778156e-05, "loss": 0.1406, "step": 6761 }, { "epoch": 4.788859616108393, "grad_norm": 0.1515103131532669, "learning_rate": 3.342548456698771e-05, "loss": 0.1145, "step": 6762 }, { "epoch": 4.789612344749718, "grad_norm": 0.14195814728736877, "learning_rate": 3.341242630655383e-05, "loss": 0.1052, "step": 6763 }, { "epoch": 4.790365073391042, "grad_norm": 0.16303299367427826, "learning_rate": 3.3399370406904554e-05, "loss": 0.1322, "step": 6764 }, { "epoch": 4.7911178020323675, "grad_norm": 0.14862670004367828, "learning_rate": 3.3386316869467665e-05, "loss": 0.1544, "step": 6765 }, { "epoch": 4.791870530673692, "grad_norm": 0.13522081077098846, "learning_rate": 3.3373265695670665e-05, "loss": 0.078, "step": 6766 }, { "epoch": 4.792623259315017, "grad_norm": 0.16407085955142975, "learning_rate": 3.336021688694082e-05, "loss": 0.0984, "step": 6767 }, { "epoch": 4.793375987956342, "grad_norm": 0.13194814324378967, "learning_rate": 3.33471704447051e-05, "loss": 0.0634, "step": 6768 }, { "epoch": 4.794128716597666, "grad_norm": 0.15197797119617462, "learning_rate": 3.333412637039027e-05, "loss": 0.0772, "step": 6769 }, { "epoch": 4.794881445238992, "grad_norm": 0.17935138940811157, "learning_rate": 3.3321084665422807e-05, "loss": 0.1435, "step": 6770 }, { "epoch": 4.795634173880316, "grad_norm": 0.1774897575378418, "learning_rate": 3.330804533122895e-05, "loss": 0.0908, "step": 6771 }, { "epoch": 4.796386902521641, "grad_norm": 0.16095462441444397, "learning_rate": 3.329500836923462e-05, "loss": 0.1568, "step": 6772 }, { "epoch": 4.797139631162966, "grad_norm": 0.18086692690849304, "learning_rate": 3.328197378086557e-05, "loss": 0.1767, "step": 6773 }, { "epoch": 4.79789235980429, "grad_norm": 0.17139165103435516, "learning_rate": 3.3268941567547184e-05, "loss": 0.1761, "step": 6774 }, { "epoch": 4.798645088445616, "grad_norm": 0.161950021982193, "learning_rate": 3.3255911730704686e-05, "loss": 0.1307, "step": 6775 }, { "epoch": 4.79939781708694, "grad_norm": 0.1651322990655899, "learning_rate": 3.324288427176298e-05, "loss": 0.1686, "step": 6776 }, { "epoch": 4.800150545728265, "grad_norm": 0.14135465025901794, "learning_rate": 3.322985919214676e-05, "loss": 0.0779, "step": 6777 }, { "epoch": 4.80090327436959, "grad_norm": 0.12805892527103424, "learning_rate": 3.321683649328039e-05, "loss": 0.1008, "step": 6778 }, { "epoch": 4.8016560030109146, "grad_norm": 0.16781030595302582, "learning_rate": 3.320381617658805e-05, "loss": 0.1744, "step": 6779 }, { "epoch": 4.802408731652239, "grad_norm": 0.16126379370689392, "learning_rate": 3.3190798243493595e-05, "loss": 0.0724, "step": 6780 }, { "epoch": 4.803161460293564, "grad_norm": 0.18079590797424316, "learning_rate": 3.317778269542063e-05, "loss": 0.1632, "step": 6781 }, { "epoch": 4.803914188934889, "grad_norm": 0.1772768348455429, "learning_rate": 3.316476953379256e-05, "loss": 0.171, "step": 6782 }, { "epoch": 4.804666917576213, "grad_norm": 0.14223888516426086, "learning_rate": 3.315175876003249e-05, "loss": 0.1026, "step": 6783 }, { "epoch": 4.805419646217539, "grad_norm": 0.15164919197559357, "learning_rate": 3.31387503755632e-05, "loss": 0.0652, "step": 6784 }, { "epoch": 4.806172374858863, "grad_norm": 0.1705583781003952, "learning_rate": 3.31257443818073e-05, "loss": 0.1479, "step": 6785 }, { "epoch": 4.8069251035001885, "grad_norm": 0.1683659851551056, "learning_rate": 3.311274078018712e-05, "loss": 0.1577, "step": 6786 }, { "epoch": 4.807677832141513, "grad_norm": 0.16623879969120026, "learning_rate": 3.30997395721247e-05, "loss": 0.1711, "step": 6787 }, { "epoch": 4.8084305607828375, "grad_norm": 0.1307930052280426, "learning_rate": 3.3086740759041835e-05, "loss": 0.0871, "step": 6788 }, { "epoch": 4.809183289424163, "grad_norm": 0.16772231459617615, "learning_rate": 3.307374434236003e-05, "loss": 0.1108, "step": 6789 }, { "epoch": 4.809936018065487, "grad_norm": 0.1677577793598175, "learning_rate": 3.306075032350062e-05, "loss": 0.1138, "step": 6790 }, { "epoch": 4.810688746706813, "grad_norm": 0.17211149632930756, "learning_rate": 3.3047758703884546e-05, "loss": 0.1693, "step": 6791 }, { "epoch": 4.811441475348137, "grad_norm": 0.17025113105773926, "learning_rate": 3.303476948493259e-05, "loss": 0.1638, "step": 6792 }, { "epoch": 4.812194203989462, "grad_norm": 0.15626582503318787, "learning_rate": 3.3021782668065206e-05, "loss": 0.1469, "step": 6793 }, { "epoch": 4.812946932630787, "grad_norm": 0.16219979524612427, "learning_rate": 3.3008798254702634e-05, "loss": 0.1378, "step": 6794 }, { "epoch": 4.8136996612721115, "grad_norm": 0.16432195901870728, "learning_rate": 3.299581624626479e-05, "loss": 0.1146, "step": 6795 }, { "epoch": 4.814452389913436, "grad_norm": 0.15453895926475525, "learning_rate": 3.2982836644171433e-05, "loss": 0.1343, "step": 6796 }, { "epoch": 4.815205118554761, "grad_norm": 0.14150096476078033, "learning_rate": 3.2969859449841945e-05, "loss": 0.1354, "step": 6797 }, { "epoch": 4.815957847196086, "grad_norm": 0.16724441945552826, "learning_rate": 3.295688466469552e-05, "loss": 0.1248, "step": 6798 }, { "epoch": 4.81671057583741, "grad_norm": 0.17284725606441498, "learning_rate": 3.294391229015103e-05, "loss": 0.1537, "step": 6799 }, { "epoch": 4.817463304478736, "grad_norm": 0.13841265439987183, "learning_rate": 3.293094232762715e-05, "loss": 0.1234, "step": 6800 }, { "epoch": 4.817463304478736, "eval_loss": 0.16682064533233643, "eval_runtime": 456.6221, "eval_samples_per_second": 21.083, "eval_steps_per_second": 0.659, "step": 6800 }, { "epoch": 4.81821603312006, "grad_norm": 0.15098240971565247, "learning_rate": 3.29179747785422e-05, "loss": 0.1301, "step": 6801 }, { "epoch": 4.818968761761385, "grad_norm": 0.136410191655159, "learning_rate": 3.2905009644314363e-05, "loss": 0.0979, "step": 6802 }, { "epoch": 4.81972149040271, "grad_norm": 0.18152154982089996, "learning_rate": 3.289204692636144e-05, "loss": 0.1851, "step": 6803 }, { "epoch": 4.820474219044034, "grad_norm": 0.159526064991951, "learning_rate": 3.287908662610104e-05, "loss": 0.1215, "step": 6804 }, { "epoch": 4.82122694768536, "grad_norm": 0.1395968347787857, "learning_rate": 3.286612874495045e-05, "loss": 0.0894, "step": 6805 }, { "epoch": 4.821979676326684, "grad_norm": 0.1413000375032425, "learning_rate": 3.2853173284326746e-05, "loss": 0.0561, "step": 6806 }, { "epoch": 4.822732404968009, "grad_norm": 0.1485811471939087, "learning_rate": 3.284022024564671e-05, "loss": 0.0757, "step": 6807 }, { "epoch": 4.823485133609334, "grad_norm": 0.1886320263147354, "learning_rate": 3.2827269630326885e-05, "loss": 0.1359, "step": 6808 }, { "epoch": 4.8242378622506585, "grad_norm": 0.17497992515563965, "learning_rate": 3.281432143978351e-05, "loss": 0.1155, "step": 6809 }, { "epoch": 4.824990590891984, "grad_norm": 0.15819616615772247, "learning_rate": 3.2801375675432575e-05, "loss": 0.1753, "step": 6810 }, { "epoch": 4.825743319533308, "grad_norm": 0.19009925425052643, "learning_rate": 3.278843233868985e-05, "loss": 0.161, "step": 6811 }, { "epoch": 4.826496048174633, "grad_norm": 0.19988274574279785, "learning_rate": 3.2775491430970736e-05, "loss": 0.1538, "step": 6812 }, { "epoch": 4.827248776815958, "grad_norm": 0.14805395901203156, "learning_rate": 3.276255295369051e-05, "loss": 0.1055, "step": 6813 }, { "epoch": 4.828001505457283, "grad_norm": 0.17003856599330902, "learning_rate": 3.2749616908263996e-05, "loss": 0.1138, "step": 6814 }, { "epoch": 4.828754234098607, "grad_norm": 0.2095632255077362, "learning_rate": 3.2736683296105976e-05, "loss": 0.1205, "step": 6815 }, { "epoch": 4.8295069627399325, "grad_norm": 0.12723539769649506, "learning_rate": 3.272375211863078e-05, "loss": 0.0901, "step": 6816 }, { "epoch": 4.830259691381257, "grad_norm": 0.14109541475772858, "learning_rate": 3.271082337725257e-05, "loss": 0.0514, "step": 6817 }, { "epoch": 4.8310124200225815, "grad_norm": 0.168239027261734, "learning_rate": 3.269789707338519e-05, "loss": 0.1376, "step": 6818 }, { "epoch": 4.831765148663907, "grad_norm": 0.12468380481004715, "learning_rate": 3.268497320844227e-05, "loss": 0.0744, "step": 6819 }, { "epoch": 4.832517877305231, "grad_norm": 0.1635095775127411, "learning_rate": 3.26720517838371e-05, "loss": 0.1506, "step": 6820 }, { "epoch": 4.833270605946556, "grad_norm": 0.13978229463100433, "learning_rate": 3.2659132800982817e-05, "loss": 0.0807, "step": 6821 }, { "epoch": 4.834023334587881, "grad_norm": 0.14701151847839355, "learning_rate": 3.264621626129215e-05, "loss": 0.1173, "step": 6822 }, { "epoch": 4.834776063229206, "grad_norm": 0.16601629555225372, "learning_rate": 3.263330216617769e-05, "loss": 0.1472, "step": 6823 }, { "epoch": 4.835528791870531, "grad_norm": 0.16077332198619843, "learning_rate": 3.262039051705167e-05, "loss": 0.1546, "step": 6824 }, { "epoch": 4.8362815205118554, "grad_norm": 0.1569555699825287, "learning_rate": 3.260748131532609e-05, "loss": 0.0747, "step": 6825 }, { "epoch": 4.83703424915318, "grad_norm": 0.17267559468746185, "learning_rate": 3.259457456241267e-05, "loss": 0.1249, "step": 6826 }, { "epoch": 4.837786977794505, "grad_norm": 0.1902802288532257, "learning_rate": 3.258167025972292e-05, "loss": 0.1659, "step": 6827 }, { "epoch": 4.83853970643583, "grad_norm": 0.1798456609249115, "learning_rate": 3.2568768408667984e-05, "loss": 0.1003, "step": 6828 }, { "epoch": 4.839292435077155, "grad_norm": 0.14999857544898987, "learning_rate": 3.255586901065883e-05, "loss": 0.0981, "step": 6829 }, { "epoch": 4.84004516371848, "grad_norm": 0.13553568720817566, "learning_rate": 3.2542972067106084e-05, "loss": 0.084, "step": 6830 }, { "epoch": 4.840797892359804, "grad_norm": 0.1804509460926056, "learning_rate": 3.2530077579420144e-05, "loss": 0.0839, "step": 6831 }, { "epoch": 4.841550621001129, "grad_norm": 0.13545311987400055, "learning_rate": 3.2517185549011154e-05, "loss": 0.0642, "step": 6832 }, { "epoch": 4.842303349642454, "grad_norm": 0.15685170888900757, "learning_rate": 3.2504295977288935e-05, "loss": 0.1532, "step": 6833 }, { "epoch": 4.843056078283778, "grad_norm": 0.1640803962945938, "learning_rate": 3.249140886566308e-05, "loss": 0.1328, "step": 6834 }, { "epoch": 4.843808806925104, "grad_norm": 0.18641425669193268, "learning_rate": 3.247852421554292e-05, "loss": 0.1518, "step": 6835 }, { "epoch": 4.844561535566428, "grad_norm": 0.1684206873178482, "learning_rate": 3.246564202833749e-05, "loss": 0.1266, "step": 6836 }, { "epoch": 4.845314264207753, "grad_norm": 0.13844768702983856, "learning_rate": 3.2452762305455576e-05, "loss": 0.0838, "step": 6837 }, { "epoch": 4.846066992849078, "grad_norm": 0.16493220627307892, "learning_rate": 3.2439885048305686e-05, "loss": 0.1011, "step": 6838 }, { "epoch": 4.8468197214904025, "grad_norm": 0.1726355403661728, "learning_rate": 3.2427010258295996e-05, "loss": 0.1233, "step": 6839 }, { "epoch": 4.847572450131728, "grad_norm": 0.190715491771698, "learning_rate": 3.241413793683458e-05, "loss": 0.1651, "step": 6840 }, { "epoch": 4.848325178773052, "grad_norm": 0.14312276244163513, "learning_rate": 3.240126808532906e-05, "loss": 0.1626, "step": 6841 }, { "epoch": 4.849077907414377, "grad_norm": 0.17034722864627838, "learning_rate": 3.23884007051869e-05, "loss": 0.0836, "step": 6842 }, { "epoch": 4.849830636055702, "grad_norm": 0.17587243020534515, "learning_rate": 3.2375535797815235e-05, "loss": 0.1711, "step": 6843 }, { "epoch": 4.850583364697027, "grad_norm": 0.16681353747844696, "learning_rate": 3.2362673364620965e-05, "loss": 0.1099, "step": 6844 }, { "epoch": 4.851336093338351, "grad_norm": 0.18132957816123962, "learning_rate": 3.234981340701069e-05, "loss": 0.08, "step": 6845 }, { "epoch": 4.8520888219796765, "grad_norm": 0.1624220907688141, "learning_rate": 3.233695592639077e-05, "loss": 0.1604, "step": 6846 }, { "epoch": 4.852841550621001, "grad_norm": 0.1690622717142105, "learning_rate": 3.232410092416727e-05, "loss": 0.1045, "step": 6847 }, { "epoch": 4.853594279262326, "grad_norm": 0.14241762459278107, "learning_rate": 3.231124840174602e-05, "loss": 0.083, "step": 6848 }, { "epoch": 4.854347007903651, "grad_norm": 0.18111227452754974, "learning_rate": 3.229839836053251e-05, "loss": 0.1351, "step": 6849 }, { "epoch": 4.855099736544975, "grad_norm": 0.16936208307743073, "learning_rate": 3.2285550801932046e-05, "loss": 0.1339, "step": 6850 }, { "epoch": 4.855852465186301, "grad_norm": 0.15409071743488312, "learning_rate": 3.2272705727349584e-05, "loss": 0.0797, "step": 6851 }, { "epoch": 4.856605193827625, "grad_norm": 0.18733640015125275, "learning_rate": 3.2259863138189845e-05, "loss": 0.1262, "step": 6852 }, { "epoch": 4.85735792246895, "grad_norm": 0.149435356259346, "learning_rate": 3.224702303585729e-05, "loss": 0.1567, "step": 6853 }, { "epoch": 4.858110651110275, "grad_norm": 0.17341765761375427, "learning_rate": 3.2234185421756105e-05, "loss": 0.1323, "step": 6854 }, { "epoch": 4.858863379751599, "grad_norm": 0.1382197141647339, "learning_rate": 3.2221350297290146e-05, "loss": 0.1032, "step": 6855 }, { "epoch": 4.859616108392924, "grad_norm": 0.1858421117067337, "learning_rate": 3.2208517663863084e-05, "loss": 0.1504, "step": 6856 }, { "epoch": 4.860368837034249, "grad_norm": 0.14341402053833008, "learning_rate": 3.2195687522878275e-05, "loss": 0.1211, "step": 6857 }, { "epoch": 4.861121565675574, "grad_norm": 0.1420661360025406, "learning_rate": 3.218285987573877e-05, "loss": 0.1751, "step": 6858 }, { "epoch": 4.861874294316899, "grad_norm": 0.17365173995494843, "learning_rate": 3.217003472384742e-05, "loss": 0.0984, "step": 6859 }, { "epoch": 4.862627022958224, "grad_norm": 0.16793102025985718, "learning_rate": 3.215721206860673e-05, "loss": 0.1893, "step": 6860 }, { "epoch": 4.863379751599548, "grad_norm": 0.15047118067741394, "learning_rate": 3.214439191141902e-05, "loss": 0.0847, "step": 6861 }, { "epoch": 4.864132480240873, "grad_norm": 0.1761917769908905, "learning_rate": 3.2131574253686217e-05, "loss": 0.1649, "step": 6862 }, { "epoch": 4.864885208882198, "grad_norm": 0.16810911893844604, "learning_rate": 3.211875909681009e-05, "loss": 0.1251, "step": 6863 }, { "epoch": 4.865637937523523, "grad_norm": 0.15243487060070038, "learning_rate": 3.210594644219206e-05, "loss": 0.0959, "step": 6864 }, { "epoch": 4.866390666164848, "grad_norm": 0.18088802695274353, "learning_rate": 3.2093136291233296e-05, "loss": 0.1203, "step": 6865 }, { "epoch": 4.867143394806172, "grad_norm": 0.1491638720035553, "learning_rate": 3.208032864533471e-05, "loss": 0.1787, "step": 6866 }, { "epoch": 4.867896123447498, "grad_norm": 0.14527933299541473, "learning_rate": 3.206752350589694e-05, "loss": 0.1237, "step": 6867 }, { "epoch": 4.868648852088822, "grad_norm": 0.15837906301021576, "learning_rate": 3.20547208743203e-05, "loss": 0.1287, "step": 6868 }, { "epoch": 4.8694015807301465, "grad_norm": 0.16621950268745422, "learning_rate": 3.2041920752004914e-05, "loss": 0.0642, "step": 6869 }, { "epoch": 4.870154309371472, "grad_norm": 0.1555330604314804, "learning_rate": 3.202912314035054e-05, "loss": 0.1326, "step": 6870 }, { "epoch": 4.870907038012796, "grad_norm": 0.15158866345882416, "learning_rate": 3.2016328040756715e-05, "loss": 0.0775, "step": 6871 }, { "epoch": 4.871659766654121, "grad_norm": 0.1519072949886322, "learning_rate": 3.2003535454622703e-05, "loss": 0.1386, "step": 6872 }, { "epoch": 4.872412495295446, "grad_norm": 0.14780037105083466, "learning_rate": 3.1990745383347496e-05, "loss": 0.0918, "step": 6873 }, { "epoch": 4.873165223936771, "grad_norm": 0.14508749544620514, "learning_rate": 3.197795782832976e-05, "loss": 0.0562, "step": 6874 }, { "epoch": 4.873917952578095, "grad_norm": 0.16371002793312073, "learning_rate": 3.1965172790967965e-05, "loss": 0.1426, "step": 6875 }, { "epoch": 4.8746706812194205, "grad_norm": 0.17849569022655487, "learning_rate": 3.1952390272660226e-05, "loss": 0.1134, "step": 6876 }, { "epoch": 4.875423409860745, "grad_norm": 0.14413288235664368, "learning_rate": 3.193961027480443e-05, "loss": 0.0998, "step": 6877 }, { "epoch": 4.87617613850207, "grad_norm": 0.14418688416481018, "learning_rate": 3.192683279879819e-05, "loss": 0.1094, "step": 6878 }, { "epoch": 4.876928867143395, "grad_norm": 0.13053128123283386, "learning_rate": 3.191405784603884e-05, "loss": 0.0857, "step": 6879 }, { "epoch": 4.877681595784719, "grad_norm": 0.16649220883846283, "learning_rate": 3.19012854179234e-05, "loss": 0.1168, "step": 6880 }, { "epoch": 4.878434324426045, "grad_norm": 0.14631931483745575, "learning_rate": 3.188851551584865e-05, "loss": 0.1177, "step": 6881 }, { "epoch": 4.879187053067369, "grad_norm": 0.18435519933700562, "learning_rate": 3.1875748141211114e-05, "loss": 0.1413, "step": 6882 }, { "epoch": 4.8799397817086945, "grad_norm": 0.15145976841449738, "learning_rate": 3.186298329540698e-05, "loss": 0.1195, "step": 6883 }, { "epoch": 4.880692510350019, "grad_norm": 0.16652178764343262, "learning_rate": 3.185022097983221e-05, "loss": 0.1397, "step": 6884 }, { "epoch": 4.881445238991343, "grad_norm": 0.1497916728258133, "learning_rate": 3.183746119588247e-05, "loss": 0.1238, "step": 6885 }, { "epoch": 4.882197967632669, "grad_norm": 0.1564020812511444, "learning_rate": 3.182470394495316e-05, "loss": 0.1387, "step": 6886 }, { "epoch": 4.882950696273993, "grad_norm": 0.15549336373806, "learning_rate": 3.1811949228439365e-05, "loss": 0.1539, "step": 6887 }, { "epoch": 4.883703424915318, "grad_norm": 0.1595834344625473, "learning_rate": 3.179919704773596e-05, "loss": 0.1471, "step": 6888 }, { "epoch": 4.884456153556643, "grad_norm": 0.15255172550678253, "learning_rate": 3.178644740423746e-05, "loss": 0.1407, "step": 6889 }, { "epoch": 4.885208882197968, "grad_norm": 0.15569067001342773, "learning_rate": 3.177370029933818e-05, "loss": 0.1602, "step": 6890 }, { "epoch": 4.885961610839292, "grad_norm": 0.17806147038936615, "learning_rate": 3.176095573443212e-05, "loss": 0.1016, "step": 6891 }, { "epoch": 4.886714339480617, "grad_norm": 0.18106549978256226, "learning_rate": 3.174821371091301e-05, "loss": 0.1149, "step": 6892 }, { "epoch": 4.887467068121942, "grad_norm": 0.15795771777629852, "learning_rate": 3.1735474230174275e-05, "loss": 0.1517, "step": 6893 }, { "epoch": 4.888219796763266, "grad_norm": 0.16255199909210205, "learning_rate": 3.172273729360911e-05, "loss": 0.1553, "step": 6894 }, { "epoch": 4.888972525404592, "grad_norm": 0.1644502729177475, "learning_rate": 3.171000290261039e-05, "loss": 0.1514, "step": 6895 }, { "epoch": 4.889725254045916, "grad_norm": 0.12769944965839386, "learning_rate": 3.1697271058570735e-05, "loss": 0.1613, "step": 6896 }, { "epoch": 4.8904779826872415, "grad_norm": 0.16077914834022522, "learning_rate": 3.168454176288248e-05, "loss": 0.1152, "step": 6897 }, { "epoch": 4.891230711328566, "grad_norm": 0.1768026202917099, "learning_rate": 3.16718150169377e-05, "loss": 0.1623, "step": 6898 }, { "epoch": 4.8919834399698905, "grad_norm": 0.16180789470672607, "learning_rate": 3.165909082212815e-05, "loss": 0.1038, "step": 6899 }, { "epoch": 4.892736168611216, "grad_norm": 0.19754645228385925, "learning_rate": 3.164636917984534e-05, "loss": 0.1779, "step": 6900 }, { "epoch": 4.89348889725254, "grad_norm": 0.18295013904571533, "learning_rate": 3.1633650091480465e-05, "loss": 0.106, "step": 6901 }, { "epoch": 4.894241625893866, "grad_norm": 0.14156390726566315, "learning_rate": 3.1620933558424486e-05, "loss": 0.0838, "step": 6902 }, { "epoch": 4.89499435453519, "grad_norm": 0.17929890751838684, "learning_rate": 3.160821958206807e-05, "loss": 0.1494, "step": 6903 }, { "epoch": 4.895747083176515, "grad_norm": 0.15715031325817108, "learning_rate": 3.1595508163801604e-05, "loss": 0.0652, "step": 6904 }, { "epoch": 4.89649981181784, "grad_norm": 0.1409827023744583, "learning_rate": 3.1582799305015164e-05, "loss": 0.1102, "step": 6905 }, { "epoch": 4.8972525404591645, "grad_norm": 0.14739975333213806, "learning_rate": 3.157009300709859e-05, "loss": 0.1284, "step": 6906 }, { "epoch": 4.898005269100489, "grad_norm": 0.15717554092407227, "learning_rate": 3.1557389271441425e-05, "loss": 0.1909, "step": 6907 }, { "epoch": 4.898757997741814, "grad_norm": 0.16743265092372894, "learning_rate": 3.15446880994329e-05, "loss": 0.0837, "step": 6908 }, { "epoch": 4.899510726383139, "grad_norm": 0.14165766537189484, "learning_rate": 3.153198949246205e-05, "loss": 0.1268, "step": 6909 }, { "epoch": 4.900263455024463, "grad_norm": 0.17500536143779755, "learning_rate": 3.151929345191752e-05, "loss": 0.1332, "step": 6910 }, { "epoch": 4.901016183665789, "grad_norm": 0.1728186458349228, "learning_rate": 3.150659997918777e-05, "loss": 0.1082, "step": 6911 }, { "epoch": 4.901768912307113, "grad_norm": 0.17428600788116455, "learning_rate": 3.149390907566092e-05, "loss": 0.1956, "step": 6912 }, { "epoch": 4.9025216409484385, "grad_norm": 0.19368214905261993, "learning_rate": 3.1481220742724845e-05, "loss": 0.1349, "step": 6913 }, { "epoch": 4.903274369589763, "grad_norm": 0.15109916031360626, "learning_rate": 3.1468534981767095e-05, "loss": 0.111, "step": 6914 }, { "epoch": 4.904027098231087, "grad_norm": 0.1523405909538269, "learning_rate": 3.145585179417499e-05, "loss": 0.1595, "step": 6915 }, { "epoch": 4.904779826872413, "grad_norm": 0.17919482290744781, "learning_rate": 3.144317118133551e-05, "loss": 0.1424, "step": 6916 }, { "epoch": 4.905532555513737, "grad_norm": 0.14279179275035858, "learning_rate": 3.143049314463544e-05, "loss": 0.1129, "step": 6917 }, { "epoch": 4.906285284155063, "grad_norm": 0.156419575214386, "learning_rate": 3.141781768546119e-05, "loss": 0.1011, "step": 6918 }, { "epoch": 4.907038012796387, "grad_norm": 0.15106549859046936, "learning_rate": 3.140514480519896e-05, "loss": 0.1583, "step": 6919 }, { "epoch": 4.9077907414377115, "grad_norm": 0.1411212682723999, "learning_rate": 3.1392474505234604e-05, "loss": 0.1695, "step": 6920 }, { "epoch": 4.908543470079037, "grad_norm": 0.14897173643112183, "learning_rate": 3.137980678695375e-05, "loss": 0.0927, "step": 6921 }, { "epoch": 4.909296198720361, "grad_norm": 0.1626075655221939, "learning_rate": 3.1367141651741694e-05, "loss": 0.2233, "step": 6922 }, { "epoch": 4.910048927361686, "grad_norm": 0.16776274144649506, "learning_rate": 3.135447910098352e-05, "loss": 0.1075, "step": 6923 }, { "epoch": 4.910801656003011, "grad_norm": 0.16411501169204712, "learning_rate": 3.134181913606394e-05, "loss": 0.1548, "step": 6924 }, { "epoch": 4.911554384644336, "grad_norm": 0.16289377212524414, "learning_rate": 3.1329161758367474e-05, "loss": 0.0303, "step": 6925 }, { "epoch": 4.91230711328566, "grad_norm": 0.18776950240135193, "learning_rate": 3.131650696927827e-05, "loss": 0.1198, "step": 6926 }, { "epoch": 4.9130598419269855, "grad_norm": 0.1558430790901184, "learning_rate": 3.130385477018026e-05, "loss": 0.2101, "step": 6927 }, { "epoch": 4.91381257056831, "grad_norm": 0.1409333199262619, "learning_rate": 3.129120516245706e-05, "loss": 0.168, "step": 6928 }, { "epoch": 4.9145652992096345, "grad_norm": 0.145340234041214, "learning_rate": 3.1278558147492045e-05, "loss": 0.1178, "step": 6929 }, { "epoch": 4.91531802785096, "grad_norm": 0.1823035627603531, "learning_rate": 3.1265913726668214e-05, "loss": 0.1129, "step": 6930 }, { "epoch": 4.916070756492284, "grad_norm": 0.16100311279296875, "learning_rate": 3.1253271901368385e-05, "loss": 0.1652, "step": 6931 }, { "epoch": 4.91682348513361, "grad_norm": 0.1469346582889557, "learning_rate": 3.124063267297506e-05, "loss": 0.1254, "step": 6932 }, { "epoch": 4.917576213774934, "grad_norm": 0.18832813203334808, "learning_rate": 3.122799604287039e-05, "loss": 0.1823, "step": 6933 }, { "epoch": 4.918328942416259, "grad_norm": 0.1404862105846405, "learning_rate": 3.1215362012436363e-05, "loss": 0.1119, "step": 6934 }, { "epoch": 4.919081671057584, "grad_norm": 0.13200528919696808, "learning_rate": 3.1202730583054554e-05, "loss": 0.0741, "step": 6935 }, { "epoch": 4.9198343996989085, "grad_norm": 0.1607540398836136, "learning_rate": 3.1190101756106374e-05, "loss": 0.1102, "step": 6936 }, { "epoch": 4.920587128340234, "grad_norm": 0.1548173427581787, "learning_rate": 3.117747553297285e-05, "loss": 0.0716, "step": 6937 }, { "epoch": 4.921339856981558, "grad_norm": 0.17651520669460297, "learning_rate": 3.116485191503481e-05, "loss": 0.0965, "step": 6938 }, { "epoch": 4.922092585622883, "grad_norm": 0.150996595621109, "learning_rate": 3.115223090367271e-05, "loss": 0.0806, "step": 6939 }, { "epoch": 4.922845314264208, "grad_norm": 0.15479031205177307, "learning_rate": 3.11396125002668e-05, "loss": 0.1365, "step": 6940 }, { "epoch": 4.923598042905533, "grad_norm": 0.1606520265340805, "learning_rate": 3.112699670619696e-05, "loss": 0.1198, "step": 6941 }, { "epoch": 4.924350771546857, "grad_norm": 0.16568107903003693, "learning_rate": 3.1114383522842906e-05, "loss": 0.1304, "step": 6942 }, { "epoch": 4.925103500188182, "grad_norm": 0.16037113964557648, "learning_rate": 3.110177295158393e-05, "loss": 0.1094, "step": 6943 }, { "epoch": 4.925856228829507, "grad_norm": 0.18273119628429413, "learning_rate": 3.108916499379916e-05, "loss": 0.1446, "step": 6944 }, { "epoch": 4.926608957470831, "grad_norm": 0.18994255363941193, "learning_rate": 3.107655965086734e-05, "loss": 0.1672, "step": 6945 }, { "epoch": 4.927361686112157, "grad_norm": 0.1714843362569809, "learning_rate": 3.1063956924167005e-05, "loss": 0.12, "step": 6946 }, { "epoch": 4.928114414753481, "grad_norm": 0.15288592875003815, "learning_rate": 3.105135681507632e-05, "loss": 0.0951, "step": 6947 }, { "epoch": 4.928867143394806, "grad_norm": 0.17972677946090698, "learning_rate": 3.103875932497327e-05, "loss": 0.1286, "step": 6948 }, { "epoch": 4.929619872036131, "grad_norm": 0.1695440262556076, "learning_rate": 3.1026164455235465e-05, "loss": 0.1882, "step": 6949 }, { "epoch": 4.9303726006774555, "grad_norm": 0.1901710480451584, "learning_rate": 3.101357220724029e-05, "loss": 0.1283, "step": 6950 }, { "epoch": 4.931125329318781, "grad_norm": 0.15556374192237854, "learning_rate": 3.100098258236478e-05, "loss": 0.1328, "step": 6951 }, { "epoch": 4.931878057960105, "grad_norm": 0.14302958548069, "learning_rate": 3.098839558198573e-05, "loss": 0.0521, "step": 6952 }, { "epoch": 4.93263078660143, "grad_norm": 0.15893813967704773, "learning_rate": 3.0975811207479656e-05, "loss": 0.0728, "step": 6953 }, { "epoch": 4.933383515242755, "grad_norm": 0.20220647752285004, "learning_rate": 3.096322946022273e-05, "loss": 0.1569, "step": 6954 }, { "epoch": 4.93413624388408, "grad_norm": 0.14436380565166473, "learning_rate": 3.095065034159089e-05, "loss": 0.1536, "step": 6955 }, { "epoch": 4.934888972525405, "grad_norm": 0.1915082037448883, "learning_rate": 3.093807385295978e-05, "loss": 0.1621, "step": 6956 }, { "epoch": 4.9356417011667295, "grad_norm": 0.16313819587230682, "learning_rate": 3.092549999570476e-05, "loss": 0.142, "step": 6957 }, { "epoch": 4.936394429808054, "grad_norm": 0.15942969918251038, "learning_rate": 3.091292877120082e-05, "loss": 0.1376, "step": 6958 }, { "epoch": 4.937147158449379, "grad_norm": 0.13713113963603973, "learning_rate": 3.0900360180822815e-05, "loss": 0.0815, "step": 6959 }, { "epoch": 4.937899887090704, "grad_norm": 0.151334747672081, "learning_rate": 3.088779422594514e-05, "loss": 0.1164, "step": 6960 }, { "epoch": 4.938652615732028, "grad_norm": 0.18475262820720673, "learning_rate": 3.087523090794208e-05, "loss": 0.1785, "step": 6961 }, { "epoch": 4.939405344373354, "grad_norm": 0.16872431337833405, "learning_rate": 3.0862670228187474e-05, "loss": 0.0699, "step": 6962 }, { "epoch": 4.940158073014678, "grad_norm": 0.15107187628746033, "learning_rate": 3.0850112188054986e-05, "loss": 0.1032, "step": 6963 }, { "epoch": 4.940910801656003, "grad_norm": 0.15983320772647858, "learning_rate": 3.08375567889179e-05, "loss": 0.1199, "step": 6964 }, { "epoch": 4.941663530297328, "grad_norm": 0.16944155097007751, "learning_rate": 3.082500403214929e-05, "loss": 0.1246, "step": 6965 }, { "epoch": 4.942416258938652, "grad_norm": 0.1683775782585144, "learning_rate": 3.081245391912186e-05, "loss": 0.1035, "step": 6966 }, { "epoch": 4.943168987579978, "grad_norm": 0.126759335398674, "learning_rate": 3.0799906451208135e-05, "loss": 0.1476, "step": 6967 }, { "epoch": 4.943921716221302, "grad_norm": 0.16131389141082764, "learning_rate": 3.078736162978025e-05, "loss": 0.0904, "step": 6968 }, { "epoch": 4.944674444862627, "grad_norm": 0.15500858426094055, "learning_rate": 3.07748194562101e-05, "loss": 0.118, "step": 6969 }, { "epoch": 4.945427173503952, "grad_norm": 0.17458190023899078, "learning_rate": 3.076227993186925e-05, "loss": 0.1442, "step": 6970 }, { "epoch": 4.946179902145277, "grad_norm": 0.17329192161560059, "learning_rate": 3.074974305812906e-05, "loss": 0.1796, "step": 6971 }, { "epoch": 4.946932630786601, "grad_norm": 0.16822192072868347, "learning_rate": 3.0737208836360484e-05, "loss": 0.1013, "step": 6972 }, { "epoch": 4.947685359427926, "grad_norm": 0.16863353550434113, "learning_rate": 3.0724677267934266e-05, "loss": 0.1336, "step": 6973 }, { "epoch": 4.948438088069251, "grad_norm": 0.1577104926109314, "learning_rate": 3.071214835422085e-05, "loss": 0.1224, "step": 6974 }, { "epoch": 4.949190816710576, "grad_norm": 0.17759647965431213, "learning_rate": 3.069962209659039e-05, "loss": 0.1199, "step": 6975 }, { "epoch": 4.949943545351901, "grad_norm": 0.16226261854171753, "learning_rate": 3.0687098496412706e-05, "loss": 0.1284, "step": 6976 }, { "epoch": 4.950696273993225, "grad_norm": 0.14591951668262482, "learning_rate": 3.067457755505738e-05, "loss": 0.1009, "step": 6977 }, { "epoch": 4.951449002634551, "grad_norm": 0.1286843866109848, "learning_rate": 3.066205927389369e-05, "loss": 0.1328, "step": 6978 }, { "epoch": 4.952201731275875, "grad_norm": 0.1353140026330948, "learning_rate": 3.064954365429059e-05, "loss": 0.0739, "step": 6979 }, { "epoch": 4.9529544599171995, "grad_norm": 0.15487204492092133, "learning_rate": 3.063703069761679e-05, "loss": 0.2014, "step": 6980 }, { "epoch": 4.953707188558525, "grad_norm": 0.1542646437883377, "learning_rate": 3.062452040524069e-05, "loss": 0.1569, "step": 6981 }, { "epoch": 4.954459917199849, "grad_norm": 0.14440420269966125, "learning_rate": 3.0612012778530395e-05, "loss": 0.0642, "step": 6982 }, { "epoch": 4.955212645841174, "grad_norm": 0.1661776602268219, "learning_rate": 3.05995078188537e-05, "loss": 0.0933, "step": 6983 }, { "epoch": 4.955965374482499, "grad_norm": 0.1941545158624649, "learning_rate": 3.058700552757817e-05, "loss": 0.1892, "step": 6984 }, { "epoch": 4.956718103123824, "grad_norm": 0.15013717114925385, "learning_rate": 3.0574505906070985e-05, "loss": 0.054, "step": 6985 }, { "epoch": 4.957470831765149, "grad_norm": 0.7394866943359375, "learning_rate": 3.0562008955699114e-05, "loss": 0.1186, "step": 6986 }, { "epoch": 4.9582235604064735, "grad_norm": 0.17661379277706146, "learning_rate": 3.0549514677829206e-05, "loss": 0.1548, "step": 6987 }, { "epoch": 4.958976289047798, "grad_norm": 0.14886051416397095, "learning_rate": 3.053702307382762e-05, "loss": 0.15, "step": 6988 }, { "epoch": 4.959729017689123, "grad_norm": 0.23053039610385895, "learning_rate": 3.0524534145060405e-05, "loss": 0.0816, "step": 6989 }, { "epoch": 4.960481746330448, "grad_norm": 0.1832948923110962, "learning_rate": 3.0512047892893348e-05, "loss": 0.1521, "step": 6990 }, { "epoch": 4.961234474971773, "grad_norm": 0.14149555563926697, "learning_rate": 3.0499564318691898e-05, "loss": 0.0715, "step": 6991 }, { "epoch": 4.961987203613098, "grad_norm": 0.1931566298007965, "learning_rate": 3.0487083423821255e-05, "loss": 0.1462, "step": 6992 }, { "epoch": 4.962739932254422, "grad_norm": 0.15378941595554352, "learning_rate": 3.0474605209646324e-05, "loss": 0.0961, "step": 6993 }, { "epoch": 4.9634926608957475, "grad_norm": 0.20154748857021332, "learning_rate": 3.0462129677531704e-05, "loss": 0.0931, "step": 6994 }, { "epoch": 4.964245389537072, "grad_norm": 0.5886436700820923, "learning_rate": 3.044965682884168e-05, "loss": 0.1361, "step": 6995 }, { "epoch": 4.964998118178396, "grad_norm": 0.9214186072349548, "learning_rate": 3.043718666494028e-05, "loss": 0.0698, "step": 6996 }, { "epoch": 4.965750846819722, "grad_norm": 0.1730402410030365, "learning_rate": 3.0424719187191207e-05, "loss": 0.148, "step": 6997 }, { "epoch": 4.966503575461046, "grad_norm": 0.15078668296337128, "learning_rate": 3.0412254396957896e-05, "loss": 0.098, "step": 6998 }, { "epoch": 4.967256304102371, "grad_norm": 0.16935691237449646, "learning_rate": 3.039979229560348e-05, "loss": 0.1596, "step": 6999 }, { "epoch": 4.968009032743696, "grad_norm": 0.14986057579517365, "learning_rate": 3.0387332884490805e-05, "loss": 0.1302, "step": 7000 }, { "epoch": 4.968009032743696, "eval_loss": 0.1642126739025116, "eval_runtime": 456.6557, "eval_samples_per_second": 21.082, "eval_steps_per_second": 0.659, "step": 7000 }, { "epoch": 4.968761761385021, "grad_norm": 0.1868821084499359, "learning_rate": 3.037487616498238e-05, "loss": 0.1498, "step": 7001 }, { "epoch": 4.969514490026345, "grad_norm": 0.1836196482181549, "learning_rate": 3.0362422138440477e-05, "loss": 0.1996, "step": 7002 }, { "epoch": 4.97026721866767, "grad_norm": 0.17659923434257507, "learning_rate": 3.0349970806227068e-05, "loss": 0.0914, "step": 7003 }, { "epoch": 4.971019947308995, "grad_norm": 0.20035074651241302, "learning_rate": 3.0337522169703768e-05, "loss": 0.1135, "step": 7004 }, { "epoch": 4.97177267595032, "grad_norm": 0.15603692829608917, "learning_rate": 3.0325076230231974e-05, "loss": 0.1007, "step": 7005 }, { "epoch": 4.972525404591645, "grad_norm": 0.17288386821746826, "learning_rate": 3.031263298917274e-05, "loss": 0.1464, "step": 7006 }, { "epoch": 4.973278133232969, "grad_norm": 0.16954351961612701, "learning_rate": 3.030019244788686e-05, "loss": 0.127, "step": 7007 }, { "epoch": 4.9740308618742946, "grad_norm": 0.13633808493614197, "learning_rate": 3.0287754607734782e-05, "loss": 0.0675, "step": 7008 }, { "epoch": 4.974783590515619, "grad_norm": 0.17681308090686798, "learning_rate": 3.0275319470076723e-05, "loss": 0.0988, "step": 7009 }, { "epoch": 4.975536319156944, "grad_norm": 0.17419788241386414, "learning_rate": 3.0262887036272535e-05, "loss": 0.1047, "step": 7010 }, { "epoch": 4.976289047798269, "grad_norm": 0.1672329157590866, "learning_rate": 3.0250457307681835e-05, "loss": 0.1509, "step": 7011 }, { "epoch": 4.977041776439593, "grad_norm": 0.1864711344242096, "learning_rate": 3.023803028566391e-05, "loss": 0.1753, "step": 7012 }, { "epoch": 4.977794505080919, "grad_norm": 0.13890327513217926, "learning_rate": 3.0225605971577786e-05, "loss": 0.0676, "step": 7013 }, { "epoch": 4.978547233722243, "grad_norm": 0.15017466247081757, "learning_rate": 3.0213184366782122e-05, "loss": 0.093, "step": 7014 }, { "epoch": 4.979299962363568, "grad_norm": 0.19147349894046783, "learning_rate": 3.020076547263536e-05, "loss": 0.1814, "step": 7015 }, { "epoch": 4.980052691004893, "grad_norm": 0.15407083928585052, "learning_rate": 3.0188349290495588e-05, "loss": 0.1319, "step": 7016 }, { "epoch": 4.9808054196462175, "grad_norm": 0.15905922651290894, "learning_rate": 3.0175935821720648e-05, "loss": 0.0818, "step": 7017 }, { "epoch": 4.981558148287542, "grad_norm": 0.13793905079364777, "learning_rate": 3.016352506766803e-05, "loss": 0.1089, "step": 7018 }, { "epoch": 4.982310876928867, "grad_norm": 0.1565445214509964, "learning_rate": 3.0151117029694986e-05, "loss": 0.1301, "step": 7019 }, { "epoch": 4.983063605570192, "grad_norm": 0.16261804103851318, "learning_rate": 3.0138711709158406e-05, "loss": 0.0992, "step": 7020 }, { "epoch": 4.983816334211516, "grad_norm": 0.18329647183418274, "learning_rate": 3.0126309107414963e-05, "loss": 0.1954, "step": 7021 }, { "epoch": 4.984569062852842, "grad_norm": 0.15827351808547974, "learning_rate": 3.0113909225820936e-05, "loss": 0.0853, "step": 7022 }, { "epoch": 4.985321791494166, "grad_norm": 0.13918112218379974, "learning_rate": 3.0101512065732375e-05, "loss": 0.0691, "step": 7023 }, { "epoch": 4.9860745201354915, "grad_norm": 0.1717829704284668, "learning_rate": 3.0089117628505026e-05, "loss": 0.1433, "step": 7024 }, { "epoch": 4.986827248776816, "grad_norm": 0.1375225931406021, "learning_rate": 3.0076725915494342e-05, "loss": 0.1061, "step": 7025 }, { "epoch": 4.98757997741814, "grad_norm": 0.14804361760616302, "learning_rate": 3.0064336928055414e-05, "loss": 0.1204, "step": 7026 }, { "epoch": 4.988332706059466, "grad_norm": 0.1463770717382431, "learning_rate": 3.0051950667543118e-05, "loss": 0.0813, "step": 7027 }, { "epoch": 4.98908543470079, "grad_norm": 0.126630499958992, "learning_rate": 3.0039567135311998e-05, "loss": 0.1145, "step": 7028 }, { "epoch": 4.989838163342116, "grad_norm": 0.15941543877124786, "learning_rate": 3.0027186332716274e-05, "loss": 0.0951, "step": 7029 }, { "epoch": 4.99059089198344, "grad_norm": 0.14235493540763855, "learning_rate": 3.00148082611099e-05, "loss": 0.1523, "step": 7030 }, { "epoch": 4.9913436206247646, "grad_norm": 0.15622974932193756, "learning_rate": 3.0002432921846536e-05, "loss": 0.0974, "step": 7031 }, { "epoch": 4.99209634926609, "grad_norm": 0.15929274260997772, "learning_rate": 2.9990060316279544e-05, "loss": 0.1807, "step": 7032 }, { "epoch": 4.992849077907414, "grad_norm": 0.16486269235610962, "learning_rate": 2.9977690445761936e-05, "loss": 0.1532, "step": 7033 }, { "epoch": 4.993601806548739, "grad_norm": 0.14746595919132233, "learning_rate": 2.9965323311646493e-05, "loss": 0.1297, "step": 7034 }, { "epoch": 4.994354535190064, "grad_norm": 0.19972771406173706, "learning_rate": 2.9952958915285633e-05, "loss": 0.1742, "step": 7035 }, { "epoch": 4.995107263831389, "grad_norm": 0.1575494259595871, "learning_rate": 2.994059725803156e-05, "loss": 0.1465, "step": 7036 }, { "epoch": 4.995859992472713, "grad_norm": 0.14110711216926575, "learning_rate": 2.992823834123605e-05, "loss": 0.0678, "step": 7037 }, { "epoch": 4.9966127211140385, "grad_norm": 0.17959220707416534, "learning_rate": 2.991588216625074e-05, "loss": 0.1173, "step": 7038 }, { "epoch": 4.997365449755363, "grad_norm": 0.1842116117477417, "learning_rate": 2.9903528734426818e-05, "loss": 0.1552, "step": 7039 }, { "epoch": 4.998118178396688, "grad_norm": 0.15804138779640198, "learning_rate": 2.9891178047115285e-05, "loss": 0.137, "step": 7040 }, { "epoch": 4.998870907038013, "grad_norm": 0.16885307431221008, "learning_rate": 2.9878830105666755e-05, "loss": 0.1577, "step": 7041 }, { "epoch": 4.999623635679337, "grad_norm": 0.13311000168323517, "learning_rate": 2.9866484911431604e-05, "loss": 0.1113, "step": 7042 }, { "epoch": 5.000376364320663, "grad_norm": 0.17322903871536255, "learning_rate": 2.9854142465759844e-05, "loss": 0.1486, "step": 7043 }, { "epoch": 5.001129092961987, "grad_norm": 0.14564268290996552, "learning_rate": 2.98418027700013e-05, "loss": 0.1663, "step": 7044 }, { "epoch": 5.001881821603312, "grad_norm": 0.17054383456707, "learning_rate": 2.9829465825505355e-05, "loss": 0.2, "step": 7045 }, { "epoch": 5.002634550244637, "grad_norm": 0.16507838666439056, "learning_rate": 2.9817131633621198e-05, "loss": 0.0905, "step": 7046 }, { "epoch": 5.0033872788859615, "grad_norm": 0.1399974226951599, "learning_rate": 2.9804800195697658e-05, "loss": 0.0916, "step": 7047 }, { "epoch": 5.004140007527287, "grad_norm": 0.16654035449028015, "learning_rate": 2.979247151308328e-05, "loss": 0.084, "step": 7048 }, { "epoch": 5.004892736168611, "grad_norm": 0.15545068681240082, "learning_rate": 2.9780145587126318e-05, "loss": 0.0949, "step": 7049 }, { "epoch": 5.005645464809936, "grad_norm": 0.14549799263477325, "learning_rate": 2.9767822419174735e-05, "loss": 0.0716, "step": 7050 }, { "epoch": 5.006398193451261, "grad_norm": 0.14292076230049133, "learning_rate": 2.975550201057614e-05, "loss": 0.0823, "step": 7051 }, { "epoch": 5.007150922092586, "grad_norm": 0.14863218367099762, "learning_rate": 2.9743184362677895e-05, "loss": 0.1361, "step": 7052 }, { "epoch": 5.00790365073391, "grad_norm": 0.17119742929935455, "learning_rate": 2.973086947682705e-05, "loss": 0.0886, "step": 7053 }, { "epoch": 5.0086563793752354, "grad_norm": 0.18176765739917755, "learning_rate": 2.9718557354370304e-05, "loss": 0.1455, "step": 7054 }, { "epoch": 5.00940910801656, "grad_norm": 0.13634295761585236, "learning_rate": 2.9706247996654137e-05, "loss": 0.0914, "step": 7055 }, { "epoch": 5.010161836657884, "grad_norm": 0.160157710313797, "learning_rate": 2.969394140502463e-05, "loss": 0.1401, "step": 7056 }, { "epoch": 5.01091456529921, "grad_norm": 0.13451270759105682, "learning_rate": 2.968163758082767e-05, "loss": 0.0892, "step": 7057 }, { "epoch": 5.011667293940534, "grad_norm": 0.13328367471694946, "learning_rate": 2.9669336525408753e-05, "loss": 0.0838, "step": 7058 }, { "epoch": 5.01242002258186, "grad_norm": 0.15515917539596558, "learning_rate": 2.9657038240113124e-05, "loss": 0.1911, "step": 7059 }, { "epoch": 5.013172751223184, "grad_norm": 0.1534973382949829, "learning_rate": 2.9644742726285662e-05, "loss": 0.1045, "step": 7060 }, { "epoch": 5.0139254798645085, "grad_norm": 0.17394328117370605, "learning_rate": 2.9632449985271037e-05, "loss": 0.1391, "step": 7061 }, { "epoch": 5.014678208505834, "grad_norm": 0.15901686251163483, "learning_rate": 2.9620160018413512e-05, "loss": 0.1185, "step": 7062 }, { "epoch": 5.015430937147158, "grad_norm": 0.13061872124671936, "learning_rate": 2.960787282705716e-05, "loss": 0.0557, "step": 7063 }, { "epoch": 5.016183665788483, "grad_norm": 0.1504867821931839, "learning_rate": 2.9595588412545644e-05, "loss": 0.1093, "step": 7064 }, { "epoch": 5.016936394429808, "grad_norm": 0.16193073987960815, "learning_rate": 2.9583306776222397e-05, "loss": 0.1097, "step": 7065 }, { "epoch": 5.017689123071133, "grad_norm": 0.14607234299182892, "learning_rate": 2.9571027919430496e-05, "loss": 0.173, "step": 7066 }, { "epoch": 5.018441851712458, "grad_norm": 0.13932549953460693, "learning_rate": 2.9558751843512768e-05, "loss": 0.1288, "step": 7067 }, { "epoch": 5.0191945803537825, "grad_norm": 0.17860187590122223, "learning_rate": 2.954647854981165e-05, "loss": 0.0981, "step": 7068 }, { "epoch": 5.019947308995107, "grad_norm": 0.15166261792182922, "learning_rate": 2.9534208039669403e-05, "loss": 0.1097, "step": 7069 }, { "epoch": 5.020700037636432, "grad_norm": 0.136325865983963, "learning_rate": 2.9521940314427865e-05, "loss": 0.0954, "step": 7070 }, { "epoch": 5.021452766277757, "grad_norm": 0.1493123471736908, "learning_rate": 2.9509675375428636e-05, "loss": 0.1217, "step": 7071 }, { "epoch": 5.022205494919081, "grad_norm": 0.14670880138874054, "learning_rate": 2.9497413224012976e-05, "loss": 0.1117, "step": 7072 }, { "epoch": 5.022958223560407, "grad_norm": 0.14565347135066986, "learning_rate": 2.9485153861521857e-05, "loss": 0.0838, "step": 7073 }, { "epoch": 5.023710952201731, "grad_norm": 0.14042901992797852, "learning_rate": 2.947289728929597e-05, "loss": 0.1786, "step": 7074 }, { "epoch": 5.0244636808430565, "grad_norm": 0.13235658407211304, "learning_rate": 2.9460643508675646e-05, "loss": 0.0857, "step": 7075 }, { "epoch": 5.025216409484381, "grad_norm": 0.17983263731002808, "learning_rate": 2.944839252100094e-05, "loss": 0.1021, "step": 7076 }, { "epoch": 5.0259691381257054, "grad_norm": 0.13668864965438843, "learning_rate": 2.9436144327611613e-05, "loss": 0.0768, "step": 7077 }, { "epoch": 5.026721866767031, "grad_norm": 0.16095824539661407, "learning_rate": 2.9423898929847127e-05, "loss": 0.166, "step": 7078 }, { "epoch": 5.027474595408355, "grad_norm": 0.15043413639068604, "learning_rate": 2.941165632904659e-05, "loss": 0.0546, "step": 7079 }, { "epoch": 5.02822732404968, "grad_norm": 0.13431133329868317, "learning_rate": 2.9399416526548852e-05, "loss": 0.1135, "step": 7080 }, { "epoch": 5.028980052691005, "grad_norm": 0.14765144884586334, "learning_rate": 2.9387179523692405e-05, "loss": 0.1062, "step": 7081 }, { "epoch": 5.02973278133233, "grad_norm": 0.15602032840251923, "learning_rate": 2.9374945321815527e-05, "loss": 0.0952, "step": 7082 }, { "epoch": 5.030485509973654, "grad_norm": 0.16782937943935394, "learning_rate": 2.9362713922256085e-05, "loss": 0.1106, "step": 7083 }, { "epoch": 5.031238238614979, "grad_norm": 0.16191545128822327, "learning_rate": 2.9350485326351717e-05, "loss": 0.103, "step": 7084 }, { "epoch": 5.031990967256304, "grad_norm": 0.15845079720020294, "learning_rate": 2.9338259535439705e-05, "loss": 0.1615, "step": 7085 }, { "epoch": 5.032743695897629, "grad_norm": 0.1668773889541626, "learning_rate": 2.932603655085706e-05, "loss": 0.123, "step": 7086 }, { "epoch": 5.033496424538954, "grad_norm": 0.15571917593479156, "learning_rate": 2.9313816373940415e-05, "loss": 0.1541, "step": 7087 }, { "epoch": 5.034249153180278, "grad_norm": 0.1459919959306717, "learning_rate": 2.9301599006026238e-05, "loss": 0.0761, "step": 7088 }, { "epoch": 5.035001881821604, "grad_norm": 0.1688573658466339, "learning_rate": 2.928938444845054e-05, "loss": 0.1361, "step": 7089 }, { "epoch": 5.035754610462928, "grad_norm": 0.13540273904800415, "learning_rate": 2.9277172702549115e-05, "loss": 0.0939, "step": 7090 }, { "epoch": 5.0365073391042525, "grad_norm": 0.15993830561637878, "learning_rate": 2.9264963769657393e-05, "loss": 0.0851, "step": 7091 }, { "epoch": 5.037260067745578, "grad_norm": 0.16207779943943024, "learning_rate": 2.9252757651110562e-05, "loss": 0.1448, "step": 7092 }, { "epoch": 5.038012796386902, "grad_norm": 0.13398078083992004, "learning_rate": 2.924055434824342e-05, "loss": 0.0849, "step": 7093 }, { "epoch": 5.038765525028228, "grad_norm": 0.20409195125102997, "learning_rate": 2.922835386239053e-05, "loss": 0.144, "step": 7094 }, { "epoch": 5.039518253669552, "grad_norm": 0.15551216900348663, "learning_rate": 2.9216156194886113e-05, "loss": 0.1344, "step": 7095 }, { "epoch": 5.040270982310877, "grad_norm": 0.15646420419216156, "learning_rate": 2.9203961347064114e-05, "loss": 0.1004, "step": 7096 }, { "epoch": 5.041023710952202, "grad_norm": 0.14065103232860565, "learning_rate": 2.9191769320258088e-05, "loss": 0.1094, "step": 7097 }, { "epoch": 5.0417764395935265, "grad_norm": 0.16595567762851715, "learning_rate": 2.9179580115801374e-05, "loss": 0.129, "step": 7098 }, { "epoch": 5.042529168234851, "grad_norm": 0.14509844779968262, "learning_rate": 2.9167393735026977e-05, "loss": 0.0697, "step": 7099 }, { "epoch": 5.043281896876176, "grad_norm": 0.14138662815093994, "learning_rate": 2.9155210179267546e-05, "loss": 0.0893, "step": 7100 }, { "epoch": 5.044034625517501, "grad_norm": 0.18116554617881775, "learning_rate": 2.9143029449855463e-05, "loss": 0.161, "step": 7101 }, { "epoch": 5.044787354158826, "grad_norm": 0.14873042702674866, "learning_rate": 2.913085154812281e-05, "loss": 0.1225, "step": 7102 }, { "epoch": 5.045540082800151, "grad_norm": 0.1468791514635086, "learning_rate": 2.9118676475401362e-05, "loss": 0.0972, "step": 7103 }, { "epoch": 5.046292811441475, "grad_norm": 0.1499404013156891, "learning_rate": 2.9106504233022512e-05, "loss": 0.0888, "step": 7104 }, { "epoch": 5.0470455400828005, "grad_norm": 0.173076331615448, "learning_rate": 2.909433482231746e-05, "loss": 0.173, "step": 7105 }, { "epoch": 5.047798268724125, "grad_norm": 0.14347650110721588, "learning_rate": 2.9082168244616976e-05, "loss": 0.1985, "step": 7106 }, { "epoch": 5.048550997365449, "grad_norm": 0.1895107626914978, "learning_rate": 2.9070004501251613e-05, "loss": 0.1392, "step": 7107 }, { "epoch": 5.049303726006775, "grad_norm": 0.180327907204628, "learning_rate": 2.905784359355157e-05, "loss": 0.2221, "step": 7108 }, { "epoch": 5.050056454648099, "grad_norm": 0.15450634062290192, "learning_rate": 2.9045685522846765e-05, "loss": 0.0817, "step": 7109 }, { "epoch": 5.050809183289424, "grad_norm": 0.1756366342306137, "learning_rate": 2.903353029046675e-05, "loss": 0.1438, "step": 7110 }, { "epoch": 5.051561911930749, "grad_norm": 0.14995943009853363, "learning_rate": 2.9021377897740836e-05, "loss": 0.1236, "step": 7111 }, { "epoch": 5.052314640572074, "grad_norm": 0.1629970669746399, "learning_rate": 2.900922834599797e-05, "loss": 0.1427, "step": 7112 }, { "epoch": 5.053067369213399, "grad_norm": 0.1628483384847641, "learning_rate": 2.8997081636566804e-05, "loss": 0.1655, "step": 7113 }, { "epoch": 5.053820097854723, "grad_norm": 0.14910314977169037, "learning_rate": 2.898493777077571e-05, "loss": 0.1097, "step": 7114 }, { "epoch": 5.054572826496048, "grad_norm": 0.13490867614746094, "learning_rate": 2.8972796749952713e-05, "loss": 0.0974, "step": 7115 }, { "epoch": 5.055325555137373, "grad_norm": 0.13946200907230377, "learning_rate": 2.8960658575425525e-05, "loss": 0.0826, "step": 7116 }, { "epoch": 5.056078283778698, "grad_norm": 0.17798039317131042, "learning_rate": 2.894852324852157e-05, "loss": 0.1843, "step": 7117 }, { "epoch": 5.056831012420022, "grad_norm": 0.17447108030319214, "learning_rate": 2.8936390770567935e-05, "loss": 0.1431, "step": 7118 }, { "epoch": 5.057583741061348, "grad_norm": 0.15291059017181396, "learning_rate": 2.892426114289142e-05, "loss": 0.1669, "step": 7119 }, { "epoch": 5.058336469702672, "grad_norm": 0.152509406208992, "learning_rate": 2.89121343668185e-05, "loss": 0.1258, "step": 7120 }, { "epoch": 5.059089198343997, "grad_norm": 0.16710606217384338, "learning_rate": 2.890001044367536e-05, "loss": 0.1505, "step": 7121 }, { "epoch": 5.059841926985322, "grad_norm": 0.15008066594600677, "learning_rate": 2.8887889374787825e-05, "loss": 0.1586, "step": 7122 }, { "epoch": 5.060594655626646, "grad_norm": 0.1463220715522766, "learning_rate": 2.8875771161481447e-05, "loss": 0.1398, "step": 7123 }, { "epoch": 5.061347384267972, "grad_norm": 0.14572590589523315, "learning_rate": 2.886365580508147e-05, "loss": 0.1867, "step": 7124 }, { "epoch": 5.062100112909296, "grad_norm": 0.15580280125141144, "learning_rate": 2.885154330691278e-05, "loss": 0.0887, "step": 7125 }, { "epoch": 5.062852841550621, "grad_norm": 0.16969838738441467, "learning_rate": 2.8839433668300007e-05, "loss": 0.0937, "step": 7126 }, { "epoch": 5.063605570191946, "grad_norm": 0.1598753184080124, "learning_rate": 2.8827326890567434e-05, "loss": 0.1635, "step": 7127 }, { "epoch": 5.0643582988332705, "grad_norm": 0.16170206665992737, "learning_rate": 2.8815222975039058e-05, "loss": 0.1477, "step": 7128 }, { "epoch": 5.065111027474595, "grad_norm": 0.1371079683303833, "learning_rate": 2.8803121923038518e-05, "loss": 0.0523, "step": 7129 }, { "epoch": 5.06586375611592, "grad_norm": 0.16972002387046814, "learning_rate": 2.879102373588919e-05, "loss": 0.1348, "step": 7130 }, { "epoch": 5.066616484757245, "grad_norm": 0.14911295473575592, "learning_rate": 2.8778928414914085e-05, "loss": 0.1185, "step": 7131 }, { "epoch": 5.06736921339857, "grad_norm": 0.16433517634868622, "learning_rate": 2.8766835961435944e-05, "loss": 0.1057, "step": 7132 }, { "epoch": 5.068121942039895, "grad_norm": 0.14667950570583344, "learning_rate": 2.8754746376777186e-05, "loss": 0.0836, "step": 7133 }, { "epoch": 5.068874670681219, "grad_norm": 0.15199579298496246, "learning_rate": 2.874265966225992e-05, "loss": 0.0829, "step": 7134 }, { "epoch": 5.0696273993225445, "grad_norm": 0.16677331924438477, "learning_rate": 2.8730575819205896e-05, "loss": 0.0844, "step": 7135 }, { "epoch": 5.070380127963869, "grad_norm": 0.1655752956867218, "learning_rate": 2.8718494848936623e-05, "loss": 0.0718, "step": 7136 }, { "epoch": 5.071132856605193, "grad_norm": 0.17383252084255219, "learning_rate": 2.8706416752773224e-05, "loss": 0.1573, "step": 7137 }, { "epoch": 5.071885585246519, "grad_norm": 0.1536787748336792, "learning_rate": 2.8694341532036562e-05, "loss": 0.1473, "step": 7138 }, { "epoch": 5.072638313887843, "grad_norm": 0.16692304611206055, "learning_rate": 2.8682269188047152e-05, "loss": 0.116, "step": 7139 }, { "epoch": 5.073391042529169, "grad_norm": 0.15328215062618256, "learning_rate": 2.8670199722125236e-05, "loss": 0.084, "step": 7140 }, { "epoch": 5.074143771170493, "grad_norm": 0.1446951925754547, "learning_rate": 2.8658133135590676e-05, "loss": 0.1376, "step": 7141 }, { "epoch": 5.074896499811818, "grad_norm": 0.14997118711471558, "learning_rate": 2.864606942976309e-05, "loss": 0.1251, "step": 7142 }, { "epoch": 5.075649228453143, "grad_norm": 0.17661145329475403, "learning_rate": 2.8634008605961705e-05, "loss": 0.1228, "step": 7143 }, { "epoch": 5.076401957094467, "grad_norm": 0.17061376571655273, "learning_rate": 2.8621950665505504e-05, "loss": 0.1365, "step": 7144 }, { "epoch": 5.077154685735792, "grad_norm": 0.1398148089647293, "learning_rate": 2.860989560971312e-05, "loss": 0.109, "step": 7145 }, { "epoch": 5.077907414377117, "grad_norm": 0.13908705115318298, "learning_rate": 2.859784343990289e-05, "loss": 0.0971, "step": 7146 }, { "epoch": 5.078660143018442, "grad_norm": 0.14806890487670898, "learning_rate": 2.8585794157392786e-05, "loss": 0.1074, "step": 7147 }, { "epoch": 5.079412871659767, "grad_norm": 0.1391461044549942, "learning_rate": 2.8573747763500526e-05, "loss": 0.094, "step": 7148 }, { "epoch": 5.0801656003010915, "grad_norm": 0.13642765581607819, "learning_rate": 2.8561704259543488e-05, "loss": 0.1528, "step": 7149 }, { "epoch": 5.080918328942416, "grad_norm": 0.14263252913951874, "learning_rate": 2.854966364683872e-05, "loss": 0.0526, "step": 7150 }, { "epoch": 5.081671057583741, "grad_norm": 0.16660597920417786, "learning_rate": 2.853762592670295e-05, "loss": 0.1784, "step": 7151 }, { "epoch": 5.082423786225066, "grad_norm": 0.18424829840660095, "learning_rate": 2.8525591100452638e-05, "loss": 0.1109, "step": 7152 }, { "epoch": 5.08317651486639, "grad_norm": 0.1748976707458496, "learning_rate": 2.851355916940389e-05, "loss": 0.1349, "step": 7153 }, { "epoch": 5.083929243507716, "grad_norm": 0.15457551181316376, "learning_rate": 2.8501530134872474e-05, "loss": 0.0781, "step": 7154 }, { "epoch": 5.08468197214904, "grad_norm": 0.1247653141617775, "learning_rate": 2.84895039981739e-05, "loss": 0.0429, "step": 7155 }, { "epoch": 5.085434700790365, "grad_norm": 0.14887423813343048, "learning_rate": 2.8477480760623293e-05, "loss": 0.0551, "step": 7156 }, { "epoch": 5.08618742943169, "grad_norm": 0.13176922500133514, "learning_rate": 2.846546042353553e-05, "loss": 0.0938, "step": 7157 }, { "epoch": 5.0869401580730145, "grad_norm": 0.15432384610176086, "learning_rate": 2.845344298822509e-05, "loss": 0.0785, "step": 7158 }, { "epoch": 5.08769288671434, "grad_norm": 0.1443355828523636, "learning_rate": 2.8441428456006247e-05, "loss": 0.1426, "step": 7159 }, { "epoch": 5.088445615355664, "grad_norm": 0.14869101345539093, "learning_rate": 2.8429416828192834e-05, "loss": 0.0561, "step": 7160 }, { "epoch": 5.089198343996989, "grad_norm": 0.14932632446289062, "learning_rate": 2.8417408106098465e-05, "loss": 0.0865, "step": 7161 }, { "epoch": 5.089951072638314, "grad_norm": 0.2125512957572937, "learning_rate": 2.8405402291036377e-05, "loss": 0.1122, "step": 7162 }, { "epoch": 5.090703801279639, "grad_norm": 0.17668336629867554, "learning_rate": 2.8393399384319508e-05, "loss": 0.1292, "step": 7163 }, { "epoch": 5.091456529920963, "grad_norm": 0.17535677552223206, "learning_rate": 2.838139938726046e-05, "loss": 0.1378, "step": 7164 }, { "epoch": 5.0922092585622885, "grad_norm": 0.13209329545497894, "learning_rate": 2.8369402301171575e-05, "loss": 0.083, "step": 7165 }, { "epoch": 5.092961987203613, "grad_norm": 0.18096771836280823, "learning_rate": 2.8357408127364816e-05, "loss": 0.1149, "step": 7166 }, { "epoch": 5.093714715844938, "grad_norm": 0.14501528441905975, "learning_rate": 2.8345416867151852e-05, "loss": 0.0696, "step": 7167 }, { "epoch": 5.094467444486263, "grad_norm": 0.16403120756149292, "learning_rate": 2.8333428521844007e-05, "loss": 0.1204, "step": 7168 }, { "epoch": 5.095220173127587, "grad_norm": 0.16623565554618835, "learning_rate": 2.8321443092752338e-05, "loss": 0.1092, "step": 7169 }, { "epoch": 5.095972901768913, "grad_norm": 0.15796664357185364, "learning_rate": 2.830946058118753e-05, "loss": 0.0664, "step": 7170 }, { "epoch": 5.096725630410237, "grad_norm": 0.16837149858474731, "learning_rate": 2.8297480988460012e-05, "loss": 0.0975, "step": 7171 }, { "epoch": 5.0974783590515615, "grad_norm": 0.14416061341762543, "learning_rate": 2.8285504315879806e-05, "loss": 0.1077, "step": 7172 }, { "epoch": 5.098231087692887, "grad_norm": 0.14509513974189758, "learning_rate": 2.8273530564756678e-05, "loss": 0.0823, "step": 7173 }, { "epoch": 5.098983816334211, "grad_norm": 0.1733979731798172, "learning_rate": 2.8261559736400085e-05, "loss": 0.0934, "step": 7174 }, { "epoch": 5.099736544975537, "grad_norm": 0.17188163101673126, "learning_rate": 2.82495918321191e-05, "loss": 0.1019, "step": 7175 }, { "epoch": 5.100489273616861, "grad_norm": 0.15566492080688477, "learning_rate": 2.8237626853222556e-05, "loss": 0.1267, "step": 7176 }, { "epoch": 5.101242002258186, "grad_norm": 0.15751823782920837, "learning_rate": 2.822566480101886e-05, "loss": 0.1271, "step": 7177 }, { "epoch": 5.101994730899511, "grad_norm": 0.17700576782226562, "learning_rate": 2.821370567681625e-05, "loss": 0.2347, "step": 7178 }, { "epoch": 5.1027474595408355, "grad_norm": 0.16089574992656708, "learning_rate": 2.820174948192249e-05, "loss": 0.1106, "step": 7179 }, { "epoch": 5.10350018818216, "grad_norm": 0.1474921554327011, "learning_rate": 2.8189796217645138e-05, "loss": 0.1059, "step": 7180 }, { "epoch": 5.104252916823485, "grad_norm": 0.16502755880355835, "learning_rate": 2.817784588529134e-05, "loss": 0.1224, "step": 7181 }, { "epoch": 5.10500564546481, "grad_norm": 0.1847693771123886, "learning_rate": 2.8165898486168006e-05, "loss": 0.1662, "step": 7182 }, { "epoch": 5.105758374106134, "grad_norm": 0.1559734344482422, "learning_rate": 2.8153954021581647e-05, "loss": 0.1193, "step": 7183 }, { "epoch": 5.10651110274746, "grad_norm": 0.1699748933315277, "learning_rate": 2.8142012492838536e-05, "loss": 0.1053, "step": 7184 }, { "epoch": 5.107263831388784, "grad_norm": 0.15713094174861908, "learning_rate": 2.8130073901244546e-05, "loss": 0.1418, "step": 7185 }, { "epoch": 5.1080165600301095, "grad_norm": 0.17868471145629883, "learning_rate": 2.8118138248105296e-05, "loss": 0.1074, "step": 7186 }, { "epoch": 5.108769288671434, "grad_norm": 0.1747101992368698, "learning_rate": 2.8106205534726003e-05, "loss": 0.1455, "step": 7187 }, { "epoch": 5.1095220173127585, "grad_norm": 0.1609479784965515, "learning_rate": 2.809427576241167e-05, "loss": 0.1084, "step": 7188 }, { "epoch": 5.110274745954084, "grad_norm": 0.16136698424816132, "learning_rate": 2.8082348932466847e-05, "loss": 0.117, "step": 7189 }, { "epoch": 5.111027474595408, "grad_norm": 0.15552926063537598, "learning_rate": 2.807042504619591e-05, "loss": 0.0699, "step": 7190 }, { "epoch": 5.111780203236733, "grad_norm": 0.15043744444847107, "learning_rate": 2.8058504104902784e-05, "loss": 0.1642, "step": 7191 }, { "epoch": 5.112532931878058, "grad_norm": 0.15840116143226624, "learning_rate": 2.804658610989116e-05, "loss": 0.1137, "step": 7192 }, { "epoch": 5.113285660519383, "grad_norm": 0.17163196206092834, "learning_rate": 2.8034671062464335e-05, "loss": 0.1622, "step": 7193 }, { "epoch": 5.114038389160708, "grad_norm": 0.13865703344345093, "learning_rate": 2.8022758963925338e-05, "loss": 0.0684, "step": 7194 }, { "epoch": 5.114791117802032, "grad_norm": 0.1859186589717865, "learning_rate": 2.8010849815576888e-05, "loss": 0.0942, "step": 7195 }, { "epoch": 5.115543846443357, "grad_norm": 0.13439764082431793, "learning_rate": 2.7998943618721303e-05, "loss": 0.0891, "step": 7196 }, { "epoch": 5.116296575084682, "grad_norm": 0.17668946087360382, "learning_rate": 2.7987040374660644e-05, "loss": 0.1145, "step": 7197 }, { "epoch": 5.117049303726007, "grad_norm": 0.1555139720439911, "learning_rate": 2.797514008469664e-05, "loss": 0.1271, "step": 7198 }, { "epoch": 5.117802032367331, "grad_norm": 0.15773636102676392, "learning_rate": 2.796324275013071e-05, "loss": 0.146, "step": 7199 }, { "epoch": 5.118554761008657, "grad_norm": 0.18611769378185272, "learning_rate": 2.7951348372263875e-05, "loss": 0.1228, "step": 7200 }, { "epoch": 5.118554761008657, "eval_loss": 0.16159974038600922, "eval_runtime": 456.5992, "eval_samples_per_second": 21.084, "eval_steps_per_second": 0.659, "step": 7200 }, { "epoch": 5.119307489649981, "grad_norm": 0.157170832157135, "learning_rate": 2.7939456952396936e-05, "loss": 0.1117, "step": 7201 }, { "epoch": 5.120060218291306, "grad_norm": 0.15518218278884888, "learning_rate": 2.792756849183027e-05, "loss": 0.1119, "step": 7202 }, { "epoch": 5.120812946932631, "grad_norm": 0.13598978519439697, "learning_rate": 2.7915682991864033e-05, "loss": 0.0856, "step": 7203 }, { "epoch": 5.121565675573955, "grad_norm": 0.14310310781002045, "learning_rate": 2.7903800453797978e-05, "loss": 0.0965, "step": 7204 }, { "epoch": 5.122318404215281, "grad_norm": 0.12229315936565399, "learning_rate": 2.7891920878931576e-05, "loss": 0.0597, "step": 7205 }, { "epoch": 5.123071132856605, "grad_norm": 0.1556524783372879, "learning_rate": 2.7880044268563943e-05, "loss": 0.1303, "step": 7206 }, { "epoch": 5.12382386149793, "grad_norm": 0.15365192294120789, "learning_rate": 2.7868170623993905e-05, "loss": 0.086, "step": 7207 }, { "epoch": 5.124576590139255, "grad_norm": 0.1752943992614746, "learning_rate": 2.7856299946519905e-05, "loss": 0.1925, "step": 7208 }, { "epoch": 5.1253293187805795, "grad_norm": 0.13933652639389038, "learning_rate": 2.784443223744016e-05, "loss": 0.0768, "step": 7209 }, { "epoch": 5.126082047421904, "grad_norm": 0.1552446335554123, "learning_rate": 2.7832567498052464e-05, "loss": 0.1486, "step": 7210 }, { "epoch": 5.126834776063229, "grad_norm": 0.13136501610279083, "learning_rate": 2.782070572965436e-05, "loss": 0.1185, "step": 7211 }, { "epoch": 5.127587504704554, "grad_norm": 0.16151641309261322, "learning_rate": 2.7808846933542997e-05, "loss": 0.0771, "step": 7212 }, { "epoch": 5.128340233345879, "grad_norm": 0.1450287252664566, "learning_rate": 2.779699111101527e-05, "loss": 0.0704, "step": 7213 }, { "epoch": 5.129092961987204, "grad_norm": 0.1355522722005844, "learning_rate": 2.7785138263367673e-05, "loss": 0.0933, "step": 7214 }, { "epoch": 5.129845690628528, "grad_norm": 0.16983261704444885, "learning_rate": 2.7773288391896447e-05, "loss": 0.1376, "step": 7215 }, { "epoch": 5.1305984192698535, "grad_norm": 0.15673278272151947, "learning_rate": 2.7761441497897463e-05, "loss": 0.1182, "step": 7216 }, { "epoch": 5.131351147911178, "grad_norm": 0.16346175968647003, "learning_rate": 2.7749597582666298e-05, "loss": 0.164, "step": 7217 }, { "epoch": 5.132103876552502, "grad_norm": 0.13900041580200195, "learning_rate": 2.773775664749816e-05, "loss": 0.0798, "step": 7218 }, { "epoch": 5.132856605193828, "grad_norm": 0.14483939111232758, "learning_rate": 2.7725918693687963e-05, "loss": 0.0898, "step": 7219 }, { "epoch": 5.133609333835152, "grad_norm": 0.15938611328601837, "learning_rate": 2.7714083722530305e-05, "loss": 0.1612, "step": 7220 }, { "epoch": 5.134362062476478, "grad_norm": 0.15156422555446625, "learning_rate": 2.770225173531942e-05, "loss": 0.0929, "step": 7221 }, { "epoch": 5.135114791117802, "grad_norm": 0.17150942981243134, "learning_rate": 2.7690422733349235e-05, "loss": 0.1921, "step": 7222 }, { "epoch": 5.135867519759127, "grad_norm": 0.13095104694366455, "learning_rate": 2.7678596717913368e-05, "loss": 0.1051, "step": 7223 }, { "epoch": 5.136620248400452, "grad_norm": 0.16740241646766663, "learning_rate": 2.7666773690305103e-05, "loss": 0.1407, "step": 7224 }, { "epoch": 5.137372977041776, "grad_norm": 0.13445648550987244, "learning_rate": 2.765495365181735e-05, "loss": 0.1242, "step": 7225 }, { "epoch": 5.138125705683101, "grad_norm": 0.162644162774086, "learning_rate": 2.764313660374277e-05, "loss": 0.1241, "step": 7226 }, { "epoch": 5.138878434324426, "grad_norm": 0.16309313476085663, "learning_rate": 2.7631322547373627e-05, "loss": 0.1522, "step": 7227 }, { "epoch": 5.139631162965751, "grad_norm": 0.16139696538448334, "learning_rate": 2.7619511484001903e-05, "loss": 0.1225, "step": 7228 }, { "epoch": 5.140383891607076, "grad_norm": 0.138424813747406, "learning_rate": 2.7607703414919233e-05, "loss": 0.0939, "step": 7229 }, { "epoch": 5.141136620248401, "grad_norm": 0.15374843776226044, "learning_rate": 2.7595898341416954e-05, "loss": 0.1094, "step": 7230 }, { "epoch": 5.141889348889725, "grad_norm": 0.15369683504104614, "learning_rate": 2.7584096264786014e-05, "loss": 0.1088, "step": 7231 }, { "epoch": 5.14264207753105, "grad_norm": 0.1462237536907196, "learning_rate": 2.7572297186317103e-05, "loss": 0.0635, "step": 7232 }, { "epoch": 5.143394806172375, "grad_norm": 0.28469228744506836, "learning_rate": 2.756050110730052e-05, "loss": 0.1604, "step": 7233 }, { "epoch": 5.144147534813699, "grad_norm": 0.1467081904411316, "learning_rate": 2.7548708029026282e-05, "loss": 0.0637, "step": 7234 }, { "epoch": 5.144900263455025, "grad_norm": 0.1487559676170349, "learning_rate": 2.753691795278406e-05, "loss": 0.0725, "step": 7235 }, { "epoch": 5.145652992096349, "grad_norm": 0.16135238111019135, "learning_rate": 2.7525130879863213e-05, "loss": 0.1673, "step": 7236 }, { "epoch": 5.146405720737674, "grad_norm": 0.10944251716136932, "learning_rate": 2.751334681155273e-05, "loss": 0.0517, "step": 7237 }, { "epoch": 5.147158449378999, "grad_norm": 0.18259170651435852, "learning_rate": 2.7501565749141335e-05, "loss": 0.1131, "step": 7238 }, { "epoch": 5.1479111780203235, "grad_norm": 0.16551470756530762, "learning_rate": 2.7489787693917346e-05, "loss": 0.1322, "step": 7239 }, { "epoch": 5.148663906661649, "grad_norm": 0.15589208900928497, "learning_rate": 2.7478012647168815e-05, "loss": 0.0904, "step": 7240 }, { "epoch": 5.149416635302973, "grad_norm": 0.16041299700737, "learning_rate": 2.7466240610183437e-05, "loss": 0.1684, "step": 7241 }, { "epoch": 5.150169363944298, "grad_norm": 0.16206172108650208, "learning_rate": 2.7454471584248616e-05, "loss": 0.1343, "step": 7242 }, { "epoch": 5.150922092585623, "grad_norm": 0.1545562446117401, "learning_rate": 2.7442705570651346e-05, "loss": 0.1031, "step": 7243 }, { "epoch": 5.151674821226948, "grad_norm": 0.19996368885040283, "learning_rate": 2.7430942570678365e-05, "loss": 0.1193, "step": 7244 }, { "epoch": 5.152427549868272, "grad_norm": 0.1631331741809845, "learning_rate": 2.741918258561607e-05, "loss": 0.1351, "step": 7245 }, { "epoch": 5.1531802785095975, "grad_norm": 0.16984935104846954, "learning_rate": 2.7407425616750482e-05, "loss": 0.0812, "step": 7246 }, { "epoch": 5.153933007150922, "grad_norm": 0.15482854843139648, "learning_rate": 2.739567166536735e-05, "loss": 0.0826, "step": 7247 }, { "epoch": 5.154685735792247, "grad_norm": 0.1823095977306366, "learning_rate": 2.7383920732752057e-05, "loss": 0.1738, "step": 7248 }, { "epoch": 5.155438464433572, "grad_norm": 0.13626250624656677, "learning_rate": 2.737217282018969e-05, "loss": 0.0643, "step": 7249 }, { "epoch": 5.156191193074896, "grad_norm": 0.1395110785961151, "learning_rate": 2.736042792896495e-05, "loss": 0.0825, "step": 7250 }, { "epoch": 5.156943921716222, "grad_norm": 0.1812230497598648, "learning_rate": 2.734868606036227e-05, "loss": 0.14, "step": 7251 }, { "epoch": 5.157696650357546, "grad_norm": 0.15767402946949005, "learning_rate": 2.73369472156657e-05, "loss": 0.1179, "step": 7252 }, { "epoch": 5.158449378998871, "grad_norm": 0.13961626589298248, "learning_rate": 2.732521139615899e-05, "loss": 0.0646, "step": 7253 }, { "epoch": 5.159202107640196, "grad_norm": 0.15086904168128967, "learning_rate": 2.7313478603125543e-05, "loss": 0.1395, "step": 7254 }, { "epoch": 5.15995483628152, "grad_norm": 0.1639776974916458, "learning_rate": 2.7301748837848472e-05, "loss": 0.1293, "step": 7255 }, { "epoch": 5.160707564922845, "grad_norm": 0.18278305232524872, "learning_rate": 2.729002210161049e-05, "loss": 0.1077, "step": 7256 }, { "epoch": 5.16146029356417, "grad_norm": 0.14485681056976318, "learning_rate": 2.7278298395694047e-05, "loss": 0.1104, "step": 7257 }, { "epoch": 5.162213022205495, "grad_norm": 0.1221807673573494, "learning_rate": 2.726657772138119e-05, "loss": 0.0729, "step": 7258 }, { "epoch": 5.16296575084682, "grad_norm": 0.16513945162296295, "learning_rate": 2.7254860079953703e-05, "loss": 0.1121, "step": 7259 }, { "epoch": 5.1637184794881446, "grad_norm": 0.15141747891902924, "learning_rate": 2.724314547269301e-05, "loss": 0.1183, "step": 7260 }, { "epoch": 5.164471208129469, "grad_norm": 0.15176527202129364, "learning_rate": 2.72314339008802e-05, "loss": 0.1092, "step": 7261 }, { "epoch": 5.165223936770794, "grad_norm": 0.14512556791305542, "learning_rate": 2.7219725365796018e-05, "loss": 0.1131, "step": 7262 }, { "epoch": 5.165976665412119, "grad_norm": 0.1615726202726364, "learning_rate": 2.7208019868720917e-05, "loss": 0.1103, "step": 7263 }, { "epoch": 5.166729394053443, "grad_norm": 0.13471852242946625, "learning_rate": 2.7196317410934964e-05, "loss": 0.13, "step": 7264 }, { "epoch": 5.167482122694769, "grad_norm": 0.1944904327392578, "learning_rate": 2.7184617993717936e-05, "loss": 0.2583, "step": 7265 }, { "epoch": 5.168234851336093, "grad_norm": 0.14542147517204285, "learning_rate": 2.7172921618349267e-05, "loss": 0.118, "step": 7266 }, { "epoch": 5.1689875799774185, "grad_norm": 0.13463620841503143, "learning_rate": 2.7161228286108065e-05, "loss": 0.1186, "step": 7267 }, { "epoch": 5.169740308618743, "grad_norm": 0.1869378387928009, "learning_rate": 2.7149537998273068e-05, "loss": 0.1868, "step": 7268 }, { "epoch": 5.1704930372600675, "grad_norm": 0.13109764456748962, "learning_rate": 2.713785075612273e-05, "loss": 0.0693, "step": 7269 }, { "epoch": 5.171245765901393, "grad_norm": 0.1538110375404358, "learning_rate": 2.712616656093515e-05, "loss": 0.1075, "step": 7270 }, { "epoch": 5.171998494542717, "grad_norm": 0.13271823525428772, "learning_rate": 2.7114485413988066e-05, "loss": 0.0937, "step": 7271 }, { "epoch": 5.172751223184042, "grad_norm": 0.20584438741207123, "learning_rate": 2.7102807316558943e-05, "loss": 0.1858, "step": 7272 }, { "epoch": 5.173503951825367, "grad_norm": 0.1456284523010254, "learning_rate": 2.7091132269924867e-05, "loss": 0.1202, "step": 7273 }, { "epoch": 5.174256680466692, "grad_norm": 0.12264365702867508, "learning_rate": 2.7079460275362623e-05, "loss": 0.0548, "step": 7274 }, { "epoch": 5.175009409108017, "grad_norm": 0.1591559648513794, "learning_rate": 2.70677913341486e-05, "loss": 0.1688, "step": 7275 }, { "epoch": 5.1757621377493415, "grad_norm": 0.16535386443138123, "learning_rate": 2.7056125447558943e-05, "loss": 0.1789, "step": 7276 }, { "epoch": 5.176514866390666, "grad_norm": 0.14007754623889923, "learning_rate": 2.704446261686938e-05, "loss": 0.1353, "step": 7277 }, { "epoch": 5.177267595031991, "grad_norm": 0.14374788105487823, "learning_rate": 2.7032802843355352e-05, "loss": 0.095, "step": 7278 }, { "epoch": 5.178020323673316, "grad_norm": 0.156447172164917, "learning_rate": 2.7021146128291963e-05, "loss": 0.0745, "step": 7279 }, { "epoch": 5.17877305231464, "grad_norm": 0.1629287749528885, "learning_rate": 2.700949247295398e-05, "loss": 0.0931, "step": 7280 }, { "epoch": 5.179525780955966, "grad_norm": 0.1380356103181839, "learning_rate": 2.6997841878615797e-05, "loss": 0.0764, "step": 7281 }, { "epoch": 5.18027850959729, "grad_norm": 0.1432749181985855, "learning_rate": 2.6986194346551546e-05, "loss": 0.0347, "step": 7282 }, { "epoch": 5.1810312382386146, "grad_norm": 0.17067937552928925, "learning_rate": 2.697454987803495e-05, "loss": 0.1171, "step": 7283 }, { "epoch": 5.18178396687994, "grad_norm": 0.15342791378498077, "learning_rate": 2.6962908474339447e-05, "loss": 0.0947, "step": 7284 }, { "epoch": 5.182536695521264, "grad_norm": 0.16275940835475922, "learning_rate": 2.69512701367381e-05, "loss": 0.0814, "step": 7285 }, { "epoch": 5.18328942416259, "grad_norm": 0.16392594575881958, "learning_rate": 2.693963486650371e-05, "loss": 0.2108, "step": 7286 }, { "epoch": 5.184042152803914, "grad_norm": 0.1543111503124237, "learning_rate": 2.692800266490865e-05, "loss": 0.2086, "step": 7287 }, { "epoch": 5.184794881445239, "grad_norm": 0.15121135115623474, "learning_rate": 2.6916373533225026e-05, "loss": 0.1366, "step": 7288 }, { "epoch": 5.185547610086564, "grad_norm": 0.15511207282543182, "learning_rate": 2.6904747472724546e-05, "loss": 0.0893, "step": 7289 }, { "epoch": 5.1863003387278885, "grad_norm": 0.16831058263778687, "learning_rate": 2.689312448467865e-05, "loss": 0.1395, "step": 7290 }, { "epoch": 5.187053067369213, "grad_norm": 0.16912120580673218, "learning_rate": 2.6881504570358394e-05, "loss": 0.1344, "step": 7291 }, { "epoch": 5.187805796010538, "grad_norm": 0.14858727157115936, "learning_rate": 2.686988773103455e-05, "loss": 0.0678, "step": 7292 }, { "epoch": 5.188558524651863, "grad_norm": 0.14840678870677948, "learning_rate": 2.685827396797747e-05, "loss": 0.1058, "step": 7293 }, { "epoch": 5.189311253293188, "grad_norm": 0.13972145318984985, "learning_rate": 2.6846663282457235e-05, "loss": 0.1389, "step": 7294 }, { "epoch": 5.190063981934513, "grad_norm": 0.133751779794693, "learning_rate": 2.6835055675743593e-05, "loss": 0.1214, "step": 7295 }, { "epoch": 5.190816710575837, "grad_norm": 0.1611785590648651, "learning_rate": 2.6823451149105904e-05, "loss": 0.0859, "step": 7296 }, { "epoch": 5.1915694392171625, "grad_norm": 0.17554375529289246, "learning_rate": 2.6811849703813253e-05, "loss": 0.1, "step": 7297 }, { "epoch": 5.192322167858487, "grad_norm": 0.14949148893356323, "learning_rate": 2.6800251341134297e-05, "loss": 0.1681, "step": 7298 }, { "epoch": 5.1930748964998115, "grad_norm": 0.1322239637374878, "learning_rate": 2.6788656062337498e-05, "loss": 0.1058, "step": 7299 }, { "epoch": 5.193827625141137, "grad_norm": 0.16880235075950623, "learning_rate": 2.677706386869083e-05, "loss": 0.0666, "step": 7300 }, { "epoch": 5.194580353782461, "grad_norm": 0.13539457321166992, "learning_rate": 2.6765474761462056e-05, "loss": 0.1299, "step": 7301 }, { "epoch": 5.195333082423787, "grad_norm": 0.14348332583904266, "learning_rate": 2.6753888741918488e-05, "loss": 0.1515, "step": 7302 }, { "epoch": 5.196085811065111, "grad_norm": 0.1728876233100891, "learning_rate": 2.6742305811327195e-05, "loss": 0.1188, "step": 7303 }, { "epoch": 5.196838539706436, "grad_norm": 0.14384959638118744, "learning_rate": 2.6730725970954816e-05, "loss": 0.087, "step": 7304 }, { "epoch": 5.197591268347761, "grad_norm": 0.16934189200401306, "learning_rate": 2.6719149222067787e-05, "loss": 0.1728, "step": 7305 }, { "epoch": 5.1983439969890854, "grad_norm": 0.14272604882717133, "learning_rate": 2.6707575565932053e-05, "loss": 0.0907, "step": 7306 }, { "epoch": 5.19909672563041, "grad_norm": 0.15680630505084991, "learning_rate": 2.669600500381334e-05, "loss": 0.1522, "step": 7307 }, { "epoch": 5.199849454271735, "grad_norm": 0.12221116572618484, "learning_rate": 2.6684437536976946e-05, "loss": 0.0641, "step": 7308 }, { "epoch": 5.20060218291306, "grad_norm": 0.15937328338623047, "learning_rate": 2.6672873166687905e-05, "loss": 0.0989, "step": 7309 }, { "epoch": 5.201354911554384, "grad_norm": 0.15216898918151855, "learning_rate": 2.6661311894210838e-05, "loss": 0.084, "step": 7310 }, { "epoch": 5.20210764019571, "grad_norm": 0.14574357867240906, "learning_rate": 2.6649753720810132e-05, "loss": 0.1211, "step": 7311 }, { "epoch": 5.202860368837034, "grad_norm": 0.15528889000415802, "learning_rate": 2.6638198647749713e-05, "loss": 0.0492, "step": 7312 }, { "epoch": 5.203613097478359, "grad_norm": 0.19219733774662018, "learning_rate": 2.6626646676293266e-05, "loss": 0.1153, "step": 7313 }, { "epoch": 5.204365826119684, "grad_norm": 0.15179213881492615, "learning_rate": 2.661509780770406e-05, "loss": 0.0955, "step": 7314 }, { "epoch": 5.205118554761008, "grad_norm": 0.1619102954864502, "learning_rate": 2.6603552043245083e-05, "loss": 0.1296, "step": 7315 }, { "epoch": 5.205871283402334, "grad_norm": 0.13290493190288544, "learning_rate": 2.6592009384178983e-05, "loss": 0.0587, "step": 7316 }, { "epoch": 5.206624012043658, "grad_norm": 0.15267863869667053, "learning_rate": 2.658046983176799e-05, "loss": 0.1291, "step": 7317 }, { "epoch": 5.207376740684983, "grad_norm": 0.1521773636341095, "learning_rate": 2.656893338727412e-05, "loss": 0.1217, "step": 7318 }, { "epoch": 5.208129469326308, "grad_norm": 0.14922891557216644, "learning_rate": 2.6557400051958936e-05, "loss": 0.1085, "step": 7319 }, { "epoch": 5.2088821979676325, "grad_norm": 0.15516528487205505, "learning_rate": 2.654586982708373e-05, "loss": 0.079, "step": 7320 }, { "epoch": 5.209634926608958, "grad_norm": 0.1505041867494583, "learning_rate": 2.65343427139094e-05, "loss": 0.1308, "step": 7321 }, { "epoch": 5.210387655250282, "grad_norm": 0.14219944179058075, "learning_rate": 2.6522818713696584e-05, "loss": 0.141, "step": 7322 }, { "epoch": 5.211140383891607, "grad_norm": 0.14026077091693878, "learning_rate": 2.651129782770545e-05, "loss": 0.0773, "step": 7323 }, { "epoch": 5.211893112532932, "grad_norm": 0.1510370373725891, "learning_rate": 2.6499780057196e-05, "loss": 0.1209, "step": 7324 }, { "epoch": 5.212645841174257, "grad_norm": 0.14144016802310944, "learning_rate": 2.648826540342773e-05, "loss": 0.08, "step": 7325 }, { "epoch": 5.213398569815581, "grad_norm": 0.16962090134620667, "learning_rate": 2.6476753867659902e-05, "loss": 0.1432, "step": 7326 }, { "epoch": 5.2141512984569065, "grad_norm": 0.13496403396129608, "learning_rate": 2.6465245451151376e-05, "loss": 0.0957, "step": 7327 }, { "epoch": 5.214904027098231, "grad_norm": 0.15720544755458832, "learning_rate": 2.6453740155160722e-05, "loss": 0.0995, "step": 7328 }, { "epoch": 5.2156567557395555, "grad_norm": 0.15988244116306305, "learning_rate": 2.6442237980946095e-05, "loss": 0.1166, "step": 7329 }, { "epoch": 5.216409484380881, "grad_norm": 0.1653236448764801, "learning_rate": 2.6430738929765418e-05, "loss": 0.1085, "step": 7330 }, { "epoch": 5.217162213022205, "grad_norm": 0.14246395230293274, "learning_rate": 2.641924300287616e-05, "loss": 0.1096, "step": 7331 }, { "epoch": 5.217914941663531, "grad_norm": 0.15282389521598816, "learning_rate": 2.640775020153553e-05, "loss": 0.0854, "step": 7332 }, { "epoch": 5.218667670304855, "grad_norm": 0.14771245419979095, "learning_rate": 2.639626052700034e-05, "loss": 0.1242, "step": 7333 }, { "epoch": 5.21942039894618, "grad_norm": 0.175876185297966, "learning_rate": 2.638477398052711e-05, "loss": 0.2158, "step": 7334 }, { "epoch": 5.220173127587505, "grad_norm": 0.14037342369556427, "learning_rate": 2.6373290563371953e-05, "loss": 0.0886, "step": 7335 }, { "epoch": 5.220925856228829, "grad_norm": 0.13823463022708893, "learning_rate": 2.63618102767907e-05, "loss": 0.1286, "step": 7336 }, { "epoch": 5.221678584870154, "grad_norm": 0.1423354148864746, "learning_rate": 2.6350333122038824e-05, "loss": 0.0932, "step": 7337 }, { "epoch": 5.222431313511479, "grad_norm": 0.1349625289440155, "learning_rate": 2.6338859100371443e-05, "loss": 0.0394, "step": 7338 }, { "epoch": 5.223184042152804, "grad_norm": 0.18523170053958893, "learning_rate": 2.6327388213043354e-05, "loss": 0.1517, "step": 7339 }, { "epoch": 5.223936770794129, "grad_norm": 0.18416795134544373, "learning_rate": 2.6315920461308964e-05, "loss": 0.1462, "step": 7340 }, { "epoch": 5.224689499435454, "grad_norm": 0.15151414275169373, "learning_rate": 2.6304455846422405e-05, "loss": 0.1548, "step": 7341 }, { "epoch": 5.225442228076778, "grad_norm": 0.14683160185813904, "learning_rate": 2.629299436963738e-05, "loss": 0.0684, "step": 7342 }, { "epoch": 5.226194956718103, "grad_norm": 0.13092096149921417, "learning_rate": 2.628153603220736e-05, "loss": 0.1032, "step": 7343 }, { "epoch": 5.226947685359428, "grad_norm": 0.17003509402275085, "learning_rate": 2.627008083538537e-05, "loss": 0.1889, "step": 7344 }, { "epoch": 5.227700414000752, "grad_norm": 0.14715181291103363, "learning_rate": 2.6258628780424156e-05, "loss": 0.0762, "step": 7345 }, { "epoch": 5.228453142642078, "grad_norm": 0.13507682085037231, "learning_rate": 2.6247179868576076e-05, "loss": 0.1294, "step": 7346 }, { "epoch": 5.229205871283402, "grad_norm": 0.1386154294013977, "learning_rate": 2.6235734101093186e-05, "loss": 0.0498, "step": 7347 }, { "epoch": 5.229958599924728, "grad_norm": 0.1616613268852234, "learning_rate": 2.6224291479227153e-05, "loss": 0.1603, "step": 7348 }, { "epoch": 5.230711328566052, "grad_norm": 0.16248944401741028, "learning_rate": 2.6212852004229344e-05, "loss": 0.1165, "step": 7349 }, { "epoch": 5.2314640572073765, "grad_norm": 0.15052518248558044, "learning_rate": 2.6201415677350754e-05, "loss": 0.1072, "step": 7350 }, { "epoch": 5.232216785848702, "grad_norm": 0.13378046452999115, "learning_rate": 2.618998249984207e-05, "loss": 0.1149, "step": 7351 }, { "epoch": 5.232969514490026, "grad_norm": 0.14718708395957947, "learning_rate": 2.6178552472953564e-05, "loss": 0.099, "step": 7352 }, { "epoch": 5.233722243131351, "grad_norm": 0.15595507621765137, "learning_rate": 2.6167125597935242e-05, "loss": 0.0899, "step": 7353 }, { "epoch": 5.234474971772676, "grad_norm": 0.16663134098052979, "learning_rate": 2.6155701876036704e-05, "loss": 0.1114, "step": 7354 }, { "epoch": 5.235227700414001, "grad_norm": 0.16621175408363342, "learning_rate": 2.6144281308507236e-05, "loss": 0.1544, "step": 7355 }, { "epoch": 5.235980429055326, "grad_norm": 0.14963626861572266, "learning_rate": 2.6132863896595783e-05, "loss": 0.1318, "step": 7356 }, { "epoch": 5.2367331576966505, "grad_norm": 0.15774467587471008, "learning_rate": 2.6121449641550955e-05, "loss": 0.0918, "step": 7357 }, { "epoch": 5.237485886337975, "grad_norm": 0.15514472126960754, "learning_rate": 2.611003854462095e-05, "loss": 0.1275, "step": 7358 }, { "epoch": 5.2382386149793, "grad_norm": 0.1300552785396576, "learning_rate": 2.6098630607053704e-05, "loss": 0.1114, "step": 7359 }, { "epoch": 5.238991343620625, "grad_norm": 0.1536857783794403, "learning_rate": 2.6087225830096772e-05, "loss": 0.0854, "step": 7360 }, { "epoch": 5.239744072261949, "grad_norm": 0.1570596843957901, "learning_rate": 2.607582421499734e-05, "loss": 0.1386, "step": 7361 }, { "epoch": 5.240496800903275, "grad_norm": 0.1720636934041977, "learning_rate": 2.6064425763002287e-05, "loss": 0.18, "step": 7362 }, { "epoch": 5.241249529544599, "grad_norm": 0.1353028565645218, "learning_rate": 2.6053030475358127e-05, "loss": 0.1073, "step": 7363 }, { "epoch": 5.242002258185924, "grad_norm": 0.14189231395721436, "learning_rate": 2.6041638353311048e-05, "loss": 0.1118, "step": 7364 }, { "epoch": 5.242754986827249, "grad_norm": 0.15023921430110931, "learning_rate": 2.6030249398106838e-05, "loss": 0.1352, "step": 7365 }, { "epoch": 5.243507715468573, "grad_norm": 0.1696515530347824, "learning_rate": 2.6018863610991023e-05, "loss": 0.1127, "step": 7366 }, { "epoch": 5.244260444109899, "grad_norm": 0.1601075679063797, "learning_rate": 2.6007480993208692e-05, "loss": 0.1336, "step": 7367 }, { "epoch": 5.245013172751223, "grad_norm": 0.16432428359985352, "learning_rate": 2.5996101546004647e-05, "loss": 0.0936, "step": 7368 }, { "epoch": 5.245765901392548, "grad_norm": 0.15291796624660492, "learning_rate": 2.5984725270623337e-05, "loss": 0.0917, "step": 7369 }, { "epoch": 5.246518630033873, "grad_norm": 0.15400049090385437, "learning_rate": 2.597335216830886e-05, "loss": 0.12, "step": 7370 }, { "epoch": 5.247271358675198, "grad_norm": 0.16664113104343414, "learning_rate": 2.596198224030493e-05, "loss": 0.1124, "step": 7371 }, { "epoch": 5.248024087316522, "grad_norm": 0.15370064973831177, "learning_rate": 2.5950615487854985e-05, "loss": 0.0835, "step": 7372 }, { "epoch": 5.248776815957847, "grad_norm": 0.14772486686706543, "learning_rate": 2.593925191220204e-05, "loss": 0.1048, "step": 7373 }, { "epoch": 5.249529544599172, "grad_norm": 0.12308787554502487, "learning_rate": 2.5927891514588814e-05, "loss": 0.0818, "step": 7374 }, { "epoch": 5.250282273240497, "grad_norm": 0.20423544943332672, "learning_rate": 2.5916534296257655e-05, "loss": 0.1314, "step": 7375 }, { "epoch": 5.251035001881822, "grad_norm": 0.1818690150976181, "learning_rate": 2.5905180258450602e-05, "loss": 0.1384, "step": 7376 }, { "epoch": 5.251787730523146, "grad_norm": 0.16663767397403717, "learning_rate": 2.5893829402409275e-05, "loss": 0.1318, "step": 7377 }, { "epoch": 5.2525404591644715, "grad_norm": 0.15545806288719177, "learning_rate": 2.588248172937502e-05, "loss": 0.1292, "step": 7378 }, { "epoch": 5.253293187805796, "grad_norm": 0.11848124116659164, "learning_rate": 2.5871137240588773e-05, "loss": 0.0738, "step": 7379 }, { "epoch": 5.2540459164471205, "grad_norm": 0.16678005456924438, "learning_rate": 2.5859795937291154e-05, "loss": 0.1451, "step": 7380 }, { "epoch": 5.254798645088446, "grad_norm": 0.16266821324825287, "learning_rate": 2.5848457820722448e-05, "loss": 0.1445, "step": 7381 }, { "epoch": 5.25555137372977, "grad_norm": 0.15697534382343292, "learning_rate": 2.583712289212258e-05, "loss": 0.1544, "step": 7382 }, { "epoch": 5.256304102371095, "grad_norm": 0.15510553121566772, "learning_rate": 2.5825791152731084e-05, "loss": 0.1271, "step": 7383 }, { "epoch": 5.25705683101242, "grad_norm": 0.14458493888378143, "learning_rate": 2.5814462603787214e-05, "loss": 0.0758, "step": 7384 }, { "epoch": 5.257809559653745, "grad_norm": 0.158015638589859, "learning_rate": 2.5803137246529845e-05, "loss": 0.0999, "step": 7385 }, { "epoch": 5.25856228829507, "grad_norm": 0.17072731256484985, "learning_rate": 2.579181508219748e-05, "loss": 0.144, "step": 7386 }, { "epoch": 5.2593150169363945, "grad_norm": 0.17635777592658997, "learning_rate": 2.578049611202829e-05, "loss": 0.1369, "step": 7387 }, { "epoch": 5.260067745577719, "grad_norm": 0.13895131647586823, "learning_rate": 2.5769180337260123e-05, "loss": 0.0795, "step": 7388 }, { "epoch": 5.260820474219044, "grad_norm": 0.16905398666858673, "learning_rate": 2.575786775913046e-05, "loss": 0.0907, "step": 7389 }, { "epoch": 5.261573202860369, "grad_norm": 0.16489428281784058, "learning_rate": 2.5746558378876408e-05, "loss": 0.1794, "step": 7390 }, { "epoch": 5.262325931501693, "grad_norm": 0.1478123962879181, "learning_rate": 2.5735252197734762e-05, "loss": 0.148, "step": 7391 }, { "epoch": 5.263078660143019, "grad_norm": 0.18468450009822845, "learning_rate": 2.5723949216941913e-05, "loss": 0.1799, "step": 7392 }, { "epoch": 5.263831388784343, "grad_norm": 0.14482401311397552, "learning_rate": 2.571264943773396e-05, "loss": 0.0973, "step": 7393 }, { "epoch": 5.2645841174256685, "grad_norm": 0.13869979977607727, "learning_rate": 2.5701352861346638e-05, "loss": 0.0776, "step": 7394 }, { "epoch": 5.265336846066993, "grad_norm": 0.13110870122909546, "learning_rate": 2.569005948901534e-05, "loss": 0.0899, "step": 7395 }, { "epoch": 5.266089574708317, "grad_norm": 0.14152513444423676, "learning_rate": 2.5678769321975042e-05, "loss": 0.1145, "step": 7396 }, { "epoch": 5.266842303349643, "grad_norm": 0.15052363276481628, "learning_rate": 2.5667482361460467e-05, "loss": 0.0962, "step": 7397 }, { "epoch": 5.267595031990967, "grad_norm": 0.1538558453321457, "learning_rate": 2.5656198608705907e-05, "loss": 0.0823, "step": 7398 }, { "epoch": 5.268347760632292, "grad_norm": 0.16480262577533722, "learning_rate": 2.5644918064945345e-05, "loss": 0.1423, "step": 7399 }, { "epoch": 5.269100489273617, "grad_norm": 0.13362501561641693, "learning_rate": 2.5633640731412412e-05, "loss": 0.0956, "step": 7400 }, { "epoch": 5.269100489273617, "eval_loss": 0.15784592926502228, "eval_runtime": 456.1829, "eval_samples_per_second": 21.103, "eval_steps_per_second": 0.66, "step": 7400 }, { "epoch": 5.2698532179149415, "grad_norm": 0.13016252219676971, "learning_rate": 2.5622366609340392e-05, "loss": 0.1013, "step": 7401 }, { "epoch": 5.270605946556266, "grad_norm": 0.1594800502061844, "learning_rate": 2.5611095699962172e-05, "loss": 0.0894, "step": 7402 }, { "epoch": 5.271358675197591, "grad_norm": 0.14370611310005188, "learning_rate": 2.5599828004510357e-05, "loss": 0.1013, "step": 7403 }, { "epoch": 5.272111403838916, "grad_norm": 0.14267592132091522, "learning_rate": 2.5588563524217134e-05, "loss": 0.1433, "step": 7404 }, { "epoch": 5.272864132480241, "grad_norm": 0.12969526648521423, "learning_rate": 2.557730226031439e-05, "loss": 0.1018, "step": 7405 }, { "epoch": 5.273616861121566, "grad_norm": 0.1799166351556778, "learning_rate": 2.556604421403363e-05, "loss": 0.1228, "step": 7406 }, { "epoch": 5.27436958976289, "grad_norm": 0.16465482115745544, "learning_rate": 2.555478938660604e-05, "loss": 0.1254, "step": 7407 }, { "epoch": 5.2751223184042155, "grad_norm": 0.12536446750164032, "learning_rate": 2.5543537779262395e-05, "loss": 0.0568, "step": 7408 }, { "epoch": 5.27587504704554, "grad_norm": 0.135859414935112, "learning_rate": 2.5532289393233165e-05, "loss": 0.1292, "step": 7409 }, { "epoch": 5.2766277756868645, "grad_norm": 0.1275719553232193, "learning_rate": 2.552104422974848e-05, "loss": 0.0737, "step": 7410 }, { "epoch": 5.27738050432819, "grad_norm": 0.15452034771442413, "learning_rate": 2.550980229003807e-05, "loss": 0.1897, "step": 7411 }, { "epoch": 5.278133232969514, "grad_norm": 0.14988264441490173, "learning_rate": 2.549856357533134e-05, "loss": 0.0754, "step": 7412 }, { "epoch": 5.27888596161084, "grad_norm": 0.1815156191587448, "learning_rate": 2.5487328086857343e-05, "loss": 0.1353, "step": 7413 }, { "epoch": 5.279638690252164, "grad_norm": 0.13913100957870483, "learning_rate": 2.5476095825844792e-05, "loss": 0.0847, "step": 7414 }, { "epoch": 5.280391418893489, "grad_norm": 0.16793783009052277, "learning_rate": 2.546486679352199e-05, "loss": 0.1359, "step": 7415 }, { "epoch": 5.281144147534814, "grad_norm": 0.16847634315490723, "learning_rate": 2.5453640991116967e-05, "loss": 0.1079, "step": 7416 }, { "epoch": 5.2818968761761385, "grad_norm": 0.15983663499355316, "learning_rate": 2.5442418419857333e-05, "loss": 0.0657, "step": 7417 }, { "epoch": 5.282649604817463, "grad_norm": 0.16255876421928406, "learning_rate": 2.5431199080970392e-05, "loss": 0.1402, "step": 7418 }, { "epoch": 5.283402333458788, "grad_norm": 0.1423829346895218, "learning_rate": 2.5419982975683037e-05, "loss": 0.097, "step": 7419 }, { "epoch": 5.284155062100113, "grad_norm": 0.14834430813789368, "learning_rate": 2.5408770105221903e-05, "loss": 0.0729, "step": 7420 }, { "epoch": 5.284907790741438, "grad_norm": 0.155939981341362, "learning_rate": 2.5397560470813163e-05, "loss": 0.1132, "step": 7421 }, { "epoch": 5.285660519382763, "grad_norm": 0.18352240324020386, "learning_rate": 2.5386354073682717e-05, "loss": 0.0726, "step": 7422 }, { "epoch": 5.286413248024087, "grad_norm": 0.14586147665977478, "learning_rate": 2.537515091505605e-05, "loss": 0.0849, "step": 7423 }, { "epoch": 5.287165976665412, "grad_norm": 0.14570793509483337, "learning_rate": 2.536395099615836e-05, "loss": 0.1255, "step": 7424 }, { "epoch": 5.287918705306737, "grad_norm": 0.14147087931632996, "learning_rate": 2.5352754318214388e-05, "loss": 0.0817, "step": 7425 }, { "epoch": 5.288671433948061, "grad_norm": 0.13477067649364471, "learning_rate": 2.5341560882448667e-05, "loss": 0.0926, "step": 7426 }, { "epoch": 5.289424162589387, "grad_norm": 0.13143251836299896, "learning_rate": 2.533037069008525e-05, "loss": 0.0926, "step": 7427 }, { "epoch": 5.290176891230711, "grad_norm": 0.17307361960411072, "learning_rate": 2.5319183742347895e-05, "loss": 0.1224, "step": 7428 }, { "epoch": 5.290929619872037, "grad_norm": 0.14201287925243378, "learning_rate": 2.5308000040459968e-05, "loss": 0.143, "step": 7429 }, { "epoch": 5.291682348513361, "grad_norm": 0.15426142513751984, "learning_rate": 2.529681958564451e-05, "loss": 0.1266, "step": 7430 }, { "epoch": 5.2924350771546855, "grad_norm": 0.16488607227802277, "learning_rate": 2.5285642379124208e-05, "loss": 0.1266, "step": 7431 }, { "epoch": 5.293187805796011, "grad_norm": 0.19643715023994446, "learning_rate": 2.52744684221214e-05, "loss": 0.17, "step": 7432 }, { "epoch": 5.293940534437335, "grad_norm": 0.1441100686788559, "learning_rate": 2.5263297715858014e-05, "loss": 0.0718, "step": 7433 }, { "epoch": 5.29469326307866, "grad_norm": 0.1534847915172577, "learning_rate": 2.5252130261555684e-05, "loss": 0.1392, "step": 7434 }, { "epoch": 5.295445991719985, "grad_norm": 0.16585050523281097, "learning_rate": 2.5240966060435677e-05, "loss": 0.0857, "step": 7435 }, { "epoch": 5.29619872036131, "grad_norm": 0.1596667766571045, "learning_rate": 2.5229805113718863e-05, "loss": 0.1296, "step": 7436 }, { "epoch": 5.296951449002634, "grad_norm": 0.18326760828495026, "learning_rate": 2.521864742262582e-05, "loss": 0.1566, "step": 7437 }, { "epoch": 5.2977041776439595, "grad_norm": 0.1567734330892563, "learning_rate": 2.5207492988376678e-05, "loss": 0.1406, "step": 7438 }, { "epoch": 5.298456906285284, "grad_norm": 0.15677505731582642, "learning_rate": 2.5196341812191355e-05, "loss": 0.1434, "step": 7439 }, { "epoch": 5.299209634926609, "grad_norm": 0.16622120141983032, "learning_rate": 2.5185193895289266e-05, "loss": 0.1406, "step": 7440 }, { "epoch": 5.299962363567934, "grad_norm": 0.16680632531642914, "learning_rate": 2.5174049238889553e-05, "loss": 0.1129, "step": 7441 }, { "epoch": 5.300715092209258, "grad_norm": 0.1560552716255188, "learning_rate": 2.516290784421097e-05, "loss": 0.1304, "step": 7442 }, { "epoch": 5.301467820850584, "grad_norm": 0.14121291041374207, "learning_rate": 2.5151769712471935e-05, "loss": 0.107, "step": 7443 }, { "epoch": 5.302220549491908, "grad_norm": 0.13136087357997894, "learning_rate": 2.5140634844890453e-05, "loss": 0.1104, "step": 7444 }, { "epoch": 5.302973278133233, "grad_norm": 0.14478671550750732, "learning_rate": 2.5129503242684283e-05, "loss": 0.1081, "step": 7445 }, { "epoch": 5.303726006774558, "grad_norm": 0.1542777270078659, "learning_rate": 2.511837490707072e-05, "loss": 0.1306, "step": 7446 }, { "epoch": 5.304478735415882, "grad_norm": 0.16024905443191528, "learning_rate": 2.510724983926676e-05, "loss": 0.1837, "step": 7447 }, { "epoch": 5.305231464057208, "grad_norm": 0.1510675996541977, "learning_rate": 2.509612804048901e-05, "loss": 0.0844, "step": 7448 }, { "epoch": 5.305984192698532, "grad_norm": 0.1353428065776825, "learning_rate": 2.508500951195374e-05, "loss": 0.1536, "step": 7449 }, { "epoch": 5.306736921339857, "grad_norm": 0.15161307156085968, "learning_rate": 2.507389425487683e-05, "loss": 0.1236, "step": 7450 }, { "epoch": 5.307489649981182, "grad_norm": 0.13508734107017517, "learning_rate": 2.5062782270473874e-05, "loss": 0.0783, "step": 7451 }, { "epoch": 5.308242378622507, "grad_norm": 0.15807554125785828, "learning_rate": 2.5051673559960026e-05, "loss": 0.1269, "step": 7452 }, { "epoch": 5.308995107263831, "grad_norm": 0.16603843867778778, "learning_rate": 2.5040568124550152e-05, "loss": 0.1369, "step": 7453 }, { "epoch": 5.309747835905156, "grad_norm": 0.16461217403411865, "learning_rate": 2.5029465965458683e-05, "loss": 0.1463, "step": 7454 }, { "epoch": 5.310500564546481, "grad_norm": 0.1856953501701355, "learning_rate": 2.5018367083899763e-05, "loss": 0.191, "step": 7455 }, { "epoch": 5.311253293187805, "grad_norm": 0.15597055852413177, "learning_rate": 2.5007271481087146e-05, "loss": 0.0781, "step": 7456 }, { "epoch": 5.312006021829131, "grad_norm": 0.17854717373847961, "learning_rate": 2.499617915823422e-05, "loss": 0.1019, "step": 7457 }, { "epoch": 5.312758750470455, "grad_norm": 0.13976025581359863, "learning_rate": 2.498509011655403e-05, "loss": 0.1656, "step": 7458 }, { "epoch": 5.313511479111781, "grad_norm": 0.1455927938222885, "learning_rate": 2.4974004357259255e-05, "loss": 0.0942, "step": 7459 }, { "epoch": 5.314264207753105, "grad_norm": 0.154214009642601, "learning_rate": 2.4962921881562244e-05, "loss": 0.1139, "step": 7460 }, { "epoch": 5.3150169363944295, "grad_norm": 0.1566934436559677, "learning_rate": 2.4951842690674913e-05, "loss": 0.136, "step": 7461 }, { "epoch": 5.315769665035755, "grad_norm": 0.1730629801750183, "learning_rate": 2.49407667858089e-05, "loss": 0.0765, "step": 7462 }, { "epoch": 5.316522393677079, "grad_norm": 0.15445460379123688, "learning_rate": 2.492969416817542e-05, "loss": 0.0957, "step": 7463 }, { "epoch": 5.317275122318404, "grad_norm": 0.1556812822818756, "learning_rate": 2.4918624838985394e-05, "loss": 0.1416, "step": 7464 }, { "epoch": 5.318027850959729, "grad_norm": 0.196661576628685, "learning_rate": 2.490755879944932e-05, "loss": 0.1348, "step": 7465 }, { "epoch": 5.318780579601054, "grad_norm": 0.15156190097332, "learning_rate": 2.4896496050777386e-05, "loss": 0.0939, "step": 7466 }, { "epoch": 5.319533308242379, "grad_norm": 0.13001342117786407, "learning_rate": 2.488543659417937e-05, "loss": 0.0771, "step": 7467 }, { "epoch": 5.3202860368837035, "grad_norm": 0.17937977612018585, "learning_rate": 2.487438043086475e-05, "loss": 0.1679, "step": 7468 }, { "epoch": 5.321038765525028, "grad_norm": 0.12270326912403107, "learning_rate": 2.486332756204257e-05, "loss": 0.0691, "step": 7469 }, { "epoch": 5.321791494166353, "grad_norm": 0.16655008494853973, "learning_rate": 2.485227798892158e-05, "loss": 0.1273, "step": 7470 }, { "epoch": 5.322544222807678, "grad_norm": 0.17286011576652527, "learning_rate": 2.4841231712710144e-05, "loss": 0.092, "step": 7471 }, { "epoch": 5.323296951449002, "grad_norm": 0.16206660866737366, "learning_rate": 2.483018873461628e-05, "loss": 0.0717, "step": 7472 }, { "epoch": 5.324049680090328, "grad_norm": 0.16565930843353271, "learning_rate": 2.48191490558476e-05, "loss": 0.0991, "step": 7473 }, { "epoch": 5.324802408731652, "grad_norm": 0.1559310108423233, "learning_rate": 2.4808112677611423e-05, "loss": 0.1741, "step": 7474 }, { "epoch": 5.325555137372977, "grad_norm": 0.15226033329963684, "learning_rate": 2.4797079601114633e-05, "loss": 0.1385, "step": 7475 }, { "epoch": 5.326307866014302, "grad_norm": 0.11593233793973923, "learning_rate": 2.4786049827563807e-05, "loss": 0.0544, "step": 7476 }, { "epoch": 5.327060594655626, "grad_norm": 0.16389331221580505, "learning_rate": 2.4775023358165146e-05, "loss": 0.0758, "step": 7477 }, { "epoch": 5.327813323296952, "grad_norm": 0.16570326685905457, "learning_rate": 2.4764000194124504e-05, "loss": 0.0789, "step": 7478 }, { "epoch": 5.328566051938276, "grad_norm": 0.13994210958480835, "learning_rate": 2.4752980336647326e-05, "loss": 0.0889, "step": 7479 }, { "epoch": 5.329318780579601, "grad_norm": 0.1712230145931244, "learning_rate": 2.4741963786938745e-05, "loss": 0.1661, "step": 7480 }, { "epoch": 5.330071509220926, "grad_norm": 0.18590262532234192, "learning_rate": 2.4730950546203522e-05, "loss": 0.1625, "step": 7481 }, { "epoch": 5.330824237862251, "grad_norm": 0.11651252955198288, "learning_rate": 2.4719940615646023e-05, "loss": 0.0793, "step": 7482 }, { "epoch": 5.331576966503576, "grad_norm": 0.16746293008327484, "learning_rate": 2.4708933996470298e-05, "loss": 0.1174, "step": 7483 }, { "epoch": 5.3323296951449, "grad_norm": 0.13603104650974274, "learning_rate": 2.4697930689880012e-05, "loss": 0.0874, "step": 7484 }, { "epoch": 5.333082423786225, "grad_norm": 0.14195677638053894, "learning_rate": 2.4686930697078474e-05, "loss": 0.0621, "step": 7485 }, { "epoch": 5.33383515242755, "grad_norm": 0.1405058205127716, "learning_rate": 2.4675934019268605e-05, "loss": 0.0927, "step": 7486 }, { "epoch": 5.334587881068875, "grad_norm": 0.13098564743995667, "learning_rate": 2.466494065765302e-05, "loss": 0.0609, "step": 7487 }, { "epoch": 5.335340609710199, "grad_norm": 0.16193434596061707, "learning_rate": 2.4653950613433903e-05, "loss": 0.0961, "step": 7488 }, { "epoch": 5.3360933383515246, "grad_norm": 0.11815910041332245, "learning_rate": 2.4642963887813118e-05, "loss": 0.0771, "step": 7489 }, { "epoch": 5.336846066992849, "grad_norm": 0.17995014786720276, "learning_rate": 2.463198048199216e-05, "loss": 0.1217, "step": 7490 }, { "epoch": 5.3375987956341735, "grad_norm": 0.15625499188899994, "learning_rate": 2.4621000397172177e-05, "loss": 0.0737, "step": 7491 }, { "epoch": 5.338351524275499, "grad_norm": 0.1612446904182434, "learning_rate": 2.46100236345539e-05, "loss": 0.1513, "step": 7492 }, { "epoch": 5.339104252916823, "grad_norm": 0.1347866654396057, "learning_rate": 2.4599050195337767e-05, "loss": 0.0646, "step": 7493 }, { "epoch": 5.339856981558149, "grad_norm": 0.1528194099664688, "learning_rate": 2.458808008072378e-05, "loss": 0.1804, "step": 7494 }, { "epoch": 5.340609710199473, "grad_norm": 0.16970011591911316, "learning_rate": 2.4577113291911627e-05, "loss": 0.1156, "step": 7495 }, { "epoch": 5.341362438840798, "grad_norm": 0.13838131725788116, "learning_rate": 2.456614983010063e-05, "loss": 0.0622, "step": 7496 }, { "epoch": 5.342115167482123, "grad_norm": 0.1409309059381485, "learning_rate": 2.4555189696489742e-05, "loss": 0.1521, "step": 7497 }, { "epoch": 5.3428678961234475, "grad_norm": 0.1294088214635849, "learning_rate": 2.4544232892277526e-05, "loss": 0.084, "step": 7498 }, { "epoch": 5.343620624764772, "grad_norm": 0.13339772820472717, "learning_rate": 2.453327941866222e-05, "loss": 0.1283, "step": 7499 }, { "epoch": 5.344373353406097, "grad_norm": 0.1578311026096344, "learning_rate": 2.4522329276841663e-05, "loss": 0.1021, "step": 7500 }, { "epoch": 5.345126082047422, "grad_norm": 0.16132758557796478, "learning_rate": 2.451138246801335e-05, "loss": 0.1442, "step": 7501 }, { "epoch": 5.345878810688747, "grad_norm": 0.1341070681810379, "learning_rate": 2.4500438993374407e-05, "loss": 0.0675, "step": 7502 }, { "epoch": 5.346631539330072, "grad_norm": 0.18249933421611786, "learning_rate": 2.448949885412162e-05, "loss": 0.1179, "step": 7503 }, { "epoch": 5.347384267971396, "grad_norm": 0.16187427937984467, "learning_rate": 2.4478562051451352e-05, "loss": 0.1372, "step": 7504 }, { "epoch": 5.3481369966127215, "grad_norm": 0.17513199150562286, "learning_rate": 2.446762858655965e-05, "loss": 0.0792, "step": 7505 }, { "epoch": 5.348889725254046, "grad_norm": 0.14435972273349762, "learning_rate": 2.4456698460642192e-05, "loss": 0.0889, "step": 7506 }, { "epoch": 5.34964245389537, "grad_norm": 0.1367531418800354, "learning_rate": 2.4445771674894256e-05, "loss": 0.0853, "step": 7507 }, { "epoch": 5.350395182536696, "grad_norm": 0.14345327019691467, "learning_rate": 2.4434848230510794e-05, "loss": 0.0626, "step": 7508 }, { "epoch": 5.35114791117802, "grad_norm": 0.17929668724536896, "learning_rate": 2.442392812868637e-05, "loss": 0.1738, "step": 7509 }, { "epoch": 5.351900639819345, "grad_norm": 0.18643862009048462, "learning_rate": 2.441301137061522e-05, "loss": 0.0888, "step": 7510 }, { "epoch": 5.35265336846067, "grad_norm": 0.18326589465141296, "learning_rate": 2.440209795749114e-05, "loss": 0.1037, "step": 7511 }, { "epoch": 5.3534060971019946, "grad_norm": 0.17039936780929565, "learning_rate": 2.4391187890507634e-05, "loss": 0.1848, "step": 7512 }, { "epoch": 5.35415882574332, "grad_norm": 0.13251939415931702, "learning_rate": 2.438028117085779e-05, "loss": 0.1054, "step": 7513 }, { "epoch": 5.354911554384644, "grad_norm": 0.15464791655540466, "learning_rate": 2.4369377799734364e-05, "loss": 0.0973, "step": 7514 }, { "epoch": 5.355664283025969, "grad_norm": 0.1722010374069214, "learning_rate": 2.4358477778329723e-05, "loss": 0.1758, "step": 7515 }, { "epoch": 5.356417011667294, "grad_norm": 0.16281723976135254, "learning_rate": 2.4347581107835905e-05, "loss": 0.1036, "step": 7516 }, { "epoch": 5.357169740308619, "grad_norm": 0.1829470992088318, "learning_rate": 2.4336687789444513e-05, "loss": 0.1709, "step": 7517 }, { "epoch": 5.357922468949943, "grad_norm": 0.14087460935115814, "learning_rate": 2.4325797824346856e-05, "loss": 0.0993, "step": 7518 }, { "epoch": 5.3586751975912685, "grad_norm": 0.1526247262954712, "learning_rate": 2.431491121373382e-05, "loss": 0.116, "step": 7519 }, { "epoch": 5.359427926232593, "grad_norm": 0.13399788737297058, "learning_rate": 2.4304027958795957e-05, "loss": 0.1381, "step": 7520 }, { "epoch": 5.360180654873918, "grad_norm": 0.16692312061786652, "learning_rate": 2.4293148060723444e-05, "loss": 0.1026, "step": 7521 }, { "epoch": 5.360933383515243, "grad_norm": 0.14469663798809052, "learning_rate": 2.4282271520706108e-05, "loss": 0.1294, "step": 7522 }, { "epoch": 5.361686112156567, "grad_norm": 0.16282999515533447, "learning_rate": 2.4271398339933364e-05, "loss": 0.1943, "step": 7523 }, { "epoch": 5.362438840797893, "grad_norm": 0.14163310825824738, "learning_rate": 2.4260528519594312e-05, "loss": 0.1375, "step": 7524 }, { "epoch": 5.363191569439217, "grad_norm": 0.14029118418693542, "learning_rate": 2.4249662060877625e-05, "loss": 0.0558, "step": 7525 }, { "epoch": 5.363944298080542, "grad_norm": 0.14999514818191528, "learning_rate": 2.423879896497167e-05, "loss": 0.0565, "step": 7526 }, { "epoch": 5.364697026721867, "grad_norm": 0.16895245015621185, "learning_rate": 2.4227939233064404e-05, "loss": 0.1165, "step": 7527 }, { "epoch": 5.3654497553631915, "grad_norm": 0.1644132137298584, "learning_rate": 2.421708286634346e-05, "loss": 0.1836, "step": 7528 }, { "epoch": 5.366202484004516, "grad_norm": 0.1328694373369217, "learning_rate": 2.420622986599603e-05, "loss": 0.0789, "step": 7529 }, { "epoch": 5.366955212645841, "grad_norm": 0.15813282132148743, "learning_rate": 2.4195380233209008e-05, "loss": 0.1165, "step": 7530 }, { "epoch": 5.367707941287166, "grad_norm": 0.14039382338523865, "learning_rate": 2.4184533969168904e-05, "loss": 0.1546, "step": 7531 }, { "epoch": 5.368460669928491, "grad_norm": 0.164057657122612, "learning_rate": 2.417369107506182e-05, "loss": 0.1289, "step": 7532 }, { "epoch": 5.369213398569816, "grad_norm": 0.13526473939418793, "learning_rate": 2.4162851552073535e-05, "loss": 0.0789, "step": 7533 }, { "epoch": 5.36996612721114, "grad_norm": 0.15550990402698517, "learning_rate": 2.4152015401389445e-05, "loss": 0.1618, "step": 7534 }, { "epoch": 5.3707188558524654, "grad_norm": 0.1372520923614502, "learning_rate": 2.4141182624194577e-05, "loss": 0.1434, "step": 7535 }, { "epoch": 5.37147158449379, "grad_norm": 0.1427537500858307, "learning_rate": 2.413035322167357e-05, "loss": 0.1208, "step": 7536 }, { "epoch": 5.372224313135114, "grad_norm": 0.1521807163953781, "learning_rate": 2.4119527195010738e-05, "loss": 0.1079, "step": 7537 }, { "epoch": 5.37297704177644, "grad_norm": 0.14699789881706238, "learning_rate": 2.4108704545389972e-05, "loss": 0.0978, "step": 7538 }, { "epoch": 5.373729770417764, "grad_norm": 0.14530816674232483, "learning_rate": 2.4097885273994844e-05, "loss": 0.0852, "step": 7539 }, { "epoch": 5.37448249905909, "grad_norm": 0.18665096163749695, "learning_rate": 2.4087069382008495e-05, "loss": 0.1345, "step": 7540 }, { "epoch": 5.375235227700414, "grad_norm": 0.13211417198181152, "learning_rate": 2.407625687061379e-05, "loss": 0.162, "step": 7541 }, { "epoch": 5.3759879563417385, "grad_norm": 0.16621114313602448, "learning_rate": 2.4065447740993123e-05, "loss": 0.1252, "step": 7542 }, { "epoch": 5.376740684983064, "grad_norm": 0.1410694420337677, "learning_rate": 2.40546419943286e-05, "loss": 0.0828, "step": 7543 }, { "epoch": 5.377493413624388, "grad_norm": 0.15563589334487915, "learning_rate": 2.4043839631801884e-05, "loss": 0.1518, "step": 7544 }, { "epoch": 5.378246142265713, "grad_norm": 0.146842360496521, "learning_rate": 2.4033040654594344e-05, "loss": 0.0831, "step": 7545 }, { "epoch": 5.378998870907038, "grad_norm": 0.1421828418970108, "learning_rate": 2.402224506388689e-05, "loss": 0.0974, "step": 7546 }, { "epoch": 5.379751599548363, "grad_norm": 0.14172515273094177, "learning_rate": 2.4011452860860168e-05, "loss": 0.0928, "step": 7547 }, { "epoch": 5.380504328189687, "grad_norm": 0.13941575586795807, "learning_rate": 2.4000664046694355e-05, "loss": 0.1207, "step": 7548 }, { "epoch": 5.3812570568310125, "grad_norm": 0.1517096906900406, "learning_rate": 2.398987862256933e-05, "loss": 0.1328, "step": 7549 }, { "epoch": 5.382009785472337, "grad_norm": 0.1395435333251953, "learning_rate": 2.397909658966454e-05, "loss": 0.0852, "step": 7550 }, { "epoch": 5.382762514113662, "grad_norm": 0.15458008646965027, "learning_rate": 2.3968317949159112e-05, "loss": 0.1473, "step": 7551 }, { "epoch": 5.383515242754987, "grad_norm": 0.13126058876514435, "learning_rate": 2.395754270223178e-05, "loss": 0.1173, "step": 7552 }, { "epoch": 5.384267971396311, "grad_norm": 0.138570174574852, "learning_rate": 2.3946770850060914e-05, "loss": 0.1203, "step": 7553 }, { "epoch": 5.385020700037637, "grad_norm": 0.1425439715385437, "learning_rate": 2.3936002393824487e-05, "loss": 0.0766, "step": 7554 }, { "epoch": 5.385773428678961, "grad_norm": 0.1316847801208496, "learning_rate": 2.3925237334700136e-05, "loss": 0.0468, "step": 7555 }, { "epoch": 5.3865261573202865, "grad_norm": 0.13792641460895538, "learning_rate": 2.3914475673865126e-05, "loss": 0.0529, "step": 7556 }, { "epoch": 5.387278885961611, "grad_norm": 0.1716773509979248, "learning_rate": 2.39037174124963e-05, "loss": 0.1066, "step": 7557 }, { "epoch": 5.3880316146029354, "grad_norm": 0.16533389687538147, "learning_rate": 2.3892962551770204e-05, "loss": 0.0784, "step": 7558 }, { "epoch": 5.388784343244261, "grad_norm": 0.14196573197841644, "learning_rate": 2.3882211092862926e-05, "loss": 0.0857, "step": 7559 }, { "epoch": 5.389537071885585, "grad_norm": 0.149057537317276, "learning_rate": 2.3871463036950283e-05, "loss": 0.1308, "step": 7560 }, { "epoch": 5.39028980052691, "grad_norm": 0.13626578450202942, "learning_rate": 2.3860718385207633e-05, "loss": 0.0885, "step": 7561 }, { "epoch": 5.391042529168235, "grad_norm": 0.15436142683029175, "learning_rate": 2.3849977138810012e-05, "loss": 0.1414, "step": 7562 }, { "epoch": 5.39179525780956, "grad_norm": 0.1570853590965271, "learning_rate": 2.383923929893205e-05, "loss": 0.143, "step": 7563 }, { "epoch": 5.392547986450884, "grad_norm": 0.17325298488140106, "learning_rate": 2.3828504866748034e-05, "loss": 0.2173, "step": 7564 }, { "epoch": 5.393300715092209, "grad_norm": 0.13101699948310852, "learning_rate": 2.3817773843431838e-05, "loss": 0.1094, "step": 7565 }, { "epoch": 5.394053443733534, "grad_norm": 0.1435564011335373, "learning_rate": 2.3807046230157036e-05, "loss": 0.0722, "step": 7566 }, { "epoch": 5.394806172374859, "grad_norm": 0.194674551486969, "learning_rate": 2.3796322028096748e-05, "loss": 0.1216, "step": 7567 }, { "epoch": 5.395558901016184, "grad_norm": 0.16635124385356903, "learning_rate": 2.3785601238423787e-05, "loss": 0.1473, "step": 7568 }, { "epoch": 5.396311629657508, "grad_norm": 0.14073067903518677, "learning_rate": 2.3774883862310527e-05, "loss": 0.0809, "step": 7569 }, { "epoch": 5.397064358298834, "grad_norm": 0.1567569226026535, "learning_rate": 2.3764169900929038e-05, "loss": 0.0977, "step": 7570 }, { "epoch": 5.397817086940158, "grad_norm": 0.12961725890636444, "learning_rate": 2.3753459355450935e-05, "loss": 0.0871, "step": 7571 }, { "epoch": 5.3985698155814825, "grad_norm": 0.1512342244386673, "learning_rate": 2.374275222704757e-05, "loss": 0.105, "step": 7572 }, { "epoch": 5.399322544222808, "grad_norm": 0.1455184519290924, "learning_rate": 2.3732048516889808e-05, "loss": 0.117, "step": 7573 }, { "epoch": 5.400075272864132, "grad_norm": 0.14611977338790894, "learning_rate": 2.3721348226148223e-05, "loss": 0.1072, "step": 7574 }, { "epoch": 5.400828001505458, "grad_norm": 0.16105806827545166, "learning_rate": 2.3710651355992965e-05, "loss": 0.1379, "step": 7575 }, { "epoch": 5.401580730146782, "grad_norm": 0.1594434678554535, "learning_rate": 2.3699957907593827e-05, "loss": 0.11, "step": 7576 }, { "epoch": 5.402333458788107, "grad_norm": 0.16102227568626404, "learning_rate": 2.3689267882120244e-05, "loss": 0.147, "step": 7577 }, { "epoch": 5.403086187429432, "grad_norm": 0.16141411662101746, "learning_rate": 2.367858128074124e-05, "loss": 0.1046, "step": 7578 }, { "epoch": 5.4038389160707565, "grad_norm": 0.16681726276874542, "learning_rate": 2.3667898104625495e-05, "loss": 0.139, "step": 7579 }, { "epoch": 5.404591644712081, "grad_norm": 0.1336016207933426, "learning_rate": 2.3657218354941303e-05, "loss": 0.0767, "step": 7580 }, { "epoch": 5.405344373353406, "grad_norm": 0.15668120980262756, "learning_rate": 2.3646542032856604e-05, "loss": 0.0839, "step": 7581 }, { "epoch": 5.406097101994731, "grad_norm": 0.15034620463848114, "learning_rate": 2.3635869139538914e-05, "loss": 0.098, "step": 7582 }, { "epoch": 5.406849830636055, "grad_norm": 0.1531384438276291, "learning_rate": 2.3625199676155437e-05, "loss": 0.0948, "step": 7583 }, { "epoch": 5.407602559277381, "grad_norm": 0.15787766873836517, "learning_rate": 2.361453364387292e-05, "loss": 0.1158, "step": 7584 }, { "epoch": 5.408355287918705, "grad_norm": 0.13259372115135193, "learning_rate": 2.360387104385785e-05, "loss": 0.1414, "step": 7585 }, { "epoch": 5.4091080165600305, "grad_norm": 0.1501416265964508, "learning_rate": 2.359321187727622e-05, "loss": 0.158, "step": 7586 }, { "epoch": 5.409860745201355, "grad_norm": 0.13937018811702728, "learning_rate": 2.358255614529374e-05, "loss": 0.1393, "step": 7587 }, { "epoch": 5.410613473842679, "grad_norm": 0.1252278983592987, "learning_rate": 2.3571903849075664e-05, "loss": 0.0674, "step": 7588 }, { "epoch": 5.411366202484005, "grad_norm": 0.14535585045814514, "learning_rate": 2.356125498978695e-05, "loss": 0.1286, "step": 7589 }, { "epoch": 5.412118931125329, "grad_norm": 0.11988440901041031, "learning_rate": 2.3550609568592088e-05, "loss": 0.0885, "step": 7590 }, { "epoch": 5.412871659766654, "grad_norm": 0.1556290090084076, "learning_rate": 2.353996758665532e-05, "loss": 0.1361, "step": 7591 }, { "epoch": 5.413624388407979, "grad_norm": 0.14955662190914154, "learning_rate": 2.3529329045140376e-05, "loss": 0.1844, "step": 7592 }, { "epoch": 5.414377117049304, "grad_norm": 0.1649152785539627, "learning_rate": 2.3518693945210706e-05, "loss": 0.1402, "step": 7593 }, { "epoch": 5.415129845690629, "grad_norm": 0.11566943675279617, "learning_rate": 2.350806228802932e-05, "loss": 0.0797, "step": 7594 }, { "epoch": 5.415882574331953, "grad_norm": 0.1520291119813919, "learning_rate": 2.349743407475891e-05, "loss": 0.1602, "step": 7595 }, { "epoch": 5.416635302973278, "grad_norm": 0.14471538364887238, "learning_rate": 2.3486809306561733e-05, "loss": 0.1045, "step": 7596 }, { "epoch": 5.417388031614603, "grad_norm": 0.15470635890960693, "learning_rate": 2.347618798459971e-05, "loss": 0.135, "step": 7597 }, { "epoch": 5.418140760255928, "grad_norm": 0.13311196863651276, "learning_rate": 2.346557011003437e-05, "loss": 0.1425, "step": 7598 }, { "epoch": 5.418893488897252, "grad_norm": 0.14499607682228088, "learning_rate": 2.3454955684026893e-05, "loss": 0.1621, "step": 7599 }, { "epoch": 5.419646217538578, "grad_norm": 0.16720278561115265, "learning_rate": 2.3444344707738015e-05, "loss": 0.1754, "step": 7600 }, { "epoch": 5.419646217538578, "eval_loss": 0.15607230365276337, "eval_runtime": 456.7584, "eval_samples_per_second": 21.077, "eval_steps_per_second": 0.659, "step": 7600 }, { "epoch": 5.420398946179902, "grad_norm": 0.1628710925579071, "learning_rate": 2.3433737182328162e-05, "loss": 0.0868, "step": 7601 }, { "epoch": 5.4211516748212265, "grad_norm": 0.15817131102085114, "learning_rate": 2.342313310895736e-05, "loss": 0.1429, "step": 7602 }, { "epoch": 5.421904403462552, "grad_norm": 0.14884941279888153, "learning_rate": 2.341253248878524e-05, "loss": 0.1411, "step": 7603 }, { "epoch": 5.422657132103876, "grad_norm": 0.13039050996303558, "learning_rate": 2.340193532297107e-05, "loss": 0.0826, "step": 7604 }, { "epoch": 5.423409860745202, "grad_norm": 0.13175886869430542, "learning_rate": 2.3391341612673743e-05, "loss": 0.0815, "step": 7605 }, { "epoch": 5.424162589386526, "grad_norm": 0.133148193359375, "learning_rate": 2.3380751359051795e-05, "loss": 0.1209, "step": 7606 }, { "epoch": 5.424915318027851, "grad_norm": 0.1547599732875824, "learning_rate": 2.337016456326332e-05, "loss": 0.0677, "step": 7607 }, { "epoch": 5.425668046669176, "grad_norm": 0.1595575213432312, "learning_rate": 2.3359581226466114e-05, "loss": 0.1518, "step": 7608 }, { "epoch": 5.4264207753105005, "grad_norm": 0.14691078662872314, "learning_rate": 2.334900134981751e-05, "loss": 0.0937, "step": 7609 }, { "epoch": 5.427173503951825, "grad_norm": 0.15160684287548065, "learning_rate": 2.333842493447454e-05, "loss": 0.1388, "step": 7610 }, { "epoch": 5.42792623259315, "grad_norm": 0.1436777561903, "learning_rate": 2.3327851981593814e-05, "loss": 0.1297, "step": 7611 }, { "epoch": 5.428678961234475, "grad_norm": 0.1638592928647995, "learning_rate": 2.3317282492331597e-05, "loss": 0.1186, "step": 7612 }, { "epoch": 5.4294316898758, "grad_norm": 0.1547795534133911, "learning_rate": 2.3306716467843712e-05, "loss": 0.1684, "step": 7613 }, { "epoch": 5.430184418517125, "grad_norm": 0.17247465252876282, "learning_rate": 2.3296153909285678e-05, "loss": 0.1732, "step": 7614 }, { "epoch": 5.430937147158449, "grad_norm": 0.17563912272453308, "learning_rate": 2.3285594817812577e-05, "loss": 0.1324, "step": 7615 }, { "epoch": 5.4316898757997745, "grad_norm": 0.1307399570941925, "learning_rate": 2.327503919457914e-05, "loss": 0.0565, "step": 7616 }, { "epoch": 5.432442604441099, "grad_norm": 0.14884141087532043, "learning_rate": 2.326448704073972e-05, "loss": 0.2114, "step": 7617 }, { "epoch": 5.433195333082423, "grad_norm": 0.1425616294145584, "learning_rate": 2.32539383574483e-05, "loss": 0.0939, "step": 7618 }, { "epoch": 5.433948061723749, "grad_norm": 0.1814597100019455, "learning_rate": 2.3243393145858432e-05, "loss": 0.1519, "step": 7619 }, { "epoch": 5.434700790365073, "grad_norm": 0.14407412707805634, "learning_rate": 2.3232851407123366e-05, "loss": 0.1075, "step": 7620 }, { "epoch": 5.435453519006398, "grad_norm": 0.15630677342414856, "learning_rate": 2.322231314239589e-05, "loss": 0.1115, "step": 7621 }, { "epoch": 5.436206247647723, "grad_norm": 0.13668285310268402, "learning_rate": 2.321177835282847e-05, "loss": 0.1029, "step": 7622 }, { "epoch": 5.436958976289048, "grad_norm": 0.15175727009773254, "learning_rate": 2.3201247039573182e-05, "loss": 0.1622, "step": 7623 }, { "epoch": 5.437711704930373, "grad_norm": 0.14669455587863922, "learning_rate": 2.319071920378172e-05, "loss": 0.1068, "step": 7624 }, { "epoch": 5.438464433571697, "grad_norm": 0.14300425350666046, "learning_rate": 2.3180194846605367e-05, "loss": 0.1163, "step": 7625 }, { "epoch": 5.439217162213022, "grad_norm": 0.12608231604099274, "learning_rate": 2.3169673969195067e-05, "loss": 0.0595, "step": 7626 }, { "epoch": 5.439969890854347, "grad_norm": 0.1399114727973938, "learning_rate": 2.3159156572701378e-05, "loss": 0.0788, "step": 7627 }, { "epoch": 5.440722619495672, "grad_norm": 0.16099727153778076, "learning_rate": 2.314864265827444e-05, "loss": 0.0855, "step": 7628 }, { "epoch": 5.441475348136997, "grad_norm": 0.14960704743862152, "learning_rate": 2.3138132227064057e-05, "loss": 0.1239, "step": 7629 }, { "epoch": 5.4422280767783215, "grad_norm": 0.15816763043403625, "learning_rate": 2.312762528021962e-05, "loss": 0.0729, "step": 7630 }, { "epoch": 5.442980805419646, "grad_norm": 0.15502819418907166, "learning_rate": 2.311712181889019e-05, "loss": 0.1322, "step": 7631 }, { "epoch": 5.443733534060971, "grad_norm": 0.15580283105373383, "learning_rate": 2.310662184422436e-05, "loss": 0.091, "step": 7632 }, { "epoch": 5.444486262702296, "grad_norm": 0.12179707735776901, "learning_rate": 2.309612535737043e-05, "loss": 0.096, "step": 7633 }, { "epoch": 5.44523899134362, "grad_norm": 0.16598497331142426, "learning_rate": 2.3085632359476252e-05, "loss": 0.1491, "step": 7634 }, { "epoch": 5.445991719984946, "grad_norm": 0.14547289907932281, "learning_rate": 2.3075142851689328e-05, "loss": 0.1074, "step": 7635 }, { "epoch": 5.44674444862627, "grad_norm": 0.15657682716846466, "learning_rate": 2.3064656835156796e-05, "loss": 0.1414, "step": 7636 }, { "epoch": 5.447497177267595, "grad_norm": 0.14257453382015228, "learning_rate": 2.3054174311025388e-05, "loss": 0.1326, "step": 7637 }, { "epoch": 5.44824990590892, "grad_norm": 0.13979020714759827, "learning_rate": 2.3043695280441432e-05, "loss": 0.1212, "step": 7638 }, { "epoch": 5.4490026345502445, "grad_norm": 0.19185958802700043, "learning_rate": 2.3033219744550926e-05, "loss": 0.1169, "step": 7639 }, { "epoch": 5.44975536319157, "grad_norm": 0.14483891427516937, "learning_rate": 2.3022747704499438e-05, "loss": 0.1195, "step": 7640 }, { "epoch": 5.450508091832894, "grad_norm": 0.19459174573421478, "learning_rate": 2.3012279161432183e-05, "loss": 0.2017, "step": 7641 }, { "epoch": 5.451260820474219, "grad_norm": 0.15990135073661804, "learning_rate": 2.3001814116493985e-05, "loss": 0.0564, "step": 7642 }, { "epoch": 5.452013549115544, "grad_norm": 0.1329207867383957, "learning_rate": 2.2991352570829303e-05, "loss": 0.0942, "step": 7643 }, { "epoch": 5.452766277756869, "grad_norm": 0.1708296537399292, "learning_rate": 2.298089452558216e-05, "loss": 0.1485, "step": 7644 }, { "epoch": 5.453519006398193, "grad_norm": 0.17081435024738312, "learning_rate": 2.2970439981896272e-05, "loss": 0.1686, "step": 7645 }, { "epoch": 5.4542717350395185, "grad_norm": 0.13854114711284637, "learning_rate": 2.2959988940914895e-05, "loss": 0.1227, "step": 7646 }, { "epoch": 5.455024463680843, "grad_norm": 0.2024560570716858, "learning_rate": 2.2949541403780958e-05, "loss": 0.1771, "step": 7647 }, { "epoch": 5.455777192322168, "grad_norm": 0.13782788813114166, "learning_rate": 2.2939097371636983e-05, "loss": 0.131, "step": 7648 }, { "epoch": 5.456529920963493, "grad_norm": 0.16083337366580963, "learning_rate": 2.292865684562513e-05, "loss": 0.1214, "step": 7649 }, { "epoch": 5.457282649604817, "grad_norm": 0.18157848715782166, "learning_rate": 2.2918219826887135e-05, "loss": 0.181, "step": 7650 }, { "epoch": 5.458035378246143, "grad_norm": 0.1679810732603073, "learning_rate": 2.2907786316564388e-05, "loss": 0.0957, "step": 7651 }, { "epoch": 5.458788106887467, "grad_norm": 0.1552109569311142, "learning_rate": 2.289735631579789e-05, "loss": 0.0482, "step": 7652 }, { "epoch": 5.4595408355287915, "grad_norm": 0.17063507437705994, "learning_rate": 2.288692982572822e-05, "loss": 0.1447, "step": 7653 }, { "epoch": 5.460293564170117, "grad_norm": 0.12334732711315155, "learning_rate": 2.2876506847495634e-05, "loss": 0.047, "step": 7654 }, { "epoch": 5.461046292811441, "grad_norm": 0.15970680117607117, "learning_rate": 2.286608738223995e-05, "loss": 0.1627, "step": 7655 }, { "epoch": 5.461799021452766, "grad_norm": 0.1734439730644226, "learning_rate": 2.285567143110066e-05, "loss": 0.1715, "step": 7656 }, { "epoch": 5.462551750094091, "grad_norm": 0.13156193494796753, "learning_rate": 2.28452589952168e-05, "loss": 0.1177, "step": 7657 }, { "epoch": 5.463304478735416, "grad_norm": 0.15389087796211243, "learning_rate": 2.2834850075727084e-05, "loss": 0.0764, "step": 7658 }, { "epoch": 5.464057207376741, "grad_norm": 0.14763544499874115, "learning_rate": 2.2824444673769788e-05, "loss": 0.1404, "step": 7659 }, { "epoch": 5.4648099360180655, "grad_norm": 0.17866025865077972, "learning_rate": 2.2814042790482863e-05, "loss": 0.1038, "step": 7660 }, { "epoch": 5.46556266465939, "grad_norm": 0.14690649509429932, "learning_rate": 2.2803644427003803e-05, "loss": 0.1046, "step": 7661 }, { "epoch": 5.466315393300715, "grad_norm": 0.12017080932855606, "learning_rate": 2.2793249584469804e-05, "loss": 0.1221, "step": 7662 }, { "epoch": 5.46706812194204, "grad_norm": 0.16115835309028625, "learning_rate": 2.2782858264017598e-05, "loss": 0.0969, "step": 7663 }, { "epoch": 5.467820850583364, "grad_norm": 0.182642862200737, "learning_rate": 2.2772470466783588e-05, "loss": 0.111, "step": 7664 }, { "epoch": 5.46857357922469, "grad_norm": 0.15447887778282166, "learning_rate": 2.276208619390374e-05, "loss": 0.0848, "step": 7665 }, { "epoch": 5.469326307866014, "grad_norm": 0.1456609070301056, "learning_rate": 2.2751705446513694e-05, "loss": 0.1756, "step": 7666 }, { "epoch": 5.4700790365073395, "grad_norm": 0.13966234028339386, "learning_rate": 2.2741328225748624e-05, "loss": 0.0936, "step": 7667 }, { "epoch": 5.470831765148664, "grad_norm": 0.1577167958021164, "learning_rate": 2.273095453274343e-05, "loss": 0.1309, "step": 7668 }, { "epoch": 5.4715844937899885, "grad_norm": 0.17583030462265015, "learning_rate": 2.2720584368632513e-05, "loss": 0.1245, "step": 7669 }, { "epoch": 5.472337222431314, "grad_norm": 0.1315109133720398, "learning_rate": 2.2710217734549975e-05, "loss": 0.0993, "step": 7670 }, { "epoch": 5.473089951072638, "grad_norm": 0.14894984662532806, "learning_rate": 2.269985463162947e-05, "loss": 0.0764, "step": 7671 }, { "epoch": 5.473842679713963, "grad_norm": 0.16225658357143402, "learning_rate": 2.2689495061004292e-05, "loss": 0.101, "step": 7672 }, { "epoch": 5.474595408355288, "grad_norm": 0.1554919332265854, "learning_rate": 2.267913902380736e-05, "loss": 0.1322, "step": 7673 }, { "epoch": 5.475348136996613, "grad_norm": 0.16473914682865143, "learning_rate": 2.26687865211712e-05, "loss": 0.0975, "step": 7674 }, { "epoch": 5.476100865637937, "grad_norm": 0.13918405771255493, "learning_rate": 2.265843755422793e-05, "loss": 0.1274, "step": 7675 }, { "epoch": 5.476853594279262, "grad_norm": 0.17155703902244568, "learning_rate": 2.26480921241093e-05, "loss": 0.1206, "step": 7676 }, { "epoch": 5.477606322920587, "grad_norm": 0.1683661937713623, "learning_rate": 2.263775023194669e-05, "loss": 0.1549, "step": 7677 }, { "epoch": 5.478359051561912, "grad_norm": 0.1345725953578949, "learning_rate": 2.2627411878871045e-05, "loss": 0.0584, "step": 7678 }, { "epoch": 5.479111780203237, "grad_norm": 0.13744117319583893, "learning_rate": 2.2617077066012975e-05, "loss": 0.0618, "step": 7679 }, { "epoch": 5.479864508844561, "grad_norm": 0.13983015716075897, "learning_rate": 2.260674579450265e-05, "loss": 0.1051, "step": 7680 }, { "epoch": 5.480617237485887, "grad_norm": 0.17332333326339722, "learning_rate": 2.2596418065469923e-05, "loss": 0.1318, "step": 7681 }, { "epoch": 5.481369966127211, "grad_norm": 0.14789903163909912, "learning_rate": 2.258609388004419e-05, "loss": 0.1209, "step": 7682 }, { "epoch": 5.4821226947685355, "grad_norm": 0.16437946259975433, "learning_rate": 2.257577323935451e-05, "loss": 0.0918, "step": 7683 }, { "epoch": 5.482875423409861, "grad_norm": 0.15368373692035675, "learning_rate": 2.2565456144529502e-05, "loss": 0.0874, "step": 7684 }, { "epoch": 5.483628152051185, "grad_norm": 0.16783533990383148, "learning_rate": 2.255514259669746e-05, "loss": 0.1097, "step": 7685 }, { "epoch": 5.484380880692511, "grad_norm": 0.1405455768108368, "learning_rate": 2.2544832596986218e-05, "loss": 0.1327, "step": 7686 }, { "epoch": 5.485133609333835, "grad_norm": 0.15590892732143402, "learning_rate": 2.2534526146523317e-05, "loss": 0.0834, "step": 7687 }, { "epoch": 5.48588633797516, "grad_norm": 0.1790473610162735, "learning_rate": 2.2524223246435806e-05, "loss": 0.1072, "step": 7688 }, { "epoch": 5.486639066616485, "grad_norm": 0.13494907319545746, "learning_rate": 2.2513923897850432e-05, "loss": 0.0606, "step": 7689 }, { "epoch": 5.4873917952578095, "grad_norm": 0.13440102338790894, "learning_rate": 2.2503628101893485e-05, "loss": 0.1324, "step": 7690 }, { "epoch": 5.488144523899134, "grad_norm": 0.17239104211330414, "learning_rate": 2.249333585969092e-05, "loss": 0.0706, "step": 7691 }, { "epoch": 5.488897252540459, "grad_norm": 0.1661268174648285, "learning_rate": 2.248304717236825e-05, "loss": 0.0652, "step": 7692 }, { "epoch": 5.489649981181784, "grad_norm": 0.13056455552577972, "learning_rate": 2.2472762041050678e-05, "loss": 0.0949, "step": 7693 }, { "epoch": 5.490402709823109, "grad_norm": 0.15914514660835266, "learning_rate": 2.2462480466862933e-05, "loss": 0.1396, "step": 7694 }, { "epoch": 5.491155438464434, "grad_norm": 0.13217374682426453, "learning_rate": 2.2452202450929415e-05, "loss": 0.1163, "step": 7695 }, { "epoch": 5.491908167105758, "grad_norm": 0.149682879447937, "learning_rate": 2.244192799437409e-05, "loss": 0.0767, "step": 7696 }, { "epoch": 5.4926608957470835, "grad_norm": 0.17173443734645844, "learning_rate": 2.243165709832057e-05, "loss": 0.1552, "step": 7697 }, { "epoch": 5.493413624388408, "grad_norm": 0.1457231044769287, "learning_rate": 2.242138976389208e-05, "loss": 0.0787, "step": 7698 }, { "epoch": 5.494166353029732, "grad_norm": 0.14605532586574554, "learning_rate": 2.2411125992211407e-05, "loss": 0.1197, "step": 7699 }, { "epoch": 5.494919081671058, "grad_norm": 0.148235023021698, "learning_rate": 2.2400865784401e-05, "loss": 0.0754, "step": 7700 }, { "epoch": 5.495671810312382, "grad_norm": 0.16947518289089203, "learning_rate": 2.2390609141582902e-05, "loss": 0.0905, "step": 7701 }, { "epoch": 5.496424538953708, "grad_norm": 0.19195494055747986, "learning_rate": 2.2380356064878766e-05, "loss": 0.2063, "step": 7702 }, { "epoch": 5.497177267595032, "grad_norm": 0.13682378828525543, "learning_rate": 2.237010655540984e-05, "loss": 0.0912, "step": 7703 }, { "epoch": 5.497929996236357, "grad_norm": 0.19966939091682434, "learning_rate": 2.2359860614297018e-05, "loss": 0.1005, "step": 7704 }, { "epoch": 5.498682724877682, "grad_norm": 0.17037861049175262, "learning_rate": 2.2349618242660732e-05, "loss": 0.1329, "step": 7705 }, { "epoch": 5.499435453519006, "grad_norm": 0.16232101619243622, "learning_rate": 2.233937944162114e-05, "loss": 0.1042, "step": 7706 }, { "epoch": 5.500188182160331, "grad_norm": 0.17713090777397156, "learning_rate": 2.2329144212297888e-05, "loss": 0.1591, "step": 7707 }, { "epoch": 5.500940910801656, "grad_norm": 0.12997257709503174, "learning_rate": 2.2318912555810328e-05, "loss": 0.087, "step": 7708 }, { "epoch": 5.501693639442981, "grad_norm": 0.1405528336763382, "learning_rate": 2.2308684473277337e-05, "loss": 0.0867, "step": 7709 }, { "epoch": 5.502446368084305, "grad_norm": 0.16607427597045898, "learning_rate": 2.229845996581747e-05, "loss": 0.0816, "step": 7710 }, { "epoch": 5.503199096725631, "grad_norm": 0.16824765503406525, "learning_rate": 2.228823903454883e-05, "loss": 0.1439, "step": 7711 }, { "epoch": 5.503951825366955, "grad_norm": 0.15088388323783875, "learning_rate": 2.227802168058922e-05, "loss": 0.0721, "step": 7712 }, { "epoch": 5.5047045540082795, "grad_norm": 0.15591658651828766, "learning_rate": 2.2267807905055944e-05, "loss": 0.1732, "step": 7713 }, { "epoch": 5.505457282649605, "grad_norm": 0.1756925731897354, "learning_rate": 2.2257597709066002e-05, "loss": 0.1703, "step": 7714 }, { "epoch": 5.506210011290929, "grad_norm": 0.1101757287979126, "learning_rate": 2.2247391093735924e-05, "loss": 0.1342, "step": 7715 }, { "epoch": 5.506962739932255, "grad_norm": 0.16053858399391174, "learning_rate": 2.2237188060181937e-05, "loss": 0.1772, "step": 7716 }, { "epoch": 5.507715468573579, "grad_norm": 0.14591525495052338, "learning_rate": 2.222698860951979e-05, "loss": 0.0961, "step": 7717 }, { "epoch": 5.508468197214904, "grad_norm": 0.17863665521144867, "learning_rate": 2.2216792742864888e-05, "loss": 0.133, "step": 7718 }, { "epoch": 5.509220925856229, "grad_norm": 0.1822928488254547, "learning_rate": 2.220660046133225e-05, "loss": 0.1542, "step": 7719 }, { "epoch": 5.5099736544975535, "grad_norm": 0.12559135258197784, "learning_rate": 2.219641176603649e-05, "loss": 0.0838, "step": 7720 }, { "epoch": 5.510726383138879, "grad_norm": 0.1301291435956955, "learning_rate": 2.218622665809181e-05, "loss": 0.0782, "step": 7721 }, { "epoch": 5.511479111780203, "grad_norm": 0.1448577493429184, "learning_rate": 2.2176045138612045e-05, "loss": 0.099, "step": 7722 }, { "epoch": 5.512231840421528, "grad_norm": 0.14690056443214417, "learning_rate": 2.2165867208710653e-05, "loss": 0.1102, "step": 7723 }, { "epoch": 5.512984569062853, "grad_norm": 0.13975003361701965, "learning_rate": 2.215569286950064e-05, "loss": 0.089, "step": 7724 }, { "epoch": 5.513737297704178, "grad_norm": 0.16580407321453094, "learning_rate": 2.2145522122094677e-05, "loss": 0.1389, "step": 7725 }, { "epoch": 5.514490026345502, "grad_norm": 0.13816474378108978, "learning_rate": 2.213535496760501e-05, "loss": 0.1107, "step": 7726 }, { "epoch": 5.5152427549868275, "grad_norm": 0.13922446966171265, "learning_rate": 2.2125191407143535e-05, "loss": 0.1445, "step": 7727 }, { "epoch": 5.515995483628152, "grad_norm": 0.15846773982048035, "learning_rate": 2.2115031441821685e-05, "loss": 0.0914, "step": 7728 }, { "epoch": 5.516748212269476, "grad_norm": 0.1623041033744812, "learning_rate": 2.210487507275057e-05, "loss": 0.1769, "step": 7729 }, { "epoch": 5.517500940910802, "grad_norm": 0.15593558549880981, "learning_rate": 2.2094722301040836e-05, "loss": 0.0946, "step": 7730 }, { "epoch": 5.518253669552126, "grad_norm": 0.1708330512046814, "learning_rate": 2.208457312780281e-05, "loss": 0.1082, "step": 7731 }, { "epoch": 5.519006398193452, "grad_norm": 0.15006716549396515, "learning_rate": 2.2074427554146376e-05, "loss": 0.1192, "step": 7732 }, { "epoch": 5.519759126834776, "grad_norm": 0.13436046242713928, "learning_rate": 2.2064285581181056e-05, "loss": 0.1148, "step": 7733 }, { "epoch": 5.520511855476101, "grad_norm": 0.16617175936698914, "learning_rate": 2.2054147210015935e-05, "loss": 0.1649, "step": 7734 }, { "epoch": 5.521264584117426, "grad_norm": 0.2014179229736328, "learning_rate": 2.2044012441759748e-05, "loss": 0.1067, "step": 7735 }, { "epoch": 5.52201731275875, "grad_norm": 0.1296422779560089, "learning_rate": 2.2033881277520805e-05, "loss": 0.0613, "step": 7736 }, { "epoch": 5.522770041400076, "grad_norm": 0.1471654623746872, "learning_rate": 2.2023753718407043e-05, "loss": 0.2044, "step": 7737 }, { "epoch": 5.5235227700414, "grad_norm": 0.1533699631690979, "learning_rate": 2.201362976552599e-05, "loss": 0.0922, "step": 7738 }, { "epoch": 5.524275498682725, "grad_norm": 0.13477826118469238, "learning_rate": 2.200350941998481e-05, "loss": 0.0831, "step": 7739 }, { "epoch": 5.52502822732405, "grad_norm": 0.13820743560791016, "learning_rate": 2.1993392682890214e-05, "loss": 0.1407, "step": 7740 }, { "epoch": 5.5257809559653746, "grad_norm": 0.14675122499465942, "learning_rate": 2.1983279555348586e-05, "loss": 0.1169, "step": 7741 }, { "epoch": 5.526533684606699, "grad_norm": 0.17832238972187042, "learning_rate": 2.1973170038465846e-05, "loss": 0.1072, "step": 7742 }, { "epoch": 5.527286413248024, "grad_norm": 0.15121306478977203, "learning_rate": 2.196306413334758e-05, "loss": 0.1151, "step": 7743 }, { "epoch": 5.528039141889349, "grad_norm": 0.13406214118003845, "learning_rate": 2.1952961841098944e-05, "loss": 0.069, "step": 7744 }, { "epoch": 5.528791870530673, "grad_norm": 0.15514910221099854, "learning_rate": 2.1942863162824735e-05, "loss": 0.1487, "step": 7745 }, { "epoch": 5.529544599171999, "grad_norm": 0.16488681733608246, "learning_rate": 2.1932768099629296e-05, "loss": 0.1186, "step": 7746 }, { "epoch": 5.530297327813323, "grad_norm": 0.15047073364257812, "learning_rate": 2.192267665261662e-05, "loss": 0.1288, "step": 7747 }, { "epoch": 5.531050056454648, "grad_norm": 0.14478440582752228, "learning_rate": 2.191258882289031e-05, "loss": 0.0953, "step": 7748 }, { "epoch": 5.531802785095973, "grad_norm": 0.14361904561519623, "learning_rate": 2.190250461155353e-05, "loss": 0.053, "step": 7749 }, { "epoch": 5.5325555137372975, "grad_norm": 0.1526450514793396, "learning_rate": 2.189242401970908e-05, "loss": 0.0896, "step": 7750 }, { "epoch": 5.533308242378623, "grad_norm": 0.14996446669101715, "learning_rate": 2.1882347048459362e-05, "loss": 0.1029, "step": 7751 }, { "epoch": 5.534060971019947, "grad_norm": 0.12664341926574707, "learning_rate": 2.1872273698906394e-05, "loss": 0.102, "step": 7752 }, { "epoch": 5.534813699661272, "grad_norm": 0.11801361292600632, "learning_rate": 2.1862203972151752e-05, "loss": 0.0474, "step": 7753 }, { "epoch": 5.535566428302597, "grad_norm": 0.1686408668756485, "learning_rate": 2.1852137869296678e-05, "loss": 0.1363, "step": 7754 }, { "epoch": 5.536319156943922, "grad_norm": 0.13685083389282227, "learning_rate": 2.1842075391441952e-05, "loss": 0.0905, "step": 7755 }, { "epoch": 5.537071885585247, "grad_norm": 0.1479780673980713, "learning_rate": 2.1832016539688012e-05, "loss": 0.141, "step": 7756 }, { "epoch": 5.5378246142265715, "grad_norm": 0.12322358787059784, "learning_rate": 2.1821961315134876e-05, "loss": 0.0617, "step": 7757 }, { "epoch": 5.538577342867896, "grad_norm": 0.15533509850502014, "learning_rate": 2.181190971888218e-05, "loss": 0.0594, "step": 7758 }, { "epoch": 5.539330071509221, "grad_norm": 0.16506251692771912, "learning_rate": 2.1801861752029128e-05, "loss": 0.1776, "step": 7759 }, { "epoch": 5.540082800150546, "grad_norm": 0.1533549726009369, "learning_rate": 2.1791817415674575e-05, "loss": 0.0995, "step": 7760 }, { "epoch": 5.54083552879187, "grad_norm": 0.14790290594100952, "learning_rate": 2.1781776710916928e-05, "loss": 0.1248, "step": 7761 }, { "epoch": 5.541588257433196, "grad_norm": 0.14214998483657837, "learning_rate": 2.1771739638854238e-05, "loss": 0.0825, "step": 7762 }, { "epoch": 5.54234098607452, "grad_norm": 0.14815092086791992, "learning_rate": 2.1761706200584143e-05, "loss": 0.0744, "step": 7763 }, { "epoch": 5.5430937147158446, "grad_norm": 0.12753501534461975, "learning_rate": 2.1751676397203898e-05, "loss": 0.0894, "step": 7764 }, { "epoch": 5.54384644335717, "grad_norm": 0.11361797899007797, "learning_rate": 2.1741650229810324e-05, "loss": 0.0949, "step": 7765 }, { "epoch": 5.544599171998494, "grad_norm": 0.17631860077381134, "learning_rate": 2.1731627699499883e-05, "loss": 0.0995, "step": 7766 }, { "epoch": 5.545351900639819, "grad_norm": 0.15818293392658234, "learning_rate": 2.172160880736861e-05, "loss": 0.1635, "step": 7767 }, { "epoch": 5.546104629281144, "grad_norm": 0.16398882865905762, "learning_rate": 2.1711593554512167e-05, "loss": 0.1033, "step": 7768 }, { "epoch": 5.546857357922469, "grad_norm": 0.13376326858997345, "learning_rate": 2.1701581942025795e-05, "loss": 0.1036, "step": 7769 }, { "epoch": 5.547610086563794, "grad_norm": 0.14820776879787445, "learning_rate": 2.1691573971004378e-05, "loss": 0.0913, "step": 7770 }, { "epoch": 5.5483628152051185, "grad_norm": 0.14738251268863678, "learning_rate": 2.1681569642542342e-05, "loss": 0.1215, "step": 7771 }, { "epoch": 5.549115543846443, "grad_norm": 0.15457892417907715, "learning_rate": 2.1671568957733753e-05, "loss": 0.1485, "step": 7772 }, { "epoch": 5.549868272487768, "grad_norm": 0.14627674221992493, "learning_rate": 2.166157191767229e-05, "loss": 0.1369, "step": 7773 }, { "epoch": 5.550621001129093, "grad_norm": 0.13878233730793, "learning_rate": 2.1651578523451187e-05, "loss": 0.0781, "step": 7774 }, { "epoch": 5.551373729770418, "grad_norm": 0.13471581041812897, "learning_rate": 2.1641588776163313e-05, "loss": 0.1327, "step": 7775 }, { "epoch": 5.552126458411743, "grad_norm": 0.15699532628059387, "learning_rate": 2.1631602676901142e-05, "loss": 0.0919, "step": 7776 }, { "epoch": 5.552879187053067, "grad_norm": 0.1705121546983719, "learning_rate": 2.1621620226756745e-05, "loss": 0.0909, "step": 7777 }, { "epoch": 5.5536319156943925, "grad_norm": 0.148708313703537, "learning_rate": 2.1611641426821756e-05, "loss": 0.0954, "step": 7778 }, { "epoch": 5.554384644335717, "grad_norm": 0.12300820648670197, "learning_rate": 2.1601666278187484e-05, "loss": 0.0934, "step": 7779 }, { "epoch": 5.5551373729770415, "grad_norm": 0.16511212289333344, "learning_rate": 2.159169478194476e-05, "loss": 0.2053, "step": 7780 }, { "epoch": 5.555890101618367, "grad_norm": 0.14838775992393494, "learning_rate": 2.1581726939184062e-05, "loss": 0.1005, "step": 7781 }, { "epoch": 5.556642830259691, "grad_norm": 0.12777592241764069, "learning_rate": 2.1571762750995462e-05, "loss": 0.0907, "step": 7782 }, { "epoch": 5.557395558901016, "grad_norm": 0.14914824068546295, "learning_rate": 2.1561802218468645e-05, "loss": 0.1089, "step": 7783 }, { "epoch": 5.558148287542341, "grad_norm": 0.14019016921520233, "learning_rate": 2.155184534269285e-05, "loss": 0.1494, "step": 7784 }, { "epoch": 5.558901016183666, "grad_norm": 0.1516193449497223, "learning_rate": 2.1541892124756975e-05, "loss": 0.0719, "step": 7785 }, { "epoch": 5.559653744824991, "grad_norm": 0.13814623653888702, "learning_rate": 2.153194256574946e-05, "loss": 0.0581, "step": 7786 }, { "epoch": 5.5604064734663154, "grad_norm": 0.1627146601676941, "learning_rate": 2.1521996666758397e-05, "loss": 0.0769, "step": 7787 }, { "epoch": 5.56115920210764, "grad_norm": 0.14365795254707336, "learning_rate": 2.151205442887142e-05, "loss": 0.0813, "step": 7788 }, { "epoch": 5.561911930748965, "grad_norm": 0.18133021891117096, "learning_rate": 2.1502115853175854e-05, "loss": 0.1258, "step": 7789 }, { "epoch": 5.56266465939029, "grad_norm": 0.1480162888765335, "learning_rate": 2.1492180940758518e-05, "loss": 0.1333, "step": 7790 }, { "epoch": 5.563417388031614, "grad_norm": 0.1768949329853058, "learning_rate": 2.1482249692705902e-05, "loss": 0.1793, "step": 7791 }, { "epoch": 5.56417011667294, "grad_norm": 0.1790185421705246, "learning_rate": 2.1472322110104062e-05, "loss": 0.15, "step": 7792 }, { "epoch": 5.564922845314264, "grad_norm": 0.15783792734146118, "learning_rate": 2.1462398194038668e-05, "loss": 0.11, "step": 7793 }, { "epoch": 5.565675573955589, "grad_norm": 0.15060624480247498, "learning_rate": 2.1452477945594984e-05, "loss": 0.0893, "step": 7794 }, { "epoch": 5.566428302596914, "grad_norm": 0.13534320890903473, "learning_rate": 2.1442561365857884e-05, "loss": 0.1297, "step": 7795 }, { "epoch": 5.567181031238238, "grad_norm": 0.1368139237165451, "learning_rate": 2.1432648455911808e-05, "loss": 0.1213, "step": 7796 }, { "epoch": 5.567933759879564, "grad_norm": 0.17152991890907288, "learning_rate": 2.1422739216840828e-05, "loss": 0.0811, "step": 7797 }, { "epoch": 5.568686488520888, "grad_norm": 0.12677054107189178, "learning_rate": 2.1412833649728623e-05, "loss": 0.05, "step": 7798 }, { "epoch": 5.569439217162213, "grad_norm": 0.17308573424816132, "learning_rate": 2.140293175565841e-05, "loss": 0.1177, "step": 7799 }, { "epoch": 5.570191945803538, "grad_norm": 0.15338030457496643, "learning_rate": 2.1393033535713093e-05, "loss": 0.0963, "step": 7800 }, { "epoch": 5.570191945803538, "eval_loss": 0.1547217071056366, "eval_runtime": 456.1165, "eval_samples_per_second": 21.106, "eval_steps_per_second": 0.66, "step": 7800 }, { "epoch": 5.5709446744448625, "grad_norm": 0.16665200889110565, "learning_rate": 2.1383138990975062e-05, "loss": 0.1329, "step": 7801 }, { "epoch": 5.571697403086187, "grad_norm": 0.1481386423110962, "learning_rate": 2.137324812252644e-05, "loss": 0.1216, "step": 7802 }, { "epoch": 5.572450131727512, "grad_norm": 0.14953571557998657, "learning_rate": 2.136336093144884e-05, "loss": 0.1605, "step": 7803 }, { "epoch": 5.573202860368837, "grad_norm": 0.19721561670303345, "learning_rate": 2.135347741882352e-05, "loss": 0.1688, "step": 7804 }, { "epoch": 5.573955589010162, "grad_norm": 0.15717175602912903, "learning_rate": 2.1343597585731318e-05, "loss": 0.1381, "step": 7805 }, { "epoch": 5.574708317651487, "grad_norm": 0.13859190046787262, "learning_rate": 2.1333721433252693e-05, "loss": 0.0427, "step": 7806 }, { "epoch": 5.575461046292811, "grad_norm": 0.1357063502073288, "learning_rate": 2.1323848962467646e-05, "loss": 0.0587, "step": 7807 }, { "epoch": 5.5762137749341365, "grad_norm": 0.17948107421398163, "learning_rate": 2.1313980174455876e-05, "loss": 0.1161, "step": 7808 }, { "epoch": 5.576966503575461, "grad_norm": 0.12828190624713898, "learning_rate": 2.1304115070296565e-05, "loss": 0.08, "step": 7809 }, { "epoch": 5.577719232216786, "grad_norm": 0.14726455509662628, "learning_rate": 2.1294253651068597e-05, "loss": 0.0557, "step": 7810 }, { "epoch": 5.578471960858111, "grad_norm": 0.16297371685504913, "learning_rate": 2.128439591785035e-05, "loss": 0.1167, "step": 7811 }, { "epoch": 5.579224689499435, "grad_norm": 0.14641954004764557, "learning_rate": 2.127454187171989e-05, "loss": 0.1348, "step": 7812 }, { "epoch": 5.579977418140761, "grad_norm": 0.11848026514053345, "learning_rate": 2.1264691513754802e-05, "loss": 0.0698, "step": 7813 }, { "epoch": 5.580730146782085, "grad_norm": 0.14295591413974762, "learning_rate": 2.1254844845032353e-05, "loss": 0.0954, "step": 7814 }, { "epoch": 5.58148287542341, "grad_norm": 0.13853958249092102, "learning_rate": 2.1245001866629322e-05, "loss": 0.1134, "step": 7815 }, { "epoch": 5.582235604064735, "grad_norm": 0.14856480062007904, "learning_rate": 2.1235162579622154e-05, "loss": 0.0937, "step": 7816 }, { "epoch": 5.582988332706059, "grad_norm": 0.15707938373088837, "learning_rate": 2.1225326985086825e-05, "loss": 0.1182, "step": 7817 }, { "epoch": 5.583741061347384, "grad_norm": 0.16106067597866058, "learning_rate": 2.1215495084098957e-05, "loss": 0.156, "step": 7818 }, { "epoch": 5.584493789988709, "grad_norm": 0.13646547496318817, "learning_rate": 2.120566687773377e-05, "loss": 0.1102, "step": 7819 }, { "epoch": 5.585246518630034, "grad_norm": 0.14048007130622864, "learning_rate": 2.1195842367066033e-05, "loss": 0.0536, "step": 7820 }, { "epoch": 5.585999247271358, "grad_norm": 0.13369877636432648, "learning_rate": 2.1186021553170148e-05, "loss": 0.052, "step": 7821 }, { "epoch": 5.586751975912684, "grad_norm": 0.12528640031814575, "learning_rate": 2.1176204437120107e-05, "loss": 0.0598, "step": 7822 }, { "epoch": 5.587504704554008, "grad_norm": 0.16548603773117065, "learning_rate": 2.1166391019989503e-05, "loss": 0.0903, "step": 7823 }, { "epoch": 5.588257433195333, "grad_norm": 0.1598331183195114, "learning_rate": 2.11565813028515e-05, "loss": 0.1431, "step": 7824 }, { "epoch": 5.589010161836658, "grad_norm": 0.13925306499004364, "learning_rate": 2.1146775286778902e-05, "loss": 0.1031, "step": 7825 }, { "epoch": 5.589762890477982, "grad_norm": 0.12072675675153732, "learning_rate": 2.1136972972844026e-05, "loss": 0.0736, "step": 7826 }, { "epoch": 5.590515619119308, "grad_norm": 0.16759660840034485, "learning_rate": 2.1127174362118903e-05, "loss": 0.1043, "step": 7827 }, { "epoch": 5.591268347760632, "grad_norm": 0.1396959275007248, "learning_rate": 2.111737945567505e-05, "loss": 0.0476, "step": 7828 }, { "epoch": 5.592021076401958, "grad_norm": 0.1678948998451233, "learning_rate": 2.1107588254583655e-05, "loss": 0.1648, "step": 7829 }, { "epoch": 5.592773805043282, "grad_norm": 0.146683469414711, "learning_rate": 2.1097800759915436e-05, "loss": 0.0701, "step": 7830 }, { "epoch": 5.5935265336846065, "grad_norm": 0.14213062822818756, "learning_rate": 2.1088016972740767e-05, "loss": 0.1126, "step": 7831 }, { "epoch": 5.594279262325932, "grad_norm": 0.1572636365890503, "learning_rate": 2.107823689412955e-05, "loss": 0.1575, "step": 7832 }, { "epoch": 5.595031990967256, "grad_norm": 0.17100737988948822, "learning_rate": 2.1068460525151378e-05, "loss": 0.1518, "step": 7833 }, { "epoch": 5.595784719608581, "grad_norm": 0.14204001426696777, "learning_rate": 2.1058687866875328e-05, "loss": 0.0991, "step": 7834 }, { "epoch": 5.596537448249906, "grad_norm": 0.13982917368412018, "learning_rate": 2.1048918920370158e-05, "loss": 0.095, "step": 7835 }, { "epoch": 5.597290176891231, "grad_norm": 0.1692177653312683, "learning_rate": 2.1039153686704154e-05, "loss": 0.1396, "step": 7836 }, { "epoch": 5.598042905532555, "grad_norm": 0.1394149661064148, "learning_rate": 2.102939216694526e-05, "loss": 0.1451, "step": 7837 }, { "epoch": 5.5987956341738805, "grad_norm": 0.13459272682666779, "learning_rate": 2.1019634362160952e-05, "loss": 0.0817, "step": 7838 }, { "epoch": 5.599548362815205, "grad_norm": 0.14721041917800903, "learning_rate": 2.1009880273418345e-05, "loss": 0.1295, "step": 7839 }, { "epoch": 5.600301091456529, "grad_norm": 0.17639806866645813, "learning_rate": 2.1000129901784122e-05, "loss": 0.1032, "step": 7840 }, { "epoch": 5.601053820097855, "grad_norm": 0.15126235783100128, "learning_rate": 2.099038324832459e-05, "loss": 0.1134, "step": 7841 }, { "epoch": 5.601806548739179, "grad_norm": 0.16770963370800018, "learning_rate": 2.0980640314105597e-05, "loss": 0.1335, "step": 7842 }, { "epoch": 5.602559277380505, "grad_norm": 0.1567005217075348, "learning_rate": 2.0970901100192636e-05, "loss": 0.1291, "step": 7843 }, { "epoch": 5.603312006021829, "grad_norm": 0.15425221621990204, "learning_rate": 2.096116560765078e-05, "loss": 0.1031, "step": 7844 }, { "epoch": 5.604064734663154, "grad_norm": 0.1541232466697693, "learning_rate": 2.0951433837544658e-05, "loss": 0.1074, "step": 7845 }, { "epoch": 5.604817463304479, "grad_norm": 0.14612272381782532, "learning_rate": 2.094170579093855e-05, "loss": 0.1512, "step": 7846 }, { "epoch": 5.605570191945803, "grad_norm": 0.17154796421527863, "learning_rate": 2.0931981468896284e-05, "loss": 0.1397, "step": 7847 }, { "epoch": 5.606322920587129, "grad_norm": 0.1064739003777504, "learning_rate": 2.0922260872481326e-05, "loss": 0.0645, "step": 7848 }, { "epoch": 5.607075649228453, "grad_norm": 0.14074738323688507, "learning_rate": 2.0912544002756662e-05, "loss": 0.0951, "step": 7849 }, { "epoch": 5.607828377869778, "grad_norm": 0.1537979543209076, "learning_rate": 2.090283086078495e-05, "loss": 0.0635, "step": 7850 }, { "epoch": 5.608581106511103, "grad_norm": 0.1658768653869629, "learning_rate": 2.089312144762838e-05, "loss": 0.1364, "step": 7851 }, { "epoch": 5.609333835152428, "grad_norm": 0.1642216593027115, "learning_rate": 2.0883415764348767e-05, "loss": 0.1198, "step": 7852 }, { "epoch": 5.610086563793752, "grad_norm": 0.1615837961435318, "learning_rate": 2.0873713812007517e-05, "loss": 0.1832, "step": 7853 }, { "epoch": 5.610839292435077, "grad_norm": 0.1343601495027542, "learning_rate": 2.0864015591665626e-05, "loss": 0.1074, "step": 7854 }, { "epoch": 5.611592021076402, "grad_norm": 0.13949230313301086, "learning_rate": 2.0854321104383653e-05, "loss": 0.066, "step": 7855 }, { "epoch": 5.612344749717726, "grad_norm": 0.13001736998558044, "learning_rate": 2.0844630351221804e-05, "loss": 0.077, "step": 7856 }, { "epoch": 5.613097478359052, "grad_norm": 0.1532149463891983, "learning_rate": 2.083494333323981e-05, "loss": 0.1102, "step": 7857 }, { "epoch": 5.613850207000376, "grad_norm": 0.13968215882778168, "learning_rate": 2.0825260051497055e-05, "loss": 0.0863, "step": 7858 }, { "epoch": 5.6146029356417015, "grad_norm": 0.16089586913585663, "learning_rate": 2.0815580507052472e-05, "loss": 0.0873, "step": 7859 }, { "epoch": 5.615355664283026, "grad_norm": 0.13487793505191803, "learning_rate": 2.0805904700964624e-05, "loss": 0.0717, "step": 7860 }, { "epoch": 5.6161083929243505, "grad_norm": 0.1472185254096985, "learning_rate": 2.0796232634291614e-05, "loss": 0.1078, "step": 7861 }, { "epoch": 5.616861121565676, "grad_norm": 0.17152653634548187, "learning_rate": 2.0786564308091196e-05, "loss": 0.0835, "step": 7862 }, { "epoch": 5.617613850207, "grad_norm": 0.12198150902986526, "learning_rate": 2.0776899723420647e-05, "loss": 0.0424, "step": 7863 }, { "epoch": 5.618366578848325, "grad_norm": 0.14959099888801575, "learning_rate": 2.0767238881336898e-05, "loss": 0.1296, "step": 7864 }, { "epoch": 5.61911930748965, "grad_norm": 0.17132210731506348, "learning_rate": 2.075758178289643e-05, "loss": 0.102, "step": 7865 }, { "epoch": 5.619872036130975, "grad_norm": 0.14995494484901428, "learning_rate": 2.074792842915535e-05, "loss": 0.0914, "step": 7866 }, { "epoch": 5.6206247647723, "grad_norm": 0.1558380424976349, "learning_rate": 2.073827882116931e-05, "loss": 0.1641, "step": 7867 }, { "epoch": 5.6213774934136245, "grad_norm": 0.1441742479801178, "learning_rate": 2.0728632959993584e-05, "loss": 0.1222, "step": 7868 }, { "epoch": 5.622130222054949, "grad_norm": 0.14467015862464905, "learning_rate": 2.071899084668304e-05, "loss": 0.1023, "step": 7869 }, { "epoch": 5.622882950696274, "grad_norm": 0.13036790490150452, "learning_rate": 2.070935248229211e-05, "loss": 0.0806, "step": 7870 }, { "epoch": 5.623635679337599, "grad_norm": 0.1564972996711731, "learning_rate": 2.0699717867874834e-05, "loss": 0.0795, "step": 7871 }, { "epoch": 5.624388407978923, "grad_norm": 0.19823040068149567, "learning_rate": 2.0690087004484844e-05, "loss": 0.2195, "step": 7872 }, { "epoch": 5.625141136620249, "grad_norm": 0.13274279236793518, "learning_rate": 2.0680459893175366e-05, "loss": 0.0866, "step": 7873 }, { "epoch": 5.625893865261573, "grad_norm": 0.14304956793785095, "learning_rate": 2.0670836534999186e-05, "loss": 0.0982, "step": 7874 }, { "epoch": 5.626646593902898, "grad_norm": 0.14640608429908752, "learning_rate": 2.0661216931008714e-05, "loss": 0.1362, "step": 7875 }, { "epoch": 5.627399322544223, "grad_norm": 0.12799763679504395, "learning_rate": 2.065160108225593e-05, "loss": 0.1335, "step": 7876 }, { "epoch": 5.628152051185547, "grad_norm": 0.18645308911800385, "learning_rate": 2.06419889897924e-05, "loss": 0.1586, "step": 7877 }, { "epoch": 5.628904779826873, "grad_norm": 0.1497257798910141, "learning_rate": 2.06323806546693e-05, "loss": 0.1228, "step": 7878 }, { "epoch": 5.629657508468197, "grad_norm": 0.14605973660945892, "learning_rate": 2.0622776077937402e-05, "loss": 0.1394, "step": 7879 }, { "epoch": 5.630410237109522, "grad_norm": 0.12283100932836533, "learning_rate": 2.0613175260647e-05, "loss": 0.0818, "step": 7880 }, { "epoch": 5.631162965750847, "grad_norm": 0.15321855247020721, "learning_rate": 2.060357820384807e-05, "loss": 0.1685, "step": 7881 }, { "epoch": 5.6319156943921715, "grad_norm": 0.14307954907417297, "learning_rate": 2.0593984908590104e-05, "loss": 0.1655, "step": 7882 }, { "epoch": 5.632668423033497, "grad_norm": 0.12794429063796997, "learning_rate": 2.0584395375922216e-05, "loss": 0.0779, "step": 7883 }, { "epoch": 5.633421151674821, "grad_norm": 0.1354195922613144, "learning_rate": 2.0574809606893113e-05, "loss": 0.05, "step": 7884 }, { "epoch": 5.634173880316146, "grad_norm": 0.15371212363243103, "learning_rate": 2.0565227602551078e-05, "loss": 0.132, "step": 7885 }, { "epoch": 5.634926608957471, "grad_norm": 0.15156911313533783, "learning_rate": 2.055564936394397e-05, "loss": 0.1307, "step": 7886 }, { "epoch": 5.635679337598796, "grad_norm": 0.14159630239009857, "learning_rate": 2.0546074892119276e-05, "loss": 0.0838, "step": 7887 }, { "epoch": 5.63643206624012, "grad_norm": 0.1496749073266983, "learning_rate": 2.053650418812402e-05, "loss": 0.1089, "step": 7888 }, { "epoch": 5.6371847948814455, "grad_norm": 0.16925877332687378, "learning_rate": 2.0526937253004848e-05, "loss": 0.155, "step": 7889 }, { "epoch": 5.63793752352277, "grad_norm": 0.151058167219162, "learning_rate": 2.0517374087807986e-05, "loss": 0.1597, "step": 7890 }, { "epoch": 5.6386902521640945, "grad_norm": 0.12702524662017822, "learning_rate": 2.0507814693579263e-05, "loss": 0.0854, "step": 7891 }, { "epoch": 5.63944298080542, "grad_norm": 0.15507711470127106, "learning_rate": 2.0498259071364055e-05, "loss": 0.1245, "step": 7892 }, { "epoch": 5.640195709446744, "grad_norm": 0.15177997946739197, "learning_rate": 2.048870722220737e-05, "loss": 0.1817, "step": 7893 }, { "epoch": 5.640948438088069, "grad_norm": 0.1384911835193634, "learning_rate": 2.047915914715378e-05, "loss": 0.0832, "step": 7894 }, { "epoch": 5.641701166729394, "grad_norm": 0.13672876358032227, "learning_rate": 2.0469614847247436e-05, "loss": 0.128, "step": 7895 }, { "epoch": 5.642453895370719, "grad_norm": 0.15469105541706085, "learning_rate": 2.0460074323532098e-05, "loss": 0.0829, "step": 7896 }, { "epoch": 5.643206624012044, "grad_norm": 0.14251771569252014, "learning_rate": 2.0450537577051097e-05, "loss": 0.1504, "step": 7897 }, { "epoch": 5.6439593526533685, "grad_norm": 0.12761333584785461, "learning_rate": 2.0441004608847375e-05, "loss": 0.0587, "step": 7898 }, { "epoch": 5.644712081294693, "grad_norm": 0.141331747174263, "learning_rate": 2.0431475419963425e-05, "loss": 0.0594, "step": 7899 }, { "epoch": 5.645464809936018, "grad_norm": 0.14178292453289032, "learning_rate": 2.0421950011441354e-05, "loss": 0.132, "step": 7900 }, { "epoch": 5.646217538577343, "grad_norm": 0.13737720251083374, "learning_rate": 2.041242838432283e-05, "loss": 0.0969, "step": 7901 }, { "epoch": 5.646970267218668, "grad_norm": 0.13927139341831207, "learning_rate": 2.040291053964914e-05, "loss": 0.0974, "step": 7902 }, { "epoch": 5.647722995859993, "grad_norm": 0.1277513951063156, "learning_rate": 2.0393396478461142e-05, "loss": 0.0744, "step": 7903 }, { "epoch": 5.648475724501317, "grad_norm": 0.15400876104831696, "learning_rate": 2.038388620179928e-05, "loss": 0.1354, "step": 7904 }, { "epoch": 5.649228453142642, "grad_norm": 0.14758671820163727, "learning_rate": 2.037437971070357e-05, "loss": 0.0867, "step": 7905 }, { "epoch": 5.649981181783967, "grad_norm": 0.15665999054908752, "learning_rate": 2.036487700621365e-05, "loss": 0.1078, "step": 7906 }, { "epoch": 5.650733910425291, "grad_norm": 0.1517195999622345, "learning_rate": 2.0355378089368686e-05, "loss": 0.054, "step": 7907 }, { "epoch": 5.651486639066617, "grad_norm": 0.16216151416301727, "learning_rate": 2.034588296120751e-05, "loss": 0.1191, "step": 7908 }, { "epoch": 5.652239367707941, "grad_norm": 0.14656423032283783, "learning_rate": 2.0336391622768438e-05, "loss": 0.1871, "step": 7909 }, { "epoch": 5.652992096349266, "grad_norm": 0.15606558322906494, "learning_rate": 2.0326904075089492e-05, "loss": 0.0966, "step": 7910 }, { "epoch": 5.653744824990591, "grad_norm": 0.1449403017759323, "learning_rate": 2.0317420319208175e-05, "loss": 0.0826, "step": 7911 }, { "epoch": 5.6544975536319155, "grad_norm": 0.1566641926765442, "learning_rate": 2.0307940356161632e-05, "loss": 0.1041, "step": 7912 }, { "epoch": 5.65525028227324, "grad_norm": 0.1461658626794815, "learning_rate": 2.029846418698657e-05, "loss": 0.1419, "step": 7913 }, { "epoch": 5.656003010914565, "grad_norm": 0.17861032485961914, "learning_rate": 2.0288991812719287e-05, "loss": 0.152, "step": 7914 }, { "epoch": 5.65675573955589, "grad_norm": 0.1277836263179779, "learning_rate": 2.027952323439567e-05, "loss": 0.0871, "step": 7915 }, { "epoch": 5.657508468197215, "grad_norm": 0.17787621915340424, "learning_rate": 2.0270058453051194e-05, "loss": 0.1199, "step": 7916 }, { "epoch": 5.65826119683854, "grad_norm": 0.17399145662784576, "learning_rate": 2.0260597469720924e-05, "loss": 0.1256, "step": 7917 }, { "epoch": 5.659013925479864, "grad_norm": 0.1539890319108963, "learning_rate": 2.0251140285439467e-05, "loss": 0.1053, "step": 7918 }, { "epoch": 5.6597666541211895, "grad_norm": 0.1317221224308014, "learning_rate": 2.0241686901241082e-05, "loss": 0.0754, "step": 7919 }, { "epoch": 5.660519382762514, "grad_norm": 0.1289588361978531, "learning_rate": 2.023223731815954e-05, "loss": 0.0657, "step": 7920 }, { "epoch": 5.661272111403839, "grad_norm": 0.1318669617176056, "learning_rate": 2.0222791537228276e-05, "loss": 0.0626, "step": 7921 }, { "epoch": 5.662024840045164, "grad_norm": 0.16379202902317047, "learning_rate": 2.0213349559480206e-05, "loss": 0.1321, "step": 7922 }, { "epoch": 5.662777568686488, "grad_norm": 0.14889991283416748, "learning_rate": 2.020391138594796e-05, "loss": 0.0651, "step": 7923 }, { "epoch": 5.663530297327814, "grad_norm": 0.17187118530273438, "learning_rate": 2.0194477017663634e-05, "loss": 0.1413, "step": 7924 }, { "epoch": 5.664283025969138, "grad_norm": 0.14510297775268555, "learning_rate": 2.0185046455658985e-05, "loss": 0.1032, "step": 7925 }, { "epoch": 5.665035754610463, "grad_norm": 0.14806774258613586, "learning_rate": 2.01756197009653e-05, "loss": 0.0748, "step": 7926 }, { "epoch": 5.665788483251788, "grad_norm": 0.15356458723545074, "learning_rate": 2.0166196754613497e-05, "loss": 0.1343, "step": 7927 }, { "epoch": 5.666541211893112, "grad_norm": 0.16569212079048157, "learning_rate": 2.015677761763402e-05, "loss": 0.1693, "step": 7928 }, { "epoch": 5.667293940534437, "grad_norm": 0.15594381093978882, "learning_rate": 2.0147362291056983e-05, "loss": 0.0968, "step": 7929 }, { "epoch": 5.668046669175762, "grad_norm": 0.13457104563713074, "learning_rate": 2.0137950775911994e-05, "loss": 0.0866, "step": 7930 }, { "epoch": 5.668799397817087, "grad_norm": 0.14589756727218628, "learning_rate": 2.0128543073228305e-05, "loss": 0.1119, "step": 7931 }, { "epoch": 5.669552126458412, "grad_norm": 0.17068037390708923, "learning_rate": 2.0119139184034708e-05, "loss": 0.1659, "step": 7932 }, { "epoch": 5.670304855099737, "grad_norm": 0.12966041266918182, "learning_rate": 2.0109739109359616e-05, "loss": 0.0909, "step": 7933 }, { "epoch": 5.671057583741061, "grad_norm": 0.17176111042499542, "learning_rate": 2.0100342850230974e-05, "loss": 0.0981, "step": 7934 }, { "epoch": 5.671810312382386, "grad_norm": 0.1421305537223816, "learning_rate": 2.0090950407676396e-05, "loss": 0.0761, "step": 7935 }, { "epoch": 5.672563041023711, "grad_norm": 0.14029330015182495, "learning_rate": 2.0081561782722978e-05, "loss": 0.1373, "step": 7936 }, { "epoch": 5.673315769665035, "grad_norm": 0.147517129778862, "learning_rate": 2.0072176976397465e-05, "loss": 0.0757, "step": 7937 }, { "epoch": 5.674068498306361, "grad_norm": 0.16940538585186005, "learning_rate": 2.006279598972618e-05, "loss": 0.1589, "step": 7938 }, { "epoch": 5.674821226947685, "grad_norm": 0.15997497737407684, "learning_rate": 2.0053418823734987e-05, "loss": 0.1601, "step": 7939 }, { "epoch": 5.675573955589011, "grad_norm": 0.16442443430423737, "learning_rate": 2.0044045479449376e-05, "loss": 0.1251, "step": 7940 }, { "epoch": 5.676326684230335, "grad_norm": 0.1669875681400299, "learning_rate": 2.003467595789438e-05, "loss": 0.1091, "step": 7941 }, { "epoch": 5.6770794128716595, "grad_norm": 0.14617709815502167, "learning_rate": 2.002531026009467e-05, "loss": 0.0742, "step": 7942 }, { "epoch": 5.677832141512985, "grad_norm": 0.14485280215740204, "learning_rate": 2.0015948387074437e-05, "loss": 0.1068, "step": 7943 }, { "epoch": 5.678584870154309, "grad_norm": 0.1732160747051239, "learning_rate": 2.00065903398575e-05, "loss": 0.1222, "step": 7944 }, { "epoch": 5.679337598795634, "grad_norm": 0.14760839939117432, "learning_rate": 1.999723611946722e-05, "loss": 0.0831, "step": 7945 }, { "epoch": 5.680090327436959, "grad_norm": 0.1578211784362793, "learning_rate": 1.998788572692659e-05, "loss": 0.1356, "step": 7946 }, { "epoch": 5.680843056078284, "grad_norm": 0.14319542050361633, "learning_rate": 1.9978539163258105e-05, "loss": 0.0847, "step": 7947 }, { "epoch": 5.681595784719608, "grad_norm": 0.17248493432998657, "learning_rate": 1.996919642948395e-05, "loss": 0.1271, "step": 7948 }, { "epoch": 5.6823485133609335, "grad_norm": 0.17218931019306183, "learning_rate": 1.99598575266258e-05, "loss": 0.137, "step": 7949 }, { "epoch": 5.683101242002258, "grad_norm": 0.11841360479593277, "learning_rate": 1.9950522455704944e-05, "loss": 0.0949, "step": 7950 }, { "epoch": 5.683853970643583, "grad_norm": 0.15514816343784332, "learning_rate": 1.994119121774226e-05, "loss": 0.1159, "step": 7951 }, { "epoch": 5.684606699284908, "grad_norm": 0.15705431997776031, "learning_rate": 1.99318638137582e-05, "loss": 0.16, "step": 7952 }, { "epoch": 5.685359427926232, "grad_norm": 0.14481103420257568, "learning_rate": 1.992254024477277e-05, "loss": 0.0974, "step": 7953 }, { "epoch": 5.686112156567558, "grad_norm": 0.15628765523433685, "learning_rate": 1.9913220511805617e-05, "loss": 0.1435, "step": 7954 }, { "epoch": 5.686864885208882, "grad_norm": 0.13550545275211334, "learning_rate": 1.9903904615875906e-05, "loss": 0.1076, "step": 7955 }, { "epoch": 5.6876176138502075, "grad_norm": 0.1507897824048996, "learning_rate": 1.9894592558002438e-05, "loss": 0.1482, "step": 7956 }, { "epoch": 5.688370342491532, "grad_norm": 0.17347675561904907, "learning_rate": 1.988528433920353e-05, "loss": 0.1293, "step": 7957 }, { "epoch": 5.689123071132856, "grad_norm": 0.14439810812473297, "learning_rate": 1.9875979960497137e-05, "loss": 0.0616, "step": 7958 }, { "epoch": 5.689875799774182, "grad_norm": 0.14251136779785156, "learning_rate": 1.986667942290078e-05, "loss": 0.1064, "step": 7959 }, { "epoch": 5.690628528415506, "grad_norm": 0.14390982687473297, "learning_rate": 1.9857382727431524e-05, "loss": 0.1514, "step": 7960 }, { "epoch": 5.691381257056831, "grad_norm": 0.1312781572341919, "learning_rate": 1.9848089875106052e-05, "loss": 0.0403, "step": 7961 }, { "epoch": 5.692133985698156, "grad_norm": 0.13596384227275848, "learning_rate": 1.9838800866940628e-05, "loss": 0.1233, "step": 7962 }, { "epoch": 5.692886714339481, "grad_norm": 0.15595513582229614, "learning_rate": 1.9829515703951086e-05, "loss": 0.061, "step": 7963 }, { "epoch": 5.693639442980805, "grad_norm": 0.1738234907388687, "learning_rate": 1.982023438715282e-05, "loss": 0.1376, "step": 7964 }, { "epoch": 5.69439217162213, "grad_norm": 0.17286111414432526, "learning_rate": 1.9810956917560834e-05, "loss": 0.1324, "step": 7965 }, { "epoch": 5.695144900263455, "grad_norm": 0.13065479695796967, "learning_rate": 1.980168329618968e-05, "loss": 0.0695, "step": 7966 }, { "epoch": 5.695897628904779, "grad_norm": 0.1587604135274887, "learning_rate": 1.9792413524053538e-05, "loss": 0.0913, "step": 7967 }, { "epoch": 5.696650357546105, "grad_norm": 0.1474868804216385, "learning_rate": 1.978314760216611e-05, "loss": 0.1111, "step": 7968 }, { "epoch": 5.697403086187429, "grad_norm": 0.13051055371761322, "learning_rate": 1.9773885531540723e-05, "loss": 0.0913, "step": 7969 }, { "epoch": 5.6981558148287546, "grad_norm": 0.15761971473693848, "learning_rate": 1.9764627313190237e-05, "loss": 0.1538, "step": 7970 }, { "epoch": 5.698908543470079, "grad_norm": 0.13349325954914093, "learning_rate": 1.975537294812715e-05, "loss": 0.0796, "step": 7971 }, { "epoch": 5.6996612721114035, "grad_norm": 0.12991823256015778, "learning_rate": 1.9746122437363474e-05, "loss": 0.0651, "step": 7972 }, { "epoch": 5.700414000752729, "grad_norm": 0.1382962018251419, "learning_rate": 1.9736875781910848e-05, "loss": 0.1026, "step": 7973 }, { "epoch": 5.701166729394053, "grad_norm": 0.12488594651222229, "learning_rate": 1.9727632982780465e-05, "loss": 0.1087, "step": 7974 }, { "epoch": 5.701919458035379, "grad_norm": 0.15987907350063324, "learning_rate": 1.9718394040983118e-05, "loss": 0.1056, "step": 7975 }, { "epoch": 5.702672186676703, "grad_norm": 0.14649611711502075, "learning_rate": 1.9709158957529147e-05, "loss": 0.1334, "step": 7976 }, { "epoch": 5.703424915318028, "grad_norm": 0.1265386939048767, "learning_rate": 1.9699927733428504e-05, "loss": 0.1428, "step": 7977 }, { "epoch": 5.704177643959353, "grad_norm": 0.1446673572063446, "learning_rate": 1.9690700369690675e-05, "loss": 0.1034, "step": 7978 }, { "epoch": 5.7049303726006775, "grad_norm": 0.13123755156993866, "learning_rate": 1.9681476867324772e-05, "loss": 0.1322, "step": 7979 }, { "epoch": 5.705683101242002, "grad_norm": 0.15382979810237885, "learning_rate": 1.9672257227339452e-05, "loss": 0.0768, "step": 7980 }, { "epoch": 5.706435829883327, "grad_norm": 0.1719665378332138, "learning_rate": 1.9663041450742986e-05, "loss": 0.102, "step": 7981 }, { "epoch": 5.707188558524652, "grad_norm": 0.17602869868278503, "learning_rate": 1.9653829538543162e-05, "loss": 0.1243, "step": 7982 }, { "epoch": 5.707941287165976, "grad_norm": 0.14951536059379578, "learning_rate": 1.9644621491747402e-05, "loss": 0.0943, "step": 7983 }, { "epoch": 5.708694015807302, "grad_norm": 0.17795290052890778, "learning_rate": 1.963541731136269e-05, "loss": 0.1218, "step": 7984 }, { "epoch": 5.709446744448626, "grad_norm": 0.14466983079910278, "learning_rate": 1.962621699839556e-05, "loss": 0.1406, "step": 7985 }, { "epoch": 5.7101994730899515, "grad_norm": 0.15972723066806793, "learning_rate": 1.961702055385215e-05, "loss": 0.1015, "step": 7986 }, { "epoch": 5.710952201731276, "grad_norm": 0.17588333785533905, "learning_rate": 1.960782797873818e-05, "loss": 0.0957, "step": 7987 }, { "epoch": 5.7117049303726, "grad_norm": 0.14453096687793732, "learning_rate": 1.959863927405894e-05, "loss": 0.104, "step": 7988 }, { "epoch": 5.712457659013926, "grad_norm": 0.14824803173542023, "learning_rate": 1.958945444081927e-05, "loss": 0.1091, "step": 7989 }, { "epoch": 5.71321038765525, "grad_norm": 0.17523346841335297, "learning_rate": 1.958027348002364e-05, "loss": 0.0869, "step": 7990 }, { "epoch": 5.713963116296575, "grad_norm": 0.1520814597606659, "learning_rate": 1.9571096392676037e-05, "loss": 0.1412, "step": 7991 }, { "epoch": 5.7147158449379, "grad_norm": 0.14518865942955017, "learning_rate": 1.9561923179780066e-05, "loss": 0.0884, "step": 7992 }, { "epoch": 5.7154685735792246, "grad_norm": 0.14456649124622345, "learning_rate": 1.955275384233889e-05, "loss": 0.1113, "step": 7993 }, { "epoch": 5.71622130222055, "grad_norm": 0.14868883788585663, "learning_rate": 1.9543588381355277e-05, "loss": 0.0814, "step": 7994 }, { "epoch": 5.716974030861874, "grad_norm": 0.14864298701286316, "learning_rate": 1.953442679783152e-05, "loss": 0.0673, "step": 7995 }, { "epoch": 5.717726759503199, "grad_norm": 0.15109829604625702, "learning_rate": 1.952526909276953e-05, "loss": 0.128, "step": 7996 }, { "epoch": 5.718479488144524, "grad_norm": 0.14259928464889526, "learning_rate": 1.9516115267170775e-05, "loss": 0.086, "step": 7997 }, { "epoch": 5.719232216785849, "grad_norm": 0.17995230853557587, "learning_rate": 1.95069653220363e-05, "loss": 0.1088, "step": 7998 }, { "epoch": 5.719984945427173, "grad_norm": 0.15971626341342926, "learning_rate": 1.949781925836674e-05, "loss": 0.1244, "step": 7999 }, { "epoch": 5.7207376740684985, "grad_norm": 0.14115947484970093, "learning_rate": 1.9488677077162295e-05, "loss": 0.0465, "step": 8000 }, { "epoch": 5.7207376740684985, "eval_loss": 0.1536247283220291, "eval_runtime": 456.6932, "eval_samples_per_second": 21.08, "eval_steps_per_second": 0.659, "step": 8000 }, { "epoch": 5.721490402709823, "grad_norm": 0.14027763903141022, "learning_rate": 1.9479538779422723e-05, "loss": 0.1025, "step": 8001 }, { "epoch": 5.7222431313511475, "grad_norm": 0.1404503732919693, "learning_rate": 1.9470404366147403e-05, "loss": 0.0773, "step": 8002 }, { "epoch": 5.722995859992473, "grad_norm": 0.14812572300434113, "learning_rate": 1.9461273838335226e-05, "loss": 0.0635, "step": 8003 }, { "epoch": 5.723748588633797, "grad_norm": 0.16264088451862335, "learning_rate": 1.945214719698471e-05, "loss": 0.1273, "step": 8004 }, { "epoch": 5.724501317275123, "grad_norm": 0.13548356294631958, "learning_rate": 1.944302444309393e-05, "loss": 0.1339, "step": 8005 }, { "epoch": 5.725254045916447, "grad_norm": 0.14800913631916046, "learning_rate": 1.9433905577660557e-05, "loss": 0.08, "step": 8006 }, { "epoch": 5.726006774557772, "grad_norm": 0.1475498229265213, "learning_rate": 1.942479060168178e-05, "loss": 0.1404, "step": 8007 }, { "epoch": 5.726759503199097, "grad_norm": 0.16002437472343445, "learning_rate": 1.9415679516154418e-05, "loss": 0.1095, "step": 8008 }, { "epoch": 5.7275122318404215, "grad_norm": 0.16105623543262482, "learning_rate": 1.9406572322074854e-05, "loss": 0.0915, "step": 8009 }, { "epoch": 5.728264960481747, "grad_norm": 0.13245601952075958, "learning_rate": 1.9397469020439014e-05, "loss": 0.0747, "step": 8010 }, { "epoch": 5.729017689123071, "grad_norm": 0.1505684107542038, "learning_rate": 1.9388369612242442e-05, "loss": 0.0657, "step": 8011 }, { "epoch": 5.729770417764396, "grad_norm": 0.13156673312187195, "learning_rate": 1.9379274098480216e-05, "loss": 0.0956, "step": 8012 }, { "epoch": 5.730523146405721, "grad_norm": 0.17587141692638397, "learning_rate": 1.9370182480147037e-05, "loss": 0.1351, "step": 8013 }, { "epoch": 5.731275875047046, "grad_norm": 0.13266079127788544, "learning_rate": 1.9361094758237125e-05, "loss": 0.0845, "step": 8014 }, { "epoch": 5.73202860368837, "grad_norm": 0.1473855972290039, "learning_rate": 1.9352010933744318e-05, "loss": 0.0786, "step": 8015 }, { "epoch": 5.7327813323296954, "grad_norm": 0.1468549370765686, "learning_rate": 1.9342931007661984e-05, "loss": 0.0713, "step": 8016 }, { "epoch": 5.73353406097102, "grad_norm": 0.17436794936656952, "learning_rate": 1.933385498098311e-05, "loss": 0.0698, "step": 8017 }, { "epoch": 5.734286789612344, "grad_norm": 0.1117362454533577, "learning_rate": 1.932478285470023e-05, "loss": 0.0423, "step": 8018 }, { "epoch": 5.73503951825367, "grad_norm": 0.16389727592468262, "learning_rate": 1.931571462980547e-05, "loss": 0.1465, "step": 8019 }, { "epoch": 5.735792246894994, "grad_norm": 0.1509234607219696, "learning_rate": 1.9306650307290498e-05, "loss": 0.1266, "step": 8020 }, { "epoch": 5.736544975536319, "grad_norm": 0.15308190882205963, "learning_rate": 1.9297589888146595e-05, "loss": 0.091, "step": 8021 }, { "epoch": 5.737297704177644, "grad_norm": 0.14472143352031708, "learning_rate": 1.9288533373364578e-05, "loss": 0.1305, "step": 8022 }, { "epoch": 5.7380504328189685, "grad_norm": 0.13099436461925507, "learning_rate": 1.927948076393485e-05, "loss": 0.123, "step": 8023 }, { "epoch": 5.738803161460294, "grad_norm": 0.15387855470180511, "learning_rate": 1.927043206084741e-05, "loss": 0.1386, "step": 8024 }, { "epoch": 5.739555890101618, "grad_norm": 0.12635841965675354, "learning_rate": 1.9261387265091808e-05, "loss": 0.0866, "step": 8025 }, { "epoch": 5.740308618742943, "grad_norm": 0.1521710306406021, "learning_rate": 1.925234637765715e-05, "loss": 0.1012, "step": 8026 }, { "epoch": 5.741061347384268, "grad_norm": 0.1494724601507187, "learning_rate": 1.9243309399532157e-05, "loss": 0.1188, "step": 8027 }, { "epoch": 5.741814076025593, "grad_norm": 0.14811067283153534, "learning_rate": 1.9234276331705083e-05, "loss": 0.1137, "step": 8028 }, { "epoch": 5.742566804666918, "grad_norm": 0.1317632496356964, "learning_rate": 1.9225247175163773e-05, "loss": 0.0542, "step": 8029 }, { "epoch": 5.7433195333082425, "grad_norm": 0.15473110973834991, "learning_rate": 1.921622193089564e-05, "loss": 0.1361, "step": 8030 }, { "epoch": 5.744072261949567, "grad_norm": 0.1400936394929886, "learning_rate": 1.9207200599887698e-05, "loss": 0.0942, "step": 8031 }, { "epoch": 5.744824990590892, "grad_norm": 0.12504251301288605, "learning_rate": 1.9198183183126465e-05, "loss": 0.0593, "step": 8032 }, { "epoch": 5.745577719232217, "grad_norm": 0.11538319289684296, "learning_rate": 1.9189169681598096e-05, "loss": 0.0603, "step": 8033 }, { "epoch": 5.746330447873541, "grad_norm": 0.1436959058046341, "learning_rate": 1.9180160096288304e-05, "loss": 0.1123, "step": 8034 }, { "epoch": 5.747083176514867, "grad_norm": 0.13150475919246674, "learning_rate": 1.9171154428182332e-05, "loss": 0.0992, "step": 8035 }, { "epoch": 5.747835905156191, "grad_norm": 0.11911530792713165, "learning_rate": 1.916215267826505e-05, "loss": 0.0753, "step": 8036 }, { "epoch": 5.748588633797516, "grad_norm": 0.1617174595594406, "learning_rate": 1.9153154847520864e-05, "loss": 0.1367, "step": 8037 }, { "epoch": 5.749341362438841, "grad_norm": 0.16608478128910065, "learning_rate": 1.9144160936933785e-05, "loss": 0.1269, "step": 8038 }, { "epoch": 5.7500940910801654, "grad_norm": 0.1340460181236267, "learning_rate": 1.9135170947487342e-05, "loss": 0.0539, "step": 8039 }, { "epoch": 5.75084681972149, "grad_norm": 0.1347663700580597, "learning_rate": 1.9126184880164695e-05, "loss": 0.1188, "step": 8040 }, { "epoch": 5.751599548362815, "grad_norm": 0.134500652551651, "learning_rate": 1.911720273594852e-05, "loss": 0.0892, "step": 8041 }, { "epoch": 5.75235227700414, "grad_norm": 0.1665424406528473, "learning_rate": 1.910822451582112e-05, "loss": 0.135, "step": 8042 }, { "epoch": 5.753105005645465, "grad_norm": 0.13803622126579285, "learning_rate": 1.9099250220764303e-05, "loss": 0.1183, "step": 8043 }, { "epoch": 5.75385773428679, "grad_norm": 0.16777575016021729, "learning_rate": 1.9090279851759525e-05, "loss": 0.0946, "step": 8044 }, { "epoch": 5.754610462928114, "grad_norm": 0.166524276137352, "learning_rate": 1.9081313409787742e-05, "loss": 0.1385, "step": 8045 }, { "epoch": 5.755363191569439, "grad_norm": 0.11999715864658356, "learning_rate": 1.9072350895829533e-05, "loss": 0.089, "step": 8046 }, { "epoch": 5.756115920210764, "grad_norm": 0.1484576165676117, "learning_rate": 1.9063392310865008e-05, "loss": 0.0902, "step": 8047 }, { "epoch": 5.756868648852089, "grad_norm": 0.14898031949996948, "learning_rate": 1.9054437655873874e-05, "loss": 0.1037, "step": 8048 }, { "epoch": 5.757621377493414, "grad_norm": 0.1357576996088028, "learning_rate": 1.9045486931835373e-05, "loss": 0.0825, "step": 8049 }, { "epoch": 5.758374106134738, "grad_norm": 0.14948482811450958, "learning_rate": 1.903654013972839e-05, "loss": 0.0956, "step": 8050 }, { "epoch": 5.759126834776064, "grad_norm": 0.1527787446975708, "learning_rate": 1.9027597280531294e-05, "loss": 0.0867, "step": 8051 }, { "epoch": 5.759879563417388, "grad_norm": 0.15181180834770203, "learning_rate": 1.9018658355222084e-05, "loss": 0.1135, "step": 8052 }, { "epoch": 5.7606322920587125, "grad_norm": 0.14927184581756592, "learning_rate": 1.9009723364778294e-05, "loss": 0.1152, "step": 8053 }, { "epoch": 5.761385020700038, "grad_norm": 0.1756245642900467, "learning_rate": 1.9000792310177046e-05, "loss": 0.0991, "step": 8054 }, { "epoch": 5.762137749341362, "grad_norm": 0.1359812617301941, "learning_rate": 1.8991865192395024e-05, "loss": 0.0962, "step": 8055 }, { "epoch": 5.762890477982687, "grad_norm": 0.1317620873451233, "learning_rate": 1.89829420124085e-05, "loss": 0.1021, "step": 8056 }, { "epoch": 5.763643206624012, "grad_norm": 0.12400168925523758, "learning_rate": 1.897402277119327e-05, "loss": 0.0668, "step": 8057 }, { "epoch": 5.764395935265337, "grad_norm": 0.14241451025009155, "learning_rate": 1.8965107469724748e-05, "loss": 0.1128, "step": 8058 }, { "epoch": 5.765148663906662, "grad_norm": 0.13901515305042267, "learning_rate": 1.8956196108977913e-05, "loss": 0.1297, "step": 8059 }, { "epoch": 5.7659013925479865, "grad_norm": 0.17318229377269745, "learning_rate": 1.8947288689927263e-05, "loss": 0.131, "step": 8060 }, { "epoch": 5.766654121189311, "grad_norm": 0.15105895698070526, "learning_rate": 1.8938385213546927e-05, "loss": 0.0761, "step": 8061 }, { "epoch": 5.767406849830636, "grad_norm": 0.15221452713012695, "learning_rate": 1.892948568081055e-05, "loss": 0.1316, "step": 8062 }, { "epoch": 5.768159578471961, "grad_norm": 0.12266235798597336, "learning_rate": 1.8920590092691404e-05, "loss": 0.0701, "step": 8063 }, { "epoch": 5.768912307113285, "grad_norm": 0.1430060714483261, "learning_rate": 1.891169845016228e-05, "loss": 0.1013, "step": 8064 }, { "epoch": 5.769665035754611, "grad_norm": 0.16917534172534943, "learning_rate": 1.8902810754195553e-05, "loss": 0.1292, "step": 8065 }, { "epoch": 5.770417764395935, "grad_norm": 0.15226785838603973, "learning_rate": 1.8893927005763167e-05, "loss": 0.1615, "step": 8066 }, { "epoch": 5.7711704930372605, "grad_norm": 0.1655978560447693, "learning_rate": 1.888504720583665e-05, "loss": 0.126, "step": 8067 }, { "epoch": 5.771923221678585, "grad_norm": 0.1476081907749176, "learning_rate": 1.887617135538705e-05, "loss": 0.1562, "step": 8068 }, { "epoch": 5.772675950319909, "grad_norm": 0.15430226922035217, "learning_rate": 1.8867299455385064e-05, "loss": 0.1013, "step": 8069 }, { "epoch": 5.773428678961235, "grad_norm": 0.15100251138210297, "learning_rate": 1.885843150680087e-05, "loss": 0.1345, "step": 8070 }, { "epoch": 5.774181407602559, "grad_norm": 0.13996368646621704, "learning_rate": 1.884956751060428e-05, "loss": 0.0983, "step": 8071 }, { "epoch": 5.774934136243884, "grad_norm": 0.13651540875434875, "learning_rate": 1.8840707467764624e-05, "loss": 0.0779, "step": 8072 }, { "epoch": 5.775686864885209, "grad_norm": 0.13698793947696686, "learning_rate": 1.8831851379250848e-05, "loss": 0.056, "step": 8073 }, { "epoch": 5.776439593526534, "grad_norm": 0.12188001722097397, "learning_rate": 1.8822999246031402e-05, "loss": 0.049, "step": 8074 }, { "epoch": 5.777192322167858, "grad_norm": 0.14357203245162964, "learning_rate": 1.881415106907439e-05, "loss": 0.0585, "step": 8075 }, { "epoch": 5.777945050809183, "grad_norm": 0.13644762337207794, "learning_rate": 1.8805306849347405e-05, "loss": 0.0655, "step": 8076 }, { "epoch": 5.778697779450508, "grad_norm": 0.16188158094882965, "learning_rate": 1.8796466587817652e-05, "loss": 0.1266, "step": 8077 }, { "epoch": 5.779450508091833, "grad_norm": 0.1310592144727707, "learning_rate": 1.8787630285451867e-05, "loss": 0.0958, "step": 8078 }, { "epoch": 5.780203236733158, "grad_norm": 0.15619805455207825, "learning_rate": 1.8778797943216397e-05, "loss": 0.1507, "step": 8079 }, { "epoch": 5.780955965374482, "grad_norm": 0.1492375284433365, "learning_rate": 1.8769969562077132e-05, "loss": 0.0939, "step": 8080 }, { "epoch": 5.781708694015808, "grad_norm": 0.12128317356109619, "learning_rate": 1.8761145142999516e-05, "loss": 0.0537, "step": 8081 }, { "epoch": 5.782461422657132, "grad_norm": 0.15336835384368896, "learning_rate": 1.8752324686948585e-05, "loss": 0.1053, "step": 8082 }, { "epoch": 5.783214151298457, "grad_norm": 0.16788633167743683, "learning_rate": 1.8743508194888925e-05, "loss": 0.1465, "step": 8083 }, { "epoch": 5.783966879939782, "grad_norm": 0.15158165991306305, "learning_rate": 1.8734695667784714e-05, "loss": 0.1029, "step": 8084 }, { "epoch": 5.784719608581106, "grad_norm": 0.1450381577014923, "learning_rate": 1.8725887106599643e-05, "loss": 0.0606, "step": 8085 }, { "epoch": 5.785472337222432, "grad_norm": 0.15140853822231293, "learning_rate": 1.871708251229704e-05, "loss": 0.1084, "step": 8086 }, { "epoch": 5.786225065863756, "grad_norm": 0.1357652246952057, "learning_rate": 1.8708281885839714e-05, "loss": 0.0866, "step": 8087 }, { "epoch": 5.786977794505081, "grad_norm": 0.1273203045129776, "learning_rate": 1.869948522819014e-05, "loss": 0.1267, "step": 8088 }, { "epoch": 5.787730523146406, "grad_norm": 0.14703957736492157, "learning_rate": 1.8690692540310274e-05, "loss": 0.0772, "step": 8089 }, { "epoch": 5.7884832517877305, "grad_norm": 0.14016947150230408, "learning_rate": 1.8681903823161695e-05, "loss": 0.135, "step": 8090 }, { "epoch": 5.789235980429055, "grad_norm": 0.12208443880081177, "learning_rate": 1.86731190777055e-05, "loss": 0.1301, "step": 8091 }, { "epoch": 5.78998870907038, "grad_norm": 0.14178039133548737, "learning_rate": 1.8664338304902395e-05, "loss": 0.1326, "step": 8092 }, { "epoch": 5.790741437711705, "grad_norm": 0.15585796535015106, "learning_rate": 1.8655561505712593e-05, "loss": 0.0777, "step": 8093 }, { "epoch": 5.791494166353029, "grad_norm": 0.11745943129062653, "learning_rate": 1.8646788681095974e-05, "loss": 0.0735, "step": 8094 }, { "epoch": 5.792246894994355, "grad_norm": 0.11537425220012665, "learning_rate": 1.8638019832011874e-05, "loss": 0.0245, "step": 8095 }, { "epoch": 5.792999623635679, "grad_norm": 0.1635482758283615, "learning_rate": 1.8629254959419266e-05, "loss": 0.1059, "step": 8096 }, { "epoch": 5.7937523522770045, "grad_norm": 0.158845454454422, "learning_rate": 1.862049406427665e-05, "loss": 0.1761, "step": 8097 }, { "epoch": 5.794505080918329, "grad_norm": 0.13355940580368042, "learning_rate": 1.8611737147542112e-05, "loss": 0.0854, "step": 8098 }, { "epoch": 5.795257809559653, "grad_norm": 0.12717632949352264, "learning_rate": 1.8602984210173273e-05, "loss": 0.0906, "step": 8099 }, { "epoch": 5.796010538200979, "grad_norm": 0.14287850260734558, "learning_rate": 1.8594235253127375e-05, "loss": 0.1064, "step": 8100 }, { "epoch": 5.796763266842303, "grad_norm": 0.13667897880077362, "learning_rate": 1.8585490277361155e-05, "loss": 0.093, "step": 8101 }, { "epoch": 5.797515995483629, "grad_norm": 0.16386428475379944, "learning_rate": 1.8576749283830995e-05, "loss": 0.1461, "step": 8102 }, { "epoch": 5.798268724124953, "grad_norm": 0.13855798542499542, "learning_rate": 1.8568012273492758e-05, "loss": 0.135, "step": 8103 }, { "epoch": 5.799021452766278, "grad_norm": 0.12794652581214905, "learning_rate": 1.855927924730192e-05, "loss": 0.1008, "step": 8104 }, { "epoch": 5.799774181407603, "grad_norm": 0.17096419632434845, "learning_rate": 1.8550550206213534e-05, "loss": 0.0951, "step": 8105 }, { "epoch": 5.800526910048927, "grad_norm": 0.123019739985466, "learning_rate": 1.8541825151182158e-05, "loss": 0.0398, "step": 8106 }, { "epoch": 5.801279638690252, "grad_norm": 0.16247591376304626, "learning_rate": 1.8533104083161974e-05, "loss": 0.0875, "step": 8107 }, { "epoch": 5.802032367331577, "grad_norm": 0.15749892592430115, "learning_rate": 1.8524387003106696e-05, "loss": 0.1378, "step": 8108 }, { "epoch": 5.802785095972902, "grad_norm": 0.14606693387031555, "learning_rate": 1.8515673911969626e-05, "loss": 0.1435, "step": 8109 }, { "epoch": 5.803537824614226, "grad_norm": 0.1458355188369751, "learning_rate": 1.85069648107036e-05, "loss": 0.1036, "step": 8110 }, { "epoch": 5.8042905532555515, "grad_norm": 0.16779005527496338, "learning_rate": 1.849825970026104e-05, "loss": 0.1306, "step": 8111 }, { "epoch": 5.805043281896876, "grad_norm": 0.141150563955307, "learning_rate": 1.8489558581593914e-05, "loss": 0.1113, "step": 8112 }, { "epoch": 5.8057960105382005, "grad_norm": 0.12675845623016357, "learning_rate": 1.8480861455653762e-05, "loss": 0.0343, "step": 8113 }, { "epoch": 5.806548739179526, "grad_norm": 0.16048961877822876, "learning_rate": 1.8472168323391702e-05, "loss": 0.1089, "step": 8114 }, { "epoch": 5.80730146782085, "grad_norm": 0.1342146396636963, "learning_rate": 1.8463479185758404e-05, "loss": 0.0568, "step": 8115 }, { "epoch": 5.808054196462176, "grad_norm": 0.1540834754705429, "learning_rate": 1.8454794043704076e-05, "loss": 0.1199, "step": 8116 }, { "epoch": 5.8088069251035, "grad_norm": 0.1447027176618576, "learning_rate": 1.844611289817854e-05, "loss": 0.1158, "step": 8117 }, { "epoch": 5.809559653744825, "grad_norm": 0.14778032898902893, "learning_rate": 1.843743575013113e-05, "loss": 0.0908, "step": 8118 }, { "epoch": 5.81031238238615, "grad_norm": 0.1532103717327118, "learning_rate": 1.8428762600510772e-05, "loss": 0.0871, "step": 8119 }, { "epoch": 5.8110651110274745, "grad_norm": 0.13496503233909607, "learning_rate": 1.8420093450265958e-05, "loss": 0.107, "step": 8120 }, { "epoch": 5.8118178396688, "grad_norm": 0.17172634601593018, "learning_rate": 1.8411428300344735e-05, "loss": 0.1751, "step": 8121 }, { "epoch": 5.812570568310124, "grad_norm": 0.16214996576309204, "learning_rate": 1.840276715169469e-05, "loss": 0.103, "step": 8122 }, { "epoch": 5.813323296951449, "grad_norm": 0.14067350327968597, "learning_rate": 1.839411000526302e-05, "loss": 0.0927, "step": 8123 }, { "epoch": 5.814076025592774, "grad_norm": 0.15269415080547333, "learning_rate": 1.838545686199643e-05, "loss": 0.1179, "step": 8124 }, { "epoch": 5.814828754234099, "grad_norm": 0.1505337506532669, "learning_rate": 1.837680772284123e-05, "loss": 0.1988, "step": 8125 }, { "epoch": 5.815581482875423, "grad_norm": 0.1613852083683014, "learning_rate": 1.8368162588743275e-05, "loss": 0.0786, "step": 8126 }, { "epoch": 5.8163342115167485, "grad_norm": 0.15689367055892944, "learning_rate": 1.8359521460647993e-05, "loss": 0.0981, "step": 8127 }, { "epoch": 5.817086940158073, "grad_norm": 0.16263778507709503, "learning_rate": 1.835088433950035e-05, "loss": 0.147, "step": 8128 }, { "epoch": 5.817839668799397, "grad_norm": 0.14705036580562592, "learning_rate": 1.8342251226244894e-05, "loss": 0.1468, "step": 8129 }, { "epoch": 5.818592397440723, "grad_norm": 0.13136141002178192, "learning_rate": 1.8333622121825737e-05, "loss": 0.088, "step": 8130 }, { "epoch": 5.819345126082047, "grad_norm": 0.16258037090301514, "learning_rate": 1.832499702718653e-05, "loss": 0.1594, "step": 8131 }, { "epoch": 5.820097854723373, "grad_norm": 0.15644578635692596, "learning_rate": 1.8316375943270504e-05, "loss": 0.1544, "step": 8132 }, { "epoch": 5.820850583364697, "grad_norm": 0.13619263470172882, "learning_rate": 1.8307758871020452e-05, "loss": 0.1124, "step": 8133 }, { "epoch": 5.8216033120060215, "grad_norm": 0.1423894613981247, "learning_rate": 1.829914581137874e-05, "loss": 0.1057, "step": 8134 }, { "epoch": 5.822356040647347, "grad_norm": 0.12870603799819946, "learning_rate": 1.8290536765287247e-05, "loss": 0.0818, "step": 8135 }, { "epoch": 5.823108769288671, "grad_norm": 0.1364857256412506, "learning_rate": 1.8281931733687474e-05, "loss": 0.0901, "step": 8136 }, { "epoch": 5.823861497929996, "grad_norm": 0.14336664974689484, "learning_rate": 1.8273330717520433e-05, "loss": 0.0802, "step": 8137 }, { "epoch": 5.824614226571321, "grad_norm": 0.13314343988895416, "learning_rate": 1.8264733717726722e-05, "loss": 0.1273, "step": 8138 }, { "epoch": 5.825366955212646, "grad_norm": 0.1347598135471344, "learning_rate": 1.82561407352465e-05, "loss": 0.0758, "step": 8139 }, { "epoch": 5.826119683853971, "grad_norm": 0.17933151125907898, "learning_rate": 1.8247551771019506e-05, "loss": 0.1391, "step": 8140 }, { "epoch": 5.8268724124952955, "grad_norm": 0.16160006821155548, "learning_rate": 1.8238966825984978e-05, "loss": 0.0707, "step": 8141 }, { "epoch": 5.82762514113662, "grad_norm": 0.12902988493442535, "learning_rate": 1.8230385901081777e-05, "loss": 0.0883, "step": 8142 }, { "epoch": 5.828377869777945, "grad_norm": 0.11169037222862244, "learning_rate": 1.8221808997248278e-05, "loss": 0.0772, "step": 8143 }, { "epoch": 5.82913059841927, "grad_norm": 0.13766905665397644, "learning_rate": 1.8213236115422454e-05, "loss": 0.0683, "step": 8144 }, { "epoch": 5.829883327060594, "grad_norm": 0.13324542343616486, "learning_rate": 1.820466725654182e-05, "loss": 0.0654, "step": 8145 }, { "epoch": 5.83063605570192, "grad_norm": 0.14619180560112, "learning_rate": 1.8196102421543472e-05, "loss": 0.0824, "step": 8146 }, { "epoch": 5.831388784343244, "grad_norm": 0.16952838003635406, "learning_rate": 1.8187541611364008e-05, "loss": 0.1579, "step": 8147 }, { "epoch": 5.832141512984569, "grad_norm": 0.14711755514144897, "learning_rate": 1.8178984826939665e-05, "loss": 0.0887, "step": 8148 }, { "epoch": 5.832894241625894, "grad_norm": 0.14484228193759918, "learning_rate": 1.817043206920617e-05, "loss": 0.1122, "step": 8149 }, { "epoch": 5.8336469702672185, "grad_norm": 0.14400839805603027, "learning_rate": 1.8161883339098846e-05, "loss": 0.0714, "step": 8150 }, { "epoch": 5.834399698908544, "grad_norm": 0.11858576536178589, "learning_rate": 1.8153338637552574e-05, "loss": 0.0715, "step": 8151 }, { "epoch": 5.835152427549868, "grad_norm": 0.10864059627056122, "learning_rate": 1.814479796550181e-05, "loss": 0.0836, "step": 8152 }, { "epoch": 5.835905156191193, "grad_norm": 0.1230667382478714, "learning_rate": 1.813626132388051e-05, "loss": 0.0964, "step": 8153 }, { "epoch": 5.836657884832518, "grad_norm": 0.15011250972747803, "learning_rate": 1.812772871362225e-05, "loss": 0.092, "step": 8154 }, { "epoch": 5.837410613473843, "grad_norm": 0.15447014570236206, "learning_rate": 1.8119200135660157e-05, "loss": 0.0769, "step": 8155 }, { "epoch": 5.838163342115168, "grad_norm": 0.1746802031993866, "learning_rate": 1.811067559092687e-05, "loss": 0.1424, "step": 8156 }, { "epoch": 5.838916070756492, "grad_norm": 0.15553732216358185, "learning_rate": 1.8102155080354642e-05, "loss": 0.085, "step": 8157 }, { "epoch": 5.839668799397817, "grad_norm": 0.15472765266895294, "learning_rate": 1.8093638604875263e-05, "loss": 0.1022, "step": 8158 }, { "epoch": 5.840421528039142, "grad_norm": 0.13880957663059235, "learning_rate": 1.8085126165420085e-05, "loss": 0.1276, "step": 8159 }, { "epoch": 5.841174256680467, "grad_norm": 0.16450554132461548, "learning_rate": 1.807661776292e-05, "loss": 0.0929, "step": 8160 }, { "epoch": 5.841926985321791, "grad_norm": 0.14944514632225037, "learning_rate": 1.8068113398305493e-05, "loss": 0.1612, "step": 8161 }, { "epoch": 5.842679713963117, "grad_norm": 0.15927903354167938, "learning_rate": 1.8059613072506564e-05, "loss": 0.1769, "step": 8162 }, { "epoch": 5.843432442604441, "grad_norm": 0.1383674293756485, "learning_rate": 1.8051116786452827e-05, "loss": 0.1068, "step": 8163 }, { "epoch": 5.8441851712457655, "grad_norm": 0.13779142498970032, "learning_rate": 1.8042624541073383e-05, "loss": 0.0843, "step": 8164 }, { "epoch": 5.844937899887091, "grad_norm": 0.1299549639225006, "learning_rate": 1.8034136337296976e-05, "loss": 0.1187, "step": 8165 }, { "epoch": 5.845690628528415, "grad_norm": 0.15239156782627106, "learning_rate": 1.802565217605183e-05, "loss": 0.0869, "step": 8166 }, { "epoch": 5.84644335716974, "grad_norm": 0.13128562271595, "learning_rate": 1.801717205826578e-05, "loss": 0.0945, "step": 8167 }, { "epoch": 5.847196085811065, "grad_norm": 0.13412301242351532, "learning_rate": 1.800869598486618e-05, "loss": 0.1113, "step": 8168 }, { "epoch": 5.84794881445239, "grad_norm": 0.16750414669513702, "learning_rate": 1.800022395677998e-05, "loss": 0.2024, "step": 8169 }, { "epoch": 5.848701543093715, "grad_norm": 0.14374011754989624, "learning_rate": 1.7991755974933648e-05, "loss": 0.0769, "step": 8170 }, { "epoch": 5.8494542717350395, "grad_norm": 0.15470872819423676, "learning_rate": 1.7983292040253258e-05, "loss": 0.1205, "step": 8171 }, { "epoch": 5.850207000376364, "grad_norm": 0.1472492814064026, "learning_rate": 1.797483215366439e-05, "loss": 0.165, "step": 8172 }, { "epoch": 5.850959729017689, "grad_norm": 0.14362215995788574, "learning_rate": 1.7966376316092217e-05, "loss": 0.1105, "step": 8173 }, { "epoch": 5.851712457659014, "grad_norm": 0.1573462337255478, "learning_rate": 1.795792452846144e-05, "loss": 0.1098, "step": 8174 }, { "epoch": 5.852465186300339, "grad_norm": 0.15891695022583008, "learning_rate": 1.794947679169634e-05, "loss": 0.1034, "step": 8175 }, { "epoch": 5.853217914941664, "grad_norm": 0.13696277141571045, "learning_rate": 1.7941033106720768e-05, "loss": 0.1142, "step": 8176 }, { "epoch": 5.853970643582988, "grad_norm": 0.13033074140548706, "learning_rate": 1.7932593474458098e-05, "loss": 0.0716, "step": 8177 }, { "epoch": 5.8547233722243135, "grad_norm": 0.1442013680934906, "learning_rate": 1.7924157895831263e-05, "loss": 0.0947, "step": 8178 }, { "epoch": 5.855476100865638, "grad_norm": 0.13596712052822113, "learning_rate": 1.7915726371762784e-05, "loss": 0.0412, "step": 8179 }, { "epoch": 5.856228829506962, "grad_norm": 0.15335501730442047, "learning_rate": 1.7907298903174723e-05, "loss": 0.1208, "step": 8180 }, { "epoch": 5.856981558148288, "grad_norm": 0.15094460546970367, "learning_rate": 1.7898875490988676e-05, "loss": 0.0715, "step": 8181 }, { "epoch": 5.857734286789612, "grad_norm": 0.14790797233581543, "learning_rate": 1.7890456136125832e-05, "loss": 0.1658, "step": 8182 }, { "epoch": 5.858487015430937, "grad_norm": 0.13244003057479858, "learning_rate": 1.7882040839506892e-05, "loss": 0.0533, "step": 8183 }, { "epoch": 5.859239744072262, "grad_norm": 0.13732796907424927, "learning_rate": 1.7873629602052178e-05, "loss": 0.1022, "step": 8184 }, { "epoch": 5.859992472713587, "grad_norm": 0.14413154125213623, "learning_rate": 1.7865222424681506e-05, "loss": 0.102, "step": 8185 }, { "epoch": 5.860745201354911, "grad_norm": 0.1362769901752472, "learning_rate": 1.7856819308314287e-05, "loss": 0.1043, "step": 8186 }, { "epoch": 5.861497929996236, "grad_norm": 0.14957447350025177, "learning_rate": 1.7848420253869457e-05, "loss": 0.0938, "step": 8187 }, { "epoch": 5.862250658637561, "grad_norm": 0.1516997069120407, "learning_rate": 1.7840025262265538e-05, "loss": 0.129, "step": 8188 }, { "epoch": 5.863003387278886, "grad_norm": 0.1501161754131317, "learning_rate": 1.7831634334420564e-05, "loss": 0.1199, "step": 8189 }, { "epoch": 5.863756115920211, "grad_norm": 0.12848034501075745, "learning_rate": 1.7823247471252202e-05, "loss": 0.0827, "step": 8190 }, { "epoch": 5.864508844561535, "grad_norm": 0.14874430000782013, "learning_rate": 1.781486467367759e-05, "loss": 0.0892, "step": 8191 }, { "epoch": 5.865261573202861, "grad_norm": 0.1593005359172821, "learning_rate": 1.780648594261349e-05, "loss": 0.1157, "step": 8192 }, { "epoch": 5.866014301844185, "grad_norm": 0.1388554722070694, "learning_rate": 1.7798111278976147e-05, "loss": 0.1087, "step": 8193 }, { "epoch": 5.86676703048551, "grad_norm": 0.16016796231269836, "learning_rate": 1.7789740683681433e-05, "loss": 0.091, "step": 8194 }, { "epoch": 5.867519759126835, "grad_norm": 0.14345493912696838, "learning_rate": 1.7781374157644715e-05, "loss": 0.1919, "step": 8195 }, { "epoch": 5.868272487768159, "grad_norm": 0.13150890171527863, "learning_rate": 1.7773011701780984e-05, "loss": 0.0832, "step": 8196 }, { "epoch": 5.869025216409485, "grad_norm": 0.23883482813835144, "learning_rate": 1.776465331700471e-05, "loss": 0.1151, "step": 8197 }, { "epoch": 5.869777945050809, "grad_norm": 0.1539314240217209, "learning_rate": 1.775629900422998e-05, "loss": 0.0887, "step": 8198 }, { "epoch": 5.870530673692134, "grad_norm": 0.15152743458747864, "learning_rate": 1.774794876437038e-05, "loss": 0.1041, "step": 8199 }, { "epoch": 5.871283402333459, "grad_norm": 0.14528115093708038, "learning_rate": 1.77396025983391e-05, "loss": 0.0982, "step": 8200 }, { "epoch": 5.871283402333459, "eval_loss": 0.14935405552387238, "eval_runtime": 456.012, "eval_samples_per_second": 21.111, "eval_steps_per_second": 0.66, "step": 8200 }, { "epoch": 5.8720361309747835, "grad_norm": 0.13897185027599335, "learning_rate": 1.773126050704887e-05, "loss": 0.0587, "step": 8201 }, { "epoch": 5.872788859616108, "grad_norm": 0.13152873516082764, "learning_rate": 1.7722922491411944e-05, "loss": 0.1121, "step": 8202 }, { "epoch": 5.873541588257433, "grad_norm": 0.15156397223472595, "learning_rate": 1.771458855234017e-05, "loss": 0.1209, "step": 8203 }, { "epoch": 5.874294316898758, "grad_norm": 0.18470406532287598, "learning_rate": 1.7706258690744942e-05, "loss": 0.0968, "step": 8204 }, { "epoch": 5.875047045540083, "grad_norm": 0.1468113362789154, "learning_rate": 1.7697932907537195e-05, "loss": 0.1069, "step": 8205 }, { "epoch": 5.875799774181408, "grad_norm": 0.13064731657505035, "learning_rate": 1.7689611203627416e-05, "loss": 0.0531, "step": 8206 }, { "epoch": 5.876552502822732, "grad_norm": 0.16127394139766693, "learning_rate": 1.768129357992567e-05, "loss": 0.1256, "step": 8207 }, { "epoch": 5.8773052314640575, "grad_norm": 0.1469523012638092, "learning_rate": 1.767298003734153e-05, "loss": 0.1839, "step": 8208 }, { "epoch": 5.878057960105382, "grad_norm": 0.15478621423244476, "learning_rate": 1.7664670576784194e-05, "loss": 0.1431, "step": 8209 }, { "epoch": 5.878810688746706, "grad_norm": 0.14343099296092987, "learning_rate": 1.7656365199162332e-05, "loss": 0.0725, "step": 8210 }, { "epoch": 5.879563417388032, "grad_norm": 0.15430624783039093, "learning_rate": 1.7648063905384247e-05, "loss": 0.133, "step": 8211 }, { "epoch": 5.880316146029356, "grad_norm": 0.19114334881305695, "learning_rate": 1.7639766696357723e-05, "loss": 0.1023, "step": 8212 }, { "epoch": 5.881068874670682, "grad_norm": 0.14078204333782196, "learning_rate": 1.7631473572990153e-05, "loss": 0.1173, "step": 8213 }, { "epoch": 5.881821603312006, "grad_norm": 0.12617644667625427, "learning_rate": 1.7623184536188424e-05, "loss": 0.0647, "step": 8214 }, { "epoch": 5.882574331953331, "grad_norm": 0.15218955278396606, "learning_rate": 1.761489958685907e-05, "loss": 0.0987, "step": 8215 }, { "epoch": 5.883327060594656, "grad_norm": 0.13722452521324158, "learning_rate": 1.7606618725908074e-05, "loss": 0.0784, "step": 8216 }, { "epoch": 5.88407978923598, "grad_norm": 0.10977627336978912, "learning_rate": 1.759834195424104e-05, "loss": 0.0463, "step": 8217 }, { "epoch": 5.884832517877305, "grad_norm": 0.1652236133813858, "learning_rate": 1.759006927276309e-05, "loss": 0.1043, "step": 8218 }, { "epoch": 5.88558524651863, "grad_norm": 0.15311676263809204, "learning_rate": 1.7581800682378936e-05, "loss": 0.1661, "step": 8219 }, { "epoch": 5.886337975159955, "grad_norm": 0.12697221338748932, "learning_rate": 1.7573536183992784e-05, "loss": 0.0406, "step": 8220 }, { "epoch": 5.887090703801279, "grad_norm": 0.14380812644958496, "learning_rate": 1.7565275778508444e-05, "loss": 0.0791, "step": 8221 }, { "epoch": 5.8878434324426046, "grad_norm": 0.14369139075279236, "learning_rate": 1.7557019466829266e-05, "loss": 0.1261, "step": 8222 }, { "epoch": 5.888596161083929, "grad_norm": 0.13528132438659668, "learning_rate": 1.7548767249858155e-05, "loss": 0.1425, "step": 8223 }, { "epoch": 5.889348889725254, "grad_norm": 0.14667855203151703, "learning_rate": 1.754051912849754e-05, "loss": 0.1512, "step": 8224 }, { "epoch": 5.890101618366579, "grad_norm": 0.15883907675743103, "learning_rate": 1.7532275103649436e-05, "loss": 0.1714, "step": 8225 }, { "epoch": 5.890854347007903, "grad_norm": 0.12446555495262146, "learning_rate": 1.7524035176215402e-05, "loss": 0.065, "step": 8226 }, { "epoch": 5.891607075649229, "grad_norm": 0.1172725260257721, "learning_rate": 1.7515799347096527e-05, "loss": 0.0678, "step": 8227 }, { "epoch": 5.892359804290553, "grad_norm": 0.13590693473815918, "learning_rate": 1.7507567617193476e-05, "loss": 0.105, "step": 8228 }, { "epoch": 5.8931125329318785, "grad_norm": 0.11437670141458511, "learning_rate": 1.7499339987406466e-05, "loss": 0.0438, "step": 8229 }, { "epoch": 5.893865261573203, "grad_norm": 0.13753147423267365, "learning_rate": 1.7491116458635266e-05, "loss": 0.0969, "step": 8230 }, { "epoch": 5.8946179902145275, "grad_norm": 0.11383163183927536, "learning_rate": 1.7482897031779162e-05, "loss": 0.0865, "step": 8231 }, { "epoch": 5.895370718855853, "grad_norm": 0.12678885459899902, "learning_rate": 1.7474681707737046e-05, "loss": 0.1033, "step": 8232 }, { "epoch": 5.896123447497177, "grad_norm": 0.15237100422382355, "learning_rate": 1.74664704874073e-05, "loss": 0.0749, "step": 8233 }, { "epoch": 5.896876176138502, "grad_norm": 0.12280140817165375, "learning_rate": 1.7458263371687917e-05, "loss": 0.0449, "step": 8234 }, { "epoch": 5.897628904779827, "grad_norm": 0.1552223265171051, "learning_rate": 1.745006036147641e-05, "loss": 0.1815, "step": 8235 }, { "epoch": 5.898381633421152, "grad_norm": 0.1319049596786499, "learning_rate": 1.7441861457669855e-05, "loss": 0.1241, "step": 8236 }, { "epoch": 5.899134362062476, "grad_norm": 0.13077403604984283, "learning_rate": 1.7433666661164854e-05, "loss": 0.0784, "step": 8237 }, { "epoch": 5.8998870907038015, "grad_norm": 0.15962527692317963, "learning_rate": 1.7425475972857597e-05, "loss": 0.1363, "step": 8238 }, { "epoch": 5.900639819345126, "grad_norm": 0.1678323894739151, "learning_rate": 1.7417289393643783e-05, "loss": 0.1448, "step": 8239 }, { "epoch": 5.90139254798645, "grad_norm": 0.1281220018863678, "learning_rate": 1.7409106924418696e-05, "loss": 0.0839, "step": 8240 }, { "epoch": 5.902145276627776, "grad_norm": 0.146963968873024, "learning_rate": 1.740092856607716e-05, "loss": 0.116, "step": 8241 }, { "epoch": 5.9028980052691, "grad_norm": 0.14050434529781342, "learning_rate": 1.7392754319513562e-05, "loss": 0.0922, "step": 8242 }, { "epoch": 5.903650733910426, "grad_norm": 0.15716002881526947, "learning_rate": 1.738458418562179e-05, "loss": 0.0751, "step": 8243 }, { "epoch": 5.90440346255175, "grad_norm": 0.15642093122005463, "learning_rate": 1.7376418165295357e-05, "loss": 0.143, "step": 8244 }, { "epoch": 5.9051561911930746, "grad_norm": 0.13412156701087952, "learning_rate": 1.7368256259427253e-05, "loss": 0.1018, "step": 8245 }, { "epoch": 5.9059089198344, "grad_norm": 0.13534897565841675, "learning_rate": 1.7360098468910065e-05, "loss": 0.1103, "step": 8246 }, { "epoch": 5.906661648475724, "grad_norm": 0.14235053956508636, "learning_rate": 1.7351944794635915e-05, "loss": 0.0821, "step": 8247 }, { "epoch": 5.90741437711705, "grad_norm": 0.11846716701984406, "learning_rate": 1.7343795237496496e-05, "loss": 0.0654, "step": 8248 }, { "epoch": 5.908167105758374, "grad_norm": 0.13037513196468353, "learning_rate": 1.7335649798383e-05, "loss": 0.1117, "step": 8249 }, { "epoch": 5.908919834399699, "grad_norm": 0.14736823737621307, "learning_rate": 1.7327508478186218e-05, "loss": 0.0631, "step": 8250 }, { "epoch": 5.909672563041024, "grad_norm": 0.1256551891565323, "learning_rate": 1.7319371277796476e-05, "loss": 0.0753, "step": 8251 }, { "epoch": 5.9104252916823485, "grad_norm": 0.12942464649677277, "learning_rate": 1.7311238198103627e-05, "loss": 0.0957, "step": 8252 }, { "epoch": 5.911178020323673, "grad_norm": 0.1193857342004776, "learning_rate": 1.7303109239997107e-05, "loss": 0.0599, "step": 8253 }, { "epoch": 5.911930748964998, "grad_norm": 0.13558274507522583, "learning_rate": 1.7294984404365888e-05, "loss": 0.0681, "step": 8254 }, { "epoch": 5.912683477606323, "grad_norm": 0.16767151653766632, "learning_rate": 1.7286863692098486e-05, "loss": 0.1338, "step": 8255 }, { "epoch": 5.913436206247647, "grad_norm": 0.13644510507583618, "learning_rate": 1.7278747104082968e-05, "loss": 0.0527, "step": 8256 }, { "epoch": 5.914188934888973, "grad_norm": 0.17464926838874817, "learning_rate": 1.7270634641206963e-05, "loss": 0.1287, "step": 8257 }, { "epoch": 5.914941663530297, "grad_norm": 0.12856708467006683, "learning_rate": 1.7262526304357614e-05, "loss": 0.0915, "step": 8258 }, { "epoch": 5.9156943921716225, "grad_norm": 0.1398400068283081, "learning_rate": 1.725442209442166e-05, "loss": 0.0898, "step": 8259 }, { "epoch": 5.916447120812947, "grad_norm": 0.15326553583145142, "learning_rate": 1.7246322012285347e-05, "loss": 0.1024, "step": 8260 }, { "epoch": 5.9171998494542715, "grad_norm": 0.1194678321480751, "learning_rate": 1.7238226058834507e-05, "loss": 0.1077, "step": 8261 }, { "epoch": 5.917952578095597, "grad_norm": 0.13189633190631866, "learning_rate": 1.7230134234954487e-05, "loss": 0.0388, "step": 8262 }, { "epoch": 5.918705306736921, "grad_norm": 0.1494700312614441, "learning_rate": 1.7222046541530208e-05, "loss": 0.0671, "step": 8263 }, { "epoch": 5.919458035378246, "grad_norm": 0.1589982509613037, "learning_rate": 1.7213962979446112e-05, "loss": 0.1281, "step": 8264 }, { "epoch": 5.920210764019571, "grad_norm": 0.15097187459468842, "learning_rate": 1.720588354958621e-05, "loss": 0.1093, "step": 8265 }, { "epoch": 5.920963492660896, "grad_norm": 0.16567887365818024, "learning_rate": 1.7197808252834065e-05, "loss": 0.1311, "step": 8266 }, { "epoch": 5.921716221302221, "grad_norm": 0.14240312576293945, "learning_rate": 1.7189737090072783e-05, "loss": 0.0794, "step": 8267 }, { "epoch": 5.9224689499435454, "grad_norm": 0.15327578783035278, "learning_rate": 1.7181670062184995e-05, "loss": 0.0784, "step": 8268 }, { "epoch": 5.92322167858487, "grad_norm": 0.14286495745182037, "learning_rate": 1.7173607170052923e-05, "loss": 0.0919, "step": 8269 }, { "epoch": 5.923974407226195, "grad_norm": 0.15626336634159088, "learning_rate": 1.7165548414558292e-05, "loss": 0.1375, "step": 8270 }, { "epoch": 5.92472713586752, "grad_norm": 0.1393461674451828, "learning_rate": 1.7157493796582398e-05, "loss": 0.0811, "step": 8271 }, { "epoch": 5.925479864508844, "grad_norm": 0.16345980763435364, "learning_rate": 1.7149443317006085e-05, "loss": 0.1189, "step": 8272 }, { "epoch": 5.92623259315017, "grad_norm": 0.1365913450717926, "learning_rate": 1.7141396976709755e-05, "loss": 0.117, "step": 8273 }, { "epoch": 5.926985321791494, "grad_norm": 0.1577150672674179, "learning_rate": 1.7133354776573323e-05, "loss": 0.1153, "step": 8274 }, { "epoch": 5.9277380504328185, "grad_norm": 0.14149390161037445, "learning_rate": 1.712531671747628e-05, "loss": 0.1569, "step": 8275 }, { "epoch": 5.928490779074144, "grad_norm": 0.13688603043556213, "learning_rate": 1.7117282800297674e-05, "loss": 0.0641, "step": 8276 }, { "epoch": 5.929243507715468, "grad_norm": 0.164977565407753, "learning_rate": 1.7109253025916044e-05, "loss": 0.1029, "step": 8277 }, { "epoch": 5.929996236356794, "grad_norm": 0.12066667526960373, "learning_rate": 1.7101227395209546e-05, "loss": 0.0353, "step": 8278 }, { "epoch": 5.930748964998118, "grad_norm": 0.14276742935180664, "learning_rate": 1.709320590905583e-05, "loss": 0.0603, "step": 8279 }, { "epoch": 5.931501693639443, "grad_norm": 0.11976196616888046, "learning_rate": 1.7085188568332146e-05, "loss": 0.0912, "step": 8280 }, { "epoch": 5.932254422280768, "grad_norm": 0.1594960242509842, "learning_rate": 1.7077175373915215e-05, "loss": 0.1185, "step": 8281 }, { "epoch": 5.9330071509220925, "grad_norm": 0.15736332535743713, "learning_rate": 1.7069166326681383e-05, "loss": 0.1288, "step": 8282 }, { "epoch": 5.933759879563418, "grad_norm": 0.13330475986003876, "learning_rate": 1.7061161427506482e-05, "loss": 0.1089, "step": 8283 }, { "epoch": 5.934512608204742, "grad_norm": 0.13872553408145905, "learning_rate": 1.705316067726594e-05, "loss": 0.093, "step": 8284 }, { "epoch": 5.935265336846067, "grad_norm": 0.1446668952703476, "learning_rate": 1.704516407683467e-05, "loss": 0.0694, "step": 8285 }, { "epoch": 5.936018065487392, "grad_norm": 0.13718879222869873, "learning_rate": 1.7037171627087216e-05, "loss": 0.0679, "step": 8286 }, { "epoch": 5.936770794128717, "grad_norm": 0.1482827514410019, "learning_rate": 1.7029183328897584e-05, "loss": 0.0967, "step": 8287 }, { "epoch": 5.937523522770041, "grad_norm": 0.14843811094760895, "learning_rate": 1.7021199183139385e-05, "loss": 0.1441, "step": 8288 }, { "epoch": 5.9382762514113665, "grad_norm": 0.15514954924583435, "learning_rate": 1.7013219190685738e-05, "loss": 0.1085, "step": 8289 }, { "epoch": 5.939028980052691, "grad_norm": 0.13207566738128662, "learning_rate": 1.7005243352409334e-05, "loss": 0.0374, "step": 8290 }, { "epoch": 5.9397817086940154, "grad_norm": 0.16188475489616394, "learning_rate": 1.6997271669182372e-05, "loss": 0.1289, "step": 8291 }, { "epoch": 5.940534437335341, "grad_norm": 0.16238872706890106, "learning_rate": 1.6989304141876664e-05, "loss": 0.1673, "step": 8292 }, { "epoch": 5.941287165976665, "grad_norm": 0.1614462435245514, "learning_rate": 1.69813407713635e-05, "loss": 0.153, "step": 8293 }, { "epoch": 5.94203989461799, "grad_norm": 0.124757319688797, "learning_rate": 1.6973381558513753e-05, "loss": 0.1051, "step": 8294 }, { "epoch": 5.942792623259315, "grad_norm": 0.1544596254825592, "learning_rate": 1.6965426504197822e-05, "loss": 0.1421, "step": 8295 }, { "epoch": 5.94354535190064, "grad_norm": 0.15890516340732574, "learning_rate": 1.6957475609285666e-05, "loss": 0.2326, "step": 8296 }, { "epoch": 5.944298080541965, "grad_norm": 0.16244232654571533, "learning_rate": 1.6949528874646774e-05, "loss": 0.0966, "step": 8297 }, { "epoch": 5.945050809183289, "grad_norm": 0.15080654621124268, "learning_rate": 1.6941586301150214e-05, "loss": 0.0683, "step": 8298 }, { "epoch": 5.945803537824614, "grad_norm": 0.1512879580259323, "learning_rate": 1.6933647889664538e-05, "loss": 0.1241, "step": 8299 }, { "epoch": 5.946556266465939, "grad_norm": 0.14079295098781586, "learning_rate": 1.6925713641057904e-05, "loss": 0.1004, "step": 8300 }, { "epoch": 5.947308995107264, "grad_norm": 0.15511097013950348, "learning_rate": 1.691778355619799e-05, "loss": 0.1248, "step": 8301 }, { "epoch": 5.948061723748589, "grad_norm": 0.14761626720428467, "learning_rate": 1.6909857635952003e-05, "loss": 0.1369, "step": 8302 }, { "epoch": 5.948814452389914, "grad_norm": 0.12095820903778076, "learning_rate": 1.690193588118672e-05, "loss": 0.0739, "step": 8303 }, { "epoch": 5.949567181031238, "grad_norm": 0.1428757756948471, "learning_rate": 1.6894018292768438e-05, "loss": 0.0825, "step": 8304 }, { "epoch": 5.950319909672563, "grad_norm": 0.13716310262680054, "learning_rate": 1.6886104871563037e-05, "loss": 0.1047, "step": 8305 }, { "epoch": 5.951072638313888, "grad_norm": 0.12864315509796143, "learning_rate": 1.6878195618435895e-05, "loss": 0.102, "step": 8306 }, { "epoch": 5.951825366955212, "grad_norm": 0.15862637758255005, "learning_rate": 1.687029053425198e-05, "loss": 0.1102, "step": 8307 }, { "epoch": 5.952578095596538, "grad_norm": 0.15298426151275635, "learning_rate": 1.6862389619875753e-05, "loss": 0.0783, "step": 8308 }, { "epoch": 5.953330824237862, "grad_norm": 0.13546201586723328, "learning_rate": 1.6854492876171264e-05, "loss": 0.0871, "step": 8309 }, { "epoch": 5.954083552879187, "grad_norm": 0.1242193728685379, "learning_rate": 1.684660030400207e-05, "loss": 0.1846, "step": 8310 }, { "epoch": 5.954836281520512, "grad_norm": 0.16183003783226013, "learning_rate": 1.683871190423132e-05, "loss": 0.1068, "step": 8311 }, { "epoch": 5.9555890101618365, "grad_norm": 0.12552635371685028, "learning_rate": 1.6830827677721648e-05, "loss": 0.0872, "step": 8312 }, { "epoch": 5.956341738803161, "grad_norm": 0.16162227094173431, "learning_rate": 1.682294762533529e-05, "loss": 0.0969, "step": 8313 }, { "epoch": 5.957094467444486, "grad_norm": 0.15501371026039124, "learning_rate": 1.681507174793397e-05, "loss": 0.1347, "step": 8314 }, { "epoch": 5.957847196085811, "grad_norm": 0.13323603570461273, "learning_rate": 1.6807200046379e-05, "loss": 0.1014, "step": 8315 }, { "epoch": 5.958599924727136, "grad_norm": 0.12665994465351105, "learning_rate": 1.6799332521531194e-05, "loss": 0.0936, "step": 8316 }, { "epoch": 5.959352653368461, "grad_norm": 0.12363064289093018, "learning_rate": 1.6791469174250973e-05, "loss": 0.0536, "step": 8317 }, { "epoch": 5.960105382009785, "grad_norm": 0.14921978116035461, "learning_rate": 1.6783610005398223e-05, "loss": 0.1154, "step": 8318 }, { "epoch": 5.9608581106511105, "grad_norm": 0.1388511210680008, "learning_rate": 1.6775755015832435e-05, "loss": 0.0715, "step": 8319 }, { "epoch": 5.961610839292435, "grad_norm": 0.13706578314304352, "learning_rate": 1.67679042064126e-05, "loss": 0.1045, "step": 8320 }, { "epoch": 5.96236356793376, "grad_norm": 0.13842958211898804, "learning_rate": 1.676005757799728e-05, "loss": 0.0608, "step": 8321 }, { "epoch": 5.963116296575085, "grad_norm": 0.15261653065681458, "learning_rate": 1.6752215131444578e-05, "loss": 0.1231, "step": 8322 }, { "epoch": 5.963869025216409, "grad_norm": 0.16016054153442383, "learning_rate": 1.674437686761212e-05, "loss": 0.1399, "step": 8323 }, { "epoch": 5.964621753857735, "grad_norm": 0.15305717289447784, "learning_rate": 1.673654278735709e-05, "loss": 0.1457, "step": 8324 }, { "epoch": 5.965374482499059, "grad_norm": 0.14990250766277313, "learning_rate": 1.6728712891536215e-05, "loss": 0.1561, "step": 8325 }, { "epoch": 5.966127211140384, "grad_norm": 0.1524270921945572, "learning_rate": 1.6720887181005763e-05, "loss": 0.0966, "step": 8326 }, { "epoch": 5.966879939781709, "grad_norm": 0.13004638254642487, "learning_rate": 1.6713065656621525e-05, "loss": 0.0838, "step": 8327 }, { "epoch": 5.967632668423033, "grad_norm": 0.14670787751674652, "learning_rate": 1.6705248319238876e-05, "loss": 0.0505, "step": 8328 }, { "epoch": 5.968385397064358, "grad_norm": 0.13828735053539276, "learning_rate": 1.6697435169712676e-05, "loss": 0.0538, "step": 8329 }, { "epoch": 5.969138125705683, "grad_norm": 0.16154678165912628, "learning_rate": 1.6689626208897398e-05, "loss": 0.1347, "step": 8330 }, { "epoch": 5.969890854347008, "grad_norm": 0.12441177666187286, "learning_rate": 1.668182143764699e-05, "loss": 0.0744, "step": 8331 }, { "epoch": 5.970643582988333, "grad_norm": 0.1301904022693634, "learning_rate": 1.667402085681499e-05, "loss": 0.1038, "step": 8332 }, { "epoch": 5.971396311629658, "grad_norm": 0.13762913644313812, "learning_rate": 1.6666224467254438e-05, "loss": 0.1206, "step": 8333 }, { "epoch": 5.972149040270982, "grad_norm": 0.1295517534017563, "learning_rate": 1.665843226981795e-05, "loss": 0.076, "step": 8334 }, { "epoch": 5.972901768912307, "grad_norm": 0.1342104822397232, "learning_rate": 1.6650644265357645e-05, "loss": 0.0874, "step": 8335 }, { "epoch": 5.973654497553632, "grad_norm": 0.1497512310743332, "learning_rate": 1.6642860454725247e-05, "loss": 0.1224, "step": 8336 }, { "epoch": 5.974407226194956, "grad_norm": 0.14590050280094147, "learning_rate": 1.6635080838771948e-05, "loss": 0.1078, "step": 8337 }, { "epoch": 5.975159954836282, "grad_norm": 0.12805376946926117, "learning_rate": 1.662730541834854e-05, "loss": 0.0899, "step": 8338 }, { "epoch": 5.975912683477606, "grad_norm": 0.12927719950675964, "learning_rate": 1.6619534194305308e-05, "loss": 0.1389, "step": 8339 }, { "epoch": 5.9766654121189315, "grad_norm": 0.1621437519788742, "learning_rate": 1.6611767167492124e-05, "loss": 0.1332, "step": 8340 }, { "epoch": 5.977418140760256, "grad_norm": 0.15120142698287964, "learning_rate": 1.6604004338758352e-05, "loss": 0.1097, "step": 8341 }, { "epoch": 5.9781708694015805, "grad_norm": 0.1288755089044571, "learning_rate": 1.659624570895294e-05, "loss": 0.0912, "step": 8342 }, { "epoch": 5.978923598042906, "grad_norm": 0.14687924087047577, "learning_rate": 1.6588491278924362e-05, "loss": 0.1319, "step": 8343 }, { "epoch": 5.97967632668423, "grad_norm": 0.1287355273962021, "learning_rate": 1.6580741049520637e-05, "loss": 0.043, "step": 8344 }, { "epoch": 5.980429055325555, "grad_norm": 0.12914720177650452, "learning_rate": 1.6572995021589293e-05, "loss": 0.0724, "step": 8345 }, { "epoch": 5.98118178396688, "grad_norm": 0.1504610776901245, "learning_rate": 1.6565253195977444e-05, "loss": 0.2044, "step": 8346 }, { "epoch": 5.981934512608205, "grad_norm": 0.14852078258991241, "learning_rate": 1.6557515573531724e-05, "loss": 0.1273, "step": 8347 }, { "epoch": 5.982687241249529, "grad_norm": 0.12337923794984818, "learning_rate": 1.6549782155098298e-05, "loss": 0.1011, "step": 8348 }, { "epoch": 5.9834399698908545, "grad_norm": 0.14464950561523438, "learning_rate": 1.6542052941522885e-05, "loss": 0.0899, "step": 8349 }, { "epoch": 5.984192698532179, "grad_norm": 0.13289225101470947, "learning_rate": 1.653432793365074e-05, "loss": 0.0577, "step": 8350 }, { "epoch": 5.984945427173504, "grad_norm": 0.15262562036514282, "learning_rate": 1.6526607132326676e-05, "loss": 0.0886, "step": 8351 }, { "epoch": 5.985698155814829, "grad_norm": 0.1320917159318924, "learning_rate": 1.6518890538395e-05, "loss": 0.0473, "step": 8352 }, { "epoch": 5.986450884456153, "grad_norm": 0.1716724932193756, "learning_rate": 1.6511178152699602e-05, "loss": 0.1559, "step": 8353 }, { "epoch": 5.987203613097479, "grad_norm": 0.15302297472953796, "learning_rate": 1.6503469976083893e-05, "loss": 0.0635, "step": 8354 }, { "epoch": 5.987956341738803, "grad_norm": 0.13482330739498138, "learning_rate": 1.6495766009390823e-05, "loss": 0.1669, "step": 8355 }, { "epoch": 5.9887090703801285, "grad_norm": 0.17673076689243317, "learning_rate": 1.6488066253462896e-05, "loss": 0.1078, "step": 8356 }, { "epoch": 5.989461799021453, "grad_norm": 0.14343379437923431, "learning_rate": 1.648037070914215e-05, "loss": 0.1391, "step": 8357 }, { "epoch": 5.990214527662777, "grad_norm": 0.1505752056837082, "learning_rate": 1.647267937727014e-05, "loss": 0.1647, "step": 8358 }, { "epoch": 5.990967256304103, "grad_norm": 0.11697009950876236, "learning_rate": 1.6464992258687996e-05, "loss": 0.0752, "step": 8359 }, { "epoch": 5.991719984945427, "grad_norm": 0.12386713176965714, "learning_rate": 1.6457309354236362e-05, "loss": 0.0734, "step": 8360 }, { "epoch": 5.992472713586752, "grad_norm": 0.1405567079782486, "learning_rate": 1.6449630664755416e-05, "loss": 0.1113, "step": 8361 }, { "epoch": 5.993225442228077, "grad_norm": 0.1287328451871872, "learning_rate": 1.6441956191084906e-05, "loss": 0.1097, "step": 8362 }, { "epoch": 5.9939781708694015, "grad_norm": 0.1412249207496643, "learning_rate": 1.6434285934064107e-05, "loss": 0.1824, "step": 8363 }, { "epoch": 5.994730899510726, "grad_norm": 0.15650053322315216, "learning_rate": 1.6426619894531802e-05, "loss": 0.121, "step": 8364 }, { "epoch": 5.995483628152051, "grad_norm": 0.1736263781785965, "learning_rate": 1.6418958073326363e-05, "loss": 0.1089, "step": 8365 }, { "epoch": 5.996236356793376, "grad_norm": 0.12692083418369293, "learning_rate": 1.6411300471285656e-05, "loss": 0.1194, "step": 8366 }, { "epoch": 5.9969890854347, "grad_norm": 0.14487962424755096, "learning_rate": 1.640364708924711e-05, "loss": 0.0931, "step": 8367 }, { "epoch": 5.997741814076026, "grad_norm": 0.1511600762605667, "learning_rate": 1.6395997928047687e-05, "loss": 0.1002, "step": 8368 }, { "epoch": 5.99849454271735, "grad_norm": 0.16088086366653442, "learning_rate": 1.63883529885239e-05, "loss": 0.0878, "step": 8369 }, { "epoch": 5.9992472713586755, "grad_norm": 0.1308668702840805, "learning_rate": 1.6380712271511776e-05, "loss": 0.083, "step": 8370 }, { "epoch": 6.0, "grad_norm": 0.14222902059555054, "learning_rate": 1.6373075777846892e-05, "loss": 0.1535, "step": 8371 }, { "epoch": 6.0007527286413245, "grad_norm": 0.14824321866035461, "learning_rate": 1.6365443508364372e-05, "loss": 0.0773, "step": 8372 }, { "epoch": 6.00150545728265, "grad_norm": 0.1547854095697403, "learning_rate": 1.6357815463898856e-05, "loss": 0.1457, "step": 8373 }, { "epoch": 6.002258185923974, "grad_norm": 0.13945075869560242, "learning_rate": 1.6350191645284546e-05, "loss": 0.103, "step": 8374 }, { "epoch": 6.003010914565299, "grad_norm": 0.1565263718366623, "learning_rate": 1.6342572053355166e-05, "loss": 0.1045, "step": 8375 }, { "epoch": 6.003763643206624, "grad_norm": 0.13873037695884705, "learning_rate": 1.6334956688943994e-05, "loss": 0.084, "step": 8376 }, { "epoch": 6.004516371847949, "grad_norm": 0.1253778636455536, "learning_rate": 1.6327345552883817e-05, "loss": 0.0807, "step": 8377 }, { "epoch": 6.005269100489274, "grad_norm": 0.16399404406547546, "learning_rate": 1.6319738646007e-05, "loss": 0.1038, "step": 8378 }, { "epoch": 6.0060218291305985, "grad_norm": 0.1563863903284073, "learning_rate": 1.63121359691454e-05, "loss": 0.1601, "step": 8379 }, { "epoch": 6.006774557771923, "grad_norm": 0.13897232711315155, "learning_rate": 1.630453752313044e-05, "loss": 0.1647, "step": 8380 }, { "epoch": 6.007527286413248, "grad_norm": 0.14397470653057098, "learning_rate": 1.629694330879308e-05, "loss": 0.1054, "step": 8381 }, { "epoch": 6.008280015054573, "grad_norm": 0.15497656166553497, "learning_rate": 1.628935332696382e-05, "loss": 0.1027, "step": 8382 }, { "epoch": 6.009032743695897, "grad_norm": 0.12761829793453217, "learning_rate": 1.628176757847267e-05, "loss": 0.0783, "step": 8383 }, { "epoch": 6.009785472337223, "grad_norm": 0.130618616938591, "learning_rate": 1.6274186064149223e-05, "loss": 0.0723, "step": 8384 }, { "epoch": 6.010538200978547, "grad_norm": 0.13028286397457123, "learning_rate": 1.6266608784822544e-05, "loss": 0.1379, "step": 8385 }, { "epoch": 6.011290929619872, "grad_norm": 0.14778739213943481, "learning_rate": 1.6259035741321298e-05, "loss": 0.1004, "step": 8386 }, { "epoch": 6.012043658261197, "grad_norm": 0.1521182656288147, "learning_rate": 1.625146693447365e-05, "loss": 0.1321, "step": 8387 }, { "epoch": 6.012796386902521, "grad_norm": 0.14663439989089966, "learning_rate": 1.624390236510734e-05, "loss": 0.0884, "step": 8388 }, { "epoch": 6.013549115543847, "grad_norm": 0.13587038218975067, "learning_rate": 1.6236342034049583e-05, "loss": 0.1064, "step": 8389 }, { "epoch": 6.014301844185171, "grad_norm": 0.14220401644706726, "learning_rate": 1.62287859421272e-05, "loss": 0.1536, "step": 8390 }, { "epoch": 6.015054572826496, "grad_norm": 0.15494313836097717, "learning_rate": 1.6221234090166477e-05, "loss": 0.0635, "step": 8391 }, { "epoch": 6.015807301467821, "grad_norm": 0.14451099932193756, "learning_rate": 1.621368647899329e-05, "loss": 0.0984, "step": 8392 }, { "epoch": 6.0165600301091455, "grad_norm": 0.1513853371143341, "learning_rate": 1.6206143109433036e-05, "loss": 0.0655, "step": 8393 }, { "epoch": 6.017312758750471, "grad_norm": 0.14627979695796967, "learning_rate": 1.6198603982310657e-05, "loss": 0.0956, "step": 8394 }, { "epoch": 6.018065487391795, "grad_norm": 0.16426631808280945, "learning_rate": 1.61910690984506e-05, "loss": 0.1128, "step": 8395 }, { "epoch": 6.01881821603312, "grad_norm": 0.15332679450511932, "learning_rate": 1.6183538458676878e-05, "loss": 0.1531, "step": 8396 }, { "epoch": 6.019570944674445, "grad_norm": 0.13737961649894714, "learning_rate": 1.617601206381304e-05, "loss": 0.0945, "step": 8397 }, { "epoch": 6.02032367331577, "grad_norm": 0.13587404787540436, "learning_rate": 1.6168489914682144e-05, "loss": 0.0922, "step": 8398 }, { "epoch": 6.021076401957094, "grad_norm": 0.12795406579971313, "learning_rate": 1.6160972012106805e-05, "loss": 0.1232, "step": 8399 }, { "epoch": 6.0218291305984195, "grad_norm": 0.14465034008026123, "learning_rate": 1.6153458356909176e-05, "loss": 0.0818, "step": 8400 }, { "epoch": 6.0218291305984195, "eval_loss": 0.14897465705871582, "eval_runtime": 456.2384, "eval_samples_per_second": 21.101, "eval_steps_per_second": 0.66, "step": 8400 }, { "epoch": 6.022581859239744, "grad_norm": 0.15920722484588623, "learning_rate": 1.6145948949910942e-05, "loss": 0.1384, "step": 8401 }, { "epoch": 6.0233345878810685, "grad_norm": 0.123938649892807, "learning_rate": 1.6138443791933307e-05, "loss": 0.0891, "step": 8402 }, { "epoch": 6.024087316522394, "grad_norm": 0.16664329171180725, "learning_rate": 1.6130942883797043e-05, "loss": 0.1775, "step": 8403 }, { "epoch": 6.024840045163718, "grad_norm": 0.12942758202552795, "learning_rate": 1.6123446226322414e-05, "loss": 0.0649, "step": 8404 }, { "epoch": 6.025592773805044, "grad_norm": 0.13521020114421844, "learning_rate": 1.611595382032926e-05, "loss": 0.0899, "step": 8405 }, { "epoch": 6.026345502446368, "grad_norm": 0.15466421842575073, "learning_rate": 1.610846566663693e-05, "loss": 0.1021, "step": 8406 }, { "epoch": 6.027098231087693, "grad_norm": 0.1519104689359665, "learning_rate": 1.6100981766064334e-05, "loss": 0.137, "step": 8407 }, { "epoch": 6.027850959729018, "grad_norm": 0.13770659267902374, "learning_rate": 1.609350211942988e-05, "loss": 0.0941, "step": 8408 }, { "epoch": 6.028603688370342, "grad_norm": 0.12822140753269196, "learning_rate": 1.6086026727551546e-05, "loss": 0.0496, "step": 8409 }, { "epoch": 6.029356417011667, "grad_norm": 0.11240702867507935, "learning_rate": 1.6078555591246814e-05, "loss": 0.1138, "step": 8410 }, { "epoch": 6.030109145652992, "grad_norm": 0.12020906060934067, "learning_rate": 1.6071088711332736e-05, "loss": 0.0796, "step": 8411 }, { "epoch": 6.030861874294317, "grad_norm": 0.150872603058815, "learning_rate": 1.6063626088625848e-05, "loss": 0.0751, "step": 8412 }, { "epoch": 6.031614602935642, "grad_norm": 0.13255678117275238, "learning_rate": 1.6056167723942286e-05, "loss": 0.1102, "step": 8413 }, { "epoch": 6.032367331576967, "grad_norm": 0.12785232067108154, "learning_rate": 1.6048713618097668e-05, "loss": 0.0642, "step": 8414 }, { "epoch": 6.033120060218291, "grad_norm": 0.15118511021137238, "learning_rate": 1.604126377190717e-05, "loss": 0.1395, "step": 8415 }, { "epoch": 6.033872788859616, "grad_norm": 0.1409517079591751, "learning_rate": 1.6033818186185484e-05, "loss": 0.06, "step": 8416 }, { "epoch": 6.034625517500941, "grad_norm": 0.15508398413658142, "learning_rate": 1.6026376861746862e-05, "loss": 0.1021, "step": 8417 }, { "epoch": 6.035378246142265, "grad_norm": 0.16952843964099884, "learning_rate": 1.6018939799405067e-05, "loss": 0.1976, "step": 8418 }, { "epoch": 6.036130974783591, "grad_norm": 0.13953962922096252, "learning_rate": 1.6011506999973417e-05, "loss": 0.1056, "step": 8419 }, { "epoch": 6.036883703424915, "grad_norm": 0.12427588552236557, "learning_rate": 1.6004078464264733e-05, "loss": 0.0726, "step": 8420 }, { "epoch": 6.03763643206624, "grad_norm": 0.11791505664587021, "learning_rate": 1.59966541930914e-05, "loss": 0.0926, "step": 8421 }, { "epoch": 6.038389160707565, "grad_norm": 0.16485126316547394, "learning_rate": 1.598923418726533e-05, "loss": 0.1266, "step": 8422 }, { "epoch": 6.0391418893488895, "grad_norm": 0.14807391166687012, "learning_rate": 1.598181844759795e-05, "loss": 0.1558, "step": 8423 }, { "epoch": 6.039894617990215, "grad_norm": 0.14800679683685303, "learning_rate": 1.5974406974900255e-05, "loss": 0.1127, "step": 8424 }, { "epoch": 6.040647346631539, "grad_norm": 0.15524527430534363, "learning_rate": 1.5966999769982712e-05, "loss": 0.0828, "step": 8425 }, { "epoch": 6.041400075272864, "grad_norm": 0.15262936055660248, "learning_rate": 1.595959683365541e-05, "loss": 0.1304, "step": 8426 }, { "epoch": 6.042152803914189, "grad_norm": 0.11426268517971039, "learning_rate": 1.5952198166727903e-05, "loss": 0.0732, "step": 8427 }, { "epoch": 6.042905532555514, "grad_norm": 0.15358032286167145, "learning_rate": 1.59448037700093e-05, "loss": 0.1083, "step": 8428 }, { "epoch": 6.043658261196838, "grad_norm": 0.12339655309915543, "learning_rate": 1.5937413644308225e-05, "loss": 0.0582, "step": 8429 }, { "epoch": 6.0444109898381635, "grad_norm": 0.1599564403295517, "learning_rate": 1.5930027790432876e-05, "loss": 0.2327, "step": 8430 }, { "epoch": 6.045163718479488, "grad_norm": 0.10399775207042694, "learning_rate": 1.592264620919093e-05, "loss": 0.0461, "step": 8431 }, { "epoch": 6.045916447120813, "grad_norm": 0.14145022630691528, "learning_rate": 1.5915268901389653e-05, "loss": 0.1204, "step": 8432 }, { "epoch": 6.046669175762138, "grad_norm": 0.10217520594596863, "learning_rate": 1.5907895867835806e-05, "loss": 0.0942, "step": 8433 }, { "epoch": 6.047421904403462, "grad_norm": 0.12612862884998322, "learning_rate": 1.59005271093357e-05, "loss": 0.0844, "step": 8434 }, { "epoch": 6.048174633044788, "grad_norm": 0.1268192082643509, "learning_rate": 1.5893162626695156e-05, "loss": 0.0906, "step": 8435 }, { "epoch": 6.048927361686112, "grad_norm": 0.12479416280984879, "learning_rate": 1.5885802420719556e-05, "loss": 0.1092, "step": 8436 }, { "epoch": 6.049680090327437, "grad_norm": 0.14975470304489136, "learning_rate": 1.587844649221378e-05, "loss": 0.1866, "step": 8437 }, { "epoch": 6.050432818968762, "grad_norm": 0.13642381131649017, "learning_rate": 1.587109484198229e-05, "loss": 0.106, "step": 8438 }, { "epoch": 6.051185547610086, "grad_norm": 0.1356101632118225, "learning_rate": 1.5863747470829038e-05, "loss": 0.0991, "step": 8439 }, { "epoch": 6.051938276251412, "grad_norm": 0.1590784192085266, "learning_rate": 1.585640437955753e-05, "loss": 0.1091, "step": 8440 }, { "epoch": 6.052691004892736, "grad_norm": 0.15127728879451752, "learning_rate": 1.5849065568970772e-05, "loss": 0.0758, "step": 8441 }, { "epoch": 6.053443733534061, "grad_norm": 0.15822657942771912, "learning_rate": 1.5841731039871348e-05, "loss": 0.1361, "step": 8442 }, { "epoch": 6.054196462175386, "grad_norm": 0.1459118127822876, "learning_rate": 1.583440079306135e-05, "loss": 0.0549, "step": 8443 }, { "epoch": 6.054949190816711, "grad_norm": 0.12444296479225159, "learning_rate": 1.5827074829342388e-05, "loss": 0.0686, "step": 8444 }, { "epoch": 6.055701919458035, "grad_norm": 0.15001444518566132, "learning_rate": 1.5819753149515625e-05, "loss": 0.0939, "step": 8445 }, { "epoch": 6.05645464809936, "grad_norm": 0.13925379514694214, "learning_rate": 1.5812435754381756e-05, "loss": 0.0823, "step": 8446 }, { "epoch": 6.057207376740685, "grad_norm": 0.13865192234516144, "learning_rate": 1.5805122644741e-05, "loss": 0.1567, "step": 8447 }, { "epoch": 6.057960105382009, "grad_norm": 0.16178923845291138, "learning_rate": 1.5797813821393096e-05, "loss": 0.1702, "step": 8448 }, { "epoch": 6.058712834023335, "grad_norm": 0.12770749628543854, "learning_rate": 1.579050928513734e-05, "loss": 0.0649, "step": 8449 }, { "epoch": 6.059465562664659, "grad_norm": 0.14625513553619385, "learning_rate": 1.578320903677252e-05, "loss": 0.1326, "step": 8450 }, { "epoch": 6.0602182913059846, "grad_norm": 0.14861273765563965, "learning_rate": 1.5775913077097015e-05, "loss": 0.0781, "step": 8451 }, { "epoch": 6.060971019947309, "grad_norm": 0.13794492185115814, "learning_rate": 1.576862140690868e-05, "loss": 0.1237, "step": 8452 }, { "epoch": 6.0617237485886335, "grad_norm": 0.1542590707540512, "learning_rate": 1.576133402700493e-05, "loss": 0.0741, "step": 8453 }, { "epoch": 6.062476477229959, "grad_norm": 0.13517162203788757, "learning_rate": 1.5754050938182687e-05, "loss": 0.0549, "step": 8454 }, { "epoch": 6.063229205871283, "grad_norm": 0.13406860828399658, "learning_rate": 1.5746772141238437e-05, "loss": 0.1389, "step": 8455 }, { "epoch": 6.063981934512608, "grad_norm": 0.14062464237213135, "learning_rate": 1.573949763696815e-05, "loss": 0.1166, "step": 8456 }, { "epoch": 6.064734663153933, "grad_norm": 0.13699211180210114, "learning_rate": 1.573222742616739e-05, "loss": 0.0632, "step": 8457 }, { "epoch": 6.065487391795258, "grad_norm": 0.13012488186359406, "learning_rate": 1.57249615096312e-05, "loss": 0.1368, "step": 8458 }, { "epoch": 6.066240120436583, "grad_norm": 0.17600101232528687, "learning_rate": 1.5717699888154172e-05, "loss": 0.1016, "step": 8459 }, { "epoch": 6.0669928490779075, "grad_norm": 0.14572201669216156, "learning_rate": 1.5710442562530416e-05, "loss": 0.0962, "step": 8460 }, { "epoch": 6.067745577719232, "grad_norm": 0.13774099946022034, "learning_rate": 1.5703189533553605e-05, "loss": 0.1115, "step": 8461 }, { "epoch": 6.068498306360557, "grad_norm": 0.16142533719539642, "learning_rate": 1.5695940802016893e-05, "loss": 0.1029, "step": 8462 }, { "epoch": 6.069251035001882, "grad_norm": 0.1372368335723877, "learning_rate": 1.5688696368712998e-05, "loss": 0.0984, "step": 8463 }, { "epoch": 6.070003763643206, "grad_norm": 0.11940954625606537, "learning_rate": 1.5681456234434168e-05, "loss": 0.0773, "step": 8464 }, { "epoch": 6.070756492284532, "grad_norm": 0.13074304163455963, "learning_rate": 1.5674220399972174e-05, "loss": 0.1043, "step": 8465 }, { "epoch": 6.071509220925856, "grad_norm": 0.1397981345653534, "learning_rate": 1.566698886611831e-05, "loss": 0.1024, "step": 8466 }, { "epoch": 6.0722619495671815, "grad_norm": 0.14895933866500854, "learning_rate": 1.5659761633663402e-05, "loss": 0.1354, "step": 8467 }, { "epoch": 6.073014678208506, "grad_norm": 0.13820357620716095, "learning_rate": 1.5652538703397823e-05, "loss": 0.0794, "step": 8468 }, { "epoch": 6.07376740684983, "grad_norm": 0.15801076591014862, "learning_rate": 1.564532007611145e-05, "loss": 0.1533, "step": 8469 }, { "epoch": 6.074520135491156, "grad_norm": 0.12149565666913986, "learning_rate": 1.56381057525937e-05, "loss": 0.0644, "step": 8470 }, { "epoch": 6.07527286413248, "grad_norm": 0.14028972387313843, "learning_rate": 1.5630895733633524e-05, "loss": 0.1061, "step": 8471 }, { "epoch": 6.076025592773805, "grad_norm": 0.1513431817293167, "learning_rate": 1.5623690020019417e-05, "loss": 0.1236, "step": 8472 }, { "epoch": 6.07677832141513, "grad_norm": 0.13838694989681244, "learning_rate": 1.5616488612539353e-05, "loss": 0.1147, "step": 8473 }, { "epoch": 6.0775310500564546, "grad_norm": 0.13986170291900635, "learning_rate": 1.560929151198089e-05, "loss": 0.1241, "step": 8474 }, { "epoch": 6.078283778697779, "grad_norm": 0.1381123960018158, "learning_rate": 1.5602098719131076e-05, "loss": 0.1156, "step": 8475 }, { "epoch": 6.079036507339104, "grad_norm": 0.1287272721529007, "learning_rate": 1.5594910234776512e-05, "loss": 0.0491, "step": 8476 }, { "epoch": 6.079789235980429, "grad_norm": 0.1636016070842743, "learning_rate": 1.558772605970332e-05, "loss": 0.1534, "step": 8477 }, { "epoch": 6.080541964621754, "grad_norm": 0.14652353525161743, "learning_rate": 1.5580546194697164e-05, "loss": 0.0872, "step": 8478 }, { "epoch": 6.081294693263079, "grad_norm": 0.12038799375295639, "learning_rate": 1.5573370640543198e-05, "loss": 0.1067, "step": 8479 }, { "epoch": 6.082047421904403, "grad_norm": 0.12949836254119873, "learning_rate": 1.556619939802615e-05, "loss": 0.1102, "step": 8480 }, { "epoch": 6.0828001505457285, "grad_norm": 0.15080460906028748, "learning_rate": 1.555903246793024e-05, "loss": 0.1512, "step": 8481 }, { "epoch": 6.083552879187053, "grad_norm": 0.16316227614879608, "learning_rate": 1.5551869851039235e-05, "loss": 0.1316, "step": 8482 }, { "epoch": 6.0843056078283775, "grad_norm": 0.1428239345550537, "learning_rate": 1.5544711548136444e-05, "loss": 0.1046, "step": 8483 }, { "epoch": 6.085058336469703, "grad_norm": 0.13767720758914948, "learning_rate": 1.553755756000468e-05, "loss": 0.0844, "step": 8484 }, { "epoch": 6.085811065111027, "grad_norm": 0.15485058724880219, "learning_rate": 1.553040788742628e-05, "loss": 0.1191, "step": 8485 }, { "epoch": 6.086563793752353, "grad_norm": 0.13817985355854034, "learning_rate": 1.5523262531183147e-05, "loss": 0.1139, "step": 8486 }, { "epoch": 6.087316522393677, "grad_norm": 0.15378886461257935, "learning_rate": 1.5516121492056658e-05, "loss": 0.1112, "step": 8487 }, { "epoch": 6.088069251035002, "grad_norm": 0.1330643892288208, "learning_rate": 1.5508984770827758e-05, "loss": 0.0899, "step": 8488 }, { "epoch": 6.088821979676327, "grad_norm": 0.1309652179479599, "learning_rate": 1.5501852368276915e-05, "loss": 0.082, "step": 8489 }, { "epoch": 6.0895747083176515, "grad_norm": 0.13858874142169952, "learning_rate": 1.5494724285184116e-05, "loss": 0.0987, "step": 8490 }, { "epoch": 6.090327436958976, "grad_norm": 0.13336192071437836, "learning_rate": 1.5487600522328865e-05, "loss": 0.0978, "step": 8491 }, { "epoch": 6.091080165600301, "grad_norm": 0.13951122760772705, "learning_rate": 1.548048108049022e-05, "loss": 0.1373, "step": 8492 }, { "epoch": 6.091832894241626, "grad_norm": 0.12310218065977097, "learning_rate": 1.547336596044675e-05, "loss": 0.0635, "step": 8493 }, { "epoch": 6.09258562288295, "grad_norm": 0.15464027225971222, "learning_rate": 1.5466255162976547e-05, "loss": 0.0857, "step": 8494 }, { "epoch": 6.093338351524276, "grad_norm": 0.14190199971199036, "learning_rate": 1.545914868885724e-05, "loss": 0.1074, "step": 8495 }, { "epoch": 6.0940910801656, "grad_norm": 0.20937389135360718, "learning_rate": 1.5452046538865982e-05, "loss": 0.0654, "step": 8496 }, { "epoch": 6.0948438088069254, "grad_norm": 0.171937495470047, "learning_rate": 1.5444948713779472e-05, "loss": 0.1454, "step": 8497 }, { "epoch": 6.09559653744825, "grad_norm": 0.1658196598291397, "learning_rate": 1.543785521437388e-05, "loss": 0.1101, "step": 8498 }, { "epoch": 6.096349266089574, "grad_norm": 0.13265518844127655, "learning_rate": 1.5430766041424978e-05, "loss": 0.1208, "step": 8499 }, { "epoch": 6.0971019947309, "grad_norm": 0.15606585144996643, "learning_rate": 1.5423681195707997e-05, "loss": 0.0646, "step": 8500 }, { "epoch": 6.097854723372224, "grad_norm": 0.1421165019273758, "learning_rate": 1.541660067799774e-05, "loss": 0.1011, "step": 8501 }, { "epoch": 6.098607452013549, "grad_norm": 0.13547813892364502, "learning_rate": 1.5409524489068517e-05, "loss": 0.0867, "step": 8502 }, { "epoch": 6.099360180654874, "grad_norm": 0.12701581418514252, "learning_rate": 1.5402452629694185e-05, "loss": 0.0997, "step": 8503 }, { "epoch": 6.1001129092961985, "grad_norm": 0.16440843045711517, "learning_rate": 1.5395385100648088e-05, "loss": 0.0561, "step": 8504 }, { "epoch": 6.100865637937524, "grad_norm": 0.13807980716228485, "learning_rate": 1.5388321902703145e-05, "loss": 0.0675, "step": 8505 }, { "epoch": 6.101618366578848, "grad_norm": 0.1330270916223526, "learning_rate": 1.5381263036631753e-05, "loss": 0.1433, "step": 8506 }, { "epoch": 6.102371095220173, "grad_norm": 0.13426831364631653, "learning_rate": 1.5374208503205866e-05, "loss": 0.1028, "step": 8507 }, { "epoch": 6.103123823861498, "grad_norm": 0.15061360597610474, "learning_rate": 1.5367158303196964e-05, "loss": 0.0536, "step": 8508 }, { "epoch": 6.103876552502823, "grad_norm": 0.1649656593799591, "learning_rate": 1.536011243737605e-05, "loss": 0.1271, "step": 8509 }, { "epoch": 6.104629281144147, "grad_norm": 0.15908199548721313, "learning_rate": 1.5353070906513632e-05, "loss": 0.1407, "step": 8510 }, { "epoch": 6.1053820097854725, "grad_norm": 0.14732839167118073, "learning_rate": 1.5346033711379783e-05, "loss": 0.088, "step": 8511 }, { "epoch": 6.106134738426797, "grad_norm": 0.14346620440483093, "learning_rate": 1.5339000852744064e-05, "loss": 0.141, "step": 8512 }, { "epoch": 6.106887467068122, "grad_norm": 0.13691866397857666, "learning_rate": 1.5331972331375574e-05, "loss": 0.0823, "step": 8513 }, { "epoch": 6.107640195709447, "grad_norm": 0.1432054191827774, "learning_rate": 1.532494814804295e-05, "loss": 0.1427, "step": 8514 }, { "epoch": 6.108392924350771, "grad_norm": 0.13628168404102325, "learning_rate": 1.531792830351435e-05, "loss": 0.1752, "step": 8515 }, { "epoch": 6.109145652992097, "grad_norm": 0.13260933756828308, "learning_rate": 1.531091279855746e-05, "loss": 0.0762, "step": 8516 }, { "epoch": 6.109898381633421, "grad_norm": 0.15797919034957886, "learning_rate": 1.530390163393946e-05, "loss": 0.1284, "step": 8517 }, { "epoch": 6.110651110274746, "grad_norm": 0.14227303862571716, "learning_rate": 1.529689481042711e-05, "loss": 0.1217, "step": 8518 }, { "epoch": 6.111403838916071, "grad_norm": 0.1533435434103012, "learning_rate": 1.528989232878664e-05, "loss": 0.0528, "step": 8519 }, { "epoch": 6.1121565675573954, "grad_norm": 0.12001246958971024, "learning_rate": 1.5282894189783843e-05, "loss": 0.0583, "step": 8520 }, { "epoch": 6.112909296198721, "grad_norm": 0.16535545885562897, "learning_rate": 1.5275900394184018e-05, "loss": 0.0944, "step": 8521 }, { "epoch": 6.113662024840045, "grad_norm": 0.12358351051807404, "learning_rate": 1.5268910942752013e-05, "loss": 0.0344, "step": 8522 }, { "epoch": 6.11441475348137, "grad_norm": 0.1417740285396576, "learning_rate": 1.5261925836252165e-05, "loss": 0.169, "step": 8523 }, { "epoch": 6.115167482122695, "grad_norm": 0.14118708670139313, "learning_rate": 1.5254945075448369e-05, "loss": 0.1274, "step": 8524 }, { "epoch": 6.11592021076402, "grad_norm": 0.13458387553691864, "learning_rate": 1.5247968661104017e-05, "loss": 0.0897, "step": 8525 }, { "epoch": 6.116672939405344, "grad_norm": 0.15689678490161896, "learning_rate": 1.5240996593982047e-05, "loss": 0.2133, "step": 8526 }, { "epoch": 6.117425668046669, "grad_norm": 0.11770981550216675, "learning_rate": 1.5234028874844913e-05, "loss": 0.0591, "step": 8527 }, { "epoch": 6.118178396687994, "grad_norm": 0.14686621725559235, "learning_rate": 1.5227065504454596e-05, "loss": 0.1092, "step": 8528 }, { "epoch": 6.118931125329318, "grad_norm": 0.1483919620513916, "learning_rate": 1.5220106483572597e-05, "loss": 0.0858, "step": 8529 }, { "epoch": 6.119683853970644, "grad_norm": 0.12109559029340744, "learning_rate": 1.5213151812959947e-05, "loss": 0.1032, "step": 8530 }, { "epoch": 6.120436582611968, "grad_norm": 0.12596020102500916, "learning_rate": 1.5206201493377193e-05, "loss": 0.0725, "step": 8531 }, { "epoch": 6.121189311253294, "grad_norm": 0.15154790878295898, "learning_rate": 1.5199255525584422e-05, "loss": 0.1136, "step": 8532 }, { "epoch": 6.121942039894618, "grad_norm": 0.13639919459819794, "learning_rate": 1.5192313910341207e-05, "loss": 0.1785, "step": 8533 }, { "epoch": 6.1226947685359425, "grad_norm": 0.14700278639793396, "learning_rate": 1.5185376648406716e-05, "loss": 0.0899, "step": 8534 }, { "epoch": 6.123447497177268, "grad_norm": 0.15029162168502808, "learning_rate": 1.5178443740539558e-05, "loss": 0.131, "step": 8535 }, { "epoch": 6.124200225818592, "grad_norm": 0.13352052867412567, "learning_rate": 1.5171515187497929e-05, "loss": 0.1003, "step": 8536 }, { "epoch": 6.124952954459917, "grad_norm": 0.149620920419693, "learning_rate": 1.516459099003952e-05, "loss": 0.1026, "step": 8537 }, { "epoch": 6.125705683101242, "grad_norm": 0.1456947773694992, "learning_rate": 1.5157671148921546e-05, "loss": 0.1018, "step": 8538 }, { "epoch": 6.126458411742567, "grad_norm": 0.15615329146385193, "learning_rate": 1.5150755664900746e-05, "loss": 0.1403, "step": 8539 }, { "epoch": 6.127211140383892, "grad_norm": 0.13218717277050018, "learning_rate": 1.5143844538733398e-05, "loss": 0.1627, "step": 8540 }, { "epoch": 6.1279638690252165, "grad_norm": 0.13176903128623962, "learning_rate": 1.51369377711753e-05, "loss": 0.0862, "step": 8541 }, { "epoch": 6.128716597666541, "grad_norm": 0.13387298583984375, "learning_rate": 1.5130035362981743e-05, "loss": 0.0796, "step": 8542 }, { "epoch": 6.129469326307866, "grad_norm": 0.11886104196310043, "learning_rate": 1.5123137314907584e-05, "loss": 0.1198, "step": 8543 }, { "epoch": 6.130222054949191, "grad_norm": 0.1465590000152588, "learning_rate": 1.5116243627707163e-05, "loss": 0.1098, "step": 8544 }, { "epoch": 6.130974783590515, "grad_norm": 0.1245419904589653, "learning_rate": 1.5109354302134384e-05, "loss": 0.0822, "step": 8545 }, { "epoch": 6.131727512231841, "grad_norm": 0.17780596017837524, "learning_rate": 1.5102469338942626e-05, "loss": 0.1239, "step": 8546 }, { "epoch": 6.132480240873165, "grad_norm": 0.1388605684041977, "learning_rate": 1.5095588738884862e-05, "loss": 0.0999, "step": 8547 }, { "epoch": 6.13323296951449, "grad_norm": 0.1313129961490631, "learning_rate": 1.5088712502713501e-05, "loss": 0.0817, "step": 8548 }, { "epoch": 6.133985698155815, "grad_norm": 0.13043057918548584, "learning_rate": 1.5081840631180544e-05, "loss": 0.0593, "step": 8549 }, { "epoch": 6.134738426797139, "grad_norm": 0.15348440408706665, "learning_rate": 1.507497312503747e-05, "loss": 0.1224, "step": 8550 }, { "epoch": 6.135491155438465, "grad_norm": 0.11553870886564255, "learning_rate": 1.5068109985035323e-05, "loss": 0.1149, "step": 8551 }, { "epoch": 6.136243884079789, "grad_norm": 0.11611542850732803, "learning_rate": 1.5061251211924606e-05, "loss": 0.0431, "step": 8552 }, { "epoch": 6.136996612721114, "grad_norm": 0.1260402649641037, "learning_rate": 1.5054396806455431e-05, "loss": 0.0382, "step": 8553 }, { "epoch": 6.137749341362439, "grad_norm": 0.17234084010124207, "learning_rate": 1.5047546769377358e-05, "loss": 0.1099, "step": 8554 }, { "epoch": 6.138502070003764, "grad_norm": 0.14920568466186523, "learning_rate": 1.5040701101439511e-05, "loss": 0.0825, "step": 8555 }, { "epoch": 6.139254798645088, "grad_norm": 0.12931300699710846, "learning_rate": 1.50338598033905e-05, "loss": 0.0611, "step": 8556 }, { "epoch": 6.140007527286413, "grad_norm": 0.14447233080863953, "learning_rate": 1.5027022875978496e-05, "loss": 0.1273, "step": 8557 }, { "epoch": 6.140760255927738, "grad_norm": 0.114203542470932, "learning_rate": 1.5020190319951175e-05, "loss": 0.0393, "step": 8558 }, { "epoch": 6.141512984569063, "grad_norm": 0.16689422726631165, "learning_rate": 1.5013362136055731e-05, "loss": 0.1455, "step": 8559 }, { "epoch": 6.142265713210388, "grad_norm": 0.13410398364067078, "learning_rate": 1.5006538325038883e-05, "loss": 0.0926, "step": 8560 }, { "epoch": 6.143018441851712, "grad_norm": 0.13475270569324493, "learning_rate": 1.4999718887646875e-05, "loss": 0.1115, "step": 8561 }, { "epoch": 6.143771170493038, "grad_norm": 0.14643539488315582, "learning_rate": 1.4992903824625478e-05, "loss": 0.1008, "step": 8562 }, { "epoch": 6.144523899134362, "grad_norm": 0.12144535779953003, "learning_rate": 1.4986093136719959e-05, "loss": 0.0613, "step": 8563 }, { "epoch": 6.1452766277756865, "grad_norm": 0.1380467265844345, "learning_rate": 1.4979286824675142e-05, "loss": 0.0673, "step": 8564 }, { "epoch": 6.146029356417012, "grad_norm": 0.13543280959129333, "learning_rate": 1.4972484889235333e-05, "loss": 0.0974, "step": 8565 }, { "epoch": 6.146782085058336, "grad_norm": 0.11147205531597137, "learning_rate": 1.4965687331144414e-05, "loss": 0.0994, "step": 8566 }, { "epoch": 6.147534813699661, "grad_norm": 0.14200840890407562, "learning_rate": 1.4958894151145725e-05, "loss": 0.1571, "step": 8567 }, { "epoch": 6.148287542340986, "grad_norm": 0.1523379385471344, "learning_rate": 1.4952105349982181e-05, "loss": 0.0986, "step": 8568 }, { "epoch": 6.149040270982311, "grad_norm": 0.12885600328445435, "learning_rate": 1.4945320928396175e-05, "loss": 0.0944, "step": 8569 }, { "epoch": 6.149792999623636, "grad_norm": 0.1387719213962555, "learning_rate": 1.4938540887129663e-05, "loss": 0.081, "step": 8570 }, { "epoch": 6.1505457282649605, "grad_norm": 0.14931154251098633, "learning_rate": 1.493176522692407e-05, "loss": 0.1458, "step": 8571 }, { "epoch": 6.151298456906285, "grad_norm": 0.12481219321489334, "learning_rate": 1.4924993948520404e-05, "loss": 0.0613, "step": 8572 }, { "epoch": 6.15205118554761, "grad_norm": 0.12303578853607178, "learning_rate": 1.4918227052659145e-05, "loss": 0.0805, "step": 8573 }, { "epoch": 6.152803914188935, "grad_norm": 0.1552879363298416, "learning_rate": 1.4911464540080317e-05, "loss": 0.1071, "step": 8574 }, { "epoch": 6.153556642830259, "grad_norm": 0.13735081255435944, "learning_rate": 1.490470641152345e-05, "loss": 0.1262, "step": 8575 }, { "epoch": 6.154309371471585, "grad_norm": 0.15618804097175598, "learning_rate": 1.4897952667727613e-05, "loss": 0.0947, "step": 8576 }, { "epoch": 6.155062100112909, "grad_norm": 0.13905631005764008, "learning_rate": 1.4891203309431365e-05, "loss": 0.0901, "step": 8577 }, { "epoch": 6.1558148287542345, "grad_norm": 0.12338857352733612, "learning_rate": 1.488445833737284e-05, "loss": 0.1326, "step": 8578 }, { "epoch": 6.156567557395559, "grad_norm": 0.12333880364894867, "learning_rate": 1.4877717752289622e-05, "loss": 0.0767, "step": 8579 }, { "epoch": 6.157320286036883, "grad_norm": 0.11295074969530106, "learning_rate": 1.487098155491888e-05, "loss": 0.0663, "step": 8580 }, { "epoch": 6.158073014678209, "grad_norm": 0.15168510377407074, "learning_rate": 1.4864249745997255e-05, "loss": 0.0953, "step": 8581 }, { "epoch": 6.158825743319533, "grad_norm": 0.14058391749858856, "learning_rate": 1.4857522326260933e-05, "loss": 0.0966, "step": 8582 }, { "epoch": 6.159578471960858, "grad_norm": 0.10839954763650894, "learning_rate": 1.4850799296445625e-05, "loss": 0.0809, "step": 8583 }, { "epoch": 6.160331200602183, "grad_norm": 0.11807286739349365, "learning_rate": 1.4844080657286535e-05, "loss": 0.067, "step": 8584 }, { "epoch": 6.161083929243508, "grad_norm": 0.15285511314868927, "learning_rate": 1.4837366409518405e-05, "loss": 0.0813, "step": 8585 }, { "epoch": 6.161836657884833, "grad_norm": 0.17338739335536957, "learning_rate": 1.483065655387551e-05, "loss": 0.0894, "step": 8586 }, { "epoch": 6.162589386526157, "grad_norm": 0.1310047209262848, "learning_rate": 1.482395109109162e-05, "loss": 0.0716, "step": 8587 }, { "epoch": 6.163342115167482, "grad_norm": 0.1269054114818573, "learning_rate": 1.4817250021900031e-05, "loss": 0.0577, "step": 8588 }, { "epoch": 6.164094843808807, "grad_norm": 0.14707332849502563, "learning_rate": 1.4810553347033576e-05, "loss": 0.082, "step": 8589 }, { "epoch": 6.164847572450132, "grad_norm": 0.15091881155967712, "learning_rate": 1.4803861067224564e-05, "loss": 0.111, "step": 8590 }, { "epoch": 6.165600301091456, "grad_norm": 0.15064483880996704, "learning_rate": 1.479717318320489e-05, "loss": 0.1092, "step": 8591 }, { "epoch": 6.1663530297327815, "grad_norm": 0.12649096548557281, "learning_rate": 1.4790489695705903e-05, "loss": 0.1195, "step": 8592 }, { "epoch": 6.167105758374106, "grad_norm": 0.13556455075740814, "learning_rate": 1.4783810605458517e-05, "loss": 0.1163, "step": 8593 }, { "epoch": 6.167858487015431, "grad_norm": 0.11989344656467438, "learning_rate": 1.4777135913193132e-05, "loss": 0.0911, "step": 8594 }, { "epoch": 6.168611215656756, "grad_norm": 0.12950661778450012, "learning_rate": 1.4770465619639695e-05, "loss": 0.0699, "step": 8595 }, { "epoch": 6.16936394429808, "grad_norm": 0.14974206686019897, "learning_rate": 1.476379972552765e-05, "loss": 0.1432, "step": 8596 }, { "epoch": 6.170116672939406, "grad_norm": 0.15410451591014862, "learning_rate": 1.4757138231585967e-05, "loss": 0.0893, "step": 8597 }, { "epoch": 6.17086940158073, "grad_norm": 0.12645742297172546, "learning_rate": 1.4750481138543145e-05, "loss": 0.0446, "step": 8598 }, { "epoch": 6.171622130222055, "grad_norm": 0.14282892644405365, "learning_rate": 1.4743828447127203e-05, "loss": 0.0956, "step": 8599 }, { "epoch": 6.17237485886338, "grad_norm": 0.14834453165531158, "learning_rate": 1.4737180158065644e-05, "loss": 0.1167, "step": 8600 }, { "epoch": 6.17237485886338, "eval_loss": 0.14948692917823792, "eval_runtime": 456.7675, "eval_samples_per_second": 21.076, "eval_steps_per_second": 0.659, "step": 8600 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 76, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.6653348537631244e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }