diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.2200326828799386, + "epoch": 2.664039219455926, "eval_steps": 4619, - "global_step": 23095, + "global_step": 27714, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -161780,6 +161780,32357 @@ "eval_test_samples_per_second": 12.594, "eval_test_steps_per_second": 0.787, "step": 23095 + }, + { + "epoch": 2.2201288089974045, + "grad_norm": 1.2289260625839233, + "learning_rate": 0.00014889655420059745, + "loss": 2.0948, + "step": 23096 + }, + { + "epoch": 2.220224935114871, + "grad_norm": 1.25767982006073, + "learning_rate": 0.00014888409880768223, + "loss": 2.0615, + "step": 23097 + }, + { + "epoch": 2.2203210612323367, + "grad_norm": 1.1182098388671875, + "learning_rate": 0.00014887164343317552, + "loss": 1.9915, + "step": 23098 + }, + { + "epoch": 2.220417187349803, + "grad_norm": 1.1862187385559082, + "learning_rate": 0.0001488591880771649, + "loss": 1.8894, + "step": 23099 + }, + { + "epoch": 2.220513313467269, + "grad_norm": 1.1674007177352905, + "learning_rate": 0.00014884673273973803, + "loss": 2.1106, + "step": 23100 + }, + { + "epoch": 2.2206094395847353, + "grad_norm": 1.5437126159667969, + "learning_rate": 0.00014883427742098257, + "loss": 2.1438, + "step": 23101 + }, + { + "epoch": 2.2207055657022012, + "grad_norm": 1.197005033493042, + "learning_rate": 0.0001488218221209862, + "loss": 2.1298, + "step": 23102 + }, + { + "epoch": 2.2208016918196676, + "grad_norm": 1.1193761825561523, + "learning_rate": 0.0001488093668398365, + "loss": 1.974, + "step": 23103 + }, + { + "epoch": 2.2208978179371335, + "grad_norm": 1.2357580661773682, + "learning_rate": 0.00014879691157762117, + "loss": 2.1069, + "step": 23104 + }, + { + "epoch": 2.2209939440546, + "grad_norm": 1.2137351036071777, + "learning_rate": 0.00014878445633442785, + "loss": 2.0271, + "step": 23105 + }, + { + "epoch": 2.2210900701720657, + "grad_norm": 1.1114758253097534, + "learning_rate": 0.0001487720011103442, + "loss": 1.8098, + "step": 23106 + }, + { + "epoch": 2.221186196289532, + "grad_norm": 1.110046625137329, + "learning_rate": 0.00014875954590545782, + "loss": 1.9509, + "step": 23107 + }, + { + "epoch": 2.221282322406998, + "grad_norm": 1.2123384475708008, + "learning_rate": 0.00014874709071985642, + "loss": 2.015, + "step": 23108 + }, + { + "epoch": 2.2213784485244643, + "grad_norm": 1.2503612041473389, + "learning_rate": 0.00014873463555362763, + "loss": 1.9472, + "step": 23109 + }, + { + "epoch": 2.22147457464193, + "grad_norm": 1.3220947980880737, + "learning_rate": 0.00014872218040685905, + "loss": 1.9874, + "step": 23110 + }, + { + "epoch": 2.221570700759396, + "grad_norm": 1.0503687858581543, + "learning_rate": 0.0001487097252796384, + "loss": 1.8159, + "step": 23111 + }, + { + "epoch": 2.2216668268768625, + "grad_norm": 1.0980985164642334, + "learning_rate": 0.0001486972701720533, + "loss": 1.8613, + "step": 23112 + }, + { + "epoch": 2.2217629529943284, + "grad_norm": 1.1806389093399048, + "learning_rate": 0.0001486848150841914, + "loss": 1.9349, + "step": 23113 + }, + { + "epoch": 2.2218590791117947, + "grad_norm": 1.1297290325164795, + "learning_rate": 0.00014867236001614036, + "loss": 2.0513, + "step": 23114 + }, + { + "epoch": 2.2219552052292606, + "grad_norm": 1.2105220556259155, + "learning_rate": 0.0001486599049679878, + "loss": 2.2087, + "step": 23115 + }, + { + "epoch": 2.222051331346727, + "grad_norm": 1.1228086948394775, + "learning_rate": 0.0001486474499398214, + "loss": 1.9747, + "step": 23116 + }, + { + "epoch": 2.222147457464193, + "grad_norm": 1.297219157218933, + "learning_rate": 0.00014863499493172877, + "loss": 2.075, + "step": 23117 + }, + { + "epoch": 2.222243583581659, + "grad_norm": 1.2761584520339966, + "learning_rate": 0.00014862253994379757, + "loss": 2.0475, + "step": 23118 + }, + { + "epoch": 2.222339709699125, + "grad_norm": 1.253216028213501, + "learning_rate": 0.00014861008497611547, + "loss": 1.9678, + "step": 23119 + }, + { + "epoch": 2.2224358358165914, + "grad_norm": 1.3341362476348877, + "learning_rate": 0.0001485976300287701, + "loss": 2.0798, + "step": 23120 + }, + { + "epoch": 2.2225319619340573, + "grad_norm": 1.355284333229065, + "learning_rate": 0.00014858517510184913, + "loss": 2.1685, + "step": 23121 + }, + { + "epoch": 2.2226280880515237, + "grad_norm": 1.2121514081954956, + "learning_rate": 0.00014857272019544018, + "loss": 2.0025, + "step": 23122 + }, + { + "epoch": 2.2227242141689896, + "grad_norm": 1.2691826820373535, + "learning_rate": 0.00014856026530963087, + "loss": 1.8373, + "step": 23123 + }, + { + "epoch": 2.222820340286456, + "grad_norm": 1.0543214082717896, + "learning_rate": 0.0001485478104445089, + "loss": 1.9682, + "step": 23124 + }, + { + "epoch": 2.222916466403922, + "grad_norm": 0.9963150024414062, + "learning_rate": 0.0001485353556001619, + "loss": 1.9871, + "step": 23125 + }, + { + "epoch": 2.223012592521388, + "grad_norm": 1.0819023847579956, + "learning_rate": 0.0001485229007766775, + "loss": 2.0129, + "step": 23126 + }, + { + "epoch": 2.223108718638854, + "grad_norm": 1.0381228923797607, + "learning_rate": 0.00014851044597414336, + "loss": 2.1033, + "step": 23127 + }, + { + "epoch": 2.2232048447563204, + "grad_norm": 1.3310264348983765, + "learning_rate": 0.00014849799119264712, + "loss": 1.9166, + "step": 23128 + }, + { + "epoch": 2.2233009708737863, + "grad_norm": 1.1208231449127197, + "learning_rate": 0.00014848553643227646, + "loss": 2.0689, + "step": 23129 + }, + { + "epoch": 2.2233970969912527, + "grad_norm": 1.096488118171692, + "learning_rate": 0.000148473081693119, + "loss": 2.1046, + "step": 23130 + }, + { + "epoch": 2.2234932231087186, + "grad_norm": 1.2517200708389282, + "learning_rate": 0.00014846062697526234, + "loss": 2.0594, + "step": 23131 + }, + { + "epoch": 2.223589349226185, + "grad_norm": 1.189373254776001, + "learning_rate": 0.00014844817227879423, + "loss": 2.1253, + "step": 23132 + }, + { + "epoch": 2.223685475343651, + "grad_norm": 1.179700493812561, + "learning_rate": 0.0001484357176038022, + "loss": 1.952, + "step": 23133 + }, + { + "epoch": 2.223781601461117, + "grad_norm": 1.159516453742981, + "learning_rate": 0.000148423262950374, + "loss": 2.0899, + "step": 23134 + }, + { + "epoch": 2.223877727578583, + "grad_norm": 1.0558598041534424, + "learning_rate": 0.0001484108083185972, + "loss": 1.9433, + "step": 23135 + }, + { + "epoch": 2.2239738536960494, + "grad_norm": 1.2737147808074951, + "learning_rate": 0.00014839835370855943, + "loss": 2.0237, + "step": 23136 + }, + { + "epoch": 2.2240699798135153, + "grad_norm": 1.2537875175476074, + "learning_rate": 0.00014838589912034844, + "loss": 2.1312, + "step": 23137 + }, + { + "epoch": 2.2241661059309816, + "grad_norm": 1.246654987335205, + "learning_rate": 0.00014837344455405178, + "loss": 2.073, + "step": 23138 + }, + { + "epoch": 2.2242622320484475, + "grad_norm": 1.1373391151428223, + "learning_rate": 0.0001483609900097571, + "loss": 1.9863, + "step": 23139 + }, + { + "epoch": 2.224358358165914, + "grad_norm": 1.0428845882415771, + "learning_rate": 0.0001483485354875521, + "loss": 1.9263, + "step": 23140 + }, + { + "epoch": 2.2244544842833798, + "grad_norm": 1.0466828346252441, + "learning_rate": 0.0001483360809875244, + "loss": 1.9528, + "step": 23141 + }, + { + "epoch": 2.224550610400846, + "grad_norm": 1.14361572265625, + "learning_rate": 0.00014832362650976164, + "loss": 1.9745, + "step": 23142 + }, + { + "epoch": 2.224646736518312, + "grad_norm": 1.0624191761016846, + "learning_rate": 0.00014831117205435144, + "loss": 1.9129, + "step": 23143 + }, + { + "epoch": 2.224742862635778, + "grad_norm": 1.2698240280151367, + "learning_rate": 0.00014829871762138142, + "loss": 2.0479, + "step": 23144 + }, + { + "epoch": 2.2248389887532443, + "grad_norm": 1.1031417846679688, + "learning_rate": 0.00014828626321093931, + "loss": 1.9509, + "step": 23145 + }, + { + "epoch": 2.2249351148707106, + "grad_norm": 1.05763840675354, + "learning_rate": 0.00014827380882311274, + "loss": 1.9904, + "step": 23146 + }, + { + "epoch": 2.2250312409881765, + "grad_norm": 1.2112001180648804, + "learning_rate": 0.00014826135445798928, + "loss": 1.9358, + "step": 23147 + }, + { + "epoch": 2.2251273671056424, + "grad_norm": 1.0487934350967407, + "learning_rate": 0.00014824890011565665, + "loss": 1.8839, + "step": 23148 + }, + { + "epoch": 2.2252234932231088, + "grad_norm": 1.330257534980774, + "learning_rate": 0.00014823644579620244, + "loss": 2.1113, + "step": 23149 + }, + { + "epoch": 2.2253196193405747, + "grad_norm": 1.0720261335372925, + "learning_rate": 0.00014822399149971432, + "loss": 1.9918, + "step": 23150 + }, + { + "epoch": 2.225415745458041, + "grad_norm": 1.01948082447052, + "learning_rate": 0.00014821153722627994, + "loss": 1.7305, + "step": 23151 + }, + { + "epoch": 2.225511871575507, + "grad_norm": 1.174883246421814, + "learning_rate": 0.0001481990829759869, + "loss": 2.0211, + "step": 23152 + }, + { + "epoch": 2.2256079976929732, + "grad_norm": 1.2989487648010254, + "learning_rate": 0.00014818662874892287, + "loss": 2.1433, + "step": 23153 + }, + { + "epoch": 2.225704123810439, + "grad_norm": 1.3510886430740356, + "learning_rate": 0.00014817417454517554, + "loss": 2.2713, + "step": 23154 + }, + { + "epoch": 2.2258002499279055, + "grad_norm": 1.203913688659668, + "learning_rate": 0.00014816172036483243, + "loss": 2.2812, + "step": 23155 + }, + { + "epoch": 2.2258963760453714, + "grad_norm": 1.0066354274749756, + "learning_rate": 0.00014814926620798134, + "loss": 1.8249, + "step": 23156 + }, + { + "epoch": 2.2259925021628377, + "grad_norm": 1.2273527383804321, + "learning_rate": 0.00014813681207470977, + "loss": 2.0769, + "step": 23157 + }, + { + "epoch": 2.2260886282803036, + "grad_norm": 1.147215485572815, + "learning_rate": 0.00014812435796510544, + "loss": 1.8851, + "step": 23158 + }, + { + "epoch": 2.22618475439777, + "grad_norm": 1.1961218118667603, + "learning_rate": 0.00014811190387925595, + "loss": 1.9695, + "step": 23159 + }, + { + "epoch": 2.226280880515236, + "grad_norm": 1.137317180633545, + "learning_rate": 0.000148099449817249, + "loss": 1.9671, + "step": 23160 + }, + { + "epoch": 2.226377006632702, + "grad_norm": 1.0834847688674927, + "learning_rate": 0.00014808699577917214, + "loss": 2.0418, + "step": 23161 + }, + { + "epoch": 2.226473132750168, + "grad_norm": 1.0756779909133911, + "learning_rate": 0.0001480745417651131, + "loss": 2.0844, + "step": 23162 + }, + { + "epoch": 2.2265692588676345, + "grad_norm": 1.2914022207260132, + "learning_rate": 0.00014806208777515948, + "loss": 2.1539, + "step": 23163 + }, + { + "epoch": 2.2266653849851004, + "grad_norm": 1.1195894479751587, + "learning_rate": 0.0001480496338093989, + "loss": 1.9752, + "step": 23164 + }, + { + "epoch": 2.2267615111025667, + "grad_norm": 1.0860601663589478, + "learning_rate": 0.00014803717986791907, + "loss": 2.1121, + "step": 23165 + }, + { + "epoch": 2.2268576372200326, + "grad_norm": 1.2239570617675781, + "learning_rate": 0.00014802472595080754, + "loss": 2.045, + "step": 23166 + }, + { + "epoch": 2.226953763337499, + "grad_norm": 1.2230579853057861, + "learning_rate": 0.00014801227205815204, + "loss": 1.8071, + "step": 23167 + }, + { + "epoch": 2.227049889454965, + "grad_norm": 1.1077196598052979, + "learning_rate": 0.0001479998181900401, + "loss": 2.0312, + "step": 23168 + }, + { + "epoch": 2.227146015572431, + "grad_norm": 1.1356292963027954, + "learning_rate": 0.0001479873643465595, + "loss": 1.9987, + "step": 23169 + }, + { + "epoch": 2.227242141689897, + "grad_norm": 1.086314082145691, + "learning_rate": 0.0001479749105277978, + "loss": 1.8092, + "step": 23170 + }, + { + "epoch": 2.2273382678073634, + "grad_norm": 1.0311806201934814, + "learning_rate": 0.00014796245673384258, + "loss": 2.0571, + "step": 23171 + }, + { + "epoch": 2.2274343939248293, + "grad_norm": 1.1245982646942139, + "learning_rate": 0.0001479500029647816, + "loss": 1.997, + "step": 23172 + }, + { + "epoch": 2.2275305200422957, + "grad_norm": 1.2022203207015991, + "learning_rate": 0.0001479375492207024, + "loss": 2.042, + "step": 23173 + }, + { + "epoch": 2.2276266461597616, + "grad_norm": 1.088234543800354, + "learning_rate": 0.00014792509550169266, + "loss": 1.7817, + "step": 23174 + }, + { + "epoch": 2.227722772277228, + "grad_norm": 1.1292788982391357, + "learning_rate": 0.00014791264180784004, + "loss": 1.9964, + "step": 23175 + }, + { + "epoch": 2.227818898394694, + "grad_norm": 1.2632256746292114, + "learning_rate": 0.00014790018813923216, + "loss": 2.1252, + "step": 23176 + }, + { + "epoch": 2.2279150245121597, + "grad_norm": 1.2529962062835693, + "learning_rate": 0.00014788773449595665, + "loss": 2.0558, + "step": 23177 + }, + { + "epoch": 2.228011150629626, + "grad_norm": 1.1343401670455933, + "learning_rate": 0.0001478752808781011, + "loss": 2.0516, + "step": 23178 + }, + { + "epoch": 2.2281072767470924, + "grad_norm": 1.1699351072311401, + "learning_rate": 0.00014786282728575326, + "loss": 2.0022, + "step": 23179 + }, + { + "epoch": 2.2282034028645583, + "grad_norm": 1.3437761068344116, + "learning_rate": 0.0001478503737190007, + "loss": 2.0536, + "step": 23180 + }, + { + "epoch": 2.228299528982024, + "grad_norm": 1.2631045579910278, + "learning_rate": 0.00014783792017793106, + "loss": 1.9142, + "step": 23181 + }, + { + "epoch": 2.2283956550994906, + "grad_norm": 1.2900323867797852, + "learning_rate": 0.000147825466662632, + "loss": 2.0348, + "step": 23182 + }, + { + "epoch": 2.2284917812169565, + "grad_norm": 1.1018218994140625, + "learning_rate": 0.00014781301317319114, + "loss": 1.9162, + "step": 23183 + }, + { + "epoch": 2.228587907334423, + "grad_norm": 1.2263661623001099, + "learning_rate": 0.00014780055970969609, + "loss": 2.1722, + "step": 23184 + }, + { + "epoch": 2.2286840334518887, + "grad_norm": 1.1914489269256592, + "learning_rate": 0.00014778810627223452, + "loss": 1.9996, + "step": 23185 + }, + { + "epoch": 2.228780159569355, + "grad_norm": 1.2075393199920654, + "learning_rate": 0.0001477756528608941, + "loss": 1.9407, + "step": 23186 + }, + { + "epoch": 2.228876285686821, + "grad_norm": 1.2403346300125122, + "learning_rate": 0.00014776319947576235, + "loss": 2.166, + "step": 23187 + }, + { + "epoch": 2.2289724118042873, + "grad_norm": 1.2262964248657227, + "learning_rate": 0.00014775074611692707, + "loss": 2.1463, + "step": 23188 + }, + { + "epoch": 2.229068537921753, + "grad_norm": 1.0531160831451416, + "learning_rate": 0.00014773829278447574, + "loss": 1.9404, + "step": 23189 + }, + { + "epoch": 2.2291646640392195, + "grad_norm": 1.1305592060089111, + "learning_rate": 0.00014772583947849608, + "loss": 2.109, + "step": 23190 + }, + { + "epoch": 2.2292607901566854, + "grad_norm": 1.3341619968414307, + "learning_rate": 0.00014771338619907573, + "loss": 2.0245, + "step": 23191 + }, + { + "epoch": 2.229356916274152, + "grad_norm": 1.1118724346160889, + "learning_rate": 0.0001477009329463023, + "loss": 1.7475, + "step": 23192 + }, + { + "epoch": 2.2294530423916177, + "grad_norm": 1.1970878839492798, + "learning_rate": 0.00014768847972026344, + "loss": 2.0021, + "step": 23193 + }, + { + "epoch": 2.229549168509084, + "grad_norm": 1.192739725112915, + "learning_rate": 0.00014767602652104677, + "loss": 1.9832, + "step": 23194 + }, + { + "epoch": 2.22964529462655, + "grad_norm": 1.1909900903701782, + "learning_rate": 0.00014766357334873988, + "loss": 2.0509, + "step": 23195 + }, + { + "epoch": 2.2297414207440163, + "grad_norm": 1.2855749130249023, + "learning_rate": 0.00014765112020343052, + "loss": 2.1628, + "step": 23196 + }, + { + "epoch": 2.229837546861482, + "grad_norm": 1.0222282409667969, + "learning_rate": 0.00014763866708520623, + "loss": 1.8914, + "step": 23197 + }, + { + "epoch": 2.2299336729789485, + "grad_norm": 1.258551836013794, + "learning_rate": 0.00014762621399415468, + "loss": 2.1579, + "step": 23198 + }, + { + "epoch": 2.2300297990964144, + "grad_norm": 1.078570008277893, + "learning_rate": 0.0001476137609303635, + "loss": 1.8169, + "step": 23199 + }, + { + "epoch": 2.2301259252138808, + "grad_norm": 1.0183234214782715, + "learning_rate": 0.00014760130789392035, + "loss": 1.8696, + "step": 23200 + }, + { + "epoch": 2.2302220513313467, + "grad_norm": 0.9986048936843872, + "learning_rate": 0.00014758885488491282, + "loss": 1.8832, + "step": 23201 + }, + { + "epoch": 2.230318177448813, + "grad_norm": 1.1771880388259888, + "learning_rate": 0.00014757640190342854, + "loss": 1.9985, + "step": 23202 + }, + { + "epoch": 2.230414303566279, + "grad_norm": 1.1162759065628052, + "learning_rate": 0.0001475639489495552, + "loss": 1.9592, + "step": 23203 + }, + { + "epoch": 2.2305104296837452, + "grad_norm": 1.2306691408157349, + "learning_rate": 0.00014755149602338035, + "loss": 2.0767, + "step": 23204 + }, + { + "epoch": 2.230606555801211, + "grad_norm": 1.267899513244629, + "learning_rate": 0.0001475390431249917, + "loss": 1.9976, + "step": 23205 + }, + { + "epoch": 2.2307026819186775, + "grad_norm": 1.181861400604248, + "learning_rate": 0.00014752659025447684, + "loss": 1.9718, + "step": 23206 + }, + { + "epoch": 2.2307988080361434, + "grad_norm": 1.2072792053222656, + "learning_rate": 0.00014751413741192343, + "loss": 2.1472, + "step": 23207 + }, + { + "epoch": 2.2308949341536097, + "grad_norm": 1.0922715663909912, + "learning_rate": 0.0001475016845974191, + "loss": 1.9401, + "step": 23208 + }, + { + "epoch": 2.2309910602710756, + "grad_norm": 1.0257409811019897, + "learning_rate": 0.00014748923181105144, + "loss": 1.9568, + "step": 23209 + }, + { + "epoch": 2.2310871863885415, + "grad_norm": 1.1800158023834229, + "learning_rate": 0.0001474767790529081, + "loss": 1.9348, + "step": 23210 + }, + { + "epoch": 2.231183312506008, + "grad_norm": 1.0916656255722046, + "learning_rate": 0.00014746432632307675, + "loss": 2.0516, + "step": 23211 + }, + { + "epoch": 2.231279438623474, + "grad_norm": 1.0566924810409546, + "learning_rate": 0.00014745187362164494, + "loss": 1.8488, + "step": 23212 + }, + { + "epoch": 2.23137556474094, + "grad_norm": 1.0649816989898682, + "learning_rate": 0.00014743942094870043, + "loss": 2.0725, + "step": 23213 + }, + { + "epoch": 2.231471690858406, + "grad_norm": 1.1806011199951172, + "learning_rate": 0.00014742696830433074, + "loss": 2.1313, + "step": 23214 + }, + { + "epoch": 2.2315678169758724, + "grad_norm": 1.3395910263061523, + "learning_rate": 0.00014741451568862354, + "loss": 2.1089, + "step": 23215 + }, + { + "epoch": 2.2316639430933383, + "grad_norm": 1.1982661485671997, + "learning_rate": 0.00014740206310166648, + "loss": 1.863, + "step": 23216 + }, + { + "epoch": 2.2317600692108046, + "grad_norm": 1.1686204671859741, + "learning_rate": 0.00014738961054354716, + "loss": 1.9334, + "step": 23217 + }, + { + "epoch": 2.2318561953282705, + "grad_norm": 1.1203278303146362, + "learning_rate": 0.00014737715801435322, + "loss": 1.8796, + "step": 23218 + }, + { + "epoch": 2.231952321445737, + "grad_norm": 1.3034169673919678, + "learning_rate": 0.0001473647055141723, + "loss": 2.1418, + "step": 23219 + }, + { + "epoch": 2.2320484475632028, + "grad_norm": 1.118516206741333, + "learning_rate": 0.000147352253043092, + "loss": 1.8825, + "step": 23220 + }, + { + "epoch": 2.232144573680669, + "grad_norm": 1.2281477451324463, + "learning_rate": 0.00014733980060119997, + "loss": 1.9305, + "step": 23221 + }, + { + "epoch": 2.232240699798135, + "grad_norm": 1.0916095972061157, + "learning_rate": 0.00014732734818858386, + "loss": 2.0556, + "step": 23222 + }, + { + "epoch": 2.2323368259156013, + "grad_norm": 1.0986061096191406, + "learning_rate": 0.0001473148958053313, + "loss": 1.9829, + "step": 23223 + }, + { + "epoch": 2.2324329520330672, + "grad_norm": 1.2147090435028076, + "learning_rate": 0.00014730244345152982, + "loss": 2.0333, + "step": 23224 + }, + { + "epoch": 2.2325290781505336, + "grad_norm": 1.081787109375, + "learning_rate": 0.0001472899911272672, + "loss": 1.9854, + "step": 23225 + }, + { + "epoch": 2.2326252042679995, + "grad_norm": 1.2197569608688354, + "learning_rate": 0.00014727753883263096, + "loss": 2.1647, + "step": 23226 + }, + { + "epoch": 2.232721330385466, + "grad_norm": 1.224462866783142, + "learning_rate": 0.00014726508656770878, + "loss": 2.1578, + "step": 23227 + }, + { + "epoch": 2.2328174565029317, + "grad_norm": 1.1800063848495483, + "learning_rate": 0.00014725263433258825, + "loss": 2.2041, + "step": 23228 + }, + { + "epoch": 2.232913582620398, + "grad_norm": 1.2080717086791992, + "learning_rate": 0.000147240182127357, + "loss": 1.9379, + "step": 23229 + }, + { + "epoch": 2.233009708737864, + "grad_norm": 1.1251354217529297, + "learning_rate": 0.00014722772995210274, + "loss": 1.9606, + "step": 23230 + }, + { + "epoch": 2.2331058348553303, + "grad_norm": 1.1390786170959473, + "learning_rate": 0.000147215277806913, + "loss": 1.9864, + "step": 23231 + }, + { + "epoch": 2.233201960972796, + "grad_norm": 1.127300500869751, + "learning_rate": 0.00014720282569187545, + "loss": 2.0566, + "step": 23232 + }, + { + "epoch": 2.2332980870902626, + "grad_norm": 1.1711199283599854, + "learning_rate": 0.0001471903736070777, + "loss": 1.9602, + "step": 23233 + }, + { + "epoch": 2.2333942132077285, + "grad_norm": 0.9705294370651245, + "learning_rate": 0.0001471779215526074, + "loss": 1.8768, + "step": 23234 + }, + { + "epoch": 2.233490339325195, + "grad_norm": 1.0822113752365112, + "learning_rate": 0.00014716546952855218, + "loss": 1.9129, + "step": 23235 + }, + { + "epoch": 2.2335864654426607, + "grad_norm": 1.194074273109436, + "learning_rate": 0.00014715301753499965, + "loss": 1.9758, + "step": 23236 + }, + { + "epoch": 2.233682591560127, + "grad_norm": 1.0145999193191528, + "learning_rate": 0.00014714056557203742, + "loss": 1.8765, + "step": 23237 + }, + { + "epoch": 2.233778717677593, + "grad_norm": 0.9851272106170654, + "learning_rate": 0.00014712811363975313, + "loss": 1.7854, + "step": 23238 + }, + { + "epoch": 2.2338748437950593, + "grad_norm": 1.3133139610290527, + "learning_rate": 0.00014711566173823443, + "loss": 1.9364, + "step": 23239 + }, + { + "epoch": 2.233970969912525, + "grad_norm": 1.098207712173462, + "learning_rate": 0.00014710320986756892, + "loss": 1.7524, + "step": 23240 + }, + { + "epoch": 2.2340670960299915, + "grad_norm": 1.1476309299468994, + "learning_rate": 0.00014709075802784425, + "loss": 2.0092, + "step": 23241 + }, + { + "epoch": 2.2341632221474574, + "grad_norm": 1.2774369716644287, + "learning_rate": 0.00014707830621914797, + "loss": 2.0378, + "step": 23242 + }, + { + "epoch": 2.234259348264924, + "grad_norm": 1.0783722400665283, + "learning_rate": 0.0001470658544415678, + "loss": 2.1892, + "step": 23243 + }, + { + "epoch": 2.2343554743823897, + "grad_norm": 1.10512113571167, + "learning_rate": 0.00014705340269519134, + "loss": 1.8602, + "step": 23244 + }, + { + "epoch": 2.234451600499856, + "grad_norm": 1.2060949802398682, + "learning_rate": 0.0001470409509801062, + "loss": 2.0626, + "step": 23245 + }, + { + "epoch": 2.234547726617322, + "grad_norm": 1.096009373664856, + "learning_rate": 0.00014702849929639994, + "loss": 2.217, + "step": 23246 + }, + { + "epoch": 2.234643852734788, + "grad_norm": 1.1068886518478394, + "learning_rate": 0.0001470160476441603, + "loss": 1.9248, + "step": 23247 + }, + { + "epoch": 2.234739978852254, + "grad_norm": 1.099102258682251, + "learning_rate": 0.00014700359602347484, + "loss": 1.9441, + "step": 23248 + }, + { + "epoch": 2.23483610496972, + "grad_norm": 1.1632148027420044, + "learning_rate": 0.00014699114443443123, + "loss": 1.9617, + "step": 23249 + }, + { + "epoch": 2.2349322310871864, + "grad_norm": 1.328458547592163, + "learning_rate": 0.000146978692877117, + "loss": 2.0593, + "step": 23250 + }, + { + "epoch": 2.2350283572046523, + "grad_norm": 1.127529263496399, + "learning_rate": 0.00014696624135161993, + "loss": 2.0453, + "step": 23251 + }, + { + "epoch": 2.2351244833221187, + "grad_norm": 1.1320525407791138, + "learning_rate": 0.00014695378985802747, + "loss": 1.9693, + "step": 23252 + }, + { + "epoch": 2.2352206094395846, + "grad_norm": 1.1342779397964478, + "learning_rate": 0.00014694133839642735, + "loss": 2.0241, + "step": 23253 + }, + { + "epoch": 2.235316735557051, + "grad_norm": 1.0816829204559326, + "learning_rate": 0.00014692888696690717, + "loss": 1.9468, + "step": 23254 + }, + { + "epoch": 2.235412861674517, + "grad_norm": 1.1087799072265625, + "learning_rate": 0.00014691643556955452, + "loss": 1.928, + "step": 23255 + }, + { + "epoch": 2.235508987791983, + "grad_norm": 1.0315123796463013, + "learning_rate": 0.00014690398420445707, + "loss": 1.8777, + "step": 23256 + }, + { + "epoch": 2.235605113909449, + "grad_norm": 1.1693220138549805, + "learning_rate": 0.0001468915328717024, + "loss": 2.0681, + "step": 23257 + }, + { + "epoch": 2.2357012400269154, + "grad_norm": 1.1247073411941528, + "learning_rate": 0.00014687908157137817, + "loss": 2.0004, + "step": 23258 + }, + { + "epoch": 2.2357973661443813, + "grad_norm": 1.0918998718261719, + "learning_rate": 0.000146866630303572, + "loss": 1.9948, + "step": 23259 + }, + { + "epoch": 2.2358934922618476, + "grad_norm": 1.0754730701446533, + "learning_rate": 0.00014685417906837146, + "loss": 1.9349, + "step": 23260 + }, + { + "epoch": 2.2359896183793135, + "grad_norm": 1.1693116426467896, + "learning_rate": 0.00014684172786586422, + "loss": 1.9975, + "step": 23261 + }, + { + "epoch": 2.23608574449678, + "grad_norm": 1.1855565309524536, + "learning_rate": 0.00014682927669613788, + "loss": 2.1306, + "step": 23262 + }, + { + "epoch": 2.236181870614246, + "grad_norm": 1.0366910696029663, + "learning_rate": 0.00014681682555928003, + "loss": 1.7286, + "step": 23263 + }, + { + "epoch": 2.236277996731712, + "grad_norm": 1.3208199739456177, + "learning_rate": 0.00014680437445537838, + "loss": 1.8862, + "step": 23264 + }, + { + "epoch": 2.236374122849178, + "grad_norm": 1.1112537384033203, + "learning_rate": 0.00014679192338452047, + "loss": 1.943, + "step": 23265 + }, + { + "epoch": 2.2364702489666444, + "grad_norm": 1.0661693811416626, + "learning_rate": 0.000146779472346794, + "loss": 2.0307, + "step": 23266 + }, + { + "epoch": 2.2365663750841103, + "grad_norm": 1.2065215110778809, + "learning_rate": 0.00014676702134228649, + "loss": 1.9808, + "step": 23267 + }, + { + "epoch": 2.2366625012015766, + "grad_norm": 1.2778998613357544, + "learning_rate": 0.0001467545703710856, + "loss": 2.1093, + "step": 23268 + }, + { + "epoch": 2.2367586273190425, + "grad_norm": 1.291813850402832, + "learning_rate": 0.00014674211943327897, + "loss": 2.0763, + "step": 23269 + }, + { + "epoch": 2.236854753436509, + "grad_norm": 1.1047017574310303, + "learning_rate": 0.0001467296685289542, + "loss": 2.0882, + "step": 23270 + }, + { + "epoch": 2.2369508795539748, + "grad_norm": 0.9805199503898621, + "learning_rate": 0.00014671721765819893, + "loss": 1.756, + "step": 23271 + }, + { + "epoch": 2.237047005671441, + "grad_norm": 1.1998927593231201, + "learning_rate": 0.00014670476682110075, + "loss": 1.9275, + "step": 23272 + }, + { + "epoch": 2.237143131788907, + "grad_norm": 1.1393954753875732, + "learning_rate": 0.00014669231601774731, + "loss": 1.9776, + "step": 23273 + }, + { + "epoch": 2.2372392579063733, + "grad_norm": 1.2159018516540527, + "learning_rate": 0.00014667986524822619, + "loss": 2.1036, + "step": 23274 + }, + { + "epoch": 2.2373353840238392, + "grad_norm": 1.299583077430725, + "learning_rate": 0.000146667414512625, + "loss": 2.1733, + "step": 23275 + }, + { + "epoch": 2.2374315101413056, + "grad_norm": 1.1138476133346558, + "learning_rate": 0.00014665496381103141, + "loss": 1.8652, + "step": 23276 + }, + { + "epoch": 2.2375276362587715, + "grad_norm": 1.114888310432434, + "learning_rate": 0.00014664251314353303, + "loss": 2.2209, + "step": 23277 + }, + { + "epoch": 2.237623762376238, + "grad_norm": 1.0481441020965576, + "learning_rate": 0.00014663006251021743, + "loss": 1.8764, + "step": 23278 + }, + { + "epoch": 2.2377198884937037, + "grad_norm": 1.336897611618042, + "learning_rate": 0.00014661761191117227, + "loss": 2.0552, + "step": 23279 + }, + { + "epoch": 2.2378160146111696, + "grad_norm": 1.120873212814331, + "learning_rate": 0.0001466051613464851, + "loss": 1.9602, + "step": 23280 + }, + { + "epoch": 2.237912140728636, + "grad_norm": 1.2822917699813843, + "learning_rate": 0.00014659271081624364, + "loss": 2.0042, + "step": 23281 + }, + { + "epoch": 2.238008266846102, + "grad_norm": 1.0700294971466064, + "learning_rate": 0.00014658026032053544, + "loss": 2.0869, + "step": 23282 + }, + { + "epoch": 2.2381043929635682, + "grad_norm": 1.0648829936981201, + "learning_rate": 0.00014656780985944815, + "loss": 1.9088, + "step": 23283 + }, + { + "epoch": 2.238200519081034, + "grad_norm": 1.3378663063049316, + "learning_rate": 0.00014655535943306934, + "loss": 2.0515, + "step": 23284 + }, + { + "epoch": 2.2382966451985005, + "grad_norm": 1.0532718896865845, + "learning_rate": 0.00014654290904148666, + "loss": 2.1419, + "step": 23285 + }, + { + "epoch": 2.2383927713159664, + "grad_norm": 1.2880336046218872, + "learning_rate": 0.0001465304586847877, + "loss": 2.1328, + "step": 23286 + }, + { + "epoch": 2.2384888974334327, + "grad_norm": 1.3492331504821777, + "learning_rate": 0.00014651800836306013, + "loss": 2.1556, + "step": 23287 + }, + { + "epoch": 2.2385850235508986, + "grad_norm": 1.3065882921218872, + "learning_rate": 0.00014650555807639153, + "loss": 2.0385, + "step": 23288 + }, + { + "epoch": 2.238681149668365, + "grad_norm": 1.0780458450317383, + "learning_rate": 0.00014649310782486948, + "loss": 2.0649, + "step": 23289 + }, + { + "epoch": 2.238777275785831, + "grad_norm": 1.2690455913543701, + "learning_rate": 0.00014648065760858162, + "loss": 2.0111, + "step": 23290 + }, + { + "epoch": 2.238873401903297, + "grad_norm": 1.3147907257080078, + "learning_rate": 0.00014646820742761558, + "loss": 2.1179, + "step": 23291 + }, + { + "epoch": 2.238969528020763, + "grad_norm": 1.1192526817321777, + "learning_rate": 0.00014645575728205895, + "loss": 1.9624, + "step": 23292 + }, + { + "epoch": 2.2390656541382294, + "grad_norm": 0.9873660802841187, + "learning_rate": 0.00014644330717199938, + "loss": 2.0057, + "step": 23293 + }, + { + "epoch": 2.2391617802556953, + "grad_norm": 1.019605040550232, + "learning_rate": 0.00014643085709752443, + "loss": 1.9188, + "step": 23294 + }, + { + "epoch": 2.2392579063731617, + "grad_norm": 1.2720363140106201, + "learning_rate": 0.00014641840705872174, + "loss": 2.0712, + "step": 23295 + }, + { + "epoch": 2.2393540324906276, + "grad_norm": 1.0199370384216309, + "learning_rate": 0.00014640595705567893, + "loss": 1.9883, + "step": 23296 + }, + { + "epoch": 2.239450158608094, + "grad_norm": 1.0001022815704346, + "learning_rate": 0.0001463935070884836, + "loss": 2.0015, + "step": 23297 + }, + { + "epoch": 2.23954628472556, + "grad_norm": 1.129944920539856, + "learning_rate": 0.00014638105715722338, + "loss": 1.9259, + "step": 23298 + }, + { + "epoch": 2.239642410843026, + "grad_norm": 1.140237808227539, + "learning_rate": 0.00014636860726198586, + "loss": 1.9599, + "step": 23299 + }, + { + "epoch": 2.239738536960492, + "grad_norm": 1.1574989557266235, + "learning_rate": 0.00014635615740285867, + "loss": 2.0629, + "step": 23300 + }, + { + "epoch": 2.2398346630779584, + "grad_norm": 1.154046893119812, + "learning_rate": 0.00014634370757992941, + "loss": 1.9893, + "step": 23301 + }, + { + "epoch": 2.2399307891954243, + "grad_norm": 1.2808555364608765, + "learning_rate": 0.0001463312577932857, + "loss": 1.9488, + "step": 23302 + }, + { + "epoch": 2.2400269153128907, + "grad_norm": 1.2022933959960938, + "learning_rate": 0.00014631880804301514, + "loss": 2.0493, + "step": 23303 + }, + { + "epoch": 2.2401230414303566, + "grad_norm": 1.2874759435653687, + "learning_rate": 0.00014630635832920533, + "loss": 2.0217, + "step": 23304 + }, + { + "epoch": 2.240219167547823, + "grad_norm": 1.0527184009552002, + "learning_rate": 0.00014629390865194395, + "loss": 1.8866, + "step": 23305 + }, + { + "epoch": 2.240315293665289, + "grad_norm": 1.0455541610717773, + "learning_rate": 0.00014628145901131851, + "loss": 1.9357, + "step": 23306 + }, + { + "epoch": 2.240411419782755, + "grad_norm": 1.292089819908142, + "learning_rate": 0.00014626900940741666, + "loss": 1.9187, + "step": 23307 + }, + { + "epoch": 2.240507545900221, + "grad_norm": 0.9804590940475464, + "learning_rate": 0.00014625655984032602, + "loss": 1.944, + "step": 23308 + }, + { + "epoch": 2.2406036720176874, + "grad_norm": 1.308350682258606, + "learning_rate": 0.00014624411031013423, + "loss": 2.0687, + "step": 23309 + }, + { + "epoch": 2.2406997981351533, + "grad_norm": 1.2135401964187622, + "learning_rate": 0.00014623166081692885, + "loss": 2.0742, + "step": 23310 + }, + { + "epoch": 2.2407959242526196, + "grad_norm": 1.1865040063858032, + "learning_rate": 0.00014621921136079747, + "loss": 2.1328, + "step": 23311 + }, + { + "epoch": 2.2408920503700855, + "grad_norm": 1.218776822090149, + "learning_rate": 0.0001462067619418278, + "loss": 2.1287, + "step": 23312 + }, + { + "epoch": 2.2409881764875514, + "grad_norm": 1.2592562437057495, + "learning_rate": 0.00014619431256010732, + "loss": 2.0502, + "step": 23313 + }, + { + "epoch": 2.241084302605018, + "grad_norm": 1.067262053489685, + "learning_rate": 0.00014618186321572368, + "loss": 2.0695, + "step": 23314 + }, + { + "epoch": 2.241180428722484, + "grad_norm": 1.2529908418655396, + "learning_rate": 0.00014616941390876452, + "loss": 2.1468, + "step": 23315 + }, + { + "epoch": 2.24127655483995, + "grad_norm": 1.2314903736114502, + "learning_rate": 0.00014615696463931744, + "loss": 1.8787, + "step": 23316 + }, + { + "epoch": 2.241372680957416, + "grad_norm": 1.2205878496170044, + "learning_rate": 0.00014614451540747006, + "loss": 2.1085, + "step": 23317 + }, + { + "epoch": 2.2414688070748823, + "grad_norm": 1.2279969453811646, + "learning_rate": 0.00014613206621330997, + "loss": 1.9192, + "step": 23318 + }, + { + "epoch": 2.241564933192348, + "grad_norm": 1.407014012336731, + "learning_rate": 0.00014611961705692474, + "loss": 2.0356, + "step": 23319 + }, + { + "epoch": 2.2416610593098145, + "grad_norm": 1.2914294004440308, + "learning_rate": 0.00014610716793840204, + "loss": 2.0642, + "step": 23320 + }, + { + "epoch": 2.2417571854272804, + "grad_norm": 1.1957212686538696, + "learning_rate": 0.00014609471885782945, + "loss": 2.0972, + "step": 23321 + }, + { + "epoch": 2.2418533115447468, + "grad_norm": 1.1997987031936646, + "learning_rate": 0.00014608226981529455, + "loss": 1.9167, + "step": 23322 + }, + { + "epoch": 2.2419494376622127, + "grad_norm": 1.0924228429794312, + "learning_rate": 0.00014606982081088495, + "loss": 2.0214, + "step": 23323 + }, + { + "epoch": 2.242045563779679, + "grad_norm": 1.1846270561218262, + "learning_rate": 0.00014605737184468832, + "loss": 2.0064, + "step": 23324 + }, + { + "epoch": 2.242141689897145, + "grad_norm": 1.5566816329956055, + "learning_rate": 0.00014604492291679218, + "loss": 2.1595, + "step": 23325 + }, + { + "epoch": 2.2422378160146113, + "grad_norm": 1.2715805768966675, + "learning_rate": 0.0001460324740272842, + "loss": 2.1354, + "step": 23326 + }, + { + "epoch": 2.242333942132077, + "grad_norm": 1.047028660774231, + "learning_rate": 0.00014602002517625197, + "loss": 1.8255, + "step": 23327 + }, + { + "epoch": 2.2424300682495435, + "grad_norm": 1.1392791271209717, + "learning_rate": 0.00014600757636378306, + "loss": 1.9819, + "step": 23328 + }, + { + "epoch": 2.2425261943670094, + "grad_norm": 1.1008809804916382, + "learning_rate": 0.0001459951275899651, + "loss": 2.0868, + "step": 23329 + }, + { + "epoch": 2.2426223204844757, + "grad_norm": 1.0877381563186646, + "learning_rate": 0.00014598267885488568, + "loss": 1.7806, + "step": 23330 + }, + { + "epoch": 2.2427184466019416, + "grad_norm": 1.259210467338562, + "learning_rate": 0.00014597023015863237, + "loss": 1.8919, + "step": 23331 + }, + { + "epoch": 2.242814572719408, + "grad_norm": 1.1820696592330933, + "learning_rate": 0.00014595778150129288, + "loss": 2.0575, + "step": 23332 + }, + { + "epoch": 2.242910698836874, + "grad_norm": 1.0681283473968506, + "learning_rate": 0.00014594533288295476, + "loss": 2.0124, + "step": 23333 + }, + { + "epoch": 2.2430068249543402, + "grad_norm": 1.137684941291809, + "learning_rate": 0.00014593288430370555, + "loss": 1.8986, + "step": 23334 + }, + { + "epoch": 2.243102951071806, + "grad_norm": 0.9996077418327332, + "learning_rate": 0.00014592043576363295, + "loss": 1.956, + "step": 23335 + }, + { + "epoch": 2.2431990771892725, + "grad_norm": 1.167482852935791, + "learning_rate": 0.0001459079872628245, + "loss": 2.0309, + "step": 23336 + }, + { + "epoch": 2.2432952033067384, + "grad_norm": 1.1520934104919434, + "learning_rate": 0.0001458955388013678, + "loss": 1.9598, + "step": 23337 + }, + { + "epoch": 2.2433913294242047, + "grad_norm": 1.0051482915878296, + "learning_rate": 0.0001458830903793505, + "loss": 1.9105, + "step": 23338 + }, + { + "epoch": 2.2434874555416706, + "grad_norm": 1.0628691911697388, + "learning_rate": 0.00014587064199686014, + "loss": 1.9367, + "step": 23339 + }, + { + "epoch": 2.243583581659137, + "grad_norm": 1.156929850578308, + "learning_rate": 0.00014585819365398438, + "loss": 1.8297, + "step": 23340 + }, + { + "epoch": 2.243679707776603, + "grad_norm": 1.1856138706207275, + "learning_rate": 0.00014584574535081077, + "loss": 1.9259, + "step": 23341 + }, + { + "epoch": 2.243775833894069, + "grad_norm": 1.1873412132263184, + "learning_rate": 0.00014583329708742698, + "loss": 1.9312, + "step": 23342 + }, + { + "epoch": 2.243871960011535, + "grad_norm": 1.213710069656372, + "learning_rate": 0.00014582084886392052, + "loss": 2.1793, + "step": 23343 + }, + { + "epoch": 2.2439680861290014, + "grad_norm": 1.125727891921997, + "learning_rate": 0.00014580840068037905, + "loss": 2.1081, + "step": 23344 + }, + { + "epoch": 2.2440642122464673, + "grad_norm": 1.110799789428711, + "learning_rate": 0.00014579595253689015, + "loss": 2.0163, + "step": 23345 + }, + { + "epoch": 2.2441603383639332, + "grad_norm": 1.0635451078414917, + "learning_rate": 0.0001457835044335414, + "loss": 1.964, + "step": 23346 + }, + { + "epoch": 2.2442564644813996, + "grad_norm": 1.1094565391540527, + "learning_rate": 0.00014577105637042045, + "loss": 1.9371, + "step": 23347 + }, + { + "epoch": 2.244352590598866, + "grad_norm": 1.0515600442886353, + "learning_rate": 0.00014575860834761483, + "loss": 2.0453, + "step": 23348 + }, + { + "epoch": 2.244448716716332, + "grad_norm": 1.2711749076843262, + "learning_rate": 0.0001457461603652122, + "loss": 2.0309, + "step": 23349 + }, + { + "epoch": 2.2445448428337977, + "grad_norm": 1.1312639713287354, + "learning_rate": 0.00014573371242330017, + "loss": 1.9991, + "step": 23350 + }, + { + "epoch": 2.244640968951264, + "grad_norm": 1.1497219800949097, + "learning_rate": 0.00014572126452196626, + "loss": 2.0718, + "step": 23351 + }, + { + "epoch": 2.24473709506873, + "grad_norm": 1.0425899028778076, + "learning_rate": 0.00014570881666129814, + "loss": 1.937, + "step": 23352 + }, + { + "epoch": 2.2448332211861963, + "grad_norm": 1.2191689014434814, + "learning_rate": 0.00014569636884138336, + "loss": 1.9806, + "step": 23353 + }, + { + "epoch": 2.2449293473036622, + "grad_norm": 1.1641393899917603, + "learning_rate": 0.00014568392106230956, + "loss": 2.0657, + "step": 23354 + }, + { + "epoch": 2.2450254734211286, + "grad_norm": 1.1553536653518677, + "learning_rate": 0.0001456714733241643, + "loss": 2.0163, + "step": 23355 + }, + { + "epoch": 2.2451215995385945, + "grad_norm": 1.2895798683166504, + "learning_rate": 0.00014565902562703518, + "loss": 2.0166, + "step": 23356 + }, + { + "epoch": 2.245217725656061, + "grad_norm": 1.283798098564148, + "learning_rate": 0.00014564657797100982, + "loss": 2.0013, + "step": 23357 + }, + { + "epoch": 2.2453138517735267, + "grad_norm": 1.0957499742507935, + "learning_rate": 0.00014563413035617579, + "loss": 1.9736, + "step": 23358 + }, + { + "epoch": 2.245409977890993, + "grad_norm": 1.1956199407577515, + "learning_rate": 0.00014562168278262072, + "loss": 2.1286, + "step": 23359 + }, + { + "epoch": 2.245506104008459, + "grad_norm": 1.3841133117675781, + "learning_rate": 0.00014560923525043217, + "loss": 2.2389, + "step": 23360 + }, + { + "epoch": 2.2456022301259253, + "grad_norm": 1.216676950454712, + "learning_rate": 0.00014559678775969773, + "loss": 2.0174, + "step": 23361 + }, + { + "epoch": 2.245698356243391, + "grad_norm": 1.262037992477417, + "learning_rate": 0.00014558434031050504, + "loss": 2.1726, + "step": 23362 + }, + { + "epoch": 2.2457944823608575, + "grad_norm": 1.387091875076294, + "learning_rate": 0.00014557189290294165, + "loss": 2.1577, + "step": 23363 + }, + { + "epoch": 2.2458906084783234, + "grad_norm": 1.091796636581421, + "learning_rate": 0.0001455594455370952, + "loss": 1.9756, + "step": 23364 + }, + { + "epoch": 2.24598673459579, + "grad_norm": 1.1973261833190918, + "learning_rate": 0.00014554699821305318, + "loss": 1.9504, + "step": 23365 + }, + { + "epoch": 2.2460828607132557, + "grad_norm": 1.110414743423462, + "learning_rate": 0.0001455345509309033, + "loss": 2.0089, + "step": 23366 + }, + { + "epoch": 2.246178986830722, + "grad_norm": 1.0911873579025269, + "learning_rate": 0.00014552210369073312, + "loss": 2.0801, + "step": 23367 + }, + { + "epoch": 2.246275112948188, + "grad_norm": 1.178171157836914, + "learning_rate": 0.00014550965649263024, + "loss": 1.9997, + "step": 23368 + }, + { + "epoch": 2.2463712390656543, + "grad_norm": 1.1691515445709229, + "learning_rate": 0.00014549720933668223, + "loss": 1.9162, + "step": 23369 + }, + { + "epoch": 2.24646736518312, + "grad_norm": 1.2555603981018066, + "learning_rate": 0.0001454847622229767, + "loss": 1.898, + "step": 23370 + }, + { + "epoch": 2.2465634913005865, + "grad_norm": 1.253362774848938, + "learning_rate": 0.0001454723151516012, + "loss": 1.9516, + "step": 23371 + }, + { + "epoch": 2.2466596174180524, + "grad_norm": 1.3325177431106567, + "learning_rate": 0.00014545986812264334, + "loss": 1.8963, + "step": 23372 + }, + { + "epoch": 2.2467557435355188, + "grad_norm": 1.2800471782684326, + "learning_rate": 0.00014544742113619077, + "loss": 1.9995, + "step": 23373 + }, + { + "epoch": 2.2468518696529847, + "grad_norm": 1.1004849672317505, + "learning_rate": 0.000145434974192331, + "loss": 1.9125, + "step": 23374 + }, + { + "epoch": 2.246947995770451, + "grad_norm": 1.4449806213378906, + "learning_rate": 0.0001454225272911517, + "loss": 2.1048, + "step": 23375 + }, + { + "epoch": 2.247044121887917, + "grad_norm": 1.113874077796936, + "learning_rate": 0.0001454100804327404, + "loss": 1.9618, + "step": 23376 + }, + { + "epoch": 2.2471402480053833, + "grad_norm": 1.2393842935562134, + "learning_rate": 0.0001453976336171847, + "loss": 2.2483, + "step": 23377 + }, + { + "epoch": 2.247236374122849, + "grad_norm": 1.1528278589248657, + "learning_rate": 0.0001453851868445722, + "loss": 1.8062, + "step": 23378 + }, + { + "epoch": 2.2473325002403155, + "grad_norm": 1.2270108461380005, + "learning_rate": 0.00014537274011499048, + "loss": 2.1828, + "step": 23379 + }, + { + "epoch": 2.2474286263577814, + "grad_norm": 0.9526183605194092, + "learning_rate": 0.00014536029342852714, + "loss": 1.8251, + "step": 23380 + }, + { + "epoch": 2.2475247524752477, + "grad_norm": 1.288541316986084, + "learning_rate": 0.00014534784678526974, + "loss": 2.152, + "step": 23381 + }, + { + "epoch": 2.2476208785927136, + "grad_norm": 1.2649935483932495, + "learning_rate": 0.00014533540018530592, + "loss": 1.9888, + "step": 23382 + }, + { + "epoch": 2.2477170047101795, + "grad_norm": 1.2236897945404053, + "learning_rate": 0.00014532295362872322, + "loss": 2.077, + "step": 23383 + }, + { + "epoch": 2.247813130827646, + "grad_norm": 1.2901114225387573, + "learning_rate": 0.0001453105071156093, + "loss": 1.8584, + "step": 23384 + }, + { + "epoch": 2.247909256945112, + "grad_norm": 1.6075057983398438, + "learning_rate": 0.00014529806064605166, + "loss": 2.4531, + "step": 23385 + }, + { + "epoch": 2.248005383062578, + "grad_norm": 1.2932491302490234, + "learning_rate": 0.00014528561422013793, + "loss": 2.0064, + "step": 23386 + }, + { + "epoch": 2.248101509180044, + "grad_norm": 1.1096279621124268, + "learning_rate": 0.00014527316783795574, + "loss": 1.9829, + "step": 23387 + }, + { + "epoch": 2.2481976352975104, + "grad_norm": 1.466333270072937, + "learning_rate": 0.00014526072149959258, + "loss": 1.9626, + "step": 23388 + }, + { + "epoch": 2.2482937614149763, + "grad_norm": 1.0894114971160889, + "learning_rate": 0.0001452482752051361, + "loss": 2.0601, + "step": 23389 + }, + { + "epoch": 2.2483898875324426, + "grad_norm": 1.0954432487487793, + "learning_rate": 0.00014523582895467385, + "loss": 1.8486, + "step": 23390 + }, + { + "epoch": 2.2484860136499085, + "grad_norm": 1.0674716234207153, + "learning_rate": 0.0001452233827482935, + "loss": 1.9073, + "step": 23391 + }, + { + "epoch": 2.248582139767375, + "grad_norm": 1.2270753383636475, + "learning_rate": 0.00014521093658608253, + "loss": 2.0116, + "step": 23392 + }, + { + "epoch": 2.2486782658848408, + "grad_norm": 1.646003246307373, + "learning_rate": 0.00014519849046812857, + "loss": 2.0312, + "step": 23393 + }, + { + "epoch": 2.248774392002307, + "grad_norm": 1.1102068424224854, + "learning_rate": 0.00014518604439451923, + "loss": 2.1037, + "step": 23394 + }, + { + "epoch": 2.248870518119773, + "grad_norm": 1.0491756200790405, + "learning_rate": 0.00014517359836534204, + "loss": 1.944, + "step": 23395 + }, + { + "epoch": 2.2489666442372394, + "grad_norm": 1.2167582511901855, + "learning_rate": 0.00014516115238068464, + "loss": 1.9996, + "step": 23396 + }, + { + "epoch": 2.2490627703547053, + "grad_norm": 1.3998911380767822, + "learning_rate": 0.0001451487064406346, + "loss": 1.8999, + "step": 23397 + }, + { + "epoch": 2.2491588964721716, + "grad_norm": 1.1637996435165405, + "learning_rate": 0.00014513626054527947, + "loss": 2.2401, + "step": 23398 + }, + { + "epoch": 2.2492550225896375, + "grad_norm": 1.0444657802581787, + "learning_rate": 0.00014512381469470684, + "loss": 1.9679, + "step": 23399 + }, + { + "epoch": 2.249351148707104, + "grad_norm": 1.1958781480789185, + "learning_rate": 0.00014511136888900432, + "loss": 1.9849, + "step": 23400 + }, + { + "epoch": 2.2494472748245697, + "grad_norm": 1.2109291553497314, + "learning_rate": 0.0001450989231282595, + "loss": 2.1411, + "step": 23401 + }, + { + "epoch": 2.249543400942036, + "grad_norm": 1.3909610509872437, + "learning_rate": 0.00014508647741255992, + "loss": 2.0777, + "step": 23402 + }, + { + "epoch": 2.249639527059502, + "grad_norm": 1.4135260581970215, + "learning_rate": 0.0001450740317419932, + "loss": 1.9692, + "step": 23403 + }, + { + "epoch": 2.2497356531769683, + "grad_norm": 1.171413779258728, + "learning_rate": 0.0001450615861166469, + "loss": 1.9485, + "step": 23404 + }, + { + "epoch": 2.2498317792944342, + "grad_norm": 1.1157532930374146, + "learning_rate": 0.00014504914053660865, + "loss": 1.8705, + "step": 23405 + }, + { + "epoch": 2.2499279054119006, + "grad_norm": 1.1456800699234009, + "learning_rate": 0.00014503669500196594, + "loss": 2.0835, + "step": 23406 + }, + { + "epoch": 2.2500240315293665, + "grad_norm": 1.1542609930038452, + "learning_rate": 0.00014502424951280644, + "loss": 2.0152, + "step": 23407 + }, + { + "epoch": 2.250120157646833, + "grad_norm": 1.059173345565796, + "learning_rate": 0.00014501180406921774, + "loss": 1.8985, + "step": 23408 + }, + { + "epoch": 2.2502162837642987, + "grad_norm": 1.1924275159835815, + "learning_rate": 0.0001449993586712873, + "loss": 1.9238, + "step": 23409 + }, + { + "epoch": 2.250312409881765, + "grad_norm": 1.1330347061157227, + "learning_rate": 0.00014498691331910277, + "loss": 1.9128, + "step": 23410 + }, + { + "epoch": 2.250408535999231, + "grad_norm": 1.1933979988098145, + "learning_rate": 0.00014497446801275176, + "loss": 1.9913, + "step": 23411 + }, + { + "epoch": 2.250504662116697, + "grad_norm": 1.2014811038970947, + "learning_rate": 0.0001449620227523218, + "loss": 1.821, + "step": 23412 + }, + { + "epoch": 2.250600788234163, + "grad_norm": 1.1243557929992676, + "learning_rate": 0.00014494957753790052, + "loss": 1.9666, + "step": 23413 + }, + { + "epoch": 2.2506969143516296, + "grad_norm": 1.1175163984298706, + "learning_rate": 0.00014493713236957546, + "loss": 1.9741, + "step": 23414 + }, + { + "epoch": 2.2507930404690955, + "grad_norm": 1.2497243881225586, + "learning_rate": 0.0001449246872474342, + "loss": 2.1082, + "step": 23415 + }, + { + "epoch": 2.2508891665865614, + "grad_norm": 1.1640788316726685, + "learning_rate": 0.00014491224217156432, + "loss": 1.8749, + "step": 23416 + }, + { + "epoch": 2.2509852927040277, + "grad_norm": 1.1647536754608154, + "learning_rate": 0.0001448997971420534, + "loss": 2.0607, + "step": 23417 + }, + { + "epoch": 2.251081418821494, + "grad_norm": 1.1324750185012817, + "learning_rate": 0.00014488735215898905, + "loss": 2.0822, + "step": 23418 + }, + { + "epoch": 2.25117754493896, + "grad_norm": 1.2207255363464355, + "learning_rate": 0.00014487490722245883, + "loss": 1.9781, + "step": 23419 + }, + { + "epoch": 2.251273671056426, + "grad_norm": 1.2458064556121826, + "learning_rate": 0.00014486246233255028, + "loss": 2.0628, + "step": 23420 + }, + { + "epoch": 2.251369797173892, + "grad_norm": 1.131654977798462, + "learning_rate": 0.000144850017489351, + "loss": 1.9924, + "step": 23421 + }, + { + "epoch": 2.251465923291358, + "grad_norm": 1.1121540069580078, + "learning_rate": 0.00014483757269294857, + "loss": 2.0363, + "step": 23422 + }, + { + "epoch": 2.2515620494088244, + "grad_norm": 1.2592195272445679, + "learning_rate": 0.00014482512794343055, + "loss": 2.1992, + "step": 23423 + }, + { + "epoch": 2.2516581755262903, + "grad_norm": 1.2306643724441528, + "learning_rate": 0.00014481268324088455, + "loss": 2.0476, + "step": 23424 + }, + { + "epoch": 2.2517543016437567, + "grad_norm": 1.2198442220687866, + "learning_rate": 0.00014480023858539812, + "loss": 2.0904, + "step": 23425 + }, + { + "epoch": 2.2518504277612226, + "grad_norm": 1.213824987411499, + "learning_rate": 0.0001447877939770588, + "loss": 1.9867, + "step": 23426 + }, + { + "epoch": 2.251946553878689, + "grad_norm": 1.0837441682815552, + "learning_rate": 0.00014477534941595424, + "loss": 2.0184, + "step": 23427 + }, + { + "epoch": 2.252042679996155, + "grad_norm": 1.1892683506011963, + "learning_rate": 0.00014476290490217201, + "loss": 2.0066, + "step": 23428 + }, + { + "epoch": 2.252138806113621, + "grad_norm": 1.2992116212844849, + "learning_rate": 0.00014475046043579962, + "loss": 1.9918, + "step": 23429 + }, + { + "epoch": 2.252234932231087, + "grad_norm": 1.1500238180160522, + "learning_rate": 0.00014473801601692468, + "loss": 2.1189, + "step": 23430 + }, + { + "epoch": 2.2523310583485534, + "grad_norm": 1.1457273960113525, + "learning_rate": 0.00014472557164563475, + "loss": 1.9155, + "step": 23431 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 1.3118560314178467, + "learning_rate": 0.0001447131273220174, + "loss": 2.0, + "step": 23432 + }, + { + "epoch": 2.2525233105834856, + "grad_norm": 1.126295566558838, + "learning_rate": 0.00014470068304616023, + "loss": 1.9993, + "step": 23433 + }, + { + "epoch": 2.2526194367009515, + "grad_norm": 1.2563846111297607, + "learning_rate": 0.0001446882388181508, + "loss": 2.065, + "step": 23434 + }, + { + "epoch": 2.252715562818418, + "grad_norm": 1.2213062047958374, + "learning_rate": 0.00014467579463807667, + "loss": 2.1599, + "step": 23435 + }, + { + "epoch": 2.252811688935884, + "grad_norm": 1.1656033992767334, + "learning_rate": 0.00014466335050602544, + "loss": 1.9854, + "step": 23436 + }, + { + "epoch": 2.25290781505335, + "grad_norm": 1.0347486734390259, + "learning_rate": 0.00014465090642208463, + "loss": 1.871, + "step": 23437 + }, + { + "epoch": 2.253003941170816, + "grad_norm": 1.057702898979187, + "learning_rate": 0.00014463846238634185, + "loss": 1.888, + "step": 23438 + }, + { + "epoch": 2.2531000672882824, + "grad_norm": 1.1160852909088135, + "learning_rate": 0.00014462601839888465, + "loss": 2.027, + "step": 23439 + }, + { + "epoch": 2.2531961934057483, + "grad_norm": 1.2614496946334839, + "learning_rate": 0.00014461357445980064, + "loss": 2.0904, + "step": 23440 + }, + { + "epoch": 2.2532923195232146, + "grad_norm": 1.3750754594802856, + "learning_rate": 0.00014460113056917733, + "loss": 2.1119, + "step": 23441 + }, + { + "epoch": 2.2533884456406805, + "grad_norm": 0.9544004201889038, + "learning_rate": 0.00014458868672710234, + "loss": 1.6179, + "step": 23442 + }, + { + "epoch": 2.253484571758147, + "grad_norm": 1.2193236351013184, + "learning_rate": 0.00014457624293366317, + "loss": 1.9394, + "step": 23443 + }, + { + "epoch": 2.2535806978756128, + "grad_norm": 1.107260823249817, + "learning_rate": 0.0001445637991889475, + "loss": 1.8126, + "step": 23444 + }, + { + "epoch": 2.2536768239930787, + "grad_norm": 1.2231143712997437, + "learning_rate": 0.0001445513554930428, + "loss": 2.0587, + "step": 23445 + }, + { + "epoch": 2.253772950110545, + "grad_norm": 1.0955933332443237, + "learning_rate": 0.0001445389118460367, + "loss": 1.8311, + "step": 23446 + }, + { + "epoch": 2.2538690762280114, + "grad_norm": 1.035355567932129, + "learning_rate": 0.00014452646824801674, + "loss": 1.9731, + "step": 23447 + }, + { + "epoch": 2.2539652023454773, + "grad_norm": 1.0797122716903687, + "learning_rate": 0.00014451402469907047, + "loss": 1.904, + "step": 23448 + }, + { + "epoch": 2.254061328462943, + "grad_norm": 1.2681224346160889, + "learning_rate": 0.0001445015811992855, + "loss": 1.9821, + "step": 23449 + }, + { + "epoch": 2.2541574545804095, + "grad_norm": 1.1768947839736938, + "learning_rate": 0.00014448913774874937, + "loss": 2.081, + "step": 23450 + }, + { + "epoch": 2.254253580697876, + "grad_norm": 1.0014081001281738, + "learning_rate": 0.0001444766943475496, + "loss": 2.0414, + "step": 23451 + }, + { + "epoch": 2.2543497068153417, + "grad_norm": 1.158573031425476, + "learning_rate": 0.00014446425099577385, + "loss": 1.936, + "step": 23452 + }, + { + "epoch": 2.2544458329328076, + "grad_norm": 1.2431976795196533, + "learning_rate": 0.00014445180769350965, + "loss": 1.98, + "step": 23453 + }, + { + "epoch": 2.254541959050274, + "grad_norm": 1.1984423398971558, + "learning_rate": 0.00014443936444084453, + "loss": 2.07, + "step": 23454 + }, + { + "epoch": 2.25463808516774, + "grad_norm": 1.0761852264404297, + "learning_rate": 0.0001444269212378661, + "loss": 1.8128, + "step": 23455 + }, + { + "epoch": 2.2547342112852062, + "grad_norm": 1.1950632333755493, + "learning_rate": 0.00014441447808466188, + "loss": 2.0305, + "step": 23456 + }, + { + "epoch": 2.254830337402672, + "grad_norm": 1.234739899635315, + "learning_rate": 0.00014440203498131948, + "loss": 1.9846, + "step": 23457 + }, + { + "epoch": 2.2549264635201385, + "grad_norm": 1.0846647024154663, + "learning_rate": 0.00014438959192792644, + "loss": 2.0606, + "step": 23458 + }, + { + "epoch": 2.2550225896376044, + "grad_norm": 1.3082213401794434, + "learning_rate": 0.00014437714892457032, + "loss": 2.1861, + "step": 23459 + }, + { + "epoch": 2.2551187157550707, + "grad_norm": 1.1978834867477417, + "learning_rate": 0.00014436470597133867, + "loss": 1.9632, + "step": 23460 + }, + { + "epoch": 2.2552148418725366, + "grad_norm": 1.2813464403152466, + "learning_rate": 0.0001443522630683191, + "loss": 1.9024, + "step": 23461 + }, + { + "epoch": 2.255310967990003, + "grad_norm": 1.1195424795150757, + "learning_rate": 0.00014433982021559914, + "loss": 1.9703, + "step": 23462 + }, + { + "epoch": 2.255407094107469, + "grad_norm": 1.2308443784713745, + "learning_rate": 0.00014432737741326636, + "loss": 1.8785, + "step": 23463 + }, + { + "epoch": 2.255503220224935, + "grad_norm": 1.0464692115783691, + "learning_rate": 0.00014431493466140832, + "loss": 1.8273, + "step": 23464 + }, + { + "epoch": 2.255599346342401, + "grad_norm": 1.3205076456069946, + "learning_rate": 0.00014430249196011256, + "loss": 1.8408, + "step": 23465 + }, + { + "epoch": 2.2556954724598675, + "grad_norm": 1.465372085571289, + "learning_rate": 0.00014429004930946668, + "loss": 2.0736, + "step": 23466 + }, + { + "epoch": 2.2557915985773334, + "grad_norm": 1.119289755821228, + "learning_rate": 0.00014427760670955825, + "loss": 1.9047, + "step": 23467 + }, + { + "epoch": 2.2558877246947997, + "grad_norm": 1.1279315948486328, + "learning_rate": 0.00014426516416047477, + "loss": 1.9287, + "step": 23468 + }, + { + "epoch": 2.2559838508122656, + "grad_norm": 1.1777926683425903, + "learning_rate": 0.00014425272166230382, + "loss": 1.9819, + "step": 23469 + }, + { + "epoch": 2.256079976929732, + "grad_norm": 1.2855898141860962, + "learning_rate": 0.000144240279215133, + "loss": 2.0004, + "step": 23470 + }, + { + "epoch": 2.256176103047198, + "grad_norm": 1.1400532722473145, + "learning_rate": 0.00014422783681904982, + "loss": 1.928, + "step": 23471 + }, + { + "epoch": 2.256272229164664, + "grad_norm": 1.1748943328857422, + "learning_rate": 0.00014421539447414186, + "loss": 2.0936, + "step": 23472 + }, + { + "epoch": 2.25636835528213, + "grad_norm": 1.1706527471542358, + "learning_rate": 0.00014420295218049668, + "loss": 1.9937, + "step": 23473 + }, + { + "epoch": 2.2564644813995964, + "grad_norm": 1.1554380655288696, + "learning_rate": 0.00014419050993820187, + "loss": 2.0771, + "step": 23474 + }, + { + "epoch": 2.2565606075170623, + "grad_norm": 1.2347406148910522, + "learning_rate": 0.0001441780677473449, + "loss": 1.8516, + "step": 23475 + }, + { + "epoch": 2.2566567336345287, + "grad_norm": 1.069109320640564, + "learning_rate": 0.00014416562560801342, + "loss": 1.9726, + "step": 23476 + }, + { + "epoch": 2.2567528597519946, + "grad_norm": 1.081477165222168, + "learning_rate": 0.0001441531835202949, + "loss": 1.8425, + "step": 23477 + }, + { + "epoch": 2.256848985869461, + "grad_norm": 1.1937167644500732, + "learning_rate": 0.00014414074148427699, + "loss": 1.945, + "step": 23478 + }, + { + "epoch": 2.256945111986927, + "grad_norm": 1.2105028629302979, + "learning_rate": 0.00014412829950004717, + "loss": 2.0398, + "step": 23479 + }, + { + "epoch": 2.257041238104393, + "grad_norm": 1.142879843711853, + "learning_rate": 0.00014411585756769307, + "loss": 2.0439, + "step": 23480 + }, + { + "epoch": 2.257137364221859, + "grad_norm": 1.10938560962677, + "learning_rate": 0.0001441034156873022, + "loss": 2.1295, + "step": 23481 + }, + { + "epoch": 2.257233490339325, + "grad_norm": 1.2190479040145874, + "learning_rate": 0.0001440909738589621, + "loss": 2.0122, + "step": 23482 + }, + { + "epoch": 2.2573296164567913, + "grad_norm": 1.276695728302002, + "learning_rate": 0.00014407853208276035, + "loss": 2.0415, + "step": 23483 + }, + { + "epoch": 2.2574257425742577, + "grad_norm": 1.2551676034927368, + "learning_rate": 0.0001440660903587845, + "loss": 2.0044, + "step": 23484 + }, + { + "epoch": 2.2575218686917236, + "grad_norm": 1.3447760343551636, + "learning_rate": 0.0001440536486871221, + "loss": 2.0894, + "step": 23485 + }, + { + "epoch": 2.2576179948091895, + "grad_norm": 1.244059443473816, + "learning_rate": 0.0001440412070678607, + "loss": 2.1178, + "step": 23486 + }, + { + "epoch": 2.257714120926656, + "grad_norm": 1.2149145603179932, + "learning_rate": 0.00014402876550108788, + "loss": 2.0003, + "step": 23487 + }, + { + "epoch": 2.2578102470441217, + "grad_norm": 1.2567145824432373, + "learning_rate": 0.00014401632398689115, + "loss": 1.9532, + "step": 23488 + }, + { + "epoch": 2.257906373161588, + "grad_norm": 1.0807271003723145, + "learning_rate": 0.00014400388252535807, + "loss": 1.8084, + "step": 23489 + }, + { + "epoch": 2.258002499279054, + "grad_norm": 1.1504451036453247, + "learning_rate": 0.00014399144111657623, + "loss": 1.8744, + "step": 23490 + }, + { + "epoch": 2.2580986253965203, + "grad_norm": 1.2129777669906616, + "learning_rate": 0.00014397899976063315, + "loss": 2.0792, + "step": 23491 + }, + { + "epoch": 2.258194751513986, + "grad_norm": 1.054901123046875, + "learning_rate": 0.0001439665584576164, + "loss": 1.8496, + "step": 23492 + }, + { + "epoch": 2.2582908776314525, + "grad_norm": 1.021154522895813, + "learning_rate": 0.0001439541172076135, + "loss": 1.8522, + "step": 23493 + }, + { + "epoch": 2.2583870037489184, + "grad_norm": 1.1722999811172485, + "learning_rate": 0.00014394167601071203, + "loss": 1.9732, + "step": 23494 + }, + { + "epoch": 2.2584831298663848, + "grad_norm": 1.0388767719268799, + "learning_rate": 0.00014392923486699954, + "loss": 1.9901, + "step": 23495 + }, + { + "epoch": 2.2585792559838507, + "grad_norm": 1.176088809967041, + "learning_rate": 0.00014391679377656359, + "loss": 1.9595, + "step": 23496 + }, + { + "epoch": 2.258675382101317, + "grad_norm": 1.2788550853729248, + "learning_rate": 0.00014390435273949168, + "loss": 1.9644, + "step": 23497 + }, + { + "epoch": 2.258771508218783, + "grad_norm": 1.2345280647277832, + "learning_rate": 0.00014389191175587143, + "loss": 2.0206, + "step": 23498 + }, + { + "epoch": 2.2588676343362493, + "grad_norm": 1.184779167175293, + "learning_rate": 0.00014387947082579032, + "loss": 2.0198, + "step": 23499 + }, + { + "epoch": 2.258963760453715, + "grad_norm": 1.150869369506836, + "learning_rate": 0.00014386702994933595, + "loss": 2.0335, + "step": 23500 + }, + { + "epoch": 2.2590598865711815, + "grad_norm": 1.2078020572662354, + "learning_rate": 0.00014385458912659582, + "loss": 1.9521, + "step": 23501 + }, + { + "epoch": 2.2591560126886474, + "grad_norm": 1.05216383934021, + "learning_rate": 0.00014384214835765755, + "loss": 1.9894, + "step": 23502 + }, + { + "epoch": 2.2592521388061138, + "grad_norm": 1.2146601676940918, + "learning_rate": 0.0001438297076426086, + "loss": 2.0256, + "step": 23503 + }, + { + "epoch": 2.2593482649235797, + "grad_norm": 1.1548800468444824, + "learning_rate": 0.00014381726698153656, + "loss": 2.0363, + "step": 23504 + }, + { + "epoch": 2.259444391041046, + "grad_norm": 1.1889357566833496, + "learning_rate": 0.000143804826374529, + "loss": 2.1588, + "step": 23505 + }, + { + "epoch": 2.259540517158512, + "grad_norm": 1.2103018760681152, + "learning_rate": 0.00014379238582167345, + "loss": 1.954, + "step": 23506 + }, + { + "epoch": 2.2596366432759782, + "grad_norm": 1.132909893989563, + "learning_rate": 0.00014377994532305746, + "loss": 2.046, + "step": 23507 + }, + { + "epoch": 2.259732769393444, + "grad_norm": 1.144545555114746, + "learning_rate": 0.0001437675048787685, + "loss": 2.101, + "step": 23508 + }, + { + "epoch": 2.2598288955109105, + "grad_norm": 1.2192347049713135, + "learning_rate": 0.00014375506448889423, + "loss": 1.8675, + "step": 23509 + }, + { + "epoch": 2.2599250216283764, + "grad_norm": 1.0648858547210693, + "learning_rate": 0.00014374262415352217, + "loss": 1.9527, + "step": 23510 + }, + { + "epoch": 2.2600211477458427, + "grad_norm": 1.4045988321304321, + "learning_rate": 0.00014373018387273974, + "loss": 1.9625, + "step": 23511 + }, + { + "epoch": 2.2601172738633086, + "grad_norm": 1.258123755455017, + "learning_rate": 0.00014371774364663466, + "loss": 1.9068, + "step": 23512 + }, + { + "epoch": 2.260213399980775, + "grad_norm": 1.3156044483184814, + "learning_rate": 0.00014370530347529437, + "loss": 1.9635, + "step": 23513 + }, + { + "epoch": 2.260309526098241, + "grad_norm": 1.1880056858062744, + "learning_rate": 0.00014369286335880646, + "loss": 1.9427, + "step": 23514 + }, + { + "epoch": 2.2604056522157068, + "grad_norm": 1.2728214263916016, + "learning_rate": 0.00014368042329725844, + "loss": 2.2425, + "step": 23515 + }, + { + "epoch": 2.260501778333173, + "grad_norm": 1.1073293685913086, + "learning_rate": 0.00014366798329073786, + "loss": 1.9467, + "step": 23516 + }, + { + "epoch": 2.2605979044506395, + "grad_norm": 1.2220711708068848, + "learning_rate": 0.00014365554333933229, + "loss": 2.1067, + "step": 23517 + }, + { + "epoch": 2.2606940305681054, + "grad_norm": 1.0885217189788818, + "learning_rate": 0.00014364310344312922, + "loss": 2.0203, + "step": 23518 + }, + { + "epoch": 2.2607901566855713, + "grad_norm": 1.2313685417175293, + "learning_rate": 0.00014363066360221623, + "loss": 2.0585, + "step": 23519 + }, + { + "epoch": 2.2608862828030376, + "grad_norm": 1.1233717203140259, + "learning_rate": 0.00014361822381668088, + "loss": 2.0362, + "step": 23520 + }, + { + "epoch": 2.260982408920504, + "grad_norm": 1.1407251358032227, + "learning_rate": 0.00014360578408661063, + "loss": 1.9479, + "step": 23521 + }, + { + "epoch": 2.26107853503797, + "grad_norm": 1.2857893705368042, + "learning_rate": 0.0001435933444120931, + "loss": 1.9959, + "step": 23522 + }, + { + "epoch": 2.2611746611554358, + "grad_norm": 1.1736363172531128, + "learning_rate": 0.0001435809047932158, + "loss": 2.0144, + "step": 23523 + }, + { + "epoch": 2.261270787272902, + "grad_norm": 1.4746876955032349, + "learning_rate": 0.00014356846523006625, + "loss": 1.9294, + "step": 23524 + }, + { + "epoch": 2.261366913390368, + "grad_norm": 1.3921798467636108, + "learning_rate": 0.00014355602572273203, + "loss": 2.1133, + "step": 23525 + }, + { + "epoch": 2.2614630395078343, + "grad_norm": 1.0138115882873535, + "learning_rate": 0.00014354358627130065, + "loss": 1.8666, + "step": 23526 + }, + { + "epoch": 2.2615591656253002, + "grad_norm": 1.0435669422149658, + "learning_rate": 0.00014353114687585964, + "loss": 2.0093, + "step": 23527 + }, + { + "epoch": 2.2616552917427666, + "grad_norm": 1.3928455114364624, + "learning_rate": 0.00014351870753649654, + "loss": 2.0667, + "step": 23528 + }, + { + "epoch": 2.2617514178602325, + "grad_norm": 0.9438033699989319, + "learning_rate": 0.00014350626825329895, + "loss": 1.9591, + "step": 23529 + }, + { + "epoch": 2.261847543977699, + "grad_norm": 1.2641475200653076, + "learning_rate": 0.00014349382902635434, + "loss": 1.9041, + "step": 23530 + }, + { + "epoch": 2.2619436700951647, + "grad_norm": 1.2355972528457642, + "learning_rate": 0.00014348138985575027, + "loss": 1.9557, + "step": 23531 + }, + { + "epoch": 2.262039796212631, + "grad_norm": 1.0594127178192139, + "learning_rate": 0.00014346895074157426, + "loss": 2.0596, + "step": 23532 + }, + { + "epoch": 2.262135922330097, + "grad_norm": 1.6079927682876587, + "learning_rate": 0.00014345651168391388, + "loss": 2.0102, + "step": 23533 + }, + { + "epoch": 2.2622320484475633, + "grad_norm": 1.0822749137878418, + "learning_rate": 0.0001434440726828566, + "loss": 1.9692, + "step": 23534 + }, + { + "epoch": 2.262328174565029, + "grad_norm": 1.184863805770874, + "learning_rate": 0.00014343163373849003, + "loss": 1.8484, + "step": 23535 + }, + { + "epoch": 2.2624243006824956, + "grad_norm": 1.163227915763855, + "learning_rate": 0.00014341919485090165, + "loss": 2.1083, + "step": 23536 + }, + { + "epoch": 2.2625204267999615, + "grad_norm": 1.1681344509124756, + "learning_rate": 0.00014340675602017903, + "loss": 2.0391, + "step": 23537 + }, + { + "epoch": 2.262616552917428, + "grad_norm": 1.3505274057388306, + "learning_rate": 0.00014339431724640967, + "loss": 2.2182, + "step": 23538 + }, + { + "epoch": 2.2627126790348937, + "grad_norm": 1.079124927520752, + "learning_rate": 0.00014338187852968115, + "loss": 1.9916, + "step": 23539 + }, + { + "epoch": 2.26280880515236, + "grad_norm": 1.1836458444595337, + "learning_rate": 0.00014336943987008097, + "loss": 1.9436, + "step": 23540 + }, + { + "epoch": 2.262904931269826, + "grad_norm": 1.2073560953140259, + "learning_rate": 0.00014335700126769666, + "loss": 2.1755, + "step": 23541 + }, + { + "epoch": 2.2630010573872923, + "grad_norm": 1.2824904918670654, + "learning_rate": 0.00014334456272261577, + "loss": 2.1767, + "step": 23542 + }, + { + "epoch": 2.263097183504758, + "grad_norm": 1.17474365234375, + "learning_rate": 0.0001433321242349258, + "loss": 2.0572, + "step": 23543 + }, + { + "epoch": 2.2631933096222245, + "grad_norm": 1.1784244775772095, + "learning_rate": 0.00014331968580471433, + "loss": 2.1121, + "step": 23544 + }, + { + "epoch": 2.2632894357396904, + "grad_norm": 1.3587543964385986, + "learning_rate": 0.0001433072474320688, + "loss": 2.0558, + "step": 23545 + }, + { + "epoch": 2.263385561857157, + "grad_norm": 1.2111645936965942, + "learning_rate": 0.00014329480911707686, + "loss": 2.1634, + "step": 23546 + }, + { + "epoch": 2.2634816879746227, + "grad_norm": 1.1841870546340942, + "learning_rate": 0.00014328237085982598, + "loss": 2.1489, + "step": 23547 + }, + { + "epoch": 2.2635778140920886, + "grad_norm": 1.184609293937683, + "learning_rate": 0.0001432699326604037, + "loss": 2.0001, + "step": 23548 + }, + { + "epoch": 2.263673940209555, + "grad_norm": 1.1482285261154175, + "learning_rate": 0.00014325749451889755, + "loss": 2.0166, + "step": 23549 + }, + { + "epoch": 2.2637700663270213, + "grad_norm": 0.9068223237991333, + "learning_rate": 0.00014324505643539503, + "loss": 1.7716, + "step": 23550 + }, + { + "epoch": 2.263866192444487, + "grad_norm": 1.0513087511062622, + "learning_rate": 0.0001432326184099837, + "loss": 1.967, + "step": 23551 + }, + { + "epoch": 2.263962318561953, + "grad_norm": 1.1179636716842651, + "learning_rate": 0.0001432201804427511, + "loss": 1.7807, + "step": 23552 + }, + { + "epoch": 2.2640584446794194, + "grad_norm": 1.3862196207046509, + "learning_rate": 0.00014320774253378472, + "loss": 2.1011, + "step": 23553 + }, + { + "epoch": 2.2641545707968858, + "grad_norm": 1.0834704637527466, + "learning_rate": 0.00014319530468317207, + "loss": 2.1297, + "step": 23554 + }, + { + "epoch": 2.2642506969143517, + "grad_norm": 1.1656378507614136, + "learning_rate": 0.00014318286689100077, + "loss": 1.8779, + "step": 23555 + }, + { + "epoch": 2.2643468230318176, + "grad_norm": 1.1643530130386353, + "learning_rate": 0.00014317042915735828, + "loss": 2.0455, + "step": 23556 + }, + { + "epoch": 2.264442949149284, + "grad_norm": 1.1077227592468262, + "learning_rate": 0.00014315799148233212, + "loss": 1.8841, + "step": 23557 + }, + { + "epoch": 2.26453907526675, + "grad_norm": 1.0839109420776367, + "learning_rate": 0.00014314555386600982, + "loss": 2.0054, + "step": 23558 + }, + { + "epoch": 2.264635201384216, + "grad_norm": 1.200184941291809, + "learning_rate": 0.00014313311630847892, + "loss": 2.0183, + "step": 23559 + }, + { + "epoch": 2.264731327501682, + "grad_norm": 1.110558032989502, + "learning_rate": 0.00014312067880982695, + "loss": 2.0047, + "step": 23560 + }, + { + "epoch": 2.2648274536191484, + "grad_norm": 1.2370796203613281, + "learning_rate": 0.0001431082413701414, + "loss": 2.0115, + "step": 23561 + }, + { + "epoch": 2.2649235797366143, + "grad_norm": 1.3212496042251587, + "learning_rate": 0.00014309580398950983, + "loss": 1.9252, + "step": 23562 + }, + { + "epoch": 2.2650197058540806, + "grad_norm": 1.029097080230713, + "learning_rate": 0.00014308336666801974, + "loss": 2.0286, + "step": 23563 + }, + { + "epoch": 2.2651158319715465, + "grad_norm": 1.2131729125976562, + "learning_rate": 0.00014307092940575868, + "loss": 1.9601, + "step": 23564 + }, + { + "epoch": 2.265211958089013, + "grad_norm": 1.140414834022522, + "learning_rate": 0.00014305849220281415, + "loss": 2.0094, + "step": 23565 + }, + { + "epoch": 2.265308084206479, + "grad_norm": 1.050445556640625, + "learning_rate": 0.00014304605505927368, + "loss": 1.9253, + "step": 23566 + }, + { + "epoch": 2.265404210323945, + "grad_norm": 1.0612152814865112, + "learning_rate": 0.00014303361797522482, + "loss": 2.0207, + "step": 23567 + }, + { + "epoch": 2.265500336441411, + "grad_norm": 1.1304291486740112, + "learning_rate": 0.00014302118095075504, + "loss": 2.0081, + "step": 23568 + }, + { + "epoch": 2.2655964625588774, + "grad_norm": 1.1206825971603394, + "learning_rate": 0.00014300874398595186, + "loss": 1.8879, + "step": 23569 + }, + { + "epoch": 2.2656925886763433, + "grad_norm": 1.1254639625549316, + "learning_rate": 0.00014299630708090286, + "loss": 2.0246, + "step": 23570 + }, + { + "epoch": 2.2657887147938096, + "grad_norm": 1.1228067874908447, + "learning_rate": 0.00014298387023569553, + "loss": 1.9709, + "step": 23571 + }, + { + "epoch": 2.2658848409112755, + "grad_norm": 1.0788947343826294, + "learning_rate": 0.0001429714334504174, + "loss": 1.9501, + "step": 23572 + }, + { + "epoch": 2.265980967028742, + "grad_norm": 1.090900182723999, + "learning_rate": 0.00014295899672515594, + "loss": 1.9801, + "step": 23573 + }, + { + "epoch": 2.2660770931462078, + "grad_norm": 1.114153504371643, + "learning_rate": 0.0001429465600599987, + "loss": 1.9542, + "step": 23574 + }, + { + "epoch": 2.266173219263674, + "grad_norm": 1.4128201007843018, + "learning_rate": 0.0001429341234550332, + "loss": 2.0905, + "step": 23575 + }, + { + "epoch": 2.26626934538114, + "grad_norm": 1.0667728185653687, + "learning_rate": 0.00014292168691034697, + "loss": 2.0742, + "step": 23576 + }, + { + "epoch": 2.2663654714986063, + "grad_norm": 1.0792533159255981, + "learning_rate": 0.00014290925042602755, + "loss": 1.8435, + "step": 23577 + }, + { + "epoch": 2.2664615976160722, + "grad_norm": 1.1322532892227173, + "learning_rate": 0.00014289681400216238, + "loss": 2.0129, + "step": 23578 + }, + { + "epoch": 2.2665577237335386, + "grad_norm": 1.1193636655807495, + "learning_rate": 0.000142884377638839, + "loss": 2.0444, + "step": 23579 + }, + { + "epoch": 2.2666538498510045, + "grad_norm": 1.2331254482269287, + "learning_rate": 0.000142871941336145, + "loss": 2.0725, + "step": 23580 + }, + { + "epoch": 2.2667499759684704, + "grad_norm": 1.2256360054016113, + "learning_rate": 0.00014285950509416783, + "loss": 1.9131, + "step": 23581 + }, + { + "epoch": 2.2668461020859367, + "grad_norm": 1.0724507570266724, + "learning_rate": 0.000142847068912995, + "loss": 2.038, + "step": 23582 + }, + { + "epoch": 2.266942228203403, + "grad_norm": 1.2378599643707275, + "learning_rate": 0.00014283463279271406, + "loss": 2.1244, + "step": 23583 + }, + { + "epoch": 2.267038354320869, + "grad_norm": 1.2288798093795776, + "learning_rate": 0.00014282219673341252, + "loss": 2.0235, + "step": 23584 + }, + { + "epoch": 2.267134480438335, + "grad_norm": 1.2275980710983276, + "learning_rate": 0.0001428097607351779, + "loss": 2.1147, + "step": 23585 + }, + { + "epoch": 2.267230606555801, + "grad_norm": 1.1541234254837036, + "learning_rate": 0.00014279732479809766, + "loss": 2.0157, + "step": 23586 + }, + { + "epoch": 2.2673267326732676, + "grad_norm": 1.4550386667251587, + "learning_rate": 0.0001427848889222594, + "loss": 2.162, + "step": 23587 + }, + { + "epoch": 2.2674228587907335, + "grad_norm": 1.133377194404602, + "learning_rate": 0.0001427724531077505, + "loss": 1.8078, + "step": 23588 + }, + { + "epoch": 2.2675189849081994, + "grad_norm": 1.1541389226913452, + "learning_rate": 0.0001427600173546586, + "loss": 1.8335, + "step": 23589 + }, + { + "epoch": 2.2676151110256657, + "grad_norm": 1.290814995765686, + "learning_rate": 0.0001427475816630712, + "loss": 1.99, + "step": 23590 + }, + { + "epoch": 2.2677112371431316, + "grad_norm": 1.1994259357452393, + "learning_rate": 0.00014273514603307575, + "loss": 2.0207, + "step": 23591 + }, + { + "epoch": 2.267807363260598, + "grad_norm": 1.098764181137085, + "learning_rate": 0.0001427227104647598, + "loss": 1.853, + "step": 23592 + }, + { + "epoch": 2.267903489378064, + "grad_norm": 1.012615442276001, + "learning_rate": 0.00014271027495821084, + "loss": 1.994, + "step": 23593 + }, + { + "epoch": 2.26799961549553, + "grad_norm": 1.2107751369476318, + "learning_rate": 0.0001426978395135164, + "loss": 2.1035, + "step": 23594 + }, + { + "epoch": 2.268095741612996, + "grad_norm": 1.0565648078918457, + "learning_rate": 0.000142685404130764, + "loss": 1.9678, + "step": 23595 + }, + { + "epoch": 2.2681918677304624, + "grad_norm": 1.19049870967865, + "learning_rate": 0.00014267296881004108, + "loss": 1.9131, + "step": 23596 + }, + { + "epoch": 2.2682879938479283, + "grad_norm": 1.0536746978759766, + "learning_rate": 0.00014266053355143523, + "loss": 1.9219, + "step": 23597 + }, + { + "epoch": 2.2683841199653947, + "grad_norm": 1.2502235174179077, + "learning_rate": 0.00014264809835503396, + "loss": 2.1018, + "step": 23598 + }, + { + "epoch": 2.2684802460828606, + "grad_norm": 1.225102186203003, + "learning_rate": 0.00014263566322092472, + "loss": 2.046, + "step": 23599 + }, + { + "epoch": 2.268576372200327, + "grad_norm": 1.2091361284255981, + "learning_rate": 0.00014262322814919505, + "loss": 1.9585, + "step": 23600 + }, + { + "epoch": 2.268672498317793, + "grad_norm": 1.30961275100708, + "learning_rate": 0.00014261079313993244, + "loss": 2.0317, + "step": 23601 + }, + { + "epoch": 2.268768624435259, + "grad_norm": 1.1114436388015747, + "learning_rate": 0.00014259835819322445, + "loss": 1.8969, + "step": 23602 + }, + { + "epoch": 2.268864750552725, + "grad_norm": 1.1884592771530151, + "learning_rate": 0.00014258592330915852, + "loss": 1.9272, + "step": 23603 + }, + { + "epoch": 2.2689608766701914, + "grad_norm": 1.114071011543274, + "learning_rate": 0.0001425734884878222, + "loss": 1.9328, + "step": 23604 + }, + { + "epoch": 2.2690570027876573, + "grad_norm": 1.3144328594207764, + "learning_rate": 0.00014256105372930294, + "loss": 2.2402, + "step": 23605 + }, + { + "epoch": 2.2691531289051237, + "grad_norm": 1.1144367456436157, + "learning_rate": 0.00014254861903368831, + "loss": 1.9416, + "step": 23606 + }, + { + "epoch": 2.2692492550225896, + "grad_norm": 1.2372584342956543, + "learning_rate": 0.0001425361844010658, + "loss": 2.1823, + "step": 23607 + }, + { + "epoch": 2.269345381140056, + "grad_norm": 1.2587032318115234, + "learning_rate": 0.00014252374983152286, + "loss": 2.0286, + "step": 23608 + }, + { + "epoch": 2.269441507257522, + "grad_norm": 1.0343536138534546, + "learning_rate": 0.00014251131532514707, + "loss": 1.8529, + "step": 23609 + }, + { + "epoch": 2.269537633374988, + "grad_norm": 0.9040254950523376, + "learning_rate": 0.0001424988808820259, + "loss": 1.9018, + "step": 23610 + }, + { + "epoch": 2.269633759492454, + "grad_norm": 1.2883557081222534, + "learning_rate": 0.00014248644650224684, + "loss": 1.9616, + "step": 23611 + }, + { + "epoch": 2.2697298856099204, + "grad_norm": 1.3563796281814575, + "learning_rate": 0.0001424740121858974, + "loss": 2.0009, + "step": 23612 + }, + { + "epoch": 2.2698260117273863, + "grad_norm": 1.328908920288086, + "learning_rate": 0.00014246157793306507, + "loss": 2.1598, + "step": 23613 + }, + { + "epoch": 2.2699221378448526, + "grad_norm": 1.0840808153152466, + "learning_rate": 0.00014244914374383742, + "loss": 1.9825, + "step": 23614 + }, + { + "epoch": 2.2700182639623185, + "grad_norm": 1.1797868013381958, + "learning_rate": 0.00014243670961830185, + "loss": 2.0461, + "step": 23615 + }, + { + "epoch": 2.270114390079785, + "grad_norm": 1.2542036771774292, + "learning_rate": 0.00014242427555654597, + "loss": 2.172, + "step": 23616 + }, + { + "epoch": 2.270210516197251, + "grad_norm": 1.063697099685669, + "learning_rate": 0.00014241184155865718, + "loss": 1.8989, + "step": 23617 + }, + { + "epoch": 2.2703066423147167, + "grad_norm": 1.1735533475875854, + "learning_rate": 0.00014239940762472301, + "loss": 2.0503, + "step": 23618 + }, + { + "epoch": 2.270402768432183, + "grad_norm": 1.2599623203277588, + "learning_rate": 0.000142386973754831, + "loss": 2.0831, + "step": 23619 + }, + { + "epoch": 2.2704988945496494, + "grad_norm": 1.2505508661270142, + "learning_rate": 0.00014237453994906857, + "loss": 2.0754, + "step": 23620 + }, + { + "epoch": 2.2705950206671153, + "grad_norm": 1.1980916261672974, + "learning_rate": 0.0001423621062075233, + "loss": 1.9854, + "step": 23621 + }, + { + "epoch": 2.270691146784581, + "grad_norm": 1.2646104097366333, + "learning_rate": 0.00014234967253028266, + "loss": 1.96, + "step": 23622 + }, + { + "epoch": 2.2707872729020475, + "grad_norm": 1.13410484790802, + "learning_rate": 0.00014233723891743415, + "loss": 2.0371, + "step": 23623 + }, + { + "epoch": 2.2708833990195134, + "grad_norm": 1.1060993671417236, + "learning_rate": 0.00014232480536906524, + "loss": 1.9365, + "step": 23624 + }, + { + "epoch": 2.2709795251369798, + "grad_norm": 1.0203946828842163, + "learning_rate": 0.00014231237188526346, + "loss": 1.8838, + "step": 23625 + }, + { + "epoch": 2.2710756512544457, + "grad_norm": 1.2368254661560059, + "learning_rate": 0.00014229993846611626, + "loss": 2.1132, + "step": 23626 + }, + { + "epoch": 2.271171777371912, + "grad_norm": 1.46670401096344, + "learning_rate": 0.0001422875051117112, + "loss": 2.0659, + "step": 23627 + }, + { + "epoch": 2.271267903489378, + "grad_norm": 1.209619164466858, + "learning_rate": 0.00014227507182213575, + "loss": 2.152, + "step": 23628 + }, + { + "epoch": 2.2713640296068442, + "grad_norm": 1.0643589496612549, + "learning_rate": 0.00014226263859747738, + "loss": 2.0161, + "step": 23629 + }, + { + "epoch": 2.27146015572431, + "grad_norm": 1.1086480617523193, + "learning_rate": 0.0001422502054378236, + "loss": 2.1137, + "step": 23630 + }, + { + "epoch": 2.2715562818417765, + "grad_norm": 1.073398232460022, + "learning_rate": 0.00014223777234326186, + "loss": 1.7852, + "step": 23631 + }, + { + "epoch": 2.2716524079592424, + "grad_norm": 1.1049085855484009, + "learning_rate": 0.00014222533931387976, + "loss": 2.1803, + "step": 23632 + }, + { + "epoch": 2.2717485340767087, + "grad_norm": 1.066968560218811, + "learning_rate": 0.0001422129063497647, + "loss": 2.0276, + "step": 23633 + }, + { + "epoch": 2.2718446601941746, + "grad_norm": 1.0932977199554443, + "learning_rate": 0.00014220047345100422, + "loss": 1.8259, + "step": 23634 + }, + { + "epoch": 2.271940786311641, + "grad_norm": 1.3029831647872925, + "learning_rate": 0.00014218804061768582, + "loss": 1.9377, + "step": 23635 + }, + { + "epoch": 2.272036912429107, + "grad_norm": 1.3916739225387573, + "learning_rate": 0.00014217560784989696, + "loss": 1.9734, + "step": 23636 + }, + { + "epoch": 2.2721330385465732, + "grad_norm": 1.2787261009216309, + "learning_rate": 0.0001421631751477251, + "loss": 1.8797, + "step": 23637 + }, + { + "epoch": 2.272229164664039, + "grad_norm": 1.1340532302856445, + "learning_rate": 0.00014215074251125781, + "loss": 2.0745, + "step": 23638 + }, + { + "epoch": 2.2723252907815055, + "grad_norm": 1.2332725524902344, + "learning_rate": 0.00014213830994058252, + "loss": 2.1538, + "step": 23639 + }, + { + "epoch": 2.2724214168989714, + "grad_norm": 1.1607669591903687, + "learning_rate": 0.00014212587743578676, + "loss": 1.9895, + "step": 23640 + }, + { + "epoch": 2.2725175430164377, + "grad_norm": 0.9914307594299316, + "learning_rate": 0.00014211344499695798, + "loss": 1.8208, + "step": 23641 + }, + { + "epoch": 2.2726136691339036, + "grad_norm": 1.3344208002090454, + "learning_rate": 0.0001421010126241837, + "loss": 1.912, + "step": 23642 + }, + { + "epoch": 2.27270979525137, + "grad_norm": 1.3008226156234741, + "learning_rate": 0.0001420885803175514, + "loss": 2.0562, + "step": 23643 + }, + { + "epoch": 2.272805921368836, + "grad_norm": 1.0821223258972168, + "learning_rate": 0.00014207614807714857, + "loss": 1.8981, + "step": 23644 + }, + { + "epoch": 2.272902047486302, + "grad_norm": 1.2375296354293823, + "learning_rate": 0.0001420637159030627, + "loss": 2.0026, + "step": 23645 + }, + { + "epoch": 2.272998173603768, + "grad_norm": 1.3001893758773804, + "learning_rate": 0.0001420512837953812, + "loss": 2.1289, + "step": 23646 + }, + { + "epoch": 2.2730942997212344, + "grad_norm": 1.2679029703140259, + "learning_rate": 0.00014203885175419168, + "loss": 2.1597, + "step": 23647 + }, + { + "epoch": 2.2731904258387003, + "grad_norm": 1.282531499862671, + "learning_rate": 0.00014202641977958152, + "loss": 1.8387, + "step": 23648 + }, + { + "epoch": 2.2732865519561667, + "grad_norm": 1.3174998760223389, + "learning_rate": 0.0001420139878716383, + "loss": 2.0113, + "step": 23649 + }, + { + "epoch": 2.2733826780736326, + "grad_norm": 1.2165578603744507, + "learning_rate": 0.00014200155603044946, + "loss": 2.1672, + "step": 23650 + }, + { + "epoch": 2.2734788041910985, + "grad_norm": 1.1511796712875366, + "learning_rate": 0.0001419891242561025, + "loss": 2.198, + "step": 23651 + }, + { + "epoch": 2.273574930308565, + "grad_norm": 1.1312015056610107, + "learning_rate": 0.00014197669254868486, + "loss": 2.0256, + "step": 23652 + }, + { + "epoch": 2.273671056426031, + "grad_norm": 1.2478550672531128, + "learning_rate": 0.00014196426090828408, + "loss": 2.2578, + "step": 23653 + }, + { + "epoch": 2.273767182543497, + "grad_norm": 1.2219798564910889, + "learning_rate": 0.00014195182933498762, + "loss": 2.0183, + "step": 23654 + }, + { + "epoch": 2.273863308660963, + "grad_norm": 1.238639235496521, + "learning_rate": 0.00014193939782888295, + "loss": 1.9734, + "step": 23655 + }, + { + "epoch": 2.2739594347784293, + "grad_norm": 1.10455322265625, + "learning_rate": 0.00014192696639005753, + "loss": 2.0182, + "step": 23656 + }, + { + "epoch": 2.2740555608958952, + "grad_norm": 1.197531819343567, + "learning_rate": 0.0001419145350185989, + "loss": 2.0186, + "step": 23657 + }, + { + "epoch": 2.2741516870133616, + "grad_norm": 1.0968079566955566, + "learning_rate": 0.00014190210371459454, + "loss": 1.7973, + "step": 23658 + }, + { + "epoch": 2.2742478131308275, + "grad_norm": 1.0603647232055664, + "learning_rate": 0.00014188967247813188, + "loss": 2.0727, + "step": 23659 + }, + { + "epoch": 2.274343939248294, + "grad_norm": 1.2966225147247314, + "learning_rate": 0.00014187724130929844, + "loss": 2.1014, + "step": 23660 + }, + { + "epoch": 2.2744400653657597, + "grad_norm": 1.1462724208831787, + "learning_rate": 0.00014186481020818167, + "loss": 2.12, + "step": 23661 + }, + { + "epoch": 2.274536191483226, + "grad_norm": 1.192313551902771, + "learning_rate": 0.00014185237917486906, + "loss": 2.0738, + "step": 23662 + }, + { + "epoch": 2.274632317600692, + "grad_norm": 1.173160195350647, + "learning_rate": 0.00014183994820944813, + "loss": 1.9064, + "step": 23663 + }, + { + "epoch": 2.2747284437181583, + "grad_norm": 1.2759355306625366, + "learning_rate": 0.00014182751731200627, + "loss": 1.7155, + "step": 23664 + }, + { + "epoch": 2.274824569835624, + "grad_norm": 1.1128618717193604, + "learning_rate": 0.00014181508648263103, + "loss": 1.9523, + "step": 23665 + }, + { + "epoch": 2.2749206959530905, + "grad_norm": 1.1371703147888184, + "learning_rate": 0.00014180265572140988, + "loss": 1.92, + "step": 23666 + }, + { + "epoch": 2.2750168220705564, + "grad_norm": 1.4302541017532349, + "learning_rate": 0.00014179022502843028, + "loss": 1.9753, + "step": 23667 + }, + { + "epoch": 2.275112948188023, + "grad_norm": 1.0143135786056519, + "learning_rate": 0.0001417777944037797, + "loss": 1.9007, + "step": 23668 + }, + { + "epoch": 2.2752090743054887, + "grad_norm": 1.342527151107788, + "learning_rate": 0.00014176536384754565, + "loss": 2.0636, + "step": 23669 + }, + { + "epoch": 2.275305200422955, + "grad_norm": 1.1577787399291992, + "learning_rate": 0.00014175293335981558, + "loss": 2.1199, + "step": 23670 + }, + { + "epoch": 2.275401326540421, + "grad_norm": 1.1861929893493652, + "learning_rate": 0.00014174050294067696, + "loss": 1.9315, + "step": 23671 + }, + { + "epoch": 2.2754974526578873, + "grad_norm": 1.283484935760498, + "learning_rate": 0.00014172807259021727, + "loss": 2.0762, + "step": 23672 + }, + { + "epoch": 2.275593578775353, + "grad_norm": 1.3153553009033203, + "learning_rate": 0.000141715642308524, + "loss": 2.136, + "step": 23673 + }, + { + "epoch": 2.2756897048928195, + "grad_norm": 1.1637715101242065, + "learning_rate": 0.0001417032120956846, + "loss": 2.0393, + "step": 23674 + }, + { + "epoch": 2.2757858310102854, + "grad_norm": 1.1096127033233643, + "learning_rate": 0.00014169078195178659, + "loss": 1.9435, + "step": 23675 + }, + { + "epoch": 2.2758819571277518, + "grad_norm": 1.3978404998779297, + "learning_rate": 0.00014167835187691734, + "loss": 2.0422, + "step": 23676 + }, + { + "epoch": 2.2759780832452177, + "grad_norm": 1.3004107475280762, + "learning_rate": 0.00014166592187116444, + "loss": 2.0582, + "step": 23677 + }, + { + "epoch": 2.276074209362684, + "grad_norm": 1.097259283065796, + "learning_rate": 0.00014165349193461527, + "loss": 1.9257, + "step": 23678 + }, + { + "epoch": 2.27617033548015, + "grad_norm": 1.1115384101867676, + "learning_rate": 0.00014164106206735738, + "loss": 2.1329, + "step": 23679 + }, + { + "epoch": 2.2762664615976163, + "grad_norm": 1.3124511241912842, + "learning_rate": 0.00014162863226947818, + "loss": 1.9313, + "step": 23680 + }, + { + "epoch": 2.276362587715082, + "grad_norm": 1.0442198514938354, + "learning_rate": 0.0001416162025410652, + "loss": 1.8346, + "step": 23681 + }, + { + "epoch": 2.2764587138325485, + "grad_norm": 1.107413649559021, + "learning_rate": 0.0001416037728822058, + "loss": 1.982, + "step": 23682 + }, + { + "epoch": 2.2765548399500144, + "grad_norm": 1.105842113494873, + "learning_rate": 0.00014159134329298754, + "loss": 1.8992, + "step": 23683 + }, + { + "epoch": 2.2766509660674803, + "grad_norm": 1.240304708480835, + "learning_rate": 0.0001415789137734979, + "loss": 2.0395, + "step": 23684 + }, + { + "epoch": 2.2767470921849466, + "grad_norm": 1.1512383222579956, + "learning_rate": 0.0001415664843238243, + "loss": 2.0613, + "step": 23685 + }, + { + "epoch": 2.276843218302413, + "grad_norm": 1.2451454401016235, + "learning_rate": 0.00014155405494405425, + "loss": 2.0283, + "step": 23686 + }, + { + "epoch": 2.276939344419879, + "grad_norm": 1.1159409284591675, + "learning_rate": 0.00014154162563427517, + "loss": 2.129, + "step": 23687 + }, + { + "epoch": 2.277035470537345, + "grad_norm": 1.244513988494873, + "learning_rate": 0.00014152919639457456, + "loss": 1.8318, + "step": 23688 + }, + { + "epoch": 2.277131596654811, + "grad_norm": 1.0706150531768799, + "learning_rate": 0.00014151676722503988, + "loss": 2.1286, + "step": 23689 + }, + { + "epoch": 2.2772277227722775, + "grad_norm": 1.0588852167129517, + "learning_rate": 0.00014150433812575857, + "loss": 1.8292, + "step": 23690 + }, + { + "epoch": 2.2773238488897434, + "grad_norm": 1.0798563957214355, + "learning_rate": 0.00014149190909681816, + "loss": 1.8591, + "step": 23691 + }, + { + "epoch": 2.2774199750072093, + "grad_norm": 1.072563886642456, + "learning_rate": 0.00014147948013830603, + "loss": 1.9396, + "step": 23692 + }, + { + "epoch": 2.2775161011246756, + "grad_norm": 1.1657054424285889, + "learning_rate": 0.00014146705125030973, + "loss": 2.0652, + "step": 23693 + }, + { + "epoch": 2.2776122272421415, + "grad_norm": 1.1795464754104614, + "learning_rate": 0.00014145462243291663, + "loss": 2.1456, + "step": 23694 + }, + { + "epoch": 2.277708353359608, + "grad_norm": 1.272194504737854, + "learning_rate": 0.0001414421936862143, + "loss": 2.0245, + "step": 23695 + }, + { + "epoch": 2.2778044794770738, + "grad_norm": 1.17998468875885, + "learning_rate": 0.00014142976501029008, + "loss": 1.9749, + "step": 23696 + }, + { + "epoch": 2.27790060559454, + "grad_norm": 1.075053334236145, + "learning_rate": 0.00014141733640523154, + "loss": 1.9008, + "step": 23697 + }, + { + "epoch": 2.277996731712006, + "grad_norm": 1.1095855236053467, + "learning_rate": 0.0001414049078711261, + "loss": 1.9398, + "step": 23698 + }, + { + "epoch": 2.2780928578294724, + "grad_norm": 1.0068455934524536, + "learning_rate": 0.00014139247940806119, + "loss": 1.8211, + "step": 23699 + }, + { + "epoch": 2.2781889839469383, + "grad_norm": 1.0528463125228882, + "learning_rate": 0.00014138005101612433, + "loss": 1.8632, + "step": 23700 + }, + { + "epoch": 2.2782851100644046, + "grad_norm": 1.0120493173599243, + "learning_rate": 0.00014136762269540293, + "loss": 1.9069, + "step": 23701 + }, + { + "epoch": 2.2783812361818705, + "grad_norm": 1.265351414680481, + "learning_rate": 0.0001413551944459845, + "loss": 2.1114, + "step": 23702 + }, + { + "epoch": 2.278477362299337, + "grad_norm": 1.1571686267852783, + "learning_rate": 0.00014134276626795648, + "loss": 2.0934, + "step": 23703 + }, + { + "epoch": 2.2785734884168027, + "grad_norm": 1.0023884773254395, + "learning_rate": 0.0001413303381614063, + "loss": 1.7807, + "step": 23704 + }, + { + "epoch": 2.278669614534269, + "grad_norm": 1.1797747611999512, + "learning_rate": 0.00014131791012642145, + "loss": 1.7252, + "step": 23705 + }, + { + "epoch": 2.278765740651735, + "grad_norm": 1.1358897686004639, + "learning_rate": 0.0001413054821630894, + "loss": 1.8442, + "step": 23706 + }, + { + "epoch": 2.2788618667692013, + "grad_norm": 1.0603467226028442, + "learning_rate": 0.00014129305427149752, + "loss": 2.133, + "step": 23707 + }, + { + "epoch": 2.2789579928866672, + "grad_norm": 1.2492069005966187, + "learning_rate": 0.0001412806264517334, + "loss": 1.9809, + "step": 23708 + }, + { + "epoch": 2.2790541190041336, + "grad_norm": 1.2341797351837158, + "learning_rate": 0.00014126819870388438, + "loss": 2.1056, + "step": 23709 + }, + { + "epoch": 2.2791502451215995, + "grad_norm": 1.155229091644287, + "learning_rate": 0.00014125577102803798, + "loss": 1.9303, + "step": 23710 + }, + { + "epoch": 2.279246371239066, + "grad_norm": 1.1706788539886475, + "learning_rate": 0.00014124334342428162, + "loss": 2.1024, + "step": 23711 + }, + { + "epoch": 2.2793424973565317, + "grad_norm": 1.1048864126205444, + "learning_rate": 0.0001412309158927028, + "loss": 1.9648, + "step": 23712 + }, + { + "epoch": 2.279438623473998, + "grad_norm": 1.1105254888534546, + "learning_rate": 0.00014121848843338895, + "loss": 1.9488, + "step": 23713 + }, + { + "epoch": 2.279534749591464, + "grad_norm": 1.3991022109985352, + "learning_rate": 0.00014120606104642752, + "loss": 2.0851, + "step": 23714 + }, + { + "epoch": 2.2796308757089303, + "grad_norm": 1.3049328327178955, + "learning_rate": 0.00014119363373190594, + "loss": 2.0885, + "step": 23715 + }, + { + "epoch": 2.279727001826396, + "grad_norm": 1.0516383647918701, + "learning_rate": 0.00014118120648991167, + "loss": 1.8648, + "step": 23716 + }, + { + "epoch": 2.279823127943862, + "grad_norm": 1.1438844203948975, + "learning_rate": 0.00014116877932053223, + "loss": 1.9767, + "step": 23717 + }, + { + "epoch": 2.2799192540613284, + "grad_norm": 1.06964111328125, + "learning_rate": 0.00014115635222385498, + "loss": 1.945, + "step": 23718 + }, + { + "epoch": 2.280015380178795, + "grad_norm": 1.1728019714355469, + "learning_rate": 0.00014114392519996746, + "loss": 2.0168, + "step": 23719 + }, + { + "epoch": 2.2801115062962607, + "grad_norm": 1.0772716999053955, + "learning_rate": 0.00014113149824895703, + "loss": 1.8847, + "step": 23720 + }, + { + "epoch": 2.2802076324137266, + "grad_norm": 1.1608086824417114, + "learning_rate": 0.0001411190713709112, + "loss": 1.9922, + "step": 23721 + }, + { + "epoch": 2.280303758531193, + "grad_norm": 1.2023537158966064, + "learning_rate": 0.0001411066445659174, + "loss": 2.0362, + "step": 23722 + }, + { + "epoch": 2.2803998846486593, + "grad_norm": 1.1784942150115967, + "learning_rate": 0.0001410942178340631, + "loss": 2.0699, + "step": 23723 + }, + { + "epoch": 2.280496010766125, + "grad_norm": 1.1068321466445923, + "learning_rate": 0.00014108179117543573, + "loss": 2.0047, + "step": 23724 + }, + { + "epoch": 2.280592136883591, + "grad_norm": 1.1409430503845215, + "learning_rate": 0.0001410693645901227, + "loss": 1.8906, + "step": 23725 + }, + { + "epoch": 2.2806882630010574, + "grad_norm": 1.0923864841461182, + "learning_rate": 0.00014105693807821154, + "loss": 1.9491, + "step": 23726 + }, + { + "epoch": 2.2807843891185233, + "grad_norm": 1.1304409503936768, + "learning_rate": 0.00014104451163978964, + "loss": 1.9262, + "step": 23727 + }, + { + "epoch": 2.2808805152359897, + "grad_norm": 1.1190820932388306, + "learning_rate": 0.00014103208527494444, + "loss": 1.9268, + "step": 23728 + }, + { + "epoch": 2.2809766413534556, + "grad_norm": 1.244009017944336, + "learning_rate": 0.00014101965898376343, + "loss": 2.0002, + "step": 23729 + }, + { + "epoch": 2.281072767470922, + "grad_norm": 1.1560511589050293, + "learning_rate": 0.00014100723276633403, + "loss": 2.0499, + "step": 23730 + }, + { + "epoch": 2.281168893588388, + "grad_norm": 1.0373082160949707, + "learning_rate": 0.00014099480662274367, + "loss": 1.992, + "step": 23731 + }, + { + "epoch": 2.281265019705854, + "grad_norm": 1.1707464456558228, + "learning_rate": 0.00014098238055307983, + "loss": 2.1679, + "step": 23732 + }, + { + "epoch": 2.28136114582332, + "grad_norm": 1.05441415309906, + "learning_rate": 0.00014096995455742988, + "loss": 2.0559, + "step": 23733 + }, + { + "epoch": 2.2814572719407864, + "grad_norm": 1.1156350374221802, + "learning_rate": 0.00014095752863588137, + "loss": 2.0685, + "step": 23734 + }, + { + "epoch": 2.2815533980582523, + "grad_norm": 1.1521626710891724, + "learning_rate": 0.00014094510278852169, + "loss": 1.9256, + "step": 23735 + }, + { + "epoch": 2.2816495241757186, + "grad_norm": 1.1010618209838867, + "learning_rate": 0.00014093267701543828, + "loss": 2.0859, + "step": 23736 + }, + { + "epoch": 2.2817456502931845, + "grad_norm": 1.0196983814239502, + "learning_rate": 0.0001409202513167186, + "loss": 1.8694, + "step": 23737 + }, + { + "epoch": 2.281841776410651, + "grad_norm": 1.1340124607086182, + "learning_rate": 0.00014090782569245005, + "loss": 2.0925, + "step": 23738 + }, + { + "epoch": 2.281937902528117, + "grad_norm": 1.112919569015503, + "learning_rate": 0.0001408954001427201, + "loss": 1.8487, + "step": 23739 + }, + { + "epoch": 2.282034028645583, + "grad_norm": 1.1084991693496704, + "learning_rate": 0.00014088297466761623, + "loss": 1.8637, + "step": 23740 + }, + { + "epoch": 2.282130154763049, + "grad_norm": 1.2284483909606934, + "learning_rate": 0.0001408705492672258, + "loss": 2.1094, + "step": 23741 + }, + { + "epoch": 2.2822262808805154, + "grad_norm": 1.3125154972076416, + "learning_rate": 0.0001408581239416363, + "loss": 2.1947, + "step": 23742 + }, + { + "epoch": 2.2823224069979813, + "grad_norm": 1.063886284828186, + "learning_rate": 0.00014084569869093514, + "loss": 2.2822, + "step": 23743 + }, + { + "epoch": 2.2824185331154476, + "grad_norm": 1.084089756011963, + "learning_rate": 0.0001408332735152098, + "loss": 1.954, + "step": 23744 + }, + { + "epoch": 2.2825146592329135, + "grad_norm": 1.144089937210083, + "learning_rate": 0.00014082084841454768, + "loss": 1.9148, + "step": 23745 + }, + { + "epoch": 2.28261078535038, + "grad_norm": 1.161445140838623, + "learning_rate": 0.00014080842338903623, + "loss": 2.1391, + "step": 23746 + }, + { + "epoch": 2.2827069114678458, + "grad_norm": 1.1150480508804321, + "learning_rate": 0.00014079599843876288, + "loss": 2.1837, + "step": 23747 + }, + { + "epoch": 2.282803037585312, + "grad_norm": 1.0818248987197876, + "learning_rate": 0.00014078357356381508, + "loss": 2.0021, + "step": 23748 + }, + { + "epoch": 2.282899163702778, + "grad_norm": 1.1080617904663086, + "learning_rate": 0.00014077114876428026, + "loss": 1.9865, + "step": 23749 + }, + { + "epoch": 2.282995289820244, + "grad_norm": 1.0732449293136597, + "learning_rate": 0.0001407587240402458, + "loss": 2.0155, + "step": 23750 + }, + { + "epoch": 2.2830914159377103, + "grad_norm": 1.0771344900131226, + "learning_rate": 0.00014074629939179925, + "loss": 1.9736, + "step": 23751 + }, + { + "epoch": 2.2831875420551766, + "grad_norm": 1.1710243225097656, + "learning_rate": 0.00014073387481902794, + "loss": 2.075, + "step": 23752 + }, + { + "epoch": 2.2832836681726425, + "grad_norm": 1.0862113237380981, + "learning_rate": 0.00014072145032201938, + "loss": 2.031, + "step": 23753 + }, + { + "epoch": 2.2833797942901084, + "grad_norm": 1.1243815422058105, + "learning_rate": 0.00014070902590086103, + "loss": 2.0222, + "step": 23754 + }, + { + "epoch": 2.2834759204075747, + "grad_norm": 1.0267329216003418, + "learning_rate": 0.00014069660155564018, + "loss": 1.9476, + "step": 23755 + }, + { + "epoch": 2.283572046525041, + "grad_norm": 1.105844497680664, + "learning_rate": 0.00014068417728644434, + "loss": 2.0292, + "step": 23756 + }, + { + "epoch": 2.283668172642507, + "grad_norm": 1.1239038705825806, + "learning_rate": 0.00014067175309336096, + "loss": 2.0532, + "step": 23757 + }, + { + "epoch": 2.283764298759973, + "grad_norm": 1.16340172290802, + "learning_rate": 0.00014065932897647744, + "loss": 2.0566, + "step": 23758 + }, + { + "epoch": 2.2838604248774392, + "grad_norm": 1.288764238357544, + "learning_rate": 0.00014064690493588122, + "loss": 1.9784, + "step": 23759 + }, + { + "epoch": 2.283956550994905, + "grad_norm": 1.109330177307129, + "learning_rate": 0.0001406344809716597, + "loss": 1.9721, + "step": 23760 + }, + { + "epoch": 2.2840526771123715, + "grad_norm": 1.4667471647262573, + "learning_rate": 0.0001406220570839004, + "loss": 2.0285, + "step": 23761 + }, + { + "epoch": 2.2841488032298374, + "grad_norm": 1.0958033800125122, + "learning_rate": 0.0001406096332726907, + "loss": 1.9546, + "step": 23762 + }, + { + "epoch": 2.2842449293473037, + "grad_norm": 0.991194486618042, + "learning_rate": 0.00014059720953811799, + "loss": 1.9154, + "step": 23763 + }, + { + "epoch": 2.2843410554647696, + "grad_norm": 1.2435814142227173, + "learning_rate": 0.0001405847858802697, + "loss": 1.8243, + "step": 23764 + }, + { + "epoch": 2.284437181582236, + "grad_norm": 1.168515920639038, + "learning_rate": 0.00014057236229923333, + "loss": 1.9548, + "step": 23765 + }, + { + "epoch": 2.284533307699702, + "grad_norm": 1.192299723625183, + "learning_rate": 0.00014055993879509625, + "loss": 2.088, + "step": 23766 + }, + { + "epoch": 2.284629433817168, + "grad_norm": 1.2051911354064941, + "learning_rate": 0.0001405475153679459, + "loss": 1.9942, + "step": 23767 + }, + { + "epoch": 2.284725559934634, + "grad_norm": 1.2082864046096802, + "learning_rate": 0.00014053509201786968, + "loss": 2.0775, + "step": 23768 + }, + { + "epoch": 2.2848216860521005, + "grad_norm": 1.3165826797485352, + "learning_rate": 0.00014052266874495505, + "loss": 2.0282, + "step": 23769 + }, + { + "epoch": 2.2849178121695664, + "grad_norm": 1.2149765491485596, + "learning_rate": 0.00014051024554928945, + "loss": 1.9793, + "step": 23770 + }, + { + "epoch": 2.2850139382870327, + "grad_norm": 1.2954845428466797, + "learning_rate": 0.00014049782243096023, + "loss": 2.0281, + "step": 23771 + }, + { + "epoch": 2.2851100644044986, + "grad_norm": 1.126232385635376, + "learning_rate": 0.00014048539939005484, + "loss": 2.0, + "step": 23772 + }, + { + "epoch": 2.285206190521965, + "grad_norm": 1.2636793851852417, + "learning_rate": 0.00014047297642666076, + "loss": 2.1638, + "step": 23773 + }, + { + "epoch": 2.285302316639431, + "grad_norm": 1.1010723114013672, + "learning_rate": 0.00014046055354086537, + "loss": 2.0226, + "step": 23774 + }, + { + "epoch": 2.285398442756897, + "grad_norm": 1.0833361148834229, + "learning_rate": 0.0001404481307327561, + "loss": 1.9737, + "step": 23775 + }, + { + "epoch": 2.285494568874363, + "grad_norm": 1.234395146369934, + "learning_rate": 0.00014043570800242032, + "loss": 2.001, + "step": 23776 + }, + { + "epoch": 2.2855906949918294, + "grad_norm": 1.2576738595962524, + "learning_rate": 0.00014042328534994553, + "loss": 2.0183, + "step": 23777 + }, + { + "epoch": 2.2856868211092953, + "grad_norm": 0.9549108147621155, + "learning_rate": 0.0001404108627754191, + "loss": 1.8667, + "step": 23778 + }, + { + "epoch": 2.2857829472267617, + "grad_norm": 1.0114691257476807, + "learning_rate": 0.0001403984402789285, + "loss": 1.9456, + "step": 23779 + }, + { + "epoch": 2.2858790733442276, + "grad_norm": 1.1717565059661865, + "learning_rate": 0.00014038601786056111, + "loss": 2.0797, + "step": 23780 + }, + { + "epoch": 2.285975199461694, + "grad_norm": 1.238800048828125, + "learning_rate": 0.00014037359552040434, + "loss": 1.8166, + "step": 23781 + }, + { + "epoch": 2.28607132557916, + "grad_norm": 1.1540242433547974, + "learning_rate": 0.00014036117325854562, + "loss": 1.8176, + "step": 23782 + }, + { + "epoch": 2.286167451696626, + "grad_norm": 1.104191541671753, + "learning_rate": 0.00014034875107507235, + "loss": 2.048, + "step": 23783 + }, + { + "epoch": 2.286263577814092, + "grad_norm": 1.0797052383422852, + "learning_rate": 0.000140336328970072, + "loss": 1.9765, + "step": 23784 + }, + { + "epoch": 2.2863597039315584, + "grad_norm": 1.1240335702896118, + "learning_rate": 0.0001403239069436319, + "loss": 1.8482, + "step": 23785 + }, + { + "epoch": 2.2864558300490243, + "grad_norm": 1.2212228775024414, + "learning_rate": 0.00014031148499583956, + "loss": 1.9514, + "step": 23786 + }, + { + "epoch": 2.28655195616649, + "grad_norm": 1.0660271644592285, + "learning_rate": 0.00014029906312678235, + "loss": 1.766, + "step": 23787 + }, + { + "epoch": 2.2866480822839566, + "grad_norm": 1.0052272081375122, + "learning_rate": 0.00014028664133654766, + "loss": 1.8924, + "step": 23788 + }, + { + "epoch": 2.286744208401423, + "grad_norm": 1.2394123077392578, + "learning_rate": 0.00014027421962522292, + "loss": 2.0849, + "step": 23789 + }, + { + "epoch": 2.286840334518889, + "grad_norm": 1.0410181283950806, + "learning_rate": 0.0001402617979928956, + "loss": 1.9478, + "step": 23790 + }, + { + "epoch": 2.2869364606363547, + "grad_norm": 1.3420655727386475, + "learning_rate": 0.000140249376439653, + "loss": 2.012, + "step": 23791 + }, + { + "epoch": 2.287032586753821, + "grad_norm": 1.3925443887710571, + "learning_rate": 0.00014023695496558262, + "loss": 1.9126, + "step": 23792 + }, + { + "epoch": 2.287128712871287, + "grad_norm": 1.0486760139465332, + "learning_rate": 0.00014022453357077184, + "loss": 1.8903, + "step": 23793 + }, + { + "epoch": 2.2872248389887533, + "grad_norm": 1.2145861387252808, + "learning_rate": 0.0001402121122553081, + "loss": 2.0738, + "step": 23794 + }, + { + "epoch": 2.287320965106219, + "grad_norm": 1.207242488861084, + "learning_rate": 0.0001401996910192788, + "loss": 2.017, + "step": 23795 + }, + { + "epoch": 2.2874170912236855, + "grad_norm": 1.1830157041549683, + "learning_rate": 0.0001401872698627713, + "loss": 1.9726, + "step": 23796 + }, + { + "epoch": 2.2875132173411514, + "grad_norm": 1.0637720823287964, + "learning_rate": 0.0001401748487858731, + "loss": 1.8457, + "step": 23797 + }, + { + "epoch": 2.2876093434586178, + "grad_norm": 1.2667545080184937, + "learning_rate": 0.00014016242778867153, + "loss": 2.0096, + "step": 23798 + }, + { + "epoch": 2.2877054695760837, + "grad_norm": 1.172438144683838, + "learning_rate": 0.00014015000687125402, + "loss": 1.8929, + "step": 23799 + }, + { + "epoch": 2.28780159569355, + "grad_norm": 1.2172757387161255, + "learning_rate": 0.00014013758603370797, + "loss": 2.0344, + "step": 23800 + }, + { + "epoch": 2.287897721811016, + "grad_norm": 1.1957749128341675, + "learning_rate": 0.00014012516527612084, + "loss": 1.9428, + "step": 23801 + }, + { + "epoch": 2.2879938479284823, + "grad_norm": 1.263360619544983, + "learning_rate": 0.00014011274459858, + "loss": 1.9329, + "step": 23802 + }, + { + "epoch": 2.288089974045948, + "grad_norm": 1.0607311725616455, + "learning_rate": 0.00014010032400117283, + "loss": 1.9151, + "step": 23803 + }, + { + "epoch": 2.2881861001634145, + "grad_norm": 1.1227926015853882, + "learning_rate": 0.00014008790348398676, + "loss": 1.8721, + "step": 23804 + }, + { + "epoch": 2.2882822262808804, + "grad_norm": 1.0913397073745728, + "learning_rate": 0.0001400754830471092, + "loss": 1.8029, + "step": 23805 + }, + { + "epoch": 2.2883783523983467, + "grad_norm": 1.0504870414733887, + "learning_rate": 0.00014006306269062754, + "loss": 1.831, + "step": 23806 + }, + { + "epoch": 2.2884744785158126, + "grad_norm": 1.0743727684020996, + "learning_rate": 0.0001400506424146292, + "loss": 1.9426, + "step": 23807 + }, + { + "epoch": 2.288570604633279, + "grad_norm": 1.3041492700576782, + "learning_rate": 0.0001400382222192016, + "loss": 2.202, + "step": 23808 + }, + { + "epoch": 2.288666730750745, + "grad_norm": 1.1951814889907837, + "learning_rate": 0.00014002580210443208, + "loss": 2.0114, + "step": 23809 + }, + { + "epoch": 2.2887628568682112, + "grad_norm": 1.2306026220321655, + "learning_rate": 0.00014001338207040808, + "loss": 1.8808, + "step": 23810 + }, + { + "epoch": 2.288858982985677, + "grad_norm": 1.2435650825500488, + "learning_rate": 0.00014000096211721702, + "loss": 2.0401, + "step": 23811 + }, + { + "epoch": 2.2889551091031435, + "grad_norm": 1.1750677824020386, + "learning_rate": 0.00013998854224494625, + "loss": 1.9767, + "step": 23812 + }, + { + "epoch": 2.2890512352206094, + "grad_norm": 1.1116341352462769, + "learning_rate": 0.00013997612245368325, + "loss": 1.9981, + "step": 23813 + }, + { + "epoch": 2.2891473613380757, + "grad_norm": 1.2081928253173828, + "learning_rate": 0.00013996370274351537, + "loss": 2.1297, + "step": 23814 + }, + { + "epoch": 2.2892434874555416, + "grad_norm": 1.261322021484375, + "learning_rate": 0.00013995128311452998, + "loss": 2.0535, + "step": 23815 + }, + { + "epoch": 2.289339613573008, + "grad_norm": 1.1405925750732422, + "learning_rate": 0.00013993886356681453, + "loss": 1.7735, + "step": 23816 + }, + { + "epoch": 2.289435739690474, + "grad_norm": 1.1871604919433594, + "learning_rate": 0.0001399264441004564, + "loss": 1.9568, + "step": 23817 + }, + { + "epoch": 2.28953186580794, + "grad_norm": 1.18953537940979, + "learning_rate": 0.000139914024715543, + "loss": 2.0276, + "step": 23818 + }, + { + "epoch": 2.289627991925406, + "grad_norm": 1.1670280694961548, + "learning_rate": 0.00013990160541216167, + "loss": 1.9666, + "step": 23819 + }, + { + "epoch": 2.289724118042872, + "grad_norm": 1.1227391958236694, + "learning_rate": 0.0001398891861903999, + "loss": 1.9565, + "step": 23820 + }, + { + "epoch": 2.2898202441603384, + "grad_norm": 1.1898764371871948, + "learning_rate": 0.000139876767050345, + "loss": 1.8339, + "step": 23821 + }, + { + "epoch": 2.2899163702778047, + "grad_norm": 1.2154762744903564, + "learning_rate": 0.0001398643479920844, + "loss": 1.9202, + "step": 23822 + }, + { + "epoch": 2.2900124963952706, + "grad_norm": 1.1708316802978516, + "learning_rate": 0.0001398519290157055, + "loss": 1.9345, + "step": 23823 + }, + { + "epoch": 2.2901086225127365, + "grad_norm": 1.2317148447036743, + "learning_rate": 0.0001398395101212957, + "loss": 1.9651, + "step": 23824 + }, + { + "epoch": 2.290204748630203, + "grad_norm": 0.9999919533729553, + "learning_rate": 0.0001398270913089424, + "loss": 1.9948, + "step": 23825 + }, + { + "epoch": 2.290300874747669, + "grad_norm": 1.1222578287124634, + "learning_rate": 0.00013981467257873296, + "loss": 1.9925, + "step": 23826 + }, + { + "epoch": 2.290397000865135, + "grad_norm": 1.2794009447097778, + "learning_rate": 0.00013980225393075477, + "loss": 2.0136, + "step": 23827 + }, + { + "epoch": 2.290493126982601, + "grad_norm": 1.1561733484268188, + "learning_rate": 0.0001397898353650952, + "loss": 2.0162, + "step": 23828 + }, + { + "epoch": 2.2905892531000673, + "grad_norm": 1.2682217359542847, + "learning_rate": 0.00013977741688184176, + "loss": 1.9835, + "step": 23829 + }, + { + "epoch": 2.2906853792175332, + "grad_norm": 0.989132285118103, + "learning_rate": 0.00013976499848108174, + "loss": 1.8766, + "step": 23830 + }, + { + "epoch": 2.2907815053349996, + "grad_norm": 1.1477566957473755, + "learning_rate": 0.00013975258016290254, + "loss": 2.1935, + "step": 23831 + }, + { + "epoch": 2.2908776314524655, + "grad_norm": 1.2263023853302002, + "learning_rate": 0.00013974016192739154, + "loss": 1.9296, + "step": 23832 + }, + { + "epoch": 2.290973757569932, + "grad_norm": 0.9382872581481934, + "learning_rate": 0.0001397277437746362, + "loss": 1.8593, + "step": 23833 + }, + { + "epoch": 2.2910698836873977, + "grad_norm": 1.360666275024414, + "learning_rate": 0.00013971532570472382, + "loss": 2.0171, + "step": 23834 + }, + { + "epoch": 2.291166009804864, + "grad_norm": 1.2101589441299438, + "learning_rate": 0.00013970290771774186, + "loss": 2.0747, + "step": 23835 + }, + { + "epoch": 2.29126213592233, + "grad_norm": 1.118517279624939, + "learning_rate": 0.00013969048981377766, + "loss": 1.9138, + "step": 23836 + }, + { + "epoch": 2.2913582620397963, + "grad_norm": 1.0345158576965332, + "learning_rate": 0.0001396780719929186, + "loss": 1.9869, + "step": 23837 + }, + { + "epoch": 2.291454388157262, + "grad_norm": 1.0729275941848755, + "learning_rate": 0.0001396656542552521, + "loss": 1.9347, + "step": 23838 + }, + { + "epoch": 2.2915505142747286, + "grad_norm": 1.2243707180023193, + "learning_rate": 0.00013965323660086553, + "loss": 2.0534, + "step": 23839 + }, + { + "epoch": 2.2916466403921945, + "grad_norm": 1.1352697610855103, + "learning_rate": 0.00013964081902984626, + "loss": 2.2053, + "step": 23840 + }, + { + "epoch": 2.291742766509661, + "grad_norm": 1.0340452194213867, + "learning_rate": 0.0001396284015422817, + "loss": 1.8833, + "step": 23841 + }, + { + "epoch": 2.2918388926271267, + "grad_norm": 1.1442344188690186, + "learning_rate": 0.00013961598413825925, + "loss": 2.0581, + "step": 23842 + }, + { + "epoch": 2.291935018744593, + "grad_norm": 1.070075511932373, + "learning_rate": 0.00013960356681786624, + "loss": 2.0764, + "step": 23843 + }, + { + "epoch": 2.292031144862059, + "grad_norm": 1.2329421043395996, + "learning_rate": 0.00013959114958119007, + "loss": 2.0194, + "step": 23844 + }, + { + "epoch": 2.2921272709795253, + "grad_norm": 1.1531773805618286, + "learning_rate": 0.00013957873242831812, + "loss": 1.9453, + "step": 23845 + }, + { + "epoch": 2.292223397096991, + "grad_norm": 1.087643027305603, + "learning_rate": 0.00013956631535933778, + "loss": 1.9746, + "step": 23846 + }, + { + "epoch": 2.2923195232144575, + "grad_norm": 1.1109894514083862, + "learning_rate": 0.00013955389837433645, + "loss": 1.9764, + "step": 23847 + }, + { + "epoch": 2.2924156493319234, + "grad_norm": 1.0703039169311523, + "learning_rate": 0.0001395414814734015, + "loss": 2.1903, + "step": 23848 + }, + { + "epoch": 2.2925117754493898, + "grad_norm": 1.0425097942352295, + "learning_rate": 0.0001395290646566203, + "loss": 1.9837, + "step": 23849 + }, + { + "epoch": 2.2926079015668557, + "grad_norm": 1.0151536464691162, + "learning_rate": 0.00013951664792408023, + "loss": 1.9466, + "step": 23850 + }, + { + "epoch": 2.292704027684322, + "grad_norm": 1.0587972402572632, + "learning_rate": 0.00013950423127586867, + "loss": 2.0986, + "step": 23851 + }, + { + "epoch": 2.292800153801788, + "grad_norm": 1.1834964752197266, + "learning_rate": 0.000139491814712073, + "loss": 2.1355, + "step": 23852 + }, + { + "epoch": 2.292896279919254, + "grad_norm": 0.968754768371582, + "learning_rate": 0.0001394793982327806, + "loss": 1.9043, + "step": 23853 + }, + { + "epoch": 2.29299240603672, + "grad_norm": 1.1849273443222046, + "learning_rate": 0.00013946698183807884, + "loss": 1.733, + "step": 23854 + }, + { + "epoch": 2.2930885321541865, + "grad_norm": 1.0627576112747192, + "learning_rate": 0.00013945456552805506, + "loss": 1.9598, + "step": 23855 + }, + { + "epoch": 2.2931846582716524, + "grad_norm": 1.0437184572219849, + "learning_rate": 0.0001394421493027967, + "loss": 1.9051, + "step": 23856 + }, + { + "epoch": 2.2932807843891183, + "grad_norm": 1.0250076055526733, + "learning_rate": 0.0001394297331623911, + "loss": 1.769, + "step": 23857 + }, + { + "epoch": 2.2933769105065847, + "grad_norm": 1.278196930885315, + "learning_rate": 0.00013941731710692562, + "loss": 2.0169, + "step": 23858 + }, + { + "epoch": 2.293473036624051, + "grad_norm": 1.0282633304595947, + "learning_rate": 0.00013940490113648768, + "loss": 2.0509, + "step": 23859 + }, + { + "epoch": 2.293569162741517, + "grad_norm": 1.231232762336731, + "learning_rate": 0.00013939248525116462, + "loss": 2.0962, + "step": 23860 + }, + { + "epoch": 2.293665288858983, + "grad_norm": 1.2436010837554932, + "learning_rate": 0.00013938006945104383, + "loss": 2.0936, + "step": 23861 + }, + { + "epoch": 2.293761414976449, + "grad_norm": 1.2012519836425781, + "learning_rate": 0.00013936765373621262, + "loss": 2.075, + "step": 23862 + }, + { + "epoch": 2.293857541093915, + "grad_norm": 1.189548373222351, + "learning_rate": 0.00013935523810675845, + "loss": 1.9264, + "step": 23863 + }, + { + "epoch": 2.2939536672113814, + "grad_norm": 1.3039336204528809, + "learning_rate": 0.00013934282256276867, + "loss": 2.0332, + "step": 23864 + }, + { + "epoch": 2.2940497933288473, + "grad_norm": 1.1349225044250488, + "learning_rate": 0.0001393304071043306, + "loss": 2.0145, + "step": 23865 + }, + { + "epoch": 2.2941459194463136, + "grad_norm": 1.2455495595932007, + "learning_rate": 0.00013931799173153166, + "loss": 2.1427, + "step": 23866 + }, + { + "epoch": 2.2942420455637795, + "grad_norm": 1.1341121196746826, + "learning_rate": 0.0001393055764444592, + "loss": 2.0293, + "step": 23867 + }, + { + "epoch": 2.294338171681246, + "grad_norm": 1.201271653175354, + "learning_rate": 0.00013929316124320056, + "loss": 1.9935, + "step": 23868 + }, + { + "epoch": 2.2944342977987118, + "grad_norm": 1.2823567390441895, + "learning_rate": 0.00013928074612784317, + "loss": 2.1466, + "step": 23869 + }, + { + "epoch": 2.294530423916178, + "grad_norm": 1.2255480289459229, + "learning_rate": 0.00013926833109847436, + "loss": 1.9789, + "step": 23870 + }, + { + "epoch": 2.294626550033644, + "grad_norm": 1.1964763402938843, + "learning_rate": 0.00013925591615518147, + "loss": 1.9388, + "step": 23871 + }, + { + "epoch": 2.2947226761511104, + "grad_norm": 1.2649043798446655, + "learning_rate": 0.00013924350129805192, + "loss": 1.8177, + "step": 23872 + }, + { + "epoch": 2.2948188022685763, + "grad_norm": 1.4406960010528564, + "learning_rate": 0.00013923108652717306, + "loss": 2.1194, + "step": 23873 + }, + { + "epoch": 2.2949149283860426, + "grad_norm": 1.1923669576644897, + "learning_rate": 0.0001392186718426322, + "loss": 1.8121, + "step": 23874 + }, + { + "epoch": 2.2950110545035085, + "grad_norm": 1.0888820886611938, + "learning_rate": 0.0001392062572445168, + "loss": 1.9369, + "step": 23875 + }, + { + "epoch": 2.295107180620975, + "grad_norm": 1.0721900463104248, + "learning_rate": 0.00013919384273291413, + "loss": 2.0348, + "step": 23876 + }, + { + "epoch": 2.2952033067384408, + "grad_norm": 1.1178886890411377, + "learning_rate": 0.00013918142830791162, + "loss": 1.9269, + "step": 23877 + }, + { + "epoch": 2.295299432855907, + "grad_norm": 1.1739741563796997, + "learning_rate": 0.00013916901396959658, + "loss": 1.9062, + "step": 23878 + }, + { + "epoch": 2.295395558973373, + "grad_norm": 1.121730923652649, + "learning_rate": 0.0001391565997180564, + "loss": 2.0578, + "step": 23879 + }, + { + "epoch": 2.2954916850908393, + "grad_norm": 1.180216908454895, + "learning_rate": 0.00013914418555337844, + "loss": 1.9757, + "step": 23880 + }, + { + "epoch": 2.2955878112083052, + "grad_norm": 1.1394709348678589, + "learning_rate": 0.00013913177147565007, + "loss": 2.039, + "step": 23881 + }, + { + "epoch": 2.2956839373257716, + "grad_norm": 1.1675770282745361, + "learning_rate": 0.00013911935748495862, + "loss": 1.8726, + "step": 23882 + }, + { + "epoch": 2.2957800634432375, + "grad_norm": 1.0396804809570312, + "learning_rate": 0.0001391069435813915, + "loss": 2.0023, + "step": 23883 + }, + { + "epoch": 2.295876189560704, + "grad_norm": 1.244295358657837, + "learning_rate": 0.00013909452976503602, + "loss": 2.0183, + "step": 23884 + }, + { + "epoch": 2.2959723156781697, + "grad_norm": 1.1361453533172607, + "learning_rate": 0.00013908211603597955, + "loss": 2.0149, + "step": 23885 + }, + { + "epoch": 2.2960684417956356, + "grad_norm": 1.1489092111587524, + "learning_rate": 0.00013906970239430946, + "loss": 1.7589, + "step": 23886 + }, + { + "epoch": 2.296164567913102, + "grad_norm": 1.3080646991729736, + "learning_rate": 0.00013905728884011306, + "loss": 2.0202, + "step": 23887 + }, + { + "epoch": 2.2962606940305683, + "grad_norm": 1.2980490922927856, + "learning_rate": 0.00013904487537347778, + "loss": 2.0571, + "step": 23888 + }, + { + "epoch": 2.296356820148034, + "grad_norm": 1.4115897417068481, + "learning_rate": 0.00013903246199449093, + "loss": 1.9828, + "step": 23889 + }, + { + "epoch": 2.2964529462655, + "grad_norm": 1.1328212022781372, + "learning_rate": 0.00013902004870323985, + "loss": 1.9005, + "step": 23890 + }, + { + "epoch": 2.2965490723829665, + "grad_norm": 1.1440033912658691, + "learning_rate": 0.00013900763549981195, + "loss": 1.9877, + "step": 23891 + }, + { + "epoch": 2.296645198500433, + "grad_norm": 1.2198524475097656, + "learning_rate": 0.00013899522238429454, + "loss": 2.2644, + "step": 23892 + }, + { + "epoch": 2.2967413246178987, + "grad_norm": 1.1323891878128052, + "learning_rate": 0.00013898280935677496, + "loss": 1.8022, + "step": 23893 + }, + { + "epoch": 2.2968374507353646, + "grad_norm": 1.383188009262085, + "learning_rate": 0.00013897039641734058, + "loss": 1.9787, + "step": 23894 + }, + { + "epoch": 2.296933576852831, + "grad_norm": 1.2862203121185303, + "learning_rate": 0.0001389579835660788, + "loss": 2.1103, + "step": 23895 + }, + { + "epoch": 2.297029702970297, + "grad_norm": 1.1836806535720825, + "learning_rate": 0.00013894557080307686, + "loss": 2.198, + "step": 23896 + }, + { + "epoch": 2.297125829087763, + "grad_norm": 1.1808884143829346, + "learning_rate": 0.00013893315812842224, + "loss": 1.9524, + "step": 23897 + }, + { + "epoch": 2.297221955205229, + "grad_norm": 1.0849026441574097, + "learning_rate": 0.0001389207455422022, + "loss": 1.9147, + "step": 23898 + }, + { + "epoch": 2.2973180813226954, + "grad_norm": 1.208886981010437, + "learning_rate": 0.00013890833304450412, + "loss": 1.9641, + "step": 23899 + }, + { + "epoch": 2.2974142074401613, + "grad_norm": 1.2683990001678467, + "learning_rate": 0.00013889592063541536, + "loss": 2.1555, + "step": 23900 + }, + { + "epoch": 2.2975103335576277, + "grad_norm": 1.0744876861572266, + "learning_rate": 0.00013888350831502326, + "loss": 1.6772, + "step": 23901 + }, + { + "epoch": 2.2976064596750936, + "grad_norm": 1.1690516471862793, + "learning_rate": 0.00013887109608341512, + "loss": 1.9604, + "step": 23902 + }, + { + "epoch": 2.29770258579256, + "grad_norm": 1.3250313997268677, + "learning_rate": 0.00013885868394067835, + "loss": 2.0165, + "step": 23903 + }, + { + "epoch": 2.297798711910026, + "grad_norm": 1.280163049697876, + "learning_rate": 0.00013884627188690026, + "loss": 2.0144, + "step": 23904 + }, + { + "epoch": 2.297894838027492, + "grad_norm": 1.141581416130066, + "learning_rate": 0.00013883385992216824, + "loss": 2.031, + "step": 23905 + }, + { + "epoch": 2.297990964144958, + "grad_norm": 1.0137996673583984, + "learning_rate": 0.00013882144804656956, + "loss": 1.9994, + "step": 23906 + }, + { + "epoch": 2.2980870902624244, + "grad_norm": 1.1421623229980469, + "learning_rate": 0.00013880903626019165, + "loss": 2.0181, + "step": 23907 + }, + { + "epoch": 2.2981832163798903, + "grad_norm": 1.2069307565689087, + "learning_rate": 0.0001387966245631218, + "loss": 1.9525, + "step": 23908 + }, + { + "epoch": 2.2982793424973567, + "grad_norm": 1.2020394802093506, + "learning_rate": 0.00013878421295544733, + "loss": 2.0275, + "step": 23909 + }, + { + "epoch": 2.2983754686148226, + "grad_norm": 1.2815991640090942, + "learning_rate": 0.00013877180143725566, + "loss": 2.1047, + "step": 23910 + }, + { + "epoch": 2.298471594732289, + "grad_norm": 1.071251392364502, + "learning_rate": 0.00013875939000863405, + "loss": 1.9697, + "step": 23911 + }, + { + "epoch": 2.298567720849755, + "grad_norm": 1.048707127571106, + "learning_rate": 0.00013874697866966988, + "loss": 1.9584, + "step": 23912 + }, + { + "epoch": 2.298663846967221, + "grad_norm": 1.0348340272903442, + "learning_rate": 0.00013873456742045047, + "loss": 1.7219, + "step": 23913 + }, + { + "epoch": 2.298759973084687, + "grad_norm": 0.9994218945503235, + "learning_rate": 0.00013872215626106323, + "loss": 1.6779, + "step": 23914 + }, + { + "epoch": 2.2988560992021534, + "grad_norm": 1.0922585725784302, + "learning_rate": 0.0001387097451915954, + "loss": 2.0803, + "step": 23915 + }, + { + "epoch": 2.2989522253196193, + "grad_norm": 1.206067681312561, + "learning_rate": 0.00013869733421213438, + "loss": 2.0228, + "step": 23916 + }, + { + "epoch": 2.2990483514370856, + "grad_norm": 1.3142927885055542, + "learning_rate": 0.00013868492332276753, + "loss": 2.0251, + "step": 23917 + }, + { + "epoch": 2.2991444775545515, + "grad_norm": 1.1520402431488037, + "learning_rate": 0.00013867251252358212, + "loss": 2.0101, + "step": 23918 + }, + { + "epoch": 2.299240603672018, + "grad_norm": 1.1686904430389404, + "learning_rate": 0.0001386601018146655, + "loss": 2.1807, + "step": 23919 + }, + { + "epoch": 2.299336729789484, + "grad_norm": 1.2598819732666016, + "learning_rate": 0.00013864769119610506, + "loss": 2.113, + "step": 23920 + }, + { + "epoch": 2.29943285590695, + "grad_norm": 1.0114972591400146, + "learning_rate": 0.00013863528066798808, + "loss": 1.9209, + "step": 23921 + }, + { + "epoch": 2.299528982024416, + "grad_norm": 1.029435634613037, + "learning_rate": 0.0001386228702304019, + "loss": 1.8579, + "step": 23922 + }, + { + "epoch": 2.299625108141882, + "grad_norm": 1.196059226989746, + "learning_rate": 0.00013861045988343388, + "loss": 1.8912, + "step": 23923 + }, + { + "epoch": 2.2997212342593483, + "grad_norm": 1.27267324924469, + "learning_rate": 0.00013859804962717133, + "loss": 1.9915, + "step": 23924 + }, + { + "epoch": 2.2998173603768146, + "grad_norm": 1.1116130352020264, + "learning_rate": 0.0001385856394617016, + "loss": 2.0824, + "step": 23925 + }, + { + "epoch": 2.2999134864942805, + "grad_norm": 1.1661815643310547, + "learning_rate": 0.000138573229387112, + "loss": 2.0834, + "step": 23926 + }, + { + "epoch": 2.3000096126117464, + "grad_norm": 1.1149756908416748, + "learning_rate": 0.00013856081940348989, + "loss": 2.0573, + "step": 23927 + }, + { + "epoch": 2.3001057387292128, + "grad_norm": 1.2843421697616577, + "learning_rate": 0.0001385484095109226, + "loss": 2.0366, + "step": 23928 + }, + { + "epoch": 2.3002018648466787, + "grad_norm": 1.1823869943618774, + "learning_rate": 0.0001385359997094974, + "loss": 2.0168, + "step": 23929 + }, + { + "epoch": 2.300297990964145, + "grad_norm": 1.2374156713485718, + "learning_rate": 0.00013852358999930168, + "loss": 1.98, + "step": 23930 + }, + { + "epoch": 2.300394117081611, + "grad_norm": 1.286683201789856, + "learning_rate": 0.00013851118038042277, + "loss": 1.9114, + "step": 23931 + }, + { + "epoch": 2.3004902431990772, + "grad_norm": 1.1155484914779663, + "learning_rate": 0.000138498770852948, + "loss": 2.1273, + "step": 23932 + }, + { + "epoch": 2.300586369316543, + "grad_norm": 1.2071138620376587, + "learning_rate": 0.00013848636141696466, + "loss": 1.9963, + "step": 23933 + }, + { + "epoch": 2.3006824954340095, + "grad_norm": 1.1094733476638794, + "learning_rate": 0.00013847395207256007, + "loss": 1.9272, + "step": 23934 + }, + { + "epoch": 2.3007786215514754, + "grad_norm": 1.180279016494751, + "learning_rate": 0.00013846154281982163, + "loss": 2.0585, + "step": 23935 + }, + { + "epoch": 2.3008747476689417, + "grad_norm": 1.165278434753418, + "learning_rate": 0.00013844913365883662, + "loss": 2.1268, + "step": 23936 + }, + { + "epoch": 2.3009708737864076, + "grad_norm": 1.2118196487426758, + "learning_rate": 0.00013843672458969237, + "loss": 2.0059, + "step": 23937 + }, + { + "epoch": 2.301066999903874, + "grad_norm": 1.203352928161621, + "learning_rate": 0.00013842431561247615, + "loss": 2.1814, + "step": 23938 + }, + { + "epoch": 2.30116312602134, + "grad_norm": 1.1496549844741821, + "learning_rate": 0.0001384119067272754, + "loss": 1.9664, + "step": 23939 + }, + { + "epoch": 2.301259252138806, + "grad_norm": 1.1427178382873535, + "learning_rate": 0.00013839949793417733, + "loss": 1.9475, + "step": 23940 + }, + { + "epoch": 2.301355378256272, + "grad_norm": 1.0983874797821045, + "learning_rate": 0.0001383870892332693, + "loss": 1.9859, + "step": 23941 + }, + { + "epoch": 2.3014515043737385, + "grad_norm": 1.3015174865722656, + "learning_rate": 0.00013837468062463865, + "loss": 2.0588, + "step": 23942 + }, + { + "epoch": 2.3015476304912044, + "grad_norm": 1.6321306228637695, + "learning_rate": 0.0001383622721083727, + "loss": 1.9259, + "step": 23943 + }, + { + "epoch": 2.3016437566086707, + "grad_norm": 1.2023223638534546, + "learning_rate": 0.00013834986368455878, + "loss": 1.924, + "step": 23944 + }, + { + "epoch": 2.3017398827261366, + "grad_norm": 1.0735024213790894, + "learning_rate": 0.00013833745535328415, + "loss": 1.7321, + "step": 23945 + }, + { + "epoch": 2.301836008843603, + "grad_norm": 1.2787801027297974, + "learning_rate": 0.00013832504711463618, + "loss": 2.0199, + "step": 23946 + }, + { + "epoch": 2.301932134961069, + "grad_norm": 1.1989500522613525, + "learning_rate": 0.00013831263896870217, + "loss": 2.0904, + "step": 23947 + }, + { + "epoch": 2.302028261078535, + "grad_norm": 1.1501455307006836, + "learning_rate": 0.00013830023091556946, + "loss": 1.8146, + "step": 23948 + }, + { + "epoch": 2.302124387196001, + "grad_norm": 1.3564871549606323, + "learning_rate": 0.00013828782295532537, + "loss": 2.206, + "step": 23949 + }, + { + "epoch": 2.3022205133134674, + "grad_norm": 1.0407923460006714, + "learning_rate": 0.00013827541508805717, + "loss": 1.9741, + "step": 23950 + }, + { + "epoch": 2.3023166394309333, + "grad_norm": 1.37770414352417, + "learning_rate": 0.00013826300731385223, + "loss": 1.9926, + "step": 23951 + }, + { + "epoch": 2.3024127655483997, + "grad_norm": 1.1118830442428589, + "learning_rate": 0.00013825059963279784, + "loss": 1.8781, + "step": 23952 + }, + { + "epoch": 2.3025088916658656, + "grad_norm": 1.0541914701461792, + "learning_rate": 0.00013823819204498132, + "loss": 2.022, + "step": 23953 + }, + { + "epoch": 2.302605017783332, + "grad_norm": 1.013685703277588, + "learning_rate": 0.00013822578455048996, + "loss": 1.9032, + "step": 23954 + }, + { + "epoch": 2.302701143900798, + "grad_norm": 1.2207636833190918, + "learning_rate": 0.0001382133771494111, + "loss": 2.048, + "step": 23955 + }, + { + "epoch": 2.3027972700182637, + "grad_norm": 1.253151297569275, + "learning_rate": 0.00013820096984183206, + "loss": 1.9022, + "step": 23956 + }, + { + "epoch": 2.30289339613573, + "grad_norm": 1.1000977754592896, + "learning_rate": 0.00013818856262784012, + "loss": 1.9925, + "step": 23957 + }, + { + "epoch": 2.3029895222531964, + "grad_norm": 1.1503164768218994, + "learning_rate": 0.00013817615550752263, + "loss": 1.934, + "step": 23958 + }, + { + "epoch": 2.3030856483706623, + "grad_norm": 1.1159826517105103, + "learning_rate": 0.00013816374848096686, + "loss": 1.8774, + "step": 23959 + }, + { + "epoch": 2.303181774488128, + "grad_norm": 1.1054108142852783, + "learning_rate": 0.00013815134154826015, + "loss": 2.0561, + "step": 23960 + }, + { + "epoch": 2.3032779006055946, + "grad_norm": 1.2354308366775513, + "learning_rate": 0.0001381389347094898, + "loss": 1.9892, + "step": 23961 + }, + { + "epoch": 2.3033740267230605, + "grad_norm": 1.0673984289169312, + "learning_rate": 0.0001381265279647431, + "loss": 1.821, + "step": 23962 + }, + { + "epoch": 2.303470152840527, + "grad_norm": 1.196180820465088, + "learning_rate": 0.00013811412131410739, + "loss": 2.0484, + "step": 23963 + }, + { + "epoch": 2.3035662789579927, + "grad_norm": 1.1637355089187622, + "learning_rate": 0.00013810171475766992, + "loss": 1.7557, + "step": 23964 + }, + { + "epoch": 2.303662405075459, + "grad_norm": 0.9971699714660645, + "learning_rate": 0.0001380893082955181, + "loss": 1.9139, + "step": 23965 + }, + { + "epoch": 2.303758531192925, + "grad_norm": 1.1542673110961914, + "learning_rate": 0.00013807690192773914, + "loss": 2.0713, + "step": 23966 + }, + { + "epoch": 2.3038546573103913, + "grad_norm": 1.2996888160705566, + "learning_rate": 0.0001380644956544204, + "loss": 1.9911, + "step": 23967 + }, + { + "epoch": 2.303950783427857, + "grad_norm": 1.010189175605774, + "learning_rate": 0.00013805208947564917, + "loss": 1.9234, + "step": 23968 + }, + { + "epoch": 2.3040469095453235, + "grad_norm": 1.2534217834472656, + "learning_rate": 0.00013803968339151274, + "loss": 2.0515, + "step": 23969 + }, + { + "epoch": 2.3041430356627894, + "grad_norm": 1.1651972532272339, + "learning_rate": 0.00013802727740209843, + "loss": 2.072, + "step": 23970 + }, + { + "epoch": 2.304239161780256, + "grad_norm": 1.2310268878936768, + "learning_rate": 0.00013801487150749353, + "loss": 1.9256, + "step": 23971 + }, + { + "epoch": 2.3043352878977217, + "grad_norm": 1.2111952304840088, + "learning_rate": 0.00013800246570778535, + "loss": 1.995, + "step": 23972 + }, + { + "epoch": 2.304431414015188, + "grad_norm": 1.1541026830673218, + "learning_rate": 0.00013799006000306117, + "loss": 2.2037, + "step": 23973 + }, + { + "epoch": 2.304527540132654, + "grad_norm": 1.082115650177002, + "learning_rate": 0.00013797765439340831, + "loss": 1.9386, + "step": 23974 + }, + { + "epoch": 2.3046236662501203, + "grad_norm": 1.1768734455108643, + "learning_rate": 0.0001379652488789141, + "loss": 1.914, + "step": 23975 + }, + { + "epoch": 2.304719792367586, + "grad_norm": 1.2208874225616455, + "learning_rate": 0.0001379528434596658, + "loss": 1.9925, + "step": 23976 + }, + { + "epoch": 2.3048159184850525, + "grad_norm": 1.246055245399475, + "learning_rate": 0.0001379404381357507, + "loss": 2.2204, + "step": 23977 + }, + { + "epoch": 2.3049120446025184, + "grad_norm": 1.2241827249526978, + "learning_rate": 0.00013792803290725613, + "loss": 2.0595, + "step": 23978 + }, + { + "epoch": 2.3050081707199848, + "grad_norm": 1.1268372535705566, + "learning_rate": 0.00013791562777426937, + "loss": 1.9858, + "step": 23979 + }, + { + "epoch": 2.3051042968374507, + "grad_norm": 1.1730469465255737, + "learning_rate": 0.00013790322273687773, + "loss": 1.9683, + "step": 23980 + }, + { + "epoch": 2.305200422954917, + "grad_norm": 1.3522480726242065, + "learning_rate": 0.00013789081779516844, + "loss": 2.1656, + "step": 23981 + }, + { + "epoch": 2.305296549072383, + "grad_norm": 1.0684576034545898, + "learning_rate": 0.0001378784129492289, + "loss": 1.9102, + "step": 23982 + }, + { + "epoch": 2.3053926751898492, + "grad_norm": 1.2338751554489136, + "learning_rate": 0.00013786600819914636, + "loss": 2.129, + "step": 23983 + }, + { + "epoch": 2.305488801307315, + "grad_norm": 1.3061883449554443, + "learning_rate": 0.0001378536035450081, + "loss": 2.0193, + "step": 23984 + }, + { + "epoch": 2.3055849274247815, + "grad_norm": 1.2692322731018066, + "learning_rate": 0.0001378411989869014, + "loss": 1.9116, + "step": 23985 + }, + { + "epoch": 2.3056810535422474, + "grad_norm": 1.0990091562271118, + "learning_rate": 0.0001378287945249136, + "loss": 1.9856, + "step": 23986 + }, + { + "epoch": 2.3057771796597137, + "grad_norm": 1.0240957736968994, + "learning_rate": 0.00013781639015913197, + "loss": 1.8748, + "step": 23987 + }, + { + "epoch": 2.3058733057771796, + "grad_norm": 1.1133649349212646, + "learning_rate": 0.0001378039858896438, + "loss": 2.0507, + "step": 23988 + }, + { + "epoch": 2.3059694318946455, + "grad_norm": 1.1767432689666748, + "learning_rate": 0.00013779158171653636, + "loss": 2.0192, + "step": 23989 + }, + { + "epoch": 2.306065558012112, + "grad_norm": 1.2725857496261597, + "learning_rate": 0.00013777917763989697, + "loss": 2.1103, + "step": 23990 + }, + { + "epoch": 2.3061616841295782, + "grad_norm": 1.3245712518692017, + "learning_rate": 0.0001377667736598129, + "loss": 1.9291, + "step": 23991 + }, + { + "epoch": 2.306257810247044, + "grad_norm": 1.2862581014633179, + "learning_rate": 0.00013775436977637147, + "loss": 1.9848, + "step": 23992 + }, + { + "epoch": 2.30635393636451, + "grad_norm": 1.1340289115905762, + "learning_rate": 0.0001377419659896599, + "loss": 1.9883, + "step": 23993 + }, + { + "epoch": 2.3064500624819764, + "grad_norm": 1.1291406154632568, + "learning_rate": 0.00013772956229976554, + "loss": 2.0409, + "step": 23994 + }, + { + "epoch": 2.3065461885994427, + "grad_norm": 1.0378950834274292, + "learning_rate": 0.0001377171587067757, + "loss": 1.831, + "step": 23995 + }, + { + "epoch": 2.3066423147169086, + "grad_norm": 1.1804927587509155, + "learning_rate": 0.00013770475521077756, + "loss": 2.0977, + "step": 23996 + }, + { + "epoch": 2.3067384408343745, + "grad_norm": 1.155547022819519, + "learning_rate": 0.0001376923518118585, + "loss": 1.9935, + "step": 23997 + }, + { + "epoch": 2.306834566951841, + "grad_norm": 1.1319178342819214, + "learning_rate": 0.00013767994851010575, + "loss": 1.9623, + "step": 23998 + }, + { + "epoch": 2.3069306930693068, + "grad_norm": 1.0859588384628296, + "learning_rate": 0.00013766754530560664, + "loss": 1.9351, + "step": 23999 + }, + { + "epoch": 2.307026819186773, + "grad_norm": 1.2207903861999512, + "learning_rate": 0.00013765514219844842, + "loss": 2.0343, + "step": 24000 + }, + { + "epoch": 2.307122945304239, + "grad_norm": 1.379183292388916, + "learning_rate": 0.0001376427391887184, + "loss": 2.0373, + "step": 24001 + }, + { + "epoch": 2.3072190714217053, + "grad_norm": 1.1142946481704712, + "learning_rate": 0.00013763033627650384, + "loss": 1.9434, + "step": 24002 + }, + { + "epoch": 2.3073151975391712, + "grad_norm": 1.0246742963790894, + "learning_rate": 0.00013761793346189202, + "loss": 2.0344, + "step": 24003 + }, + { + "epoch": 2.3074113236566376, + "grad_norm": 1.0345468521118164, + "learning_rate": 0.00013760553074497023, + "loss": 2.0872, + "step": 24004 + }, + { + "epoch": 2.3075074497741035, + "grad_norm": 1.1142767667770386, + "learning_rate": 0.00013759312812582574, + "loss": 2.1031, + "step": 24005 + }, + { + "epoch": 2.30760357589157, + "grad_norm": 1.0899275541305542, + "learning_rate": 0.00013758072560454584, + "loss": 2.0013, + "step": 24006 + }, + { + "epoch": 2.3076997020090357, + "grad_norm": 1.1570487022399902, + "learning_rate": 0.00013756832318121778, + "loss": 2.0474, + "step": 24007 + }, + { + "epoch": 2.307795828126502, + "grad_norm": 1.2022156715393066, + "learning_rate": 0.0001375559208559289, + "loss": 2.0764, + "step": 24008 + }, + { + "epoch": 2.307891954243968, + "grad_norm": 1.1278916597366333, + "learning_rate": 0.0001375435186287664, + "loss": 2.0151, + "step": 24009 + }, + { + "epoch": 2.3079880803614343, + "grad_norm": 1.1222926378250122, + "learning_rate": 0.00013753111649981761, + "loss": 1.8148, + "step": 24010 + }, + { + "epoch": 2.3080842064789002, + "grad_norm": 1.225996494293213, + "learning_rate": 0.00013751871446916977, + "loss": 2.0092, + "step": 24011 + }, + { + "epoch": 2.3081803325963666, + "grad_norm": 1.1809966564178467, + "learning_rate": 0.00013750631253691022, + "loss": 2.1474, + "step": 24012 + }, + { + "epoch": 2.3082764587138325, + "grad_norm": 1.1902409791946411, + "learning_rate": 0.00013749391070312612, + "loss": 2.0943, + "step": 24013 + }, + { + "epoch": 2.308372584831299, + "grad_norm": 1.1946600675582886, + "learning_rate": 0.00013748150896790486, + "loss": 1.9824, + "step": 24014 + }, + { + "epoch": 2.3084687109487647, + "grad_norm": 1.123397946357727, + "learning_rate": 0.0001374691073313336, + "loss": 1.9998, + "step": 24015 + }, + { + "epoch": 2.308564837066231, + "grad_norm": 1.3282740116119385, + "learning_rate": 0.00013745670579349973, + "loss": 2.0287, + "step": 24016 + }, + { + "epoch": 2.308660963183697, + "grad_norm": 1.1472952365875244, + "learning_rate": 0.00013744430435449047, + "loss": 1.7908, + "step": 24017 + }, + { + "epoch": 2.3087570893011633, + "grad_norm": 1.302027940750122, + "learning_rate": 0.00013743190301439308, + "loss": 2.2081, + "step": 24018 + }, + { + "epoch": 2.308853215418629, + "grad_norm": 1.245152235031128, + "learning_rate": 0.00013741950177329485, + "loss": 2.155, + "step": 24019 + }, + { + "epoch": 2.3089493415360955, + "grad_norm": 1.178098201751709, + "learning_rate": 0.00013740710063128302, + "loss": 1.9393, + "step": 24020 + }, + { + "epoch": 2.3090454676535614, + "grad_norm": 1.1081258058547974, + "learning_rate": 0.00013739469958844487, + "loss": 1.8297, + "step": 24021 + }, + { + "epoch": 2.3091415937710273, + "grad_norm": 1.1685887575149536, + "learning_rate": 0.0001373822986448677, + "loss": 1.9546, + "step": 24022 + }, + { + "epoch": 2.3092377198884937, + "grad_norm": 1.3782999515533447, + "learning_rate": 0.00013736989780063872, + "loss": 2.1168, + "step": 24023 + }, + { + "epoch": 2.30933384600596, + "grad_norm": 1.156073808670044, + "learning_rate": 0.00013735749705584525, + "loss": 1.9898, + "step": 24024 + }, + { + "epoch": 2.309429972123426, + "grad_norm": 1.143364429473877, + "learning_rate": 0.00013734509641057452, + "loss": 2.1701, + "step": 24025 + }, + { + "epoch": 2.309526098240892, + "grad_norm": 1.1711053848266602, + "learning_rate": 0.0001373326958649138, + "loss": 2.0422, + "step": 24026 + }, + { + "epoch": 2.309622224358358, + "grad_norm": 1.2461103200912476, + "learning_rate": 0.00013732029541895036, + "loss": 2.3032, + "step": 24027 + }, + { + "epoch": 2.3097183504758245, + "grad_norm": 1.1760754585266113, + "learning_rate": 0.00013730789507277152, + "loss": 2.0227, + "step": 24028 + }, + { + "epoch": 2.3098144765932904, + "grad_norm": 1.064687967300415, + "learning_rate": 0.00013729549482646442, + "loss": 2.0589, + "step": 24029 + }, + { + "epoch": 2.3099106027107563, + "grad_norm": 1.0755805969238281, + "learning_rate": 0.00013728309468011643, + "loss": 1.8445, + "step": 24030 + }, + { + "epoch": 2.3100067288282227, + "grad_norm": 1.3386168479919434, + "learning_rate": 0.00013727069463381476, + "loss": 2.0511, + "step": 24031 + }, + { + "epoch": 2.3101028549456886, + "grad_norm": 1.3231970071792603, + "learning_rate": 0.00013725829468764665, + "loss": 1.9867, + "step": 24032 + }, + { + "epoch": 2.310198981063155, + "grad_norm": 1.3494670391082764, + "learning_rate": 0.00013724589484169944, + "loss": 2.0961, + "step": 24033 + }, + { + "epoch": 2.310295107180621, + "grad_norm": 1.1495617628097534, + "learning_rate": 0.0001372334950960603, + "loss": 1.9484, + "step": 24034 + }, + { + "epoch": 2.310391233298087, + "grad_norm": 1.2337360382080078, + "learning_rate": 0.00013722109545081656, + "loss": 1.9439, + "step": 24035 + }, + { + "epoch": 2.310487359415553, + "grad_norm": 1.1303706169128418, + "learning_rate": 0.00013720869590605546, + "loss": 1.9625, + "step": 24036 + }, + { + "epoch": 2.3105834855330194, + "grad_norm": 1.1425424814224243, + "learning_rate": 0.00013719629646186422, + "loss": 2.028, + "step": 24037 + }, + { + "epoch": 2.3106796116504853, + "grad_norm": 1.2030143737792969, + "learning_rate": 0.00013718389711833015, + "loss": 1.9534, + "step": 24038 + }, + { + "epoch": 2.3107757377679516, + "grad_norm": 1.1744883060455322, + "learning_rate": 0.0001371714978755405, + "loss": 2.1443, + "step": 24039 + }, + { + "epoch": 2.3108718638854175, + "grad_norm": 1.120632529258728, + "learning_rate": 0.00013715909873358243, + "loss": 2.0, + "step": 24040 + }, + { + "epoch": 2.310967990002884, + "grad_norm": 1.2559128999710083, + "learning_rate": 0.00013714669969254334, + "loss": 2.1179, + "step": 24041 + }, + { + "epoch": 2.31106411612035, + "grad_norm": 1.0651235580444336, + "learning_rate": 0.00013713430075251037, + "loss": 1.9198, + "step": 24042 + }, + { + "epoch": 2.311160242237816, + "grad_norm": 1.0993733406066895, + "learning_rate": 0.00013712190191357085, + "loss": 2.1067, + "step": 24043 + }, + { + "epoch": 2.311256368355282, + "grad_norm": 1.303419589996338, + "learning_rate": 0.00013710950317581198, + "loss": 2.0542, + "step": 24044 + }, + { + "epoch": 2.3113524944727484, + "grad_norm": 1.1377344131469727, + "learning_rate": 0.000137097104539321, + "loss": 1.9948, + "step": 24045 + }, + { + "epoch": 2.3114486205902143, + "grad_norm": 1.0149673223495483, + "learning_rate": 0.00013708470600418523, + "loss": 1.9872, + "step": 24046 + }, + { + "epoch": 2.3115447467076806, + "grad_norm": 1.1644814014434814, + "learning_rate": 0.00013707230757049188, + "loss": 2.1881, + "step": 24047 + }, + { + "epoch": 2.3116408728251465, + "grad_norm": 1.0333341360092163, + "learning_rate": 0.0001370599092383282, + "loss": 1.8763, + "step": 24048 + }, + { + "epoch": 2.311736998942613, + "grad_norm": 1.1427607536315918, + "learning_rate": 0.0001370475110077814, + "loss": 2.0107, + "step": 24049 + }, + { + "epoch": 2.3118331250600788, + "grad_norm": 1.0918796062469482, + "learning_rate": 0.00013703511287893879, + "loss": 1.8922, + "step": 24050 + }, + { + "epoch": 2.311929251177545, + "grad_norm": 1.037426233291626, + "learning_rate": 0.0001370227148518876, + "loss": 1.9827, + "step": 24051 + }, + { + "epoch": 2.312025377295011, + "grad_norm": 1.1476223468780518, + "learning_rate": 0.0001370103169267151, + "loss": 1.9363, + "step": 24052 + }, + { + "epoch": 2.3121215034124774, + "grad_norm": 1.0787386894226074, + "learning_rate": 0.00013699791910350845, + "loss": 1.9279, + "step": 24053 + }, + { + "epoch": 2.3122176295299433, + "grad_norm": 1.0620473623275757, + "learning_rate": 0.000136985521382355, + "loss": 1.9458, + "step": 24054 + }, + { + "epoch": 2.312313755647409, + "grad_norm": 1.2113999128341675, + "learning_rate": 0.00013697312376334192, + "loss": 2.0167, + "step": 24055 + }, + { + "epoch": 2.3124098817648755, + "grad_norm": 1.0761792659759521, + "learning_rate": 0.00013696072624655648, + "loss": 2.0155, + "step": 24056 + }, + { + "epoch": 2.312506007882342, + "grad_norm": 1.284458041191101, + "learning_rate": 0.00013694832883208593, + "loss": 2.0028, + "step": 24057 + }, + { + "epoch": 2.3126021339998077, + "grad_norm": 1.106929898262024, + "learning_rate": 0.00013693593152001754, + "loss": 1.9959, + "step": 24058 + }, + { + "epoch": 2.3126982601172736, + "grad_norm": 1.152712345123291, + "learning_rate": 0.00013692353431043848, + "loss": 1.9652, + "step": 24059 + }, + { + "epoch": 2.31279438623474, + "grad_norm": 1.1382168531417847, + "learning_rate": 0.00013691113720343604, + "loss": 2.0274, + "step": 24060 + }, + { + "epoch": 2.3128905123522063, + "grad_norm": 1.0076783895492554, + "learning_rate": 0.00013689874019909743, + "loss": 1.8029, + "step": 24061 + }, + { + "epoch": 2.3129866384696722, + "grad_norm": 1.1564620733261108, + "learning_rate": 0.00013688634329750993, + "loss": 2.0718, + "step": 24062 + }, + { + "epoch": 2.313082764587138, + "grad_norm": 1.33171546459198, + "learning_rate": 0.00013687394649876074, + "loss": 2.0739, + "step": 24063 + }, + { + "epoch": 2.3131788907046045, + "grad_norm": 1.376187801361084, + "learning_rate": 0.00013686154980293712, + "loss": 2.0467, + "step": 24064 + }, + { + "epoch": 2.3132750168220704, + "grad_norm": 1.093043327331543, + "learning_rate": 0.0001368491532101263, + "loss": 1.947, + "step": 24065 + }, + { + "epoch": 2.3133711429395367, + "grad_norm": 1.1083523035049438, + "learning_rate": 0.0001368367567204155, + "loss": 2.0751, + "step": 24066 + }, + { + "epoch": 2.3134672690570026, + "grad_norm": 1.0302822589874268, + "learning_rate": 0.000136824360333892, + "loss": 2.0539, + "step": 24067 + }, + { + "epoch": 2.313563395174469, + "grad_norm": 1.1284593343734741, + "learning_rate": 0.000136811964050643, + "loss": 2.0072, + "step": 24068 + }, + { + "epoch": 2.313659521291935, + "grad_norm": 1.0701864957809448, + "learning_rate": 0.00013679956787075577, + "loss": 2.0904, + "step": 24069 + }, + { + "epoch": 2.313755647409401, + "grad_norm": 1.0103458166122437, + "learning_rate": 0.0001367871717943175, + "loss": 2.0804, + "step": 24070 + }, + { + "epoch": 2.313851773526867, + "grad_norm": 1.222550392150879, + "learning_rate": 0.00013677477582141545, + "loss": 2.1496, + "step": 24071 + }, + { + "epoch": 2.3139478996443335, + "grad_norm": 1.226020097732544, + "learning_rate": 0.00013676237995213683, + "loss": 2.1216, + "step": 24072 + }, + { + "epoch": 2.3140440257617994, + "grad_norm": 1.1158084869384766, + "learning_rate": 0.00013674998418656889, + "loss": 2.0285, + "step": 24073 + }, + { + "epoch": 2.3141401518792657, + "grad_norm": 1.1588711738586426, + "learning_rate": 0.00013673758852479886, + "loss": 1.9385, + "step": 24074 + }, + { + "epoch": 2.3142362779967316, + "grad_norm": 0.9447157382965088, + "learning_rate": 0.00013672519296691396, + "loss": 1.8175, + "step": 24075 + }, + { + "epoch": 2.314332404114198, + "grad_norm": 1.0178377628326416, + "learning_rate": 0.00013671279751300143, + "loss": 1.9493, + "step": 24076 + }, + { + "epoch": 2.314428530231664, + "grad_norm": 1.159162163734436, + "learning_rate": 0.00013670040216314847, + "loss": 1.9793, + "step": 24077 + }, + { + "epoch": 2.31452465634913, + "grad_norm": 1.0085431337356567, + "learning_rate": 0.00013668800691744233, + "loss": 1.8505, + "step": 24078 + }, + { + "epoch": 2.314620782466596, + "grad_norm": 1.1239756345748901, + "learning_rate": 0.00013667561177597024, + "loss": 1.9717, + "step": 24079 + }, + { + "epoch": 2.3147169085840624, + "grad_norm": 1.2238208055496216, + "learning_rate": 0.00013666321673881944, + "loss": 1.9579, + "step": 24080 + }, + { + "epoch": 2.3148130347015283, + "grad_norm": 1.1201953887939453, + "learning_rate": 0.00013665082180607714, + "loss": 1.9959, + "step": 24081 + }, + { + "epoch": 2.3149091608189947, + "grad_norm": 1.2479066848754883, + "learning_rate": 0.00013663842697783055, + "loss": 1.8709, + "step": 24082 + }, + { + "epoch": 2.3150052869364606, + "grad_norm": 1.2483667135238647, + "learning_rate": 0.0001366260322541669, + "loss": 1.9304, + "step": 24083 + }, + { + "epoch": 2.315101413053927, + "grad_norm": 1.195393681526184, + "learning_rate": 0.00013661363763517345, + "loss": 1.8909, + "step": 24084 + }, + { + "epoch": 2.315197539171393, + "grad_norm": 1.1495673656463623, + "learning_rate": 0.00013660124312093737, + "loss": 1.9592, + "step": 24085 + }, + { + "epoch": 2.315293665288859, + "grad_norm": 1.241114854812622, + "learning_rate": 0.0001365888487115459, + "loss": 2.0555, + "step": 24086 + }, + { + "epoch": 2.315389791406325, + "grad_norm": 1.2474102973937988, + "learning_rate": 0.00013657645440708633, + "loss": 1.9825, + "step": 24087 + }, + { + "epoch": 2.3154859175237914, + "grad_norm": 1.291569471359253, + "learning_rate": 0.00013656406020764574, + "loss": 2.09, + "step": 24088 + }, + { + "epoch": 2.3155820436412573, + "grad_norm": 1.1860750913619995, + "learning_rate": 0.00013655166611331148, + "loss": 1.9728, + "step": 24089 + }, + { + "epoch": 2.3156781697587236, + "grad_norm": 1.3676007986068726, + "learning_rate": 0.0001365392721241707, + "loss": 2.0565, + "step": 24090 + }, + { + "epoch": 2.3157742958761895, + "grad_norm": 1.0862188339233398, + "learning_rate": 0.00013652687824031062, + "loss": 1.9936, + "step": 24091 + }, + { + "epoch": 2.3158704219936554, + "grad_norm": 1.3260774612426758, + "learning_rate": 0.0001365144844618185, + "loss": 1.9366, + "step": 24092 + }, + { + "epoch": 2.315966548111122, + "grad_norm": 1.1745373010635376, + "learning_rate": 0.0001365020907887815, + "loss": 1.9346, + "step": 24093 + }, + { + "epoch": 2.316062674228588, + "grad_norm": 1.1995108127593994, + "learning_rate": 0.0001364896972212869, + "loss": 2.0283, + "step": 24094 + }, + { + "epoch": 2.316158800346054, + "grad_norm": 1.0895004272460938, + "learning_rate": 0.00013647730375942183, + "loss": 1.9324, + "step": 24095 + }, + { + "epoch": 2.31625492646352, + "grad_norm": 1.089719295501709, + "learning_rate": 0.0001364649104032736, + "loss": 1.9998, + "step": 24096 + }, + { + "epoch": 2.3163510525809863, + "grad_norm": 1.013120412826538, + "learning_rate": 0.00013645251715292938, + "loss": 2.0263, + "step": 24097 + }, + { + "epoch": 2.316447178698452, + "grad_norm": 1.2781652212142944, + "learning_rate": 0.00013644012400847635, + "loss": 2.0642, + "step": 24098 + }, + { + "epoch": 2.3165433048159185, + "grad_norm": 1.2006638050079346, + "learning_rate": 0.00013642773097000176, + "loss": 2.0259, + "step": 24099 + }, + { + "epoch": 2.3166394309333844, + "grad_norm": 1.2589775323867798, + "learning_rate": 0.0001364153380375928, + "loss": 1.995, + "step": 24100 + }, + { + "epoch": 2.3167355570508508, + "grad_norm": 1.222819209098816, + "learning_rate": 0.00013640294521133678, + "loss": 2.1437, + "step": 24101 + }, + { + "epoch": 2.3168316831683167, + "grad_norm": 1.2111631631851196, + "learning_rate": 0.00013639055249132077, + "loss": 2.1659, + "step": 24102 + }, + { + "epoch": 2.316927809285783, + "grad_norm": 1.2659273147583008, + "learning_rate": 0.00013637815987763204, + "loss": 2.0879, + "step": 24103 + }, + { + "epoch": 2.317023935403249, + "grad_norm": 1.104746699333191, + "learning_rate": 0.00013636576737035777, + "loss": 1.9482, + "step": 24104 + }, + { + "epoch": 2.3171200615207153, + "grad_norm": 1.004499912261963, + "learning_rate": 0.00013635337496958522, + "loss": 1.908, + "step": 24105 + }, + { + "epoch": 2.317216187638181, + "grad_norm": 1.2123240232467651, + "learning_rate": 0.00013634098267540154, + "loss": 2.0198, + "step": 24106 + }, + { + "epoch": 2.3173123137556475, + "grad_norm": 1.214743971824646, + "learning_rate": 0.00013632859048789397, + "loss": 2.0384, + "step": 24107 + }, + { + "epoch": 2.3174084398731134, + "grad_norm": 1.149328351020813, + "learning_rate": 0.00013631619840714972, + "loss": 2.0702, + "step": 24108 + }, + { + "epoch": 2.3175045659905797, + "grad_norm": 1.3051193952560425, + "learning_rate": 0.000136303806433256, + "loss": 2.1444, + "step": 24109 + }, + { + "epoch": 2.3176006921080456, + "grad_norm": 1.2658735513687134, + "learning_rate": 0.00013629141456629994, + "loss": 2.033, + "step": 24110 + }, + { + "epoch": 2.317696818225512, + "grad_norm": 1.1899759769439697, + "learning_rate": 0.00013627902280636885, + "loss": 1.9921, + "step": 24111 + }, + { + "epoch": 2.317792944342978, + "grad_norm": 1.05608069896698, + "learning_rate": 0.0001362666311535499, + "loss": 1.982, + "step": 24112 + }, + { + "epoch": 2.3178890704604442, + "grad_norm": 1.127540111541748, + "learning_rate": 0.00013625423960793024, + "loss": 1.87, + "step": 24113 + }, + { + "epoch": 2.31798519657791, + "grad_norm": 1.1280529499053955, + "learning_rate": 0.00013624184816959716, + "loss": 1.9972, + "step": 24114 + }, + { + "epoch": 2.3180813226953765, + "grad_norm": 1.2316720485687256, + "learning_rate": 0.00013622945683863775, + "loss": 1.9104, + "step": 24115 + }, + { + "epoch": 2.3181774488128424, + "grad_norm": 0.986897349357605, + "learning_rate": 0.0001362170656151393, + "loss": 1.815, + "step": 24116 + }, + { + "epoch": 2.3182735749303087, + "grad_norm": 1.1317520141601562, + "learning_rate": 0.00013620467449918896, + "loss": 2.0057, + "step": 24117 + }, + { + "epoch": 2.3183697010477746, + "grad_norm": 1.1663905382156372, + "learning_rate": 0.00013619228349087396, + "loss": 2.0415, + "step": 24118 + }, + { + "epoch": 2.318465827165241, + "grad_norm": 1.1448523998260498, + "learning_rate": 0.0001361798925902815, + "loss": 1.9366, + "step": 24119 + }, + { + "epoch": 2.318561953282707, + "grad_norm": 1.264521837234497, + "learning_rate": 0.00013616750179749873, + "loss": 2.1589, + "step": 24120 + }, + { + "epoch": 2.318658079400173, + "grad_norm": 1.1200164556503296, + "learning_rate": 0.00013615511111261287, + "loss": 2.0945, + "step": 24121 + }, + { + "epoch": 2.318754205517639, + "grad_norm": 1.252966046333313, + "learning_rate": 0.00013614272053571113, + "loss": 2.1646, + "step": 24122 + }, + { + "epoch": 2.3188503316351055, + "grad_norm": 1.0143722295761108, + "learning_rate": 0.0001361303300668807, + "loss": 2.0188, + "step": 24123 + }, + { + "epoch": 2.3189464577525714, + "grad_norm": 1.320327639579773, + "learning_rate": 0.00013611793970620873, + "loss": 2.0823, + "step": 24124 + }, + { + "epoch": 2.3190425838700373, + "grad_norm": 1.2389549016952515, + "learning_rate": 0.0001361055494537825, + "loss": 1.9462, + "step": 24125 + }, + { + "epoch": 2.3191387099875036, + "grad_norm": 1.2686145305633545, + "learning_rate": 0.00013609315930968914, + "loss": 2.0251, + "step": 24126 + }, + { + "epoch": 2.31923483610497, + "grad_norm": 1.2056561708450317, + "learning_rate": 0.00013608076927401578, + "loss": 1.9159, + "step": 24127 + }, + { + "epoch": 2.319330962222436, + "grad_norm": 1.2241462469100952, + "learning_rate": 0.00013606837934684976, + "loss": 2.0841, + "step": 24128 + }, + { + "epoch": 2.3194270883399017, + "grad_norm": 1.1991859674453735, + "learning_rate": 0.00013605598952827816, + "loss": 1.8874, + "step": 24129 + }, + { + "epoch": 2.319523214457368, + "grad_norm": 1.1356064081192017, + "learning_rate": 0.00013604359981838823, + "loss": 1.8891, + "step": 24130 + }, + { + "epoch": 2.3196193405748344, + "grad_norm": 1.239907145500183, + "learning_rate": 0.00013603121021726712, + "loss": 1.9903, + "step": 24131 + }, + { + "epoch": 2.3197154666923003, + "grad_norm": 1.014967918395996, + "learning_rate": 0.00013601882072500202, + "loss": 1.8446, + "step": 24132 + }, + { + "epoch": 2.3198115928097662, + "grad_norm": 1.0428870916366577, + "learning_rate": 0.00013600643134168013, + "loss": 1.7494, + "step": 24133 + }, + { + "epoch": 2.3199077189272326, + "grad_norm": 1.1208478212356567, + "learning_rate": 0.00013599404206738862, + "loss": 1.9813, + "step": 24134 + }, + { + "epoch": 2.3200038450446985, + "grad_norm": 1.1717028617858887, + "learning_rate": 0.0001359816529022147, + "loss": 1.999, + "step": 24135 + }, + { + "epoch": 2.320099971162165, + "grad_norm": 1.2510124444961548, + "learning_rate": 0.00013596926384624552, + "loss": 2.0277, + "step": 24136 + }, + { + "epoch": 2.3201960972796307, + "grad_norm": 1.1897575855255127, + "learning_rate": 0.0001359568748995683, + "loss": 2.0077, + "step": 24137 + }, + { + "epoch": 2.320292223397097, + "grad_norm": 1.2404065132141113, + "learning_rate": 0.00013594448606227022, + "loss": 2.0967, + "step": 24138 + }, + { + "epoch": 2.320388349514563, + "grad_norm": 1.2926424741744995, + "learning_rate": 0.0001359320973344384, + "loss": 2.0767, + "step": 24139 + }, + { + "epoch": 2.3204844756320293, + "grad_norm": 1.0573310852050781, + "learning_rate": 0.00013591970871616008, + "loss": 1.998, + "step": 24140 + }, + { + "epoch": 2.320580601749495, + "grad_norm": 1.2238552570343018, + "learning_rate": 0.00013590732020752244, + "loss": 1.941, + "step": 24141 + }, + { + "epoch": 2.3206767278669616, + "grad_norm": 1.1904528141021729, + "learning_rate": 0.00013589493180861265, + "loss": 1.8088, + "step": 24142 + }, + { + "epoch": 2.3207728539844275, + "grad_norm": 1.204083800315857, + "learning_rate": 0.0001358825435195179, + "loss": 2.0016, + "step": 24143 + }, + { + "epoch": 2.320868980101894, + "grad_norm": 1.1557743549346924, + "learning_rate": 0.0001358701553403253, + "loss": 2.0342, + "step": 24144 + }, + { + "epoch": 2.3209651062193597, + "grad_norm": 1.1806645393371582, + "learning_rate": 0.00013585776727112212, + "loss": 2.1494, + "step": 24145 + }, + { + "epoch": 2.321061232336826, + "grad_norm": 1.1954606771469116, + "learning_rate": 0.0001358453793119955, + "loss": 2.0548, + "step": 24146 + }, + { + "epoch": 2.321157358454292, + "grad_norm": 1.4532145261764526, + "learning_rate": 0.00013583299146303261, + "loss": 2.0611, + "step": 24147 + }, + { + "epoch": 2.3212534845717583, + "grad_norm": 1.348096489906311, + "learning_rate": 0.00013582060372432063, + "loss": 2.2046, + "step": 24148 + }, + { + "epoch": 2.321349610689224, + "grad_norm": 1.3290295600891113, + "learning_rate": 0.00013580821609594673, + "loss": 2.0463, + "step": 24149 + }, + { + "epoch": 2.3214457368066905, + "grad_norm": 1.171226143836975, + "learning_rate": 0.00013579582857799812, + "loss": 1.9817, + "step": 24150 + }, + { + "epoch": 2.3215418629241564, + "grad_norm": 1.3696078062057495, + "learning_rate": 0.0001357834411705619, + "loss": 2.1176, + "step": 24151 + }, + { + "epoch": 2.3216379890416228, + "grad_norm": 1.1718590259552002, + "learning_rate": 0.0001357710538737253, + "loss": 2.0539, + "step": 24152 + }, + { + "epoch": 2.3217341151590887, + "grad_norm": 1.1434696912765503, + "learning_rate": 0.00013575866668757546, + "loss": 2.1788, + "step": 24153 + }, + { + "epoch": 2.321830241276555, + "grad_norm": 1.3197541236877441, + "learning_rate": 0.0001357462796121996, + "loss": 2.0416, + "step": 24154 + }, + { + "epoch": 2.321926367394021, + "grad_norm": 1.1926926374435425, + "learning_rate": 0.00013573389264768482, + "loss": 2.0825, + "step": 24155 + }, + { + "epoch": 2.3220224935114873, + "grad_norm": 1.117964744567871, + "learning_rate": 0.0001357215057941183, + "loss": 2.1734, + "step": 24156 + }, + { + "epoch": 2.322118619628953, + "grad_norm": 1.2601176500320435, + "learning_rate": 0.00013570911905158727, + "loss": 1.9593, + "step": 24157 + }, + { + "epoch": 2.322214745746419, + "grad_norm": 1.0061577558517456, + "learning_rate": 0.00013569673242017885, + "loss": 2.0321, + "step": 24158 + }, + { + "epoch": 2.3223108718638854, + "grad_norm": 1.1630396842956543, + "learning_rate": 0.0001356843458999802, + "loss": 2.0217, + "step": 24159 + }, + { + "epoch": 2.3224069979813518, + "grad_norm": 1.1447237730026245, + "learning_rate": 0.00013567195949107852, + "loss": 1.8463, + "step": 24160 + }, + { + "epoch": 2.3225031240988177, + "grad_norm": 1.1893357038497925, + "learning_rate": 0.0001356595731935609, + "loss": 2.2552, + "step": 24161 + }, + { + "epoch": 2.3225992502162836, + "grad_norm": 1.1844178438186646, + "learning_rate": 0.0001356471870075146, + "loss": 1.88, + "step": 24162 + }, + { + "epoch": 2.32269537633375, + "grad_norm": 1.0076427459716797, + "learning_rate": 0.00013563480093302672, + "loss": 2.0528, + "step": 24163 + }, + { + "epoch": 2.3227915024512162, + "grad_norm": 1.2457913160324097, + "learning_rate": 0.00013562241497018447, + "loss": 2.0496, + "step": 24164 + }, + { + "epoch": 2.322887628568682, + "grad_norm": 1.0723086595535278, + "learning_rate": 0.000135610029119075, + "loss": 1.9145, + "step": 24165 + }, + { + "epoch": 2.322983754686148, + "grad_norm": 1.2018197774887085, + "learning_rate": 0.00013559764337978542, + "loss": 1.9138, + "step": 24166 + }, + { + "epoch": 2.3230798808036144, + "grad_norm": 1.2567418813705444, + "learning_rate": 0.00013558525775240294, + "loss": 2.0159, + "step": 24167 + }, + { + "epoch": 2.3231760069210803, + "grad_norm": 0.9752364754676819, + "learning_rate": 0.0001355728722370147, + "loss": 1.9437, + "step": 24168 + }, + { + "epoch": 2.3232721330385466, + "grad_norm": 1.1979163885116577, + "learning_rate": 0.00013556048683370788, + "loss": 2.0423, + "step": 24169 + }, + { + "epoch": 2.3233682591560125, + "grad_norm": 1.2024006843566895, + "learning_rate": 0.0001355481015425696, + "loss": 2.0328, + "step": 24170 + }, + { + "epoch": 2.323464385273479, + "grad_norm": 1.1515116691589355, + "learning_rate": 0.00013553571636368706, + "loss": 1.9964, + "step": 24171 + }, + { + "epoch": 2.3235605113909448, + "grad_norm": 1.1724530458450317, + "learning_rate": 0.0001355233312971474, + "loss": 1.9764, + "step": 24172 + }, + { + "epoch": 2.323656637508411, + "grad_norm": 1.074634075164795, + "learning_rate": 0.00013551094634303777, + "loss": 1.9647, + "step": 24173 + }, + { + "epoch": 2.323752763625877, + "grad_norm": 1.2052483558654785, + "learning_rate": 0.0001354985615014453, + "loss": 2.133, + "step": 24174 + }, + { + "epoch": 2.3238488897433434, + "grad_norm": 1.1836376190185547, + "learning_rate": 0.0001354861767724572, + "loss": 2.2232, + "step": 24175 + }, + { + "epoch": 2.3239450158608093, + "grad_norm": 1.237450361251831, + "learning_rate": 0.00013547379215616057, + "loss": 2.0714, + "step": 24176 + }, + { + "epoch": 2.3240411419782756, + "grad_norm": 1.2979861497879028, + "learning_rate": 0.0001354614076526426, + "loss": 2.0755, + "step": 24177 + }, + { + "epoch": 2.3241372680957415, + "grad_norm": 1.122730016708374, + "learning_rate": 0.0001354490232619904, + "loss": 2.0671, + "step": 24178 + }, + { + "epoch": 2.324233394213208, + "grad_norm": 1.082629680633545, + "learning_rate": 0.00013543663898429113, + "loss": 1.9629, + "step": 24179 + }, + { + "epoch": 2.3243295203306737, + "grad_norm": 1.216707468032837, + "learning_rate": 0.000135424254819632, + "loss": 2.0208, + "step": 24180 + }, + { + "epoch": 2.32442564644814, + "grad_norm": 1.274504542350769, + "learning_rate": 0.00013541187076810012, + "loss": 2.1082, + "step": 24181 + }, + { + "epoch": 2.324521772565606, + "grad_norm": 1.1828632354736328, + "learning_rate": 0.00013539948682978265, + "loss": 2.1142, + "step": 24182 + }, + { + "epoch": 2.3246178986830723, + "grad_norm": 1.1573052406311035, + "learning_rate": 0.00013538710300476666, + "loss": 2.154, + "step": 24183 + }, + { + "epoch": 2.3247140248005382, + "grad_norm": 1.081315279006958, + "learning_rate": 0.0001353747192931394, + "loss": 1.8677, + "step": 24184 + }, + { + "epoch": 2.3248101509180046, + "grad_norm": 1.1118274927139282, + "learning_rate": 0.00013536233569498798, + "loss": 2.0552, + "step": 24185 + }, + { + "epoch": 2.3249062770354705, + "grad_norm": 1.141139030456543, + "learning_rate": 0.00013534995221039953, + "loss": 2.0341, + "step": 24186 + }, + { + "epoch": 2.325002403152937, + "grad_norm": 1.225707769393921, + "learning_rate": 0.00013533756883946118, + "loss": 2.0784, + "step": 24187 + }, + { + "epoch": 2.3250985292704027, + "grad_norm": 0.9913200736045837, + "learning_rate": 0.00013532518558226014, + "loss": 1.9346, + "step": 24188 + }, + { + "epoch": 2.325194655387869, + "grad_norm": 1.0721684694290161, + "learning_rate": 0.00013531280243888347, + "loss": 1.9761, + "step": 24189 + }, + { + "epoch": 2.325290781505335, + "grad_norm": 1.2316774129867554, + "learning_rate": 0.0001353004194094184, + "loss": 2.1523, + "step": 24190 + }, + { + "epoch": 2.325386907622801, + "grad_norm": 1.3060672283172607, + "learning_rate": 0.00013528803649395197, + "loss": 2.1399, + "step": 24191 + }, + { + "epoch": 2.325483033740267, + "grad_norm": 1.0304991006851196, + "learning_rate": 0.0001352756536925714, + "loss": 2.0347, + "step": 24192 + }, + { + "epoch": 2.3255791598577336, + "grad_norm": 1.1793335676193237, + "learning_rate": 0.00013526327100536382, + "loss": 2.1355, + "step": 24193 + }, + { + "epoch": 2.3256752859751995, + "grad_norm": 1.212551474571228, + "learning_rate": 0.00013525088843241632, + "loss": 2.1545, + "step": 24194 + }, + { + "epoch": 2.3257714120926654, + "grad_norm": 0.9831458926200867, + "learning_rate": 0.00013523850597381605, + "loss": 1.8267, + "step": 24195 + }, + { + "epoch": 2.3258675382101317, + "grad_norm": 1.0144336223602295, + "learning_rate": 0.0001352261236296502, + "loss": 1.9017, + "step": 24196 + }, + { + "epoch": 2.325963664327598, + "grad_norm": 1.1560252904891968, + "learning_rate": 0.00013521374140000584, + "loss": 1.9653, + "step": 24197 + }, + { + "epoch": 2.326059790445064, + "grad_norm": 1.1133811473846436, + "learning_rate": 0.00013520135928497018, + "loss": 1.9921, + "step": 24198 + }, + { + "epoch": 2.32615591656253, + "grad_norm": 1.1637516021728516, + "learning_rate": 0.00013518897728463032, + "loss": 1.8911, + "step": 24199 + }, + { + "epoch": 2.326252042679996, + "grad_norm": 1.2112631797790527, + "learning_rate": 0.00013517659539907334, + "loss": 2.02, + "step": 24200 + }, + { + "epoch": 2.326348168797462, + "grad_norm": 1.1588956117630005, + "learning_rate": 0.00013516421362838645, + "loss": 1.9954, + "step": 24201 + }, + { + "epoch": 2.3264442949149284, + "grad_norm": 1.1027984619140625, + "learning_rate": 0.00013515183197265678, + "loss": 2.2057, + "step": 24202 + }, + { + "epoch": 2.3265404210323943, + "grad_norm": 1.104594349861145, + "learning_rate": 0.00013513945043197137, + "loss": 1.7882, + "step": 24203 + }, + { + "epoch": 2.3266365471498607, + "grad_norm": 1.1276459693908691, + "learning_rate": 0.00013512706900641746, + "loss": 2.1736, + "step": 24204 + }, + { + "epoch": 2.3267326732673266, + "grad_norm": 1.3678550720214844, + "learning_rate": 0.0001351146876960821, + "loss": 2.0762, + "step": 24205 + }, + { + "epoch": 2.326828799384793, + "grad_norm": 1.08583664894104, + "learning_rate": 0.0001351023065010525, + "loss": 1.9033, + "step": 24206 + }, + { + "epoch": 2.326924925502259, + "grad_norm": 1.0125877857208252, + "learning_rate": 0.0001350899254214157, + "loss": 1.8502, + "step": 24207 + }, + { + "epoch": 2.327021051619725, + "grad_norm": 1.2385505437850952, + "learning_rate": 0.00013507754445725887, + "loss": 2.1092, + "step": 24208 + }, + { + "epoch": 2.327117177737191, + "grad_norm": 1.174789309501648, + "learning_rate": 0.00013506516360866917, + "loss": 2.0359, + "step": 24209 + }, + { + "epoch": 2.3272133038546574, + "grad_norm": 1.1024335622787476, + "learning_rate": 0.00013505278287573365, + "loss": 1.9387, + "step": 24210 + }, + { + "epoch": 2.3273094299721233, + "grad_norm": 1.2959407567977905, + "learning_rate": 0.0001350404022585395, + "loss": 1.988, + "step": 24211 + }, + { + "epoch": 2.3274055560895897, + "grad_norm": 1.124632477760315, + "learning_rate": 0.00013502802175717376, + "loss": 2.1211, + "step": 24212 + }, + { + "epoch": 2.3275016822070556, + "grad_norm": 1.011427879333496, + "learning_rate": 0.00013501564137172368, + "loss": 2.0424, + "step": 24213 + }, + { + "epoch": 2.327597808324522, + "grad_norm": 1.1473060846328735, + "learning_rate": 0.0001350032611022763, + "loss": 1.8925, + "step": 24214 + }, + { + "epoch": 2.327693934441988, + "grad_norm": 1.3539888858795166, + "learning_rate": 0.00013499088094891875, + "loss": 1.9419, + "step": 24215 + }, + { + "epoch": 2.327790060559454, + "grad_norm": 1.0201783180236816, + "learning_rate": 0.00013497850091173818, + "loss": 1.9024, + "step": 24216 + }, + { + "epoch": 2.32788618667692, + "grad_norm": 1.118189811706543, + "learning_rate": 0.00013496612099082167, + "loss": 1.9935, + "step": 24217 + }, + { + "epoch": 2.3279823127943864, + "grad_norm": 1.066885232925415, + "learning_rate": 0.00013495374118625635, + "loss": 1.9734, + "step": 24218 + }, + { + "epoch": 2.3280784389118523, + "grad_norm": 1.2595164775848389, + "learning_rate": 0.00013494136149812936, + "loss": 2.1306, + "step": 24219 + }, + { + "epoch": 2.3281745650293186, + "grad_norm": 1.2372723817825317, + "learning_rate": 0.00013492898192652778, + "loss": 1.9959, + "step": 24220 + }, + { + "epoch": 2.3282706911467845, + "grad_norm": 1.3686820268630981, + "learning_rate": 0.00013491660247153878, + "loss": 2.0295, + "step": 24221 + }, + { + "epoch": 2.328366817264251, + "grad_norm": 1.1598143577575684, + "learning_rate": 0.00013490422313324943, + "loss": 2.0812, + "step": 24222 + }, + { + "epoch": 2.3284629433817168, + "grad_norm": 1.2057689428329468, + "learning_rate": 0.00013489184391174687, + "loss": 2.0416, + "step": 24223 + }, + { + "epoch": 2.328559069499183, + "grad_norm": 1.228786587715149, + "learning_rate": 0.0001348794648071182, + "loss": 2.1342, + "step": 24224 + }, + { + "epoch": 2.328655195616649, + "grad_norm": 1.189320683479309, + "learning_rate": 0.00013486708581945053, + "loss": 1.9968, + "step": 24225 + }, + { + "epoch": 2.3287513217341154, + "grad_norm": 1.2515640258789062, + "learning_rate": 0.000134854706948831, + "loss": 2.1961, + "step": 24226 + }, + { + "epoch": 2.3288474478515813, + "grad_norm": 1.0270583629608154, + "learning_rate": 0.0001348423281953467, + "loss": 1.983, + "step": 24227 + }, + { + "epoch": 2.328943573969047, + "grad_norm": 1.3227972984313965, + "learning_rate": 0.00013482994955908473, + "loss": 1.9418, + "step": 24228 + }, + { + "epoch": 2.3290397000865135, + "grad_norm": 1.1689515113830566, + "learning_rate": 0.0001348175710401322, + "loss": 2.1866, + "step": 24229 + }, + { + "epoch": 2.32913582620398, + "grad_norm": 1.0800833702087402, + "learning_rate": 0.00013480519263857623, + "loss": 2.0152, + "step": 24230 + }, + { + "epoch": 2.3292319523214458, + "grad_norm": 1.2516255378723145, + "learning_rate": 0.00013479281435450398, + "loss": 2.0628, + "step": 24231 + }, + { + "epoch": 2.3293280784389117, + "grad_norm": 1.036176323890686, + "learning_rate": 0.00013478043618800247, + "loss": 1.9398, + "step": 24232 + }, + { + "epoch": 2.329424204556378, + "grad_norm": 0.9944074749946594, + "learning_rate": 0.0001347680581391589, + "loss": 1.8911, + "step": 24233 + }, + { + "epoch": 2.329520330673844, + "grad_norm": 1.3133909702301025, + "learning_rate": 0.00013475568020806028, + "loss": 2.052, + "step": 24234 + }, + { + "epoch": 2.3296164567913102, + "grad_norm": 1.2445510625839233, + "learning_rate": 0.00013474330239479374, + "loss": 2.0204, + "step": 24235 + }, + { + "epoch": 2.329712582908776, + "grad_norm": 1.177922010421753, + "learning_rate": 0.00013473092469944644, + "loss": 1.86, + "step": 24236 + }, + { + "epoch": 2.3298087090262425, + "grad_norm": 1.3232121467590332, + "learning_rate": 0.00013471854712210544, + "loss": 2.0662, + "step": 24237 + }, + { + "epoch": 2.3299048351437084, + "grad_norm": 1.2453235387802124, + "learning_rate": 0.00013470616966285785, + "loss": 1.9764, + "step": 24238 + }, + { + "epoch": 2.3300009612611747, + "grad_norm": 1.1401829719543457, + "learning_rate": 0.00013469379232179074, + "loss": 1.9461, + "step": 24239 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 1.1236648559570312, + "learning_rate": 0.0001346814150989913, + "loss": 1.9824, + "step": 24240 + }, + { + "epoch": 2.330193213496107, + "grad_norm": 1.10371994972229, + "learning_rate": 0.00013466903799454652, + "loss": 1.9996, + "step": 24241 + }, + { + "epoch": 2.330289339613573, + "grad_norm": 1.0393240451812744, + "learning_rate": 0.0001346566610085436, + "loss": 1.8081, + "step": 24242 + }, + { + "epoch": 2.330385465731039, + "grad_norm": 1.1242027282714844, + "learning_rate": 0.00013464428414106954, + "loss": 2.0915, + "step": 24243 + }, + { + "epoch": 2.330481591848505, + "grad_norm": 1.2892030477523804, + "learning_rate": 0.0001346319073922115, + "loss": 2.098, + "step": 24244 + }, + { + "epoch": 2.3305777179659715, + "grad_norm": 1.122281551361084, + "learning_rate": 0.00013461953076205656, + "loss": 1.9624, + "step": 24245 + }, + { + "epoch": 2.3306738440834374, + "grad_norm": 1.0364357233047485, + "learning_rate": 0.00013460715425069183, + "loss": 1.9554, + "step": 24246 + }, + { + "epoch": 2.3307699702009037, + "grad_norm": 1.280272364616394, + "learning_rate": 0.00013459477785820437, + "loss": 1.8312, + "step": 24247 + }, + { + "epoch": 2.3308660963183696, + "grad_norm": 1.028969645500183, + "learning_rate": 0.00013458240158468134, + "loss": 1.8627, + "step": 24248 + }, + { + "epoch": 2.330962222435836, + "grad_norm": 0.9660157561302185, + "learning_rate": 0.00013457002543020978, + "loss": 1.8686, + "step": 24249 + }, + { + "epoch": 2.331058348553302, + "grad_norm": 1.0459094047546387, + "learning_rate": 0.00013455764939487681, + "loss": 1.9464, + "step": 24250 + }, + { + "epoch": 2.331154474670768, + "grad_norm": 1.2005938291549683, + "learning_rate": 0.0001345452734787695, + "loss": 1.8512, + "step": 24251 + }, + { + "epoch": 2.331250600788234, + "grad_norm": 1.1620076894760132, + "learning_rate": 0.00013453289768197493, + "loss": 2.0576, + "step": 24252 + }, + { + "epoch": 2.3313467269057004, + "grad_norm": 1.3785061836242676, + "learning_rate": 0.00013452052200458025, + "loss": 2.0479, + "step": 24253 + }, + { + "epoch": 2.3314428530231663, + "grad_norm": 1.3668181896209717, + "learning_rate": 0.00013450814644667248, + "loss": 2.0945, + "step": 24254 + }, + { + "epoch": 2.3315389791406327, + "grad_norm": 1.0680276155471802, + "learning_rate": 0.0001344957710083387, + "loss": 1.9011, + "step": 24255 + }, + { + "epoch": 2.3316351052580986, + "grad_norm": 1.1148251295089722, + "learning_rate": 0.0001344833956896661, + "loss": 2.0086, + "step": 24256 + }, + { + "epoch": 2.331731231375565, + "grad_norm": 1.2209264039993286, + "learning_rate": 0.0001344710204907417, + "loss": 1.9524, + "step": 24257 + }, + { + "epoch": 2.331827357493031, + "grad_norm": 1.0957485437393188, + "learning_rate": 0.00013445864541165257, + "loss": 1.9086, + "step": 24258 + }, + { + "epoch": 2.331923483610497, + "grad_norm": 1.1961159706115723, + "learning_rate": 0.0001344462704524858, + "loss": 2.0595, + "step": 24259 + }, + { + "epoch": 2.332019609727963, + "grad_norm": 1.1634148359298706, + "learning_rate": 0.0001344338956133285, + "loss": 2.0285, + "step": 24260 + }, + { + "epoch": 2.332115735845429, + "grad_norm": 1.4561266899108887, + "learning_rate": 0.00013442152089426773, + "loss": 2.1892, + "step": 24261 + }, + { + "epoch": 2.3322118619628953, + "grad_norm": 1.1262434720993042, + "learning_rate": 0.0001344091462953906, + "loss": 2.0621, + "step": 24262 + }, + { + "epoch": 2.3323079880803617, + "grad_norm": 1.0955713987350464, + "learning_rate": 0.00013439677181678413, + "loss": 1.9473, + "step": 24263 + }, + { + "epoch": 2.3324041141978276, + "grad_norm": 1.1367580890655518, + "learning_rate": 0.0001343843974585355, + "loss": 2.0702, + "step": 24264 + }, + { + "epoch": 2.3325002403152935, + "grad_norm": 1.2040289640426636, + "learning_rate": 0.00013437202322073171, + "loss": 1.9567, + "step": 24265 + }, + { + "epoch": 2.33259636643276, + "grad_norm": 1.3744786977767944, + "learning_rate": 0.00013435964910345988, + "loss": 2.2195, + "step": 24266 + }, + { + "epoch": 2.3326924925502257, + "grad_norm": 1.0847142934799194, + "learning_rate": 0.00013434727510680707, + "loss": 1.8465, + "step": 24267 + }, + { + "epoch": 2.332788618667692, + "grad_norm": 1.1424949169158936, + "learning_rate": 0.00013433490123086037, + "loss": 1.8689, + "step": 24268 + }, + { + "epoch": 2.332884744785158, + "grad_norm": 1.3177266120910645, + "learning_rate": 0.00013432252747570684, + "loss": 1.9634, + "step": 24269 + }, + { + "epoch": 2.3329808709026243, + "grad_norm": 1.230751872062683, + "learning_rate": 0.00013431015384143355, + "loss": 2.1161, + "step": 24270 + }, + { + "epoch": 2.33307699702009, + "grad_norm": 1.1352957487106323, + "learning_rate": 0.00013429778032812763, + "loss": 2.3269, + "step": 24271 + }, + { + "epoch": 2.3331731231375565, + "grad_norm": 1.1977053880691528, + "learning_rate": 0.00013428540693587607, + "loss": 2.0168, + "step": 24272 + }, + { + "epoch": 2.3332692492550224, + "grad_norm": 1.3553845882415771, + "learning_rate": 0.000134273033664766, + "loss": 1.917, + "step": 24273 + }, + { + "epoch": 2.333365375372489, + "grad_norm": 1.2340413331985474, + "learning_rate": 0.0001342606605148845, + "loss": 1.9957, + "step": 24274 + }, + { + "epoch": 2.3334615014899547, + "grad_norm": 1.3719992637634277, + "learning_rate": 0.00013424828748631858, + "loss": 2.1587, + "step": 24275 + }, + { + "epoch": 2.333557627607421, + "grad_norm": 1.1822725534439087, + "learning_rate": 0.00013423591457915537, + "loss": 1.9338, + "step": 24276 + }, + { + "epoch": 2.333653753724887, + "grad_norm": 1.1955909729003906, + "learning_rate": 0.00013422354179348192, + "loss": 2.1208, + "step": 24277 + }, + { + "epoch": 2.3337498798423533, + "grad_norm": 1.1066285371780396, + "learning_rate": 0.00013421116912938532, + "loss": 1.8282, + "step": 24278 + }, + { + "epoch": 2.333846005959819, + "grad_norm": 1.1512407064437866, + "learning_rate": 0.0001341987965869526, + "loss": 2.101, + "step": 24279 + }, + { + "epoch": 2.3339421320772855, + "grad_norm": 0.9753270745277405, + "learning_rate": 0.00013418642416627082, + "loss": 1.797, + "step": 24280 + }, + { + "epoch": 2.3340382581947514, + "grad_norm": 1.2207244634628296, + "learning_rate": 0.0001341740518674271, + "loss": 2.1179, + "step": 24281 + }, + { + "epoch": 2.3341343843122178, + "grad_norm": 1.1611179113388062, + "learning_rate": 0.00013416167969050846, + "loss": 2.0976, + "step": 24282 + }, + { + "epoch": 2.3342305104296837, + "grad_norm": 1.2760461568832397, + "learning_rate": 0.00013414930763560202, + "loss": 2.1018, + "step": 24283 + }, + { + "epoch": 2.33432663654715, + "grad_norm": 1.1717374324798584, + "learning_rate": 0.00013413693570279476, + "loss": 2.0155, + "step": 24284 + }, + { + "epoch": 2.334422762664616, + "grad_norm": 1.3390133380889893, + "learning_rate": 0.00013412456389217382, + "loss": 1.9977, + "step": 24285 + }, + { + "epoch": 2.3345188887820822, + "grad_norm": 1.0492421388626099, + "learning_rate": 0.0001341121922038262, + "loss": 1.959, + "step": 24286 + }, + { + "epoch": 2.334615014899548, + "grad_norm": 1.0079057216644287, + "learning_rate": 0.00013409982063783906, + "loss": 1.849, + "step": 24287 + }, + { + "epoch": 2.3347111410170145, + "grad_norm": 1.1648786067962646, + "learning_rate": 0.00013408744919429932, + "loss": 1.9061, + "step": 24288 + }, + { + "epoch": 2.3348072671344804, + "grad_norm": 1.0931538343429565, + "learning_rate": 0.00013407507787329415, + "loss": 2.1109, + "step": 24289 + }, + { + "epoch": 2.3349033932519467, + "grad_norm": 1.1457173824310303, + "learning_rate": 0.00013406270667491054, + "loss": 2.0112, + "step": 24290 + }, + { + "epoch": 2.3349995193694126, + "grad_norm": 1.0626634359359741, + "learning_rate": 0.0001340503355992356, + "loss": 2.172, + "step": 24291 + }, + { + "epoch": 2.335095645486879, + "grad_norm": 0.9885098934173584, + "learning_rate": 0.00013403796464635638, + "loss": 2.0175, + "step": 24292 + }, + { + "epoch": 2.335191771604345, + "grad_norm": 1.0887855291366577, + "learning_rate": 0.00013402559381635993, + "loss": 1.8874, + "step": 24293 + }, + { + "epoch": 2.335287897721811, + "grad_norm": 1.2230420112609863, + "learning_rate": 0.00013401322310933327, + "loss": 2.0252, + "step": 24294 + }, + { + "epoch": 2.335384023839277, + "grad_norm": 1.1175109148025513, + "learning_rate": 0.0001340008525253635, + "loss": 1.9032, + "step": 24295 + }, + { + "epoch": 2.3354801499567435, + "grad_norm": 1.2774096727371216, + "learning_rate": 0.00013398848206453763, + "loss": 2.1518, + "step": 24296 + }, + { + "epoch": 2.3355762760742094, + "grad_norm": 1.3430535793304443, + "learning_rate": 0.0001339761117269427, + "loss": 2.1041, + "step": 24297 + }, + { + "epoch": 2.3356724021916753, + "grad_norm": 1.0865856409072876, + "learning_rate": 0.00013396374151266588, + "loss": 1.906, + "step": 24298 + }, + { + "epoch": 2.3357685283091416, + "grad_norm": 1.058003544807434, + "learning_rate": 0.00013395137142179408, + "loss": 1.9892, + "step": 24299 + }, + { + "epoch": 2.335864654426608, + "grad_norm": 1.294028878211975, + "learning_rate": 0.00013393900145441446, + "loss": 2.0535, + "step": 24300 + }, + { + "epoch": 2.335960780544074, + "grad_norm": 1.2524698972702026, + "learning_rate": 0.000133926631610614, + "loss": 2.3248, + "step": 24301 + }, + { + "epoch": 2.3360569066615398, + "grad_norm": 1.1296179294586182, + "learning_rate": 0.00013391426189047977, + "loss": 1.9148, + "step": 24302 + }, + { + "epoch": 2.336153032779006, + "grad_norm": 1.2789865732192993, + "learning_rate": 0.00013390189229409882, + "loss": 1.9232, + "step": 24303 + }, + { + "epoch": 2.336249158896472, + "grad_norm": 1.2606689929962158, + "learning_rate": 0.0001338895228215582, + "loss": 2.0643, + "step": 24304 + }, + { + "epoch": 2.3363452850139383, + "grad_norm": 1.3522659540176392, + "learning_rate": 0.0001338771534729449, + "loss": 2.1394, + "step": 24305 + }, + { + "epoch": 2.3364414111314042, + "grad_norm": 0.9391958117485046, + "learning_rate": 0.00013386478424834606, + "loss": 1.7782, + "step": 24306 + }, + { + "epoch": 2.3365375372488706, + "grad_norm": 1.167536973953247, + "learning_rate": 0.00013385241514784867, + "loss": 1.9111, + "step": 24307 + }, + { + "epoch": 2.3366336633663365, + "grad_norm": 1.1835030317306519, + "learning_rate": 0.00013384004617153977, + "loss": 1.9947, + "step": 24308 + }, + { + "epoch": 2.336729789483803, + "grad_norm": 1.0677188634872437, + "learning_rate": 0.0001338276773195064, + "loss": 2.039, + "step": 24309 + }, + { + "epoch": 2.3368259156012687, + "grad_norm": 1.0870522260665894, + "learning_rate": 0.00013381530859183564, + "loss": 2.0063, + "step": 24310 + }, + { + "epoch": 2.336922041718735, + "grad_norm": 1.1107960939407349, + "learning_rate": 0.00013380293998861448, + "loss": 1.8047, + "step": 24311 + }, + { + "epoch": 2.337018167836201, + "grad_norm": 1.2623363733291626, + "learning_rate": 0.00013379057150993, + "loss": 2.1529, + "step": 24312 + }, + { + "epoch": 2.3371142939536673, + "grad_norm": 1.1861079931259155, + "learning_rate": 0.0001337782031558692, + "loss": 1.9986, + "step": 24313 + }, + { + "epoch": 2.337210420071133, + "grad_norm": 1.2446768283843994, + "learning_rate": 0.0001337658349265191, + "loss": 2.1008, + "step": 24314 + }, + { + "epoch": 2.3373065461885996, + "grad_norm": 1.1703228950500488, + "learning_rate": 0.00013375346682196687, + "loss": 2.0633, + "step": 24315 + }, + { + "epoch": 2.3374026723060655, + "grad_norm": 1.1020146608352661, + "learning_rate": 0.0001337410988422994, + "loss": 2.1575, + "step": 24316 + }, + { + "epoch": 2.337498798423532, + "grad_norm": 1.1100928783416748, + "learning_rate": 0.0001337287309876038, + "loss": 1.9735, + "step": 24317 + }, + { + "epoch": 2.3375949245409977, + "grad_norm": 1.3221746683120728, + "learning_rate": 0.0001337163632579671, + "loss": 2.1404, + "step": 24318 + }, + { + "epoch": 2.337691050658464, + "grad_norm": 1.085748314857483, + "learning_rate": 0.00013370399565347627, + "loss": 2.0426, + "step": 24319 + }, + { + "epoch": 2.33778717677593, + "grad_norm": 1.2025477886199951, + "learning_rate": 0.0001336916281742184, + "loss": 2.0701, + "step": 24320 + }, + { + "epoch": 2.3378833028933963, + "grad_norm": 1.0657942295074463, + "learning_rate": 0.0001336792608202805, + "loss": 2.1566, + "step": 24321 + }, + { + "epoch": 2.337979429010862, + "grad_norm": 1.054363489151001, + "learning_rate": 0.00013366689359174964, + "loss": 2.0193, + "step": 24322 + }, + { + "epoch": 2.3380755551283285, + "grad_norm": 1.0316513776779175, + "learning_rate": 0.00013365452648871282, + "loss": 1.8902, + "step": 24323 + }, + { + "epoch": 2.3381716812457944, + "grad_norm": 1.226576328277588, + "learning_rate": 0.00013364215951125705, + "loss": 1.99, + "step": 24324 + }, + { + "epoch": 2.338267807363261, + "grad_norm": 1.2059963941574097, + "learning_rate": 0.00013362979265946938, + "loss": 2.0236, + "step": 24325 + }, + { + "epoch": 2.3383639334807267, + "grad_norm": 1.1846059560775757, + "learning_rate": 0.00013361742593343684, + "loss": 1.9992, + "step": 24326 + }, + { + "epoch": 2.3384600595981926, + "grad_norm": 1.1302565336227417, + "learning_rate": 0.00013360505933324645, + "loss": 2.172, + "step": 24327 + }, + { + "epoch": 2.338556185715659, + "grad_norm": 1.0475282669067383, + "learning_rate": 0.00013359269285898522, + "loss": 1.9336, + "step": 24328 + }, + { + "epoch": 2.3386523118331253, + "grad_norm": 1.2780007123947144, + "learning_rate": 0.00013358032651074022, + "loss": 2.1454, + "step": 24329 + }, + { + "epoch": 2.338748437950591, + "grad_norm": 1.1868863105773926, + "learning_rate": 0.00013356796028859844, + "loss": 2.026, + "step": 24330 + }, + { + "epoch": 2.338844564068057, + "grad_norm": 1.294769525527954, + "learning_rate": 0.00013355559419264687, + "loss": 2.0765, + "step": 24331 + }, + { + "epoch": 2.3389406901855234, + "grad_norm": 1.1391477584838867, + "learning_rate": 0.0001335432282229726, + "loss": 2.0378, + "step": 24332 + }, + { + "epoch": 2.3390368163029898, + "grad_norm": 1.2338871955871582, + "learning_rate": 0.00013353086237966264, + "loss": 2.0666, + "step": 24333 + }, + { + "epoch": 2.3391329424204557, + "grad_norm": 1.1499844789505005, + "learning_rate": 0.00013351849666280396, + "loss": 2.0093, + "step": 24334 + }, + { + "epoch": 2.3392290685379216, + "grad_norm": 1.0932894945144653, + "learning_rate": 0.0001335061310724836, + "loss": 1.8841, + "step": 24335 + }, + { + "epoch": 2.339325194655388, + "grad_norm": 1.12949538230896, + "learning_rate": 0.00013349376560878862, + "loss": 2.1889, + "step": 24336 + }, + { + "epoch": 2.339421320772854, + "grad_norm": 1.0696182250976562, + "learning_rate": 0.000133481400271806, + "loss": 2.0116, + "step": 24337 + }, + { + "epoch": 2.33951744689032, + "grad_norm": 1.1705515384674072, + "learning_rate": 0.0001334690350616228, + "loss": 1.7609, + "step": 24338 + }, + { + "epoch": 2.339613573007786, + "grad_norm": 1.2021325826644897, + "learning_rate": 0.00013345666997832591, + "loss": 1.9396, + "step": 24339 + }, + { + "epoch": 2.3397096991252524, + "grad_norm": 1.0924609899520874, + "learning_rate": 0.0001334443050220025, + "loss": 1.8844, + "step": 24340 + }, + { + "epoch": 2.3398058252427183, + "grad_norm": 1.2054316997528076, + "learning_rate": 0.0001334319401927395, + "loss": 2.1817, + "step": 24341 + }, + { + "epoch": 2.3399019513601846, + "grad_norm": 1.393297553062439, + "learning_rate": 0.00013341957549062392, + "loss": 2.247, + "step": 24342 + }, + { + "epoch": 2.3399980774776505, + "grad_norm": 1.1352827548980713, + "learning_rate": 0.00013340721091574284, + "loss": 1.982, + "step": 24343 + }, + { + "epoch": 2.340094203595117, + "grad_norm": 1.0915107727050781, + "learning_rate": 0.0001333948464681832, + "loss": 2.0131, + "step": 24344 + }, + { + "epoch": 2.340190329712583, + "grad_norm": 1.0580312013626099, + "learning_rate": 0.000133382482148032, + "loss": 2.0103, + "step": 24345 + }, + { + "epoch": 2.340286455830049, + "grad_norm": 1.2229853868484497, + "learning_rate": 0.00013337011795537632, + "loss": 2.0545, + "step": 24346 + }, + { + "epoch": 2.340382581947515, + "grad_norm": 1.234950065612793, + "learning_rate": 0.00013335775389030312, + "loss": 2.0085, + "step": 24347 + }, + { + "epoch": 2.3404787080649814, + "grad_norm": 1.2697023153305054, + "learning_rate": 0.0001333453899528994, + "loss": 2.0542, + "step": 24348 + }, + { + "epoch": 2.3405748341824473, + "grad_norm": 1.196711778640747, + "learning_rate": 0.00013333302614325221, + "loss": 2.0989, + "step": 24349 + }, + { + "epoch": 2.3406709602999136, + "grad_norm": 1.1415977478027344, + "learning_rate": 0.00013332066246144852, + "loss": 2.0079, + "step": 24350 + }, + { + "epoch": 2.3407670864173795, + "grad_norm": 1.1490622758865356, + "learning_rate": 0.00013330829890757535, + "loss": 2.1397, + "step": 24351 + }, + { + "epoch": 2.340863212534846, + "grad_norm": 1.2325295209884644, + "learning_rate": 0.00013329593548171972, + "loss": 2.1014, + "step": 24352 + }, + { + "epoch": 2.3409593386523118, + "grad_norm": 1.0849323272705078, + "learning_rate": 0.0001332835721839686, + "loss": 1.7803, + "step": 24353 + }, + { + "epoch": 2.341055464769778, + "grad_norm": 1.2079449892044067, + "learning_rate": 0.000133271209014409, + "loss": 1.9652, + "step": 24354 + }, + { + "epoch": 2.341151590887244, + "grad_norm": 1.1505787372589111, + "learning_rate": 0.00013325884597312793, + "loss": 2.0376, + "step": 24355 + }, + { + "epoch": 2.3412477170047103, + "grad_norm": 1.1009714603424072, + "learning_rate": 0.0001332464830602124, + "loss": 2.0507, + "step": 24356 + }, + { + "epoch": 2.3413438431221762, + "grad_norm": 1.1230359077453613, + "learning_rate": 0.00013323412027574938, + "loss": 2.1033, + "step": 24357 + }, + { + "epoch": 2.3414399692396426, + "grad_norm": 1.1034585237503052, + "learning_rate": 0.00013322175761982592, + "loss": 2.1119, + "step": 24358 + }, + { + "epoch": 2.3415360953571085, + "grad_norm": 1.022386908531189, + "learning_rate": 0.00013320939509252894, + "loss": 1.9856, + "step": 24359 + }, + { + "epoch": 2.3416322214745744, + "grad_norm": 1.0423200130462646, + "learning_rate": 0.00013319703269394552, + "loss": 1.7934, + "step": 24360 + }, + { + "epoch": 2.3417283475920407, + "grad_norm": 1.3168411254882812, + "learning_rate": 0.0001331846704241626, + "loss": 2.0286, + "step": 24361 + }, + { + "epoch": 2.341824473709507, + "grad_norm": 1.1031025648117065, + "learning_rate": 0.00013317230828326716, + "loss": 2.1535, + "step": 24362 + }, + { + "epoch": 2.341920599826973, + "grad_norm": 1.1829147338867188, + "learning_rate": 0.00013315994627134626, + "loss": 2.1389, + "step": 24363 + }, + { + "epoch": 2.342016725944439, + "grad_norm": 0.9947364926338196, + "learning_rate": 0.00013314758438848687, + "loss": 1.9774, + "step": 24364 + }, + { + "epoch": 2.3421128520619052, + "grad_norm": 1.2006452083587646, + "learning_rate": 0.00013313522263477592, + "loss": 2.04, + "step": 24365 + }, + { + "epoch": 2.3422089781793716, + "grad_norm": 1.1598271131515503, + "learning_rate": 0.0001331228610103005, + "loss": 1.952, + "step": 24366 + }, + { + "epoch": 2.3423051042968375, + "grad_norm": 1.1612653732299805, + "learning_rate": 0.00013311049951514754, + "loss": 2.0349, + "step": 24367 + }, + { + "epoch": 2.3424012304143034, + "grad_norm": 1.0844647884368896, + "learning_rate": 0.00013309813814940403, + "loss": 1.8498, + "step": 24368 + }, + { + "epoch": 2.3424973565317697, + "grad_norm": 1.0882840156555176, + "learning_rate": 0.00013308577691315698, + "loss": 1.9826, + "step": 24369 + }, + { + "epoch": 2.3425934826492356, + "grad_norm": 1.2090226411819458, + "learning_rate": 0.00013307341580649337, + "loss": 2.1729, + "step": 24370 + }, + { + "epoch": 2.342689608766702, + "grad_norm": 1.0737251043319702, + "learning_rate": 0.0001330610548295002, + "loss": 2.0903, + "step": 24371 + }, + { + "epoch": 2.342785734884168, + "grad_norm": 1.2617424726486206, + "learning_rate": 0.00013304869398226443, + "loss": 2.0751, + "step": 24372 + }, + { + "epoch": 2.342881861001634, + "grad_norm": 1.1195682287216187, + "learning_rate": 0.00013303633326487306, + "loss": 1.9799, + "step": 24373 + }, + { + "epoch": 2.3429779871191, + "grad_norm": 1.1955190896987915, + "learning_rate": 0.00013302397267741308, + "loss": 1.9007, + "step": 24374 + }, + { + "epoch": 2.3430741132365664, + "grad_norm": 1.0784167051315308, + "learning_rate": 0.00013301161221997146, + "loss": 2.0167, + "step": 24375 + }, + { + "epoch": 2.3431702393540323, + "grad_norm": 1.2586777210235596, + "learning_rate": 0.00013299925189263516, + "loss": 2.1376, + "step": 24376 + }, + { + "epoch": 2.3432663654714987, + "grad_norm": 1.2639628648757935, + "learning_rate": 0.0001329868916954912, + "loss": 2.0984, + "step": 24377 + }, + { + "epoch": 2.3433624915889646, + "grad_norm": 1.1807355880737305, + "learning_rate": 0.00013297453162862657, + "loss": 1.898, + "step": 24378 + }, + { + "epoch": 2.343458617706431, + "grad_norm": 1.0786457061767578, + "learning_rate": 0.00013296217169212818, + "loss": 2.0738, + "step": 24379 + }, + { + "epoch": 2.343554743823897, + "grad_norm": 1.1616954803466797, + "learning_rate": 0.00013294981188608312, + "loss": 2.0713, + "step": 24380 + }, + { + "epoch": 2.343650869941363, + "grad_norm": 1.2769370079040527, + "learning_rate": 0.00013293745221057825, + "loss": 2.0708, + "step": 24381 + }, + { + "epoch": 2.343746996058829, + "grad_norm": 1.2057898044586182, + "learning_rate": 0.0001329250926657006, + "loss": 1.9719, + "step": 24382 + }, + { + "epoch": 2.3438431221762954, + "grad_norm": 1.1257277727127075, + "learning_rate": 0.00013291273325153716, + "loss": 2.1603, + "step": 24383 + }, + { + "epoch": 2.3439392482937613, + "grad_norm": 1.310401439666748, + "learning_rate": 0.0001329003739681749, + "loss": 2.1048, + "step": 24384 + }, + { + "epoch": 2.3440353744112277, + "grad_norm": 1.3111910820007324, + "learning_rate": 0.00013288801481570078, + "loss": 1.8686, + "step": 24385 + }, + { + "epoch": 2.3441315005286936, + "grad_norm": 1.213840365409851, + "learning_rate": 0.00013287565579420178, + "loss": 2.2145, + "step": 24386 + }, + { + "epoch": 2.34422762664616, + "grad_norm": 1.1374356746673584, + "learning_rate": 0.00013286329690376488, + "loss": 2.0838, + "step": 24387 + }, + { + "epoch": 2.344323752763626, + "grad_norm": 1.0905719995498657, + "learning_rate": 0.00013285093814447704, + "loss": 1.9763, + "step": 24388 + }, + { + "epoch": 2.344419878881092, + "grad_norm": 1.1283773183822632, + "learning_rate": 0.00013283857951642525, + "loss": 2.0481, + "step": 24389 + }, + { + "epoch": 2.344516004998558, + "grad_norm": 1.1061210632324219, + "learning_rate": 0.00013282622101969642, + "loss": 1.9905, + "step": 24390 + }, + { + "epoch": 2.3446121311160244, + "grad_norm": 1.2075175046920776, + "learning_rate": 0.00013281386265437758, + "loss": 2.0129, + "step": 24391 + }, + { + "epoch": 2.3447082572334903, + "grad_norm": 1.1305018663406372, + "learning_rate": 0.00013280150442055566, + "loss": 2.1878, + "step": 24392 + }, + { + "epoch": 2.3448043833509566, + "grad_norm": 1.202560544013977, + "learning_rate": 0.00013278914631831767, + "loss": 2.1034, + "step": 24393 + }, + { + "epoch": 2.3449005094684225, + "grad_norm": 1.134842872619629, + "learning_rate": 0.00013277678834775053, + "loss": 1.9829, + "step": 24394 + }, + { + "epoch": 2.344996635585889, + "grad_norm": 1.1030272245407104, + "learning_rate": 0.00013276443050894124, + "loss": 1.8751, + "step": 24395 + }, + { + "epoch": 2.345092761703355, + "grad_norm": 1.1310365200042725, + "learning_rate": 0.00013275207280197675, + "loss": 1.9644, + "step": 24396 + }, + { + "epoch": 2.3451888878208207, + "grad_norm": 1.1830352544784546, + "learning_rate": 0.000132739715226944, + "loss": 2.1201, + "step": 24397 + }, + { + "epoch": 2.345285013938287, + "grad_norm": 1.1067270040512085, + "learning_rate": 0.00013272735778393, + "loss": 2.1623, + "step": 24398 + }, + { + "epoch": 2.3453811400557534, + "grad_norm": 1.1641184091567993, + "learning_rate": 0.00013271500047302162, + "loss": 1.9445, + "step": 24399 + }, + { + "epoch": 2.3454772661732193, + "grad_norm": 1.1535180807113647, + "learning_rate": 0.00013270264329430593, + "loss": 1.955, + "step": 24400 + }, + { + "epoch": 2.345573392290685, + "grad_norm": 1.1437662839889526, + "learning_rate": 0.00013269028624786984, + "loss": 1.9962, + "step": 24401 + }, + { + "epoch": 2.3456695184081515, + "grad_norm": 1.1186115741729736, + "learning_rate": 0.00013267792933380033, + "loss": 1.8845, + "step": 24402 + }, + { + "epoch": 2.3457656445256174, + "grad_norm": 1.2320407629013062, + "learning_rate": 0.00013266557255218434, + "loss": 2.0827, + "step": 24403 + }, + { + "epoch": 2.3458617706430838, + "grad_norm": 1.0384749174118042, + "learning_rate": 0.0001326532159031088, + "loss": 2.0099, + "step": 24404 + }, + { + "epoch": 2.3459578967605497, + "grad_norm": 1.073445439338684, + "learning_rate": 0.00013264085938666072, + "loss": 2.0293, + "step": 24405 + }, + { + "epoch": 2.346054022878016, + "grad_norm": 1.140194058418274, + "learning_rate": 0.000132628503002927, + "loss": 2.0563, + "step": 24406 + }, + { + "epoch": 2.346150148995482, + "grad_norm": 0.9994754195213318, + "learning_rate": 0.00013261614675199462, + "loss": 2.0514, + "step": 24407 + }, + { + "epoch": 2.3462462751129483, + "grad_norm": 1.1141254901885986, + "learning_rate": 0.00013260379063395053, + "loss": 2.0838, + "step": 24408 + }, + { + "epoch": 2.346342401230414, + "grad_norm": 1.0965356826782227, + "learning_rate": 0.0001325914346488817, + "loss": 1.8652, + "step": 24409 + }, + { + "epoch": 2.3464385273478805, + "grad_norm": 1.3668845891952515, + "learning_rate": 0.00013257907879687505, + "loss": 2.1324, + "step": 24410 + }, + { + "epoch": 2.3465346534653464, + "grad_norm": 0.9971897602081299, + "learning_rate": 0.00013256672307801755, + "loss": 2.0756, + "step": 24411 + }, + { + "epoch": 2.3466307795828127, + "grad_norm": 1.219504475593567, + "learning_rate": 0.00013255436749239617, + "loss": 2.1967, + "step": 24412 + }, + { + "epoch": 2.3467269057002786, + "grad_norm": 1.243358850479126, + "learning_rate": 0.0001325420120400978, + "loss": 2.0378, + "step": 24413 + }, + { + "epoch": 2.346823031817745, + "grad_norm": 1.0299962759017944, + "learning_rate": 0.00013252965672120941, + "loss": 2.0736, + "step": 24414 + }, + { + "epoch": 2.346919157935211, + "grad_norm": 1.2189419269561768, + "learning_rate": 0.00013251730153581795, + "loss": 2.02, + "step": 24415 + }, + { + "epoch": 2.3470152840526772, + "grad_norm": 1.218695044517517, + "learning_rate": 0.00013250494648401037, + "loss": 2.1673, + "step": 24416 + }, + { + "epoch": 2.347111410170143, + "grad_norm": 1.086969017982483, + "learning_rate": 0.00013249259156587363, + "loss": 2.0955, + "step": 24417 + }, + { + "epoch": 2.3472075362876095, + "grad_norm": 1.1918845176696777, + "learning_rate": 0.00013248023678149462, + "loss": 2.1582, + "step": 24418 + }, + { + "epoch": 2.3473036624050754, + "grad_norm": 1.2572964429855347, + "learning_rate": 0.00013246788213096036, + "loss": 2.1703, + "step": 24419 + }, + { + "epoch": 2.3473997885225417, + "grad_norm": 1.2284513711929321, + "learning_rate": 0.00013245552761435777, + "loss": 2.0535, + "step": 24420 + }, + { + "epoch": 2.3474959146400076, + "grad_norm": 1.1699645519256592, + "learning_rate": 0.00013244317323177373, + "loss": 1.9656, + "step": 24421 + }, + { + "epoch": 2.347592040757474, + "grad_norm": 1.3625388145446777, + "learning_rate": 0.00013243081898329525, + "loss": 2.195, + "step": 24422 + }, + { + "epoch": 2.34768816687494, + "grad_norm": 1.1479430198669434, + "learning_rate": 0.0001324184648690092, + "loss": 1.9796, + "step": 24423 + }, + { + "epoch": 2.347784292992406, + "grad_norm": 1.3793545961380005, + "learning_rate": 0.00013240611088900258, + "loss": 2.0592, + "step": 24424 + }, + { + "epoch": 2.347880419109872, + "grad_norm": 1.059247612953186, + "learning_rate": 0.00013239375704336232, + "loss": 2.0135, + "step": 24425 + }, + { + "epoch": 2.3479765452273385, + "grad_norm": 1.21739661693573, + "learning_rate": 0.0001323814033321753, + "loss": 1.9753, + "step": 24426 + }, + { + "epoch": 2.3480726713448044, + "grad_norm": 1.1350008249282837, + "learning_rate": 0.00013236904975552852, + "loss": 1.8651, + "step": 24427 + }, + { + "epoch": 2.3481687974622707, + "grad_norm": 1.1250909566879272, + "learning_rate": 0.00013235669631350886, + "loss": 1.9367, + "step": 24428 + }, + { + "epoch": 2.3482649235797366, + "grad_norm": 0.9493280649185181, + "learning_rate": 0.0001323443430062033, + "loss": 1.9919, + "step": 24429 + }, + { + "epoch": 2.3483610496972025, + "grad_norm": 1.2945367097854614, + "learning_rate": 0.00013233198983369877, + "loss": 2.1617, + "step": 24430 + }, + { + "epoch": 2.348457175814669, + "grad_norm": 1.0833920240402222, + "learning_rate": 0.00013231963679608217, + "loss": 1.8238, + "step": 24431 + }, + { + "epoch": 2.348553301932135, + "grad_norm": 1.2501381635665894, + "learning_rate": 0.00013230728389344043, + "loss": 2.0521, + "step": 24432 + }, + { + "epoch": 2.348649428049601, + "grad_norm": 1.2207226753234863, + "learning_rate": 0.00013229493112586047, + "loss": 1.9355, + "step": 24433 + }, + { + "epoch": 2.348745554167067, + "grad_norm": 1.1822617053985596, + "learning_rate": 0.00013228257849342927, + "loss": 2.0964, + "step": 24434 + }, + { + "epoch": 2.3488416802845333, + "grad_norm": 1.0864731073379517, + "learning_rate": 0.00013227022599623372, + "loss": 1.9453, + "step": 24435 + }, + { + "epoch": 2.3489378064019997, + "grad_norm": 1.2354271411895752, + "learning_rate": 0.00013225787363436075, + "loss": 2.0296, + "step": 24436 + }, + { + "epoch": 2.3490339325194656, + "grad_norm": 1.333143949508667, + "learning_rate": 0.00013224552140789732, + "loss": 2.0591, + "step": 24437 + }, + { + "epoch": 2.3491300586369315, + "grad_norm": 1.1473129987716675, + "learning_rate": 0.00013223316931693031, + "loss": 2.0406, + "step": 24438 + }, + { + "epoch": 2.349226184754398, + "grad_norm": 1.217559814453125, + "learning_rate": 0.00013222081736154664, + "loss": 2.1073, + "step": 24439 + }, + { + "epoch": 2.3493223108718637, + "grad_norm": 1.0520596504211426, + "learning_rate": 0.00013220846554183328, + "loss": 2.0718, + "step": 24440 + }, + { + "epoch": 2.34941843698933, + "grad_norm": 1.119504690170288, + "learning_rate": 0.0001321961138578771, + "loss": 2.0325, + "step": 24441 + }, + { + "epoch": 2.349514563106796, + "grad_norm": 1.2242158651351929, + "learning_rate": 0.00013218376230976503, + "loss": 2.0015, + "step": 24442 + }, + { + "epoch": 2.3496106892242623, + "grad_norm": 1.1757965087890625, + "learning_rate": 0.00013217141089758402, + "loss": 1.9923, + "step": 24443 + }, + { + "epoch": 2.349706815341728, + "grad_norm": 1.139749526977539, + "learning_rate": 0.00013215905962142094, + "loss": 2.0648, + "step": 24444 + }, + { + "epoch": 2.3498029414591945, + "grad_norm": 1.0675456523895264, + "learning_rate": 0.00013214670848136277, + "loss": 2.1016, + "step": 24445 + }, + { + "epoch": 2.3498990675766604, + "grad_norm": 1.236735463142395, + "learning_rate": 0.0001321343574774964, + "loss": 1.9106, + "step": 24446 + }, + { + "epoch": 2.349995193694127, + "grad_norm": 1.1445536613464355, + "learning_rate": 0.00013212200660990873, + "loss": 2.0006, + "step": 24447 + }, + { + "epoch": 2.3500913198115927, + "grad_norm": 1.1127448081970215, + "learning_rate": 0.0001321096558786867, + "loss": 2.0181, + "step": 24448 + }, + { + "epoch": 2.350187445929059, + "grad_norm": 1.1925277709960938, + "learning_rate": 0.0001320973052839172, + "loss": 2.1824, + "step": 24449 + }, + { + "epoch": 2.350283572046525, + "grad_norm": 1.339717984199524, + "learning_rate": 0.00013208495482568714, + "loss": 2.0371, + "step": 24450 + }, + { + "epoch": 2.3503796981639913, + "grad_norm": 1.2462061643600464, + "learning_rate": 0.0001320726045040834, + "loss": 2.0738, + "step": 24451 + }, + { + "epoch": 2.350475824281457, + "grad_norm": 1.0208758115768433, + "learning_rate": 0.00013206025431919296, + "loss": 1.9407, + "step": 24452 + }, + { + "epoch": 2.3505719503989235, + "grad_norm": 1.1424585580825806, + "learning_rate": 0.00013204790427110273, + "loss": 2.1033, + "step": 24453 + }, + { + "epoch": 2.3506680765163894, + "grad_norm": 1.1759226322174072, + "learning_rate": 0.00013203555435989956, + "loss": 2.0475, + "step": 24454 + }, + { + "epoch": 2.3507642026338558, + "grad_norm": 1.2119067907333374, + "learning_rate": 0.00013202320458567042, + "loss": 2.089, + "step": 24455 + }, + { + "epoch": 2.3508603287513217, + "grad_norm": 1.3498772382736206, + "learning_rate": 0.0001320108549485022, + "loss": 1.9726, + "step": 24456 + }, + { + "epoch": 2.350956454868788, + "grad_norm": 1.2115378379821777, + "learning_rate": 0.00013199850544848174, + "loss": 1.9553, + "step": 24457 + }, + { + "epoch": 2.351052580986254, + "grad_norm": 1.199209451675415, + "learning_rate": 0.00013198615608569605, + "loss": 2.1059, + "step": 24458 + }, + { + "epoch": 2.3511487071037203, + "grad_norm": 1.277611255645752, + "learning_rate": 0.00013197380686023194, + "loss": 1.8608, + "step": 24459 + }, + { + "epoch": 2.351244833221186, + "grad_norm": 1.0560358762741089, + "learning_rate": 0.00013196145777217636, + "loss": 2.0019, + "step": 24460 + }, + { + "epoch": 2.3513409593386525, + "grad_norm": 1.0711677074432373, + "learning_rate": 0.00013194910882161623, + "loss": 1.876, + "step": 24461 + }, + { + "epoch": 2.3514370854561184, + "grad_norm": 1.220340609550476, + "learning_rate": 0.00013193676000863844, + "loss": 1.9215, + "step": 24462 + }, + { + "epoch": 2.3515332115735843, + "grad_norm": 1.0880016088485718, + "learning_rate": 0.00013192441133332984, + "loss": 1.978, + "step": 24463 + }, + { + "epoch": 2.3516293376910506, + "grad_norm": 1.1137269735336304, + "learning_rate": 0.0001319120627957774, + "loss": 1.9311, + "step": 24464 + }, + { + "epoch": 2.351725463808517, + "grad_norm": 1.1684174537658691, + "learning_rate": 0.00013189971439606798, + "loss": 2.0583, + "step": 24465 + }, + { + "epoch": 2.351821589925983, + "grad_norm": 1.2412364482879639, + "learning_rate": 0.0001318873661342885, + "loss": 2.1028, + "step": 24466 + }, + { + "epoch": 2.351917716043449, + "grad_norm": 1.0650871992111206, + "learning_rate": 0.00013187501801052584, + "loss": 1.9748, + "step": 24467 + }, + { + "epoch": 2.352013842160915, + "grad_norm": 1.0958929061889648, + "learning_rate": 0.00013186267002486687, + "loss": 2.1649, + "step": 24468 + }, + { + "epoch": 2.3521099682783815, + "grad_norm": 1.047446370124817, + "learning_rate": 0.00013185032217739854, + "loss": 2.0403, + "step": 24469 + }, + { + "epoch": 2.3522060943958474, + "grad_norm": 1.1624647378921509, + "learning_rate": 0.0001318379744682077, + "loss": 2.0672, + "step": 24470 + }, + { + "epoch": 2.3523022205133133, + "grad_norm": 1.003448486328125, + "learning_rate": 0.00013182562689738127, + "loss": 1.9354, + "step": 24471 + }, + { + "epoch": 2.3523983466307796, + "grad_norm": 1.173258662223816, + "learning_rate": 0.00013181327946500612, + "loss": 1.9239, + "step": 24472 + }, + { + "epoch": 2.3524944727482455, + "grad_norm": 1.1309654712677002, + "learning_rate": 0.00013180093217116914, + "loss": 2.0461, + "step": 24473 + }, + { + "epoch": 2.352590598865712, + "grad_norm": 1.2341063022613525, + "learning_rate": 0.00013178858501595723, + "loss": 2.1993, + "step": 24474 + }, + { + "epoch": 2.3526867249831778, + "grad_norm": 1.1357762813568115, + "learning_rate": 0.0001317762379994573, + "loss": 1.9633, + "step": 24475 + }, + { + "epoch": 2.352782851100644, + "grad_norm": 1.108078956604004, + "learning_rate": 0.0001317638911217562, + "loss": 2.0292, + "step": 24476 + }, + { + "epoch": 2.35287897721811, + "grad_norm": 1.2889431715011597, + "learning_rate": 0.0001317515443829408, + "loss": 1.8106, + "step": 24477 + }, + { + "epoch": 2.3529751033355764, + "grad_norm": 1.22144615650177, + "learning_rate": 0.00013173919778309805, + "loss": 1.9943, + "step": 24478 + }, + { + "epoch": 2.3530712294530423, + "grad_norm": 1.0492221117019653, + "learning_rate": 0.00013172685132231481, + "loss": 2.0393, + "step": 24479 + }, + { + "epoch": 2.3531673555705086, + "grad_norm": 1.0996482372283936, + "learning_rate": 0.00013171450500067797, + "loss": 1.8643, + "step": 24480 + }, + { + "epoch": 2.3532634816879745, + "grad_norm": 1.185518741607666, + "learning_rate": 0.00013170215881827438, + "loss": 1.9042, + "step": 24481 + }, + { + "epoch": 2.353359607805441, + "grad_norm": 1.1921501159667969, + "learning_rate": 0.00013168981277519095, + "loss": 1.9638, + "step": 24482 + }, + { + "epoch": 2.3534557339229067, + "grad_norm": 1.1290347576141357, + "learning_rate": 0.00013167746687151453, + "loss": 2.0102, + "step": 24483 + }, + { + "epoch": 2.353551860040373, + "grad_norm": 1.063673973083496, + "learning_rate": 0.00013166512110733203, + "loss": 1.8767, + "step": 24484 + }, + { + "epoch": 2.353647986157839, + "grad_norm": 1.1230703592300415, + "learning_rate": 0.00013165277548273034, + "loss": 1.8767, + "step": 24485 + }, + { + "epoch": 2.3537441122753053, + "grad_norm": 1.106750726699829, + "learning_rate": 0.00013164042999779633, + "loss": 2.0739, + "step": 24486 + }, + { + "epoch": 2.3538402383927712, + "grad_norm": 1.0299193859100342, + "learning_rate": 0.00013162808465261685, + "loss": 2.0253, + "step": 24487 + }, + { + "epoch": 2.3539363645102376, + "grad_norm": 1.1594417095184326, + "learning_rate": 0.00013161573944727876, + "loss": 2.0073, + "step": 24488 + }, + { + "epoch": 2.3540324906277035, + "grad_norm": 1.1888129711151123, + "learning_rate": 0.000131603394381869, + "loss": 1.9693, + "step": 24489 + }, + { + "epoch": 2.35412861674517, + "grad_norm": 1.062905192375183, + "learning_rate": 0.0001315910494564744, + "loss": 1.9001, + "step": 24490 + }, + { + "epoch": 2.3542247428626357, + "grad_norm": 1.1698857545852661, + "learning_rate": 0.00013157870467118182, + "loss": 1.9663, + "step": 24491 + }, + { + "epoch": 2.354320868980102, + "grad_norm": 1.309299349784851, + "learning_rate": 0.00013156636002607818, + "loss": 2.0168, + "step": 24492 + }, + { + "epoch": 2.354416995097568, + "grad_norm": 1.0052533149719238, + "learning_rate": 0.00013155401552125036, + "loss": 1.8114, + "step": 24493 + }, + { + "epoch": 2.3545131212150343, + "grad_norm": 1.0176825523376465, + "learning_rate": 0.00013154167115678512, + "loss": 1.8432, + "step": 24494 + }, + { + "epoch": 2.3546092473325, + "grad_norm": 1.1053314208984375, + "learning_rate": 0.00013152932693276945, + "loss": 2.07, + "step": 24495 + }, + { + "epoch": 2.354705373449966, + "grad_norm": 1.1117801666259766, + "learning_rate": 0.00013151698284929016, + "loss": 2.1922, + "step": 24496 + }, + { + "epoch": 2.3548014995674325, + "grad_norm": 1.2106889486312866, + "learning_rate": 0.00013150463890643414, + "loss": 2.1325, + "step": 24497 + }, + { + "epoch": 2.354897625684899, + "grad_norm": 1.1754205226898193, + "learning_rate": 0.00013149229510428824, + "loss": 2.0445, + "step": 24498 + }, + { + "epoch": 2.3549937518023647, + "grad_norm": 1.0537710189819336, + "learning_rate": 0.00013147995144293932, + "loss": 2.0514, + "step": 24499 + }, + { + "epoch": 2.3550898779198306, + "grad_norm": 1.1076098680496216, + "learning_rate": 0.0001314676079224743, + "loss": 2.0212, + "step": 24500 + }, + { + "epoch": 2.355186004037297, + "grad_norm": 1.2084929943084717, + "learning_rate": 0.00013145526454297996, + "loss": 1.9677, + "step": 24501 + }, + { + "epoch": 2.3552821301547633, + "grad_norm": 1.1089534759521484, + "learning_rate": 0.00013144292130454321, + "loss": 1.9817, + "step": 24502 + }, + { + "epoch": 2.355378256272229, + "grad_norm": 1.176498532295227, + "learning_rate": 0.0001314305782072509, + "loss": 2.101, + "step": 24503 + }, + { + "epoch": 2.355474382389695, + "grad_norm": 1.2998323440551758, + "learning_rate": 0.0001314182352511899, + "loss": 2.1217, + "step": 24504 + }, + { + "epoch": 2.3555705085071614, + "grad_norm": 1.0919259786605835, + "learning_rate": 0.00013140589243644705, + "loss": 1.8287, + "step": 24505 + }, + { + "epoch": 2.3556666346246273, + "grad_norm": 1.0343611240386963, + "learning_rate": 0.00013139354976310922, + "loss": 1.8433, + "step": 24506 + }, + { + "epoch": 2.3557627607420937, + "grad_norm": 1.1341042518615723, + "learning_rate": 0.00013138120723126329, + "loss": 2.0213, + "step": 24507 + }, + { + "epoch": 2.3558588868595596, + "grad_norm": 1.047274112701416, + "learning_rate": 0.00013136886484099605, + "loss": 1.9806, + "step": 24508 + }, + { + "epoch": 2.355955012977026, + "grad_norm": 1.2614755630493164, + "learning_rate": 0.00013135652259239441, + "loss": 2.2224, + "step": 24509 + }, + { + "epoch": 2.356051139094492, + "grad_norm": 1.1417709589004517, + "learning_rate": 0.00013134418048554522, + "loss": 1.8189, + "step": 24510 + }, + { + "epoch": 2.356147265211958, + "grad_norm": 1.2736961841583252, + "learning_rate": 0.0001313318385205353, + "loss": 1.9814, + "step": 24511 + }, + { + "epoch": 2.356243391329424, + "grad_norm": 1.0802057981491089, + "learning_rate": 0.00013131949669745155, + "loss": 1.8675, + "step": 24512 + }, + { + "epoch": 2.3563395174468904, + "grad_norm": 1.1591928005218506, + "learning_rate": 0.0001313071550163808, + "loss": 1.9813, + "step": 24513 + }, + { + "epoch": 2.3564356435643563, + "grad_norm": 1.1218057870864868, + "learning_rate": 0.0001312948134774099, + "loss": 1.8861, + "step": 24514 + }, + { + "epoch": 2.3565317696818227, + "grad_norm": 1.1867318153381348, + "learning_rate": 0.0001312824720806257, + "loss": 2.0729, + "step": 24515 + }, + { + "epoch": 2.3566278957992886, + "grad_norm": 1.144529938697815, + "learning_rate": 0.00013127013082611503, + "loss": 1.9972, + "step": 24516 + }, + { + "epoch": 2.356724021916755, + "grad_norm": 1.0098835229873657, + "learning_rate": 0.00013125778971396476, + "loss": 1.8917, + "step": 24517 + }, + { + "epoch": 2.356820148034221, + "grad_norm": 1.2121328115463257, + "learning_rate": 0.00013124544874426171, + "loss": 2.075, + "step": 24518 + }, + { + "epoch": 2.356916274151687, + "grad_norm": 1.2158938646316528, + "learning_rate": 0.0001312331079170928, + "loss": 2.1294, + "step": 24519 + }, + { + "epoch": 2.357012400269153, + "grad_norm": 1.020938754081726, + "learning_rate": 0.00013122076723254476, + "loss": 1.9402, + "step": 24520 + }, + { + "epoch": 2.3571085263866194, + "grad_norm": 1.2792479991912842, + "learning_rate": 0.0001312084266907045, + "loss": 1.9484, + "step": 24521 + }, + { + "epoch": 2.3572046525040853, + "grad_norm": 1.1310923099517822, + "learning_rate": 0.00013119608629165886, + "loss": 1.8509, + "step": 24522 + }, + { + "epoch": 2.3573007786215516, + "grad_norm": 1.1218459606170654, + "learning_rate": 0.0001311837460354947, + "loss": 1.9083, + "step": 24523 + }, + { + "epoch": 2.3573969047390175, + "grad_norm": 1.0793323516845703, + "learning_rate": 0.0001311714059222988, + "loss": 1.8704, + "step": 24524 + }, + { + "epoch": 2.357493030856484, + "grad_norm": 1.2338125705718994, + "learning_rate": 0.00013115906595215804, + "loss": 2.0163, + "step": 24525 + }, + { + "epoch": 2.3575891569739498, + "grad_norm": 1.2552101612091064, + "learning_rate": 0.00013114672612515925, + "loss": 1.9717, + "step": 24526 + }, + { + "epoch": 2.357685283091416, + "grad_norm": 1.031463623046875, + "learning_rate": 0.00013113438644138926, + "loss": 2.0967, + "step": 24527 + }, + { + "epoch": 2.357781409208882, + "grad_norm": 1.0839256048202515, + "learning_rate": 0.00013112204690093491, + "loss": 1.9433, + "step": 24528 + }, + { + "epoch": 2.3578775353263484, + "grad_norm": 1.1247661113739014, + "learning_rate": 0.00013110970750388306, + "loss": 1.9143, + "step": 24529 + }, + { + "epoch": 2.3579736614438143, + "grad_norm": 1.1667286157608032, + "learning_rate": 0.0001310973682503205, + "loss": 2.0834, + "step": 24530 + }, + { + "epoch": 2.3580697875612806, + "grad_norm": 1.2750523090362549, + "learning_rate": 0.0001310850291403341, + "loss": 2.2053, + "step": 24531 + }, + { + "epoch": 2.3581659136787465, + "grad_norm": 1.0854854583740234, + "learning_rate": 0.00013107269017401067, + "loss": 1.9592, + "step": 24532 + }, + { + "epoch": 2.3582620397962124, + "grad_norm": 1.1663482189178467, + "learning_rate": 0.00013106035135143708, + "loss": 1.9146, + "step": 24533 + }, + { + "epoch": 2.3583581659136788, + "grad_norm": 1.1378145217895508, + "learning_rate": 0.0001310480126727001, + "loss": 2.0287, + "step": 24534 + }, + { + "epoch": 2.358454292031145, + "grad_norm": 1.076102614402771, + "learning_rate": 0.0001310356741378866, + "loss": 2.1528, + "step": 24535 + }, + { + "epoch": 2.358550418148611, + "grad_norm": 1.1418735980987549, + "learning_rate": 0.0001310233357470834, + "loss": 1.9961, + "step": 24536 + }, + { + "epoch": 2.358646544266077, + "grad_norm": 1.1140918731689453, + "learning_rate": 0.0001310109975003773, + "loss": 2.0917, + "step": 24537 + }, + { + "epoch": 2.3587426703835432, + "grad_norm": 1.1133588552474976, + "learning_rate": 0.00013099865939785517, + "loss": 1.7933, + "step": 24538 + }, + { + "epoch": 2.358838796501009, + "grad_norm": 1.1771867275238037, + "learning_rate": 0.0001309863214396038, + "loss": 2.0005, + "step": 24539 + }, + { + "epoch": 2.3589349226184755, + "grad_norm": 1.030725359916687, + "learning_rate": 0.00013097398362571007, + "loss": 1.8688, + "step": 24540 + }, + { + "epoch": 2.3590310487359414, + "grad_norm": 1.370136022567749, + "learning_rate": 0.00013096164595626073, + "loss": 1.8654, + "step": 24541 + }, + { + "epoch": 2.3591271748534077, + "grad_norm": 1.1243456602096558, + "learning_rate": 0.0001309493084313426, + "loss": 1.9723, + "step": 24542 + }, + { + "epoch": 2.3592233009708736, + "grad_norm": 1.1467465162277222, + "learning_rate": 0.0001309369710510426, + "loss": 1.9091, + "step": 24543 + }, + { + "epoch": 2.35931942708834, + "grad_norm": 1.1276073455810547, + "learning_rate": 0.00013092463381544745, + "loss": 2.1096, + "step": 24544 + }, + { + "epoch": 2.359415553205806, + "grad_norm": 1.1473255157470703, + "learning_rate": 0.00013091229672464397, + "loss": 1.8653, + "step": 24545 + }, + { + "epoch": 2.359511679323272, + "grad_norm": 1.036295771598816, + "learning_rate": 0.00013089995977871903, + "loss": 1.9526, + "step": 24546 + }, + { + "epoch": 2.359607805440738, + "grad_norm": 0.9870808124542236, + "learning_rate": 0.00013088762297775944, + "loss": 1.923, + "step": 24547 + }, + { + "epoch": 2.3597039315582045, + "grad_norm": 1.1754337549209595, + "learning_rate": 0.00013087528632185202, + "loss": 2.0053, + "step": 24548 + }, + { + "epoch": 2.3598000576756704, + "grad_norm": 1.134343147277832, + "learning_rate": 0.00013086294981108352, + "loss": 2.0248, + "step": 24549 + }, + { + "epoch": 2.3598961837931367, + "grad_norm": 1.328657627105713, + "learning_rate": 0.00013085061344554082, + "loss": 2.1358, + "step": 24550 + }, + { + "epoch": 2.3599923099106026, + "grad_norm": 1.2374446392059326, + "learning_rate": 0.00013083827722531075, + "loss": 2.0673, + "step": 24551 + }, + { + "epoch": 2.360088436028069, + "grad_norm": 1.2453112602233887, + "learning_rate": 0.00013082594115048005, + "loss": 1.966, + "step": 24552 + }, + { + "epoch": 2.360184562145535, + "grad_norm": 1.110016107559204, + "learning_rate": 0.00013081360522113557, + "loss": 2.0015, + "step": 24553 + }, + { + "epoch": 2.360280688263001, + "grad_norm": 1.2671308517456055, + "learning_rate": 0.00013080126943736413, + "loss": 1.9421, + "step": 24554 + }, + { + "epoch": 2.360376814380467, + "grad_norm": 1.2711777687072754, + "learning_rate": 0.0001307889337992525, + "loss": 2.2348, + "step": 24555 + }, + { + "epoch": 2.3604729404979334, + "grad_norm": 1.3584601879119873, + "learning_rate": 0.00013077659830688758, + "loss": 2.0758, + "step": 24556 + }, + { + "epoch": 2.3605690666153993, + "grad_norm": 1.2530474662780762, + "learning_rate": 0.00013076426296035603, + "loss": 2.0591, + "step": 24557 + }, + { + "epoch": 2.3606651927328657, + "grad_norm": 1.3382647037506104, + "learning_rate": 0.00013075192775974476, + "loss": 2.005, + "step": 24558 + }, + { + "epoch": 2.3607613188503316, + "grad_norm": 1.120468020439148, + "learning_rate": 0.00013073959270514055, + "loss": 1.912, + "step": 24559 + }, + { + "epoch": 2.360857444967798, + "grad_norm": 1.348085641860962, + "learning_rate": 0.0001307272577966302, + "loss": 2.0544, + "step": 24560 + }, + { + "epoch": 2.360953571085264, + "grad_norm": 1.1562787294387817, + "learning_rate": 0.0001307149230343005, + "loss": 1.8222, + "step": 24561 + }, + { + "epoch": 2.36104969720273, + "grad_norm": 1.2184090614318848, + "learning_rate": 0.00013070258841823827, + "loss": 1.9864, + "step": 24562 + }, + { + "epoch": 2.361145823320196, + "grad_norm": 1.13002610206604, + "learning_rate": 0.00013069025394853035, + "loss": 1.9862, + "step": 24563 + }, + { + "epoch": 2.3612419494376624, + "grad_norm": 1.1185444593429565, + "learning_rate": 0.00013067791962526344, + "loss": 1.8246, + "step": 24564 + }, + { + "epoch": 2.3613380755551283, + "grad_norm": 1.2698159217834473, + "learning_rate": 0.00013066558544852443, + "loss": 2.0954, + "step": 24565 + }, + { + "epoch": 2.361434201672594, + "grad_norm": 1.0040638446807861, + "learning_rate": 0.00013065325141840007, + "loss": 1.8646, + "step": 24566 + }, + { + "epoch": 2.3615303277900606, + "grad_norm": 1.4158955812454224, + "learning_rate": 0.00013064091753497718, + "loss": 2.0516, + "step": 24567 + }, + { + "epoch": 2.361626453907527, + "grad_norm": 1.1881145238876343, + "learning_rate": 0.00013062858379834252, + "loss": 1.9531, + "step": 24568 + }, + { + "epoch": 2.361722580024993, + "grad_norm": 1.172107458114624, + "learning_rate": 0.00013061625020858292, + "loss": 1.9838, + "step": 24569 + }, + { + "epoch": 2.3618187061424587, + "grad_norm": 1.0877958536148071, + "learning_rate": 0.00013060391676578516, + "loss": 1.9912, + "step": 24570 + }, + { + "epoch": 2.361914832259925, + "grad_norm": 1.331859827041626, + "learning_rate": 0.00013059158347003604, + "loss": 2.0258, + "step": 24571 + }, + { + "epoch": 2.362010958377391, + "grad_norm": 1.1209477186203003, + "learning_rate": 0.00013057925032142233, + "loss": 2.0672, + "step": 24572 + }, + { + "epoch": 2.3621070844948573, + "grad_norm": 1.3764548301696777, + "learning_rate": 0.00013056691732003084, + "loss": 1.9846, + "step": 24573 + }, + { + "epoch": 2.362203210612323, + "grad_norm": 1.1716563701629639, + "learning_rate": 0.00013055458446594836, + "loss": 1.9405, + "step": 24574 + }, + { + "epoch": 2.3622993367297895, + "grad_norm": 1.092797875404358, + "learning_rate": 0.00013054225175926167, + "loss": 2.0548, + "step": 24575 + }, + { + "epoch": 2.3623954628472554, + "grad_norm": 1.050798773765564, + "learning_rate": 0.00013052991920005754, + "loss": 1.9752, + "step": 24576 + }, + { + "epoch": 2.362491588964722, + "grad_norm": 1.2214125394821167, + "learning_rate": 0.0001305175867884228, + "loss": 1.9898, + "step": 24577 + }, + { + "epoch": 2.3625877150821877, + "grad_norm": 1.2643908262252808, + "learning_rate": 0.0001305052545244442, + "loss": 2.0989, + "step": 24578 + }, + { + "epoch": 2.362683841199654, + "grad_norm": 1.3705905675888062, + "learning_rate": 0.00013049292240820847, + "loss": 2.0834, + "step": 24579 + }, + { + "epoch": 2.36277996731712, + "grad_norm": 1.102576732635498, + "learning_rate": 0.00013048059043980253, + "loss": 2.0891, + "step": 24580 + }, + { + "epoch": 2.3628760934345863, + "grad_norm": 1.31568443775177, + "learning_rate": 0.00013046825861931307, + "loss": 1.9476, + "step": 24581 + }, + { + "epoch": 2.362972219552052, + "grad_norm": 1.147078514099121, + "learning_rate": 0.00013045592694682688, + "loss": 2.1078, + "step": 24582 + }, + { + "epoch": 2.3630683456695185, + "grad_norm": 1.3228002786636353, + "learning_rate": 0.00013044359542243076, + "loss": 2.1609, + "step": 24583 + }, + { + "epoch": 2.3631644717869844, + "grad_norm": 1.071976661682129, + "learning_rate": 0.0001304312640462115, + "loss": 1.9504, + "step": 24584 + }, + { + "epoch": 2.3632605979044508, + "grad_norm": 1.168092966079712, + "learning_rate": 0.0001304189328182558, + "loss": 2.1444, + "step": 24585 + }, + { + "epoch": 2.3633567240219167, + "grad_norm": 1.0620818138122559, + "learning_rate": 0.00013040660173865054, + "loss": 2.0305, + "step": 24586 + }, + { + "epoch": 2.363452850139383, + "grad_norm": 1.1531604528427124, + "learning_rate": 0.0001303942708074824, + "loss": 2.1007, + "step": 24587 + }, + { + "epoch": 2.363548976256849, + "grad_norm": 1.0502116680145264, + "learning_rate": 0.00013038194002483824, + "loss": 1.951, + "step": 24588 + }, + { + "epoch": 2.3636451023743152, + "grad_norm": 1.0475808382034302, + "learning_rate": 0.00013036960939080477, + "loss": 1.8346, + "step": 24589 + }, + { + "epoch": 2.363741228491781, + "grad_norm": 1.0915477275848389, + "learning_rate": 0.0001303572789054688, + "loss": 1.7786, + "step": 24590 + }, + { + "epoch": 2.3638373546092475, + "grad_norm": 1.2040849924087524, + "learning_rate": 0.00013034494856891706, + "loss": 2.0927, + "step": 24591 + }, + { + "epoch": 2.3639334807267134, + "grad_norm": 1.2090502977371216, + "learning_rate": 0.00013033261838123637, + "loss": 1.9859, + "step": 24592 + }, + { + "epoch": 2.3640296068441797, + "grad_norm": 1.2219282388687134, + "learning_rate": 0.00013032028834251348, + "loss": 1.8873, + "step": 24593 + }, + { + "epoch": 2.3641257329616456, + "grad_norm": 1.0803604125976562, + "learning_rate": 0.00013030795845283514, + "loss": 2.0474, + "step": 24594 + }, + { + "epoch": 2.364221859079112, + "grad_norm": 1.2796376943588257, + "learning_rate": 0.00013029562871228815, + "loss": 2.1611, + "step": 24595 + }, + { + "epoch": 2.364317985196578, + "grad_norm": 1.0873603820800781, + "learning_rate": 0.00013028329912095923, + "loss": 2.0495, + "step": 24596 + }, + { + "epoch": 2.364414111314044, + "grad_norm": 1.135523796081543, + "learning_rate": 0.00013027096967893517, + "loss": 2.0482, + "step": 24597 + }, + { + "epoch": 2.36451023743151, + "grad_norm": 1.1383943557739258, + "learning_rate": 0.00013025864038630278, + "loss": 1.9792, + "step": 24598 + }, + { + "epoch": 2.364606363548976, + "grad_norm": 1.012345552444458, + "learning_rate": 0.00013024631124314875, + "loss": 1.7456, + "step": 24599 + }, + { + "epoch": 2.3647024896664424, + "grad_norm": 1.2306358814239502, + "learning_rate": 0.00013023398224955987, + "loss": 1.8527, + "step": 24600 + }, + { + "epoch": 2.3647986157839087, + "grad_norm": 1.0649954080581665, + "learning_rate": 0.00013022165340562293, + "loss": 2.0327, + "step": 24601 + }, + { + "epoch": 2.3648947419013746, + "grad_norm": 1.167894721031189, + "learning_rate": 0.00013020932471142463, + "loss": 1.9524, + "step": 24602 + }, + { + "epoch": 2.3649908680188405, + "grad_norm": 1.1756269931793213, + "learning_rate": 0.0001301969961670518, + "loss": 2.0015, + "step": 24603 + }, + { + "epoch": 2.365086994136307, + "grad_norm": 1.082707166671753, + "learning_rate": 0.00013018466777259114, + "loss": 1.9641, + "step": 24604 + }, + { + "epoch": 2.365183120253773, + "grad_norm": 1.2334564924240112, + "learning_rate": 0.00013017233952812942, + "loss": 2.0015, + "step": 24605 + }, + { + "epoch": 2.365279246371239, + "grad_norm": 1.1930551528930664, + "learning_rate": 0.00013016001143375343, + "loss": 1.9224, + "step": 24606 + }, + { + "epoch": 2.365375372488705, + "grad_norm": 1.2264798879623413, + "learning_rate": 0.00013014768348954984, + "loss": 2.0472, + "step": 24607 + }, + { + "epoch": 2.3654714986061713, + "grad_norm": 1.228560447692871, + "learning_rate": 0.0001301353556956055, + "loss": 2.0299, + "step": 24608 + }, + { + "epoch": 2.3655676247236372, + "grad_norm": 1.1622031927108765, + "learning_rate": 0.00013012302805200714, + "loss": 2.2506, + "step": 24609 + }, + { + "epoch": 2.3656637508411036, + "grad_norm": 1.3270577192306519, + "learning_rate": 0.00013011070055884145, + "loss": 2.0406, + "step": 24610 + }, + { + "epoch": 2.3657598769585695, + "grad_norm": 1.0588293075561523, + "learning_rate": 0.00013009837321619524, + "loss": 1.9465, + "step": 24611 + }, + { + "epoch": 2.365856003076036, + "grad_norm": 1.6222987174987793, + "learning_rate": 0.00013008604602415526, + "loss": 2.0349, + "step": 24612 + }, + { + "epoch": 2.3659521291935017, + "grad_norm": 1.157995581626892, + "learning_rate": 0.00013007371898280819, + "loss": 2.1955, + "step": 24613 + }, + { + "epoch": 2.366048255310968, + "grad_norm": 1.1626724004745483, + "learning_rate": 0.00013006139209224086, + "loss": 2.1136, + "step": 24614 + }, + { + "epoch": 2.366144381428434, + "grad_norm": 1.2712900638580322, + "learning_rate": 0.00013004906535254, + "loss": 2.2181, + "step": 24615 + }, + { + "epoch": 2.3662405075459003, + "grad_norm": 1.3024667501449585, + "learning_rate": 0.00013003673876379231, + "loss": 1.9609, + "step": 24616 + }, + { + "epoch": 2.366336633663366, + "grad_norm": 1.0981473922729492, + "learning_rate": 0.00013002441232608459, + "loss": 1.9227, + "step": 24617 + }, + { + "epoch": 2.3664327597808326, + "grad_norm": 1.2460010051727295, + "learning_rate": 0.00013001208603950355, + "loss": 2.1187, + "step": 24618 + }, + { + "epoch": 2.3665288858982985, + "grad_norm": 1.149515151977539, + "learning_rate": 0.00012999975990413593, + "loss": 2.0811, + "step": 24619 + }, + { + "epoch": 2.366625012015765, + "grad_norm": 0.9752029776573181, + "learning_rate": 0.0001299874339200685, + "loss": 1.9372, + "step": 24620 + }, + { + "epoch": 2.3667211381332307, + "grad_norm": 0.9411177635192871, + "learning_rate": 0.00012997510808738792, + "loss": 1.9817, + "step": 24621 + }, + { + "epoch": 2.366817264250697, + "grad_norm": 0.9916439056396484, + "learning_rate": 0.00012996278240618104, + "loss": 1.9848, + "step": 24622 + }, + { + "epoch": 2.366913390368163, + "grad_norm": 0.9956062436103821, + "learning_rate": 0.00012995045687653453, + "loss": 1.9868, + "step": 24623 + }, + { + "epoch": 2.3670095164856293, + "grad_norm": 1.0245343446731567, + "learning_rate": 0.00012993813149853514, + "loss": 2.0606, + "step": 24624 + }, + { + "epoch": 2.367105642603095, + "grad_norm": 1.2149394750595093, + "learning_rate": 0.0001299258062722696, + "loss": 2.1535, + "step": 24625 + }, + { + "epoch": 2.3672017687205615, + "grad_norm": 1.2386361360549927, + "learning_rate": 0.00012991348119782467, + "loss": 1.8987, + "step": 24626 + }, + { + "epoch": 2.3672978948380274, + "grad_norm": 1.1363904476165771, + "learning_rate": 0.00012990115627528704, + "loss": 1.8953, + "step": 24627 + }, + { + "epoch": 2.367394020955494, + "grad_norm": 1.1654460430145264, + "learning_rate": 0.00012988883150474346, + "loss": 1.8858, + "step": 24628 + }, + { + "epoch": 2.3674901470729597, + "grad_norm": 1.299444556236267, + "learning_rate": 0.00012987650688628069, + "loss": 2.1463, + "step": 24629 + }, + { + "epoch": 2.367586273190426, + "grad_norm": 1.2399401664733887, + "learning_rate": 0.0001298641824199854, + "loss": 2.055, + "step": 24630 + }, + { + "epoch": 2.367682399307892, + "grad_norm": 1.13931143283844, + "learning_rate": 0.00012985185810594437, + "loss": 1.9437, + "step": 24631 + }, + { + "epoch": 2.367778525425358, + "grad_norm": 1.1154447793960571, + "learning_rate": 0.00012983953394424434, + "loss": 1.8799, + "step": 24632 + }, + { + "epoch": 2.367874651542824, + "grad_norm": 1.176737904548645, + "learning_rate": 0.000129827209934972, + "loss": 2.1491, + "step": 24633 + }, + { + "epoch": 2.3679707776602905, + "grad_norm": 1.2591911554336548, + "learning_rate": 0.00012981488607821407, + "loss": 2.0733, + "step": 24634 + }, + { + "epoch": 2.3680669037777564, + "grad_norm": 1.082566261291504, + "learning_rate": 0.0001298025623740573, + "loss": 2.1311, + "step": 24635 + }, + { + "epoch": 2.3681630298952223, + "grad_norm": 1.194841980934143, + "learning_rate": 0.00012979023882258843, + "loss": 1.9681, + "step": 24636 + }, + { + "epoch": 2.3682591560126887, + "grad_norm": 1.198293924331665, + "learning_rate": 0.00012977791542389413, + "loss": 2.0345, + "step": 24637 + }, + { + "epoch": 2.368355282130155, + "grad_norm": 1.2032387256622314, + "learning_rate": 0.00012976559217806115, + "loss": 2.025, + "step": 24638 + }, + { + "epoch": 2.368451408247621, + "grad_norm": 1.1657745838165283, + "learning_rate": 0.00012975326908517624, + "loss": 1.9708, + "step": 24639 + }, + { + "epoch": 2.368547534365087, + "grad_norm": 1.1944447755813599, + "learning_rate": 0.00012974094614532603, + "loss": 2.2096, + "step": 24640 + }, + { + "epoch": 2.368643660482553, + "grad_norm": 1.2296373844146729, + "learning_rate": 0.00012972862335859734, + "loss": 1.9982, + "step": 24641 + }, + { + "epoch": 2.368739786600019, + "grad_norm": 1.1995594501495361, + "learning_rate": 0.00012971630072507683, + "loss": 2.1072, + "step": 24642 + }, + { + "epoch": 2.3688359127174854, + "grad_norm": 1.2309514284133911, + "learning_rate": 0.00012970397824485123, + "loss": 2.0574, + "step": 24643 + }, + { + "epoch": 2.3689320388349513, + "grad_norm": 1.1828653812408447, + "learning_rate": 0.00012969165591800725, + "loss": 2.2025, + "step": 24644 + }, + { + "epoch": 2.3690281649524176, + "grad_norm": 1.0413082838058472, + "learning_rate": 0.00012967933374463162, + "loss": 1.9569, + "step": 24645 + }, + { + "epoch": 2.3691242910698835, + "grad_norm": 1.1213412284851074, + "learning_rate": 0.00012966701172481104, + "loss": 1.9176, + "step": 24646 + }, + { + "epoch": 2.36922041718735, + "grad_norm": 1.0708394050598145, + "learning_rate": 0.0001296546898586322, + "loss": 2.0752, + "step": 24647 + }, + { + "epoch": 2.369316543304816, + "grad_norm": 1.1110683679580688, + "learning_rate": 0.00012964236814618185, + "loss": 2.0103, + "step": 24648 + }, + { + "epoch": 2.369412669422282, + "grad_norm": 1.277337908744812, + "learning_rate": 0.00012963004658754666, + "loss": 1.8957, + "step": 24649 + }, + { + "epoch": 2.369508795539748, + "grad_norm": 1.159541368484497, + "learning_rate": 0.0001296177251828134, + "loss": 2.0362, + "step": 24650 + }, + { + "epoch": 2.3696049216572144, + "grad_norm": 1.1481071710586548, + "learning_rate": 0.00012960540393206876, + "loss": 2.0048, + "step": 24651 + }, + { + "epoch": 2.3697010477746803, + "grad_norm": 1.2590736150741577, + "learning_rate": 0.00012959308283539935, + "loss": 1.9753, + "step": 24652 + }, + { + "epoch": 2.3697971738921466, + "grad_norm": 1.2824956178665161, + "learning_rate": 0.00012958076189289203, + "loss": 2.003, + "step": 24653 + }, + { + "epoch": 2.3698933000096125, + "grad_norm": 1.4627162218093872, + "learning_rate": 0.0001295684411046334, + "loss": 1.9678, + "step": 24654 + }, + { + "epoch": 2.369989426127079, + "grad_norm": 1.1283973455429077, + "learning_rate": 0.00012955612047071017, + "loss": 2.058, + "step": 24655 + }, + { + "epoch": 2.3700855522445448, + "grad_norm": 1.1886018514633179, + "learning_rate": 0.00012954379999120908, + "loss": 1.9686, + "step": 24656 + }, + { + "epoch": 2.370181678362011, + "grad_norm": 1.0990055799484253, + "learning_rate": 0.00012953147966621677, + "loss": 1.9305, + "step": 24657 + }, + { + "epoch": 2.370277804479477, + "grad_norm": 1.1324899196624756, + "learning_rate": 0.00012951915949582, + "loss": 1.9109, + "step": 24658 + }, + { + "epoch": 2.3703739305969433, + "grad_norm": 1.225400686264038, + "learning_rate": 0.00012950683948010548, + "loss": 2.1428, + "step": 24659 + }, + { + "epoch": 2.3704700567144092, + "grad_norm": 1.1037781238555908, + "learning_rate": 0.00012949451961915986, + "loss": 2.0462, + "step": 24660 + }, + { + "epoch": 2.3705661828318756, + "grad_norm": 1.0800846815109253, + "learning_rate": 0.00012948219991306987, + "loss": 2.1559, + "step": 24661 + }, + { + "epoch": 2.3706623089493415, + "grad_norm": 1.256089687347412, + "learning_rate": 0.00012946988036192217, + "loss": 2.0583, + "step": 24662 + }, + { + "epoch": 2.370758435066808, + "grad_norm": 1.2404720783233643, + "learning_rate": 0.00012945756096580346, + "loss": 2.072, + "step": 24663 + }, + { + "epoch": 2.3708545611842737, + "grad_norm": 1.290674090385437, + "learning_rate": 0.00012944524172480045, + "loss": 1.8804, + "step": 24664 + }, + { + "epoch": 2.3709506873017396, + "grad_norm": 1.000353217124939, + "learning_rate": 0.00012943292263899983, + "loss": 2.0241, + "step": 24665 + }, + { + "epoch": 2.371046813419206, + "grad_norm": 1.314858317375183, + "learning_rate": 0.0001294206037084883, + "loss": 2.1471, + "step": 24666 + }, + { + "epoch": 2.3711429395366723, + "grad_norm": 1.4497781991958618, + "learning_rate": 0.00012940828493335257, + "loss": 2.1434, + "step": 24667 + }, + { + "epoch": 2.3712390656541382, + "grad_norm": 1.1795982122421265, + "learning_rate": 0.00012939596631367925, + "loss": 1.9674, + "step": 24668 + }, + { + "epoch": 2.371335191771604, + "grad_norm": 0.9325761198997498, + "learning_rate": 0.00012938364784955512, + "loss": 1.8695, + "step": 24669 + }, + { + "epoch": 2.3714313178890705, + "grad_norm": 1.1419278383255005, + "learning_rate": 0.00012937132954106678, + "loss": 1.9993, + "step": 24670 + }, + { + "epoch": 2.371527444006537, + "grad_norm": 1.061888575553894, + "learning_rate": 0.000129359011388301, + "loss": 1.9813, + "step": 24671 + }, + { + "epoch": 2.3716235701240027, + "grad_norm": 1.067950963973999, + "learning_rate": 0.00012934669339134444, + "loss": 1.985, + "step": 24672 + }, + { + "epoch": 2.3717196962414686, + "grad_norm": 1.1037373542785645, + "learning_rate": 0.0001293343755502837, + "loss": 2.0131, + "step": 24673 + }, + { + "epoch": 2.371815822358935, + "grad_norm": 1.1793910264968872, + "learning_rate": 0.00012932205786520557, + "loss": 2.0107, + "step": 24674 + }, + { + "epoch": 2.371911948476401, + "grad_norm": 1.1540207862854004, + "learning_rate": 0.0001293097403361967, + "loss": 2.0146, + "step": 24675 + }, + { + "epoch": 2.372008074593867, + "grad_norm": 1.271151065826416, + "learning_rate": 0.00012929742296334374, + "loss": 2.05, + "step": 24676 + }, + { + "epoch": 2.372104200711333, + "grad_norm": 1.3614965677261353, + "learning_rate": 0.0001292851057467334, + "loss": 2.1703, + "step": 24677 + }, + { + "epoch": 2.3722003268287994, + "grad_norm": 1.1704405546188354, + "learning_rate": 0.00012927278868645235, + "loss": 2.0526, + "step": 24678 + }, + { + "epoch": 2.3722964529462653, + "grad_norm": 1.1238595247268677, + "learning_rate": 0.0001292604717825873, + "loss": 2.1099, + "step": 24679 + }, + { + "epoch": 2.3723925790637317, + "grad_norm": 1.1506929397583008, + "learning_rate": 0.00012924815503522483, + "loss": 1.955, + "step": 24680 + }, + { + "epoch": 2.3724887051811976, + "grad_norm": 1.0713741779327393, + "learning_rate": 0.00012923583844445165, + "loss": 1.9289, + "step": 24681 + }, + { + "epoch": 2.372584831298664, + "grad_norm": 1.1293145418167114, + "learning_rate": 0.00012922352201035454, + "loss": 1.932, + "step": 24682 + }, + { + "epoch": 2.37268095741613, + "grad_norm": 1.2003947496414185, + "learning_rate": 0.00012921120573302006, + "loss": 2.0331, + "step": 24683 + }, + { + "epoch": 2.372777083533596, + "grad_norm": 1.0751668214797974, + "learning_rate": 0.00012919888961253492, + "loss": 2.0828, + "step": 24684 + }, + { + "epoch": 2.372873209651062, + "grad_norm": 1.0917332172393799, + "learning_rate": 0.00012918657364898575, + "loss": 1.9734, + "step": 24685 + }, + { + "epoch": 2.3729693357685284, + "grad_norm": 1.2825901508331299, + "learning_rate": 0.00012917425784245932, + "loss": 2.0522, + "step": 24686 + }, + { + "epoch": 2.3730654618859943, + "grad_norm": 0.9511754512786865, + "learning_rate": 0.00012916194219304218, + "loss": 1.805, + "step": 24687 + }, + { + "epoch": 2.3731615880034607, + "grad_norm": 1.294579267501831, + "learning_rate": 0.00012914962670082105, + "loss": 2.0448, + "step": 24688 + }, + { + "epoch": 2.3732577141209266, + "grad_norm": 1.1330500841140747, + "learning_rate": 0.0001291373113658826, + "loss": 2.0759, + "step": 24689 + }, + { + "epoch": 2.373353840238393, + "grad_norm": 1.3324726819992065, + "learning_rate": 0.0001291249961883135, + "loss": 2.0721, + "step": 24690 + }, + { + "epoch": 2.373449966355859, + "grad_norm": 1.0255991220474243, + "learning_rate": 0.0001291126811682004, + "loss": 1.9163, + "step": 24691 + }, + { + "epoch": 2.373546092473325, + "grad_norm": 1.0039756298065186, + "learning_rate": 0.00012910036630562995, + "loss": 1.8398, + "step": 24692 + }, + { + "epoch": 2.373642218590791, + "grad_norm": 1.3037244081497192, + "learning_rate": 0.00012908805160068883, + "loss": 2.0738, + "step": 24693 + }, + { + "epoch": 2.3737383447082574, + "grad_norm": 1.1637111902236938, + "learning_rate": 0.00012907573705346368, + "loss": 1.6109, + "step": 24694 + }, + { + "epoch": 2.3738344708257233, + "grad_norm": 1.1236798763275146, + "learning_rate": 0.00012906342266404123, + "loss": 2.0519, + "step": 24695 + }, + { + "epoch": 2.3739305969431896, + "grad_norm": 1.036285638809204, + "learning_rate": 0.00012905110843250804, + "loss": 1.8757, + "step": 24696 + }, + { + "epoch": 2.3740267230606555, + "grad_norm": 1.1132760047912598, + "learning_rate": 0.00012903879435895082, + "loss": 1.9079, + "step": 24697 + }, + { + "epoch": 2.374122849178122, + "grad_norm": 1.2430108785629272, + "learning_rate": 0.00012902648044345617, + "loss": 2.0279, + "step": 24698 + }, + { + "epoch": 2.374218975295588, + "grad_norm": 1.248456597328186, + "learning_rate": 0.00012901416668611082, + "loss": 2.0183, + "step": 24699 + }, + { + "epoch": 2.374315101413054, + "grad_norm": 1.22763991355896, + "learning_rate": 0.0001290018530870014, + "loss": 2.0669, + "step": 24700 + }, + { + "epoch": 2.37441122753052, + "grad_norm": 1.2104918956756592, + "learning_rate": 0.0001289895396462146, + "loss": 1.9211, + "step": 24701 + }, + { + "epoch": 2.374507353647986, + "grad_norm": 1.328758955001831, + "learning_rate": 0.000128977226363837, + "loss": 2.0629, + "step": 24702 + }, + { + "epoch": 2.3746034797654523, + "grad_norm": 1.1589821577072144, + "learning_rate": 0.00012896491323995525, + "loss": 2.1209, + "step": 24703 + }, + { + "epoch": 2.3746996058829186, + "grad_norm": 1.073771357536316, + "learning_rate": 0.00012895260027465607, + "loss": 2.0964, + "step": 24704 + }, + { + "epoch": 2.3747957320003845, + "grad_norm": 1.1508995294570923, + "learning_rate": 0.00012894028746802602, + "loss": 1.97, + "step": 24705 + }, + { + "epoch": 2.3748918581178504, + "grad_norm": 1.2058337926864624, + "learning_rate": 0.00012892797482015183, + "loss": 2.0614, + "step": 24706 + }, + { + "epoch": 2.3749879842353168, + "grad_norm": 1.1430408954620361, + "learning_rate": 0.00012891566233112008, + "loss": 1.8449, + "step": 24707 + }, + { + "epoch": 2.3750841103527827, + "grad_norm": 1.4065126180648804, + "learning_rate": 0.00012890335000101746, + "loss": 2.1865, + "step": 24708 + }, + { + "epoch": 2.375180236470249, + "grad_norm": 1.2989418506622314, + "learning_rate": 0.0001288910378299306, + "loss": 2.0987, + "step": 24709 + }, + { + "epoch": 2.375276362587715, + "grad_norm": 1.2616219520568848, + "learning_rate": 0.00012887872581794613, + "loss": 1.9174, + "step": 24710 + }, + { + "epoch": 2.3753724887051813, + "grad_norm": 1.4085851907730103, + "learning_rate": 0.0001288664139651507, + "loss": 1.9978, + "step": 24711 + }, + { + "epoch": 2.375468614822647, + "grad_norm": 1.2067322731018066, + "learning_rate": 0.00012885410227163097, + "loss": 1.9631, + "step": 24712 + }, + { + "epoch": 2.3755647409401135, + "grad_norm": 1.1990150213241577, + "learning_rate": 0.00012884179073747353, + "loss": 2.0977, + "step": 24713 + }, + { + "epoch": 2.3756608670575794, + "grad_norm": 1.1582800149917603, + "learning_rate": 0.00012882947936276507, + "loss": 2.0351, + "step": 24714 + }, + { + "epoch": 2.3757569931750457, + "grad_norm": 1.077355980873108, + "learning_rate": 0.00012881716814759216, + "loss": 1.8747, + "step": 24715 + }, + { + "epoch": 2.3758531192925116, + "grad_norm": 1.2119600772857666, + "learning_rate": 0.0001288048570920415, + "loss": 2.0772, + "step": 24716 + }, + { + "epoch": 2.375949245409978, + "grad_norm": 1.0327895879745483, + "learning_rate": 0.00012879254619619973, + "loss": 2.0046, + "step": 24717 + }, + { + "epoch": 2.376045371527444, + "grad_norm": 1.1885356903076172, + "learning_rate": 0.00012878023546015346, + "loss": 2.1416, + "step": 24718 + }, + { + "epoch": 2.3761414976449102, + "grad_norm": 1.1255944967269897, + "learning_rate": 0.0001287679248839893, + "loss": 1.9888, + "step": 24719 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 1.3064446449279785, + "learning_rate": 0.0001287556144677939, + "loss": 1.9068, + "step": 24720 + }, + { + "epoch": 2.3763337498798425, + "grad_norm": 1.158797264099121, + "learning_rate": 0.00012874330421165393, + "loss": 1.8017, + "step": 24721 + }, + { + "epoch": 2.3764298759973084, + "grad_norm": 1.1059048175811768, + "learning_rate": 0.00012873099411565595, + "loss": 2.1365, + "step": 24722 + }, + { + "epoch": 2.3765260021147747, + "grad_norm": 1.139920949935913, + "learning_rate": 0.00012871868417988662, + "loss": 1.9811, + "step": 24723 + }, + { + "epoch": 2.3766221282322406, + "grad_norm": 1.1380373239517212, + "learning_rate": 0.00012870637440443256, + "loss": 1.8888, + "step": 24724 + }, + { + "epoch": 2.376718254349707, + "grad_norm": 1.2170652151107788, + "learning_rate": 0.00012869406478938043, + "loss": 1.9495, + "step": 24725 + }, + { + "epoch": 2.376814380467173, + "grad_norm": 1.162592887878418, + "learning_rate": 0.00012868175533481678, + "loss": 2.0365, + "step": 24726 + }, + { + "epoch": 2.376910506584639, + "grad_norm": 1.2424157857894897, + "learning_rate": 0.00012866944604082833, + "loss": 1.8686, + "step": 24727 + }, + { + "epoch": 2.377006632702105, + "grad_norm": 1.1786754131317139, + "learning_rate": 0.00012865713690750162, + "loss": 1.9754, + "step": 24728 + }, + { + "epoch": 2.3771027588195714, + "grad_norm": 1.1574277877807617, + "learning_rate": 0.00012864482793492333, + "loss": 1.9535, + "step": 24729 + }, + { + "epoch": 2.3771988849370373, + "grad_norm": 1.2770503759384155, + "learning_rate": 0.00012863251912318001, + "loss": 2.1576, + "step": 24730 + }, + { + "epoch": 2.3772950110545037, + "grad_norm": 1.1946924924850464, + "learning_rate": 0.00012862021047235834, + "loss": 2.0904, + "step": 24731 + }, + { + "epoch": 2.3773911371719696, + "grad_norm": 1.1904877424240112, + "learning_rate": 0.00012860790198254487, + "loss": 2.0894, + "step": 24732 + }, + { + "epoch": 2.377487263289436, + "grad_norm": 1.0791383981704712, + "learning_rate": 0.00012859559365382633, + "loss": 2.2052, + "step": 24733 + }, + { + "epoch": 2.377583389406902, + "grad_norm": 1.0890460014343262, + "learning_rate": 0.00012858328548628927, + "loss": 1.9943, + "step": 24734 + }, + { + "epoch": 2.3776795155243677, + "grad_norm": 1.2942205667495728, + "learning_rate": 0.0001285709774800203, + "loss": 2.1076, + "step": 24735 + }, + { + "epoch": 2.377775641641834, + "grad_norm": 1.065492033958435, + "learning_rate": 0.00012855866963510602, + "loss": 2.0638, + "step": 24736 + }, + { + "epoch": 2.3778717677593004, + "grad_norm": 1.1289620399475098, + "learning_rate": 0.00012854636195163305, + "loss": 1.9911, + "step": 24737 + }, + { + "epoch": 2.3779678938767663, + "grad_norm": 1.0464321374893188, + "learning_rate": 0.00012853405442968805, + "loss": 1.8302, + "step": 24738 + }, + { + "epoch": 2.3780640199942322, + "grad_norm": 1.3068656921386719, + "learning_rate": 0.00012852174706935756, + "loss": 2.0454, + "step": 24739 + }, + { + "epoch": 2.3781601461116986, + "grad_norm": 1.007217288017273, + "learning_rate": 0.00012850943987072823, + "loss": 1.9053, + "step": 24740 + }, + { + "epoch": 2.378256272229165, + "grad_norm": 1.1924159526824951, + "learning_rate": 0.00012849713283388667, + "loss": 1.8948, + "step": 24741 + }, + { + "epoch": 2.378352398346631, + "grad_norm": 1.1995338201522827, + "learning_rate": 0.00012848482595891945, + "loss": 2.0607, + "step": 24742 + }, + { + "epoch": 2.3784485244640967, + "grad_norm": 0.9776483178138733, + "learning_rate": 0.00012847251924591322, + "loss": 1.7785, + "step": 24743 + }, + { + "epoch": 2.378544650581563, + "grad_norm": 1.1133365631103516, + "learning_rate": 0.00012846021269495453, + "loss": 1.8778, + "step": 24744 + }, + { + "epoch": 2.378640776699029, + "grad_norm": 1.1724352836608887, + "learning_rate": 0.00012844790630613004, + "loss": 2.0579, + "step": 24745 + }, + { + "epoch": 2.3787369028164953, + "grad_norm": 1.122370958328247, + "learning_rate": 0.00012843560007952634, + "loss": 1.9478, + "step": 24746 + }, + { + "epoch": 2.378833028933961, + "grad_norm": 1.2101011276245117, + "learning_rate": 0.00012842329401522998, + "loss": 1.9878, + "step": 24747 + }, + { + "epoch": 2.3789291550514275, + "grad_norm": 1.082096815109253, + "learning_rate": 0.00012841098811332762, + "loss": 1.8024, + "step": 24748 + }, + { + "epoch": 2.3790252811688934, + "grad_norm": 1.126815676689148, + "learning_rate": 0.0001283986823739058, + "loss": 2.0076, + "step": 24749 + }, + { + "epoch": 2.37912140728636, + "grad_norm": 1.3266788721084595, + "learning_rate": 0.0001283863767970512, + "loss": 2.0247, + "step": 24750 + }, + { + "epoch": 2.3792175334038257, + "grad_norm": 1.188433289527893, + "learning_rate": 0.00012837407138285034, + "loss": 2.1226, + "step": 24751 + }, + { + "epoch": 2.379313659521292, + "grad_norm": 1.293757677078247, + "learning_rate": 0.00012836176613138987, + "loss": 2.0396, + "step": 24752 + }, + { + "epoch": 2.379409785638758, + "grad_norm": 1.10433030128479, + "learning_rate": 0.00012834946104275631, + "loss": 2.0282, + "step": 24753 + }, + { + "epoch": 2.3795059117562243, + "grad_norm": 1.137672781944275, + "learning_rate": 0.00012833715611703636, + "loss": 1.9719, + "step": 24754 + }, + { + "epoch": 2.37960203787369, + "grad_norm": 1.1790231466293335, + "learning_rate": 0.00012832485135431653, + "loss": 2.1175, + "step": 24755 + }, + { + "epoch": 2.3796981639911565, + "grad_norm": 1.1343976259231567, + "learning_rate": 0.0001283125467546834, + "loss": 1.9369, + "step": 24756 + }, + { + "epoch": 2.3797942901086224, + "grad_norm": 0.985439121723175, + "learning_rate": 0.00012830024231822362, + "loss": 2.0203, + "step": 24757 + }, + { + "epoch": 2.3798904162260888, + "grad_norm": 1.166806697845459, + "learning_rate": 0.00012828793804502378, + "loss": 2.0914, + "step": 24758 + }, + { + "epoch": 2.3799865423435547, + "grad_norm": 1.0465354919433594, + "learning_rate": 0.0001282756339351704, + "loss": 1.852, + "step": 24759 + }, + { + "epoch": 2.380082668461021, + "grad_norm": 1.2223345041275024, + "learning_rate": 0.0001282633299887501, + "loss": 1.9949, + "step": 24760 + }, + { + "epoch": 2.380178794578487, + "grad_norm": 1.1265610456466675, + "learning_rate": 0.00012825102620584946, + "loss": 2.1755, + "step": 24761 + }, + { + "epoch": 2.3802749206959533, + "grad_norm": 1.1514760255813599, + "learning_rate": 0.00012823872258655507, + "loss": 2.0272, + "step": 24762 + }, + { + "epoch": 2.380371046813419, + "grad_norm": 1.1581076383590698, + "learning_rate": 0.00012822641913095355, + "loss": 2.0553, + "step": 24763 + }, + { + "epoch": 2.3804671729308855, + "grad_norm": 1.0708518028259277, + "learning_rate": 0.0001282141158391314, + "loss": 1.9662, + "step": 24764 + }, + { + "epoch": 2.3805632990483514, + "grad_norm": 1.1392226219177246, + "learning_rate": 0.00012820181271117526, + "loss": 1.9621, + "step": 24765 + }, + { + "epoch": 2.3806594251658177, + "grad_norm": 1.0783098936080933, + "learning_rate": 0.00012818950974717165, + "loss": 1.996, + "step": 24766 + }, + { + "epoch": 2.3807555512832836, + "grad_norm": 1.0919524431228638, + "learning_rate": 0.00012817720694720724, + "loss": 2.0321, + "step": 24767 + }, + { + "epoch": 2.3808516774007495, + "grad_norm": 1.074155330657959, + "learning_rate": 0.00012816490431136855, + "loss": 2.076, + "step": 24768 + }, + { + "epoch": 2.380947803518216, + "grad_norm": 1.1281709671020508, + "learning_rate": 0.00012815260183974216, + "loss": 1.9922, + "step": 24769 + }, + { + "epoch": 2.3810439296356822, + "grad_norm": 1.1667201519012451, + "learning_rate": 0.00012814029953241464, + "loss": 2.0842, + "step": 24770 + }, + { + "epoch": 2.381140055753148, + "grad_norm": 1.1766437292099, + "learning_rate": 0.00012812799738947257, + "loss": 1.8992, + "step": 24771 + }, + { + "epoch": 2.381236181870614, + "grad_norm": 1.1826282739639282, + "learning_rate": 0.0001281156954110025, + "loss": 1.9992, + "step": 24772 + }, + { + "epoch": 2.3813323079880804, + "grad_norm": 1.0404601097106934, + "learning_rate": 0.00012810339359709105, + "loss": 1.789, + "step": 24773 + }, + { + "epoch": 2.3814284341055467, + "grad_norm": 1.1019465923309326, + "learning_rate": 0.00012809109194782475, + "loss": 1.8765, + "step": 24774 + }, + { + "epoch": 2.3815245602230126, + "grad_norm": 1.2081774473190308, + "learning_rate": 0.00012807879046329018, + "loss": 2.0069, + "step": 24775 + }, + { + "epoch": 2.3816206863404785, + "grad_norm": 1.0825939178466797, + "learning_rate": 0.00012806648914357393, + "loss": 2.0215, + "step": 24776 + }, + { + "epoch": 2.381716812457945, + "grad_norm": 1.2081230878829956, + "learning_rate": 0.0001280541879887625, + "loss": 2.1564, + "step": 24777 + }, + { + "epoch": 2.3818129385754108, + "grad_norm": 1.18272066116333, + "learning_rate": 0.00012804188699894252, + "loss": 2.1844, + "step": 24778 + }, + { + "epoch": 2.381909064692877, + "grad_norm": 1.5419048070907593, + "learning_rate": 0.00012802958617420053, + "loss": 2.2745, + "step": 24779 + }, + { + "epoch": 2.382005190810343, + "grad_norm": 1.3725031614303589, + "learning_rate": 0.00012801728551462308, + "loss": 2.1606, + "step": 24780 + }, + { + "epoch": 2.3821013169278094, + "grad_norm": 1.0193285942077637, + "learning_rate": 0.00012800498502029678, + "loss": 1.9628, + "step": 24781 + }, + { + "epoch": 2.3821974430452753, + "grad_norm": 0.9931008219718933, + "learning_rate": 0.00012799268469130813, + "loss": 1.9952, + "step": 24782 + }, + { + "epoch": 2.3822935691627416, + "grad_norm": 1.183166742324829, + "learning_rate": 0.0001279803845277437, + "loss": 1.9845, + "step": 24783 + }, + { + "epoch": 2.3823896952802075, + "grad_norm": 1.1107889413833618, + "learning_rate": 0.00012796808452969006, + "loss": 2.0041, + "step": 24784 + }, + { + "epoch": 2.382485821397674, + "grad_norm": 1.2002137899398804, + "learning_rate": 0.00012795578469723378, + "loss": 1.8863, + "step": 24785 + }, + { + "epoch": 2.3825819475151397, + "grad_norm": 1.0854825973510742, + "learning_rate": 0.00012794348503046142, + "loss": 1.8987, + "step": 24786 + }, + { + "epoch": 2.382678073632606, + "grad_norm": 1.0409928560256958, + "learning_rate": 0.00012793118552945952, + "loss": 2.0475, + "step": 24787 + }, + { + "epoch": 2.382774199750072, + "grad_norm": 1.230758786201477, + "learning_rate": 0.00012791888619431463, + "loss": 1.9947, + "step": 24788 + }, + { + "epoch": 2.3828703258675383, + "grad_norm": 1.0547386407852173, + "learning_rate": 0.00012790658702511328, + "loss": 1.9188, + "step": 24789 + }, + { + "epoch": 2.3829664519850042, + "grad_norm": 1.1607569456100464, + "learning_rate": 0.00012789428802194208, + "loss": 2.0508, + "step": 24790 + }, + { + "epoch": 2.3830625781024706, + "grad_norm": 1.0804377794265747, + "learning_rate": 0.00012788198918488753, + "loss": 1.9192, + "step": 24791 + }, + { + "epoch": 2.3831587042199365, + "grad_norm": 1.089848518371582, + "learning_rate": 0.00012786969051403621, + "loss": 2.0582, + "step": 24792 + }, + { + "epoch": 2.383254830337403, + "grad_norm": 1.1421774625778198, + "learning_rate": 0.00012785739200947462, + "loss": 2.0149, + "step": 24793 + }, + { + "epoch": 2.3833509564548687, + "grad_norm": 1.108860969543457, + "learning_rate": 0.00012784509367128936, + "loss": 1.9235, + "step": 24794 + }, + { + "epoch": 2.383447082572335, + "grad_norm": 1.1483741998672485, + "learning_rate": 0.000127832795499567, + "loss": 1.9485, + "step": 24795 + }, + { + "epoch": 2.383543208689801, + "grad_norm": 1.0658133029937744, + "learning_rate": 0.00012782049749439394, + "loss": 2.0532, + "step": 24796 + }, + { + "epoch": 2.3836393348072673, + "grad_norm": 1.165108323097229, + "learning_rate": 0.00012780819965585688, + "loss": 2.1253, + "step": 24797 + }, + { + "epoch": 2.383735460924733, + "grad_norm": 1.0595238208770752, + "learning_rate": 0.0001277959019840423, + "loss": 1.907, + "step": 24798 + }, + { + "epoch": 2.3838315870421996, + "grad_norm": 1.1045571565628052, + "learning_rate": 0.00012778360447903672, + "loss": 1.9013, + "step": 24799 + }, + { + "epoch": 2.3839277131596655, + "grad_norm": 1.1484148502349854, + "learning_rate": 0.00012777130714092673, + "loss": 2.0487, + "step": 24800 + }, + { + "epoch": 2.3840238392771314, + "grad_norm": 0.978052020072937, + "learning_rate": 0.0001277590099697988, + "loss": 1.9253, + "step": 24801 + }, + { + "epoch": 2.3841199653945977, + "grad_norm": 1.097120761871338, + "learning_rate": 0.00012774671296573954, + "loss": 2.0401, + "step": 24802 + }, + { + "epoch": 2.384216091512064, + "grad_norm": 1.0528628826141357, + "learning_rate": 0.00012773441612883542, + "loss": 1.9651, + "step": 24803 + }, + { + "epoch": 2.38431221762953, + "grad_norm": 1.1988800764083862, + "learning_rate": 0.00012772211945917303, + "loss": 1.8918, + "step": 24804 + }, + { + "epoch": 2.384408343746996, + "grad_norm": 1.011873483657837, + "learning_rate": 0.00012770982295683884, + "loss": 1.8679, + "step": 24805 + }, + { + "epoch": 2.384504469864462, + "grad_norm": 1.1919581890106201, + "learning_rate": 0.00012769752662191946, + "loss": 2.0325, + "step": 24806 + }, + { + "epoch": 2.3846005959819285, + "grad_norm": 1.275876522064209, + "learning_rate": 0.0001276852304545014, + "loss": 2.1363, + "step": 24807 + }, + { + "epoch": 2.3846967220993944, + "grad_norm": 1.0435189008712769, + "learning_rate": 0.00012767293445467115, + "loss": 1.9271, + "step": 24808 + }, + { + "epoch": 2.3847928482168603, + "grad_norm": 1.2448831796646118, + "learning_rate": 0.00012766063862251526, + "loss": 2.1179, + "step": 24809 + }, + { + "epoch": 2.3848889743343267, + "grad_norm": 1.0995943546295166, + "learning_rate": 0.00012764834295812022, + "loss": 1.8556, + "step": 24810 + }, + { + "epoch": 2.3849851004517926, + "grad_norm": 1.063349723815918, + "learning_rate": 0.0001276360474615726, + "loss": 2.1321, + "step": 24811 + }, + { + "epoch": 2.385081226569259, + "grad_norm": 1.2558931112289429, + "learning_rate": 0.00012762375213295896, + "loss": 2.0428, + "step": 24812 + }, + { + "epoch": 2.385177352686725, + "grad_norm": 1.0596452951431274, + "learning_rate": 0.0001276114569723658, + "loss": 1.9736, + "step": 24813 + }, + { + "epoch": 2.385273478804191, + "grad_norm": 1.1827812194824219, + "learning_rate": 0.00012759916197987962, + "loss": 1.9189, + "step": 24814 + }, + { + "epoch": 2.385369604921657, + "grad_norm": 1.224762201309204, + "learning_rate": 0.00012758686715558693, + "loss": 2.0724, + "step": 24815 + }, + { + "epoch": 2.3854657310391234, + "grad_norm": 1.0826977491378784, + "learning_rate": 0.00012757457249957426, + "loss": 2.1162, + "step": 24816 + }, + { + "epoch": 2.3855618571565893, + "grad_norm": 1.1396465301513672, + "learning_rate": 0.00012756227801192813, + "loss": 1.8497, + "step": 24817 + }, + { + "epoch": 2.3856579832740556, + "grad_norm": 1.0582075119018555, + "learning_rate": 0.00012754998369273512, + "loss": 2.0035, + "step": 24818 + }, + { + "epoch": 2.3857541093915215, + "grad_norm": 1.1345741748809814, + "learning_rate": 0.00012753768954208163, + "loss": 2.0844, + "step": 24819 + }, + { + "epoch": 2.385850235508988, + "grad_norm": 1.2408194541931152, + "learning_rate": 0.00012752539556005427, + "loss": 2.2191, + "step": 24820 + }, + { + "epoch": 2.385946361626454, + "grad_norm": 1.036601185798645, + "learning_rate": 0.00012751310174673953, + "loss": 1.9092, + "step": 24821 + }, + { + "epoch": 2.38604248774392, + "grad_norm": 1.081804871559143, + "learning_rate": 0.0001275008081022239, + "loss": 1.9206, + "step": 24822 + }, + { + "epoch": 2.386138613861386, + "grad_norm": 1.2413280010223389, + "learning_rate": 0.00012748851462659393, + "loss": 1.916, + "step": 24823 + }, + { + "epoch": 2.3862347399788524, + "grad_norm": 1.3694775104522705, + "learning_rate": 0.00012747622131993608, + "loss": 2.0345, + "step": 24824 + }, + { + "epoch": 2.3863308660963183, + "grad_norm": 1.2551934719085693, + "learning_rate": 0.0001274639281823369, + "loss": 1.9872, + "step": 24825 + }, + { + "epoch": 2.3864269922137846, + "grad_norm": 1.0794894695281982, + "learning_rate": 0.00012745163521388287, + "loss": 1.9517, + "step": 24826 + }, + { + "epoch": 2.3865231183312505, + "grad_norm": 1.0172150135040283, + "learning_rate": 0.0001274393424146605, + "loss": 1.9195, + "step": 24827 + }, + { + "epoch": 2.386619244448717, + "grad_norm": 1.1610386371612549, + "learning_rate": 0.00012742704978475637, + "loss": 2.1497, + "step": 24828 + }, + { + "epoch": 2.3867153705661828, + "grad_norm": 1.3212487697601318, + "learning_rate": 0.0001274147573242569, + "loss": 2.4139, + "step": 24829 + }, + { + "epoch": 2.386811496683649, + "grad_norm": 1.0580838918685913, + "learning_rate": 0.0001274024650332486, + "loss": 1.8363, + "step": 24830 + }, + { + "epoch": 2.386907622801115, + "grad_norm": 1.140047311782837, + "learning_rate": 0.00012739017291181804, + "loss": 2.0965, + "step": 24831 + }, + { + "epoch": 2.3870037489185814, + "grad_norm": 1.125350832939148, + "learning_rate": 0.00012737788096005162, + "loss": 2.1081, + "step": 24832 + }, + { + "epoch": 2.3870998750360473, + "grad_norm": 1.2119741439819336, + "learning_rate": 0.00012736558917803592, + "loss": 2.1019, + "step": 24833 + }, + { + "epoch": 2.3871960011535136, + "grad_norm": 1.1566827297210693, + "learning_rate": 0.0001273532975658574, + "loss": 2.0834, + "step": 24834 + }, + { + "epoch": 2.3872921272709795, + "grad_norm": 1.0248584747314453, + "learning_rate": 0.0001273410061236026, + "loss": 1.9879, + "step": 24835 + }, + { + "epoch": 2.387388253388446, + "grad_norm": 1.1567156314849854, + "learning_rate": 0.00012732871485135797, + "loss": 2.1041, + "step": 24836 + }, + { + "epoch": 2.3874843795059117, + "grad_norm": 1.1627014875411987, + "learning_rate": 0.00012731642374921002, + "loss": 2.049, + "step": 24837 + }, + { + "epoch": 2.3875805056233776, + "grad_norm": 1.2390353679656982, + "learning_rate": 0.00012730413281724525, + "loss": 2.2121, + "step": 24838 + }, + { + "epoch": 2.387676631740844, + "grad_norm": 1.2069677114486694, + "learning_rate": 0.00012729184205555012, + "loss": 1.8843, + "step": 24839 + }, + { + "epoch": 2.3877727578583103, + "grad_norm": 1.0786579847335815, + "learning_rate": 0.0001272795514642112, + "loss": 2.0268, + "step": 24840 + }, + { + "epoch": 2.3878688839757762, + "grad_norm": 1.2703315019607544, + "learning_rate": 0.00012726726104331492, + "loss": 1.8433, + "step": 24841 + }, + { + "epoch": 2.387965010093242, + "grad_norm": 1.0946940183639526, + "learning_rate": 0.00012725497079294775, + "loss": 1.9076, + "step": 24842 + }, + { + "epoch": 2.3880611362107085, + "grad_norm": 1.1147551536560059, + "learning_rate": 0.00012724268071319625, + "loss": 1.8282, + "step": 24843 + }, + { + "epoch": 2.3881572623281744, + "grad_norm": 1.0899481773376465, + "learning_rate": 0.00012723039080414682, + "loss": 1.9259, + "step": 24844 + }, + { + "epoch": 2.3882533884456407, + "grad_norm": 1.0240836143493652, + "learning_rate": 0.000127218101065886, + "loss": 2.0796, + "step": 24845 + }, + { + "epoch": 2.3883495145631066, + "grad_norm": 1.1122777462005615, + "learning_rate": 0.0001272058114985003, + "loss": 2.0686, + "step": 24846 + }, + { + "epoch": 2.388445640680573, + "grad_norm": 1.0773015022277832, + "learning_rate": 0.00012719352210207615, + "loss": 2.0626, + "step": 24847 + }, + { + "epoch": 2.388541766798039, + "grad_norm": 1.180173635482788, + "learning_rate": 0.00012718123287670005, + "loss": 2.0398, + "step": 24848 + }, + { + "epoch": 2.388637892915505, + "grad_norm": 1.1516886949539185, + "learning_rate": 0.0001271689438224585, + "loss": 2.1777, + "step": 24849 + }, + { + "epoch": 2.388734019032971, + "grad_norm": 1.2935072183609009, + "learning_rate": 0.00012715665493943796, + "loss": 2.0, + "step": 24850 + }, + { + "epoch": 2.3888301451504375, + "grad_norm": 1.2599620819091797, + "learning_rate": 0.0001271443662277249, + "loss": 2.1127, + "step": 24851 + }, + { + "epoch": 2.3889262712679034, + "grad_norm": 0.9805333018302917, + "learning_rate": 0.0001271320776874058, + "loss": 1.7408, + "step": 24852 + }, + { + "epoch": 2.3890223973853697, + "grad_norm": 1.012631893157959, + "learning_rate": 0.00012711978931856716, + "loss": 2.0948, + "step": 24853 + }, + { + "epoch": 2.3891185235028356, + "grad_norm": 1.1322660446166992, + "learning_rate": 0.00012710750112129545, + "loss": 1.8439, + "step": 24854 + }, + { + "epoch": 2.389214649620302, + "grad_norm": 1.1149498224258423, + "learning_rate": 0.00012709521309567715, + "loss": 2.2097, + "step": 24855 + }, + { + "epoch": 2.389310775737768, + "grad_norm": 1.1604385375976562, + "learning_rate": 0.00012708292524179866, + "loss": 2.0547, + "step": 24856 + }, + { + "epoch": 2.389406901855234, + "grad_norm": 1.0600022077560425, + "learning_rate": 0.00012707063755974655, + "loss": 1.9237, + "step": 24857 + }, + { + "epoch": 2.3895030279727, + "grad_norm": 1.172135591506958, + "learning_rate": 0.00012705835004960724, + "loss": 1.966, + "step": 24858 + }, + { + "epoch": 2.3895991540901664, + "grad_norm": 1.1948398351669312, + "learning_rate": 0.00012704606271146722, + "loss": 1.9467, + "step": 24859 + }, + { + "epoch": 2.3896952802076323, + "grad_norm": 1.152999758720398, + "learning_rate": 0.0001270337755454129, + "loss": 2.1656, + "step": 24860 + }, + { + "epoch": 2.3897914063250987, + "grad_norm": 1.198507308959961, + "learning_rate": 0.0001270214885515308, + "loss": 2.1156, + "step": 24861 + }, + { + "epoch": 2.3898875324425646, + "grad_norm": 1.0731958150863647, + "learning_rate": 0.00012700920172990739, + "loss": 1.986, + "step": 24862 + }, + { + "epoch": 2.389983658560031, + "grad_norm": 1.1586884260177612, + "learning_rate": 0.00012699691508062913, + "loss": 1.8652, + "step": 24863 + }, + { + "epoch": 2.390079784677497, + "grad_norm": 1.1179691553115845, + "learning_rate": 0.0001269846286037825, + "loss": 1.8852, + "step": 24864 + }, + { + "epoch": 2.390175910794963, + "grad_norm": 1.0945760011672974, + "learning_rate": 0.00012697234229945388, + "loss": 2.0901, + "step": 24865 + }, + { + "epoch": 2.390272036912429, + "grad_norm": 1.4261940717697144, + "learning_rate": 0.0001269600561677298, + "loss": 2.2519, + "step": 24866 + }, + { + "epoch": 2.3903681630298954, + "grad_norm": 1.1951619386672974, + "learning_rate": 0.0001269477702086967, + "loss": 2.0335, + "step": 24867 + }, + { + "epoch": 2.3904642891473613, + "grad_norm": 1.2624375820159912, + "learning_rate": 0.0001269354844224411, + "loss": 2.0371, + "step": 24868 + }, + { + "epoch": 2.3905604152648277, + "grad_norm": 1.1203680038452148, + "learning_rate": 0.00012692319880904933, + "loss": 2.0823, + "step": 24869 + }, + { + "epoch": 2.3906565413822936, + "grad_norm": 1.1451572179794312, + "learning_rate": 0.00012691091336860795, + "loss": 2.0367, + "step": 24870 + }, + { + "epoch": 2.3907526674997595, + "grad_norm": 1.2413125038146973, + "learning_rate": 0.00012689862810120336, + "loss": 2.1885, + "step": 24871 + }, + { + "epoch": 2.390848793617226, + "grad_norm": 1.166332721710205, + "learning_rate": 0.00012688634300692206, + "loss": 2.0704, + "step": 24872 + }, + { + "epoch": 2.390944919734692, + "grad_norm": 1.3608534336090088, + "learning_rate": 0.00012687405808585044, + "loss": 2.1311, + "step": 24873 + }, + { + "epoch": 2.391041045852158, + "grad_norm": 1.131022334098816, + "learning_rate": 0.000126861773338075, + "loss": 1.9115, + "step": 24874 + }, + { + "epoch": 2.391137171969624, + "grad_norm": 1.145831823348999, + "learning_rate": 0.0001268494887636822, + "loss": 1.9394, + "step": 24875 + }, + { + "epoch": 2.3912332980870903, + "grad_norm": 1.2856122255325317, + "learning_rate": 0.0001268372043627584, + "loss": 1.988, + "step": 24876 + }, + { + "epoch": 2.391329424204556, + "grad_norm": 0.9998330473899841, + "learning_rate": 0.00012682492013539018, + "loss": 2.0663, + "step": 24877 + }, + { + "epoch": 2.3914255503220225, + "grad_norm": 1.2113031148910522, + "learning_rate": 0.00012681263608166384, + "loss": 2.0281, + "step": 24878 + }, + { + "epoch": 2.3915216764394884, + "grad_norm": 1.0361545085906982, + "learning_rate": 0.00012680035220166594, + "loss": 2.0645, + "step": 24879 + }, + { + "epoch": 2.3916178025569548, + "grad_norm": 1.265238881111145, + "learning_rate": 0.0001267880684954829, + "loss": 1.9176, + "step": 24880 + }, + { + "epoch": 2.3917139286744207, + "grad_norm": 1.1303027868270874, + "learning_rate": 0.00012677578496320113, + "loss": 2.1352, + "step": 24881 + }, + { + "epoch": 2.391810054791887, + "grad_norm": 1.222529649734497, + "learning_rate": 0.00012676350160490708, + "loss": 2.0318, + "step": 24882 + }, + { + "epoch": 2.391906180909353, + "grad_norm": 1.0895638465881348, + "learning_rate": 0.00012675121842068722, + "loss": 1.7997, + "step": 24883 + }, + { + "epoch": 2.3920023070268193, + "grad_norm": 1.1749200820922852, + "learning_rate": 0.00012673893541062794, + "loss": 1.9476, + "step": 24884 + }, + { + "epoch": 2.392098433144285, + "grad_norm": 1.203593373298645, + "learning_rate": 0.00012672665257481574, + "loss": 2.1335, + "step": 24885 + }, + { + "epoch": 2.3921945592617515, + "grad_norm": 1.0566576719284058, + "learning_rate": 0.000126714369913337, + "loss": 1.9407, + "step": 24886 + }, + { + "epoch": 2.3922906853792174, + "grad_norm": 1.3538384437561035, + "learning_rate": 0.00012670208742627813, + "loss": 2.1665, + "step": 24887 + }, + { + "epoch": 2.3923868114966838, + "grad_norm": 1.2539036273956299, + "learning_rate": 0.00012668980511372564, + "loss": 2.1418, + "step": 24888 + }, + { + "epoch": 2.3924829376141497, + "grad_norm": 1.1135939359664917, + "learning_rate": 0.00012667752297576594, + "loss": 1.9837, + "step": 24889 + }, + { + "epoch": 2.392579063731616, + "grad_norm": 1.1786946058273315, + "learning_rate": 0.00012666524101248542, + "loss": 2.0368, + "step": 24890 + }, + { + "epoch": 2.392675189849082, + "grad_norm": 1.2295877933502197, + "learning_rate": 0.00012665295922397056, + "loss": 1.925, + "step": 24891 + }, + { + "epoch": 2.3927713159665482, + "grad_norm": 1.3354557752609253, + "learning_rate": 0.00012664067761030778, + "loss": 2.1003, + "step": 24892 + }, + { + "epoch": 2.392867442084014, + "grad_norm": 1.3047072887420654, + "learning_rate": 0.00012662839617158346, + "loss": 1.9843, + "step": 24893 + }, + { + "epoch": 2.3929635682014805, + "grad_norm": 1.26132333278656, + "learning_rate": 0.0001266161149078841, + "loss": 2.0881, + "step": 24894 + }, + { + "epoch": 2.3930596943189464, + "grad_norm": 1.13802969455719, + "learning_rate": 0.00012660383381929603, + "loss": 2.0579, + "step": 24895 + }, + { + "epoch": 2.3931558204364127, + "grad_norm": 1.231320858001709, + "learning_rate": 0.00012659155290590577, + "loss": 2.1398, + "step": 24896 + }, + { + "epoch": 2.3932519465538786, + "grad_norm": 1.1597588062286377, + "learning_rate": 0.0001265792721677997, + "loss": 2.0013, + "step": 24897 + }, + { + "epoch": 2.393348072671345, + "grad_norm": 1.0484589338302612, + "learning_rate": 0.00012656699160506425, + "loss": 1.8941, + "step": 24898 + }, + { + "epoch": 2.393444198788811, + "grad_norm": 1.1616045236587524, + "learning_rate": 0.00012655471121778586, + "loss": 1.9539, + "step": 24899 + }, + { + "epoch": 2.393540324906277, + "grad_norm": 1.1160757541656494, + "learning_rate": 0.00012654243100605086, + "loss": 2.0935, + "step": 24900 + }, + { + "epoch": 2.393636451023743, + "grad_norm": 1.2673537731170654, + "learning_rate": 0.0001265301509699458, + "loss": 2.1359, + "step": 24901 + }, + { + "epoch": 2.3937325771412095, + "grad_norm": 1.1753073930740356, + "learning_rate": 0.00012651787110955697, + "loss": 1.9589, + "step": 24902 + }, + { + "epoch": 2.3938287032586754, + "grad_norm": 1.1398273706436157, + "learning_rate": 0.00012650559142497088, + "loss": 2.1831, + "step": 24903 + }, + { + "epoch": 2.3939248293761413, + "grad_norm": 1.2180012464523315, + "learning_rate": 0.00012649331191627388, + "loss": 2.223, + "step": 24904 + }, + { + "epoch": 2.3940209554936076, + "grad_norm": 1.111626386642456, + "learning_rate": 0.0001264810325835524, + "loss": 2.0517, + "step": 24905 + }, + { + "epoch": 2.394117081611074, + "grad_norm": 1.2462003231048584, + "learning_rate": 0.00012646875342689292, + "loss": 2.1948, + "step": 24906 + }, + { + "epoch": 2.39421320772854, + "grad_norm": 1.2584556341171265, + "learning_rate": 0.00012645647444638173, + "loss": 2.0066, + "step": 24907 + }, + { + "epoch": 2.3943093338460057, + "grad_norm": 1.102830410003662, + "learning_rate": 0.00012644419564210532, + "loss": 2.2437, + "step": 24908 + }, + { + "epoch": 2.394405459963472, + "grad_norm": 1.4094254970550537, + "learning_rate": 0.00012643191701415006, + "loss": 2.2115, + "step": 24909 + }, + { + "epoch": 2.3945015860809384, + "grad_norm": 1.282110571861267, + "learning_rate": 0.00012641963856260238, + "loss": 1.9508, + "step": 24910 + }, + { + "epoch": 2.3945977121984043, + "grad_norm": 1.0764925479888916, + "learning_rate": 0.0001264073602875487, + "loss": 1.7401, + "step": 24911 + }, + { + "epoch": 2.3946938383158702, + "grad_norm": 1.0600048303604126, + "learning_rate": 0.00012639508218907535, + "loss": 1.75, + "step": 24912 + }, + { + "epoch": 2.3947899644333366, + "grad_norm": 1.0164813995361328, + "learning_rate": 0.00012638280426726883, + "loss": 2.0778, + "step": 24913 + }, + { + "epoch": 2.3948860905508025, + "grad_norm": 1.0051506757736206, + "learning_rate": 0.0001263705265222155, + "loss": 2.0606, + "step": 24914 + }, + { + "epoch": 2.394982216668269, + "grad_norm": 1.0665810108184814, + "learning_rate": 0.00012635824895400173, + "loss": 2.0648, + "step": 24915 + }, + { + "epoch": 2.3950783427857347, + "grad_norm": 1.18904447555542, + "learning_rate": 0.00012634597156271397, + "loss": 1.984, + "step": 24916 + }, + { + "epoch": 2.395174468903201, + "grad_norm": 1.2799594402313232, + "learning_rate": 0.00012633369434843857, + "loss": 2.0445, + "step": 24917 + }, + { + "epoch": 2.395270595020667, + "grad_norm": 1.1789547204971313, + "learning_rate": 0.00012632141731126198, + "loss": 1.9956, + "step": 24918 + }, + { + "epoch": 2.3953667211381333, + "grad_norm": 1.1006208658218384, + "learning_rate": 0.00012630914045127053, + "loss": 2.0586, + "step": 24919 + }, + { + "epoch": 2.395462847255599, + "grad_norm": 1.122373104095459, + "learning_rate": 0.00012629686376855066, + "loss": 1.9187, + "step": 24920 + }, + { + "epoch": 2.3955589733730656, + "grad_norm": 1.0876376628875732, + "learning_rate": 0.00012628458726318876, + "loss": 1.8472, + "step": 24921 + }, + { + "epoch": 2.3956550994905315, + "grad_norm": 1.3482540845870972, + "learning_rate": 0.0001262723109352712, + "loss": 2.0264, + "step": 24922 + }, + { + "epoch": 2.395751225607998, + "grad_norm": 1.0397017002105713, + "learning_rate": 0.00012626003478488442, + "loss": 1.9384, + "step": 24923 + }, + { + "epoch": 2.3958473517254637, + "grad_norm": 1.1486281156539917, + "learning_rate": 0.00012624775881211472, + "loss": 2.09, + "step": 24924 + }, + { + "epoch": 2.39594347784293, + "grad_norm": 1.0167359113693237, + "learning_rate": 0.00012623548301704857, + "loss": 1.9558, + "step": 24925 + }, + { + "epoch": 2.396039603960396, + "grad_norm": 0.9562268257141113, + "learning_rate": 0.0001262232073997723, + "loss": 1.8505, + "step": 24926 + }, + { + "epoch": 2.3961357300778623, + "grad_norm": 1.2663520574569702, + "learning_rate": 0.00012621093196037237, + "loss": 1.9044, + "step": 24927 + }, + { + "epoch": 2.396231856195328, + "grad_norm": 1.2963414192199707, + "learning_rate": 0.00012619865669893509, + "loss": 2.1517, + "step": 24928 + }, + { + "epoch": 2.3963279823127945, + "grad_norm": 1.1138375997543335, + "learning_rate": 0.0001261863816155468, + "loss": 1.9851, + "step": 24929 + }, + { + "epoch": 2.3964241084302604, + "grad_norm": 1.333739995956421, + "learning_rate": 0.00012617410671029403, + "loss": 1.9707, + "step": 24930 + }, + { + "epoch": 2.396520234547727, + "grad_norm": 1.0949634313583374, + "learning_rate": 0.00012616183198326309, + "loss": 1.9377, + "step": 24931 + }, + { + "epoch": 2.3966163606651927, + "grad_norm": 1.0958000421524048, + "learning_rate": 0.0001261495574345403, + "loss": 2.1347, + "step": 24932 + }, + { + "epoch": 2.396712486782659, + "grad_norm": 1.0979467630386353, + "learning_rate": 0.00012613728306421211, + "loss": 2.1432, + "step": 24933 + }, + { + "epoch": 2.396808612900125, + "grad_norm": 1.022650957107544, + "learning_rate": 0.00012612500887236485, + "loss": 1.9604, + "step": 24934 + }, + { + "epoch": 2.3969047390175913, + "grad_norm": 1.0974235534667969, + "learning_rate": 0.00012611273485908496, + "loss": 1.8745, + "step": 24935 + }, + { + "epoch": 2.397000865135057, + "grad_norm": 1.2109651565551758, + "learning_rate": 0.00012610046102445876, + "loss": 2.099, + "step": 24936 + }, + { + "epoch": 2.397096991252523, + "grad_norm": 1.2721713781356812, + "learning_rate": 0.00012608818736857262, + "loss": 2.1231, + "step": 24937 + }, + { + "epoch": 2.3971931173699894, + "grad_norm": 1.2208166122436523, + "learning_rate": 0.00012607591389151294, + "loss": 2.1318, + "step": 24938 + }, + { + "epoch": 2.3972892434874558, + "grad_norm": 1.1159319877624512, + "learning_rate": 0.00012606364059336607, + "loss": 1.8288, + "step": 24939 + }, + { + "epoch": 2.3973853696049217, + "grad_norm": 1.1840029954910278, + "learning_rate": 0.0001260513674742184, + "loss": 2.0591, + "step": 24940 + }, + { + "epoch": 2.3974814957223876, + "grad_norm": 1.2910833358764648, + "learning_rate": 0.00012603909453415624, + "loss": 2.0341, + "step": 24941 + }, + { + "epoch": 2.397577621839854, + "grad_norm": 1.1116317510604858, + "learning_rate": 0.00012602682177326603, + "loss": 1.9159, + "step": 24942 + }, + { + "epoch": 2.3976737479573202, + "grad_norm": 1.2651125192642212, + "learning_rate": 0.00012601454919163412, + "loss": 2.0031, + "step": 24943 + }, + { + "epoch": 2.397769874074786, + "grad_norm": 1.076002597808838, + "learning_rate": 0.00012600227678934682, + "loss": 1.9459, + "step": 24944 + }, + { + "epoch": 2.397866000192252, + "grad_norm": 1.0784833431243896, + "learning_rate": 0.00012599000456649056, + "loss": 2.0104, + "step": 24945 + }, + { + "epoch": 2.3979621263097184, + "grad_norm": 1.1964285373687744, + "learning_rate": 0.0001259777325231516, + "loss": 2.0606, + "step": 24946 + }, + { + "epoch": 2.3980582524271843, + "grad_norm": 1.2236542701721191, + "learning_rate": 0.00012596546065941645, + "loss": 2.2032, + "step": 24947 + }, + { + "epoch": 2.3981543785446506, + "grad_norm": 1.0459725856781006, + "learning_rate": 0.00012595318897537137, + "loss": 2.0177, + "step": 24948 + }, + { + "epoch": 2.3982505046621165, + "grad_norm": 1.1756843328475952, + "learning_rate": 0.00012594091747110276, + "loss": 1.8695, + "step": 24949 + }, + { + "epoch": 2.398346630779583, + "grad_norm": 1.1264300346374512, + "learning_rate": 0.00012592864614669692, + "loss": 1.8999, + "step": 24950 + }, + { + "epoch": 2.3984427568970488, + "grad_norm": 1.2592661380767822, + "learning_rate": 0.00012591637500224026, + "loss": 1.8523, + "step": 24951 + }, + { + "epoch": 2.398538883014515, + "grad_norm": 1.217705249786377, + "learning_rate": 0.0001259041040378191, + "loss": 2.0747, + "step": 24952 + }, + { + "epoch": 2.398635009131981, + "grad_norm": 1.1842252016067505, + "learning_rate": 0.00012589183325351983, + "loss": 2.0273, + "step": 24953 + }, + { + "epoch": 2.3987311352494474, + "grad_norm": 1.2825690507888794, + "learning_rate": 0.00012587956264942878, + "loss": 2.0758, + "step": 24954 + }, + { + "epoch": 2.3988272613669133, + "grad_norm": 1.2097588777542114, + "learning_rate": 0.00012586729222563228, + "loss": 1.9972, + "step": 24955 + }, + { + "epoch": 2.3989233874843796, + "grad_norm": 1.468512773513794, + "learning_rate": 0.00012585502198221669, + "loss": 2.2473, + "step": 24956 + }, + { + "epoch": 2.3990195136018455, + "grad_norm": 1.1375707387924194, + "learning_rate": 0.00012584275191926836, + "loss": 1.8463, + "step": 24957 + }, + { + "epoch": 2.399115639719312, + "grad_norm": 1.2971163988113403, + "learning_rate": 0.00012583048203687364, + "loss": 1.9541, + "step": 24958 + }, + { + "epoch": 2.3992117658367778, + "grad_norm": 1.1984783411026, + "learning_rate": 0.00012581821233511887, + "loss": 1.979, + "step": 24959 + }, + { + "epoch": 2.399307891954244, + "grad_norm": 1.0113080739974976, + "learning_rate": 0.00012580594281409044, + "loss": 1.7375, + "step": 24960 + }, + { + "epoch": 2.39940401807171, + "grad_norm": 1.046794056892395, + "learning_rate": 0.00012579367347387462, + "loss": 1.9403, + "step": 24961 + }, + { + "epoch": 2.3995001441891763, + "grad_norm": 1.1085416078567505, + "learning_rate": 0.00012578140431455777, + "loss": 2.0181, + "step": 24962 + }, + { + "epoch": 2.3995962703066422, + "grad_norm": 1.2245211601257324, + "learning_rate": 0.00012576913533622622, + "loss": 2.0826, + "step": 24963 + }, + { + "epoch": 2.3996923964241086, + "grad_norm": 1.1715410947799683, + "learning_rate": 0.00012575686653896638, + "loss": 1.9689, + "step": 24964 + }, + { + "epoch": 2.3997885225415745, + "grad_norm": 1.120590090751648, + "learning_rate": 0.00012574459792286452, + "loss": 2.0103, + "step": 24965 + }, + { + "epoch": 2.399884648659041, + "grad_norm": 0.987709641456604, + "learning_rate": 0.00012573232948800698, + "loss": 2.0009, + "step": 24966 + }, + { + "epoch": 2.3999807747765067, + "grad_norm": 1.1518079042434692, + "learning_rate": 0.00012572006123448012, + "loss": 1.8899, + "step": 24967 + }, + { + "epoch": 2.400076900893973, + "grad_norm": 1.0624713897705078, + "learning_rate": 0.00012570779316237025, + "loss": 1.8238, + "step": 24968 + }, + { + "epoch": 2.400173027011439, + "grad_norm": 1.1012974977493286, + "learning_rate": 0.00012569552527176374, + "loss": 1.8103, + "step": 24969 + }, + { + "epoch": 2.400269153128905, + "grad_norm": 1.2644370794296265, + "learning_rate": 0.00012568325756274687, + "loss": 2.1226, + "step": 24970 + }, + { + "epoch": 2.400365279246371, + "grad_norm": 1.202542781829834, + "learning_rate": 0.000125670990035406, + "loss": 1.9966, + "step": 24971 + }, + { + "epoch": 2.4004614053638376, + "grad_norm": 0.9267218708992004, + "learning_rate": 0.00012565872268982745, + "loss": 1.8525, + "step": 24972 + }, + { + "epoch": 2.4005575314813035, + "grad_norm": 1.182334542274475, + "learning_rate": 0.00012564645552609754, + "loss": 1.9974, + "step": 24973 + }, + { + "epoch": 2.4006536575987694, + "grad_norm": 1.073166847229004, + "learning_rate": 0.00012563418854430263, + "loss": 1.9276, + "step": 24974 + }, + { + "epoch": 2.4007497837162357, + "grad_norm": 1.1725258827209473, + "learning_rate": 0.00012562192174452898, + "loss": 1.9087, + "step": 24975 + }, + { + "epoch": 2.400845909833702, + "grad_norm": 1.347927451133728, + "learning_rate": 0.000125609655126863, + "loss": 2.1127, + "step": 24976 + }, + { + "epoch": 2.400942035951168, + "grad_norm": 1.2586060762405396, + "learning_rate": 0.0001255973886913909, + "loss": 1.9036, + "step": 24977 + }, + { + "epoch": 2.401038162068634, + "grad_norm": 1.1703444719314575, + "learning_rate": 0.00012558512243819912, + "loss": 2.1841, + "step": 24978 + }, + { + "epoch": 2.4011342881861, + "grad_norm": 1.2641401290893555, + "learning_rate": 0.0001255728563673739, + "loss": 2.0424, + "step": 24979 + }, + { + "epoch": 2.401230414303566, + "grad_norm": 1.171639084815979, + "learning_rate": 0.00012556059047900156, + "loss": 1.9043, + "step": 24980 + }, + { + "epoch": 2.4013265404210324, + "grad_norm": 1.331895351409912, + "learning_rate": 0.00012554832477316844, + "loss": 2.0231, + "step": 24981 + }, + { + "epoch": 2.4014226665384983, + "grad_norm": 1.069913387298584, + "learning_rate": 0.0001255360592499609, + "loss": 2.052, + "step": 24982 + }, + { + "epoch": 2.4015187926559647, + "grad_norm": 1.1806429624557495, + "learning_rate": 0.00012552379390946518, + "loss": 1.989, + "step": 24983 + }, + { + "epoch": 2.4016149187734306, + "grad_norm": 1.2991931438446045, + "learning_rate": 0.00012551152875176766, + "loss": 1.9026, + "step": 24984 + }, + { + "epoch": 2.401711044890897, + "grad_norm": 1.180945634841919, + "learning_rate": 0.00012549926377695457, + "loss": 2.1002, + "step": 24985 + }, + { + "epoch": 2.401807171008363, + "grad_norm": 1.1172213554382324, + "learning_rate": 0.00012548699898511225, + "loss": 2.1201, + "step": 24986 + }, + { + "epoch": 2.401903297125829, + "grad_norm": 1.0066357851028442, + "learning_rate": 0.00012547473437632704, + "loss": 1.9296, + "step": 24987 + }, + { + "epoch": 2.401999423243295, + "grad_norm": 1.1089632511138916, + "learning_rate": 0.00012546246995068523, + "loss": 2.0599, + "step": 24988 + }, + { + "epoch": 2.4020955493607614, + "grad_norm": 1.1328221559524536, + "learning_rate": 0.00012545020570827316, + "loss": 1.9964, + "step": 24989 + }, + { + "epoch": 2.4021916754782273, + "grad_norm": 1.2776950597763062, + "learning_rate": 0.00012543794164917706, + "loss": 1.993, + "step": 24990 + }, + { + "epoch": 2.4022878015956937, + "grad_norm": 1.1314221620559692, + "learning_rate": 0.0001254256777734833, + "loss": 2.0446, + "step": 24991 + }, + { + "epoch": 2.4023839277131596, + "grad_norm": 1.1293452978134155, + "learning_rate": 0.00012541341408127814, + "loss": 2.0062, + "step": 24992 + }, + { + "epoch": 2.402480053830626, + "grad_norm": 1.0377110242843628, + "learning_rate": 0.0001254011505726479, + "loss": 1.8794, + "step": 24993 + }, + { + "epoch": 2.402576179948092, + "grad_norm": 1.063550353050232, + "learning_rate": 0.00012538888724767888, + "loss": 1.9588, + "step": 24994 + }, + { + "epoch": 2.402672306065558, + "grad_norm": 1.1256163120269775, + "learning_rate": 0.0001253766241064574, + "loss": 1.9566, + "step": 24995 + }, + { + "epoch": 2.402768432183024, + "grad_norm": 1.105684757232666, + "learning_rate": 0.0001253643611490697, + "loss": 1.9615, + "step": 24996 + }, + { + "epoch": 2.4028645583004904, + "grad_norm": 1.0832644701004028, + "learning_rate": 0.00012535209837560211, + "loss": 1.9756, + "step": 24997 + }, + { + "epoch": 2.4029606844179563, + "grad_norm": 1.5543973445892334, + "learning_rate": 0.00012533983578614093, + "loss": 2.0004, + "step": 24998 + }, + { + "epoch": 2.4030568105354226, + "grad_norm": 1.2031340599060059, + "learning_rate": 0.00012532757338077247, + "loss": 2.1117, + "step": 24999 + }, + { + "epoch": 2.4031529366528885, + "grad_norm": 1.2325756549835205, + "learning_rate": 0.00012531531115958302, + "loss": 2.0539, + "step": 25000 + }, + { + "epoch": 2.403249062770355, + "grad_norm": 1.1134079694747925, + "learning_rate": 0.0001253030491226588, + "loss": 1.9486, + "step": 25001 + }, + { + "epoch": 2.403345188887821, + "grad_norm": 1.0566272735595703, + "learning_rate": 0.00012529078727008621, + "loss": 2.0937, + "step": 25002 + }, + { + "epoch": 2.403441315005287, + "grad_norm": 1.2362076044082642, + "learning_rate": 0.00012527852560195144, + "loss": 2.1562, + "step": 25003 + }, + { + "epoch": 2.403537441122753, + "grad_norm": 1.1605830192565918, + "learning_rate": 0.00012526626411834085, + "loss": 2.0787, + "step": 25004 + }, + { + "epoch": 2.4036335672402194, + "grad_norm": 1.1478182077407837, + "learning_rate": 0.00012525400281934067, + "loss": 2.059, + "step": 25005 + }, + { + "epoch": 2.4037296933576853, + "grad_norm": 1.1491985321044922, + "learning_rate": 0.0001252417417050372, + "loss": 1.9512, + "step": 25006 + }, + { + "epoch": 2.403825819475151, + "grad_norm": 1.1787972450256348, + "learning_rate": 0.00012522948077551673, + "loss": 1.9537, + "step": 25007 + }, + { + "epoch": 2.4039219455926175, + "grad_norm": 1.243862271308899, + "learning_rate": 0.00012521722003086557, + "loss": 2.1573, + "step": 25008 + }, + { + "epoch": 2.404018071710084, + "grad_norm": 1.0309871435165405, + "learning_rate": 0.00012520495947116996, + "loss": 1.9511, + "step": 25009 + }, + { + "epoch": 2.4041141978275498, + "grad_norm": 1.0909744501113892, + "learning_rate": 0.0001251926990965162, + "loss": 1.8437, + "step": 25010 + }, + { + "epoch": 2.4042103239450157, + "grad_norm": 1.1086382865905762, + "learning_rate": 0.00012518043890699055, + "loss": 2.0104, + "step": 25011 + }, + { + "epoch": 2.404306450062482, + "grad_norm": 1.0600090026855469, + "learning_rate": 0.0001251681789026793, + "loss": 1.9853, + "step": 25012 + }, + { + "epoch": 2.404402576179948, + "grad_norm": 1.1840612888336182, + "learning_rate": 0.00012515591908366874, + "loss": 2.0724, + "step": 25013 + }, + { + "epoch": 2.4044987022974142, + "grad_norm": 1.16022527217865, + "learning_rate": 0.00012514365945004506, + "loss": 1.783, + "step": 25014 + }, + { + "epoch": 2.40459482841488, + "grad_norm": 1.1171987056732178, + "learning_rate": 0.00012513140000189468, + "loss": 2.1428, + "step": 25015 + }, + { + "epoch": 2.4046909545323465, + "grad_norm": 1.3155550956726074, + "learning_rate": 0.00012511914073930376, + "loss": 2.1087, + "step": 25016 + }, + { + "epoch": 2.4047870806498124, + "grad_norm": 1.1726438999176025, + "learning_rate": 0.00012510688166235862, + "loss": 2.0598, + "step": 25017 + }, + { + "epoch": 2.4048832067672787, + "grad_norm": 0.9468849897384644, + "learning_rate": 0.0001250946227711455, + "loss": 1.8944, + "step": 25018 + }, + { + "epoch": 2.4049793328847446, + "grad_norm": 1.1598482131958008, + "learning_rate": 0.00012508236406575066, + "loss": 1.9564, + "step": 25019 + }, + { + "epoch": 2.405075459002211, + "grad_norm": 1.383028268814087, + "learning_rate": 0.00012507010554626042, + "loss": 2.0685, + "step": 25020 + }, + { + "epoch": 2.405171585119677, + "grad_norm": 1.2329670190811157, + "learning_rate": 0.00012505784721276097, + "loss": 1.9989, + "step": 25021 + }, + { + "epoch": 2.4052677112371432, + "grad_norm": 1.2116349935531616, + "learning_rate": 0.00012504558906533864, + "loss": 2.1059, + "step": 25022 + }, + { + "epoch": 2.405363837354609, + "grad_norm": 1.0006287097930908, + "learning_rate": 0.00012503333110407965, + "loss": 2.0756, + "step": 25023 + }, + { + "epoch": 2.4054599634720755, + "grad_norm": 1.1177163124084473, + "learning_rate": 0.00012502107332907028, + "loss": 1.9852, + "step": 25024 + }, + { + "epoch": 2.4055560895895414, + "grad_norm": 1.2050647735595703, + "learning_rate": 0.00012500881574039677, + "loss": 2.3379, + "step": 25025 + }, + { + "epoch": 2.4056522157070077, + "grad_norm": 1.0624982118606567, + "learning_rate": 0.00012499655833814543, + "loss": 1.8773, + "step": 25026 + }, + { + "epoch": 2.4057483418244736, + "grad_norm": 1.159452199935913, + "learning_rate": 0.00012498430112240245, + "loss": 2.085, + "step": 25027 + }, + { + "epoch": 2.40584446794194, + "grad_norm": 1.2354158163070679, + "learning_rate": 0.00012497204409325414, + "loss": 2.1436, + "step": 25028 + }, + { + "epoch": 2.405940594059406, + "grad_norm": 1.073901891708374, + "learning_rate": 0.00012495978725078673, + "loss": 1.9572, + "step": 25029 + }, + { + "epoch": 2.406036720176872, + "grad_norm": 1.2018252611160278, + "learning_rate": 0.00012494753059508646, + "loss": 1.9905, + "step": 25030 + }, + { + "epoch": 2.406132846294338, + "grad_norm": 1.0785636901855469, + "learning_rate": 0.00012493527412623957, + "loss": 2.0441, + "step": 25031 + }, + { + "epoch": 2.4062289724118044, + "grad_norm": 1.0035582780838013, + "learning_rate": 0.00012492301784433236, + "loss": 1.9191, + "step": 25032 + }, + { + "epoch": 2.4063250985292703, + "grad_norm": 1.0754916667938232, + "learning_rate": 0.00012491076174945106, + "loss": 1.9928, + "step": 25033 + }, + { + "epoch": 2.4064212246467367, + "grad_norm": 1.2007697820663452, + "learning_rate": 0.0001248985058416819, + "loss": 2.0685, + "step": 25034 + }, + { + "epoch": 2.4065173507642026, + "grad_norm": 1.144777774810791, + "learning_rate": 0.00012488625012111116, + "loss": 2.0564, + "step": 25035 + }, + { + "epoch": 2.406613476881669, + "grad_norm": 1.0941693782806396, + "learning_rate": 0.00012487399458782503, + "loss": 2.0007, + "step": 25036 + }, + { + "epoch": 2.406709602999135, + "grad_norm": 1.1587315797805786, + "learning_rate": 0.00012486173924190983, + "loss": 2.0031, + "step": 25037 + }, + { + "epoch": 2.406805729116601, + "grad_norm": 1.0948824882507324, + "learning_rate": 0.00012484948408345175, + "loss": 1.922, + "step": 25038 + }, + { + "epoch": 2.406901855234067, + "grad_norm": 1.195064663887024, + "learning_rate": 0.00012483722911253703, + "loss": 2.0273, + "step": 25039 + }, + { + "epoch": 2.406997981351533, + "grad_norm": 1.2932841777801514, + "learning_rate": 0.00012482497432925193, + "loss": 2.1186, + "step": 25040 + }, + { + "epoch": 2.4070941074689993, + "grad_norm": 1.1760963201522827, + "learning_rate": 0.0001248127197336827, + "loss": 1.9666, + "step": 25041 + }, + { + "epoch": 2.4071902335864657, + "grad_norm": 1.1392067670822144, + "learning_rate": 0.00012480046532591555, + "loss": 1.9489, + "step": 25042 + }, + { + "epoch": 2.4072863597039316, + "grad_norm": 1.0993001461029053, + "learning_rate": 0.00012478821110603673, + "loss": 2.0226, + "step": 25043 + }, + { + "epoch": 2.4073824858213975, + "grad_norm": 1.0699713230133057, + "learning_rate": 0.00012477595707413244, + "loss": 2.0267, + "step": 25044 + }, + { + "epoch": 2.407478611938864, + "grad_norm": 1.2674859762191772, + "learning_rate": 0.00012476370323028895, + "loss": 2.0598, + "step": 25045 + }, + { + "epoch": 2.4075747380563297, + "grad_norm": 1.0354875326156616, + "learning_rate": 0.0001247514495745925, + "loss": 1.9026, + "step": 25046 + }, + { + "epoch": 2.407670864173796, + "grad_norm": 1.1721795797348022, + "learning_rate": 0.0001247391961071293, + "loss": 1.993, + "step": 25047 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 1.182586908340454, + "learning_rate": 0.00012472694282798555, + "loss": 1.8863, + "step": 25048 + }, + { + "epoch": 2.4078631164087283, + "grad_norm": 1.3511686325073242, + "learning_rate": 0.00012471468973724754, + "loss": 1.9915, + "step": 25049 + }, + { + "epoch": 2.407959242526194, + "grad_norm": 1.0755724906921387, + "learning_rate": 0.0001247024368350015, + "loss": 1.9425, + "step": 25050 + }, + { + "epoch": 2.4080553686436605, + "grad_norm": 1.0647938251495361, + "learning_rate": 0.0001246901841213336, + "loss": 2.0619, + "step": 25051 + }, + { + "epoch": 2.4081514947611264, + "grad_norm": 1.1155672073364258, + "learning_rate": 0.00012467793159633007, + "loss": 2.0515, + "step": 25052 + }, + { + "epoch": 2.408247620878593, + "grad_norm": 0.9953758120536804, + "learning_rate": 0.00012466567926007715, + "loss": 1.8137, + "step": 25053 + }, + { + "epoch": 2.4083437469960587, + "grad_norm": 1.0872015953063965, + "learning_rate": 0.0001246534271126611, + "loss": 2.056, + "step": 25054 + }, + { + "epoch": 2.408439873113525, + "grad_norm": 1.252465844154358, + "learning_rate": 0.00012464117515416807, + "loss": 2.1627, + "step": 25055 + }, + { + "epoch": 2.408535999230991, + "grad_norm": 1.2320002317428589, + "learning_rate": 0.00012462892338468433, + "loss": 2.0619, + "step": 25056 + }, + { + "epoch": 2.4086321253484573, + "grad_norm": 1.0069634914398193, + "learning_rate": 0.00012461667180429609, + "loss": 1.7833, + "step": 25057 + }, + { + "epoch": 2.408728251465923, + "grad_norm": 1.0510835647583008, + "learning_rate": 0.00012460442041308952, + "loss": 1.8612, + "step": 25058 + }, + { + "epoch": 2.4088243775833895, + "grad_norm": 1.053628921508789, + "learning_rate": 0.0001245921692111509, + "loss": 1.9359, + "step": 25059 + }, + { + "epoch": 2.4089205037008554, + "grad_norm": 1.114870309829712, + "learning_rate": 0.0001245799181985664, + "loss": 1.9158, + "step": 25060 + }, + { + "epoch": 2.4090166298183218, + "grad_norm": 1.1017531156539917, + "learning_rate": 0.00012456766737542225, + "loss": 2.0276, + "step": 25061 + }, + { + "epoch": 2.4091127559357877, + "grad_norm": 1.2451249361038208, + "learning_rate": 0.00012455541674180466, + "loss": 1.9089, + "step": 25062 + }, + { + "epoch": 2.409208882053254, + "grad_norm": 1.1682119369506836, + "learning_rate": 0.00012454316629779982, + "loss": 2.026, + "step": 25063 + }, + { + "epoch": 2.40930500817072, + "grad_norm": 1.2415066957473755, + "learning_rate": 0.00012453091604349396, + "loss": 1.9413, + "step": 25064 + }, + { + "epoch": 2.4094011342881863, + "grad_norm": 1.109330654144287, + "learning_rate": 0.00012451866597897328, + "loss": 1.8083, + "step": 25065 + }, + { + "epoch": 2.409497260405652, + "grad_norm": 1.1390949487686157, + "learning_rate": 0.00012450641610432396, + "loss": 1.9806, + "step": 25066 + }, + { + "epoch": 2.4095933865231185, + "grad_norm": 1.2396371364593506, + "learning_rate": 0.00012449416641963226, + "loss": 2.1151, + "step": 25067 + }, + { + "epoch": 2.4096895126405844, + "grad_norm": 1.2323392629623413, + "learning_rate": 0.00012448191692498436, + "loss": 2.0047, + "step": 25068 + }, + { + "epoch": 2.4097856387580507, + "grad_norm": 1.2671431303024292, + "learning_rate": 0.00012446966762046644, + "loss": 1.82, + "step": 25069 + }, + { + "epoch": 2.4098817648755166, + "grad_norm": 1.1091316938400269, + "learning_rate": 0.00012445741850616472, + "loss": 1.9415, + "step": 25070 + }, + { + "epoch": 2.409977890992983, + "grad_norm": 1.0198968648910522, + "learning_rate": 0.00012444516958216539, + "loss": 1.9947, + "step": 25071 + }, + { + "epoch": 2.410074017110449, + "grad_norm": 1.154121994972229, + "learning_rate": 0.00012443292084855464, + "loss": 1.9607, + "step": 25072 + }, + { + "epoch": 2.410170143227915, + "grad_norm": 1.220780372619629, + "learning_rate": 0.0001244206723054187, + "loss": 1.9929, + "step": 25073 + }, + { + "epoch": 2.410266269345381, + "grad_norm": 1.1240530014038086, + "learning_rate": 0.0001244084239528437, + "loss": 1.9995, + "step": 25074 + }, + { + "epoch": 2.4103623954628475, + "grad_norm": 1.0475720167160034, + "learning_rate": 0.0001243961757909159, + "loss": 1.8387, + "step": 25075 + }, + { + "epoch": 2.4104585215803134, + "grad_norm": 1.1853827238082886, + "learning_rate": 0.00012438392781972146, + "loss": 1.9817, + "step": 25076 + }, + { + "epoch": 2.4105546476977793, + "grad_norm": 0.9650838971138, + "learning_rate": 0.0001243716800393466, + "loss": 1.7877, + "step": 25077 + }, + { + "epoch": 2.4106507738152456, + "grad_norm": 1.0831670761108398, + "learning_rate": 0.00012435943244987747, + "loss": 2.0376, + "step": 25078 + }, + { + "epoch": 2.410746899932712, + "grad_norm": 1.1486382484436035, + "learning_rate": 0.00012434718505140028, + "loss": 2.1266, + "step": 25079 + }, + { + "epoch": 2.410843026050178, + "grad_norm": 1.05265212059021, + "learning_rate": 0.0001243349378440012, + "loss": 2.0083, + "step": 25080 + }, + { + "epoch": 2.4109391521676438, + "grad_norm": 1.1576550006866455, + "learning_rate": 0.00012432269082776642, + "loss": 1.8694, + "step": 25081 + }, + { + "epoch": 2.41103527828511, + "grad_norm": 1.2079954147338867, + "learning_rate": 0.00012431044400278214, + "loss": 2.0781, + "step": 25082 + }, + { + "epoch": 2.411131404402576, + "grad_norm": 1.067933440208435, + "learning_rate": 0.0001242981973691345, + "loss": 1.9198, + "step": 25083 + }, + { + "epoch": 2.4112275305200424, + "grad_norm": 1.253218650817871, + "learning_rate": 0.00012428595092690975, + "loss": 2.0799, + "step": 25084 + }, + { + "epoch": 2.4113236566375083, + "grad_norm": 1.2231671810150146, + "learning_rate": 0.00012427370467619402, + "loss": 2.0154, + "step": 25085 + }, + { + "epoch": 2.4114197827549746, + "grad_norm": 1.092759609222412, + "learning_rate": 0.0001242614586170735, + "loss": 1.9823, + "step": 25086 + }, + { + "epoch": 2.4115159088724405, + "grad_norm": 1.3355786800384521, + "learning_rate": 0.0001242492127496344, + "loss": 2.193, + "step": 25087 + }, + { + "epoch": 2.411612034989907, + "grad_norm": 1.2087552547454834, + "learning_rate": 0.00012423696707396283, + "loss": 2.0208, + "step": 25088 + }, + { + "epoch": 2.4117081611073727, + "grad_norm": 1.2878053188323975, + "learning_rate": 0.000124224721590145, + "loss": 1.9721, + "step": 25089 + }, + { + "epoch": 2.411804287224839, + "grad_norm": 1.2627321481704712, + "learning_rate": 0.0001242124762982671, + "loss": 2.0879, + "step": 25090 + }, + { + "epoch": 2.411900413342305, + "grad_norm": 1.2984371185302734, + "learning_rate": 0.0001242002311984153, + "loss": 2.1079, + "step": 25091 + }, + { + "epoch": 2.4119965394597713, + "grad_norm": 1.1470224857330322, + "learning_rate": 0.00012418798629067573, + "loss": 2.0679, + "step": 25092 + }, + { + "epoch": 2.4120926655772372, + "grad_norm": 1.2573269605636597, + "learning_rate": 0.0001241757415751346, + "loss": 2.1645, + "step": 25093 + }, + { + "epoch": 2.4121887916947036, + "grad_norm": 1.124651312828064, + "learning_rate": 0.00012416349705187803, + "loss": 1.9327, + "step": 25094 + }, + { + "epoch": 2.4122849178121695, + "grad_norm": 1.149377465248108, + "learning_rate": 0.00012415125272099227, + "loss": 2.0172, + "step": 25095 + }, + { + "epoch": 2.412381043929636, + "grad_norm": 1.283462405204773, + "learning_rate": 0.0001241390085825634, + "loss": 2.0303, + "step": 25096 + }, + { + "epoch": 2.4124771700471017, + "grad_norm": 1.2412105798721313, + "learning_rate": 0.00012412676463667763, + "loss": 1.8847, + "step": 25097 + }, + { + "epoch": 2.412573296164568, + "grad_norm": 1.0843044519424438, + "learning_rate": 0.00012411452088342107, + "loss": 2.004, + "step": 25098 + }, + { + "epoch": 2.412669422282034, + "grad_norm": 1.0135068893432617, + "learning_rate": 0.00012410227732287998, + "loss": 2.0744, + "step": 25099 + }, + { + "epoch": 2.4127655483995003, + "grad_norm": 1.0316004753112793, + "learning_rate": 0.0001240900339551404, + "loss": 2.0724, + "step": 25100 + }, + { + "epoch": 2.412861674516966, + "grad_norm": 1.2417898178100586, + "learning_rate": 0.00012407779078028855, + "loss": 2.1667, + "step": 25101 + }, + { + "epoch": 2.4129578006344325, + "grad_norm": 0.9465720057487488, + "learning_rate": 0.00012406554779841064, + "loss": 1.8523, + "step": 25102 + }, + { + "epoch": 2.4130539267518984, + "grad_norm": 1.2159382104873657, + "learning_rate": 0.00012405330500959272, + "loss": 2.1658, + "step": 25103 + }, + { + "epoch": 2.413150052869365, + "grad_norm": 1.094261646270752, + "learning_rate": 0.00012404106241392104, + "loss": 2.1431, + "step": 25104 + }, + { + "epoch": 2.4132461789868307, + "grad_norm": 1.0467445850372314, + "learning_rate": 0.00012402882001148172, + "loss": 1.8173, + "step": 25105 + }, + { + "epoch": 2.4133423051042966, + "grad_norm": 1.0155876874923706, + "learning_rate": 0.00012401657780236082, + "loss": 1.9921, + "step": 25106 + }, + { + "epoch": 2.413438431221763, + "grad_norm": 1.0996195077896118, + "learning_rate": 0.00012400433578664462, + "loss": 1.992, + "step": 25107 + }, + { + "epoch": 2.4135345573392293, + "grad_norm": 0.9433556795120239, + "learning_rate": 0.00012399209396441924, + "loss": 1.9999, + "step": 25108 + }, + { + "epoch": 2.413630683456695, + "grad_norm": 1.1583330631256104, + "learning_rate": 0.00012397985233577078, + "loss": 1.8501, + "step": 25109 + }, + { + "epoch": 2.413726809574161, + "grad_norm": 1.1385763883590698, + "learning_rate": 0.00012396761090078542, + "loss": 2.0307, + "step": 25110 + }, + { + "epoch": 2.4138229356916274, + "grad_norm": 1.0042835474014282, + "learning_rate": 0.0001239553696595493, + "loss": 1.8252, + "step": 25111 + }, + { + "epoch": 2.4139190618090938, + "grad_norm": 1.114016056060791, + "learning_rate": 0.00012394312861214855, + "loss": 2.1439, + "step": 25112 + }, + { + "epoch": 2.4140151879265597, + "grad_norm": 1.1876496076583862, + "learning_rate": 0.00012393088775866933, + "loss": 1.9864, + "step": 25113 + }, + { + "epoch": 2.4141113140440256, + "grad_norm": 1.1542317867279053, + "learning_rate": 0.00012391864709919775, + "loss": 1.9981, + "step": 25114 + }, + { + "epoch": 2.414207440161492, + "grad_norm": 1.2156139612197876, + "learning_rate": 0.00012390640663381998, + "loss": 2.0278, + "step": 25115 + }, + { + "epoch": 2.414303566278958, + "grad_norm": 1.1690130233764648, + "learning_rate": 0.00012389416636262215, + "loss": 2.1394, + "step": 25116 + }, + { + "epoch": 2.414399692396424, + "grad_norm": 1.1962652206420898, + "learning_rate": 0.00012388192628569038, + "loss": 2.1143, + "step": 25117 + }, + { + "epoch": 2.41449581851389, + "grad_norm": 1.1250430345535278, + "learning_rate": 0.00012386968640311084, + "loss": 2.0458, + "step": 25118 + }, + { + "epoch": 2.4145919446313564, + "grad_norm": 1.090241551399231, + "learning_rate": 0.00012385744671496965, + "loss": 1.9677, + "step": 25119 + }, + { + "epoch": 2.4146880707488223, + "grad_norm": 1.2468634843826294, + "learning_rate": 0.00012384520722135293, + "loss": 2.1186, + "step": 25120 + }, + { + "epoch": 2.4147841968662886, + "grad_norm": 1.1400476694107056, + "learning_rate": 0.00012383296792234684, + "loss": 2.0501, + "step": 25121 + }, + { + "epoch": 2.4148803229837545, + "grad_norm": 1.2309352159500122, + "learning_rate": 0.00012382072881803746, + "loss": 2.1005, + "step": 25122 + }, + { + "epoch": 2.414976449101221, + "grad_norm": 1.254342794418335, + "learning_rate": 0.00012380848990851097, + "loss": 2.1071, + "step": 25123 + }, + { + "epoch": 2.415072575218687, + "grad_norm": 0.9897087216377258, + "learning_rate": 0.00012379625119385344, + "loss": 1.9072, + "step": 25124 + }, + { + "epoch": 2.415168701336153, + "grad_norm": 1.0419012308120728, + "learning_rate": 0.00012378401267415105, + "loss": 1.8861, + "step": 25125 + }, + { + "epoch": 2.415264827453619, + "grad_norm": 1.1552972793579102, + "learning_rate": 0.00012377177434948992, + "loss": 1.9873, + "step": 25126 + }, + { + "epoch": 2.4153609535710854, + "grad_norm": 1.033725619316101, + "learning_rate": 0.0001237595362199561, + "loss": 1.8403, + "step": 25127 + }, + { + "epoch": 2.4154570796885513, + "grad_norm": 1.1743298768997192, + "learning_rate": 0.0001237472982856358, + "loss": 1.9251, + "step": 25128 + }, + { + "epoch": 2.4155532058060176, + "grad_norm": 1.1302490234375, + "learning_rate": 0.00012373506054661515, + "loss": 2.0717, + "step": 25129 + }, + { + "epoch": 2.4156493319234835, + "grad_norm": 1.110556721687317, + "learning_rate": 0.00012372282300298016, + "loss": 1.7454, + "step": 25130 + }, + { + "epoch": 2.41574545804095, + "grad_norm": 1.1202138662338257, + "learning_rate": 0.00012371058565481703, + "loss": 1.9323, + "step": 25131 + }, + { + "epoch": 2.4158415841584158, + "grad_norm": 1.1740831136703491, + "learning_rate": 0.0001236983485022119, + "loss": 1.9603, + "step": 25132 + }, + { + "epoch": 2.415937710275882, + "grad_norm": 1.0722708702087402, + "learning_rate": 0.0001236861115452508, + "loss": 1.9545, + "step": 25133 + }, + { + "epoch": 2.416033836393348, + "grad_norm": 1.1741081476211548, + "learning_rate": 0.00012367387478401986, + "loss": 1.9893, + "step": 25134 + }, + { + "epoch": 2.4161299625108144, + "grad_norm": 1.1410123109817505, + "learning_rate": 0.00012366163821860525, + "loss": 1.9571, + "step": 25135 + }, + { + "epoch": 2.4162260886282803, + "grad_norm": 1.1025217771530151, + "learning_rate": 0.00012364940184909304, + "loss": 1.8447, + "step": 25136 + }, + { + "epoch": 2.4163222147457466, + "grad_norm": 1.1274528503417969, + "learning_rate": 0.00012363716567556938, + "loss": 2.0219, + "step": 25137 + }, + { + "epoch": 2.4164183408632125, + "grad_norm": 1.2444932460784912, + "learning_rate": 0.00012362492969812032, + "loss": 1.9375, + "step": 25138 + }, + { + "epoch": 2.416514466980679, + "grad_norm": 1.15544593334198, + "learning_rate": 0.00012361269391683198, + "loss": 1.9668, + "step": 25139 + }, + { + "epoch": 2.4166105930981447, + "grad_norm": 1.134211540222168, + "learning_rate": 0.0001236004583317905, + "loss": 2.073, + "step": 25140 + }, + { + "epoch": 2.416706719215611, + "grad_norm": 1.0512489080429077, + "learning_rate": 0.00012358822294308204, + "loss": 2.011, + "step": 25141 + }, + { + "epoch": 2.416802845333077, + "grad_norm": 1.171994924545288, + "learning_rate": 0.00012357598775079252, + "loss": 1.9195, + "step": 25142 + }, + { + "epoch": 2.416898971450543, + "grad_norm": 1.1140727996826172, + "learning_rate": 0.00012356375275500813, + "loss": 1.7783, + "step": 25143 + }, + { + "epoch": 2.4169950975680092, + "grad_norm": 1.0825574398040771, + "learning_rate": 0.000123551517955815, + "loss": 1.9918, + "step": 25144 + }, + { + "epoch": 2.4170912236854756, + "grad_norm": 1.0839805603027344, + "learning_rate": 0.00012353928335329922, + "loss": 2.0188, + "step": 25145 + }, + { + "epoch": 2.4171873498029415, + "grad_norm": 1.083211064338684, + "learning_rate": 0.00012352704894754687, + "loss": 2.0385, + "step": 25146 + }, + { + "epoch": 2.4172834759204074, + "grad_norm": 1.1705337762832642, + "learning_rate": 0.00012351481473864406, + "loss": 2.0346, + "step": 25147 + }, + { + "epoch": 2.4173796020378737, + "grad_norm": 1.0751984119415283, + "learning_rate": 0.0001235025807266769, + "loss": 1.9287, + "step": 25148 + }, + { + "epoch": 2.4174757281553396, + "grad_norm": 1.1501296758651733, + "learning_rate": 0.0001234903469117314, + "loss": 2.1443, + "step": 25149 + }, + { + "epoch": 2.417571854272806, + "grad_norm": 1.1024143695831299, + "learning_rate": 0.00012347811329389377, + "loss": 2.2084, + "step": 25150 + }, + { + "epoch": 2.417667980390272, + "grad_norm": 1.025410532951355, + "learning_rate": 0.00012346587987325, + "loss": 1.9887, + "step": 25151 + }, + { + "epoch": 2.417764106507738, + "grad_norm": 1.0143847465515137, + "learning_rate": 0.00012345364664988624, + "loss": 1.8383, + "step": 25152 + }, + { + "epoch": 2.417860232625204, + "grad_norm": 1.1332216262817383, + "learning_rate": 0.00012344141362388852, + "loss": 1.7532, + "step": 25153 + }, + { + "epoch": 2.4179563587426705, + "grad_norm": 1.1952197551727295, + "learning_rate": 0.000123429180795343, + "loss": 2.1174, + "step": 25154 + }, + { + "epoch": 2.4180524848601364, + "grad_norm": 1.0252701044082642, + "learning_rate": 0.0001234169481643357, + "loss": 2.0591, + "step": 25155 + }, + { + "epoch": 2.4181486109776027, + "grad_norm": 1.1457467079162598, + "learning_rate": 0.00012340471573095275, + "loss": 2.0116, + "step": 25156 + }, + { + "epoch": 2.4182447370950686, + "grad_norm": 1.1362006664276123, + "learning_rate": 0.00012339248349528016, + "loss": 2.0914, + "step": 25157 + }, + { + "epoch": 2.418340863212535, + "grad_norm": 1.2281310558319092, + "learning_rate": 0.0001233802514574041, + "loss": 2.0638, + "step": 25158 + }, + { + "epoch": 2.418436989330001, + "grad_norm": 1.1455421447753906, + "learning_rate": 0.00012336801961741062, + "loss": 2.0502, + "step": 25159 + }, + { + "epoch": 2.418533115447467, + "grad_norm": 0.9762647151947021, + "learning_rate": 0.0001233557879753857, + "loss": 1.8848, + "step": 25160 + }, + { + "epoch": 2.418629241564933, + "grad_norm": 1.177855134010315, + "learning_rate": 0.00012334355653141556, + "loss": 1.8173, + "step": 25161 + }, + { + "epoch": 2.4187253676823994, + "grad_norm": 1.0846729278564453, + "learning_rate": 0.0001233313252855862, + "loss": 1.9436, + "step": 25162 + }, + { + "epoch": 2.4188214937998653, + "grad_norm": 0.9779982566833496, + "learning_rate": 0.00012331909423798373, + "loss": 1.784, + "step": 25163 + }, + { + "epoch": 2.4189176199173317, + "grad_norm": 1.2036738395690918, + "learning_rate": 0.00012330686338869417, + "loss": 1.9937, + "step": 25164 + }, + { + "epoch": 2.4190137460347976, + "grad_norm": 1.143349051475525, + "learning_rate": 0.00012329463273780364, + "loss": 1.9237, + "step": 25165 + }, + { + "epoch": 2.419109872152264, + "grad_norm": 1.3442308902740479, + "learning_rate": 0.00012328240228539817, + "loss": 2.0869, + "step": 25166 + }, + { + "epoch": 2.41920599826973, + "grad_norm": 1.0806483030319214, + "learning_rate": 0.00012327017203156386, + "loss": 1.9775, + "step": 25167 + }, + { + "epoch": 2.419302124387196, + "grad_norm": 1.191280484199524, + "learning_rate": 0.00012325794197638677, + "loss": 1.9937, + "step": 25168 + }, + { + "epoch": 2.419398250504662, + "grad_norm": 0.965243935585022, + "learning_rate": 0.00012324571211995292, + "loss": 1.9852, + "step": 25169 + }, + { + "epoch": 2.4194943766221284, + "grad_norm": 1.0329210758209229, + "learning_rate": 0.00012323348246234844, + "loss": 1.764, + "step": 25170 + }, + { + "epoch": 2.4195905027395943, + "grad_norm": 1.072849988937378, + "learning_rate": 0.00012322125300365933, + "loss": 2.1605, + "step": 25171 + }, + { + "epoch": 2.4196866288570607, + "grad_norm": 1.1226083040237427, + "learning_rate": 0.0001232090237439717, + "loss": 2.0653, + "step": 25172 + }, + { + "epoch": 2.4197827549745266, + "grad_norm": 1.3661140203475952, + "learning_rate": 0.0001231967946833716, + "loss": 2.1269, + "step": 25173 + }, + { + "epoch": 2.419878881091993, + "grad_norm": 1.1373634338378906, + "learning_rate": 0.00012318456582194508, + "loss": 1.9588, + "step": 25174 + }, + { + "epoch": 2.419975007209459, + "grad_norm": 1.2556483745574951, + "learning_rate": 0.00012317233715977818, + "loss": 2.0572, + "step": 25175 + }, + { + "epoch": 2.4200711333269247, + "grad_norm": 1.1957378387451172, + "learning_rate": 0.00012316010869695693, + "loss": 2.0551, + "step": 25176 + }, + { + "epoch": 2.420167259444391, + "grad_norm": 1.0768811702728271, + "learning_rate": 0.00012314788043356743, + "loss": 2.0286, + "step": 25177 + }, + { + "epoch": 2.4202633855618574, + "grad_norm": 1.1177561283111572, + "learning_rate": 0.00012313565236969577, + "loss": 1.9879, + "step": 25178 + }, + { + "epoch": 2.4203595116793233, + "grad_norm": 1.0935593843460083, + "learning_rate": 0.00012312342450542793, + "loss": 2.1027, + "step": 25179 + }, + { + "epoch": 2.420455637796789, + "grad_norm": 1.1036365032196045, + "learning_rate": 0.00012311119684084998, + "loss": 1.92, + "step": 25180 + }, + { + "epoch": 2.4205517639142555, + "grad_norm": 1.302527904510498, + "learning_rate": 0.00012309896937604797, + "loss": 2.0885, + "step": 25181 + }, + { + "epoch": 2.4206478900317214, + "grad_norm": 1.0069963932037354, + "learning_rate": 0.00012308674211110795, + "loss": 1.7949, + "step": 25182 + }, + { + "epoch": 2.4207440161491878, + "grad_norm": 1.082224726676941, + "learning_rate": 0.00012307451504611598, + "loss": 2.1235, + "step": 25183 + }, + { + "epoch": 2.4208401422666537, + "grad_norm": 1.239912748336792, + "learning_rate": 0.00012306228818115806, + "loss": 1.9009, + "step": 25184 + }, + { + "epoch": 2.42093626838412, + "grad_norm": 0.8823050856590271, + "learning_rate": 0.00012305006151632029, + "loss": 1.7948, + "step": 25185 + }, + { + "epoch": 2.421032394501586, + "grad_norm": 1.2062112092971802, + "learning_rate": 0.00012303783505168865, + "loss": 2.0387, + "step": 25186 + }, + { + "epoch": 2.4211285206190523, + "grad_norm": 1.232405185699463, + "learning_rate": 0.00012302560878734922, + "loss": 2.0426, + "step": 25187 + }, + { + "epoch": 2.421224646736518, + "grad_norm": 1.1600885391235352, + "learning_rate": 0.00012301338272338803, + "loss": 2.0118, + "step": 25188 + }, + { + "epoch": 2.4213207728539845, + "grad_norm": 1.1316756010055542, + "learning_rate": 0.0001230011568598911, + "loss": 1.9991, + "step": 25189 + }, + { + "epoch": 2.4214168989714504, + "grad_norm": 1.0900459289550781, + "learning_rate": 0.00012298893119694446, + "loss": 1.9145, + "step": 25190 + }, + { + "epoch": 2.4215130250889167, + "grad_norm": 1.2414075136184692, + "learning_rate": 0.00012297670573463418, + "loss": 2.0078, + "step": 25191 + }, + { + "epoch": 2.4216091512063826, + "grad_norm": 1.4317269325256348, + "learning_rate": 0.0001229644804730463, + "loss": 2.0373, + "step": 25192 + }, + { + "epoch": 2.421705277323849, + "grad_norm": 1.2074906826019287, + "learning_rate": 0.0001229522554122668, + "loss": 2.0148, + "step": 25193 + }, + { + "epoch": 2.421801403441315, + "grad_norm": 1.1845839023590088, + "learning_rate": 0.00012294003055238168, + "loss": 1.9765, + "step": 25194 + }, + { + "epoch": 2.4218975295587812, + "grad_norm": 1.1263821125030518, + "learning_rate": 0.0001229278058934771, + "loss": 2.1056, + "step": 25195 + }, + { + "epoch": 2.421993655676247, + "grad_norm": 1.124272108078003, + "learning_rate": 0.00012291558143563897, + "loss": 1.8841, + "step": 25196 + }, + { + "epoch": 2.4220897817937135, + "grad_norm": 1.121279239654541, + "learning_rate": 0.00012290335717895338, + "loss": 1.8755, + "step": 25197 + }, + { + "epoch": 2.4221859079111794, + "grad_norm": 1.2969856262207031, + "learning_rate": 0.00012289113312350633, + "loss": 2.0806, + "step": 25198 + }, + { + "epoch": 2.4222820340286457, + "grad_norm": 1.110382080078125, + "learning_rate": 0.0001228789092693838, + "loss": 1.9482, + "step": 25199 + }, + { + "epoch": 2.4223781601461116, + "grad_norm": 1.3337342739105225, + "learning_rate": 0.00012286668561667187, + "loss": 2.1621, + "step": 25200 + }, + { + "epoch": 2.422474286263578, + "grad_norm": 1.3254082202911377, + "learning_rate": 0.00012285446216545655, + "loss": 2.1893, + "step": 25201 + }, + { + "epoch": 2.422570412381044, + "grad_norm": 1.1944245100021362, + "learning_rate": 0.00012284223891582385, + "loss": 1.9919, + "step": 25202 + }, + { + "epoch": 2.42266653849851, + "grad_norm": 1.2398203611373901, + "learning_rate": 0.00012283001586785978, + "loss": 2.0148, + "step": 25203 + }, + { + "epoch": 2.422762664615976, + "grad_norm": 1.2605360746383667, + "learning_rate": 0.00012281779302165035, + "loss": 2.0375, + "step": 25204 + }, + { + "epoch": 2.4228587907334425, + "grad_norm": 1.135342001914978, + "learning_rate": 0.00012280557037728162, + "loss": 2.1123, + "step": 25205 + }, + { + "epoch": 2.4229549168509084, + "grad_norm": 1.1854443550109863, + "learning_rate": 0.00012279334793483952, + "loss": 2.0334, + "step": 25206 + }, + { + "epoch": 2.4230510429683747, + "grad_norm": 1.1742192506790161, + "learning_rate": 0.00012278112569441012, + "loss": 1.99, + "step": 25207 + }, + { + "epoch": 2.4231471690858406, + "grad_norm": 1.1882743835449219, + "learning_rate": 0.00012276890365607945, + "loss": 2.1965, + "step": 25208 + }, + { + "epoch": 2.4232432952033065, + "grad_norm": 1.1961770057678223, + "learning_rate": 0.00012275668181993345, + "loss": 2.1329, + "step": 25209 + }, + { + "epoch": 2.423339421320773, + "grad_norm": 1.1444106101989746, + "learning_rate": 0.00012274446018605817, + "loss": 1.9797, + "step": 25210 + }, + { + "epoch": 2.423435547438239, + "grad_norm": 1.078829050064087, + "learning_rate": 0.0001227322387545396, + "loss": 2.1003, + "step": 25211 + }, + { + "epoch": 2.423531673555705, + "grad_norm": 1.1232075691223145, + "learning_rate": 0.00012272001752546374, + "loss": 1.7929, + "step": 25212 + }, + { + "epoch": 2.423627799673171, + "grad_norm": 1.2251237630844116, + "learning_rate": 0.00012270779649891663, + "loss": 1.9756, + "step": 25213 + }, + { + "epoch": 2.4237239257906373, + "grad_norm": 1.1237047910690308, + "learning_rate": 0.00012269557567498422, + "loss": 1.8401, + "step": 25214 + }, + { + "epoch": 2.4238200519081037, + "grad_norm": 1.0351344347000122, + "learning_rate": 0.00012268335505375254, + "loss": 1.9672, + "step": 25215 + }, + { + "epoch": 2.4239161780255696, + "grad_norm": 1.2530370950698853, + "learning_rate": 0.00012267113463530763, + "loss": 2.0566, + "step": 25216 + }, + { + "epoch": 2.4240123041430355, + "grad_norm": 1.1173094511032104, + "learning_rate": 0.0001226589144197354, + "loss": 2.0009, + "step": 25217 + }, + { + "epoch": 2.424108430260502, + "grad_norm": 1.193070411682129, + "learning_rate": 0.0001226466944071219, + "loss": 2.0099, + "step": 25218 + }, + { + "epoch": 2.4242045563779677, + "grad_norm": 0.9649573564529419, + "learning_rate": 0.0001226344745975531, + "loss": 1.8503, + "step": 25219 + }, + { + "epoch": 2.424300682495434, + "grad_norm": 1.2137463092803955, + "learning_rate": 0.000122622254991115, + "loss": 2.1466, + "step": 25220 + }, + { + "epoch": 2.4243968086129, + "grad_norm": 1.096565842628479, + "learning_rate": 0.0001226100355878936, + "loss": 2.1025, + "step": 25221 + }, + { + "epoch": 2.4244929347303663, + "grad_norm": 1.0390263795852661, + "learning_rate": 0.00012259781638797486, + "loss": 1.8576, + "step": 25222 + }, + { + "epoch": 2.424589060847832, + "grad_norm": 1.378675103187561, + "learning_rate": 0.0001225855973914448, + "loss": 2.0338, + "step": 25223 + }, + { + "epoch": 2.4246851869652986, + "grad_norm": 1.1839474439620972, + "learning_rate": 0.0001225733785983894, + "loss": 2.1048, + "step": 25224 + }, + { + "epoch": 2.4247813130827645, + "grad_norm": 1.1150667667388916, + "learning_rate": 0.00012256116000889468, + "loss": 2.157, + "step": 25225 + }, + { + "epoch": 2.424877439200231, + "grad_norm": 1.0290614366531372, + "learning_rate": 0.0001225489416230465, + "loss": 2.0189, + "step": 25226 + }, + { + "epoch": 2.4249735653176967, + "grad_norm": 1.108399510383606, + "learning_rate": 0.000122536723440931, + "loss": 1.9982, + "step": 25227 + }, + { + "epoch": 2.425069691435163, + "grad_norm": 1.0816566944122314, + "learning_rate": 0.00012252450546263406, + "loss": 1.8966, + "step": 25228 + }, + { + "epoch": 2.425165817552629, + "grad_norm": 0.9764103293418884, + "learning_rate": 0.0001225122876882417, + "loss": 1.9818, + "step": 25229 + }, + { + "epoch": 2.4252619436700953, + "grad_norm": 1.1522589921951294, + "learning_rate": 0.00012250007011783986, + "loss": 2.0712, + "step": 25230 + }, + { + "epoch": 2.425358069787561, + "grad_norm": 1.0834097862243652, + "learning_rate": 0.0001224878527515146, + "loss": 2.1666, + "step": 25231 + }, + { + "epoch": 2.4254541959050275, + "grad_norm": 0.9838570952415466, + "learning_rate": 0.0001224756355893518, + "loss": 1.8923, + "step": 25232 + }, + { + "epoch": 2.4255503220224934, + "grad_norm": 1.156899094581604, + "learning_rate": 0.00012246341863143747, + "loss": 2.0062, + "step": 25233 + }, + { + "epoch": 2.4256464481399598, + "grad_norm": 1.2714070081710815, + "learning_rate": 0.00012245120187785762, + "loss": 1.8679, + "step": 25234 + }, + { + "epoch": 2.4257425742574257, + "grad_norm": 1.1456565856933594, + "learning_rate": 0.00012243898532869816, + "loss": 1.9564, + "step": 25235 + }, + { + "epoch": 2.425838700374892, + "grad_norm": 1.0576852560043335, + "learning_rate": 0.00012242676898404508, + "loss": 1.948, + "step": 25236 + }, + { + "epoch": 2.425934826492358, + "grad_norm": 1.176115870475769, + "learning_rate": 0.00012241455284398437, + "loss": 2.0364, + "step": 25237 + }, + { + "epoch": 2.4260309526098243, + "grad_norm": 1.2362438440322876, + "learning_rate": 0.00012240233690860198, + "loss": 2.1841, + "step": 25238 + }, + { + "epoch": 2.42612707872729, + "grad_norm": 1.1614930629730225, + "learning_rate": 0.0001223901211779839, + "loss": 1.8014, + "step": 25239 + }, + { + "epoch": 2.4262232048447565, + "grad_norm": 1.1074979305267334, + "learning_rate": 0.00012237790565221605, + "loss": 2.0596, + "step": 25240 + }, + { + "epoch": 2.4263193309622224, + "grad_norm": 1.1147719621658325, + "learning_rate": 0.00012236569033138438, + "loss": 1.9321, + "step": 25241 + }, + { + "epoch": 2.4264154570796883, + "grad_norm": 1.1029659509658813, + "learning_rate": 0.0001223534752155749, + "loss": 2.0543, + "step": 25242 + }, + { + "epoch": 2.4265115831971547, + "grad_norm": 1.101418375968933, + "learning_rate": 0.0001223412603048736, + "loss": 1.9093, + "step": 25243 + }, + { + "epoch": 2.426607709314621, + "grad_norm": 1.0919086933135986, + "learning_rate": 0.00012232904559936634, + "loss": 2.0119, + "step": 25244 + }, + { + "epoch": 2.426703835432087, + "grad_norm": 1.2496395111083984, + "learning_rate": 0.0001223168310991391, + "loss": 2.0654, + "step": 25245 + }, + { + "epoch": 2.426799961549553, + "grad_norm": 1.0601186752319336, + "learning_rate": 0.00012230461680427793, + "loss": 1.8529, + "step": 25246 + }, + { + "epoch": 2.426896087667019, + "grad_norm": 1.1070151329040527, + "learning_rate": 0.00012229240271486867, + "loss": 1.9537, + "step": 25247 + }, + { + "epoch": 2.4269922137844855, + "grad_norm": 1.113150715827942, + "learning_rate": 0.00012228018883099736, + "loss": 2.1293, + "step": 25248 + }, + { + "epoch": 2.4270883399019514, + "grad_norm": 1.4572639465332031, + "learning_rate": 0.00012226797515274988, + "loss": 2.0692, + "step": 25249 + }, + { + "epoch": 2.4271844660194173, + "grad_norm": 1.1215407848358154, + "learning_rate": 0.00012225576168021222, + "loss": 2.0716, + "step": 25250 + }, + { + "epoch": 2.4272805921368836, + "grad_norm": 1.1117496490478516, + "learning_rate": 0.00012224354841347033, + "loss": 2.0002, + "step": 25251 + }, + { + "epoch": 2.4273767182543495, + "grad_norm": 1.238883137702942, + "learning_rate": 0.0001222313353526101, + "loss": 2.2924, + "step": 25252 + }, + { + "epoch": 2.427472844371816, + "grad_norm": 1.2802048921585083, + "learning_rate": 0.00012221912249771755, + "loss": 1.8891, + "step": 25253 + }, + { + "epoch": 2.4275689704892818, + "grad_norm": 1.2127697467803955, + "learning_rate": 0.00012220690984887859, + "loss": 2.1503, + "step": 25254 + }, + { + "epoch": 2.427665096606748, + "grad_norm": 1.214174747467041, + "learning_rate": 0.00012219469740617914, + "loss": 1.9796, + "step": 25255 + }, + { + "epoch": 2.427761222724214, + "grad_norm": 1.0042678117752075, + "learning_rate": 0.00012218248516970522, + "loss": 1.9735, + "step": 25256 + }, + { + "epoch": 2.4278573488416804, + "grad_norm": 1.0576494932174683, + "learning_rate": 0.00012217027313954264, + "loss": 1.9031, + "step": 25257 + }, + { + "epoch": 2.4279534749591463, + "grad_norm": 1.0610655546188354, + "learning_rate": 0.00012215806131577747, + "loss": 2.0781, + "step": 25258 + }, + { + "epoch": 2.4280496010766126, + "grad_norm": 1.1905614137649536, + "learning_rate": 0.00012214584969849555, + "loss": 2.0252, + "step": 25259 + }, + { + "epoch": 2.4281457271940785, + "grad_norm": 1.0735397338867188, + "learning_rate": 0.0001221336382877829, + "loss": 2.0615, + "step": 25260 + }, + { + "epoch": 2.428241853311545, + "grad_norm": 1.129071831703186, + "learning_rate": 0.00012212142708372536, + "loss": 2.0179, + "step": 25261 + }, + { + "epoch": 2.4283379794290108, + "grad_norm": 1.125108242034912, + "learning_rate": 0.0001221092160864089, + "loss": 1.8961, + "step": 25262 + }, + { + "epoch": 2.428434105546477, + "grad_norm": 1.1504335403442383, + "learning_rate": 0.00012209700529591949, + "loss": 2.1324, + "step": 25263 + }, + { + "epoch": 2.428530231663943, + "grad_norm": 1.1720170974731445, + "learning_rate": 0.000122084794712343, + "loss": 1.8841, + "step": 25264 + }, + { + "epoch": 2.4286263577814093, + "grad_norm": 1.089199423789978, + "learning_rate": 0.0001220725843357654, + "loss": 2.0619, + "step": 25265 + }, + { + "epoch": 2.4287224838988752, + "grad_norm": 1.3215090036392212, + "learning_rate": 0.0001220603741662726, + "loss": 2.1191, + "step": 25266 + }, + { + "epoch": 2.4288186100163416, + "grad_norm": 1.122169852256775, + "learning_rate": 0.00012204816420395055, + "loss": 2.0442, + "step": 25267 + }, + { + "epoch": 2.4289147361338075, + "grad_norm": 1.2089632749557495, + "learning_rate": 0.00012203595444888513, + "loss": 2.014, + "step": 25268 + }, + { + "epoch": 2.429010862251274, + "grad_norm": 1.1882379055023193, + "learning_rate": 0.00012202374490116226, + "loss": 1.9915, + "step": 25269 + }, + { + "epoch": 2.4291069883687397, + "grad_norm": 1.0485122203826904, + "learning_rate": 0.00012201153556086792, + "loss": 1.9581, + "step": 25270 + }, + { + "epoch": 2.429203114486206, + "grad_norm": 1.0538790225982666, + "learning_rate": 0.00012199932642808797, + "loss": 1.9688, + "step": 25271 + }, + { + "epoch": 2.429299240603672, + "grad_norm": 1.2206430435180664, + "learning_rate": 0.00012198711750290833, + "loss": 1.9491, + "step": 25272 + }, + { + "epoch": 2.4293953667211383, + "grad_norm": 1.1463412046432495, + "learning_rate": 0.00012197490878541494, + "loss": 2.0014, + "step": 25273 + }, + { + "epoch": 2.429491492838604, + "grad_norm": 1.0983097553253174, + "learning_rate": 0.00012196270027569372, + "loss": 1.8618, + "step": 25274 + }, + { + "epoch": 2.42958761895607, + "grad_norm": 1.3192845582962036, + "learning_rate": 0.00012195049197383057, + "loss": 1.9368, + "step": 25275 + }, + { + "epoch": 2.4296837450735365, + "grad_norm": 1.1489605903625488, + "learning_rate": 0.00012193828387991141, + "loss": 2.0134, + "step": 25276 + }, + { + "epoch": 2.429779871191003, + "grad_norm": 1.1385533809661865, + "learning_rate": 0.00012192607599402214, + "loss": 1.8645, + "step": 25277 + }, + { + "epoch": 2.4298759973084687, + "grad_norm": 1.0811936855316162, + "learning_rate": 0.00012191386831624864, + "loss": 1.955, + "step": 25278 + }, + { + "epoch": 2.4299721234259346, + "grad_norm": 1.060245156288147, + "learning_rate": 0.00012190166084667687, + "loss": 1.8461, + "step": 25279 + }, + { + "epoch": 2.430068249543401, + "grad_norm": 1.3399144411087036, + "learning_rate": 0.0001218894535853927, + "loss": 2.1296, + "step": 25280 + }, + { + "epoch": 2.4301643756608673, + "grad_norm": 1.2183057069778442, + "learning_rate": 0.00012187724653248204, + "loss": 2.0206, + "step": 25281 + }, + { + "epoch": 2.430260501778333, + "grad_norm": 1.0597132444381714, + "learning_rate": 0.00012186503968803082, + "loss": 2.0759, + "step": 25282 + }, + { + "epoch": 2.430356627895799, + "grad_norm": 1.128045678138733, + "learning_rate": 0.00012185283305212491, + "loss": 2.0364, + "step": 25283 + }, + { + "epoch": 2.4304527540132654, + "grad_norm": 1.0199905633926392, + "learning_rate": 0.00012184062662485023, + "loss": 2.033, + "step": 25284 + }, + { + "epoch": 2.4305488801307313, + "grad_norm": 1.0715054273605347, + "learning_rate": 0.00012182842040629268, + "loss": 1.9928, + "step": 25285 + }, + { + "epoch": 2.4306450062481977, + "grad_norm": 1.10431706905365, + "learning_rate": 0.00012181621439653815, + "loss": 1.8153, + "step": 25286 + }, + { + "epoch": 2.4307411323656636, + "grad_norm": 1.125557541847229, + "learning_rate": 0.00012180400859567251, + "loss": 1.9723, + "step": 25287 + }, + { + "epoch": 2.43083725848313, + "grad_norm": 1.0235958099365234, + "learning_rate": 0.00012179180300378169, + "loss": 1.7822, + "step": 25288 + }, + { + "epoch": 2.430933384600596, + "grad_norm": 1.0947767496109009, + "learning_rate": 0.00012177959762095155, + "loss": 2.0107, + "step": 25289 + }, + { + "epoch": 2.431029510718062, + "grad_norm": 1.0597244501113892, + "learning_rate": 0.00012176739244726802, + "loss": 1.9866, + "step": 25290 + }, + { + "epoch": 2.431125636835528, + "grad_norm": 1.1987501382827759, + "learning_rate": 0.00012175518748281695, + "loss": 1.9525, + "step": 25291 + }, + { + "epoch": 2.4312217629529944, + "grad_norm": 1.2590986490249634, + "learning_rate": 0.00012174298272768426, + "loss": 1.769, + "step": 25292 + }, + { + "epoch": 2.4313178890704603, + "grad_norm": 1.1846582889556885, + "learning_rate": 0.0001217307781819558, + "loss": 1.9945, + "step": 25293 + }, + { + "epoch": 2.4314140151879267, + "grad_norm": 1.1995244026184082, + "learning_rate": 0.00012171857384571752, + "loss": 1.9917, + "step": 25294 + }, + { + "epoch": 2.4315101413053926, + "grad_norm": 1.2566437721252441, + "learning_rate": 0.00012170636971905523, + "loss": 2.0085, + "step": 25295 + }, + { + "epoch": 2.431606267422859, + "grad_norm": 1.2456647157669067, + "learning_rate": 0.00012169416580205487, + "loss": 2.1127, + "step": 25296 + }, + { + "epoch": 2.431702393540325, + "grad_norm": 1.090614914894104, + "learning_rate": 0.00012168196209480224, + "loss": 1.9906, + "step": 25297 + }, + { + "epoch": 2.431798519657791, + "grad_norm": 1.0940793752670288, + "learning_rate": 0.00012166975859738332, + "loss": 2.0677, + "step": 25298 + }, + { + "epoch": 2.431894645775257, + "grad_norm": 1.0502562522888184, + "learning_rate": 0.00012165755530988392, + "loss": 2.1015, + "step": 25299 + }, + { + "epoch": 2.4319907718927234, + "grad_norm": 1.0559104681015015, + "learning_rate": 0.00012164535223238995, + "loss": 2.0124, + "step": 25300 + }, + { + "epoch": 2.4320868980101893, + "grad_norm": 1.1592639684677124, + "learning_rate": 0.00012163314936498726, + "loss": 2.1093, + "step": 25301 + }, + { + "epoch": 2.4321830241276556, + "grad_norm": 1.15425705909729, + "learning_rate": 0.00012162094670776179, + "loss": 1.7387, + "step": 25302 + }, + { + "epoch": 2.4322791502451215, + "grad_norm": 0.995823085308075, + "learning_rate": 0.00012160874426079929, + "loss": 1.8818, + "step": 25303 + }, + { + "epoch": 2.432375276362588, + "grad_norm": 1.074698567390442, + "learning_rate": 0.00012159654202418573, + "loss": 1.9788, + "step": 25304 + }, + { + "epoch": 2.432471402480054, + "grad_norm": 1.2826367616653442, + "learning_rate": 0.00012158433999800694, + "loss": 2.1434, + "step": 25305 + }, + { + "epoch": 2.43256752859752, + "grad_norm": 1.0872896909713745, + "learning_rate": 0.00012157213818234878, + "loss": 2.0206, + "step": 25306 + }, + { + "epoch": 2.432663654714986, + "grad_norm": 1.0390852689743042, + "learning_rate": 0.00012155993657729716, + "loss": 2.0424, + "step": 25307 + }, + { + "epoch": 2.4327597808324524, + "grad_norm": 1.132558822631836, + "learning_rate": 0.0001215477351829379, + "loss": 2.1942, + "step": 25308 + }, + { + "epoch": 2.4328559069499183, + "grad_norm": 1.3726184368133545, + "learning_rate": 0.00012153553399935687, + "loss": 2.1313, + "step": 25309 + }, + { + "epoch": 2.4329520330673846, + "grad_norm": 1.1509233713150024, + "learning_rate": 0.00012152333302663995, + "loss": 2.0132, + "step": 25310 + }, + { + "epoch": 2.4330481591848505, + "grad_norm": 1.27243971824646, + "learning_rate": 0.00012151113226487297, + "loss": 1.9895, + "step": 25311 + }, + { + "epoch": 2.4331442853023164, + "grad_norm": 1.0173616409301758, + "learning_rate": 0.00012149893171414184, + "loss": 1.9972, + "step": 25312 + }, + { + "epoch": 2.4332404114197828, + "grad_norm": 1.1500275135040283, + "learning_rate": 0.00012148673137453234, + "loss": 2.1302, + "step": 25313 + }, + { + "epoch": 2.433336537537249, + "grad_norm": 1.2037848234176636, + "learning_rate": 0.00012147453124613036, + "loss": 2.0056, + "step": 25314 + }, + { + "epoch": 2.433432663654715, + "grad_norm": 1.1852574348449707, + "learning_rate": 0.0001214623313290218, + "loss": 1.8069, + "step": 25315 + }, + { + "epoch": 2.433528789772181, + "grad_norm": 1.0371590852737427, + "learning_rate": 0.00012145013162329249, + "loss": 2.07, + "step": 25316 + }, + { + "epoch": 2.4336249158896472, + "grad_norm": 1.1246490478515625, + "learning_rate": 0.00012143793212902824, + "loss": 1.9352, + "step": 25317 + }, + { + "epoch": 2.433721042007113, + "grad_norm": 0.9901874661445618, + "learning_rate": 0.00012142573284631493, + "loss": 1.8574, + "step": 25318 + }, + { + "epoch": 2.4338171681245795, + "grad_norm": 1.298791527748108, + "learning_rate": 0.00012141353377523842, + "loss": 2.1642, + "step": 25319 + }, + { + "epoch": 2.4339132942420454, + "grad_norm": 1.2422308921813965, + "learning_rate": 0.00012140133491588452, + "loss": 2.0835, + "step": 25320 + }, + { + "epoch": 2.4340094203595117, + "grad_norm": 1.0368995666503906, + "learning_rate": 0.00012138913626833911, + "loss": 2.0885, + "step": 25321 + }, + { + "epoch": 2.4341055464769776, + "grad_norm": 1.029594898223877, + "learning_rate": 0.00012137693783268803, + "loss": 1.8379, + "step": 25322 + }, + { + "epoch": 2.434201672594444, + "grad_norm": 1.1638267040252686, + "learning_rate": 0.00012136473960901708, + "loss": 2.0236, + "step": 25323 + }, + { + "epoch": 2.43429779871191, + "grad_norm": 1.2674728631973267, + "learning_rate": 0.00012135254159741217, + "loss": 2.1556, + "step": 25324 + }, + { + "epoch": 2.434393924829376, + "grad_norm": 1.2443373203277588, + "learning_rate": 0.00012134034379795907, + "loss": 1.983, + "step": 25325 + }, + { + "epoch": 2.434490050946842, + "grad_norm": 1.2512705326080322, + "learning_rate": 0.00012132814621074367, + "loss": 2.0443, + "step": 25326 + }, + { + "epoch": 2.4345861770643085, + "grad_norm": 1.134260892868042, + "learning_rate": 0.00012131594883585177, + "loss": 1.9453, + "step": 25327 + }, + { + "epoch": 2.4346823031817744, + "grad_norm": 1.2701336145401, + "learning_rate": 0.00012130375167336922, + "loss": 1.8791, + "step": 25328 + }, + { + "epoch": 2.4347784292992407, + "grad_norm": 1.1884092092514038, + "learning_rate": 0.00012129155472338187, + "loss": 2.1571, + "step": 25329 + }, + { + "epoch": 2.4348745554167066, + "grad_norm": 1.1943283081054688, + "learning_rate": 0.00012127935798597551, + "loss": 2.1175, + "step": 25330 + }, + { + "epoch": 2.434970681534173, + "grad_norm": 1.1016404628753662, + "learning_rate": 0.00012126716146123599, + "loss": 2.0819, + "step": 25331 + }, + { + "epoch": 2.435066807651639, + "grad_norm": 1.0448118448257446, + "learning_rate": 0.00012125496514924915, + "loss": 1.9407, + "step": 25332 + }, + { + "epoch": 2.435162933769105, + "grad_norm": 1.2104624509811401, + "learning_rate": 0.00012124276905010083, + "loss": 1.9966, + "step": 25333 + }, + { + "epoch": 2.435259059886571, + "grad_norm": 1.1879042387008667, + "learning_rate": 0.00012123057316387682, + "loss": 1.9783, + "step": 25334 + }, + { + "epoch": 2.4353551860040374, + "grad_norm": 1.0093921422958374, + "learning_rate": 0.00012121837749066297, + "loss": 1.8833, + "step": 25335 + }, + { + "epoch": 2.4354513121215033, + "grad_norm": 0.9544417858123779, + "learning_rate": 0.00012120618203054509, + "loss": 1.8273, + "step": 25336 + }, + { + "epoch": 2.4355474382389697, + "grad_norm": 1.2816765308380127, + "learning_rate": 0.00012119398678360901, + "loss": 2.1334, + "step": 25337 + }, + { + "epoch": 2.4356435643564356, + "grad_norm": 0.98256516456604, + "learning_rate": 0.00012118179174994053, + "loss": 2.0425, + "step": 25338 + }, + { + "epoch": 2.435739690473902, + "grad_norm": 1.0271848440170288, + "learning_rate": 0.00012116959692962549, + "loss": 1.8927, + "step": 25339 + }, + { + "epoch": 2.435835816591368, + "grad_norm": 1.3325124979019165, + "learning_rate": 0.00012115740232274969, + "loss": 2.267, + "step": 25340 + }, + { + "epoch": 2.435931942708834, + "grad_norm": 1.0460138320922852, + "learning_rate": 0.00012114520792939897, + "loss": 1.9244, + "step": 25341 + }, + { + "epoch": 2.4360280688263, + "grad_norm": 1.1616344451904297, + "learning_rate": 0.00012113301374965912, + "loss": 2.1467, + "step": 25342 + }, + { + "epoch": 2.4361241949437664, + "grad_norm": 1.2299453020095825, + "learning_rate": 0.00012112081978361597, + "loss": 2.0338, + "step": 25343 + }, + { + "epoch": 2.4362203210612323, + "grad_norm": 1.4337875843048096, + "learning_rate": 0.0001211086260313553, + "loss": 1.964, + "step": 25344 + }, + { + "epoch": 2.436316447178698, + "grad_norm": 1.0250674486160278, + "learning_rate": 0.00012109643249296295, + "loss": 1.8762, + "step": 25345 + }, + { + "epoch": 2.4364125732961646, + "grad_norm": 1.3052639961242676, + "learning_rate": 0.00012108423916852471, + "loss": 2.0912, + "step": 25346 + }, + { + "epoch": 2.436508699413631, + "grad_norm": 0.9983817934989929, + "learning_rate": 0.0001210720460581264, + "loss": 2.1082, + "step": 25347 + }, + { + "epoch": 2.436604825531097, + "grad_norm": 1.246115803718567, + "learning_rate": 0.00012105985316185378, + "loss": 1.8469, + "step": 25348 + }, + { + "epoch": 2.4367009516485627, + "grad_norm": 1.2021163702011108, + "learning_rate": 0.00012104766047979272, + "loss": 2.1168, + "step": 25349 + }, + { + "epoch": 2.436797077766029, + "grad_norm": 1.1441850662231445, + "learning_rate": 0.00012103546801202902, + "loss": 2.103, + "step": 25350 + }, + { + "epoch": 2.436893203883495, + "grad_norm": 1.3483963012695312, + "learning_rate": 0.00012102327575864842, + "loss": 2.1659, + "step": 25351 + }, + { + "epoch": 2.4369893300009613, + "grad_norm": 1.0641648769378662, + "learning_rate": 0.00012101108371973675, + "loss": 1.7214, + "step": 25352 + }, + { + "epoch": 2.437085456118427, + "grad_norm": 1.195117712020874, + "learning_rate": 0.00012099889189537983, + "loss": 2.0987, + "step": 25353 + }, + { + "epoch": 2.4371815822358935, + "grad_norm": 1.1328988075256348, + "learning_rate": 0.00012098670028566342, + "loss": 1.9941, + "step": 25354 + }, + { + "epoch": 2.4372777083533594, + "grad_norm": 1.0132484436035156, + "learning_rate": 0.00012097450889067332, + "loss": 1.9916, + "step": 25355 + }, + { + "epoch": 2.437373834470826, + "grad_norm": 1.0541895627975464, + "learning_rate": 0.00012096231771049533, + "loss": 1.8104, + "step": 25356 + }, + { + "epoch": 2.4374699605882917, + "grad_norm": 1.3173288106918335, + "learning_rate": 0.00012095012674521525, + "loss": 2.0072, + "step": 25357 + }, + { + "epoch": 2.437566086705758, + "grad_norm": 1.2204771041870117, + "learning_rate": 0.00012093793599491886, + "loss": 1.9959, + "step": 25358 + }, + { + "epoch": 2.437662212823224, + "grad_norm": 1.1247807741165161, + "learning_rate": 0.00012092574545969194, + "loss": 2.0523, + "step": 25359 + }, + { + "epoch": 2.4377583389406903, + "grad_norm": 1.1782023906707764, + "learning_rate": 0.00012091355513962031, + "loss": 1.8728, + "step": 25360 + }, + { + "epoch": 2.437854465058156, + "grad_norm": 1.0633739233016968, + "learning_rate": 0.0001209013650347897, + "loss": 1.9842, + "step": 25361 + }, + { + "epoch": 2.4379505911756225, + "grad_norm": 1.2723950147628784, + "learning_rate": 0.00012088917514528593, + "loss": 1.9932, + "step": 25362 + }, + { + "epoch": 2.4380467172930884, + "grad_norm": 1.2462083101272583, + "learning_rate": 0.00012087698547119479, + "loss": 1.9935, + "step": 25363 + }, + { + "epoch": 2.4381428434105548, + "grad_norm": 1.1586939096450806, + "learning_rate": 0.00012086479601260204, + "loss": 1.9617, + "step": 25364 + }, + { + "epoch": 2.4382389695280207, + "grad_norm": 1.1893267631530762, + "learning_rate": 0.00012085260676959343, + "loss": 2.0382, + "step": 25365 + }, + { + "epoch": 2.438335095645487, + "grad_norm": 1.1760109663009644, + "learning_rate": 0.0001208404177422548, + "loss": 1.9722, + "step": 25366 + }, + { + "epoch": 2.438431221762953, + "grad_norm": 0.8848485946655273, + "learning_rate": 0.00012082822893067191, + "loss": 1.7818, + "step": 25367 + }, + { + "epoch": 2.4385273478804192, + "grad_norm": 1.3184196949005127, + "learning_rate": 0.00012081604033493051, + "loss": 1.8805, + "step": 25368 + }, + { + "epoch": 2.438623473997885, + "grad_norm": 1.1884241104125977, + "learning_rate": 0.00012080385195511639, + "loss": 2.0394, + "step": 25369 + }, + { + "epoch": 2.4387196001153515, + "grad_norm": 0.9309086799621582, + "learning_rate": 0.00012079166379131532, + "loss": 1.6774, + "step": 25370 + }, + { + "epoch": 2.4388157262328174, + "grad_norm": 1.1384098529815674, + "learning_rate": 0.00012077947584361306, + "loss": 1.9346, + "step": 25371 + }, + { + "epoch": 2.4389118523502837, + "grad_norm": 0.9909814596176147, + "learning_rate": 0.00012076728811209537, + "loss": 2.117, + "step": 25372 + }, + { + "epoch": 2.4390079784677496, + "grad_norm": 1.3389755487442017, + "learning_rate": 0.00012075510059684806, + "loss": 2.202, + "step": 25373 + }, + { + "epoch": 2.439104104585216, + "grad_norm": 1.1634900569915771, + "learning_rate": 0.00012074291329795685, + "loss": 1.8583, + "step": 25374 + }, + { + "epoch": 2.439200230702682, + "grad_norm": 1.0824766159057617, + "learning_rate": 0.00012073072621550753, + "loss": 1.7853, + "step": 25375 + }, + { + "epoch": 2.4392963568201482, + "grad_norm": 1.1777167320251465, + "learning_rate": 0.00012071853934958585, + "loss": 1.9588, + "step": 25376 + }, + { + "epoch": 2.439392482937614, + "grad_norm": 1.0692169666290283, + "learning_rate": 0.00012070635270027756, + "loss": 1.9716, + "step": 25377 + }, + { + "epoch": 2.43948860905508, + "grad_norm": 1.2051811218261719, + "learning_rate": 0.00012069416626766845, + "loss": 2.2035, + "step": 25378 + }, + { + "epoch": 2.4395847351725464, + "grad_norm": 0.9523755311965942, + "learning_rate": 0.00012068198005184424, + "loss": 1.7972, + "step": 25379 + }, + { + "epoch": 2.4396808612900127, + "grad_norm": 1.2166482210159302, + "learning_rate": 0.00012066979405289072, + "loss": 2.0723, + "step": 25380 + }, + { + "epoch": 2.4397769874074786, + "grad_norm": 1.1592919826507568, + "learning_rate": 0.00012065760827089364, + "loss": 1.916, + "step": 25381 + }, + { + "epoch": 2.4398731135249445, + "grad_norm": 1.1216237545013428, + "learning_rate": 0.00012064542270593869, + "loss": 1.9524, + "step": 25382 + }, + { + "epoch": 2.439969239642411, + "grad_norm": 1.0405776500701904, + "learning_rate": 0.00012063323735811173, + "loss": 1.7513, + "step": 25383 + }, + { + "epoch": 2.440065365759877, + "grad_norm": 0.930481493473053, + "learning_rate": 0.00012062105222749844, + "loss": 1.8397, + "step": 25384 + }, + { + "epoch": 2.440161491877343, + "grad_norm": 1.0833144187927246, + "learning_rate": 0.00012060886731418459, + "loss": 2.054, + "step": 25385 + }, + { + "epoch": 2.440257617994809, + "grad_norm": 1.062915325164795, + "learning_rate": 0.0001205966826182559, + "loss": 2.0759, + "step": 25386 + }, + { + "epoch": 2.4403537441122753, + "grad_norm": 1.2029221057891846, + "learning_rate": 0.00012058449813979815, + "loss": 1.9973, + "step": 25387 + }, + { + "epoch": 2.4404498702297412, + "grad_norm": 1.2304188013076782, + "learning_rate": 0.00012057231387889707, + "loss": 1.8977, + "step": 25388 + }, + { + "epoch": 2.4405459963472076, + "grad_norm": 1.1115309000015259, + "learning_rate": 0.00012056012983563841, + "loss": 2.1642, + "step": 25389 + }, + { + "epoch": 2.4406421224646735, + "grad_norm": 1.0548830032348633, + "learning_rate": 0.00012054794601010789, + "loss": 1.8627, + "step": 25390 + }, + { + "epoch": 2.44073824858214, + "grad_norm": 1.1167511940002441, + "learning_rate": 0.00012053576240239127, + "loss": 1.969, + "step": 25391 + }, + { + "epoch": 2.4408343746996057, + "grad_norm": 1.2739447355270386, + "learning_rate": 0.00012052357901257428, + "loss": 2.0109, + "step": 25392 + }, + { + "epoch": 2.440930500817072, + "grad_norm": 1.2213823795318604, + "learning_rate": 0.00012051139584074264, + "loss": 1.9756, + "step": 25393 + }, + { + "epoch": 2.441026626934538, + "grad_norm": 1.0846188068389893, + "learning_rate": 0.00012049921288698213, + "loss": 1.9495, + "step": 25394 + }, + { + "epoch": 2.4411227530520043, + "grad_norm": 1.1509497165679932, + "learning_rate": 0.00012048703015137844, + "loss": 2.0529, + "step": 25395 + }, + { + "epoch": 2.4412188791694702, + "grad_norm": 1.0785984992980957, + "learning_rate": 0.00012047484763401731, + "loss": 1.91, + "step": 25396 + }, + { + "epoch": 2.4413150052869366, + "grad_norm": 1.1334584951400757, + "learning_rate": 0.00012046266533498448, + "loss": 1.9747, + "step": 25397 + }, + { + "epoch": 2.4414111314044025, + "grad_norm": 1.2010201215744019, + "learning_rate": 0.00012045048325436567, + "loss": 2.208, + "step": 25398 + }, + { + "epoch": 2.441507257521869, + "grad_norm": 1.347041368484497, + "learning_rate": 0.0001204383013922466, + "loss": 2.0054, + "step": 25399 + }, + { + "epoch": 2.4416033836393347, + "grad_norm": 1.2486748695373535, + "learning_rate": 0.00012042611974871302, + "loss": 2.0493, + "step": 25400 + }, + { + "epoch": 2.441699509756801, + "grad_norm": 1.2922011613845825, + "learning_rate": 0.00012041393832385063, + "loss": 1.972, + "step": 25401 + }, + { + "epoch": 2.441795635874267, + "grad_norm": 1.4280847311019897, + "learning_rate": 0.00012040175711774515, + "loss": 2.0677, + "step": 25402 + }, + { + "epoch": 2.4418917619917333, + "grad_norm": 1.0693633556365967, + "learning_rate": 0.00012038957613048235, + "loss": 1.8178, + "step": 25403 + }, + { + "epoch": 2.441987888109199, + "grad_norm": 0.9596725106239319, + "learning_rate": 0.0001203773953621479, + "loss": 1.6993, + "step": 25404 + }, + { + "epoch": 2.4420840142266655, + "grad_norm": 0.9837289452552795, + "learning_rate": 0.00012036521481282754, + "loss": 1.9252, + "step": 25405 + }, + { + "epoch": 2.4421801403441314, + "grad_norm": 1.0828444957733154, + "learning_rate": 0.00012035303448260696, + "loss": 2.0609, + "step": 25406 + }, + { + "epoch": 2.442276266461598, + "grad_norm": 1.0482628345489502, + "learning_rate": 0.00012034085437157189, + "loss": 2.0703, + "step": 25407 + }, + { + "epoch": 2.4423723925790637, + "grad_norm": 1.1085008382797241, + "learning_rate": 0.00012032867447980807, + "loss": 1.967, + "step": 25408 + }, + { + "epoch": 2.44246851869653, + "grad_norm": 1.1629520654678345, + "learning_rate": 0.00012031649480740117, + "loss": 1.9386, + "step": 25409 + }, + { + "epoch": 2.442564644813996, + "grad_norm": 1.1222532987594604, + "learning_rate": 0.00012030431535443692, + "loss": 2.027, + "step": 25410 + }, + { + "epoch": 2.442660770931462, + "grad_norm": 1.1682758331298828, + "learning_rate": 0.00012029213612100102, + "loss": 2.1056, + "step": 25411 + }, + { + "epoch": 2.442756897048928, + "grad_norm": 1.016206979751587, + "learning_rate": 0.00012027995710717917, + "loss": 1.9836, + "step": 25412 + }, + { + "epoch": 2.4428530231663945, + "grad_norm": 1.1645920276641846, + "learning_rate": 0.00012026777831305709, + "loss": 1.989, + "step": 25413 + }, + { + "epoch": 2.4429491492838604, + "grad_norm": 1.1692283153533936, + "learning_rate": 0.0001202555997387205, + "loss": 2.2008, + "step": 25414 + }, + { + "epoch": 2.4430452754013263, + "grad_norm": 1.1924188137054443, + "learning_rate": 0.00012024342138425508, + "loss": 2.0685, + "step": 25415 + }, + { + "epoch": 2.4431414015187927, + "grad_norm": 1.0543771982192993, + "learning_rate": 0.00012023124324974647, + "loss": 2.0224, + "step": 25416 + }, + { + "epoch": 2.443237527636259, + "grad_norm": 1.0193989276885986, + "learning_rate": 0.00012021906533528049, + "loss": 2.1628, + "step": 25417 + }, + { + "epoch": 2.443333653753725, + "grad_norm": 1.1834306716918945, + "learning_rate": 0.0001202068876409428, + "loss": 2.0974, + "step": 25418 + }, + { + "epoch": 2.443429779871191, + "grad_norm": 1.0505454540252686, + "learning_rate": 0.00012019471016681905, + "loss": 2.0151, + "step": 25419 + }, + { + "epoch": 2.443525905988657, + "grad_norm": 1.0689823627471924, + "learning_rate": 0.00012018253291299498, + "loss": 2.004, + "step": 25420 + }, + { + "epoch": 2.443622032106123, + "grad_norm": 1.1593135595321655, + "learning_rate": 0.00012017035587955625, + "loss": 1.989, + "step": 25421 + }, + { + "epoch": 2.4437181582235894, + "grad_norm": 1.2161576747894287, + "learning_rate": 0.00012015817906658855, + "loss": 1.9874, + "step": 25422 + }, + { + "epoch": 2.4438142843410553, + "grad_norm": 1.1816341876983643, + "learning_rate": 0.00012014600247417761, + "loss": 1.9436, + "step": 25423 + }, + { + "epoch": 2.4439104104585216, + "grad_norm": 1.0866119861602783, + "learning_rate": 0.00012013382610240909, + "loss": 1.9763, + "step": 25424 + }, + { + "epoch": 2.4440065365759875, + "grad_norm": 1.1047965288162231, + "learning_rate": 0.00012012164995136868, + "loss": 2.2204, + "step": 25425 + }, + { + "epoch": 2.444102662693454, + "grad_norm": 0.894714891910553, + "learning_rate": 0.00012010947402114208, + "loss": 1.8642, + "step": 25426 + }, + { + "epoch": 2.44419878881092, + "grad_norm": 1.0218572616577148, + "learning_rate": 0.00012009729831181495, + "loss": 1.8853, + "step": 25427 + }, + { + "epoch": 2.444294914928386, + "grad_norm": 1.0603573322296143, + "learning_rate": 0.00012008512282347298, + "loss": 1.8872, + "step": 25428 + }, + { + "epoch": 2.444391041045852, + "grad_norm": 1.168684959411621, + "learning_rate": 0.00012007294755620185, + "loss": 1.9839, + "step": 25429 + }, + { + "epoch": 2.4444871671633184, + "grad_norm": 1.3044449090957642, + "learning_rate": 0.00012006077251008726, + "loss": 2.1046, + "step": 25430 + }, + { + "epoch": 2.4445832932807843, + "grad_norm": 0.9787259697914124, + "learning_rate": 0.00012004859768521485, + "loss": 2.0378, + "step": 25431 + }, + { + "epoch": 2.4446794193982506, + "grad_norm": 1.0184355974197388, + "learning_rate": 0.00012003642308167031, + "loss": 1.9143, + "step": 25432 + }, + { + "epoch": 2.4447755455157165, + "grad_norm": 1.2053579092025757, + "learning_rate": 0.0001200242486995393, + "loss": 1.9562, + "step": 25433 + }, + { + "epoch": 2.444871671633183, + "grad_norm": 1.1152886152267456, + "learning_rate": 0.00012001207453890753, + "loss": 2.0607, + "step": 25434 + }, + { + "epoch": 2.4449677977506488, + "grad_norm": 1.2057870626449585, + "learning_rate": 0.00011999990059986068, + "loss": 2.0128, + "step": 25435 + }, + { + "epoch": 2.445063923868115, + "grad_norm": 1.18195378780365, + "learning_rate": 0.00011998772688248438, + "loss": 2.0333, + "step": 25436 + }, + { + "epoch": 2.445160049985581, + "grad_norm": 1.3530325889587402, + "learning_rate": 0.00011997555338686432, + "loss": 2.0782, + "step": 25437 + }, + { + "epoch": 2.4452561761030474, + "grad_norm": 1.10814368724823, + "learning_rate": 0.00011996338011308614, + "loss": 1.9503, + "step": 25438 + }, + { + "epoch": 2.4453523022205133, + "grad_norm": 1.1337066888809204, + "learning_rate": 0.00011995120706123554, + "loss": 1.8365, + "step": 25439 + }, + { + "epoch": 2.4454484283379796, + "grad_norm": 1.1445077657699585, + "learning_rate": 0.00011993903423139814, + "loss": 2.2011, + "step": 25440 + }, + { + "epoch": 2.4455445544554455, + "grad_norm": 1.1837687492370605, + "learning_rate": 0.00011992686162365966, + "loss": 2.1392, + "step": 25441 + }, + { + "epoch": 2.445640680572912, + "grad_norm": 1.1649973392486572, + "learning_rate": 0.00011991468923810572, + "loss": 2.0047, + "step": 25442 + }, + { + "epoch": 2.4457368066903777, + "grad_norm": 1.1999866962432861, + "learning_rate": 0.00011990251707482198, + "loss": 2.0047, + "step": 25443 + }, + { + "epoch": 2.445832932807844, + "grad_norm": 1.2143948078155518, + "learning_rate": 0.00011989034513389413, + "loss": 2.1397, + "step": 25444 + }, + { + "epoch": 2.44592905892531, + "grad_norm": 1.2323192358016968, + "learning_rate": 0.00011987817341540776, + "loss": 1.9454, + "step": 25445 + }, + { + "epoch": 2.4460251850427763, + "grad_norm": 1.2665601968765259, + "learning_rate": 0.00011986600191944859, + "loss": 1.908, + "step": 25446 + }, + { + "epoch": 2.4461213111602422, + "grad_norm": 1.0411947965621948, + "learning_rate": 0.00011985383064610223, + "loss": 1.9363, + "step": 25447 + }, + { + "epoch": 2.446217437277708, + "grad_norm": 1.070572853088379, + "learning_rate": 0.00011984165959545435, + "loss": 1.9547, + "step": 25448 + }, + { + "epoch": 2.4463135633951745, + "grad_norm": 0.9648131728172302, + "learning_rate": 0.00011982948876759063, + "loss": 1.9442, + "step": 25449 + }, + { + "epoch": 2.446409689512641, + "grad_norm": 1.13060462474823, + "learning_rate": 0.00011981731816259662, + "loss": 1.8129, + "step": 25450 + }, + { + "epoch": 2.4465058156301067, + "grad_norm": 1.267921805381775, + "learning_rate": 0.00011980514778055808, + "loss": 2.0688, + "step": 25451 + }, + { + "epoch": 2.4466019417475726, + "grad_norm": 1.213947057723999, + "learning_rate": 0.00011979297762156059, + "loss": 2.0868, + "step": 25452 + }, + { + "epoch": 2.446698067865039, + "grad_norm": 1.2381625175476074, + "learning_rate": 0.00011978080768568984, + "loss": 2.0167, + "step": 25453 + }, + { + "epoch": 2.446794193982505, + "grad_norm": 1.1479732990264893, + "learning_rate": 0.00011976863797303141, + "loss": 2.0607, + "step": 25454 + }, + { + "epoch": 2.446890320099971, + "grad_norm": 1.175029993057251, + "learning_rate": 0.00011975646848367097, + "loss": 2.0117, + "step": 25455 + }, + { + "epoch": 2.446986446217437, + "grad_norm": 1.2046828269958496, + "learning_rate": 0.00011974429921769417, + "loss": 2.0821, + "step": 25456 + }, + { + "epoch": 2.4470825723349034, + "grad_norm": 1.078338861465454, + "learning_rate": 0.00011973213017518665, + "loss": 1.9624, + "step": 25457 + }, + { + "epoch": 2.4471786984523693, + "grad_norm": 1.054466724395752, + "learning_rate": 0.00011971996135623402, + "loss": 2.1861, + "step": 25458 + }, + { + "epoch": 2.4472748245698357, + "grad_norm": 1.2694456577301025, + "learning_rate": 0.00011970779276092192, + "loss": 2.0932, + "step": 25459 + }, + { + "epoch": 2.4473709506873016, + "grad_norm": 1.1180559396743774, + "learning_rate": 0.00011969562438933598, + "loss": 1.9213, + "step": 25460 + }, + { + "epoch": 2.447467076804768, + "grad_norm": 1.3817391395568848, + "learning_rate": 0.00011968345624156185, + "loss": 2.0922, + "step": 25461 + }, + { + "epoch": 2.447563202922234, + "grad_norm": 1.1034377813339233, + "learning_rate": 0.00011967128831768513, + "loss": 1.9416, + "step": 25462 + }, + { + "epoch": 2.4476593290397, + "grad_norm": 1.2628642320632935, + "learning_rate": 0.00011965912061779149, + "loss": 2.0766, + "step": 25463 + }, + { + "epoch": 2.447755455157166, + "grad_norm": 1.126621961593628, + "learning_rate": 0.00011964695314196649, + "loss": 1.8379, + "step": 25464 + }, + { + "epoch": 2.4478515812746324, + "grad_norm": 1.3135405778884888, + "learning_rate": 0.00011963478589029583, + "loss": 2.1314, + "step": 25465 + }, + { + "epoch": 2.4479477073920983, + "grad_norm": 1.146533489227295, + "learning_rate": 0.00011962261886286509, + "loss": 1.9816, + "step": 25466 + }, + { + "epoch": 2.4480438335095647, + "grad_norm": 1.0591483116149902, + "learning_rate": 0.00011961045205975987, + "loss": 2.051, + "step": 25467 + }, + { + "epoch": 2.4481399596270306, + "grad_norm": 1.2318514585494995, + "learning_rate": 0.00011959828548106582, + "loss": 2.0741, + "step": 25468 + }, + { + "epoch": 2.448236085744497, + "grad_norm": 1.2326735258102417, + "learning_rate": 0.00011958611912686858, + "loss": 1.8206, + "step": 25469 + }, + { + "epoch": 2.448332211861963, + "grad_norm": 1.2808173894882202, + "learning_rate": 0.00011957395299725372, + "loss": 2.1412, + "step": 25470 + }, + { + "epoch": 2.448428337979429, + "grad_norm": 1.2863588333129883, + "learning_rate": 0.00011956178709230692, + "loss": 1.9827, + "step": 25471 + }, + { + "epoch": 2.448524464096895, + "grad_norm": 1.136353850364685, + "learning_rate": 0.0001195496214121137, + "loss": 1.8973, + "step": 25472 + }, + { + "epoch": 2.4486205902143614, + "grad_norm": 1.2818987369537354, + "learning_rate": 0.00011953745595675975, + "loss": 1.9867, + "step": 25473 + }, + { + "epoch": 2.4487167163318273, + "grad_norm": 1.1279149055480957, + "learning_rate": 0.00011952529072633063, + "loss": 2.0626, + "step": 25474 + }, + { + "epoch": 2.4488128424492936, + "grad_norm": 0.9493167400360107, + "learning_rate": 0.00011951312572091199, + "loss": 1.9118, + "step": 25475 + }, + { + "epoch": 2.4489089685667595, + "grad_norm": 1.1632429361343384, + "learning_rate": 0.00011950096094058941, + "loss": 2.1005, + "step": 25476 + }, + { + "epoch": 2.449005094684226, + "grad_norm": 1.1762073040008545, + "learning_rate": 0.00011948879638544849, + "loss": 2.172, + "step": 25477 + }, + { + "epoch": 2.449101220801692, + "grad_norm": 1.2839912176132202, + "learning_rate": 0.00011947663205557485, + "loss": 2.1577, + "step": 25478 + }, + { + "epoch": 2.449197346919158, + "grad_norm": 1.0079869031906128, + "learning_rate": 0.00011946446795105409, + "loss": 2.025, + "step": 25479 + }, + { + "epoch": 2.449293473036624, + "grad_norm": 1.1595453023910522, + "learning_rate": 0.00011945230407197179, + "loss": 2.0368, + "step": 25480 + }, + { + "epoch": 2.44938959915409, + "grad_norm": 1.245025873184204, + "learning_rate": 0.00011944014041841359, + "loss": 2.13, + "step": 25481 + }, + { + "epoch": 2.4494857252715563, + "grad_norm": 1.087690830230713, + "learning_rate": 0.00011942797699046506, + "loss": 2.0408, + "step": 25482 + }, + { + "epoch": 2.4495818513890226, + "grad_norm": 1.0514183044433594, + "learning_rate": 0.00011941581378821179, + "loss": 2.0473, + "step": 25483 + }, + { + "epoch": 2.4496779775064885, + "grad_norm": 1.1288902759552002, + "learning_rate": 0.00011940365081173935, + "loss": 1.8798, + "step": 25484 + }, + { + "epoch": 2.4497741036239544, + "grad_norm": 0.9557623863220215, + "learning_rate": 0.0001193914880611334, + "loss": 1.8353, + "step": 25485 + }, + { + "epoch": 2.4498702297414208, + "grad_norm": 1.356776237487793, + "learning_rate": 0.0001193793255364795, + "loss": 2.0429, + "step": 25486 + }, + { + "epoch": 2.4499663558588867, + "grad_norm": 1.2281906604766846, + "learning_rate": 0.00011936716323786323, + "loss": 2.098, + "step": 25487 + }, + { + "epoch": 2.450062481976353, + "grad_norm": 0.94427889585495, + "learning_rate": 0.00011935500116537025, + "loss": 1.9743, + "step": 25488 + }, + { + "epoch": 2.450158608093819, + "grad_norm": 1.1472141742706299, + "learning_rate": 0.00011934283931908604, + "loss": 1.912, + "step": 25489 + }, + { + "epoch": 2.4502547342112853, + "grad_norm": 1.1425986289978027, + "learning_rate": 0.00011933067769909619, + "loss": 2.1068, + "step": 25490 + }, + { + "epoch": 2.450350860328751, + "grad_norm": 1.216129183769226, + "learning_rate": 0.00011931851630548635, + "loss": 2.1318, + "step": 25491 + }, + { + "epoch": 2.4504469864462175, + "grad_norm": 1.1147910356521606, + "learning_rate": 0.00011930635513834205, + "loss": 2.0323, + "step": 25492 + }, + { + "epoch": 2.4505431125636834, + "grad_norm": 1.0681767463684082, + "learning_rate": 0.00011929419419774888, + "loss": 1.9356, + "step": 25493 + }, + { + "epoch": 2.4506392386811497, + "grad_norm": 1.119694709777832, + "learning_rate": 0.00011928203348379241, + "loss": 2.0662, + "step": 25494 + }, + { + "epoch": 2.4507353647986156, + "grad_norm": 1.1118638515472412, + "learning_rate": 0.00011926987299655826, + "loss": 2.0336, + "step": 25495 + }, + { + "epoch": 2.450831490916082, + "grad_norm": 1.2462553977966309, + "learning_rate": 0.00011925771273613197, + "loss": 1.8574, + "step": 25496 + }, + { + "epoch": 2.450927617033548, + "grad_norm": 1.0713589191436768, + "learning_rate": 0.00011924555270259913, + "loss": 2.076, + "step": 25497 + }, + { + "epoch": 2.4510237431510142, + "grad_norm": 1.1183429956436157, + "learning_rate": 0.00011923339289604533, + "loss": 1.9173, + "step": 25498 + }, + { + "epoch": 2.45111986926848, + "grad_norm": 1.1202359199523926, + "learning_rate": 0.00011922123331655609, + "loss": 2.0729, + "step": 25499 + }, + { + "epoch": 2.4512159953859465, + "grad_norm": 1.060414433479309, + "learning_rate": 0.000119209073964217, + "loss": 1.992, + "step": 25500 + }, + { + "epoch": 2.4513121215034124, + "grad_norm": 1.0795661211013794, + "learning_rate": 0.00011919691483911362, + "loss": 1.9838, + "step": 25501 + }, + { + "epoch": 2.4514082476208787, + "grad_norm": 1.222452163696289, + "learning_rate": 0.00011918475594133155, + "loss": 1.8811, + "step": 25502 + }, + { + "epoch": 2.4515043737383446, + "grad_norm": 1.0027713775634766, + "learning_rate": 0.00011917259727095632, + "loss": 1.9497, + "step": 25503 + }, + { + "epoch": 2.451600499855811, + "grad_norm": 1.124904751777649, + "learning_rate": 0.0001191604388280735, + "loss": 2.0705, + "step": 25504 + }, + { + "epoch": 2.451696625973277, + "grad_norm": 1.207985281944275, + "learning_rate": 0.00011914828061276866, + "loss": 1.9587, + "step": 25505 + }, + { + "epoch": 2.451792752090743, + "grad_norm": 1.060646653175354, + "learning_rate": 0.00011913612262512736, + "loss": 2.0157, + "step": 25506 + }, + { + "epoch": 2.451888878208209, + "grad_norm": 1.2500191926956177, + "learning_rate": 0.00011912396486523512, + "loss": 1.9649, + "step": 25507 + }, + { + "epoch": 2.4519850043256755, + "grad_norm": 1.3261500597000122, + "learning_rate": 0.00011911180733317754, + "loss": 1.9907, + "step": 25508 + }, + { + "epoch": 2.4520811304431414, + "grad_norm": 1.4009751081466675, + "learning_rate": 0.00011909965002904015, + "loss": 2.0054, + "step": 25509 + }, + { + "epoch": 2.4521772565606077, + "grad_norm": 1.0555514097213745, + "learning_rate": 0.00011908749295290852, + "loss": 2.0929, + "step": 25510 + }, + { + "epoch": 2.4522733826780736, + "grad_norm": 1.2085086107254028, + "learning_rate": 0.00011907533610486815, + "loss": 2.0217, + "step": 25511 + }, + { + "epoch": 2.45236950879554, + "grad_norm": 1.1269906759262085, + "learning_rate": 0.00011906317948500468, + "loss": 2.0892, + "step": 25512 + }, + { + "epoch": 2.452465634913006, + "grad_norm": 1.2207084894180298, + "learning_rate": 0.00011905102309340361, + "loss": 2.0856, + "step": 25513 + }, + { + "epoch": 2.4525617610304717, + "grad_norm": 1.2213573455810547, + "learning_rate": 0.00011903886693015048, + "loss": 2.0854, + "step": 25514 + }, + { + "epoch": 2.452657887147938, + "grad_norm": 1.0513068437576294, + "learning_rate": 0.00011902671099533082, + "loss": 1.9199, + "step": 25515 + }, + { + "epoch": 2.4527540132654044, + "grad_norm": 1.2200431823730469, + "learning_rate": 0.00011901455528903023, + "loss": 2.1038, + "step": 25516 + }, + { + "epoch": 2.4528501393828703, + "grad_norm": 1.0321446657180786, + "learning_rate": 0.0001190023998113342, + "loss": 2.1613, + "step": 25517 + }, + { + "epoch": 2.4529462655003362, + "grad_norm": 1.115297794342041, + "learning_rate": 0.00011899024456232826, + "loss": 2.1986, + "step": 25518 + }, + { + "epoch": 2.4530423916178026, + "grad_norm": 1.2532624006271362, + "learning_rate": 0.00011897808954209802, + "loss": 1.8957, + "step": 25519 + }, + { + "epoch": 2.453138517735269, + "grad_norm": 1.1748554706573486, + "learning_rate": 0.00011896593475072895, + "loss": 2.1555, + "step": 25520 + }, + { + "epoch": 2.453234643852735, + "grad_norm": 0.9961651563644409, + "learning_rate": 0.00011895378018830659, + "loss": 1.943, + "step": 25521 + }, + { + "epoch": 2.4533307699702007, + "grad_norm": 1.2678474187850952, + "learning_rate": 0.0001189416258549165, + "loss": 1.967, + "step": 25522 + }, + { + "epoch": 2.453426896087667, + "grad_norm": 1.0386353731155396, + "learning_rate": 0.00011892947175064418, + "loss": 1.8708, + "step": 25523 + }, + { + "epoch": 2.453523022205133, + "grad_norm": 1.4242972135543823, + "learning_rate": 0.00011891731787557521, + "loss": 2.023, + "step": 25524 + }, + { + "epoch": 2.4536191483225993, + "grad_norm": 1.2658156156539917, + "learning_rate": 0.00011890516422979507, + "loss": 2.0698, + "step": 25525 + }, + { + "epoch": 2.453715274440065, + "grad_norm": 1.099816083908081, + "learning_rate": 0.0001188930108133893, + "loss": 2.0677, + "step": 25526 + }, + { + "epoch": 2.4538114005575316, + "grad_norm": 1.0577301979064941, + "learning_rate": 0.00011888085762644345, + "loss": 1.7237, + "step": 25527 + }, + { + "epoch": 2.4539075266749975, + "grad_norm": 0.9938634634017944, + "learning_rate": 0.00011886870466904298, + "loss": 1.8237, + "step": 25528 + }, + { + "epoch": 2.454003652792464, + "grad_norm": 1.1872233152389526, + "learning_rate": 0.00011885655194127348, + "loss": 2.0478, + "step": 25529 + }, + { + "epoch": 2.4540997789099297, + "grad_norm": 1.112820029258728, + "learning_rate": 0.00011884439944322047, + "loss": 2.0084, + "step": 25530 + }, + { + "epoch": 2.454195905027396, + "grad_norm": 1.2442313432693481, + "learning_rate": 0.00011883224717496943, + "loss": 2.1957, + "step": 25531 + }, + { + "epoch": 2.454292031144862, + "grad_norm": 1.0776338577270508, + "learning_rate": 0.0001188200951366059, + "loss": 1.9067, + "step": 25532 + }, + { + "epoch": 2.4543881572623283, + "grad_norm": 1.087473750114441, + "learning_rate": 0.00011880794332821538, + "loss": 1.9333, + "step": 25533 + }, + { + "epoch": 2.454484283379794, + "grad_norm": 1.2203679084777832, + "learning_rate": 0.00011879579174988338, + "loss": 2.211, + "step": 25534 + }, + { + "epoch": 2.4545804094972605, + "grad_norm": 1.0277754068374634, + "learning_rate": 0.00011878364040169546, + "loss": 1.9464, + "step": 25535 + }, + { + "epoch": 2.4546765356147264, + "grad_norm": 1.5431712865829468, + "learning_rate": 0.00011877148928373705, + "loss": 2.1155, + "step": 25536 + }, + { + "epoch": 2.4547726617321928, + "grad_norm": 1.1801284551620483, + "learning_rate": 0.00011875933839609375, + "loss": 2.0841, + "step": 25537 + }, + { + "epoch": 2.4548687878496587, + "grad_norm": 1.295279860496521, + "learning_rate": 0.000118747187738851, + "loss": 1.9068, + "step": 25538 + }, + { + "epoch": 2.454964913967125, + "grad_norm": 1.2007546424865723, + "learning_rate": 0.0001187350373120943, + "loss": 2.1495, + "step": 25539 + }, + { + "epoch": 2.455061040084591, + "grad_norm": 0.991865336894989, + "learning_rate": 0.00011872288711590922, + "loss": 1.9609, + "step": 25540 + }, + { + "epoch": 2.4551571662020573, + "grad_norm": 1.1855945587158203, + "learning_rate": 0.0001187107371503812, + "loss": 1.8804, + "step": 25541 + }, + { + "epoch": 2.455253292319523, + "grad_norm": 1.0305708646774292, + "learning_rate": 0.00011869858741559579, + "loss": 1.725, + "step": 25542 + }, + { + "epoch": 2.4553494184369895, + "grad_norm": 1.1889592409133911, + "learning_rate": 0.00011868643791163845, + "loss": 2.0665, + "step": 25543 + }, + { + "epoch": 2.4554455445544554, + "grad_norm": 1.0983363389968872, + "learning_rate": 0.0001186742886385947, + "loss": 1.896, + "step": 25544 + }, + { + "epoch": 2.4555416706719218, + "grad_norm": 1.1527513265609741, + "learning_rate": 0.00011866213959654999, + "loss": 2.1325, + "step": 25545 + }, + { + "epoch": 2.4556377967893877, + "grad_norm": 0.9886757135391235, + "learning_rate": 0.00011864999078558988, + "loss": 2.0396, + "step": 25546 + }, + { + "epoch": 2.4557339229068536, + "grad_norm": 1.1143022775650024, + "learning_rate": 0.00011863784220579985, + "loss": 2.1263, + "step": 25547 + }, + { + "epoch": 2.45583004902432, + "grad_norm": 1.0725468397140503, + "learning_rate": 0.00011862569385726536, + "loss": 2.1325, + "step": 25548 + }, + { + "epoch": 2.4559261751417862, + "grad_norm": 1.1676242351531982, + "learning_rate": 0.00011861354574007193, + "loss": 2.1064, + "step": 25549 + }, + { + "epoch": 2.456022301259252, + "grad_norm": 1.2612369060516357, + "learning_rate": 0.00011860139785430503, + "loss": 2.0475, + "step": 25550 + }, + { + "epoch": 2.456118427376718, + "grad_norm": 1.295333743095398, + "learning_rate": 0.00011858925020005014, + "loss": 1.9569, + "step": 25551 + }, + { + "epoch": 2.4562145534941844, + "grad_norm": 1.2015700340270996, + "learning_rate": 0.00011857710277739278, + "loss": 1.9438, + "step": 25552 + }, + { + "epoch": 2.4563106796116507, + "grad_norm": 1.1090220212936401, + "learning_rate": 0.0001185649555864184, + "loss": 1.9665, + "step": 25553 + }, + { + "epoch": 2.4564068057291166, + "grad_norm": 1.1087230443954468, + "learning_rate": 0.0001185528086272125, + "loss": 2.0652, + "step": 25554 + }, + { + "epoch": 2.4565029318465825, + "grad_norm": 1.1370757818222046, + "learning_rate": 0.00011854066189986051, + "loss": 2.0301, + "step": 25555 + }, + { + "epoch": 2.456599057964049, + "grad_norm": 1.1500608921051025, + "learning_rate": 0.00011852851540444798, + "loss": 1.9695, + "step": 25556 + }, + { + "epoch": 2.4566951840815148, + "grad_norm": 1.1563178300857544, + "learning_rate": 0.00011851636914106033, + "loss": 1.9394, + "step": 25557 + }, + { + "epoch": 2.456791310198981, + "grad_norm": 1.1821517944335938, + "learning_rate": 0.0001185042231097831, + "loss": 2.1684, + "step": 25558 + }, + { + "epoch": 2.456887436316447, + "grad_norm": 1.0512275695800781, + "learning_rate": 0.0001184920773107017, + "loss": 1.8122, + "step": 25559 + }, + { + "epoch": 2.4569835624339134, + "grad_norm": 1.0680274963378906, + "learning_rate": 0.00011847993174390164, + "loss": 1.9656, + "step": 25560 + }, + { + "epoch": 2.4570796885513793, + "grad_norm": 1.5176746845245361, + "learning_rate": 0.00011846778640946836, + "loss": 2.1496, + "step": 25561 + }, + { + "epoch": 2.4571758146688456, + "grad_norm": 1.2273719310760498, + "learning_rate": 0.00011845564130748732, + "loss": 1.925, + "step": 25562 + }, + { + "epoch": 2.4572719407863115, + "grad_norm": 1.1492680311203003, + "learning_rate": 0.00011844349643804402, + "loss": 2.0031, + "step": 25563 + }, + { + "epoch": 2.457368066903778, + "grad_norm": 1.1965839862823486, + "learning_rate": 0.00011843135180122394, + "loss": 1.94, + "step": 25564 + }, + { + "epoch": 2.4574641930212437, + "grad_norm": 1.1965751647949219, + "learning_rate": 0.00011841920739711251, + "loss": 2.1585, + "step": 25565 + }, + { + "epoch": 2.45756031913871, + "grad_norm": 1.1176508665084839, + "learning_rate": 0.0001184070632257952, + "loss": 1.9213, + "step": 25566 + }, + { + "epoch": 2.457656445256176, + "grad_norm": 1.169679880142212, + "learning_rate": 0.00011839491928735746, + "loss": 1.9345, + "step": 25567 + }, + { + "epoch": 2.4577525713736423, + "grad_norm": 1.1737345457077026, + "learning_rate": 0.00011838277558188479, + "loss": 2.0522, + "step": 25568 + }, + { + "epoch": 2.4578486974911082, + "grad_norm": 1.1109813451766968, + "learning_rate": 0.00011837063210946258, + "loss": 2.0775, + "step": 25569 + }, + { + "epoch": 2.4579448236085746, + "grad_norm": 1.4658089876174927, + "learning_rate": 0.00011835848887017635, + "loss": 2.1302, + "step": 25570 + }, + { + "epoch": 2.4580409497260405, + "grad_norm": 1.1433167457580566, + "learning_rate": 0.00011834634586411152, + "loss": 2.0966, + "step": 25571 + }, + { + "epoch": 2.458137075843507, + "grad_norm": 1.0424038171768188, + "learning_rate": 0.00011833420309135353, + "loss": 1.9828, + "step": 25572 + }, + { + "epoch": 2.4582332019609727, + "grad_norm": 1.2590749263763428, + "learning_rate": 0.00011832206055198787, + "loss": 1.9967, + "step": 25573 + }, + { + "epoch": 2.458329328078439, + "grad_norm": 1.2977681159973145, + "learning_rate": 0.00011830991824609996, + "loss": 2.1728, + "step": 25574 + }, + { + "epoch": 2.458425454195905, + "grad_norm": 1.2689969539642334, + "learning_rate": 0.00011829777617377525, + "loss": 2.1519, + "step": 25575 + }, + { + "epoch": 2.4585215803133713, + "grad_norm": 1.059136152267456, + "learning_rate": 0.00011828563433509919, + "loss": 1.9524, + "step": 25576 + }, + { + "epoch": 2.458617706430837, + "grad_norm": 1.1990050077438354, + "learning_rate": 0.00011827349273015721, + "loss": 2.0958, + "step": 25577 + }, + { + "epoch": 2.4587138325483036, + "grad_norm": 1.1359717845916748, + "learning_rate": 0.00011826135135903477, + "loss": 2.0299, + "step": 25578 + }, + { + "epoch": 2.4588099586657695, + "grad_norm": 1.147946834564209, + "learning_rate": 0.00011824921022181729, + "loss": 1.9309, + "step": 25579 + }, + { + "epoch": 2.4589060847832354, + "grad_norm": 1.099563717842102, + "learning_rate": 0.00011823706931859024, + "loss": 1.9981, + "step": 25580 + }, + { + "epoch": 2.4590022109007017, + "grad_norm": 1.219955325126648, + "learning_rate": 0.00011822492864943904, + "loss": 2.069, + "step": 25581 + }, + { + "epoch": 2.459098337018168, + "grad_norm": 1.4019948244094849, + "learning_rate": 0.00011821278821444915, + "loss": 1.9692, + "step": 25582 + }, + { + "epoch": 2.459194463135634, + "grad_norm": 1.0561829805374146, + "learning_rate": 0.00011820064801370594, + "loss": 1.8165, + "step": 25583 + }, + { + "epoch": 2.4592905892531, + "grad_norm": 1.2870852947235107, + "learning_rate": 0.00011818850804729492, + "loss": 2.1493, + "step": 25584 + }, + { + "epoch": 2.459386715370566, + "grad_norm": 1.1173557043075562, + "learning_rate": 0.00011817636831530148, + "loss": 2.2776, + "step": 25585 + }, + { + "epoch": 2.4594828414880325, + "grad_norm": 1.2394944429397583, + "learning_rate": 0.00011816422881781104, + "loss": 2.0782, + "step": 25586 + }, + { + "epoch": 2.4595789676054984, + "grad_norm": 0.986686110496521, + "learning_rate": 0.00011815208955490905, + "loss": 1.8499, + "step": 25587 + }, + { + "epoch": 2.4596750937229643, + "grad_norm": 1.232016682624817, + "learning_rate": 0.00011813995052668093, + "loss": 2.0479, + "step": 25588 + }, + { + "epoch": 2.4597712198404307, + "grad_norm": 1.2895859479904175, + "learning_rate": 0.00011812781173321208, + "loss": 2.0828, + "step": 25589 + }, + { + "epoch": 2.4598673459578966, + "grad_norm": 1.2453429698944092, + "learning_rate": 0.00011811567317458796, + "loss": 2.1142, + "step": 25590 + }, + { + "epoch": 2.459963472075363, + "grad_norm": 1.1624709367752075, + "learning_rate": 0.00011810353485089395, + "loss": 2.1393, + "step": 25591 + }, + { + "epoch": 2.460059598192829, + "grad_norm": 1.2223976850509644, + "learning_rate": 0.00011809139676221553, + "loss": 1.9665, + "step": 25592 + }, + { + "epoch": 2.460155724310295, + "grad_norm": 1.042525291442871, + "learning_rate": 0.00011807925890863805, + "loss": 2.0221, + "step": 25593 + }, + { + "epoch": 2.460251850427761, + "grad_norm": 1.1855266094207764, + "learning_rate": 0.00011806712129024695, + "loss": 1.9318, + "step": 25594 + }, + { + "epoch": 2.4603479765452274, + "grad_norm": 1.1860706806182861, + "learning_rate": 0.00011805498390712767, + "loss": 2.0599, + "step": 25595 + }, + { + "epoch": 2.4604441026626933, + "grad_norm": 1.1015416383743286, + "learning_rate": 0.00011804284675936555, + "loss": 2.0177, + "step": 25596 + }, + { + "epoch": 2.4605402287801597, + "grad_norm": 1.0571345090866089, + "learning_rate": 0.00011803070984704609, + "loss": 1.9336, + "step": 25597 + }, + { + "epoch": 2.4606363548976256, + "grad_norm": 1.2015725374221802, + "learning_rate": 0.00011801857317025466, + "loss": 2.0222, + "step": 25598 + }, + { + "epoch": 2.460732481015092, + "grad_norm": 1.262048363685608, + "learning_rate": 0.0001180064367290767, + "loss": 2.1659, + "step": 25599 + }, + { + "epoch": 2.460828607132558, + "grad_norm": 1.1254115104675293, + "learning_rate": 0.00011799430052359755, + "loss": 1.97, + "step": 25600 + }, + { + "epoch": 2.460924733250024, + "grad_norm": 1.2337454557418823, + "learning_rate": 0.00011798216455390267, + "loss": 2.0768, + "step": 25601 + }, + { + "epoch": 2.46102085936749, + "grad_norm": 1.0451476573944092, + "learning_rate": 0.00011797002882007743, + "loss": 2.0346, + "step": 25602 + }, + { + "epoch": 2.4611169854849564, + "grad_norm": 1.099139928817749, + "learning_rate": 0.00011795789332220725, + "loss": 1.8418, + "step": 25603 + }, + { + "epoch": 2.4612131116024223, + "grad_norm": 1.2322291135787964, + "learning_rate": 0.00011794575806037749, + "loss": 2.0334, + "step": 25604 + }, + { + "epoch": 2.4613092377198886, + "grad_norm": 1.0315614938735962, + "learning_rate": 0.00011793362303467363, + "loss": 2.0217, + "step": 25605 + }, + { + "epoch": 2.4614053638373545, + "grad_norm": 1.271045446395874, + "learning_rate": 0.00011792148824518098, + "loss": 2.0686, + "step": 25606 + }, + { + "epoch": 2.461501489954821, + "grad_norm": 1.1521347761154175, + "learning_rate": 0.00011790935369198497, + "loss": 2.1136, + "step": 25607 + }, + { + "epoch": 2.4615976160722868, + "grad_norm": 1.0819950103759766, + "learning_rate": 0.00011789721937517101, + "loss": 1.9088, + "step": 25608 + }, + { + "epoch": 2.461693742189753, + "grad_norm": 1.2007479667663574, + "learning_rate": 0.00011788508529482447, + "loss": 2.0442, + "step": 25609 + }, + { + "epoch": 2.461789868307219, + "grad_norm": 1.138363242149353, + "learning_rate": 0.00011787295145103074, + "loss": 2.0657, + "step": 25610 + }, + { + "epoch": 2.4618859944246854, + "grad_norm": 1.104137897491455, + "learning_rate": 0.00011786081784387518, + "loss": 2.0381, + "step": 25611 + }, + { + "epoch": 2.4619821205421513, + "grad_norm": 1.1431361436843872, + "learning_rate": 0.00011784868447344324, + "loss": 2.0569, + "step": 25612 + }, + { + "epoch": 2.4620782466596176, + "grad_norm": 0.9568375945091248, + "learning_rate": 0.00011783655133982022, + "loss": 1.7239, + "step": 25613 + }, + { + "epoch": 2.4621743727770835, + "grad_norm": 1.2773112058639526, + "learning_rate": 0.00011782441844309161, + "loss": 2.0169, + "step": 25614 + }, + { + "epoch": 2.46227049889455, + "grad_norm": 1.1126724481582642, + "learning_rate": 0.00011781228578334271, + "loss": 1.9025, + "step": 25615 + }, + { + "epoch": 2.4623666250120158, + "grad_norm": 1.1430493593215942, + "learning_rate": 0.00011780015336065893, + "loss": 1.8864, + "step": 25616 + }, + { + "epoch": 2.4624627511294817, + "grad_norm": 1.0839794874191284, + "learning_rate": 0.00011778802117512564, + "loss": 1.9674, + "step": 25617 + }, + { + "epoch": 2.462558877246948, + "grad_norm": 1.2554548978805542, + "learning_rate": 0.00011777588922682821, + "loss": 1.9401, + "step": 25618 + }, + { + "epoch": 2.4626550033644143, + "grad_norm": 1.119448184967041, + "learning_rate": 0.00011776375751585201, + "loss": 1.8846, + "step": 25619 + }, + { + "epoch": 2.4627511294818802, + "grad_norm": 1.0603234767913818, + "learning_rate": 0.00011775162604228243, + "loss": 1.9714, + "step": 25620 + }, + { + "epoch": 2.462847255599346, + "grad_norm": 1.0379818677902222, + "learning_rate": 0.00011773949480620484, + "loss": 1.78, + "step": 25621 + }, + { + "epoch": 2.4629433817168125, + "grad_norm": 1.2095754146575928, + "learning_rate": 0.0001177273638077046, + "loss": 1.9179, + "step": 25622 + }, + { + "epoch": 2.4630395078342784, + "grad_norm": 1.0699037313461304, + "learning_rate": 0.00011771523304686706, + "loss": 1.8356, + "step": 25623 + }, + { + "epoch": 2.4631356339517447, + "grad_norm": 1.1063779592514038, + "learning_rate": 0.00011770310252377764, + "loss": 1.9104, + "step": 25624 + }, + { + "epoch": 2.4632317600692106, + "grad_norm": 1.1387513875961304, + "learning_rate": 0.00011769097223852166, + "loss": 2.1264, + "step": 25625 + }, + { + "epoch": 2.463327886186677, + "grad_norm": 1.140595555305481, + "learning_rate": 0.00011767884219118447, + "loss": 1.8854, + "step": 25626 + }, + { + "epoch": 2.463424012304143, + "grad_norm": 1.3767192363739014, + "learning_rate": 0.00011766671238185149, + "loss": 2.0895, + "step": 25627 + }, + { + "epoch": 2.463520138421609, + "grad_norm": 1.1713365316390991, + "learning_rate": 0.00011765458281060801, + "loss": 2.0566, + "step": 25628 + }, + { + "epoch": 2.463616264539075, + "grad_norm": 1.1712942123413086, + "learning_rate": 0.00011764245347753944, + "loss": 2.0866, + "step": 25629 + }, + { + "epoch": 2.4637123906565415, + "grad_norm": 1.2154207229614258, + "learning_rate": 0.00011763032438273107, + "loss": 2.2154, + "step": 25630 + }, + { + "epoch": 2.4638085167740074, + "grad_norm": 1.108077883720398, + "learning_rate": 0.00011761819552626834, + "loss": 2.0535, + "step": 25631 + }, + { + "epoch": 2.4639046428914737, + "grad_norm": 1.1446515321731567, + "learning_rate": 0.00011760606690823655, + "loss": 2.0699, + "step": 25632 + }, + { + "epoch": 2.4640007690089396, + "grad_norm": 1.1180225610733032, + "learning_rate": 0.00011759393852872106, + "loss": 1.9375, + "step": 25633 + }, + { + "epoch": 2.464096895126406, + "grad_norm": 1.0971959829330444, + "learning_rate": 0.00011758181038780725, + "loss": 2.0835, + "step": 25634 + }, + { + "epoch": 2.464193021243872, + "grad_norm": 1.2325788736343384, + "learning_rate": 0.0001175696824855804, + "loss": 1.9203, + "step": 25635 + }, + { + "epoch": 2.464289147361338, + "grad_norm": 0.9690203666687012, + "learning_rate": 0.00011755755482212593, + "loss": 1.96, + "step": 25636 + }, + { + "epoch": 2.464385273478804, + "grad_norm": 1.0197584629058838, + "learning_rate": 0.00011754542739752914, + "loss": 1.9727, + "step": 25637 + }, + { + "epoch": 2.4644813995962704, + "grad_norm": 1.2811065912246704, + "learning_rate": 0.00011753330021187537, + "loss": 2.1446, + "step": 25638 + }, + { + "epoch": 2.4645775257137363, + "grad_norm": 1.103116512298584, + "learning_rate": 0.00011752117326525, + "loss": 1.801, + "step": 25639 + }, + { + "epoch": 2.4646736518312027, + "grad_norm": 1.0485737323760986, + "learning_rate": 0.00011750904655773831, + "loss": 2.0242, + "step": 25640 + }, + { + "epoch": 2.4647697779486686, + "grad_norm": 1.126259684562683, + "learning_rate": 0.00011749692008942568, + "loss": 1.9887, + "step": 25641 + }, + { + "epoch": 2.464865904066135, + "grad_norm": 1.3592486381530762, + "learning_rate": 0.00011748479386039744, + "loss": 2.1088, + "step": 25642 + }, + { + "epoch": 2.464962030183601, + "grad_norm": 1.0307252407073975, + "learning_rate": 0.0001174726678707389, + "loss": 1.9413, + "step": 25643 + }, + { + "epoch": 2.465058156301067, + "grad_norm": 1.0956236124038696, + "learning_rate": 0.00011746054212053544, + "loss": 2.0535, + "step": 25644 + }, + { + "epoch": 2.465154282418533, + "grad_norm": 0.9828141927719116, + "learning_rate": 0.00011744841660987234, + "loss": 2.0046, + "step": 25645 + }, + { + "epoch": 2.4652504085359994, + "grad_norm": 1.3197150230407715, + "learning_rate": 0.00011743629133883494, + "loss": 2.1579, + "step": 25646 + }, + { + "epoch": 2.4653465346534653, + "grad_norm": 1.3757390975952148, + "learning_rate": 0.00011742416630750856, + "loss": 2.2019, + "step": 25647 + }, + { + "epoch": 2.4654426607709317, + "grad_norm": 1.222287893295288, + "learning_rate": 0.00011741204151597857, + "loss": 2.1421, + "step": 25648 + }, + { + "epoch": 2.4655387868883976, + "grad_norm": 1.3402601480484009, + "learning_rate": 0.00011739991696433026, + "loss": 2.1072, + "step": 25649 + }, + { + "epoch": 2.4656349130058635, + "grad_norm": 1.1295053958892822, + "learning_rate": 0.00011738779265264898, + "loss": 1.9671, + "step": 25650 + }, + { + "epoch": 2.46573103912333, + "grad_norm": 1.0638374090194702, + "learning_rate": 0.00011737566858102, + "loss": 1.9944, + "step": 25651 + }, + { + "epoch": 2.465827165240796, + "grad_norm": 1.0619521141052246, + "learning_rate": 0.00011736354474952871, + "loss": 1.9161, + "step": 25652 + }, + { + "epoch": 2.465923291358262, + "grad_norm": 1.062989354133606, + "learning_rate": 0.00011735142115826035, + "loss": 1.9883, + "step": 25653 + }, + { + "epoch": 2.466019417475728, + "grad_norm": 1.155479073524475, + "learning_rate": 0.00011733929780730027, + "loss": 1.8067, + "step": 25654 + }, + { + "epoch": 2.4661155435931943, + "grad_norm": 1.1168582439422607, + "learning_rate": 0.0001173271746967338, + "loss": 1.9018, + "step": 25655 + }, + { + "epoch": 2.46621166971066, + "grad_norm": 1.2266178131103516, + "learning_rate": 0.00011731505182664623, + "loss": 2.1765, + "step": 25656 + }, + { + "epoch": 2.4663077958281265, + "grad_norm": 1.0507246255874634, + "learning_rate": 0.00011730292919712287, + "loss": 1.8976, + "step": 25657 + }, + { + "epoch": 2.4664039219455924, + "grad_norm": 1.372437596321106, + "learning_rate": 0.00011729080680824903, + "loss": 2.159, + "step": 25658 + }, + { + "epoch": 2.466500048063059, + "grad_norm": 1.193766474723816, + "learning_rate": 0.00011727868466011004, + "loss": 2.0417, + "step": 25659 + }, + { + "epoch": 2.4665961741805247, + "grad_norm": 1.1334749460220337, + "learning_rate": 0.00011726656275279118, + "loss": 2.0052, + "step": 25660 + }, + { + "epoch": 2.466692300297991, + "grad_norm": 1.0758166313171387, + "learning_rate": 0.00011725444108637778, + "loss": 1.9588, + "step": 25661 + }, + { + "epoch": 2.466788426415457, + "grad_norm": 1.2624541521072388, + "learning_rate": 0.00011724231966095508, + "loss": 2.1779, + "step": 25662 + }, + { + "epoch": 2.4668845525329233, + "grad_norm": 1.152161955833435, + "learning_rate": 0.00011723019847660845, + "loss": 1.8435, + "step": 25663 + }, + { + "epoch": 2.466980678650389, + "grad_norm": 1.3121466636657715, + "learning_rate": 0.00011721807753342312, + "loss": 1.861, + "step": 25664 + }, + { + "epoch": 2.4670768047678555, + "grad_norm": 1.0123835802078247, + "learning_rate": 0.00011720595683148446, + "loss": 1.8866, + "step": 25665 + }, + { + "epoch": 2.4671729308853214, + "grad_norm": 1.2869747877120972, + "learning_rate": 0.00011719383637087775, + "loss": 2.0117, + "step": 25666 + }, + { + "epoch": 2.4672690570027878, + "grad_norm": 1.2234485149383545, + "learning_rate": 0.00011718171615168825, + "loss": 1.9238, + "step": 25667 + }, + { + "epoch": 2.4673651831202537, + "grad_norm": 1.3165091276168823, + "learning_rate": 0.00011716959617400127, + "loss": 2.171, + "step": 25668 + }, + { + "epoch": 2.46746130923772, + "grad_norm": 1.1368162631988525, + "learning_rate": 0.0001171574764379021, + "loss": 1.911, + "step": 25669 + }, + { + "epoch": 2.467557435355186, + "grad_norm": 1.0363264083862305, + "learning_rate": 0.00011714535694347601, + "loss": 1.9511, + "step": 25670 + }, + { + "epoch": 2.4676535614726522, + "grad_norm": 1.222570538520813, + "learning_rate": 0.00011713323769080834, + "loss": 2.0372, + "step": 25671 + }, + { + "epoch": 2.467749687590118, + "grad_norm": 1.1310542821884155, + "learning_rate": 0.00011712111867998432, + "loss": 2.0041, + "step": 25672 + }, + { + "epoch": 2.4678458137075845, + "grad_norm": 1.247814655303955, + "learning_rate": 0.00011710899991108925, + "loss": 2.1217, + "step": 25673 + }, + { + "epoch": 2.4679419398250504, + "grad_norm": 1.1396551132202148, + "learning_rate": 0.0001170968813842084, + "loss": 2.0567, + "step": 25674 + }, + { + "epoch": 2.4680380659425167, + "grad_norm": 1.0983721017837524, + "learning_rate": 0.00011708476309942706, + "loss": 1.7968, + "step": 25675 + }, + { + "epoch": 2.4681341920599826, + "grad_norm": 1.202335000038147, + "learning_rate": 0.00011707264505683052, + "loss": 2.0754, + "step": 25676 + }, + { + "epoch": 2.468230318177449, + "grad_norm": 1.1233632564544678, + "learning_rate": 0.00011706052725650405, + "loss": 2.1462, + "step": 25677 + }, + { + "epoch": 2.468326444294915, + "grad_norm": 1.0478535890579224, + "learning_rate": 0.00011704840969853292, + "loss": 2.1244, + "step": 25678 + }, + { + "epoch": 2.4684225704123812, + "grad_norm": 1.0149734020233154, + "learning_rate": 0.0001170362923830024, + "loss": 2.0369, + "step": 25679 + }, + { + "epoch": 2.468518696529847, + "grad_norm": 1.2537566423416138, + "learning_rate": 0.00011702417530999776, + "loss": 2.04, + "step": 25680 + }, + { + "epoch": 2.4686148226473135, + "grad_norm": 1.1460098028182983, + "learning_rate": 0.00011701205847960425, + "loss": 2.0128, + "step": 25681 + }, + { + "epoch": 2.4687109487647794, + "grad_norm": 1.0744993686676025, + "learning_rate": 0.00011699994189190718, + "loss": 1.9229, + "step": 25682 + }, + { + "epoch": 2.4688070748822453, + "grad_norm": 1.0463249683380127, + "learning_rate": 0.00011698782554699182, + "loss": 2.1408, + "step": 25683 + }, + { + "epoch": 2.4689032009997116, + "grad_norm": 1.1053194999694824, + "learning_rate": 0.00011697570944494341, + "loss": 2.0721, + "step": 25684 + }, + { + "epoch": 2.468999327117178, + "grad_norm": 1.2442911863327026, + "learning_rate": 0.00011696359358584721, + "loss": 1.9438, + "step": 25685 + }, + { + "epoch": 2.469095453234644, + "grad_norm": 1.0204240083694458, + "learning_rate": 0.00011695147796978848, + "loss": 1.9592, + "step": 25686 + }, + { + "epoch": 2.4691915793521098, + "grad_norm": 1.182969331741333, + "learning_rate": 0.0001169393625968525, + "loss": 2.0449, + "step": 25687 + }, + { + "epoch": 2.469287705469576, + "grad_norm": 1.3409080505371094, + "learning_rate": 0.00011692724746712451, + "loss": 2.1211, + "step": 25688 + }, + { + "epoch": 2.4693838315870424, + "grad_norm": 1.1121344566345215, + "learning_rate": 0.00011691513258068977, + "loss": 1.9117, + "step": 25689 + }, + { + "epoch": 2.4694799577045083, + "grad_norm": 1.0197550058364868, + "learning_rate": 0.00011690301793763352, + "loss": 2.0152, + "step": 25690 + }, + { + "epoch": 2.4695760838219742, + "grad_norm": 1.1407663822174072, + "learning_rate": 0.00011689090353804103, + "loss": 2.2287, + "step": 25691 + }, + { + "epoch": 2.4696722099394406, + "grad_norm": 1.0092923641204834, + "learning_rate": 0.00011687878938199754, + "loss": 1.8657, + "step": 25692 + }, + { + "epoch": 2.4697683360569065, + "grad_norm": 1.2250871658325195, + "learning_rate": 0.00011686667546958833, + "loss": 1.9992, + "step": 25693 + }, + { + "epoch": 2.469864462174373, + "grad_norm": 1.1648545265197754, + "learning_rate": 0.00011685456180089862, + "loss": 1.9082, + "step": 25694 + }, + { + "epoch": 2.4699605882918387, + "grad_norm": 1.197845220565796, + "learning_rate": 0.00011684244837601365, + "loss": 1.966, + "step": 25695 + }, + { + "epoch": 2.470056714409305, + "grad_norm": 1.1408835649490356, + "learning_rate": 0.00011683033519501865, + "loss": 2.0304, + "step": 25696 + }, + { + "epoch": 2.470152840526771, + "grad_norm": 1.09013032913208, + "learning_rate": 0.0001168182222579989, + "loss": 2.1328, + "step": 25697 + }, + { + "epoch": 2.4702489666442373, + "grad_norm": 1.3352750539779663, + "learning_rate": 0.00011680610956503961, + "loss": 2.2015, + "step": 25698 + }, + { + "epoch": 2.470345092761703, + "grad_norm": 1.1770206689834595, + "learning_rate": 0.00011679399711622606, + "loss": 1.9516, + "step": 25699 + }, + { + "epoch": 2.4704412188791696, + "grad_norm": 1.1291886568069458, + "learning_rate": 0.00011678188491164345, + "loss": 2.0913, + "step": 25700 + }, + { + "epoch": 2.4705373449966355, + "grad_norm": 1.2057085037231445, + "learning_rate": 0.00011676977295137704, + "loss": 2.0904, + "step": 25701 + }, + { + "epoch": 2.470633471114102, + "grad_norm": 1.189396858215332, + "learning_rate": 0.00011675766123551203, + "loss": 1.9987, + "step": 25702 + }, + { + "epoch": 2.4707295972315677, + "grad_norm": 1.1328582763671875, + "learning_rate": 0.00011674554976413367, + "loss": 2.1954, + "step": 25703 + }, + { + "epoch": 2.470825723349034, + "grad_norm": 1.0159038305282593, + "learning_rate": 0.00011673343853732722, + "loss": 1.9447, + "step": 25704 + }, + { + "epoch": 2.4709218494665, + "grad_norm": 1.1156277656555176, + "learning_rate": 0.00011672132755517784, + "loss": 2.19, + "step": 25705 + }, + { + "epoch": 2.4710179755839663, + "grad_norm": 1.0912379026412964, + "learning_rate": 0.00011670921681777082, + "loss": 2.1138, + "step": 25706 + }, + { + "epoch": 2.471114101701432, + "grad_norm": 0.9982404708862305, + "learning_rate": 0.00011669710632519135, + "loss": 1.8193, + "step": 25707 + }, + { + "epoch": 2.4712102278188985, + "grad_norm": 1.2110525369644165, + "learning_rate": 0.00011668499607752468, + "loss": 2.123, + "step": 25708 + }, + { + "epoch": 2.4713063539363644, + "grad_norm": 1.2082513570785522, + "learning_rate": 0.00011667288607485601, + "loss": 2.1621, + "step": 25709 + }, + { + "epoch": 2.471402480053831, + "grad_norm": 1.079542636871338, + "learning_rate": 0.00011666077631727057, + "loss": 2.1233, + "step": 25710 + }, + { + "epoch": 2.4714986061712967, + "grad_norm": 1.1925442218780518, + "learning_rate": 0.00011664866680485355, + "loss": 2.0673, + "step": 25711 + }, + { + "epoch": 2.471594732288763, + "grad_norm": 1.0923103094100952, + "learning_rate": 0.00011663655753769022, + "loss": 1.8376, + "step": 25712 + }, + { + "epoch": 2.471690858406229, + "grad_norm": 1.4959790706634521, + "learning_rate": 0.00011662444851586574, + "loss": 2.0837, + "step": 25713 + }, + { + "epoch": 2.4717869845236953, + "grad_norm": 1.0987135171890259, + "learning_rate": 0.00011661233973946538, + "loss": 1.9957, + "step": 25714 + }, + { + "epoch": 2.471883110641161, + "grad_norm": 1.1609219312667847, + "learning_rate": 0.00011660023120857426, + "loss": 1.8882, + "step": 25715 + }, + { + "epoch": 2.471979236758627, + "grad_norm": 0.944667637348175, + "learning_rate": 0.00011658812292327767, + "loss": 1.8978, + "step": 25716 + }, + { + "epoch": 2.4720753628760934, + "grad_norm": 1.2198723554611206, + "learning_rate": 0.00011657601488366084, + "loss": 2.0925, + "step": 25717 + }, + { + "epoch": 2.4721714889935598, + "grad_norm": 0.9915568232536316, + "learning_rate": 0.0001165639070898089, + "loss": 1.9618, + "step": 25718 + }, + { + "epoch": 2.4722676151110257, + "grad_norm": 1.0606452226638794, + "learning_rate": 0.00011655179954180706, + "loss": 1.9156, + "step": 25719 + }, + { + "epoch": 2.4723637412284916, + "grad_norm": 1.1548728942871094, + "learning_rate": 0.00011653969223974059, + "loss": 1.9486, + "step": 25720 + }, + { + "epoch": 2.472459867345958, + "grad_norm": 1.0737193822860718, + "learning_rate": 0.00011652758518369465, + "loss": 1.9189, + "step": 25721 + }, + { + "epoch": 2.4725559934634243, + "grad_norm": 1.1392102241516113, + "learning_rate": 0.00011651547837375442, + "loss": 1.9643, + "step": 25722 + }, + { + "epoch": 2.47265211958089, + "grad_norm": 1.2601063251495361, + "learning_rate": 0.0001165033718100051, + "loss": 1.8687, + "step": 25723 + }, + { + "epoch": 2.472748245698356, + "grad_norm": 1.1887849569320679, + "learning_rate": 0.00011649126549253194, + "loss": 1.9524, + "step": 25724 + }, + { + "epoch": 2.4728443718158224, + "grad_norm": 1.2942979335784912, + "learning_rate": 0.00011647915942142008, + "loss": 2.1977, + "step": 25725 + }, + { + "epoch": 2.4729404979332883, + "grad_norm": 1.2394386529922485, + "learning_rate": 0.00011646705359675471, + "loss": 1.9797, + "step": 25726 + }, + { + "epoch": 2.4730366240507546, + "grad_norm": 1.4115004539489746, + "learning_rate": 0.00011645494801862105, + "loss": 2.1146, + "step": 25727 + }, + { + "epoch": 2.4731327501682205, + "grad_norm": 1.1512646675109863, + "learning_rate": 0.00011644284268710428, + "loss": 2.1627, + "step": 25728 + }, + { + "epoch": 2.473228876285687, + "grad_norm": 1.1024540662765503, + "learning_rate": 0.00011643073760228957, + "loss": 1.9862, + "step": 25729 + }, + { + "epoch": 2.473325002403153, + "grad_norm": 1.1906850337982178, + "learning_rate": 0.00011641863276426213, + "loss": 1.9727, + "step": 25730 + }, + { + "epoch": 2.473421128520619, + "grad_norm": 1.001156210899353, + "learning_rate": 0.00011640652817310713, + "loss": 1.9108, + "step": 25731 + }, + { + "epoch": 2.473517254638085, + "grad_norm": 1.0096194744110107, + "learning_rate": 0.00011639442382890972, + "loss": 2.089, + "step": 25732 + }, + { + "epoch": 2.4736133807555514, + "grad_norm": 1.103507161140442, + "learning_rate": 0.00011638231973175515, + "loss": 1.9892, + "step": 25733 + }, + { + "epoch": 2.4737095068730173, + "grad_norm": 1.111741304397583, + "learning_rate": 0.00011637021588172856, + "loss": 2.0729, + "step": 25734 + }, + { + "epoch": 2.4738056329904836, + "grad_norm": 1.137297511100769, + "learning_rate": 0.0001163581122789151, + "loss": 1.983, + "step": 25735 + }, + { + "epoch": 2.4739017591079495, + "grad_norm": 1.0895068645477295, + "learning_rate": 0.00011634600892340001, + "loss": 2.0803, + "step": 25736 + }, + { + "epoch": 2.473997885225416, + "grad_norm": 1.0663484334945679, + "learning_rate": 0.00011633390581526841, + "loss": 1.8855, + "step": 25737 + }, + { + "epoch": 2.4740940113428818, + "grad_norm": 1.1271533966064453, + "learning_rate": 0.0001163218029546055, + "loss": 1.9352, + "step": 25738 + }, + { + "epoch": 2.474190137460348, + "grad_norm": 1.1513471603393555, + "learning_rate": 0.00011630970034149641, + "loss": 2.0924, + "step": 25739 + }, + { + "epoch": 2.474286263577814, + "grad_norm": 1.2551881074905396, + "learning_rate": 0.00011629759797602636, + "loss": 2.0693, + "step": 25740 + }, + { + "epoch": 2.4743823896952803, + "grad_norm": 1.0738532543182373, + "learning_rate": 0.00011628549585828048, + "loss": 2.0231, + "step": 25741 + }, + { + "epoch": 2.4744785158127462, + "grad_norm": 1.100343108177185, + "learning_rate": 0.00011627339398834393, + "loss": 1.9306, + "step": 25742 + }, + { + "epoch": 2.4745746419302126, + "grad_norm": 1.1103267669677734, + "learning_rate": 0.0001162612923663019, + "loss": 1.9868, + "step": 25743 + }, + { + "epoch": 2.4746707680476785, + "grad_norm": 1.1616319417953491, + "learning_rate": 0.00011624919099223954, + "loss": 1.8679, + "step": 25744 + }, + { + "epoch": 2.474766894165145, + "grad_norm": 1.2139426469802856, + "learning_rate": 0.000116237089866242, + "loss": 1.8658, + "step": 25745 + }, + { + "epoch": 2.4748630202826107, + "grad_norm": 1.179884910583496, + "learning_rate": 0.00011622498898839445, + "loss": 1.9095, + "step": 25746 + }, + { + "epoch": 2.474959146400077, + "grad_norm": 1.1816109418869019, + "learning_rate": 0.00011621288835878204, + "loss": 2.1233, + "step": 25747 + }, + { + "epoch": 2.475055272517543, + "grad_norm": 1.1326642036437988, + "learning_rate": 0.00011620078797748992, + "loss": 2.0436, + "step": 25748 + }, + { + "epoch": 2.4751513986350093, + "grad_norm": 1.2205199003219604, + "learning_rate": 0.0001161886878446032, + "loss": 1.9318, + "step": 25749 + }, + { + "epoch": 2.4752475247524752, + "grad_norm": 1.219718098640442, + "learning_rate": 0.00011617658796020713, + "loss": 2.0825, + "step": 25750 + }, + { + "epoch": 2.4753436508699416, + "grad_norm": 1.2453649044036865, + "learning_rate": 0.0001161644883243868, + "loss": 2.0426, + "step": 25751 + }, + { + "epoch": 2.4754397769874075, + "grad_norm": 1.1759357452392578, + "learning_rate": 0.00011615238893722736, + "loss": 2.1451, + "step": 25752 + }, + { + "epoch": 2.4755359031048734, + "grad_norm": 1.162792682647705, + "learning_rate": 0.00011614028979881396, + "loss": 1.9996, + "step": 25753 + }, + { + "epoch": 2.4756320292223397, + "grad_norm": 1.28296959400177, + "learning_rate": 0.00011612819090923171, + "loss": 2.0374, + "step": 25754 + }, + { + "epoch": 2.475728155339806, + "grad_norm": 1.0221081972122192, + "learning_rate": 0.00011611609226856582, + "loss": 1.9344, + "step": 25755 + }, + { + "epoch": 2.475824281457272, + "grad_norm": 1.1869357824325562, + "learning_rate": 0.00011610399387690136, + "loss": 2.0655, + "step": 25756 + }, + { + "epoch": 2.475920407574738, + "grad_norm": 1.2346251010894775, + "learning_rate": 0.00011609189573432352, + "loss": 2.0424, + "step": 25757 + }, + { + "epoch": 2.476016533692204, + "grad_norm": 1.1647998094558716, + "learning_rate": 0.00011607979784091741, + "loss": 1.9067, + "step": 25758 + }, + { + "epoch": 2.47611265980967, + "grad_norm": 1.1103012561798096, + "learning_rate": 0.00011606770019676816, + "loss": 2.0132, + "step": 25759 + }, + { + "epoch": 2.4762087859271364, + "grad_norm": 1.1918494701385498, + "learning_rate": 0.00011605560280196093, + "loss": 2.0717, + "step": 25760 + }, + { + "epoch": 2.4763049120446023, + "grad_norm": 1.2669769525527954, + "learning_rate": 0.00011604350565658082, + "loss": 2.0387, + "step": 25761 + }, + { + "epoch": 2.4764010381620687, + "grad_norm": 1.1899656057357788, + "learning_rate": 0.00011603140876071297, + "loss": 2.1009, + "step": 25762 + }, + { + "epoch": 2.4764971642795346, + "grad_norm": 0.9874704480171204, + "learning_rate": 0.00011601931211444251, + "loss": 2.0336, + "step": 25763 + }, + { + "epoch": 2.476593290397001, + "grad_norm": 1.0559861660003662, + "learning_rate": 0.00011600721571785458, + "loss": 1.8049, + "step": 25764 + }, + { + "epoch": 2.476689416514467, + "grad_norm": 1.1813337802886963, + "learning_rate": 0.00011599511957103428, + "loss": 2.0155, + "step": 25765 + }, + { + "epoch": 2.476785542631933, + "grad_norm": 1.3506498336791992, + "learning_rate": 0.00011598302367406674, + "loss": 2.1105, + "step": 25766 + }, + { + "epoch": 2.476881668749399, + "grad_norm": 1.0904481410980225, + "learning_rate": 0.00011597092802703706, + "loss": 1.98, + "step": 25767 + }, + { + "epoch": 2.4769777948668654, + "grad_norm": 1.0882600545883179, + "learning_rate": 0.00011595883263003044, + "loss": 1.9928, + "step": 25768 + }, + { + "epoch": 2.4770739209843313, + "grad_norm": 1.091405987739563, + "learning_rate": 0.0001159467374831319, + "loss": 1.8297, + "step": 25769 + }, + { + "epoch": 2.4771700471017977, + "grad_norm": 1.1561931371688843, + "learning_rate": 0.00011593464258642659, + "loss": 1.9613, + "step": 25770 + }, + { + "epoch": 2.4772661732192636, + "grad_norm": 1.166256070137024, + "learning_rate": 0.00011592254793999963, + "loss": 2.0733, + "step": 25771 + }, + { + "epoch": 2.47736229933673, + "grad_norm": 1.196244239807129, + "learning_rate": 0.00011591045354393615, + "loss": 1.9992, + "step": 25772 + }, + { + "epoch": 2.477458425454196, + "grad_norm": 1.287947416305542, + "learning_rate": 0.00011589835939832124, + "loss": 2.0159, + "step": 25773 + }, + { + "epoch": 2.477554551571662, + "grad_norm": 1.1862058639526367, + "learning_rate": 0.00011588626550323998, + "loss": 2.0778, + "step": 25774 + }, + { + "epoch": 2.477650677689128, + "grad_norm": 1.1046442985534668, + "learning_rate": 0.00011587417185877752, + "loss": 1.8622, + "step": 25775 + }, + { + "epoch": 2.4777468038065944, + "grad_norm": 1.1446641683578491, + "learning_rate": 0.00011586207846501894, + "loss": 1.9322, + "step": 25776 + }, + { + "epoch": 2.4778429299240603, + "grad_norm": 1.0133875608444214, + "learning_rate": 0.00011584998532204934, + "loss": 1.9695, + "step": 25777 + }, + { + "epoch": 2.4779390560415266, + "grad_norm": 1.1356680393218994, + "learning_rate": 0.00011583789242995387, + "loss": 1.9174, + "step": 25778 + }, + { + "epoch": 2.4780351821589925, + "grad_norm": 1.090743064880371, + "learning_rate": 0.00011582579978881758, + "loss": 2.0839, + "step": 25779 + }, + { + "epoch": 2.478131308276459, + "grad_norm": 1.0206633806228638, + "learning_rate": 0.00011581370739872556, + "loss": 1.9492, + "step": 25780 + }, + { + "epoch": 2.478227434393925, + "grad_norm": 1.076798677444458, + "learning_rate": 0.00011580161525976295, + "loss": 1.9906, + "step": 25781 + }, + { + "epoch": 2.478323560511391, + "grad_norm": 1.2473030090332031, + "learning_rate": 0.00011578952337201482, + "loss": 1.9947, + "step": 25782 + }, + { + "epoch": 2.478419686628857, + "grad_norm": 1.2450968027114868, + "learning_rate": 0.0001157774317355662, + "loss": 2.099, + "step": 25783 + }, + { + "epoch": 2.4785158127463234, + "grad_norm": 0.9960737228393555, + "learning_rate": 0.0001157653403505023, + "loss": 1.8436, + "step": 25784 + }, + { + "epoch": 2.4786119388637893, + "grad_norm": 1.1366041898727417, + "learning_rate": 0.00011575324921690812, + "loss": 1.9062, + "step": 25785 + }, + { + "epoch": 2.478708064981255, + "grad_norm": 1.0352678298950195, + "learning_rate": 0.00011574115833486879, + "loss": 1.8689, + "step": 25786 + }, + { + "epoch": 2.4788041910987215, + "grad_norm": 1.2775591611862183, + "learning_rate": 0.00011572906770446941, + "loss": 2.2611, + "step": 25787 + }, + { + "epoch": 2.478900317216188, + "grad_norm": 1.0610684156417847, + "learning_rate": 0.00011571697732579501, + "loss": 1.8738, + "step": 25788 + }, + { + "epoch": 2.4789964433336538, + "grad_norm": 1.1484177112579346, + "learning_rate": 0.0001157048871989307, + "loss": 2.1277, + "step": 25789 + }, + { + "epoch": 2.4790925694511197, + "grad_norm": 1.0710798501968384, + "learning_rate": 0.00011569279732396155, + "loss": 1.9159, + "step": 25790 + }, + { + "epoch": 2.479188695568586, + "grad_norm": 1.2627086639404297, + "learning_rate": 0.00011568070770097266, + "loss": 2.0746, + "step": 25791 + }, + { + "epoch": 2.479284821686052, + "grad_norm": 1.1128759384155273, + "learning_rate": 0.0001156686183300491, + "loss": 1.9955, + "step": 25792 + }, + { + "epoch": 2.4793809478035183, + "grad_norm": 1.2876793146133423, + "learning_rate": 0.00011565652921127592, + "loss": 2.1908, + "step": 25793 + }, + { + "epoch": 2.479477073920984, + "grad_norm": 1.0658354759216309, + "learning_rate": 0.00011564444034473823, + "loss": 1.7937, + "step": 25794 + }, + { + "epoch": 2.4795732000384505, + "grad_norm": 1.2545533180236816, + "learning_rate": 0.00011563235173052104, + "loss": 1.9776, + "step": 25795 + }, + { + "epoch": 2.4796693261559164, + "grad_norm": 1.0519682168960571, + "learning_rate": 0.0001156202633687095, + "loss": 2.0257, + "step": 25796 + }, + { + "epoch": 2.4797654522733827, + "grad_norm": 1.1333060264587402, + "learning_rate": 0.00011560817525938863, + "loss": 2.0738, + "step": 25797 + }, + { + "epoch": 2.4798615783908486, + "grad_norm": 1.2414093017578125, + "learning_rate": 0.00011559608740264351, + "loss": 1.9294, + "step": 25798 + }, + { + "epoch": 2.479957704508315, + "grad_norm": 1.1191658973693848, + "learning_rate": 0.00011558399979855918, + "loss": 1.9496, + "step": 25799 + }, + { + "epoch": 2.480053830625781, + "grad_norm": 1.036632776260376, + "learning_rate": 0.0001155719124472207, + "loss": 1.8015, + "step": 25800 + }, + { + "epoch": 2.4801499567432472, + "grad_norm": 1.4207690954208374, + "learning_rate": 0.00011555982534871317, + "loss": 1.9992, + "step": 25801 + }, + { + "epoch": 2.480246082860713, + "grad_norm": 1.0301529169082642, + "learning_rate": 0.00011554773850312162, + "loss": 2.0094, + "step": 25802 + }, + { + "epoch": 2.4803422089781795, + "grad_norm": 1.0565097332000732, + "learning_rate": 0.00011553565191053113, + "loss": 1.926, + "step": 25803 + }, + { + "epoch": 2.4804383350956454, + "grad_norm": 1.0585006475448608, + "learning_rate": 0.00011552356557102672, + "loss": 1.9882, + "step": 25804 + }, + { + "epoch": 2.4805344612131117, + "grad_norm": 1.3419291973114014, + "learning_rate": 0.0001155114794846935, + "loss": 2.0864, + "step": 25805 + }, + { + "epoch": 2.4806305873305776, + "grad_norm": 1.190883755683899, + "learning_rate": 0.00011549939365161646, + "loss": 2.0063, + "step": 25806 + }, + { + "epoch": 2.480726713448044, + "grad_norm": 1.2142645120620728, + "learning_rate": 0.00011548730807188068, + "loss": 1.8805, + "step": 25807 + }, + { + "epoch": 2.48082283956551, + "grad_norm": 1.1310503482818604, + "learning_rate": 0.0001154752227455712, + "loss": 2.0644, + "step": 25808 + }, + { + "epoch": 2.480918965682976, + "grad_norm": 1.078210711479187, + "learning_rate": 0.00011546313767277306, + "loss": 2.0244, + "step": 25809 + }, + { + "epoch": 2.481015091800442, + "grad_norm": 1.1206992864608765, + "learning_rate": 0.00011545105285357134, + "loss": 2.1407, + "step": 25810 + }, + { + "epoch": 2.4811112179179085, + "grad_norm": 1.255691647529602, + "learning_rate": 0.00011543896828805105, + "loss": 1.9695, + "step": 25811 + }, + { + "epoch": 2.4812073440353744, + "grad_norm": 1.1779624223709106, + "learning_rate": 0.0001154268839762972, + "loss": 1.9779, + "step": 25812 + }, + { + "epoch": 2.4813034701528407, + "grad_norm": 1.141563057899475, + "learning_rate": 0.0001154147999183949, + "loss": 1.9572, + "step": 25813 + }, + { + "epoch": 2.4813995962703066, + "grad_norm": 1.1688214540481567, + "learning_rate": 0.00011540271611442914, + "loss": 1.9587, + "step": 25814 + }, + { + "epoch": 2.481495722387773, + "grad_norm": 1.2733243703842163, + "learning_rate": 0.00011539063256448497, + "loss": 2.0761, + "step": 25815 + }, + { + "epoch": 2.481591848505239, + "grad_norm": 1.0147485733032227, + "learning_rate": 0.00011537854926864743, + "loss": 1.8606, + "step": 25816 + }, + { + "epoch": 2.481687974622705, + "grad_norm": 1.1718767881393433, + "learning_rate": 0.00011536646622700148, + "loss": 2.0064, + "step": 25817 + }, + { + "epoch": 2.481784100740171, + "grad_norm": 1.363858699798584, + "learning_rate": 0.00011535438343963228, + "loss": 1.9642, + "step": 25818 + }, + { + "epoch": 2.481880226857637, + "grad_norm": 1.1133322715759277, + "learning_rate": 0.00011534230090662478, + "loss": 2.0721, + "step": 25819 + }, + { + "epoch": 2.4819763529751033, + "grad_norm": 1.0874882936477661, + "learning_rate": 0.00011533021862806401, + "loss": 1.986, + "step": 25820 + }, + { + "epoch": 2.4820724790925697, + "grad_norm": 1.2012193202972412, + "learning_rate": 0.00011531813660403502, + "loss": 2.093, + "step": 25821 + }, + { + "epoch": 2.4821686052100356, + "grad_norm": 1.062402606010437, + "learning_rate": 0.00011530605483462277, + "loss": 1.7741, + "step": 25822 + }, + { + "epoch": 2.4822647313275015, + "grad_norm": 1.1660888195037842, + "learning_rate": 0.00011529397331991237, + "loss": 1.9387, + "step": 25823 + }, + { + "epoch": 2.482360857444968, + "grad_norm": 1.3010128736495972, + "learning_rate": 0.0001152818920599888, + "loss": 2.0603, + "step": 25824 + }, + { + "epoch": 2.482456983562434, + "grad_norm": 1.2032856941223145, + "learning_rate": 0.00011526981105493706, + "loss": 1.963, + "step": 25825 + }, + { + "epoch": 2.4825531096799, + "grad_norm": 1.1370913982391357, + "learning_rate": 0.00011525773030484217, + "loss": 1.9971, + "step": 25826 + }, + { + "epoch": 2.482649235797366, + "grad_norm": 1.125898003578186, + "learning_rate": 0.00011524564980978916, + "loss": 2.0342, + "step": 25827 + }, + { + "epoch": 2.4827453619148323, + "grad_norm": 1.1491512060165405, + "learning_rate": 0.00011523356956986303, + "loss": 2.1206, + "step": 25828 + }, + { + "epoch": 2.482841488032298, + "grad_norm": 1.0749242305755615, + "learning_rate": 0.00011522148958514882, + "loss": 2.0775, + "step": 25829 + }, + { + "epoch": 2.4829376141497645, + "grad_norm": 1.1906906366348267, + "learning_rate": 0.0001152094098557315, + "loss": 1.9191, + "step": 25830 + }, + { + "epoch": 2.4830337402672304, + "grad_norm": 0.9930627942085266, + "learning_rate": 0.00011519733038169606, + "loss": 2.0847, + "step": 25831 + }, + { + "epoch": 2.483129866384697, + "grad_norm": 1.2336379289627075, + "learning_rate": 0.00011518525116312756, + "loss": 2.0408, + "step": 25832 + }, + { + "epoch": 2.4832259925021627, + "grad_norm": 1.232784628868103, + "learning_rate": 0.000115173172200111, + "loss": 2.0934, + "step": 25833 + }, + { + "epoch": 2.483322118619629, + "grad_norm": 1.1519970893859863, + "learning_rate": 0.00011516109349273129, + "loss": 1.8983, + "step": 25834 + }, + { + "epoch": 2.483418244737095, + "grad_norm": 1.2431855201721191, + "learning_rate": 0.00011514901504107362, + "loss": 2.0665, + "step": 25835 + }, + { + "epoch": 2.4835143708545613, + "grad_norm": 1.1008073091506958, + "learning_rate": 0.0001151369368452228, + "loss": 1.9093, + "step": 25836 + }, + { + "epoch": 2.483610496972027, + "grad_norm": 0.990661084651947, + "learning_rate": 0.00011512485890526389, + "loss": 2.0444, + "step": 25837 + }, + { + "epoch": 2.4837066230894935, + "grad_norm": 1.2564489841461182, + "learning_rate": 0.00011511278122128188, + "loss": 1.8839, + "step": 25838 + }, + { + "epoch": 2.4838027492069594, + "grad_norm": 1.1572864055633545, + "learning_rate": 0.00011510070379336177, + "loss": 1.9921, + "step": 25839 + }, + { + "epoch": 2.4838988753244258, + "grad_norm": 1.0951975584030151, + "learning_rate": 0.00011508862662158855, + "loss": 1.9211, + "step": 25840 + }, + { + "epoch": 2.4839950014418917, + "grad_norm": 1.3154360055923462, + "learning_rate": 0.00011507654970604722, + "loss": 2.0626, + "step": 25841 + }, + { + "epoch": 2.484091127559358, + "grad_norm": 0.9674065113067627, + "learning_rate": 0.00011506447304682275, + "loss": 1.9997, + "step": 25842 + }, + { + "epoch": 2.484187253676824, + "grad_norm": 1.0997587442398071, + "learning_rate": 0.00011505239664400015, + "loss": 2.0546, + "step": 25843 + }, + { + "epoch": 2.4842833797942903, + "grad_norm": 1.0084853172302246, + "learning_rate": 0.00011504032049766433, + "loss": 1.9318, + "step": 25844 + }, + { + "epoch": 2.484379505911756, + "grad_norm": 1.2340091466903687, + "learning_rate": 0.00011502824460790037, + "loss": 1.996, + "step": 25845 + }, + { + "epoch": 2.4844756320292225, + "grad_norm": 1.0863019227981567, + "learning_rate": 0.0001150161689747932, + "loss": 1.9696, + "step": 25846 + }, + { + "epoch": 2.4845717581466884, + "grad_norm": 1.2051604986190796, + "learning_rate": 0.00011500409359842778, + "loss": 1.9694, + "step": 25847 + }, + { + "epoch": 2.4846678842641547, + "grad_norm": 1.2564759254455566, + "learning_rate": 0.00011499201847888913, + "loss": 2.071, + "step": 25848 + }, + { + "epoch": 2.4847640103816206, + "grad_norm": 1.1304595470428467, + "learning_rate": 0.0001149799436162622, + "loss": 2.1479, + "step": 25849 + }, + { + "epoch": 2.484860136499087, + "grad_norm": 1.1002544164657593, + "learning_rate": 0.00011496786901063198, + "loss": 1.9669, + "step": 25850 + }, + { + "epoch": 2.484956262616553, + "grad_norm": 0.9682755470275879, + "learning_rate": 0.00011495579466208342, + "loss": 1.7623, + "step": 25851 + }, + { + "epoch": 2.485052388734019, + "grad_norm": 1.280466914176941, + "learning_rate": 0.0001149437205707015, + "loss": 1.8443, + "step": 25852 + }, + { + "epoch": 2.485148514851485, + "grad_norm": 1.136550784111023, + "learning_rate": 0.00011493164673657116, + "loss": 1.9382, + "step": 25853 + }, + { + "epoch": 2.4852446409689515, + "grad_norm": 1.2135969400405884, + "learning_rate": 0.00011491957315977743, + "loss": 2.181, + "step": 25854 + }, + { + "epoch": 2.4853407670864174, + "grad_norm": 1.0411438941955566, + "learning_rate": 0.00011490749984040517, + "loss": 1.9036, + "step": 25855 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 1.056333065032959, + "learning_rate": 0.00011489542677853947, + "loss": 2.0039, + "step": 25856 + }, + { + "epoch": 2.4855330193213496, + "grad_norm": 1.1463730335235596, + "learning_rate": 0.00011488335397426517, + "loss": 2.1991, + "step": 25857 + }, + { + "epoch": 2.485629145438816, + "grad_norm": 1.2155817747116089, + "learning_rate": 0.00011487128142766731, + "loss": 2.0536, + "step": 25858 + }, + { + "epoch": 2.485725271556282, + "grad_norm": 0.9917870759963989, + "learning_rate": 0.00011485920913883081, + "loss": 1.9465, + "step": 25859 + }, + { + "epoch": 2.4858213976737478, + "grad_norm": 1.0090047121047974, + "learning_rate": 0.00011484713710784063, + "loss": 1.9585, + "step": 25860 + }, + { + "epoch": 2.485917523791214, + "grad_norm": 1.2681885957717896, + "learning_rate": 0.00011483506533478169, + "loss": 2.1773, + "step": 25861 + }, + { + "epoch": 2.48601364990868, + "grad_norm": 1.0690802335739136, + "learning_rate": 0.00011482299381973902, + "loss": 1.9366, + "step": 25862 + }, + { + "epoch": 2.4861097760261464, + "grad_norm": 1.3493231534957886, + "learning_rate": 0.00011481092256279751, + "loss": 2.0981, + "step": 25863 + }, + { + "epoch": 2.4862059021436123, + "grad_norm": 1.1065577268600464, + "learning_rate": 0.00011479885156404212, + "loss": 1.909, + "step": 25864 + }, + { + "epoch": 2.4863020282610786, + "grad_norm": 1.1018168926239014, + "learning_rate": 0.00011478678082355781, + "loss": 2.011, + "step": 25865 + }, + { + "epoch": 2.4863981543785445, + "grad_norm": 1.107753038406372, + "learning_rate": 0.0001147747103414295, + "loss": 1.9883, + "step": 25866 + }, + { + "epoch": 2.486494280496011, + "grad_norm": 1.1887178421020508, + "learning_rate": 0.00011476264011774215, + "loss": 1.9453, + "step": 25867 + }, + { + "epoch": 2.4865904066134767, + "grad_norm": 1.13785719871521, + "learning_rate": 0.00011475057015258068, + "loss": 1.8881, + "step": 25868 + }, + { + "epoch": 2.486686532730943, + "grad_norm": 1.1232870817184448, + "learning_rate": 0.00011473850044603003, + "loss": 1.893, + "step": 25869 + }, + { + "epoch": 2.486782658848409, + "grad_norm": 1.065226435661316, + "learning_rate": 0.00011472643099817516, + "loss": 2.097, + "step": 25870 + }, + { + "epoch": 2.4868787849658753, + "grad_norm": 1.1764442920684814, + "learning_rate": 0.00011471436180910099, + "loss": 1.9735, + "step": 25871 + }, + { + "epoch": 2.4869749110833412, + "grad_norm": 1.1667253971099854, + "learning_rate": 0.00011470229287889244, + "loss": 2.0624, + "step": 25872 + }, + { + "epoch": 2.4870710372008076, + "grad_norm": 1.326170802116394, + "learning_rate": 0.00011469022420763447, + "loss": 2.1184, + "step": 25873 + }, + { + "epoch": 2.4871671633182735, + "grad_norm": 1.0734212398529053, + "learning_rate": 0.00011467815579541197, + "loss": 2.0692, + "step": 25874 + }, + { + "epoch": 2.48726328943574, + "grad_norm": 1.0771645307540894, + "learning_rate": 0.00011466608764230992, + "loss": 2.0044, + "step": 25875 + }, + { + "epoch": 2.4873594155532057, + "grad_norm": 1.249368667602539, + "learning_rate": 0.0001146540197484132, + "loss": 2.0463, + "step": 25876 + }, + { + "epoch": 2.487455541670672, + "grad_norm": 1.1181124448776245, + "learning_rate": 0.00011464195211380674, + "loss": 2.0322, + "step": 25877 + }, + { + "epoch": 2.487551667788138, + "grad_norm": 1.0358333587646484, + "learning_rate": 0.00011462988473857543, + "loss": 2.1143, + "step": 25878 + }, + { + "epoch": 2.4876477939056043, + "grad_norm": 1.0210328102111816, + "learning_rate": 0.00011461781762280428, + "loss": 1.9628, + "step": 25879 + }, + { + "epoch": 2.48774392002307, + "grad_norm": 1.2905820608139038, + "learning_rate": 0.00011460575076657814, + "loss": 1.9305, + "step": 25880 + }, + { + "epoch": 2.4878400461405366, + "grad_norm": 1.2913596630096436, + "learning_rate": 0.00011459368416998199, + "loss": 2.0345, + "step": 25881 + }, + { + "epoch": 2.4879361722580025, + "grad_norm": 1.0048426389694214, + "learning_rate": 0.00011458161783310065, + "loss": 1.963, + "step": 25882 + }, + { + "epoch": 2.488032298375469, + "grad_norm": 0.9454376697540283, + "learning_rate": 0.00011456955175601908, + "loss": 1.947, + "step": 25883 + }, + { + "epoch": 2.4881284244929347, + "grad_norm": 1.2357150316238403, + "learning_rate": 0.0001145574859388222, + "loss": 2.1944, + "step": 25884 + }, + { + "epoch": 2.4882245506104006, + "grad_norm": 1.0624768733978271, + "learning_rate": 0.00011454542038159491, + "loss": 2.0001, + "step": 25885 + }, + { + "epoch": 2.488320676727867, + "grad_norm": 1.3325458765029907, + "learning_rate": 0.00011453335508442212, + "loss": 2.0048, + "step": 25886 + }, + { + "epoch": 2.4884168028453333, + "grad_norm": 1.171491026878357, + "learning_rate": 0.00011452129004738873, + "loss": 1.9611, + "step": 25887 + }, + { + "epoch": 2.488512928962799, + "grad_norm": 1.1756936311721802, + "learning_rate": 0.00011450922527057964, + "loss": 1.8811, + "step": 25888 + }, + { + "epoch": 2.488609055080265, + "grad_norm": 1.122603178024292, + "learning_rate": 0.00011449716075407976, + "loss": 1.9881, + "step": 25889 + }, + { + "epoch": 2.4887051811977314, + "grad_norm": 1.2020870447158813, + "learning_rate": 0.00011448509649797399, + "loss": 2.0189, + "step": 25890 + }, + { + "epoch": 2.4888013073151978, + "grad_norm": 1.064932107925415, + "learning_rate": 0.00011447303250234722, + "loss": 1.9083, + "step": 25891 + }, + { + "epoch": 2.4888974334326637, + "grad_norm": 1.152085542678833, + "learning_rate": 0.00011446096876728435, + "loss": 2.067, + "step": 25892 + }, + { + "epoch": 2.4889935595501296, + "grad_norm": 1.049275517463684, + "learning_rate": 0.00011444890529287026, + "loss": 1.9229, + "step": 25893 + }, + { + "epoch": 2.489089685667596, + "grad_norm": 0.972698986530304, + "learning_rate": 0.00011443684207918989, + "loss": 2.0163, + "step": 25894 + }, + { + "epoch": 2.489185811785062, + "grad_norm": 1.20158851146698, + "learning_rate": 0.00011442477912632804, + "loss": 1.9766, + "step": 25895 + }, + { + "epoch": 2.489281937902528, + "grad_norm": 1.2717978954315186, + "learning_rate": 0.0001144127164343697, + "loss": 2.2377, + "step": 25896 + }, + { + "epoch": 2.489378064019994, + "grad_norm": 1.0723587274551392, + "learning_rate": 0.0001144006540033997, + "loss": 2.0563, + "step": 25897 + }, + { + "epoch": 2.4894741901374604, + "grad_norm": 1.0721293687820435, + "learning_rate": 0.00011438859183350294, + "loss": 2.1272, + "step": 25898 + }, + { + "epoch": 2.4895703162549263, + "grad_norm": 1.2978981733322144, + "learning_rate": 0.00011437652992476433, + "loss": 2.0124, + "step": 25899 + }, + { + "epoch": 2.4896664423723927, + "grad_norm": 1.1966320276260376, + "learning_rate": 0.00011436446827726869, + "loss": 1.9378, + "step": 25900 + }, + { + "epoch": 2.4897625684898586, + "grad_norm": 0.9910979866981506, + "learning_rate": 0.00011435240689110092, + "loss": 1.7761, + "step": 25901 + }, + { + "epoch": 2.489858694607325, + "grad_norm": 1.288313627243042, + "learning_rate": 0.00011434034576634593, + "loss": 2.1755, + "step": 25902 + }, + { + "epoch": 2.489954820724791, + "grad_norm": 1.166824221611023, + "learning_rate": 0.00011432828490308857, + "loss": 2.1529, + "step": 25903 + }, + { + "epoch": 2.490050946842257, + "grad_norm": 1.194657564163208, + "learning_rate": 0.00011431622430141374, + "loss": 2.1297, + "step": 25904 + }, + { + "epoch": 2.490147072959723, + "grad_norm": 1.1084446907043457, + "learning_rate": 0.00011430416396140627, + "loss": 2.0258, + "step": 25905 + }, + { + "epoch": 2.4902431990771894, + "grad_norm": 1.1271380186080933, + "learning_rate": 0.00011429210388315103, + "loss": 1.9366, + "step": 25906 + }, + { + "epoch": 2.4903393251946553, + "grad_norm": 0.946304976940155, + "learning_rate": 0.00011428004406673295, + "loss": 1.8944, + "step": 25907 + }, + { + "epoch": 2.4904354513121216, + "grad_norm": 1.1754848957061768, + "learning_rate": 0.00011426798451223685, + "loss": 2.0032, + "step": 25908 + }, + { + "epoch": 2.4905315774295875, + "grad_norm": 1.2148836851119995, + "learning_rate": 0.00011425592521974757, + "loss": 2.2157, + "step": 25909 + }, + { + "epoch": 2.490627703547054, + "grad_norm": 1.2971240282058716, + "learning_rate": 0.00011424386618935003, + "loss": 2.1662, + "step": 25910 + }, + { + "epoch": 2.4907238296645198, + "grad_norm": 1.3219064474105835, + "learning_rate": 0.00011423180742112906, + "loss": 2.1223, + "step": 25911 + }, + { + "epoch": 2.490819955781986, + "grad_norm": 1.103844404220581, + "learning_rate": 0.00011421974891516949, + "loss": 1.8826, + "step": 25912 + }, + { + "epoch": 2.490916081899452, + "grad_norm": 1.106972575187683, + "learning_rate": 0.00011420769067155626, + "loss": 1.863, + "step": 25913 + }, + { + "epoch": 2.4910122080169184, + "grad_norm": 1.1521072387695312, + "learning_rate": 0.00011419563269037416, + "loss": 1.9441, + "step": 25914 + }, + { + "epoch": 2.4911083341343843, + "grad_norm": 1.09621262550354, + "learning_rate": 0.00011418357497170805, + "loss": 1.9193, + "step": 25915 + }, + { + "epoch": 2.4912044602518506, + "grad_norm": 1.1826870441436768, + "learning_rate": 0.00011417151751564282, + "loss": 2.0116, + "step": 25916 + }, + { + "epoch": 2.4913005863693165, + "grad_norm": 1.0799105167388916, + "learning_rate": 0.00011415946032226329, + "loss": 1.8815, + "step": 25917 + }, + { + "epoch": 2.491396712486783, + "grad_norm": 1.0780127048492432, + "learning_rate": 0.00011414740339165428, + "loss": 1.896, + "step": 25918 + }, + { + "epoch": 2.4914928386042487, + "grad_norm": 0.9699863195419312, + "learning_rate": 0.00011413534672390071, + "loss": 1.8284, + "step": 25919 + }, + { + "epoch": 2.491588964721715, + "grad_norm": 1.0893990993499756, + "learning_rate": 0.00011412329031908735, + "loss": 1.8765, + "step": 25920 + }, + { + "epoch": 2.491685090839181, + "grad_norm": 1.072396993637085, + "learning_rate": 0.00011411123417729909, + "loss": 2.0718, + "step": 25921 + }, + { + "epoch": 2.491781216956647, + "grad_norm": 1.027087926864624, + "learning_rate": 0.00011409917829862075, + "loss": 1.9305, + "step": 25922 + }, + { + "epoch": 2.4918773430741132, + "grad_norm": 1.204291820526123, + "learning_rate": 0.00011408712268313716, + "loss": 2.2256, + "step": 25923 + }, + { + "epoch": 2.4919734691915796, + "grad_norm": 1.0082775354385376, + "learning_rate": 0.00011407506733093318, + "loss": 2.0278, + "step": 25924 + }, + { + "epoch": 2.4920695953090455, + "grad_norm": 1.2612037658691406, + "learning_rate": 0.00011406301224209365, + "loss": 1.9587, + "step": 25925 + }, + { + "epoch": 2.4921657214265114, + "grad_norm": 1.1864967346191406, + "learning_rate": 0.00011405095741670337, + "loss": 2.0183, + "step": 25926 + }, + { + "epoch": 2.4922618475439777, + "grad_norm": 1.1190993785858154, + "learning_rate": 0.0001140389028548472, + "loss": 1.8083, + "step": 25927 + }, + { + "epoch": 2.4923579736614436, + "grad_norm": 1.017469048500061, + "learning_rate": 0.00011402684855660997, + "loss": 2.0077, + "step": 25928 + }, + { + "epoch": 2.49245409977891, + "grad_norm": 1.1361104249954224, + "learning_rate": 0.00011401479452207645, + "loss": 2.0169, + "step": 25929 + }, + { + "epoch": 2.492550225896376, + "grad_norm": 1.271039366722107, + "learning_rate": 0.00011400274075133157, + "loss": 2.2732, + "step": 25930 + }, + { + "epoch": 2.492646352013842, + "grad_norm": 1.0539652109146118, + "learning_rate": 0.00011399068724446007, + "loss": 1.8148, + "step": 25931 + }, + { + "epoch": 2.492742478131308, + "grad_norm": 1.1653484106063843, + "learning_rate": 0.0001139786340015468, + "loss": 2.1033, + "step": 25932 + }, + { + "epoch": 2.4928386042487745, + "grad_norm": 1.0258163213729858, + "learning_rate": 0.00011396658102267659, + "loss": 1.9961, + "step": 25933 + }, + { + "epoch": 2.4929347303662404, + "grad_norm": 1.0431771278381348, + "learning_rate": 0.00011395452830793425, + "loss": 1.8677, + "step": 25934 + }, + { + "epoch": 2.4930308564837067, + "grad_norm": 1.0505518913269043, + "learning_rate": 0.00011394247585740459, + "loss": 1.9916, + "step": 25935 + }, + { + "epoch": 2.4931269826011726, + "grad_norm": 0.9628477096557617, + "learning_rate": 0.00011393042367117245, + "loss": 1.8465, + "step": 25936 + }, + { + "epoch": 2.493223108718639, + "grad_norm": 1.17107355594635, + "learning_rate": 0.00011391837174932261, + "loss": 1.947, + "step": 25937 + }, + { + "epoch": 2.493319234836105, + "grad_norm": 1.0956350564956665, + "learning_rate": 0.0001139063200919399, + "loss": 1.9875, + "step": 25938 + }, + { + "epoch": 2.493415360953571, + "grad_norm": 1.124393105506897, + "learning_rate": 0.00011389426869910912, + "loss": 1.9739, + "step": 25939 + }, + { + "epoch": 2.493511487071037, + "grad_norm": 1.123551368713379, + "learning_rate": 0.00011388221757091507, + "loss": 2.0478, + "step": 25940 + }, + { + "epoch": 2.4936076131885034, + "grad_norm": 1.1879972219467163, + "learning_rate": 0.00011387016670744257, + "loss": 2.0629, + "step": 25941 + }, + { + "epoch": 2.4937037393059693, + "grad_norm": 1.0053499937057495, + "learning_rate": 0.00011385811610877645, + "loss": 1.8837, + "step": 25942 + }, + { + "epoch": 2.4937998654234357, + "grad_norm": 1.2512129545211792, + "learning_rate": 0.00011384606577500145, + "loss": 2.1942, + "step": 25943 + }, + { + "epoch": 2.4938959915409016, + "grad_norm": 1.3097758293151855, + "learning_rate": 0.0001138340157062024, + "loss": 2.0421, + "step": 25944 + }, + { + "epoch": 2.493992117658368, + "grad_norm": 1.1163002252578735, + "learning_rate": 0.00011382196590246411, + "loss": 2.105, + "step": 25945 + }, + { + "epoch": 2.494088243775834, + "grad_norm": 1.2801988124847412, + "learning_rate": 0.00011380991636387133, + "loss": 2.0822, + "step": 25946 + }, + { + "epoch": 2.4941843698933, + "grad_norm": 1.0667357444763184, + "learning_rate": 0.00011379786709050894, + "loss": 2.1848, + "step": 25947 + }, + { + "epoch": 2.494280496010766, + "grad_norm": 1.081318736076355, + "learning_rate": 0.00011378581808246167, + "loss": 1.9677, + "step": 25948 + }, + { + "epoch": 2.4943766221282324, + "grad_norm": 1.1528102159500122, + "learning_rate": 0.00011377376933981433, + "loss": 2.0444, + "step": 25949 + }, + { + "epoch": 2.4944727482456983, + "grad_norm": 1.175714373588562, + "learning_rate": 0.00011376172086265167, + "loss": 2.0566, + "step": 25950 + }, + { + "epoch": 2.4945688743631647, + "grad_norm": 1.0596749782562256, + "learning_rate": 0.00011374967265105855, + "loss": 2.1136, + "step": 25951 + }, + { + "epoch": 2.4946650004806306, + "grad_norm": 1.102931261062622, + "learning_rate": 0.00011373762470511968, + "loss": 2.025, + "step": 25952 + }, + { + "epoch": 2.494761126598097, + "grad_norm": 1.1870273351669312, + "learning_rate": 0.0001137255770249199, + "loss": 2.0436, + "step": 25953 + }, + { + "epoch": 2.494857252715563, + "grad_norm": 1.321833610534668, + "learning_rate": 0.00011371352961054396, + "loss": 2.019, + "step": 25954 + }, + { + "epoch": 2.4949533788330287, + "grad_norm": 1.0592267513275146, + "learning_rate": 0.00011370148246207665, + "loss": 1.9503, + "step": 25955 + }, + { + "epoch": 2.495049504950495, + "grad_norm": 1.2477076053619385, + "learning_rate": 0.00011368943557960274, + "loss": 2.14, + "step": 25956 + }, + { + "epoch": 2.4951456310679614, + "grad_norm": 1.0297338962554932, + "learning_rate": 0.00011367738896320702, + "loss": 2.1765, + "step": 25957 + }, + { + "epoch": 2.4952417571854273, + "grad_norm": 1.3043063879013062, + "learning_rate": 0.00011366534261297424, + "loss": 2.1183, + "step": 25958 + }, + { + "epoch": 2.495337883302893, + "grad_norm": 1.1272677183151245, + "learning_rate": 0.00011365329652898919, + "loss": 2.1394, + "step": 25959 + }, + { + "epoch": 2.4954340094203595, + "grad_norm": 1.0543527603149414, + "learning_rate": 0.00011364125071133662, + "loss": 1.5759, + "step": 25960 + }, + { + "epoch": 2.4955301355378254, + "grad_norm": 1.1179167032241821, + "learning_rate": 0.00011362920516010133, + "loss": 2.0375, + "step": 25961 + }, + { + "epoch": 2.4956262616552918, + "grad_norm": 1.120676875114441, + "learning_rate": 0.00011361715987536808, + "loss": 2.0165, + "step": 25962 + }, + { + "epoch": 2.4957223877727577, + "grad_norm": 1.0735247135162354, + "learning_rate": 0.0001136051148572216, + "loss": 2.0392, + "step": 25963 + }, + { + "epoch": 2.495818513890224, + "grad_norm": 1.1110106706619263, + "learning_rate": 0.00011359307010574668, + "loss": 1.9471, + "step": 25964 + }, + { + "epoch": 2.49591464000769, + "grad_norm": 1.1720603704452515, + "learning_rate": 0.0001135810256210281, + "loss": 2.1066, + "step": 25965 + }, + { + "epoch": 2.4960107661251563, + "grad_norm": 0.97132408618927, + "learning_rate": 0.00011356898140315057, + "loss": 1.8546, + "step": 25966 + }, + { + "epoch": 2.496106892242622, + "grad_norm": 1.0612711906433105, + "learning_rate": 0.00011355693745219888, + "loss": 1.9812, + "step": 25967 + }, + { + "epoch": 2.4962030183600885, + "grad_norm": 1.1281756162643433, + "learning_rate": 0.00011354489376825779, + "loss": 1.8193, + "step": 25968 + }, + { + "epoch": 2.4962991444775544, + "grad_norm": 1.0960019826889038, + "learning_rate": 0.00011353285035141204, + "loss": 2.0892, + "step": 25969 + }, + { + "epoch": 2.4963952705950208, + "grad_norm": 1.0332976579666138, + "learning_rate": 0.00011352080720174637, + "loss": 2.0327, + "step": 25970 + }, + { + "epoch": 2.4964913967124867, + "grad_norm": 1.0639458894729614, + "learning_rate": 0.00011350876431934556, + "loss": 2.0061, + "step": 25971 + }, + { + "epoch": 2.496587522829953, + "grad_norm": 0.9808492064476013, + "learning_rate": 0.00011349672170429433, + "loss": 1.8878, + "step": 25972 + }, + { + "epoch": 2.496683648947419, + "grad_norm": 1.1143027544021606, + "learning_rate": 0.00011348467935667743, + "loss": 1.9876, + "step": 25973 + }, + { + "epoch": 2.4967797750648852, + "grad_norm": 1.084523320198059, + "learning_rate": 0.00011347263727657962, + "loss": 2.0944, + "step": 25974 + }, + { + "epoch": 2.496875901182351, + "grad_norm": 1.3578921556472778, + "learning_rate": 0.0001134605954640856, + "loss": 1.929, + "step": 25975 + }, + { + "epoch": 2.4969720272998175, + "grad_norm": 1.1839090585708618, + "learning_rate": 0.00011344855391928017, + "loss": 2.1288, + "step": 25976 + }, + { + "epoch": 2.4970681534172834, + "grad_norm": 1.036880612373352, + "learning_rate": 0.00011343651264224803, + "loss": 1.945, + "step": 25977 + }, + { + "epoch": 2.4971642795347497, + "grad_norm": 1.3425090312957764, + "learning_rate": 0.00011342447163307392, + "loss": 2.0237, + "step": 25978 + }, + { + "epoch": 2.4972604056522156, + "grad_norm": 1.2142033576965332, + "learning_rate": 0.00011341243089184257, + "loss": 1.9709, + "step": 25979 + }, + { + "epoch": 2.497356531769682, + "grad_norm": 1.1990216970443726, + "learning_rate": 0.00011340039041863871, + "loss": 2.1362, + "step": 25980 + }, + { + "epoch": 2.497452657887148, + "grad_norm": 0.9726925492286682, + "learning_rate": 0.0001133883502135471, + "loss": 1.8819, + "step": 25981 + }, + { + "epoch": 2.497548784004614, + "grad_norm": 1.0284295082092285, + "learning_rate": 0.00011337631027665244, + "loss": 1.8939, + "step": 25982 + }, + { + "epoch": 2.49764491012208, + "grad_norm": 1.256235957145691, + "learning_rate": 0.00011336427060803948, + "loss": 2.0659, + "step": 25983 + }, + { + "epoch": 2.4977410362395465, + "grad_norm": 1.1007893085479736, + "learning_rate": 0.00011335223120779292, + "loss": 1.9434, + "step": 25984 + }, + { + "epoch": 2.4978371623570124, + "grad_norm": 1.2163513898849487, + "learning_rate": 0.00011334019207599752, + "loss": 2.085, + "step": 25985 + }, + { + "epoch": 2.4979332884744787, + "grad_norm": 1.1862883567810059, + "learning_rate": 0.00011332815321273794, + "loss": 2.0709, + "step": 25986 + }, + { + "epoch": 2.4980294145919446, + "grad_norm": 1.0388041734695435, + "learning_rate": 0.00011331611461809896, + "loss": 1.9306, + "step": 25987 + }, + { + "epoch": 2.4981255407094105, + "grad_norm": 1.1402249336242676, + "learning_rate": 0.00011330407629216526, + "loss": 2.0559, + "step": 25988 + }, + { + "epoch": 2.498221666826877, + "grad_norm": 1.1547659635543823, + "learning_rate": 0.00011329203823502157, + "loss": 1.882, + "step": 25989 + }, + { + "epoch": 2.498317792944343, + "grad_norm": 1.2466391324996948, + "learning_rate": 0.00011328000044675262, + "loss": 2.1017, + "step": 25990 + }, + { + "epoch": 2.498413919061809, + "grad_norm": 1.1602953672409058, + "learning_rate": 0.00011326796292744306, + "loss": 2.0323, + "step": 25991 + }, + { + "epoch": 2.498510045179275, + "grad_norm": 1.0783404111862183, + "learning_rate": 0.00011325592567717766, + "loss": 1.979, + "step": 25992 + }, + { + "epoch": 2.4986061712967413, + "grad_norm": 1.2524255514144897, + "learning_rate": 0.00011324388869604111, + "loss": 1.975, + "step": 25993 + }, + { + "epoch": 2.4987022974142077, + "grad_norm": 0.9838381409645081, + "learning_rate": 0.00011323185198411814, + "loss": 1.9271, + "step": 25994 + }, + { + "epoch": 2.4987984235316736, + "grad_norm": 1.1033594608306885, + "learning_rate": 0.00011321981554149339, + "loss": 2.2084, + "step": 25995 + }, + { + "epoch": 2.4988945496491395, + "grad_norm": 1.2035661935806274, + "learning_rate": 0.0001132077793682516, + "loss": 1.9718, + "step": 25996 + }, + { + "epoch": 2.498990675766606, + "grad_norm": 1.0407562255859375, + "learning_rate": 0.00011319574346447748, + "loss": 2.0051, + "step": 25997 + }, + { + "epoch": 2.4990868018840717, + "grad_norm": 1.1475324630737305, + "learning_rate": 0.00011318370783025574, + "loss": 2.1292, + "step": 25998 + }, + { + "epoch": 2.499182928001538, + "grad_norm": 1.120094895362854, + "learning_rate": 0.00011317167246567104, + "loss": 1.925, + "step": 25999 + }, + { + "epoch": 2.499279054119004, + "grad_norm": 1.0955960750579834, + "learning_rate": 0.00011315963737080809, + "loss": 1.9927, + "step": 26000 + }, + { + "epoch": 2.4993751802364703, + "grad_norm": 1.3705166578292847, + "learning_rate": 0.0001131476025457516, + "loss": 2.1268, + "step": 26001 + }, + { + "epoch": 2.499471306353936, + "grad_norm": 1.3374273777008057, + "learning_rate": 0.00011313556799058624, + "loss": 2.2458, + "step": 26002 + }, + { + "epoch": 2.4995674324714026, + "grad_norm": 1.175514817237854, + "learning_rate": 0.00011312353370539668, + "loss": 1.9178, + "step": 26003 + }, + { + "epoch": 2.4996635585888685, + "grad_norm": 1.0960900783538818, + "learning_rate": 0.00011311149969026766, + "loss": 2.0953, + "step": 26004 + }, + { + "epoch": 2.499759684706335, + "grad_norm": 1.0346922874450684, + "learning_rate": 0.00011309946594528383, + "loss": 1.9963, + "step": 26005 + }, + { + "epoch": 2.4998558108238007, + "grad_norm": 1.1304564476013184, + "learning_rate": 0.00011308743247052984, + "loss": 1.9211, + "step": 26006 + }, + { + "epoch": 2.499951936941267, + "grad_norm": 1.0623911619186401, + "learning_rate": 0.00011307539926609047, + "loss": 1.7232, + "step": 26007 + }, + { + "epoch": 2.500048063058733, + "grad_norm": 1.084266185760498, + "learning_rate": 0.00011306336633205029, + "loss": 2.0761, + "step": 26008 + }, + { + "epoch": 2.5001441891761993, + "grad_norm": 0.9297289848327637, + "learning_rate": 0.00011305133366849405, + "loss": 1.754, + "step": 26009 + }, + { + "epoch": 2.500240315293665, + "grad_norm": 1.1196115016937256, + "learning_rate": 0.0001130393012755064, + "loss": 2.1115, + "step": 26010 + }, + { + "epoch": 2.5003364414111315, + "grad_norm": 1.2527918815612793, + "learning_rate": 0.00011302726915317203, + "loss": 2.1783, + "step": 26011 + }, + { + "epoch": 2.5004325675285974, + "grad_norm": 1.09394109249115, + "learning_rate": 0.00011301523730157557, + "loss": 1.9675, + "step": 26012 + }, + { + "epoch": 2.500528693646064, + "grad_norm": 0.929645836353302, + "learning_rate": 0.00011300320572080171, + "loss": 1.7008, + "step": 26013 + }, + { + "epoch": 2.5006248197635297, + "grad_norm": 1.0995125770568848, + "learning_rate": 0.00011299117441093512, + "loss": 2.1334, + "step": 26014 + }, + { + "epoch": 2.500720945880996, + "grad_norm": 1.0673243999481201, + "learning_rate": 0.00011297914337206048, + "loss": 1.9054, + "step": 26015 + }, + { + "epoch": 2.500817071998462, + "grad_norm": 1.1370110511779785, + "learning_rate": 0.00011296711260426247, + "loss": 1.8452, + "step": 26016 + }, + { + "epoch": 2.500913198115928, + "grad_norm": 1.3333625793457031, + "learning_rate": 0.0001129550821076257, + "loss": 2.1206, + "step": 26017 + }, + { + "epoch": 2.501009324233394, + "grad_norm": 1.141524314880371, + "learning_rate": 0.00011294305188223487, + "loss": 2.0198, + "step": 26018 + }, + { + "epoch": 2.5011054503508605, + "grad_norm": 1.0517059564590454, + "learning_rate": 0.00011293102192817463, + "loss": 2.0061, + "step": 26019 + }, + { + "epoch": 2.5012015764683264, + "grad_norm": 1.069074273109436, + "learning_rate": 0.00011291899224552964, + "loss": 2.0011, + "step": 26020 + }, + { + "epoch": 2.5012977025857923, + "grad_norm": 1.2238746881484985, + "learning_rate": 0.00011290696283438453, + "loss": 1.9522, + "step": 26021 + }, + { + "epoch": 2.5013938287032587, + "grad_norm": 1.3018981218338013, + "learning_rate": 0.00011289493369482398, + "loss": 2.0123, + "step": 26022 + }, + { + "epoch": 2.501489954820725, + "grad_norm": 1.1675536632537842, + "learning_rate": 0.00011288290482693261, + "loss": 2.01, + "step": 26023 + }, + { + "epoch": 2.501586080938191, + "grad_norm": 1.1432526111602783, + "learning_rate": 0.00011287087623079511, + "loss": 2.0557, + "step": 26024 + }, + { + "epoch": 2.501682207055657, + "grad_norm": 1.2032601833343506, + "learning_rate": 0.0001128588479064961, + "loss": 2.066, + "step": 26025 + }, + { + "epoch": 2.501778333173123, + "grad_norm": 1.1346988677978516, + "learning_rate": 0.00011284681985412022, + "loss": 1.8904, + "step": 26026 + }, + { + "epoch": 2.5018744592905895, + "grad_norm": 1.111461877822876, + "learning_rate": 0.00011283479207375214, + "loss": 1.9861, + "step": 26027 + }, + { + "epoch": 2.5019705854080554, + "grad_norm": 1.206666350364685, + "learning_rate": 0.00011282276456547648, + "loss": 1.7969, + "step": 26028 + }, + { + "epoch": 2.5020667115255213, + "grad_norm": 1.1764614582061768, + "learning_rate": 0.00011281073732937788, + "loss": 1.9326, + "step": 26029 + }, + { + "epoch": 2.5021628376429876, + "grad_norm": 1.0684614181518555, + "learning_rate": 0.00011279871036554097, + "loss": 1.9766, + "step": 26030 + }, + { + "epoch": 2.502258963760454, + "grad_norm": 1.0416324138641357, + "learning_rate": 0.00011278668367405038, + "loss": 1.9596, + "step": 26031 + }, + { + "epoch": 2.50235508987792, + "grad_norm": 1.1124866008758545, + "learning_rate": 0.00011277465725499079, + "loss": 1.9996, + "step": 26032 + }, + { + "epoch": 2.502451215995386, + "grad_norm": 1.2665150165557861, + "learning_rate": 0.00011276263110844681, + "loss": 1.9591, + "step": 26033 + }, + { + "epoch": 2.502547342112852, + "grad_norm": 1.151847004890442, + "learning_rate": 0.00011275060523450306, + "loss": 1.9199, + "step": 26034 + }, + { + "epoch": 2.502643468230318, + "grad_norm": 0.99146968126297, + "learning_rate": 0.00011273857963324416, + "loss": 1.9806, + "step": 26035 + }, + { + "epoch": 2.5027395943477844, + "grad_norm": 0.9573439359664917, + "learning_rate": 0.00011272655430475473, + "loss": 1.7796, + "step": 26036 + }, + { + "epoch": 2.5028357204652503, + "grad_norm": 1.0927355289459229, + "learning_rate": 0.00011271452924911944, + "loss": 2.0339, + "step": 26037 + }, + { + "epoch": 2.5029318465827166, + "grad_norm": 1.0602176189422607, + "learning_rate": 0.00011270250446642286, + "loss": 1.9075, + "step": 26038 + }, + { + "epoch": 2.5030279727001825, + "grad_norm": 1.2629978656768799, + "learning_rate": 0.00011269047995674964, + "loss": 2.1636, + "step": 26039 + }, + { + "epoch": 2.503124098817649, + "grad_norm": 1.3219184875488281, + "learning_rate": 0.0001126784557201844, + "loss": 2.2221, + "step": 26040 + }, + { + "epoch": 2.5032202249351148, + "grad_norm": 1.194250226020813, + "learning_rate": 0.00011266643175681174, + "loss": 2.1262, + "step": 26041 + }, + { + "epoch": 2.503316351052581, + "grad_norm": 1.2089133262634277, + "learning_rate": 0.00011265440806671625, + "loss": 2.1829, + "step": 26042 + }, + { + "epoch": 2.503412477170047, + "grad_norm": 1.1471333503723145, + "learning_rate": 0.0001126423846499826, + "loss": 1.9841, + "step": 26043 + }, + { + "epoch": 2.5035086032875133, + "grad_norm": 1.1370766162872314, + "learning_rate": 0.00011263036150669537, + "loss": 2.0219, + "step": 26044 + }, + { + "epoch": 2.5036047294049792, + "grad_norm": 1.0561349391937256, + "learning_rate": 0.00011261833863693917, + "loss": 1.9792, + "step": 26045 + }, + { + "epoch": 2.5037008555224456, + "grad_norm": 0.9927887320518494, + "learning_rate": 0.0001126063160407986, + "loss": 2.0708, + "step": 26046 + }, + { + "epoch": 2.5037969816399115, + "grad_norm": 1.1719573736190796, + "learning_rate": 0.0001125942937183583, + "loss": 1.9413, + "step": 26047 + }, + { + "epoch": 2.503893107757378, + "grad_norm": 1.1222074031829834, + "learning_rate": 0.0001125822716697028, + "loss": 1.9946, + "step": 26048 + }, + { + "epoch": 2.5039892338748437, + "grad_norm": 1.076575517654419, + "learning_rate": 0.00011257024989491675, + "loss": 1.9872, + "step": 26049 + }, + { + "epoch": 2.5040853599923096, + "grad_norm": 0.9608566761016846, + "learning_rate": 0.00011255822839408479, + "loss": 2.0672, + "step": 26050 + }, + { + "epoch": 2.504181486109776, + "grad_norm": 1.114154577255249, + "learning_rate": 0.00011254620716729141, + "loss": 1.9911, + "step": 26051 + }, + { + "epoch": 2.5042776122272423, + "grad_norm": 1.309372067451477, + "learning_rate": 0.00011253418621462132, + "loss": 2.0109, + "step": 26052 + }, + { + "epoch": 2.5043737383447082, + "grad_norm": 1.1165587902069092, + "learning_rate": 0.00011252216553615903, + "loss": 2.0238, + "step": 26053 + }, + { + "epoch": 2.504469864462174, + "grad_norm": 1.2375553846359253, + "learning_rate": 0.00011251014513198918, + "loss": 2.1429, + "step": 26054 + }, + { + "epoch": 2.5045659905796405, + "grad_norm": 1.1673482656478882, + "learning_rate": 0.00011249812500219634, + "loss": 1.9638, + "step": 26055 + }, + { + "epoch": 2.504662116697107, + "grad_norm": 1.1848187446594238, + "learning_rate": 0.00011248610514686509, + "loss": 2.0296, + "step": 26056 + }, + { + "epoch": 2.5047582428145727, + "grad_norm": 1.2125717401504517, + "learning_rate": 0.00011247408556608004, + "loss": 2.2823, + "step": 26057 + }, + { + "epoch": 2.5048543689320386, + "grad_norm": 1.2832375764846802, + "learning_rate": 0.00011246206625992574, + "loss": 2.0873, + "step": 26058 + }, + { + "epoch": 2.504950495049505, + "grad_norm": 1.294636845588684, + "learning_rate": 0.00011245004722848677, + "loss": 2.1335, + "step": 26059 + }, + { + "epoch": 2.5050466211669713, + "grad_norm": 1.1506396532058716, + "learning_rate": 0.00011243802847184776, + "loss": 2.0667, + "step": 26060 + }, + { + "epoch": 2.505142747284437, + "grad_norm": 1.153383731842041, + "learning_rate": 0.00011242600999009322, + "loss": 2.003, + "step": 26061 + }, + { + "epoch": 2.505238873401903, + "grad_norm": 1.1210565567016602, + "learning_rate": 0.00011241399178330782, + "loss": 1.92, + "step": 26062 + }, + { + "epoch": 2.5053349995193694, + "grad_norm": 1.466090440750122, + "learning_rate": 0.00011240197385157604, + "loss": 2.0804, + "step": 26063 + }, + { + "epoch": 2.505431125636836, + "grad_norm": 1.0432568788528442, + "learning_rate": 0.00011238995619498247, + "loss": 2.0532, + "step": 26064 + }, + { + "epoch": 2.5055272517543017, + "grad_norm": 1.1323188543319702, + "learning_rate": 0.00011237793881361167, + "loss": 1.9591, + "step": 26065 + }, + { + "epoch": 2.5056233778717676, + "grad_norm": 1.24342679977417, + "learning_rate": 0.0001123659217075483, + "loss": 2.0692, + "step": 26066 + }, + { + "epoch": 2.505719503989234, + "grad_norm": 1.4437659978866577, + "learning_rate": 0.00011235390487687684, + "loss": 2.0668, + "step": 26067 + }, + { + "epoch": 2.5058156301067, + "grad_norm": 1.506571650505066, + "learning_rate": 0.00011234188832168187, + "loss": 2.0876, + "step": 26068 + }, + { + "epoch": 2.505911756224166, + "grad_norm": 1.1329994201660156, + "learning_rate": 0.00011232987204204794, + "loss": 1.9835, + "step": 26069 + }, + { + "epoch": 2.506007882341632, + "grad_norm": 1.1974706649780273, + "learning_rate": 0.00011231785603805967, + "loss": 1.954, + "step": 26070 + }, + { + "epoch": 2.5061040084590984, + "grad_norm": 1.0568361282348633, + "learning_rate": 0.00011230584030980154, + "loss": 2.1363, + "step": 26071 + }, + { + "epoch": 2.5062001345765643, + "grad_norm": 1.1697800159454346, + "learning_rate": 0.00011229382485735813, + "loss": 2.0417, + "step": 26072 + }, + { + "epoch": 2.5062962606940307, + "grad_norm": 1.0662301778793335, + "learning_rate": 0.00011228180968081403, + "loss": 1.8964, + "step": 26073 + }, + { + "epoch": 2.5063923868114966, + "grad_norm": 1.2834736108779907, + "learning_rate": 0.00011226979478025378, + "loss": 2.0186, + "step": 26074 + }, + { + "epoch": 2.506488512928963, + "grad_norm": 0.9817519187927246, + "learning_rate": 0.00011225778015576191, + "loss": 1.9247, + "step": 26075 + }, + { + "epoch": 2.506584639046429, + "grad_norm": 0.9834118485450745, + "learning_rate": 0.00011224576580742298, + "loss": 1.8498, + "step": 26076 + }, + { + "epoch": 2.506680765163895, + "grad_norm": 1.1624815464019775, + "learning_rate": 0.00011223375173532151, + "loss": 2.0246, + "step": 26077 + }, + { + "epoch": 2.506776891281361, + "grad_norm": 1.0888460874557495, + "learning_rate": 0.00011222173793954208, + "loss": 1.9846, + "step": 26078 + }, + { + "epoch": 2.5068730173988274, + "grad_norm": 1.094221830368042, + "learning_rate": 0.00011220972442016925, + "loss": 1.9262, + "step": 26079 + }, + { + "epoch": 2.5069691435162933, + "grad_norm": 0.999553918838501, + "learning_rate": 0.0001121977111772875, + "loss": 1.8002, + "step": 26080 + }, + { + "epoch": 2.5070652696337596, + "grad_norm": 1.255437970161438, + "learning_rate": 0.00011218569821098142, + "loss": 1.9346, + "step": 26081 + }, + { + "epoch": 2.5071613957512255, + "grad_norm": 1.1953984498977661, + "learning_rate": 0.0001121736855213355, + "loss": 1.8342, + "step": 26082 + }, + { + "epoch": 2.507257521868692, + "grad_norm": 1.358019471168518, + "learning_rate": 0.0001121616731084343, + "loss": 2.1818, + "step": 26083 + }, + { + "epoch": 2.507353647986158, + "grad_norm": 1.1920733451843262, + "learning_rate": 0.0001121496609723624, + "loss": 2.0795, + "step": 26084 + }, + { + "epoch": 2.507449774103624, + "grad_norm": 1.1703957319259644, + "learning_rate": 0.00011213764911320427, + "loss": 2.0881, + "step": 26085 + }, + { + "epoch": 2.50754590022109, + "grad_norm": 1.1154428720474243, + "learning_rate": 0.00011212563753104448, + "loss": 1.9854, + "step": 26086 + }, + { + "epoch": 2.507642026338556, + "grad_norm": 1.0225493907928467, + "learning_rate": 0.00011211362622596749, + "loss": 2.0207, + "step": 26087 + }, + { + "epoch": 2.5077381524560223, + "grad_norm": 1.119193196296692, + "learning_rate": 0.00011210161519805789, + "loss": 1.8034, + "step": 26088 + }, + { + "epoch": 2.5078342785734886, + "grad_norm": 1.2517675161361694, + "learning_rate": 0.00011208960444740016, + "loss": 2.0763, + "step": 26089 + }, + { + "epoch": 2.5079304046909545, + "grad_norm": 1.0993008613586426, + "learning_rate": 0.00011207759397407887, + "loss": 2.0191, + "step": 26090 + }, + { + "epoch": 2.5080265308084204, + "grad_norm": 1.0425922870635986, + "learning_rate": 0.0001120655837781785, + "loss": 1.8267, + "step": 26091 + }, + { + "epoch": 2.5081226569258868, + "grad_norm": 1.1632966995239258, + "learning_rate": 0.0001120535738597836, + "loss": 2.05, + "step": 26092 + }, + { + "epoch": 2.508218783043353, + "grad_norm": 1.2238308191299438, + "learning_rate": 0.00011204156421897863, + "loss": 2.0328, + "step": 26093 + }, + { + "epoch": 2.508314909160819, + "grad_norm": 1.2641675472259521, + "learning_rate": 0.00011202955485584816, + "loss": 2.0154, + "step": 26094 + }, + { + "epoch": 2.508411035278285, + "grad_norm": 1.0645071268081665, + "learning_rate": 0.0001120175457704767, + "loss": 1.965, + "step": 26095 + }, + { + "epoch": 2.5085071613957513, + "grad_norm": 1.2962303161621094, + "learning_rate": 0.00011200553696294871, + "loss": 2.0691, + "step": 26096 + }, + { + "epoch": 2.5086032875132176, + "grad_norm": 1.2597732543945312, + "learning_rate": 0.00011199352843334874, + "loss": 1.9804, + "step": 26097 + }, + { + "epoch": 2.5086994136306835, + "grad_norm": 1.1084764003753662, + "learning_rate": 0.0001119815201817613, + "loss": 2.1641, + "step": 26098 + }, + { + "epoch": 2.5087955397481494, + "grad_norm": 1.1400140523910522, + "learning_rate": 0.00011196951220827082, + "loss": 1.9358, + "step": 26099 + }, + { + "epoch": 2.5088916658656157, + "grad_norm": 1.0498840808868408, + "learning_rate": 0.00011195750451296191, + "loss": 1.8358, + "step": 26100 + }, + { + "epoch": 2.5089877919830816, + "grad_norm": 1.134519100189209, + "learning_rate": 0.00011194549709591899, + "loss": 2.1298, + "step": 26101 + }, + { + "epoch": 2.509083918100548, + "grad_norm": 1.1946009397506714, + "learning_rate": 0.00011193348995722662, + "loss": 2.1324, + "step": 26102 + }, + { + "epoch": 2.509180044218014, + "grad_norm": 1.2136452198028564, + "learning_rate": 0.00011192148309696925, + "loss": 2.1711, + "step": 26103 + }, + { + "epoch": 2.5092761703354802, + "grad_norm": 1.1050859689712524, + "learning_rate": 0.00011190947651523138, + "loss": 2.0253, + "step": 26104 + }, + { + "epoch": 2.509372296452946, + "grad_norm": 1.2133632898330688, + "learning_rate": 0.0001118974702120975, + "loss": 2.0493, + "step": 26105 + }, + { + "epoch": 2.5094684225704125, + "grad_norm": 1.0569469928741455, + "learning_rate": 0.00011188546418765215, + "loss": 2.0264, + "step": 26106 + }, + { + "epoch": 2.5095645486878784, + "grad_norm": 1.280572772026062, + "learning_rate": 0.00011187345844197972, + "loss": 2.1247, + "step": 26107 + }, + { + "epoch": 2.5096606748053447, + "grad_norm": 1.1410760879516602, + "learning_rate": 0.00011186145297516479, + "loss": 1.9452, + "step": 26108 + }, + { + "epoch": 2.5097568009228106, + "grad_norm": 1.2175145149230957, + "learning_rate": 0.0001118494477872918, + "loss": 2.0625, + "step": 26109 + }, + { + "epoch": 2.509852927040277, + "grad_norm": 1.2906534671783447, + "learning_rate": 0.00011183744287844525, + "loss": 1.9591, + "step": 26110 + }, + { + "epoch": 2.509949053157743, + "grad_norm": 1.1161417961120605, + "learning_rate": 0.0001118254382487096, + "loss": 2.1087, + "step": 26111 + }, + { + "epoch": 2.510045179275209, + "grad_norm": 0.9485589265823364, + "learning_rate": 0.00011181343389816933, + "loss": 1.8157, + "step": 26112 + }, + { + "epoch": 2.510141305392675, + "grad_norm": 1.119785189628601, + "learning_rate": 0.00011180142982690892, + "loss": 2.0318, + "step": 26113 + }, + { + "epoch": 2.5102374315101414, + "grad_norm": 1.253459095954895, + "learning_rate": 0.00011178942603501285, + "loss": 2.1202, + "step": 26114 + }, + { + "epoch": 2.5103335576276073, + "grad_norm": 1.0290446281433105, + "learning_rate": 0.0001117774225225656, + "loss": 1.8934, + "step": 26115 + }, + { + "epoch": 2.5104296837450737, + "grad_norm": 1.2630701065063477, + "learning_rate": 0.00011176541928965161, + "loss": 2.1965, + "step": 26116 + }, + { + "epoch": 2.5105258098625396, + "grad_norm": 0.9654363393783569, + "learning_rate": 0.0001117534163363554, + "loss": 1.7363, + "step": 26117 + }, + { + "epoch": 2.510621935980006, + "grad_norm": 1.1489466428756714, + "learning_rate": 0.00011174141366276138, + "loss": 1.8681, + "step": 26118 + }, + { + "epoch": 2.510718062097472, + "grad_norm": 1.2178572416305542, + "learning_rate": 0.00011172941126895405, + "loss": 2.2603, + "step": 26119 + }, + { + "epoch": 2.5108141882149377, + "grad_norm": 0.9484860301017761, + "learning_rate": 0.00011171740915501788, + "loss": 1.8847, + "step": 26120 + }, + { + "epoch": 2.510910314332404, + "grad_norm": 1.016210913658142, + "learning_rate": 0.0001117054073210373, + "loss": 1.8068, + "step": 26121 + }, + { + "epoch": 2.5110064404498704, + "grad_norm": 1.107150673866272, + "learning_rate": 0.00011169340576709678, + "loss": 2.0017, + "step": 26122 + }, + { + "epoch": 2.5111025665673363, + "grad_norm": 1.42222261428833, + "learning_rate": 0.00011168140449328078, + "loss": 2.0156, + "step": 26123 + }, + { + "epoch": 2.5111986926848022, + "grad_norm": 1.1279096603393555, + "learning_rate": 0.00011166940349967373, + "loss": 1.9837, + "step": 26124 + }, + { + "epoch": 2.5112948188022686, + "grad_norm": 1.2473194599151611, + "learning_rate": 0.00011165740278636012, + "loss": 1.9043, + "step": 26125 + }, + { + "epoch": 2.511390944919735, + "grad_norm": 1.152144432067871, + "learning_rate": 0.00011164540235342441, + "loss": 2.021, + "step": 26126 + }, + { + "epoch": 2.511487071037201, + "grad_norm": 1.4787191152572632, + "learning_rate": 0.00011163340220095102, + "loss": 2.0371, + "step": 26127 + }, + { + "epoch": 2.5115831971546667, + "grad_norm": 1.2296981811523438, + "learning_rate": 0.00011162140232902439, + "loss": 2.0859, + "step": 26128 + }, + { + "epoch": 2.511679323272133, + "grad_norm": 1.0655434131622314, + "learning_rate": 0.00011160940273772898, + "loss": 2.0107, + "step": 26129 + }, + { + "epoch": 2.5117754493895994, + "grad_norm": 1.0982630252838135, + "learning_rate": 0.00011159740342714923, + "loss": 2.107, + "step": 26130 + }, + { + "epoch": 2.5118715755070653, + "grad_norm": 1.1016534566879272, + "learning_rate": 0.00011158540439736956, + "loss": 1.9911, + "step": 26131 + }, + { + "epoch": 2.511967701624531, + "grad_norm": 1.0138475894927979, + "learning_rate": 0.00011157340564847446, + "loss": 1.9364, + "step": 26132 + }, + { + "epoch": 2.5120638277419975, + "grad_norm": 1.089463472366333, + "learning_rate": 0.00011156140718054828, + "loss": 1.9431, + "step": 26133 + }, + { + "epoch": 2.512159953859464, + "grad_norm": 1.2413077354431152, + "learning_rate": 0.00011154940899367557, + "loss": 2.0397, + "step": 26134 + }, + { + "epoch": 2.51225607997693, + "grad_norm": 1.3153873682022095, + "learning_rate": 0.00011153741108794067, + "loss": 2.0734, + "step": 26135 + }, + { + "epoch": 2.5123522060943957, + "grad_norm": 1.0828101634979248, + "learning_rate": 0.00011152541346342806, + "loss": 2.0812, + "step": 26136 + }, + { + "epoch": 2.512448332211862, + "grad_norm": 1.0921757221221924, + "learning_rate": 0.00011151341612022214, + "loss": 2.045, + "step": 26137 + }, + { + "epoch": 2.512544458329328, + "grad_norm": 1.3625963926315308, + "learning_rate": 0.0001115014190584074, + "loss": 2.1013, + "step": 26138 + }, + { + "epoch": 2.5126405844467943, + "grad_norm": 1.15708589553833, + "learning_rate": 0.00011148942227806817, + "loss": 2.079, + "step": 26139 + }, + { + "epoch": 2.51273671056426, + "grad_norm": 1.2365466356277466, + "learning_rate": 0.00011147742577928895, + "loss": 2.0928, + "step": 26140 + }, + { + "epoch": 2.5128328366817265, + "grad_norm": 1.1119378805160522, + "learning_rate": 0.00011146542956215411, + "loss": 2.0412, + "step": 26141 + }, + { + "epoch": 2.5129289627991924, + "grad_norm": 0.9735493063926697, + "learning_rate": 0.00011145343362674807, + "loss": 1.856, + "step": 26142 + }, + { + "epoch": 2.5130250889166588, + "grad_norm": 1.1289924383163452, + "learning_rate": 0.00011144143797315527, + "loss": 1.9221, + "step": 26143 + }, + { + "epoch": 2.5131212150341247, + "grad_norm": 1.0379502773284912, + "learning_rate": 0.00011142944260146015, + "loss": 1.9701, + "step": 26144 + }, + { + "epoch": 2.513217341151591, + "grad_norm": 1.1957536935806274, + "learning_rate": 0.00011141744751174708, + "loss": 2.049, + "step": 26145 + }, + { + "epoch": 2.513313467269057, + "grad_norm": 1.2713501453399658, + "learning_rate": 0.00011140545270410047, + "loss": 2.117, + "step": 26146 + }, + { + "epoch": 2.5134095933865233, + "grad_norm": 1.3811641931533813, + "learning_rate": 0.00011139345817860475, + "loss": 2.1905, + "step": 26147 + }, + { + "epoch": 2.513505719503989, + "grad_norm": 1.3077389001846313, + "learning_rate": 0.00011138146393534433, + "loss": 2.1629, + "step": 26148 + }, + { + "epoch": 2.5136018456214555, + "grad_norm": 1.2290608882904053, + "learning_rate": 0.00011136946997440362, + "loss": 2.0197, + "step": 26149 + }, + { + "epoch": 2.5136979717389214, + "grad_norm": 1.1223636865615845, + "learning_rate": 0.00011135747629586696, + "loss": 2.037, + "step": 26150 + }, + { + "epoch": 2.5137940978563877, + "grad_norm": 1.1945021152496338, + "learning_rate": 0.00011134548289981882, + "loss": 1.9622, + "step": 26151 + }, + { + "epoch": 2.5138902239738536, + "grad_norm": 1.097387433052063, + "learning_rate": 0.0001113334897863436, + "loss": 2.0369, + "step": 26152 + }, + { + "epoch": 2.5139863500913195, + "grad_norm": 1.1726435422897339, + "learning_rate": 0.0001113214969555257, + "loss": 1.8599, + "step": 26153 + }, + { + "epoch": 2.514082476208786, + "grad_norm": 1.1747653484344482, + "learning_rate": 0.00011130950440744944, + "loss": 2.0765, + "step": 26154 + }, + { + "epoch": 2.5141786023262522, + "grad_norm": 1.0383684635162354, + "learning_rate": 0.00011129751214219928, + "loss": 1.9896, + "step": 26155 + }, + { + "epoch": 2.514274728443718, + "grad_norm": 1.1211307048797607, + "learning_rate": 0.0001112855201598596, + "loss": 2.0478, + "step": 26156 + }, + { + "epoch": 2.514370854561184, + "grad_norm": 1.1093002557754517, + "learning_rate": 0.00011127352846051483, + "loss": 2.0071, + "step": 26157 + }, + { + "epoch": 2.5144669806786504, + "grad_norm": 1.1964991092681885, + "learning_rate": 0.00011126153704424926, + "loss": 1.9326, + "step": 26158 + }, + { + "epoch": 2.5145631067961167, + "grad_norm": 1.4377962350845337, + "learning_rate": 0.00011124954591114732, + "loss": 2.0627, + "step": 26159 + }, + { + "epoch": 2.5146592329135826, + "grad_norm": 1.1697760820388794, + "learning_rate": 0.00011123755506129344, + "loss": 2.1059, + "step": 26160 + }, + { + "epoch": 2.5147553590310485, + "grad_norm": 1.0250324010849, + "learning_rate": 0.00011122556449477195, + "loss": 1.9759, + "step": 26161 + }, + { + "epoch": 2.514851485148515, + "grad_norm": 1.4795364141464233, + "learning_rate": 0.00011121357421166725, + "loss": 2.0869, + "step": 26162 + }, + { + "epoch": 2.514947611265981, + "grad_norm": 1.1997803449630737, + "learning_rate": 0.0001112015842120637, + "loss": 2.1476, + "step": 26163 + }, + { + "epoch": 2.515043737383447, + "grad_norm": 1.3141409158706665, + "learning_rate": 0.00011118959449604569, + "loss": 1.9692, + "step": 26164 + }, + { + "epoch": 2.515139863500913, + "grad_norm": 1.0249552726745605, + "learning_rate": 0.00011117760506369757, + "loss": 1.9904, + "step": 26165 + }, + { + "epoch": 2.5152359896183794, + "grad_norm": 1.1549571752548218, + "learning_rate": 0.00011116561591510373, + "loss": 2.0057, + "step": 26166 + }, + { + "epoch": 2.5153321157358457, + "grad_norm": 1.257513165473938, + "learning_rate": 0.00011115362705034853, + "loss": 2.1302, + "step": 26167 + }, + { + "epoch": 2.5154282418533116, + "grad_norm": 1.3828282356262207, + "learning_rate": 0.00011114163846951639, + "loss": 2.2325, + "step": 26168 + }, + { + "epoch": 2.5155243679707775, + "grad_norm": 1.2126964330673218, + "learning_rate": 0.00011112965017269159, + "loss": 2.0315, + "step": 26169 + }, + { + "epoch": 2.515620494088244, + "grad_norm": 1.2092416286468506, + "learning_rate": 0.00011111766215995855, + "loss": 2.0911, + "step": 26170 + }, + { + "epoch": 2.5157166202057097, + "grad_norm": 1.0981626510620117, + "learning_rate": 0.00011110567443140162, + "loss": 1.8878, + "step": 26171 + }, + { + "epoch": 2.515812746323176, + "grad_norm": 1.1405128240585327, + "learning_rate": 0.00011109368698710516, + "loss": 2.1267, + "step": 26172 + }, + { + "epoch": 2.515908872440642, + "grad_norm": 1.150469422340393, + "learning_rate": 0.00011108169982715349, + "loss": 2.1531, + "step": 26173 + }, + { + "epoch": 2.5160049985581083, + "grad_norm": 1.152699589729309, + "learning_rate": 0.00011106971295163104, + "loss": 1.8035, + "step": 26174 + }, + { + "epoch": 2.5161011246755742, + "grad_norm": 1.161218523979187, + "learning_rate": 0.00011105772636062213, + "loss": 2.1032, + "step": 26175 + }, + { + "epoch": 2.5161972507930406, + "grad_norm": 1.2332026958465576, + "learning_rate": 0.00011104574005421105, + "loss": 1.9605, + "step": 26176 + }, + { + "epoch": 2.5162933769105065, + "grad_norm": 0.9862735867500305, + "learning_rate": 0.00011103375403248226, + "loss": 1.7688, + "step": 26177 + }, + { + "epoch": 2.516389503027973, + "grad_norm": 1.227017879486084, + "learning_rate": 0.00011102176829552, + "loss": 2.0444, + "step": 26178 + }, + { + "epoch": 2.5164856291454387, + "grad_norm": 1.2725955247879028, + "learning_rate": 0.00011100978284340869, + "loss": 2.2118, + "step": 26179 + }, + { + "epoch": 2.516581755262905, + "grad_norm": 0.9971721172332764, + "learning_rate": 0.00011099779767623264, + "loss": 1.9547, + "step": 26180 + }, + { + "epoch": 2.516677881380371, + "grad_norm": 1.2347421646118164, + "learning_rate": 0.00011098581279407621, + "loss": 2.086, + "step": 26181 + }, + { + "epoch": 2.5167740074978373, + "grad_norm": 0.9934170842170715, + "learning_rate": 0.00011097382819702372, + "loss": 1.8858, + "step": 26182 + }, + { + "epoch": 2.516870133615303, + "grad_norm": 0.9308106899261475, + "learning_rate": 0.00011096184388515953, + "loss": 1.7843, + "step": 26183 + }, + { + "epoch": 2.5169662597327696, + "grad_norm": 1.1535485982894897, + "learning_rate": 0.00011094985985856796, + "loss": 1.9952, + "step": 26184 + }, + { + "epoch": 2.5170623858502355, + "grad_norm": 1.052822470664978, + "learning_rate": 0.00011093787611733333, + "loss": 2.0336, + "step": 26185 + }, + { + "epoch": 2.5171585119677014, + "grad_norm": 1.0966402292251587, + "learning_rate": 0.00011092589266153997, + "loss": 1.9627, + "step": 26186 + }, + { + "epoch": 2.5172546380851677, + "grad_norm": 1.1147546768188477, + "learning_rate": 0.00011091390949127225, + "loss": 2.2019, + "step": 26187 + }, + { + "epoch": 2.517350764202634, + "grad_norm": 0.9669939875602722, + "learning_rate": 0.00011090192660661445, + "loss": 1.9931, + "step": 26188 + }, + { + "epoch": 2.5174468903201, + "grad_norm": 1.3781805038452148, + "learning_rate": 0.00011088994400765093, + "loss": 2.0041, + "step": 26189 + }, + { + "epoch": 2.517543016437566, + "grad_norm": 0.9455859065055847, + "learning_rate": 0.000110877961694466, + "loss": 1.7282, + "step": 26190 + }, + { + "epoch": 2.517639142555032, + "grad_norm": 1.227410912513733, + "learning_rate": 0.00011086597966714397, + "loss": 2.1302, + "step": 26191 + }, + { + "epoch": 2.5177352686724985, + "grad_norm": 1.0074775218963623, + "learning_rate": 0.00011085399792576918, + "loss": 2.0491, + "step": 26192 + }, + { + "epoch": 2.5178313947899644, + "grad_norm": 1.0695302486419678, + "learning_rate": 0.00011084201647042594, + "loss": 1.8256, + "step": 26193 + }, + { + "epoch": 2.5179275209074303, + "grad_norm": 1.1051692962646484, + "learning_rate": 0.00011083003530119854, + "loss": 1.8745, + "step": 26194 + }, + { + "epoch": 2.5180236470248967, + "grad_norm": 1.006849765777588, + "learning_rate": 0.00011081805441817133, + "loss": 1.8992, + "step": 26195 + }, + { + "epoch": 2.518119773142363, + "grad_norm": 1.0658836364746094, + "learning_rate": 0.00011080607382142862, + "loss": 1.8925, + "step": 26196 + }, + { + "epoch": 2.518215899259829, + "grad_norm": 1.231728196144104, + "learning_rate": 0.00011079409351105469, + "loss": 1.943, + "step": 26197 + }, + { + "epoch": 2.518312025377295, + "grad_norm": 1.0926858186721802, + "learning_rate": 0.00011078211348713387, + "loss": 1.7244, + "step": 26198 + }, + { + "epoch": 2.518408151494761, + "grad_norm": 1.0637469291687012, + "learning_rate": 0.00011077013374975048, + "loss": 2.0785, + "step": 26199 + }, + { + "epoch": 2.5185042776122275, + "grad_norm": 1.2220661640167236, + "learning_rate": 0.00011075815429898877, + "loss": 1.9385, + "step": 26200 + }, + { + "epoch": 2.5186004037296934, + "grad_norm": 1.114520788192749, + "learning_rate": 0.00011074617513493309, + "loss": 1.9682, + "step": 26201 + }, + { + "epoch": 2.5186965298471593, + "grad_norm": 1.168748378753662, + "learning_rate": 0.00011073419625766771, + "loss": 2.0784, + "step": 26202 + }, + { + "epoch": 2.5187926559646256, + "grad_norm": 1.1823327541351318, + "learning_rate": 0.00011072221766727693, + "loss": 2.1068, + "step": 26203 + }, + { + "epoch": 2.5188887820820915, + "grad_norm": 1.1112635135650635, + "learning_rate": 0.00011071023936384507, + "loss": 1.9998, + "step": 26204 + }, + { + "epoch": 2.518984908199558, + "grad_norm": 1.2026472091674805, + "learning_rate": 0.00011069826134745642, + "loss": 1.9402, + "step": 26205 + }, + { + "epoch": 2.519081034317024, + "grad_norm": 1.1928837299346924, + "learning_rate": 0.00011068628361819523, + "loss": 2.1031, + "step": 26206 + }, + { + "epoch": 2.51917716043449, + "grad_norm": 1.1626918315887451, + "learning_rate": 0.00011067430617614584, + "loss": 1.8943, + "step": 26207 + }, + { + "epoch": 2.519273286551956, + "grad_norm": 1.2383878231048584, + "learning_rate": 0.00011066232902139251, + "loss": 2.0843, + "step": 26208 + }, + { + "epoch": 2.5193694126694224, + "grad_norm": 1.1198316812515259, + "learning_rate": 0.0001106503521540195, + "loss": 1.8393, + "step": 26209 + }, + { + "epoch": 2.5194655387868883, + "grad_norm": 1.1818128824234009, + "learning_rate": 0.00011063837557411114, + "loss": 2.0974, + "step": 26210 + }, + { + "epoch": 2.5195616649043546, + "grad_norm": 1.1551671028137207, + "learning_rate": 0.00011062639928175169, + "loss": 2.0137, + "step": 26211 + }, + { + "epoch": 2.5196577910218205, + "grad_norm": 1.3096141815185547, + "learning_rate": 0.00011061442327702544, + "loss": 1.9752, + "step": 26212 + }, + { + "epoch": 2.519753917139287, + "grad_norm": 1.0673699378967285, + "learning_rate": 0.00011060244756001663, + "loss": 1.8197, + "step": 26213 + }, + { + "epoch": 2.5198500432567528, + "grad_norm": 1.0318161249160767, + "learning_rate": 0.0001105904721308096, + "loss": 1.8633, + "step": 26214 + }, + { + "epoch": 2.519946169374219, + "grad_norm": 1.0869905948638916, + "learning_rate": 0.00011057849698948857, + "loss": 1.9405, + "step": 26215 + }, + { + "epoch": 2.520042295491685, + "grad_norm": 1.0320886373519897, + "learning_rate": 0.00011056652213613783, + "loss": 1.8451, + "step": 26216 + }, + { + "epoch": 2.5201384216091514, + "grad_norm": 1.1353086233139038, + "learning_rate": 0.00011055454757084161, + "loss": 2.0475, + "step": 26217 + }, + { + "epoch": 2.5202345477266173, + "grad_norm": 1.1919915676116943, + "learning_rate": 0.00011054257329368428, + "loss": 2.0886, + "step": 26218 + }, + { + "epoch": 2.5203306738440836, + "grad_norm": 1.1042633056640625, + "learning_rate": 0.00011053059930474998, + "loss": 1.9794, + "step": 26219 + }, + { + "epoch": 2.5204267999615495, + "grad_norm": 1.0355900526046753, + "learning_rate": 0.00011051862560412305, + "loss": 1.7935, + "step": 26220 + }, + { + "epoch": 2.520522926079016, + "grad_norm": 1.0170209407806396, + "learning_rate": 0.00011050665219188772, + "loss": 1.7173, + "step": 26221 + }, + { + "epoch": 2.5206190521964817, + "grad_norm": 1.1619665622711182, + "learning_rate": 0.00011049467906812827, + "loss": 1.9323, + "step": 26222 + }, + { + "epoch": 2.5207151783139476, + "grad_norm": 1.1038625240325928, + "learning_rate": 0.00011048270623292895, + "loss": 2.0228, + "step": 26223 + }, + { + "epoch": 2.520811304431414, + "grad_norm": 1.1640843152999878, + "learning_rate": 0.00011047073368637399, + "loss": 2.0623, + "step": 26224 + }, + { + "epoch": 2.5209074305488803, + "grad_norm": 1.270972728729248, + "learning_rate": 0.00011045876142854766, + "loss": 2.0337, + "step": 26225 + }, + { + "epoch": 2.5210035566663462, + "grad_norm": 1.4145616292953491, + "learning_rate": 0.00011044678945953422, + "loss": 2.1364, + "step": 26226 + }, + { + "epoch": 2.521099682783812, + "grad_norm": 1.1452716588974, + "learning_rate": 0.00011043481777941791, + "loss": 1.8846, + "step": 26227 + }, + { + "epoch": 2.5211958089012785, + "grad_norm": 1.0023322105407715, + "learning_rate": 0.00011042284638828294, + "loss": 1.9458, + "step": 26228 + }, + { + "epoch": 2.521291935018745, + "grad_norm": 1.209504246711731, + "learning_rate": 0.00011041087528621363, + "loss": 2.1517, + "step": 26229 + }, + { + "epoch": 2.5213880611362107, + "grad_norm": 1.1903997659683228, + "learning_rate": 0.00011039890447329419, + "loss": 1.9932, + "step": 26230 + }, + { + "epoch": 2.5214841872536766, + "grad_norm": 1.0651110410690308, + "learning_rate": 0.00011038693394960885, + "loss": 1.8839, + "step": 26231 + }, + { + "epoch": 2.521580313371143, + "grad_norm": 1.0478311777114868, + "learning_rate": 0.00011037496371524183, + "loss": 2.1335, + "step": 26232 + }, + { + "epoch": 2.5216764394886093, + "grad_norm": 1.1744229793548584, + "learning_rate": 0.00011036299377027741, + "loss": 2.0133, + "step": 26233 + }, + { + "epoch": 2.521772565606075, + "grad_norm": 1.2345013618469238, + "learning_rate": 0.00011035102411479981, + "loss": 2.1076, + "step": 26234 + }, + { + "epoch": 2.521868691723541, + "grad_norm": 1.1717162132263184, + "learning_rate": 0.00011033905474889322, + "loss": 1.9995, + "step": 26235 + }, + { + "epoch": 2.5219648178410075, + "grad_norm": 1.0108344554901123, + "learning_rate": 0.00011032708567264194, + "loss": 2.0272, + "step": 26236 + }, + { + "epoch": 2.5220609439584734, + "grad_norm": 1.2070283889770508, + "learning_rate": 0.00011031511688613015, + "loss": 2.0328, + "step": 26237 + }, + { + "epoch": 2.5221570700759397, + "grad_norm": 1.0986981391906738, + "learning_rate": 0.00011030314838944207, + "loss": 2.0204, + "step": 26238 + }, + { + "epoch": 2.5222531961934056, + "grad_norm": 1.2127571105957031, + "learning_rate": 0.00011029118018266197, + "loss": 2.2472, + "step": 26239 + }, + { + "epoch": 2.522349322310872, + "grad_norm": 1.2124583721160889, + "learning_rate": 0.00011027921226587402, + "loss": 2.1107, + "step": 26240 + }, + { + "epoch": 2.522445448428338, + "grad_norm": 1.2320243120193481, + "learning_rate": 0.00011026724463916248, + "loss": 2.0124, + "step": 26241 + }, + { + "epoch": 2.522541574545804, + "grad_norm": 1.1484644412994385, + "learning_rate": 0.00011025527730261154, + "loss": 2.0624, + "step": 26242 + }, + { + "epoch": 2.52263770066327, + "grad_norm": 1.0936936140060425, + "learning_rate": 0.00011024331025630544, + "loss": 1.806, + "step": 26243 + }, + { + "epoch": 2.5227338267807364, + "grad_norm": 1.0656793117523193, + "learning_rate": 0.00011023134350032836, + "loss": 1.9355, + "step": 26244 + }, + { + "epoch": 2.5228299528982023, + "grad_norm": 1.0455559492111206, + "learning_rate": 0.00011021937703476454, + "loss": 1.9075, + "step": 26245 + }, + { + "epoch": 2.5229260790156687, + "grad_norm": 1.2022138833999634, + "learning_rate": 0.00011020741085969819, + "loss": 2.0947, + "step": 26246 + }, + { + "epoch": 2.5230222051331346, + "grad_norm": 1.110875129699707, + "learning_rate": 0.00011019544497521351, + "loss": 1.8091, + "step": 26247 + }, + { + "epoch": 2.523118331250601, + "grad_norm": 1.1334983110427856, + "learning_rate": 0.0001101834793813947, + "loss": 1.9759, + "step": 26248 + }, + { + "epoch": 2.523214457368067, + "grad_norm": 1.0332019329071045, + "learning_rate": 0.00011017151407832596, + "loss": 2.2256, + "step": 26249 + }, + { + "epoch": 2.523310583485533, + "grad_norm": 1.194009780883789, + "learning_rate": 0.00011015954906609153, + "loss": 2.1665, + "step": 26250 + }, + { + "epoch": 2.523406709602999, + "grad_norm": 1.229883074760437, + "learning_rate": 0.00011014758434477558, + "loss": 1.9182, + "step": 26251 + }, + { + "epoch": 2.5235028357204654, + "grad_norm": 1.0576483011245728, + "learning_rate": 0.0001101356199144623, + "loss": 2.0119, + "step": 26252 + }, + { + "epoch": 2.5235989618379313, + "grad_norm": 1.0987842082977295, + "learning_rate": 0.00011012365577523589, + "loss": 2.0536, + "step": 26253 + }, + { + "epoch": 2.5236950879553977, + "grad_norm": 1.2382776737213135, + "learning_rate": 0.00011011169192718054, + "loss": 1.904, + "step": 26254 + }, + { + "epoch": 2.5237912140728636, + "grad_norm": 1.2714990377426147, + "learning_rate": 0.00011009972837038044, + "loss": 2.0822, + "step": 26255 + }, + { + "epoch": 2.5238873401903295, + "grad_norm": 1.205736517906189, + "learning_rate": 0.00011008776510491983, + "loss": 2.1121, + "step": 26256 + }, + { + "epoch": 2.523983466307796, + "grad_norm": 1.0037689208984375, + "learning_rate": 0.00011007580213088281, + "loss": 2.0014, + "step": 26257 + }, + { + "epoch": 2.524079592425262, + "grad_norm": 1.4638620615005493, + "learning_rate": 0.00011006383944835362, + "loss": 2.0731, + "step": 26258 + }, + { + "epoch": 2.524175718542728, + "grad_norm": 1.117897629737854, + "learning_rate": 0.00011005187705741643, + "loss": 2.0716, + "step": 26259 + }, + { + "epoch": 2.524271844660194, + "grad_norm": 1.1347050666809082, + "learning_rate": 0.00011003991495815546, + "loss": 2.0116, + "step": 26260 + }, + { + "epoch": 2.5243679707776603, + "grad_norm": 0.995302677154541, + "learning_rate": 0.00011002795315065481, + "loss": 1.8438, + "step": 26261 + }, + { + "epoch": 2.5244640968951266, + "grad_norm": 1.0154743194580078, + "learning_rate": 0.00011001599163499867, + "loss": 1.7714, + "step": 26262 + }, + { + "epoch": 2.5245602230125925, + "grad_norm": 1.1266320943832397, + "learning_rate": 0.00011000403041127129, + "loss": 1.9728, + "step": 26263 + }, + { + "epoch": 2.5246563491300584, + "grad_norm": 1.1831378936767578, + "learning_rate": 0.0001099920694795568, + "loss": 2.0659, + "step": 26264 + }, + { + "epoch": 2.5247524752475248, + "grad_norm": 1.007794976234436, + "learning_rate": 0.00010998010883993937, + "loss": 1.9542, + "step": 26265 + }, + { + "epoch": 2.524848601364991, + "grad_norm": 1.1346958875656128, + "learning_rate": 0.00010996814849250315, + "loss": 1.9316, + "step": 26266 + }, + { + "epoch": 2.524944727482457, + "grad_norm": 1.1086368560791016, + "learning_rate": 0.00010995618843733234, + "loss": 2.0383, + "step": 26267 + }, + { + "epoch": 2.525040853599923, + "grad_norm": 1.1520055532455444, + "learning_rate": 0.00010994422867451107, + "loss": 2.0495, + "step": 26268 + }, + { + "epoch": 2.5251369797173893, + "grad_norm": 1.3715674877166748, + "learning_rate": 0.00010993226920412353, + "loss": 1.9008, + "step": 26269 + }, + { + "epoch": 2.5252331058348556, + "grad_norm": 1.3973667621612549, + "learning_rate": 0.00010992031002625386, + "loss": 2.0612, + "step": 26270 + }, + { + "epoch": 2.5253292319523215, + "grad_norm": 1.07880699634552, + "learning_rate": 0.00010990835114098625, + "loss": 2.0383, + "step": 26271 + }, + { + "epoch": 2.5254253580697874, + "grad_norm": 0.947979211807251, + "learning_rate": 0.00010989639254840481, + "loss": 1.7881, + "step": 26272 + }, + { + "epoch": 2.5255214841872538, + "grad_norm": 1.1754052639007568, + "learning_rate": 0.00010988443424859374, + "loss": 1.7987, + "step": 26273 + }, + { + "epoch": 2.5256176103047197, + "grad_norm": 1.2011560201644897, + "learning_rate": 0.00010987247624163716, + "loss": 1.9382, + "step": 26274 + }, + { + "epoch": 2.525713736422186, + "grad_norm": 1.1869934797286987, + "learning_rate": 0.00010986051852761922, + "loss": 1.953, + "step": 26275 + }, + { + "epoch": 2.525809862539652, + "grad_norm": 1.1015022993087769, + "learning_rate": 0.0001098485611066241, + "loss": 2.0832, + "step": 26276 + }, + { + "epoch": 2.5259059886571182, + "grad_norm": 1.365705132484436, + "learning_rate": 0.00010983660397873592, + "loss": 2.1036, + "step": 26277 + }, + { + "epoch": 2.526002114774584, + "grad_norm": 1.131351113319397, + "learning_rate": 0.00010982464714403885, + "loss": 1.9036, + "step": 26278 + }, + { + "epoch": 2.5260982408920505, + "grad_norm": 1.2136141061782837, + "learning_rate": 0.00010981269060261696, + "loss": 1.9498, + "step": 26279 + }, + { + "epoch": 2.5261943670095164, + "grad_norm": 1.2807948589324951, + "learning_rate": 0.00010980073435455447, + "loss": 1.9403, + "step": 26280 + }, + { + "epoch": 2.5262904931269827, + "grad_norm": 1.1564139127731323, + "learning_rate": 0.0001097887783999355, + "loss": 1.8426, + "step": 26281 + }, + { + "epoch": 2.5263866192444486, + "grad_norm": 1.1278150081634521, + "learning_rate": 0.00010977682273884415, + "loss": 2.226, + "step": 26282 + }, + { + "epoch": 2.526482745361915, + "grad_norm": 1.0680651664733887, + "learning_rate": 0.00010976486737136462, + "loss": 2.192, + "step": 26283 + }, + { + "epoch": 2.526578871479381, + "grad_norm": 1.12980318069458, + "learning_rate": 0.00010975291229758095, + "loss": 1.8817, + "step": 26284 + }, + { + "epoch": 2.526674997596847, + "grad_norm": 1.0740095376968384, + "learning_rate": 0.00010974095751757734, + "loss": 2.054, + "step": 26285 + }, + { + "epoch": 2.526771123714313, + "grad_norm": 1.1614009141921997, + "learning_rate": 0.00010972900303143793, + "loss": 2.0032, + "step": 26286 + }, + { + "epoch": 2.5268672498317795, + "grad_norm": 1.0096181631088257, + "learning_rate": 0.00010971704883924677, + "loss": 1.9654, + "step": 26287 + }, + { + "epoch": 2.5269633759492454, + "grad_norm": 1.1434520483016968, + "learning_rate": 0.00010970509494108804, + "loss": 1.8891, + "step": 26288 + }, + { + "epoch": 2.5270595020667113, + "grad_norm": 1.0321524143218994, + "learning_rate": 0.00010969314133704588, + "loss": 1.808, + "step": 26289 + }, + { + "epoch": 2.5271556281841776, + "grad_norm": 1.086409330368042, + "learning_rate": 0.00010968118802720432, + "loss": 2.0036, + "step": 26290 + }, + { + "epoch": 2.527251754301644, + "grad_norm": 1.1484655141830444, + "learning_rate": 0.00010966923501164758, + "loss": 1.9473, + "step": 26291 + }, + { + "epoch": 2.52734788041911, + "grad_norm": 1.2939445972442627, + "learning_rate": 0.0001096572822904597, + "loss": 1.8742, + "step": 26292 + }, + { + "epoch": 2.5274440065365757, + "grad_norm": 1.271740198135376, + "learning_rate": 0.00010964532986372482, + "loss": 2.0324, + "step": 26293 + }, + { + "epoch": 2.527540132654042, + "grad_norm": 1.1648615598678589, + "learning_rate": 0.00010963337773152707, + "loss": 2.0059, + "step": 26294 + }, + { + "epoch": 2.5276362587715084, + "grad_norm": 0.9938161969184875, + "learning_rate": 0.00010962142589395052, + "loss": 2.0439, + "step": 26295 + }, + { + "epoch": 2.5277323848889743, + "grad_norm": 0.9675464630126953, + "learning_rate": 0.00010960947435107929, + "loss": 1.9818, + "step": 26296 + }, + { + "epoch": 2.5278285110064402, + "grad_norm": 1.151637315750122, + "learning_rate": 0.00010959752310299751, + "loss": 1.9831, + "step": 26297 + }, + { + "epoch": 2.5279246371239066, + "grad_norm": 0.913083016872406, + "learning_rate": 0.00010958557214978927, + "loss": 2.0204, + "step": 26298 + }, + { + "epoch": 2.528020763241373, + "grad_norm": 1.0300976037979126, + "learning_rate": 0.00010957362149153867, + "loss": 1.9872, + "step": 26299 + }, + { + "epoch": 2.528116889358839, + "grad_norm": 1.0844777822494507, + "learning_rate": 0.0001095616711283298, + "loss": 2.1551, + "step": 26300 + }, + { + "epoch": 2.5282130154763047, + "grad_norm": 1.1036806106567383, + "learning_rate": 0.00010954972106024674, + "loss": 2.0404, + "step": 26301 + }, + { + "epoch": 2.528309141593771, + "grad_norm": 1.1993297338485718, + "learning_rate": 0.00010953777128737365, + "loss": 2.2567, + "step": 26302 + }, + { + "epoch": 2.5284052677112374, + "grad_norm": 1.2642199993133545, + "learning_rate": 0.00010952582180979455, + "loss": 1.9434, + "step": 26303 + }, + { + "epoch": 2.5285013938287033, + "grad_norm": 1.292862057685852, + "learning_rate": 0.00010951387262759354, + "loss": 1.9663, + "step": 26304 + }, + { + "epoch": 2.528597519946169, + "grad_norm": 1.022843360900879, + "learning_rate": 0.00010950192374085476, + "loss": 1.8759, + "step": 26305 + }, + { + "epoch": 2.5286936460636356, + "grad_norm": 1.1307649612426758, + "learning_rate": 0.00010948997514966224, + "loss": 1.9982, + "step": 26306 + }, + { + "epoch": 2.5287897721811015, + "grad_norm": 1.0890172719955444, + "learning_rate": 0.00010947802685410006, + "loss": 2.0094, + "step": 26307 + }, + { + "epoch": 2.528885898298568, + "grad_norm": 1.1576403379440308, + "learning_rate": 0.00010946607885425238, + "loss": 1.9764, + "step": 26308 + }, + { + "epoch": 2.5289820244160337, + "grad_norm": 1.2032930850982666, + "learning_rate": 0.0001094541311502032, + "loss": 2.0931, + "step": 26309 + }, + { + "epoch": 2.5290781505335, + "grad_norm": 1.2823625802993774, + "learning_rate": 0.00010944218374203663, + "loss": 2.033, + "step": 26310 + }, + { + "epoch": 2.529174276650966, + "grad_norm": 1.1230807304382324, + "learning_rate": 0.00010943023662983674, + "loss": 2.1184, + "step": 26311 + }, + { + "epoch": 2.5292704027684323, + "grad_norm": 1.2549585103988647, + "learning_rate": 0.00010941828981368758, + "loss": 2.0548, + "step": 26312 + }, + { + "epoch": 2.529366528885898, + "grad_norm": 1.0718097686767578, + "learning_rate": 0.00010940634329367325, + "loss": 1.9821, + "step": 26313 + }, + { + "epoch": 2.5294626550033645, + "grad_norm": 1.276160717010498, + "learning_rate": 0.00010939439706987781, + "loss": 2.0681, + "step": 26314 + }, + { + "epoch": 2.5295587811208304, + "grad_norm": 1.1477254629135132, + "learning_rate": 0.00010938245114238536, + "loss": 1.9619, + "step": 26315 + }, + { + "epoch": 2.529654907238297, + "grad_norm": 1.1081308126449585, + "learning_rate": 0.00010937050551127992, + "loss": 1.8829, + "step": 26316 + }, + { + "epoch": 2.5297510333557627, + "grad_norm": 1.2206465005874634, + "learning_rate": 0.00010935856017664558, + "loss": 1.9209, + "step": 26317 + }, + { + "epoch": 2.529847159473229, + "grad_norm": 1.1626827716827393, + "learning_rate": 0.00010934661513856638, + "loss": 1.8715, + "step": 26318 + }, + { + "epoch": 2.529943285590695, + "grad_norm": 1.1635607481002808, + "learning_rate": 0.00010933467039712638, + "loss": 2.0628, + "step": 26319 + }, + { + "epoch": 2.5300394117081613, + "grad_norm": 1.1427397727966309, + "learning_rate": 0.00010932272595240967, + "loss": 1.9864, + "step": 26320 + }, + { + "epoch": 2.530135537825627, + "grad_norm": 1.2336891889572144, + "learning_rate": 0.00010931078180450027, + "loss": 2.2217, + "step": 26321 + }, + { + "epoch": 2.530231663943093, + "grad_norm": 1.134140133857727, + "learning_rate": 0.00010929883795348225, + "loss": 1.9589, + "step": 26322 + }, + { + "epoch": 2.5303277900605594, + "grad_norm": 1.0224753618240356, + "learning_rate": 0.00010928689439943964, + "loss": 1.9541, + "step": 26323 + }, + { + "epoch": 2.5304239161780258, + "grad_norm": 0.9834718108177185, + "learning_rate": 0.0001092749511424565, + "loss": 1.9159, + "step": 26324 + }, + { + "epoch": 2.5305200422954917, + "grad_norm": 1.321487545967102, + "learning_rate": 0.00010926300818261688, + "loss": 2.0661, + "step": 26325 + }, + { + "epoch": 2.5306161684129576, + "grad_norm": 1.0958055257797241, + "learning_rate": 0.00010925106552000484, + "loss": 1.9215, + "step": 26326 + }, + { + "epoch": 2.530712294530424, + "grad_norm": 1.122398853302002, + "learning_rate": 0.00010923912315470441, + "loss": 2.0246, + "step": 26327 + }, + { + "epoch": 2.5308084206478902, + "grad_norm": 1.0767697095870972, + "learning_rate": 0.0001092271810867996, + "loss": 2.0015, + "step": 26328 + }, + { + "epoch": 2.530904546765356, + "grad_norm": 1.1771873235702515, + "learning_rate": 0.00010921523931637449, + "loss": 1.8817, + "step": 26329 + }, + { + "epoch": 2.531000672882822, + "grad_norm": 1.0358368158340454, + "learning_rate": 0.00010920329784351308, + "loss": 1.893, + "step": 26330 + }, + { + "epoch": 2.5310967990002884, + "grad_norm": 1.1161267757415771, + "learning_rate": 0.0001091913566682994, + "loss": 2.0218, + "step": 26331 + }, + { + "epoch": 2.5311929251177547, + "grad_norm": 1.1689958572387695, + "learning_rate": 0.00010917941579081755, + "loss": 1.8461, + "step": 26332 + }, + { + "epoch": 2.5312890512352206, + "grad_norm": 1.1350116729736328, + "learning_rate": 0.00010916747521115154, + "loss": 2.1027, + "step": 26333 + }, + { + "epoch": 2.5313851773526865, + "grad_norm": 1.075192928314209, + "learning_rate": 0.00010915553492938531, + "loss": 2.0356, + "step": 26334 + }, + { + "epoch": 2.531481303470153, + "grad_norm": 1.1632626056671143, + "learning_rate": 0.00010914359494560298, + "loss": 2.0249, + "step": 26335 + }, + { + "epoch": 2.531577429587619, + "grad_norm": 1.2844109535217285, + "learning_rate": 0.00010913165525988854, + "loss": 2.2484, + "step": 26336 + }, + { + "epoch": 2.531673555705085, + "grad_norm": 1.0350196361541748, + "learning_rate": 0.00010911971587232599, + "loss": 2.0975, + "step": 26337 + }, + { + "epoch": 2.531769681822551, + "grad_norm": 1.1940374374389648, + "learning_rate": 0.0001091077767829994, + "loss": 2.0302, + "step": 26338 + }, + { + "epoch": 2.5318658079400174, + "grad_norm": 1.065619945526123, + "learning_rate": 0.00010909583799199273, + "loss": 1.8351, + "step": 26339 + }, + { + "epoch": 2.5319619340574833, + "grad_norm": 1.1397627592086792, + "learning_rate": 0.00010908389949939004, + "loss": 2.1096, + "step": 26340 + }, + { + "epoch": 2.5320580601749496, + "grad_norm": 1.2222405672073364, + "learning_rate": 0.00010907196130527532, + "loss": 2.0435, + "step": 26341 + }, + { + "epoch": 2.5321541862924155, + "grad_norm": 0.8865599632263184, + "learning_rate": 0.00010906002340973258, + "loss": 1.8505, + "step": 26342 + }, + { + "epoch": 2.532250312409882, + "grad_norm": 1.203758955001831, + "learning_rate": 0.00010904808581284587, + "loss": 2.0089, + "step": 26343 + }, + { + "epoch": 2.5323464385273478, + "grad_norm": 1.0478428602218628, + "learning_rate": 0.00010903614851469913, + "loss": 2.1312, + "step": 26344 + }, + { + "epoch": 2.532442564644814, + "grad_norm": 1.0120956897735596, + "learning_rate": 0.00010902421151537641, + "loss": 1.8946, + "step": 26345 + }, + { + "epoch": 2.53253869076228, + "grad_norm": 1.1672813892364502, + "learning_rate": 0.00010901227481496167, + "loss": 2.0782, + "step": 26346 + }, + { + "epoch": 2.5326348168797463, + "grad_norm": 1.1272565126419067, + "learning_rate": 0.00010900033841353895, + "loss": 1.9429, + "step": 26347 + }, + { + "epoch": 2.5327309429972122, + "grad_norm": 1.2849278450012207, + "learning_rate": 0.00010898840231119222, + "loss": 2.2077, + "step": 26348 + }, + { + "epoch": 2.5328270691146786, + "grad_norm": 1.0549124479293823, + "learning_rate": 0.00010897646650800553, + "loss": 1.9317, + "step": 26349 + }, + { + "epoch": 2.5329231952321445, + "grad_norm": 1.146396279335022, + "learning_rate": 0.00010896453100406283, + "loss": 1.9588, + "step": 26350 + }, + { + "epoch": 2.533019321349611, + "grad_norm": 1.130397081375122, + "learning_rate": 0.0001089525957994481, + "loss": 2.0531, + "step": 26351 + }, + { + "epoch": 2.5331154474670767, + "grad_norm": 1.2212227582931519, + "learning_rate": 0.00010894066089424536, + "loss": 2.0465, + "step": 26352 + }, + { + "epoch": 2.533211573584543, + "grad_norm": 1.0660902261734009, + "learning_rate": 0.00010892872628853859, + "loss": 1.8657, + "step": 26353 + }, + { + "epoch": 2.533307699702009, + "grad_norm": 1.0623610019683838, + "learning_rate": 0.00010891679198241178, + "loss": 2.0155, + "step": 26354 + }, + { + "epoch": 2.533403825819475, + "grad_norm": 1.0567591190338135, + "learning_rate": 0.00010890485797594887, + "loss": 2.0247, + "step": 26355 + }, + { + "epoch": 2.533499951936941, + "grad_norm": 1.066619873046875, + "learning_rate": 0.0001088929242692339, + "loss": 1.9512, + "step": 26356 + }, + { + "epoch": 2.5335960780544076, + "grad_norm": 0.96141517162323, + "learning_rate": 0.00010888099086235083, + "loss": 1.8932, + "step": 26357 + }, + { + "epoch": 2.5336922041718735, + "grad_norm": 0.9130444526672363, + "learning_rate": 0.00010886905775538361, + "loss": 1.8631, + "step": 26358 + }, + { + "epoch": 2.5337883302893394, + "grad_norm": 1.2878447771072388, + "learning_rate": 0.00010885712494841623, + "loss": 1.9636, + "step": 26359 + }, + { + "epoch": 2.5338844564068057, + "grad_norm": 1.193580150604248, + "learning_rate": 0.0001088451924415327, + "loss": 2.0232, + "step": 26360 + }, + { + "epoch": 2.533980582524272, + "grad_norm": 1.0375385284423828, + "learning_rate": 0.00010883326023481696, + "loss": 1.9706, + "step": 26361 + }, + { + "epoch": 2.534076708641738, + "grad_norm": 1.1901023387908936, + "learning_rate": 0.00010882132832835296, + "loss": 1.9621, + "step": 26362 + }, + { + "epoch": 2.534172834759204, + "grad_norm": 1.1186169385910034, + "learning_rate": 0.0001088093967222247, + "loss": 1.9963, + "step": 26363 + }, + { + "epoch": 2.53426896087667, + "grad_norm": 0.994782567024231, + "learning_rate": 0.0001087974654165161, + "loss": 1.8853, + "step": 26364 + }, + { + "epoch": 2.5343650869941365, + "grad_norm": 1.2489652633666992, + "learning_rate": 0.00010878553441131117, + "loss": 1.9485, + "step": 26365 + }, + { + "epoch": 2.5344612131116024, + "grad_norm": 1.140884280204773, + "learning_rate": 0.00010877360370669386, + "loss": 1.964, + "step": 26366 + }, + { + "epoch": 2.5345573392290683, + "grad_norm": 1.0403509140014648, + "learning_rate": 0.0001087616733027481, + "loss": 1.8343, + "step": 26367 + }, + { + "epoch": 2.5346534653465347, + "grad_norm": 1.1147229671478271, + "learning_rate": 0.0001087497431995579, + "loss": 2.0487, + "step": 26368 + }, + { + "epoch": 2.534749591464001, + "grad_norm": 1.1367652416229248, + "learning_rate": 0.00010873781339720715, + "loss": 1.9439, + "step": 26369 + }, + { + "epoch": 2.534845717581467, + "grad_norm": 1.113653302192688, + "learning_rate": 0.00010872588389577984, + "loss": 2.0332, + "step": 26370 + }, + { + "epoch": 2.534941843698933, + "grad_norm": 1.0875153541564941, + "learning_rate": 0.00010871395469535993, + "loss": 2.0091, + "step": 26371 + }, + { + "epoch": 2.535037969816399, + "grad_norm": 1.1619597673416138, + "learning_rate": 0.00010870202579603132, + "loss": 2.0275, + "step": 26372 + }, + { + "epoch": 2.535134095933865, + "grad_norm": 1.074486494064331, + "learning_rate": 0.000108690097197878, + "loss": 2.1367, + "step": 26373 + }, + { + "epoch": 2.5352302220513314, + "grad_norm": 0.9959986209869385, + "learning_rate": 0.0001086781689009839, + "loss": 2.005, + "step": 26374 + }, + { + "epoch": 2.5353263481687973, + "grad_norm": 1.072219729423523, + "learning_rate": 0.00010866624090543294, + "loss": 2.0456, + "step": 26375 + }, + { + "epoch": 2.5354224742862637, + "grad_norm": 1.1511682271957397, + "learning_rate": 0.0001086543132113091, + "loss": 2.0474, + "step": 26376 + }, + { + "epoch": 2.5355186004037296, + "grad_norm": 1.0492435693740845, + "learning_rate": 0.00010864238581869627, + "loss": 1.9637, + "step": 26377 + }, + { + "epoch": 2.535614726521196, + "grad_norm": 1.1505041122436523, + "learning_rate": 0.00010863045872767842, + "loss": 1.9392, + "step": 26378 + }, + { + "epoch": 2.535710852638662, + "grad_norm": 1.040657877922058, + "learning_rate": 0.00010861853193833948, + "loss": 1.8448, + "step": 26379 + }, + { + "epoch": 2.535806978756128, + "grad_norm": 1.1789920330047607, + "learning_rate": 0.00010860660545076335, + "loss": 2.1806, + "step": 26380 + }, + { + "epoch": 2.535903104873594, + "grad_norm": 1.0362274646759033, + "learning_rate": 0.00010859467926503397, + "loss": 1.9843, + "step": 26381 + }, + { + "epoch": 2.5359992309910604, + "grad_norm": 1.1170200109481812, + "learning_rate": 0.0001085827533812353, + "loss": 2.126, + "step": 26382 + }, + { + "epoch": 2.5360953571085263, + "grad_norm": 1.2913964986801147, + "learning_rate": 0.00010857082779945123, + "loss": 1.9658, + "step": 26383 + }, + { + "epoch": 2.5361914832259926, + "grad_norm": 1.0463130474090576, + "learning_rate": 0.0001085589025197657, + "loss": 1.7849, + "step": 26384 + }, + { + "epoch": 2.5362876093434585, + "grad_norm": 1.156986117362976, + "learning_rate": 0.00010854697754226263, + "loss": 1.977, + "step": 26385 + }, + { + "epoch": 2.536383735460925, + "grad_norm": 1.212814450263977, + "learning_rate": 0.00010853505286702592, + "loss": 1.9826, + "step": 26386 + }, + { + "epoch": 2.536479861578391, + "grad_norm": 1.061689019203186, + "learning_rate": 0.0001085231284941395, + "loss": 1.9267, + "step": 26387 + }, + { + "epoch": 2.536575987695857, + "grad_norm": 1.279759407043457, + "learning_rate": 0.00010851120442368728, + "loss": 1.9026, + "step": 26388 + }, + { + "epoch": 2.536672113813323, + "grad_norm": 1.1182324886322021, + "learning_rate": 0.00010849928065575317, + "loss": 1.9988, + "step": 26389 + }, + { + "epoch": 2.5367682399307894, + "grad_norm": 1.2951444387435913, + "learning_rate": 0.00010848735719042109, + "loss": 2.0598, + "step": 26390 + }, + { + "epoch": 2.5368643660482553, + "grad_norm": 1.1259015798568726, + "learning_rate": 0.00010847543402777491, + "loss": 1.862, + "step": 26391 + }, + { + "epoch": 2.536960492165721, + "grad_norm": 1.1224169731140137, + "learning_rate": 0.00010846351116789859, + "loss": 1.9176, + "step": 26392 + }, + { + "epoch": 2.5370566182831875, + "grad_norm": 1.0302335023880005, + "learning_rate": 0.00010845158861087596, + "loss": 1.9291, + "step": 26393 + }, + { + "epoch": 2.537152744400654, + "grad_norm": 1.112481713294983, + "learning_rate": 0.000108439666356791, + "loss": 1.8476, + "step": 26394 + }, + { + "epoch": 2.5372488705181198, + "grad_norm": 1.1520946025848389, + "learning_rate": 0.00010842774440572758, + "loss": 2.1471, + "step": 26395 + }, + { + "epoch": 2.5373449966355857, + "grad_norm": 1.0095959901809692, + "learning_rate": 0.00010841582275776957, + "loss": 2.0902, + "step": 26396 + }, + { + "epoch": 2.537441122753052, + "grad_norm": 1.221278429031372, + "learning_rate": 0.00010840390141300088, + "loss": 2.1617, + "step": 26397 + }, + { + "epoch": 2.5375372488705183, + "grad_norm": 1.037042260169983, + "learning_rate": 0.0001083919803715054, + "loss": 1.8649, + "step": 26398 + }, + { + "epoch": 2.5376333749879842, + "grad_norm": 1.2486037015914917, + "learning_rate": 0.00010838005963336704, + "loss": 2.1592, + "step": 26399 + }, + { + "epoch": 2.53772950110545, + "grad_norm": 1.069984793663025, + "learning_rate": 0.00010836813919866968, + "loss": 2.0261, + "step": 26400 + }, + { + "epoch": 2.5378256272229165, + "grad_norm": 1.2151974439620972, + "learning_rate": 0.00010835621906749719, + "loss": 2.0526, + "step": 26401 + }, + { + "epoch": 2.537921753340383, + "grad_norm": 1.0764015913009644, + "learning_rate": 0.00010834429923993347, + "loss": 1.9289, + "step": 26402 + }, + { + "epoch": 2.5380178794578487, + "grad_norm": 1.3242557048797607, + "learning_rate": 0.0001083323797160624, + "loss": 1.9617, + "step": 26403 + }, + { + "epoch": 2.5381140055753146, + "grad_norm": 1.181166172027588, + "learning_rate": 0.00010832046049596786, + "loss": 1.9943, + "step": 26404 + }, + { + "epoch": 2.538210131692781, + "grad_norm": 1.1026337146759033, + "learning_rate": 0.0001083085415797337, + "loss": 1.9502, + "step": 26405 + }, + { + "epoch": 2.538306257810247, + "grad_norm": 1.2666889429092407, + "learning_rate": 0.00010829662296744385, + "loss": 1.9498, + "step": 26406 + }, + { + "epoch": 2.5384023839277132, + "grad_norm": 1.2842504978179932, + "learning_rate": 0.00010828470465918211, + "loss": 1.9964, + "step": 26407 + }, + { + "epoch": 2.538498510045179, + "grad_norm": 1.261741280555725, + "learning_rate": 0.00010827278665503241, + "loss": 2.1007, + "step": 26408 + }, + { + "epoch": 2.5385946361626455, + "grad_norm": 1.1684306859970093, + "learning_rate": 0.0001082608689550786, + "loss": 1.8655, + "step": 26409 + }, + { + "epoch": 2.5386907622801114, + "grad_norm": 1.2022439241409302, + "learning_rate": 0.00010824895155940455, + "loss": 1.9559, + "step": 26410 + }, + { + "epoch": 2.5387868883975777, + "grad_norm": 1.0995973348617554, + "learning_rate": 0.00010823703446809412, + "loss": 2.0953, + "step": 26411 + }, + { + "epoch": 2.5388830145150436, + "grad_norm": 1.1383177042007446, + "learning_rate": 0.00010822511768123115, + "loss": 2.07, + "step": 26412 + }, + { + "epoch": 2.53897914063251, + "grad_norm": 1.1576052904129028, + "learning_rate": 0.00010821320119889954, + "loss": 2.1187, + "step": 26413 + }, + { + "epoch": 2.539075266749976, + "grad_norm": 1.0636287927627563, + "learning_rate": 0.00010820128502118313, + "loss": 1.9876, + "step": 26414 + }, + { + "epoch": 2.539171392867442, + "grad_norm": 1.0243836641311646, + "learning_rate": 0.00010818936914816575, + "loss": 1.9726, + "step": 26415 + }, + { + "epoch": 2.539267518984908, + "grad_norm": 1.2342851161956787, + "learning_rate": 0.00010817745357993131, + "loss": 1.9293, + "step": 26416 + }, + { + "epoch": 2.5393636451023744, + "grad_norm": 1.0150344371795654, + "learning_rate": 0.00010816553831656361, + "loss": 1.9163, + "step": 26417 + }, + { + "epoch": 2.5394597712198403, + "grad_norm": 1.0558463335037231, + "learning_rate": 0.00010815362335814655, + "loss": 1.981, + "step": 26418 + }, + { + "epoch": 2.5395558973373067, + "grad_norm": 1.127908706665039, + "learning_rate": 0.00010814170870476392, + "loss": 2.0538, + "step": 26419 + }, + { + "epoch": 2.5396520234547726, + "grad_norm": 1.3539880514144897, + "learning_rate": 0.00010812979435649961, + "loss": 2.0589, + "step": 26420 + }, + { + "epoch": 2.539748149572239, + "grad_norm": 1.1695387363433838, + "learning_rate": 0.00010811788031343743, + "loss": 2.0616, + "step": 26421 + }, + { + "epoch": 2.539844275689705, + "grad_norm": 1.137434959411621, + "learning_rate": 0.00010810596657566125, + "loss": 1.9743, + "step": 26422 + }, + { + "epoch": 2.539940401807171, + "grad_norm": 1.180977463722229, + "learning_rate": 0.0001080940531432549, + "loss": 2.0487, + "step": 26423 + }, + { + "epoch": 2.540036527924637, + "grad_norm": 1.297582745552063, + "learning_rate": 0.0001080821400163022, + "loss": 1.9236, + "step": 26424 + }, + { + "epoch": 2.540132654042103, + "grad_norm": 1.227787733078003, + "learning_rate": 0.00010807022719488702, + "loss": 1.7523, + "step": 26425 + }, + { + "epoch": 2.5402287801595693, + "grad_norm": 0.9104532599449158, + "learning_rate": 0.00010805831467909313, + "loss": 1.9855, + "step": 26426 + }, + { + "epoch": 2.5403249062770357, + "grad_norm": 1.4028724431991577, + "learning_rate": 0.00010804640246900441, + "loss": 2.2464, + "step": 26427 + }, + { + "epoch": 2.5404210323945016, + "grad_norm": 1.2670103311538696, + "learning_rate": 0.00010803449056470468, + "loss": 1.9413, + "step": 26428 + }, + { + "epoch": 2.5405171585119675, + "grad_norm": 1.0830365419387817, + "learning_rate": 0.00010802257896627776, + "loss": 2.0437, + "step": 26429 + }, + { + "epoch": 2.540613284629434, + "grad_norm": 1.0889972448349, + "learning_rate": 0.00010801066767380748, + "loss": 2.0294, + "step": 26430 + }, + { + "epoch": 2.5407094107469, + "grad_norm": 1.025262713432312, + "learning_rate": 0.00010799875668737767, + "loss": 2.011, + "step": 26431 + }, + { + "epoch": 2.540805536864366, + "grad_norm": 0.9091854095458984, + "learning_rate": 0.00010798684600707207, + "loss": 1.7081, + "step": 26432 + }, + { + "epoch": 2.540901662981832, + "grad_norm": 1.1193827390670776, + "learning_rate": 0.00010797493563297463, + "loss": 2.0134, + "step": 26433 + }, + { + "epoch": 2.5409977890992983, + "grad_norm": 1.2456305027008057, + "learning_rate": 0.00010796302556516908, + "loss": 2.0409, + "step": 26434 + }, + { + "epoch": 2.5410939152167646, + "grad_norm": 1.1138349771499634, + "learning_rate": 0.00010795111580373925, + "loss": 2.0508, + "step": 26435 + }, + { + "epoch": 2.5411900413342305, + "grad_norm": 1.1542924642562866, + "learning_rate": 0.00010793920634876896, + "loss": 1.945, + "step": 26436 + }, + { + "epoch": 2.5412861674516964, + "grad_norm": 1.3258235454559326, + "learning_rate": 0.000107927297200342, + "loss": 2.0574, + "step": 26437 + }, + { + "epoch": 2.541382293569163, + "grad_norm": 1.0364038944244385, + "learning_rate": 0.00010791538835854218, + "loss": 2.0039, + "step": 26438 + }, + { + "epoch": 2.541478419686629, + "grad_norm": 1.3305671215057373, + "learning_rate": 0.00010790347982345332, + "loss": 2.0295, + "step": 26439 + }, + { + "epoch": 2.541574545804095, + "grad_norm": 1.0004817247390747, + "learning_rate": 0.00010789157159515923, + "loss": 1.9329, + "step": 26440 + }, + { + "epoch": 2.541670671921561, + "grad_norm": 1.1581788063049316, + "learning_rate": 0.00010787966367374365, + "loss": 2.029, + "step": 26441 + }, + { + "epoch": 2.5417667980390273, + "grad_norm": 1.2811107635498047, + "learning_rate": 0.00010786775605929045, + "loss": 2.0465, + "step": 26442 + }, + { + "epoch": 2.541862924156493, + "grad_norm": 1.2081575393676758, + "learning_rate": 0.0001078558487518834, + "loss": 1.9988, + "step": 26443 + }, + { + "epoch": 2.5419590502739595, + "grad_norm": 1.1457258462905884, + "learning_rate": 0.00010784394175160627, + "loss": 1.8299, + "step": 26444 + }, + { + "epoch": 2.5420551763914254, + "grad_norm": 1.0825086832046509, + "learning_rate": 0.00010783203505854288, + "loss": 1.8672, + "step": 26445 + }, + { + "epoch": 2.5421513025088918, + "grad_norm": 1.0694503784179688, + "learning_rate": 0.00010782012867277702, + "loss": 2.103, + "step": 26446 + }, + { + "epoch": 2.5422474286263577, + "grad_norm": 1.1361756324768066, + "learning_rate": 0.00010780822259439244, + "loss": 2.0708, + "step": 26447 + }, + { + "epoch": 2.542343554743824, + "grad_norm": 1.1771811246871948, + "learning_rate": 0.00010779631682347296, + "loss": 2.1232, + "step": 26448 + }, + { + "epoch": 2.54243968086129, + "grad_norm": 1.1653213500976562, + "learning_rate": 0.00010778441136010234, + "loss": 1.974, + "step": 26449 + }, + { + "epoch": 2.5425358069787563, + "grad_norm": 0.9693575501441956, + "learning_rate": 0.00010777250620436438, + "loss": 2.007, + "step": 26450 + }, + { + "epoch": 2.542631933096222, + "grad_norm": 1.1059075593948364, + "learning_rate": 0.00010776060135634287, + "loss": 1.9088, + "step": 26451 + }, + { + "epoch": 2.5427280592136885, + "grad_norm": 1.1422916650772095, + "learning_rate": 0.00010774869681612157, + "loss": 2.2362, + "step": 26452 + }, + { + "epoch": 2.5428241853311544, + "grad_norm": 1.0697251558303833, + "learning_rate": 0.00010773679258378422, + "loss": 2.0829, + "step": 26453 + }, + { + "epoch": 2.5429203114486207, + "grad_norm": 1.2777420282363892, + "learning_rate": 0.00010772488865941466, + "loss": 1.9399, + "step": 26454 + }, + { + "epoch": 2.5430164375660866, + "grad_norm": 1.2364795207977295, + "learning_rate": 0.00010771298504309663, + "loss": 2.1619, + "step": 26455 + }, + { + "epoch": 2.543112563683553, + "grad_norm": 1.1376590728759766, + "learning_rate": 0.00010770108173491384, + "loss": 1.8293, + "step": 26456 + }, + { + "epoch": 2.543208689801019, + "grad_norm": 1.069011926651001, + "learning_rate": 0.00010768917873495013, + "loss": 1.909, + "step": 26457 + }, + { + "epoch": 2.543304815918485, + "grad_norm": 1.2567166090011597, + "learning_rate": 0.00010767727604328924, + "loss": 2.0378, + "step": 26458 + }, + { + "epoch": 2.543400942035951, + "grad_norm": 1.133927822113037, + "learning_rate": 0.00010766537366001495, + "loss": 1.8649, + "step": 26459 + }, + { + "epoch": 2.5434970681534175, + "grad_norm": 1.1084389686584473, + "learning_rate": 0.000107653471585211, + "loss": 2.0507, + "step": 26460 + }, + { + "epoch": 2.5435931942708834, + "grad_norm": 1.1229299306869507, + "learning_rate": 0.00010764156981896112, + "loss": 2.0879, + "step": 26461 + }, + { + "epoch": 2.5436893203883493, + "grad_norm": 1.008509635925293, + "learning_rate": 0.00010762966836134909, + "loss": 1.9285, + "step": 26462 + }, + { + "epoch": 2.5437854465058156, + "grad_norm": 1.1135410070419312, + "learning_rate": 0.00010761776721245867, + "loss": 1.9696, + "step": 26463 + }, + { + "epoch": 2.543881572623282, + "grad_norm": 0.948769748210907, + "learning_rate": 0.0001076058663723736, + "loss": 1.8128, + "step": 26464 + }, + { + "epoch": 2.543977698740748, + "grad_norm": 1.056174635887146, + "learning_rate": 0.00010759396584117762, + "loss": 1.9514, + "step": 26465 + }, + { + "epoch": 2.5440738248582138, + "grad_norm": 1.3140243291854858, + "learning_rate": 0.00010758206561895447, + "loss": 2.1464, + "step": 26466 + }, + { + "epoch": 2.54416995097568, + "grad_norm": 1.2749791145324707, + "learning_rate": 0.00010757016570578792, + "loss": 2.1116, + "step": 26467 + }, + { + "epoch": 2.5442660770931464, + "grad_norm": 1.008447289466858, + "learning_rate": 0.00010755826610176174, + "loss": 1.9204, + "step": 26468 + }, + { + "epoch": 2.5443622032106123, + "grad_norm": 1.3397490978240967, + "learning_rate": 0.0001075463668069596, + "loss": 1.9572, + "step": 26469 + }, + { + "epoch": 2.5444583293280782, + "grad_norm": 1.3686414957046509, + "learning_rate": 0.00010753446782146525, + "loss": 1.9391, + "step": 26470 + }, + { + "epoch": 2.5445544554455446, + "grad_norm": 1.1858404874801636, + "learning_rate": 0.00010752256914536246, + "loss": 1.8873, + "step": 26471 + }, + { + "epoch": 2.544650581563011, + "grad_norm": 1.0024102926254272, + "learning_rate": 0.00010751067077873494, + "loss": 1.8981, + "step": 26472 + }, + { + "epoch": 2.544746707680477, + "grad_norm": 0.9926580786705017, + "learning_rate": 0.00010749877272166643, + "loss": 2.2011, + "step": 26473 + }, + { + "epoch": 2.5448428337979427, + "grad_norm": 1.0336496829986572, + "learning_rate": 0.00010748687497424066, + "loss": 1.8853, + "step": 26474 + }, + { + "epoch": 2.544938959915409, + "grad_norm": 0.9889883995056152, + "learning_rate": 0.00010747497753654133, + "loss": 2.0757, + "step": 26475 + }, + { + "epoch": 2.545035086032875, + "grad_norm": 1.1026568412780762, + "learning_rate": 0.0001074630804086522, + "loss": 1.9367, + "step": 26476 + }, + { + "epoch": 2.5451312121503413, + "grad_norm": 1.1084275245666504, + "learning_rate": 0.00010745118359065696, + "loss": 1.9442, + "step": 26477 + }, + { + "epoch": 2.5452273382678072, + "grad_norm": 1.1051265001296997, + "learning_rate": 0.00010743928708263936, + "loss": 2.0307, + "step": 26478 + }, + { + "epoch": 2.5453234643852736, + "grad_norm": 1.2865616083145142, + "learning_rate": 0.00010742739088468307, + "loss": 1.9618, + "step": 26479 + }, + { + "epoch": 2.5454195905027395, + "grad_norm": 1.0318844318389893, + "learning_rate": 0.00010741549499687186, + "loss": 1.8032, + "step": 26480 + }, + { + "epoch": 2.545515716620206, + "grad_norm": 1.119508147239685, + "learning_rate": 0.0001074035994192894, + "loss": 1.9076, + "step": 26481 + }, + { + "epoch": 2.5456118427376717, + "grad_norm": 1.0161799192428589, + "learning_rate": 0.00010739170415201944, + "loss": 2.072, + "step": 26482 + }, + { + "epoch": 2.545707968855138, + "grad_norm": 1.3099175691604614, + "learning_rate": 0.00010737980919514564, + "loss": 1.9536, + "step": 26483 + }, + { + "epoch": 2.545804094972604, + "grad_norm": 1.297607183456421, + "learning_rate": 0.00010736791454875174, + "loss": 2.1833, + "step": 26484 + }, + { + "epoch": 2.5459002210900703, + "grad_norm": 0.9982551336288452, + "learning_rate": 0.00010735602021292147, + "loss": 1.8206, + "step": 26485 + }, + { + "epoch": 2.545996347207536, + "grad_norm": 1.0941507816314697, + "learning_rate": 0.00010734412618773848, + "loss": 1.9624, + "step": 26486 + }, + { + "epoch": 2.5460924733250025, + "grad_norm": 1.0624291896820068, + "learning_rate": 0.00010733223247328649, + "loss": 1.907, + "step": 26487 + }, + { + "epoch": 2.5461885994424684, + "grad_norm": 1.208025574684143, + "learning_rate": 0.00010732033906964923, + "loss": 2.0136, + "step": 26488 + }, + { + "epoch": 2.546284725559935, + "grad_norm": 0.9504314661026001, + "learning_rate": 0.00010730844597691033, + "loss": 1.8927, + "step": 26489 + }, + { + "epoch": 2.5463808516774007, + "grad_norm": 1.2148653268814087, + "learning_rate": 0.00010729655319515353, + "loss": 2.0646, + "step": 26490 + }, + { + "epoch": 2.5464769777948666, + "grad_norm": 1.2403175830841064, + "learning_rate": 0.00010728466072446251, + "loss": 2.0803, + "step": 26491 + }, + { + "epoch": 2.546573103912333, + "grad_norm": 1.1160295009613037, + "learning_rate": 0.00010727276856492095, + "loss": 1.9848, + "step": 26492 + }, + { + "epoch": 2.5466692300297993, + "grad_norm": 1.11397385597229, + "learning_rate": 0.00010726087671661256, + "loss": 2.0683, + "step": 26493 + }, + { + "epoch": 2.546765356147265, + "grad_norm": 1.3047834634780884, + "learning_rate": 0.000107248985179621, + "loss": 2.1292, + "step": 26494 + }, + { + "epoch": 2.546861482264731, + "grad_norm": 1.0271122455596924, + "learning_rate": 0.00010723709395402998, + "loss": 2.0314, + "step": 26495 + }, + { + "epoch": 2.5469576083821974, + "grad_norm": 1.1176940202713013, + "learning_rate": 0.00010722520303992315, + "loss": 1.9935, + "step": 26496 + }, + { + "epoch": 2.5470537344996638, + "grad_norm": 1.1720303297042847, + "learning_rate": 0.00010721331243738421, + "loss": 2.0042, + "step": 26497 + }, + { + "epoch": 2.5471498606171297, + "grad_norm": 1.23979651927948, + "learning_rate": 0.00010720142214649679, + "loss": 2.1269, + "step": 26498 + }, + { + "epoch": 2.5472459867345956, + "grad_norm": 1.133212685585022, + "learning_rate": 0.00010718953216734462, + "loss": 1.8816, + "step": 26499 + }, + { + "epoch": 2.547342112852062, + "grad_norm": 1.1611411571502686, + "learning_rate": 0.00010717764250001132, + "loss": 2.0605, + "step": 26500 + }, + { + "epoch": 2.5474382389695283, + "grad_norm": 1.3255773782730103, + "learning_rate": 0.00010716575314458063, + "loss": 2.1406, + "step": 26501 + }, + { + "epoch": 2.547534365086994, + "grad_norm": 1.269155740737915, + "learning_rate": 0.00010715386410113618, + "loss": 1.9733, + "step": 26502 + }, + { + "epoch": 2.54763049120446, + "grad_norm": 1.1191881895065308, + "learning_rate": 0.00010714197536976161, + "loss": 2.1865, + "step": 26503 + }, + { + "epoch": 2.5477266173219264, + "grad_norm": 1.0905935764312744, + "learning_rate": 0.00010713008695054061, + "loss": 2.1892, + "step": 26504 + }, + { + "epoch": 2.5478227434393927, + "grad_norm": 1.0258409976959229, + "learning_rate": 0.00010711819884355685, + "loss": 1.84, + "step": 26505 + }, + { + "epoch": 2.5479188695568586, + "grad_norm": 1.238076090812683, + "learning_rate": 0.00010710631104889397, + "loss": 2.0924, + "step": 26506 + }, + { + "epoch": 2.5480149956743245, + "grad_norm": 1.071506142616272, + "learning_rate": 0.00010709442356663562, + "loss": 2.0826, + "step": 26507 + }, + { + "epoch": 2.548111121791791, + "grad_norm": 1.1260857582092285, + "learning_rate": 0.00010708253639686547, + "loss": 1.9388, + "step": 26508 + }, + { + "epoch": 2.548207247909257, + "grad_norm": 1.0987423658370972, + "learning_rate": 0.00010707064953966717, + "loss": 2.0789, + "step": 26509 + }, + { + "epoch": 2.548303374026723, + "grad_norm": 1.1069985628128052, + "learning_rate": 0.00010705876299512436, + "loss": 2.0368, + "step": 26510 + }, + { + "epoch": 2.548399500144189, + "grad_norm": 1.0776554346084595, + "learning_rate": 0.0001070468767633207, + "loss": 2.0148, + "step": 26511 + }, + { + "epoch": 2.5484956262616554, + "grad_norm": 1.166542887687683, + "learning_rate": 0.0001070349908443398, + "loss": 2.0422, + "step": 26512 + }, + { + "epoch": 2.5485917523791213, + "grad_norm": 1.1513439416885376, + "learning_rate": 0.00010702310523826536, + "loss": 2.0243, + "step": 26513 + }, + { + "epoch": 2.5486878784965876, + "grad_norm": 1.00316321849823, + "learning_rate": 0.00010701121994518097, + "loss": 1.9483, + "step": 26514 + }, + { + "epoch": 2.5487840046140535, + "grad_norm": 1.0525420904159546, + "learning_rate": 0.0001069993349651703, + "loss": 1.7096, + "step": 26515 + }, + { + "epoch": 2.54888013073152, + "grad_norm": 1.3717570304870605, + "learning_rate": 0.00010698745029831697, + "loss": 2.1813, + "step": 26516 + }, + { + "epoch": 2.5489762568489858, + "grad_norm": 0.9731457233428955, + "learning_rate": 0.00010697556594470456, + "loss": 1.9167, + "step": 26517 + }, + { + "epoch": 2.549072382966452, + "grad_norm": 1.0196304321289062, + "learning_rate": 0.00010696368190441684, + "loss": 1.7592, + "step": 26518 + }, + { + "epoch": 2.549168509083918, + "grad_norm": 1.0371745824813843, + "learning_rate": 0.00010695179817753734, + "loss": 1.952, + "step": 26519 + }, + { + "epoch": 2.5492646352013844, + "grad_norm": 1.0419316291809082, + "learning_rate": 0.00010693991476414971, + "loss": 2.0363, + "step": 26520 + }, + { + "epoch": 2.5493607613188503, + "grad_norm": 1.2027156352996826, + "learning_rate": 0.00010692803166433756, + "loss": 2.1001, + "step": 26521 + }, + { + "epoch": 2.5494568874363166, + "grad_norm": 1.0092113018035889, + "learning_rate": 0.00010691614887818454, + "loss": 2.0921, + "step": 26522 + }, + { + "epoch": 2.5495530135537825, + "grad_norm": 1.1355350017547607, + "learning_rate": 0.00010690426640577426, + "loss": 1.8292, + "step": 26523 + }, + { + "epoch": 2.549649139671249, + "grad_norm": 1.0486749410629272, + "learning_rate": 0.00010689238424719032, + "loss": 1.9664, + "step": 26524 + }, + { + "epoch": 2.5497452657887147, + "grad_norm": 1.2785775661468506, + "learning_rate": 0.00010688050240251638, + "loss": 2.1096, + "step": 26525 + }, + { + "epoch": 2.549841391906181, + "grad_norm": 1.099959373474121, + "learning_rate": 0.000106868620871836, + "loss": 1.9856, + "step": 26526 + }, + { + "epoch": 2.549937518023647, + "grad_norm": 1.2633857727050781, + "learning_rate": 0.00010685673965523284, + "loss": 2.048, + "step": 26527 + }, + { + "epoch": 2.550033644141113, + "grad_norm": 1.0954225063323975, + "learning_rate": 0.0001068448587527905, + "loss": 1.876, + "step": 26528 + }, + { + "epoch": 2.5501297702585792, + "grad_norm": 1.010472297668457, + "learning_rate": 0.00010683297816459253, + "loss": 1.9527, + "step": 26529 + }, + { + "epoch": 2.5502258963760456, + "grad_norm": 1.1434576511383057, + "learning_rate": 0.0001068210978907226, + "loss": 2.1422, + "step": 26530 + }, + { + "epoch": 2.5503220224935115, + "grad_norm": 1.1656831502914429, + "learning_rate": 0.00010680921793126432, + "loss": 2.0274, + "step": 26531 + }, + { + "epoch": 2.5504181486109774, + "grad_norm": 0.9521569013595581, + "learning_rate": 0.00010679733828630123, + "loss": 1.9952, + "step": 26532 + }, + { + "epoch": 2.5505142747284437, + "grad_norm": 1.1216639280319214, + "learning_rate": 0.00010678545895591697, + "loss": 2.0243, + "step": 26533 + }, + { + "epoch": 2.55061040084591, + "grad_norm": 1.0899609327316284, + "learning_rate": 0.00010677357994019513, + "loss": 2.1526, + "step": 26534 + }, + { + "epoch": 2.550706526963376, + "grad_norm": 1.2456077337265015, + "learning_rate": 0.00010676170123921931, + "loss": 2.1747, + "step": 26535 + }, + { + "epoch": 2.550802653080842, + "grad_norm": 1.1940044164657593, + "learning_rate": 0.00010674982285307308, + "loss": 1.9419, + "step": 26536 + }, + { + "epoch": 2.550898779198308, + "grad_norm": 1.3452203273773193, + "learning_rate": 0.00010673794478184007, + "loss": 2.0356, + "step": 26537 + }, + { + "epoch": 2.5509949053157746, + "grad_norm": 1.1069700717926025, + "learning_rate": 0.00010672606702560385, + "loss": 2.0564, + "step": 26538 + }, + { + "epoch": 2.5510910314332405, + "grad_norm": 1.0346596240997314, + "learning_rate": 0.00010671418958444796, + "loss": 2.0056, + "step": 26539 + }, + { + "epoch": 2.5511871575507064, + "grad_norm": 1.0664963722229004, + "learning_rate": 0.00010670231245845606, + "loss": 2.0863, + "step": 26540 + }, + { + "epoch": 2.5512832836681727, + "grad_norm": 1.1815214157104492, + "learning_rate": 0.00010669043564771168, + "loss": 2.0577, + "step": 26541 + }, + { + "epoch": 2.5513794097856386, + "grad_norm": 1.227607250213623, + "learning_rate": 0.00010667855915229842, + "loss": 2.1811, + "step": 26542 + }, + { + "epoch": 2.551475535903105, + "grad_norm": 1.0554012060165405, + "learning_rate": 0.00010666668297229983, + "loss": 1.9601, + "step": 26543 + }, + { + "epoch": 2.551571662020571, + "grad_norm": 1.028031349182129, + "learning_rate": 0.00010665480710779948, + "loss": 1.9694, + "step": 26544 + }, + { + "epoch": 2.551667788138037, + "grad_norm": 1.2527104616165161, + "learning_rate": 0.00010664293155888102, + "loss": 2.071, + "step": 26545 + }, + { + "epoch": 2.551763914255503, + "grad_norm": 1.2664148807525635, + "learning_rate": 0.00010663105632562792, + "loss": 2.2478, + "step": 26546 + }, + { + "epoch": 2.5518600403729694, + "grad_norm": 1.015048623085022, + "learning_rate": 0.00010661918140812382, + "loss": 2.0556, + "step": 26547 + }, + { + "epoch": 2.5519561664904353, + "grad_norm": 1.2017065286636353, + "learning_rate": 0.00010660730680645226, + "loss": 1.9513, + "step": 26548 + }, + { + "epoch": 2.5520522926079017, + "grad_norm": 1.2670542001724243, + "learning_rate": 0.0001065954325206968, + "loss": 2.0122, + "step": 26549 + }, + { + "epoch": 2.5521484187253676, + "grad_norm": 1.0642694234848022, + "learning_rate": 0.000106583558550941, + "loss": 2.0024, + "step": 26550 + }, + { + "epoch": 2.552244544842834, + "grad_norm": 1.2429436445236206, + "learning_rate": 0.00010657168489726844, + "loss": 1.9956, + "step": 26551 + }, + { + "epoch": 2.5523406709603, + "grad_norm": 1.0544297695159912, + "learning_rate": 0.00010655981155976261, + "loss": 2.1231, + "step": 26552 + }, + { + "epoch": 2.552436797077766, + "grad_norm": 0.998018741607666, + "learning_rate": 0.00010654793853850716, + "loss": 1.967, + "step": 26553 + }, + { + "epoch": 2.552532923195232, + "grad_norm": 1.017246961593628, + "learning_rate": 0.00010653606583358556, + "loss": 2.0559, + "step": 26554 + }, + { + "epoch": 2.5526290493126984, + "grad_norm": 1.2099120616912842, + "learning_rate": 0.00010652419344508142, + "loss": 1.9966, + "step": 26555 + }, + { + "epoch": 2.5527251754301643, + "grad_norm": 1.0157047510147095, + "learning_rate": 0.00010651232137307823, + "loss": 1.8301, + "step": 26556 + }, + { + "epoch": 2.5528213015476307, + "grad_norm": 1.080996036529541, + "learning_rate": 0.00010650044961765959, + "loss": 1.9892, + "step": 26557 + }, + { + "epoch": 2.5529174276650966, + "grad_norm": 1.314913034439087, + "learning_rate": 0.00010648857817890902, + "loss": 2.1467, + "step": 26558 + }, + { + "epoch": 2.553013553782563, + "grad_norm": 1.002534031867981, + "learning_rate": 0.00010647670705691005, + "loss": 1.9745, + "step": 26559 + }, + { + "epoch": 2.553109679900029, + "grad_norm": 1.058899164199829, + "learning_rate": 0.00010646483625174625, + "loss": 1.897, + "step": 26560 + }, + { + "epoch": 2.5532058060174947, + "grad_norm": 1.066092610359192, + "learning_rate": 0.00010645296576350109, + "loss": 1.77, + "step": 26561 + }, + { + "epoch": 2.553301932134961, + "grad_norm": 1.0698655843734741, + "learning_rate": 0.0001064410955922582, + "loss": 1.8904, + "step": 26562 + }, + { + "epoch": 2.5533980582524274, + "grad_norm": 1.1091591119766235, + "learning_rate": 0.00010642922573810104, + "loss": 1.9598, + "step": 26563 + }, + { + "epoch": 2.5534941843698933, + "grad_norm": 1.2193973064422607, + "learning_rate": 0.00010641735620111317, + "loss": 2.0514, + "step": 26564 + }, + { + "epoch": 2.553590310487359, + "grad_norm": 1.0423474311828613, + "learning_rate": 0.00010640548698137812, + "loss": 1.8806, + "step": 26565 + }, + { + "epoch": 2.5536864366048255, + "grad_norm": 1.1945996284484863, + "learning_rate": 0.00010639361807897939, + "loss": 2.0415, + "step": 26566 + }, + { + "epoch": 2.553782562722292, + "grad_norm": 1.1258082389831543, + "learning_rate": 0.00010638174949400054, + "loss": 1.9802, + "step": 26567 + }, + { + "epoch": 2.5538786888397578, + "grad_norm": 1.0911887884140015, + "learning_rate": 0.00010636988122652503, + "loss": 1.9644, + "step": 26568 + }, + { + "epoch": 2.5539748149572237, + "grad_norm": 1.1147119998931885, + "learning_rate": 0.00010635801327663646, + "loss": 2.0843, + "step": 26569 + }, + { + "epoch": 2.55407094107469, + "grad_norm": 1.0100291967391968, + "learning_rate": 0.00010634614564441828, + "loss": 1.7866, + "step": 26570 + }, + { + "epoch": 2.5541670671921564, + "grad_norm": 1.122625708580017, + "learning_rate": 0.00010633427832995403, + "loss": 2.0836, + "step": 26571 + }, + { + "epoch": 2.5542631933096223, + "grad_norm": 1.0405328273773193, + "learning_rate": 0.00010632241133332725, + "loss": 2.0465, + "step": 26572 + }, + { + "epoch": 2.554359319427088, + "grad_norm": 1.0085443258285522, + "learning_rate": 0.00010631054465462137, + "loss": 1.9379, + "step": 26573 + }, + { + "epoch": 2.5544554455445545, + "grad_norm": 1.1431920528411865, + "learning_rate": 0.00010629867829391999, + "loss": 1.7605, + "step": 26574 + }, + { + "epoch": 2.554551571662021, + "grad_norm": 1.1991233825683594, + "learning_rate": 0.00010628681225130654, + "loss": 2.0915, + "step": 26575 + }, + { + "epoch": 2.5546476977794867, + "grad_norm": 1.1831632852554321, + "learning_rate": 0.0001062749465268646, + "loss": 1.9496, + "step": 26576 + }, + { + "epoch": 2.5547438238969526, + "grad_norm": 1.2073768377304077, + "learning_rate": 0.0001062630811206776, + "loss": 1.9507, + "step": 26577 + }, + { + "epoch": 2.554839950014419, + "grad_norm": 1.1337229013442993, + "learning_rate": 0.00010625121603282903, + "loss": 1.9185, + "step": 26578 + }, + { + "epoch": 2.554936076131885, + "grad_norm": 1.2059146165847778, + "learning_rate": 0.00010623935126340244, + "loss": 2.066, + "step": 26579 + }, + { + "epoch": 2.5550322022493512, + "grad_norm": 1.2048407793045044, + "learning_rate": 0.00010622748681248134, + "loss": 2.0379, + "step": 26580 + }, + { + "epoch": 2.555128328366817, + "grad_norm": 1.1804633140563965, + "learning_rate": 0.00010621562268014915, + "loss": 1.8372, + "step": 26581 + }, + { + "epoch": 2.5552244544842835, + "grad_norm": 1.0557457208633423, + "learning_rate": 0.00010620375886648944, + "loss": 1.9353, + "step": 26582 + }, + { + "epoch": 2.5553205806017494, + "grad_norm": 1.0098414421081543, + "learning_rate": 0.00010619189537158563, + "loss": 1.8625, + "step": 26583 + }, + { + "epoch": 2.5554167067192157, + "grad_norm": 1.2814562320709229, + "learning_rate": 0.00010618003219552122, + "loss": 2.0631, + "step": 26584 + }, + { + "epoch": 2.5555128328366816, + "grad_norm": 1.0779494047164917, + "learning_rate": 0.00010616816933837972, + "loss": 1.8232, + "step": 26585 + }, + { + "epoch": 2.555608958954148, + "grad_norm": 1.1346014738082886, + "learning_rate": 0.00010615630680024457, + "loss": 1.9502, + "step": 26586 + }, + { + "epoch": 2.555705085071614, + "grad_norm": 1.0965640544891357, + "learning_rate": 0.00010614444458119931, + "loss": 1.8139, + "step": 26587 + }, + { + "epoch": 2.55580121118908, + "grad_norm": 1.4153096675872803, + "learning_rate": 0.00010613258268132735, + "loss": 2.1143, + "step": 26588 + }, + { + "epoch": 2.555897337306546, + "grad_norm": 1.257341980934143, + "learning_rate": 0.00010612072110071219, + "loss": 2.0156, + "step": 26589 + }, + { + "epoch": 2.5559934634240125, + "grad_norm": 0.9612097144126892, + "learning_rate": 0.00010610885983943732, + "loss": 2.008, + "step": 26590 + }, + { + "epoch": 2.5560895895414784, + "grad_norm": 1.175810694694519, + "learning_rate": 0.00010609699889758617, + "loss": 2.0782, + "step": 26591 + }, + { + "epoch": 2.5561857156589447, + "grad_norm": 1.058829665184021, + "learning_rate": 0.00010608513827524223, + "loss": 1.8902, + "step": 26592 + }, + { + "epoch": 2.5562818417764106, + "grad_norm": 0.9896097779273987, + "learning_rate": 0.00010607327797248898, + "loss": 2.0274, + "step": 26593 + }, + { + "epoch": 2.5563779678938765, + "grad_norm": 1.0828291177749634, + "learning_rate": 0.00010606141798940983, + "loss": 1.8713, + "step": 26594 + }, + { + "epoch": 2.556474094011343, + "grad_norm": 1.1509833335876465, + "learning_rate": 0.00010604955832608828, + "loss": 2.085, + "step": 26595 + }, + { + "epoch": 2.556570220128809, + "grad_norm": 0.9809055924415588, + "learning_rate": 0.00010603769898260783, + "loss": 2.0832, + "step": 26596 + }, + { + "epoch": 2.556666346246275, + "grad_norm": 0.9876195192337036, + "learning_rate": 0.00010602583995905187, + "loss": 1.8983, + "step": 26597 + }, + { + "epoch": 2.556762472363741, + "grad_norm": 1.0796809196472168, + "learning_rate": 0.00010601398125550388, + "loss": 2.0579, + "step": 26598 + }, + { + "epoch": 2.5568585984812073, + "grad_norm": 1.0034387111663818, + "learning_rate": 0.00010600212287204729, + "loss": 1.8737, + "step": 26599 + }, + { + "epoch": 2.5569547245986737, + "grad_norm": 1.0463086366653442, + "learning_rate": 0.00010599026480876555, + "loss": 1.9898, + "step": 26600 + }, + { + "epoch": 2.5570508507161396, + "grad_norm": 1.195807695388794, + "learning_rate": 0.00010597840706574216, + "loss": 1.9624, + "step": 26601 + }, + { + "epoch": 2.5571469768336055, + "grad_norm": 1.2406647205352783, + "learning_rate": 0.00010596654964306054, + "loss": 2.0238, + "step": 26602 + }, + { + "epoch": 2.557243102951072, + "grad_norm": 1.2141088247299194, + "learning_rate": 0.00010595469254080406, + "loss": 2.1285, + "step": 26603 + }, + { + "epoch": 2.557339229068538, + "grad_norm": 1.2236130237579346, + "learning_rate": 0.00010594283575905628, + "loss": 2.0126, + "step": 26604 + }, + { + "epoch": 2.557435355186004, + "grad_norm": 0.914088249206543, + "learning_rate": 0.00010593097929790054, + "loss": 1.7523, + "step": 26605 + }, + { + "epoch": 2.55753148130347, + "grad_norm": 1.157112717628479, + "learning_rate": 0.00010591912315742032, + "loss": 2.0668, + "step": 26606 + }, + { + "epoch": 2.5576276074209363, + "grad_norm": 1.1005669832229614, + "learning_rate": 0.00010590726733769903, + "loss": 1.9352, + "step": 26607 + }, + { + "epoch": 2.5577237335384027, + "grad_norm": 1.2029812335968018, + "learning_rate": 0.00010589541183882012, + "loss": 2.0769, + "step": 26608 + }, + { + "epoch": 2.5578198596558686, + "grad_norm": 1.1526150703430176, + "learning_rate": 0.00010588355666086702, + "loss": 2.1428, + "step": 26609 + }, + { + "epoch": 2.5579159857733345, + "grad_norm": 1.075269103050232, + "learning_rate": 0.00010587170180392315, + "loss": 1.8986, + "step": 26610 + }, + { + "epoch": 2.558012111890801, + "grad_norm": 1.1493388414382935, + "learning_rate": 0.00010585984726807195, + "loss": 1.9964, + "step": 26611 + }, + { + "epoch": 2.5581082380082667, + "grad_norm": 1.2366302013397217, + "learning_rate": 0.00010584799305339679, + "loss": 2.0679, + "step": 26612 + }, + { + "epoch": 2.558204364125733, + "grad_norm": 1.2652531862258911, + "learning_rate": 0.00010583613915998115, + "loss": 1.9751, + "step": 26613 + }, + { + "epoch": 2.558300490243199, + "grad_norm": 1.272829294204712, + "learning_rate": 0.00010582428558790845, + "loss": 1.999, + "step": 26614 + }, + { + "epoch": 2.5583966163606653, + "grad_norm": 1.025146484375, + "learning_rate": 0.00010581243233726204, + "loss": 1.8219, + "step": 26615 + }, + { + "epoch": 2.558492742478131, + "grad_norm": 1.0429739952087402, + "learning_rate": 0.0001058005794081254, + "loss": 1.9009, + "step": 26616 + }, + { + "epoch": 2.5585888685955975, + "grad_norm": 1.1916779279708862, + "learning_rate": 0.0001057887268005819, + "loss": 1.9501, + "step": 26617 + }, + { + "epoch": 2.5586849947130634, + "grad_norm": 0.9879503846168518, + "learning_rate": 0.00010577687451471498, + "loss": 1.9743, + "step": 26618 + }, + { + "epoch": 2.5587811208305298, + "grad_norm": 1.2370473146438599, + "learning_rate": 0.00010576502255060802, + "loss": 2.22, + "step": 26619 + }, + { + "epoch": 2.5588772469479957, + "grad_norm": 1.1416208744049072, + "learning_rate": 0.00010575317090834443, + "loss": 1.9257, + "step": 26620 + }, + { + "epoch": 2.558973373065462, + "grad_norm": 1.2283246517181396, + "learning_rate": 0.00010574131958800762, + "loss": 2.1913, + "step": 26621 + }, + { + "epoch": 2.559069499182928, + "grad_norm": 1.125930666923523, + "learning_rate": 0.00010572946858968098, + "loss": 2.07, + "step": 26622 + }, + { + "epoch": 2.5591656253003943, + "grad_norm": 1.090022087097168, + "learning_rate": 0.0001057176179134479, + "loss": 1.8772, + "step": 26623 + }, + { + "epoch": 2.55926175141786, + "grad_norm": 1.130383014678955, + "learning_rate": 0.0001057057675593918, + "loss": 1.9583, + "step": 26624 + }, + { + "epoch": 2.5593578775353265, + "grad_norm": 0.9740765690803528, + "learning_rate": 0.00010569391752759605, + "loss": 1.9534, + "step": 26625 + }, + { + "epoch": 2.5594540036527924, + "grad_norm": 1.121755599975586, + "learning_rate": 0.00010568206781814404, + "loss": 2.0277, + "step": 26626 + }, + { + "epoch": 2.5595501297702583, + "grad_norm": 1.1988170146942139, + "learning_rate": 0.00010567021843111919, + "loss": 1.9741, + "step": 26627 + }, + { + "epoch": 2.5596462558877247, + "grad_norm": 1.2176034450531006, + "learning_rate": 0.00010565836936660484, + "loss": 1.9531, + "step": 26628 + }, + { + "epoch": 2.559742382005191, + "grad_norm": 1.1923474073410034, + "learning_rate": 0.00010564652062468438, + "loss": 2.068, + "step": 26629 + }, + { + "epoch": 2.559838508122657, + "grad_norm": 1.1222611665725708, + "learning_rate": 0.00010563467220544124, + "loss": 1.9166, + "step": 26630 + }, + { + "epoch": 2.559934634240123, + "grad_norm": 1.0573174953460693, + "learning_rate": 0.00010562282410895875, + "loss": 1.992, + "step": 26631 + }, + { + "epoch": 2.560030760357589, + "grad_norm": 1.2029058933258057, + "learning_rate": 0.0001056109763353203, + "loss": 2.102, + "step": 26632 + }, + { + "epoch": 2.5601268864750555, + "grad_norm": 1.1158065795898438, + "learning_rate": 0.00010559912888460927, + "loss": 1.898, + "step": 26633 + }, + { + "epoch": 2.5602230125925214, + "grad_norm": 1.1340978145599365, + "learning_rate": 0.00010558728175690905, + "loss": 1.9699, + "step": 26634 + }, + { + "epoch": 2.5603191387099873, + "grad_norm": 1.1358393430709839, + "learning_rate": 0.00010557543495230298, + "loss": 2.0581, + "step": 26635 + }, + { + "epoch": 2.5604152648274536, + "grad_norm": 1.2212375402450562, + "learning_rate": 0.00010556358847087443, + "loss": 1.9788, + "step": 26636 + }, + { + "epoch": 2.56051139094492, + "grad_norm": 1.110861897468567, + "learning_rate": 0.00010555174231270678, + "loss": 2.0152, + "step": 26637 + }, + { + "epoch": 2.560607517062386, + "grad_norm": 1.098535180091858, + "learning_rate": 0.00010553989647788338, + "loss": 2.1287, + "step": 26638 + }, + { + "epoch": 2.5607036431798518, + "grad_norm": 1.087152361869812, + "learning_rate": 0.0001055280509664876, + "loss": 1.9897, + "step": 26639 + }, + { + "epoch": 2.560799769297318, + "grad_norm": 1.0727449655532837, + "learning_rate": 0.00010551620577860277, + "loss": 2.0189, + "step": 26640 + }, + { + "epoch": 2.5608958954147845, + "grad_norm": 1.1443325281143188, + "learning_rate": 0.00010550436091431231, + "loss": 1.9688, + "step": 26641 + }, + { + "epoch": 2.5609920215322504, + "grad_norm": 1.2726008892059326, + "learning_rate": 0.00010549251637369951, + "loss": 2.0265, + "step": 26642 + }, + { + "epoch": 2.5610881476497163, + "grad_norm": 1.165264368057251, + "learning_rate": 0.00010548067215684776, + "loss": 1.8597, + "step": 26643 + }, + { + "epoch": 2.5611842737671826, + "grad_norm": 1.2770427465438843, + "learning_rate": 0.00010546882826384037, + "loss": 2.109, + "step": 26644 + }, + { + "epoch": 2.5612803998846485, + "grad_norm": 1.1493418216705322, + "learning_rate": 0.00010545698469476074, + "loss": 1.937, + "step": 26645 + }, + { + "epoch": 2.561376526002115, + "grad_norm": 1.174609661102295, + "learning_rate": 0.00010544514144969215, + "loss": 2.0776, + "step": 26646 + }, + { + "epoch": 2.5614726521195808, + "grad_norm": 1.123721718788147, + "learning_rate": 0.000105433298528718, + "loss": 2.0195, + "step": 26647 + }, + { + "epoch": 2.561568778237047, + "grad_norm": 1.249485731124878, + "learning_rate": 0.00010542145593192162, + "loss": 2.0487, + "step": 26648 + }, + { + "epoch": 2.561664904354513, + "grad_norm": 1.1980311870574951, + "learning_rate": 0.00010540961365938632, + "loss": 1.8642, + "step": 26649 + }, + { + "epoch": 2.5617610304719793, + "grad_norm": 1.0659260749816895, + "learning_rate": 0.00010539777171119547, + "loss": 1.8266, + "step": 26650 + }, + { + "epoch": 2.5618571565894452, + "grad_norm": 1.1112033128738403, + "learning_rate": 0.00010538593008743239, + "loss": 2.0753, + "step": 26651 + }, + { + "epoch": 2.5619532827069116, + "grad_norm": 1.208418607711792, + "learning_rate": 0.0001053740887881804, + "loss": 2.0144, + "step": 26652 + }, + { + "epoch": 2.5620494088243775, + "grad_norm": 1.1137229204177856, + "learning_rate": 0.00010536224781352285, + "loss": 1.8663, + "step": 26653 + }, + { + "epoch": 2.562145534941844, + "grad_norm": 1.2064238786697388, + "learning_rate": 0.00010535040716354305, + "loss": 2.0179, + "step": 26654 + }, + { + "epoch": 2.5622416610593097, + "grad_norm": 1.125327229499817, + "learning_rate": 0.00010533856683832431, + "loss": 2.1426, + "step": 26655 + }, + { + "epoch": 2.562337787176776, + "grad_norm": 1.0950698852539062, + "learning_rate": 0.00010532672683795, + "loss": 1.8174, + "step": 26656 + }, + { + "epoch": 2.562433913294242, + "grad_norm": 1.0518825054168701, + "learning_rate": 0.0001053148871625034, + "loss": 1.8897, + "step": 26657 + }, + { + "epoch": 2.5625300394117083, + "grad_norm": 1.012621283531189, + "learning_rate": 0.00010530304781206783, + "loss": 1.97, + "step": 26658 + }, + { + "epoch": 2.562626165529174, + "grad_norm": 1.0712032318115234, + "learning_rate": 0.0001052912087867266, + "loss": 1.9956, + "step": 26659 + }, + { + "epoch": 2.56272229164664, + "grad_norm": 1.110304594039917, + "learning_rate": 0.00010527937008656307, + "loss": 1.9523, + "step": 26660 + }, + { + "epoch": 2.5628184177641065, + "grad_norm": 1.0578821897506714, + "learning_rate": 0.00010526753171166048, + "loss": 2.0122, + "step": 26661 + }, + { + "epoch": 2.562914543881573, + "grad_norm": 1.190301775932312, + "learning_rate": 0.0001052556936621022, + "loss": 2.2677, + "step": 26662 + }, + { + "epoch": 2.5630106699990387, + "grad_norm": 1.1523112058639526, + "learning_rate": 0.00010524385593797148, + "loss": 1.98, + "step": 26663 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 1.010854721069336, + "learning_rate": 0.00010523201853935167, + "loss": 2.0486, + "step": 26664 + }, + { + "epoch": 2.563202922233971, + "grad_norm": 1.0474469661712646, + "learning_rate": 0.00010522018146632605, + "loss": 1.8436, + "step": 26665 + }, + { + "epoch": 2.5632990483514373, + "grad_norm": 1.1802382469177246, + "learning_rate": 0.00010520834471897796, + "loss": 2.0224, + "step": 26666 + }, + { + "epoch": 2.563395174468903, + "grad_norm": 1.0730702877044678, + "learning_rate": 0.00010519650829739064, + "loss": 1.9854, + "step": 26667 + }, + { + "epoch": 2.563491300586369, + "grad_norm": 1.3250762224197388, + "learning_rate": 0.0001051846722016474, + "loss": 2.0591, + "step": 26668 + }, + { + "epoch": 2.5635874267038354, + "grad_norm": 1.119551420211792, + "learning_rate": 0.00010517283643183153, + "loss": 1.9996, + "step": 26669 + }, + { + "epoch": 2.563683552821302, + "grad_norm": 1.1487523317337036, + "learning_rate": 0.00010516100098802636, + "loss": 2.0987, + "step": 26670 + }, + { + "epoch": 2.5637796789387677, + "grad_norm": 1.0663084983825684, + "learning_rate": 0.00010514916587031509, + "loss": 1.9589, + "step": 26671 + }, + { + "epoch": 2.5638758050562336, + "grad_norm": 1.121482014656067, + "learning_rate": 0.00010513733107878112, + "loss": 1.9679, + "step": 26672 + }, + { + "epoch": 2.5639719311737, + "grad_norm": 1.1335474252700806, + "learning_rate": 0.00010512549661350768, + "loss": 1.9422, + "step": 26673 + }, + { + "epoch": 2.5640680572911663, + "grad_norm": 1.0418214797973633, + "learning_rate": 0.00010511366247457798, + "loss": 1.9433, + "step": 26674 + }, + { + "epoch": 2.564164183408632, + "grad_norm": 1.230681300163269, + "learning_rate": 0.00010510182866207541, + "loss": 1.9921, + "step": 26675 + }, + { + "epoch": 2.564260309526098, + "grad_norm": 1.1108450889587402, + "learning_rate": 0.00010508999517608319, + "loss": 2.2145, + "step": 26676 + }, + { + "epoch": 2.5643564356435644, + "grad_norm": 1.3009198904037476, + "learning_rate": 0.00010507816201668461, + "loss": 2.1412, + "step": 26677 + }, + { + "epoch": 2.5644525617610303, + "grad_norm": 1.0505565404891968, + "learning_rate": 0.00010506632918396293, + "loss": 2.2094, + "step": 26678 + }, + { + "epoch": 2.5645486878784967, + "grad_norm": 1.1591060161590576, + "learning_rate": 0.00010505449667800144, + "loss": 1.8997, + "step": 26679 + }, + { + "epoch": 2.5646448139959626, + "grad_norm": 1.1107587814331055, + "learning_rate": 0.00010504266449888332, + "loss": 2.1568, + "step": 26680 + }, + { + "epoch": 2.564740940113429, + "grad_norm": 1.1249194145202637, + "learning_rate": 0.00010503083264669195, + "loss": 2.0961, + "step": 26681 + }, + { + "epoch": 2.564837066230895, + "grad_norm": 1.0368136167526245, + "learning_rate": 0.00010501900112151056, + "loss": 2.0843, + "step": 26682 + }, + { + "epoch": 2.564933192348361, + "grad_norm": 1.227358341217041, + "learning_rate": 0.00010500716992342241, + "loss": 1.8879, + "step": 26683 + }, + { + "epoch": 2.565029318465827, + "grad_norm": 1.0076782703399658, + "learning_rate": 0.00010499533905251074, + "loss": 1.9844, + "step": 26684 + }, + { + "epoch": 2.5651254445832934, + "grad_norm": 0.9664653539657593, + "learning_rate": 0.0001049835085088588, + "loss": 1.9329, + "step": 26685 + }, + { + "epoch": 2.5652215707007593, + "grad_norm": 1.0522266626358032, + "learning_rate": 0.00010497167829254985, + "loss": 1.8699, + "step": 26686 + }, + { + "epoch": 2.5653176968182256, + "grad_norm": 1.0610358715057373, + "learning_rate": 0.00010495984840366717, + "loss": 1.9444, + "step": 26687 + }, + { + "epoch": 2.5654138229356915, + "grad_norm": 1.0856420993804932, + "learning_rate": 0.00010494801884229396, + "loss": 1.9315, + "step": 26688 + }, + { + "epoch": 2.565509949053158, + "grad_norm": 1.171402931213379, + "learning_rate": 0.0001049361896085135, + "loss": 1.9928, + "step": 26689 + }, + { + "epoch": 2.565606075170624, + "grad_norm": 1.0964646339416504, + "learning_rate": 0.00010492436070240905, + "loss": 2.2016, + "step": 26690 + }, + { + "epoch": 2.56570220128809, + "grad_norm": 1.0624014139175415, + "learning_rate": 0.00010491253212406381, + "loss": 1.9505, + "step": 26691 + }, + { + "epoch": 2.565798327405556, + "grad_norm": 1.0477737188339233, + "learning_rate": 0.00010490070387356101, + "loss": 1.8572, + "step": 26692 + }, + { + "epoch": 2.5658944535230224, + "grad_norm": 1.26865816116333, + "learning_rate": 0.00010488887595098394, + "loss": 2.0214, + "step": 26693 + }, + { + "epoch": 2.5659905796404883, + "grad_norm": 1.1183757781982422, + "learning_rate": 0.0001048770483564158, + "loss": 1.9966, + "step": 26694 + }, + { + "epoch": 2.5660867057579546, + "grad_norm": 1.0885273218154907, + "learning_rate": 0.00010486522108993982, + "loss": 1.9149, + "step": 26695 + }, + { + "epoch": 2.5661828318754205, + "grad_norm": 1.2364193201065063, + "learning_rate": 0.00010485339415163924, + "loss": 1.9722, + "step": 26696 + }, + { + "epoch": 2.5662789579928864, + "grad_norm": 1.1293597221374512, + "learning_rate": 0.00010484156754159728, + "loss": 1.9633, + "step": 26697 + }, + { + "epoch": 2.5663750841103528, + "grad_norm": 1.08078134059906, + "learning_rate": 0.0001048297412598972, + "loss": 1.8803, + "step": 26698 + }, + { + "epoch": 2.566471210227819, + "grad_norm": 1.27510666847229, + "learning_rate": 0.00010481791530662216, + "loss": 1.9848, + "step": 26699 + }, + { + "epoch": 2.566567336345285, + "grad_norm": 1.1302284002304077, + "learning_rate": 0.00010480608968185546, + "loss": 1.8816, + "step": 26700 + }, + { + "epoch": 2.566663462462751, + "grad_norm": 1.3026708364486694, + "learning_rate": 0.00010479426438568025, + "loss": 2.0681, + "step": 26701 + }, + { + "epoch": 2.5667595885802172, + "grad_norm": 1.09413480758667, + "learning_rate": 0.00010478243941817978, + "loss": 2.153, + "step": 26702 + }, + { + "epoch": 2.5668557146976836, + "grad_norm": 2.5858752727508545, + "learning_rate": 0.00010477061477943727, + "loss": 1.8828, + "step": 26703 + }, + { + "epoch": 2.5669518408151495, + "grad_norm": 1.2184090614318848, + "learning_rate": 0.00010475879046953591, + "loss": 1.8992, + "step": 26704 + }, + { + "epoch": 2.5670479669326154, + "grad_norm": 0.9885724186897278, + "learning_rate": 0.0001047469664885589, + "loss": 1.9768, + "step": 26705 + }, + { + "epoch": 2.5671440930500817, + "grad_norm": 1.1109819412231445, + "learning_rate": 0.00010473514283658949, + "loss": 1.9984, + "step": 26706 + }, + { + "epoch": 2.567240219167548, + "grad_norm": 0.9768446087837219, + "learning_rate": 0.00010472331951371084, + "loss": 1.9445, + "step": 26707 + }, + { + "epoch": 2.567336345285014, + "grad_norm": 1.3965896368026733, + "learning_rate": 0.00010471149652000616, + "loss": 2.363, + "step": 26708 + }, + { + "epoch": 2.56743247140248, + "grad_norm": 1.2359521389007568, + "learning_rate": 0.00010469967385555868, + "loss": 1.8777, + "step": 26709 + }, + { + "epoch": 2.567528597519946, + "grad_norm": 1.0129616260528564, + "learning_rate": 0.00010468785152045159, + "loss": 1.962, + "step": 26710 + }, + { + "epoch": 2.567624723637412, + "grad_norm": 1.0841084718704224, + "learning_rate": 0.00010467602951476806, + "loss": 2.0793, + "step": 26711 + }, + { + "epoch": 2.5677208497548785, + "grad_norm": 0.9727306365966797, + "learning_rate": 0.00010466420783859131, + "loss": 1.9181, + "step": 26712 + }, + { + "epoch": 2.5678169758723444, + "grad_norm": 1.2364319562911987, + "learning_rate": 0.00010465238649200447, + "loss": 2.1993, + "step": 26713 + }, + { + "epoch": 2.5679131019898107, + "grad_norm": 1.1168346405029297, + "learning_rate": 0.0001046405654750908, + "loss": 1.9951, + "step": 26714 + }, + { + "epoch": 2.5680092281072766, + "grad_norm": 1.0531041622161865, + "learning_rate": 0.00010462874478793348, + "loss": 1.8218, + "step": 26715 + }, + { + "epoch": 2.568105354224743, + "grad_norm": 1.2179206609725952, + "learning_rate": 0.00010461692443061568, + "loss": 2.2156, + "step": 26716 + }, + { + "epoch": 2.568201480342209, + "grad_norm": 1.1739799976348877, + "learning_rate": 0.00010460510440322054, + "loss": 2.1106, + "step": 26717 + }, + { + "epoch": 2.568297606459675, + "grad_norm": 1.1779251098632812, + "learning_rate": 0.0001045932847058313, + "loss": 2.0808, + "step": 26718 + }, + { + "epoch": 2.568393732577141, + "grad_norm": 1.0655286312103271, + "learning_rate": 0.00010458146533853114, + "loss": 1.9386, + "step": 26719 + }, + { + "epoch": 2.5684898586946074, + "grad_norm": 1.0201938152313232, + "learning_rate": 0.00010456964630140319, + "loss": 1.8758, + "step": 26720 + }, + { + "epoch": 2.5685859848120733, + "grad_norm": 1.045023798942566, + "learning_rate": 0.00010455782759453063, + "loss": 2.0862, + "step": 26721 + }, + { + "epoch": 2.5686821109295397, + "grad_norm": 1.1337794065475464, + "learning_rate": 0.00010454600921799666, + "loss": 1.9782, + "step": 26722 + }, + { + "epoch": 2.5687782370470056, + "grad_norm": 1.296898365020752, + "learning_rate": 0.00010453419117188442, + "loss": 1.9942, + "step": 26723 + }, + { + "epoch": 2.568874363164472, + "grad_norm": 1.217366337776184, + "learning_rate": 0.00010452237345627709, + "loss": 1.916, + "step": 26724 + }, + { + "epoch": 2.568970489281938, + "grad_norm": 1.0733146667480469, + "learning_rate": 0.00010451055607125781, + "loss": 1.9952, + "step": 26725 + }, + { + "epoch": 2.569066615399404, + "grad_norm": 1.112778663635254, + "learning_rate": 0.00010449873901690978, + "loss": 1.9701, + "step": 26726 + }, + { + "epoch": 2.56916274151687, + "grad_norm": 1.3096864223480225, + "learning_rate": 0.00010448692229331612, + "loss": 2.1297, + "step": 26727 + }, + { + "epoch": 2.5692588676343364, + "grad_norm": 1.0078996419906616, + "learning_rate": 0.00010447510590055997, + "loss": 1.9972, + "step": 26728 + }, + { + "epoch": 2.5693549937518023, + "grad_norm": 1.2830758094787598, + "learning_rate": 0.00010446328983872457, + "loss": 2.0659, + "step": 26729 + }, + { + "epoch": 2.569451119869268, + "grad_norm": 1.0057876110076904, + "learning_rate": 0.00010445147410789298, + "loss": 1.946, + "step": 26730 + }, + { + "epoch": 2.5695472459867346, + "grad_norm": 0.9730482697486877, + "learning_rate": 0.00010443965870814836, + "loss": 1.8498, + "step": 26731 + }, + { + "epoch": 2.569643372104201, + "grad_norm": 1.0662813186645508, + "learning_rate": 0.00010442784363957393, + "loss": 1.8472, + "step": 26732 + }, + { + "epoch": 2.569739498221667, + "grad_norm": 1.1842176914215088, + "learning_rate": 0.00010441602890225278, + "loss": 2.0708, + "step": 26733 + }, + { + "epoch": 2.5698356243391327, + "grad_norm": 1.0581170320510864, + "learning_rate": 0.00010440421449626804, + "loss": 1.9485, + "step": 26734 + }, + { + "epoch": 2.569931750456599, + "grad_norm": 1.2144203186035156, + "learning_rate": 0.00010439240042170287, + "loss": 2.1235, + "step": 26735 + }, + { + "epoch": 2.5700278765740654, + "grad_norm": 1.1528751850128174, + "learning_rate": 0.00010438058667864042, + "loss": 1.8769, + "step": 26736 + }, + { + "epoch": 2.5701240026915313, + "grad_norm": 1.056777000427246, + "learning_rate": 0.00010436877326716379, + "loss": 2.03, + "step": 26737 + }, + { + "epoch": 2.570220128808997, + "grad_norm": 1.3442343473434448, + "learning_rate": 0.00010435696018735615, + "loss": 1.9468, + "step": 26738 + }, + { + "epoch": 2.5703162549264635, + "grad_norm": 1.2061240673065186, + "learning_rate": 0.00010434514743930063, + "loss": 2.1334, + "step": 26739 + }, + { + "epoch": 2.57041238104393, + "grad_norm": 1.2450371980667114, + "learning_rate": 0.0001043333350230803, + "loss": 2.078, + "step": 26740 + }, + { + "epoch": 2.570508507161396, + "grad_norm": 1.1645946502685547, + "learning_rate": 0.00010432152293877835, + "loss": 1.9415, + "step": 26741 + }, + { + "epoch": 2.5706046332788617, + "grad_norm": 1.1273046731948853, + "learning_rate": 0.00010430971118647787, + "loss": 1.8072, + "step": 26742 + }, + { + "epoch": 2.570700759396328, + "grad_norm": 1.1125261783599854, + "learning_rate": 0.00010429789976626199, + "loss": 1.9107, + "step": 26743 + }, + { + "epoch": 2.5707968855137944, + "grad_norm": 1.0759711265563965, + "learning_rate": 0.00010428608867821384, + "loss": 1.9952, + "step": 26744 + }, + { + "epoch": 2.5708930116312603, + "grad_norm": 1.1809799671173096, + "learning_rate": 0.00010427427792241651, + "loss": 1.9398, + "step": 26745 + }, + { + "epoch": 2.570989137748726, + "grad_norm": 1.2077327966690063, + "learning_rate": 0.00010426246749895312, + "loss": 2.1091, + "step": 26746 + }, + { + "epoch": 2.5710852638661925, + "grad_norm": 1.0728014707565308, + "learning_rate": 0.00010425065740790681, + "loss": 2.0206, + "step": 26747 + }, + { + "epoch": 2.5711813899836584, + "grad_norm": 1.0797210931777954, + "learning_rate": 0.00010423884764936066, + "loss": 1.9008, + "step": 26748 + }, + { + "epoch": 2.5712775161011248, + "grad_norm": 1.2298228740692139, + "learning_rate": 0.00010422703822339776, + "loss": 2.1961, + "step": 26749 + }, + { + "epoch": 2.5713736422185907, + "grad_norm": 1.186780571937561, + "learning_rate": 0.00010421522913010128, + "loss": 1.7378, + "step": 26750 + }, + { + "epoch": 2.571469768336057, + "grad_norm": 1.1693192720413208, + "learning_rate": 0.00010420342036955429, + "loss": 2.0643, + "step": 26751 + }, + { + "epoch": 2.571565894453523, + "grad_norm": 1.0893325805664062, + "learning_rate": 0.00010419161194183986, + "loss": 2.1572, + "step": 26752 + }, + { + "epoch": 2.5716620205709892, + "grad_norm": 1.1435017585754395, + "learning_rate": 0.00010417980384704112, + "loss": 2.0812, + "step": 26753 + }, + { + "epoch": 2.571758146688455, + "grad_norm": 1.089245319366455, + "learning_rate": 0.00010416799608524117, + "loss": 1.7995, + "step": 26754 + }, + { + "epoch": 2.5718542728059215, + "grad_norm": 1.16781747341156, + "learning_rate": 0.00010415618865652307, + "loss": 2.1003, + "step": 26755 + }, + { + "epoch": 2.5719503989233874, + "grad_norm": 1.0891283750534058, + "learning_rate": 0.00010414438156096992, + "loss": 2.1632, + "step": 26756 + }, + { + "epoch": 2.5720465250408537, + "grad_norm": 1.126325249671936, + "learning_rate": 0.00010413257479866484, + "loss": 2.0009, + "step": 26757 + }, + { + "epoch": 2.5721426511583196, + "grad_norm": 0.9442245364189148, + "learning_rate": 0.00010412076836969088, + "loss": 2.0753, + "step": 26758 + }, + { + "epoch": 2.572238777275786, + "grad_norm": 1.2551813125610352, + "learning_rate": 0.00010410896227413115, + "loss": 1.9171, + "step": 26759 + }, + { + "epoch": 2.572334903393252, + "grad_norm": 1.1598888635635376, + "learning_rate": 0.00010409715651206872, + "loss": 2.0184, + "step": 26760 + }, + { + "epoch": 2.5724310295107182, + "grad_norm": 1.0086588859558105, + "learning_rate": 0.00010408535108358663, + "loss": 1.7189, + "step": 26761 + }, + { + "epoch": 2.572527155628184, + "grad_norm": 1.226473093032837, + "learning_rate": 0.00010407354598876804, + "loss": 2.0297, + "step": 26762 + }, + { + "epoch": 2.57262328174565, + "grad_norm": 1.0957247018814087, + "learning_rate": 0.00010406174122769597, + "loss": 1.8558, + "step": 26763 + }, + { + "epoch": 2.5727194078631164, + "grad_norm": 1.1918625831604004, + "learning_rate": 0.00010404993680045347, + "loss": 2.1316, + "step": 26764 + }, + { + "epoch": 2.5728155339805827, + "grad_norm": 1.0206643342971802, + "learning_rate": 0.00010403813270712366, + "loss": 1.9801, + "step": 26765 + }, + { + "epoch": 2.5729116600980486, + "grad_norm": 1.0380420684814453, + "learning_rate": 0.00010402632894778955, + "loss": 1.9366, + "step": 26766 + }, + { + "epoch": 2.5730077862155145, + "grad_norm": 1.1725164651870728, + "learning_rate": 0.00010401452552253427, + "loss": 1.9382, + "step": 26767 + }, + { + "epoch": 2.573103912332981, + "grad_norm": 1.1033490896224976, + "learning_rate": 0.00010400272243144087, + "loss": 2.0585, + "step": 26768 + }, + { + "epoch": 2.573200038450447, + "grad_norm": 1.0375694036483765, + "learning_rate": 0.00010399091967459238, + "loss": 1.8346, + "step": 26769 + }, + { + "epoch": 2.573296164567913, + "grad_norm": 1.2000455856323242, + "learning_rate": 0.00010397911725207184, + "loss": 2.0771, + "step": 26770 + }, + { + "epoch": 2.573392290685379, + "grad_norm": 1.4199961423873901, + "learning_rate": 0.0001039673151639624, + "loss": 1.9247, + "step": 26771 + }, + { + "epoch": 2.5734884168028453, + "grad_norm": 1.0122164487838745, + "learning_rate": 0.00010395551341034701, + "loss": 1.9847, + "step": 26772 + }, + { + "epoch": 2.5735845429203117, + "grad_norm": 1.1874312162399292, + "learning_rate": 0.00010394371199130873, + "loss": 2.1745, + "step": 26773 + }, + { + "epoch": 2.5736806690377776, + "grad_norm": 1.1727715730667114, + "learning_rate": 0.00010393191090693067, + "loss": 2.1155, + "step": 26774 + }, + { + "epoch": 2.5737767951552435, + "grad_norm": 1.1103941202163696, + "learning_rate": 0.00010392011015729584, + "loss": 2.0781, + "step": 26775 + }, + { + "epoch": 2.57387292127271, + "grad_norm": 1.1771328449249268, + "learning_rate": 0.0001039083097424873, + "loss": 1.7813, + "step": 26776 + }, + { + "epoch": 2.573969047390176, + "grad_norm": 1.1683560609817505, + "learning_rate": 0.00010389650966258806, + "loss": 2.0067, + "step": 26777 + }, + { + "epoch": 2.574065173507642, + "grad_norm": 1.044822335243225, + "learning_rate": 0.0001038847099176812, + "loss": 2.0549, + "step": 26778 + }, + { + "epoch": 2.574161299625108, + "grad_norm": 1.1144713163375854, + "learning_rate": 0.00010387291050784968, + "loss": 1.8325, + "step": 26779 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 1.2018040418624878, + "learning_rate": 0.00010386111143317664, + "loss": 1.9869, + "step": 26780 + }, + { + "epoch": 2.5743535518600402, + "grad_norm": 1.0869977474212646, + "learning_rate": 0.00010384931269374503, + "loss": 1.9877, + "step": 26781 + }, + { + "epoch": 2.5744496779775066, + "grad_norm": 1.2141592502593994, + "learning_rate": 0.00010383751428963792, + "loss": 1.9813, + "step": 26782 + }, + { + "epoch": 2.5745458040949725, + "grad_norm": 1.168933629989624, + "learning_rate": 0.00010382571622093829, + "loss": 1.9083, + "step": 26783 + }, + { + "epoch": 2.574641930212439, + "grad_norm": 1.156440258026123, + "learning_rate": 0.00010381391848772924, + "loss": 1.9426, + "step": 26784 + }, + { + "epoch": 2.5747380563299047, + "grad_norm": 1.2130646705627441, + "learning_rate": 0.00010380212109009373, + "loss": 1.8998, + "step": 26785 + }, + { + "epoch": 2.574834182447371, + "grad_norm": 1.114393711090088, + "learning_rate": 0.00010379032402811483, + "loss": 1.9612, + "step": 26786 + }, + { + "epoch": 2.574930308564837, + "grad_norm": 1.2384368181228638, + "learning_rate": 0.00010377852730187553, + "loss": 2.1808, + "step": 26787 + }, + { + "epoch": 2.5750264346823033, + "grad_norm": 1.2928541898727417, + "learning_rate": 0.00010376673091145882, + "loss": 2.0292, + "step": 26788 + }, + { + "epoch": 2.575122560799769, + "grad_norm": 0.9841133952140808, + "learning_rate": 0.00010375493485694774, + "loss": 2.0326, + "step": 26789 + }, + { + "epoch": 2.5752186869172355, + "grad_norm": 1.0844049453735352, + "learning_rate": 0.00010374313913842531, + "loss": 1.936, + "step": 26790 + }, + { + "epoch": 2.5753148130347014, + "grad_norm": 1.0631132125854492, + "learning_rate": 0.00010373134375597452, + "loss": 2.0326, + "step": 26791 + }, + { + "epoch": 2.575410939152168, + "grad_norm": 1.1514407396316528, + "learning_rate": 0.00010371954870967838, + "loss": 1.9019, + "step": 26792 + }, + { + "epoch": 2.5755070652696337, + "grad_norm": 1.2622324228286743, + "learning_rate": 0.0001037077539996199, + "loss": 2.092, + "step": 26793 + }, + { + "epoch": 2.5756031913871, + "grad_norm": 1.050207257270813, + "learning_rate": 0.00010369595962588207, + "loss": 1.9432, + "step": 26794 + }, + { + "epoch": 2.575699317504566, + "grad_norm": 1.1298948526382446, + "learning_rate": 0.00010368416558854791, + "loss": 1.9231, + "step": 26795 + }, + { + "epoch": 2.575795443622032, + "grad_norm": 1.1787710189819336, + "learning_rate": 0.00010367237188770038, + "loss": 1.9925, + "step": 26796 + }, + { + "epoch": 2.575891569739498, + "grad_norm": 1.1839467287063599, + "learning_rate": 0.00010366057852342252, + "loss": 1.9312, + "step": 26797 + }, + { + "epoch": 2.5759876958569645, + "grad_norm": 1.1477785110473633, + "learning_rate": 0.00010364878549579728, + "loss": 1.9623, + "step": 26798 + }, + { + "epoch": 2.5760838219744304, + "grad_norm": 0.9798493385314941, + "learning_rate": 0.00010363699280490767, + "loss": 1.9436, + "step": 26799 + }, + { + "epoch": 2.5761799480918963, + "grad_norm": 1.1594722270965576, + "learning_rate": 0.00010362520045083665, + "loss": 2.0574, + "step": 26800 + }, + { + "epoch": 2.5762760742093627, + "grad_norm": 1.1058645248413086, + "learning_rate": 0.00010361340843366723, + "loss": 2.0026, + "step": 26801 + }, + { + "epoch": 2.576372200326829, + "grad_norm": 1.1527528762817383, + "learning_rate": 0.00010360161675348242, + "loss": 2.0937, + "step": 26802 + }, + { + "epoch": 2.576468326444295, + "grad_norm": 1.4940345287322998, + "learning_rate": 0.00010358982541036515, + "loss": 1.9872, + "step": 26803 + }, + { + "epoch": 2.576564452561761, + "grad_norm": 1.1369671821594238, + "learning_rate": 0.00010357803440439843, + "loss": 1.8859, + "step": 26804 + }, + { + "epoch": 2.576660578679227, + "grad_norm": 1.069109559059143, + "learning_rate": 0.0001035662437356652, + "loss": 2.0311, + "step": 26805 + }, + { + "epoch": 2.5767567047966935, + "grad_norm": 1.2163944244384766, + "learning_rate": 0.00010355445340424846, + "loss": 2.0486, + "step": 26806 + }, + { + "epoch": 2.5768528309141594, + "grad_norm": 1.1946359872817993, + "learning_rate": 0.0001035426634102312, + "loss": 1.8759, + "step": 26807 + }, + { + "epoch": 2.5769489570316253, + "grad_norm": 1.0928726196289062, + "learning_rate": 0.00010353087375369634, + "loss": 1.9221, + "step": 26808 + }, + { + "epoch": 2.5770450831490916, + "grad_norm": 1.1876051425933838, + "learning_rate": 0.00010351908443472687, + "loss": 2.1208, + "step": 26809 + }, + { + "epoch": 2.577141209266558, + "grad_norm": 1.0573450326919556, + "learning_rate": 0.00010350729545340576, + "loss": 1.9119, + "step": 26810 + }, + { + "epoch": 2.577237335384024, + "grad_norm": 1.2188044786453247, + "learning_rate": 0.00010349550680981595, + "loss": 2.2056, + "step": 26811 + }, + { + "epoch": 2.57733346150149, + "grad_norm": 1.1407898664474487, + "learning_rate": 0.0001034837185040404, + "loss": 2.0583, + "step": 26812 + }, + { + "epoch": 2.577429587618956, + "grad_norm": 1.1365993022918701, + "learning_rate": 0.00010347193053616208, + "loss": 2.009, + "step": 26813 + }, + { + "epoch": 2.577525713736422, + "grad_norm": 1.032354712486267, + "learning_rate": 0.00010346014290626396, + "loss": 1.9442, + "step": 26814 + }, + { + "epoch": 2.5776218398538884, + "grad_norm": 0.9911150336265564, + "learning_rate": 0.00010344835561442893, + "loss": 1.7086, + "step": 26815 + }, + { + "epoch": 2.5777179659713543, + "grad_norm": 1.1961010694503784, + "learning_rate": 0.00010343656866073999, + "loss": 2.1046, + "step": 26816 + }, + { + "epoch": 2.5778140920888206, + "grad_norm": 1.1144720315933228, + "learning_rate": 0.00010342478204528009, + "loss": 2.045, + "step": 26817 + }, + { + "epoch": 2.5779102182062865, + "grad_norm": 0.9392250776290894, + "learning_rate": 0.00010341299576813214, + "loss": 1.932, + "step": 26818 + }, + { + "epoch": 2.578006344323753, + "grad_norm": 1.0329277515411377, + "learning_rate": 0.00010340120982937913, + "loss": 2.018, + "step": 26819 + }, + { + "epoch": 2.5781024704412188, + "grad_norm": 1.0839306116104126, + "learning_rate": 0.00010338942422910394, + "loss": 2.1924, + "step": 26820 + }, + { + "epoch": 2.578198596558685, + "grad_norm": 1.0816785097122192, + "learning_rate": 0.00010337763896738955, + "loss": 2.0471, + "step": 26821 + }, + { + "epoch": 2.578294722676151, + "grad_norm": 1.114695429801941, + "learning_rate": 0.00010336585404431889, + "loss": 2.1473, + "step": 26822 + }, + { + "epoch": 2.5783908487936174, + "grad_norm": 0.9857603907585144, + "learning_rate": 0.0001033540694599749, + "loss": 1.7605, + "step": 26823 + }, + { + "epoch": 2.5784869749110833, + "grad_norm": 1.035765290260315, + "learning_rate": 0.00010334228521444046, + "loss": 1.9534, + "step": 26824 + }, + { + "epoch": 2.5785831010285496, + "grad_norm": 1.1695575714111328, + "learning_rate": 0.00010333050130779854, + "loss": 2.0006, + "step": 26825 + }, + { + "epoch": 2.5786792271460155, + "grad_norm": 1.1080946922302246, + "learning_rate": 0.00010331871774013207, + "loss": 2.0458, + "step": 26826 + }, + { + "epoch": 2.578775353263482, + "grad_norm": 1.0549180507659912, + "learning_rate": 0.00010330693451152393, + "loss": 1.932, + "step": 26827 + }, + { + "epoch": 2.5788714793809477, + "grad_norm": 1.0244134664535522, + "learning_rate": 0.00010329515162205709, + "loss": 1.8288, + "step": 26828 + }, + { + "epoch": 2.578967605498414, + "grad_norm": 1.019692063331604, + "learning_rate": 0.00010328336907181446, + "loss": 1.9951, + "step": 26829 + }, + { + "epoch": 2.57906373161588, + "grad_norm": 1.0784343481063843, + "learning_rate": 0.0001032715868608789, + "loss": 2.2131, + "step": 26830 + }, + { + "epoch": 2.5791598577333463, + "grad_norm": 1.1739977598190308, + "learning_rate": 0.00010325980498933339, + "loss": 2.0964, + "step": 26831 + }, + { + "epoch": 2.5792559838508122, + "grad_norm": 0.9902849197387695, + "learning_rate": 0.00010324802345726082, + "loss": 1.8144, + "step": 26832 + }, + { + "epoch": 2.579352109968278, + "grad_norm": 1.1509711742401123, + "learning_rate": 0.00010323624226474409, + "loss": 1.8604, + "step": 26833 + }, + { + "epoch": 2.5794482360857445, + "grad_norm": 1.1870863437652588, + "learning_rate": 0.00010322446141186606, + "loss": 2.1527, + "step": 26834 + }, + { + "epoch": 2.579544362203211, + "grad_norm": 1.0890235900878906, + "learning_rate": 0.00010321268089870974, + "loss": 1.8197, + "step": 26835 + }, + { + "epoch": 2.5796404883206767, + "grad_norm": 1.0529919862747192, + "learning_rate": 0.00010320090072535796, + "loss": 1.8232, + "step": 26836 + }, + { + "epoch": 2.5797366144381426, + "grad_norm": 1.049748420715332, + "learning_rate": 0.00010318912089189364, + "loss": 2.0323, + "step": 26837 + }, + { + "epoch": 2.579832740555609, + "grad_norm": 1.1176480054855347, + "learning_rate": 0.00010317734139839968, + "loss": 2.1762, + "step": 26838 + }, + { + "epoch": 2.5799288666730753, + "grad_norm": 1.1542835235595703, + "learning_rate": 0.00010316556224495894, + "loss": 2.034, + "step": 26839 + }, + { + "epoch": 2.580024992790541, + "grad_norm": 1.0539411306381226, + "learning_rate": 0.00010315378343165434, + "loss": 2.0586, + "step": 26840 + }, + { + "epoch": 2.580121118908007, + "grad_norm": 1.1647685766220093, + "learning_rate": 0.00010314200495856878, + "loss": 1.8936, + "step": 26841 + }, + { + "epoch": 2.5802172450254734, + "grad_norm": 1.0136321783065796, + "learning_rate": 0.0001031302268257851, + "loss": 1.8223, + "step": 26842 + }, + { + "epoch": 2.58031337114294, + "grad_norm": 1.1291117668151855, + "learning_rate": 0.00010311844903338625, + "loss": 2.0571, + "step": 26843 + }, + { + "epoch": 2.5804094972604057, + "grad_norm": 1.0615098476409912, + "learning_rate": 0.00010310667158145505, + "loss": 1.8023, + "step": 26844 + }, + { + "epoch": 2.5805056233778716, + "grad_norm": 1.1293054819107056, + "learning_rate": 0.00010309489447007442, + "loss": 2.0401, + "step": 26845 + }, + { + "epoch": 2.580601749495338, + "grad_norm": 1.0680590867996216, + "learning_rate": 0.00010308311769932723, + "loss": 2.0546, + "step": 26846 + }, + { + "epoch": 2.580697875612804, + "grad_norm": 1.199584722518921, + "learning_rate": 0.00010307134126929634, + "loss": 2.0702, + "step": 26847 + }, + { + "epoch": 2.58079400173027, + "grad_norm": 1.0061237812042236, + "learning_rate": 0.00010305956518006465, + "loss": 1.809, + "step": 26848 + }, + { + "epoch": 2.580890127847736, + "grad_norm": 1.016016960144043, + "learning_rate": 0.00010304778943171498, + "loss": 1.7716, + "step": 26849 + }, + { + "epoch": 2.5809862539652024, + "grad_norm": 1.2091323137283325, + "learning_rate": 0.00010303601402433023, + "loss": 2.0269, + "step": 26850 + }, + { + "epoch": 2.5810823800826683, + "grad_norm": 1.0328530073165894, + "learning_rate": 0.00010302423895799323, + "loss": 1.8793, + "step": 26851 + }, + { + "epoch": 2.5811785062001347, + "grad_norm": 1.1383079290390015, + "learning_rate": 0.00010301246423278693, + "loss": 1.8586, + "step": 26852 + }, + { + "epoch": 2.5812746323176006, + "grad_norm": 1.0324627161026, + "learning_rate": 0.00010300068984879413, + "loss": 1.9731, + "step": 26853 + }, + { + "epoch": 2.581370758435067, + "grad_norm": 1.1645522117614746, + "learning_rate": 0.00010298891580609769, + "loss": 2.1381, + "step": 26854 + }, + { + "epoch": 2.581466884552533, + "grad_norm": 1.3001686334609985, + "learning_rate": 0.00010297714210478046, + "loss": 1.8878, + "step": 26855 + }, + { + "epoch": 2.581563010669999, + "grad_norm": 1.1791627407073975, + "learning_rate": 0.0001029653687449253, + "loss": 2.1868, + "step": 26856 + }, + { + "epoch": 2.581659136787465, + "grad_norm": 1.1168010234832764, + "learning_rate": 0.00010295359572661511, + "loss": 1.8676, + "step": 26857 + }, + { + "epoch": 2.5817552629049314, + "grad_norm": 1.137424111366272, + "learning_rate": 0.00010294182304993265, + "loss": 2.0121, + "step": 26858 + }, + { + "epoch": 2.5818513890223973, + "grad_norm": 1.133851170539856, + "learning_rate": 0.00010293005071496081, + "loss": 1.9066, + "step": 26859 + }, + { + "epoch": 2.5819475151398636, + "grad_norm": 0.9953070282936096, + "learning_rate": 0.00010291827872178245, + "loss": 1.8972, + "step": 26860 + }, + { + "epoch": 2.5820436412573295, + "grad_norm": 1.1119389533996582, + "learning_rate": 0.00010290650707048039, + "loss": 1.9257, + "step": 26861 + }, + { + "epoch": 2.582139767374796, + "grad_norm": 1.1394200325012207, + "learning_rate": 0.00010289473576113746, + "loss": 1.9518, + "step": 26862 + }, + { + "epoch": 2.582235893492262, + "grad_norm": 1.015708565711975, + "learning_rate": 0.00010288296479383652, + "loss": 1.7987, + "step": 26863 + }, + { + "epoch": 2.582332019609728, + "grad_norm": 0.9890961050987244, + "learning_rate": 0.00010287119416866037, + "loss": 1.8359, + "step": 26864 + }, + { + "epoch": 2.582428145727194, + "grad_norm": 1.0739682912826538, + "learning_rate": 0.0001028594238856919, + "loss": 1.989, + "step": 26865 + }, + { + "epoch": 2.58252427184466, + "grad_norm": 1.2429279088974, + "learning_rate": 0.00010284765394501388, + "loss": 2.1037, + "step": 26866 + }, + { + "epoch": 2.5826203979621263, + "grad_norm": 1.1850181818008423, + "learning_rate": 0.00010283588434670917, + "loss": 2.0451, + "step": 26867 + }, + { + "epoch": 2.5827165240795926, + "grad_norm": 1.106319546699524, + "learning_rate": 0.00010282411509086052, + "loss": 1.8587, + "step": 26868 + }, + { + "epoch": 2.5828126501970585, + "grad_norm": 1.1005234718322754, + "learning_rate": 0.0001028123461775509, + "loss": 2.0536, + "step": 26869 + }, + { + "epoch": 2.5829087763145244, + "grad_norm": 1.1097235679626465, + "learning_rate": 0.00010280057760686299, + "loss": 1.8977, + "step": 26870 + }, + { + "epoch": 2.5830049024319908, + "grad_norm": 1.1760636568069458, + "learning_rate": 0.0001027888093788797, + "loss": 2.1253, + "step": 26871 + }, + { + "epoch": 2.583101028549457, + "grad_norm": 1.1056759357452393, + "learning_rate": 0.00010277704149368381, + "loss": 1.9915, + "step": 26872 + }, + { + "epoch": 2.583197154666923, + "grad_norm": 1.0281683206558228, + "learning_rate": 0.00010276527395135811, + "loss": 1.8709, + "step": 26873 + }, + { + "epoch": 2.583293280784389, + "grad_norm": 1.019711971282959, + "learning_rate": 0.00010275350675198544, + "loss": 2.0455, + "step": 26874 + }, + { + "epoch": 2.5833894069018553, + "grad_norm": 1.18809974193573, + "learning_rate": 0.00010274173989564864, + "loss": 2.0206, + "step": 26875 + }, + { + "epoch": 2.5834855330193216, + "grad_norm": 1.0842760801315308, + "learning_rate": 0.00010272997338243042, + "loss": 1.8168, + "step": 26876 + }, + { + "epoch": 2.5835816591367875, + "grad_norm": 1.056963562965393, + "learning_rate": 0.00010271820721241361, + "loss": 1.8752, + "step": 26877 + }, + { + "epoch": 2.5836777852542534, + "grad_norm": 1.0979907512664795, + "learning_rate": 0.00010270644138568106, + "loss": 2.0877, + "step": 26878 + }, + { + "epoch": 2.5837739113717197, + "grad_norm": 1.012597680091858, + "learning_rate": 0.00010269467590231555, + "loss": 1.8581, + "step": 26879 + }, + { + "epoch": 2.583870037489186, + "grad_norm": 1.2058556079864502, + "learning_rate": 0.00010268291076239985, + "loss": 2.1214, + "step": 26880 + }, + { + "epoch": 2.583966163606652, + "grad_norm": 1.0745238065719604, + "learning_rate": 0.00010267114596601677, + "loss": 1.838, + "step": 26881 + }, + { + "epoch": 2.584062289724118, + "grad_norm": 1.1284282207489014, + "learning_rate": 0.00010265938151324913, + "loss": 2.1199, + "step": 26882 + }, + { + "epoch": 2.5841584158415842, + "grad_norm": 1.0136592388153076, + "learning_rate": 0.00010264761740417966, + "loss": 1.7828, + "step": 26883 + }, + { + "epoch": 2.58425454195905, + "grad_norm": 0.9631736278533936, + "learning_rate": 0.0001026358536388912, + "loss": 1.8622, + "step": 26884 + }, + { + "epoch": 2.5843506680765165, + "grad_norm": 1.1952297687530518, + "learning_rate": 0.00010262409021746647, + "loss": 1.9264, + "step": 26885 + }, + { + "epoch": 2.5844467941939824, + "grad_norm": 1.1943248510360718, + "learning_rate": 0.00010261232713998832, + "loss": 2.0325, + "step": 26886 + }, + { + "epoch": 2.5845429203114487, + "grad_norm": 1.1514527797698975, + "learning_rate": 0.00010260056440653948, + "loss": 2.0028, + "step": 26887 + }, + { + "epoch": 2.5846390464289146, + "grad_norm": 1.0760287046432495, + "learning_rate": 0.00010258880201720273, + "loss": 1.9145, + "step": 26888 + }, + { + "epoch": 2.584735172546381, + "grad_norm": 1.1463145017623901, + "learning_rate": 0.00010257703997206089, + "loss": 1.9667, + "step": 26889 + }, + { + "epoch": 2.584831298663847, + "grad_norm": 1.2667558193206787, + "learning_rate": 0.00010256527827119669, + "loss": 2.1887, + "step": 26890 + }, + { + "epoch": 2.584927424781313, + "grad_norm": 1.3497487306594849, + "learning_rate": 0.00010255351691469289, + "loss": 1.8686, + "step": 26891 + }, + { + "epoch": 2.585023550898779, + "grad_norm": 1.067055106163025, + "learning_rate": 0.00010254175590263228, + "loss": 2.025, + "step": 26892 + }, + { + "epoch": 2.5851196770162455, + "grad_norm": 1.072033405303955, + "learning_rate": 0.00010252999523509762, + "loss": 1.9705, + "step": 26893 + }, + { + "epoch": 2.5852158031337114, + "grad_norm": 1.1156939268112183, + "learning_rate": 0.00010251823491217165, + "loss": 2.1869, + "step": 26894 + }, + { + "epoch": 2.5853119292511777, + "grad_norm": 1.1931630373001099, + "learning_rate": 0.00010250647493393715, + "loss": 1.9977, + "step": 26895 + }, + { + "epoch": 2.5854080553686436, + "grad_norm": 1.0130313634872437, + "learning_rate": 0.00010249471530047689, + "loss": 1.8785, + "step": 26896 + }, + { + "epoch": 2.58550418148611, + "grad_norm": 1.1568241119384766, + "learning_rate": 0.00010248295601187361, + "loss": 1.8847, + "step": 26897 + }, + { + "epoch": 2.585600307603576, + "grad_norm": 1.09099280834198, + "learning_rate": 0.00010247119706821003, + "loss": 1.7911, + "step": 26898 + }, + { + "epoch": 2.5856964337210417, + "grad_norm": 1.177769660949707, + "learning_rate": 0.00010245943846956894, + "loss": 1.956, + "step": 26899 + }, + { + "epoch": 2.585792559838508, + "grad_norm": 1.053615927696228, + "learning_rate": 0.00010244768021603311, + "loss": 1.9128, + "step": 26900 + }, + { + "epoch": 2.5858886859559744, + "grad_norm": 1.1484949588775635, + "learning_rate": 0.00010243592230768523, + "loss": 2.0879, + "step": 26901 + }, + { + "epoch": 2.5859848120734403, + "grad_norm": 1.2242181301116943, + "learning_rate": 0.00010242416474460805, + "loss": 2.037, + "step": 26902 + }, + { + "epoch": 2.5860809381909062, + "grad_norm": 1.0006648302078247, + "learning_rate": 0.00010241240752688432, + "loss": 1.9065, + "step": 26903 + }, + { + "epoch": 2.5861770643083726, + "grad_norm": 1.0591483116149902, + "learning_rate": 0.0001024006506545968, + "loss": 1.9859, + "step": 26904 + }, + { + "epoch": 2.586273190425839, + "grad_norm": 1.2771512269973755, + "learning_rate": 0.0001023888941278282, + "loss": 1.914, + "step": 26905 + }, + { + "epoch": 2.586369316543305, + "grad_norm": 1.078371286392212, + "learning_rate": 0.00010237713794666124, + "loss": 2.0487, + "step": 26906 + }, + { + "epoch": 2.5864654426607707, + "grad_norm": 1.0554254055023193, + "learning_rate": 0.00010236538211117869, + "loss": 1.9226, + "step": 26907 + }, + { + "epoch": 2.586561568778237, + "grad_norm": 1.1181750297546387, + "learning_rate": 0.00010235362662146323, + "loss": 1.9731, + "step": 26908 + }, + { + "epoch": 2.5866576948957034, + "grad_norm": 1.1587330102920532, + "learning_rate": 0.00010234187147759762, + "loss": 1.9844, + "step": 26909 + }, + { + "epoch": 2.5867538210131693, + "grad_norm": 1.2717952728271484, + "learning_rate": 0.00010233011667966454, + "loss": 2.0336, + "step": 26910 + }, + { + "epoch": 2.586849947130635, + "grad_norm": 1.1288281679153442, + "learning_rate": 0.00010231836222774676, + "loss": 2.0225, + "step": 26911 + }, + { + "epoch": 2.5869460732481016, + "grad_norm": 1.0887726545333862, + "learning_rate": 0.00010230660812192698, + "loss": 2.0267, + "step": 26912 + }, + { + "epoch": 2.587042199365568, + "grad_norm": 1.1274596452713013, + "learning_rate": 0.00010229485436228793, + "loss": 1.9667, + "step": 26913 + }, + { + "epoch": 2.587138325483034, + "grad_norm": 1.176434874534607, + "learning_rate": 0.00010228310094891228, + "loss": 2.0679, + "step": 26914 + }, + { + "epoch": 2.5872344516004997, + "grad_norm": 1.234127402305603, + "learning_rate": 0.0001022713478818828, + "loss": 2.082, + "step": 26915 + }, + { + "epoch": 2.587330577717966, + "grad_norm": 1.2075743675231934, + "learning_rate": 0.00010225959516128214, + "loss": 2.0827, + "step": 26916 + }, + { + "epoch": 2.587426703835432, + "grad_norm": 0.9883179664611816, + "learning_rate": 0.00010224784278719303, + "loss": 1.7309, + "step": 26917 + }, + { + "epoch": 2.5875228299528983, + "grad_norm": 1.1680210828781128, + "learning_rate": 0.00010223609075969821, + "loss": 2.0542, + "step": 26918 + }, + { + "epoch": 2.587618956070364, + "grad_norm": 1.2469916343688965, + "learning_rate": 0.00010222433907888029, + "loss": 2.1729, + "step": 26919 + }, + { + "epoch": 2.5877150821878305, + "grad_norm": 1.0065865516662598, + "learning_rate": 0.00010221258774482205, + "loss": 1.7779, + "step": 26920 + }, + { + "epoch": 2.5878112083052964, + "grad_norm": 1.1007542610168457, + "learning_rate": 0.00010220083675760617, + "loss": 1.922, + "step": 26921 + }, + { + "epoch": 2.5879073344227628, + "grad_norm": 1.1781284809112549, + "learning_rate": 0.00010218908611731534, + "loss": 1.887, + "step": 26922 + }, + { + "epoch": 2.5880034605402287, + "grad_norm": 1.2038044929504395, + "learning_rate": 0.00010217733582403219, + "loss": 2.089, + "step": 26923 + }, + { + "epoch": 2.588099586657695, + "grad_norm": 1.0165215730667114, + "learning_rate": 0.00010216558587783951, + "loss": 1.9065, + "step": 26924 + }, + { + "epoch": 2.588195712775161, + "grad_norm": 1.0671006441116333, + "learning_rate": 0.00010215383627881991, + "loss": 2.0254, + "step": 26925 + }, + { + "epoch": 2.5882918388926273, + "grad_norm": 1.140158772468567, + "learning_rate": 0.00010214208702705612, + "loss": 1.9718, + "step": 26926 + }, + { + "epoch": 2.588387965010093, + "grad_norm": 1.1859114170074463, + "learning_rate": 0.00010213033812263079, + "loss": 1.8934, + "step": 26927 + }, + { + "epoch": 2.5884840911275595, + "grad_norm": 1.0152870416641235, + "learning_rate": 0.00010211858956562658, + "loss": 1.9881, + "step": 26928 + }, + { + "epoch": 2.5885802172450254, + "grad_norm": 1.1587094068527222, + "learning_rate": 0.00010210684135612622, + "loss": 1.8714, + "step": 26929 + }, + { + "epoch": 2.5886763433624917, + "grad_norm": 1.2923592329025269, + "learning_rate": 0.00010209509349421238, + "loss": 1.8465, + "step": 26930 + }, + { + "epoch": 2.5887724694799576, + "grad_norm": 1.1397604942321777, + "learning_rate": 0.0001020833459799677, + "loss": 1.964, + "step": 26931 + }, + { + "epoch": 2.5888685955974235, + "grad_norm": 1.1942739486694336, + "learning_rate": 0.00010207159881347487, + "loss": 2.0129, + "step": 26932 + }, + { + "epoch": 2.58896472171489, + "grad_norm": 1.2743430137634277, + "learning_rate": 0.00010205985199481656, + "loss": 2.0929, + "step": 26933 + }, + { + "epoch": 2.5890608478323562, + "grad_norm": 0.9289365410804749, + "learning_rate": 0.0001020481055240754, + "loss": 1.8955, + "step": 26934 + }, + { + "epoch": 2.589156973949822, + "grad_norm": 1.2761396169662476, + "learning_rate": 0.00010203635940133406, + "loss": 2.1069, + "step": 26935 + }, + { + "epoch": 2.589253100067288, + "grad_norm": 1.164149284362793, + "learning_rate": 0.00010202461362667523, + "loss": 2.0533, + "step": 26936 + }, + { + "epoch": 2.5893492261847544, + "grad_norm": 1.2963300943374634, + "learning_rate": 0.00010201286820018152, + "loss": 2.119, + "step": 26937 + }, + { + "epoch": 2.5894453523022207, + "grad_norm": 1.2743215560913086, + "learning_rate": 0.00010200112312193565, + "loss": 1.9864, + "step": 26938 + }, + { + "epoch": 2.5895414784196866, + "grad_norm": 1.1250956058502197, + "learning_rate": 0.00010198937839202021, + "loss": 2.0308, + "step": 26939 + }, + { + "epoch": 2.5896376045371525, + "grad_norm": 1.1256836652755737, + "learning_rate": 0.00010197763401051789, + "loss": 2.0945, + "step": 26940 + }, + { + "epoch": 2.589733730654619, + "grad_norm": 1.3116110563278198, + "learning_rate": 0.0001019658899775113, + "loss": 2.0748, + "step": 26941 + }, + { + "epoch": 2.589829856772085, + "grad_norm": 1.1111445426940918, + "learning_rate": 0.00010195414629308315, + "loss": 1.9441, + "step": 26942 + }, + { + "epoch": 2.589925982889551, + "grad_norm": 1.1939771175384521, + "learning_rate": 0.00010194240295731599, + "loss": 2.1996, + "step": 26943 + }, + { + "epoch": 2.590022109007017, + "grad_norm": 1.1544156074523926, + "learning_rate": 0.00010193065997029252, + "loss": 1.9811, + "step": 26944 + }, + { + "epoch": 2.5901182351244834, + "grad_norm": 1.1064696311950684, + "learning_rate": 0.00010191891733209536, + "loss": 2.0766, + "step": 26945 + }, + { + "epoch": 2.5902143612419497, + "grad_norm": 1.0102819204330444, + "learning_rate": 0.00010190717504280712, + "loss": 1.9218, + "step": 26946 + }, + { + "epoch": 2.5903104873594156, + "grad_norm": 0.9970163106918335, + "learning_rate": 0.00010189543310251048, + "loss": 1.8515, + "step": 26947 + }, + { + "epoch": 2.5904066134768815, + "grad_norm": 1.0244163274765015, + "learning_rate": 0.00010188369151128808, + "loss": 2.0074, + "step": 26948 + }, + { + "epoch": 2.590502739594348, + "grad_norm": 1.066219687461853, + "learning_rate": 0.00010187195026922248, + "loss": 1.9149, + "step": 26949 + }, + { + "epoch": 2.5905988657118137, + "grad_norm": 1.061664342880249, + "learning_rate": 0.00010186020937639637, + "loss": 1.9173, + "step": 26950 + }, + { + "epoch": 2.59069499182928, + "grad_norm": 1.1406407356262207, + "learning_rate": 0.00010184846883289233, + "loss": 2.1495, + "step": 26951 + }, + { + "epoch": 2.590791117946746, + "grad_norm": 1.0465835332870483, + "learning_rate": 0.000101836728638793, + "loss": 1.8555, + "step": 26952 + }, + { + "epoch": 2.5908872440642123, + "grad_norm": 1.1036728620529175, + "learning_rate": 0.00010182498879418097, + "loss": 1.9875, + "step": 26953 + }, + { + "epoch": 2.5909833701816782, + "grad_norm": 1.1239960193634033, + "learning_rate": 0.0001018132492991389, + "loss": 2.0723, + "step": 26954 + }, + { + "epoch": 2.5910794962991446, + "grad_norm": 1.1064060926437378, + "learning_rate": 0.00010180151015374937, + "loss": 2.0378, + "step": 26955 + }, + { + "epoch": 2.5911756224166105, + "grad_norm": 1.076240062713623, + "learning_rate": 0.00010178977135809499, + "loss": 1.9605, + "step": 26956 + }, + { + "epoch": 2.591271748534077, + "grad_norm": 1.1167540550231934, + "learning_rate": 0.0001017780329122584, + "loss": 1.9864, + "step": 26957 + }, + { + "epoch": 2.5913678746515427, + "grad_norm": 1.1718746423721313, + "learning_rate": 0.00010176629481632217, + "loss": 2.0424, + "step": 26958 + }, + { + "epoch": 2.591464000769009, + "grad_norm": 1.2355129718780518, + "learning_rate": 0.00010175455707036891, + "loss": 1.9739, + "step": 26959 + }, + { + "epoch": 2.591560126886475, + "grad_norm": 1.3106518983840942, + "learning_rate": 0.00010174281967448122, + "loss": 1.9793, + "step": 26960 + }, + { + "epoch": 2.5916562530039413, + "grad_norm": 1.1492466926574707, + "learning_rate": 0.00010173108262874172, + "loss": 1.9215, + "step": 26961 + }, + { + "epoch": 2.591752379121407, + "grad_norm": 1.0877892971038818, + "learning_rate": 0.00010171934593323297, + "loss": 1.8334, + "step": 26962 + }, + { + "epoch": 2.5918485052388736, + "grad_norm": 1.0358272790908813, + "learning_rate": 0.00010170760958803753, + "loss": 1.9772, + "step": 26963 + }, + { + "epoch": 2.5919446313563395, + "grad_norm": 1.2626559734344482, + "learning_rate": 0.00010169587359323808, + "loss": 2.0365, + "step": 26964 + }, + { + "epoch": 2.5920407574738054, + "grad_norm": 1.0953460931777954, + "learning_rate": 0.00010168413794891718, + "loss": 1.931, + "step": 26965 + }, + { + "epoch": 2.5921368835912717, + "grad_norm": 1.0422755479812622, + "learning_rate": 0.00010167240265515741, + "loss": 2.0783, + "step": 26966 + }, + { + "epoch": 2.592233009708738, + "grad_norm": 1.1189395189285278, + "learning_rate": 0.00010166066771204134, + "loss": 1.9613, + "step": 26967 + }, + { + "epoch": 2.592329135826204, + "grad_norm": 1.1663144826889038, + "learning_rate": 0.00010164893311965156, + "loss": 2.119, + "step": 26968 + }, + { + "epoch": 2.59242526194367, + "grad_norm": 1.2637709379196167, + "learning_rate": 0.00010163719887807062, + "loss": 2.1781, + "step": 26969 + }, + { + "epoch": 2.592521388061136, + "grad_norm": 1.2142143249511719, + "learning_rate": 0.00010162546498738114, + "loss": 2.0236, + "step": 26970 + }, + { + "epoch": 2.5926175141786025, + "grad_norm": 1.2303402423858643, + "learning_rate": 0.00010161373144766565, + "loss": 2.041, + "step": 26971 + }, + { + "epoch": 2.5927136402960684, + "grad_norm": 1.062108039855957, + "learning_rate": 0.00010160199825900677, + "loss": 2.0985, + "step": 26972 + }, + { + "epoch": 2.5928097664135343, + "grad_norm": 1.0804234743118286, + "learning_rate": 0.00010159026542148704, + "loss": 2.0931, + "step": 26973 + }, + { + "epoch": 2.5929058925310007, + "grad_norm": 1.162099003791809, + "learning_rate": 0.00010157853293518902, + "loss": 2.0506, + "step": 26974 + }, + { + "epoch": 2.593002018648467, + "grad_norm": 1.055609941482544, + "learning_rate": 0.00010156680080019528, + "loss": 2.0302, + "step": 26975 + }, + { + "epoch": 2.593098144765933, + "grad_norm": 1.008613109588623, + "learning_rate": 0.00010155506901658836, + "loss": 1.7708, + "step": 26976 + }, + { + "epoch": 2.593194270883399, + "grad_norm": 0.9913798570632935, + "learning_rate": 0.00010154333758445084, + "loss": 2.0405, + "step": 26977 + }, + { + "epoch": 2.593290397000865, + "grad_norm": 1.1214600801467896, + "learning_rate": 0.00010153160650386532, + "loss": 1.9093, + "step": 26978 + }, + { + "epoch": 2.5933865231183315, + "grad_norm": 1.0820744037628174, + "learning_rate": 0.00010151987577491429, + "loss": 1.9703, + "step": 26979 + }, + { + "epoch": 2.5934826492357974, + "grad_norm": 0.9939950108528137, + "learning_rate": 0.00010150814539768027, + "loss": 1.8746, + "step": 26980 + }, + { + "epoch": 2.5935787753532633, + "grad_norm": 1.1118412017822266, + "learning_rate": 0.0001014964153722459, + "loss": 1.8182, + "step": 26981 + }, + { + "epoch": 2.5936749014707297, + "grad_norm": 1.2037780284881592, + "learning_rate": 0.00010148468569869366, + "loss": 2.173, + "step": 26982 + }, + { + "epoch": 2.5937710275881956, + "grad_norm": 1.242720365524292, + "learning_rate": 0.00010147295637710615, + "loss": 2.2804, + "step": 26983 + }, + { + "epoch": 2.593867153705662, + "grad_norm": 1.2514649629592896, + "learning_rate": 0.00010146122740756584, + "loss": 1.9168, + "step": 26984 + }, + { + "epoch": 2.593963279823128, + "grad_norm": 1.0644140243530273, + "learning_rate": 0.00010144949879015533, + "loss": 1.9894, + "step": 26985 + }, + { + "epoch": 2.594059405940594, + "grad_norm": 1.3099079132080078, + "learning_rate": 0.00010143777052495712, + "loss": 2.1292, + "step": 26986 + }, + { + "epoch": 2.59415553205806, + "grad_norm": 1.0576794147491455, + "learning_rate": 0.00010142604261205377, + "loss": 1.9145, + "step": 26987 + }, + { + "epoch": 2.5942516581755264, + "grad_norm": 1.1257768869400024, + "learning_rate": 0.00010141431505152778, + "loss": 2.0022, + "step": 26988 + }, + { + "epoch": 2.5943477842929923, + "grad_norm": 1.0750898122787476, + "learning_rate": 0.00010140258784346171, + "loss": 2.0777, + "step": 26989 + }, + { + "epoch": 2.5944439104104586, + "grad_norm": 1.0758874416351318, + "learning_rate": 0.00010139086098793807, + "loss": 1.996, + "step": 26990 + }, + { + "epoch": 2.5945400365279245, + "grad_norm": 1.105574131011963, + "learning_rate": 0.00010137913448503935, + "loss": 1.8433, + "step": 26991 + }, + { + "epoch": 2.594636162645391, + "grad_norm": 1.0771547555923462, + "learning_rate": 0.00010136740833484812, + "loss": 1.987, + "step": 26992 + }, + { + "epoch": 2.5947322887628568, + "grad_norm": 1.1671139001846313, + "learning_rate": 0.00010135568253744691, + "loss": 2.0747, + "step": 26993 + }, + { + "epoch": 2.594828414880323, + "grad_norm": 1.243701457977295, + "learning_rate": 0.00010134395709291819, + "loss": 2.1117, + "step": 26994 + }, + { + "epoch": 2.594924540997789, + "grad_norm": 1.2364418506622314, + "learning_rate": 0.00010133223200134447, + "loss": 2.1487, + "step": 26995 + }, + { + "epoch": 2.5950206671152554, + "grad_norm": 1.1142761707305908, + "learning_rate": 0.00010132050726280834, + "loss": 2.0372, + "step": 26996 + }, + { + "epoch": 2.5951167932327213, + "grad_norm": 1.1543865203857422, + "learning_rate": 0.00010130878287739217, + "loss": 2.0972, + "step": 26997 + }, + { + "epoch": 2.5952129193501876, + "grad_norm": 1.3084708452224731, + "learning_rate": 0.0001012970588451786, + "loss": 2.0412, + "step": 26998 + }, + { + "epoch": 2.5953090454676535, + "grad_norm": 1.1485671997070312, + "learning_rate": 0.00010128533516625008, + "loss": 1.948, + "step": 26999 + }, + { + "epoch": 2.59540517158512, + "grad_norm": 1.1829938888549805, + "learning_rate": 0.0001012736118406891, + "loss": 2.1082, + "step": 27000 + }, + { + "epoch": 2.5955012977025858, + "grad_norm": 1.0043039321899414, + "learning_rate": 0.00010126188886857818, + "loss": 2.0147, + "step": 27001 + }, + { + "epoch": 2.5955974238200517, + "grad_norm": 1.1676933765411377, + "learning_rate": 0.00010125016624999983, + "loss": 2.1623, + "step": 27002 + }, + { + "epoch": 2.595693549937518, + "grad_norm": 1.156273365020752, + "learning_rate": 0.00010123844398503649, + "loss": 1.7997, + "step": 27003 + }, + { + "epoch": 2.5957896760549843, + "grad_norm": 1.4359486103057861, + "learning_rate": 0.00010122672207377069, + "loss": 2.1609, + "step": 27004 + }, + { + "epoch": 2.5958858021724502, + "grad_norm": 1.0855152606964111, + "learning_rate": 0.00010121500051628492, + "loss": 2.0197, + "step": 27005 + }, + { + "epoch": 2.595981928289916, + "grad_norm": 1.1598517894744873, + "learning_rate": 0.00010120327931266168, + "loss": 2.034, + "step": 27006 + }, + { + "epoch": 2.5960780544073825, + "grad_norm": 1.3245526552200317, + "learning_rate": 0.00010119155846298339, + "loss": 1.9829, + "step": 27007 + }, + { + "epoch": 2.596174180524849, + "grad_norm": 1.0467510223388672, + "learning_rate": 0.00010117983796733258, + "loss": 2.0552, + "step": 27008 + }, + { + "epoch": 2.5962703066423147, + "grad_norm": 1.1249445676803589, + "learning_rate": 0.00010116811782579173, + "loss": 2.0273, + "step": 27009 + }, + { + "epoch": 2.5963664327597806, + "grad_norm": 0.9443672299385071, + "learning_rate": 0.00010115639803844332, + "loss": 1.944, + "step": 27010 + }, + { + "epoch": 2.596462558877247, + "grad_norm": 1.2940735816955566, + "learning_rate": 0.0001011446786053698, + "loss": 2.1793, + "step": 27011 + }, + { + "epoch": 2.5965586849947133, + "grad_norm": 1.1062915325164795, + "learning_rate": 0.00010113295952665364, + "loss": 2.0749, + "step": 27012 + }, + { + "epoch": 2.596654811112179, + "grad_norm": 1.1476023197174072, + "learning_rate": 0.00010112124080237734, + "loss": 1.829, + "step": 27013 + }, + { + "epoch": 2.596750937229645, + "grad_norm": 1.1246209144592285, + "learning_rate": 0.00010110952243262332, + "loss": 1.9622, + "step": 27014 + }, + { + "epoch": 2.5968470633471115, + "grad_norm": 1.081581711769104, + "learning_rate": 0.00010109780441747409, + "loss": 1.9285, + "step": 27015 + }, + { + "epoch": 2.5969431894645774, + "grad_norm": 0.9982064962387085, + "learning_rate": 0.0001010860867570121, + "loss": 2.0244, + "step": 27016 + }, + { + "epoch": 2.5970393155820437, + "grad_norm": 1.3119114637374878, + "learning_rate": 0.0001010743694513198, + "loss": 2.1323, + "step": 27017 + }, + { + "epoch": 2.5971354416995096, + "grad_norm": 1.0777900218963623, + "learning_rate": 0.00010106265250047963, + "loss": 1.9424, + "step": 27018 + }, + { + "epoch": 2.597231567816976, + "grad_norm": 0.9387509226799011, + "learning_rate": 0.00010105093590457411, + "loss": 1.9011, + "step": 27019 + }, + { + "epoch": 2.597327693934442, + "grad_norm": 0.9662867188453674, + "learning_rate": 0.00010103921966368559, + "loss": 1.7352, + "step": 27020 + }, + { + "epoch": 2.597423820051908, + "grad_norm": 1.016586184501648, + "learning_rate": 0.0001010275037778966, + "loss": 1.7851, + "step": 27021 + }, + { + "epoch": 2.597519946169374, + "grad_norm": 1.2266056537628174, + "learning_rate": 0.00010101578824728956, + "loss": 1.9736, + "step": 27022 + }, + { + "epoch": 2.5976160722868404, + "grad_norm": 1.0391788482666016, + "learning_rate": 0.00010100407307194691, + "loss": 1.9647, + "step": 27023 + }, + { + "epoch": 2.5977121984043063, + "grad_norm": 1.0971769094467163, + "learning_rate": 0.00010099235825195108, + "loss": 1.7886, + "step": 27024 + }, + { + "epoch": 2.5978083245217727, + "grad_norm": 1.1157764196395874, + "learning_rate": 0.00010098064378738455, + "loss": 2.1862, + "step": 27025 + }, + { + "epoch": 2.5979044506392386, + "grad_norm": 1.0690178871154785, + "learning_rate": 0.00010096892967832971, + "loss": 1.8743, + "step": 27026 + }, + { + "epoch": 2.598000576756705, + "grad_norm": 1.2411218881607056, + "learning_rate": 0.00010095721592486898, + "loss": 1.9931, + "step": 27027 + }, + { + "epoch": 2.598096702874171, + "grad_norm": 1.039959192276001, + "learning_rate": 0.00010094550252708488, + "loss": 1.9916, + "step": 27028 + }, + { + "epoch": 2.598192828991637, + "grad_norm": 1.100919485092163, + "learning_rate": 0.00010093378948505974, + "loss": 2.1226, + "step": 27029 + }, + { + "epoch": 2.598288955109103, + "grad_norm": 1.0556098222732544, + "learning_rate": 0.00010092207679887608, + "loss": 1.8522, + "step": 27030 + }, + { + "epoch": 2.5983850812265694, + "grad_norm": 0.9562926888465881, + "learning_rate": 0.00010091036446861618, + "loss": 1.9554, + "step": 27031 + }, + { + "epoch": 2.5984812073440353, + "grad_norm": 1.0886638164520264, + "learning_rate": 0.00010089865249436263, + "loss": 1.8703, + "step": 27032 + }, + { + "epoch": 2.5985773334615017, + "grad_norm": 1.1949101686477661, + "learning_rate": 0.00010088694087619775, + "loss": 2.0779, + "step": 27033 + }, + { + "epoch": 2.5986734595789676, + "grad_norm": 1.234192967414856, + "learning_rate": 0.00010087522961420399, + "loss": 1.9581, + "step": 27034 + }, + { + "epoch": 2.5987695856964335, + "grad_norm": 1.1054736375808716, + "learning_rate": 0.00010086351870846374, + "loss": 2.0293, + "step": 27035 + }, + { + "epoch": 2.5988657118139, + "grad_norm": 0.9628943800926208, + "learning_rate": 0.00010085180815905944, + "loss": 1.7883, + "step": 27036 + }, + { + "epoch": 2.598961837931366, + "grad_norm": 1.2007004022598267, + "learning_rate": 0.00010084009796607348, + "loss": 2.254, + "step": 27037 + }, + { + "epoch": 2.599057964048832, + "grad_norm": 1.1002027988433838, + "learning_rate": 0.00010082838812958825, + "loss": 2.0006, + "step": 27038 + }, + { + "epoch": 2.599154090166298, + "grad_norm": 1.1163805723190308, + "learning_rate": 0.0001008166786496862, + "loss": 2.0103, + "step": 27039 + }, + { + "epoch": 2.5992502162837643, + "grad_norm": 0.9602311253547668, + "learning_rate": 0.00010080496952644968, + "loss": 1.7754, + "step": 27040 + }, + { + "epoch": 2.5993463424012306, + "grad_norm": 1.130199670791626, + "learning_rate": 0.00010079326075996113, + "loss": 1.9785, + "step": 27041 + }, + { + "epoch": 2.5994424685186965, + "grad_norm": 1.0369558334350586, + "learning_rate": 0.00010078155235030292, + "loss": 2.1473, + "step": 27042 + }, + { + "epoch": 2.5995385946361624, + "grad_norm": 1.1007099151611328, + "learning_rate": 0.00010076984429755746, + "loss": 2.0269, + "step": 27043 + }, + { + "epoch": 2.599634720753629, + "grad_norm": 1.0831111669540405, + "learning_rate": 0.00010075813660180714, + "loss": 2.0181, + "step": 27044 + }, + { + "epoch": 2.599730846871095, + "grad_norm": 1.1234501600265503, + "learning_rate": 0.00010074642926313432, + "loss": 1.9187, + "step": 27045 + }, + { + "epoch": 2.599826972988561, + "grad_norm": 1.006381869316101, + "learning_rate": 0.00010073472228162142, + "loss": 1.926, + "step": 27046 + }, + { + "epoch": 2.599923099106027, + "grad_norm": 1.4045900106430054, + "learning_rate": 0.00010072301565735081, + "loss": 2.0076, + "step": 27047 + }, + { + "epoch": 2.6000192252234933, + "grad_norm": 1.2931385040283203, + "learning_rate": 0.00010071130939040486, + "loss": 1.9471, + "step": 27048 + }, + { + "epoch": 2.6001153513409596, + "grad_norm": 1.225730538368225, + "learning_rate": 0.00010069960348086597, + "loss": 2.0791, + "step": 27049 + }, + { + "epoch": 2.6002114774584255, + "grad_norm": 1.2025845050811768, + "learning_rate": 0.00010068789792881651, + "loss": 1.9081, + "step": 27050 + }, + { + "epoch": 2.6003076035758914, + "grad_norm": 1.0215260982513428, + "learning_rate": 0.00010067619273433886, + "loss": 1.7463, + "step": 27051 + }, + { + "epoch": 2.6004037296933578, + "grad_norm": 1.050104022026062, + "learning_rate": 0.00010066448789751538, + "loss": 2.1269, + "step": 27052 + }, + { + "epoch": 2.6004998558108237, + "grad_norm": 1.1259714365005493, + "learning_rate": 0.00010065278341842846, + "loss": 2.0669, + "step": 27053 + }, + { + "epoch": 2.60059598192829, + "grad_norm": 1.060837745666504, + "learning_rate": 0.00010064107929716043, + "loss": 1.9118, + "step": 27054 + }, + { + "epoch": 2.600692108045756, + "grad_norm": 1.1781010627746582, + "learning_rate": 0.00010062937553379365, + "loss": 2.019, + "step": 27055 + }, + { + "epoch": 2.6007882341632222, + "grad_norm": 1.2101280689239502, + "learning_rate": 0.00010061767212841055, + "loss": 2.0569, + "step": 27056 + }, + { + "epoch": 2.600884360280688, + "grad_norm": 1.1316155195236206, + "learning_rate": 0.0001006059690810934, + "loss": 1.9348, + "step": 27057 + }, + { + "epoch": 2.6009804863981545, + "grad_norm": 1.2607547044754028, + "learning_rate": 0.0001005942663919246, + "loss": 1.9619, + "step": 27058 + }, + { + "epoch": 2.6010766125156204, + "grad_norm": 1.0989848375320435, + "learning_rate": 0.00010058256406098652, + "loss": 1.9129, + "step": 27059 + }, + { + "epoch": 2.6011727386330867, + "grad_norm": 1.165588140487671, + "learning_rate": 0.0001005708620883615, + "loss": 2.0281, + "step": 27060 + }, + { + "epoch": 2.6012688647505526, + "grad_norm": 1.0224989652633667, + "learning_rate": 0.00010055916047413184, + "loss": 1.8878, + "step": 27061 + }, + { + "epoch": 2.601364990868019, + "grad_norm": 1.1820893287658691, + "learning_rate": 0.00010054745921837994, + "loss": 1.9415, + "step": 27062 + }, + { + "epoch": 2.601461116985485, + "grad_norm": 1.4350340366363525, + "learning_rate": 0.00010053575832118814, + "loss": 2.1222, + "step": 27063 + }, + { + "epoch": 2.6015572431029512, + "grad_norm": 1.0584149360656738, + "learning_rate": 0.00010052405778263876, + "loss": 1.8683, + "step": 27064 + }, + { + "epoch": 2.601653369220417, + "grad_norm": 0.9984741806983948, + "learning_rate": 0.00010051235760281412, + "loss": 2.0843, + "step": 27065 + }, + { + "epoch": 2.6017494953378835, + "grad_norm": 1.0758556127548218, + "learning_rate": 0.00010050065778179662, + "loss": 1.8988, + "step": 27066 + }, + { + "epoch": 2.6018456214553494, + "grad_norm": 1.2636736631393433, + "learning_rate": 0.00010048895831966854, + "loss": 2.2532, + "step": 27067 + }, + { + "epoch": 2.6019417475728153, + "grad_norm": 1.0925655364990234, + "learning_rate": 0.00010047725921651224, + "loss": 2.0053, + "step": 27068 + }, + { + "epoch": 2.6020378736902816, + "grad_norm": 1.1133968830108643, + "learning_rate": 0.00010046556047241005, + "loss": 2.1861, + "step": 27069 + }, + { + "epoch": 2.602133999807748, + "grad_norm": 1.1423176527023315, + "learning_rate": 0.00010045386208744423, + "loss": 2.0248, + "step": 27070 + }, + { + "epoch": 2.602230125925214, + "grad_norm": 1.1840858459472656, + "learning_rate": 0.00010044216406169721, + "loss": 1.9455, + "step": 27071 + }, + { + "epoch": 2.6023262520426798, + "grad_norm": 1.2579118013381958, + "learning_rate": 0.00010043046639525124, + "loss": 1.9973, + "step": 27072 + }, + { + "epoch": 2.602422378160146, + "grad_norm": 1.188677430152893, + "learning_rate": 0.00010041876908818864, + "loss": 2.2378, + "step": 27073 + }, + { + "epoch": 2.6025185042776124, + "grad_norm": 0.9373351335525513, + "learning_rate": 0.00010040707214059175, + "loss": 1.8685, + "step": 27074 + }, + { + "epoch": 2.6026146303950783, + "grad_norm": 1.1878762245178223, + "learning_rate": 0.00010039537555254287, + "loss": 2.0345, + "step": 27075 + }, + { + "epoch": 2.6027107565125442, + "grad_norm": 1.0999984741210938, + "learning_rate": 0.00010038367932412431, + "loss": 1.9736, + "step": 27076 + }, + { + "epoch": 2.6028068826300106, + "grad_norm": 0.9209535121917725, + "learning_rate": 0.00010037198345541838, + "loss": 1.8454, + "step": 27077 + }, + { + "epoch": 2.602903008747477, + "grad_norm": 1.0734105110168457, + "learning_rate": 0.00010036028794650737, + "loss": 1.957, + "step": 27078 + }, + { + "epoch": 2.602999134864943, + "grad_norm": 1.1550430059432983, + "learning_rate": 0.00010034859279747362, + "loss": 2.1928, + "step": 27079 + }, + { + "epoch": 2.6030952609824087, + "grad_norm": 1.060453176498413, + "learning_rate": 0.00010033689800839941, + "loss": 2.2147, + "step": 27080 + }, + { + "epoch": 2.603191387099875, + "grad_norm": 1.0031877756118774, + "learning_rate": 0.00010032520357936703, + "loss": 1.8809, + "step": 27081 + }, + { + "epoch": 2.6032875132173414, + "grad_norm": 1.2444097995758057, + "learning_rate": 0.00010031350951045877, + "loss": 1.985, + "step": 27082 + }, + { + "epoch": 2.6033836393348073, + "grad_norm": 1.1527765989303589, + "learning_rate": 0.00010030181580175695, + "loss": 2.1418, + "step": 27083 + }, + { + "epoch": 2.603479765452273, + "grad_norm": 1.0703628063201904, + "learning_rate": 0.00010029012245334381, + "loss": 1.9471, + "step": 27084 + }, + { + "epoch": 2.6035758915697396, + "grad_norm": 0.9823930263519287, + "learning_rate": 0.00010027842946530173, + "loss": 2.0171, + "step": 27085 + }, + { + "epoch": 2.6036720176872055, + "grad_norm": 1.0476936101913452, + "learning_rate": 0.00010026673683771289, + "loss": 1.8961, + "step": 27086 + }, + { + "epoch": 2.603768143804672, + "grad_norm": 1.0441120862960815, + "learning_rate": 0.00010025504457065967, + "loss": 1.9459, + "step": 27087 + }, + { + "epoch": 2.6038642699221377, + "grad_norm": 0.9857841730117798, + "learning_rate": 0.00010024335266422428, + "loss": 1.95, + "step": 27088 + }, + { + "epoch": 2.603960396039604, + "grad_norm": 1.0185164213180542, + "learning_rate": 0.000100231661118489, + "loss": 1.8813, + "step": 27089 + }, + { + "epoch": 2.60405652215707, + "grad_norm": 1.1762975454330444, + "learning_rate": 0.00010021996993353614, + "loss": 1.9248, + "step": 27090 + }, + { + "epoch": 2.6041526482745363, + "grad_norm": 1.0305579900741577, + "learning_rate": 0.00010020827910944796, + "loss": 2.0386, + "step": 27091 + }, + { + "epoch": 2.604248774392002, + "grad_norm": 1.1066385507583618, + "learning_rate": 0.00010019658864630672, + "loss": 1.8892, + "step": 27092 + }, + { + "epoch": 2.6043449005094685, + "grad_norm": 1.2307482957839966, + "learning_rate": 0.00010018489854419469, + "loss": 2.0939, + "step": 27093 + }, + { + "epoch": 2.6044410266269344, + "grad_norm": 1.1420457363128662, + "learning_rate": 0.00010017320880319414, + "loss": 2.2156, + "step": 27094 + }, + { + "epoch": 2.604537152744401, + "grad_norm": 1.2219007015228271, + "learning_rate": 0.00010016151942338732, + "loss": 2.0811, + "step": 27095 + }, + { + "epoch": 2.6046332788618667, + "grad_norm": 1.1809041500091553, + "learning_rate": 0.00010014983040485652, + "loss": 1.9617, + "step": 27096 + }, + { + "epoch": 2.604729404979333, + "grad_norm": 0.9834439158439636, + "learning_rate": 0.00010013814174768398, + "loss": 1.8055, + "step": 27097 + }, + { + "epoch": 2.604825531096799, + "grad_norm": 1.1586264371871948, + "learning_rate": 0.00010012645345195193, + "loss": 1.9707, + "step": 27098 + }, + { + "epoch": 2.6049216572142653, + "grad_norm": 1.0246456861495972, + "learning_rate": 0.00010011476551774263, + "loss": 1.9512, + "step": 27099 + }, + { + "epoch": 2.605017783331731, + "grad_norm": 1.0889073610305786, + "learning_rate": 0.00010010307794513835, + "loss": 1.8829, + "step": 27100 + }, + { + "epoch": 2.605113909449197, + "grad_norm": 1.1624494791030884, + "learning_rate": 0.00010009139073422137, + "loss": 1.9337, + "step": 27101 + }, + { + "epoch": 2.6052100355666634, + "grad_norm": 1.0751184225082397, + "learning_rate": 0.00010007970388507386, + "loss": 1.9138, + "step": 27102 + }, + { + "epoch": 2.6053061616841298, + "grad_norm": 1.028856635093689, + "learning_rate": 0.00010006801739777813, + "loss": 1.8558, + "step": 27103 + }, + { + "epoch": 2.6054022878015957, + "grad_norm": 1.3030171394348145, + "learning_rate": 0.00010005633127241636, + "loss": 1.9965, + "step": 27104 + }, + { + "epoch": 2.6054984139190616, + "grad_norm": 1.0774309635162354, + "learning_rate": 0.00010004464550907082, + "loss": 1.9342, + "step": 27105 + }, + { + "epoch": 2.605594540036528, + "grad_norm": 1.1556453704833984, + "learning_rate": 0.00010003296010782374, + "loss": 1.8923, + "step": 27106 + }, + { + "epoch": 2.6056906661539943, + "grad_norm": 1.122602939605713, + "learning_rate": 0.00010002127506875735, + "loss": 1.8331, + "step": 27107 + }, + { + "epoch": 2.60578679227146, + "grad_norm": 1.0866235494613647, + "learning_rate": 0.00010000959039195391, + "loss": 2.0106, + "step": 27108 + }, + { + "epoch": 2.605882918388926, + "grad_norm": 1.0594348907470703, + "learning_rate": 9.999790607749557e-05, + "loss": 1.9094, + "step": 27109 + }, + { + "epoch": 2.6059790445063924, + "grad_norm": 1.2342458963394165, + "learning_rate": 9.998622212546462e-05, + "loss": 2.1029, + "step": 27110 + }, + { + "epoch": 2.6060751706238587, + "grad_norm": 1.0477317571640015, + "learning_rate": 9.997453853594326e-05, + "loss": 1.9376, + "step": 27111 + }, + { + "epoch": 2.6061712967413246, + "grad_norm": 1.0406283140182495, + "learning_rate": 9.996285530901372e-05, + "loss": 2.0153, + "step": 27112 + }, + { + "epoch": 2.6062674228587905, + "grad_norm": 1.0832241773605347, + "learning_rate": 9.995117244475819e-05, + "loss": 1.9457, + "step": 27113 + }, + { + "epoch": 2.606363548976257, + "grad_norm": 1.1903256177902222, + "learning_rate": 9.993948994325892e-05, + "loss": 2.0904, + "step": 27114 + }, + { + "epoch": 2.6064596750937232, + "grad_norm": 1.2323521375656128, + "learning_rate": 9.992780780459807e-05, + "loss": 1.9855, + "step": 27115 + }, + { + "epoch": 2.606555801211189, + "grad_norm": 1.1124334335327148, + "learning_rate": 9.991612602885787e-05, + "loss": 2.0026, + "step": 27116 + }, + { + "epoch": 2.606651927328655, + "grad_norm": 1.3054691553115845, + "learning_rate": 9.990444461612057e-05, + "loss": 1.885, + "step": 27117 + }, + { + "epoch": 2.6067480534461214, + "grad_norm": 0.9799697995185852, + "learning_rate": 9.989276356646833e-05, + "loss": 1.8262, + "step": 27118 + }, + { + "epoch": 2.6068441795635873, + "grad_norm": 1.0957986116409302, + "learning_rate": 9.988108287998338e-05, + "loss": 1.8516, + "step": 27119 + }, + { + "epoch": 2.6069403056810536, + "grad_norm": 1.0676674842834473, + "learning_rate": 9.98694025567479e-05, + "loss": 1.9778, + "step": 27120 + }, + { + "epoch": 2.6070364317985195, + "grad_norm": 1.0700269937515259, + "learning_rate": 9.985772259684407e-05, + "loss": 2.0492, + "step": 27121 + }, + { + "epoch": 2.607132557915986, + "grad_norm": 1.1432489156723022, + "learning_rate": 9.984604300035412e-05, + "loss": 2.1223, + "step": 27122 + }, + { + "epoch": 2.6072286840334518, + "grad_norm": 1.081493854522705, + "learning_rate": 9.983436376736022e-05, + "loss": 1.7916, + "step": 27123 + }, + { + "epoch": 2.607324810150918, + "grad_norm": 1.1392543315887451, + "learning_rate": 9.982268489794456e-05, + "loss": 1.9959, + "step": 27124 + }, + { + "epoch": 2.607420936268384, + "grad_norm": 1.0927809476852417, + "learning_rate": 9.98110063921893e-05, + "loss": 2.1459, + "step": 27125 + }, + { + "epoch": 2.6075170623858503, + "grad_norm": 1.2407454252243042, + "learning_rate": 9.979932825017667e-05, + "loss": 1.941, + "step": 27126 + }, + { + "epoch": 2.6076131885033162, + "grad_norm": 1.0233662128448486, + "learning_rate": 9.978765047198884e-05, + "loss": 1.9834, + "step": 27127 + }, + { + "epoch": 2.6077093146207826, + "grad_norm": 1.0937774181365967, + "learning_rate": 9.977597305770798e-05, + "loss": 1.9666, + "step": 27128 + }, + { + "epoch": 2.6078054407382485, + "grad_norm": 1.0630459785461426, + "learning_rate": 9.976429600741624e-05, + "loss": 2.0586, + "step": 27129 + }, + { + "epoch": 2.607901566855715, + "grad_norm": 1.4328254461288452, + "learning_rate": 9.975261932119582e-05, + "loss": 2.0532, + "step": 27130 + }, + { + "epoch": 2.6079976929731807, + "grad_norm": 1.2362710237503052, + "learning_rate": 9.974094299912889e-05, + "loss": 2.1458, + "step": 27131 + }, + { + "epoch": 2.608093819090647, + "grad_norm": 1.0760685205459595, + "learning_rate": 9.97292670412976e-05, + "loss": 2.01, + "step": 27132 + }, + { + "epoch": 2.608189945208113, + "grad_norm": 1.1934138536453247, + "learning_rate": 9.971759144778413e-05, + "loss": 1.8548, + "step": 27133 + }, + { + "epoch": 2.6082860713255793, + "grad_norm": 1.173855185508728, + "learning_rate": 9.970591621867067e-05, + "loss": 1.9806, + "step": 27134 + }, + { + "epoch": 2.6083821974430452, + "grad_norm": 1.0176494121551514, + "learning_rate": 9.969424135403933e-05, + "loss": 1.923, + "step": 27135 + }, + { + "epoch": 2.6084783235605116, + "grad_norm": 1.076596736907959, + "learning_rate": 9.968256685397232e-05, + "loss": 2.0018, + "step": 27136 + }, + { + "epoch": 2.6085744496779775, + "grad_norm": 1.0073624849319458, + "learning_rate": 9.967089271855174e-05, + "loss": 1.9667, + "step": 27137 + }, + { + "epoch": 2.6086705757954434, + "grad_norm": 1.129422664642334, + "learning_rate": 9.965921894785978e-05, + "loss": 1.9775, + "step": 27138 + }, + { + "epoch": 2.6087667019129097, + "grad_norm": 1.141644835472107, + "learning_rate": 9.964754554197858e-05, + "loss": 2.2064, + "step": 27139 + }, + { + "epoch": 2.608862828030376, + "grad_norm": 1.2153351306915283, + "learning_rate": 9.963587250099028e-05, + "loss": 1.9816, + "step": 27140 + }, + { + "epoch": 2.608958954147842, + "grad_norm": 1.1373037099838257, + "learning_rate": 9.962419982497703e-05, + "loss": 2.0615, + "step": 27141 + }, + { + "epoch": 2.609055080265308, + "grad_norm": 1.037482738494873, + "learning_rate": 9.961252751402097e-05, + "loss": 1.8172, + "step": 27142 + }, + { + "epoch": 2.609151206382774, + "grad_norm": 1.2333825826644897, + "learning_rate": 9.960085556820426e-05, + "loss": 1.8815, + "step": 27143 + }, + { + "epoch": 2.6092473325002405, + "grad_norm": 1.1504544019699097, + "learning_rate": 9.9589183987609e-05, + "loss": 1.9954, + "step": 27144 + }, + { + "epoch": 2.6093434586177064, + "grad_norm": 1.0797882080078125, + "learning_rate": 9.957751277231735e-05, + "loss": 2.0715, + "step": 27145 + }, + { + "epoch": 2.6094395847351723, + "grad_norm": 0.9892483353614807, + "learning_rate": 9.956584192241144e-05, + "loss": 1.7657, + "step": 27146 + }, + { + "epoch": 2.6095357108526387, + "grad_norm": 0.9829080104827881, + "learning_rate": 9.95541714379734e-05, + "loss": 1.9792, + "step": 27147 + }, + { + "epoch": 2.609631836970105, + "grad_norm": 1.0426303148269653, + "learning_rate": 9.954250131908534e-05, + "loss": 2.0083, + "step": 27148 + }, + { + "epoch": 2.609727963087571, + "grad_norm": 1.1070917844772339, + "learning_rate": 9.95308315658294e-05, + "loss": 2.0305, + "step": 27149 + }, + { + "epoch": 2.609824089205037, + "grad_norm": 1.2695878744125366, + "learning_rate": 9.951916217828768e-05, + "loss": 2.1799, + "step": 27150 + }, + { + "epoch": 2.609920215322503, + "grad_norm": 1.0607690811157227, + "learning_rate": 9.950749315654234e-05, + "loss": 1.9037, + "step": 27151 + }, + { + "epoch": 2.610016341439969, + "grad_norm": 1.2236016988754272, + "learning_rate": 9.949582450067548e-05, + "loss": 2.0843, + "step": 27152 + }, + { + "epoch": 2.6101124675574354, + "grad_norm": 1.0152939558029175, + "learning_rate": 9.948415621076917e-05, + "loss": 1.8584, + "step": 27153 + }, + { + "epoch": 2.6102085936749013, + "grad_norm": 0.9876336455345154, + "learning_rate": 9.947248828690558e-05, + "loss": 2.0333, + "step": 27154 + }, + { + "epoch": 2.6103047197923677, + "grad_norm": 0.9839642643928528, + "learning_rate": 9.946082072916682e-05, + "loss": 1.7937, + "step": 27155 + }, + { + "epoch": 2.6104008459098336, + "grad_norm": 1.135324239730835, + "learning_rate": 9.944915353763496e-05, + "loss": 1.9735, + "step": 27156 + }, + { + "epoch": 2.6104969720273, + "grad_norm": 1.2241404056549072, + "learning_rate": 9.94374867123921e-05, + "loss": 2.0255, + "step": 27157 + }, + { + "epoch": 2.610593098144766, + "grad_norm": 1.0993698835372925, + "learning_rate": 9.942582025352038e-05, + "loss": 1.931, + "step": 27158 + }, + { + "epoch": 2.610689224262232, + "grad_norm": 1.0507614612579346, + "learning_rate": 9.94141541611019e-05, + "loss": 1.965, + "step": 27159 + }, + { + "epoch": 2.610785350379698, + "grad_norm": 1.2747670412063599, + "learning_rate": 9.94024884352187e-05, + "loss": 2.0883, + "step": 27160 + }, + { + "epoch": 2.6108814764971644, + "grad_norm": 1.1649678945541382, + "learning_rate": 9.93908230759529e-05, + "loss": 2.0114, + "step": 27161 + }, + { + "epoch": 2.6109776026146303, + "grad_norm": 1.196607232093811, + "learning_rate": 9.937915808338662e-05, + "loss": 2.134, + "step": 27162 + }, + { + "epoch": 2.6110737287320966, + "grad_norm": 1.1548014879226685, + "learning_rate": 9.936749345760191e-05, + "loss": 2.0282, + "step": 27163 + }, + { + "epoch": 2.6111698548495625, + "grad_norm": 1.157052993774414, + "learning_rate": 9.935582919868087e-05, + "loss": 2.1788, + "step": 27164 + }, + { + "epoch": 2.611265980967029, + "grad_norm": 1.4082539081573486, + "learning_rate": 9.93441653067056e-05, + "loss": 2.1578, + "step": 27165 + }, + { + "epoch": 2.611362107084495, + "grad_norm": 0.9906690716743469, + "learning_rate": 9.933250178175816e-05, + "loss": 1.9096, + "step": 27166 + }, + { + "epoch": 2.611458233201961, + "grad_norm": 1.1147512197494507, + "learning_rate": 9.932083862392058e-05, + "loss": 1.8647, + "step": 27167 + }, + { + "epoch": 2.611554359319427, + "grad_norm": 1.0422515869140625, + "learning_rate": 9.930917583327503e-05, + "loss": 2.0383, + "step": 27168 + }, + { + "epoch": 2.6116504854368934, + "grad_norm": 1.0500575304031372, + "learning_rate": 9.929751340990355e-05, + "loss": 1.9446, + "step": 27169 + }, + { + "epoch": 2.6117466115543593, + "grad_norm": 1.0422275066375732, + "learning_rate": 9.928585135388818e-05, + "loss": 1.8416, + "step": 27170 + }, + { + "epoch": 2.611842737671825, + "grad_norm": 0.9093446135520935, + "learning_rate": 9.927418966531103e-05, + "loss": 1.8075, + "step": 27171 + }, + { + "epoch": 2.6119388637892915, + "grad_norm": 1.1910938024520874, + "learning_rate": 9.926252834425414e-05, + "loss": 2.0299, + "step": 27172 + }, + { + "epoch": 2.612034989906758, + "grad_norm": 1.0354435443878174, + "learning_rate": 9.925086739079957e-05, + "loss": 1.9242, + "step": 27173 + }, + { + "epoch": 2.6121311160242238, + "grad_norm": 0.9790925979614258, + "learning_rate": 9.923920680502938e-05, + "loss": 1.842, + "step": 27174 + }, + { + "epoch": 2.6122272421416897, + "grad_norm": 1.1011146306991577, + "learning_rate": 9.922754658702564e-05, + "loss": 1.9885, + "step": 27175 + }, + { + "epoch": 2.612323368259156, + "grad_norm": 1.2662724256515503, + "learning_rate": 9.921588673687039e-05, + "loss": 2.0204, + "step": 27176 + }, + { + "epoch": 2.6124194943766224, + "grad_norm": 1.0250582695007324, + "learning_rate": 9.920422725464568e-05, + "loss": 1.8963, + "step": 27177 + }, + { + "epoch": 2.6125156204940883, + "grad_norm": 1.0932762622833252, + "learning_rate": 9.91925681404336e-05, + "loss": 1.8439, + "step": 27178 + }, + { + "epoch": 2.612611746611554, + "grad_norm": 1.2024911642074585, + "learning_rate": 9.918090939431614e-05, + "loss": 2.1181, + "step": 27179 + }, + { + "epoch": 2.6127078727290205, + "grad_norm": 1.1348423957824707, + "learning_rate": 9.91692510163754e-05, + "loss": 2.1059, + "step": 27180 + }, + { + "epoch": 2.612803998846487, + "grad_norm": 1.1385051012039185, + "learning_rate": 9.915759300669334e-05, + "loss": 1.9591, + "step": 27181 + }, + { + "epoch": 2.6129001249639527, + "grad_norm": 1.1920201778411865, + "learning_rate": 9.914593536535207e-05, + "loss": 2.111, + "step": 27182 + }, + { + "epoch": 2.6129962510814186, + "grad_norm": 1.2953171730041504, + "learning_rate": 9.913427809243362e-05, + "loss": 2.0637, + "step": 27183 + }, + { + "epoch": 2.613092377198885, + "grad_norm": 1.2121498584747314, + "learning_rate": 9.912262118801997e-05, + "loss": 2.06, + "step": 27184 + }, + { + "epoch": 2.6131885033163513, + "grad_norm": 1.0758711099624634, + "learning_rate": 9.911096465219321e-05, + "loss": 2.0559, + "step": 27185 + }, + { + "epoch": 2.6132846294338172, + "grad_norm": 1.1852482557296753, + "learning_rate": 9.909930848503535e-05, + "loss": 1.9374, + "step": 27186 + }, + { + "epoch": 2.613380755551283, + "grad_norm": 1.2364424467086792, + "learning_rate": 9.908765268662843e-05, + "loss": 2.2249, + "step": 27187 + }, + { + "epoch": 2.6134768816687495, + "grad_norm": 1.1105564832687378, + "learning_rate": 9.907599725705445e-05, + "loss": 1.9771, + "step": 27188 + }, + { + "epoch": 2.6135730077862154, + "grad_norm": 1.248314380645752, + "learning_rate": 9.906434219639545e-05, + "loss": 2.0564, + "step": 27189 + }, + { + "epoch": 2.6136691339036817, + "grad_norm": 1.0855655670166016, + "learning_rate": 9.905268750473344e-05, + "loss": 1.9543, + "step": 27190 + }, + { + "epoch": 2.6137652600211476, + "grad_norm": 1.2841167449951172, + "learning_rate": 9.904103318215043e-05, + "loss": 1.9162, + "step": 27191 + }, + { + "epoch": 2.613861386138614, + "grad_norm": 1.0381121635437012, + "learning_rate": 9.902937922872842e-05, + "loss": 1.9039, + "step": 27192 + }, + { + "epoch": 2.61395751225608, + "grad_norm": 1.2470439672470093, + "learning_rate": 9.901772564454947e-05, + "loss": 2.1012, + "step": 27193 + }, + { + "epoch": 2.614053638373546, + "grad_norm": 0.997094452381134, + "learning_rate": 9.900607242969552e-05, + "loss": 1.8427, + "step": 27194 + }, + { + "epoch": 2.614149764491012, + "grad_norm": 1.1531099081039429, + "learning_rate": 9.899441958424863e-05, + "loss": 2.0695, + "step": 27195 + }, + { + "epoch": 2.6142458906084785, + "grad_norm": 1.136942982673645, + "learning_rate": 9.898276710829078e-05, + "loss": 2.0583, + "step": 27196 + }, + { + "epoch": 2.6143420167259444, + "grad_norm": 1.2281630039215088, + "learning_rate": 9.8971115001904e-05, + "loss": 2.0103, + "step": 27197 + }, + { + "epoch": 2.6144381428434107, + "grad_norm": 1.1669598817825317, + "learning_rate": 9.895946326517022e-05, + "loss": 1.9696, + "step": 27198 + }, + { + "epoch": 2.6145342689608766, + "grad_norm": 0.9493838548660278, + "learning_rate": 9.894781189817151e-05, + "loss": 2.0343, + "step": 27199 + }, + { + "epoch": 2.614630395078343, + "grad_norm": 1.3233892917633057, + "learning_rate": 9.893616090098977e-05, + "loss": 2.0369, + "step": 27200 + }, + { + "epoch": 2.614726521195809, + "grad_norm": 1.0690792798995972, + "learning_rate": 9.892451027370708e-05, + "loss": 1.8085, + "step": 27201 + }, + { + "epoch": 2.614822647313275, + "grad_norm": 1.1878405809402466, + "learning_rate": 9.891286001640541e-05, + "loss": 2.134, + "step": 27202 + }, + { + "epoch": 2.614918773430741, + "grad_norm": 0.9712536931037903, + "learning_rate": 9.890121012916673e-05, + "loss": 1.949, + "step": 27203 + }, + { + "epoch": 2.615014899548207, + "grad_norm": 1.064374327659607, + "learning_rate": 9.888956061207302e-05, + "loss": 2.0043, + "step": 27204 + }, + { + "epoch": 2.6151110256656733, + "grad_norm": 1.178959846496582, + "learning_rate": 9.887791146520623e-05, + "loss": 2.1345, + "step": 27205 + }, + { + "epoch": 2.6152071517831397, + "grad_norm": 1.2154690027236938, + "learning_rate": 9.88662626886484e-05, + "loss": 2.0705, + "step": 27206 + }, + { + "epoch": 2.6153032779006056, + "grad_norm": 1.2783225774765015, + "learning_rate": 9.885461428248148e-05, + "loss": 2.0222, + "step": 27207 + }, + { + "epoch": 2.6153994040180715, + "grad_norm": 1.090733528137207, + "learning_rate": 9.88429662467874e-05, + "loss": 2.1253, + "step": 27208 + }, + { + "epoch": 2.615495530135538, + "grad_norm": 1.3350147008895874, + "learning_rate": 9.883131858164817e-05, + "loss": 2.0971, + "step": 27209 + }, + { + "epoch": 2.615591656253004, + "grad_norm": 1.1653413772583008, + "learning_rate": 9.881967128714576e-05, + "loss": 1.8605, + "step": 27210 + }, + { + "epoch": 2.61568778237047, + "grad_norm": 1.1039749383926392, + "learning_rate": 9.880802436336211e-05, + "loss": 2.0497, + "step": 27211 + }, + { + "epoch": 2.615783908487936, + "grad_norm": 1.1705150604248047, + "learning_rate": 9.879637781037921e-05, + "loss": 2.0253, + "step": 27212 + }, + { + "epoch": 2.6158800346054023, + "grad_norm": 1.1189628839492798, + "learning_rate": 9.878473162827899e-05, + "loss": 2.0518, + "step": 27213 + }, + { + "epoch": 2.6159761607228686, + "grad_norm": 1.0799036026000977, + "learning_rate": 9.877308581714341e-05, + "loss": 2.0487, + "step": 27214 + }, + { + "epoch": 2.6160722868403345, + "grad_norm": 1.1287071704864502, + "learning_rate": 9.876144037705447e-05, + "loss": 1.9618, + "step": 27215 + }, + { + "epoch": 2.6161684129578004, + "grad_norm": 1.1107789278030396, + "learning_rate": 9.874979530809403e-05, + "loss": 1.9041, + "step": 27216 + }, + { + "epoch": 2.616264539075267, + "grad_norm": 1.2335885763168335, + "learning_rate": 9.873815061034409e-05, + "loss": 2.0158, + "step": 27217 + }, + { + "epoch": 2.616360665192733, + "grad_norm": 1.0862271785736084, + "learning_rate": 9.87265062838866e-05, + "loss": 1.9207, + "step": 27218 + }, + { + "epoch": 2.616456791310199, + "grad_norm": 1.3591784238815308, + "learning_rate": 9.87148623288035e-05, + "loss": 1.9284, + "step": 27219 + }, + { + "epoch": 2.616552917427665, + "grad_norm": 1.0955990552902222, + "learning_rate": 9.870321874517672e-05, + "loss": 1.9818, + "step": 27220 + }, + { + "epoch": 2.6166490435451313, + "grad_norm": 1.1184595823287964, + "learning_rate": 9.869157553308824e-05, + "loss": 2.1506, + "step": 27221 + }, + { + "epoch": 2.616745169662597, + "grad_norm": 1.1850146055221558, + "learning_rate": 9.867993269261999e-05, + "loss": 2.1294, + "step": 27222 + }, + { + "epoch": 2.6168412957800635, + "grad_norm": 0.9748621582984924, + "learning_rate": 9.866829022385383e-05, + "loss": 1.9228, + "step": 27223 + }, + { + "epoch": 2.6169374218975294, + "grad_norm": 1.2115116119384766, + "learning_rate": 9.865664812687171e-05, + "loss": 2.0505, + "step": 27224 + }, + { + "epoch": 2.6170335480149958, + "grad_norm": 1.1298160552978516, + "learning_rate": 9.864500640175559e-05, + "loss": 2.0092, + "step": 27225 + }, + { + "epoch": 2.6171296741324617, + "grad_norm": 1.0971417427062988, + "learning_rate": 9.863336504858738e-05, + "loss": 1.997, + "step": 27226 + }, + { + "epoch": 2.617225800249928, + "grad_norm": 0.9371147155761719, + "learning_rate": 9.862172406744901e-05, + "loss": 1.8639, + "step": 27227 + }, + { + "epoch": 2.617321926367394, + "grad_norm": 1.2486509084701538, + "learning_rate": 9.861008345842235e-05, + "loss": 2.1241, + "step": 27228 + }, + { + "epoch": 2.6174180524848603, + "grad_norm": 1.0963674783706665, + "learning_rate": 9.859844322158939e-05, + "loss": 1.9604, + "step": 27229 + }, + { + "epoch": 2.617514178602326, + "grad_norm": 1.235185146331787, + "learning_rate": 9.858680335703201e-05, + "loss": 2.1988, + "step": 27230 + }, + { + "epoch": 2.6176103047197925, + "grad_norm": 1.2849483489990234, + "learning_rate": 9.857516386483217e-05, + "loss": 2.0542, + "step": 27231 + }, + { + "epoch": 2.6177064308372584, + "grad_norm": 1.1169079542160034, + "learning_rate": 9.856352474507169e-05, + "loss": 1.8783, + "step": 27232 + }, + { + "epoch": 2.6178025569547247, + "grad_norm": 1.0239837169647217, + "learning_rate": 9.855188599783254e-05, + "loss": 1.8122, + "step": 27233 + }, + { + "epoch": 2.6178986830721906, + "grad_norm": 0.9535133838653564, + "learning_rate": 9.854024762319658e-05, + "loss": 1.8839, + "step": 27234 + }, + { + "epoch": 2.617994809189657, + "grad_norm": 1.115934133529663, + "learning_rate": 9.852860962124575e-05, + "loss": 1.9601, + "step": 27235 + }, + { + "epoch": 2.618090935307123, + "grad_norm": 1.2645593881607056, + "learning_rate": 9.851697199206194e-05, + "loss": 2.0995, + "step": 27236 + }, + { + "epoch": 2.618187061424589, + "grad_norm": 1.1019537448883057, + "learning_rate": 9.850533473572702e-05, + "loss": 1.9163, + "step": 27237 + }, + { + "epoch": 2.618283187542055, + "grad_norm": 1.1441715955734253, + "learning_rate": 9.849369785232292e-05, + "loss": 1.9596, + "step": 27238 + }, + { + "epoch": 2.6183793136595215, + "grad_norm": 1.156940221786499, + "learning_rate": 9.84820613419315e-05, + "loss": 1.98, + "step": 27239 + }, + { + "epoch": 2.6184754397769874, + "grad_norm": 0.9582567811012268, + "learning_rate": 9.847042520463467e-05, + "loss": 1.8921, + "step": 27240 + }, + { + "epoch": 2.6185715658944533, + "grad_norm": 1.1025155782699585, + "learning_rate": 9.845878944051429e-05, + "loss": 2.0387, + "step": 27241 + }, + { + "epoch": 2.6186676920119196, + "grad_norm": 1.1460161209106445, + "learning_rate": 9.844715404965227e-05, + "loss": 2.1481, + "step": 27242 + }, + { + "epoch": 2.618763818129386, + "grad_norm": 1.245130181312561, + "learning_rate": 9.843551903213047e-05, + "loss": 1.9202, + "step": 27243 + }, + { + "epoch": 2.618859944246852, + "grad_norm": 0.9907774329185486, + "learning_rate": 9.842388438803077e-05, + "loss": 2.1089, + "step": 27244 + }, + { + "epoch": 2.6189560703643178, + "grad_norm": 1.0386728048324585, + "learning_rate": 9.841225011743504e-05, + "loss": 1.7936, + "step": 27245 + }, + { + "epoch": 2.619052196481784, + "grad_norm": 1.1506249904632568, + "learning_rate": 9.840061622042514e-05, + "loss": 1.9932, + "step": 27246 + }, + { + "epoch": 2.6191483225992505, + "grad_norm": 1.1347017288208008, + "learning_rate": 9.838898269708303e-05, + "loss": 1.9656, + "step": 27247 + }, + { + "epoch": 2.6192444487167164, + "grad_norm": 1.129646897315979, + "learning_rate": 9.837734954749045e-05, + "loss": 1.9634, + "step": 27248 + }, + { + "epoch": 2.6193405748341823, + "grad_norm": 1.1703020334243774, + "learning_rate": 9.836571677172936e-05, + "loss": 1.98, + "step": 27249 + }, + { + "epoch": 2.6194367009516486, + "grad_norm": 1.2059844732284546, + "learning_rate": 9.835408436988158e-05, + "loss": 1.9452, + "step": 27250 + }, + { + "epoch": 2.619532827069115, + "grad_norm": 1.1316192150115967, + "learning_rate": 9.834245234202893e-05, + "loss": 1.7883, + "step": 27251 + }, + { + "epoch": 2.619628953186581, + "grad_norm": 1.139428973197937, + "learning_rate": 9.833082068825335e-05, + "loss": 2.1099, + "step": 27252 + }, + { + "epoch": 2.6197250793040467, + "grad_norm": 1.0519682168960571, + "learning_rate": 9.831918940863662e-05, + "loss": 2.0202, + "step": 27253 + }, + { + "epoch": 2.619821205421513, + "grad_norm": 1.2360444068908691, + "learning_rate": 9.830755850326064e-05, + "loss": 2.0071, + "step": 27254 + }, + { + "epoch": 2.619917331538979, + "grad_norm": 0.924727201461792, + "learning_rate": 9.829592797220723e-05, + "loss": 1.8562, + "step": 27255 + }, + { + "epoch": 2.6200134576564453, + "grad_norm": 1.0390372276306152, + "learning_rate": 9.828429781555826e-05, + "loss": 1.9966, + "step": 27256 + }, + { + "epoch": 2.6201095837739112, + "grad_norm": 1.2264909744262695, + "learning_rate": 9.827266803339554e-05, + "loss": 2.0177, + "step": 27257 + }, + { + "epoch": 2.6202057098913776, + "grad_norm": 1.1740702390670776, + "learning_rate": 9.826103862580096e-05, + "loss": 1.8994, + "step": 27258 + }, + { + "epoch": 2.6203018360088435, + "grad_norm": 1.3867040872573853, + "learning_rate": 9.824940959285628e-05, + "loss": 2.0458, + "step": 27259 + }, + { + "epoch": 2.62039796212631, + "grad_norm": 1.0888534784317017, + "learning_rate": 9.823778093464342e-05, + "loss": 1.8803, + "step": 27260 + }, + { + "epoch": 2.6204940882437757, + "grad_norm": 1.0730050802230835, + "learning_rate": 9.822615265124416e-05, + "loss": 1.8555, + "step": 27261 + }, + { + "epoch": 2.620590214361242, + "grad_norm": 1.135585069656372, + "learning_rate": 9.821452474274031e-05, + "loss": 2.1301, + "step": 27262 + }, + { + "epoch": 2.620686340478708, + "grad_norm": 1.0861481428146362, + "learning_rate": 9.820289720921379e-05, + "loss": 1.9598, + "step": 27263 + }, + { + "epoch": 2.6207824665961743, + "grad_norm": 1.0768436193466187, + "learning_rate": 9.819127005074634e-05, + "loss": 2.0166, + "step": 27264 + }, + { + "epoch": 2.62087859271364, + "grad_norm": 1.1457648277282715, + "learning_rate": 9.817964326741981e-05, + "loss": 1.8424, + "step": 27265 + }, + { + "epoch": 2.6209747188311066, + "grad_norm": 1.1567935943603516, + "learning_rate": 9.816801685931601e-05, + "loss": 2.0577, + "step": 27266 + }, + { + "epoch": 2.6210708449485725, + "grad_norm": 1.0688350200653076, + "learning_rate": 9.815639082651679e-05, + "loss": 2.0578, + "step": 27267 + }, + { + "epoch": 2.621166971066039, + "grad_norm": 1.117729902267456, + "learning_rate": 9.814476516910393e-05, + "loss": 2.1134, + "step": 27268 + }, + { + "epoch": 2.6212630971835047, + "grad_norm": 1.0200848579406738, + "learning_rate": 9.813313988715923e-05, + "loss": 1.7765, + "step": 27269 + }, + { + "epoch": 2.6213592233009706, + "grad_norm": 1.2010855674743652, + "learning_rate": 9.812151498076453e-05, + "loss": 2.0326, + "step": 27270 + }, + { + "epoch": 2.621455349418437, + "grad_norm": 1.1404820680618286, + "learning_rate": 9.810989045000162e-05, + "loss": 2.141, + "step": 27271 + }, + { + "epoch": 2.6215514755359033, + "grad_norm": 1.0832035541534424, + "learning_rate": 9.809826629495232e-05, + "loss": 1.9885, + "step": 27272 + }, + { + "epoch": 2.621647601653369, + "grad_norm": 1.1085090637207031, + "learning_rate": 9.808664251569841e-05, + "loss": 1.9487, + "step": 27273 + }, + { + "epoch": 2.621743727770835, + "grad_norm": 1.1150060892105103, + "learning_rate": 9.807501911232171e-05, + "loss": 1.8937, + "step": 27274 + }, + { + "epoch": 2.6218398538883014, + "grad_norm": 1.1266828775405884, + "learning_rate": 9.806339608490398e-05, + "loss": 2.0481, + "step": 27275 + }, + { + "epoch": 2.6219359800057678, + "grad_norm": 1.1628962755203247, + "learning_rate": 9.805177343352705e-05, + "loss": 2.1384, + "step": 27276 + }, + { + "epoch": 2.6220321061232337, + "grad_norm": 1.042460560798645, + "learning_rate": 9.804015115827268e-05, + "loss": 1.8849, + "step": 27277 + }, + { + "epoch": 2.6221282322406996, + "grad_norm": 1.1621932983398438, + "learning_rate": 9.802852925922269e-05, + "loss": 1.9494, + "step": 27278 + }, + { + "epoch": 2.622224358358166, + "grad_norm": 1.0973836183547974, + "learning_rate": 9.801690773645881e-05, + "loss": 2.0582, + "step": 27279 + }, + { + "epoch": 2.6223204844756323, + "grad_norm": 0.965397298336029, + "learning_rate": 9.800528659006289e-05, + "loss": 1.897, + "step": 27280 + }, + { + "epoch": 2.622416610593098, + "grad_norm": 1.0887653827667236, + "learning_rate": 9.799366582011667e-05, + "loss": 2.195, + "step": 27281 + }, + { + "epoch": 2.622512736710564, + "grad_norm": 1.1378154754638672, + "learning_rate": 9.798204542670194e-05, + "loss": 2.0544, + "step": 27282 + }, + { + "epoch": 2.6226088628280304, + "grad_norm": 1.455252766609192, + "learning_rate": 9.797042540990049e-05, + "loss": 2.1199, + "step": 27283 + }, + { + "epoch": 2.6227049889454968, + "grad_norm": 1.0518183708190918, + "learning_rate": 9.795880576979405e-05, + "loss": 2.1281, + "step": 27284 + }, + { + "epoch": 2.6228011150629627, + "grad_norm": 1.1755021810531616, + "learning_rate": 9.794718650646441e-05, + "loss": 2.0963, + "step": 27285 + }, + { + "epoch": 2.6228972411804286, + "grad_norm": 1.2062033414840698, + "learning_rate": 9.793556761999336e-05, + "loss": 2.0302, + "step": 27286 + }, + { + "epoch": 2.622993367297895, + "grad_norm": 1.1110846996307373, + "learning_rate": 9.79239491104626e-05, + "loss": 2.0554, + "step": 27287 + }, + { + "epoch": 2.623089493415361, + "grad_norm": 1.0213216543197632, + "learning_rate": 9.7912330977954e-05, + "loss": 1.8107, + "step": 27288 + }, + { + "epoch": 2.623185619532827, + "grad_norm": 1.2584415674209595, + "learning_rate": 9.790071322254918e-05, + "loss": 2.037, + "step": 27289 + }, + { + "epoch": 2.623281745650293, + "grad_norm": 1.0953584909439087, + "learning_rate": 9.788909584433e-05, + "loss": 1.9374, + "step": 27290 + }, + { + "epoch": 2.6233778717677594, + "grad_norm": 1.1849030256271362, + "learning_rate": 9.78774788433782e-05, + "loss": 2.1042, + "step": 27291 + }, + { + "epoch": 2.6234739978852253, + "grad_norm": 1.2673755884170532, + "learning_rate": 9.786586221977548e-05, + "loss": 2.0986, + "step": 27292 + }, + { + "epoch": 2.6235701240026916, + "grad_norm": 1.1501235961914062, + "learning_rate": 9.785424597360363e-05, + "loss": 1.9967, + "step": 27293 + }, + { + "epoch": 2.6236662501201575, + "grad_norm": 1.1553839445114136, + "learning_rate": 9.78426301049444e-05, + "loss": 2.0638, + "step": 27294 + }, + { + "epoch": 2.623762376237624, + "grad_norm": 1.1286137104034424, + "learning_rate": 9.783101461387948e-05, + "loss": 1.8878, + "step": 27295 + }, + { + "epoch": 2.6238585023550898, + "grad_norm": 1.057157039642334, + "learning_rate": 9.781939950049062e-05, + "loss": 1.9309, + "step": 27296 + }, + { + "epoch": 2.623954628472556, + "grad_norm": 1.1580548286437988, + "learning_rate": 9.780778476485962e-05, + "loss": 1.8468, + "step": 27297 + }, + { + "epoch": 2.624050754590022, + "grad_norm": 1.2699756622314453, + "learning_rate": 9.779617040706819e-05, + "loss": 2.2059, + "step": 27298 + }, + { + "epoch": 2.6241468807074884, + "grad_norm": 1.2645072937011719, + "learning_rate": 9.778455642719804e-05, + "loss": 1.942, + "step": 27299 + }, + { + "epoch": 2.6242430068249543, + "grad_norm": 1.0338494777679443, + "learning_rate": 9.777294282533091e-05, + "loss": 1.8876, + "step": 27300 + }, + { + "epoch": 2.6243391329424206, + "grad_norm": 1.0694948434829712, + "learning_rate": 9.776132960154852e-05, + "loss": 1.7919, + "step": 27301 + }, + { + "epoch": 2.6244352590598865, + "grad_norm": 1.4066238403320312, + "learning_rate": 9.77497167559326e-05, + "loss": 2.1201, + "step": 27302 + }, + { + "epoch": 2.624531385177353, + "grad_norm": 1.0494035482406616, + "learning_rate": 9.773810428856485e-05, + "loss": 1.9531, + "step": 27303 + }, + { + "epoch": 2.6246275112948187, + "grad_norm": 1.0503383874893188, + "learning_rate": 9.772649219952705e-05, + "loss": 2.0366, + "step": 27304 + }, + { + "epoch": 2.624723637412285, + "grad_norm": 1.2502037286758423, + "learning_rate": 9.771488048890083e-05, + "loss": 2.166, + "step": 27305 + }, + { + "epoch": 2.624819763529751, + "grad_norm": 1.0256812572479248, + "learning_rate": 9.770326915676798e-05, + "loss": 1.9584, + "step": 27306 + }, + { + "epoch": 2.624915889647217, + "grad_norm": 0.9928293824195862, + "learning_rate": 9.769165820321018e-05, + "loss": 1.9869, + "step": 27307 + }, + { + "epoch": 2.6250120157646832, + "grad_norm": 1.142275094985962, + "learning_rate": 9.768004762830912e-05, + "loss": 2.0935, + "step": 27308 + }, + { + "epoch": 2.6251081418821496, + "grad_norm": 1.062665343284607, + "learning_rate": 9.766843743214654e-05, + "loss": 1.993, + "step": 27309 + }, + { + "epoch": 2.6252042679996155, + "grad_norm": 1.273120641708374, + "learning_rate": 9.765682761480411e-05, + "loss": 2.028, + "step": 27310 + }, + { + "epoch": 2.6253003941170814, + "grad_norm": 1.0789088010787964, + "learning_rate": 9.764521817636352e-05, + "loss": 1.9479, + "step": 27311 + }, + { + "epoch": 2.6253965202345477, + "grad_norm": 1.1563167572021484, + "learning_rate": 9.763360911690654e-05, + "loss": 2.1784, + "step": 27312 + }, + { + "epoch": 2.625492646352014, + "grad_norm": 1.308190107345581, + "learning_rate": 9.762200043651474e-05, + "loss": 1.9855, + "step": 27313 + }, + { + "epoch": 2.62558877246948, + "grad_norm": 1.1906853914260864, + "learning_rate": 9.761039213526994e-05, + "loss": 2.0226, + "step": 27314 + }, + { + "epoch": 2.625684898586946, + "grad_norm": 1.0333142280578613, + "learning_rate": 9.759878421325377e-05, + "loss": 1.8589, + "step": 27315 + }, + { + "epoch": 2.625781024704412, + "grad_norm": 1.2424055337905884, + "learning_rate": 9.758717667054794e-05, + "loss": 2.0866, + "step": 27316 + }, + { + "epoch": 2.6258771508218786, + "grad_norm": 1.1200201511383057, + "learning_rate": 9.75755695072341e-05, + "loss": 2.0448, + "step": 27317 + }, + { + "epoch": 2.6259732769393445, + "grad_norm": 1.1329447031021118, + "learning_rate": 9.756396272339395e-05, + "loss": 1.9825, + "step": 27318 + }, + { + "epoch": 2.6260694030568104, + "grad_norm": 1.2328746318817139, + "learning_rate": 9.755235631910915e-05, + "loss": 1.9819, + "step": 27319 + }, + { + "epoch": 2.6261655291742767, + "grad_norm": 1.1128476858139038, + "learning_rate": 9.754075029446141e-05, + "loss": 2.0529, + "step": 27320 + }, + { + "epoch": 2.6262616552917426, + "grad_norm": 1.1603895425796509, + "learning_rate": 9.752914464953239e-05, + "loss": 2.0578, + "step": 27321 + }, + { + "epoch": 2.626357781409209, + "grad_norm": 1.2129337787628174, + "learning_rate": 9.751753938440375e-05, + "loss": 2.0454, + "step": 27322 + }, + { + "epoch": 2.626453907526675, + "grad_norm": 1.21676766872406, + "learning_rate": 9.750593449915717e-05, + "loss": 2.0511, + "step": 27323 + }, + { + "epoch": 2.626550033644141, + "grad_norm": 1.0437633991241455, + "learning_rate": 9.749432999387429e-05, + "loss": 1.9782, + "step": 27324 + }, + { + "epoch": 2.626646159761607, + "grad_norm": 1.203524112701416, + "learning_rate": 9.748272586863683e-05, + "loss": 2.0529, + "step": 27325 + }, + { + "epoch": 2.6267422858790734, + "grad_norm": 1.1805789470672607, + "learning_rate": 9.74711221235264e-05, + "loss": 1.9674, + "step": 27326 + }, + { + "epoch": 2.6268384119965393, + "grad_norm": 1.1220327615737915, + "learning_rate": 9.745951875862466e-05, + "loss": 2.1348, + "step": 27327 + }, + { + "epoch": 2.6269345381140057, + "grad_norm": 1.1541372537612915, + "learning_rate": 9.744791577401325e-05, + "loss": 2.0157, + "step": 27328 + }, + { + "epoch": 2.6270306642314716, + "grad_norm": 1.1250042915344238, + "learning_rate": 9.743631316977387e-05, + "loss": 2.1229, + "step": 27329 + }, + { + "epoch": 2.627126790348938, + "grad_norm": 1.0225709676742554, + "learning_rate": 9.74247109459881e-05, + "loss": 2.1377, + "step": 27330 + }, + { + "epoch": 2.627222916466404, + "grad_norm": 1.3380719423294067, + "learning_rate": 9.741310910273767e-05, + "loss": 1.9985, + "step": 27331 + }, + { + "epoch": 2.62731904258387, + "grad_norm": 1.174697756767273, + "learning_rate": 9.740150764010419e-05, + "loss": 2.0554, + "step": 27332 + }, + { + "epoch": 2.627415168701336, + "grad_norm": 1.154489517211914, + "learning_rate": 9.738990655816928e-05, + "loss": 1.913, + "step": 27333 + }, + { + "epoch": 2.6275112948188024, + "grad_norm": 1.1092841625213623, + "learning_rate": 9.737830585701462e-05, + "loss": 1.9568, + "step": 27334 + }, + { + "epoch": 2.6276074209362683, + "grad_norm": 1.1933047771453857, + "learning_rate": 9.73667055367218e-05, + "loss": 1.994, + "step": 27335 + }, + { + "epoch": 2.6277035470537347, + "grad_norm": 1.031928539276123, + "learning_rate": 9.735510559737246e-05, + "loss": 2.0313, + "step": 27336 + }, + { + "epoch": 2.6277996731712006, + "grad_norm": 1.1197545528411865, + "learning_rate": 9.734350603904826e-05, + "loss": 1.9278, + "step": 27337 + }, + { + "epoch": 2.627895799288667, + "grad_norm": 1.1815382242202759, + "learning_rate": 9.733190686183082e-05, + "loss": 2.0662, + "step": 27338 + }, + { + "epoch": 2.627991925406133, + "grad_norm": 1.218977689743042, + "learning_rate": 9.732030806580173e-05, + "loss": 1.95, + "step": 27339 + }, + { + "epoch": 2.6280880515235987, + "grad_norm": 0.9758931994438171, + "learning_rate": 9.730870965104267e-05, + "loss": 1.8537, + "step": 27340 + }, + { + "epoch": 2.628184177641065, + "grad_norm": 1.0464627742767334, + "learning_rate": 9.729711161763519e-05, + "loss": 2.0675, + "step": 27341 + }, + { + "epoch": 2.6282803037585314, + "grad_norm": 1.0873390436172485, + "learning_rate": 9.728551396566098e-05, + "loss": 1.7903, + "step": 27342 + }, + { + "epoch": 2.6283764298759973, + "grad_norm": 1.2277100086212158, + "learning_rate": 9.727391669520161e-05, + "loss": 2.021, + "step": 27343 + }, + { + "epoch": 2.628472555993463, + "grad_norm": 0.9783762097358704, + "learning_rate": 9.72623198063387e-05, + "loss": 1.9481, + "step": 27344 + }, + { + "epoch": 2.6285686821109295, + "grad_norm": 1.0336188077926636, + "learning_rate": 9.725072329915387e-05, + "loss": 1.9702, + "step": 27345 + }, + { + "epoch": 2.628664808228396, + "grad_norm": 1.0014259815216064, + "learning_rate": 9.72391271737287e-05, + "loss": 1.9606, + "step": 27346 + }, + { + "epoch": 2.6287609343458618, + "grad_norm": 1.1292988061904907, + "learning_rate": 9.722753143014477e-05, + "loss": 1.9124, + "step": 27347 + }, + { + "epoch": 2.6288570604633277, + "grad_norm": 1.1545524597167969, + "learning_rate": 9.721593606848375e-05, + "loss": 2.1265, + "step": 27348 + }, + { + "epoch": 2.628953186580794, + "grad_norm": 1.1156249046325684, + "learning_rate": 9.720434108882725e-05, + "loss": 2.1316, + "step": 27349 + }, + { + "epoch": 2.6290493126982604, + "grad_norm": 1.133538842201233, + "learning_rate": 9.719274649125679e-05, + "loss": 2.1683, + "step": 27350 + }, + { + "epoch": 2.6291454388157263, + "grad_norm": 1.1066374778747559, + "learning_rate": 9.7181152275854e-05, + "loss": 1.939, + "step": 27351 + }, + { + "epoch": 2.629241564933192, + "grad_norm": 1.0709612369537354, + "learning_rate": 9.716955844270045e-05, + "loss": 1.9874, + "step": 27352 + }, + { + "epoch": 2.6293376910506585, + "grad_norm": 1.1702758073806763, + "learning_rate": 9.715796499187777e-05, + "loss": 2.0648, + "step": 27353 + }, + { + "epoch": 2.629433817168125, + "grad_norm": 1.135675072669983, + "learning_rate": 9.71463719234675e-05, + "loss": 1.9968, + "step": 27354 + }, + { + "epoch": 2.6295299432855908, + "grad_norm": 1.0477372407913208, + "learning_rate": 9.713477923755126e-05, + "loss": 2.014, + "step": 27355 + }, + { + "epoch": 2.6296260694030567, + "grad_norm": 1.2512681484222412, + "learning_rate": 9.71231869342106e-05, + "loss": 2.0594, + "step": 27356 + }, + { + "epoch": 2.629722195520523, + "grad_norm": 1.09530770778656, + "learning_rate": 9.711159501352711e-05, + "loss": 2.1552, + "step": 27357 + }, + { + "epoch": 2.629818321637989, + "grad_norm": 1.0405105352401733, + "learning_rate": 9.710000347558236e-05, + "loss": 1.9017, + "step": 27358 + }, + { + "epoch": 2.6299144477554552, + "grad_norm": 1.1023656129837036, + "learning_rate": 9.708841232045793e-05, + "loss": 1.9834, + "step": 27359 + }, + { + "epoch": 2.630010573872921, + "grad_norm": 1.101034164428711, + "learning_rate": 9.707682154823537e-05, + "loss": 2.0088, + "step": 27360 + }, + { + "epoch": 2.6301066999903875, + "grad_norm": 1.107828974723816, + "learning_rate": 9.706523115899625e-05, + "loss": 2.1083, + "step": 27361 + }, + { + "epoch": 2.6302028261078534, + "grad_norm": 1.1880559921264648, + "learning_rate": 9.705364115282217e-05, + "loss": 2.0592, + "step": 27362 + }, + { + "epoch": 2.6302989522253197, + "grad_norm": 1.157116174697876, + "learning_rate": 9.704205152979463e-05, + "loss": 1.9912, + "step": 27363 + }, + { + "epoch": 2.6303950783427856, + "grad_norm": 1.0952500104904175, + "learning_rate": 9.70304622899952e-05, + "loss": 1.8998, + "step": 27364 + }, + { + "epoch": 2.630491204460252, + "grad_norm": 1.0078470706939697, + "learning_rate": 9.701887343350548e-05, + "loss": 1.95, + "step": 27365 + }, + { + "epoch": 2.630587330577718, + "grad_norm": 1.2406269311904907, + "learning_rate": 9.700728496040698e-05, + "loss": 2.2709, + "step": 27366 + }, + { + "epoch": 2.630683456695184, + "grad_norm": 1.0560275316238403, + "learning_rate": 9.69956968707813e-05, + "loss": 1.9856, + "step": 27367 + }, + { + "epoch": 2.63077958281265, + "grad_norm": 1.1468489170074463, + "learning_rate": 9.698410916470991e-05, + "loss": 2.0304, + "step": 27368 + }, + { + "epoch": 2.6308757089301165, + "grad_norm": 1.0325368642807007, + "learning_rate": 9.697252184227442e-05, + "loss": 2.06, + "step": 27369 + }, + { + "epoch": 2.6309718350475824, + "grad_norm": 1.1486172676086426, + "learning_rate": 9.696093490355634e-05, + "loss": 2.0319, + "step": 27370 + }, + { + "epoch": 2.6310679611650487, + "grad_norm": 1.0148372650146484, + "learning_rate": 9.694934834863724e-05, + "loss": 1.94, + "step": 27371 + }, + { + "epoch": 2.6311640872825146, + "grad_norm": 0.9931276440620422, + "learning_rate": 9.693776217759863e-05, + "loss": 1.9708, + "step": 27372 + }, + { + "epoch": 2.6312602133999805, + "grad_norm": 1.1284184455871582, + "learning_rate": 9.692617639052203e-05, + "loss": 1.9573, + "step": 27373 + }, + { + "epoch": 2.631356339517447, + "grad_norm": 1.1405938863754272, + "learning_rate": 9.691459098748898e-05, + "loss": 2.0041, + "step": 27374 + }, + { + "epoch": 2.631452465634913, + "grad_norm": 1.0334498882293701, + "learning_rate": 9.690300596858105e-05, + "loss": 2.0616, + "step": 27375 + }, + { + "epoch": 2.631548591752379, + "grad_norm": 1.1224538087844849, + "learning_rate": 9.689142133387968e-05, + "loss": 1.8441, + "step": 27376 + }, + { + "epoch": 2.631644717869845, + "grad_norm": 0.9909238815307617, + "learning_rate": 9.687983708346649e-05, + "loss": 1.9709, + "step": 27377 + }, + { + "epoch": 2.6317408439873113, + "grad_norm": 1.0556485652923584, + "learning_rate": 9.686825321742294e-05, + "loss": 1.9352, + "step": 27378 + }, + { + "epoch": 2.6318369701047777, + "grad_norm": 1.2442268133163452, + "learning_rate": 9.685666973583054e-05, + "loss": 2.1128, + "step": 27379 + }, + { + "epoch": 2.6319330962222436, + "grad_norm": 1.2840662002563477, + "learning_rate": 9.684508663877084e-05, + "loss": 2.0999, + "step": 27380 + }, + { + "epoch": 2.6320292223397095, + "grad_norm": 1.239487648010254, + "learning_rate": 9.683350392632532e-05, + "loss": 2.0057, + "step": 27381 + }, + { + "epoch": 2.632125348457176, + "grad_norm": 1.1345027685165405, + "learning_rate": 9.682192159857552e-05, + "loss": 1.9176, + "step": 27382 + }, + { + "epoch": 2.632221474574642, + "grad_norm": 1.270456075668335, + "learning_rate": 9.681033965560295e-05, + "loss": 2.2132, + "step": 27383 + }, + { + "epoch": 2.632317600692108, + "grad_norm": 0.9997849464416504, + "learning_rate": 9.679875809748906e-05, + "loss": 1.7366, + "step": 27384 + }, + { + "epoch": 2.632413726809574, + "grad_norm": 1.1300894021987915, + "learning_rate": 9.678717692431543e-05, + "loss": 2.0618, + "step": 27385 + }, + { + "epoch": 2.6325098529270403, + "grad_norm": 1.0424596071243286, + "learning_rate": 9.677559613616349e-05, + "loss": 1.8973, + "step": 27386 + }, + { + "epoch": 2.6326059790445067, + "grad_norm": 1.2518813610076904, + "learning_rate": 9.676401573311476e-05, + "loss": 2.0435, + "step": 27387 + }, + { + "epoch": 2.6327021051619726, + "grad_norm": 1.1205304861068726, + "learning_rate": 9.675243571525076e-05, + "loss": 1.8907, + "step": 27388 + }, + { + "epoch": 2.6327982312794385, + "grad_norm": 1.1043245792388916, + "learning_rate": 9.674085608265293e-05, + "loss": 1.9188, + "step": 27389 + }, + { + "epoch": 2.632894357396905, + "grad_norm": 1.1371639966964722, + "learning_rate": 9.672927683540279e-05, + "loss": 2.0209, + "step": 27390 + }, + { + "epoch": 2.6329904835143707, + "grad_norm": 1.1631569862365723, + "learning_rate": 9.671769797358182e-05, + "loss": 1.9457, + "step": 27391 + }, + { + "epoch": 2.633086609631837, + "grad_norm": 1.1858338117599487, + "learning_rate": 9.67061194972715e-05, + "loss": 1.9131, + "step": 27392 + }, + { + "epoch": 2.633182735749303, + "grad_norm": 1.06905198097229, + "learning_rate": 9.66945414065533e-05, + "loss": 2.0184, + "step": 27393 + }, + { + "epoch": 2.6332788618667693, + "grad_norm": 1.0704405307769775, + "learning_rate": 9.668296370150871e-05, + "loss": 1.988, + "step": 27394 + }, + { + "epoch": 2.633374987984235, + "grad_norm": 1.1239831447601318, + "learning_rate": 9.667138638221921e-05, + "loss": 2.0475, + "step": 27395 + }, + { + "epoch": 2.6334711141017015, + "grad_norm": 1.1399294137954712, + "learning_rate": 9.665980944876623e-05, + "loss": 2.0397, + "step": 27396 + }, + { + "epoch": 2.6335672402191674, + "grad_norm": 1.0262738466262817, + "learning_rate": 9.664823290123129e-05, + "loss": 1.9379, + "step": 27397 + }, + { + "epoch": 2.633663366336634, + "grad_norm": 1.1924588680267334, + "learning_rate": 9.66366567396958e-05, + "loss": 1.9741, + "step": 27398 + }, + { + "epoch": 2.6337594924540997, + "grad_norm": 1.2207280397415161, + "learning_rate": 9.662508096424128e-05, + "loss": 2.0078, + "step": 27399 + }, + { + "epoch": 2.633855618571566, + "grad_norm": 1.1806378364562988, + "learning_rate": 9.661350557494918e-05, + "loss": 1.9727, + "step": 27400 + }, + { + "epoch": 2.633951744689032, + "grad_norm": 1.0934809446334839, + "learning_rate": 9.660193057190094e-05, + "loss": 1.8762, + "step": 27401 + }, + { + "epoch": 2.6340478708064983, + "grad_norm": 1.1487420797348022, + "learning_rate": 9.659035595517803e-05, + "loss": 2.1317, + "step": 27402 + }, + { + "epoch": 2.634143996923964, + "grad_norm": 1.290636420249939, + "learning_rate": 9.657878172486188e-05, + "loss": 2.036, + "step": 27403 + }, + { + "epoch": 2.6342401230414305, + "grad_norm": 1.201427936553955, + "learning_rate": 9.656720788103396e-05, + "loss": 1.9494, + "step": 27404 + }, + { + "epoch": 2.6343362491588964, + "grad_norm": 1.1203861236572266, + "learning_rate": 9.65556344237757e-05, + "loss": 1.8303, + "step": 27405 + }, + { + "epoch": 2.6344323752763623, + "grad_norm": 1.0363975763320923, + "learning_rate": 9.654406135316853e-05, + "loss": 1.9854, + "step": 27406 + }, + { + "epoch": 2.6345285013938287, + "grad_norm": 1.1121575832366943, + "learning_rate": 9.653248866929394e-05, + "loss": 2.0321, + "step": 27407 + }, + { + "epoch": 2.634624627511295, + "grad_norm": 1.153202772140503, + "learning_rate": 9.652091637223336e-05, + "loss": 2.0358, + "step": 27408 + }, + { + "epoch": 2.634720753628761, + "grad_norm": 0.9222634434700012, + "learning_rate": 9.65093444620682e-05, + "loss": 1.8956, + "step": 27409 + }, + { + "epoch": 2.634816879746227, + "grad_norm": 1.2733020782470703, + "learning_rate": 9.649777293887989e-05, + "loss": 2.1297, + "step": 27410 + }, + { + "epoch": 2.634913005863693, + "grad_norm": 1.0822908878326416, + "learning_rate": 9.648620180274987e-05, + "loss": 1.8836, + "step": 27411 + }, + { + "epoch": 2.6350091319811595, + "grad_norm": 1.2232366800308228, + "learning_rate": 9.647463105375958e-05, + "loss": 2.0084, + "step": 27412 + }, + { + "epoch": 2.6351052580986254, + "grad_norm": 1.2398221492767334, + "learning_rate": 9.646306069199043e-05, + "loss": 2.194, + "step": 27413 + }, + { + "epoch": 2.6352013842160913, + "grad_norm": 1.049198031425476, + "learning_rate": 9.645149071752383e-05, + "loss": 1.9537, + "step": 27414 + }, + { + "epoch": 2.6352975103335576, + "grad_norm": 1.3393605947494507, + "learning_rate": 9.64399211304412e-05, + "loss": 1.9689, + "step": 27415 + }, + { + "epoch": 2.635393636451024, + "grad_norm": 1.2962888479232788, + "learning_rate": 9.642835193082402e-05, + "loss": 2.0536, + "step": 27416 + }, + { + "epoch": 2.63548976256849, + "grad_norm": 1.256187915802002, + "learning_rate": 9.641678311875363e-05, + "loss": 2.1782, + "step": 27417 + }, + { + "epoch": 2.635585888685956, + "grad_norm": 1.2153512239456177, + "learning_rate": 9.640521469431149e-05, + "loss": 2.0435, + "step": 27418 + }, + { + "epoch": 2.635682014803422, + "grad_norm": 1.0694249868392944, + "learning_rate": 9.639364665757898e-05, + "loss": 2.0982, + "step": 27419 + }, + { + "epoch": 2.6357781409208885, + "grad_norm": 0.9931066632270813, + "learning_rate": 9.638207900863751e-05, + "loss": 2.0277, + "step": 27420 + }, + { + "epoch": 2.6358742670383544, + "grad_norm": 1.0755456686019897, + "learning_rate": 9.637051174756848e-05, + "loss": 1.959, + "step": 27421 + }, + { + "epoch": 2.6359703931558203, + "grad_norm": 1.0973989963531494, + "learning_rate": 9.635894487445328e-05, + "loss": 1.9595, + "step": 27422 + }, + { + "epoch": 2.6360665192732866, + "grad_norm": 1.1490792036056519, + "learning_rate": 9.634737838937337e-05, + "loss": 2.024, + "step": 27423 + }, + { + "epoch": 2.6361626453907525, + "grad_norm": 1.0467784404754639, + "learning_rate": 9.633581229241006e-05, + "loss": 1.9499, + "step": 27424 + }, + { + "epoch": 2.636258771508219, + "grad_norm": 1.1642597913742065, + "learning_rate": 9.63242465836448e-05, + "loss": 2.0927, + "step": 27425 + }, + { + "epoch": 2.6363548976256848, + "grad_norm": 1.1254559755325317, + "learning_rate": 9.631268126315896e-05, + "loss": 1.9607, + "step": 27426 + }, + { + "epoch": 2.636451023743151, + "grad_norm": 1.1215544939041138, + "learning_rate": 9.630111633103394e-05, + "loss": 1.9727, + "step": 27427 + }, + { + "epoch": 2.636547149860617, + "grad_norm": 1.1480716466903687, + "learning_rate": 9.628955178735108e-05, + "loss": 1.8377, + "step": 27428 + }, + { + "epoch": 2.6366432759780833, + "grad_norm": 1.1758508682250977, + "learning_rate": 9.627798763219182e-05, + "loss": 2.0911, + "step": 27429 + }, + { + "epoch": 2.6367394020955492, + "grad_norm": 1.097348690032959, + "learning_rate": 9.62664238656375e-05, + "loss": 1.9924, + "step": 27430 + }, + { + "epoch": 2.6368355282130156, + "grad_norm": 1.106726884841919, + "learning_rate": 9.625486048776952e-05, + "loss": 1.8463, + "step": 27431 + }, + { + "epoch": 2.6369316543304815, + "grad_norm": 1.1055091619491577, + "learning_rate": 9.624329749866918e-05, + "loss": 2.0031, + "step": 27432 + }, + { + "epoch": 2.637027780447948, + "grad_norm": 1.0128690004348755, + "learning_rate": 9.623173489841797e-05, + "loss": 2.0058, + "step": 27433 + }, + { + "epoch": 2.6371239065654137, + "grad_norm": 0.9849504828453064, + "learning_rate": 9.622017268709719e-05, + "loss": 1.7618, + "step": 27434 + }, + { + "epoch": 2.63722003268288, + "grad_norm": 0.9666386842727661, + "learning_rate": 9.620861086478821e-05, + "loss": 1.7788, + "step": 27435 + }, + { + "epoch": 2.637316158800346, + "grad_norm": 1.0680248737335205, + "learning_rate": 9.619704943157239e-05, + "loss": 2.0973, + "step": 27436 + }, + { + "epoch": 2.6374122849178123, + "grad_norm": 1.2086663246154785, + "learning_rate": 9.61854883875311e-05, + "loss": 1.9702, + "step": 27437 + }, + { + "epoch": 2.637508411035278, + "grad_norm": 1.1958082914352417, + "learning_rate": 9.61739277327457e-05, + "loss": 1.8947, + "step": 27438 + }, + { + "epoch": 2.6376045371527446, + "grad_norm": 1.3885252475738525, + "learning_rate": 9.616236746729751e-05, + "loss": 2.1453, + "step": 27439 + }, + { + "epoch": 2.6377006632702105, + "grad_norm": 1.1436641216278076, + "learning_rate": 9.615080759126794e-05, + "loss": 1.9946, + "step": 27440 + }, + { + "epoch": 2.637796789387677, + "grad_norm": 1.0489349365234375, + "learning_rate": 9.613924810473829e-05, + "loss": 2.1486, + "step": 27441 + }, + { + "epoch": 2.6378929155051427, + "grad_norm": 1.1055841445922852, + "learning_rate": 9.612768900778992e-05, + "loss": 2.0914, + "step": 27442 + }, + { + "epoch": 2.6379890416226086, + "grad_norm": 1.0305759906768799, + "learning_rate": 9.611613030050419e-05, + "loss": 1.9398, + "step": 27443 + }, + { + "epoch": 2.638085167740075, + "grad_norm": 1.238753318786621, + "learning_rate": 9.61045719829624e-05, + "loss": 2.1353, + "step": 27444 + }, + { + "epoch": 2.6381812938575413, + "grad_norm": 1.0778019428253174, + "learning_rate": 9.60930140552459e-05, + "loss": 1.9485, + "step": 27445 + }, + { + "epoch": 2.638277419975007, + "grad_norm": 1.191454291343689, + "learning_rate": 9.608145651743607e-05, + "loss": 1.9182, + "step": 27446 + }, + { + "epoch": 2.638373546092473, + "grad_norm": 0.9610220193862915, + "learning_rate": 9.606989936961417e-05, + "loss": 1.867, + "step": 27447 + }, + { + "epoch": 2.6384696722099394, + "grad_norm": 1.232327938079834, + "learning_rate": 9.60583426118616e-05, + "loss": 1.9844, + "step": 27448 + }, + { + "epoch": 2.638565798327406, + "grad_norm": 1.1269582509994507, + "learning_rate": 9.604678624425962e-05, + "loss": 1.9678, + "step": 27449 + }, + { + "epoch": 2.6386619244448717, + "grad_norm": 1.0691856145858765, + "learning_rate": 9.60352302668896e-05, + "loss": 2.0193, + "step": 27450 + }, + { + "epoch": 2.6387580505623376, + "grad_norm": 1.1374397277832031, + "learning_rate": 9.602367467983287e-05, + "loss": 1.8494, + "step": 27451 + }, + { + "epoch": 2.638854176679804, + "grad_norm": 1.0840867757797241, + "learning_rate": 9.601211948317073e-05, + "loss": 1.9246, + "step": 27452 + }, + { + "epoch": 2.6389503027972703, + "grad_norm": 1.1147923469543457, + "learning_rate": 9.600056467698448e-05, + "loss": 1.8454, + "step": 27453 + }, + { + "epoch": 2.639046428914736, + "grad_norm": 1.2763638496398926, + "learning_rate": 9.598901026135545e-05, + "loss": 2.1223, + "step": 27454 + }, + { + "epoch": 2.639142555032202, + "grad_norm": 1.3351194858551025, + "learning_rate": 9.597745623636495e-05, + "loss": 2.0743, + "step": 27455 + }, + { + "epoch": 2.6392386811496684, + "grad_norm": 1.1930729150772095, + "learning_rate": 9.596590260209428e-05, + "loss": 2.0057, + "step": 27456 + }, + { + "epoch": 2.6393348072671343, + "grad_norm": 1.0653477907180786, + "learning_rate": 9.595434935862474e-05, + "loss": 1.9547, + "step": 27457 + }, + { + "epoch": 2.6394309333846007, + "grad_norm": 1.1191236972808838, + "learning_rate": 9.594279650603762e-05, + "loss": 2.0075, + "step": 27458 + }, + { + "epoch": 2.6395270595020666, + "grad_norm": 1.1871161460876465, + "learning_rate": 9.593124404441426e-05, + "loss": 2.0733, + "step": 27459 + }, + { + "epoch": 2.639623185619533, + "grad_norm": 1.1431037187576294, + "learning_rate": 9.591969197383594e-05, + "loss": 2.0696, + "step": 27460 + }, + { + "epoch": 2.639719311736999, + "grad_norm": 1.2153228521347046, + "learning_rate": 9.590814029438396e-05, + "loss": 2.0704, + "step": 27461 + }, + { + "epoch": 2.639815437854465, + "grad_norm": 1.1102941036224365, + "learning_rate": 9.589658900613958e-05, + "loss": 1.8984, + "step": 27462 + }, + { + "epoch": 2.639911563971931, + "grad_norm": 1.1649503707885742, + "learning_rate": 9.58850381091841e-05, + "loss": 2.0134, + "step": 27463 + }, + { + "epoch": 2.6400076900893974, + "grad_norm": 1.0877050161361694, + "learning_rate": 9.587348760359881e-05, + "loss": 1.8819, + "step": 27464 + }, + { + "epoch": 2.6401038162068633, + "grad_norm": 1.1356563568115234, + "learning_rate": 9.5861937489465e-05, + "loss": 2.106, + "step": 27465 + }, + { + "epoch": 2.6401999423243296, + "grad_norm": 1.1283513307571411, + "learning_rate": 9.58503877668639e-05, + "loss": 1.9955, + "step": 27466 + }, + { + "epoch": 2.6402960684417955, + "grad_norm": 0.9700493216514587, + "learning_rate": 9.58388384358769e-05, + "loss": 1.987, + "step": 27467 + }, + { + "epoch": 2.640392194559262, + "grad_norm": 1.1995258331298828, + "learning_rate": 9.582728949658517e-05, + "loss": 1.9998, + "step": 27468 + }, + { + "epoch": 2.640488320676728, + "grad_norm": 1.136001467704773, + "learning_rate": 9.581574094907005e-05, + "loss": 2.0097, + "step": 27469 + }, + { + "epoch": 2.640584446794194, + "grad_norm": 1.3555454015731812, + "learning_rate": 9.580419279341272e-05, + "loss": 2.1558, + "step": 27470 + }, + { + "epoch": 2.64068057291166, + "grad_norm": 1.1703169345855713, + "learning_rate": 9.579264502969455e-05, + "loss": 1.9479, + "step": 27471 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 1.3822685480117798, + "learning_rate": 9.578109765799674e-05, + "loss": 2.1324, + "step": 27472 + }, + { + "epoch": 2.6408728251465923, + "grad_norm": 1.0749566555023193, + "learning_rate": 9.576955067840055e-05, + "loss": 1.9756, + "step": 27473 + }, + { + "epoch": 2.6409689512640586, + "grad_norm": 1.0737557411193848, + "learning_rate": 9.575800409098729e-05, + "loss": 2.0611, + "step": 27474 + }, + { + "epoch": 2.6410650773815245, + "grad_norm": 1.027976155281067, + "learning_rate": 9.574645789583814e-05, + "loss": 1.9671, + "step": 27475 + }, + { + "epoch": 2.6411612034989904, + "grad_norm": 1.108088493347168, + "learning_rate": 9.573491209303442e-05, + "loss": 1.9643, + "step": 27476 + }, + { + "epoch": 2.6412573296164568, + "grad_norm": 1.045654058456421, + "learning_rate": 9.572336668265733e-05, + "loss": 1.9523, + "step": 27477 + }, + { + "epoch": 2.641353455733923, + "grad_norm": 1.0335601568222046, + "learning_rate": 9.571182166478812e-05, + "loss": 1.8465, + "step": 27478 + }, + { + "epoch": 2.641449581851389, + "grad_norm": 1.1766958236694336, + "learning_rate": 9.57002770395081e-05, + "loss": 2.0081, + "step": 27479 + }, + { + "epoch": 2.641545707968855, + "grad_norm": 1.1559723615646362, + "learning_rate": 9.568873280689841e-05, + "loss": 2.1627, + "step": 27480 + }, + { + "epoch": 2.6416418340863212, + "grad_norm": 1.0399631261825562, + "learning_rate": 9.567718896704038e-05, + "loss": 1.8675, + "step": 27481 + }, + { + "epoch": 2.6417379602037876, + "grad_norm": 1.124542474746704, + "learning_rate": 9.566564552001517e-05, + "loss": 1.8995, + "step": 27482 + }, + { + "epoch": 2.6418340863212535, + "grad_norm": 1.1835989952087402, + "learning_rate": 9.565410246590405e-05, + "loss": 1.9529, + "step": 27483 + }, + { + "epoch": 2.6419302124387194, + "grad_norm": 1.2032114267349243, + "learning_rate": 9.564255980478824e-05, + "loss": 1.9225, + "step": 27484 + }, + { + "epoch": 2.6420263385561857, + "grad_norm": 1.1098880767822266, + "learning_rate": 9.563101753674901e-05, + "loss": 1.9862, + "step": 27485 + }, + { + "epoch": 2.642122464673652, + "grad_norm": 1.1242040395736694, + "learning_rate": 9.561947566186754e-05, + "loss": 2.0789, + "step": 27486 + }, + { + "epoch": 2.642218590791118, + "grad_norm": 1.1926559209823608, + "learning_rate": 9.560793418022505e-05, + "loss": 2.2089, + "step": 27487 + }, + { + "epoch": 2.642314716908584, + "grad_norm": 1.1405613422393799, + "learning_rate": 9.559639309190278e-05, + "loss": 2.0854, + "step": 27488 + }, + { + "epoch": 2.6424108430260502, + "grad_norm": 1.1140466928482056, + "learning_rate": 9.558485239698194e-05, + "loss": 2.0224, + "step": 27489 + }, + { + "epoch": 2.6425069691435166, + "grad_norm": 1.3460965156555176, + "learning_rate": 9.557331209554372e-05, + "loss": 2.0097, + "step": 27490 + }, + { + "epoch": 2.6426030952609825, + "grad_norm": 1.1625046730041504, + "learning_rate": 9.556177218766937e-05, + "loss": 1.9131, + "step": 27491 + }, + { + "epoch": 2.6426992213784484, + "grad_norm": 1.0516878366470337, + "learning_rate": 9.555023267344006e-05, + "loss": 1.9551, + "step": 27492 + }, + { + "epoch": 2.6427953474959147, + "grad_norm": 1.0704413652420044, + "learning_rate": 9.553869355293705e-05, + "loss": 2.2379, + "step": 27493 + }, + { + "epoch": 2.6428914736133806, + "grad_norm": 1.0036230087280273, + "learning_rate": 9.552715482624148e-05, + "loss": 1.8772, + "step": 27494 + }, + { + "epoch": 2.642987599730847, + "grad_norm": 1.0733680725097656, + "learning_rate": 9.551561649343457e-05, + "loss": 1.9909, + "step": 27495 + }, + { + "epoch": 2.643083725848313, + "grad_norm": 1.0899319648742676, + "learning_rate": 9.550407855459755e-05, + "loss": 1.9917, + "step": 27496 + }, + { + "epoch": 2.643179851965779, + "grad_norm": 1.1284003257751465, + "learning_rate": 9.549254100981156e-05, + "loss": 2.0281, + "step": 27497 + }, + { + "epoch": 2.643275978083245, + "grad_norm": 1.031206727027893, + "learning_rate": 9.548100385915783e-05, + "loss": 1.9005, + "step": 27498 + }, + { + "epoch": 2.6433721042007114, + "grad_norm": 1.2251687049865723, + "learning_rate": 9.546946710271754e-05, + "loss": 2.0134, + "step": 27499 + }, + { + "epoch": 2.6434682303181773, + "grad_norm": 1.1205445528030396, + "learning_rate": 9.545793074057183e-05, + "loss": 1.7976, + "step": 27500 + }, + { + "epoch": 2.6435643564356437, + "grad_norm": 1.2893681526184082, + "learning_rate": 9.544639477280195e-05, + "loss": 2.101, + "step": 27501 + }, + { + "epoch": 2.6436604825531096, + "grad_norm": 1.0702623128890991, + "learning_rate": 9.543485919948907e-05, + "loss": 2.0765, + "step": 27502 + }, + { + "epoch": 2.643756608670576, + "grad_norm": 1.115605115890503, + "learning_rate": 9.542332402071436e-05, + "loss": 2.0252, + "step": 27503 + }, + { + "epoch": 2.643852734788042, + "grad_norm": 1.2194416522979736, + "learning_rate": 9.541178923655897e-05, + "loss": 2.0019, + "step": 27504 + }, + { + "epoch": 2.643948860905508, + "grad_norm": 1.1748698949813843, + "learning_rate": 9.540025484710411e-05, + "loss": 2.05, + "step": 27505 + }, + { + "epoch": 2.644044987022974, + "grad_norm": 0.9800822734832764, + "learning_rate": 9.538872085243092e-05, + "loss": 1.8302, + "step": 27506 + }, + { + "epoch": 2.6441411131404404, + "grad_norm": 1.0600316524505615, + "learning_rate": 9.537718725262059e-05, + "loss": 1.849, + "step": 27507 + }, + { + "epoch": 2.6442372392579063, + "grad_norm": 1.1277844905853271, + "learning_rate": 9.536565404775424e-05, + "loss": 1.8883, + "step": 27508 + }, + { + "epoch": 2.6443333653753722, + "grad_norm": 1.1037729978561401, + "learning_rate": 9.535412123791307e-05, + "loss": 1.873, + "step": 27509 + }, + { + "epoch": 2.6444294914928386, + "grad_norm": 1.050053358078003, + "learning_rate": 9.534258882317823e-05, + "loss": 1.8508, + "step": 27510 + }, + { + "epoch": 2.644525617610305, + "grad_norm": 1.109379529953003, + "learning_rate": 9.533105680363088e-05, + "loss": 2.0139, + "step": 27511 + }, + { + "epoch": 2.644621743727771, + "grad_norm": 1.0684391260147095, + "learning_rate": 9.531952517935215e-05, + "loss": 2.0868, + "step": 27512 + }, + { + "epoch": 2.6447178698452367, + "grad_norm": 1.0308496952056885, + "learning_rate": 9.53079939504232e-05, + "loss": 2.1395, + "step": 27513 + }, + { + "epoch": 2.644813995962703, + "grad_norm": 0.9713638424873352, + "learning_rate": 9.52964631169252e-05, + "loss": 2.0006, + "step": 27514 + }, + { + "epoch": 2.6449101220801694, + "grad_norm": 1.2023353576660156, + "learning_rate": 9.528493267893927e-05, + "loss": 2.0296, + "step": 27515 + }, + { + "epoch": 2.6450062481976353, + "grad_norm": 1.4670634269714355, + "learning_rate": 9.527340263654655e-05, + "loss": 2.2056, + "step": 27516 + }, + { + "epoch": 2.645102374315101, + "grad_norm": 1.164759635925293, + "learning_rate": 9.526187298982815e-05, + "loss": 1.9794, + "step": 27517 + }, + { + "epoch": 2.6451985004325675, + "grad_norm": 1.192465901374817, + "learning_rate": 9.52503437388653e-05, + "loss": 2.0529, + "step": 27518 + }, + { + "epoch": 2.645294626550034, + "grad_norm": 1.2591606378555298, + "learning_rate": 9.523881488373903e-05, + "loss": 2.0924, + "step": 27519 + }, + { + "epoch": 2.6453907526675, + "grad_norm": 1.0842074155807495, + "learning_rate": 9.522728642453055e-05, + "loss": 1.8404, + "step": 27520 + }, + { + "epoch": 2.6454868787849657, + "grad_norm": 1.1287676095962524, + "learning_rate": 9.521575836132094e-05, + "loss": 1.9778, + "step": 27521 + }, + { + "epoch": 2.645583004902432, + "grad_norm": 1.0094692707061768, + "learning_rate": 9.520423069419133e-05, + "loss": 1.9686, + "step": 27522 + }, + { + "epoch": 2.6456791310198984, + "grad_norm": 1.0553778409957886, + "learning_rate": 9.519270342322286e-05, + "loss": 1.9168, + "step": 27523 + }, + { + "epoch": 2.6457752571373643, + "grad_norm": 0.9568527936935425, + "learning_rate": 9.518117654849664e-05, + "loss": 1.8904, + "step": 27524 + }, + { + "epoch": 2.64587138325483, + "grad_norm": 1.1637961864471436, + "learning_rate": 9.516965007009375e-05, + "loss": 2.0341, + "step": 27525 + }, + { + "epoch": 2.6459675093722965, + "grad_norm": 1.0361589193344116, + "learning_rate": 9.515812398809538e-05, + "loss": 1.8999, + "step": 27526 + }, + { + "epoch": 2.6460636354897624, + "grad_norm": 1.1169748306274414, + "learning_rate": 9.514659830258257e-05, + "loss": 1.9416, + "step": 27527 + }, + { + "epoch": 2.6461597616072288, + "grad_norm": 1.1638120412826538, + "learning_rate": 9.513507301363645e-05, + "loss": 1.9838, + "step": 27528 + }, + { + "epoch": 2.6462558877246947, + "grad_norm": 1.0210742950439453, + "learning_rate": 9.512354812133814e-05, + "loss": 1.864, + "step": 27529 + }, + { + "epoch": 2.646352013842161, + "grad_norm": 1.1220653057098389, + "learning_rate": 9.511202362576875e-05, + "loss": 1.7743, + "step": 27530 + }, + { + "epoch": 2.646448139959627, + "grad_norm": 1.1238858699798584, + "learning_rate": 9.510049952700935e-05, + "loss": 1.9717, + "step": 27531 + }, + { + "epoch": 2.6465442660770933, + "grad_norm": 1.3250179290771484, + "learning_rate": 9.508897582514104e-05, + "loss": 2.106, + "step": 27532 + }, + { + "epoch": 2.646640392194559, + "grad_norm": 1.175326943397522, + "learning_rate": 9.507745252024492e-05, + "loss": 1.9444, + "step": 27533 + }, + { + "epoch": 2.6467365183120255, + "grad_norm": 1.1066055297851562, + "learning_rate": 9.506592961240206e-05, + "loss": 2.0263, + "step": 27534 + }, + { + "epoch": 2.6468326444294914, + "grad_norm": 1.1149178743362427, + "learning_rate": 9.50544071016936e-05, + "loss": 2.1848, + "step": 27535 + }, + { + "epoch": 2.6469287705469577, + "grad_norm": 1.077883243560791, + "learning_rate": 9.504288498820062e-05, + "loss": 1.962, + "step": 27536 + }, + { + "epoch": 2.6470248966644236, + "grad_norm": 1.1135011911392212, + "learning_rate": 9.503136327200416e-05, + "loss": 1.9819, + "step": 27537 + }, + { + "epoch": 2.64712102278189, + "grad_norm": 1.1280343532562256, + "learning_rate": 9.50198419531853e-05, + "loss": 1.9057, + "step": 27538 + }, + { + "epoch": 2.647217148899356, + "grad_norm": 1.174781084060669, + "learning_rate": 9.500832103182517e-05, + "loss": 2.0587, + "step": 27539 + }, + { + "epoch": 2.6473132750168222, + "grad_norm": 1.2578011751174927, + "learning_rate": 9.49968005080048e-05, + "loss": 1.9773, + "step": 27540 + }, + { + "epoch": 2.647409401134288, + "grad_norm": 1.1375997066497803, + "learning_rate": 9.498528038180524e-05, + "loss": 2.0902, + "step": 27541 + }, + { + "epoch": 2.647505527251754, + "grad_norm": 1.1054704189300537, + "learning_rate": 9.497376065330763e-05, + "loss": 1.8222, + "step": 27542 + }, + { + "epoch": 2.6476016533692204, + "grad_norm": 1.1866285800933838, + "learning_rate": 9.4962241322593e-05, + "loss": 1.9959, + "step": 27543 + }, + { + "epoch": 2.6476977794866867, + "grad_norm": 1.10966956615448, + "learning_rate": 9.49507223897424e-05, + "loss": 2.13, + "step": 27544 + }, + { + "epoch": 2.6477939056041526, + "grad_norm": 1.1801731586456299, + "learning_rate": 9.49392038548369e-05, + "loss": 2.0966, + "step": 27545 + }, + { + "epoch": 2.6478900317216185, + "grad_norm": 1.144183874130249, + "learning_rate": 9.492768571795753e-05, + "loss": 1.8962, + "step": 27546 + }, + { + "epoch": 2.647986157839085, + "grad_norm": 0.9963379502296448, + "learning_rate": 9.491616797918544e-05, + "loss": 1.8198, + "step": 27547 + }, + { + "epoch": 2.648082283956551, + "grad_norm": 1.1846929788589478, + "learning_rate": 9.490465063860155e-05, + "loss": 2.134, + "step": 27548 + }, + { + "epoch": 2.648178410074017, + "grad_norm": 1.2565759420394897, + "learning_rate": 9.4893133696287e-05, + "loss": 1.984, + "step": 27549 + }, + { + "epoch": 2.648274536191483, + "grad_norm": 1.23198664188385, + "learning_rate": 9.488161715232283e-05, + "loss": 2.0782, + "step": 27550 + }, + { + "epoch": 2.6483706623089494, + "grad_norm": 1.251273512840271, + "learning_rate": 9.487010100679e-05, + "loss": 2.0867, + "step": 27551 + }, + { + "epoch": 2.6484667884264157, + "grad_norm": 1.4543591737747192, + "learning_rate": 9.485858525976968e-05, + "loss": 2.0084, + "step": 27552 + }, + { + "epoch": 2.6485629145438816, + "grad_norm": 1.1029964685440063, + "learning_rate": 9.48470699113428e-05, + "loss": 1.9016, + "step": 27553 + }, + { + "epoch": 2.6486590406613475, + "grad_norm": 1.1822928190231323, + "learning_rate": 9.483555496159047e-05, + "loss": 1.9994, + "step": 27554 + }, + { + "epoch": 2.648755166778814, + "grad_norm": 1.2020684480667114, + "learning_rate": 9.482404041059367e-05, + "loss": 2.2888, + "step": 27555 + }, + { + "epoch": 2.64885129289628, + "grad_norm": 1.1533658504486084, + "learning_rate": 9.481252625843346e-05, + "loss": 1.9871, + "step": 27556 + }, + { + "epoch": 2.648947419013746, + "grad_norm": 1.2534122467041016, + "learning_rate": 9.480101250519087e-05, + "loss": 2.0104, + "step": 27557 + }, + { + "epoch": 2.649043545131212, + "grad_norm": 1.056008219718933, + "learning_rate": 9.47894991509469e-05, + "loss": 1.8568, + "step": 27558 + }, + { + "epoch": 2.6491396712486783, + "grad_norm": 1.1970171928405762, + "learning_rate": 9.477798619578257e-05, + "loss": 2.2203, + "step": 27559 + }, + { + "epoch": 2.6492357973661442, + "grad_norm": 1.0988928079605103, + "learning_rate": 9.476647363977892e-05, + "loss": 2.0703, + "step": 27560 + }, + { + "epoch": 2.6493319234836106, + "grad_norm": 1.1461619138717651, + "learning_rate": 9.475496148301695e-05, + "loss": 2.1641, + "step": 27561 + }, + { + "epoch": 2.6494280496010765, + "grad_norm": 0.9981667399406433, + "learning_rate": 9.474344972557768e-05, + "loss": 1.9061, + "step": 27562 + }, + { + "epoch": 2.649524175718543, + "grad_norm": 1.2767796516418457, + "learning_rate": 9.473193836754211e-05, + "loss": 2.0113, + "step": 27563 + }, + { + "epoch": 2.6496203018360087, + "grad_norm": 1.015819787979126, + "learning_rate": 9.472042740899127e-05, + "loss": 1.93, + "step": 27564 + }, + { + "epoch": 2.649716427953475, + "grad_norm": 1.1721373796463013, + "learning_rate": 9.470891685000613e-05, + "loss": 2.2366, + "step": 27565 + }, + { + "epoch": 2.649812554070941, + "grad_norm": 1.1060370206832886, + "learning_rate": 9.46974066906677e-05, + "loss": 1.9668, + "step": 27566 + }, + { + "epoch": 2.6499086801884073, + "grad_norm": 1.035300612449646, + "learning_rate": 9.468589693105705e-05, + "loss": 1.7841, + "step": 27567 + }, + { + "epoch": 2.650004806305873, + "grad_norm": 1.3455463647842407, + "learning_rate": 9.467438757125502e-05, + "loss": 2.1066, + "step": 27568 + }, + { + "epoch": 2.6501009324233396, + "grad_norm": 1.2583295106887817, + "learning_rate": 9.466287861134282e-05, + "loss": 2.0203, + "step": 27569 + }, + { + "epoch": 2.6501970585408055, + "grad_norm": 1.1144429445266724, + "learning_rate": 9.465137005140125e-05, + "loss": 2.1415, + "step": 27570 + }, + { + "epoch": 2.650293184658272, + "grad_norm": 1.1091240644454956, + "learning_rate": 9.463986189151137e-05, + "loss": 2.0472, + "step": 27571 + }, + { + "epoch": 2.6503893107757377, + "grad_norm": 1.0318270921707153, + "learning_rate": 9.462835413175416e-05, + "loss": 2.0618, + "step": 27572 + }, + { + "epoch": 2.650485436893204, + "grad_norm": 1.2221057415008545, + "learning_rate": 9.461684677221057e-05, + "loss": 1.9976, + "step": 27573 + }, + { + "epoch": 2.65058156301067, + "grad_norm": 1.1410789489746094, + "learning_rate": 9.460533981296163e-05, + "loss": 2.0574, + "step": 27574 + }, + { + "epoch": 2.650677689128136, + "grad_norm": 1.0755882263183594, + "learning_rate": 9.45938332540883e-05, + "loss": 2.042, + "step": 27575 + }, + { + "epoch": 2.650773815245602, + "grad_norm": 1.0910338163375854, + "learning_rate": 9.458232709567155e-05, + "loss": 1.9667, + "step": 27576 + }, + { + "epoch": 2.6508699413630685, + "grad_norm": 1.102988839149475, + "learning_rate": 9.457082133779234e-05, + "loss": 2.0691, + "step": 27577 + }, + { + "epoch": 2.6509660674805344, + "grad_norm": 1.1954021453857422, + "learning_rate": 9.455931598053162e-05, + "loss": 2.045, + "step": 27578 + }, + { + "epoch": 2.6510621935980003, + "grad_norm": 1.0114824771881104, + "learning_rate": 9.454781102397044e-05, + "loss": 2.0382, + "step": 27579 + }, + { + "epoch": 2.6511583197154667, + "grad_norm": 1.1331640481948853, + "learning_rate": 9.453630646818967e-05, + "loss": 1.9429, + "step": 27580 + }, + { + "epoch": 2.651254445832933, + "grad_norm": 1.0665459632873535, + "learning_rate": 9.452480231327031e-05, + "loss": 1.8739, + "step": 27581 + }, + { + "epoch": 2.651350571950399, + "grad_norm": 1.058164119720459, + "learning_rate": 9.45132985592933e-05, + "loss": 2.0352, + "step": 27582 + }, + { + "epoch": 2.651446698067865, + "grad_norm": 1.2325505018234253, + "learning_rate": 9.450179520633963e-05, + "loss": 2.0436, + "step": 27583 + }, + { + "epoch": 2.651542824185331, + "grad_norm": 1.1576083898544312, + "learning_rate": 9.449029225449018e-05, + "loss": 2.0672, + "step": 27584 + }, + { + "epoch": 2.6516389503027975, + "grad_norm": 1.1589030027389526, + "learning_rate": 9.4478789703826e-05, + "loss": 1.8494, + "step": 27585 + }, + { + "epoch": 2.6517350764202634, + "grad_norm": 1.2373300790786743, + "learning_rate": 9.446728755442794e-05, + "loss": 2.1783, + "step": 27586 + }, + { + "epoch": 2.6518312025377293, + "grad_norm": 1.1517140865325928, + "learning_rate": 9.445578580637699e-05, + "loss": 1.8323, + "step": 27587 + }, + { + "epoch": 2.6519273286551956, + "grad_norm": 1.2103519439697266, + "learning_rate": 9.444428445975406e-05, + "loss": 2.024, + "step": 27588 + }, + { + "epoch": 2.652023454772662, + "grad_norm": 1.138296365737915, + "learning_rate": 9.443278351464014e-05, + "loss": 1.966, + "step": 27589 + }, + { + "epoch": 2.652119580890128, + "grad_norm": 0.9951560497283936, + "learning_rate": 9.44212829711161e-05, + "loss": 1.8252, + "step": 27590 + }, + { + "epoch": 2.652215707007594, + "grad_norm": 1.1677613258361816, + "learning_rate": 9.44097828292629e-05, + "loss": 2.0562, + "step": 27591 + }, + { + "epoch": 2.65231183312506, + "grad_norm": 1.097126841545105, + "learning_rate": 9.439828308916149e-05, + "loss": 1.9319, + "step": 27592 + }, + { + "epoch": 2.652407959242526, + "grad_norm": 1.2353616952896118, + "learning_rate": 9.438678375089275e-05, + "loss": 1.9987, + "step": 27593 + }, + { + "epoch": 2.6525040853599924, + "grad_norm": 0.9864879250526428, + "learning_rate": 9.437528481453763e-05, + "loss": 1.973, + "step": 27594 + }, + { + "epoch": 2.6526002114774583, + "grad_norm": 0.8551456928253174, + "learning_rate": 9.436378628017701e-05, + "loss": 1.7506, + "step": 27595 + }, + { + "epoch": 2.6526963375949246, + "grad_norm": 1.036637783050537, + "learning_rate": 9.435228814789188e-05, + "loss": 1.9697, + "step": 27596 + }, + { + "epoch": 2.6527924637123905, + "grad_norm": 1.0014036893844604, + "learning_rate": 9.434079041776312e-05, + "loss": 1.9103, + "step": 27597 + }, + { + "epoch": 2.652888589829857, + "grad_norm": 1.0722439289093018, + "learning_rate": 9.432929308987162e-05, + "loss": 1.993, + "step": 27598 + }, + { + "epoch": 2.6529847159473228, + "grad_norm": 1.3031713962554932, + "learning_rate": 9.431779616429832e-05, + "loss": 2.1453, + "step": 27599 + }, + { + "epoch": 2.653080842064789, + "grad_norm": 1.1688123941421509, + "learning_rate": 9.430629964112412e-05, + "loss": 1.9504, + "step": 27600 + }, + { + "epoch": 2.653176968182255, + "grad_norm": 1.039182424545288, + "learning_rate": 9.42948035204299e-05, + "loss": 1.7862, + "step": 27601 + }, + { + "epoch": 2.6532730942997214, + "grad_norm": 1.1108736991882324, + "learning_rate": 9.428330780229655e-05, + "loss": 2.0186, + "step": 27602 + }, + { + "epoch": 2.6533692204171873, + "grad_norm": 1.2208759784698486, + "learning_rate": 9.427181248680503e-05, + "loss": 1.968, + "step": 27603 + }, + { + "epoch": 2.6534653465346536, + "grad_norm": 1.0834072828292847, + "learning_rate": 9.426031757403617e-05, + "loss": 1.9732, + "step": 27604 + }, + { + "epoch": 2.6535614726521195, + "grad_norm": 1.0880051851272583, + "learning_rate": 9.424882306407089e-05, + "loss": 1.7653, + "step": 27605 + }, + { + "epoch": 2.653657598769586, + "grad_norm": 1.2185004949569702, + "learning_rate": 9.423732895699008e-05, + "loss": 1.9984, + "step": 27606 + }, + { + "epoch": 2.6537537248870517, + "grad_norm": 1.0978102684020996, + "learning_rate": 9.422583525287462e-05, + "loss": 1.9415, + "step": 27607 + }, + { + "epoch": 2.653849851004518, + "grad_norm": 1.2670478820800781, + "learning_rate": 9.421434195180539e-05, + "loss": 2.0581, + "step": 27608 + }, + { + "epoch": 2.653945977121984, + "grad_norm": 1.1271849870681763, + "learning_rate": 9.420284905386326e-05, + "loss": 1.9923, + "step": 27609 + }, + { + "epoch": 2.6540421032394503, + "grad_norm": 1.1720190048217773, + "learning_rate": 9.419135655912913e-05, + "loss": 1.9952, + "step": 27610 + }, + { + "epoch": 2.6541382293569162, + "grad_norm": 1.2801686525344849, + "learning_rate": 9.417986446768386e-05, + "loss": 2.0417, + "step": 27611 + }, + { + "epoch": 2.654234355474382, + "grad_norm": 1.1321419477462769, + "learning_rate": 9.416837277960829e-05, + "loss": 1.9867, + "step": 27612 + }, + { + "epoch": 2.6543304815918485, + "grad_norm": 1.3145825862884521, + "learning_rate": 9.415688149498334e-05, + "loss": 2.0357, + "step": 27613 + }, + { + "epoch": 2.654426607709315, + "grad_norm": 1.0879637002944946, + "learning_rate": 9.41453906138899e-05, + "loss": 2.0139, + "step": 27614 + }, + { + "epoch": 2.6545227338267807, + "grad_norm": 1.0623066425323486, + "learning_rate": 9.413390013640876e-05, + "loss": 1.8847, + "step": 27615 + }, + { + "epoch": 2.6546188599442466, + "grad_norm": 1.0340735912322998, + "learning_rate": 9.412241006262084e-05, + "loss": 1.9714, + "step": 27616 + }, + { + "epoch": 2.654714986061713, + "grad_norm": 1.1635913848876953, + "learning_rate": 9.411092039260693e-05, + "loss": 1.986, + "step": 27617 + }, + { + "epoch": 2.6548111121791793, + "grad_norm": 1.1588020324707031, + "learning_rate": 9.409943112644796e-05, + "loss": 1.8867, + "step": 27618 + }, + { + "epoch": 2.654907238296645, + "grad_norm": 1.1202566623687744, + "learning_rate": 9.408794226422473e-05, + "loss": 2.0914, + "step": 27619 + }, + { + "epoch": 2.655003364414111, + "grad_norm": 1.2253029346466064, + "learning_rate": 9.407645380601811e-05, + "loss": 1.903, + "step": 27620 + }, + { + "epoch": 2.6550994905315775, + "grad_norm": 1.0821807384490967, + "learning_rate": 9.406496575190894e-05, + "loss": 2.0427, + "step": 27621 + }, + { + "epoch": 2.655195616649044, + "grad_norm": 1.0022650957107544, + "learning_rate": 9.405347810197805e-05, + "loss": 1.8773, + "step": 27622 + }, + { + "epoch": 2.6552917427665097, + "grad_norm": 1.1172828674316406, + "learning_rate": 9.404199085630631e-05, + "loss": 1.8431, + "step": 27623 + }, + { + "epoch": 2.6553878688839756, + "grad_norm": 1.0651288032531738, + "learning_rate": 9.403050401497454e-05, + "loss": 1.9314, + "step": 27624 + }, + { + "epoch": 2.655483995001442, + "grad_norm": 1.18901789188385, + "learning_rate": 9.401901757806357e-05, + "loss": 2.0175, + "step": 27625 + }, + { + "epoch": 2.655580121118908, + "grad_norm": 1.317147970199585, + "learning_rate": 9.400753154565422e-05, + "loss": 2.2487, + "step": 27626 + }, + { + "epoch": 2.655676247236374, + "grad_norm": 1.0146039724349976, + "learning_rate": 9.399604591782736e-05, + "loss": 1.8812, + "step": 27627 + }, + { + "epoch": 2.65577237335384, + "grad_norm": 1.1397196054458618, + "learning_rate": 9.398456069466378e-05, + "loss": 2.1419, + "step": 27628 + }, + { + "epoch": 2.6558684994713064, + "grad_norm": 1.2056485414505005, + "learning_rate": 9.397307587624427e-05, + "loss": 2.1309, + "step": 27629 + }, + { + "epoch": 2.6559646255887723, + "grad_norm": 1.2068618535995483, + "learning_rate": 9.396159146264975e-05, + "loss": 2.2173, + "step": 27630 + }, + { + "epoch": 2.6560607517062387, + "grad_norm": 1.1375871896743774, + "learning_rate": 9.395010745396096e-05, + "loss": 2.0315, + "step": 27631 + }, + { + "epoch": 2.6561568778237046, + "grad_norm": 1.2667229175567627, + "learning_rate": 9.393862385025876e-05, + "loss": 2.1434, + "step": 27632 + }, + { + "epoch": 2.656253003941171, + "grad_norm": 1.3985646963119507, + "learning_rate": 9.392714065162393e-05, + "loss": 2.1872, + "step": 27633 + }, + { + "epoch": 2.656349130058637, + "grad_norm": 1.0850645303726196, + "learning_rate": 9.39156578581373e-05, + "loss": 2.0292, + "step": 27634 + }, + { + "epoch": 2.656445256176103, + "grad_norm": 1.0392227172851562, + "learning_rate": 9.390417546987962e-05, + "loss": 2.0827, + "step": 27635 + }, + { + "epoch": 2.656541382293569, + "grad_norm": 1.1221463680267334, + "learning_rate": 9.389269348693178e-05, + "loss": 1.6553, + "step": 27636 + }, + { + "epoch": 2.6566375084110354, + "grad_norm": 1.2673639059066772, + "learning_rate": 9.388121190937453e-05, + "loss": 2.1828, + "step": 27637 + }, + { + "epoch": 2.6567336345285013, + "grad_norm": 1.037676453590393, + "learning_rate": 9.386973073728868e-05, + "loss": 2.0411, + "step": 27638 + }, + { + "epoch": 2.6568297606459677, + "grad_norm": 1.1685807704925537, + "learning_rate": 9.385824997075497e-05, + "loss": 2.0343, + "step": 27639 + }, + { + "epoch": 2.6569258867634336, + "grad_norm": 0.9180979132652283, + "learning_rate": 9.38467696098543e-05, + "loss": 2.0056, + "step": 27640 + }, + { + "epoch": 2.6570220128809, + "grad_norm": 0.967477560043335, + "learning_rate": 9.383528965466738e-05, + "loss": 1.8868, + "step": 27641 + }, + { + "epoch": 2.657118138998366, + "grad_norm": 1.0900111198425293, + "learning_rate": 9.382381010527502e-05, + "loss": 1.9394, + "step": 27642 + }, + { + "epoch": 2.657214265115832, + "grad_norm": 1.3064086437225342, + "learning_rate": 9.381233096175797e-05, + "loss": 2.0089, + "step": 27643 + }, + { + "epoch": 2.657310391233298, + "grad_norm": 1.1036936044692993, + "learning_rate": 9.380085222419706e-05, + "loss": 1.9616, + "step": 27644 + }, + { + "epoch": 2.657406517350764, + "grad_norm": 1.0363891124725342, + "learning_rate": 9.378937389267305e-05, + "loss": 1.9841, + "step": 27645 + }, + { + "epoch": 2.6575026434682303, + "grad_norm": 1.1711972951889038, + "learning_rate": 9.377789596726668e-05, + "loss": 1.9473, + "step": 27646 + }, + { + "epoch": 2.6575987695856966, + "grad_norm": 1.0040545463562012, + "learning_rate": 9.376641844805878e-05, + "loss": 1.843, + "step": 27647 + }, + { + "epoch": 2.6576948957031625, + "grad_norm": 1.2767059803009033, + "learning_rate": 9.37549413351301e-05, + "loss": 2.0415, + "step": 27648 + }, + { + "epoch": 2.6577910218206284, + "grad_norm": 1.0584447383880615, + "learning_rate": 9.374346462856137e-05, + "loss": 2.086, + "step": 27649 + }, + { + "epoch": 2.6578871479380948, + "grad_norm": 0.9956802725791931, + "learning_rate": 9.373198832843342e-05, + "loss": 1.8756, + "step": 27650 + }, + { + "epoch": 2.657983274055561, + "grad_norm": 1.0209287405014038, + "learning_rate": 9.372051243482693e-05, + "loss": 1.9699, + "step": 27651 + }, + { + "epoch": 2.658079400173027, + "grad_norm": 1.1554722785949707, + "learning_rate": 9.370903694782274e-05, + "loss": 1.946, + "step": 27652 + }, + { + "epoch": 2.658175526290493, + "grad_norm": 1.1876040697097778, + "learning_rate": 9.369756186750155e-05, + "loss": 2.241, + "step": 27653 + }, + { + "epoch": 2.6582716524079593, + "grad_norm": 1.0066585540771484, + "learning_rate": 9.36860871939441e-05, + "loss": 1.9881, + "step": 27654 + }, + { + "epoch": 2.6583677785254256, + "grad_norm": 0.9630988836288452, + "learning_rate": 9.36746129272312e-05, + "loss": 1.903, + "step": 27655 + }, + { + "epoch": 2.6584639046428915, + "grad_norm": 1.0568125247955322, + "learning_rate": 9.366313906744353e-05, + "loss": 2.0723, + "step": 27656 + }, + { + "epoch": 2.6585600307603574, + "grad_norm": 1.0362074375152588, + "learning_rate": 9.365166561466189e-05, + "loss": 1.8638, + "step": 27657 + }, + { + "epoch": 2.6586561568778238, + "grad_norm": 1.067445158958435, + "learning_rate": 9.364019256896698e-05, + "loss": 1.7618, + "step": 27658 + }, + { + "epoch": 2.65875228299529, + "grad_norm": 0.9897101521492004, + "learning_rate": 9.362871993043954e-05, + "loss": 2.0606, + "step": 27659 + }, + { + "epoch": 2.658848409112756, + "grad_norm": 1.34538996219635, + "learning_rate": 9.361724769916032e-05, + "loss": 2.0367, + "step": 27660 + }, + { + "epoch": 2.658944535230222, + "grad_norm": 1.0909219980239868, + "learning_rate": 9.360577587521008e-05, + "loss": 2.0257, + "step": 27661 + }, + { + "epoch": 2.6590406613476882, + "grad_norm": 1.4456158876419067, + "learning_rate": 9.359430445866948e-05, + "loss": 2.0781, + "step": 27662 + }, + { + "epoch": 2.659136787465154, + "grad_norm": 1.2306289672851562, + "learning_rate": 9.358283344961926e-05, + "loss": 2.1379, + "step": 27663 + }, + { + "epoch": 2.6592329135826205, + "grad_norm": 1.019844651222229, + "learning_rate": 9.357136284814018e-05, + "loss": 1.9625, + "step": 27664 + }, + { + "epoch": 2.6593290397000864, + "grad_norm": 1.0192984342575073, + "learning_rate": 9.355989265431298e-05, + "loss": 2.1029, + "step": 27665 + }, + { + "epoch": 2.6594251658175527, + "grad_norm": 1.1450365781784058, + "learning_rate": 9.354842286821833e-05, + "loss": 1.9294, + "step": 27666 + }, + { + "epoch": 2.6595212919350186, + "grad_norm": 1.2580152750015259, + "learning_rate": 9.353695348993693e-05, + "loss": 2.0033, + "step": 27667 + }, + { + "epoch": 2.659617418052485, + "grad_norm": 0.991101086139679, + "learning_rate": 9.352548451954953e-05, + "loss": 2.0553, + "step": 27668 + }, + { + "epoch": 2.659713544169951, + "grad_norm": 1.0725942850112915, + "learning_rate": 9.351401595713684e-05, + "loss": 1.9936, + "step": 27669 + }, + { + "epoch": 2.659809670287417, + "grad_norm": 1.1737585067749023, + "learning_rate": 9.350254780277955e-05, + "loss": 2.0788, + "step": 27670 + }, + { + "epoch": 2.659905796404883, + "grad_norm": 1.0635576248168945, + "learning_rate": 9.349108005655838e-05, + "loss": 2.1247, + "step": 27671 + }, + { + "epoch": 2.6600019225223495, + "grad_norm": 1.215745449066162, + "learning_rate": 9.3479612718554e-05, + "loss": 1.9907, + "step": 27672 + }, + { + "epoch": 2.6600980486398154, + "grad_norm": 1.031640648841858, + "learning_rate": 9.346814578884712e-05, + "loss": 1.9035, + "step": 27673 + }, + { + "epoch": 2.6601941747572817, + "grad_norm": 1.3557668924331665, + "learning_rate": 9.345667926751845e-05, + "loss": 2.0137, + "step": 27674 + }, + { + "epoch": 2.6602903008747476, + "grad_norm": 1.1160348653793335, + "learning_rate": 9.344521315464866e-05, + "loss": 2.1193, + "step": 27675 + }, + { + "epoch": 2.660386426992214, + "grad_norm": 1.2146492004394531, + "learning_rate": 9.343374745031843e-05, + "loss": 2.1402, + "step": 27676 + }, + { + "epoch": 2.66048255310968, + "grad_norm": 0.9427669644355774, + "learning_rate": 9.342228215460849e-05, + "loss": 1.8161, + "step": 27677 + }, + { + "epoch": 2.6605786792271457, + "grad_norm": 1.09123694896698, + "learning_rate": 9.341081726759949e-05, + "loss": 1.991, + "step": 27678 + }, + { + "epoch": 2.660674805344612, + "grad_norm": 1.2293637990951538, + "learning_rate": 9.33993527893721e-05, + "loss": 2.0742, + "step": 27679 + }, + { + "epoch": 2.6607709314620784, + "grad_norm": 1.0287816524505615, + "learning_rate": 9.338788872000702e-05, + "loss": 2.1521, + "step": 27680 + }, + { + "epoch": 2.6608670575795443, + "grad_norm": 0.8937469720840454, + "learning_rate": 9.337642505958492e-05, + "loss": 1.7582, + "step": 27681 + }, + { + "epoch": 2.6609631836970102, + "grad_norm": 1.2108832597732544, + "learning_rate": 9.336496180818649e-05, + "loss": 2.0795, + "step": 27682 + }, + { + "epoch": 2.6610593098144766, + "grad_norm": 0.9519547820091248, + "learning_rate": 9.335349896589237e-05, + "loss": 1.93, + "step": 27683 + }, + { + "epoch": 2.661155435931943, + "grad_norm": 1.0164955854415894, + "learning_rate": 9.334203653278322e-05, + "loss": 1.9635, + "step": 27684 + }, + { + "epoch": 2.661251562049409, + "grad_norm": 1.177566409111023, + "learning_rate": 9.33305745089397e-05, + "loss": 1.9767, + "step": 27685 + }, + { + "epoch": 2.6613476881668747, + "grad_norm": 1.0799152851104736, + "learning_rate": 9.331911289444252e-05, + "loss": 2.0455, + "step": 27686 + }, + { + "epoch": 2.661443814284341, + "grad_norm": 1.2129110097885132, + "learning_rate": 9.33076516893723e-05, + "loss": 2.0137, + "step": 27687 + }, + { + "epoch": 2.6615399404018074, + "grad_norm": 1.0328607559204102, + "learning_rate": 9.32961908938097e-05, + "loss": 1.9593, + "step": 27688 + }, + { + "epoch": 2.6616360665192733, + "grad_norm": 1.1508455276489258, + "learning_rate": 9.328473050783536e-05, + "loss": 1.951, + "step": 27689 + }, + { + "epoch": 2.661732192636739, + "grad_norm": 1.0371991395950317, + "learning_rate": 9.327327053152993e-05, + "loss": 2.0293, + "step": 27690 + }, + { + "epoch": 2.6618283187542056, + "grad_norm": 1.1727951765060425, + "learning_rate": 9.326181096497406e-05, + "loss": 2.1204, + "step": 27691 + }, + { + "epoch": 2.661924444871672, + "grad_norm": 1.162739634513855, + "learning_rate": 9.325035180824842e-05, + "loss": 1.9779, + "step": 27692 + }, + { + "epoch": 2.662020570989138, + "grad_norm": 1.1999047994613647, + "learning_rate": 9.323889306143359e-05, + "loss": 2.1339, + "step": 27693 + }, + { + "epoch": 2.6621166971066037, + "grad_norm": 0.9939467310905457, + "learning_rate": 9.322743472461025e-05, + "loss": 1.9992, + "step": 27694 + }, + { + "epoch": 2.66221282322407, + "grad_norm": 1.1945123672485352, + "learning_rate": 9.321597679785905e-05, + "loss": 2.0042, + "step": 27695 + }, + { + "epoch": 2.662308949341536, + "grad_norm": 1.180564284324646, + "learning_rate": 9.320451928126058e-05, + "loss": 1.8702, + "step": 27696 + }, + { + "epoch": 2.6624050754590023, + "grad_norm": 1.1913275718688965, + "learning_rate": 9.319306217489544e-05, + "loss": 2.0569, + "step": 27697 + }, + { + "epoch": 2.662501201576468, + "grad_norm": 1.0063618421554565, + "learning_rate": 9.318160547884434e-05, + "loss": 1.822, + "step": 27698 + }, + { + "epoch": 2.6625973276939345, + "grad_norm": 1.04045832157135, + "learning_rate": 9.317014919318788e-05, + "loss": 2.0955, + "step": 27699 + }, + { + "epoch": 2.6626934538114004, + "grad_norm": 1.1587090492248535, + "learning_rate": 9.315869331800664e-05, + "loss": 1.9162, + "step": 27700 + }, + { + "epoch": 2.662789579928867, + "grad_norm": 1.0611153841018677, + "learning_rate": 9.314723785338126e-05, + "loss": 2.0713, + "step": 27701 + }, + { + "epoch": 2.6628857060463327, + "grad_norm": 1.3449534177780151, + "learning_rate": 9.313578279939235e-05, + "loss": 1.9409, + "step": 27702 + }, + { + "epoch": 2.662981832163799, + "grad_norm": 1.038962721824646, + "learning_rate": 9.312432815612052e-05, + "loss": 2.0777, + "step": 27703 + }, + { + "epoch": 2.663077958281265, + "grad_norm": 1.3147997856140137, + "learning_rate": 9.311287392364641e-05, + "loss": 1.9454, + "step": 27704 + }, + { + "epoch": 2.6631740843987313, + "grad_norm": 1.2107787132263184, + "learning_rate": 9.310142010205055e-05, + "loss": 2.0726, + "step": 27705 + }, + { + "epoch": 2.663270210516197, + "grad_norm": 1.1245430707931519, + "learning_rate": 9.30899666914136e-05, + "loss": 1.9178, + "step": 27706 + }, + { + "epoch": 2.6633663366336635, + "grad_norm": 1.1699509620666504, + "learning_rate": 9.307851369181617e-05, + "loss": 2.033, + "step": 27707 + }, + { + "epoch": 2.6634624627511294, + "grad_norm": 0.9878363013267517, + "learning_rate": 9.306706110333881e-05, + "loss": 1.8519, + "step": 27708 + }, + { + "epoch": 2.6635585888685958, + "grad_norm": 1.1204464435577393, + "learning_rate": 9.305560892606214e-05, + "loss": 2.0089, + "step": 27709 + }, + { + "epoch": 2.6636547149860617, + "grad_norm": 1.0385180711746216, + "learning_rate": 9.304415716006675e-05, + "loss": 1.9786, + "step": 27710 + }, + { + "epoch": 2.6637508411035276, + "grad_norm": 1.2259272336959839, + "learning_rate": 9.303270580543324e-05, + "loss": 2.0883, + "step": 27711 + }, + { + "epoch": 2.663846967220994, + "grad_norm": 1.0911121368408203, + "learning_rate": 9.302125486224215e-05, + "loss": 1.9946, + "step": 27712 + }, + { + "epoch": 2.6639430933384602, + "grad_norm": 1.0138510465621948, + "learning_rate": 9.300980433057411e-05, + "loss": 1.8751, + "step": 27713 + }, + { + "epoch": 2.664039219455926, + "grad_norm": 1.0029278993606567, + "learning_rate": 9.299835421050966e-05, + "loss": 2.0546, + "step": 27714 + }, + { + "epoch": 2.664039219455926, + "eval_train_loss": 1.9114065170288086, + "eval_train_mean_batch_perplexity": 7.680344964919399, + "eval_train_runtime": 13146.4053, + "eval_train_samples_per_second": 12.661, + "eval_train_steps_per_second": 0.791, + "step": 27714 + }, + { + "epoch": 2.664039219455926, + "eval_test_loss": 2.0666396617889404, + "eval_test_mean_batch_perplexity": 9.027373768004038, + "eval_test_runtime": 2832.4175, + "eval_test_samples_per_second": 12.593, + "eval_test_steps_per_second": 0.787, + "step": 27714 } ], "logging_steps": 1, @@ -161799,7 +194150,7 @@ "attributes": {} } }, - "total_flos": 5.849053308504244e+18, + "total_flos": 7.018844896573784e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null